diff --git a/sft_pretrain/Full_smoe_perturbed/added_tokens.json b/sft_pretrain/Full_smoe_perturbed/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..c9d3d3a1b74d87e381e471f7b33784015d2dc0ea --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/added_tokens.json @@ -0,0 +1,13 @@ +{ + "<|assistant|>": 32001, + "<|endoftext|>": 32000, + "<|end|>": 32007, + "<|placeholder1|>": 32002, + "<|placeholder2|>": 32003, + "<|placeholder3|>": 32004, + "<|placeholder4|>": 32005, + "<|placeholder5|>": 32008, + "<|placeholder6|>": 32009, + "<|system|>": 32006, + "<|user|>": 32010 +} diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/added_tokens.json b/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..c9d3d3a1b74d87e381e471f7b33784015d2dc0ea --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/added_tokens.json @@ -0,0 +1,13 @@ +{ + "<|assistant|>": 32001, + "<|endoftext|>": 32000, + "<|end|>": 32007, + "<|placeholder1|>": 32002, + "<|placeholder2|>": 32003, + "<|placeholder3|>": 32004, + "<|placeholder4|>": 32005, + "<|placeholder5|>": 32008, + "<|placeholder6|>": 32009, + "<|system|>": 32006, + "<|user|>": 32010 +} diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/config.json b/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/config.json new file mode 100644 index 0000000000000000000000000000000000000000..987150c78c9255ac53c0408588036e10466fc436 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/config.json @@ -0,0 +1,200 @@ +{ + "_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "architectures": [ + "LlavaPhiForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_phi3.Phi3Config", + "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM" + }, + "bal_comp_loss_coef": 0.01, + "balance_loss_coef": 0.01, + "bos_token_id": 1, + "clip_smoe": false, + "diversity_loss_coef": 0.01, + "dropout": false, + "e_loss_coef": 0.001, + "embd_pdrop": 0.0, + "entropy_advance_loss": false, + "eos_token_id": 32000, + "freeze_backbone": false, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 3072, + "hybrid": false, + "image_aspect_ratio": "pad", + "init_weight": true, + "initializer_range": 0.02, + "intermediate_size": 8192, + "is_cosine": false, + "is_norm_weight": false, + "local_rank": 0, + "loss1": "balanceloss", + "loss2": "zloss", + "luna": false, + "max_compete_in_iter": 3, + "max_position_embeddings": 131072, + "mlp_smoe": true, + "mm_hidden_size": 1152, + "mm_patch_merge_type": "flat", + "mm_projector_lr": null, + "mm_projector_type": "moe", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-224", + "model_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "model_type": "llava_phi", + "moe_name": "smoe_perturbed", + "norm_softmax": false, + "normalization": false, + "num_attention_heads": 32, + "num_experts": 8, + "num_hidden_layers": 32, + "num_key_value_heads": 32, + "num_layers": 3, + "num_selected": 4, + "number_of_previous_tokens": 2, + "original_max_position_embeddings": 4096, + "pad_token_id": 32000, + "pretrain_mm_mlp_adapter": null, + "rate_compete": 0.2, + "rate_flip": 0.05, + "resid_pdrop": 0.0, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "long_factor": [ + 1.0800000429153442, + 1.1100000143051147, + 1.1399999856948853, + 1.340000033378601, + 1.5899999141693115, + 1.600000023841858, + 1.6200000047683716, + 2.620000123977661, + 3.2300000190734863, + 3.2300000190734863, + 4.789999961853027, + 7.400000095367432, + 7.700000286102295, + 9.09000015258789, + 12.199999809265137, + 17.670000076293945, + 24.46000099182129, + 28.57000160217285, + 30.420001983642578, + 30.840002059936523, + 32.590003967285156, + 32.93000411987305, + 42.320003509521484, + 44.96000289916992, + 50.340003967285156, + 50.45000457763672, + 57.55000305175781, + 57.93000411987305, + 58.21000289916992, + 60.1400032043457, + 62.61000442504883, + 62.62000274658203, + 62.71000289916992, + 63.1400032043457, + 63.1400032043457, + 63.77000427246094, + 63.93000411987305, + 63.96000289916992, + 63.970001220703125, + 64.02999877929688, + 64.06999969482422, + 64.08000183105469, + 64.12000274658203, + 64.41000366210938, + 64.4800033569336, + 64.51000213623047, + 64.52999877929688, + 64.83999633789062 + ], + "short_factor": [ + 1.0, + 1.0199999809265137, + 1.0299999713897705, + 1.0299999713897705, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0699999332427979, + 1.0999999046325684, + 1.1099998950958252, + 1.1599998474121094, + 1.1599998474121094, + 1.1699998378753662, + 1.2899998426437378, + 1.339999794960022, + 1.679999828338623, + 1.7899998426437378, + 1.8199998140335083, + 1.8499997854232788, + 1.8799997568130493, + 1.9099997282028198, + 1.9399996995925903, + 1.9899996519088745, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0799996852874756, + 2.0899996757507324, + 2.189999580383301, + 2.2199995517730713, + 2.5899994373321533, + 2.729999542236328, + 2.749999523162842, + 2.8399994373321533 + ], + "type": "longrope" + }, + "rope_theta": 10000.0, + "router_loss_coef": 0.01, + "router_theta": 0.1, + "router_z_loss_coef": 0.001, + "scales": [ + 1, + 3 + ], + "sliding_window": 262144, + "sparse_upcycling": false, + "strategy_train": "base", + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "topk_max": 2, + "topk_min": 1, + "torch_dtype": "bfloat16", + "training": true, + "transformers_version": "4.43.0", + "tune_mm_mlp_adapter": false, + "unit_test": true, + "use_cache": false, + "use_mm_proj": true, + "use_old": false, + "version": "phi35", + "vision_tower": "google/siglip-so400m-patch14-224", + "vision_tower_dir": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/clip.bin", + "vocab_size": 32064, + "warm_up": 0.05 +} diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/generation_config.json b/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dad5c4578f0dc5969b38755d095fc30c368bb54a --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/generation_config.json @@ -0,0 +1,12 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "do_sample": true, + "eos_token_id": [ + 32007, + 32001, + 32000 + ], + "pad_token_id": 32000, + "transformers_version": "4.43.0" +} diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/global_step1040/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/global_step1040/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3c6adeb3d6a792896e3c3d384bad71040b4f79d9 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/global_step1040/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a93577b7cc73c9b2eddf166ec1c3663b4ed855e07c1ac26c5e0278b0a5b79e4 +size 396609872 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/global_step1040/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/global_step1040/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d859eec78bf60f354994d89cbdf35a6b0e788d90 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/global_step1040/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b5ee93419e33026e34d53cd520b68b174b94b2bbbc1c79ca6d5b656dd03b129 +size 396609872 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/global_step1040/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/global_step1040/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..00679f8a2f677a91e08f2c2040e649497dcb95e3 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/global_step1040/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9805c8013f2d532fb94af92e1fd7e19a9297308084d6b618c592f4af0670511 +size 396609872 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/global_step1040/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/global_step1040/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a831fcf027efcc9ca5deba9c66d5247820da8e4c --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/global_step1040/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3068b759d18a28e87d0088e50d00e7f3993f18320978dd6f51e2c9d304a80f23 +size 396609872 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/global_step1040/zero_pp_rank_0_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/global_step1040/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1ecff8f5c1b8788e53f014f70e228e88ebd4d649 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/global_step1040/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b59ba19b363550e9a25e7d805564fee0fed57860909fc1dc7ac1a445a8c18d6e +size 2117322436 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/global_step1040/zero_pp_rank_1_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/global_step1040/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..46d7a093d49d2e63bba7607dcd668861975c1117 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/global_step1040/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0881087c748bdb8d5772b9d17a45fde64ad3e2e997c0aed71dac2d850c8be46 +size 2117322436 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/global_step1040/zero_pp_rank_2_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/global_step1040/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8cda3b0ddd711eecc2505350be327593a7c35049 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/global_step1040/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90af7c7b3d4e31ce6f71032c983c9befd8f520868d0c9dc4524434c7c43af278 +size 2117322436 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/global_step1040/zero_pp_rank_3_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/global_step1040/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ed707c38f9f7e1000129ee39840038fb2d32e506 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/global_step1040/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a80d91982d89e25f800ec353b89ec3ddcb63dcc51070f4cbe9c7a6ff4dadafdd +size 2117322436 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/latest b/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/latest new file mode 100644 index 0000000000000000000000000000000000000000..f37da78e3c7eee26ebe5f06b54d6621716edb6b9 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/latest @@ -0,0 +1 @@ +global_step1040 \ No newline at end of file diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/model-00001-of-00002.safetensors b/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/model-00001-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..29d76f5d80605301aab2bba59b53a5e2582094c4 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/model-00001-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe6c4f6ef38e8993629091331e0bbf23484cc88bdfd038f0dd17b6ec2800d855 +size 4972489328 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/model-00002-of-00002.safetensors b/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/model-00002-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..bfbef6f731f832a552b873d620b9afa58ced3f44 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/model-00002-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01c18ebfa64747ca7259c4e8a25d9a7dd8483a517e23ce27037d5223f83e2e9c +size 3759043888 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/model.safetensors.index.json b/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..01fe755c95da02467d97df3e39228dbbb26b065f --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/model.safetensors.index.json @@ -0,0 +1,674 @@ +{ + "metadata": { + "total_size": 8731443232 + }, + "weight_map": { + "lm_head.weight": "model-00002-of-00002.safetensors", + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.mm_projector.layer_norm.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.layer_norm.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.expert_embeddings": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.gate.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.inp_reduction.weight": "model-00002-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/rng_state_0.pth b/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..ef4849062bcdc8ffd2246c07673ba196a8d61a6d --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fae2114fffe9b1eea30e28bbdb4ce59046b0079ea5b8dc4682079f609d49d787 +size 14960 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/rng_state_1.pth b/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..2fcb2b640bc236c26aa841680d34a91240247970 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4ff5f3a53530ac868291e2667c8f824bfa1f4fa1ce880df8223a7165ef38e11 +size 14960 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/rng_state_2.pth b/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..00c3f989de00e6d58ca7345ae6f65fee0afcbdcd --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91f80a7779b0034e70106ba6cb0e3e686052334c20ce54453ee3977cc0219d15 +size 14960 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/rng_state_3.pth b/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..f289913854ee3fa52a86e282421da07d85b8a4c4 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ece3bc0d0e16c43ef245cc787cbd0d63d08d460f489c4cd52adf6501b9281a18 +size 14960 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/special_tokens_map.json b/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3e4d5a5bc1cb51753cc9ae0305ece0da60052b10 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/tokenizer.model b/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/tokenizer_config.json b/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d579bb0b91b24b214ea3c2e487e27a65017cdc4a --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/tokenizer_config.json @@ -0,0 +1,132 @@ +{ + "add_bos_token": false, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "32000": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32001": { + "content": "<|assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32002": { + "content": "<|placeholder1|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32003": { + "content": "<|placeholder2|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32004": { + "content": "<|placeholder3|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32005": { + "content": "<|placeholder4|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32006": { + "content": "<|system|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32007": { + "content": "<|end|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32008": { + "content": "<|placeholder5|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32009": { + "content": "<|placeholder6|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32010": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "legacy": false, + "model_max_length": 2048, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/trainer_state.json b/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2de4d6c0c5e8d808a2820f5f43575d3359bc1b8c --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/trainer_state.json @@ -0,0 +1,15633 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.2000769526741054, + "eval_steps": 500, + "global_step": 1040, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02574398, + "balance_loss_mlp": 1.85189414, + "epoch": 0.00019238168526356292, + "flos": 471022176768.0, + "grad_norm": 12.86455737221305, + "language_loss": 2.79777646, + "learning_rate": 0.0, + "loss": 1.8614465, + "num_input_tokens_seen": 67104, + "router_z_loss_mlp": 7.2109375, + "step": 1, + "time_per_iteration": 21.83068585395813 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02254613, + "balance_loss_mlp": 1.76785779, + "epoch": 0.00038476337052712584, + "flos": 505537981440.0, + "grad_norm": 51.581369656319104, + "language_loss": 12.34714699, + "learning_rate": 0.00013726078121135892, + "loss": 12.3696928, + "num_input_tokens_seen": 134080, + "router_z_loss_mlp": 4.875, + "step": 2, + "time_per_iteration": 2.6192572116851807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02235864, + "balance_loss_mlp": 1.75177932, + "epoch": 0.0005771450557906887, + "flos": 600333152256.0, + "grad_norm": 53.41660983156924, + "language_loss": 12.32898235, + "learning_rate": 0.00021755319103969496, + "loss": 12.35134125, + "num_input_tokens_seen": 205152, + "router_z_loss_mlp": 4.84765625, + "step": 3, + "time_per_iteration": 2.887979030609131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02281771, + "balance_loss_mlp": 1.79577887, + "epoch": 0.0007695267410542517, + "flos": 581496442368.0, + "grad_norm": 15.812083363335244, + "language_loss": 9.24414825, + "learning_rate": 0.00027452156242271784, + "loss": 9.26696682, + "num_input_tokens_seen": 269664, + "router_z_loss_mlp": 4.8671875, + "step": 4, + "time_per_iteration": 2.6792547702789307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02454864, + "balance_loss_mlp": 1.95551991, + "epoch": 0.0009619084263178145, + "flos": 487153164288.0, + "grad_norm": 10.3691594005885, + "language_loss": 9.1886158, + "learning_rate": 0.0003187096642208417, + "loss": 9.21316433, + "num_input_tokens_seen": 338560, + "router_z_loss_mlp": 4.98828125, + "step": 5, + "time_per_iteration": 2.627883195877075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0247156, + "balance_loss_mlp": 1.97450531, + "epoch": 0.0011542901115813775, + "flos": 561166519296.0, + "grad_norm": 9.061082825397735, + "language_loss": 9.31672573, + "learning_rate": 0.0003548139722510539, + "loss": 9.34144115, + "num_input_tokens_seen": 410112, + "router_z_loss_mlp": 4.96875, + "step": 6, + "time_per_iteration": 2.697327136993408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02496704, + "balance_loss_mlp": 1.9977417, + "epoch": 0.0013466717968449403, + "flos": 534950886912.0, + "grad_norm": 5.1401213461899875, + "language_loss": 8.45638084, + "learning_rate": 0.00038533972973918044, + "loss": 8.48134804, + "num_input_tokens_seen": 477552, + "router_z_loss_mlp": 4.984375, + "step": 7, + "time_per_iteration": 2.6605119705200195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02367166, + "balance_loss_mlp": 1.8800292, + "epoch": 0.0015390534821085034, + "flos": 493333587456.0, + "grad_norm": 4.765795170053606, + "language_loss": 7.86978722, + "learning_rate": 0.0004117823436340768, + "loss": 7.89345884, + "num_input_tokens_seen": 549184, + "router_z_loss_mlp": 4.87890625, + "step": 8, + "time_per_iteration": 2.60813570022583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02377529, + "balance_loss_mlp": 1.89153647, + "epoch": 0.0017314351673720662, + "flos": 565775139840.0, + "grad_norm": 2.6394105736579268, + "language_loss": 7.60834789, + "learning_rate": 0.00043510638207938993, + "loss": 7.63212299, + "num_input_tokens_seen": 622880, + "router_z_loss_mlp": 4.8671875, + "step": 9, + "time_per_iteration": 2.871943712234497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0239868, + "balance_loss_mlp": 1.91802776, + "epoch": 0.001923816852635629, + "flos": 594508568064.0, + "grad_norm": 2.7082435786924752, + "language_loss": 7.06748104, + "learning_rate": 0.00045597044543220066, + "loss": 7.09146786, + "num_input_tokens_seen": 693584, + "router_z_loss_mlp": 4.8125, + "step": 10, + "time_per_iteration": 2.671294689178467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02381293, + "balance_loss_mlp": 1.90254807, + "epoch": 0.002116198537899192, + "flos": 610894611456.0, + "grad_norm": 2.113301815517677, + "language_loss": 6.83692646, + "learning_rate": 0.00047484428652143135, + "loss": 6.86073971, + "num_input_tokens_seen": 774432, + "router_z_loss_mlp": 4.79296875, + "step": 11, + "time_per_iteration": 2.885416269302368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02427226, + "balance_loss_mlp": 1.95687437, + "epoch": 0.002308580223162755, + "flos": 546174359040.0, + "grad_norm": 1.7416212933802626, + "language_loss": 6.4295001, + "learning_rate": 0.0004920747534624128, + "loss": 6.45377207, + "num_input_tokens_seen": 844304, + "router_z_loss_mlp": 4.70703125, + "step": 12, + "time_per_iteration": 2.6201112270355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02503769, + "balance_loss_mlp": 2.03265429, + "epoch": 0.002500961908426318, + "flos": 645923255808.0, + "grad_norm": 2.43618245016211, + "language_loss": 6.0048914, + "learning_rate": 0.0005079252465375872, + "loss": 6.02992916, + "num_input_tokens_seen": 915104, + "router_z_loss_mlp": 4.71484375, + "step": 13, + "time_per_iteration": 2.852263927459717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02634854, + "balance_loss_mlp": 2.15916157, + "epoch": 0.0026933435936898806, + "flos": 488848492032.0, + "grad_norm": 4.143842376760835, + "language_loss": 5.42230844, + "learning_rate": 0.0005226005109505393, + "loss": 5.44865704, + "num_input_tokens_seen": 982720, + "router_z_loss_mlp": 4.76171875, + "step": 14, + "time_per_iteration": 2.5524611473083496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02844198, + "balance_loss_mlp": 2.3646903, + "epoch": 0.0028857252789534437, + "flos": 435525628416.0, + "grad_norm": 5.672862092220106, + "language_loss": 4.15845776, + "learning_rate": 0.0005362628552605367, + "loss": 4.18689966, + "num_input_tokens_seen": 1050528, + "router_z_loss_mlp": 4.80078125, + "step": 15, + "time_per_iteration": 2.7353649139404297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03208902, + "balance_loss_mlp": 2.72252893, + "epoch": 0.0030781069642170067, + "flos": 597840826368.0, + "grad_norm": 3.947061509829782, + "language_loss": 2.26971245, + "learning_rate": 0.0005490431248454357, + "loss": 2.30180168, + "num_input_tokens_seen": 1116512, + "router_z_loss_mlp": 4.87109375, + "step": 16, + "time_per_iteration": 2.676703929901123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03601284, + "balance_loss_mlp": 3.10232162, + "epoch": 0.0032704886494805694, + "flos": 1541510280192.0, + "grad_norm": 0.6213816402988768, + "language_loss": 0.75705111, + "learning_rate": 0.0005610483427624225, + "loss": 0.793064, + "num_input_tokens_seen": 1351216, + "router_z_loss_mlp": 5.0, + "step": 17, + "time_per_iteration": 6.1610119342803955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0334326, + "balance_loss_mlp": 2.85841203, + "epoch": 0.0034628703347441324, + "flos": 474970237440.0, + "grad_norm": 2.8341915883282045, + "language_loss": 1.71282685, + "learning_rate": 0.0005723671632907488, + "loss": 1.74625945, + "num_input_tokens_seen": 1420512, + "router_z_loss_mlp": 4.85546875, + "step": 18, + "time_per_iteration": 2.638371467590332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02881518, + "balance_loss_mlp": 2.39934015, + "epoch": 0.0036552520200076955, + "flos": 449477743104.0, + "grad_norm": 2.8867361132515086, + "language_loss": 1.68530536, + "learning_rate": 0.0005830738490244919, + "loss": 1.71412063, + "num_input_tokens_seen": 1484976, + "router_z_loss_mlp": 4.828125, + "step": 19, + "time_per_iteration": 2.56374454498291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02402526, + "balance_loss_mlp": 1.92301893, + "epoch": 0.003847633705271258, + "flos": 637350563328.0, + "grad_norm": 0.6925173808128176, + "language_loss": 1.38203168, + "learning_rate": 0.0005932312266435596, + "loss": 1.406057, + "num_input_tokens_seen": 1557392, + "router_z_loss_mlp": 4.80078125, + "step": 20, + "time_per_iteration": 2.763998508453369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02421171, + "balance_loss_mlp": 1.94814897, + "epoch": 0.004040015390534821, + "flos": 590590158336.0, + "grad_norm": 1.6265477944222306, + "language_loss": 1.40919662, + "learning_rate": 0.0006028929207788754, + "loss": 1.43340826, + "num_input_tokens_seen": 1626064, + "router_z_loss_mlp": 4.734375, + "step": 21, + "time_per_iteration": 2.746016502380371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02575294, + "balance_loss_mlp": 2.10036469, + "epoch": 0.004232397075798384, + "flos": 757865812992.0, + "grad_norm": 1.576079326940489, + "language_loss": 1.40810275, + "learning_rate": 0.0006121050677327902, + "loss": 1.43385565, + "num_input_tokens_seen": 1696528, + "router_z_loss_mlp": 4.75390625, + "step": 22, + "time_per_iteration": 2.9607386589050293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02550906, + "balance_loss_mlp": 2.07025433, + "epoch": 0.004424778761061947, + "flos": 527726415360.0, + "grad_norm": 0.6323448080178445, + "language_loss": 1.22419024, + "learning_rate": 0.0006209076479463684, + "loss": 1.24969923, + "num_input_tokens_seen": 1765936, + "router_z_loss_mlp": 4.8125, + "step": 23, + "time_per_iteration": 2.5966527462005615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02511897, + "balance_loss_mlp": 2.02285314, + "epoch": 0.00461716044632551, + "flos": 549217907712.0, + "grad_norm": 0.22573529074246063, + "language_loss": 1.26396596, + "learning_rate": 0.0006293355346737718, + "loss": 1.28908491, + "num_input_tokens_seen": 1841632, + "router_z_loss_mlp": 4.8984375, + "step": 24, + "time_per_iteration": 2.672264575958252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02557217, + "balance_loss_mlp": 2.05978036, + "epoch": 0.004809542131589073, + "flos": 568751559168.0, + "grad_norm": 0.10471299124135865, + "language_loss": 1.20974565, + "learning_rate": 0.0006374193284416834, + "loss": 1.23531783, + "num_input_tokens_seen": 1920256, + "router_z_loss_mlp": 4.96875, + "step": 25, + "time_per_iteration": 2.7392375469207764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02658191, + "balance_loss_mlp": 2.15503263, + "epoch": 0.005001923816852636, + "flos": 471583584768.0, + "grad_norm": 0.16888144752152706, + "language_loss": 1.20314312, + "learning_rate": 0.0006451860277489461, + "loss": 1.22972512, + "num_input_tokens_seen": 1986528, + "router_z_loss_mlp": 5.02734375, + "step": 26, + "time_per_iteration": 2.6047253608703613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02722422, + "balance_loss_mlp": 2.21582985, + "epoch": 0.005194305502116198, + "flos": 416380743168.0, + "grad_norm": 0.22424567034217777, + "language_loss": 1.28844571, + "learning_rate": 0.0006526595731190848, + "loss": 1.31566989, + "num_input_tokens_seen": 2048016, + "router_z_loss_mlp": 5.0625, + "step": 27, + "time_per_iteration": 2.481884717941284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02743244, + "balance_loss_mlp": 2.2351265, + "epoch": 0.005386687187379761, + "flos": 629995835904.0, + "grad_norm": 0.15642653525507078, + "language_loss": 1.18914986, + "learning_rate": 0.0006598612921618983, + "loss": 1.2165823, + "num_input_tokens_seen": 2127664, + "router_z_loss_mlp": 5.078125, + "step": 28, + "time_per_iteration": 2.8519153594970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02748247, + "balance_loss_mlp": 2.24051118, + "epoch": 0.005579068872643324, + "flos": 888019997184.0, + "grad_norm": 0.1209301216257677, + "language_loss": 1.12191987, + "learning_rate": 0.0006668102665011454, + "loss": 1.14940238, + "num_input_tokens_seen": 2213952, + "router_z_loss_mlp": 5.07421875, + "step": 29, + "time_per_iteration": 3.2244889736175537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02691091, + "balance_loss_mlp": 2.18411779, + "epoch": 0.005771450557906887, + "flos": 548657952768.0, + "grad_norm": 0.1098895199150706, + "language_loss": 1.21368051, + "learning_rate": 0.0006735236364718957, + "loss": 1.24059153, + "num_input_tokens_seen": 2284736, + "router_z_loss_mlp": 5.06640625, + "step": 30, + "time_per_iteration": 2.642730474472046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02653145, + "balance_loss_mlp": 2.14769816, + "epoch": 0.00596383224317045, + "flos": 533068907520.0, + "grad_norm": 0.11046596793449442, + "language_loss": 1.1970098, + "learning_rate": 0.0006800168558381346, + "loss": 1.22354114, + "num_input_tokens_seen": 2354384, + "router_z_loss_mlp": 5.05078125, + "step": 31, + "time_per_iteration": 2.581875801086426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0257592, + "balance_loss_mlp": 2.07123542, + "epoch": 0.0061562139284340135, + "flos": 590162460672.0, + "grad_norm": 0.10949645130098669, + "language_loss": 1.22987807, + "learning_rate": 0.0006863039060567947, + "loss": 1.25563729, + "num_input_tokens_seen": 2419440, + "router_z_loss_mlp": 5.04296875, + "step": 32, + "time_per_iteration": 2.733224868774414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02505923, + "balance_loss_mlp": 2.00390816, + "epoch": 0.006348595613697576, + "flos": 619441107456.0, + "grad_norm": 0.0835016489973258, + "language_loss": 1.14437437, + "learning_rate": 0.0006923974775611263, + "loss": 1.16943359, + "num_input_tokens_seen": 2496368, + "router_z_loss_mlp": 5.015625, + "step": 33, + "time_per_iteration": 2.820788621902466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02482464, + "balance_loss_mlp": 1.98159432, + "epoch": 0.006540977298961139, + "flos": 779298908160.0, + "grad_norm": 0.08776573315434787, + "language_loss": 1.10869515, + "learning_rate": 0.0006983091239737814, + "loss": 1.13351965, + "num_input_tokens_seen": 2573280, + "router_z_loss_mlp": 5.00390625, + "step": 34, + "time_per_iteration": 2.9917590618133545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02373805, + "balance_loss_mlp": 1.87636864, + "epoch": 0.006733358984224702, + "flos": 668372201472.0, + "grad_norm": 0.0744368555221442, + "language_loss": 1.09626412, + "learning_rate": 0.0007040493939600222, + "loss": 1.12000227, + "num_input_tokens_seen": 2647248, + "router_z_loss_mlp": 4.96875, + "step": 35, + "time_per_iteration": 2.813040256500244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02308046, + "balance_loss_mlp": 1.81175399, + "epoch": 0.006925740669488265, + "flos": 565495162368.0, + "grad_norm": 0.06560236116646054, + "language_loss": 1.0974791, + "learning_rate": 0.0007096279445021078, + "loss": 1.12055957, + "num_input_tokens_seen": 2720736, + "router_z_loss_mlp": 4.95703125, + "step": 36, + "time_per_iteration": 2.715013027191162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02240602, + "balance_loss_mlp": 1.74888754, + "epoch": 0.007118122354751828, + "flos": 551111347200.0, + "grad_norm": 0.05581405617561486, + "language_loss": 1.16120386, + "learning_rate": 0.0007150536386503726, + "loss": 1.18360972, + "num_input_tokens_seen": 2800336, + "router_z_loss_mlp": 4.91015625, + "step": 37, + "time_per_iteration": 2.8262643814086914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02218804, + "balance_loss_mlp": 1.7293781, + "epoch": 0.007310504040015391, + "flos": 703813807104.0, + "grad_norm": 0.06412720029508237, + "language_loss": 1.08394384, + "learning_rate": 0.0007203346302358509, + "loss": 1.10613179, + "num_input_tokens_seen": 2883184, + "router_z_loss_mlp": 4.890625, + "step": 38, + "time_per_iteration": 2.9149320125579834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0220325, + "balance_loss_mlp": 1.71954608, + "epoch": 0.007502885725278953, + "flos": 600500338176.0, + "grad_norm": 0.08018675586540955, + "language_loss": 1.13587177, + "learning_rate": 0.000725478437577282, + "loss": 1.15790427, + "num_input_tokens_seen": 2960736, + "router_z_loss_mlp": 4.84375, + "step": 39, + "time_per_iteration": 2.7649383544921875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02194939, + "balance_loss_mlp": 1.71237946, + "epoch": 0.007695267410542516, + "flos": 561427031040.0, + "grad_norm": 0.11080304178085185, + "language_loss": 1.08546591, + "learning_rate": 0.0007304920078549186, + "loss": 1.10741532, + "num_input_tokens_seen": 3033472, + "router_z_loss_mlp": 4.83203125, + "step": 40, + "time_per_iteration": 2.7245187759399414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02164234, + "balance_loss_mlp": 1.68548942, + "epoch": 0.007887649095806078, + "flos": 509230808064.0, + "grad_norm": 0.12864951336881933, + "language_loss": 1.10053396, + "learning_rate": 0.0007353817735343603, + "loss": 1.12217629, + "num_input_tokens_seen": 3107824, + "router_z_loss_mlp": 4.79296875, + "step": 41, + "time_per_iteration": 2.6662168502807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02109951, + "balance_loss_mlp": 1.63425827, + "epoch": 0.008080030781069641, + "flos": 504904166400.0, + "grad_norm": 0.0888118324595499, + "language_loss": 1.05816543, + "learning_rate": 0.0007401537019902344, + "loss": 1.07926488, + "num_input_tokens_seen": 3176528, + "router_z_loss_mlp": 4.76171875, + "step": 42, + "time_per_iteration": 2.583256244659424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02065976, + "balance_loss_mlp": 1.59219027, + "epoch": 0.008272412466333205, + "flos": 519106059264.0, + "grad_norm": 0.08974821197730459, + "language_loss": 1.0785954, + "learning_rate": 0.0007448133392900729, + "loss": 1.09925508, + "num_input_tokens_seen": 3254256, + "router_z_loss_mlp": 4.7421875, + "step": 43, + "time_per_iteration": 2.677175998687744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01955434, + "balance_loss_mlp": 1.4839375, + "epoch": 0.008464794151596768, + "flos": 609183820800.0, + "grad_norm": 0.06237767914218564, + "language_loss": 1.03785229, + "learning_rate": 0.0007493658489441491, + "loss": 1.05740666, + "num_input_tokens_seen": 3340224, + "router_z_loss_mlp": 4.71875, + "step": 44, + "time_per_iteration": 2.8553237915039062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01864539, + "balance_loss_mlp": 1.39800107, + "epoch": 0.00865717583686033, + "flos": 539006283264.0, + "grad_norm": 0.049849947719683325, + "language_loss": 1.08088911, + "learning_rate": 0.0007538160463002316, + "loss": 1.09953451, + "num_input_tokens_seen": 3409216, + "router_z_loss_mlp": 4.66796875, + "step": 45, + "time_per_iteration": 2.637796640396118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01780353, + "balance_loss_mlp": 1.31572247, + "epoch": 0.008849557522123894, + "flos": 509009227776.0, + "grad_norm": 0.046919324832442044, + "language_loss": 1.11748755, + "learning_rate": 0.0007581684291577274, + "loss": 1.1352911, + "num_input_tokens_seen": 3478352, + "router_z_loss_mlp": 4.6484375, + "step": 46, + "time_per_iteration": 2.5655901432037354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01764453, + "balance_loss_mlp": 1.30211222, + "epoch": 0.009041939207387457, + "flos": 626507125248.0, + "grad_norm": 0.05937298040562763, + "language_loss": 1.13580585, + "learning_rate": 0.0007624272050891776, + "loss": 1.15345049, + "num_input_tokens_seen": 3555616, + "router_z_loss_mlp": 4.625, + "step": 47, + "time_per_iteration": 2.804643392562866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01776852, + "balance_loss_mlp": 1.31908798, + "epoch": 0.00923432089265102, + "flos": 550609789440.0, + "grad_norm": 0.07500714899038924, + "language_loss": 1.03489327, + "learning_rate": 0.0007665963158851307, + "loss": 1.05266178, + "num_input_tokens_seen": 3634512, + "router_z_loss_mlp": 4.578125, + "step": 48, + "time_per_iteration": 2.781435489654541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01771411, + "balance_loss_mlp": 1.3170805, + "epoch": 0.009426702577914583, + "flos": 563678310912.0, + "grad_norm": 0.07921486390615404, + "language_loss": 1.12758589, + "learning_rate": 0.0007706794594783609, + "loss": 1.14529991, + "num_input_tokens_seen": 3708480, + "router_z_loss_mlp": 4.54296875, + "step": 49, + "time_per_iteration": 2.739976644515991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.017484, + "balance_loss_mlp": 1.29483247, + "epoch": 0.009619084263178146, + "flos": 617925700608.0, + "grad_norm": 0.05671895540127436, + "language_loss": 1.10915053, + "learning_rate": 0.0007746801096530423, + "loss": 1.12663448, + "num_input_tokens_seen": 3783472, + "router_z_loss_mlp": 4.53515625, + "step": 50, + "time_per_iteration": 2.7333760261535645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01715641, + "balance_loss_mlp": 1.2616924, + "epoch": 0.009811465948441709, + "flos": 542488263168.0, + "grad_norm": 0.04785443300923319, + "language_loss": 1.16231108, + "learning_rate": 0.0007786015338021173, + "loss": 1.17946756, + "num_input_tokens_seen": 3851360, + "router_z_loss_mlp": 4.5390625, + "step": 51, + "time_per_iteration": 2.681392192840576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01700387, + "balance_loss_mlp": 1.24720073, + "epoch": 0.010003847633705272, + "flos": 536976583680.0, + "grad_norm": 0.04536583817216675, + "language_loss": 1.08076, + "learning_rate": 0.0007824468089603051, + "loss": 1.0977639, + "num_input_tokens_seen": 3923056, + "router_z_loss_mlp": 4.53125, + "step": 52, + "time_per_iteration": 2.6839513778686523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01675834, + "balance_loss_mlp": 1.2218852, + "epoch": 0.010196229318968833, + "flos": 910805316096.0, + "grad_norm": 0.04374839581732082, + "language_loss": 1.0833261, + "learning_rate": 0.0007862188363098669, + "loss": 1.10008454, + "num_input_tokens_seen": 4004528, + "router_z_loss_mlp": 4.5390625, + "step": 53, + "time_per_iteration": 3.1748838424682617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01650634, + "balance_loss_mlp": 1.19477725, + "epoch": 0.010388611004232396, + "flos": 586969190400.0, + "grad_norm": 0.045477377455174536, + "language_loss": 1.08262885, + "learning_rate": 0.0007899203543304438, + "loss": 1.09913516, + "num_input_tokens_seen": 4078704, + "router_z_loss_mlp": 4.55859375, + "step": 54, + "time_per_iteration": 2.7011117935180664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01588572, + "balance_loss_mlp": 1.13195276, + "epoch": 0.01058099268949596, + "flos": 503471351808.0, + "grad_norm": 0.05216939031034974, + "language_loss": 1.22650576, + "learning_rate": 0.0007935539507422731, + "loss": 1.24239147, + "num_input_tokens_seen": 4143600, + "router_z_loss_mlp": 4.56640625, + "step": 55, + "time_per_iteration": 2.6142656803131104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01553155, + "balance_loss_mlp": 1.09462798, + "epoch": 0.010773374374759523, + "flos": 545558008320.0, + "grad_norm": 0.04278176221573414, + "language_loss": 1.12836909, + "learning_rate": 0.0007971220733732573, + "loss": 1.14390063, + "num_input_tokens_seen": 4217904, + "router_z_loss_mlp": 4.5859375, + "step": 56, + "time_per_iteration": 2.718210220336914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01586959, + "balance_loss_mlp": 1.1318655, + "epoch": 0.010965756060023086, + "flos": 527285982720.0, + "grad_norm": 0.06958617519474361, + "language_loss": 1.08844507, + "learning_rate": 0.0008006270400641869, + "loss": 1.10431468, + "num_input_tokens_seen": 4293920, + "router_z_loss_mlp": 4.55078125, + "step": 57, + "time_per_iteration": 2.702324628829956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01576177, + "balance_loss_mlp": 1.12375367, + "epoch": 0.011158137745286649, + "flos": 578097054720.0, + "grad_norm": 0.08376433329063605, + "language_loss": 1.09231043, + "learning_rate": 0.0008040710477125043, + "loss": 1.10807228, + "num_input_tokens_seen": 4370080, + "router_z_loss_mlp": 4.5234375, + "step": 58, + "time_per_iteration": 2.733733892440796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01587306, + "balance_loss_mlp": 1.13793492, + "epoch": 0.011350519430550212, + "flos": 530314068480.0, + "grad_norm": 0.056261163559927586, + "language_loss": 1.098104, + "learning_rate": 0.0008074561805429771, + "loss": 1.11397719, + "num_input_tokens_seen": 4439792, + "router_z_loss_mlp": 4.4921875, + "step": 59, + "time_per_iteration": 2.604173183441162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0153348, + "balance_loss_mlp": 1.0886867, + "epoch": 0.011542901115813775, + "flos": 556970133504.0, + "grad_norm": 0.07546157909609297, + "language_loss": 1.07214928, + "learning_rate": 0.0008107844176832545, + "loss": 1.08748412, + "num_input_tokens_seen": 4510800, + "router_z_loss_mlp": 4.45703125, + "step": 60, + "time_per_iteration": 2.670180082321167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01515203, + "balance_loss_mlp": 1.07155395, + "epoch": 0.011735282801077338, + "flos": 573175529472.0, + "grad_norm": 0.06932920743779293, + "language_loss": 1.09267807, + "learning_rate": 0.0008140576401132568, + "loss": 1.10783005, + "num_input_tokens_seen": 4581136, + "router_z_loss_mlp": 4.44921875, + "step": 61, + "time_per_iteration": 2.635917901992798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01537914, + "balance_loss_mlp": 1.0965538, + "epoch": 0.0119276644863409, + "flos": 616716467712.0, + "grad_norm": 0.056166475672555005, + "language_loss": 1.10548615, + "learning_rate": 0.0008172776370494935, + "loss": 1.12086535, + "num_input_tokens_seen": 4650352, + "router_z_loss_mlp": 4.42578125, + "step": 62, + "time_per_iteration": 2.709764242172241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.015397, + "balance_loss_mlp": 1.10024714, + "epoch": 0.012120046171604464, + "flos": 502084199424.0, + "grad_norm": 0.046962065793300374, + "language_loss": 1.17909575, + "learning_rate": 0.0008204461118185703, + "loss": 1.19449282, + "num_input_tokens_seen": 4716336, + "router_z_loss_mlp": 4.40625, + "step": 63, + "time_per_iteration": 2.5971004962921143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01545078, + "balance_loss_mlp": 1.10943925, + "epoch": 0.012312427856868027, + "flos": 474301493760.0, + "grad_norm": 0.04671162143151921, + "language_loss": 1.07277906, + "learning_rate": 0.0008235646872681536, + "loss": 1.08822989, + "num_input_tokens_seen": 4781648, + "router_z_loss_mlp": 4.3671875, + "step": 64, + "time_per_iteration": 2.567622423171997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01534227, + "balance_loss_mlp": 1.10240316, + "epoch": 0.012504809542131588, + "flos": 539470910976.0, + "grad_norm": 0.04435006978162803, + "language_loss": 1.0673492, + "learning_rate": 0.0008266349107584288, + "loss": 1.08269131, + "num_input_tokens_seen": 4852320, + "router_z_loss_mlp": 4.328125, + "step": 65, + "time_per_iteration": 2.6833384037017822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0149994, + "balance_loss_mlp": 1.07345641, + "epoch": 0.012697191227395151, + "flos": 609856567296.0, + "grad_norm": 0.04524096047594039, + "language_loss": 1.09403265, + "learning_rate": 0.0008296582587724851, + "loss": 1.10903215, + "num_input_tokens_seen": 4922016, + "router_z_loss_mlp": 4.2734375, + "step": 66, + "time_per_iteration": 2.692337989807129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01482262, + "balance_loss_mlp": 1.05806744, + "epoch": 0.012889572912658714, + "flos": 769397460480.0, + "grad_norm": 0.04198159389490698, + "language_loss": 1.06809163, + "learning_rate": 0.0008326361411800136, + "loss": 1.08291411, + "num_input_tokens_seen": 5000128, + "router_z_loss_mlp": 4.25, + "step": 67, + "time_per_iteration": 2.923720598220825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01474655, + "balance_loss_mlp": 1.05503809, + "epoch": 0.013081954597922277, + "flos": 535020744192.0, + "grad_norm": 0.041919130945389606, + "language_loss": 1.07100165, + "learning_rate": 0.0008355699051851403, + "loss": 1.0857482, + "num_input_tokens_seen": 5074512, + "router_z_loss_mlp": 4.203125, + "step": 68, + "time_per_iteration": 2.7417044639587402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01462817, + "balance_loss_mlp": 1.04701531, + "epoch": 0.01327433628318584, + "flos": 574180646400.0, + "grad_norm": 0.041322055356332446, + "language_loss": 1.14468551, + "learning_rate": 0.0008384608389860635, + "loss": 1.15931368, + "num_input_tokens_seen": 5141856, + "router_z_loss_mlp": 4.1640625, + "step": 69, + "time_per_iteration": 2.6545376777648926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01450151, + "balance_loss_mlp": 1.03930819, + "epoch": 0.013466717968449404, + "flos": 498259115520.0, + "grad_norm": 0.039605765449237204, + "language_loss": 1.04742777, + "learning_rate": 0.000841310175171381, + "loss": 1.06192923, + "num_input_tokens_seen": 5209280, + "router_z_loss_mlp": 4.11328125, + "step": 70, + "time_per_iteration": 2.5687999725341797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01441096, + "balance_loss_mlp": 1.03101599, + "epoch": 0.013659099653712967, + "flos": 566621803008.0, + "grad_norm": 0.03646297128801074, + "language_loss": 1.03104186, + "learning_rate": 0.000844119093875517, + "loss": 1.04545283, + "num_input_tokens_seen": 5285424, + "router_z_loss_mlp": 4.1015625, + "step": 71, + "time_per_iteration": 2.698259115219116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01433469, + "balance_loss_mlp": 1.02720368, + "epoch": 0.01385148133897653, + "flos": 574942715904.0, + "grad_norm": 0.02854119406997066, + "language_loss": 1.07372236, + "learning_rate": 0.0008468887257134666, + "loss": 1.08805704, + "num_input_tokens_seen": 5358624, + "router_z_loss_mlp": 4.06445312, + "step": 72, + "time_per_iteration": 2.7074387073516846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01422625, + "balance_loss_mlp": 1.01941192, + "epoch": 0.014043863024240093, + "flos": 577958066688.0, + "grad_norm": 0.03113282173853564, + "language_loss": 1.10314119, + "learning_rate": 0.0008496201545131264, + "loss": 1.11736751, + "num_input_tokens_seen": 5429792, + "router_z_loss_mlp": 4.03515625, + "step": 73, + "time_per_iteration": 2.725660562515259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01425762, + "balance_loss_mlp": 1.02655351, + "epoch": 0.014236244709503656, + "flos": 940263883776.0, + "grad_norm": 0.033199488198319166, + "language_loss": 1.07624495, + "learning_rate": 0.0008523144198617317, + "loss": 1.0905025, + "num_input_tokens_seen": 5518608, + "router_z_loss_mlp": 3.99414062, + "step": 74, + "time_per_iteration": 3.2577481269836426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01437934, + "balance_loss_mlp": 1.04139662, + "epoch": 0.014428626394767219, + "flos": 529495603200.0, + "grad_norm": 0.03119178099318558, + "language_loss": 1.07016373, + "learning_rate": 0.0008549725194813783, + "loss": 1.08454299, + "num_input_tokens_seen": 5590576, + "router_z_loss_mlp": 3.96679688, + "step": 75, + "time_per_iteration": 2.727982997894287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01437754, + "balance_loss_mlp": 1.0446496, + "epoch": 0.014621008080030782, + "flos": 805282226688.0, + "grad_norm": 0.02968258762679391, + "language_loss": 1.06415534, + "learning_rate": 0.0008575954114472099, + "loss": 1.07853293, + "num_input_tokens_seen": 5674224, + "router_z_loss_mlp": 3.93164062, + "step": 76, + "time_per_iteration": 3.172807455062866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0143975, + "balance_loss_mlp": 1.04950643, + "epoch": 0.014813389765294343, + "flos": 698356521984.0, + "grad_norm": 0.031905123056971844, + "language_loss": 1.03629625, + "learning_rate": 0.0008601840162606118, + "loss": 1.05069387, + "num_input_tokens_seen": 5757648, + "router_z_loss_mlp": 3.90234375, + "step": 77, + "time_per_iteration": 3.029114007949829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01438585, + "balance_loss_mlp": 1.05158365, + "epoch": 0.015005771450557906, + "flos": 598164464640.0, + "grad_norm": 0.026994348673938514, + "language_loss": 1.09661531, + "learning_rate": 0.000862739218788641, + "loss": 1.11100101, + "num_input_tokens_seen": 5837600, + "router_z_loss_mlp": 3.86914062, + "step": 78, + "time_per_iteration": 2.795952320098877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01440626, + "balance_loss_mlp": 1.05705774, + "epoch": 0.01519815313582147, + "flos": 550492268544.0, + "grad_norm": 0.029495859587709627, + "language_loss": 1.07574832, + "learning_rate": 0.0008652618700799138, + "loss": 1.09015465, + "num_input_tokens_seen": 5907248, + "router_z_loss_mlp": 3.83789062, + "step": 79, + "time_per_iteration": 2.6552224159240723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01430975, + "balance_loss_mlp": 1.05084014, + "epoch": 0.015390534821085032, + "flos": 431440032768.0, + "grad_norm": 0.037998818197719206, + "language_loss": 1.07206631, + "learning_rate": 0.0008677527890662774, + "loss": 1.08637595, + "num_input_tokens_seen": 5970864, + "router_z_loss_mlp": 3.80664062, + "step": 80, + "time_per_iteration": 2.530073881149292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01424927, + "balance_loss_mlp": 1.04727161, + "epoch": 0.015582916506348595, + "flos": 525184424448.0, + "grad_norm": 0.03521308344632083, + "language_loss": 1.08168781, + "learning_rate": 0.0008702127641587799, + "loss": 1.09593713, + "num_input_tokens_seen": 6040800, + "router_z_loss_mlp": 3.78125, + "step": 81, + "time_per_iteration": 2.6248533725738525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01426595, + "balance_loss_mlp": 1.05141926, + "epoch": 0.015775298191612157, + "flos": 576616576512.0, + "grad_norm": 0.026523126631237747, + "language_loss": 1.036394, + "learning_rate": 0.0008726425547457192, + "loss": 1.05065989, + "num_input_tokens_seen": 6111840, + "router_z_loss_mlp": 3.75585938, + "step": 82, + "time_per_iteration": 2.759159564971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01424967, + "balance_loss_mlp": 1.05303442, + "epoch": 0.01596767987687572, + "flos": 611439103488.0, + "grad_norm": 0.03656915183129864, + "language_loss": 1.03032446, + "learning_rate": 0.0008750428925998964, + "loss": 1.04457414, + "num_input_tokens_seen": 6183872, + "router_z_loss_mlp": 3.72265625, + "step": 83, + "time_per_iteration": 2.739105224609375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01431924, + "balance_loss_mlp": 1.06151688, + "epoch": 0.016160061562139283, + "flos": 568232537088.0, + "grad_norm": 0.03323001720600938, + "language_loss": 1.08511543, + "learning_rate": 0.0008774144832015932, + "loss": 1.09943461, + "num_input_tokens_seen": 6255760, + "router_z_loss_mlp": 3.70703125, + "step": 84, + "time_per_iteration": 2.7144806385040283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02085876, + "balance_loss_mlp": 1.68762207, + "epoch": 0.016352443247402846, + "flos": 1414499701248.0, + "grad_norm": 0.1388747380481991, + "language_loss": 0.74774313, + "learning_rate": 0.0008797580069832641, + "loss": 0.76860189, + "num_input_tokens_seen": 6472960, + "router_z_loss_mlp": 3.984375, + "step": 85, + "time_per_iteration": 4.569611072540283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01450774, + "balance_loss_mlp": 1.08532572, + "epoch": 0.01654482493266641, + "flos": 731785165824.0, + "grad_norm": 0.04601998260491519, + "language_loss": 1.03772068, + "learning_rate": 0.0008820741205014318, + "loss": 1.05222845, + "num_input_tokens_seen": 6548912, + "router_z_loss_mlp": 3.65625, + "step": 86, + "time_per_iteration": 2.8604419231414795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.014606, + "balance_loss_mlp": 1.09744096, + "epoch": 0.016737206617929972, + "flos": 537404281344.0, + "grad_norm": 0.03433335749497543, + "language_loss": 1.05140662, + "learning_rate": 0.0008843634575408404, + "loss": 1.06601262, + "num_input_tokens_seen": 6621520, + "router_z_loss_mlp": 3.62695312, + "step": 87, + "time_per_iteration": 2.677731513977051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0145769, + "balance_loss_mlp": 1.09777355, + "epoch": 0.016929588303193535, + "flos": 538129420800.0, + "grad_norm": 0.05036212092144492, + "language_loss": 1.06815004, + "learning_rate": 0.0008866266301555082, + "loss": 1.08272696, + "num_input_tokens_seen": 6698432, + "router_z_loss_mlp": 3.59765625, + "step": 88, + "time_per_iteration": 2.7037875652313232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0145347, + "balance_loss_mlp": 1.09622347, + "epoch": 0.017121969988457098, + "flos": 527791543296.0, + "grad_norm": 0.030252065691096418, + "language_loss": 1.07441962, + "learning_rate": 0.0008888642296509615, + "loss": 1.08895445, + "num_input_tokens_seen": 6764336, + "router_z_loss_mlp": 3.56445312, + "step": 89, + "time_per_iteration": 2.590280771255493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0145473, + "balance_loss_mlp": 1.10034442, + "epoch": 0.01731435167372066, + "flos": 626767636992.0, + "grad_norm": 0.041554939890322294, + "language_loss": 1.12743318, + "learning_rate": 0.0008910768275115906, + "loss": 1.14198053, + "num_input_tokens_seen": 6839392, + "router_z_loss_mlp": 3.54101562, + "step": 90, + "time_per_iteration": 2.7529714107513428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0145373, + "balance_loss_mlp": 1.10220587, + "epoch": 0.017506733358984224, + "flos": 497384254464.0, + "grad_norm": 0.05646737130307679, + "language_loss": 1.07978606, + "learning_rate": 0.0008932649762767675, + "loss": 1.0943234, + "num_input_tokens_seen": 6907344, + "router_z_loss_mlp": 3.50976562, + "step": 91, + "time_per_iteration": 2.5964808464050293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01457202, + "balance_loss_mlp": 1.10911036, + "epoch": 0.017699115044247787, + "flos": 747217758720.0, + "grad_norm": 0.04050166442287704, + "language_loss": 1.1018101, + "learning_rate": 0.0008954292103690864, + "loss": 1.11638212, + "num_input_tokens_seen": 6982464, + "router_z_loss_mlp": 3.47851562, + "step": 92, + "time_per_iteration": 2.9288997650146484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01459372, + "balance_loss_mlp": 1.11395121, + "epoch": 0.01789149672951135, + "flos": 516520407552.0, + "grad_norm": 0.054281950557984966, + "language_loss": 1.12496912, + "learning_rate": 0.0008975700468778296, + "loss": 1.13956285, + "num_input_tokens_seen": 7049712, + "router_z_loss_mlp": 3.45117188, + "step": 93, + "time_per_iteration": 2.5800487995147705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01462727, + "balance_loss_mlp": 1.11978543, + "epoch": 0.018083878414774913, + "flos": 587229702144.0, + "grad_norm": 0.04557553976021738, + "language_loss": 1.05795836, + "learning_rate": 0.0008996879863005366, + "loss": 1.07258558, + "num_input_tokens_seen": 7120288, + "router_z_loss_mlp": 3.42578125, + "step": 94, + "time_per_iteration": 2.6668198108673096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0146929, + "balance_loss_mlp": 1.12882805, + "epoch": 0.018276260100038477, + "flos": 498369905664.0, + "grad_norm": 0.055406629054909326, + "language_loss": 1.06168532, + "learning_rate": 0.0009017835132453337, + "loss": 1.07637823, + "num_input_tokens_seen": 7188896, + "router_z_loss_mlp": 3.40234375, + "step": 95, + "time_per_iteration": 2.588728904724121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0146889, + "balance_loss_mlp": 1.1312896, + "epoch": 0.01846864178530204, + "flos": 641232043008.0, + "grad_norm": 0.04012691806662063, + "language_loss": 1.05874133, + "learning_rate": 0.0009038570970964896, + "loss": 1.0734303, + "num_input_tokens_seen": 7259536, + "router_z_loss_mlp": 3.37890625, + "step": 96, + "time_per_iteration": 2.7860937118530273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01464817, + "balance_loss_mlp": 1.12912345, + "epoch": 0.018661023470565603, + "flos": 512667125760.0, + "grad_norm": 0.027884025705687265, + "language_loss": 1.03269148, + "learning_rate": 0.0009059091926454854, + "loss": 1.04733968, + "num_input_tokens_seen": 7326752, + "router_z_loss_mlp": 3.359375, + "step": 97, + "time_per_iteration": 2.6100950241088867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01470726, + "balance_loss_mlp": 1.13694024, + "epoch": 0.018853405155829166, + "flos": 932696308224.0, + "grad_norm": 0.03936003805775877, + "language_loss": 1.02435613, + "learning_rate": 0.0009079402406897198, + "loss": 1.03906357, + "num_input_tokens_seen": 7417488, + "router_z_loss_mlp": 3.33984375, + "step": 98, + "time_per_iteration": 3.2489542961120605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01467854, + "balance_loss_mlp": 1.13616598, + "epoch": 0.01904578684109273, + "flos": 577586764800.0, + "grad_norm": 0.036005296184057074, + "language_loss": 1.04073858, + "learning_rate": 0.0009099506686008212, + "loss": 1.05541718, + "num_input_tokens_seen": 7493136, + "router_z_loss_mlp": 3.31835938, + "step": 99, + "time_per_iteration": 2.7905051708221436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01467812, + "balance_loss_mlp": 1.13822246, + "epoch": 0.019238168526356292, + "flos": 559520856576.0, + "grad_norm": 0.02696843746399884, + "language_loss": 1.07409596, + "learning_rate": 0.0009119408908644013, + "loss": 1.08877409, + "num_input_tokens_seen": 7560896, + "router_z_loss_mlp": 3.296875, + "step": 100, + "time_per_iteration": 2.7075607776641846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01456893, + "balance_loss_mlp": 1.12882876, + "epoch": 0.019430550211619855, + "flos": 725103184896.0, + "grad_norm": 0.03304065923870771, + "language_loss": 1.12780023, + "learning_rate": 0.0009139113095929519, + "loss": 1.14236927, + "num_input_tokens_seen": 7629040, + "router_z_loss_mlp": 3.28125, + "step": 101, + "time_per_iteration": 2.86230731010437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01460167, + "balance_loss_mlp": 1.13439226, + "epoch": 0.019622931896883418, + "flos": 500456001024.0, + "grad_norm": 0.030619133870748612, + "language_loss": 1.06594038, + "learning_rate": 0.0009158623150134762, + "loss": 1.08054209, + "num_input_tokens_seen": 7694256, + "router_z_loss_mlp": 3.2578125, + "step": 102, + "time_per_iteration": 2.563690185546875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01458611, + "balance_loss_mlp": 1.13569677, + "epoch": 0.01981531358214698, + "flos": 510281587200.0, + "grad_norm": 0.03276303076426602, + "language_loss": 1.06164801, + "learning_rate": 0.000917794285931332, + "loss": 1.0762341, + "num_input_tokens_seen": 7762256, + "router_z_loss_mlp": 3.22851562, + "step": 103, + "time_per_iteration": 2.6599903106689453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01462945, + "balance_loss_mlp": 1.1421293, + "epoch": 0.020007695267410544, + "flos": 522392655360.0, + "grad_norm": 0.026505304013468463, + "language_loss": 0.98227251, + "learning_rate": 0.0009197075901716639, + "loss": 0.99690199, + "num_input_tokens_seen": 7834400, + "router_z_loss_mlp": 3.20703125, + "step": 104, + "time_per_iteration": 2.726245880126953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01469463, + "balance_loss_mlp": 1.14998221, + "epoch": 0.020200076952674107, + "flos": 534443324928.0, + "grad_norm": 0.029933884589862427, + "language_loss": 1.08736229, + "learning_rate": 0.0009216025849997171, + "loss": 1.10205698, + "num_input_tokens_seen": 7911184, + "router_z_loss_mlp": 3.19335938, + "step": 105, + "time_per_iteration": 2.8023486137390137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01468836, + "balance_loss_mlp": 1.15221632, + "epoch": 0.020392458637937667, + "flos": 686082270720.0, + "grad_norm": 0.024520994280375335, + "language_loss": 1.03054178, + "learning_rate": 0.0009234796175212258, + "loss": 1.04523015, + "num_input_tokens_seen": 7985280, + "router_z_loss_mlp": 3.1640625, + "step": 106, + "time_per_iteration": 2.9396088123321533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01469456, + "balance_loss_mlp": 1.15512502, + "epoch": 0.02058484032320123, + "flos": 703414307328.0, + "grad_norm": 0.02898567585615155, + "language_loss": 1.07201982, + "learning_rate": 0.000925339025064007, + "loss": 1.08671439, + "num_input_tokens_seen": 8068320, + "router_z_loss_mlp": 3.140625, + "step": 107, + "time_per_iteration": 2.9473297595977783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0147282, + "balance_loss_mlp": 1.16001439, + "epoch": 0.020777222008464793, + "flos": 640326982656.0, + "grad_norm": 0.02770789473723963, + "language_loss": 0.99879742, + "learning_rate": 0.0009271811355418027, + "loss": 1.01352561, + "num_input_tokens_seen": 8148144, + "router_z_loss_mlp": 3.125, + "step": 108, + "time_per_iteration": 2.8551387786865234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01469504, + "balance_loss_mlp": 1.15803361, + "epoch": 0.020969603693728356, + "flos": 683320700928.0, + "grad_norm": 0.029161506766480293, + "language_loss": 1.06637371, + "learning_rate": 0.0009290062678013548, + "loss": 1.08106875, + "num_input_tokens_seen": 8222256, + "router_z_loss_mlp": 3.11132812, + "step": 109, + "time_per_iteration": 2.821951389312744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01468675, + "balance_loss_mlp": 1.15949392, + "epoch": 0.02116198537899192, + "flos": 534419129856.0, + "grad_norm": 0.03188637458086245, + "language_loss": 1.05070233, + "learning_rate": 0.0009308147319536321, + "loss": 1.06538928, + "num_input_tokens_seen": 8292432, + "router_z_loss_mlp": 3.08789062, + "step": 110, + "time_per_iteration": 2.6315042972564697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01469018, + "balance_loss_mlp": 1.16212535, + "epoch": 0.021354367064255482, + "flos": 718727377920.0, + "grad_norm": 0.030955966903197116, + "language_loss": 1.11490715, + "learning_rate": 0.0009326068296900676, + "loss": 1.12959719, + "num_input_tokens_seen": 8365024, + "router_z_loss_mlp": 3.06445312, + "step": 111, + "time_per_iteration": 2.8208162784576416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01474326, + "balance_loss_mlp": 1.16934085, + "epoch": 0.021546748749519045, + "flos": 520623467520.0, + "grad_norm": 0.027870670355515197, + "language_loss": 1.02138007, + "learning_rate": 0.0009343828545846161, + "loss": 1.03612328, + "num_input_tokens_seen": 8442448, + "router_z_loss_mlp": 3.04492188, + "step": 112, + "time_per_iteration": 2.759277105331421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01474098, + "balance_loss_mlp": 1.17063916, + "epoch": 0.021739130434782608, + "flos": 506161062912.0, + "grad_norm": 0.03372988233582904, + "language_loss": 1.06662297, + "learning_rate": 0.0009361430923823841, + "loss": 1.08136404, + "num_input_tokens_seen": 8508992, + "router_z_loss_mlp": 3.02929688, + "step": 113, + "time_per_iteration": 2.565107822418213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01471087, + "balance_loss_mlp": 1.1693449, + "epoch": 0.02193151212004617, + "flos": 464426242560.0, + "grad_norm": 0.03803370713592907, + "language_loss": 1.10115385, + "learning_rate": 0.0009378878212755459, + "loss": 1.11586463, + "num_input_tokens_seen": 8574048, + "router_z_loss_mlp": 3.01171875, + "step": 114, + "time_per_iteration": 2.491929292678833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01471993, + "balance_loss_mlp": 1.17253923, + "epoch": 0.022123893805309734, + "flos": 553331701248.0, + "grad_norm": 0.029753755152528143, + "language_loss": 1.00006115, + "learning_rate": 0.0009396173121672103, + "loss": 1.014781, + "num_input_tokens_seen": 8647808, + "router_z_loss_mlp": 2.98828125, + "step": 115, + "time_per_iteration": 2.6869561672210693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01473585, + "balance_loss_mlp": 1.1754663, + "epoch": 0.022316275490573297, + "flos": 637378761216.0, + "grad_norm": 0.032022590728611564, + "language_loss": 1.0593642, + "learning_rate": 0.0009413318289238633, + "loss": 1.07410002, + "num_input_tokens_seen": 8719760, + "router_z_loss_mlp": 2.97460938, + "step": 116, + "time_per_iteration": 2.7639846801757812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01474428, + "balance_loss_mlp": 1.17859828, + "epoch": 0.02250865717583686, + "flos": 800315039232.0, + "grad_norm": 0.032750944460810345, + "language_loss": 0.98115921, + "learning_rate": 0.0009430316286169771, + "loss": 0.99590349, + "num_input_tokens_seen": 8798752, + "router_z_loss_mlp": 2.95117188, + "step": 117, + "time_per_iteration": 3.020703077316284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01469481, + "balance_loss_mlp": 1.17536783, + "epoch": 0.022701038861100423, + "flos": 457062782976.0, + "grad_norm": 0.027209249322999743, + "language_loss": 1.0327785, + "learning_rate": 0.0009447169617543361, + "loss": 1.04747331, + "num_input_tokens_seen": 8866848, + "router_z_loss_mlp": 2.9375, + "step": 118, + "time_per_iteration": 2.5938501358032227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01466386, + "balance_loss_mlp": 1.17437065, + "epoch": 0.022893420546363986, + "flos": 584186153472.0, + "grad_norm": 0.028075325054819567, + "language_loss": 1.10005641, + "learning_rate": 0.0009463880725016029, + "loss": 1.11472011, + "num_input_tokens_seen": 8935488, + "router_z_loss_mlp": 2.91992188, + "step": 119, + "time_per_iteration": 2.7082488536834717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01467196, + "balance_loss_mlp": 1.17861414, + "epoch": 0.02308580223162755, + "flos": 562477810176.0, + "grad_norm": 0.032360539397207934, + "language_loss": 1.05048943, + "learning_rate": 0.0009480451988946134, + "loss": 1.06516147, + "num_input_tokens_seen": 9015344, + "router_z_loss_mlp": 2.89257812, + "step": 120, + "time_per_iteration": 2.808687686920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01461098, + "balance_loss_mlp": 1.17423272, + "epoch": 0.023278183916891113, + "flos": 772645125120.0, + "grad_norm": 0.033180722862994706, + "language_loss": 1.06113267, + "learning_rate": 0.0009496885730428627, + "loss": 1.07574379, + "num_input_tokens_seen": 9094672, + "router_z_loss_mlp": 2.875, + "step": 121, + "time_per_iteration": 3.0043137073516846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01466426, + "balance_loss_mlp": 1.18070555, + "epoch": 0.023470565602154676, + "flos": 554430144000.0, + "grad_norm": 0.030787275004595428, + "language_loss": 1.04567683, + "learning_rate": 0.0009513184213246156, + "loss": 1.06034112, + "num_input_tokens_seen": 9160608, + "router_z_loss_mlp": 2.86328125, + "step": 122, + "time_per_iteration": 2.6666839122772217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01462554, + "balance_loss_mlp": 1.17835939, + "epoch": 0.02366294728741824, + "flos": 561166519296.0, + "grad_norm": 0.030499039091632818, + "language_loss": 1.08099937, + "learning_rate": 0.0009529349645740552, + "loss": 1.09562504, + "num_input_tokens_seen": 9228704, + "router_z_loss_mlp": 2.84765625, + "step": 123, + "time_per_iteration": 2.69850492477417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01460088, + "balance_loss_mlp": 1.17741883, + "epoch": 0.0238553289726818, + "flos": 469516955136.0, + "grad_norm": 0.026549221517309443, + "language_loss": 1.06623578, + "learning_rate": 0.0009545384182608524, + "loss": 1.08083653, + "num_input_tokens_seen": 9294288, + "router_z_loss_mlp": 2.83203125, + "step": 124, + "time_per_iteration": 2.5435874462127686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01462583, + "balance_loss_mlp": 1.18144011, + "epoch": 0.024047710657945365, + "flos": 561103392768.0, + "grad_norm": 0.03287811385355005, + "language_loss": 1.04055512, + "learning_rate": 0.0009561289926625252, + "loss": 1.05518079, + "num_input_tokens_seen": 9368048, + "router_z_loss_mlp": 2.81640625, + "step": 125, + "time_per_iteration": 2.6661720275878906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01464029, + "balance_loss_mlp": 1.18460226, + "epoch": 0.024240092343208928, + "flos": 505770295296.0, + "grad_norm": 0.030159442314643806, + "language_loss": 1.08985233, + "learning_rate": 0.0009577068930299292, + "loss": 1.10449266, + "num_input_tokens_seen": 9434848, + "router_z_loss_mlp": 2.79882812, + "step": 126, + "time_per_iteration": 2.596027135848999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01456959, + "balance_loss_mlp": 1.17944014, + "epoch": 0.02443247402847249, + "flos": 436752325632.0, + "grad_norm": 0.03465787530540315, + "language_loss": 1.04454637, + "learning_rate": 0.0009592723197462087, + "loss": 1.05911589, + "num_input_tokens_seen": 9504112, + "router_z_loss_mlp": 2.77929688, + "step": 127, + "time_per_iteration": 2.6355836391448975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0145855, + "balance_loss_mlp": 1.18236613, + "epoch": 0.024624855713736054, + "flos": 685068421632.0, + "grad_norm": 0.03103018628328697, + "language_loss": 1.00976562, + "learning_rate": 0.0009608254684795125, + "loss": 1.02435124, + "num_input_tokens_seen": 9590032, + "router_z_loss_mlp": 2.765625, + "step": 128, + "time_per_iteration": 2.956745147705078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01452077, + "balance_loss_mlp": 1.17741859, + "epoch": 0.024817237398999614, + "flos": 526113679872.0, + "grad_norm": 0.03378324138815482, + "language_loss": 1.03947771, + "learning_rate": 0.0009623665303297678, + "loss": 1.05399847, + "num_input_tokens_seen": 9663040, + "router_z_loss_mlp": 2.75, + "step": 129, + "time_per_iteration": 2.762612819671631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0145448, + "balance_loss_mlp": 1.18115723, + "epoch": 0.025009619084263177, + "flos": 656886216192.0, + "grad_norm": 0.03318348770393379, + "language_loss": 1.08023834, + "learning_rate": 0.0009638956919697878, + "loss": 1.09478307, + "num_input_tokens_seen": 9736544, + "router_z_loss_mlp": 2.73339844, + "step": 130, + "time_per_iteration": 2.8800294399261475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01453293, + "balance_loss_mlp": 1.18130565, + "epoch": 0.02520200076952674, + "flos": 455369456640.0, + "grad_norm": 0.028803226470227133, + "language_loss": 1.00211501, + "learning_rate": 0.0009654131357809714, + "loss": 1.01664793, + "num_input_tokens_seen": 9804656, + "router_z_loss_mlp": 2.71875, + "step": 131, + "time_per_iteration": 2.593409776687622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01454951, + "balance_loss_mlp": 1.18534708, + "epoch": 0.025394382454790303, + "flos": 841268324352.0, + "grad_norm": 0.035993676074610494, + "language_loss": 1.09494662, + "learning_rate": 0.0009669190399838441, + "loss": 1.10949612, + "num_input_tokens_seen": 9888864, + "router_z_loss_mlp": 2.69824219, + "step": 132, + "time_per_iteration": 3.1307294368743896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01454062, + "balance_loss_mlp": 1.18588877, + "epoch": 0.025586764140053866, + "flos": 582228312576.0, + "grad_norm": 0.03305283337163912, + "language_loss": 1.02299893, + "learning_rate": 0.0009684135787636724, + "loss": 1.03753948, + "num_input_tokens_seen": 9968208, + "router_z_loss_mlp": 2.68359375, + "step": 133, + "time_per_iteration": 2.8118627071380615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01454726, + "balance_loss_mlp": 1.18798327, + "epoch": 0.02577914582531743, + "flos": 791677218816.0, + "grad_norm": 0.03011124606519955, + "language_loss": 1.06380379, + "learning_rate": 0.0009698969223913726, + "loss": 1.07835102, + "num_input_tokens_seen": 10049664, + "router_z_loss_mlp": 2.66894531, + "step": 134, + "time_per_iteration": 3.0371806621551514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01450237, + "balance_loss_mlp": 1.18454385, + "epoch": 0.025971527510580992, + "flos": 596062906368.0, + "grad_norm": 0.030569012833979448, + "language_loss": 1.08986592, + "learning_rate": 0.0009713692373399265, + "loss": 1.10436833, + "num_input_tokens_seen": 10120096, + "router_z_loss_mlp": 2.65820312, + "step": 135, + "time_per_iteration": 2.684903621673584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01684837, + "balance_loss_mlp": 1.39873505, + "epoch": 0.026163909195844555, + "flos": 1581074411520.0, + "grad_norm": 0.08870187959024729, + "language_loss": 0.79456228, + "learning_rate": 0.0009728306863964993, + "loss": 0.81141067, + "num_input_tokens_seen": 10348976, + "router_z_loss_mlp": 2.8671875, + "step": 136, + "time_per_iteration": 5.94019627571106 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0161422, + "balance_loss_mlp": 1.33116913, + "epoch": 0.026356290881108118, + "flos": 1505160886272.0, + "grad_norm": 0.07212137850421584, + "language_loss": 0.77811038, + "learning_rate": 0.0009742814287704512, + "loss": 0.79425257, + "num_input_tokens_seen": 10576512, + "router_z_loss_mlp": 2.8359375, + "step": 137, + "time_per_iteration": 4.865153074264526 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01469938, + "balance_loss_mlp": 1.20901299, + "epoch": 0.02654867256637168, + "flos": 598340382720.0, + "grad_norm": 0.040535745966457745, + "language_loss": 1.01652551, + "learning_rate": 0.0009757216201974225, + "loss": 1.03122485, + "num_input_tokens_seen": 10659168, + "router_z_loss_mlp": 2.609375, + "step": 138, + "time_per_iteration": 2.8955435752868652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01487517, + "balance_loss_mlp": 1.22802222, + "epoch": 0.026741054251635244, + "flos": 546135427584.0, + "grad_norm": 0.04340470282065083, + "language_loss": 1.06732666, + "learning_rate": 0.0009771514130396581, + "loss": 1.08220184, + "num_input_tokens_seen": 10731584, + "router_z_loss_mlp": 2.59472656, + "step": 139, + "time_per_iteration": 2.6939706802368164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01498511, + "balance_loss_mlp": 1.24044681, + "epoch": 0.026933435936898807, + "flos": 507845657088.0, + "grad_norm": 0.04879945782970011, + "language_loss": 1.07520163, + "learning_rate": 0.00097857095638274, + "loss": 1.09018672, + "num_input_tokens_seen": 10799456, + "router_z_loss_mlp": 2.58007812, + "step": 140, + "time_per_iteration": 2.559673309326172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01492411, + "balance_loss_mlp": 1.23558652, + "epoch": 0.02712581762216237, + "flos": 742253299200.0, + "grad_norm": 0.043929969627725114, + "language_loss": 0.98754954, + "learning_rate": 0.0009799803961288726, + "loss": 1.00247359, + "num_input_tokens_seen": 10886416, + "router_z_loss_mlp": 2.5703125, + "step": 141, + "time_per_iteration": 3.008998394012451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01470778, + "balance_loss_mlp": 1.21567059, + "epoch": 0.027318199307425933, + "flos": 849777890304.0, + "grad_norm": 0.03716164217421175, + "language_loss": 1.04960537, + "learning_rate": 0.000981379875086876, + "loss": 1.06431305, + "num_input_tokens_seen": 10966064, + "router_z_loss_mlp": 2.55371094, + "step": 142, + "time_per_iteration": 3.057098865509033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01469037, + "balance_loss_mlp": 1.21535933, + "epoch": 0.027510580992689496, + "flos": 576638043648.0, + "grad_norm": 0.03712962317624948, + "language_loss": 1.00046849, + "learning_rate": 0.0009827695330590185, + "loss": 1.01515889, + "num_input_tokens_seen": 11039712, + "router_z_loss_mlp": 2.5390625, + "step": 143, + "time_per_iteration": 2.638338327407837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01450228, + "balance_loss_mlp": 1.19750416, + "epoch": 0.02770296267795306, + "flos": 773789230080.0, + "grad_norm": 0.030455330453953735, + "language_loss": 0.99027133, + "learning_rate": 0.0009841495069248256, + "loss": 1.00477362, + "num_input_tokens_seen": 11123984, + "router_z_loss_mlp": 2.52929688, + "step": 144, + "time_per_iteration": 2.981438636779785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01441391, + "balance_loss_mlp": 1.19009781, + "epoch": 0.027895344363216622, + "flos": 570448888320.0, + "grad_norm": 0.031624263879455494, + "language_loss": 0.98723662, + "learning_rate": 0.0009855199307219871, + "loss": 1.00165045, + "num_input_tokens_seen": 11192864, + "router_z_loss_mlp": 2.51464844, + "step": 145, + "time_per_iteration": 2.6923046112060547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01440125, + "balance_loss_mlp": 1.1903578, + "epoch": 0.028087726048480186, + "flos": 548408174592.0, + "grad_norm": 0.029995844711875903, + "language_loss": 1.00586843, + "learning_rate": 0.0009868809357244854, + "loss": 1.02026975, + "num_input_tokens_seen": 11261760, + "router_z_loss_mlp": 2.49902344, + "step": 146, + "time_per_iteration": 2.6284868717193604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01436833, + "balance_loss_mlp": 1.18782902, + "epoch": 0.02828010773374375, + "flos": 525872633856.0, + "grad_norm": 0.03288909570778387, + "language_loss": 1.05042541, + "learning_rate": 0.0009882326505180556, + "loss": 1.06479371, + "num_input_tokens_seen": 11334736, + "router_z_loss_mlp": 2.49121094, + "step": 147, + "time_per_iteration": 2.6588714122772217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01425728, + "balance_loss_mlp": 1.1783452, + "epoch": 0.02847248941900731, + "flos": 773771765760.0, + "grad_norm": 0.031738987003727674, + "language_loss": 1.02499485, + "learning_rate": 0.0009895752010730906, + "loss": 1.03925204, + "num_input_tokens_seen": 11409872, + "router_z_loss_mlp": 2.47460938, + "step": 148, + "time_per_iteration": 2.9316182136535645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01424571, + "balance_loss_mlp": 1.17785549, + "epoch": 0.028664871104270875, + "flos": 535469908992.0, + "grad_norm": 0.028294299214345536, + "language_loss": 1.0900923, + "learning_rate": 0.0009909087108150867, + "loss": 1.10433793, + "num_input_tokens_seen": 11481024, + "router_z_loss_mlp": 2.46777344, + "step": 149, + "time_per_iteration": 2.697423219680786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.014274, + "balance_loss_mlp": 1.18182933, + "epoch": 0.028857252789534438, + "flos": 368604487680.0, + "grad_norm": 0.03525963963400797, + "language_loss": 1.09753942, + "learning_rate": 0.0009922333006927371, + "loss": 1.11181331, + "num_input_tokens_seen": 11544240, + "router_z_loss_mlp": 2.45605469, + "step": 150, + "time_per_iteration": 2.483644723892212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01433542, + "balance_loss_mlp": 1.18911529, + "epoch": 0.029049634474798, + "flos": 516483477504.0, + "grad_norm": 0.03341635886009217, + "language_loss": 1.03220332, + "learning_rate": 0.0009935490892437632, + "loss": 1.04653883, + "num_input_tokens_seen": 11610416, + "router_z_loss_mlp": 2.44433594, + "step": 151, + "time_per_iteration": 2.604599952697754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01438911, + "balance_loss_mlp": 1.19553363, + "epoch": 0.029242016160061564, + "flos": 589348724736.0, + "grad_norm": 0.030166761621646727, + "language_loss": 1.01782072, + "learning_rate": 0.0009948561926585687, + "loss": 1.03220987, + "num_input_tokens_seen": 11687488, + "router_z_loss_mlp": 2.43359375, + "step": 152, + "time_per_iteration": 2.7724709510803223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01445258, + "balance_loss_mlp": 1.20350146, + "epoch": 0.029434397845325123, + "flos": 553136317440.0, + "grad_norm": 0.030739210798008048, + "language_loss": 1.05873716, + "learning_rate": 0.0009961547248418122, + "loss": 1.07318974, + "num_input_tokens_seen": 11754576, + "router_z_loss_mlp": 2.41699219, + "step": 153, + "time_per_iteration": 2.6247737407684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01440878, + "balance_loss_mlp": 1.19988418, + "epoch": 0.029626779530588686, + "flos": 604607400960.0, + "grad_norm": 0.030186385343499288, + "language_loss": 1.02632022, + "learning_rate": 0.0009974447974719707, + "loss": 1.04072905, + "num_input_tokens_seen": 11831360, + "router_z_loss_mlp": 2.40917969, + "step": 154, + "time_per_iteration": 2.730053663253784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01431891, + "balance_loss_mlp": 1.19194651, + "epoch": 0.02981916121585225, + "flos": 622217413632.0, + "grad_norm": 0.02801027733601246, + "language_loss": 1.04305005, + "learning_rate": 0.0009987265200589763, + "loss": 1.05736899, + "num_input_tokens_seen": 11902192, + "router_z_loss_mlp": 2.3984375, + "step": 155, + "time_per_iteration": 2.7198500633239746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01423605, + "balance_loss_mlp": 1.18537688, + "epoch": 0.030011542901115813, + "flos": 662879987712.0, + "grad_norm": 0.0349007823819893, + "language_loss": 1.04218483, + "learning_rate": 0.001, + "loss": 1.05642092, + "num_input_tokens_seen": 11979088, + "router_z_loss_mlp": 2.38085938, + "step": 156, + "time_per_iteration": 2.8801028728485107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01420835, + "balance_loss_mlp": 1.18289316, + "epoch": 0.030203924586379376, + "flos": 652818084864.0, + "grad_norm": 0.029403473562715665, + "language_loss": 1.01930022, + "learning_rate": 0.0009999999029413921, + "loss": 1.03350854, + "num_input_tokens_seen": 12059200, + "router_z_loss_mlp": 2.37792969, + "step": 157, + "time_per_iteration": 2.8549368381500244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01415444, + "balance_loss_mlp": 1.17921925, + "epoch": 0.03039630627164294, + "flos": 532443824640.0, + "grad_norm": 0.03295212675068383, + "language_loss": 1.02716291, + "learning_rate": 0.0009999996117656068, + "loss": 1.04131734, + "num_input_tokens_seen": 12134944, + "router_z_loss_mlp": 2.36035156, + "step": 158, + "time_per_iteration": 2.6989729404449463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01410387, + "balance_loss_mlp": 1.17530584, + "epoch": 0.030588687956906502, + "flos": 587294830080.0, + "grad_norm": 0.0291076208082698, + "language_loss": 0.96305156, + "learning_rate": 0.0009999991264727564, + "loss": 0.97715545, + "num_input_tokens_seen": 12207936, + "router_z_loss_mlp": 2.34863281, + "step": 159, + "time_per_iteration": 2.7609338760375977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0140999, + "balance_loss_mlp": 1.1752907, + "epoch": 0.030781069642170065, + "flos": 514286592000.0, + "grad_norm": 0.030494101007586163, + "language_loss": 1.0725081, + "learning_rate": 0.0009999984470630296, + "loss": 1.08660805, + "num_input_tokens_seen": 12273200, + "router_z_loss_mlp": 2.34472656, + "step": 160, + "time_per_iteration": 2.5805158615112305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01410287, + "balance_loss_mlp": 1.17711365, + "epoch": 0.030973451327433628, + "flos": 719559304704.0, + "grad_norm": 0.025032822394785544, + "language_loss": 0.95934659, + "learning_rate": 0.0009999975735366902, + "loss": 0.97344947, + "num_input_tokens_seen": 12359600, + "router_z_loss_mlp": 2.32910156, + "step": 161, + "time_per_iteration": 3.078343629837036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01409543, + "balance_loss_mlp": 1.17675149, + "epoch": 0.03116583301269719, + "flos": 1111614400512.0, + "grad_norm": 0.029903967107167622, + "language_loss": 0.98009437, + "learning_rate": 0.0009999965058940775, + "loss": 0.99418974, + "num_input_tokens_seen": 12443936, + "router_z_loss_mlp": 2.32519531, + "step": 162, + "time_per_iteration": 3.49137544631958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01408163, + "balance_loss_mlp": 1.17689729, + "epoch": 0.031358214697960754, + "flos": 451833082368.0, + "grad_norm": 0.11336845133687022, + "language_loss": 1.0463953, + "learning_rate": 0.0009999952441356057, + "loss": 1.06047678, + "num_input_tokens_seen": 12507488, + "router_z_loss_mlp": 2.30957031, + "step": 163, + "time_per_iteration": 2.531280755996704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01406979, + "balance_loss_mlp": 1.17676246, + "epoch": 0.031550596383224314, + "flos": 1257085658112.0, + "grad_norm": 0.03183858769064714, + "language_loss": 1.05248928, + "learning_rate": 0.000999993788261765, + "loss": 1.06655908, + "num_input_tokens_seen": 12594096, + "router_z_loss_mlp": 2.30078125, + "step": 164, + "time_per_iteration": 3.5714328289031982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01408503, + "balance_loss_mlp": 1.17943025, + "epoch": 0.03174297806848788, + "flos": 669322924032.0, + "grad_norm": 0.03191781964215587, + "language_loss": 1.06263065, + "learning_rate": 0.00099999213827312, + "loss": 1.07671571, + "num_input_tokens_seen": 12669424, + "router_z_loss_mlp": 2.29101562, + "step": 165, + "time_per_iteration": 2.7947938442230225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01409995, + "balance_loss_mlp": 1.18101788, + "epoch": 0.03193535975375144, + "flos": 552363514368.0, + "grad_norm": 0.03891580789868065, + "language_loss": 1.01044345, + "learning_rate": 0.000999990294170312, + "loss": 1.0245434, + "num_input_tokens_seen": 12740080, + "router_z_loss_mlp": 2.29003906, + "step": 166, + "time_per_iteration": 2.6462574005126953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0140342, + "balance_loss_mlp": 1.17577803, + "epoch": 0.032127741439015006, + "flos": 544739543040.0, + "grad_norm": 0.03757156138401865, + "language_loss": 1.05309296, + "learning_rate": 0.0009999882559540566, + "loss": 1.06712723, + "num_input_tokens_seen": 12810576, + "router_z_loss_mlp": 2.27636719, + "step": 167, + "time_per_iteration": 2.629549503326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0140941, + "balance_loss_mlp": 1.18234003, + "epoch": 0.032320123124278566, + "flos": 549513348096.0, + "grad_norm": 0.028659149555752484, + "language_loss": 1.01791751, + "learning_rate": 0.000999986023625145, + "loss": 1.03201175, + "num_input_tokens_seen": 12887904, + "router_z_loss_mlp": 2.27050781, + "step": 168, + "time_per_iteration": 2.7051401138305664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01589355, + "balance_loss_mlp": 1.35360718, + "epoch": 0.03251250480954213, + "flos": 1308815430144.0, + "grad_norm": 0.08201951270186027, + "language_loss": 0.78924417, + "learning_rate": 0.0009999835971844441, + "loss": 0.80513763, + "num_input_tokens_seen": 13107344, + "router_z_loss_mlp": 2.35546875, + "step": 169, + "time_per_iteration": 4.9428627490997314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01407645, + "balance_loss_mlp": 1.18257797, + "epoch": 0.03270488649480569, + "flos": 562201835520.0, + "grad_norm": 0.03970113019311383, + "language_loss": 1.02863848, + "learning_rate": 0.0009999809766328958, + "loss": 1.04271495, + "num_input_tokens_seen": 13175552, + "router_z_loss_mlp": 2.25, + "step": 170, + "time_per_iteration": 2.675811529159546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01415662, + "balance_loss_mlp": 1.19193029, + "epoch": 0.03289726818006926, + "flos": 483338813952.0, + "grad_norm": 0.03325277263778645, + "language_loss": 1.04760146, + "learning_rate": 0.0009999781619715177, + "loss": 1.06175804, + "num_input_tokens_seen": 13242384, + "router_z_loss_mlp": 2.23632812, + "step": 171, + "time_per_iteration": 2.5431392192840576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01419714, + "balance_loss_mlp": 1.1972214, + "epoch": 0.03308964986533282, + "flos": 675820254720.0, + "grad_norm": 0.02950894161591202, + "language_loss": 1.04164565, + "learning_rate": 0.000999975153201402, + "loss": 1.05584288, + "num_input_tokens_seen": 13316160, + "router_z_loss_mlp": 2.22363281, + "step": 172, + "time_per_iteration": 2.812837600708008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01422366, + "balance_loss_mlp": 1.20044637, + "epoch": 0.033282031550596385, + "flos": 610340660736.0, + "grad_norm": 0.03086814843966846, + "language_loss": 1.02532911, + "learning_rate": 0.0009999719503237174, + "loss": 1.03955269, + "num_input_tokens_seen": 13387664, + "router_z_loss_mlp": 2.21777344, + "step": 173, + "time_per_iteration": 2.755462646484375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01416936, + "balance_loss_mlp": 1.1959697, + "epoch": 0.033474413235859944, + "flos": 468995931648.0, + "grad_norm": 0.048603642070708566, + "language_loss": 1.1131072, + "learning_rate": 0.0009999685533397073, + "loss": 1.12727666, + "num_input_tokens_seen": 13454528, + "router_z_loss_mlp": 2.20800781, + "step": 174, + "time_per_iteration": 2.566751003265381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01414495, + "balance_loss_mlp": 1.19438744, + "epoch": 0.03366679492112351, + "flos": 580714907136.0, + "grad_norm": 0.03243683176756354, + "language_loss": 1.02908182, + "learning_rate": 0.00099996496225069, + "loss": 1.04322672, + "num_input_tokens_seen": 13522528, + "router_z_loss_mlp": 2.19921875, + "step": 175, + "time_per_iteration": 2.67861008644104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01407523, + "balance_loss_mlp": 1.1883682, + "epoch": 0.03385917660638707, + "flos": 638885435904.0, + "grad_norm": 0.029120554083078395, + "language_loss": 1.05784094, + "learning_rate": 0.0009999611770580604, + "loss": 1.0719161, + "num_input_tokens_seen": 13601120, + "router_z_loss_mlp": 2.18945312, + "step": 176, + "time_per_iteration": 2.8410942554473877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01401607, + "balance_loss_mlp": 1.18302441, + "epoch": 0.03405155829165064, + "flos": 442739366400.0, + "grad_norm": 0.031490867136515936, + "language_loss": 1.04703283, + "learning_rate": 0.0009999571977632876, + "loss": 1.06104875, + "num_input_tokens_seen": 13666384, + "router_z_loss_mlp": 2.18359375, + "step": 177, + "time_per_iteration": 2.559652090072632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01399051, + "balance_loss_mlp": 1.1813277, + "epoch": 0.034243939976914196, + "flos": 467274407424.0, + "grad_norm": 0.029366691437037535, + "language_loss": 1.0724479, + "learning_rate": 0.0009999530243679166, + "loss": 1.08643842, + "num_input_tokens_seen": 13733968, + "router_z_loss_mlp": 2.17480469, + "step": 178, + "time_per_iteration": 2.5423247814178467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01392432, + "balance_loss_mlp": 1.17556691, + "epoch": 0.03443632166217776, + "flos": 780712257024.0, + "grad_norm": 0.02507202069561695, + "language_loss": 1.01653552, + "learning_rate": 0.0009999486568735675, + "loss": 1.03045988, + "num_input_tokens_seen": 13818960, + "router_z_loss_mlp": 2.16601562, + "step": 179, + "time_per_iteration": 3.111632823944092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01381684, + "balance_loss_mlp": 1.16567647, + "epoch": 0.03462870334744132, + "flos": 1265758407168.0, + "grad_norm": 0.027829136834509844, + "language_loss": 1.02053452, + "learning_rate": 0.0009999440952819362, + "loss": 1.03435147, + "num_input_tokens_seen": 13912448, + "router_z_loss_mlp": 2.15722656, + "step": 180, + "time_per_iteration": 3.6354756355285645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01375883, + "balance_loss_mlp": 1.16035271, + "epoch": 0.03482108503270489, + "flos": 608302228992.0, + "grad_norm": 0.033531921209289, + "language_loss": 1.02966988, + "learning_rate": 0.0009999393395947935, + "loss": 1.04342866, + "num_input_tokens_seen": 13990752, + "router_z_loss_mlp": 2.15234375, + "step": 181, + "time_per_iteration": 2.8509652614593506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01372611, + "balance_loss_mlp": 1.15774834, + "epoch": 0.03501346671796845, + "flos": 539314458624.0, + "grad_norm": 0.029990628161131794, + "language_loss": 1.05946589, + "learning_rate": 0.0009999343898139858, + "loss": 1.07319212, + "num_input_tokens_seen": 14058608, + "router_z_loss_mlp": 2.14550781, + "step": 182, + "time_per_iteration": 2.6199755668640137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01375908, + "balance_loss_mlp": 1.16161704, + "epoch": 0.035205848403232015, + "flos": 519498828288.0, + "grad_norm": 0.03419998284579487, + "language_loss": 1.04830694, + "learning_rate": 0.0009999292459414348, + "loss": 1.06206608, + "num_input_tokens_seen": 14126656, + "router_z_loss_mlp": 2.13964844, + "step": 183, + "time_per_iteration": 2.563997983932495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01386507, + "balance_loss_mlp": 1.17269289, + "epoch": 0.035398230088495575, + "flos": 473333306880.0, + "grad_norm": 0.03346089667402367, + "language_loss": 1.09292293, + "learning_rate": 0.0009999239079791374, + "loss": 1.10678792, + "num_input_tokens_seen": 14195840, + "router_z_loss_mlp": 2.13476562, + "step": 184, + "time_per_iteration": 2.5561137199401855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01387981, + "balance_loss_mlp": 1.17512131, + "epoch": 0.03559061177375914, + "flos": 513094823424.0, + "grad_norm": 0.03551516541146116, + "language_loss": 1.01857162, + "learning_rate": 0.0009999183759291659, + "loss": 1.03245139, + "num_input_tokens_seen": 14269936, + "router_z_loss_mlp": 2.125, + "step": 185, + "time_per_iteration": 2.689763307571411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01383562, + "balance_loss_mlp": 1.17108345, + "epoch": 0.0357829934590227, + "flos": 478350159360.0, + "grad_norm": 0.03945465081959485, + "language_loss": 1.04534364, + "learning_rate": 0.0009999126497936682, + "loss": 1.05917931, + "num_input_tokens_seen": 14334848, + "router_z_loss_mlp": 2.12109375, + "step": 186, + "time_per_iteration": 2.5142176151275635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01375295, + "balance_loss_mlp": 1.16415167, + "epoch": 0.03597537514428627, + "flos": 645884324352.0, + "grad_norm": 0.029215470851159726, + "language_loss": 1.06864357, + "learning_rate": 0.0009999067295748676, + "loss": 1.08239663, + "num_input_tokens_seen": 14407888, + "router_z_loss_mlp": 2.10742188, + "step": 187, + "time_per_iteration": 2.8259549140930176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01370561, + "balance_loss_mlp": 1.16056204, + "epoch": 0.03616775682954983, + "flos": 582269245440.0, + "grad_norm": 0.03159066859467708, + "language_loss": 1.0519886, + "learning_rate": 0.000999900615275062, + "loss": 1.06569433, + "num_input_tokens_seen": 14479072, + "router_z_loss_mlp": 2.09570312, + "step": 188, + "time_per_iteration": 2.6748595237731934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01368603, + "balance_loss_mlp": 1.15898561, + "epoch": 0.03636013851481339, + "flos": 383264277504.0, + "grad_norm": 0.043734318168479426, + "language_loss": 1.10731864, + "learning_rate": 0.0009998943068966256, + "loss": 1.1210047, + "num_input_tokens_seen": 14540944, + "router_z_loss_mlp": 2.09179688, + "step": 189, + "time_per_iteration": 2.4394500255584717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01365543, + "balance_loss_mlp": 1.15668833, + "epoch": 0.03655252020007695, + "flos": 584307677184.0, + "grad_norm": 0.02577278402121573, + "language_loss": 1.05579162, + "learning_rate": 0.0009998878044420072, + "loss": 1.06944704, + "num_input_tokens_seen": 14611392, + "router_z_loss_mlp": 2.0859375, + "step": 190, + "time_per_iteration": 2.7022814750671387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01365865, + "balance_loss_mlp": 1.15882242, + "epoch": 0.03674490188534051, + "flos": 472597433856.0, + "grad_norm": 0.03520388751206912, + "language_loss": 1.01277018, + "learning_rate": 0.0009998811079137318, + "loss": 1.02642882, + "num_input_tokens_seen": 14679776, + "router_z_loss_mlp": 2.07324219, + "step": 191, + "time_per_iteration": 2.5930585861206055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0136447, + "balance_loss_mlp": 1.15742755, + "epoch": 0.03693728357060408, + "flos": 529411009536.0, + "grad_norm": 0.03125533686722731, + "language_loss": 1.02464271, + "learning_rate": 0.0009998742173143987, + "loss": 1.0382874, + "num_input_tokens_seen": 14749712, + "router_z_loss_mlp": 2.07324219, + "step": 192, + "time_per_iteration": 2.6235413551330566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01358793, + "balance_loss_mlp": 1.15222692, + "epoch": 0.03712966525586764, + "flos": 800345238528.0, + "grad_norm": 0.02848545485219292, + "language_loss": 1.02800548, + "learning_rate": 0.0009998671326466833, + "loss": 1.04159343, + "num_input_tokens_seen": 14827136, + "router_z_loss_mlp": 2.06835938, + "step": 193, + "time_per_iteration": 2.991110324859619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01351781, + "balance_loss_mlp": 1.1463598, + "epoch": 0.037322046941131205, + "flos": 831358144512.0, + "grad_norm": 0.03513998418582105, + "language_loss": 1.0392077, + "learning_rate": 0.0009998598539133362, + "loss": 1.05272543, + "num_input_tokens_seen": 14902880, + "router_z_loss_mlp": 2.05664062, + "step": 194, + "time_per_iteration": 3.0204203128814697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01349328, + "balance_loss_mlp": 1.14371598, + "epoch": 0.037514428626394765, + "flos": 438588642816.0, + "grad_norm": 0.028816536284039847, + "language_loss": 1.04176903, + "learning_rate": 0.0009998523811171828, + "loss": 1.05526221, + "num_input_tokens_seen": 14967264, + "router_z_loss_mlp": 2.05859375, + "step": 195, + "time_per_iteration": 2.5615782737731934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01345129, + "balance_loss_mlp": 1.14047015, + "epoch": 0.03770681031165833, + "flos": 512638927872.0, + "grad_norm": 0.030721230574493993, + "language_loss": 1.05052435, + "learning_rate": 0.0009998447142611248, + "loss": 1.06397557, + "num_input_tokens_seen": 15039104, + "router_z_loss_mlp": 2.04882812, + "step": 196, + "time_per_iteration": 2.6310269832611084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01347072, + "balance_loss_mlp": 1.14289033, + "epoch": 0.03789919199692189, + "flos": 808842069504.0, + "grad_norm": 0.024329502455983587, + "language_loss": 0.97805226, + "learning_rate": 0.0009998368533481387, + "loss": 0.99152303, + "num_input_tokens_seen": 15124864, + "router_z_loss_mlp": 2.04394531, + "step": 197, + "time_per_iteration": 3.0467066764831543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01344143, + "balance_loss_mlp": 1.14043784, + "epoch": 0.03809157368218546, + "flos": 691791335424.0, + "grad_norm": 0.028391473090668865, + "language_loss": 1.00891113, + "learning_rate": 0.0009998287983812762, + "loss": 1.0223527, + "num_input_tokens_seen": 15199680, + "router_z_loss_mlp": 2.0390625, + "step": 198, + "time_per_iteration": 2.8457672595977783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01342798, + "balance_loss_mlp": 1.14023721, + "epoch": 0.03828395536744902, + "flos": 519004001280.0, + "grad_norm": 0.02890411668538335, + "language_loss": 1.07749867, + "learning_rate": 0.0009998205493636646, + "loss": 1.09092665, + "num_input_tokens_seen": 15270176, + "router_z_loss_mlp": 2.02734375, + "step": 199, + "time_per_iteration": 2.66135573387146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01336213, + "balance_loss_mlp": 1.13432038, + "epoch": 0.038476337052712584, + "flos": 582762071040.0, + "grad_norm": 0.025165239757241963, + "language_loss": 0.99723649, + "learning_rate": 0.0009998121062985063, + "loss": 1.01059866, + "num_input_tokens_seen": 15343168, + "router_z_loss_mlp": 2.02050781, + "step": 200, + "time_per_iteration": 2.70021915435791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01340101, + "balance_loss_mlp": 1.13868463, + "epoch": 0.03866871873797614, + "flos": 578272972800.0, + "grad_norm": 0.025940014565947116, + "language_loss": 1.01401794, + "learning_rate": 0.0009998034691890794, + "loss": 1.02741897, + "num_input_tokens_seen": 15417328, + "router_z_loss_mlp": 2.015625, + "step": 201, + "time_per_iteration": 2.7596118450164795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0134112, + "balance_loss_mlp": 1.14018106, + "epoch": 0.03886110042323971, + "flos": 541771855872.0, + "grad_norm": 0.03045868040347491, + "language_loss": 1.06763899, + "learning_rate": 0.0009997946380387369, + "loss": 1.08105016, + "num_input_tokens_seen": 15489488, + "router_z_loss_mlp": 2.01074219, + "step": 202, + "time_per_iteration": 2.6249613761901855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01341912, + "balance_loss_mlp": 1.14192665, + "epoch": 0.03905348210850327, + "flos": 719239669248.0, + "grad_norm": 0.02826530469295273, + "language_loss": 1.09111357, + "learning_rate": 0.0009997856128509076, + "loss": 1.1045326, + "num_input_tokens_seen": 15558944, + "router_z_loss_mlp": 2.00097656, + "step": 203, + "time_per_iteration": 2.8254761695861816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01336015, + "balance_loss_mlp": 1.13660145, + "epoch": 0.039245863793766836, + "flos": 428396484096.0, + "grad_norm": 0.028264614074004907, + "language_loss": 1.0366801, + "learning_rate": 0.0009997763936290952, + "loss": 1.05004025, + "num_input_tokens_seen": 15625024, + "router_z_loss_mlp": 1.99511719, + "step": 204, + "time_per_iteration": 2.4907312393188477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01334897, + "balance_loss_mlp": 1.13624632, + "epoch": 0.039438245479030395, + "flos": 664269141504.0, + "grad_norm": 0.0294297584821439, + "language_loss": 1.09143519, + "learning_rate": 0.0009997669803768789, + "loss": 1.10478401, + "num_input_tokens_seen": 15697120, + "router_z_loss_mlp": 1.98730469, + "step": 205, + "time_per_iteration": 2.787046194076538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01332958, + "balance_loss_mlp": 1.13497555, + "epoch": 0.03963062716429396, + "flos": 636495168000.0, + "grad_norm": 0.025164669035445293, + "language_loss": 1.04324186, + "learning_rate": 0.0009997573730979134, + "loss": 1.05657148, + "num_input_tokens_seen": 15768752, + "router_z_loss_mlp": 1.98242188, + "step": 206, + "time_per_iteration": 2.744339942932129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01388672, + "balance_loss_mlp": 1.18687439, + "epoch": 0.03982300884955752, + "flos": 1421587186176.0, + "grad_norm": 0.04225268457123109, + "language_loss": 0.79193199, + "learning_rate": 0.0009997475717959284, + "loss": 0.80581868, + "num_input_tokens_seen": 15980624, + "router_z_loss_mlp": 2.01953125, + "step": 207, + "time_per_iteration": 4.62822699546814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01338974, + "balance_loss_mlp": 1.14251721, + "epoch": 0.04001539053482109, + "flos": 690519702528.0, + "grad_norm": 0.029734692172116686, + "language_loss": 1.02667236, + "learning_rate": 0.0009997375764747294, + "loss": 1.04006195, + "num_input_tokens_seen": 16067232, + "router_z_loss_mlp": 1.96875, + "step": 208, + "time_per_iteration": 3.0006470680236816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01332342, + "balance_loss_mlp": 1.1360755, + "epoch": 0.04020777222008465, + "flos": 534751500288.0, + "grad_norm": 0.02521302149444487, + "language_loss": 1.00535607, + "learning_rate": 0.0009997273871381967, + "loss": 1.01867938, + "num_input_tokens_seen": 16139808, + "router_z_loss_mlp": 1.96679688, + "step": 209, + "time_per_iteration": 2.6790220737457275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01335368, + "balance_loss_mlp": 1.14005554, + "epoch": 0.040400153905348214, + "flos": 568996608000.0, + "grad_norm": 0.04055154679799505, + "language_loss": 1.05331016, + "learning_rate": 0.0009997170037902862, + "loss": 1.06666374, + "num_input_tokens_seen": 16210848, + "router_z_loss_mlp": 1.95703125, + "step": 210, + "time_per_iteration": 2.748340129852295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01331596, + "balance_loss_mlp": 1.13647389, + "epoch": 0.040592535590611774, + "flos": 714678712320.0, + "grad_norm": 0.0276705792773584, + "language_loss": 1.07916689, + "learning_rate": 0.0009997064264350292, + "loss": 1.09248281, + "num_input_tokens_seen": 16283984, + "router_z_loss_mlp": 1.95507812, + "step": 211, + "time_per_iteration": 2.8284339904785156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01332545, + "balance_loss_mlp": 1.13761449, + "epoch": 0.04078491727587533, + "flos": 579206231040.0, + "grad_norm": 0.026753366885260317, + "language_loss": 1.01893198, + "learning_rate": 0.0009996956550765317, + "loss": 1.03225756, + "num_input_tokens_seen": 16353904, + "router_z_loss_mlp": 1.953125, + "step": 212, + "time_per_iteration": 2.6636033058166504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01330854, + "balance_loss_mlp": 1.13668597, + "epoch": 0.0409772989611389, + "flos": 553368631296.0, + "grad_norm": 0.03340351088011317, + "language_loss": 0.96620274, + "learning_rate": 0.0009996846897189762, + "loss": 0.97951126, + "num_input_tokens_seen": 16425488, + "router_z_loss_mlp": 1.9453125, + "step": 213, + "time_per_iteration": 2.62785005569458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01327396, + "balance_loss_mlp": 1.13332307, + "epoch": 0.04116968064640246, + "flos": 556764016128.0, + "grad_norm": 0.026256493309422244, + "language_loss": 1.0283711, + "learning_rate": 0.0009996735303666193, + "loss": 1.04164505, + "num_input_tokens_seen": 16498016, + "router_z_loss_mlp": 1.94433594, + "step": 214, + "time_per_iteration": 2.745412588119507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01324547, + "balance_loss_mlp": 1.13152313, + "epoch": 0.041362062331666026, + "flos": 579651393024.0, + "grad_norm": 0.025801807715809106, + "language_loss": 1.04973316, + "learning_rate": 0.0009996621770237937, + "loss": 1.06297863, + "num_input_tokens_seen": 16573744, + "router_z_loss_mlp": 1.93359375, + "step": 215, + "time_per_iteration": 2.7359023094177246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01319406, + "balance_loss_mlp": 1.12657344, + "epoch": 0.041554444016929586, + "flos": 612700729344.0, + "grad_norm": 0.027594527286323677, + "language_loss": 1.00985026, + "learning_rate": 0.0009996506296949073, + "loss": 1.02304435, + "num_input_tokens_seen": 16655344, + "router_z_loss_mlp": 1.93164062, + "step": 216, + "time_per_iteration": 2.860781669616699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01320461, + "balance_loss_mlp": 1.12781918, + "epoch": 0.04174682570219315, + "flos": 529150497792.0, + "grad_norm": 0.030561981852332186, + "language_loss": 1.01172602, + "learning_rate": 0.0009996388883844428, + "loss": 1.02493072, + "num_input_tokens_seen": 16726480, + "router_z_loss_mlp": 1.9296875, + "step": 217, + "time_per_iteration": 2.614837169647217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01315002, + "balance_loss_mlp": 1.12255037, + "epoch": 0.04193920738745671, + "flos": 512499939840.0, + "grad_norm": 0.024235201889365978, + "language_loss": 1.04092622, + "learning_rate": 0.0009996269530969588, + "loss": 1.05407631, + "num_input_tokens_seen": 16792112, + "router_z_loss_mlp": 1.92773438, + "step": 218, + "time_per_iteration": 2.5777087211608887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01317845, + "balance_loss_mlp": 1.1255846, + "epoch": 0.04213158907272028, + "flos": 572552448000.0, + "grad_norm": 0.03618883866707401, + "language_loss": 1.04623246, + "learning_rate": 0.0009996148238370888, + "loss": 1.05941105, + "num_input_tokens_seen": 16862960, + "router_z_loss_mlp": 1.92578125, + "step": 219, + "time_per_iteration": 2.723344564437866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01319419, + "balance_loss_mlp": 1.12830234, + "epoch": 0.04232397075798384, + "flos": 965904098304.0, + "grad_norm": 0.02808123492922437, + "language_loss": 0.99962145, + "learning_rate": 0.0009996025006095421, + "loss": 1.01281559, + "num_input_tokens_seen": 16950416, + "router_z_loss_mlp": 1.9140625, + "step": 220, + "time_per_iteration": 3.297567844390869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01355408, + "balance_loss_mlp": 1.16314697, + "epoch": 0.042516352443247404, + "flos": 1472730628608.0, + "grad_norm": 0.031119874656221472, + "language_loss": 0.77783144, + "learning_rate": 0.0009995899834191028, + "loss": 0.79138547, + "num_input_tokens_seen": 17180944, + "router_z_loss_mlp": 1.92578125, + "step": 221, + "time_per_iteration": 5.484851837158203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0132056, + "balance_loss_mlp": 1.13039756, + "epoch": 0.042708734128510964, + "flos": 655891832832.0, + "grad_norm": 0.027306518139410985, + "language_loss": 0.99887031, + "learning_rate": 0.0009995772722706307, + "loss": 1.0120759, + "num_input_tokens_seen": 17257792, + "router_z_loss_mlp": 1.90429688, + "step": 222, + "time_per_iteration": 2.801955461502075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01324867, + "balance_loss_mlp": 1.13518083, + "epoch": 0.04290111581377453, + "flos": 432733859328.0, + "grad_norm": 0.025166076900031344, + "language_loss": 1.13987851, + "learning_rate": 0.0009995643671690604, + "loss": 1.15312719, + "num_input_tokens_seen": 17320288, + "router_z_loss_mlp": 1.89941406, + "step": 223, + "time_per_iteration": 2.4589195251464844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01320058, + "balance_loss_mlp": 1.13142133, + "epoch": 0.04309349749903809, + "flos": 645866860032.0, + "grad_norm": 0.02470776233740571, + "language_loss": 1.01624262, + "learning_rate": 0.0009995512681194023, + "loss": 1.02944326, + "num_input_tokens_seen": 17396672, + "router_z_loss_mlp": 1.88867188, + "step": 224, + "time_per_iteration": 2.854653835296631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01319788, + "balance_loss_mlp": 1.13124692, + "epoch": 0.04328587918430166, + "flos": 832895745024.0, + "grad_norm": 0.02898896961022835, + "language_loss": 0.98942387, + "learning_rate": 0.0009995379751267417, + "loss": 1.00262189, + "num_input_tokens_seen": 17488096, + "router_z_loss_mlp": 1.88769531, + "step": 225, + "time_per_iteration": 3.260105609893799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01317885, + "balance_loss_mlp": 1.12943935, + "epoch": 0.043478260869565216, + "flos": 526115681280.0, + "grad_norm": 0.02601835272599882, + "language_loss": 1.00718379, + "learning_rate": 0.0009995244881962398, + "loss": 1.02036262, + "num_input_tokens_seen": 17557632, + "router_z_loss_mlp": 1.88671875, + "step": 226, + "time_per_iteration": 2.631685495376587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01320396, + "balance_loss_mlp": 1.13204539, + "epoch": 0.04367064255482878, + "flos": 440412225024.0, + "grad_norm": 0.02740546356326938, + "language_loss": 1.02089393, + "learning_rate": 0.0009995108073331323, + "loss": 1.03409791, + "num_input_tokens_seen": 17626672, + "router_z_loss_mlp": 1.88574219, + "step": 227, + "time_per_iteration": 2.6414895057678223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01308962, + "balance_loss_mlp": 1.12156498, + "epoch": 0.04386302424009234, + "flos": 508466737152.0, + "grad_norm": 0.023646446246452554, + "language_loss": 1.04017711, + "learning_rate": 0.0009994969325427309, + "loss": 1.05326676, + "num_input_tokens_seen": 17698624, + "router_z_loss_mlp": 1.87597656, + "step": 228, + "time_per_iteration": 2.6933584213256836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0130646, + "balance_loss_mlp": 1.11906338, + "epoch": 0.04405540592535591, + "flos": 541743657984.0, + "grad_norm": 0.02642836262436834, + "language_loss": 1.00691068, + "learning_rate": 0.0009994828638304218, + "loss": 1.0199753, + "num_input_tokens_seen": 17767760, + "router_z_loss_mlp": 1.87597656, + "step": 229, + "time_per_iteration": 2.604616165161133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01305226, + "balance_loss_mlp": 1.11792421, + "epoch": 0.04424778761061947, + "flos": 447309055488.0, + "grad_norm": 0.039218098968292335, + "language_loss": 1.07079852, + "learning_rate": 0.0009994686012016675, + "loss": 1.08385086, + "num_input_tokens_seen": 17833664, + "router_z_loss_mlp": 1.875, + "step": 230, + "time_per_iteration": 2.568608045578003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0130487, + "balance_loss_mlp": 1.1187129, + "epoch": 0.044440169295883035, + "flos": 701981492736.0, + "grad_norm": 0.02721662483758601, + "language_loss": 1.06240797, + "learning_rate": 0.000999454144662005, + "loss": 1.07545662, + "num_input_tokens_seen": 17908880, + "router_z_loss_mlp": 1.86328125, + "step": 231, + "time_per_iteration": 2.9104526042938232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01295735, + "balance_loss_mlp": 1.10957813, + "epoch": 0.044632550981146595, + "flos": 589426587648.0, + "grad_norm": 0.02817980914561194, + "language_loss": 1.003865, + "learning_rate": 0.0009994394942170468, + "loss": 1.01682234, + "num_input_tokens_seen": 17978208, + "router_z_loss_mlp": 1.86328125, + "step": 232, + "time_per_iteration": 2.674896001815796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01302928, + "balance_loss_mlp": 1.11667526, + "epoch": 0.04482493266641016, + "flos": 555854226432.0, + "grad_norm": 0.029144066951330677, + "language_loss": 0.98161608, + "learning_rate": 0.0009994246498724808, + "loss": 0.99464536, + "num_input_tokens_seen": 18049296, + "router_z_loss_mlp": 1.86425781, + "step": 233, + "time_per_iteration": 2.674178123474121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01302597, + "balance_loss_mlp": 1.11682117, + "epoch": 0.04501731435167372, + "flos": 724069870080.0, + "grad_norm": 0.027038299766394356, + "language_loss": 1.00722432, + "learning_rate": 0.00099940961163407, + "loss": 1.02025032, + "num_input_tokens_seen": 18123296, + "router_z_loss_mlp": 1.859375, + "step": 234, + "time_per_iteration": 2.8427939414978027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01301098, + "balance_loss_mlp": 1.11608493, + "epoch": 0.04520969603693728, + "flos": 512797381632.0, + "grad_norm": 0.027022139799708383, + "language_loss": 1.02586675, + "learning_rate": 0.0009993943795076528, + "loss": 1.03887773, + "num_input_tokens_seen": 18192784, + "router_z_loss_mlp": 1.8515625, + "step": 235, + "time_per_iteration": 2.5940792560577393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01295671, + "balance_loss_mlp": 1.11094403, + "epoch": 0.04540207772220085, + "flos": 365877846528.0, + "grad_norm": 0.03212133053651388, + "language_loss": 1.0562067, + "learning_rate": 0.0009993789534991427, + "loss": 1.06916356, + "num_input_tokens_seen": 18254064, + "router_z_loss_mlp": 1.84863281, + "step": 236, + "time_per_iteration": 2.4345834255218506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01294151, + "balance_loss_mlp": 1.1095196, + "epoch": 0.045594459407464406, + "flos": 523723411968.0, + "grad_norm": 0.029471400038435007, + "language_loss": 1.00276268, + "learning_rate": 0.0009993633336145287, + "loss": 1.01570415, + "num_input_tokens_seen": 18325728, + "router_z_loss_mlp": 1.84765625, + "step": 237, + "time_per_iteration": 2.6279234886169434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01296614, + "balance_loss_mlp": 1.11284053, + "epoch": 0.04578684109272797, + "flos": 673115807232.0, + "grad_norm": 0.032189822363292264, + "language_loss": 1.04537559, + "learning_rate": 0.0009993475198598752, + "loss": 1.05834174, + "num_input_tokens_seen": 18408608, + "router_z_loss_mlp": 1.83886719, + "step": 238, + "time_per_iteration": 2.98264741897583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01294154, + "balance_loss_mlp": 1.11047626, + "epoch": 0.04597922277799153, + "flos": 542620520448.0, + "grad_norm": 0.025834809881005002, + "language_loss": 1.01282692, + "learning_rate": 0.0009993315122413212, + "loss": 1.02576852, + "num_input_tokens_seen": 18471920, + "router_z_loss_mlp": 1.83789062, + "step": 239, + "time_per_iteration": 2.5969364643096924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01297016, + "balance_loss_mlp": 1.11333883, + "epoch": 0.0461716044632551, + "flos": 459993540096.0, + "grad_norm": 0.025301515003642434, + "language_loss": 1.01210213, + "learning_rate": 0.0009993153107650818, + "loss": 1.02507234, + "num_input_tokens_seen": 18540496, + "router_z_loss_mlp": 1.83789062, + "step": 240, + "time_per_iteration": 2.590198278427124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01297188, + "balance_loss_mlp": 1.11360526, + "epoch": 0.04636398614851866, + "flos": 456170457600.0, + "grad_norm": 0.0338801607583888, + "language_loss": 1.01026332, + "learning_rate": 0.0009992989154374468, + "loss": 1.0232352, + "num_input_tokens_seen": 18606944, + "router_z_loss_mlp": 1.83691406, + "step": 241, + "time_per_iteration": 2.5699570178985596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012963, + "balance_loss_mlp": 1.11271763, + "epoch": 0.046556367833782225, + "flos": 557901390336.0, + "grad_norm": 0.02656657647638049, + "language_loss": 1.0757494, + "learning_rate": 0.0009992823262647817, + "loss": 1.08871233, + "num_input_tokens_seen": 18679520, + "router_z_loss_mlp": 1.83691406, + "step": 242, + "time_per_iteration": 2.6949496269226074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01293965, + "balance_loss_mlp": 1.11047852, + "epoch": 0.046748749519045785, + "flos": 594087601152.0, + "grad_norm": 0.02772781005565529, + "language_loss": 1.02479577, + "learning_rate": 0.0009992655432535264, + "loss": 1.03773546, + "num_input_tokens_seen": 18756656, + "router_z_loss_mlp": 1.8359375, + "step": 243, + "time_per_iteration": 2.7783396244049072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01286985, + "balance_loss_mlp": 1.10454702, + "epoch": 0.04694113120430935, + "flos": 570941713920.0, + "grad_norm": 0.021337056529223342, + "language_loss": 1.01771712, + "learning_rate": 0.0009992485664101973, + "loss": 1.03058696, + "num_input_tokens_seen": 18829792, + "router_z_loss_mlp": 1.82519531, + "step": 244, + "time_per_iteration": 2.679227590560913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01286082, + "balance_loss_mlp": 1.10364425, + "epoch": 0.04713351288957291, + "flos": 865245411840.0, + "grad_norm": 0.03170954338904746, + "language_loss": 1.04355013, + "learning_rate": 0.000999231395741385, + "loss": 1.05641103, + "num_input_tokens_seen": 18906864, + "router_z_loss_mlp": 1.82519531, + "step": 245, + "time_per_iteration": 3.0976788997650146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01287082, + "balance_loss_mlp": 1.10473943, + "epoch": 0.04732589457483648, + "flos": 538235481600.0, + "grad_norm": 0.02353809889700427, + "language_loss": 1.02393425, + "learning_rate": 0.0009992140312537557, + "loss": 1.03680515, + "num_input_tokens_seen": 18973632, + "router_z_loss_mlp": 1.82421875, + "step": 246, + "time_per_iteration": 2.6005859375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01286378, + "balance_loss_mlp": 1.1048938, + "epoch": 0.04751827626010004, + "flos": 763271431680.0, + "grad_norm": 0.021903859990429042, + "language_loss": 0.96665001, + "learning_rate": 0.000999196472954051, + "loss": 0.97951376, + "num_input_tokens_seen": 19052944, + "router_z_loss_mlp": 1.81542969, + "step": 247, + "time_per_iteration": 2.95379638671875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01319153, + "balance_loss_mlp": 1.13833618, + "epoch": 0.0477106579453636, + "flos": 1583125578240.0, + "grad_norm": 0.034344144576267104, + "language_loss": 0.79424852, + "learning_rate": 0.0009991787208490878, + "loss": 0.80744004, + "num_input_tokens_seen": 19286288, + "router_z_loss_mlp": 1.80859375, + "step": 248, + "time_per_iteration": 6.070216655731201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01286412, + "balance_loss_mlp": 1.10521388, + "epoch": 0.04790303963062716, + "flos": 458692982784.0, + "grad_norm": 0.024476775577385278, + "language_loss": 1.04631317, + "learning_rate": 0.0009991607749457578, + "loss": 1.05917728, + "num_input_tokens_seen": 19349296, + "router_z_loss_mlp": 1.8125, + "step": 249, + "time_per_iteration": 2.5741825103759766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0128623, + "balance_loss_mlp": 1.10503209, + "epoch": 0.04809542131589073, + "flos": 783786004992.0, + "grad_norm": 0.021665977114244464, + "language_loss": 1.0235486, + "learning_rate": 0.0009991426352510286, + "loss": 1.03641105, + "num_input_tokens_seen": 19428416, + "router_z_loss_mlp": 1.81152344, + "step": 250, + "time_per_iteration": 3.004519462585449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01287109, + "balance_loss_mlp": 1.10648286, + "epoch": 0.04828780300115429, + "flos": 560321857536.0, + "grad_norm": 0.028059326531900755, + "language_loss": 1.04456568, + "learning_rate": 0.0009991243017719422, + "loss": 1.05743682, + "num_input_tokens_seen": 19498688, + "router_z_loss_mlp": 1.8046875, + "step": 251, + "time_per_iteration": 2.666212320327759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01283793, + "balance_loss_mlp": 1.10364354, + "epoch": 0.048480184686417856, + "flos": 502922130432.0, + "grad_norm": 0.02282661348297379, + "language_loss": 0.985008, + "learning_rate": 0.0009991057745156165, + "loss": 0.99784589, + "num_input_tokens_seen": 19567568, + "router_z_loss_mlp": 1.80078125, + "step": 252, + "time_per_iteration": 2.6053824424743652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01291534, + "balance_loss_mlp": 1.11186218, + "epoch": 0.048672566371681415, + "flos": 1539469120512.0, + "grad_norm": 0.022804524860740846, + "language_loss": 0.81910986, + "learning_rate": 0.0009990870534892446, + "loss": 0.83202517, + "num_input_tokens_seen": 19796368, + "router_z_loss_mlp": 1.796875, + "step": 253, + "time_per_iteration": 5.005317449569702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01285445, + "balance_loss_mlp": 1.10500991, + "epoch": 0.04886494805694498, + "flos": 538951888896.0, + "grad_norm": 0.028242285238858512, + "language_loss": 1.06865251, + "learning_rate": 0.0009990681387000943, + "loss": 1.08150697, + "num_input_tokens_seen": 19870480, + "router_z_loss_mlp": 1.80371094, + "step": 254, + "time_per_iteration": 2.743307590484619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01283321, + "balance_loss_mlp": 1.10317183, + "epoch": 0.04905732974220854, + "flos": 681484383744.0, + "grad_norm": 0.028658365214850164, + "language_loss": 1.02065015, + "learning_rate": 0.0009990490301555093, + "loss": 1.03348327, + "num_input_tokens_seen": 19956288, + "router_z_loss_mlp": 1.80126953, + "step": 255, + "time_per_iteration": 2.989856719970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01291977, + "balance_loss_mlp": 1.1134491, + "epoch": 0.04924971142747211, + "flos": 1424274895872.0, + "grad_norm": 0.01325206916769545, + "language_loss": 0.79215157, + "learning_rate": 0.0009990297278629078, + "loss": 0.80507129, + "num_input_tokens_seen": 20180080, + "router_z_loss_mlp": 1.78515625, + "step": 256, + "time_per_iteration": 4.888273477554321 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01281082, + "balance_loss_mlp": 1.10255432, + "epoch": 0.04944209311273567, + "flos": 1561236587520.0, + "grad_norm": 0.00993410716153638, + "language_loss": 0.79242742, + "learning_rate": 0.000999010231829784, + "loss": 0.80523825, + "num_input_tokens_seen": 20413456, + "router_z_loss_mlp": 1.78515625, + "step": 257, + "time_per_iteration": 4.983605623245239 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01285439, + "balance_loss_mlp": 1.10786438, + "epoch": 0.04963447479799923, + "flos": 1574170850304.0, + "grad_norm": 0.014798835308040135, + "language_loss": 0.69975883, + "learning_rate": 0.0009989905420637066, + "loss": 0.71261322, + "num_input_tokens_seen": 20644736, + "router_z_loss_mlp": 1.77539062, + "step": 258, + "time_per_iteration": 4.888776540756226 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01282209, + "balance_loss_mlp": 1.10310864, + "epoch": 0.049826856483262794, + "flos": 626498393088.0, + "grad_norm": 0.032236291487241595, + "language_loss": 0.9680413, + "learning_rate": 0.0009989706585723202, + "loss": 0.98086333, + "num_input_tokens_seen": 20719040, + "router_z_loss_mlp": 1.79003906, + "step": 259, + "time_per_iteration": 2.776397705078125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01280186, + "balance_loss_mlp": 1.10175359, + "epoch": 0.05001923816852635, + "flos": 505155945984.0, + "grad_norm": 0.03442249770662494, + "language_loss": 1.03026366, + "learning_rate": 0.0009989505813633442, + "loss": 1.04306555, + "num_input_tokens_seen": 20789376, + "router_z_loss_mlp": 1.78271484, + "step": 260, + "time_per_iteration": 2.651773691177368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01281097, + "balance_loss_mlp": 1.10295069, + "epoch": 0.05021161985378992, + "flos": 588467132928.0, + "grad_norm": 0.024781843968885862, + "language_loss": 1.02880228, + "learning_rate": 0.000998930310444573, + "loss": 1.04161322, + "num_input_tokens_seen": 20857856, + "router_z_loss_mlp": 1.78125, + "step": 261, + "time_per_iteration": 2.730717420578003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01266116, + "balance_loss_mlp": 1.08796966, + "epoch": 0.05040400153905348, + "flos": 634402341888.0, + "grad_norm": 0.028473185138455738, + "language_loss": 1.01351452, + "learning_rate": 0.0009989098458238765, + "loss": 1.02617574, + "num_input_tokens_seen": 20931232, + "router_z_loss_mlp": 1.77929688, + "step": 262, + "time_per_iteration": 2.7717010974884033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01272128, + "balance_loss_mlp": 1.09407711, + "epoch": 0.050596383224317046, + "flos": 554808176640.0, + "grad_norm": 0.03464065468219783, + "language_loss": 1.00597906, + "learning_rate": 0.0009988891875091998, + "loss": 1.01870036, + "num_input_tokens_seen": 21012672, + "router_z_loss_mlp": 1.77880859, + "step": 263, + "time_per_iteration": 2.8842556476593018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012725, + "balance_loss_mlp": 1.09444928, + "epoch": 0.050788764909580605, + "flos": 550761512448.0, + "grad_norm": 0.02541343292713684, + "language_loss": 0.95014787, + "learning_rate": 0.0009988683355085636, + "loss": 0.96287298, + "num_input_tokens_seen": 21088592, + "router_z_loss_mlp": 1.77880859, + "step": 264, + "time_per_iteration": 2.7466378211975098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01272896, + "balance_loss_mlp": 1.09527469, + "epoch": 0.05098114659484417, + "flos": 606344388096.0, + "grad_norm": 0.02024934595994547, + "language_loss": 1.03858495, + "learning_rate": 0.000998847289830063, + "loss": 1.05131388, + "num_input_tokens_seen": 21169840, + "router_z_loss_mlp": 1.77587891, + "step": 265, + "time_per_iteration": 2.821997880935669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01285574, + "balance_loss_mlp": 1.10761857, + "epoch": 0.05117352828010773, + "flos": 439472236032.0, + "grad_norm": 0.026937538773041583, + "language_loss": 0.97004128, + "learning_rate": 0.0009988260504818682, + "loss": 0.98289704, + "num_input_tokens_seen": 21236144, + "router_z_loss_mlp": 1.77832031, + "step": 266, + "time_per_iteration": 2.557830333709717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01277028, + "balance_loss_mlp": 1.09907281, + "epoch": 0.0513659099653713, + "flos": 506030807040.0, + "grad_norm": 0.02494960853942852, + "language_loss": 1.03986156, + "learning_rate": 0.000998804617472226, + "loss": 1.05263186, + "num_input_tokens_seen": 21304864, + "router_z_loss_mlp": 1.77832031, + "step": 267, + "time_per_iteration": 2.644099235534668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01269549, + "balance_loss_mlp": 1.09254682, + "epoch": 0.05155829165063486, + "flos": 696714862080.0, + "grad_norm": 0.027664306986101984, + "language_loss": 0.98796493, + "learning_rate": 0.0009987829908094568, + "loss": 1.00066042, + "num_input_tokens_seen": 21377504, + "router_z_loss_mlp": 1.76953125, + "step": 268, + "time_per_iteration": 2.8291003704071045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01265086, + "balance_loss_mlp": 1.08817983, + "epoch": 0.051750673335898424, + "flos": 1350300294144.0, + "grad_norm": 0.03385083640642466, + "language_loss": 1.06218576, + "learning_rate": 0.0009987611705019569, + "loss": 1.07483661, + "num_input_tokens_seen": 21463840, + "router_z_loss_mlp": 1.76855469, + "step": 269, + "time_per_iteration": 4.150776624679565 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01264769, + "balance_loss_mlp": 1.08795822, + "epoch": 0.051943055021161984, + "flos": 490589481984.0, + "grad_norm": 0.028250493976035247, + "language_loss": 1.04104686, + "learning_rate": 0.0009987391565581978, + "loss": 1.05369449, + "num_input_tokens_seen": 21531184, + "router_z_loss_mlp": 1.76757812, + "step": 270, + "time_per_iteration": 2.5921454429626465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01266977, + "balance_loss_mlp": 1.09092879, + "epoch": 0.05213543670642555, + "flos": 546880032768.0, + "grad_norm": 0.026669721507250346, + "language_loss": 0.96455419, + "learning_rate": 0.000998716948986726, + "loss": 0.97722399, + "num_input_tokens_seen": 21612224, + "router_z_loss_mlp": 1.75976562, + "step": 271, + "time_per_iteration": 2.7835500240325928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01268405, + "balance_loss_mlp": 1.09264266, + "epoch": 0.05232781839168911, + "flos": 604672528896.0, + "grad_norm": 0.03568520247936263, + "language_loss": 0.99334317, + "learning_rate": 0.0009986945477961633, + "loss": 1.00602722, + "num_input_tokens_seen": 21681024, + "router_z_loss_mlp": 1.75683594, + "step": 272, + "time_per_iteration": 2.6972289085388184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01271248, + "balance_loss_mlp": 1.0953902, + "epoch": 0.052520200076952676, + "flos": 539655561216.0, + "grad_norm": 0.02343402151836954, + "language_loss": 1.0317328, + "learning_rate": 0.0009986719529952066, + "loss": 1.04444528, + "num_input_tokens_seen": 21761616, + "router_z_loss_mlp": 1.7578125, + "step": 273, + "time_per_iteration": 2.908298969268799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0126867, + "balance_loss_mlp": 1.09266984, + "epoch": 0.052712581762216236, + "flos": 464332916736.0, + "grad_norm": 0.028493663433316604, + "language_loss": 1.03350449, + "learning_rate": 0.000998649164592628, + "loss": 1.0461911, + "num_input_tokens_seen": 21828416, + "router_z_loss_mlp": 1.75927734, + "step": 274, + "time_per_iteration": 2.5805718898773193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01263404, + "balance_loss_mlp": 1.08735609, + "epoch": 0.0529049634474798, + "flos": 549105116160.0, + "grad_norm": 0.024462560446863554, + "language_loss": 1.01155043, + "learning_rate": 0.0009986261825972748, + "loss": 1.02418458, + "num_input_tokens_seen": 21901600, + "router_z_loss_mlp": 1.75976562, + "step": 275, + "time_per_iteration": 2.675705909729004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01269015, + "balance_loss_mlp": 1.09334803, + "epoch": 0.05309734513274336, + "flos": 619200061440.0, + "grad_norm": 0.026443817532743642, + "language_loss": 1.03055406, + "learning_rate": 0.000998603007018069, + "loss": 1.04324436, + "num_input_tokens_seen": 21979312, + "router_z_loss_mlp": 1.75585938, + "step": 276, + "time_per_iteration": 2.77298903465271 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01264217, + "balance_loss_mlp": 1.08893192, + "epoch": 0.05328972681800693, + "flos": 606617634816.0, + "grad_norm": 0.022439827576013177, + "language_loss": 1.00613213, + "learning_rate": 0.0009985796378640089, + "loss": 1.01877427, + "num_input_tokens_seen": 22053776, + "router_z_loss_mlp": 1.75195312, + "step": 277, + "time_per_iteration": 2.693049669265747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01264635, + "balance_loss_mlp": 1.08963549, + "epoch": 0.05348210850327049, + "flos": 605730038784.0, + "grad_norm": 0.02549683888178727, + "language_loss": 1.01102281, + "learning_rate": 0.0009985560751441665, + "loss": 1.02366924, + "num_input_tokens_seen": 22134304, + "router_z_loss_mlp": 1.74902344, + "step": 278, + "time_per_iteration": 2.8009955883026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262716, + "balance_loss_mlp": 1.08757329, + "epoch": 0.053674490188534055, + "flos": 631997337600.0, + "grad_norm": 0.025192100126554, + "language_loss": 1.03316271, + "learning_rate": 0.00099853231886769, + "loss": 1.04578984, + "num_input_tokens_seen": 22212896, + "router_z_loss_mlp": 1.75048828, + "step": 279, + "time_per_iteration": 2.8228564262390137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262121, + "balance_loss_mlp": 1.08712184, + "epoch": 0.053866871873797614, + "flos": 480173741568.0, + "grad_norm": 0.02583251996588833, + "language_loss": 1.02629757, + "learning_rate": 0.0009985083690438024, + "loss": 1.03891873, + "num_input_tokens_seen": 22287216, + "router_z_loss_mlp": 1.74902344, + "step": 280, + "time_per_iteration": 2.705789566040039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01260843, + "balance_loss_mlp": 1.08655906, + "epoch": 0.054059253559061174, + "flos": 789489065472.0, + "grad_norm": 0.023704628566171972, + "language_loss": 0.9340027, + "learning_rate": 0.0009984842256818016, + "loss": 0.94661117, + "num_input_tokens_seen": 22370864, + "router_z_loss_mlp": 1.74169922, + "step": 281, + "time_per_iteration": 3.084801435470581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01257985, + "balance_loss_mlp": 1.08379591, + "epoch": 0.05425163524432474, + "flos": 629505011712.0, + "grad_norm": 0.027462270528210347, + "language_loss": 1.04308844, + "learning_rate": 0.0009984598887910613, + "loss": 1.05566835, + "num_input_tokens_seen": 22440080, + "router_z_loss_mlp": 1.74072266, + "step": 282, + "time_per_iteration": 2.729063034057617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262745, + "balance_loss_mlp": 1.08855665, + "epoch": 0.0544440169295883, + "flos": 616992442368.0, + "grad_norm": 0.02580860229759897, + "language_loss": 0.99945354, + "learning_rate": 0.0009984353583810297, + "loss": 1.01208091, + "num_input_tokens_seen": 22517936, + "router_z_loss_mlp": 1.74072266, + "step": 283, + "time_per_iteration": 2.812309741973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01258383, + "balance_loss_mlp": 1.08433735, + "epoch": 0.05463639861485187, + "flos": 648929874432.0, + "grad_norm": 0.0290705298354334, + "language_loss": 1.01989841, + "learning_rate": 0.0009984106344612302, + "loss": 1.03248215, + "num_input_tokens_seen": 22590480, + "router_z_loss_mlp": 1.73925781, + "step": 284, + "time_per_iteration": 2.785377264022827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0126395, + "balance_loss_mlp": 1.0907625, + "epoch": 0.054828780300115426, + "flos": 798584782848.0, + "grad_norm": 0.03167011835004719, + "language_loss": 0.97435868, + "learning_rate": 0.0009983857170412615, + "loss": 0.9869982, + "num_input_tokens_seen": 22668144, + "router_z_loss_mlp": 1.73046875, + "step": 285, + "time_per_iteration": 2.9822604656219482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01258353, + "balance_loss_mlp": 1.08511817, + "epoch": 0.05502116198537899, + "flos": 550798442496.0, + "grad_norm": 0.02077828299254123, + "language_loss": 0.96197385, + "learning_rate": 0.000998360606130798, + "loss": 0.9745574, + "num_input_tokens_seen": 22749648, + "router_z_loss_mlp": 1.73095703, + "step": 286, + "time_per_iteration": 2.8340489864349365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01266281, + "balance_loss_mlp": 1.09461975, + "epoch": 0.05521354367064255, + "flos": 1410906931200.0, + "grad_norm": 0.010589673029146669, + "language_loss": 0.69073117, + "learning_rate": 0.0009983353017395877, + "loss": 0.70339394, + "num_input_tokens_seen": 22982752, + "router_z_loss_mlp": 1.71484375, + "step": 287, + "time_per_iteration": 4.893908500671387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0126535, + "balance_loss_mlp": 1.09235394, + "epoch": 0.05540592535590612, + "flos": 646611465216.0, + "grad_norm": 0.04031113274469801, + "language_loss": 1.02544129, + "learning_rate": 0.0009983098038774552, + "loss": 1.03809476, + "num_input_tokens_seen": 23053584, + "router_z_loss_mlp": 1.72851562, + "step": 288, + "time_per_iteration": 2.800687551498413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01258598, + "balance_loss_mlp": 1.08712769, + "epoch": 0.05559830704116968, + "flos": 1514315727360.0, + "grad_norm": 0.011752943348929798, + "language_loss": 0.78170228, + "learning_rate": 0.0009982841125542993, + "loss": 0.79428822, + "num_input_tokens_seen": 23280256, + "router_z_loss_mlp": 1.71289062, + "step": 289, + "time_per_iteration": 4.802466630935669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0126164, + "balance_loss_mlp": 1.08869088, + "epoch": 0.055790688726433245, + "flos": 509334867456.0, + "grad_norm": 0.03460900762027919, + "language_loss": 1.00913107, + "learning_rate": 0.0009982582277800948, + "loss": 1.02174735, + "num_input_tokens_seen": 23345760, + "router_z_loss_mlp": 1.72802734, + "step": 290, + "time_per_iteration": 2.574007749557495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01255451, + "balance_loss_mlp": 1.08326483, + "epoch": 0.055983070411696804, + "flos": 659074369536.0, + "grad_norm": 0.03439417592421578, + "language_loss": 1.07703924, + "learning_rate": 0.0009982321495648908, + "loss": 1.08959377, + "num_input_tokens_seen": 23420720, + "router_z_loss_mlp": 1.72021484, + "step": 291, + "time_per_iteration": 2.8004326820373535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01257264, + "balance_loss_mlp": 1.08503067, + "epoch": 0.05617545209696037, + "flos": 588475865088.0, + "grad_norm": 0.024241847728240208, + "language_loss": 0.9905349, + "learning_rate": 0.0009982058779188115, + "loss": 1.00310755, + "num_input_tokens_seen": 23492576, + "router_z_loss_mlp": 1.72070312, + "step": 292, + "time_per_iteration": 2.763096570968628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01257503, + "balance_loss_mlp": 1.0853169, + "epoch": 0.05636783378222393, + "flos": 612787324416.0, + "grad_norm": 0.027188079674348095, + "language_loss": 1.06693649, + "learning_rate": 0.0009981794128520567, + "loss": 1.07951164, + "num_input_tokens_seen": 23569824, + "router_z_loss_mlp": 1.72021484, + "step": 293, + "time_per_iteration": 2.7630960941314697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01253426, + "balance_loss_mlp": 1.08123958, + "epoch": 0.0565602154674875, + "flos": 669422980608.0, + "grad_norm": 0.030197403892147204, + "language_loss": 1.03523457, + "learning_rate": 0.000998152754374901, + "loss": 1.04776871, + "num_input_tokens_seen": 23649984, + "router_z_loss_mlp": 1.72021484, + "step": 294, + "time_per_iteration": 2.8583314418792725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01249713, + "balance_loss_mlp": 1.07743168, + "epoch": 0.05675259715275106, + "flos": 618364131840.0, + "grad_norm": 0.026289358543143387, + "language_loss": 0.99071473, + "learning_rate": 0.0009981259024976943, + "loss": 1.00321186, + "num_input_tokens_seen": 23722032, + "router_z_loss_mlp": 1.72119141, + "step": 295, + "time_per_iteration": 2.719881534576416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250566, + "balance_loss_mlp": 1.07814193, + "epoch": 0.05694497883801462, + "flos": 753153133056.0, + "grad_norm": 0.03148267511857758, + "language_loss": 0.97962338, + "learning_rate": 0.0009980988572308612, + "loss": 0.99212909, + "num_input_tokens_seen": 23797376, + "router_z_loss_mlp": 1.72265625, + "step": 296, + "time_per_iteration": 2.9828195571899414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250905, + "balance_loss_mlp": 1.0789572, + "epoch": 0.05713736052327818, + "flos": 713380882944.0, + "grad_norm": 0.02524811137395651, + "language_loss": 1.00250125, + "learning_rate": 0.0009980716185849015, + "loss": 1.01501024, + "num_input_tokens_seen": 23880496, + "router_z_loss_mlp": 1.71777344, + "step": 297, + "time_per_iteration": 2.9749252796173096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01251066, + "balance_loss_mlp": 1.07959557, + "epoch": 0.05732974220854175, + "flos": 469935920640.0, + "grad_norm": 0.024054663695119705, + "language_loss": 0.96916056, + "learning_rate": 0.0009980441865703904, + "loss": 0.98167121, + "num_input_tokens_seen": 23950016, + "router_z_loss_mlp": 1.71289062, + "step": 298, + "time_per_iteration": 2.598325252532959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250911, + "balance_loss_mlp": 1.07939255, + "epoch": 0.05752212389380531, + "flos": 602540771328.0, + "grad_norm": 0.025930022992042723, + "language_loss": 1.05563986, + "learning_rate": 0.000998016561197978, + "loss": 1.06814897, + "num_input_tokens_seen": 24020064, + "router_z_loss_mlp": 1.71337891, + "step": 299, + "time_per_iteration": 2.690300703048706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250529, + "balance_loss_mlp": 1.07924938, + "epoch": 0.057714505579068875, + "flos": 679949511168.0, + "grad_norm": 0.025847674874905035, + "language_loss": 0.97115421, + "learning_rate": 0.0009979887424783895, + "loss": 0.98365951, + "num_input_tokens_seen": 24095360, + "router_z_loss_mlp": 1.7109375, + "step": 300, + "time_per_iteration": 2.863856554031372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01249286, + "balance_loss_mlp": 1.07810116, + "epoch": 0.057906887264332435, + "flos": 597011627520.0, + "grad_norm": 0.02594453351976595, + "language_loss": 0.96475613, + "learning_rate": 0.0009979607304224248, + "loss": 0.97724897, + "num_input_tokens_seen": 24164608, + "router_z_loss_mlp": 1.70996094, + "step": 301, + "time_per_iteration": 2.733415365219116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01248659, + "balance_loss_mlp": 1.0772841, + "epoch": 0.058099268949596, + "flos": 553164515328.0, + "grad_norm": 0.024492956239426298, + "language_loss": 1.0387162, + "learning_rate": 0.000997932525040959, + "loss": 1.05120289, + "num_input_tokens_seen": 24233840, + "router_z_loss_mlp": 1.71191406, + "step": 302, + "time_per_iteration": 2.6392264366149902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01252345, + "balance_loss_mlp": 1.08111238, + "epoch": 0.05829165063485956, + "flos": 509230808064.0, + "grad_norm": 0.038324718957869854, + "language_loss": 1.05616117, + "learning_rate": 0.000997904126344943, + "loss": 1.06868458, + "num_input_tokens_seen": 24302928, + "router_z_loss_mlp": 1.71044922, + "step": 303, + "time_per_iteration": 2.611621141433716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0125091, + "balance_loss_mlp": 1.080441, + "epoch": 0.05848403232012313, + "flos": 616362630144.0, + "grad_norm": 0.028818083574726525, + "language_loss": 1.02425826, + "learning_rate": 0.0009978755343454018, + "loss": 1.03676736, + "num_input_tokens_seen": 24377024, + "router_z_loss_mlp": 1.70263672, + "step": 304, + "time_per_iteration": 2.750213384628296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01245805, + "balance_loss_mlp": 1.07490659, + "epoch": 0.05867641400538669, + "flos": 501079082496.0, + "grad_norm": 0.025195073137535502, + "language_loss": 1.02874422, + "learning_rate": 0.0009978467490534355, + "loss": 1.04120219, + "num_input_tokens_seen": 24442736, + "router_z_loss_mlp": 1.70703125, + "step": 305, + "time_per_iteration": 2.591139078140259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0124905, + "balance_loss_mlp": 1.07853293, + "epoch": 0.05886879569065025, + "flos": 532378696704.0, + "grad_norm": 0.026491629776715375, + "language_loss": 0.99473399, + "learning_rate": 0.00099781777048022, + "loss": 1.00722456, + "num_input_tokens_seen": 24514800, + "router_z_loss_mlp": 1.703125, + "step": 306, + "time_per_iteration": 2.731084108352661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012482, + "balance_loss_mlp": 1.07782638, + "epoch": 0.05906117737591381, + "flos": 490040260608.0, + "grad_norm": 0.025118942729794178, + "language_loss": 1.01122224, + "learning_rate": 0.0009977885986370057, + "loss": 1.02370417, + "num_input_tokens_seen": 24581648, + "router_z_loss_mlp": 1.70166016, + "step": 307, + "time_per_iteration": 2.548307418823242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01247075, + "balance_loss_mlp": 1.0766536, + "epoch": 0.05925355906117737, + "flos": 592709180928.0, + "grad_norm": 0.029001286226925486, + "language_loss": 0.96780527, + "learning_rate": 0.000997759233535118, + "loss": 0.98027599, + "num_input_tokens_seen": 24658864, + "router_z_loss_mlp": 1.70214844, + "step": 308, + "time_per_iteration": 2.7876322269439697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01247056, + "balance_loss_mlp": 1.07668173, + "epoch": 0.05944594074644094, + "flos": 564787487232.0, + "grad_norm": 0.026648157056946717, + "language_loss": 1.03345561, + "learning_rate": 0.0009977296751859576, + "loss": 1.04592621, + "num_input_tokens_seen": 24735808, + "router_z_loss_mlp": 1.70166016, + "step": 309, + "time_per_iteration": 2.71488094329834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0124953, + "balance_loss_mlp": 1.07958508, + "epoch": 0.0596383224317045, + "flos": 539807284224.0, + "grad_norm": 0.023775477335694146, + "language_loss": 1.04459929, + "learning_rate": 0.0009976999236009998, + "loss": 1.05709469, + "num_input_tokens_seen": 24807744, + "router_z_loss_mlp": 1.69726562, + "step": 310, + "time_per_iteration": 2.7919182777404785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01255511, + "balance_loss_mlp": 1.08618629, + "epoch": 0.059830704116968066, + "flos": 562052113920.0, + "grad_norm": 0.02942700961653022, + "language_loss": 1.06853497, + "learning_rate": 0.0009976699787917955, + "loss": 1.08109009, + "num_input_tokens_seen": 24876640, + "router_z_loss_mlp": 1.69091797, + "step": 311, + "time_per_iteration": 2.6729257106781006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012565, + "balance_loss_mlp": 1.08789062, + "epoch": 0.060023085802231625, + "flos": 1574047325184.0, + "grad_norm": 0.029063497479097016, + "language_loss": 0.73442996, + "learning_rate": 0.00099763984076997, + "loss": 0.74699497, + "num_input_tokens_seen": 25110864, + "router_z_loss_mlp": 1.68359375, + "step": 312, + "time_per_iteration": 4.972649097442627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01249775, + "balance_loss_mlp": 1.08021212, + "epoch": 0.06021546748749519, + "flos": 483627523584.0, + "grad_norm": 0.0314235925459163, + "language_loss": 0.98280072, + "learning_rate": 0.0009976095095472243, + "loss": 0.9952985, + "num_input_tokens_seen": 25179328, + "router_z_loss_mlp": 1.69335938, + "step": 313, + "time_per_iteration": 2.5644209384918213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0125234, + "balance_loss_mlp": 1.08287179, + "epoch": 0.06040784917275875, + "flos": 621423143424.0, + "grad_norm": 0.030123719928355924, + "language_loss": 0.99538821, + "learning_rate": 0.0009975789851353334, + "loss": 1.00791156, + "num_input_tokens_seen": 25254128, + "router_z_loss_mlp": 1.69238281, + "step": 314, + "time_per_iteration": 2.794311285018921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01256592, + "balance_loss_mlp": 1.08741045, + "epoch": 0.06060023085802232, + "flos": 484602441216.0, + "grad_norm": 0.026992074473858402, + "language_loss": 1.01683283, + "learning_rate": 0.0009975482675461487, + "loss": 1.02939868, + "num_input_tokens_seen": 25324624, + "router_z_loss_mlp": 1.68945312, + "step": 315, + "time_per_iteration": 2.67146897315979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01249108, + "balance_loss_mlp": 1.08054566, + "epoch": 0.06079261254328588, + "flos": 582985652736.0, + "grad_norm": 0.0292304668639163, + "language_loss": 0.99909455, + "learning_rate": 0.0009975173567915952, + "loss": 1.01158559, + "num_input_tokens_seen": 25393648, + "router_z_loss_mlp": 1.68310547, + "step": 316, + "time_per_iteration": 2.693526268005371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0124983, + "balance_loss_mlp": 1.08131599, + "epoch": 0.060984994228549444, + "flos": 689008298496.0, + "grad_norm": 0.03272213432041067, + "language_loss": 0.93868685, + "learning_rate": 0.000997486252883674, + "loss": 0.95118511, + "num_input_tokens_seen": 25469152, + "router_z_loss_mlp": 1.68261719, + "step": 317, + "time_per_iteration": 2.837315082550049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01252509, + "balance_loss_mlp": 1.08399427, + "epoch": 0.061177375913813004, + "flos": 1316747398656.0, + "grad_norm": 0.031012352820614663, + "language_loss": 0.98949343, + "learning_rate": 0.0009974549558344602, + "loss": 1.00201845, + "num_input_tokens_seen": 25560944, + "router_z_loss_mlp": 1.68261719, + "step": 318, + "time_per_iteration": 3.686920166015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0125178, + "balance_loss_mlp": 1.08321846, + "epoch": 0.06136975759907657, + "flos": 575400612864.0, + "grad_norm": 0.027925836735275204, + "language_loss": 1.08640313, + "learning_rate": 0.000997423465656105, + "loss": 1.09892082, + "num_input_tokens_seen": 25631424, + "router_z_loss_mlp": 1.68310547, + "step": 319, + "time_per_iteration": 2.7691538333892822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250553, + "balance_loss_mlp": 1.08218133, + "epoch": 0.06156213928434013, + "flos": 528564346368.0, + "grad_norm": 0.033042319608268485, + "language_loss": 1.06051123, + "learning_rate": 0.0009973917823608335, + "loss": 1.07301688, + "num_input_tokens_seen": 25698176, + "router_z_loss_mlp": 1.68115234, + "step": 320, + "time_per_iteration": 2.583859443664551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01251303, + "balance_loss_mlp": 1.08364725, + "epoch": 0.061754520969603696, + "flos": 496589984256.0, + "grad_norm": 0.025351519610416894, + "language_loss": 0.99929821, + "learning_rate": 0.0009973599059609462, + "loss": 1.01181126, + "num_input_tokens_seen": 25773472, + "router_z_loss_mlp": 1.67382812, + "step": 321, + "time_per_iteration": 2.7139415740966797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01246641, + "balance_loss_mlp": 1.07893777, + "epoch": 0.061946902654867256, + "flos": 441044038656.0, + "grad_norm": 0.025867704850659153, + "language_loss": 0.98033404, + "learning_rate": 0.000997327836468819, + "loss": 0.99280047, + "num_input_tokens_seen": 25841088, + "router_z_loss_mlp": 1.67431641, + "step": 322, + "time_per_iteration": 2.598400831222534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250362, + "balance_loss_mlp": 1.08280182, + "epoch": 0.06213928434013082, + "flos": 600042441216.0, + "grad_norm": 0.02535167136018297, + "language_loss": 1.01516175, + "learning_rate": 0.000997295573896902, + "loss": 1.02766538, + "num_input_tokens_seen": 25919424, + "router_z_loss_mlp": 1.67285156, + "step": 323, + "time_per_iteration": 2.8295648097991943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0125071, + "balance_loss_mlp": 1.0847702, + "epoch": 0.06233166602539438, + "flos": 1453114384896.0, + "grad_norm": 0.012451454042686489, + "language_loss": 0.8119604, + "learning_rate": 0.000997263118257721, + "loss": 0.82446748, + "num_input_tokens_seen": 26135504, + "router_z_loss_mlp": 1.65625, + "step": 324, + "time_per_iteration": 4.72265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01244164, + "balance_loss_mlp": 1.07803345, + "epoch": 0.06252404771065795, + "flos": 1466628794880.0, + "grad_norm": 0.009026829376029815, + "language_loss": 0.78571939, + "learning_rate": 0.0009972304695638763, + "loss": 0.79816103, + "num_input_tokens_seen": 26358880, + "router_z_loss_mlp": 1.65820312, + "step": 325, + "time_per_iteration": 4.859014272689819 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01252677, + "balance_loss_mlp": 1.08535445, + "epoch": 0.06271642939592151, + "flos": 465235975680.0, + "grad_norm": 0.02899330239765154, + "language_loss": 0.95714885, + "learning_rate": 0.000997197627828043, + "loss": 0.96967566, + "num_input_tokens_seen": 26425888, + "router_z_loss_mlp": 1.67041016, + "step": 326, + "time_per_iteration": 2.5137081146240234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250284, + "balance_loss_mlp": 1.08343852, + "epoch": 0.06290881108118507, + "flos": 533431477248.0, + "grad_norm": 0.02712212536791958, + "language_loss": 0.90827119, + "learning_rate": 0.0009971645930629716, + "loss": 0.92077404, + "num_input_tokens_seen": 26500656, + "router_z_loss_mlp": 1.66552734, + "step": 327, + "time_per_iteration": 2.6867988109588623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01249402, + "balance_loss_mlp": 1.08260453, + "epoch": 0.06310119276644863, + "flos": 674767474176.0, + "grad_norm": 0.026247049513885422, + "language_loss": 1.04735494, + "learning_rate": 0.0009971313652814872, + "loss": 1.0598489, + "num_input_tokens_seen": 26577408, + "router_z_loss_mlp": 1.66503906, + "step": 328, + "time_per_iteration": 2.845618724822998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01245995, + "balance_loss_mlp": 1.07924485, + "epoch": 0.0632935744517122, + "flos": 772050241536.0, + "grad_norm": 0.03020034978800923, + "language_loss": 1.02482498, + "learning_rate": 0.0009970979444964903, + "loss": 1.03728485, + "num_input_tokens_seen": 26652048, + "router_z_loss_mlp": 1.66455078, + "step": 329, + "time_per_iteration": 2.967315196990967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01249674, + "balance_loss_mlp": 1.08316231, + "epoch": 0.06348595613697576, + "flos": 562974638592.0, + "grad_norm": 0.027434293654228625, + "language_loss": 1.03562641, + "learning_rate": 0.0009970643307209556, + "loss": 1.04812312, + "num_input_tokens_seen": 26728192, + "router_z_loss_mlp": 1.66210938, + "step": 330, + "time_per_iteration": 2.7991747856140137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01247918, + "balance_loss_mlp": 1.0814544, + "epoch": 0.06367833782223932, + "flos": 677383325184.0, + "grad_norm": 0.030236705728133754, + "language_loss": 1.00163436, + "learning_rate": 0.0009970305239679334, + "loss": 1.01411343, + "num_input_tokens_seen": 26798016, + "router_z_loss_mlp": 1.66162109, + "step": 331, + "time_per_iteration": 2.8012547492980957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01243208, + "balance_loss_mlp": 1.07669675, + "epoch": 0.06387071950750288, + "flos": 496348938240.0, + "grad_norm": 0.029279450628507057, + "language_loss": 1.04491925, + "learning_rate": 0.0009969965242505483, + "loss": 1.05735123, + "num_input_tokens_seen": 26867536, + "router_z_loss_mlp": 1.66210938, + "step": 332, + "time_per_iteration": 2.658085584640503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01251001, + "balance_loss_mlp": 1.08463287, + "epoch": 0.06406310119276645, + "flos": 534556116480.0, + "grad_norm": 0.029350032940601952, + "language_loss": 1.00548685, + "learning_rate": 0.0009969623315820007, + "loss": 1.01799679, + "num_input_tokens_seen": 26941216, + "router_z_loss_mlp": 1.66064453, + "step": 333, + "time_per_iteration": 2.6670596599578857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238877, + "balance_loss_mlp": 1.07246125, + "epoch": 0.06425548287803001, + "flos": 457164840960.0, + "grad_norm": 0.03277849846880731, + "language_loss": 1.00979996, + "learning_rate": 0.000996927945975565, + "loss": 1.02218866, + "num_input_tokens_seen": 27006560, + "router_z_loss_mlp": 1.66113281, + "step": 334, + "time_per_iteration": 2.5448765754699707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01245409, + "balance_loss_mlp": 1.0792315, + "epoch": 0.06444786456329357, + "flos": 561122858496.0, + "grad_norm": 0.03573042475309631, + "language_loss": 0.98108363, + "learning_rate": 0.0009968933674445906, + "loss": 0.99353766, + "num_input_tokens_seen": 27076400, + "router_z_loss_mlp": 1.65869141, + "step": 335, + "time_per_iteration": 2.679093360900879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01242425, + "balance_loss_mlp": 1.07672429, + "epoch": 0.06464024624855713, + "flos": 667356350976.0, + "grad_norm": 0.0316377115871937, + "language_loss": 0.99817598, + "learning_rate": 0.0009968585960025028, + "loss": 1.01060021, + "num_input_tokens_seen": 27158672, + "router_z_loss_mlp": 1.65380859, + "step": 336, + "time_per_iteration": 2.9642832279205322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01246223, + "balance_loss_mlp": 1.08085632, + "epoch": 0.0648326279338207, + "flos": 1524555549696.0, + "grad_norm": 0.012731648189289846, + "language_loss": 0.77653188, + "learning_rate": 0.0009968236316628006, + "loss": 0.78899413, + "num_input_tokens_seen": 27380592, + "router_z_loss_mlp": 1.65039062, + "step": 337, + "time_per_iteration": 4.799122333526611 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01242249, + "balance_loss_mlp": 1.07683408, + "epoch": 0.06502500961908426, + "flos": 1145214959616.0, + "grad_norm": 0.030168792806873873, + "language_loss": 0.98216963, + "learning_rate": 0.0009967884744390583, + "loss": 0.99459207, + "num_input_tokens_seen": 27469984, + "router_z_loss_mlp": 1.65087891, + "step": 338, + "time_per_iteration": 3.513155460357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01243978, + "balance_loss_mlp": 1.07865858, + "epoch": 0.06521739130434782, + "flos": 583693327872.0, + "grad_norm": 0.025823410577593665, + "language_loss": 0.98998213, + "learning_rate": 0.0009967531243449256, + "loss": 1.00242186, + "num_input_tokens_seen": 27543904, + "router_z_loss_mlp": 1.64990234, + "step": 339, + "time_per_iteration": 2.6683707237243652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01239476, + "balance_loss_mlp": 1.07453787, + "epoch": 0.06540977298961138, + "flos": 498658615296.0, + "grad_norm": 0.02384437782241591, + "language_loss": 1.06067204, + "learning_rate": 0.000996717581394126, + "loss": 1.07306671, + "num_input_tokens_seen": 27609888, + "router_z_loss_mlp": 1.64599609, + "step": 340, + "time_per_iteration": 2.5471885204315186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236124, + "balance_loss_mlp": 1.07171023, + "epoch": 0.06560215467487496, + "flos": 543903613440.0, + "grad_norm": 0.02318937955413124, + "language_loss": 1.0712086, + "learning_rate": 0.000996681845600459, + "loss": 1.08356977, + "num_input_tokens_seen": 27683936, + "router_z_loss_mlp": 1.640625, + "step": 341, + "time_per_iteration": 2.651742458343506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01240028, + "balance_loss_mlp": 1.07575738, + "epoch": 0.06579453636013852, + "flos": 414351043584.0, + "grad_norm": 0.026316803994829763, + "language_loss": 0.99228215, + "learning_rate": 0.0009966459169777982, + "loss": 1.00468254, + "num_input_tokens_seen": 27747840, + "router_z_loss_mlp": 1.63916016, + "step": 342, + "time_per_iteration": 2.4996230602264404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01244627, + "balance_loss_mlp": 1.08045232, + "epoch": 0.06598691804540208, + "flos": 561680812032.0, + "grad_norm": 0.03097158399986616, + "language_loss": 1.07124209, + "learning_rate": 0.0009966097955400924, + "loss": 1.08368838, + "num_input_tokens_seen": 27819728, + "router_z_loss_mlp": 1.63818359, + "step": 343, + "time_per_iteration": 2.7243080139160156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238691, + "balance_loss_mlp": 1.07451606, + "epoch": 0.06617929973066564, + "flos": 573301782528.0, + "grad_norm": 0.022915441754152527, + "language_loss": 1.00964892, + "learning_rate": 0.0009965734813013652, + "loss": 1.02203584, + "num_input_tokens_seen": 27893536, + "router_z_loss_mlp": 1.63818359, + "step": 344, + "time_per_iteration": 2.8087360858917236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01237027, + "balance_loss_mlp": 1.07375824, + "epoch": 0.06637168141592921, + "flos": 491464343040.0, + "grad_norm": 0.024444849604151265, + "language_loss": 1.03758335, + "learning_rate": 0.0009965369742757151, + "loss": 1.04995358, + "num_input_tokens_seen": 27960976, + "router_z_loss_mlp": 1.62890625, + "step": 345, + "time_per_iteration": 2.5691587924957275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01237907, + "balance_loss_mlp": 1.07459044, + "epoch": 0.06656406310119277, + "flos": 1081037924352.0, + "grad_norm": 0.024807678995847144, + "language_loss": 0.99529493, + "learning_rate": 0.0009965002744773152, + "loss": 1.00767398, + "num_input_tokens_seen": 28050864, + "router_z_loss_mlp": 1.62939453, + "step": 346, + "time_per_iteration": 3.507969856262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01239522, + "balance_loss_mlp": 1.07611036, + "epoch": 0.06675644478645633, + "flos": 514723021824.0, + "grad_norm": 0.02663627628784384, + "language_loss": 0.97097999, + "learning_rate": 0.0009964633819204139, + "loss": 0.98337519, + "num_input_tokens_seen": 28122448, + "router_z_loss_mlp": 1.63037109, + "step": 347, + "time_per_iteration": 2.6551601886749268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01261986, + "balance_loss_mlp": 1.09986115, + "epoch": 0.06694882647171989, + "flos": 1450534189056.0, + "grad_norm": 0.030948258254188146, + "language_loss": 0.81801116, + "learning_rate": 0.0009964262966193338, + "loss": 0.83063102, + "num_input_tokens_seen": 28350352, + "router_z_loss_mlp": 1.6171875, + "step": 348, + "time_per_iteration": 5.152506589889526 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236206, + "balance_loss_mlp": 1.07427216, + "epoch": 0.06714120815698346, + "flos": 1555397266944.0, + "grad_norm": 0.0077968992848742235, + "language_loss": 0.75153887, + "learning_rate": 0.000996389018588473, + "loss": 0.76390088, + "num_input_tokens_seen": 28585584, + "router_z_loss_mlp": 1.61523438, + "step": 349, + "time_per_iteration": 4.909464120864868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01242005, + "balance_loss_mlp": 1.07873547, + "epoch": 0.06733358984224702, + "flos": 881615992320.0, + "grad_norm": 0.03432587789196913, + "language_loss": 0.97228402, + "learning_rate": 0.000996351547842304, + "loss": 0.98470408, + "num_input_tokens_seen": 28672512, + "router_z_loss_mlp": 1.62890625, + "step": 350, + "time_per_iteration": 3.1799545288085938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01240315, + "balance_loss_mlp": 1.0778569, + "epoch": 0.06752597152751058, + "flos": 519917793792.0, + "grad_norm": 0.030803186893757592, + "language_loss": 0.96182388, + "learning_rate": 0.0009963138843953744, + "loss": 0.97422707, + "num_input_tokens_seen": 28741520, + "router_z_loss_mlp": 1.62060547, + "step": 351, + "time_per_iteration": 2.5873348712921143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238163, + "balance_loss_mlp": 1.07565665, + "epoch": 0.06771835321277414, + "flos": 540882258432.0, + "grad_norm": 0.023778523337364334, + "language_loss": 0.99575555, + "learning_rate": 0.000996276028262306, + "loss": 1.00813723, + "num_input_tokens_seen": 28814912, + "router_z_loss_mlp": 1.62109375, + "step": 352, + "time_per_iteration": 2.7943532466888428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238104, + "balance_loss_mlp": 1.07583654, + "epoch": 0.0679107348980377, + "flos": 461615007744.0, + "grad_norm": 0.02720743117278016, + "language_loss": 1.06749547, + "learning_rate": 0.0009962379794577964, + "loss": 1.07987642, + "num_input_tokens_seen": 28882192, + "router_z_loss_mlp": 1.61865234, + "step": 353, + "time_per_iteration": 2.589200973510742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01239427, + "balance_loss_mlp": 1.07711196, + "epoch": 0.06810311658330127, + "flos": 637207572480.0, + "grad_norm": 0.02321502152829773, + "language_loss": 0.95908678, + "learning_rate": 0.000996199737996617, + "loss": 0.97148108, + "num_input_tokens_seen": 28968576, + "router_z_loss_mlp": 1.61914062, + "step": 354, + "time_per_iteration": 2.8822708129882812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0123871, + "balance_loss_mlp": 1.07687151, + "epoch": 0.06829549826856483, + "flos": 465626743296.0, + "grad_norm": 0.030894548658215056, + "language_loss": 1.05554581, + "learning_rate": 0.0009961613038936149, + "loss": 1.06793284, + "num_input_tokens_seen": 29036160, + "router_z_loss_mlp": 1.61425781, + "step": 355, + "time_per_iteration": 2.576930522918701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236116, + "balance_loss_mlp": 1.07456315, + "epoch": 0.06848787995382839, + "flos": 635896281600.0, + "grad_norm": 0.0286185110148739, + "language_loss": 0.9730283, + "learning_rate": 0.000996122677163711, + "loss": 0.98538941, + "num_input_tokens_seen": 29112048, + "router_z_loss_mlp": 1.61132812, + "step": 356, + "time_per_iteration": 2.850829601287842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01237686, + "balance_loss_mlp": 1.07637215, + "epoch": 0.06868026163909195, + "flos": 807780556800.0, + "grad_norm": 0.03078602082995562, + "language_loss": 1.03526855, + "learning_rate": 0.000996083857821902, + "loss": 1.04764557, + "num_input_tokens_seen": 29190960, + "router_z_loss_mlp": 1.60888672, + "step": 357, + "time_per_iteration": 3.124053716659546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01237273, + "balance_loss_mlp": 1.07605469, + "epoch": 0.06887264332435553, + "flos": 440151713280.0, + "grad_norm": 0.02263887650004652, + "language_loss": 1.01701617, + "learning_rate": 0.0009960448458832588, + "loss": 1.0293889, + "num_input_tokens_seen": 29262832, + "router_z_loss_mlp": 1.60791016, + "step": 358, + "time_per_iteration": 2.6918816566467285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01242041, + "balance_loss_mlp": 1.08077514, + "epoch": 0.06906502500961909, + "flos": 485785477632.0, + "grad_norm": 0.021707311176365728, + "language_loss": 1.01897752, + "learning_rate": 0.000996005641362927, + "loss": 1.03139794, + "num_input_tokens_seen": 29329552, + "router_z_loss_mlp": 1.60839844, + "step": 359, + "time_per_iteration": 2.601358652114868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238764, + "balance_loss_mlp": 1.07725942, + "epoch": 0.06925740669488265, + "flos": 734885110272.0, + "grad_norm": 0.024380378407611886, + "language_loss": 1.04387617, + "learning_rate": 0.0009959662442761274, + "loss": 1.05626392, + "num_input_tokens_seen": 29410784, + "router_z_loss_mlp": 1.61083984, + "step": 360, + "time_per_iteration": 2.9404215812683105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236823, + "balance_loss_mlp": 1.07589066, + "epoch": 0.0694497883801462, + "flos": 553570745856.0, + "grad_norm": 0.023221163769242582, + "language_loss": 0.97943044, + "learning_rate": 0.000995926654638155, + "loss": 0.99179876, + "num_input_tokens_seen": 29486992, + "router_z_loss_mlp": 1.60498047, + "step": 361, + "time_per_iteration": 2.811624526977539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01234495, + "balance_loss_mlp": 1.07413495, + "epoch": 0.06964217006540978, + "flos": 679243837440.0, + "grad_norm": 0.025577226237571565, + "language_loss": 1.00741839, + "learning_rate": 0.00099588687246438, + "loss": 1.01976323, + "num_input_tokens_seen": 29557232, + "router_z_loss_mlp": 1.59912109, + "step": 362, + "time_per_iteration": 2.826204538345337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01235331, + "balance_loss_mlp": 1.0749228, + "epoch": 0.06983455175067334, + "flos": 525260285952.0, + "grad_norm": 0.054619150892928216, + "language_loss": 1.0805161, + "learning_rate": 0.0009958468977702471, + "loss": 1.09286952, + "num_input_tokens_seen": 29625344, + "router_z_loss_mlp": 1.59960938, + "step": 363, + "time_per_iteration": 2.5742297172546387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01269455, + "balance_loss_mlp": 1.11000061, + "epoch": 0.0700269334359369, + "flos": 1580173353984.0, + "grad_norm": 0.0347214045967213, + "language_loss": 0.79734707, + "learning_rate": 0.0009958067305712761, + "loss": 0.81004167, + "num_input_tokens_seen": 29843664, + "router_z_loss_mlp": 1.59179688, + "step": 364, + "time_per_iteration": 4.815373182296753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01234235, + "balance_loss_mlp": 1.07420838, + "epoch": 0.07021931512120046, + "flos": 1014856659456.0, + "grad_norm": 0.027565425727799023, + "language_loss": 0.95424879, + "learning_rate": 0.0009957663708830612, + "loss": 0.96659118, + "num_input_tokens_seen": 29927152, + "router_z_loss_mlp": 1.59667969, + "step": 365, + "time_per_iteration": 3.3032214641571045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238249, + "balance_loss_mlp": 1.07874703, + "epoch": 0.07041169680646403, + "flos": 824431114752.0, + "grad_norm": 0.03609893162101238, + "language_loss": 0.99641442, + "learning_rate": 0.0009957258187212714, + "loss": 1.00879693, + "num_input_tokens_seen": 30004928, + "router_z_loss_mlp": 1.59228516, + "step": 366, + "time_per_iteration": 3.143951654434204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01232948, + "balance_loss_mlp": 1.0748291, + "epoch": 0.07060407849172759, + "flos": 1417290743808.0, + "grad_norm": 0.015479474187128486, + "language_loss": 0.79194862, + "learning_rate": 0.0009956850741016502, + "loss": 0.80427808, + "num_input_tokens_seen": 30230256, + "router_z_loss_mlp": 1.578125, + "step": 367, + "time_per_iteration": 4.856614112854004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01232866, + "balance_loss_mlp": 1.07417488, + "epoch": 0.07079646017699115, + "flos": 513941486592.0, + "grad_norm": 0.03158452537667852, + "language_loss": 0.9606331, + "learning_rate": 0.0009956441370400167, + "loss": 0.97296178, + "num_input_tokens_seen": 30301200, + "router_z_loss_mlp": 1.58398438, + "step": 368, + "time_per_iteration": 2.6471550464630127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01231431, + "balance_loss_mlp": 1.07288289, + "epoch": 0.07098884186225471, + "flos": 541548274176.0, + "grad_norm": 0.03366854249700899, + "language_loss": 1.02536654, + "learning_rate": 0.0009956030075522636, + "loss": 1.03768086, + "num_input_tokens_seen": 30377024, + "router_z_loss_mlp": 1.58251953, + "step": 369, + "time_per_iteration": 2.764350175857544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01230433, + "balance_loss_mlp": 1.07183695, + "epoch": 0.07118122354751828, + "flos": 549738931200.0, + "grad_norm": 0.025388205653796188, + "language_loss": 1.02520657, + "learning_rate": 0.0009955616856543587, + "loss": 1.03751087, + "num_input_tokens_seen": 30448896, + "router_z_loss_mlp": 1.58300781, + "step": 370, + "time_per_iteration": 2.6488449573516846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01233332, + "balance_loss_mlp": 1.07483125, + "epoch": 0.07137360523278184, + "flos": 622076424192.0, + "grad_norm": 0.025131147277089937, + "language_loss": 0.94016552, + "learning_rate": 0.0009955201713623448, + "loss": 0.95249885, + "num_input_tokens_seen": 30523584, + "router_z_loss_mlp": 1.58203125, + "step": 371, + "time_per_iteration": 2.7475128173828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01231201, + "balance_loss_mlp": 1.07594299, + "epoch": 0.0715659869180454, + "flos": 1505973347328.0, + "grad_norm": 0.011087848535678398, + "language_loss": 0.76672721, + "learning_rate": 0.000995478464692339, + "loss": 0.77903926, + "num_input_tokens_seen": 30757920, + "router_z_loss_mlp": 1.55664062, + "step": 372, + "time_per_iteration": 4.930227518081665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01234827, + "balance_loss_mlp": 1.0769937, + "epoch": 0.07175836860330896, + "flos": 496481195520.0, + "grad_norm": 0.02946804107059058, + "language_loss": 1.07406306, + "learning_rate": 0.0009954365656605333, + "loss": 1.08641148, + "num_input_tokens_seen": 30824960, + "router_z_loss_mlp": 1.57910156, + "step": 373, + "time_per_iteration": 2.5494606494903564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01235693, + "balance_loss_mlp": 1.07862246, + "epoch": 0.07195075028857253, + "flos": 787081333248.0, + "grad_norm": 0.030340412148976308, + "language_loss": 1.00769055, + "learning_rate": 0.0009953944742831947, + "loss": 1.02004743, + "num_input_tokens_seen": 30902224, + "router_z_loss_mlp": 1.57519531, + "step": 374, + "time_per_iteration": 2.974013328552246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01234053, + "balance_loss_mlp": 1.07707787, + "epoch": 0.0721431319738361, + "flos": 594346111488.0, + "grad_norm": 0.024760984543104554, + "language_loss": 1.04227853, + "learning_rate": 0.0009953521905766642, + "loss": 1.05461907, + "num_input_tokens_seen": 30984784, + "router_z_loss_mlp": 1.57421875, + "step": 375, + "time_per_iteration": 2.9470102787017822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01233349, + "balance_loss_mlp": 1.07642198, + "epoch": 0.07233551365909965, + "flos": 549328697856.0, + "grad_norm": 0.025099095391344205, + "language_loss": 1.02903581, + "learning_rate": 0.0009953097145573577, + "loss": 1.04136944, + "num_input_tokens_seen": 31055376, + "router_z_loss_mlp": 1.57373047, + "step": 376, + "time_per_iteration": 2.656438112258911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01232315, + "balance_loss_mlp": 1.0754832, + "epoch": 0.07252789534436321, + "flos": 959167723008.0, + "grad_norm": 0.028756244795243427, + "language_loss": 1.01008701, + "learning_rate": 0.000995267046241766, + "loss": 1.02241015, + "num_input_tokens_seen": 31144944, + "router_z_loss_mlp": 1.57275391, + "step": 377, + "time_per_iteration": 3.2601664066314697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226098, + "balance_loss_mlp": 1.06931448, + "epoch": 0.07272027702962677, + "flos": 508655390208.0, + "grad_norm": 0.025279277167219092, + "language_loss": 1.00209188, + "learning_rate": 0.0009952241856464547, + "loss": 1.01435292, + "num_input_tokens_seen": 31213392, + "router_z_loss_mlp": 1.57226562, + "step": 378, + "time_per_iteration": 2.616483688354492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228279, + "balance_loss_mlp": 1.07159042, + "epoch": 0.07291265871489035, + "flos": 613551395328.0, + "grad_norm": 0.025059419305224793, + "language_loss": 1.0761106, + "learning_rate": 0.0009951811327880632, + "loss": 1.08839345, + "num_input_tokens_seen": 31289840, + "router_z_loss_mlp": 1.57128906, + "step": 379, + "time_per_iteration": 2.7666382789611816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226658, + "balance_loss_mlp": 1.07063651, + "epoch": 0.0731050404001539, + "flos": 496741707264.0, + "grad_norm": 0.032880990240464036, + "language_loss": 1.00766444, + "learning_rate": 0.0009951378876833063, + "loss": 1.01993108, + "num_input_tokens_seen": 31357600, + "router_z_loss_mlp": 1.56445312, + "step": 380, + "time_per_iteration": 2.5504086017608643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01230504, + "balance_loss_mlp": 1.07433975, + "epoch": 0.07329742208541747, + "flos": 641129985024.0, + "grad_norm": 0.0343074889031262, + "language_loss": 1.0780232, + "learning_rate": 0.0009950944503489736, + "loss": 1.0903281, + "num_input_tokens_seen": 31428896, + "router_z_loss_mlp": 1.56591797, + "step": 381, + "time_per_iteration": 2.7695260047912598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01231248, + "balance_loss_mlp": 1.07537043, + "epoch": 0.07348980377068103, + "flos": 817740401664.0, + "grad_norm": 0.027198888726283066, + "language_loss": 1.01785743, + "learning_rate": 0.0009950508208019285, + "loss": 1.03016996, + "num_input_tokens_seen": 31507424, + "router_z_loss_mlp": 1.56298828, + "step": 382, + "time_per_iteration": 2.9918277263641357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01227944, + "balance_loss_mlp": 1.07187521, + "epoch": 0.0736821854559446, + "flos": 509669239296.0, + "grad_norm": 0.03113985633155724, + "language_loss": 1.05612254, + "learning_rate": 0.0009950069990591096, + "loss": 1.06840205, + "num_input_tokens_seen": 31576768, + "router_z_loss_mlp": 1.56494141, + "step": 383, + "time_per_iteration": 2.610745429992676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01248039, + "balance_loss_mlp": 1.09392548, + "epoch": 0.07387456714120816, + "flos": 1558048046592.0, + "grad_norm": 0.03338671968111017, + "language_loss": 0.76401371, + "learning_rate": 0.0009949629851375302, + "loss": 0.77649409, + "num_input_tokens_seen": 31797312, + "router_z_loss_mlp": 1.54492188, + "step": 384, + "time_per_iteration": 4.854166269302368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01229749, + "balance_loss_mlp": 1.0736798, + "epoch": 0.07406694882647172, + "flos": 526643435520.0, + "grad_norm": 0.03274978311793036, + "language_loss": 0.98781282, + "learning_rate": 0.0009949187790542777, + "loss": 1.00011039, + "num_input_tokens_seen": 31869568, + "router_z_loss_mlp": 1.56494141, + "step": 385, + "time_per_iteration": 2.728701591491699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0123258, + "balance_loss_mlp": 1.07636821, + "epoch": 0.07425933051173528, + "flos": 498823799808.0, + "grad_norm": 0.026908846939264777, + "language_loss": 0.94723004, + "learning_rate": 0.0009948743808265148, + "loss": 0.95955586, + "num_input_tokens_seen": 31941712, + "router_z_loss_mlp": 1.56640625, + "step": 386, + "time_per_iteration": 2.6850693225860596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01231135, + "balance_loss_mlp": 1.07511437, + "epoch": 0.07445171219699885, + "flos": 506057003520.0, + "grad_norm": 0.05633654869747302, + "language_loss": 1.04553366, + "learning_rate": 0.0009948297904714782, + "loss": 1.05784488, + "num_input_tokens_seen": 32015232, + "router_z_loss_mlp": 1.56445312, + "step": 387, + "time_per_iteration": 2.6746010780334473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01231627, + "balance_loss_mlp": 1.07555866, + "epoch": 0.07464409388226241, + "flos": 555116352000.0, + "grad_norm": 0.03450843374667126, + "language_loss": 0.9665134, + "learning_rate": 0.0009947850080064796, + "loss": 0.97882968, + "num_input_tokens_seen": 32094640, + "router_z_loss_mlp": 1.56494141, + "step": 388, + "time_per_iteration": 2.7839057445526123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01230193, + "balance_loss_mlp": 1.07431459, + "epoch": 0.07483647556752597, + "flos": 778274325504.0, + "grad_norm": 0.021592891008175935, + "language_loss": 1.01240289, + "learning_rate": 0.0009947400334489047, + "loss": 1.02470493, + "num_input_tokens_seen": 32176640, + "router_z_loss_mlp": 1.56298828, + "step": 389, + "time_per_iteration": 2.9945342540740967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01229089, + "balance_loss_mlp": 1.07411718, + "epoch": 0.07502885725278953, + "flos": 613681651200.0, + "grad_norm": 0.023383004705128753, + "language_loss": 0.92341155, + "learning_rate": 0.0009946948668162145, + "loss": 0.93570244, + "num_input_tokens_seen": 32246704, + "router_z_loss_mlp": 1.55371094, + "step": 390, + "time_per_iteration": 2.7355024814605713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122989, + "balance_loss_mlp": 1.07496524, + "epoch": 0.0752212389380531, + "flos": 689854961664.0, + "grad_norm": 0.026752200694656208, + "language_loss": 0.97335494, + "learning_rate": 0.0009946495081259441, + "loss": 0.98565376, + "num_input_tokens_seen": 32320032, + "router_z_loss_mlp": 1.55322266, + "step": 391, + "time_per_iteration": 2.799938678741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01227768, + "balance_loss_mlp": 1.07303405, + "epoch": 0.07541362062331666, + "flos": 767050853376.0, + "grad_norm": 0.02596026064524479, + "language_loss": 1.01604676, + "learning_rate": 0.0009946039573957035, + "loss": 1.02832437, + "num_input_tokens_seen": 32398144, + "router_z_loss_mlp": 1.55126953, + "step": 392, + "time_per_iteration": 2.932504415512085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0123199, + "balance_loss_mlp": 1.07768571, + "epoch": 0.07560600230858022, + "flos": 589908679680.0, + "grad_norm": 0.028382748029943367, + "language_loss": 0.97495323, + "learning_rate": 0.000994558214643177, + "loss": 0.98727316, + "num_input_tokens_seen": 32471984, + "router_z_loss_mlp": 1.546875, + "step": 393, + "time_per_iteration": 2.752694845199585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228178, + "balance_loss_mlp": 1.07425475, + "epoch": 0.07579838399384378, + "flos": 751144900608.0, + "grad_norm": 0.028291982513743617, + "language_loss": 0.99160051, + "learning_rate": 0.000994512279886123, + "loss": 1.00388229, + "num_input_tokens_seen": 32550176, + "router_z_loss_mlp": 1.54296875, + "step": 394, + "time_per_iteration": 3.06592059135437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228894, + "balance_loss_mlp": 1.07530475, + "epoch": 0.07599076567910736, + "flos": 524550609408.0, + "grad_norm": 0.023352712612718218, + "language_loss": 0.98641121, + "learning_rate": 0.0009944661531423758, + "loss": 0.99870014, + "num_input_tokens_seen": 32620768, + "router_z_loss_mlp": 1.53955078, + "step": 395, + "time_per_iteration": 2.6720728874206543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122919, + "balance_loss_mlp": 1.07555354, + "epoch": 0.07618314736437092, + "flos": 552185594880.0, + "grad_norm": 0.026216962171459895, + "language_loss": 0.97914684, + "learning_rate": 0.000994419834429843, + "loss": 0.99143875, + "num_input_tokens_seen": 32693472, + "router_z_loss_mlp": 1.54003906, + "step": 396, + "time_per_iteration": 2.6652910709381104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226861, + "balance_loss_mlp": 1.07308066, + "epoch": 0.07637552904963447, + "flos": 699432771072.0, + "grad_norm": 0.029361663168223213, + "language_loss": 1.03114796, + "learning_rate": 0.0009943733237665069, + "loss": 1.0434165, + "num_input_tokens_seen": 32764976, + "router_z_loss_mlp": 1.54150391, + "step": 397, + "time_per_iteration": 2.808711290359497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01227023, + "balance_loss_mlp": 1.07329071, + "epoch": 0.07656791073489803, + "flos": 580635042816.0, + "grad_norm": 0.02000560632750303, + "language_loss": 1.01598048, + "learning_rate": 0.0009943266211704248, + "loss": 1.02825069, + "num_input_tokens_seen": 32853104, + "router_z_loss_mlp": 1.54101562, + "step": 398, + "time_per_iteration": 2.9420461654663086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226854, + "balance_loss_mlp": 1.0732646, + "epoch": 0.0767602924201616, + "flos": 418037139456.0, + "grad_norm": 0.02425852476792673, + "language_loss": 1.03237891, + "learning_rate": 0.000994279726659728, + "loss": 1.04464746, + "num_input_tokens_seen": 32919376, + "router_z_loss_mlp": 1.53955078, + "step": 399, + "time_per_iteration": 2.5185675621032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01230296, + "balance_loss_mlp": 1.07675469, + "epoch": 0.07695267410542517, + "flos": 483888035328.0, + "grad_norm": 0.030174375239475117, + "language_loss": 1.02145576, + "learning_rate": 0.0009942326402526231, + "loss": 1.03375876, + "num_input_tokens_seen": 32988064, + "router_z_loss_mlp": 1.5390625, + "step": 400, + "time_per_iteration": 2.5265390872955322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224857, + "balance_loss_mlp": 1.07184029, + "epoch": 0.07714505579068873, + "flos": 532026860544.0, + "grad_norm": 0.024483465572707617, + "language_loss": 0.99344772, + "learning_rate": 0.0009941853619673902, + "loss": 1.0056963, + "num_input_tokens_seen": 33059024, + "router_z_loss_mlp": 1.53369141, + "step": 401, + "time_per_iteration": 2.660491704940796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224912, + "balance_loss_mlp": 1.07218146, + "epoch": 0.07733743747595229, + "flos": 806439066624.0, + "grad_norm": 0.032921156451595594, + "language_loss": 1.03587961, + "learning_rate": 0.0009941378918223844, + "loss": 1.04812872, + "num_input_tokens_seen": 33137712, + "router_z_loss_mlp": 1.53076172, + "step": 402, + "time_per_iteration": 3.078272819519043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01222316, + "balance_loss_mlp": 1.06972802, + "epoch": 0.07752981916121585, + "flos": 623613298176.0, + "grad_norm": 0.02596227047756477, + "language_loss": 0.96322513, + "learning_rate": 0.0009940902298360354, + "loss": 0.97544825, + "num_input_tokens_seen": 33211296, + "router_z_loss_mlp": 1.52929688, + "step": 403, + "time_per_iteration": 2.78222918510437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224993, + "balance_loss_mlp": 1.07288182, + "epoch": 0.07772220084647942, + "flos": 729542618112.0, + "grad_norm": 0.031231063897144088, + "language_loss": 1.06544566, + "learning_rate": 0.0009940423760268473, + "loss": 1.07769561, + "num_input_tokens_seen": 33283632, + "router_z_loss_mlp": 1.52441406, + "step": 404, + "time_per_iteration": 2.8572018146514893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226552, + "balance_loss_mlp": 1.07472658, + "epoch": 0.07791458253174298, + "flos": 556468575744.0, + "grad_norm": 0.029548764371286118, + "language_loss": 0.99639893, + "learning_rate": 0.0009939943304133982, + "loss": 1.00866449, + "num_input_tokens_seen": 33350704, + "router_z_loss_mlp": 1.52148438, + "step": 405, + "time_per_iteration": 2.607412815093994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226106, + "balance_loss_mlp": 1.07409084, + "epoch": 0.07810696421700654, + "flos": 554234760192.0, + "grad_norm": 0.031141101296471768, + "language_loss": 1.06411445, + "learning_rate": 0.0009939460930143416, + "loss": 1.07637548, + "num_input_tokens_seen": 33416272, + "router_z_loss_mlp": 1.5234375, + "step": 406, + "time_per_iteration": 2.6132876873016357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223027, + "balance_loss_mlp": 1.07120168, + "epoch": 0.0782993459022701, + "flos": 651878095872.0, + "grad_norm": 0.023437908852709077, + "language_loss": 1.00106847, + "learning_rate": 0.0009938976638484043, + "loss": 1.01329875, + "num_input_tokens_seen": 33501824, + "router_z_loss_mlp": 1.52148438, + "step": 407, + "time_per_iteration": 2.905681610107422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218745, + "balance_loss_mlp": 1.06691968, + "epoch": 0.07849172758753367, + "flos": 497160672768.0, + "grad_norm": 0.02891290096917658, + "language_loss": 0.99991584, + "learning_rate": 0.0009938490429343887, + "loss": 1.01210332, + "num_input_tokens_seen": 33571456, + "router_z_loss_mlp": 1.52148438, + "step": 408, + "time_per_iteration": 2.539567708969116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01222677, + "balance_loss_mlp": 1.07066166, + "epoch": 0.07868410927279723, + "flos": 579075975168.0, + "grad_norm": 0.030601656563413092, + "language_loss": 0.99965751, + "learning_rate": 0.0009938002302911709, + "loss": 1.01188421, + "num_input_tokens_seen": 33646320, + "router_z_loss_mlp": 1.5234375, + "step": 409, + "time_per_iteration": 2.732064962387085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01220028, + "balance_loss_mlp": 1.0680126, + "epoch": 0.07887649095806079, + "flos": 524066515968.0, + "grad_norm": 0.03256443285635905, + "language_loss": 1.03146362, + "learning_rate": 0.0009937512259377015, + "loss": 1.04366398, + "num_input_tokens_seen": 33717664, + "router_z_loss_mlp": 1.5234375, + "step": 410, + "time_per_iteration": 2.6500303745269775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221864, + "balance_loss_mlp": 1.07013464, + "epoch": 0.07906887264332435, + "flos": 558437876736.0, + "grad_norm": 0.023780630120827737, + "language_loss": 1.01466393, + "learning_rate": 0.000993702029893006, + "loss": 1.02688265, + "num_input_tokens_seen": 33794720, + "router_z_loss_mlp": 1.52050781, + "step": 411, + "time_per_iteration": 2.7921671867370605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221791, + "balance_loss_mlp": 1.07010949, + "epoch": 0.07926125432858792, + "flos": 823362871296.0, + "grad_norm": 0.04077078343290612, + "language_loss": 1.01153946, + "learning_rate": 0.0009936526421761838, + "loss": 1.02375734, + "num_input_tokens_seen": 33868304, + "router_z_loss_mlp": 1.52001953, + "step": 412, + "time_per_iteration": 3.0569379329681396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217861, + "balance_loss_mlp": 1.06632257, + "epoch": 0.07945363601385148, + "flos": 563393604096.0, + "grad_norm": 0.02717343044282308, + "language_loss": 1.04004121, + "learning_rate": 0.000993603062806409, + "loss": 1.05221987, + "num_input_tokens_seen": 33937424, + "router_z_loss_mlp": 1.51855469, + "step": 413, + "time_per_iteration": 2.707462787628174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01219172, + "balance_loss_mlp": 1.06844354, + "epoch": 0.07964601769911504, + "flos": 518884478976.0, + "grad_norm": 0.031245789494761384, + "language_loss": 1.07179379, + "learning_rate": 0.0009935532918029298, + "loss": 1.08398533, + "num_input_tokens_seen": 34003984, + "router_z_loss_mlp": 1.51025391, + "step": 414, + "time_per_iteration": 2.668151617050171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224604, + "balance_loss_mlp": 1.07387555, + "epoch": 0.0798383993843786, + "flos": 540300109824.0, + "grad_norm": 0.025221671350570463, + "language_loss": 0.99906069, + "learning_rate": 0.0009935033291850694, + "loss": 1.01130676, + "num_input_tokens_seen": 34072400, + "router_z_loss_mlp": 1.51025391, + "step": 415, + "time_per_iteration": 2.64747953414917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216008, + "balance_loss_mlp": 1.06547058, + "epoch": 0.08003078106964218, + "flos": 486121850880.0, + "grad_norm": 0.027121462600521052, + "language_loss": 1.02766061, + "learning_rate": 0.0009934531749722247, + "loss": 1.03982067, + "num_input_tokens_seen": 34142448, + "router_z_loss_mlp": 1.50830078, + "step": 416, + "time_per_iteration": 2.5705764293670654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121625, + "balance_loss_mlp": 1.06576049, + "epoch": 0.08022316275490574, + "flos": 519275246592.0, + "grad_norm": 0.027391361962933233, + "language_loss": 1.00515926, + "learning_rate": 0.0009934028291838672, + "loss": 1.01732171, + "num_input_tokens_seen": 34214080, + "router_z_loss_mlp": 1.5078125, + "step": 417, + "time_per_iteration": 2.7232770919799805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01219761, + "balance_loss_mlp": 1.0695101, + "epoch": 0.0804155444401693, + "flos": 495046379520.0, + "grad_norm": 0.028534904701295792, + "language_loss": 0.95904237, + "learning_rate": 0.0009933522918395433, + "loss": 0.97123998, + "num_input_tokens_seen": 34288448, + "router_z_loss_mlp": 1.50537109, + "step": 418, + "time_per_iteration": 2.670992374420166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01265297, + "balance_loss_mlp": 1.11595154, + "epoch": 0.08060792612543285, + "flos": 1584853833216.0, + "grad_norm": 0.03473829356439328, + "language_loss": 0.782511, + "learning_rate": 0.0009933015629588731, + "loss": 0.79516399, + "num_input_tokens_seen": 34521632, + "router_z_loss_mlp": 1.49609375, + "step": 419, + "time_per_iteration": 4.9051830768585205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01222046, + "balance_loss_mlp": 1.07246244, + "epoch": 0.08080030781069643, + "flos": 526358728704.0, + "grad_norm": 0.03232182071246488, + "language_loss": 1.15746891, + "learning_rate": 0.000993250642561551, + "loss": 1.16968942, + "num_input_tokens_seen": 34590080, + "router_z_loss_mlp": 1.49853516, + "step": 420, + "time_per_iteration": 2.596930503845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224313, + "balance_loss_mlp": 1.07487273, + "epoch": 0.08099268949595999, + "flos": 547756895232.0, + "grad_norm": 0.03306568774928502, + "language_loss": 1.00193918, + "learning_rate": 0.0009931995306673466, + "loss": 1.01418233, + "num_input_tokens_seen": 34660512, + "router_z_loss_mlp": 1.49707031, + "step": 421, + "time_per_iteration": 2.704012155532837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223697, + "balance_loss_mlp": 1.0744468, + "epoch": 0.08118507118122355, + "flos": 511373299200.0, + "grad_norm": 0.026268861479682264, + "language_loss": 1.0597651, + "learning_rate": 0.000993148227296103, + "loss": 1.07200205, + "num_input_tokens_seen": 34732016, + "router_z_loss_mlp": 1.49511719, + "step": 422, + "time_per_iteration": 2.6110117435455322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224578, + "balance_loss_mlp": 1.0751853, + "epoch": 0.08137745286648711, + "flos": 722001239040.0, + "grad_norm": 0.024088300997991936, + "language_loss": 0.92380643, + "learning_rate": 0.000993096732467738, + "loss": 0.9360522, + "num_input_tokens_seen": 34810416, + "router_z_loss_mlp": 1.49658203, + "step": 423, + "time_per_iteration": 2.9790220260620117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224383, + "balance_loss_mlp": 1.0753237, + "epoch": 0.08156983455175067, + "flos": 680817641472.0, + "grad_norm": 0.029818930066630327, + "language_loss": 1.0177561, + "learning_rate": 0.0009930450462022435, + "loss": 1.02999997, + "num_input_tokens_seen": 34879504, + "router_z_loss_mlp": 1.49316406, + "step": 424, + "time_per_iteration": 2.8023674488067627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223, + "balance_loss_mlp": 1.07518005, + "epoch": 0.08176221623701424, + "flos": 1456588359168.0, + "grad_norm": 0.012435251357338771, + "language_loss": 0.79189807, + "learning_rate": 0.0009929931685196862, + "loss": 0.80412811, + "num_input_tokens_seen": 35111584, + "router_z_loss_mlp": 1.48046875, + "step": 425, + "time_per_iteration": 4.96533989906311 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01219597, + "balance_loss_mlp": 1.0711571, + "epoch": 0.0819545979222778, + "flos": 1558883071488.0, + "grad_norm": 0.04204100969257126, + "language_loss": 1.00605047, + "learning_rate": 0.0009929410994402065, + "loss": 1.01824641, + "num_input_tokens_seen": 35205664, + "router_z_loss_mlp": 1.48681641, + "step": 426, + "time_per_iteration": 3.850475311279297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01220758, + "balance_loss_mlp": 1.07236588, + "epoch": 0.08214697960754136, + "flos": 513800497152.0, + "grad_norm": 0.03975912273964659, + "language_loss": 1.03955805, + "learning_rate": 0.0009928888389840196, + "loss": 1.05176568, + "num_input_tokens_seen": 35280144, + "router_z_loss_mlp": 1.48632812, + "step": 427, + "time_per_iteration": 2.6892385482788086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224824, + "balance_loss_mlp": 1.07633698, + "epoch": 0.08233936129280492, + "flos": 596221360128.0, + "grad_norm": 0.02633667259549893, + "language_loss": 1.0604248, + "learning_rate": 0.0009928363871714147, + "loss": 1.07267296, + "num_input_tokens_seen": 35344768, + "router_z_loss_mlp": 1.48730469, + "step": 428, + "time_per_iteration": 2.666851282119751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224039, + "balance_loss_mlp": 1.07550442, + "epoch": 0.08253174297806849, + "flos": 573164795904.0, + "grad_norm": 0.03052010415677114, + "language_loss": 0.99677718, + "learning_rate": 0.0009927837440227556, + "loss": 1.00901759, + "num_input_tokens_seen": 35425536, + "router_z_loss_mlp": 1.48779297, + "step": 429, + "time_per_iteration": 2.810197591781616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228416, + "balance_loss_mlp": 1.07992899, + "epoch": 0.08272412466333205, + "flos": 624642610176.0, + "grad_norm": 0.029909202440675912, + "language_loss": 0.93710327, + "learning_rate": 0.0009927309095584798, + "loss": 0.94938743, + "num_input_tokens_seen": 35515440, + "router_z_loss_mlp": 1.48730469, + "step": 430, + "time_per_iteration": 2.98052978515625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122165, + "balance_loss_mlp": 1.07316256, + "epoch": 0.08291650634859561, + "flos": 514994267136.0, + "grad_norm": 0.038201439099628094, + "language_loss": 1.07072532, + "learning_rate": 0.0009926778837991, + "loss": 1.08294177, + "num_input_tokens_seen": 35580192, + "router_z_loss_mlp": 1.48730469, + "step": 431, + "time_per_iteration": 2.613912582397461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223506, + "balance_loss_mlp": 1.07516193, + "epoch": 0.08310888803385917, + "flos": 668541388800.0, + "grad_norm": 0.02618037233016902, + "language_loss": 1.04762018, + "learning_rate": 0.000992624666765202, + "loss": 1.05985522, + "num_input_tokens_seen": 35649472, + "router_z_loss_mlp": 1.48583984, + "step": 432, + "time_per_iteration": 2.785602331161499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224029, + "balance_loss_mlp": 1.07659137, + "epoch": 0.08330126971912274, + "flos": 584490326016.0, + "grad_norm": 0.023129420064945467, + "language_loss": 1.02043724, + "learning_rate": 0.000992571258477447, + "loss": 1.03267753, + "num_input_tokens_seen": 35722848, + "router_z_loss_mlp": 1.4765625, + "step": 433, + "time_per_iteration": 2.7774012088775635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225333, + "balance_loss_mlp": 1.07799041, + "epoch": 0.0834936514043863, + "flos": 562497275904.0, + "grad_norm": 0.02412369992445121, + "language_loss": 0.95710295, + "learning_rate": 0.0009925176589565695, + "loss": 0.9693563, + "num_input_tokens_seen": 35800944, + "router_z_loss_mlp": 1.47558594, + "step": 434, + "time_per_iteration": 2.7975149154663086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224713, + "balance_loss_mlp": 1.07751381, + "epoch": 0.08368603308964986, + "flos": 495513008640.0, + "grad_norm": 0.023499028814372425, + "language_loss": 1.06310439, + "learning_rate": 0.0009924638682233791, + "loss": 1.07535148, + "num_input_tokens_seen": 35866288, + "router_z_loss_mlp": 1.47412109, + "step": 435, + "time_per_iteration": 2.5623626708984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01247864, + "balance_loss_mlp": 1.10328674, + "epoch": 0.08387841477491342, + "flos": 1391808983040.0, + "grad_norm": 0.0329185074425942, + "language_loss": 0.79564589, + "learning_rate": 0.0009924098862987589, + "loss": 0.80812454, + "num_input_tokens_seen": 36083040, + "router_z_loss_mlp": 1.44726562, + "step": 436, + "time_per_iteration": 4.5364601612091064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01219037, + "balance_loss_mlp": 1.07174218, + "epoch": 0.084070796460177, + "flos": 800353970688.0, + "grad_norm": 0.025226905267595717, + "language_loss": 0.95941472, + "learning_rate": 0.0009923557132036668, + "loss": 0.97160506, + "num_input_tokens_seen": 36158816, + "router_z_loss_mlp": 1.47509766, + "step": 437, + "time_per_iteration": 3.031538963317871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01219746, + "balance_loss_mlp": 1.07226074, + "epoch": 0.08426317814544056, + "flos": 560096274432.0, + "grad_norm": 0.024291343012928023, + "language_loss": 0.99699497, + "learning_rate": 0.0009923013489591345, + "loss": 1.00919247, + "num_input_tokens_seen": 36236432, + "router_z_loss_mlp": 1.47705078, + "step": 438, + "time_per_iteration": 2.741021156311035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217749, + "balance_loss_mlp": 1.07073975, + "epoch": 0.08445555983070412, + "flos": 811883616768.0, + "grad_norm": 0.02787309358423107, + "language_loss": 0.97740996, + "learning_rate": 0.0009922467935862681, + "loss": 0.98958743, + "num_input_tokens_seen": 36327952, + "router_z_loss_mlp": 1.47216797, + "step": 439, + "time_per_iteration": 3.0727341175079346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215984, + "balance_loss_mlp": 1.06907046, + "epoch": 0.08464794151596768, + "flos": 511169183232.0, + "grad_norm": 0.02418736148641671, + "language_loss": 1.01547837, + "learning_rate": 0.0009921920471062478, + "loss": 1.0276382, + "num_input_tokens_seen": 36394896, + "router_z_loss_mlp": 1.47119141, + "step": 440, + "time_per_iteration": 2.5793957710266113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214442, + "balance_loss_mlp": 1.06805265, + "epoch": 0.08484032320123125, + "flos": 557473692672.0, + "grad_norm": 0.02549300900866748, + "language_loss": 0.99590349, + "learning_rate": 0.0009921371095403281, + "loss": 1.00804806, + "num_input_tokens_seen": 36464656, + "router_z_loss_mlp": 1.46582031, + "step": 441, + "time_per_iteration": 2.633976936340332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215261, + "balance_loss_mlp": 1.06887233, + "epoch": 0.08503270488649481, + "flos": 528360230400.0, + "grad_norm": 0.023285649852896013, + "language_loss": 1.02823853, + "learning_rate": 0.0009920819809098379, + "loss": 1.04039121, + "num_input_tokens_seen": 36532208, + "router_z_loss_mlp": 1.46582031, + "step": 442, + "time_per_iteration": 2.5975728034973145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213611, + "balance_loss_mlp": 1.06722176, + "epoch": 0.08522508657175837, + "flos": 615385711104.0, + "grad_norm": 0.021771679570127336, + "language_loss": 0.97986722, + "learning_rate": 0.0009920266612361798, + "loss": 0.99200332, + "num_input_tokens_seen": 36607360, + "router_z_loss_mlp": 1.46582031, + "step": 443, + "time_per_iteration": 2.7284042835235596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214332, + "balance_loss_mlp": 1.06803846, + "epoch": 0.08541746825702193, + "flos": 620986713600.0, + "grad_norm": 0.024601404202987703, + "language_loss": 0.97963679, + "learning_rate": 0.0009919711505408308, + "loss": 0.9917801, + "num_input_tokens_seen": 36680688, + "router_z_loss_mlp": 1.46484375, + "step": 444, + "time_per_iteration": 2.797030448913574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216522, + "balance_loss_mlp": 1.07051492, + "epoch": 0.08560984994228549, + "flos": 483888035328.0, + "grad_norm": 0.023417740932750293, + "language_loss": 0.96522343, + "learning_rate": 0.000991915448845342, + "loss": 0.97738856, + "num_input_tokens_seen": 36746288, + "router_z_loss_mlp": 1.46191406, + "step": 445, + "time_per_iteration": 2.544638156890869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121537, + "balance_loss_mlp": 1.06945765, + "epoch": 0.08580223162754906, + "flos": 518176803840.0, + "grad_norm": 0.025018627604332305, + "language_loss": 1.05275297, + "learning_rate": 0.000991859556171339, + "loss": 1.0649066, + "num_input_tokens_seen": 36812528, + "router_z_loss_mlp": 1.4609375, + "step": 446, + "time_per_iteration": 2.5865097045898438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214045, + "balance_loss_mlp": 1.06856191, + "epoch": 0.08599461331281262, + "flos": 532519686144.0, + "grad_norm": 0.025883227843611877, + "language_loss": 1.07190132, + "learning_rate": 0.000991803472540521, + "loss": 1.08404183, + "num_input_tokens_seen": 36879248, + "router_z_loss_mlp": 1.45654297, + "step": 447, + "time_per_iteration": 2.6001055240631104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213992, + "balance_loss_mlp": 1.06879497, + "epoch": 0.08618699499807618, + "flos": 791633558016.0, + "grad_norm": 0.022461373320799196, + "language_loss": 1.02303076, + "learning_rate": 0.0009917471979746615, + "loss": 1.03517067, + "num_input_tokens_seen": 36951376, + "router_z_loss_mlp": 1.45361328, + "step": 448, + "time_per_iteration": 2.9621376991271973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218395, + "balance_loss_mlp": 1.07300746, + "epoch": 0.08637937668333974, + "flos": 567114628608.0, + "grad_norm": 0.02449904215267775, + "language_loss": 1.00404847, + "learning_rate": 0.0009916907324956086, + "loss": 1.01623249, + "num_input_tokens_seen": 37025936, + "router_z_loss_mlp": 1.45556641, + "step": 449, + "time_per_iteration": 2.691150188446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214944, + "balance_loss_mlp": 1.0697943, + "epoch": 0.08657175836860331, + "flos": 446117286912.0, + "grad_norm": 0.025714213043280993, + "language_loss": 0.97109705, + "learning_rate": 0.0009916340761252837, + "loss": 0.98324645, + "num_input_tokens_seen": 37095872, + "router_z_loss_mlp": 1.453125, + "step": 450, + "time_per_iteration": 2.6118698120117188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212599, + "balance_loss_mlp": 1.067307, + "epoch": 0.08676414005386687, + "flos": 845588235264.0, + "grad_norm": 0.02612794411743426, + "language_loss": 0.94540501, + "learning_rate": 0.0009915772288856832, + "loss": 0.95753098, + "num_input_tokens_seen": 37179072, + "router_z_loss_mlp": 1.45458984, + "step": 451, + "time_per_iteration": 3.0883219242095947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213701, + "balance_loss_mlp": 1.06926715, + "epoch": 0.08695652173913043, + "flos": 604483875840.0, + "grad_norm": 0.02003375948944636, + "language_loss": 0.95739877, + "learning_rate": 0.000991520190798877, + "loss": 0.96953583, + "num_input_tokens_seen": 37260288, + "router_z_loss_mlp": 1.44580078, + "step": 452, + "time_per_iteration": 2.8387818336486816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213572, + "balance_loss_mlp": 1.06928122, + "epoch": 0.08714890342439399, + "flos": 732000015360.0, + "grad_norm": 0.027770143088691506, + "language_loss": 1.06693339, + "learning_rate": 0.0009914629618870089, + "loss": 1.07906914, + "num_input_tokens_seen": 37331136, + "router_z_loss_mlp": 1.44433594, + "step": 453, + "time_per_iteration": 2.9403207302093506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01234398, + "balance_loss_mlp": 1.0905838, + "epoch": 0.08734128510965757, + "flos": 1485454044672.0, + "grad_norm": 0.02536208637588336, + "language_loss": 0.78675872, + "learning_rate": 0.0009914055421722976, + "loss": 0.79910266, + "num_input_tokens_seen": 37559040, + "router_z_loss_mlp": 1.43945312, + "step": 454, + "time_per_iteration": 4.803662061691284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121994, + "balance_loss_mlp": 1.07631683, + "epoch": 0.08753366679492113, + "flos": 1526266340352.0, + "grad_norm": 0.01817690946373191, + "language_loss": 0.81427962, + "learning_rate": 0.0009913479316770353, + "loss": 0.82647902, + "num_input_tokens_seen": 37785136, + "router_z_loss_mlp": 1.4375, + "step": 455, + "time_per_iteration": 4.812621355056763 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213204, + "balance_loss_mlp": 1.06919885, + "epoch": 0.08772604848018468, + "flos": 722524263936.0, + "grad_norm": 0.030160618436618963, + "language_loss": 0.98162878, + "learning_rate": 0.0009912901304235883, + "loss": 0.99376082, + "num_input_tokens_seen": 37858832, + "router_z_loss_mlp": 1.44140625, + "step": 456, + "time_per_iteration": 2.9147355556488037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217818, + "balance_loss_mlp": 1.07386112, + "epoch": 0.08791843016544824, + "flos": 709466476032.0, + "grad_norm": 0.03064824893295274, + "language_loss": 0.96399593, + "learning_rate": 0.000991232138434397, + "loss": 0.97617412, + "num_input_tokens_seen": 37931856, + "router_z_loss_mlp": 1.44091797, + "step": 457, + "time_per_iteration": 2.8735082149505615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121922, + "balance_loss_mlp": 1.07540572, + "epoch": 0.08811081185071182, + "flos": 474021516288.0, + "grad_norm": 0.03193385229896835, + "language_loss": 1.03185177, + "learning_rate": 0.000991173955731976, + "loss": 1.04404402, + "num_input_tokens_seen": 38002432, + "router_z_loss_mlp": 1.43945312, + "step": 458, + "time_per_iteration": 2.6597843170166016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01220724, + "balance_loss_mlp": 1.07762539, + "epoch": 0.08830319353597538, + "flos": 686314584576.0, + "grad_norm": 0.057581270182385194, + "language_loss": 1.06524456, + "learning_rate": 0.0009911155823389137, + "loss": 1.07745171, + "num_input_tokens_seen": 38081648, + "router_z_loss_mlp": 1.43212891, + "step": 459, + "time_per_iteration": 2.938124656677246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218235, + "balance_loss_mlp": 1.07513571, + "epoch": 0.08849557522123894, + "flos": 574608344064.0, + "grad_norm": 0.027044136096108284, + "language_loss": 1.01923048, + "learning_rate": 0.000991057018277873, + "loss": 1.03141284, + "num_input_tokens_seen": 38153424, + "router_z_loss_mlp": 1.43212891, + "step": 460, + "time_per_iteration": 2.746169090270996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212445, + "balance_loss_mlp": 1.0693934, + "epoch": 0.0886879569065025, + "flos": 565627419648.0, + "grad_norm": 0.031092379840733354, + "language_loss": 1.03267121, + "learning_rate": 0.0009909982635715898, + "loss": 1.04479575, + "num_input_tokens_seen": 38223008, + "router_z_loss_mlp": 1.43164062, + "step": 461, + "time_per_iteration": 2.6196396350860596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212854, + "balance_loss_mlp": 1.06956458, + "epoch": 0.08888033859176607, + "flos": 564956674560.0, + "grad_norm": 0.030181357689894217, + "language_loss": 1.02059078, + "learning_rate": 0.0009909393182428751, + "loss": 1.03271937, + "num_input_tokens_seen": 38294592, + "router_z_loss_mlp": 1.43408203, + "step": 462, + "time_per_iteration": 2.679793357849121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216843, + "balance_loss_mlp": 1.07345808, + "epoch": 0.08907272027702963, + "flos": 466742650368.0, + "grad_norm": 0.029240136547664795, + "language_loss": 0.9639132, + "learning_rate": 0.000990880182314614, + "loss": 0.97608161, + "num_input_tokens_seen": 38365792, + "router_z_loss_mlp": 1.43505859, + "step": 463, + "time_per_iteration": 2.712097644805908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212421, + "balance_loss_mlp": 1.06922734, + "epoch": 0.08926510196229319, + "flos": 682843338240.0, + "grad_norm": 0.026287763165510035, + "language_loss": 0.96174729, + "learning_rate": 0.0009908208558097643, + "loss": 0.97387147, + "num_input_tokens_seen": 38447776, + "router_z_loss_mlp": 1.43310547, + "step": 464, + "time_per_iteration": 2.906903028488159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217208, + "balance_loss_mlp": 1.07406175, + "epoch": 0.08945748364755675, + "flos": 597821360640.0, + "grad_norm": 0.024374741633963998, + "language_loss": 0.98668623, + "learning_rate": 0.000990761338751359, + "loss": 0.99885827, + "num_input_tokens_seen": 38521632, + "router_z_loss_mlp": 1.43261719, + "step": 465, + "time_per_iteration": 2.7994933128356934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225639, + "balance_loss_mlp": 1.08506775, + "epoch": 0.08964986533282032, + "flos": 1589340930048.0, + "grad_norm": 0.02575129149720033, + "language_loss": 0.73659623, + "learning_rate": 0.0009907016311625045, + "loss": 0.74885261, + "num_input_tokens_seen": 38760528, + "router_z_loss_mlp": 1.40625, + "step": 466, + "time_per_iteration": 4.9763429164886475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221953, + "balance_loss_mlp": 1.07861578, + "epoch": 0.08984224701808388, + "flos": 534549385728.0, + "grad_norm": 0.024628184063577727, + "language_loss": 1.01551545, + "learning_rate": 0.0009906417330663815, + "loss": 1.02773499, + "num_input_tokens_seen": 38827200, + "router_z_loss_mlp": 1.43457031, + "step": 467, + "time_per_iteration": 2.614560842514038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01232523, + "balance_loss_mlp": 1.08994913, + "epoch": 0.09003462870334744, + "flos": 479850103296.0, + "grad_norm": 0.03230737833956583, + "language_loss": 0.98222148, + "learning_rate": 0.0009905816444862442, + "loss": 0.99454677, + "num_input_tokens_seen": 38891984, + "router_z_loss_mlp": 1.42675781, + "step": 468, + "time_per_iteration": 2.598146438598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223867, + "balance_loss_mlp": 1.08124495, + "epoch": 0.090227010388611, + "flos": 654902178816.0, + "grad_norm": 0.027522185030294237, + "language_loss": 0.95659769, + "learning_rate": 0.0009905213654454216, + "loss": 0.96883637, + "num_input_tokens_seen": 38977136, + "router_z_loss_mlp": 1.42724609, + "step": 469, + "time_per_iteration": 2.8876352310180664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01219852, + "balance_loss_mlp": 1.07737279, + "epoch": 0.09041939207387456, + "flos": 619358515200.0, + "grad_norm": 0.023282407360439072, + "language_loss": 1.03878951, + "learning_rate": 0.0009904608959673158, + "loss": 1.0509882, + "num_input_tokens_seen": 39052224, + "router_z_loss_mlp": 1.42578125, + "step": 470, + "time_per_iteration": 2.7882330417633057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213781, + "balance_loss_mlp": 1.0718745, + "epoch": 0.09061177375913813, + "flos": 455295596544.0, + "grad_norm": 0.02882877970469751, + "language_loss": 1.04707062, + "learning_rate": 0.000990400236075403, + "loss": 1.05920839, + "num_input_tokens_seen": 39116832, + "router_z_loss_mlp": 1.41992188, + "step": 471, + "time_per_iteration": 2.5016987323760986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217743, + "balance_loss_mlp": 1.07574117, + "epoch": 0.0908041554444017, + "flos": 545308230144.0, + "grad_norm": 0.02444258884202674, + "language_loss": 1.01020849, + "learning_rate": 0.0009903393857932338, + "loss": 1.02238584, + "num_input_tokens_seen": 39190528, + "router_z_loss_mlp": 1.42089844, + "step": 472, + "time_per_iteration": 2.644397497177124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218613, + "balance_loss_mlp": 1.07732654, + "epoch": 0.09099653712966525, + "flos": 565466964480.0, + "grad_norm": 0.02685769494428931, + "language_loss": 0.99245131, + "learning_rate": 0.0009902783451444317, + "loss": 1.00463748, + "num_input_tokens_seen": 39263168, + "router_z_loss_mlp": 1.41357422, + "step": 473, + "time_per_iteration": 2.7087745666503906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214499, + "balance_loss_mlp": 1.07292593, + "epoch": 0.09118891881492881, + "flos": 475501994496.0, + "grad_norm": 0.029476649456104027, + "language_loss": 1.02896917, + "learning_rate": 0.0009902171141526956, + "loss": 1.04111421, + "num_input_tokens_seen": 39330784, + "router_z_loss_mlp": 1.41650391, + "step": 474, + "time_per_iteration": 2.5271990299224854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215154, + "balance_loss_mlp": 1.07410538, + "epoch": 0.09138130050019239, + "flos": 546990822912.0, + "grad_norm": 0.02490932279529465, + "language_loss": 0.89845926, + "learning_rate": 0.000990155692841797, + "loss": 0.9106108, + "num_input_tokens_seen": 39417472, + "router_z_loss_mlp": 1.41113281, + "step": 475, + "time_per_iteration": 2.958740234375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214039, + "balance_loss_mlp": 1.07303798, + "epoch": 0.09157368218545595, + "flos": 733973319168.0, + "grad_norm": 0.02740759839690251, + "language_loss": 1.01869047, + "learning_rate": 0.0009900940812355818, + "loss": 1.03083086, + "num_input_tokens_seen": 39488656, + "router_z_loss_mlp": 1.41064453, + "step": 476, + "time_per_iteration": 2.891787528991699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205639, + "balance_loss_mlp": 1.06478107, + "epoch": 0.0917660638707195, + "flos": 612072918528.0, + "grad_norm": 0.029261712768775452, + "language_loss": 0.99624813, + "learning_rate": 0.00099003227935797, + "loss": 1.0083046, + "num_input_tokens_seen": 39558224, + "router_z_loss_mlp": 1.40917969, + "step": 477, + "time_per_iteration": 2.7569031715393066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207057, + "balance_loss_mlp": 1.06605613, + "epoch": 0.09195844555598306, + "flos": 657018473472.0, + "grad_norm": 0.026965523070242428, + "language_loss": 1.02860427, + "learning_rate": 0.000989970287232955, + "loss": 1.04067481, + "num_input_tokens_seen": 39629856, + "router_z_loss_mlp": 1.41064453, + "step": 478, + "time_per_iteration": 2.7705225944519043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212938, + "balance_loss_mlp": 1.07212758, + "epoch": 0.09215082724124664, + "flos": 477540426240.0, + "grad_norm": 0.02578247385618595, + "language_loss": 0.99767786, + "learning_rate": 0.0009899081048846043, + "loss": 1.00980723, + "num_input_tokens_seen": 39695984, + "router_z_loss_mlp": 1.40869141, + "step": 479, + "time_per_iteration": 2.5488922595977783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215229, + "balance_loss_mlp": 1.07437098, + "epoch": 0.0923432089265102, + "flos": 525325413888.0, + "grad_norm": 0.029009434883925433, + "language_loss": 1.05276799, + "learning_rate": 0.0009898457323370593, + "loss": 1.06492031, + "num_input_tokens_seen": 39760256, + "router_z_loss_mlp": 1.40917969, + "step": 480, + "time_per_iteration": 2.5628790855407715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213957, + "balance_loss_mlp": 1.07314658, + "epoch": 0.09253559061177376, + "flos": 546638986752.0, + "grad_norm": 0.030643020391807937, + "language_loss": 1.01694977, + "learning_rate": 0.000989783169614535, + "loss": 1.02908933, + "num_input_tokens_seen": 39827984, + "router_z_loss_mlp": 1.40869141, + "step": 481, + "time_per_iteration": 2.6431851387023926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206421, + "balance_loss_mlp": 1.06718445, + "epoch": 0.09272797229703732, + "flos": 1541334362112.0, + "grad_norm": 0.00793715508899474, + "language_loss": 0.78752756, + "learning_rate": 0.0009897204167413206, + "loss": 0.79959178, + "num_input_tokens_seen": 40056688, + "router_z_loss_mlp": 1.39257812, + "step": 482, + "time_per_iteration": 4.84259295463562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211177, + "balance_loss_mlp": 1.07041514, + "epoch": 0.09292035398230089, + "flos": 691064194560.0, + "grad_norm": 0.029391602229229655, + "language_loss": 0.99036419, + "learning_rate": 0.000989657473741779, + "loss": 1.00247598, + "num_input_tokens_seen": 40133120, + "router_z_loss_mlp": 1.40820312, + "step": 483, + "time_per_iteration": 2.8193717002868652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210505, + "balance_loss_mlp": 1.06964695, + "epoch": 0.09311273566756445, + "flos": 510822076416.0, + "grad_norm": 0.026713621627667553, + "language_loss": 1.0060308, + "learning_rate": 0.0009895943406403465, + "loss": 1.01813591, + "num_input_tokens_seen": 40206464, + "router_z_loss_mlp": 1.40917969, + "step": 484, + "time_per_iteration": 2.695058822631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210956, + "balance_loss_mlp": 1.07071841, + "epoch": 0.09330511735282801, + "flos": 660583045632.0, + "grad_norm": 0.02538483632370611, + "language_loss": 0.94170594, + "learning_rate": 0.0009895310174615338, + "loss": 0.95381546, + "num_input_tokens_seen": 40277744, + "router_z_loss_mlp": 1.40283203, + "step": 485, + "time_per_iteration": 2.7646515369415283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210991, + "balance_loss_mlp": 1.0725174, + "epoch": 0.09349749903809157, + "flos": 1456021673472.0, + "grad_norm": 0.008074315810691821, + "language_loss": 0.75718516, + "learning_rate": 0.0009894675042299251, + "loss": 0.7692951, + "num_input_tokens_seen": 40503664, + "router_z_loss_mlp": 1.38476562, + "step": 486, + "time_per_iteration": 4.652726888656616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208546, + "balance_loss_mlp": 1.06868994, + "epoch": 0.09368988072335514, + "flos": 521899829760.0, + "grad_norm": 0.021962490795067104, + "language_loss": 0.97574425, + "learning_rate": 0.0009894038009701782, + "loss": 0.98782969, + "num_input_tokens_seen": 40571376, + "router_z_loss_mlp": 1.39892578, + "step": 487, + "time_per_iteration": 2.647747755050659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207771, + "balance_loss_mlp": 1.06786692, + "epoch": 0.0938822624086187, + "flos": 498751941120.0, + "grad_norm": 0.02403393711112831, + "language_loss": 1.01297927, + "learning_rate": 0.0009893399077070253, + "loss": 1.02505696, + "num_input_tokens_seen": 40638096, + "router_z_loss_mlp": 1.39941406, + "step": 488, + "time_per_iteration": 2.5559775829315186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209251, + "balance_loss_mlp": 1.07006216, + "epoch": 0.09407464409388226, + "flos": 534223746048.0, + "grad_norm": 0.02465812888810929, + "language_loss": 0.94380867, + "learning_rate": 0.0009892758244652718, + "loss": 0.95590127, + "num_input_tokens_seen": 40710992, + "router_z_loss_mlp": 1.39208984, + "step": 489, + "time_per_iteration": 2.6696364879608154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203933, + "balance_loss_mlp": 1.06398153, + "epoch": 0.09426702577914582, + "flos": 587090714112.0, + "grad_norm": 0.02607881729553482, + "language_loss": 1.01920152, + "learning_rate": 0.0009892115512697968, + "loss": 1.03124094, + "num_input_tokens_seen": 40778896, + "router_z_loss_mlp": 1.39990234, + "step": 490, + "time_per_iteration": 2.645073652267456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205245, + "balance_loss_mlp": 1.06524527, + "epoch": 0.0944594074644094, + "flos": 504463733760.0, + "grad_norm": 0.02086232355550113, + "language_loss": 1.01703966, + "learning_rate": 0.0009891470881455537, + "loss": 1.02909207, + "num_input_tokens_seen": 40853376, + "router_z_loss_mlp": 1.40039062, + "step": 491, + "time_per_iteration": 2.669978618621826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207443, + "balance_loss_mlp": 1.06777763, + "epoch": 0.09465178914967295, + "flos": 572114016768.0, + "grad_norm": 0.026976181820206353, + "language_loss": 1.00743008, + "learning_rate": 0.0009890824351175692, + "loss": 1.01950443, + "num_input_tokens_seen": 40923776, + "router_z_loss_mlp": 1.39697266, + "step": 492, + "time_per_iteration": 2.6572952270507812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207157, + "balance_loss_mlp": 1.06796801, + "epoch": 0.09484417083493651, + "flos": 550418408448.0, + "grad_norm": 0.023611014675858334, + "language_loss": 1.04079592, + "learning_rate": 0.0009890175922109435, + "loss": 1.05286753, + "num_input_tokens_seen": 40996848, + "router_z_loss_mlp": 1.39208984, + "step": 493, + "time_per_iteration": 2.622361183166504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120413, + "balance_loss_mlp": 1.06498933, + "epoch": 0.09503655252020007, + "flos": 825271047168.0, + "grad_norm": 0.02510100112233158, + "language_loss": 1.0275588, + "learning_rate": 0.0009889525594508513, + "loss": 1.03960025, + "num_input_tokens_seen": 41071280, + "router_z_loss_mlp": 1.39160156, + "step": 494, + "time_per_iteration": 3.0307581424713135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202477, + "balance_loss_mlp": 1.06333554, + "epoch": 0.09522893420546363, + "flos": 405517839360.0, + "grad_norm": 0.02234367718934989, + "language_loss": 0.96151906, + "learning_rate": 0.0009888873368625404, + "loss": 0.97354376, + "num_input_tokens_seen": 41136304, + "router_z_loss_mlp": 1.39160156, + "step": 495, + "time_per_iteration": 2.4793317317962646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205465, + "balance_loss_mlp": 1.06665742, + "epoch": 0.0954213158907272, + "flos": 692255963136.0, + "grad_norm": 0.025506351191757377, + "language_loss": 1.00908709, + "learning_rate": 0.0009888219244713326, + "loss": 1.02114165, + "num_input_tokens_seen": 41212384, + "router_z_loss_mlp": 1.38818359, + "step": 496, + "time_per_iteration": 2.865914821624756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206499, + "balance_loss_mlp": 1.06773937, + "epoch": 0.09561369757599077, + "flos": 520074246144.0, + "grad_norm": 0.030124833611481355, + "language_loss": 1.02319717, + "learning_rate": 0.0009887563223026229, + "loss": 1.03526211, + "num_input_tokens_seen": 41282528, + "router_z_loss_mlp": 1.38671875, + "step": 497, + "time_per_iteration": 2.689708948135376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210899, + "balance_loss_mlp": 1.07376099, + "epoch": 0.09580607926125433, + "flos": 1388781623808.0, + "grad_norm": 0.014650036919455408, + "language_loss": 0.7906816, + "learning_rate": 0.0009886905303818805, + "loss": 0.80279064, + "num_input_tokens_seen": 41512256, + "router_z_loss_mlp": 1.37109375, + "step": 498, + "time_per_iteration": 4.940208196640015 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203477, + "balance_loss_mlp": 1.06476545, + "epoch": 0.09599846094651789, + "flos": 718825433088.0, + "grad_norm": 0.028840614245688557, + "language_loss": 0.98952407, + "learning_rate": 0.0009886245487346482, + "loss": 1.00155878, + "num_input_tokens_seen": 41596816, + "router_z_loss_mlp": 1.38427734, + "step": 499, + "time_per_iteration": 3.023056745529175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205479, + "balance_loss_mlp": 1.06690967, + "epoch": 0.09619084263178146, + "flos": 386893977600.0, + "grad_norm": 0.031706482821381415, + "language_loss": 1.0340035, + "learning_rate": 0.0009885583773865422, + "loss": 1.04605842, + "num_input_tokens_seen": 41658544, + "router_z_loss_mlp": 1.38183594, + "step": 500, + "time_per_iteration": 2.422914505004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202787, + "balance_loss_mlp": 1.06479073, + "epoch": 0.09638322431704502, + "flos": 535172467200.0, + "grad_norm": 0.02878579188863982, + "language_loss": 0.99392897, + "learning_rate": 0.0009884920163632524, + "loss": 1.00595689, + "num_input_tokens_seen": 41730736, + "router_z_loss_mlp": 1.37988281, + "step": 501, + "time_per_iteration": 2.6820154190063477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203474, + "balance_loss_mlp": 1.0655731, + "epoch": 0.09657560600230858, + "flos": 501656501760.0, + "grad_norm": 0.02635733095705931, + "language_loss": 1.03128934, + "learning_rate": 0.000988425465690543, + "loss": 1.04332411, + "num_input_tokens_seen": 41797824, + "router_z_loss_mlp": 1.37890625, + "step": 502, + "time_per_iteration": 2.605536699295044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204627, + "balance_loss_mlp": 1.06677341, + "epoch": 0.09676798768757214, + "flos": 530331532800.0, + "grad_norm": 0.023374032620567947, + "language_loss": 1.00861204, + "learning_rate": 0.0009883587253942505, + "loss": 1.02065825, + "num_input_tokens_seen": 41875520, + "router_z_loss_mlp": 1.37841797, + "step": 503, + "time_per_iteration": 2.7548091411590576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204765, + "balance_loss_mlp": 1.06686366, + "epoch": 0.09696036937283571, + "flos": 464556498432.0, + "grad_norm": 0.029206950172382878, + "language_loss": 1.0685035, + "learning_rate": 0.0009882917955002862, + "loss": 1.08055115, + "num_input_tokens_seen": 41942224, + "router_z_loss_mlp": 1.37890625, + "step": 504, + "time_per_iteration": 2.520970344543457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200777, + "balance_loss_mlp": 1.06297076, + "epoch": 0.09715275105809927, + "flos": 536010398208.0, + "grad_norm": 0.02484338661637091, + "language_loss": 0.9770751, + "learning_rate": 0.0009882246760346343, + "loss": 0.98908287, + "num_input_tokens_seen": 42007552, + "router_z_loss_mlp": 1.37695312, + "step": 505, + "time_per_iteration": 2.6314897537231445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204578, + "balance_loss_mlp": 1.06672478, + "epoch": 0.09734513274336283, + "flos": 455881747968.0, + "grad_norm": 0.02756591702740651, + "language_loss": 1.04990697, + "learning_rate": 0.0009881573670233533, + "loss": 1.06195283, + "num_input_tokens_seen": 42071760, + "router_z_loss_mlp": 1.37451172, + "step": 506, + "time_per_iteration": 2.492464780807495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203948, + "balance_loss_mlp": 1.06619, + "epoch": 0.09753751442862639, + "flos": 509827693056.0, + "grad_norm": 0.02954706972608782, + "language_loss": 0.97619581, + "learning_rate": 0.0009880898684925747, + "loss": 0.98823535, + "num_input_tokens_seen": 42140688, + "router_z_loss_mlp": 1.37353516, + "step": 507, + "time_per_iteration": 2.6402478218078613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120195, + "balance_loss_mlp": 1.06438243, + "epoch": 0.09772989611388996, + "flos": 485246989824.0, + "grad_norm": 0.02487380392257162, + "language_loss": 0.96617985, + "learning_rate": 0.0009880221804685037, + "loss": 0.97819936, + "num_input_tokens_seen": 42208544, + "router_z_loss_mlp": 1.37158203, + "step": 508, + "time_per_iteration": 2.5352439880371094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209412, + "balance_loss_mlp": 1.0741806, + "epoch": 0.09792227779915352, + "flos": 1569316454400.0, + "grad_norm": 0.016823619827393988, + "language_loss": 0.79344422, + "learning_rate": 0.000987954302977419, + "loss": 0.80553836, + "num_input_tokens_seen": 42426624, + "router_z_loss_mlp": 1.3515625, + "step": 509, + "time_per_iteration": 4.694217205047607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205455, + "balance_loss_mlp": 1.06831706, + "epoch": 0.09811465948441708, + "flos": 588914296320.0, + "grad_norm": 0.032012577058462416, + "language_loss": 1.03636336, + "learning_rate": 0.0009878862360456733, + "loss": 1.04841793, + "num_input_tokens_seen": 42494592, + "router_z_loss_mlp": 1.37011719, + "step": 510, + "time_per_iteration": 2.73879337310791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208431, + "balance_loss_mlp": 1.07148337, + "epoch": 0.09830704116968064, + "flos": 614128814592.0, + "grad_norm": 0.028115444050206044, + "language_loss": 0.94855493, + "learning_rate": 0.0009878179796996922, + "loss": 0.96063924, + "num_input_tokens_seen": 42564944, + "router_z_loss_mlp": 1.36914062, + "step": 511, + "time_per_iteration": 2.6949734687805176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207361, + "balance_loss_mlp": 1.07050836, + "epoch": 0.09849942285494422, + "flos": 539935538688.0, + "grad_norm": 0.022608937638108787, + "language_loss": 0.9790619, + "learning_rate": 0.0009877495339659754, + "loss": 0.99113548, + "num_input_tokens_seen": 42645616, + "router_z_loss_mlp": 1.36816406, + "step": 512, + "time_per_iteration": 2.7515861988067627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214076, + "balance_loss_mlp": 1.0773195, + "epoch": 0.09869180454020778, + "flos": 621603064320.0, + "grad_norm": 0.029833187637910333, + "language_loss": 0.94261241, + "learning_rate": 0.000987680898871096, + "loss": 0.95475316, + "num_input_tokens_seen": 42713632, + "router_z_loss_mlp": 1.3671875, + "step": 513, + "time_per_iteration": 2.6975760459899902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120845, + "balance_loss_mlp": 1.07145417, + "epoch": 0.09888418622547133, + "flos": 813059922432.0, + "grad_norm": 0.032512892127392744, + "language_loss": 0.9726817, + "learning_rate": 0.0009876120744417, + "loss": 0.98476619, + "num_input_tokens_seen": 42789088, + "router_z_loss_mlp": 1.36767578, + "step": 514, + "time_per_iteration": 2.9514927864074707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214576, + "balance_loss_mlp": 1.07762837, + "epoch": 0.0990765679107349, + "flos": 536857061376.0, + "grad_norm": 0.028495408786163776, + "language_loss": 1.0346663, + "learning_rate": 0.0009875430607045078, + "loss": 1.04681206, + "num_input_tokens_seen": 42861168, + "router_z_loss_mlp": 1.36523438, + "step": 515, + "time_per_iteration": 2.669271230697632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209323, + "balance_loss_mlp": 1.07242322, + "epoch": 0.09926894959599845, + "flos": 588970692096.0, + "grad_norm": 0.026228231589839293, + "language_loss": 0.98752952, + "learning_rate": 0.000987473857686313, + "loss": 0.9996227, + "num_input_tokens_seen": 42934112, + "router_z_loss_mlp": 1.36474609, + "step": 516, + "time_per_iteration": 2.7055716514587402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120601, + "balance_loss_mlp": 1.06934881, + "epoch": 0.09946133128126203, + "flos": 642386881536.0, + "grad_norm": 0.0302129460476142, + "language_loss": 1.04248524, + "learning_rate": 0.0009874044654139824, + "loss": 1.05454528, + "num_input_tokens_seen": 43005248, + "router_z_loss_mlp": 1.36230469, + "step": 517, + "time_per_iteration": 2.726618528366089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200307, + "balance_loss_mlp": 1.06340742, + "epoch": 0.09965371296652559, + "flos": 466725186048.0, + "grad_norm": 0.03251153136411229, + "language_loss": 1.02563679, + "learning_rate": 0.0009873348839144563, + "loss": 1.03763986, + "num_input_tokens_seen": 43070576, + "router_z_loss_mlp": 1.36474609, + "step": 518, + "time_per_iteration": 2.5855953693389893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200913, + "balance_loss_mlp": 1.06439471, + "epoch": 0.09984609465178915, + "flos": 484558780416.0, + "grad_norm": 0.029627125773621466, + "language_loss": 1.03352094, + "learning_rate": 0.000987265113214749, + "loss": 1.04552996, + "num_input_tokens_seen": 43138048, + "router_z_loss_mlp": 1.36279297, + "step": 519, + "time_per_iteration": 2.5350587368011475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01201703, + "balance_loss_mlp": 1.06566191, + "epoch": 0.1000384763370527, + "flos": 570095050752.0, + "grad_norm": 0.028931775658430137, + "language_loss": 1.07544637, + "learning_rate": 0.0009871951533419476, + "loss": 1.08746338, + "num_input_tokens_seen": 43207600, + "router_z_loss_mlp": 1.35986328, + "step": 520, + "time_per_iteration": 2.6423709392547607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200484, + "balance_loss_mlp": 1.06439495, + "epoch": 0.10023085802231628, + "flos": 546925694976.0, + "grad_norm": 0.025491893219336172, + "language_loss": 0.95403761, + "learning_rate": 0.0009871250043232132, + "loss": 0.96604246, + "num_input_tokens_seen": 43285104, + "router_z_loss_mlp": 1.36035156, + "step": 521, + "time_per_iteration": 2.7604362964630127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198813, + "balance_loss_mlp": 1.06205583, + "epoch": 0.10042323970757984, + "flos": 504439538688.0, + "grad_norm": 0.029888360913216814, + "language_loss": 0.96113187, + "learning_rate": 0.0009870546661857797, + "loss": 0.97311997, + "num_input_tokens_seen": 43353312, + "router_z_loss_mlp": 1.36328125, + "step": 522, + "time_per_iteration": 2.578458547592163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195212, + "balance_loss_mlp": 1.05931365, + "epoch": 0.1006156213928434, + "flos": 771724601856.0, + "grad_norm": 0.029426081780707294, + "language_loss": 1.05752206, + "learning_rate": 0.0009869841389569553, + "loss": 1.0694741, + "num_input_tokens_seen": 43427680, + "router_z_loss_mlp": 1.35839844, + "step": 523, + "time_per_iteration": 2.958531618118286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194366, + "balance_loss_mlp": 1.05846703, + "epoch": 0.10080800307810696, + "flos": 491008447488.0, + "grad_norm": 0.024593893632090205, + "language_loss": 0.96497846, + "learning_rate": 0.0009869134226641206, + "loss": 0.97692204, + "num_input_tokens_seen": 43495200, + "router_z_loss_mlp": 1.35839844, + "step": 524, + "time_per_iteration": 2.6820528507232666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196113, + "balance_loss_mlp": 1.06030965, + "epoch": 0.10100038476337053, + "flos": 455712560640.0, + "grad_norm": 0.026556514945601337, + "language_loss": 0.98348475, + "learning_rate": 0.0009868425173347303, + "loss": 0.99544585, + "num_input_tokens_seen": 43566256, + "router_z_loss_mlp": 1.35742188, + "step": 525, + "time_per_iteration": 2.6460907459259033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196515, + "balance_loss_mlp": 1.06099772, + "epoch": 0.10119276644863409, + "flos": 557573749248.0, + "grad_norm": 0.022458491608374247, + "language_loss": 1.03332829, + "learning_rate": 0.0009867714229963125, + "loss": 1.04529333, + "num_input_tokens_seen": 43639696, + "router_z_loss_mlp": 1.35449219, + "step": 526, + "time_per_iteration": 2.693362236022949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119647, + "balance_loss_mlp": 1.0609529, + "epoch": 0.10138514813389765, + "flos": 517219350528.0, + "grad_norm": 0.028969258136437262, + "language_loss": 1.0161202, + "learning_rate": 0.000986700139676468, + "loss": 1.02808487, + "num_input_tokens_seen": 43703872, + "router_z_loss_mlp": 1.35449219, + "step": 527, + "time_per_iteration": 2.5826644897460938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202893, + "balance_loss_mlp": 1.06742311, + "epoch": 0.10157752981916121, + "flos": 501563175936.0, + "grad_norm": 0.023004964960346017, + "language_loss": 0.98490077, + "learning_rate": 0.0009866286674028717, + "loss": 0.99692971, + "num_input_tokens_seen": 43774416, + "router_z_loss_mlp": 1.35400391, + "step": 528, + "time_per_iteration": 2.626595973968506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204326, + "balance_loss_mlp": 1.06876123, + "epoch": 0.10176991150442478, + "flos": 658093447680.0, + "grad_norm": 0.024381421822087013, + "language_loss": 0.95674849, + "learning_rate": 0.0009865570062032717, + "loss": 0.96879184, + "num_input_tokens_seen": 43853376, + "router_z_loss_mlp": 1.35498047, + "step": 529, + "time_per_iteration": 2.916924238204956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203456, + "balance_loss_mlp": 1.0680815, + "epoch": 0.10196229318968834, + "flos": 574402226688.0, + "grad_norm": 0.021344584600364362, + "language_loss": 0.99175954, + "learning_rate": 0.0009864851561054893, + "loss": 1.00379407, + "num_input_tokens_seen": 43929632, + "router_z_loss_mlp": 1.35302734, + "step": 530, + "time_per_iteration": 2.750075578689575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203649, + "balance_loss_mlp": 1.06856096, + "epoch": 0.1021546748749519, + "flos": 519255780864.0, + "grad_norm": 0.027896087186932737, + "language_loss": 0.99157, + "learning_rate": 0.0009864131171374191, + "loss": 1.00360656, + "num_input_tokens_seen": 44002144, + "router_z_loss_mlp": 1.35009766, + "step": 531, + "time_per_iteration": 2.6506359577178955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202329, + "balance_loss_mlp": 1.06728852, + "epoch": 0.10234705656021546, + "flos": 610953008640.0, + "grad_norm": 0.021304730024267197, + "language_loss": 0.98848057, + "learning_rate": 0.0009863408893270292, + "loss": 1.0005039, + "num_input_tokens_seen": 44078272, + "router_z_loss_mlp": 1.34960938, + "step": 532, + "time_per_iteration": 2.827632427215576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202805, + "balance_loss_mlp": 1.06776476, + "epoch": 0.10253943824547904, + "flos": 602912073216.0, + "grad_norm": 0.02650069508154076, + "language_loss": 0.95645475, + "learning_rate": 0.0009862684727023605, + "loss": 0.96848285, + "num_input_tokens_seen": 44152304, + "router_z_loss_mlp": 1.34960938, + "step": 533, + "time_per_iteration": 2.730771541595459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206135, + "balance_loss_mlp": 1.07152414, + "epoch": 0.1027318199307426, + "flos": 664156349952.0, + "grad_norm": 0.02579556790717569, + "language_loss": 0.96718729, + "learning_rate": 0.0009861958672915283, + "loss": 0.97924864, + "num_input_tokens_seen": 44226720, + "router_z_loss_mlp": 1.34521484, + "step": 534, + "time_per_iteration": 2.825239419937134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202189, + "balance_loss_mlp": 1.06776834, + "epoch": 0.10292420161600616, + "flos": 684529933824.0, + "grad_norm": 0.02492376876437301, + "language_loss": 0.95656139, + "learning_rate": 0.0009861230731227201, + "loss": 0.96858335, + "num_input_tokens_seen": 44303600, + "router_z_loss_mlp": 1.34326172, + "step": 535, + "time_per_iteration": 2.858596086502075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203815, + "balance_loss_mlp": 1.06958508, + "epoch": 0.10311658330126972, + "flos": 491268959232.0, + "grad_norm": 0.02833674325523021, + "language_loss": 0.99709427, + "learning_rate": 0.0009860500902241973, + "loss": 1.00913239, + "num_input_tokens_seen": 44370960, + "router_z_loss_mlp": 1.34130859, + "step": 536, + "time_per_iteration": 2.5780303478240967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197149, + "balance_loss_mlp": 1.06291902, + "epoch": 0.10330896498653329, + "flos": 432686195712.0, + "grad_norm": 0.024484943889946764, + "language_loss": 1.03652823, + "learning_rate": 0.0009859769186242942, + "loss": 1.0484997, + "num_input_tokens_seen": 44435584, + "router_z_loss_mlp": 1.34130859, + "step": 537, + "time_per_iteration": 2.5104598999023438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119791, + "balance_loss_mlp": 1.06415713, + "epoch": 0.10350134667179685, + "flos": 550641990144.0, + "grad_norm": 0.0271300181774947, + "language_loss": 0.97886324, + "learning_rate": 0.0009859035583514187, + "loss": 0.99084234, + "num_input_tokens_seen": 44505456, + "router_z_loss_mlp": 1.33642578, + "step": 538, + "time_per_iteration": 2.6156880855560303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197994, + "balance_loss_mlp": 1.06395507, + "epoch": 0.10369372835706041, + "flos": 641826926592.0, + "grad_norm": 0.024416305433678544, + "language_loss": 1.00991774, + "learning_rate": 0.0009858300094340517, + "loss": 1.02189767, + "num_input_tokens_seen": 44580208, + "router_z_loss_mlp": 1.33935547, + "step": 539, + "time_per_iteration": 2.764214515686035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198436, + "balance_loss_mlp": 1.06468332, + "epoch": 0.10388611004232397, + "flos": 522765958656.0, + "grad_norm": 0.025798430155835095, + "language_loss": 0.9342165, + "learning_rate": 0.0009857562719007473, + "loss": 0.94620085, + "num_input_tokens_seen": 44646576, + "router_z_loss_mlp": 1.33642578, + "step": 540, + "time_per_iteration": 2.6592581272125244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204547, + "balance_loss_mlp": 1.07122386, + "epoch": 0.10407849172758753, + "flos": 703739947008.0, + "grad_norm": 0.023593197084580173, + "language_loss": 0.95331407, + "learning_rate": 0.0009856823457801331, + "loss": 0.96535957, + "num_input_tokens_seen": 44726752, + "router_z_loss_mlp": 1.33203125, + "step": 541, + "time_per_iteration": 2.889531373977661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202711, + "balance_loss_mlp": 1.06924474, + "epoch": 0.1042708734128511, + "flos": 503944711680.0, + "grad_norm": 0.023957714626313076, + "language_loss": 1.02856565, + "learning_rate": 0.00098560823110091, + "loss": 1.04059267, + "num_input_tokens_seen": 44795824, + "router_z_loss_mlp": 1.33349609, + "step": 542, + "time_per_iteration": 2.6067047119140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205134, + "balance_loss_mlp": 1.07185781, + "epoch": 0.10446325509811466, + "flos": 486640872960.0, + "grad_norm": 0.0231214260398276, + "language_loss": 1.01405394, + "learning_rate": 0.000985533927891851, + "loss": 1.02610517, + "num_input_tokens_seen": 44868496, + "router_z_loss_mlp": 1.33154297, + "step": 543, + "time_per_iteration": 2.6622776985168457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01201388, + "balance_loss_mlp": 1.06820762, + "epoch": 0.10465563678337822, + "flos": 569713015296.0, + "grad_norm": 0.023482705287667723, + "language_loss": 1.01015687, + "learning_rate": 0.0009854594361818044, + "loss": 1.02217078, + "num_input_tokens_seen": 44939888, + "router_z_loss_mlp": 1.33056641, + "step": 544, + "time_per_iteration": 2.7061924934387207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195672, + "balance_loss_mlp": 1.06244385, + "epoch": 0.10484801846864178, + "flos": 627242998272.0, + "grad_norm": 0.023194608242680787, + "language_loss": 0.99799937, + "learning_rate": 0.0009853847559996897, + "loss": 1.00995612, + "num_input_tokens_seen": 45012720, + "router_z_loss_mlp": 1.33105469, + "step": 545, + "time_per_iteration": 2.742445707321167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192128, + "balance_loss_mlp": 1.05885231, + "epoch": 0.10504040015390535, + "flos": 744812754432.0, + "grad_norm": 0.025865682249952955, + "language_loss": 0.99192667, + "learning_rate": 0.0009853098873745, + "loss": 1.00384796, + "num_input_tokens_seen": 45093744, + "router_z_loss_mlp": 1.33154297, + "step": 546, + "time_per_iteration": 3.0260400772094727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192867, + "balance_loss_mlp": 1.05997264, + "epoch": 0.10523278183916891, + "flos": 587842050048.0, + "grad_norm": 0.02599355243407578, + "language_loss": 0.98197657, + "learning_rate": 0.0009852348303353027, + "loss": 0.99390525, + "num_input_tokens_seen": 45172784, + "router_z_loss_mlp": 1.32763672, + "step": 547, + "time_per_iteration": 2.8120169639587402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191481, + "balance_loss_mlp": 1.05844367, + "epoch": 0.10542516352443247, + "flos": 871145857536.0, + "grad_norm": 0.02495252935664815, + "language_loss": 0.91398883, + "learning_rate": 0.000985159584911237, + "loss": 0.92590368, + "num_input_tokens_seen": 45255600, + "router_z_loss_mlp": 1.32910156, + "step": 548, + "time_per_iteration": 3.1012043952941895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119193, + "balance_loss_mlp": 1.05913138, + "epoch": 0.10561754520969603, + "flos": 506412842496.0, + "grad_norm": 0.025955858684814606, + "language_loss": 0.9925828, + "learning_rate": 0.0009850841511315162, + "loss": 1.00450206, + "num_input_tokens_seen": 45325072, + "router_z_loss_mlp": 1.32666016, + "step": 549, + "time_per_iteration": 2.626220464706421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192876, + "balance_loss_mlp": 1.06022012, + "epoch": 0.1058099268949596, + "flos": 561147053568.0, + "grad_norm": 0.02554357007654854, + "language_loss": 0.98952115, + "learning_rate": 0.0009850085290254256, + "loss": 1.00144982, + "num_input_tokens_seen": 45401440, + "router_z_loss_mlp": 1.32519531, + "step": 550, + "time_per_iteration": 2.7464635372161865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194366, + "balance_loss_mlp": 1.06161487, + "epoch": 0.10600230858022316, + "flos": 563159288832.0, + "grad_norm": 0.020736613501838204, + "language_loss": 0.9519307, + "learning_rate": 0.0009849327186223246, + "loss": 0.9638744, + "num_input_tokens_seen": 45479264, + "router_z_loss_mlp": 1.32617188, + "step": 551, + "time_per_iteration": 2.7678163051605225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199655, + "balance_loss_mlp": 1.06728542, + "epoch": 0.10619469026548672, + "flos": 495317624832.0, + "grad_norm": 0.02236411826292933, + "language_loss": 1.02411103, + "learning_rate": 0.000984856719951646, + "loss": 1.03610754, + "num_input_tokens_seen": 45547328, + "router_z_loss_mlp": 1.32226562, + "step": 552, + "time_per_iteration": 2.5607285499572754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196226, + "balance_loss_mlp": 1.06404662, + "epoch": 0.10638707195075028, + "flos": 677463916032.0, + "grad_norm": 0.025808282690500464, + "language_loss": 1.00531495, + "learning_rate": 0.0009847805330428943, + "loss": 1.01727724, + "num_input_tokens_seen": 45631152, + "router_z_loss_mlp": 1.3203125, + "step": 553, + "time_per_iteration": 2.8748667240142822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190787, + "balance_loss_mlp": 1.05860806, + "epoch": 0.10657945363601386, + "flos": 489035143680.0, + "grad_norm": 0.02571681940882287, + "language_loss": 1.04715252, + "learning_rate": 0.0009847041579256481, + "loss": 1.05906045, + "num_input_tokens_seen": 45698208, + "router_z_loss_mlp": 1.3203125, + "step": 554, + "time_per_iteration": 2.56693696975708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191519, + "balance_loss_mlp": 1.05948246, + "epoch": 0.10677183532127742, + "flos": 483970627584.0, + "grad_norm": 0.020874824601389917, + "language_loss": 1.01746583, + "learning_rate": 0.0009846275946295592, + "loss": 1.02938092, + "num_input_tokens_seen": 45766640, + "router_z_loss_mlp": 1.31884766, + "step": 555, + "time_per_iteration": 2.596774101257324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195781, + "balance_loss_mlp": 1.06369734, + "epoch": 0.10696421700654098, + "flos": 657581156352.0, + "grad_norm": 0.023085993180182653, + "language_loss": 0.93557143, + "learning_rate": 0.0009845508431843518, + "loss": 0.94752926, + "num_input_tokens_seen": 45851408, + "router_z_loss_mlp": 1.31933594, + "step": 556, + "time_per_iteration": 2.9913573265075684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192823, + "balance_loss_mlp": 1.06088233, + "epoch": 0.10715659869180454, + "flos": 568792492032.0, + "grad_norm": 0.026087632201688016, + "language_loss": 0.9692713, + "learning_rate": 0.0009844739036198233, + "loss": 0.9811995, + "num_input_tokens_seen": 45919824, + "router_z_loss_mlp": 1.31787109, + "step": 557, + "time_per_iteration": 2.6583988666534424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192362, + "balance_loss_mlp": 1.06051683, + "epoch": 0.10734898037706811, + "flos": 541743657984.0, + "grad_norm": 0.02708275038302545, + "language_loss": 1.03564882, + "learning_rate": 0.0009843967759658448, + "loss": 1.04757237, + "num_input_tokens_seen": 45991024, + "router_z_loss_mlp": 1.31689453, + "step": 558, + "time_per_iteration": 2.6571173667907715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209854, + "balance_loss_mlp": 1.07920074, + "epoch": 0.10754136206233167, + "flos": 1479731518464.0, + "grad_norm": 0.021017403581586082, + "language_loss": 0.72767758, + "learning_rate": 0.0009843194602523592, + "loss": 0.73977602, + "num_input_tokens_seen": 46212736, + "router_z_loss_mlp": 1.30664062, + "step": 559, + "time_per_iteration": 4.901749134063721 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191994, + "balance_loss_mlp": 1.06024349, + "epoch": 0.10773374374759523, + "flos": 513411730944.0, + "grad_norm": 0.02623387515623986, + "language_loss": 1.03025067, + "learning_rate": 0.000984241956509384, + "loss": 1.04217052, + "num_input_tokens_seen": 46283920, + "router_z_loss_mlp": 1.31591797, + "step": 560, + "time_per_iteration": 2.642380714416504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011916, + "balance_loss_mlp": 1.06013584, + "epoch": 0.10792612543285879, + "flos": 497477580288.0, + "grad_norm": 0.029111560342126648, + "language_loss": 1.01683569, + "learning_rate": 0.0009841642647670078, + "loss": 1.02875161, + "num_input_tokens_seen": 46349664, + "router_z_loss_mlp": 1.31298828, + "step": 561, + "time_per_iteration": 2.5994741916656494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191695, + "balance_loss_mlp": 1.06027901, + "epoch": 0.10811850711812235, + "flos": 736836946944.0, + "grad_norm": 0.027918527501713815, + "language_loss": 0.94711685, + "learning_rate": 0.0009840863850553944, + "loss": 0.95903373, + "num_input_tokens_seen": 46432688, + "router_z_loss_mlp": 1.3125, + "step": 562, + "time_per_iteration": 2.980377435684204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193377, + "balance_loss_mlp": 1.06215191, + "epoch": 0.10831088880338592, + "flos": 612676534272.0, + "grad_norm": 0.025174626098757973, + "language_loss": 0.99795747, + "learning_rate": 0.0009840083174047782, + "loss": 1.00989127, + "num_input_tokens_seen": 46507216, + "router_z_loss_mlp": 1.31054688, + "step": 563, + "time_per_iteration": 2.7209153175354004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194645, + "balance_loss_mlp": 1.0633713, + "epoch": 0.10850327048864948, + "flos": 557497887744.0, + "grad_norm": 0.021851565940339403, + "language_loss": 0.93414235, + "learning_rate": 0.0009839300618454685, + "loss": 0.94608879, + "num_input_tokens_seen": 46590464, + "router_z_loss_mlp": 1.31103516, + "step": 564, + "time_per_iteration": 2.833120584487915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194873, + "balance_loss_mlp": 1.06402934, + "epoch": 0.10869565217391304, + "flos": 604436212224.0, + "grad_norm": 0.021697209366751603, + "language_loss": 0.98980927, + "learning_rate": 0.0009838516184078466, + "loss": 1.00175798, + "num_input_tokens_seen": 46666240, + "router_z_loss_mlp": 1.30664062, + "step": 565, + "time_per_iteration": 2.805722236633301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193483, + "balance_loss_mlp": 1.06263876, + "epoch": 0.1088880338591766, + "flos": 527205391872.0, + "grad_norm": 0.024778377976546286, + "language_loss": 0.97356248, + "learning_rate": 0.0009837729871223669, + "loss": 0.98549736, + "num_input_tokens_seen": 46734288, + "router_z_loss_mlp": 1.30664062, + "step": 566, + "time_per_iteration": 2.652186155319214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119656, + "balance_loss_mlp": 1.0658114, + "epoch": 0.10908041554444017, + "flos": 621416412672.0, + "grad_norm": 0.023487449334803984, + "language_loss": 0.99301046, + "learning_rate": 0.0009836941680195568, + "loss": 1.00497603, + "num_input_tokens_seen": 46809920, + "router_z_loss_mlp": 1.30566406, + "step": 567, + "time_per_iteration": 2.7732484340667725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192093, + "balance_loss_mlp": 1.06144011, + "epoch": 0.10927279722970373, + "flos": 899673168384.0, + "grad_norm": 0.026216288845653656, + "language_loss": 0.95416081, + "learning_rate": 0.0009836151611300166, + "loss": 0.96608174, + "num_input_tokens_seen": 46889984, + "router_z_loss_mlp": 1.3046875, + "step": 568, + "time_per_iteration": 3.174981117248535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190864, + "balance_loss_mlp": 1.06049693, + "epoch": 0.10946517891496729, + "flos": 529699719168.0, + "grad_norm": 0.02336242427092275, + "language_loss": 1.03071296, + "learning_rate": 0.0009835359664844194, + "loss": 1.04262161, + "num_input_tokens_seen": 46959536, + "router_z_loss_mlp": 1.30273438, + "step": 569, + "time_per_iteration": 2.595041513442993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190102, + "balance_loss_mlp": 1.06173706, + "epoch": 0.10965756060023085, + "flos": 1563991426560.0, + "grad_norm": 0.006726678932110135, + "language_loss": 0.81036806, + "learning_rate": 0.0009834565841135114, + "loss": 0.82226908, + "num_input_tokens_seen": 47196960, + "router_z_loss_mlp": 1.28320312, + "step": 570, + "time_per_iteration": 4.911731719970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193915, + "balance_loss_mlp": 1.0634526, + "epoch": 0.10984994228549443, + "flos": 514099940352.0, + "grad_norm": 0.027266515996607284, + "language_loss": 1.00165153, + "learning_rate": 0.0009833770140481118, + "loss": 1.01359057, + "num_input_tokens_seen": 47266560, + "router_z_loss_mlp": 1.30273438, + "step": 571, + "time_per_iteration": 2.6079747676849365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197777, + "balance_loss_mlp": 1.06741011, + "epoch": 0.11004232397075799, + "flos": 956273895936.0, + "grad_norm": 0.026548665437539986, + "language_loss": 0.90315044, + "learning_rate": 0.000983297256319112, + "loss": 0.91512823, + "num_input_tokens_seen": 47348512, + "router_z_loss_mlp": 1.30175781, + "step": 572, + "time_per_iteration": 3.1897354125976562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188275, + "balance_loss_mlp": 1.05776477, + "epoch": 0.11023470565602154, + "flos": 489228526080.0, + "grad_norm": 0.026034490292812715, + "language_loss": 0.95817071, + "learning_rate": 0.000983217310957477, + "loss": 0.97005343, + "num_input_tokens_seen": 47425392, + "router_z_loss_mlp": 1.30322266, + "step": 573, + "time_per_iteration": 2.7447898387908936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190883, + "balance_loss_mlp": 1.06056309, + "epoch": 0.1104270873412851, + "flos": 656990275584.0, + "grad_norm": 0.026590820610190004, + "language_loss": 1.00224817, + "learning_rate": 0.000983137177994244, + "loss": 1.01415706, + "num_input_tokens_seen": 47502336, + "router_z_loss_mlp": 1.30126953, + "step": 574, + "time_per_iteration": 2.846140146255493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185115, + "balance_loss_mlp": 1.0552249, + "epoch": 0.11061946902654868, + "flos": 724747345920.0, + "grad_norm": 0.019709272455133778, + "language_loss": 0.93286896, + "learning_rate": 0.0009830568574605235, + "loss": 0.94472009, + "num_input_tokens_seen": 47583552, + "router_z_loss_mlp": 1.29736328, + "step": 575, + "time_per_iteration": 2.922821044921875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185727, + "balance_loss_mlp": 1.05569339, + "epoch": 0.11081185071181224, + "flos": 836867822592.0, + "grad_norm": 0.025292755419638515, + "language_loss": 0.97880363, + "learning_rate": 0.0009829763493874992, + "loss": 0.99066085, + "num_input_tokens_seen": 47663440, + "router_z_loss_mlp": 1.29833984, + "step": 576, + "time_per_iteration": 3.022394895553589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183726, + "balance_loss_mlp": 1.05412149, + "epoch": 0.1110042323970758, + "flos": 610282263552.0, + "grad_norm": 0.023453623229808367, + "language_loss": 1.02838886, + "learning_rate": 0.0009828956538064264, + "loss": 1.04022622, + "num_input_tokens_seen": 47741920, + "router_z_loss_mlp": 1.29541016, + "step": 577, + "time_per_iteration": 2.817147970199585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182671, + "balance_loss_mlp": 1.05316234, + "epoch": 0.11119661408233936, + "flos": 597039825408.0, + "grad_norm": 0.025026186935027953, + "language_loss": 0.99076784, + "learning_rate": 0.0009828147707486344, + "loss": 1.00259459, + "num_input_tokens_seen": 47815136, + "router_z_loss_mlp": 1.29492188, + "step": 578, + "time_per_iteration": 2.6778078079223633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186939, + "balance_loss_mlp": 1.05752516, + "epoch": 0.11138899576760293, + "flos": 556887541248.0, + "grad_norm": 0.027590262528076937, + "language_loss": 0.96720088, + "learning_rate": 0.0009827337002455245, + "loss": 0.97907031, + "num_input_tokens_seen": 47881360, + "router_z_loss_mlp": 1.29394531, + "step": 579, + "time_per_iteration": 2.6259562969207764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188781, + "balance_loss_mlp": 1.05951095, + "epoch": 0.11158137745286649, + "flos": 691062193152.0, + "grad_norm": 0.0223692175133054, + "language_loss": 0.94567806, + "learning_rate": 0.0009826524423285712, + "loss": 0.9575659, + "num_input_tokens_seen": 47962720, + "router_z_loss_mlp": 1.29150391, + "step": 580, + "time_per_iteration": 2.9144554138183594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118328, + "balance_loss_mlp": 1.05386627, + "epoch": 0.11177375913813005, + "flos": 764306747904.0, + "grad_norm": 0.02877171771660235, + "language_loss": 0.97941083, + "learning_rate": 0.0009825709970293218, + "loss": 0.9912436, + "num_input_tokens_seen": 48035472, + "router_z_loss_mlp": 1.29296875, + "step": 581, + "time_per_iteration": 2.8999927043914795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181128, + "balance_loss_mlp": 1.05223894, + "epoch": 0.11196614082339361, + "flos": 808030334976.0, + "grad_norm": 0.029325346048851512, + "language_loss": 1.03732872, + "learning_rate": 0.0009824893643793956, + "loss": 1.04913998, + "num_input_tokens_seen": 48116944, + "router_z_loss_mlp": 1.28857422, + "step": 582, + "time_per_iteration": 3.0697131156921387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186636, + "balance_loss_mlp": 1.05731773, + "epoch": 0.11215852250865718, + "flos": 559724972544.0, + "grad_norm": 0.028740695003145394, + "language_loss": 0.98446089, + "learning_rate": 0.0009824075444104857, + "loss": 0.99632728, + "num_input_tokens_seen": 48187808, + "router_z_loss_mlp": 1.29150391, + "step": 583, + "time_per_iteration": 2.7398793697357178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190407, + "balance_loss_mlp": 1.06147003, + "epoch": 0.11235090419392074, + "flos": 514575301632.0, + "grad_norm": 0.02293328270345756, + "language_loss": 1.02460003, + "learning_rate": 0.000982325537154357, + "loss": 1.03650403, + "num_input_tokens_seen": 48254464, + "router_z_loss_mlp": 1.28808594, + "step": 584, + "time_per_iteration": 2.590156078338623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188149, + "balance_loss_mlp": 1.05954635, + "epoch": 0.1125432858791843, + "flos": 492432529920.0, + "grad_norm": 0.028214107652977688, + "language_loss": 1.0381788, + "learning_rate": 0.0009822433426428484, + "loss": 1.05006027, + "num_input_tokens_seen": 48318784, + "router_z_loss_mlp": 1.28564453, + "step": 585, + "time_per_iteration": 2.566488027572632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188321, + "balance_loss_mlp": 1.05957532, + "epoch": 0.11273566756444786, + "flos": 511727136768.0, + "grad_norm": 0.027438709113267498, + "language_loss": 0.95940274, + "learning_rate": 0.0009821609609078697, + "loss": 0.971286, + "num_input_tokens_seen": 48389248, + "router_z_loss_mlp": 1.28710938, + "step": 586, + "time_per_iteration": 2.6117701530456543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189545, + "balance_loss_mlp": 1.06098938, + "epoch": 0.11292804924971142, + "flos": 623639494656.0, + "grad_norm": 0.025949033694362005, + "language_loss": 0.97216725, + "learning_rate": 0.0009820783919814045, + "loss": 0.98406273, + "num_input_tokens_seen": 48463312, + "router_z_loss_mlp": 1.28515625, + "step": 587, + "time_per_iteration": 2.798182249069214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181783, + "balance_loss_mlp": 1.05360925, + "epoch": 0.113120430934975, + "flos": 479038368768.0, + "grad_norm": 0.03012596671256698, + "language_loss": 0.94172156, + "learning_rate": 0.0009819956358955095, + "loss": 0.95353937, + "num_input_tokens_seen": 48531856, + "router_z_loss_mlp": 1.28125, + "step": 588, + "time_per_iteration": 2.54179310798645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197707, + "balance_loss_mlp": 1.06905663, + "epoch": 0.11331281262023855, + "flos": 467990814720.0, + "grad_norm": 0.02502737191739997, + "language_loss": 0.9542653, + "learning_rate": 0.0009819126926823127, + "loss": 0.96624243, + "num_input_tokens_seen": 48596640, + "router_z_loss_mlp": 1.28613281, + "step": 589, + "time_per_iteration": 2.5262975692749023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191554, + "balance_loss_mlp": 1.06333208, + "epoch": 0.11350519430550211, + "flos": 651610853376.0, + "grad_norm": 0.023462259875113876, + "language_loss": 0.96713853, + "learning_rate": 0.000981829562374016, + "loss": 0.97905409, + "num_input_tokens_seen": 48669648, + "router_z_loss_mlp": 1.28173828, + "step": 590, + "time_per_iteration": 2.764240026473999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192039, + "balance_loss_mlp": 1.06415117, + "epoch": 0.11369757599076567, + "flos": 558860845056.0, + "grad_norm": 0.030341732837715945, + "language_loss": 1.07369685, + "learning_rate": 0.0009817462450028933, + "loss": 1.08561718, + "num_input_tokens_seen": 48737392, + "router_z_loss_mlp": 1.27832031, + "step": 591, + "time_per_iteration": 2.638333559036255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189947, + "balance_loss_mlp": 1.06215453, + "epoch": 0.11388995767602925, + "flos": 572305397760.0, + "grad_norm": 0.0238596111294556, + "language_loss": 0.94198918, + "learning_rate": 0.0009816627406012916, + "loss": 0.9538886, + "num_input_tokens_seen": 48817136, + "router_z_loss_mlp": 1.27734375, + "step": 592, + "time_per_iteration": 2.800842523574829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191939, + "balance_loss_mlp": 1.06395626, + "epoch": 0.1140823393612928, + "flos": 741743009280.0, + "grad_norm": 0.025351621893671843, + "language_loss": 0.93787777, + "learning_rate": 0.0009815790492016295, + "loss": 0.94979715, + "num_input_tokens_seen": 48895808, + "router_z_loss_mlp": 1.27929688, + "step": 593, + "time_per_iteration": 2.9331579208374023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191026, + "balance_loss_mlp": 1.06337643, + "epoch": 0.11427472104655637, + "flos": 700251236352.0, + "grad_norm": 0.02689478502881467, + "language_loss": 0.96601468, + "learning_rate": 0.0009814951708363993, + "loss": 0.97792494, + "num_input_tokens_seen": 48967456, + "router_z_loss_mlp": 1.27587891, + "step": 594, + "time_per_iteration": 2.832094192504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200218, + "balance_loss_mlp": 1.07414246, + "epoch": 0.11446710273181993, + "flos": 1480352598528.0, + "grad_norm": 0.020191453180706247, + "language_loss": 0.77990985, + "learning_rate": 0.0009814111055381654, + "loss": 0.79191208, + "num_input_tokens_seen": 49193152, + "router_z_loss_mlp": 1.25976562, + "step": 595, + "time_per_iteration": 4.752530574798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187485, + "balance_loss_mlp": 1.06026483, + "epoch": 0.1146594844170835, + "flos": 495912508416.0, + "grad_norm": 0.02910362847653251, + "language_loss": 0.97498882, + "learning_rate": 0.0009813268533395648, + "loss": 0.98686367, + "num_input_tokens_seen": 49260960, + "router_z_loss_mlp": 1.27148438, + "step": 596, + "time_per_iteration": 2.571367025375366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187961, + "balance_loss_mlp": 1.06093144, + "epoch": 0.11485186610234706, + "flos": 475790704128.0, + "grad_norm": 0.02927093575191284, + "language_loss": 0.98108673, + "learning_rate": 0.0009812424142733073, + "loss": 0.99296629, + "num_input_tokens_seen": 49327616, + "router_z_loss_mlp": 1.26953125, + "step": 597, + "time_per_iteration": 2.5622098445892334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187255, + "balance_loss_mlp": 1.06046438, + "epoch": 0.11504424778761062, + "flos": 732619094016.0, + "grad_norm": 0.02047017320895946, + "language_loss": 0.92490959, + "learning_rate": 0.000981157788372175, + "loss": 0.93678212, + "num_input_tokens_seen": 49412864, + "router_z_loss_mlp": 1.26708984, + "step": 598, + "time_per_iteration": 3.017120599746704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185489, + "balance_loss_mlp": 1.05855536, + "epoch": 0.11523662947287418, + "flos": 546962625024.0, + "grad_norm": 0.02044602685826044, + "language_loss": 0.96609688, + "learning_rate": 0.0009810729756690223, + "loss": 0.97795177, + "num_input_tokens_seen": 49483584, + "router_z_loss_mlp": 1.26855469, + "step": 599, + "time_per_iteration": 2.7182610034942627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190213, + "balance_loss_mlp": 1.06323159, + "epoch": 0.11542901115813775, + "flos": 776387616768.0, + "grad_norm": 0.023703305464208416, + "language_loss": 0.99939269, + "learning_rate": 0.0009809879761967766, + "loss": 1.01129484, + "num_input_tokens_seen": 49563568, + "router_z_loss_mlp": 1.26904297, + "step": 600, + "time_per_iteration": 2.9586148262023926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189892, + "balance_loss_mlp": 1.06319618, + "epoch": 0.11562139284340131, + "flos": 732212863488.0, + "grad_norm": 0.024193120208057816, + "language_loss": 0.99113685, + "learning_rate": 0.0009809027899884378, + "loss": 1.00303578, + "num_input_tokens_seen": 49640800, + "router_z_loss_mlp": 1.26611328, + "step": 601, + "time_per_iteration": 2.885070323944092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183816, + "balance_loss_mlp": 1.05731082, + "epoch": 0.11581377452866487, + "flos": 537039710208.0, + "grad_norm": 0.022696091128935367, + "language_loss": 0.96568906, + "learning_rate": 0.0009808174170770779, + "loss": 0.97752714, + "num_input_tokens_seen": 49721872, + "router_z_loss_mlp": 1.26416016, + "step": 602, + "time_per_iteration": 2.7809743881225586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191742, + "balance_loss_mlp": 1.0662384, + "epoch": 0.11600615621392843, + "flos": 1559211617280.0, + "grad_norm": 0.013792800863456836, + "language_loss": 0.84898245, + "learning_rate": 0.0009807318574958418, + "loss": 0.86089987, + "num_input_tokens_seen": 49951472, + "router_z_loss_mlp": 1.25390625, + "step": 603, + "time_per_iteration": 4.860181570053101 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187966, + "balance_loss_mlp": 1.06169963, + "epoch": 0.116198537899192, + "flos": 538467795456.0, + "grad_norm": 0.022659628017063727, + "language_loss": 1.02766323, + "learning_rate": 0.0009806461112779462, + "loss": 1.03954291, + "num_input_tokens_seen": 50021136, + "router_z_loss_mlp": 1.26171875, + "step": 604, + "time_per_iteration": 2.614189863204956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187324, + "balance_loss_mlp": 1.06091404, + "epoch": 0.11639091958445556, + "flos": 455137142784.0, + "grad_norm": 0.0301649070939891, + "language_loss": 1.00891566, + "learning_rate": 0.0009805601784566814, + "loss": 1.02078903, + "num_input_tokens_seen": 50083888, + "router_z_loss_mlp": 1.26318359, + "step": 605, + "time_per_iteration": 2.470878839492798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119223, + "balance_loss_mlp": 1.06658351, + "epoch": 0.11658330126971912, + "flos": 556151668224.0, + "grad_norm": 0.025758302551065336, + "language_loss": 1.05099356, + "learning_rate": 0.0009804740590654089, + "loss": 1.0629158, + "num_input_tokens_seen": 50151744, + "router_z_loss_mlp": 1.25537109, + "step": 606, + "time_per_iteration": 2.631462812423706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191677, + "balance_loss_mlp": 1.06588733, + "epoch": 0.11677568295498268, + "flos": 717600737280.0, + "grad_norm": 0.02545612001836415, + "language_loss": 0.99629396, + "learning_rate": 0.0009803877531375635, + "loss": 1.00821078, + "num_input_tokens_seen": 50221248, + "router_z_loss_mlp": 1.25683594, + "step": 607, + "time_per_iteration": 2.879645586013794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191881, + "balance_loss_mlp": 1.06613898, + "epoch": 0.11696806464024626, + "flos": 610898614272.0, + "grad_norm": 0.023619167708177922, + "language_loss": 0.99668628, + "learning_rate": 0.0009803012607066523, + "loss": 1.008605, + "num_input_tokens_seen": 50293792, + "router_z_loss_mlp": 1.25634766, + "step": 608, + "time_per_iteration": 2.717660427093506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189661, + "balance_loss_mlp": 1.06406212, + "epoch": 0.11716044632550981, + "flos": 521415736320.0, + "grad_norm": 0.023557070356346427, + "language_loss": 0.97414643, + "learning_rate": 0.0009802145818062543, + "loss": 0.98604298, + "num_input_tokens_seen": 50367760, + "router_z_loss_mlp": 1.25488281, + "step": 609, + "time_per_iteration": 2.7209720611572266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190685, + "balance_loss_mlp": 1.064991, + "epoch": 0.11735282801077337, + "flos": 508488204288.0, + "grad_norm": 0.03039581956620226, + "language_loss": 1.01476204, + "learning_rate": 0.0009801277164700212, + "loss": 1.02666891, + "num_input_tokens_seen": 50435664, + "router_z_loss_mlp": 1.25585938, + "step": 610, + "time_per_iteration": 2.5900633335113525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190447, + "balance_loss_mlp": 1.06489623, + "epoch": 0.11754520969603693, + "flos": 687835995648.0, + "grad_norm": 0.028512829376260446, + "language_loss": 0.97853899, + "learning_rate": 0.0009800406647316776, + "loss": 0.99044347, + "num_input_tokens_seen": 50514144, + "router_z_loss_mlp": 1.25439453, + "step": 611, + "time_per_iteration": 2.8018290996551514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118467, + "balance_loss_mlp": 1.06088257, + "epoch": 0.1177375913813005, + "flos": 1545756331008.0, + "grad_norm": 0.00764509792440145, + "language_loss": 0.76914459, + "learning_rate": 0.0009799534266250196, + "loss": 0.78099126, + "num_input_tokens_seen": 50738448, + "router_z_loss_mlp": 1.24023438, + "step": 612, + "time_per_iteration": 4.767510175704956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185632, + "balance_loss_mlp": 1.05974686, + "epoch": 0.11792997306656407, + "flos": 521537260032.0, + "grad_norm": 0.0290479345737112, + "language_loss": 0.97953087, + "learning_rate": 0.000979866002183916, + "loss": 0.99138713, + "num_input_tokens_seen": 50809328, + "router_z_loss_mlp": 1.2578125, + "step": 613, + "time_per_iteration": 2.6752681732177734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182111, + "balance_loss_mlp": 1.05632174, + "epoch": 0.11812235475182763, + "flos": 667488608256.0, + "grad_norm": 0.030776001440310688, + "language_loss": 0.9883132, + "learning_rate": 0.0009797783914423082, + "loss": 1.00013435, + "num_input_tokens_seen": 50887728, + "router_z_loss_mlp": 1.25683594, + "step": 614, + "time_per_iteration": 2.8556718826293945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182577, + "balance_loss_mlp": 1.05697787, + "epoch": 0.11831473643709119, + "flos": 622504121856.0, + "grad_norm": 0.02739500646081478, + "language_loss": 0.93579996, + "learning_rate": 0.0009796905944342094, + "loss": 0.94762576, + "num_input_tokens_seen": 50966160, + "router_z_loss_mlp": 1.25488281, + "step": 615, + "time_per_iteration": 2.80253267288208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187072, + "balance_loss_mlp": 1.06152117, + "epoch": 0.11850711812235475, + "flos": 457694596608.0, + "grad_norm": 0.020858577781052552, + "language_loss": 0.96166766, + "learning_rate": 0.0009796026111937057, + "loss": 0.9735384, + "num_input_tokens_seen": 51035712, + "router_z_loss_mlp": 1.25439453, + "step": 616, + "time_per_iteration": 2.5763044357299805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189497, + "balance_loss_mlp": 1.06404102, + "epoch": 0.11869949980761832, + "flos": 514927137792.0, + "grad_norm": 0.022050319992180305, + "language_loss": 0.96050835, + "learning_rate": 0.0009795144417549552, + "loss": 0.97240329, + "num_input_tokens_seen": 51108656, + "router_z_loss_mlp": 1.25341797, + "step": 617, + "time_per_iteration": 2.7428698539733887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186044, + "balance_loss_mlp": 1.06092167, + "epoch": 0.11889188149288188, + "flos": 536156116992.0, + "grad_norm": 0.0238791856796517, + "language_loss": 0.97532642, + "learning_rate": 0.0009794260861521883, + "loss": 0.98718691, + "num_input_tokens_seen": 51185552, + "router_z_loss_mlp": 1.25292969, + "step": 618, + "time_per_iteration": 2.784257173538208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189196, + "balance_loss_mlp": 1.06445491, + "epoch": 0.11908426317814544, + "flos": 499644266496.0, + "grad_norm": 0.024260475486046627, + "language_loss": 0.96495152, + "learning_rate": 0.0009793375444197075, + "loss": 0.97684348, + "num_input_tokens_seen": 51255808, + "router_z_loss_mlp": 1.25, + "step": 619, + "time_per_iteration": 2.6021063327789307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189567, + "balance_loss_mlp": 1.06482673, + "epoch": 0.119276644863409, + "flos": 661067139072.0, + "grad_norm": 0.023292068214373615, + "language_loss": 0.96012962, + "learning_rate": 0.000979248816591888, + "loss": 0.97202522, + "num_input_tokens_seen": 51329408, + "router_z_loss_mlp": 1.25, + "step": 620, + "time_per_iteration": 2.783372640609741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184512, + "balance_loss_mlp": 1.06001019, + "epoch": 0.11946902654867257, + "flos": 760152021504.0, + "grad_norm": 0.02911418191745056, + "language_loss": 0.95521206, + "learning_rate": 0.0009791599027031766, + "loss": 0.96705711, + "num_input_tokens_seen": 51408784, + "router_z_loss_mlp": 1.24755859, + "step": 621, + "time_per_iteration": 3.04338002204895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185972, + "balance_loss_mlp": 1.06156564, + "epoch": 0.11966140823393613, + "flos": 682213526016.0, + "grad_norm": 0.0317276180850791, + "language_loss": 0.96021026, + "learning_rate": 0.0009790708027880932, + "loss": 0.97206998, + "num_input_tokens_seen": 51482592, + "router_z_loss_mlp": 1.24658203, + "step": 622, + "time_per_iteration": 2.8048830032348633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184547, + "balance_loss_mlp": 1.06171417, + "epoch": 0.11985378991919969, + "flos": 1454298147840.0, + "grad_norm": 0.011779966077399251, + "language_loss": 0.77427292, + "learning_rate": 0.0009789815168812293, + "loss": 0.78611839, + "num_input_tokens_seen": 51712240, + "router_z_loss_mlp": 1.23046875, + "step": 623, + "time_per_iteration": 4.88221549987793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187655, + "balance_loss_mlp": 1.06291461, + "epoch": 0.12004617160446325, + "flos": 528898718208.0, + "grad_norm": 0.0243802584204396, + "language_loss": 1.01341891, + "learning_rate": 0.0009788920450172487, + "loss": 1.0252955, + "num_input_tokens_seen": 51781440, + "router_z_loss_mlp": 1.25, + "step": 624, + "time_per_iteration": 2.6179678440093994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190724, + "balance_loss_mlp": 1.06655562, + "epoch": 0.12023855328972682, + "flos": 475176354816.0, + "grad_norm": 0.025839680970612892, + "language_loss": 0.99598378, + "learning_rate": 0.0009788023872308875, + "loss": 1.00789118, + "num_input_tokens_seen": 51845424, + "router_z_loss_mlp": 1.24414062, + "step": 625, + "time_per_iteration": 2.5168616771698 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189499, + "balance_loss_mlp": 1.06723785, + "epoch": 0.12043093497499038, + "flos": 1535051880960.0, + "grad_norm": 0.008994278182213968, + "language_loss": 0.75428998, + "learning_rate": 0.0009787125435569539, + "loss": 0.76618505, + "num_input_tokens_seen": 52076496, + "router_z_loss_mlp": 1.22460938, + "step": 626, + "time_per_iteration": 4.739393472671509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194547, + "balance_loss_mlp": 1.07128501, + "epoch": 0.12062331666025394, + "flos": 540914459136.0, + "grad_norm": 0.025390703641747513, + "language_loss": 1.01758838, + "learning_rate": 0.0009786225140303285, + "loss": 1.02953386, + "num_input_tokens_seen": 52143072, + "router_z_loss_mlp": 1.23486328, + "step": 627, + "time_per_iteration": 2.627995729446411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119053, + "balance_loss_mlp": 1.06683803, + "epoch": 0.1208156983455175, + "flos": 512999496192.0, + "grad_norm": 0.027559316114759484, + "language_loss": 1.00245547, + "learning_rate": 0.0009785322986859634, + "loss": 1.0143609, + "num_input_tokens_seen": 52211888, + "router_z_loss_mlp": 1.23925781, + "step": 628, + "time_per_iteration": 2.657465696334839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011787, + "balance_loss_mlp": 1.05481803, + "epoch": 0.12100808003078108, + "flos": 597589046784.0, + "grad_norm": 0.024406659961039724, + "language_loss": 1.01031506, + "learning_rate": 0.0009784418975588838, + "loss": 1.02210212, + "num_input_tokens_seen": 52283696, + "router_z_loss_mlp": 1.24121094, + "step": 629, + "time_per_iteration": 2.6953535079956055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187008, + "balance_loss_mlp": 1.063555, + "epoch": 0.12120046171604464, + "flos": 524066515968.0, + "grad_norm": 0.02180733694842763, + "language_loss": 0.99517697, + "learning_rate": 0.0009783513106841862, + "loss": 1.00704694, + "num_input_tokens_seen": 52358624, + "router_z_loss_mlp": 1.23681641, + "step": 630, + "time_per_iteration": 2.7234978675842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189331, + "balance_loss_mlp": 1.06687927, + "epoch": 0.1213928434013082, + "flos": 1557907057152.0, + "grad_norm": 0.011472153843238986, + "language_loss": 0.76732707, + "learning_rate": 0.00097826053809704, + "loss": 0.77922034, + "num_input_tokens_seen": 52591248, + "router_z_loss_mlp": 1.2265625, + "step": 631, + "time_per_iteration": 4.975109100341797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184278, + "balance_loss_mlp": 1.06072986, + "epoch": 0.12158522508657175, + "flos": 496387869696.0, + "grad_norm": 0.025959921000511615, + "language_loss": 0.96498066, + "learning_rate": 0.0009781695798326854, + "loss": 0.97682351, + "num_input_tokens_seen": 52659920, + "router_z_loss_mlp": 1.23779297, + "step": 632, + "time_per_iteration": 2.5740485191345215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184351, + "balance_loss_mlp": 1.0608983, + "epoch": 0.12177760677183531, + "flos": 476589703680.0, + "grad_norm": 0.025554774573744533, + "language_loss": 0.96275663, + "learning_rate": 0.0009780784359264365, + "loss": 0.9746002, + "num_input_tokens_seen": 52728832, + "router_z_loss_mlp": 1.23681641, + "step": 633, + "time_per_iteration": 2.604390859603882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176552, + "balance_loss_mlp": 1.05543518, + "epoch": 0.12196998845709889, + "flos": 1471784635392.0, + "grad_norm": 0.009598735556444526, + "language_loss": 0.74188697, + "learning_rate": 0.0009779871064136778, + "loss": 0.75365245, + "num_input_tokens_seen": 52949776, + "router_z_loss_mlp": 1.21289062, + "step": 634, + "time_per_iteration": 4.757449626922607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117713, + "balance_loss_mlp": 1.05424869, + "epoch": 0.12216237014236245, + "flos": 587748724224.0, + "grad_norm": 0.021555120902870813, + "language_loss": 0.93822527, + "learning_rate": 0.000977895591329867, + "loss": 0.94999647, + "num_input_tokens_seen": 53027184, + "router_z_loss_mlp": 1.23095703, + "step": 635, + "time_per_iteration": 2.7859792709350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181489, + "balance_loss_mlp": 1.05851305, + "epoch": 0.12235475182762601, + "flos": 599106455040.0, + "grad_norm": 0.023775729584682537, + "language_loss": 0.96009773, + "learning_rate": 0.000977803890710533, + "loss": 0.97191262, + "num_input_tokens_seen": 53101072, + "router_z_loss_mlp": 1.23193359, + "step": 636, + "time_per_iteration": 2.76069712638855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180701, + "balance_loss_mlp": 1.05762947, + "epoch": 0.12254713351288957, + "flos": 498760673280.0, + "grad_norm": 0.024707427516876792, + "language_loss": 1.00440359, + "learning_rate": 0.0009777120045912774, + "loss": 1.01621056, + "num_input_tokens_seen": 53172992, + "router_z_loss_mlp": 1.23291016, + "step": 637, + "time_per_iteration": 2.5980072021484375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118065, + "balance_loss_mlp": 1.05772126, + "epoch": 0.12273951519815314, + "flos": 606980204544.0, + "grad_norm": 0.02489341207380848, + "language_loss": 0.99891078, + "learning_rate": 0.0009776199330077736, + "loss": 1.01071739, + "num_input_tokens_seen": 53248256, + "router_z_loss_mlp": 1.23144531, + "step": 638, + "time_per_iteration": 2.704040288925171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181154, + "balance_loss_mlp": 1.05841601, + "epoch": 0.1229318968834167, + "flos": 598984931328.0, + "grad_norm": 0.02631208797714665, + "language_loss": 1.02141118, + "learning_rate": 0.0009775276759957667, + "loss": 1.03322268, + "num_input_tokens_seen": 53318960, + "router_z_loss_mlp": 1.22949219, + "step": 639, + "time_per_iteration": 2.7442896366119385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179934, + "balance_loss_mlp": 1.05700564, + "epoch": 0.12312427856868026, + "flos": 679588942848.0, + "grad_norm": 0.026802425502252814, + "language_loss": 1.01084137, + "learning_rate": 0.0009774352335910745, + "loss": 1.02264071, + "num_input_tokens_seen": 53389120, + "router_z_loss_mlp": 1.23144531, + "step": 640, + "time_per_iteration": 2.8294076919555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117918, + "balance_loss_mlp": 1.05625129, + "epoch": 0.12331666025394382, + "flos": 610043218944.0, + "grad_norm": 0.020742791942005383, + "language_loss": 1.02118182, + "learning_rate": 0.000977342605829586, + "loss": 1.03297377, + "num_input_tokens_seen": 53459056, + "router_z_loss_mlp": 1.23144531, + "step": 641, + "time_per_iteration": 2.7078418731689453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180028, + "balance_loss_mlp": 1.05748129, + "epoch": 0.12350904193920739, + "flos": 763840118784.0, + "grad_norm": 0.025027209312251563, + "language_loss": 0.94737858, + "learning_rate": 0.0009772497927472623, + "loss": 0.95917892, + "num_input_tokens_seen": 53541552, + "router_z_loss_mlp": 1.22753906, + "step": 642, + "time_per_iteration": 3.0655579566955566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177096, + "balance_loss_mlp": 1.05454898, + "epoch": 0.12370142362447095, + "flos": 542049831936.0, + "grad_norm": 0.02608476880613399, + "language_loss": 0.96273685, + "learning_rate": 0.0009771567943801368, + "loss": 0.97450781, + "num_input_tokens_seen": 53611520, + "router_z_loss_mlp": 1.22753906, + "step": 643, + "time_per_iteration": 2.7343406677246094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179725, + "balance_loss_mlp": 1.05727291, + "epoch": 0.12389380530973451, + "flos": 549252836352.0, + "grad_norm": 0.02435000122960196, + "language_loss": 0.99357152, + "learning_rate": 0.0009770636107643152, + "loss": 1.00536871, + "num_input_tokens_seen": 53683888, + "router_z_loss_mlp": 1.2265625, + "step": 644, + "time_per_iteration": 2.7361886501312256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177422, + "balance_loss_mlp": 1.05516136, + "epoch": 0.12408618699499807, + "flos": 541352890368.0, + "grad_norm": 0.02246298440278387, + "language_loss": 0.95392644, + "learning_rate": 0.0009769702419359738, + "loss": 0.96570063, + "num_input_tokens_seen": 53751888, + "router_z_loss_mlp": 1.22460938, + "step": 645, + "time_per_iteration": 2.674142837524414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181453, + "balance_loss_mlp": 1.05904841, + "epoch": 0.12427856868026164, + "flos": 747159361536.0, + "grad_norm": 0.023095982047370255, + "language_loss": 0.97586024, + "learning_rate": 0.000976876687931362, + "loss": 0.98767477, + "num_input_tokens_seen": 53827648, + "router_z_loss_mlp": 1.22607422, + "step": 646, + "time_per_iteration": 2.9833688735961914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189298, + "balance_loss_mlp": 1.06703711, + "epoch": 0.1244709503655252, + "flos": 534744769536.0, + "grad_norm": 0.03060863164707411, + "language_loss": 0.94044995, + "learning_rate": 0.0009767829487868005, + "loss": 0.95234299, + "num_input_tokens_seen": 53896400, + "router_z_loss_mlp": 1.22460938, + "step": 647, + "time_per_iteration": 2.596003293991089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182997, + "balance_loss_mlp": 1.06073558, + "epoch": 0.12466333205078876, + "flos": 509111285760.0, + "grad_norm": 0.028982594733012217, + "language_loss": 0.98960567, + "learning_rate": 0.000976689024538682, + "loss": 1.00143564, + "num_input_tokens_seen": 53965904, + "router_z_loss_mlp": 1.22460938, + "step": 648, + "time_per_iteration": 2.5837948322296143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183924, + "balance_loss_mlp": 1.06171107, + "epoch": 0.12485571373605232, + "flos": 682639222272.0, + "grad_norm": 0.03213416167398649, + "language_loss": 0.97804081, + "learning_rate": 0.0009765949152234716, + "loss": 0.98988008, + "num_input_tokens_seen": 54049792, + "router_z_loss_mlp": 1.22412109, + "step": 649, + "time_per_iteration": 2.876009702682495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193359, + "balance_loss_mlp": 1.07243347, + "epoch": 0.1250480954213159, + "flos": 1333198748160.0, + "grad_norm": 0.014891788740719425, + "language_loss": 0.78686082, + "learning_rate": 0.0009765006208777055, + "loss": 0.79879445, + "num_input_tokens_seen": 54262432, + "router_z_loss_mlp": 1.2109375, + "step": 650, + "time_per_iteration": 4.675558805465698 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182781, + "balance_loss_mlp": 1.06152093, + "epoch": 0.12524047710657946, + "flos": 940196754432.0, + "grad_norm": 0.027794334398077363, + "language_loss": 0.91408408, + "learning_rate": 0.0009764061415379919, + "loss": 0.9259119, + "num_input_tokens_seen": 54351568, + "router_z_loss_mlp": 1.21435547, + "step": 651, + "time_per_iteration": 3.260758399963379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184193, + "balance_loss_mlp": 1.06288576, + "epoch": 0.12543285879184302, + "flos": 514900941312.0, + "grad_norm": 0.027655948956122736, + "language_loss": 0.97430605, + "learning_rate": 0.0009763114772410109, + "loss": 0.986148, + "num_input_tokens_seen": 54418944, + "router_z_loss_mlp": 1.21484375, + "step": 652, + "time_per_iteration": 2.60402512550354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179616, + "balance_loss_mlp": 1.05849957, + "epoch": 0.12562524047710658, + "flos": 719682829824.0, + "grad_norm": 0.022040452281994895, + "language_loss": 0.94100869, + "learning_rate": 0.0009762166280235146, + "loss": 0.95280486, + "num_input_tokens_seen": 54495312, + "router_z_loss_mlp": 1.21289062, + "step": 653, + "time_per_iteration": 2.953866958618164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177042, + "balance_loss_mlp": 1.05592513, + "epoch": 0.12581762216237014, + "flos": 564798220800.0, + "grad_norm": 0.026345633512325176, + "language_loss": 0.96725851, + "learning_rate": 0.0009761215939223267, + "loss": 0.97902894, + "num_input_tokens_seen": 54566832, + "router_z_loss_mlp": 1.21289062, + "step": 654, + "time_per_iteration": 2.6936216354370117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176243, + "balance_loss_mlp": 1.0553174, + "epoch": 0.1260100038476337, + "flos": 482900382720.0, + "grad_norm": 0.0302310026354778, + "language_loss": 0.97697163, + "learning_rate": 0.0009760263749743428, + "loss": 0.98873413, + "num_input_tokens_seen": 54632128, + "router_z_loss_mlp": 1.2109375, + "step": 655, + "time_per_iteration": 2.5425992012023926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173716, + "balance_loss_mlp": 1.05302835, + "epoch": 0.12620238553289725, + "flos": 576701170176.0, + "grad_norm": 0.026173940013352312, + "language_loss": 0.96703827, + "learning_rate": 0.0009759309712165299, + "loss": 0.97877538, + "num_input_tokens_seen": 54707600, + "router_z_loss_mlp": 1.20849609, + "step": 656, + "time_per_iteration": 2.7134242057800293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182641, + "balance_loss_mlp": 1.06185794, + "epoch": 0.12639476721816084, + "flos": 532185314304.0, + "grad_norm": 0.024272217680215723, + "language_loss": 1.00863099, + "learning_rate": 0.0009758353826859272, + "loss": 1.02045751, + "num_input_tokens_seen": 54776704, + "router_z_loss_mlp": 1.20947266, + "step": 657, + "time_per_iteration": 2.621317148208618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183764, + "balance_loss_mlp": 1.06288576, + "epoch": 0.1265871489034244, + "flos": 691231380480.0, + "grad_norm": 0.02639198012969831, + "language_loss": 0.9913975, + "learning_rate": 0.0009757396094196456, + "loss": 1.00323522, + "num_input_tokens_seen": 54851744, + "router_z_loss_mlp": 1.21044922, + "step": 658, + "time_per_iteration": 2.8867759704589844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183942, + "balance_loss_mlp": 1.06311166, + "epoch": 0.12677953058868796, + "flos": 538242212352.0, + "grad_norm": 0.02343039495549204, + "language_loss": 0.91435432, + "learning_rate": 0.0009756436514548673, + "loss": 0.92619371, + "num_input_tokens_seen": 54932576, + "router_z_loss_mlp": 1.20996094, + "step": 659, + "time_per_iteration": 2.8055155277252197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179962, + "balance_loss_mlp": 1.05903614, + "epoch": 0.12697191227395152, + "flos": 520119908352.0, + "grad_norm": 0.02147737158217614, + "language_loss": 0.94944704, + "learning_rate": 0.0009755475088288466, + "loss": 0.96124667, + "num_input_tokens_seen": 55007296, + "router_z_loss_mlp": 1.2109375, + "step": 660, + "time_per_iteration": 2.713801860809326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179144, + "balance_loss_mlp": 1.05826533, + "epoch": 0.12716429395921508, + "flos": 567665851392.0, + "grad_norm": 0.026687699897107686, + "language_loss": 0.99289566, + "learning_rate": 0.0009754511815789095, + "loss": 1.00468707, + "num_input_tokens_seen": 55079312, + "router_z_loss_mlp": 1.21044922, + "step": 661, + "time_per_iteration": 2.739250898361206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176549, + "balance_loss_mlp": 1.05590951, + "epoch": 0.12735667564447864, + "flos": 515141987328.0, + "grad_norm": 0.028028480179563667, + "language_loss": 0.94950283, + "learning_rate": 0.0009753546697424533, + "loss": 0.96126837, + "num_input_tokens_seen": 55151824, + "router_z_loss_mlp": 1.20800781, + "step": 662, + "time_per_iteration": 2.71746826171875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180242, + "balance_loss_mlp": 1.05941188, + "epoch": 0.1275490573297422, + "flos": 542321077248.0, + "grad_norm": 0.02443290319898258, + "language_loss": 0.98755229, + "learning_rate": 0.0009752579733569475, + "loss": 0.99935466, + "num_input_tokens_seen": 55224368, + "router_z_loss_mlp": 1.20996094, + "step": 663, + "time_per_iteration": 2.631284713745117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179512, + "balance_loss_mlp": 1.06030273, + "epoch": 0.12774143901500576, + "flos": 1562024853504.0, + "grad_norm": 0.010147906106003043, + "language_loss": 0.74881387, + "learning_rate": 0.0009751610924599328, + "loss": 0.76060903, + "num_input_tokens_seen": 55453584, + "router_z_loss_mlp": 1.19335938, + "step": 664, + "time_per_iteration": 4.941519260406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188286, + "balance_loss_mlp": 1.06783676, + "epoch": 0.12793382070026935, + "flos": 614873419776.0, + "grad_norm": 0.028758292375382164, + "language_loss": 1.00255466, + "learning_rate": 0.0009750640270890217, + "loss": 1.01443744, + "num_input_tokens_seen": 55528000, + "router_z_loss_mlp": 1.20605469, + "step": 665, + "time_per_iteration": 2.7382516860961914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185033, + "balance_loss_mlp": 1.06458378, + "epoch": 0.1281262023855329, + "flos": 709117367808.0, + "grad_norm": 0.02727882395737353, + "language_loss": 1.05972624, + "learning_rate": 0.0009749667772818983, + "loss": 1.0715766, + "num_input_tokens_seen": 55612416, + "router_z_loss_mlp": 1.20605469, + "step": 666, + "time_per_iteration": 2.961103677749634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117968, + "balance_loss_mlp": 1.06104279, + "epoch": 0.12831858407079647, + "flos": 1428182572032.0, + "grad_norm": 0.005713660367986308, + "language_loss": 0.76935941, + "learning_rate": 0.0009748693430763185, + "loss": 0.78115624, + "num_input_tokens_seen": 55843664, + "router_z_loss_mlp": 1.1875, + "step": 667, + "time_per_iteration": 4.799788475036621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180825, + "balance_loss_mlp": 1.06056714, + "epoch": 0.12851096575606002, + "flos": 450018232320.0, + "grad_norm": 0.027450705632443572, + "language_loss": 1.04045725, + "learning_rate": 0.0009747717245101093, + "loss": 1.05226541, + "num_input_tokens_seen": 55909072, + "router_z_loss_mlp": 1.20410156, + "step": 668, + "time_per_iteration": 2.503016471862793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181103, + "balance_loss_mlp": 1.0609405, + "epoch": 0.12870334744132358, + "flos": 480909614592.0, + "grad_norm": 0.024743463193645603, + "language_loss": 0.94192064, + "learning_rate": 0.00097467392162117, + "loss": 0.95373166, + "num_input_tokens_seen": 55978544, + "router_z_loss_mlp": 1.203125, + "step": 669, + "time_per_iteration": 2.6341683864593506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176215, + "balance_loss_mlp": 1.05609953, + "epoch": 0.12889572912658714, + "flos": 640151064576.0, + "grad_norm": 0.020470833753638586, + "language_loss": 0.98179239, + "learning_rate": 0.0009745759344474708, + "loss": 0.99355447, + "num_input_tokens_seen": 56054144, + "router_z_loss_mlp": 1.20263672, + "step": 670, + "time_per_iteration": 2.8753654956817627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175464, + "balance_loss_mlp": 1.05530083, + "epoch": 0.1290881108118507, + "flos": 510954333696.0, + "grad_norm": 0.02496408481001148, + "language_loss": 0.98669916, + "learning_rate": 0.0009744777630270536, + "loss": 0.99845386, + "num_input_tokens_seen": 56120960, + "router_z_loss_mlp": 1.203125, + "step": 671, + "time_per_iteration": 2.601480484008789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173739, + "balance_loss_mlp": 1.05381489, + "epoch": 0.12928049249711426, + "flos": 672290611200.0, + "grad_norm": 0.0267777739546368, + "language_loss": 1.0349828, + "learning_rate": 0.000974379407398032, + "loss": 1.04672015, + "num_input_tokens_seen": 56202560, + "router_z_loss_mlp": 1.20068359, + "step": 672, + "time_per_iteration": 2.8746023178100586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176311, + "balance_loss_mlp": 1.05633891, + "epoch": 0.12947287418237785, + "flos": 794998743552.0, + "grad_norm": 0.021070447178693698, + "language_loss": 0.89884377, + "learning_rate": 0.0009742808675985913, + "loss": 0.91060686, + "num_input_tokens_seen": 56289456, + "router_z_loss_mlp": 1.20117188, + "step": 673, + "time_per_iteration": 3.106855869293213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178925, + "balance_loss_mlp": 1.05895269, + "epoch": 0.1296652558676414, + "flos": 486447490560.0, + "grad_norm": 0.028552559493613055, + "language_loss": 1.00707459, + "learning_rate": 0.0009741821436669876, + "loss": 1.0188638, + "num_input_tokens_seen": 56354480, + "router_z_loss_mlp": 1.20117188, + "step": 674, + "time_per_iteration": 2.6221611499786377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180752, + "balance_loss_mlp": 1.06097043, + "epoch": 0.12985763755290497, + "flos": 454392537600.0, + "grad_norm": 0.03163366532216525, + "language_loss": 1.04449701, + "learning_rate": 0.0009740832356415492, + "loss": 1.05630445, + "num_input_tokens_seen": 56418944, + "router_z_loss_mlp": 1.19921875, + "step": 675, + "time_per_iteration": 2.508666515350342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179614, + "balance_loss_mlp": 1.05968916, + "epoch": 0.13005001923816853, + "flos": 826434617856.0, + "grad_norm": 0.02755997498495484, + "language_loss": 0.99148017, + "learning_rate": 0.0009739841435606756, + "loss": 1.00327623, + "num_input_tokens_seen": 56492368, + "router_z_loss_mlp": 1.20068359, + "step": 676, + "time_per_iteration": 3.026420831680298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180175, + "balance_loss_mlp": 1.06058431, + "epoch": 0.1302424009234321, + "flos": 532480754688.0, + "grad_norm": 0.02275953253130011, + "language_loss": 0.97366607, + "learning_rate": 0.0009738848674628377, + "loss": 0.98546779, + "num_input_tokens_seen": 56568128, + "router_z_loss_mlp": 1.19726562, + "step": 677, + "time_per_iteration": 2.710205554962158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179059, + "balance_loss_mlp": 1.05927801, + "epoch": 0.13043478260869565, + "flos": 526916682240.0, + "grad_norm": 0.02441501439452981, + "language_loss": 0.97902691, + "learning_rate": 0.000973785407386578, + "loss": 0.99081755, + "num_input_tokens_seen": 56646448, + "router_z_loss_mlp": 1.19921875, + "step": 678, + "time_per_iteration": 2.7785394191741943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184892, + "balance_loss_mlp": 1.06553924, + "epoch": 0.1306271642939592, + "flos": 627416914944.0, + "grad_norm": 0.023801085732510874, + "language_loss": 0.94469249, + "learning_rate": 0.0009736857633705103, + "loss": 0.95654142, + "num_input_tokens_seen": 56732080, + "router_z_loss_mlp": 1.19482422, + "step": 679, + "time_per_iteration": 2.8619470596313477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177483, + "balance_loss_mlp": 1.05827415, + "epoch": 0.13081954597922277, + "flos": 551840489472.0, + "grad_norm": 0.024512943765722366, + "language_loss": 1.01033652, + "learning_rate": 0.0009735859354533196, + "loss": 1.02211142, + "num_input_tokens_seen": 56804432, + "router_z_loss_mlp": 1.19335938, + "step": 680, + "time_per_iteration": 2.6954457759857178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176387, + "balance_loss_mlp": 1.05755925, + "epoch": 0.13101192766448633, + "flos": 537955504128.0, + "grad_norm": 0.029188130773433643, + "language_loss": 1.02405858, + "learning_rate": 0.0009734859236737628, + "loss": 1.03582239, + "num_input_tokens_seen": 56872512, + "router_z_loss_mlp": 1.18945312, + "step": 681, + "time_per_iteration": 2.606597661972046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172364, + "balance_loss_mlp": 1.05353606, + "epoch": 0.13120430934974991, + "flos": 504513398784.0, + "grad_norm": 0.02625319928532985, + "language_loss": 1.02007055, + "learning_rate": 0.0009733857280706678, + "loss": 1.03179431, + "num_input_tokens_seen": 56940928, + "router_z_loss_mlp": 1.18945312, + "step": 682, + "time_per_iteration": 2.626211404800415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011686, + "balance_loss_mlp": 1.05010605, + "epoch": 0.13139669103501347, + "flos": 615422641152.0, + "grad_norm": 0.025135553656080285, + "language_loss": 0.9321503, + "learning_rate": 0.000973285348682934, + "loss": 0.94383633, + "num_input_tokens_seen": 57012736, + "router_z_loss_mlp": 1.18603516, + "step": 683, + "time_per_iteration": 2.71779727935791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190269, + "balance_loss_mlp": 1.07296753, + "epoch": 0.13158907272027703, + "flos": 1488215614464.0, + "grad_norm": 0.025067429703540995, + "language_loss": 0.77898371, + "learning_rate": 0.0009731847855495323, + "loss": 0.7908864, + "num_input_tokens_seen": 57243136, + "router_z_loss_mlp": 1.17382812, + "step": 684, + "time_per_iteration": 4.811431169509888 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168738, + "balance_loss_mlp": 1.05048192, + "epoch": 0.1317814544055406, + "flos": 987117614592.0, + "grad_norm": 0.026136533405527674, + "language_loss": 0.93269205, + "learning_rate": 0.0009730840387095046, + "loss": 0.94437939, + "num_input_tokens_seen": 57336160, + "router_z_loss_mlp": 1.18359375, + "step": 685, + "time_per_iteration": 3.3154938220977783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117288, + "balance_loss_mlp": 1.05443382, + "epoch": 0.13197383609080415, + "flos": 612628870656.0, + "grad_norm": 0.026271684435729213, + "language_loss": 0.99177825, + "learning_rate": 0.0009729831082019642, + "loss": 1.00350702, + "num_input_tokens_seen": 57418976, + "router_z_loss_mlp": 1.18554688, + "step": 686, + "time_per_iteration": 2.79620623588562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179476, + "balance_loss_mlp": 1.06093395, + "epoch": 0.1321662177760677, + "flos": 495554668032.0, + "grad_norm": 0.02508782879826625, + "language_loss": 0.97052312, + "learning_rate": 0.0009728819940660958, + "loss": 0.98231786, + "num_input_tokens_seen": 57490288, + "router_z_loss_mlp": 1.18652344, + "step": 687, + "time_per_iteration": 2.779193162918091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178983, + "balance_loss_mlp": 1.06067955, + "epoch": 0.13235859946133127, + "flos": 496843765248.0, + "grad_norm": 0.02705130625621755, + "language_loss": 0.97550011, + "learning_rate": 0.0009727806963411557, + "loss": 0.98728997, + "num_input_tokens_seen": 57556064, + "router_z_loss_mlp": 1.18408203, + "step": 688, + "time_per_iteration": 2.5702319145202637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177139, + "balance_loss_mlp": 1.05883551, + "epoch": 0.13255098114659483, + "flos": 512767182336.0, + "grad_norm": 0.022910122085290585, + "language_loss": 0.96022904, + "learning_rate": 0.000972679215066471, + "loss": 0.97200048, + "num_input_tokens_seen": 57627248, + "router_z_loss_mlp": 1.18408203, + "step": 689, + "time_per_iteration": 2.64780592918396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178761, + "balance_loss_mlp": 1.06050563, + "epoch": 0.13274336283185842, + "flos": 548399442432.0, + "grad_norm": 0.030606528220640358, + "language_loss": 1.08985806, + "learning_rate": 0.0009725775502814401, + "loss": 1.10164571, + "num_input_tokens_seen": 57694832, + "router_z_loss_mlp": 1.18359375, + "step": 690, + "time_per_iteration": 2.5830535888671875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179512, + "balance_loss_mlp": 1.06120849, + "epoch": 0.13293574451712198, + "flos": 642002844672.0, + "grad_norm": 0.023439513257655937, + "language_loss": 0.94635952, + "learning_rate": 0.0009724757020255327, + "loss": 0.95815468, + "num_input_tokens_seen": 57771776, + "router_z_loss_mlp": 1.18408203, + "step": 691, + "time_per_iteration": 2.827944278717041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183334, + "balance_loss_mlp": 1.06517375, + "epoch": 0.13312812620238554, + "flos": 492469459968.0, + "grad_norm": 0.028212898490696088, + "language_loss": 0.96836531, + "learning_rate": 0.0009723736703382902, + "loss": 0.98019874, + "num_input_tokens_seen": 57836272, + "router_z_loss_mlp": 1.18261719, + "step": 692, + "time_per_iteration": 2.6144213676452637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180114, + "balance_loss_mlp": 1.06200123, + "epoch": 0.1333205078876491, + "flos": 509949216768.0, + "grad_norm": 0.023005533645913036, + "language_loss": 0.90654016, + "learning_rate": 0.0009722714552593244, + "loss": 0.91834128, + "num_input_tokens_seen": 57907232, + "router_z_loss_mlp": 1.18212891, + "step": 693, + "time_per_iteration": 2.600128173828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180549, + "balance_loss_mlp": 1.06262743, + "epoch": 0.13351288957291266, + "flos": 419591477760.0, + "grad_norm": 0.029950659996273835, + "language_loss": 1.05475199, + "learning_rate": 0.000972169056828319, + "loss": 1.06655741, + "num_input_tokens_seen": 57969808, + "router_z_loss_mlp": 1.18017578, + "step": 694, + "time_per_iteration": 2.466643810272217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178338, + "balance_loss_mlp": 1.0606066, + "epoch": 0.13370527125817622, + "flos": 617050839552.0, + "grad_norm": 0.021764231653516302, + "language_loss": 0.95444119, + "learning_rate": 0.0009720664750850283, + "loss": 0.96622455, + "num_input_tokens_seen": 58042944, + "router_z_loss_mlp": 1.17822266, + "step": 695, + "time_per_iteration": 2.7776308059692383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173328, + "balance_loss_mlp": 1.05578816, + "epoch": 0.13389765294343978, + "flos": 627169138176.0, + "grad_norm": 0.026088042391715836, + "language_loss": 1.0165019, + "learning_rate": 0.0009719637100692784, + "loss": 1.0282352, + "num_input_tokens_seen": 58116080, + "router_z_loss_mlp": 1.17626953, + "step": 696, + "time_per_iteration": 2.77535343170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175294, + "balance_loss_mlp": 1.0578016, + "epoch": 0.13409003462870334, + "flos": 610896612864.0, + "grad_norm": 0.027090913840535472, + "language_loss": 0.92017978, + "learning_rate": 0.0009718607618209661, + "loss": 0.93193275, + "num_input_tokens_seen": 58197616, + "router_z_loss_mlp": 1.17578125, + "step": 697, + "time_per_iteration": 2.8413584232330322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179845, + "balance_loss_mlp": 1.06235278, + "epoch": 0.13428241631396692, + "flos": 685087887360.0, + "grad_norm": 0.024883061853709334, + "language_loss": 0.95573747, + "learning_rate": 0.0009717576303800595, + "loss": 0.96753585, + "num_input_tokens_seen": 58280480, + "router_z_loss_mlp": 1.17578125, + "step": 698, + "time_per_iteration": 3.047100782394409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175386, + "balance_loss_mlp": 1.05794048, + "epoch": 0.13447479799923048, + "flos": 509818960896.0, + "grad_norm": 0.024888049065051182, + "language_loss": 0.95325053, + "learning_rate": 0.0009716543157865975, + "loss": 0.96500432, + "num_input_tokens_seen": 58352464, + "router_z_loss_mlp": 1.17529297, + "step": 699, + "time_per_iteration": 2.7481272220611572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176233, + "balance_loss_mlp": 1.05878782, + "epoch": 0.13466717968449404, + "flos": 899058819072.0, + "grad_norm": 0.023872779385430955, + "language_loss": 0.92076075, + "learning_rate": 0.0009715508180806907, + "loss": 0.93252313, + "num_input_tokens_seen": 58437216, + "router_z_loss_mlp": 1.17529297, + "step": 700, + "time_per_iteration": 3.2107367515563965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173529, + "balance_loss_mlp": 1.05660856, + "epoch": 0.1348595613697576, + "flos": 991694034432.0, + "grad_norm": 0.023513798430807663, + "language_loss": 1.00262749, + "learning_rate": 0.0009714471373025202, + "loss": 1.01436281, + "num_input_tokens_seen": 58533152, + "router_z_loss_mlp": 1.16992188, + "step": 701, + "time_per_iteration": 3.3966751098632812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173715, + "balance_loss_mlp": 1.0566988, + "epoch": 0.13505194305502116, + "flos": 488811561984.0, + "grad_norm": 0.028001983236069502, + "language_loss": 0.99373382, + "learning_rate": 0.0009713432734923386, + "loss": 1.00547099, + "num_input_tokens_seen": 58601376, + "router_z_loss_mlp": 1.17089844, + "step": 702, + "time_per_iteration": 2.615107536315918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171408, + "balance_loss_mlp": 1.05439234, + "epoch": 0.13524432474028472, + "flos": 614519582208.0, + "grad_norm": 0.024192478681639117, + "language_loss": 0.96606487, + "learning_rate": 0.0009712392266904696, + "loss": 0.97777891, + "num_input_tokens_seen": 58676608, + "router_z_loss_mlp": 1.17089844, + "step": 703, + "time_per_iteration": 2.7448034286499023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174325, + "balance_loss_mlp": 1.05740499, + "epoch": 0.13543670642554828, + "flos": 906274558464.0, + "grad_norm": 0.025492480769094515, + "language_loss": 0.96012545, + "learning_rate": 0.0009711349969373076, + "loss": 0.97186869, + "num_input_tokens_seen": 58759264, + "router_z_loss_mlp": 1.16992188, + "step": 704, + "time_per_iteration": 3.1337268352508545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172794, + "balance_loss_mlp": 1.05596876, + "epoch": 0.13562908811081184, + "flos": 551747163648.0, + "grad_norm": 0.026772975251671254, + "language_loss": 0.91034031, + "learning_rate": 0.0009710305842733178, + "loss": 0.9220683, + "num_input_tokens_seen": 58834800, + "router_z_loss_mlp": 1.16894531, + "step": 705, + "time_per_iteration": 2.7571139335632324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167183, + "balance_loss_mlp": 1.05031061, + "epoch": 0.1358214697960754, + "flos": 509037425664.0, + "grad_norm": 0.024292049069741084, + "language_loss": 0.98220038, + "learning_rate": 0.0009709259887390373, + "loss": 0.99387223, + "num_input_tokens_seen": 58901712, + "router_z_loss_mlp": 1.16943359, + "step": 706, + "time_per_iteration": 2.559511661529541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168004, + "balance_loss_mlp": 1.05141699, + "epoch": 0.136013851481339, + "flos": 529923300864.0, + "grad_norm": 0.025926611739077732, + "language_loss": 1.00068641, + "learning_rate": 0.0009708212103750737, + "loss": 1.01236641, + "num_input_tokens_seen": 58967824, + "router_z_loss_mlp": 1.16650391, + "step": 707, + "time_per_iteration": 2.6197190284729004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168587, + "balance_loss_mlp": 1.05219126, + "epoch": 0.13620623316660255, + "flos": 660320532480.0, + "grad_norm": 0.02235622943703988, + "language_loss": 0.96270919, + "learning_rate": 0.0009707162492221051, + "loss": 0.97439504, + "num_input_tokens_seen": 59045040, + "router_z_loss_mlp": 1.16455078, + "step": 708, + "time_per_iteration": 2.8917648792266846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171818, + "balance_loss_mlp": 1.05542207, + "epoch": 0.1363986148518661, + "flos": 673082880000.0, + "grad_norm": 0.027649047287573853, + "language_loss": 0.98132068, + "learning_rate": 0.0009706111053208815, + "loss": 0.99303889, + "num_input_tokens_seen": 59117216, + "router_z_loss_mlp": 1.16455078, + "step": 709, + "time_per_iteration": 2.7827165126800537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173191, + "balance_loss_mlp": 1.05669987, + "epoch": 0.13659099653712967, + "flos": 474004051968.0, + "grad_norm": 0.02773643003805471, + "language_loss": 0.94597077, + "learning_rate": 0.0009705057787122232, + "loss": 0.9577027, + "num_input_tokens_seen": 59183056, + "router_z_loss_mlp": 1.16552734, + "step": 710, + "time_per_iteration": 2.542836904525757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169067, + "balance_loss_mlp": 1.05286229, + "epoch": 0.13678337822239323, + "flos": 453647932416.0, + "grad_norm": 0.0248615327032158, + "language_loss": 0.9884814, + "learning_rate": 0.0009704002694370216, + "loss": 1.00017214, + "num_input_tokens_seen": 59247312, + "router_z_loss_mlp": 1.16259766, + "step": 711, + "time_per_iteration": 2.550527811050415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164533, + "balance_loss_mlp": 1.04842281, + "epoch": 0.13697575990765679, + "flos": 520625468928.0, + "grad_norm": 0.0274811578413112, + "language_loss": 0.97066599, + "learning_rate": 0.0009702945775362388, + "loss": 0.98231125, + "num_input_tokens_seen": 59317968, + "router_z_loss_mlp": 1.16162109, + "step": 712, + "time_per_iteration": 2.56953501701355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116862, + "balance_loss_mlp": 1.05246294, + "epoch": 0.13716814159292035, + "flos": 481365510144.0, + "grad_norm": 0.025544817797380492, + "language_loss": 0.98621845, + "learning_rate": 0.0009701887030509086, + "loss": 0.99790466, + "num_input_tokens_seen": 59387936, + "router_z_loss_mlp": 1.16210938, + "step": 713, + "time_per_iteration": 2.6443872451782227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173026, + "balance_loss_mlp": 1.05663013, + "epoch": 0.1373605232781839, + "flos": 546749776896.0, + "grad_norm": 0.02672517687154734, + "language_loss": 1.02031791, + "learning_rate": 0.0009700826460221346, + "loss": 1.03204811, + "num_input_tokens_seen": 59460624, + "router_z_loss_mlp": 1.16455078, + "step": 714, + "time_per_iteration": 2.6742734909057617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171339, + "balance_loss_mlp": 1.05508566, + "epoch": 0.1375529049634475, + "flos": 710070091776.0, + "grad_norm": 0.027473841831572973, + "language_loss": 1.03736091, + "learning_rate": 0.0009699764064910921, + "loss": 1.04907441, + "num_input_tokens_seen": 59536752, + "router_z_loss_mlp": 1.16308594, + "step": 715, + "time_per_iteration": 2.8945000171661377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116921, + "balance_loss_mlp": 1.05281401, + "epoch": 0.13774528664871105, + "flos": 487676189184.0, + "grad_norm": 0.02500038679906112, + "language_loss": 0.96403199, + "learning_rate": 0.0009698699844990268, + "loss": 0.9757241, + "num_input_tokens_seen": 59608128, + "router_z_loss_mlp": 1.16455078, + "step": 716, + "time_per_iteration": 2.638272762298584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116569, + "balance_loss_mlp": 1.04972363, + "epoch": 0.1379376683339746, + "flos": 681458187264.0, + "grad_norm": 0.024933229917961583, + "language_loss": 0.9565106, + "learning_rate": 0.0009697633800872555, + "loss": 0.96816742, + "num_input_tokens_seen": 59685120, + "router_z_loss_mlp": 1.16015625, + "step": 717, + "time_per_iteration": 2.8989553451538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168974, + "balance_loss_mlp": 1.05310297, + "epoch": 0.13813005001923817, + "flos": 612225368064.0, + "grad_norm": 0.02330012063083705, + "language_loss": 1.0130372, + "learning_rate": 0.0009696565932971655, + "loss": 1.02472687, + "num_input_tokens_seen": 59763376, + "router_z_loss_mlp": 1.15917969, + "step": 718, + "time_per_iteration": 2.8472671508789062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171117, + "balance_loss_mlp": 1.05524576, + "epoch": 0.13832243170450173, + "flos": 589926144000.0, + "grad_norm": 0.027418468702626427, + "language_loss": 0.98498988, + "learning_rate": 0.0009695496241702153, + "loss": 0.99670106, + "num_input_tokens_seen": 59836800, + "router_z_loss_mlp": 1.15917969, + "step": 719, + "time_per_iteration": 2.786895990371704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167345, + "balance_loss_mlp": 1.05180764, + "epoch": 0.1385148133897653, + "flos": 701319479808.0, + "grad_norm": 0.026285913371991803, + "language_loss": 0.94868541, + "learning_rate": 0.0009694424727479339, + "loss": 0.96035892, + "num_input_tokens_seen": 59914720, + "router_z_loss_mlp": 1.15576172, + "step": 720, + "time_per_iteration": 2.921644926071167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117298, + "balance_loss_mlp": 1.05729949, + "epoch": 0.13870719507502885, + "flos": 599366966784.0, + "grad_norm": 0.024279001882637877, + "language_loss": 0.97845113, + "learning_rate": 0.0009693351390719213, + "loss": 0.99018097, + "num_input_tokens_seen": 59984544, + "router_z_loss_mlp": 1.15722656, + "step": 721, + "time_per_iteration": 2.701911687850952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168632, + "balance_loss_mlp": 1.05304694, + "epoch": 0.1388995767602924, + "flos": 587748724224.0, + "grad_norm": 0.03212240351747381, + "language_loss": 0.98596126, + "learning_rate": 0.000969227623183848, + "loss": 0.99764758, + "num_input_tokens_seen": 60057056, + "router_z_loss_mlp": 1.15625, + "step": 722, + "time_per_iteration": 2.7723541259765625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167541, + "balance_loss_mlp": 1.05205071, + "epoch": 0.139091958445556, + "flos": 652362189312.0, + "grad_norm": 0.025655198862846312, + "language_loss": 0.99224544, + "learning_rate": 0.0009691199251254554, + "loss": 1.00392079, + "num_input_tokens_seen": 60133232, + "router_z_loss_mlp": 1.15527344, + "step": 723, + "time_per_iteration": 2.8426058292388916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165537, + "balance_loss_mlp": 1.05019021, + "epoch": 0.13928434013081956, + "flos": 576905286144.0, + "grad_norm": 0.022500478429048027, + "language_loss": 0.9243086, + "learning_rate": 0.0009690120449385555, + "loss": 0.93596393, + "num_input_tokens_seen": 60207104, + "router_z_loss_mlp": 1.15380859, + "step": 724, + "time_per_iteration": 2.7558276653289795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168709, + "balance_loss_mlp": 1.05307627, + "epoch": 0.13947672181608312, + "flos": 564314127360.0, + "grad_norm": 0.02294482348940274, + "language_loss": 1.00981367, + "learning_rate": 0.0009689039826650312, + "loss": 1.02150071, + "num_input_tokens_seen": 60277920, + "router_z_loss_mlp": 1.15673828, + "step": 725, + "time_per_iteration": 2.784708261489868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211281, + "balance_loss_mlp": 1.09550476, + "epoch": 0.13966910350134668, + "flos": 1524949045248.0, + "grad_norm": 0.02639881420994122, + "language_loss": 0.76523066, + "learning_rate": 0.000968795738346836, + "loss": 0.77734339, + "num_input_tokens_seen": 60494224, + "router_z_loss_mlp": 1.15820312, + "step": 726, + "time_per_iteration": 4.9523255825042725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171441, + "balance_loss_mlp": 1.05604661, + "epoch": 0.13986148518661023, + "flos": 500855500800.0, + "grad_norm": 0.0321160389091748, + "language_loss": 0.98954523, + "learning_rate": 0.0009686873120259941, + "loss": 1.00125957, + "num_input_tokens_seen": 60562176, + "router_z_loss_mlp": 1.15429688, + "step": 727, + "time_per_iteration": 2.584141731262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173326, + "balance_loss_mlp": 1.05850363, + "epoch": 0.1400538668718738, + "flos": 599849058816.0, + "grad_norm": 0.027531106684590426, + "language_loss": 0.93834305, + "learning_rate": 0.0009685787037446004, + "loss": 0.95007634, + "num_input_tokens_seen": 60631472, + "router_z_loss_mlp": 1.1484375, + "step": 728, + "time_per_iteration": 2.770592451095581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170215, + "balance_loss_mlp": 1.05520177, + "epoch": 0.14024624855713735, + "flos": 595168579584.0, + "grad_norm": 0.026051179565135866, + "language_loss": 0.98294961, + "learning_rate": 0.0009684699135448201, + "loss": 0.99465179, + "num_input_tokens_seen": 60703488, + "router_z_loss_mlp": 1.15039062, + "step": 729, + "time_per_iteration": 2.728573799133301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164865, + "balance_loss_mlp": 1.04985154, + "epoch": 0.1404386302424009, + "flos": 507585145344.0, + "grad_norm": 0.02205061924934426, + "language_loss": 0.98307908, + "learning_rate": 0.0009683609414688895, + "loss": 0.99472773, + "num_input_tokens_seen": 60773936, + "router_z_loss_mlp": 1.15039062, + "step": 730, + "time_per_iteration": 2.700016975402832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167363, + "balance_loss_mlp": 1.05254078, + "epoch": 0.14063101192766447, + "flos": 574515018240.0, + "grad_norm": 0.021243768346974407, + "language_loss": 0.95329058, + "learning_rate": 0.0009682517875591154, + "loss": 0.96496415, + "num_input_tokens_seen": 60851120, + "router_z_loss_mlp": 1.1484375, + "step": 731, + "time_per_iteration": 2.743590831756592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116728, + "balance_loss_mlp": 1.05264843, + "epoch": 0.14082339361292806, + "flos": 565764406272.0, + "grad_norm": 0.02284757167221282, + "language_loss": 0.93998873, + "learning_rate": 0.0009681424518578749, + "loss": 0.95166153, + "num_input_tokens_seen": 60924896, + "router_z_loss_mlp": 1.14648438, + "step": 732, + "time_per_iteration": 2.757690668106079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166596, + "balance_loss_mlp": 1.05215514, + "epoch": 0.14101577529819162, + "flos": 464582694912.0, + "grad_norm": 0.02112517179619274, + "language_loss": 0.95363593, + "learning_rate": 0.000968032934407616, + "loss": 0.96530199, + "num_input_tokens_seen": 60996016, + "router_z_loss_mlp": 1.14453125, + "step": 733, + "time_per_iteration": 2.6260647773742676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167013, + "balance_loss_mlp": 1.05257201, + "epoch": 0.14120815698345518, + "flos": 597261405696.0, + "grad_norm": 0.02235342076428548, + "language_loss": 0.90822989, + "learning_rate": 0.0009679232352508571, + "loss": 0.91990006, + "num_input_tokens_seen": 61072016, + "router_z_loss_mlp": 1.14453125, + "step": 734, + "time_per_iteration": 2.7677996158599854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167689, + "balance_loss_mlp": 1.05334342, + "epoch": 0.14140053866871874, + "flos": 536231978496.0, + "grad_norm": 0.023954026934244203, + "language_loss": 0.90350544, + "learning_rate": 0.0009678133544301871, + "loss": 0.91518235, + "num_input_tokens_seen": 61144528, + "router_z_loss_mlp": 1.14355469, + "step": 735, + "time_per_iteration": 2.6668286323547363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165912, + "balance_loss_mlp": 1.05147135, + "epoch": 0.1415929203539823, + "flos": 521276748288.0, + "grad_norm": 0.01836780541558419, + "language_loss": 0.98091269, + "learning_rate": 0.0009677032919882658, + "loss": 0.99257177, + "num_input_tokens_seen": 61216960, + "router_z_loss_mlp": 1.14453125, + "step": 736, + "time_per_iteration": 2.654975652694702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175055, + "balance_loss_mlp": 1.0601368, + "epoch": 0.14178530203924586, + "flos": 483301883904.0, + "grad_norm": 0.025248480485652293, + "language_loss": 1.00008237, + "learning_rate": 0.000967593047967823, + "loss": 1.01183295, + "num_input_tokens_seen": 61281312, + "router_z_loss_mlp": 1.14941406, + "step": 737, + "time_per_iteration": 2.529147148132324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167635, + "balance_loss_mlp": 1.05319452, + "epoch": 0.14197768372450942, + "flos": 677839220736.0, + "grad_norm": 0.02278890168576414, + "language_loss": 0.9561522, + "learning_rate": 0.0009674826224116593, + "loss": 0.96782857, + "num_input_tokens_seen": 61355888, + "router_z_loss_mlp": 1.14453125, + "step": 738, + "time_per_iteration": 2.8032455444335938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170364, + "balance_loss_mlp": 1.05606639, + "epoch": 0.14217006540977298, + "flos": 446992147968.0, + "grad_norm": 0.026055784762538982, + "language_loss": 0.97800839, + "learning_rate": 0.0009673720153626455, + "loss": 0.989712, + "num_input_tokens_seen": 61424288, + "router_z_loss_mlp": 1.14306641, + "step": 739, + "time_per_iteration": 2.629868984222412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172861, + "balance_loss_mlp": 1.05889642, + "epoch": 0.14236244709503657, + "flos": 497477580288.0, + "grad_norm": 0.02475738760241807, + "language_loss": 0.95941108, + "learning_rate": 0.0009672612268637235, + "loss": 0.97113973, + "num_input_tokens_seen": 61493344, + "router_z_loss_mlp": 1.13964844, + "step": 740, + "time_per_iteration": 2.6037824153900146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170194, + "balance_loss_mlp": 1.05618262, + "epoch": 0.14255482878030012, + "flos": 649479095808.0, + "grad_norm": 0.03387034378547869, + "language_loss": 0.95329261, + "learning_rate": 0.0009671502569579048, + "loss": 0.96499455, + "num_input_tokens_seen": 61565216, + "router_z_loss_mlp": 1.14013672, + "step": 741, + "time_per_iteration": 2.7700846195220947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170542, + "balance_loss_mlp": 1.05657792, + "epoch": 0.14274721046556368, + "flos": 537274025472.0, + "grad_norm": 0.02433568326488268, + "language_loss": 0.98081231, + "learning_rate": 0.0009670391056882719, + "loss": 0.99251777, + "num_input_tokens_seen": 61640928, + "router_z_loss_mlp": 1.13964844, + "step": 742, + "time_per_iteration": 2.696019172668457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174036, + "balance_loss_mlp": 1.06002402, + "epoch": 0.14293959215082724, + "flos": 958583572992.0, + "grad_norm": 0.027423351639808666, + "language_loss": 0.96458268, + "learning_rate": 0.0009669277730979776, + "loss": 0.97632295, + "num_input_tokens_seen": 61717552, + "router_z_loss_mlp": 1.14013672, + "step": 743, + "time_per_iteration": 3.2084367275238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174905, + "balance_loss_mlp": 1.06103587, + "epoch": 0.1431319738360908, + "flos": 694385719296.0, + "grad_norm": 0.02304461389980259, + "language_loss": 0.94654781, + "learning_rate": 0.0009668162592302449, + "loss": 0.9582969, + "num_input_tokens_seen": 61800016, + "router_z_loss_mlp": 1.13867188, + "step": 744, + "time_per_iteration": 2.8862292766571045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184206, + "balance_loss_mlp": 1.07009852, + "epoch": 0.14332435552135436, + "flos": 566502280704.0, + "grad_norm": 0.024928546312887438, + "language_loss": 0.9473027, + "learning_rate": 0.0009667045641283676, + "loss": 0.95914471, + "num_input_tokens_seen": 61865904, + "router_z_loss_mlp": 1.14111328, + "step": 745, + "time_per_iteration": 2.6714677810668945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170598, + "balance_loss_mlp": 1.05672932, + "epoch": 0.14351673720661792, + "flos": 739695845376.0, + "grad_norm": 0.027004630074695047, + "language_loss": 1.03854704, + "learning_rate": 0.0009665926878357092, + "loss": 1.05025315, + "num_input_tokens_seen": 61945728, + "router_z_loss_mlp": 1.13867188, + "step": 746, + "time_per_iteration": 2.9414963722229004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168037, + "balance_loss_mlp": 1.05416811, + "epoch": 0.14370911889188148, + "flos": 550351279104.0, + "grad_norm": 0.024394803732961844, + "language_loss": 0.99195439, + "learning_rate": 0.0009664806303957043, + "loss": 1.00363481, + "num_input_tokens_seen": 62016288, + "router_z_loss_mlp": 1.13867188, + "step": 747, + "time_per_iteration": 2.6798276901245117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175063, + "balance_loss_mlp": 1.06109881, + "epoch": 0.14390150057714507, + "flos": 591589271040.0, + "grad_norm": 0.028912253716933817, + "language_loss": 0.96970344, + "learning_rate": 0.0009663683918518571, + "loss": 0.98145401, + "num_input_tokens_seen": 62097904, + "router_z_loss_mlp": 1.13964844, + "step": 748, + "time_per_iteration": 2.894670248031616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172034, + "balance_loss_mlp": 1.05845118, + "epoch": 0.14409388226240863, + "flos": 592144496640.0, + "grad_norm": 0.025560266799661176, + "language_loss": 0.96381319, + "learning_rate": 0.0009662559722477428, + "loss": 0.97553355, + "num_input_tokens_seen": 62166736, + "router_z_loss_mlp": 1.13574219, + "step": 749, + "time_per_iteration": 2.702796220779419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193848, + "balance_loss_mlp": 1.08131409, + "epoch": 0.1442862639476722, + "flos": 1514654828544.0, + "grad_norm": 0.02305864885865106, + "language_loss": 0.7616297, + "learning_rate": 0.0009661433716270062, + "loss": 0.77356815, + "num_input_tokens_seen": 62402512, + "router_z_loss_mlp": 1.125, + "step": 750, + "time_per_iteration": 5.010634660720825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166507, + "balance_loss_mlp": 1.05287659, + "epoch": 0.14447864563293575, + "flos": 497855612928.0, + "grad_norm": 0.023714468612350204, + "language_loss": 0.97989428, + "learning_rate": 0.0009660305900333632, + "loss": 0.99155927, + "num_input_tokens_seen": 62473408, + "router_z_loss_mlp": 1.13623047, + "step": 751, + "time_per_iteration": 2.7064144611358643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172129, + "balance_loss_mlp": 1.05845106, + "epoch": 0.1446710273181993, + "flos": 590794274304.0, + "grad_norm": 0.03190287595859636, + "language_loss": 0.91963172, + "learning_rate": 0.0009659176275105992, + "loss": 0.93135297, + "num_input_tokens_seen": 62547440, + "router_z_loss_mlp": 1.13671875, + "step": 752, + "time_per_iteration": 2.7171401977539062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171619, + "balance_loss_mlp": 1.05803668, + "epoch": 0.14486340900346287, + "flos": 587012851200.0, + "grad_norm": 0.023715921645424867, + "language_loss": 0.93508279, + "learning_rate": 0.0009658044841025701, + "loss": 0.94679892, + "num_input_tokens_seen": 62620224, + "router_z_loss_mlp": 1.13574219, + "step": 753, + "time_per_iteration": 2.77504563331604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172686, + "balance_loss_mlp": 1.05900788, + "epoch": 0.14505579068872643, + "flos": 505740096000.0, + "grad_norm": 0.025730958483317315, + "language_loss": 0.9055903, + "learning_rate": 0.0009656911598532021, + "loss": 0.91731715, + "num_input_tokens_seen": 62690464, + "router_z_loss_mlp": 1.13671875, + "step": 754, + "time_per_iteration": 2.642886161804199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172881, + "balance_loss_mlp": 1.05925071, + "epoch": 0.14524817237399, + "flos": 487815177216.0, + "grad_norm": 0.025261406861214447, + "language_loss": 0.98625988, + "learning_rate": 0.0009655776548064917, + "loss": 0.9979887, + "num_input_tokens_seen": 62762240, + "router_z_loss_mlp": 1.13623047, + "step": 755, + "time_per_iteration": 2.6610004901885986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169342, + "balance_loss_mlp": 1.05571139, + "epoch": 0.14544055405925355, + "flos": 729449292288.0, + "grad_norm": 0.025093779151575485, + "language_loss": 0.97407329, + "learning_rate": 0.0009654639690065054, + "loss": 0.98576677, + "num_input_tokens_seen": 62839760, + "router_z_loss_mlp": 1.13623047, + "step": 756, + "time_per_iteration": 2.867976665496826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173831, + "balance_loss_mlp": 1.06024873, + "epoch": 0.14563293574451713, + "flos": 594786544128.0, + "grad_norm": 0.02769433731610086, + "language_loss": 0.96328217, + "learning_rate": 0.00096535010249738, + "loss": 0.97502041, + "num_input_tokens_seen": 62910336, + "router_z_loss_mlp": 1.13574219, + "step": 757, + "time_per_iteration": 2.718595266342163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171947, + "balance_loss_mlp": 1.05879402, + "epoch": 0.1458253174297807, + "flos": 561622414848.0, + "grad_norm": 0.027253539371253223, + "language_loss": 0.93671888, + "learning_rate": 0.0009652360553233224, + "loss": 0.94843829, + "num_input_tokens_seen": 62988160, + "router_z_loss_mlp": 1.13134766, + "step": 758, + "time_per_iteration": 2.732665538787842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181084, + "balance_loss_mlp": 1.06835938, + "epoch": 0.14601769911504425, + "flos": 1561186922496.0, + "grad_norm": 0.016548141494889222, + "language_loss": 0.73773748, + "learning_rate": 0.0009651218275286093, + "loss": 0.74954832, + "num_input_tokens_seen": 63224704, + "router_z_loss_mlp": 1.12695312, + "step": 759, + "time_per_iteration": 4.9278404712677 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161415, + "balance_loss_mlp": 1.04840457, + "epoch": 0.1462100808003078, + "flos": 867822331392.0, + "grad_norm": 0.024551380524627048, + "language_loss": 0.89752859, + "learning_rate": 0.0009650074191575883, + "loss": 0.90914273, + "num_input_tokens_seen": 63312400, + "router_z_loss_mlp": 1.12988281, + "step": 760, + "time_per_iteration": 3.18084716796875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011658, + "balance_loss_mlp": 1.05302811, + "epoch": 0.14640246248557137, + "flos": 524029585920.0, + "grad_norm": 0.025729752682943422, + "language_loss": 0.95023656, + "learning_rate": 0.0009648928302546766, + "loss": 0.96189463, + "num_input_tokens_seen": 63387792, + "router_z_loss_mlp": 1.12744141, + "step": 761, + "time_per_iteration": 2.707385301589966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161728, + "balance_loss_mlp": 1.04895639, + "epoch": 0.14659484417083493, + "flos": 1032241089024.0, + "grad_norm": 0.022974522077421757, + "language_loss": 0.94352418, + "learning_rate": 0.0009647780608643613, + "loss": 0.95514143, + "num_input_tokens_seen": 63475632, + "router_z_loss_mlp": 1.12744141, + "step": 762, + "time_per_iteration": 3.357776165008545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116078, + "balance_loss_mlp": 1.04848516, + "epoch": 0.1467872258560985, + "flos": 501656501760.0, + "grad_norm": 0.027279773355913427, + "language_loss": 0.99627388, + "learning_rate": 0.0009646631110312001, + "loss": 1.00788176, + "num_input_tokens_seen": 63546080, + "router_z_loss_mlp": 1.12255859, + "step": 763, + "time_per_iteration": 2.629650115966797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159049, + "balance_loss_mlp": 1.04665887, + "epoch": 0.14697960754136205, + "flos": 548935928832.0, + "grad_norm": 0.020644179018096606, + "language_loss": 0.95446718, + "learning_rate": 0.0009645479807998203, + "loss": 0.96605766, + "num_input_tokens_seen": 63622464, + "router_z_loss_mlp": 1.12353516, + "step": 764, + "time_per_iteration": 2.7475621700286865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157825, + "balance_loss_mlp": 1.04510117, + "epoch": 0.14717198922662564, + "flos": 518901943296.0, + "grad_norm": 0.021535065255329562, + "language_loss": 0.99812603, + "learning_rate": 0.0009644326702149196, + "loss": 1.00970435, + "num_input_tokens_seen": 63694736, + "router_z_loss_mlp": 1.12695312, + "step": 765, + "time_per_iteration": 2.711500406265259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158907, + "balance_loss_mlp": 1.04618227, + "epoch": 0.1473643709118892, + "flos": 733483221504.0, + "grad_norm": 0.02504361772442387, + "language_loss": 0.95452881, + "learning_rate": 0.0009643171793212653, + "loss": 0.96611786, + "num_input_tokens_seen": 63779072, + "router_z_loss_mlp": 1.12695312, + "step": 766, + "time_per_iteration": 3.130798578262329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163931, + "balance_loss_mlp": 1.05115891, + "epoch": 0.14755675259715276, + "flos": 621668192256.0, + "grad_norm": 0.027740201354691706, + "language_loss": 0.99870968, + "learning_rate": 0.0009642015081636952, + "loss": 1.01034904, + "num_input_tokens_seen": 63847472, + "router_z_loss_mlp": 1.12744141, + "step": 767, + "time_per_iteration": 2.701939344406128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160055, + "balance_loss_mlp": 1.04761696, + "epoch": 0.14774913428241632, + "flos": 453172571136.0, + "grad_norm": 0.025159341457135456, + "language_loss": 0.98449206, + "learning_rate": 0.0009640856567871166, + "loss": 0.99609256, + "num_input_tokens_seen": 63912496, + "router_z_loss_mlp": 1.12402344, + "step": 768, + "time_per_iteration": 2.516721725463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164874, + "balance_loss_mlp": 1.05262613, + "epoch": 0.14794151596767988, + "flos": 838654474752.0, + "grad_norm": 0.02612823197324643, + "language_loss": 0.99416363, + "learning_rate": 0.0009639696252365072, + "loss": 1.00581241, + "num_input_tokens_seen": 63990832, + "router_z_loss_mlp": 1.12207031, + "step": 769, + "time_per_iteration": 3.06074857711792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167068, + "balance_loss_mlp": 1.05472481, + "epoch": 0.14813389765294344, + "flos": 687404295168.0, + "grad_norm": 0.02602975967937929, + "language_loss": 0.89651555, + "learning_rate": 0.0009638534135569144, + "loss": 0.90818626, + "num_input_tokens_seen": 64067552, + "router_z_loss_mlp": 1.12304688, + "step": 770, + "time_per_iteration": 2.9440436363220215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169876, + "balance_loss_mlp": 1.05753326, + "epoch": 0.148326279338207, + "flos": 510943600128.0, + "grad_norm": 0.028093178265757666, + "language_loss": 1.01150489, + "learning_rate": 0.0009637370217934554, + "loss": 1.02320373, + "num_input_tokens_seen": 64140336, + "router_z_loss_mlp": 1.12304688, + "step": 771, + "time_per_iteration": 2.649656295776367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166681, + "balance_loss_mlp": 1.05443311, + "epoch": 0.14851866102347056, + "flos": 589331260416.0, + "grad_norm": 0.028336871459981, + "language_loss": 0.90924722, + "learning_rate": 0.0009636204499913175, + "loss": 0.92091405, + "num_input_tokens_seen": 64223472, + "router_z_loss_mlp": 1.12207031, + "step": 772, + "time_per_iteration": 2.8592941761016846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157961, + "balance_loss_mlp": 1.04609525, + "epoch": 0.14871104270873411, + "flos": 692247230976.0, + "grad_norm": 0.030313888046816524, + "language_loss": 0.95830965, + "learning_rate": 0.0009635036981957581, + "loss": 0.96988928, + "num_input_tokens_seen": 64299872, + "router_z_loss_mlp": 1.11816406, + "step": 773, + "time_per_iteration": 2.8690600395202637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160765, + "balance_loss_mlp": 1.04904246, + "epoch": 0.1489034243939977, + "flos": 656282600448.0, + "grad_norm": 0.02808100337337059, + "language_loss": 0.98035401, + "learning_rate": 0.0009633867664521043, + "loss": 0.99196172, + "num_input_tokens_seen": 64377152, + "router_z_loss_mlp": 1.11669922, + "step": 774, + "time_per_iteration": 2.812833070755005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159463, + "balance_loss_mlp": 1.04788363, + "epoch": 0.14909580607926126, + "flos": 476795821056.0, + "grad_norm": 0.030787585825694654, + "language_loss": 0.97385693, + "learning_rate": 0.0009632696548057527, + "loss": 0.98545158, + "num_input_tokens_seen": 64443008, + "router_z_loss_mlp": 1.11523438, + "step": 775, + "time_per_iteration": 2.5662474632263184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160778, + "balance_loss_mlp": 1.04910243, + "epoch": 0.14928818776452482, + "flos": 612283765248.0, + "grad_norm": 0.030552265213122824, + "language_loss": 0.94746792, + "learning_rate": 0.0009631523633021704, + "loss": 0.95907569, + "num_input_tokens_seen": 64519776, + "router_z_loss_mlp": 1.11621094, + "step": 776, + "time_per_iteration": 2.789336919784546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115528, + "balance_loss_mlp": 1.04408133, + "epoch": 0.14948056944978838, + "flos": 562916241408.0, + "grad_norm": 0.02653866309736765, + "language_loss": 0.98006344, + "learning_rate": 0.0009630348919868936, + "loss": 0.99161637, + "num_input_tokens_seen": 64593712, + "router_z_loss_mlp": 1.11132812, + "step": 777, + "time_per_iteration": 2.708918571472168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115506, + "balance_loss_mlp": 1.04395676, + "epoch": 0.14967295113505194, + "flos": 450111558144.0, + "grad_norm": 0.02761804701826243, + "language_loss": 0.92444694, + "learning_rate": 0.0009629172409055293, + "loss": 0.93599755, + "num_input_tokens_seen": 64658448, + "router_z_loss_mlp": 1.11035156, + "step": 778, + "time_per_iteration": 2.522322177886963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154649, + "balance_loss_mlp": 1.0435462, + "epoch": 0.1498653328203155, + "flos": 572428922880.0, + "grad_norm": 0.02112796064723151, + "language_loss": 0.9446094, + "learning_rate": 0.0009627994101037531, + "loss": 0.9561559, + "num_input_tokens_seen": 64734144, + "router_z_loss_mlp": 1.11035156, + "step": 779, + "time_per_iteration": 2.7606184482574463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154399, + "balance_loss_mlp": 1.0433439, + "epoch": 0.15005771450557906, + "flos": 632407570944.0, + "grad_norm": 0.02232887996041627, + "language_loss": 0.98232067, + "learning_rate": 0.0009626813996273114, + "loss": 0.99386466, + "num_input_tokens_seen": 64813456, + "router_z_loss_mlp": 1.10986328, + "step": 780, + "time_per_iteration": 2.8442463874816895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156542, + "balance_loss_mlp": 1.04553461, + "epoch": 0.15025009619084262, + "flos": 579165298176.0, + "grad_norm": 0.021576328362923832, + "language_loss": 0.96611506, + "learning_rate": 0.0009625632095220198, + "loss": 0.97768044, + "num_input_tokens_seen": 64896816, + "router_z_loss_mlp": 1.109375, + "step": 781, + "time_per_iteration": 2.823941469192505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156174, + "balance_loss_mlp": 1.04492784, + "epoch": 0.1504424778761062, + "flos": 484856222208.0, + "grad_norm": 0.023769174200548453, + "language_loss": 0.96595448, + "learning_rate": 0.0009624448398337637, + "loss": 0.97751617, + "num_input_tokens_seen": 64964176, + "router_z_loss_mlp": 1.11181641, + "step": 782, + "time_per_iteration": 2.517115354537964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153917, + "balance_loss_mlp": 1.04286146, + "epoch": 0.15063485956136977, + "flos": 763894513152.0, + "grad_norm": 0.022118467112767815, + "language_loss": 0.97773027, + "learning_rate": 0.0009623262906084984, + "loss": 0.98926944, + "num_input_tokens_seen": 65042592, + "router_z_loss_mlp": 1.10986328, + "step": 783, + "time_per_iteration": 2.9971072673797607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156171, + "balance_loss_mlp": 1.04554462, + "epoch": 0.15082724124663333, + "flos": 498676079616.0, + "grad_norm": 0.021733375764601555, + "language_loss": 0.99047554, + "learning_rate": 0.0009622075618922486, + "loss": 1.00203729, + "num_input_tokens_seen": 65114576, + "router_z_loss_mlp": 1.10546875, + "step": 784, + "time_per_iteration": 2.7209272384643555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161923, + "balance_loss_mlp": 1.05110586, + "epoch": 0.15101962293189689, + "flos": 510722019840.0, + "grad_norm": 0.02414763506099098, + "language_loss": 0.95223093, + "learning_rate": 0.0009620886537311091, + "loss": 0.96385014, + "num_input_tokens_seen": 65186640, + "router_z_loss_mlp": 1.10742188, + "step": 785, + "time_per_iteration": 2.668501138687134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154688, + "balance_loss_mlp": 1.04406226, + "epoch": 0.15121200461716044, + "flos": 458701714944.0, + "grad_norm": 0.026890312379790088, + "language_loss": 0.97208995, + "learning_rate": 0.000961969566171244, + "loss": 0.98363686, + "num_input_tokens_seen": 65252112, + "router_z_loss_mlp": 1.10546875, + "step": 786, + "time_per_iteration": 2.5466530323028564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153217, + "balance_loss_mlp": 1.04278123, + "epoch": 0.151404386302424, + "flos": 539017016832.0, + "grad_norm": 0.02528800532756524, + "language_loss": 1.00058115, + "learning_rate": 0.0009618502992588873, + "loss": 1.01211333, + "num_input_tokens_seen": 65318912, + "router_z_loss_mlp": 1.10351562, + "step": 787, + "time_per_iteration": 2.6463584899902344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154208, + "balance_loss_mlp": 1.04358232, + "epoch": 0.15159676798768756, + "flos": 689616643584.0, + "grad_norm": 0.023869082053813537, + "language_loss": 0.98612797, + "learning_rate": 0.0009617308530403424, + "loss": 0.99766994, + "num_input_tokens_seen": 65395424, + "router_z_loss_mlp": 1.10546875, + "step": 788, + "time_per_iteration": 3.065110921859741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158206, + "balance_loss_mlp": 1.04758012, + "epoch": 0.15178914967295112, + "flos": 546432869376.0, + "grad_norm": 0.025092696297707027, + "language_loss": 0.95288265, + "learning_rate": 0.0009616112275619825, + "loss": 0.96446472, + "num_input_tokens_seen": 65470480, + "router_z_loss_mlp": 1.10546875, + "step": 789, + "time_per_iteration": 2.7197253704071045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158368, + "balance_loss_mlp": 1.0478847, + "epoch": 0.1519815313582147, + "flos": 512814845952.0, + "grad_norm": 0.020890571468345706, + "language_loss": 0.90545368, + "learning_rate": 0.0009614914228702503, + "loss": 0.91703737, + "num_input_tokens_seen": 65544720, + "router_z_loss_mlp": 1.10400391, + "step": 790, + "time_per_iteration": 2.6894142627716064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158071, + "balance_loss_mlp": 1.04782641, + "epoch": 0.15217391304347827, + "flos": 685457187840.0, + "grad_norm": 0.02448742031060442, + "language_loss": 0.96480352, + "learning_rate": 0.0009613714390116581, + "loss": 0.97638422, + "num_input_tokens_seen": 65627872, + "router_z_loss_mlp": 1.1015625, + "step": 791, + "time_per_iteration": 2.9898860454559326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155788, + "balance_loss_mlp": 1.04568636, + "epoch": 0.15236629472874183, + "flos": 645445893120.0, + "grad_norm": 0.023088199171654812, + "language_loss": 0.93995309, + "learning_rate": 0.0009612512760327879, + "loss": 0.95151103, + "num_input_tokens_seen": 65705264, + "router_z_loss_mlp": 1.10009766, + "step": 792, + "time_per_iteration": 2.855648994445801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154532, + "balance_loss_mlp": 1.0444783, + "epoch": 0.1525586764140054, + "flos": 413764892160.0, + "grad_norm": 0.024948238648346503, + "language_loss": 0.97790802, + "learning_rate": 0.0009611309339802909, + "loss": 0.98945332, + "num_input_tokens_seen": 65768592, + "router_z_loss_mlp": 1.09960938, + "step": 793, + "time_per_iteration": 2.4684345722198486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153777, + "balance_loss_mlp": 1.04372334, + "epoch": 0.15275105809926895, + "flos": 804233448960.0, + "grad_norm": 0.02131820977076166, + "language_loss": 0.93039513, + "learning_rate": 0.0009610104129008881, + "loss": 0.94193292, + "num_input_tokens_seen": 65852432, + "router_z_loss_mlp": 1.09960938, + "step": 794, + "time_per_iteration": 3.1013269424438477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155691, + "balance_loss_mlp": 1.04554129, + "epoch": 0.1529434397845325, + "flos": 613542663168.0, + "grad_norm": 0.024012716250022468, + "language_loss": 0.97966266, + "learning_rate": 0.0009608897128413701, + "loss": 0.99121952, + "num_input_tokens_seen": 65927904, + "router_z_loss_mlp": 1.10058594, + "step": 795, + "time_per_iteration": 2.729837417602539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154149, + "balance_loss_mlp": 1.04419053, + "epoch": 0.15313582146979607, + "flos": 616471418880.0, + "grad_norm": 0.02134077894827986, + "language_loss": 0.93399352, + "learning_rate": 0.0009607688338485965, + "loss": 0.945535, + "num_input_tokens_seen": 66006800, + "router_z_loss_mlp": 1.09863281, + "step": 796, + "time_per_iteration": 2.8517422676086426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153572, + "balance_loss_mlp": 1.04409015, + "epoch": 0.15332820315505963, + "flos": 794992012800.0, + "grad_norm": 0.02204541106277596, + "language_loss": 0.98951191, + "learning_rate": 0.0009606477759694969, + "loss": 1.00104761, + "num_input_tokens_seen": 66088608, + "router_z_loss_mlp": 1.09375, + "step": 797, + "time_per_iteration": 3.0313384532928467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153537, + "balance_loss_mlp": 1.0440551, + "epoch": 0.1535205848403232, + "flos": 551256339456.0, + "grad_norm": 0.028291975879130113, + "language_loss": 0.99155664, + "learning_rate": 0.0009605265392510703, + "loss": 1.00309205, + "num_input_tokens_seen": 66153616, + "router_z_loss_mlp": 1.09375, + "step": 798, + "time_per_iteration": 2.6558592319488525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150991, + "balance_loss_mlp": 1.04122281, + "epoch": 0.15371296652558677, + "flos": 536978585088.0, + "grad_norm": 0.02676367025649214, + "language_loss": 1.00762391, + "learning_rate": 0.0009604051237403846, + "loss": 1.01913381, + "num_input_tokens_seen": 66219472, + "router_z_loss_mlp": 1.09667969, + "step": 799, + "time_per_iteration": 2.6129424571990967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151653, + "balance_loss_mlp": 1.04198015, + "epoch": 0.15390534821085033, + "flos": 396089751552.0, + "grad_norm": 0.02759928767191203, + "language_loss": 0.9523741, + "learning_rate": 0.0009602835294845776, + "loss": 0.96389061, + "num_input_tokens_seen": 66281456, + "router_z_loss_mlp": 1.09570312, + "step": 800, + "time_per_iteration": 2.4865612983703613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152453, + "balance_loss_mlp": 1.04297161, + "epoch": 0.1540977298961139, + "flos": 536885259264.0, + "grad_norm": 0.0240348205061721, + "language_loss": 0.99338514, + "learning_rate": 0.0009601617565308565, + "loss": 1.00490952, + "num_input_tokens_seen": 66348160, + "router_z_loss_mlp": 1.09375, + "step": 801, + "time_per_iteration": 2.646925449371338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155144, + "balance_loss_mlp": 1.04551864, + "epoch": 0.15429011158137745, + "flos": 725090449920.0, + "grad_norm": 0.022214532903779557, + "language_loss": 0.94821054, + "learning_rate": 0.0009600398049264977, + "loss": 0.95976186, + "num_input_tokens_seen": 66430576, + "router_z_loss_mlp": 1.09521484, + "step": 802, + "time_per_iteration": 3.0287652015686035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011558, + "balance_loss_mlp": 1.04627085, + "epoch": 0.154482493266641, + "flos": 621748783104.0, + "grad_norm": 0.025430739734688717, + "language_loss": 1.02679133, + "learning_rate": 0.0009599176747188469, + "loss": 1.03834927, + "num_input_tokens_seen": 66506480, + "router_z_loss_mlp": 1.09423828, + "step": 803, + "time_per_iteration": 2.8240089416503906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156206, + "balance_loss_mlp": 1.0467242, + "epoch": 0.15467487495190457, + "flos": 526719297024.0, + "grad_norm": 0.024483654101252486, + "language_loss": 0.90705526, + "learning_rate": 0.0009597953659553196, + "loss": 0.91861731, + "num_input_tokens_seen": 66577680, + "router_z_loss_mlp": 1.09375, + "step": 804, + "time_per_iteration": 2.745878219604492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153494, + "balance_loss_mlp": 1.04386926, + "epoch": 0.15486725663716813, + "flos": 528759730176.0, + "grad_norm": 0.02516296775651391, + "language_loss": 0.97286022, + "learning_rate": 0.0009596728786833997, + "loss": 0.98439509, + "num_input_tokens_seen": 66648496, + "router_z_loss_mlp": 1.09521484, + "step": 805, + "time_per_iteration": 2.6471030712127686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152075, + "balance_loss_mlp": 1.04244983, + "epoch": 0.1550596383224317, + "flos": 1050278799360.0, + "grad_norm": 0.026563720364072098, + "language_loss": 0.9858942, + "learning_rate": 0.0009595502129506415, + "loss": 0.99741489, + "num_input_tokens_seen": 66735216, + "router_z_loss_mlp": 1.09521484, + "step": 806, + "time_per_iteration": 3.3734352588653564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115037, + "balance_loss_mlp": 1.04088783, + "epoch": 0.15525202000769528, + "flos": 614836489728.0, + "grad_norm": 0.02624405223250092, + "language_loss": 0.91745955, + "learning_rate": 0.0009594273688046678, + "loss": 0.92896324, + "num_input_tokens_seen": 66810672, + "router_z_loss_mlp": 1.09375, + "step": 807, + "time_per_iteration": 2.8000967502593994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153708, + "balance_loss_mlp": 1.04441667, + "epoch": 0.15544440169295884, + "flos": 534102222336.0, + "grad_norm": 0.028049278390969077, + "language_loss": 0.97350299, + "learning_rate": 0.000959304346293171, + "loss": 0.98504007, + "num_input_tokens_seen": 66879824, + "router_z_loss_mlp": 1.09179688, + "step": 808, + "time_per_iteration": 2.7285830974578857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164275, + "balance_loss_mlp": 1.05464995, + "epoch": 0.1556367833782224, + "flos": 645886325760.0, + "grad_norm": 0.033021349518653896, + "language_loss": 0.99046445, + "learning_rate": 0.0009591811454639125, + "loss": 1.00210714, + "num_input_tokens_seen": 66949424, + "router_z_loss_mlp": 1.09521484, + "step": 809, + "time_per_iteration": 2.842867612838745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155411, + "balance_loss_mlp": 1.04612005, + "epoch": 0.15582916506348596, + "flos": 544952391168.0, + "grad_norm": 0.02421082053858415, + "language_loss": 0.95793635, + "learning_rate": 0.0009590577663647234, + "loss": 0.96949041, + "num_input_tokens_seen": 67024000, + "router_z_loss_mlp": 1.09179688, + "step": 810, + "time_per_iteration": 2.8207406997680664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158015, + "balance_loss_mlp": 1.04877126, + "epoch": 0.15602154674874952, + "flos": 581214463488.0, + "grad_norm": 0.022734781081273227, + "language_loss": 0.95110512, + "learning_rate": 0.0009589342090435036, + "loss": 0.96268523, + "num_input_tokens_seen": 67100672, + "router_z_loss_mlp": 1.09130859, + "step": 811, + "time_per_iteration": 2.8413872718811035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170356, + "balance_loss_mlp": 1.06068361, + "epoch": 0.15621392843401308, + "flos": 536316572160.0, + "grad_norm": 0.026628933906638022, + "language_loss": 0.97807872, + "learning_rate": 0.0009588104735482223, + "loss": 0.98978221, + "num_input_tokens_seen": 67171584, + "router_z_loss_mlp": 1.09570312, + "step": 812, + "time_per_iteration": 2.6673126220703125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164587, + "balance_loss_mlp": 1.05524826, + "epoch": 0.15640631011927664, + "flos": 551981478912.0, + "grad_norm": 0.027865461759282353, + "language_loss": 0.94247007, + "learning_rate": 0.0009586865599269177, + "loss": 0.95411587, + "num_input_tokens_seen": 67240640, + "router_z_loss_mlp": 1.09228516, + "step": 813, + "time_per_iteration": 2.655217409133911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159004, + "balance_loss_mlp": 1.04985571, + "epoch": 0.1565986918045402, + "flos": 638635657728.0, + "grad_norm": 0.024501009698068087, + "language_loss": 0.98888743, + "learning_rate": 0.0009585624682276977, + "loss": 1.00047755, + "num_input_tokens_seen": 67312976, + "router_z_loss_mlp": 1.09033203, + "step": 814, + "time_per_iteration": 2.7572293281555176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160029, + "balance_loss_mlp": 1.05073786, + "epoch": 0.15679107348980378, + "flos": 491781250560.0, + "grad_norm": 0.02545428800843787, + "language_loss": 0.97158241, + "learning_rate": 0.0009584381984987386, + "loss": 0.98318267, + "num_input_tokens_seen": 67378528, + "router_z_loss_mlp": 1.09179688, + "step": 815, + "time_per_iteration": 2.554208517074585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160766, + "balance_loss_mlp": 1.05185616, + "epoch": 0.15698345517506734, + "flos": 531002277888.0, + "grad_norm": 0.022736041606184667, + "language_loss": 0.98151159, + "learning_rate": 0.0009583137507882864, + "loss": 0.99311924, + "num_input_tokens_seen": 67449728, + "router_z_loss_mlp": 1.08789062, + "step": 816, + "time_per_iteration": 2.6635444164276123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158696, + "balance_loss_mlp": 1.04978669, + "epoch": 0.1571758368603309, + "flos": 547077417984.0, + "grad_norm": 0.024009976747476527, + "language_loss": 0.90921289, + "learning_rate": 0.000958189125144656, + "loss": 0.92079985, + "num_input_tokens_seen": 67520512, + "router_z_loss_mlp": 1.08789062, + "step": 817, + "time_per_iteration": 2.635559558868408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156061, + "balance_loss_mlp": 1.04719925, + "epoch": 0.15736821854559446, + "flos": 566743326720.0, + "grad_norm": 0.021547949482456395, + "language_loss": 0.97883654, + "learning_rate": 0.0009580643216162313, + "loss": 0.99039721, + "num_input_tokens_seen": 67592464, + "router_z_loss_mlp": 1.08740234, + "step": 818, + "time_per_iteration": 2.673997640609741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157698, + "balance_loss_mlp": 1.04888415, + "epoch": 0.15756060023085802, + "flos": 501953943552.0, + "grad_norm": 0.023826624353146583, + "language_loss": 0.90112716, + "learning_rate": 0.0009579393402514652, + "loss": 0.91270417, + "num_input_tokens_seen": 67658928, + "router_z_loss_mlp": 1.08691406, + "step": 819, + "time_per_iteration": 2.61460542678833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156999, + "balance_loss_mlp": 1.04823244, + "epoch": 0.15775298191612158, + "flos": 520271631360.0, + "grad_norm": 0.023927295219635936, + "language_loss": 0.99075627, + "learning_rate": 0.0009578141810988801, + "loss": 1.00232625, + "num_input_tokens_seen": 67727936, + "router_z_loss_mlp": 1.08642578, + "step": 820, + "time_per_iteration": 2.591036558151245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149976, + "balance_loss_mlp": 1.04111433, + "epoch": 0.15794536360138514, + "flos": 467087755776.0, + "grad_norm": 0.026283029611425073, + "language_loss": 1.00067806, + "learning_rate": 0.0009576888442070668, + "loss": 1.01217794, + "num_input_tokens_seen": 67795488, + "router_z_loss_mlp": 1.08740234, + "step": 821, + "time_per_iteration": 2.5960564613342285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151894, + "balance_loss_mlp": 1.04279363, + "epoch": 0.1581377452866487, + "flos": 518168071680.0, + "grad_norm": 0.02399653039287492, + "language_loss": 1.01290274, + "learning_rate": 0.0009575633296246854, + "loss": 1.02442169, + "num_input_tokens_seen": 67858896, + "router_z_loss_mlp": 1.08984375, + "step": 822, + "time_per_iteration": 2.579575300216675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151848, + "balance_loss_mlp": 1.04312956, + "epoch": 0.15833012697191226, + "flos": 550837373952.0, + "grad_norm": 0.02407632334340799, + "language_loss": 0.91124117, + "learning_rate": 0.0009574376374004652, + "loss": 0.92275965, + "num_input_tokens_seen": 67924864, + "router_z_loss_mlp": 1.0859375, + "step": 823, + "time_per_iteration": 2.661754608154297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162901, + "balance_loss_mlp": 1.05446815, + "epoch": 0.15852250865717585, + "flos": 488466456576.0, + "grad_norm": 0.026327967105985502, + "language_loss": 0.90841949, + "learning_rate": 0.000957311767583204, + "loss": 0.92004848, + "num_input_tokens_seen": 67992912, + "router_z_loss_mlp": 1.08300781, + "step": 824, + "time_per_iteration": 2.7887372970581055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156753, + "balance_loss_mlp": 1.04956055, + "epoch": 0.1587148903424394, + "flos": 1312696909824.0, + "grad_norm": 0.010620587901871582, + "language_loss": 0.8207159, + "learning_rate": 0.0009571857202217691, + "loss": 0.8322835, + "num_input_tokens_seen": 68207408, + "router_z_loss_mlp": 1.0703125, + "step": 825, + "time_per_iteration": 4.766167640686035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151145, + "balance_loss_mlp": 1.04304576, + "epoch": 0.15890727202770297, + "flos": 467832360960.0, + "grad_norm": 0.02959471781097451, + "language_loss": 1.0376749, + "learning_rate": 0.0009570594953650961, + "loss": 1.04918623, + "num_input_tokens_seen": 68270864, + "router_z_loss_mlp": 1.07958984, + "step": 826, + "time_per_iteration": 2.6334874629974365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151786, + "balance_loss_mlp": 1.04354417, + "epoch": 0.15909965371296653, + "flos": 778606695936.0, + "grad_norm": 0.024366848241159877, + "language_loss": 0.8923949, + "learning_rate": 0.00095693309306219, + "loss": 0.90391278, + "num_input_tokens_seen": 68355408, + "router_z_loss_mlp": 1.08105469, + "step": 827, + "time_per_iteration": 3.1078274250030518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115264, + "balance_loss_mlp": 1.04449332, + "epoch": 0.1592920353982301, + "flos": 1079962950144.0, + "grad_norm": 0.02547465125103231, + "language_loss": 0.98567259, + "learning_rate": 0.0009568065133621244, + "loss": 0.99719906, + "num_input_tokens_seen": 68437072, + "router_z_loss_mlp": 1.08007812, + "step": 828, + "time_per_iteration": 3.3287436962127686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147109, + "balance_loss_mlp": 1.03872418, + "epoch": 0.15948441708349365, + "flos": 726889837056.0, + "grad_norm": 0.026992334830630314, + "language_loss": 0.93815649, + "learning_rate": 0.0009566797563140422, + "loss": 0.94962764, + "num_input_tokens_seen": 68511696, + "router_z_loss_mlp": 1.08251953, + "step": 829, + "time_per_iteration": 2.8641507625579834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146027, + "balance_loss_mlp": 1.03788006, + "epoch": 0.1596767987687572, + "flos": 580075087872.0, + "grad_norm": 0.026140449767567974, + "language_loss": 0.96191794, + "learning_rate": 0.0009565528219671547, + "loss": 0.97337818, + "num_input_tokens_seen": 68587488, + "router_z_loss_mlp": 1.08007812, + "step": 830, + "time_per_iteration": 2.9082329273223877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147169, + "balance_loss_mlp": 1.03902268, + "epoch": 0.15986918045402077, + "flos": 530025358848.0, + "grad_norm": 0.02186736495212519, + "language_loss": 0.93771887, + "learning_rate": 0.0009564257103707418, + "loss": 0.94919056, + "num_input_tokens_seen": 68655760, + "router_z_loss_mlp": 1.08007812, + "step": 831, + "time_per_iteration": 4.109540700912476 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153246, + "balance_loss_mlp": 1.04519463, + "epoch": 0.16006156213928435, + "flos": 575669856768.0, + "grad_norm": 0.025156765484562034, + "language_loss": 1.01463771, + "learning_rate": 0.0009562984215741533, + "loss": 1.02617025, + "num_input_tokens_seen": 68724560, + "router_z_loss_mlp": 1.07910156, + "step": 832, + "time_per_iteration": 2.634381055831909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148637, + "balance_loss_mlp": 1.0408721, + "epoch": 0.1602539438245479, + "flos": 516674858496.0, + "grad_norm": 0.023022886756030446, + "language_loss": 0.90665066, + "learning_rate": 0.0009561709556268065, + "loss": 0.91813707, + "num_input_tokens_seen": 68795440, + "router_z_loss_mlp": 1.07617188, + "step": 833, + "time_per_iteration": 2.7094552516937256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115539, + "balance_loss_mlp": 1.04752922, + "epoch": 0.16044632550981147, + "flos": 622161017856.0, + "grad_norm": 0.02456985500743924, + "language_loss": 1.0306673, + "learning_rate": 0.0009560433125781884, + "loss": 1.04222107, + "num_input_tokens_seen": 68868176, + "router_z_loss_mlp": 1.07714844, + "step": 834, + "time_per_iteration": 2.7217955589294434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156285, + "balance_loss_mlp": 1.04794765, + "epoch": 0.16063870719507503, + "flos": 562127975424.0, + "grad_norm": 0.02550250825542428, + "language_loss": 1.02622008, + "learning_rate": 0.0009559154924778544, + "loss": 1.03778291, + "num_input_tokens_seen": 68939616, + "router_z_loss_mlp": 1.08203125, + "step": 835, + "time_per_iteration": 4.0438151359558105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153381, + "balance_loss_mlp": 1.04509139, + "epoch": 0.1608310888803386, + "flos": 806560590336.0, + "grad_norm": 0.023331498233936678, + "language_loss": 0.93980491, + "learning_rate": 0.0009557874953754284, + "loss": 0.95133871, + "num_input_tokens_seen": 69016192, + "router_z_loss_mlp": 1.08154297, + "step": 836, + "time_per_iteration": 3.0253541469573975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155161, + "balance_loss_mlp": 1.04739583, + "epoch": 0.16102347056560215, + "flos": 601694108160.0, + "grad_norm": 0.024039154316001603, + "language_loss": 0.9449209, + "learning_rate": 0.0009556593213206038, + "loss": 0.95647246, + "num_input_tokens_seen": 69089360, + "router_z_loss_mlp": 1.07617188, + "step": 837, + "time_per_iteration": 2.815293788909912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148071, + "balance_loss_mlp": 1.04049647, + "epoch": 0.1612158522508657, + "flos": 554614794240.0, + "grad_norm": 0.024490980939479982, + "language_loss": 0.96443379, + "learning_rate": 0.0009555309703631414, + "loss": 0.9759146, + "num_input_tokens_seen": 69161952, + "router_z_loss_mlp": 1.07421875, + "step": 838, + "time_per_iteration": 2.7353601455688477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148397, + "balance_loss_mlp": 1.0406791, + "epoch": 0.16140823393612927, + "flos": 557017797120.0, + "grad_norm": 0.026558461299776022, + "language_loss": 0.98485982, + "learning_rate": 0.0009554024425528722, + "loss": 0.99634379, + "num_input_tokens_seen": 69232432, + "router_z_loss_mlp": 1.07568359, + "step": 839, + "time_per_iteration": 2.801539182662964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146915, + "balance_loss_mlp": 1.03924477, + "epoch": 0.16160061562139286, + "flos": 544908730368.0, + "grad_norm": 0.023933605454050468, + "language_loss": 0.96992832, + "learning_rate": 0.0009552737379396948, + "loss": 0.98139745, + "num_input_tokens_seen": 69297696, + "router_z_loss_mlp": 1.07519531, + "step": 840, + "time_per_iteration": 2.613037586212158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148515, + "balance_loss_mlp": 1.04122651, + "epoch": 0.16179299730665642, + "flos": 605006900736.0, + "grad_norm": 0.020652206840645122, + "language_loss": 0.95695615, + "learning_rate": 0.0009551448565735767, + "loss": 0.96844131, + "num_input_tokens_seen": 69373888, + "router_z_loss_mlp": 1.07128906, + "step": 841, + "time_per_iteration": 2.779979705810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149052, + "balance_loss_mlp": 1.04128659, + "epoch": 0.16198537899191998, + "flos": 788551077888.0, + "grad_norm": 0.02358864683094414, + "language_loss": 0.96423578, + "learning_rate": 0.0009550157985045543, + "loss": 0.97572625, + "num_input_tokens_seen": 69449984, + "router_z_loss_mlp": 1.07617188, + "step": 842, + "time_per_iteration": 3.0352344512939453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148245, + "balance_loss_mlp": 1.04086173, + "epoch": 0.16217776067718354, + "flos": 520829584896.0, + "grad_norm": 0.02127918945612936, + "language_loss": 0.95624614, + "learning_rate": 0.0009548865637827321, + "loss": 0.96772861, + "num_input_tokens_seen": 69522736, + "router_z_loss_mlp": 1.07226562, + "step": 843, + "time_per_iteration": 2.695211172103882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148036, + "balance_loss_mlp": 1.04027128, + "epoch": 0.1623701423624471, + "flos": 506254388736.0, + "grad_norm": 0.02427958482397641, + "language_loss": 0.99469078, + "learning_rate": 0.0009547571524582838, + "loss": 1.00617111, + "num_input_tokens_seen": 69587184, + "router_z_loss_mlp": 1.07617188, + "step": 844, + "time_per_iteration": 2.586859941482544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145622, + "balance_loss_mlp": 1.03842914, + "epoch": 0.16256252404771065, + "flos": 498157057536.0, + "grad_norm": 0.025657026114593633, + "language_loss": 1.02873135, + "learning_rate": 0.0009546275645814512, + "loss": 1.04018748, + "num_input_tokens_seen": 69656560, + "router_z_loss_mlp": 1.0703125, + "step": 845, + "time_per_iteration": 2.735323190689087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147597, + "balance_loss_mlp": 1.04040384, + "epoch": 0.16275490573297421, + "flos": 503286701568.0, + "grad_norm": 0.024743383464961046, + "language_loss": 1.00377154, + "learning_rate": 0.0009544978002025446, + "loss": 1.01524746, + "num_input_tokens_seen": 69723872, + "router_z_loss_mlp": 1.0703125, + "step": 846, + "time_per_iteration": 2.5876121520996094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148997, + "balance_loss_mlp": 1.04189885, + "epoch": 0.16294728741823777, + "flos": 508353945600.0, + "grad_norm": 0.020876938588178177, + "language_loss": 0.94877481, + "learning_rate": 0.0009543678593719434, + "loss": 0.9602648, + "num_input_tokens_seen": 69795504, + "router_z_loss_mlp": 1.06933594, + "step": 847, + "time_per_iteration": 2.69250750541687 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159847, + "balance_loss_mlp": 1.05274892, + "epoch": 0.16313966910350133, + "flos": 510756948480.0, + "grad_norm": 0.020936629725758764, + "language_loss": 0.95534647, + "learning_rate": 0.0009542377421400945, + "loss": 0.96694493, + "num_input_tokens_seen": 69873408, + "router_z_loss_mlp": 1.06933594, + "step": 848, + "time_per_iteration": 2.7832183837890625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146796, + "balance_loss_mlp": 1.03965068, + "epoch": 0.16333205078876492, + "flos": 545056450560.0, + "grad_norm": 0.023544058946573278, + "language_loss": 0.94486761, + "learning_rate": 0.0009541074485575145, + "loss": 0.95633554, + "num_input_tokens_seen": 69944112, + "router_z_loss_mlp": 1.06982422, + "step": 849, + "time_per_iteration": 2.7163026332855225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147161, + "balance_loss_mlp": 1.03996801, + "epoch": 0.16352443247402848, + "flos": 508711785984.0, + "grad_norm": 0.023080110816121054, + "language_loss": 1.00550437, + "learning_rate": 0.0009539769786747874, + "loss": 1.01697588, + "num_input_tokens_seen": 70012288, + "router_z_loss_mlp": 1.0703125, + "step": 850, + "time_per_iteration": 2.5918350219726562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152854, + "balance_loss_mlp": 1.04547, + "epoch": 0.16371681415929204, + "flos": 543222134784.0, + "grad_norm": 0.022593715242085626, + "language_loss": 0.90895152, + "learning_rate": 0.0009538463325425665, + "loss": 0.92048007, + "num_input_tokens_seen": 70086560, + "router_z_loss_mlp": 1.07226562, + "step": 851, + "time_per_iteration": 2.701662063598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146583, + "balance_loss_mlp": 1.03939056, + "epoch": 0.1639091958445556, + "flos": 521760841728.0, + "grad_norm": 0.025319624949764974, + "language_loss": 0.95562863, + "learning_rate": 0.0009537155102115728, + "loss": 0.96709442, + "num_input_tokens_seen": 70153968, + "router_z_loss_mlp": 1.0703125, + "step": 852, + "time_per_iteration": 2.577416181564331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145576, + "balance_loss_mlp": 1.03871727, + "epoch": 0.16410157752981916, + "flos": 548482034688.0, + "grad_norm": 0.022217218078565786, + "language_loss": 0.92332971, + "learning_rate": 0.0009535845117325961, + "loss": 0.93478549, + "num_input_tokens_seen": 70222496, + "router_z_loss_mlp": 1.06689453, + "step": 853, + "time_per_iteration": 2.643528699874878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148166, + "balance_loss_mlp": 1.04135406, + "epoch": 0.16429395921508272, + "flos": 584025698304.0, + "grad_norm": 0.02024018106959617, + "language_loss": 1.00128078, + "learning_rate": 0.0009534533371564946, + "loss": 1.01276231, + "num_input_tokens_seen": 70301680, + "router_z_loss_mlp": 1.06640625, + "step": 854, + "time_per_iteration": 2.74361515045166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150543, + "balance_loss_mlp": 1.04377949, + "epoch": 0.16448634090034628, + "flos": 531961732608.0, + "grad_norm": 0.02843561601072028, + "language_loss": 1.00094676, + "learning_rate": 0.0009533219865341949, + "loss": 1.01245213, + "num_input_tokens_seen": 70371152, + "router_z_loss_mlp": 1.06591797, + "step": 855, + "time_per_iteration": 2.5858380794525146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156957, + "balance_loss_mlp": 1.05014503, + "epoch": 0.16467872258560984, + "flos": 492960284160.0, + "grad_norm": 0.026495144396752456, + "language_loss": 0.95923662, + "learning_rate": 0.0009531904599166916, + "loss": 0.97080612, + "num_input_tokens_seen": 70440832, + "router_z_loss_mlp": 1.06640625, + "step": 856, + "time_per_iteration": 2.638528823852539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147973, + "balance_loss_mlp": 1.04101861, + "epoch": 0.16487110427087343, + "flos": 507259505664.0, + "grad_norm": 0.02303677132947941, + "language_loss": 0.95950538, + "learning_rate": 0.0009530587573550478, + "loss": 0.97098505, + "num_input_tokens_seen": 70507424, + "router_z_loss_mlp": 1.06787109, + "step": 857, + "time_per_iteration": 2.5788354873657227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151596, + "balance_loss_mlp": 1.04592896, + "epoch": 0.16506348595613698, + "flos": 1436108714496.0, + "grad_norm": 0.011861304780107247, + "language_loss": 0.74319386, + "learning_rate": 0.0009529268789003953, + "loss": 0.75470984, + "num_input_tokens_seen": 70742320, + "router_z_loss_mlp": 1.0546875, + "step": 858, + "time_per_iteration": 5.003005027770996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153597, + "balance_loss_mlp": 1.04673755, + "epoch": 0.16525586764140054, + "flos": 478089647616.0, + "grad_norm": 0.02595402254221991, + "language_loss": 0.98057735, + "learning_rate": 0.0009527948246039337, + "loss": 0.99211335, + "num_input_tokens_seen": 70808400, + "router_z_loss_mlp": 1.06689453, + "step": 859, + "time_per_iteration": 2.541255474090576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152748, + "balance_loss_mlp": 1.04622293, + "epoch": 0.1654482493266641, + "flos": 882540518400.0, + "grad_norm": 0.024187417777422206, + "language_loss": 0.96476752, + "learning_rate": 0.000952662594516931, + "loss": 0.97629499, + "num_input_tokens_seen": 70886192, + "router_z_loss_mlp": 1.06347656, + "step": 860, + "time_per_iteration": 3.102233409881592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154678, + "balance_loss_mlp": 1.04791439, + "epoch": 0.16564063101192766, + "flos": 628105124352.0, + "grad_norm": 0.02242324391324738, + "language_loss": 0.93166292, + "learning_rate": 0.0009525301886907234, + "loss": 0.94320977, + "num_input_tokens_seen": 70964816, + "router_z_loss_mlp": 1.06591797, + "step": 861, + "time_per_iteration": 2.871971368789673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151309, + "balance_loss_mlp": 1.04487896, + "epoch": 0.16583301269719122, + "flos": 562592603136.0, + "grad_norm": 0.02248996903194516, + "language_loss": 0.97140592, + "learning_rate": 0.0009523976071767155, + "loss": 0.98291898, + "num_input_tokens_seen": 71037456, + "router_z_loss_mlp": 1.0625, + "step": 862, + "time_per_iteration": 2.653031349182129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146763, + "balance_loss_mlp": 1.04038036, + "epoch": 0.16602539438245478, + "flos": 568983873024.0, + "grad_norm": 0.020794335354585358, + "language_loss": 0.9646408, + "learning_rate": 0.00095226485002638, + "loss": 0.97610843, + "num_input_tokens_seen": 71111872, + "router_z_loss_mlp": 1.06201172, + "step": 863, + "time_per_iteration": 2.7685163021087646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147042, + "balance_loss_mlp": 1.04075551, + "epoch": 0.16621777606771834, + "flos": 576021692928.0, + "grad_norm": 0.021581021962121343, + "language_loss": 0.96560466, + "learning_rate": 0.0009521319172912576, + "loss": 0.9770751, + "num_input_tokens_seen": 71187808, + "router_z_loss_mlp": 1.06103516, + "step": 864, + "time_per_iteration": 2.762233257293701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149511, + "balance_loss_mlp": 1.0432713, + "epoch": 0.16641015775298193, + "flos": 515597882880.0, + "grad_norm": 0.029880870913045234, + "language_loss": 1.0375855, + "learning_rate": 0.0009519988090229579, + "loss": 1.04908061, + "num_input_tokens_seen": 71261728, + "router_z_loss_mlp": 1.06054688, + "step": 865, + "time_per_iteration": 2.7156929969787598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148426, + "balance_loss_mlp": 1.04199588, + "epoch": 0.1666025394382455, + "flos": 622849227264.0, + "grad_norm": 0.023088954173990716, + "language_loss": 0.96669209, + "learning_rate": 0.0009518655252731576, + "loss": 0.9781763, + "num_input_tokens_seen": 71338352, + "router_z_loss_mlp": 1.0625, + "step": 866, + "time_per_iteration": 2.76474928855896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147261, + "balance_loss_mlp": 1.04102135, + "epoch": 0.16679492112350905, + "flos": 549932313600.0, + "grad_norm": 0.021458749489738967, + "language_loss": 0.98467255, + "learning_rate": 0.0009517320660936022, + "loss": 0.99614513, + "num_input_tokens_seen": 71416544, + "router_z_loss_mlp": 1.06054688, + "step": 867, + "time_per_iteration": 2.7664713859558105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151692, + "balance_loss_mlp": 1.04545259, + "epoch": 0.1669873028087726, + "flos": 666865526784.0, + "grad_norm": 0.02209258354681387, + "language_loss": 0.92114806, + "learning_rate": 0.0009515984315361051, + "loss": 0.93266487, + "num_input_tokens_seen": 71494080, + "router_z_loss_mlp": 1.06054688, + "step": 868, + "time_per_iteration": 2.845388412475586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151588, + "balance_loss_mlp": 1.04563451, + "epoch": 0.16717968449403617, + "flos": 539603168256.0, + "grad_norm": 0.02501334283432316, + "language_loss": 0.95751995, + "learning_rate": 0.000951464621652548, + "loss": 0.96903574, + "num_input_tokens_seen": 71562672, + "router_z_loss_mlp": 1.05761719, + "step": 869, + "time_per_iteration": 2.623375415802002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148167, + "balance_loss_mlp": 1.04216599, + "epoch": 0.16737206617929973, + "flos": 531278252544.0, + "grad_norm": 0.02062860382438808, + "language_loss": 0.87610328, + "learning_rate": 0.0009513306364948804, + "loss": 0.88758498, + "num_input_tokens_seen": 71641904, + "router_z_loss_mlp": 1.05810547, + "step": 870, + "time_per_iteration": 2.792346239089966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148065, + "balance_loss_mlp": 1.04206407, + "epoch": 0.1675644478645633, + "flos": 481756277760.0, + "grad_norm": 0.023236257285911367, + "language_loss": 0.98118269, + "learning_rate": 0.0009511964761151197, + "loss": 0.99266338, + "num_input_tokens_seen": 71709616, + "router_z_loss_mlp": 1.05810547, + "step": 871, + "time_per_iteration": 2.572923183441162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152601, + "balance_loss_mlp": 1.04669595, + "epoch": 0.16775682954982685, + "flos": 495541206528.0, + "grad_norm": 0.026661505796453877, + "language_loss": 0.99311042, + "learning_rate": 0.0009510621405653521, + "loss": 1.00463641, + "num_input_tokens_seen": 71776592, + "router_z_loss_mlp": 1.05712891, + "step": 872, + "time_per_iteration": 2.6296472549438477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150326, + "balance_loss_mlp": 1.04484987, + "epoch": 0.1679492112350904, + "flos": 753404912640.0, + "grad_norm": 0.029291148216183213, + "language_loss": 0.93300939, + "learning_rate": 0.0009509276298977309, + "loss": 0.94451261, + "num_input_tokens_seen": 71856352, + "router_z_loss_mlp": 1.05273438, + "step": 873, + "time_per_iteration": 3.0177366733551025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150817, + "balance_loss_mlp": 1.04543638, + "epoch": 0.168141592920354, + "flos": 1137731977728.0, + "grad_norm": 0.021155110884158303, + "language_loss": 0.9134444, + "learning_rate": 0.0009507929441644778, + "loss": 0.92495263, + "num_input_tokens_seen": 71948480, + "router_z_loss_mlp": 1.05175781, + "step": 874, + "time_per_iteration": 3.53277325630188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160399, + "balance_loss_mlp": 1.05501771, + "epoch": 0.16833397460561755, + "flos": 633553677312.0, + "grad_norm": 0.025508723945600786, + "language_loss": 0.94342184, + "learning_rate": 0.0009506580834178826, + "loss": 0.95502585, + "num_input_tokens_seen": 72019200, + "router_z_loss_mlp": 1.05175781, + "step": 875, + "time_per_iteration": 2.763296365737915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151031, + "balance_loss_mlp": 1.04560196, + "epoch": 0.1685263562908811, + "flos": 542542657536.0, + "grad_norm": 0.0234395143242784, + "language_loss": 1.00066125, + "learning_rate": 0.0009505230477103028, + "loss": 1.01217151, + "num_input_tokens_seen": 72088672, + "router_z_loss_mlp": 1.05224609, + "step": 876, + "time_per_iteration": 2.7256453037261963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143495, + "balance_loss_mlp": 1.03801847, + "epoch": 0.16871873797614467, + "flos": 620485155840.0, + "grad_norm": 0.02951425183806971, + "language_loss": 0.91949958, + "learning_rate": 0.0009503878370941641, + "loss": 0.93093449, + "num_input_tokens_seen": 72159952, + "router_z_loss_mlp": 1.05273438, + "step": 877, + "time_per_iteration": 2.75011944770813 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143733, + "balance_loss_mlp": 1.038257, + "epoch": 0.16891111966140823, + "flos": 607455565824.0, + "grad_norm": 0.02526909046796152, + "language_loss": 0.99137431, + "learning_rate": 0.0009502524516219595, + "loss": 1.00281167, + "num_input_tokens_seen": 72231648, + "router_z_loss_mlp": 1.05273438, + "step": 878, + "time_per_iteration": 2.7107326984405518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145725, + "balance_loss_mlp": 1.04005778, + "epoch": 0.1691035013466718, + "flos": 553405561344.0, + "grad_norm": 0.023246247090994255, + "language_loss": 0.99022686, + "learning_rate": 0.0009501168913462506, + "loss": 1.00168419, + "num_input_tokens_seen": 72298608, + "router_z_loss_mlp": 1.0546875, + "step": 879, + "time_per_iteration": 2.654356002807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153572, + "balance_loss_mlp": 1.04866791, + "epoch": 0.16929588303193535, + "flos": 1479305822208.0, + "grad_norm": 0.014844444469597292, + "language_loss": 0.79121923, + "learning_rate": 0.0009499811563196665, + "loss": 0.802755, + "num_input_tokens_seen": 72525312, + "router_z_loss_mlp": 1.046875, + "step": 880, + "time_per_iteration": 4.877387762069702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114571, + "balance_loss_mlp": 1.04042399, + "epoch": 0.1694882647171989, + "flos": 927846641664.0, + "grad_norm": 0.023879743421000837, + "language_loss": 0.93963408, + "learning_rate": 0.0009498452465949042, + "loss": 0.95109117, + "num_input_tokens_seen": 72612976, + "router_z_loss_mlp": 1.05078125, + "step": 881, + "time_per_iteration": 3.241151809692383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150014, + "balance_loss_mlp": 1.0447762, + "epoch": 0.1696806464024625, + "flos": 547151278080.0, + "grad_norm": 0.02293023114251512, + "language_loss": 0.98854458, + "learning_rate": 0.0009497091622247285, + "loss": 1.0000447, + "num_input_tokens_seen": 72686800, + "router_z_loss_mlp": 1.05029297, + "step": 882, + "time_per_iteration": 2.720453977584839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145786, + "balance_loss_mlp": 1.0406431, + "epoch": 0.16987302808772606, + "flos": 530294602752.0, + "grad_norm": 0.02459483675822623, + "language_loss": 1.0302248, + "learning_rate": 0.0009495729032619723, + "loss": 1.04168272, + "num_input_tokens_seen": 72759360, + "router_z_loss_mlp": 1.04931641, + "step": 883, + "time_per_iteration": 2.717176675796509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151842, + "balance_loss_mlp": 1.04731977, + "epoch": 0.17006540977298962, + "flos": 756478660608.0, + "grad_norm": 0.02507713686866634, + "language_loss": 0.9295364, + "learning_rate": 0.0009494364697595354, + "loss": 0.94105482, + "num_input_tokens_seen": 72831424, + "router_z_loss_mlp": 1.04589844, + "step": 884, + "time_per_iteration": 2.924898147583008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157567, + "balance_loss_mlp": 1.05271089, + "epoch": 0.17025779145825318, + "flos": 559874694144.0, + "grad_norm": 0.025110060032482954, + "language_loss": 0.98774076, + "learning_rate": 0.0009492998617703867, + "loss": 0.99931645, + "num_input_tokens_seen": 72901536, + "router_z_loss_mlp": 1.04833984, + "step": 885, + "time_per_iteration": 2.6759417057037354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155376, + "balance_loss_mlp": 1.05104423, + "epoch": 0.17045017314351674, + "flos": 513216347136.0, + "grad_norm": 0.0280627140127875, + "language_loss": 0.96898842, + "learning_rate": 0.0009491630793475619, + "loss": 0.98054218, + "num_input_tokens_seen": 72970480, + "router_z_loss_mlp": 1.04492188, + "step": 886, + "time_per_iteration": 2.5910444259643555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149096, + "balance_loss_mlp": 1.04452574, + "epoch": 0.1706425548287803, + "flos": 510012343296.0, + "grad_norm": 0.023090423796267925, + "language_loss": 0.94873035, + "learning_rate": 0.0009490261225441643, + "loss": 0.96022129, + "num_input_tokens_seen": 73053376, + "router_z_loss_mlp": 1.04638672, + "step": 887, + "time_per_iteration": 2.960139513015747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149945, + "balance_loss_mlp": 1.04508829, + "epoch": 0.17083493651404386, + "flos": 718714642944.0, + "grad_norm": 0.024954435208077393, + "language_loss": 0.98478651, + "learning_rate": 0.0009488889914133656, + "loss": 0.99628592, + "num_input_tokens_seen": 73136032, + "router_z_loss_mlp": 1.04833984, + "step": 888, + "time_per_iteration": 3.0498712062835693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151016, + "balance_loss_mlp": 1.04649353, + "epoch": 0.17102731819930742, + "flos": 560200333824.0, + "grad_norm": 0.020862133880352407, + "language_loss": 0.97394216, + "learning_rate": 0.0009487516860084047, + "loss": 0.98545229, + "num_input_tokens_seen": 73208544, + "router_z_loss_mlp": 1.046875, + "step": 889, + "time_per_iteration": 2.799579381942749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115955, + "balance_loss_mlp": 1.0542171, + "epoch": 0.17121969988457098, + "flos": 495764788224.0, + "grad_norm": 0.030159167385703775, + "language_loss": 0.99659365, + "learning_rate": 0.0009486142063825884, + "loss": 1.0081892, + "num_input_tokens_seen": 73274336, + "router_z_loss_mlp": 1.05126953, + "step": 890, + "time_per_iteration": 2.5897767543792725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160561, + "balance_loss_mlp": 1.05718231, + "epoch": 0.17141208156983456, + "flos": 1552105941504.0, + "grad_norm": 0.012289453069715352, + "language_loss": 0.72426212, + "learning_rate": 0.0009484765525892909, + "loss": 0.73586774, + "num_input_tokens_seen": 73506320, + "router_z_loss_mlp": 1.03515625, + "step": 891, + "time_per_iteration": 4.971697807312012 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160561, + "balance_loss_mlp": 1.05527556, + "epoch": 0.17160446325509812, + "flos": 620700005376.0, + "grad_norm": 0.02677753623279009, + "language_loss": 1.00227833, + "learning_rate": 0.0009483387246819542, + "loss": 1.01388383, + "num_input_tokens_seen": 73578048, + "router_z_loss_mlp": 1.05078125, + "step": 892, + "time_per_iteration": 2.7142419815063477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153152, + "balance_loss_mlp": 1.04977417, + "epoch": 0.17179684494036168, + "flos": 1384693300224.0, + "grad_norm": 0.011012484205567044, + "language_loss": 0.82285583, + "learning_rate": 0.0009482007227140877, + "loss": 0.8343873, + "num_input_tokens_seen": 73798640, + "router_z_loss_mlp": 1.03515625, + "step": 893, + "time_per_iteration": 4.678752183914185 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159751, + "balance_loss_mlp": 1.05446541, + "epoch": 0.17198922662562524, + "flos": 493641762816.0, + "grad_norm": 0.02464509578240857, + "language_loss": 0.9638195, + "learning_rate": 0.0009480625467392688, + "loss": 0.97541702, + "num_input_tokens_seen": 73867328, + "router_z_loss_mlp": 1.05175781, + "step": 894, + "time_per_iteration": 2.6579103469848633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158279, + "balance_loss_mlp": 1.05490112, + "epoch": 0.1721816083108888, + "flos": 1461485689344.0, + "grad_norm": 0.014844728137103481, + "language_loss": 0.77994668, + "learning_rate": 0.0009479241968111421, + "loss": 0.79152954, + "num_input_tokens_seen": 74093376, + "router_z_loss_mlp": 1.03515625, + "step": 895, + "time_per_iteration": 4.754615783691406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157074, + "balance_loss_mlp": 1.0523603, + "epoch": 0.17237398999615236, + "flos": 529204892160.0, + "grad_norm": 0.024157534092911288, + "language_loss": 0.95005947, + "learning_rate": 0.0009477856729834196, + "loss": 0.96163023, + "num_input_tokens_seen": 74169136, + "router_z_loss_mlp": 1.046875, + "step": 896, + "time_per_iteration": 2.7640984058380127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161659, + "balance_loss_mlp": 1.05742288, + "epoch": 0.17256637168141592, + "flos": 605026366464.0, + "grad_norm": 0.02447501108745492, + "language_loss": 0.9782356, + "learning_rate": 0.0009476469753098809, + "loss": 0.98985219, + "num_input_tokens_seen": 74236912, + "router_z_loss_mlp": 1.04394531, + "step": 897, + "time_per_iteration": 2.7016282081604004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153769, + "balance_loss_mlp": 1.04957986, + "epoch": 0.17275875336667948, + "flos": 510693821952.0, + "grad_norm": 0.025419887327313116, + "language_loss": 0.94868481, + "learning_rate": 0.0009475081038443738, + "loss": 0.96022242, + "num_input_tokens_seen": 74305968, + "router_z_loss_mlp": 1.04345703, + "step": 898, + "time_per_iteration": 2.5731348991394043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148609, + "balance_loss_mlp": 1.0446589, + "epoch": 0.17295113505194307, + "flos": 666500955648.0, + "grad_norm": 0.02623291269769982, + "language_loss": 0.95752573, + "learning_rate": 0.0009473690586408124, + "loss": 0.96901178, + "num_input_tokens_seen": 74384144, + "router_z_loss_mlp": 1.04101562, + "step": 899, + "time_per_iteration": 2.8549156188964844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146417, + "balance_loss_mlp": 1.04227531, + "epoch": 0.17314351673720663, + "flos": 556431645696.0, + "grad_norm": 0.022300666942289, + "language_loss": 0.94826102, + "learning_rate": 0.0009472298397531792, + "loss": 0.9597252, + "num_input_tokens_seen": 74455040, + "router_z_loss_mlp": 1.04296875, + "step": 900, + "time_per_iteration": 2.7165167331695557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145486, + "balance_loss_mlp": 1.04124928, + "epoch": 0.17333589842247019, + "flos": 504606724608.0, + "grad_norm": 0.023477361471443404, + "language_loss": 0.95443118, + "learning_rate": 0.0009470904472355235, + "loss": 0.96588612, + "num_input_tokens_seen": 74525248, + "router_z_loss_mlp": 1.04394531, + "step": 901, + "time_per_iteration": 2.668320655822754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143811, + "balance_loss_mlp": 1.03967023, + "epoch": 0.17352828010773375, + "flos": 557350167552.0, + "grad_norm": 0.02470997420275152, + "language_loss": 0.90534914, + "learning_rate": 0.0009469508811419626, + "loss": 0.91678727, + "num_input_tokens_seen": 74597328, + "router_z_loss_mlp": 1.04296875, + "step": 902, + "time_per_iteration": 2.714174747467041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155739, + "balance_loss_mlp": 1.05331421, + "epoch": 0.1737206617929973, + "flos": 1557791537664.0, + "grad_norm": 0.011695515468407039, + "language_loss": 0.7161383, + "learning_rate": 0.0009468111415266806, + "loss": 0.7276957, + "num_input_tokens_seen": 74819664, + "router_z_loss_mlp": 1.02539062, + "step": 903, + "time_per_iteration": 4.783574104309082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146888, + "balance_loss_mlp": 1.04308009, + "epoch": 0.17391304347826086, + "flos": 517755836928.0, + "grad_norm": 0.027522671456014093, + "language_loss": 0.94518518, + "learning_rate": 0.0009466712284439292, + "loss": 0.95665407, + "num_input_tokens_seen": 74896224, + "router_z_loss_mlp": 1.03955078, + "step": 904, + "time_per_iteration": 2.7503864765167236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011486, + "balance_loss_mlp": 1.04503071, + "epoch": 0.17410542516352442, + "flos": 542160622080.0, + "grad_norm": 0.027186859166075866, + "language_loss": 0.99262786, + "learning_rate": 0.0009465311419480276, + "loss": 1.00411391, + "num_input_tokens_seen": 74966560, + "router_z_loss_mlp": 1.03710938, + "step": 905, + "time_per_iteration": 2.671753168106079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153491, + "balance_loss_mlp": 1.05011249, + "epoch": 0.17429780684878798, + "flos": 625081041408.0, + "grad_norm": 0.028950662808853365, + "language_loss": 0.96674442, + "learning_rate": 0.0009463908820933622, + "loss": 0.97827929, + "num_input_tokens_seen": 75045248, + "router_z_loss_mlp": 1.03515625, + "step": 906, + "time_per_iteration": 2.8291828632354736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151914, + "balance_loss_mlp": 1.04844034, + "epoch": 0.17449018853405157, + "flos": 576848890368.0, + "grad_norm": 0.03002954803612974, + "language_loss": 0.90420532, + "learning_rate": 0.0009462504489343868, + "loss": 0.91572446, + "num_input_tokens_seen": 75123952, + "router_z_loss_mlp": 1.03613281, + "step": 907, + "time_per_iteration": 2.8554108142852783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147077, + "balance_loss_mlp": 1.04341269, + "epoch": 0.17468257021931513, + "flos": 534772967424.0, + "grad_norm": 0.024073731406752365, + "language_loss": 1.01002121, + "learning_rate": 0.0009461098425256222, + "loss": 1.02149189, + "num_input_tokens_seen": 75191728, + "router_z_loss_mlp": 1.03808594, + "step": 908, + "time_per_iteration": 2.5968382358551025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114306, + "balance_loss_mlp": 1.03930068, + "epoch": 0.1748749519045787, + "flos": 541808785920.0, + "grad_norm": 0.02493910110608304, + "language_loss": 0.93412566, + "learning_rate": 0.0009459690629216567, + "loss": 0.94555628, + "num_input_tokens_seen": 75262224, + "router_z_loss_mlp": 1.0390625, + "step": 909, + "time_per_iteration": 2.670389413833618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150977, + "balance_loss_mlp": 1.04688334, + "epoch": 0.17506733358984225, + "flos": 499626802176.0, + "grad_norm": 0.02402970341263653, + "language_loss": 0.96272469, + "learning_rate": 0.0009458281101771457, + "loss": 0.97423446, + "num_input_tokens_seen": 75329760, + "router_z_loss_mlp": 1.04248047, + "step": 910, + "time_per_iteration": 2.6256320476531982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153015, + "balance_loss_mlp": 1.04906452, + "epoch": 0.1752597152751058, + "flos": 624132320256.0, + "grad_norm": 0.023679811966199643, + "language_loss": 0.91450173, + "learning_rate": 0.0009456869843468122, + "loss": 0.92603183, + "num_input_tokens_seen": 75407920, + "router_z_loss_mlp": 1.04101562, + "step": 911, + "time_per_iteration": 2.863004207611084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158204, + "balance_loss_mlp": 1.05434883, + "epoch": 0.17545209696036937, + "flos": 521993155584.0, + "grad_norm": 0.029813530713564303, + "language_loss": 0.92364156, + "learning_rate": 0.0009455456854854459, + "loss": 0.93522358, + "num_input_tokens_seen": 75476752, + "router_z_loss_mlp": 1.04003906, + "step": 912, + "time_per_iteration": 2.616231918334961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150079, + "balance_loss_mlp": 1.04612815, + "epoch": 0.17564447864563293, + "flos": 462945764352.0, + "grad_norm": 0.02810445184103091, + "language_loss": 0.92624664, + "learning_rate": 0.0009454042136479039, + "loss": 0.93774742, + "num_input_tokens_seen": 75542944, + "router_z_loss_mlp": 1.04101562, + "step": 913, + "time_per_iteration": 2.5944247245788574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155662, + "balance_loss_mlp": 1.05199766, + "epoch": 0.1758368603308965, + "flos": 481617289728.0, + "grad_norm": 0.02706355326928303, + "language_loss": 0.91841793, + "learning_rate": 0.0009452625688891103, + "loss": 0.92997456, + "num_input_tokens_seen": 75609840, + "router_z_loss_mlp": 1.03808594, + "step": 914, + "time_per_iteration": 2.580941915512085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144051, + "balance_loss_mlp": 1.04200745, + "epoch": 0.17602924201616005, + "flos": 1482084856320.0, + "grad_norm": 0.009713749524187035, + "language_loss": 0.78734738, + "learning_rate": 0.0009451207512640567, + "loss": 0.79878789, + "num_input_tokens_seen": 75819312, + "router_z_loss_mlp": 1.02148438, + "step": 915, + "time_per_iteration": 4.592097997665405 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148996, + "balance_loss_mlp": 1.04523647, + "epoch": 0.17622162370142364, + "flos": 603470026752.0, + "grad_norm": 0.02797967110469985, + "language_loss": 1.03421283, + "learning_rate": 0.0009449787608278015, + "loss": 1.0457027, + "num_input_tokens_seen": 75893984, + "router_z_loss_mlp": 1.0390625, + "step": 916, + "time_per_iteration": 2.755580425262451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150108, + "balance_loss_mlp": 1.04677713, + "epoch": 0.1764140053866872, + "flos": 443605495296.0, + "grad_norm": 0.024189441248888145, + "language_loss": 1.00777316, + "learning_rate": 0.0009448365976354704, + "loss": 1.01927423, + "num_input_tokens_seen": 75958944, + "router_z_loss_mlp": 1.03466797, + "step": 917, + "time_per_iteration": 2.4922571182250977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149055, + "balance_loss_mlp": 1.04567707, + "epoch": 0.17660638707195075, + "flos": 501591373824.0, + "grad_norm": 0.028333637349232343, + "language_loss": 1.01507974, + "learning_rate": 0.0009446942617422558, + "loss": 1.02657032, + "num_input_tokens_seen": 76024240, + "router_z_loss_mlp": 1.03515625, + "step": 918, + "time_per_iteration": 2.574998378753662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148191, + "balance_loss_mlp": 1.0448128, + "epoch": 0.17679876875721431, + "flos": 539983202304.0, + "grad_norm": 0.02432410226762854, + "language_loss": 0.94564992, + "learning_rate": 0.0009445517532034176, + "loss": 0.9571318, + "num_input_tokens_seen": 76095264, + "router_z_loss_mlp": 1.03515625, + "step": 919, + "time_per_iteration": 2.7170355319976807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153425, + "balance_loss_mlp": 1.05009484, + "epoch": 0.17699115044247787, + "flos": 498715011072.0, + "grad_norm": 0.026165935935680888, + "language_loss": 0.99032271, + "learning_rate": 0.0009444090720742824, + "loss": 1.00185692, + "num_input_tokens_seen": 76163520, + "router_z_loss_mlp": 1.03466797, + "step": 920, + "time_per_iteration": 2.6009292602539062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149157, + "balance_loss_mlp": 1.04587448, + "epoch": 0.17718353212774143, + "flos": 663915303936.0, + "grad_norm": 0.025722324934358026, + "language_loss": 0.98290348, + "learning_rate": 0.0009442662184102439, + "loss": 0.99439508, + "num_input_tokens_seen": 76233760, + "router_z_loss_mlp": 1.03417969, + "step": 921, + "time_per_iteration": 2.7612035274505615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145605, + "balance_loss_mlp": 1.04251313, + "epoch": 0.177375913813005, + "flos": 583847778816.0, + "grad_norm": 0.021564117555322487, + "language_loss": 0.93569565, + "learning_rate": 0.000944123192266763, + "loss": 0.94715166, + "num_input_tokens_seen": 76310704, + "router_z_loss_mlp": 1.03222656, + "step": 922, + "time_per_iteration": 2.8110268115997314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141792, + "balance_loss_mlp": 1.03865182, + "epoch": 0.17756829549826855, + "flos": 553683537408.0, + "grad_norm": 0.021487036209533367, + "language_loss": 0.92858881, + "learning_rate": 0.0009439799936993671, + "loss": 0.94000673, + "num_input_tokens_seen": 76386992, + "router_z_loss_mlp": 1.03271484, + "step": 923, + "time_per_iteration": 2.7440245151519775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142202, + "balance_loss_mlp": 1.03901482, + "epoch": 0.17776067718353214, + "flos": 557371634688.0, + "grad_norm": 0.02463154633112553, + "language_loss": 0.97990632, + "learning_rate": 0.0009438366227636511, + "loss": 0.99132836, + "num_input_tokens_seen": 76453328, + "router_z_loss_mlp": 1.03320312, + "step": 924, + "time_per_iteration": 2.7032759189605713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140208, + "balance_loss_mlp": 1.03721154, + "epoch": 0.1779530588687957, + "flos": 659651788800.0, + "grad_norm": 0.022941473179093813, + "language_loss": 0.94988692, + "learning_rate": 0.0009436930795152763, + "loss": 0.96128899, + "num_input_tokens_seen": 76529040, + "router_z_loss_mlp": 1.03125, + "step": 925, + "time_per_iteration": 2.854522943496704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143555, + "balance_loss_mlp": 1.04084456, + "epoch": 0.17814544055405926, + "flos": 645671476224.0, + "grad_norm": 0.02421412975678805, + "language_loss": 0.95479, + "learning_rate": 0.0009435493640099713, + "loss": 0.9662255, + "num_input_tokens_seen": 76604080, + "router_z_loss_mlp": 1.02832031, + "step": 926, + "time_per_iteration": 2.8268251419067383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143389, + "balance_loss_mlp": 1.04077399, + "epoch": 0.17833782223932282, + "flos": 461884251648.0, + "grad_norm": 0.0252062590806445, + "language_loss": 0.94177145, + "learning_rate": 0.0009434054763035314, + "loss": 0.95320535, + "num_input_tokens_seen": 76674096, + "router_z_loss_mlp": 1.02734375, + "step": 927, + "time_per_iteration": 2.629499673843384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139685, + "balance_loss_mlp": 1.03706956, + "epoch": 0.17853020392458638, + "flos": 760852965888.0, + "grad_norm": 0.02122720378042075, + "language_loss": 0.93181551, + "learning_rate": 0.0009432614164518185, + "loss": 0.94321233, + "num_input_tokens_seen": 76752144, + "router_z_loss_mlp": 1.02734375, + "step": 928, + "time_per_iteration": 2.9364700317382812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140803, + "balance_loss_mlp": 1.03818727, + "epoch": 0.17872258560984994, + "flos": 784055248896.0, + "grad_norm": 0.023477252169520995, + "language_loss": 0.93520033, + "learning_rate": 0.000943117184510762, + "loss": 0.94660836, + "num_input_tokens_seen": 76830240, + "router_z_loss_mlp": 1.02734375, + "step": 929, + "time_per_iteration": 3.07600474357605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150169, + "balance_loss_mlp": 1.04831696, + "epoch": 0.1789149672951135, + "flos": 1463031295488.0, + "grad_norm": 0.013755703560815407, + "language_loss": 0.78789961, + "learning_rate": 0.0009429727805363575, + "loss": 0.7994014, + "num_input_tokens_seen": 77062464, + "router_z_loss_mlp": 1.01953125, + "step": 930, + "time_per_iteration": 5.029282808303833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153323, + "balance_loss_mlp": 1.05099344, + "epoch": 0.17910734898037706, + "flos": 504930362880.0, + "grad_norm": 0.023999213273897636, + "language_loss": 0.96652937, + "learning_rate": 0.0009428282045846674, + "loss": 0.97806263, + "num_input_tokens_seen": 77136672, + "router_z_loss_mlp": 1.02441406, + "step": 931, + "time_per_iteration": 2.7112410068511963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145421, + "balance_loss_mlp": 1.04275823, + "epoch": 0.17929973066564064, + "flos": 747669651456.0, + "grad_norm": 0.02006943819739268, + "language_loss": 0.96385491, + "learning_rate": 0.0009426834567118214, + "loss": 0.97530913, + "num_input_tokens_seen": 77227040, + "router_z_loss_mlp": 1.02783203, + "step": 932, + "time_per_iteration": 3.0711913108825684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143693, + "balance_loss_mlp": 1.04098177, + "epoch": 0.1794921123509042, + "flos": 714572651520.0, + "grad_norm": 0.021210123960592832, + "language_loss": 0.89608383, + "learning_rate": 0.0009425385369740155, + "loss": 0.90752071, + "num_input_tokens_seen": 77319392, + "router_z_loss_mlp": 1.02832031, + "step": 933, + "time_per_iteration": 3.059857130050659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114727, + "balance_loss_mlp": 1.0451318, + "epoch": 0.17968449403616776, + "flos": 634361409024.0, + "grad_norm": 0.02299955090486112, + "language_loss": 0.96636283, + "learning_rate": 0.0009423934454275125, + "loss": 0.97783554, + "num_input_tokens_seen": 77394688, + "router_z_loss_mlp": 1.02246094, + "step": 934, + "time_per_iteration": 2.85917592048645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146917, + "balance_loss_mlp": 1.04477859, + "epoch": 0.17987687572143132, + "flos": 537378084864.0, + "grad_norm": 0.02461268142415081, + "language_loss": 1.01075852, + "learning_rate": 0.0009422481821286418, + "loss": 1.02222764, + "num_input_tokens_seen": 77468288, + "router_z_loss_mlp": 1.02246094, + "step": 935, + "time_per_iteration": 2.7314486503601074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150005, + "balance_loss_mlp": 1.04777098, + "epoch": 0.18006925740669488, + "flos": 539119074816.0, + "grad_norm": 0.026258801194945027, + "language_loss": 0.98970592, + "learning_rate": 0.0009421027471337998, + "loss": 1.00120604, + "num_input_tokens_seen": 77535840, + "router_z_loss_mlp": 1.0234375, + "step": 936, + "time_per_iteration": 2.6354496479034424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151337, + "balance_loss_mlp": 1.04891205, + "epoch": 0.18026163909195844, + "flos": 540534425088.0, + "grad_norm": 0.029056123283387615, + "language_loss": 0.94782555, + "learning_rate": 0.0009419571404994493, + "loss": 0.9593389, + "num_input_tokens_seen": 77604000, + "router_z_loss_mlp": 1.02539062, + "step": 937, + "time_per_iteration": 2.6368348598480225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148965, + "balance_loss_mlp": 1.04649317, + "epoch": 0.180454020777222, + "flos": 501682698240.0, + "grad_norm": 0.026973093946582868, + "language_loss": 1.00715971, + "learning_rate": 0.00094181136228212, + "loss": 1.01864934, + "num_input_tokens_seen": 77671488, + "router_z_loss_mlp": 1.02587891, + "step": 938, + "time_per_iteration": 2.710451602935791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145832, + "balance_loss_mlp": 1.043455, + "epoch": 0.18064640246248556, + "flos": 500006836224.0, + "grad_norm": 0.02510488837562242, + "language_loss": 0.93535352, + "learning_rate": 0.0009416654125384077, + "loss": 0.9468118, + "num_input_tokens_seen": 77746240, + "router_z_loss_mlp": 1.02490234, + "step": 939, + "time_per_iteration": 2.728480577468872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145905, + "balance_loss_mlp": 1.04424286, + "epoch": 0.18083878414774912, + "flos": 1522290808320.0, + "grad_norm": 0.01070150853185005, + "language_loss": 0.79772377, + "learning_rate": 0.0009415192913249752, + "loss": 0.80918276, + "num_input_tokens_seen": 77966080, + "router_z_loss_mlp": 1.01757812, + "step": 940, + "time_per_iteration": 4.915560007095337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145419, + "balance_loss_mlp": 1.04318535, + "epoch": 0.1810311658330127, + "flos": 728665755648.0, + "grad_norm": 0.023936590350452012, + "language_loss": 0.92724693, + "learning_rate": 0.000941372998698552, + "loss": 0.93870103, + "num_input_tokens_seen": 78049200, + "router_z_loss_mlp": 1.0234375, + "step": 941, + "time_per_iteration": 2.993441343307495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140689, + "balance_loss_mlp": 1.0385505, + "epoch": 0.18122354751827627, + "flos": 566044383744.0, + "grad_norm": 0.025062658148163358, + "language_loss": 0.94270039, + "learning_rate": 0.0009412265347159336, + "loss": 0.95410728, + "num_input_tokens_seen": 78122752, + "router_z_loss_mlp": 1.02246094, + "step": 942, + "time_per_iteration": 2.731416702270508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140669, + "balance_loss_mlp": 1.03848326, + "epoch": 0.18141592920353983, + "flos": 520317293568.0, + "grad_norm": 0.024682729806918415, + "language_loss": 0.94559634, + "learning_rate": 0.0009410798994339829, + "loss": 0.95700312, + "num_input_tokens_seen": 78194064, + "router_z_loss_mlp": 1.02294922, + "step": 943, + "time_per_iteration": 2.6001992225646973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138644, + "balance_loss_mlp": 1.03650522, + "epoch": 0.1816083108888034, + "flos": 513476858880.0, + "grad_norm": 0.022579221317186333, + "language_loss": 0.95589852, + "learning_rate": 0.000940933092909628, + "loss": 0.96728498, + "num_input_tokens_seen": 78262048, + "router_z_loss_mlp": 1.02246094, + "step": 944, + "time_per_iteration": 2.6360957622528076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147356, + "balance_loss_mlp": 1.04550409, + "epoch": 0.18180069257406695, + "flos": 493372518912.0, + "grad_norm": 0.02569410792888805, + "language_loss": 0.9276287, + "learning_rate": 0.0009407861151998649, + "loss": 0.93910229, + "num_input_tokens_seen": 78330624, + "router_z_loss_mlp": 1.01953125, + "step": 945, + "time_per_iteration": 2.6910903453826904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147749, + "balance_loss_mlp": 1.04608703, + "epoch": 0.1819930742593305, + "flos": 571230423552.0, + "grad_norm": 0.024877151530798884, + "language_loss": 0.95025092, + "learning_rate": 0.0009406389663617552, + "loss": 0.96172833, + "num_input_tokens_seen": 78400672, + "router_z_loss_mlp": 1.01757812, + "step": 946, + "time_per_iteration": 2.689232110977173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138734, + "balance_loss_mlp": 1.03669131, + "epoch": 0.18218545594459407, + "flos": 607110460416.0, + "grad_norm": 0.026141117268158143, + "language_loss": 0.96229172, + "learning_rate": 0.000940491646452427, + "loss": 0.97367907, + "num_input_tokens_seen": 78467952, + "router_z_loss_mlp": 1.02148438, + "step": 947, + "time_per_iteration": 2.720996618270874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136776, + "balance_loss_mlp": 1.03473294, + "epoch": 0.18237783762985763, + "flos": 549738931200.0, + "grad_norm": 0.02114848591843324, + "language_loss": 0.99382234, + "learning_rate": 0.000940344155529075, + "loss": 1.00519001, + "num_input_tokens_seen": 78538928, + "router_z_loss_mlp": 1.02148438, + "step": 948, + "time_per_iteration": 2.655764102935791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136656, + "balance_loss_mlp": 1.03489935, + "epoch": 0.1825702193151212, + "flos": 451674628608.0, + "grad_norm": 0.027816765537183038, + "language_loss": 0.98392528, + "learning_rate": 0.0009401964936489605, + "loss": 0.99529195, + "num_input_tokens_seen": 78602144, + "router_z_loss_mlp": 1.01855469, + "step": 949, + "time_per_iteration": 2.5372273921966553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137812, + "balance_loss_mlp": 1.03615081, + "epoch": 0.18276260100038477, + "flos": 590384040960.0, + "grad_norm": 0.023066854335363023, + "language_loss": 0.93237805, + "learning_rate": 0.0009400486608694108, + "loss": 0.94375616, + "num_input_tokens_seen": 78673152, + "router_z_loss_mlp": 1.01757812, + "step": 950, + "time_per_iteration": 2.7370681762695312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139002, + "balance_loss_mlp": 1.03719783, + "epoch": 0.18295498268564833, + "flos": 788709531648.0, + "grad_norm": 0.02337801281240106, + "language_loss": 0.97100747, + "learning_rate": 0.0009399006572478195, + "loss": 0.98239744, + "num_input_tokens_seen": 78753872, + "router_z_loss_mlp": 1.01904297, + "step": 951, + "time_per_iteration": 3.1136744022369385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144566, + "balance_loss_mlp": 1.04276168, + "epoch": 0.1831473643709119, + "flos": 579225696768.0, + "grad_norm": 0.024500893588447415, + "language_loss": 0.99522519, + "learning_rate": 0.0009397524828416468, + "loss": 1.00667083, + "num_input_tokens_seen": 78822640, + "router_z_loss_mlp": 1.01904297, + "step": 952, + "time_per_iteration": 2.680551767349243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138356, + "balance_loss_mlp": 1.03664696, + "epoch": 0.18333974605617545, + "flos": 567963293184.0, + "grad_norm": 0.023361368133084506, + "language_loss": 1.04812968, + "learning_rate": 0.0009396041377084192, + "loss": 1.05951309, + "num_input_tokens_seen": 78893792, + "router_z_loss_mlp": 1.01806641, + "step": 953, + "time_per_iteration": 2.6526119709014893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136097, + "balance_loss_mlp": 1.03443527, + "epoch": 0.183532127741439, + "flos": 528069519360.0, + "grad_norm": 0.02324700647994909, + "language_loss": 0.98137838, + "learning_rate": 0.0009394556219057295, + "loss": 0.99273932, + "num_input_tokens_seen": 78964752, + "router_z_loss_mlp": 1.01757812, + "step": 954, + "time_per_iteration": 2.6928489208221436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147999, + "balance_loss_mlp": 1.04671907, + "epoch": 0.18372450942670257, + "flos": 595643940864.0, + "grad_norm": 0.02338261009959255, + "language_loss": 0.93879586, + "learning_rate": 0.0009393069354912362, + "loss": 0.95027584, + "num_input_tokens_seen": 79034400, + "router_z_loss_mlp": 1.01367188, + "step": 955, + "time_per_iteration": 2.7496042251586914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151957, + "balance_loss_mlp": 1.05067647, + "epoch": 0.18391689111196613, + "flos": 646283824128.0, + "grad_norm": 0.029421035614033756, + "language_loss": 0.90626895, + "learning_rate": 0.0009391580785226649, + "loss": 0.91778857, + "num_input_tokens_seen": 79109488, + "router_z_loss_mlp": 1.01367188, + "step": 956, + "time_per_iteration": 2.9440600872039795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152481, + "balance_loss_mlp": 1.05253601, + "epoch": 0.18410927279722972, + "flos": 1460391975936.0, + "grad_norm": 0.020211591247266292, + "language_loss": 0.79340446, + "learning_rate": 0.0009390090510578067, + "loss": 0.80492932, + "num_input_tokens_seen": 79327712, + "router_z_loss_mlp": 1.0, + "step": 957, + "time_per_iteration": 4.738964796066284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138037, + "balance_loss_mlp": 1.03623211, + "epoch": 0.18430165448249328, + "flos": 660003624960.0, + "grad_norm": 0.026926680065899915, + "language_loss": 0.95339954, + "learning_rate": 0.0009388598531545196, + "loss": 0.96477991, + "num_input_tokens_seen": 79401504, + "router_z_loss_mlp": 1.01904297, + "step": 958, + "time_per_iteration": 2.859509229660034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138629, + "balance_loss_mlp": 1.03687191, + "epoch": 0.18449403616775684, + "flos": 518949606912.0, + "grad_norm": 0.029778126611616895, + "language_loss": 0.94583583, + "learning_rate": 0.000938710484870727, + "loss": 0.9572221, + "num_input_tokens_seen": 79466688, + "router_z_loss_mlp": 1.01855469, + "step": 959, + "time_per_iteration": 2.565548896789551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137101, + "balance_loss_mlp": 1.03543901, + "epoch": 0.1846864178530204, + "flos": 553824526848.0, + "grad_norm": 0.027283874554685776, + "language_loss": 0.94945395, + "learning_rate": 0.0009385609462644189, + "loss": 0.96082497, + "num_input_tokens_seen": 79540288, + "router_z_loss_mlp": 1.01757812, + "step": 960, + "time_per_iteration": 2.676379919052124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138569, + "balance_loss_mlp": 1.03709817, + "epoch": 0.18487879953828396, + "flos": 467115953664.0, + "grad_norm": 0.025693285519799033, + "language_loss": 0.96468461, + "learning_rate": 0.0009384112373936514, + "loss": 0.97607034, + "num_input_tokens_seen": 79611872, + "router_z_loss_mlp": 1.015625, + "step": 961, + "time_per_iteration": 2.7575974464416504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154728, + "balance_loss_mlp": 1.05325735, + "epoch": 0.18507118122354752, + "flos": 649683211776.0, + "grad_norm": 0.02725538915325764, + "language_loss": 1.0098747, + "learning_rate": 0.0009382613583165467, + "loss": 1.02142203, + "num_input_tokens_seen": 79689504, + "router_z_loss_mlp": 1.015625, + "step": 962, + "time_per_iteration": 2.8268754482269287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116263, + "balance_loss_mlp": 1.06125438, + "epoch": 0.18526356290881107, + "flos": 627922475520.0, + "grad_norm": 0.027998512126097927, + "language_loss": 0.99849832, + "learning_rate": 0.0009381113090912928, + "loss": 1.01012468, + "num_input_tokens_seen": 79759264, + "router_z_loss_mlp": 1.01464844, + "step": 963, + "time_per_iteration": 2.7762861251831055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147698, + "balance_loss_mlp": 1.04679894, + "epoch": 0.18545594459407463, + "flos": 433645650432.0, + "grad_norm": 0.027008272304904758, + "language_loss": 0.98634118, + "learning_rate": 0.000937961089776144, + "loss": 0.99781811, + "num_input_tokens_seen": 79824464, + "router_z_loss_mlp": 1.00976562, + "step": 964, + "time_per_iteration": 2.61759352684021 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149635, + "balance_loss_mlp": 1.04844999, + "epoch": 0.1856483262793382, + "flos": 750426491904.0, + "grad_norm": 0.028502333826765886, + "language_loss": 0.91998804, + "learning_rate": 0.0009378107004294208, + "loss": 0.93148446, + "num_input_tokens_seen": 79907152, + "router_z_loss_mlp": 1.01269531, + "step": 965, + "time_per_iteration": 2.964561939239502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151478, + "balance_loss_mlp": 1.05057883, + "epoch": 0.18584070796460178, + "flos": 531401777664.0, + "grad_norm": 0.02451376704559663, + "language_loss": 1.00210857, + "learning_rate": 0.0009376601411095096, + "loss": 1.01362348, + "num_input_tokens_seen": 79976944, + "router_z_loss_mlp": 1.00976562, + "step": 966, + "time_per_iteration": 2.6664164066314697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150482, + "balance_loss_mlp": 1.04953575, + "epoch": 0.18603308964986534, + "flos": 484083419136.0, + "grad_norm": 0.02282308899195351, + "language_loss": 0.93174511, + "learning_rate": 0.0009375094118748622, + "loss": 0.94324994, + "num_input_tokens_seen": 80042112, + "router_z_loss_mlp": 1.01025391, + "step": 967, + "time_per_iteration": 2.544952392578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142823, + "balance_loss_mlp": 1.041924, + "epoch": 0.1862254713351289, + "flos": 802681112064.0, + "grad_norm": 0.02495680742184495, + "language_loss": 1.00251484, + "learning_rate": 0.0009373585127839976, + "loss": 1.01394308, + "num_input_tokens_seen": 80118896, + "router_z_loss_mlp": 1.00976562, + "step": 968, + "time_per_iteration": 2.973095417022705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142113, + "balance_loss_mlp": 1.0413574, + "epoch": 0.18641785302039246, + "flos": 479290148352.0, + "grad_norm": 0.02509872783632802, + "language_loss": 0.9944787, + "learning_rate": 0.0009372074438954994, + "loss": 1.00589979, + "num_input_tokens_seen": 80183360, + "router_z_loss_mlp": 1.00830078, + "step": 969, + "time_per_iteration": 2.5303025245666504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142663, + "balance_loss_mlp": 1.04181159, + "epoch": 0.18661023470565602, + "flos": 389779072512.0, + "grad_norm": 0.02439046514561532, + "language_loss": 1.00939226, + "learning_rate": 0.0009370562052680181, + "loss": 1.02081895, + "num_input_tokens_seen": 80247024, + "router_z_loss_mlp": 1.00927734, + "step": 970, + "time_per_iteration": 2.5023443698883057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114981, + "balance_loss_mlp": 1.04929316, + "epoch": 0.18680261639091958, + "flos": 565775139840.0, + "grad_norm": 0.02213336285369191, + "language_loss": 0.95379293, + "learning_rate": 0.0009369047969602695, + "loss": 0.96529102, + "num_input_tokens_seen": 80318256, + "router_z_loss_mlp": 1.00585938, + "step": 971, + "time_per_iteration": 2.722823143005371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154865, + "balance_loss_mlp": 1.05420506, + "epoch": 0.18699499807618314, + "flos": 480230137344.0, + "grad_norm": 0.029574405329312194, + "language_loss": 0.9913702, + "learning_rate": 0.0009367532190310357, + "loss": 1.00291884, + "num_input_tokens_seen": 80384848, + "router_z_loss_mlp": 1.00732422, + "step": 972, + "time_per_iteration": 2.633387327194214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149336, + "balance_loss_mlp": 1.0490092, + "epoch": 0.1871873797614467, + "flos": 554328086016.0, + "grad_norm": 0.02905569815438633, + "language_loss": 0.99535728, + "learning_rate": 0.0009366014715391644, + "loss": 1.00685072, + "num_input_tokens_seen": 80453088, + "router_z_loss_mlp": 1.00390625, + "step": 973, + "time_per_iteration": 2.6549065113067627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153264, + "balance_loss_mlp": 1.05293763, + "epoch": 0.18737976144671029, + "flos": 553952781312.0, + "grad_norm": 0.023481989115367276, + "language_loss": 0.9123525, + "learning_rate": 0.0009364495545435693, + "loss": 0.92388517, + "num_input_tokens_seen": 80528608, + "router_z_loss_mlp": 1.00390625, + "step": 974, + "time_per_iteration": 4.409714221954346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155126, + "balance_loss_mlp": 1.05479944, + "epoch": 0.18757214313197385, + "flos": 503247770112.0, + "grad_norm": 0.022955013749569684, + "language_loss": 0.97297812, + "learning_rate": 0.0009362974681032297, + "loss": 0.98452938, + "num_input_tokens_seen": 80599600, + "router_z_loss_mlp": 1.00390625, + "step": 975, + "time_per_iteration": 2.61857533454895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153706, + "balance_loss_mlp": 1.05352271, + "epoch": 0.1877645248172374, + "flos": 676291613184.0, + "grad_norm": 0.028784531937469084, + "language_loss": 0.98011422, + "learning_rate": 0.0009361452122771907, + "loss": 0.9916513, + "num_input_tokens_seen": 80677264, + "router_z_loss_mlp": 1.00244141, + "step": 976, + "time_per_iteration": 2.91774320602417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149368, + "balance_loss_mlp": 1.04923177, + "epoch": 0.18795690650250096, + "flos": 405862944768.0, + "grad_norm": 0.029616845561456457, + "language_loss": 0.95658362, + "learning_rate": 0.0009359927871245635, + "loss": 0.9680773, + "num_input_tokens_seen": 80739776, + "router_z_loss_mlp": 1.00195312, + "step": 977, + "time_per_iteration": 2.563232183456421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149302, + "balance_loss_mlp": 1.04916573, + "epoch": 0.18814928818776452, + "flos": 639063355392.0, + "grad_norm": 0.027239481801034963, + "language_loss": 0.98439831, + "learning_rate": 0.0009358401927045246, + "loss": 0.99589127, + "num_input_tokens_seen": 80815200, + "router_z_loss_mlp": 1.00195312, + "step": 978, + "time_per_iteration": 2.8147568702697754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144978, + "balance_loss_mlp": 1.04498518, + "epoch": 0.18834166987302808, + "flos": 1140115514880.0, + "grad_norm": 0.022094320674951175, + "language_loss": 0.96123868, + "learning_rate": 0.0009356874290763166, + "loss": 0.9726885, + "num_input_tokens_seen": 80905024, + "router_z_loss_mlp": 1.00048828, + "step": 979, + "time_per_iteration": 3.4719691276550293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149894, + "balance_loss_mlp": 1.04971051, + "epoch": 0.18853405155829164, + "flos": 505815957504.0, + "grad_norm": 0.02560863383472628, + "language_loss": 0.98637187, + "learning_rate": 0.0009355344962992474, + "loss": 0.99787074, + "num_input_tokens_seen": 80976704, + "router_z_loss_mlp": 1.00244141, + "step": 980, + "time_per_iteration": 2.6199324131011963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139646, + "balance_loss_mlp": 1.03931963, + "epoch": 0.1887264332435552, + "flos": 609370472448.0, + "grad_norm": 0.02150131271194909, + "language_loss": 0.97900265, + "learning_rate": 0.0009353813944326908, + "loss": 0.99039912, + "num_input_tokens_seen": 81057152, + "router_z_loss_mlp": 1.00390625, + "step": 981, + "time_per_iteration": 2.8862478733062744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143203, + "balance_loss_mlp": 1.04287672, + "epoch": 0.1889188149288188, + "flos": 553592212992.0, + "grad_norm": 0.027403519760576756, + "language_loss": 0.92598587, + "learning_rate": 0.0009352281235360863, + "loss": 0.93741786, + "num_input_tokens_seen": 81131520, + "router_z_loss_mlp": 1.00390625, + "step": 982, + "time_per_iteration": 2.680797815322876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142003, + "balance_loss_mlp": 1.04167616, + "epoch": 0.18911119661408235, + "flos": 419469954048.0, + "grad_norm": 0.02481781093748577, + "language_loss": 0.92531025, + "learning_rate": 0.0009350746836689389, + "loss": 0.93673027, + "num_input_tokens_seen": 81195952, + "router_z_loss_mlp": 1.00390625, + "step": 983, + "time_per_iteration": 2.5687928199768066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152649, + "balance_loss_mlp": 1.05289459, + "epoch": 0.1893035782993459, + "flos": 1485317784576.0, + "grad_norm": 0.01747927461324531, + "language_loss": 0.81439221, + "learning_rate": 0.0009349210748908193, + "loss": 0.82591867, + "num_input_tokens_seen": 81427312, + "router_z_loss_mlp": 0.99804688, + "step": 984, + "time_per_iteration": 4.978898048400879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115218, + "balance_loss_mlp": 1.05237782, + "epoch": 0.18949595998460947, + "flos": 509456391168.0, + "grad_norm": 0.033971943902626214, + "language_loss": 0.94133711, + "learning_rate": 0.0009347672972613634, + "loss": 0.95285892, + "num_input_tokens_seen": 81494256, + "router_z_loss_mlp": 0.99853516, + "step": 985, + "time_per_iteration": 2.5850014686584473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153583, + "balance_loss_mlp": 1.05382824, + "epoch": 0.18968834166987303, + "flos": 532192045056.0, + "grad_norm": 0.027626772825507382, + "language_loss": 0.93152702, + "learning_rate": 0.0009346133508402735, + "loss": 0.9430629, + "num_input_tokens_seen": 81569312, + "router_z_loss_mlp": 0.99804688, + "step": 986, + "time_per_iteration": 2.7262227535247803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146056, + "balance_loss_mlp": 1.04658782, + "epoch": 0.1898807233551366, + "flos": 500753442816.0, + "grad_norm": 0.02768975875157221, + "language_loss": 0.95335174, + "learning_rate": 0.0009344592356873166, + "loss": 0.96481234, + "num_input_tokens_seen": 81637024, + "router_z_loss_mlp": 0.99511719, + "step": 987, + "time_per_iteration": 2.678715467453003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149829, + "balance_loss_mlp": 1.05002666, + "epoch": 0.19007310504040015, + "flos": 603359236608.0, + "grad_norm": 0.02899497531058058, + "language_loss": 0.87347138, + "learning_rate": 0.0009343049518623255, + "loss": 0.88496965, + "num_input_tokens_seen": 81709488, + "router_z_loss_mlp": 0.99853516, + "step": 988, + "time_per_iteration": 2.726668119430542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143975, + "balance_loss_mlp": 1.04407787, + "epoch": 0.1902654867256637, + "flos": 602764353024.0, + "grad_norm": 0.022945627178248204, + "language_loss": 0.90576518, + "learning_rate": 0.0009341504994251985, + "loss": 0.91720492, + "num_input_tokens_seen": 81787152, + "router_z_loss_mlp": 0.99951172, + "step": 989, + "time_per_iteration": 2.8518989086151123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151848, + "balance_loss_mlp": 1.05247498, + "epoch": 0.19045786841092727, + "flos": 1579231363584.0, + "grad_norm": 0.011944448483625032, + "language_loss": 0.73520499, + "learning_rate": 0.0009339958784358994, + "loss": 0.74672347, + "num_input_tokens_seen": 82030608, + "router_z_loss_mlp": 0.99414062, + "step": 990, + "time_per_iteration": 5.084089517593384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144398, + "balance_loss_mlp": 1.04445326, + "epoch": 0.19065025009619085, + "flos": 683054184960.0, + "grad_norm": 0.025253455013724026, + "language_loss": 0.88680583, + "learning_rate": 0.0009338410889544574, + "loss": 0.8982498, + "num_input_tokens_seen": 82119872, + "router_z_loss_mlp": 1.0, + "step": 991, + "time_per_iteration": 3.007277011871338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113977, + "balance_loss_mlp": 1.03949153, + "epoch": 0.1908426317814544, + "flos": 603441828864.0, + "grad_norm": 0.02514183514150974, + "language_loss": 0.96243769, + "learning_rate": 0.000933686131040967, + "loss": 0.97383535, + "num_input_tokens_seen": 82195552, + "router_z_loss_mlp": 1.00341797, + "step": 992, + "time_per_iteration": 2.7673017978668213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144459, + "balance_loss_mlp": 1.04441845, + "epoch": 0.19103501346671797, + "flos": 587433818112.0, + "grad_norm": 0.025095383977303525, + "language_loss": 0.99126339, + "learning_rate": 0.0009335310047555883, + "loss": 1.00270796, + "num_input_tokens_seen": 82267040, + "router_z_loss_mlp": 1.00097656, + "step": 993, + "time_per_iteration": 2.782841920852661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145602, + "balance_loss_mlp": 1.04565716, + "epoch": 0.19122739515198153, + "flos": 546834370560.0, + "grad_norm": 0.0365250692916995, + "language_loss": 0.97246122, + "learning_rate": 0.0009333757101585467, + "loss": 0.98391724, + "num_input_tokens_seen": 82337680, + "router_z_loss_mlp": 1.0, + "step": 994, + "time_per_iteration": 2.6937174797058105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142239, + "balance_loss_mlp": 1.04229414, + "epoch": 0.1914197768372451, + "flos": 522549107712.0, + "grad_norm": 0.02399514581888075, + "language_loss": 1.00362575, + "learning_rate": 0.0009332202473101329, + "loss": 1.01504803, + "num_input_tokens_seen": 82409600, + "router_z_loss_mlp": 1.0, + "step": 995, + "time_per_iteration": 2.7192962169647217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137582, + "balance_loss_mlp": 1.03763652, + "epoch": 0.19161215852250865, + "flos": 612387824640.0, + "grad_norm": 0.024864495797513732, + "language_loss": 0.91319168, + "learning_rate": 0.0009330646162707028, + "loss": 0.92456746, + "num_input_tokens_seen": 82480288, + "router_z_loss_mlp": 1.0, + "step": 996, + "time_per_iteration": 2.7450180053710938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113947, + "balance_loss_mlp": 1.03962064, + "epoch": 0.1918045402077722, + "flos": 848182619136.0, + "grad_norm": 0.02592603597590215, + "language_loss": 0.92579019, + "learning_rate": 0.0009329088171006779, + "loss": 0.93718487, + "num_input_tokens_seen": 82568960, + "router_z_loss_mlp": 0.99902344, + "step": 997, + "time_per_iteration": 3.1890194416046143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144521, + "balance_loss_mlp": 1.04457617, + "epoch": 0.19199692189303577, + "flos": 466892371968.0, + "grad_norm": 0.027577096255712943, + "language_loss": 0.95194477, + "learning_rate": 0.0009327528498605446, + "loss": 0.96338999, + "num_input_tokens_seen": 82634128, + "router_z_loss_mlp": 1.0, + "step": 998, + "time_per_iteration": 2.6845622062683105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141712, + "balance_loss_mlp": 1.04143262, + "epoch": 0.19218930357829936, + "flos": 532613011968.0, + "grad_norm": 0.026795980657526523, + "language_loss": 0.98209792, + "learning_rate": 0.0009325967146108548, + "loss": 0.99351501, + "num_input_tokens_seen": 82707472, + "router_z_loss_mlp": 1.00341797, + "step": 999, + "time_per_iteration": 2.690363883972168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145933, + "balance_loss_mlp": 1.04589295, + "epoch": 0.19238168526356292, + "flos": 602727422976.0, + "grad_norm": 0.025877996038880184, + "language_loss": 0.97816348, + "learning_rate": 0.0009324404114122258, + "loss": 0.98962283, + "num_input_tokens_seen": 82775232, + "router_z_loss_mlp": 1.00097656, + "step": 1000, + "time_per_iteration": 2.717535972595215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139683, + "balance_loss_mlp": 1.03969073, + "epoch": 0.19257406694882648, + "flos": 573154062336.0, + "grad_norm": 0.0251308575536182, + "language_loss": 0.95425117, + "learning_rate": 0.0009322839403253397, + "loss": 0.96564806, + "num_input_tokens_seen": 82850032, + "router_z_loss_mlp": 1.00048828, + "step": 1001, + "time_per_iteration": 2.8128621578216553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147687, + "balance_loss_mlp": 1.04793251, + "epoch": 0.19276644863409004, + "flos": 803156473344.0, + "grad_norm": 0.02827819499351052, + "language_loss": 0.93752921, + "learning_rate": 0.0009321273014109439, + "loss": 0.94900608, + "num_input_tokens_seen": 82926080, + "router_z_loss_mlp": 0.99804688, + "step": 1002, + "time_per_iteration": 2.9962668418884277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115103, + "balance_loss_mlp": 1.05127609, + "epoch": 0.1929588303193536, + "flos": 564479311872.0, + "grad_norm": 0.02425681225612504, + "language_loss": 0.92063946, + "learning_rate": 0.0009319704947298513, + "loss": 0.93214977, + "num_input_tokens_seen": 83005200, + "router_z_loss_mlp": 0.99804688, + "step": 1003, + "time_per_iteration": 2.9961774349212646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148634, + "balance_loss_mlp": 1.04887998, + "epoch": 0.19315121200461716, + "flos": 627987603456.0, + "grad_norm": 0.023688885680104285, + "language_loss": 0.95116329, + "learning_rate": 0.0009318135203429393, + "loss": 0.96264958, + "num_input_tokens_seen": 83077280, + "router_z_loss_mlp": 0.99804688, + "step": 1004, + "time_per_iteration": 2.7953245639801025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146221, + "balance_loss_mlp": 1.04646707, + "epoch": 0.19334359368988072, + "flos": 518583034368.0, + "grad_norm": 0.02448547542723696, + "language_loss": 0.95706153, + "learning_rate": 0.0009316563783111511, + "loss": 0.9685238, + "num_input_tokens_seen": 83145456, + "router_z_loss_mlp": 0.99804688, + "step": 1005, + "time_per_iteration": 2.7417562007904053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141812, + "balance_loss_mlp": 1.04224837, + "epoch": 0.19353597537514428, + "flos": 695399568384.0, + "grad_norm": 0.022656832097962477, + "language_loss": 0.91614294, + "learning_rate": 0.0009314990686954943, + "loss": 0.9275611, + "num_input_tokens_seen": 83225392, + "router_z_loss_mlp": 0.99609375, + "step": 1006, + "time_per_iteration": 2.921147584915161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143701, + "balance_loss_mlp": 1.04413795, + "epoch": 0.19372835706040784, + "flos": 1212199226880.0, + "grad_norm": 0.0213605480211332, + "language_loss": 0.89449364, + "learning_rate": 0.000931341591557042, + "loss": 0.90593064, + "num_input_tokens_seen": 83331296, + "router_z_loss_mlp": 0.99609375, + "step": 1007, + "time_per_iteration": 3.6934237480163574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142723, + "balance_loss_mlp": 1.04292154, + "epoch": 0.19392073874567142, + "flos": 521684980224.0, + "grad_norm": 0.02492230683936131, + "language_loss": 0.9970367, + "learning_rate": 0.0009311839469569325, + "loss": 1.00846386, + "num_input_tokens_seen": 83399952, + "router_z_loss_mlp": 0.99853516, + "step": 1008, + "time_per_iteration": 2.66283917427063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141437, + "balance_loss_mlp": 1.04187346, + "epoch": 0.19411312043093498, + "flos": 589910681088.0, + "grad_norm": 0.028572464719479444, + "language_loss": 0.9835515, + "learning_rate": 0.0009310261349563687, + "loss": 0.99496591, + "num_input_tokens_seen": 83468384, + "router_z_loss_mlp": 0.99609375, + "step": 1009, + "time_per_iteration": 2.6913864612579346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139912, + "balance_loss_mlp": 1.04034853, + "epoch": 0.19430550211619854, + "flos": 580571916288.0, + "grad_norm": 0.022224830980977262, + "language_loss": 0.9288035, + "learning_rate": 0.0009308681556166186, + "loss": 0.94020259, + "num_input_tokens_seen": 83547952, + "router_z_loss_mlp": 0.99609375, + "step": 1010, + "time_per_iteration": 2.8937342166900635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141907, + "balance_loss_mlp": 1.04234338, + "epoch": 0.1944978838014621, + "flos": 622245611520.0, + "grad_norm": 0.028831874511777204, + "language_loss": 1.01060331, + "learning_rate": 0.0009307100089990152, + "loss": 1.02202237, + "num_input_tokens_seen": 83615712, + "router_z_loss_mlp": 0.99609375, + "step": 1011, + "time_per_iteration": 2.7086822986602783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114452, + "balance_loss_mlp": 1.04495597, + "epoch": 0.19469026548672566, + "flos": 599814130176.0, + "grad_norm": 0.02434118582542042, + "language_loss": 0.95591187, + "learning_rate": 0.0009305516951649568, + "loss": 0.96735704, + "num_input_tokens_seen": 83687296, + "router_z_loss_mlp": 0.99609375, + "step": 1012, + "time_per_iteration": 2.7046425342559814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114359, + "balance_loss_mlp": 1.04402685, + "epoch": 0.19488264717198922, + "flos": 553247107584.0, + "grad_norm": 0.020712874248618226, + "language_loss": 0.93779677, + "learning_rate": 0.0009303932141759057, + "loss": 0.94923264, + "num_input_tokens_seen": 83763168, + "router_z_loss_mlp": 0.99609375, + "step": 1013, + "time_per_iteration": 2.7684950828552246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145994, + "balance_loss_mlp": 1.0468123, + "epoch": 0.19507502885725278, + "flos": 667312690176.0, + "grad_norm": 0.029421944235057496, + "language_loss": 0.94045115, + "learning_rate": 0.0009302345660933902, + "loss": 0.95191121, + "num_input_tokens_seen": 83837312, + "router_z_loss_mlp": 0.9921875, + "step": 1014, + "time_per_iteration": 2.8242082595825195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143606, + "balance_loss_mlp": 1.04442382, + "epoch": 0.19526741054251634, + "flos": 672327541248.0, + "grad_norm": 0.024449615989116238, + "language_loss": 0.93477654, + "learning_rate": 0.0009300757509790026, + "loss": 0.94621253, + "num_input_tokens_seen": 83917120, + "router_z_loss_mlp": 0.9921875, + "step": 1015, + "time_per_iteration": 2.840658664703369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144964, + "balance_loss_mlp": 1.04578233, + "epoch": 0.19545979222777993, + "flos": 448146986496.0, + "grad_norm": 0.028637929544829934, + "language_loss": 1.02226353, + "learning_rate": 0.0009299167688944005, + "loss": 1.0337131, + "num_input_tokens_seen": 83982992, + "router_z_loss_mlp": 0.9921875, + "step": 1016, + "time_per_iteration": 2.505427837371826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114266, + "balance_loss_mlp": 1.04376352, + "epoch": 0.1956521739130435, + "flos": 570168910848.0, + "grad_norm": 0.02609870742448671, + "language_loss": 0.93148959, + "learning_rate": 0.0009297576199013063, + "loss": 0.94291621, + "num_input_tokens_seen": 84057296, + "router_z_loss_mlp": 0.98925781, + "step": 1017, + "time_per_iteration": 2.7357168197631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155182, + "balance_loss_mlp": 1.05752563, + "epoch": 0.19584455559830705, + "flos": 1458880571904.0, + "grad_norm": 0.02028337436206496, + "language_loss": 0.73002136, + "learning_rate": 0.0009295983040615071, + "loss": 0.74157315, + "num_input_tokens_seen": 84292640, + "router_z_loss_mlp": 0.9765625, + "step": 1018, + "time_per_iteration": 5.09963059425354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147095, + "balance_loss_mlp": 1.04962921, + "epoch": 0.1960369372835706, + "flos": 1594481307648.0, + "grad_norm": 0.015251553743586253, + "language_loss": 0.79426301, + "learning_rate": 0.0009294388214368547, + "loss": 0.80573392, + "num_input_tokens_seen": 84524448, + "router_z_loss_mlp": 0.97460938, + "step": 1019, + "time_per_iteration": 6.03454852104187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146546, + "balance_loss_mlp": 1.0477457, + "epoch": 0.19622931896883417, + "flos": 617252954112.0, + "grad_norm": 0.02445318741287071, + "language_loss": 0.94190967, + "learning_rate": 0.0009292791720892659, + "loss": 0.9533751, + "num_input_tokens_seen": 84600208, + "router_z_loss_mlp": 0.98828125, + "step": 1020, + "time_per_iteration": 2.8369834423065186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147421, + "balance_loss_mlp": 1.0486201, + "epoch": 0.19642170065409773, + "flos": 467207278080.0, + "grad_norm": 0.027280190942869837, + "language_loss": 0.98824823, + "learning_rate": 0.0009291193560807218, + "loss": 0.99972242, + "num_input_tokens_seen": 84668032, + "router_z_loss_mlp": 0.98828125, + "step": 1021, + "time_per_iteration": 2.5833048820495605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144681, + "balance_loss_mlp": 1.0458802, + "epoch": 0.19661408233936128, + "flos": 516288093696.0, + "grad_norm": 0.025303886608753337, + "language_loss": 0.95740455, + "learning_rate": 0.0009289593734732688, + "loss": 0.96885145, + "num_input_tokens_seen": 84738176, + "router_z_loss_mlp": 0.98828125, + "step": 1022, + "time_per_iteration": 2.5913774967193604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149525, + "balance_loss_mlp": 1.05058122, + "epoch": 0.19680646402462484, + "flos": 393493366272.0, + "grad_norm": 0.0253763529676381, + "language_loss": 1.01103711, + "learning_rate": 0.0009287992243290175, + "loss": 1.02253246, + "num_input_tokens_seen": 84799936, + "router_z_loss_mlp": 0.98974609, + "step": 1023, + "time_per_iteration": 2.4793736934661865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115501, + "balance_loss_mlp": 1.05635238, + "epoch": 0.19699884570988843, + "flos": 627623032320.0, + "grad_norm": 0.02508480994731895, + "language_loss": 0.99886519, + "learning_rate": 0.0009286389087101435, + "loss": 1.01041532, + "num_input_tokens_seen": 84877216, + "router_z_loss_mlp": 0.98681641, + "step": 1024, + "time_per_iteration": 2.7772202491760254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153446, + "balance_loss_mlp": 1.05483615, + "epoch": 0.197191227395152, + "flos": 559073693184.0, + "grad_norm": 0.02445444816711275, + "language_loss": 0.98426372, + "learning_rate": 0.0009284784266788864, + "loss": 0.99579823, + "num_input_tokens_seen": 84952464, + "router_z_loss_mlp": 0.98632812, + "step": 1025, + "time_per_iteration": 2.6955864429473877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150264, + "balance_loss_mlp": 1.05165374, + "epoch": 0.19738360908041555, + "flos": 666249176064.0, + "grad_norm": 0.021666801749132464, + "language_loss": 0.99231869, + "learning_rate": 0.0009283177782975512, + "loss": 1.00382137, + "num_input_tokens_seen": 85031488, + "router_z_loss_mlp": 0.98632812, + "step": 1026, + "time_per_iteration": 2.9886229038238525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153907, + "balance_loss_mlp": 1.05529749, + "epoch": 0.1975759907656791, + "flos": 523510563840.0, + "grad_norm": 0.025961932589349316, + "language_loss": 0.98509014, + "learning_rate": 0.000928156963628507, + "loss": 0.99662918, + "num_input_tokens_seen": 85098384, + "router_z_loss_mlp": 0.98632812, + "step": 1027, + "time_per_iteration": 2.586740493774414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149439, + "balance_loss_mlp": 1.05097175, + "epoch": 0.19776837245094267, + "flos": 463484252160.0, + "grad_norm": 0.02550253779434718, + "language_loss": 0.96135926, + "learning_rate": 0.0009279959827341877, + "loss": 0.97285366, + "num_input_tokens_seen": 85172944, + "router_z_loss_mlp": 0.98486328, + "step": 1028, + "time_per_iteration": 2.723517894744873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146754, + "balance_loss_mlp": 1.04852605, + "epoch": 0.19796075413620623, + "flos": 504057503232.0, + "grad_norm": 0.02160335630411572, + "language_loss": 0.96627682, + "learning_rate": 0.0009278348356770915, + "loss": 0.97774434, + "num_input_tokens_seen": 85241632, + "router_z_loss_mlp": 0.98242188, + "step": 1029, + "time_per_iteration": 2.566802501678467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144801, + "balance_loss_mlp": 1.04666746, + "epoch": 0.1981531358214698, + "flos": 508570796544.0, + "grad_norm": 0.024261507948164947, + "language_loss": 0.9528529, + "learning_rate": 0.0009276735225197814, + "loss": 0.96430099, + "num_input_tokens_seen": 85308992, + "router_z_loss_mlp": 0.98144531, + "step": 1030, + "time_per_iteration": 2.6009340286254883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145205, + "balance_loss_mlp": 1.04702377, + "epoch": 0.19834551750673335, + "flos": 532639208448.0, + "grad_norm": 0.023062563394134136, + "language_loss": 0.95906407, + "learning_rate": 0.0009275120433248847, + "loss": 0.97051609, + "num_input_tokens_seen": 85381936, + "router_z_loss_mlp": 0.98193359, + "step": 1031, + "time_per_iteration": 2.684858560562134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145757, + "balance_loss_mlp": 1.0477196, + "epoch": 0.1985378991919969, + "flos": 776969765376.0, + "grad_norm": 0.02469129884935611, + "language_loss": 0.94986421, + "learning_rate": 0.0009273503981550931, + "loss": 0.96132183, + "num_input_tokens_seen": 85474352, + "router_z_loss_mlp": 0.98046875, + "step": 1032, + "time_per_iteration": 3.058094024658203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145313, + "balance_loss_mlp": 1.04737103, + "epoch": 0.1987302808772605, + "flos": 435191256576.0, + "grad_norm": 0.025952536265860523, + "language_loss": 0.96777844, + "learning_rate": 0.0009271885870731626, + "loss": 0.9792316, + "num_input_tokens_seen": 85538416, + "router_z_loss_mlp": 0.97949219, + "step": 1033, + "time_per_iteration": 2.493664503097534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153962, + "balance_loss_mlp": 1.05592442, + "epoch": 0.19892266256252406, + "flos": 554653725696.0, + "grad_norm": 0.029222795446194067, + "language_loss": 1.0035603, + "learning_rate": 0.0009270266101419143, + "loss": 1.01509976, + "num_input_tokens_seen": 85604416, + "router_z_loss_mlp": 0.98046875, + "step": 1034, + "time_per_iteration": 2.626612901687622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145521, + "balance_loss_mlp": 1.04748368, + "epoch": 0.19911504424778761, + "flos": 550948164096.0, + "grad_norm": 0.02425528851980561, + "language_loss": 0.92802572, + "learning_rate": 0.0009268644674242328, + "loss": 0.9394809, + "num_input_tokens_seen": 85677008, + "router_z_loss_mlp": 0.98046875, + "step": 1035, + "time_per_iteration": 2.683253288269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148174, + "balance_loss_mlp": 1.04994512, + "epoch": 0.19930742593305117, + "flos": 519312176640.0, + "grad_norm": 0.02646778626346152, + "language_loss": 0.91577774, + "learning_rate": 0.0009267021589830678, + "loss": 0.9272595, + "num_input_tokens_seen": 85745200, + "router_z_loss_mlp": 0.98242188, + "step": 1036, + "time_per_iteration": 2.7614338397979736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218948, + "balance_loss_mlp": 1.11824036, + "epoch": 0.19949980761831473, + "flos": 1512637863936.0, + "grad_norm": 0.02467753292442409, + "language_loss": 0.77627081, + "learning_rate": 0.0009265396848814328, + "loss": 0.78846025, + "num_input_tokens_seen": 85980608, + "router_z_loss_mlp": 1.0078125, + "step": 1037, + "time_per_iteration": 4.962339878082275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114988, + "balance_loss_mlp": 1.05184233, + "epoch": 0.1996921893035783, + "flos": 699439501824.0, + "grad_norm": 0.02757683731024766, + "language_loss": 1.02362621, + "learning_rate": 0.000926377045182406, + "loss": 1.03512502, + "num_input_tokens_seen": 86055952, + "router_z_loss_mlp": 0.98046875, + "step": 1038, + "time_per_iteration": 2.916594982147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155504, + "balance_loss_mlp": 1.05727601, + "epoch": 0.19988457098884185, + "flos": 728394510336.0, + "grad_norm": 0.024851830352508646, + "language_loss": 0.97729039, + "learning_rate": 0.0009262142399491296, + "loss": 0.98884547, + "num_input_tokens_seen": 86145536, + "router_z_loss_mlp": 0.98242188, + "step": 1039, + "time_per_iteration": 3.0976781845092773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156606, + "balance_loss_mlp": 1.05837739, + "epoch": 0.2000769526741054, + "flos": 561624416256.0, + "grad_norm": 0.025662568358030838, + "language_loss": 0.98388815, + "learning_rate": 0.0009260512692448105, + "loss": 0.99545419, + "num_input_tokens_seen": 86214480, + "router_z_loss_mlp": 0.98242188, + "step": 1040, + "time_per_iteration": 2.715479850769043 + } + ], + "logging_steps": 1.0, + "max_steps": 5198, + "num_input_tokens_seen": 86214480, + "num_train_epochs": 1, + "save_steps": 1040, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2348213263269888.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +} diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/training_args.bin b/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..dec1b7e0db130318069c72434f32c2789119b732 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c077e5103b778b39b648e3a5a2e73e36256d052f444290e14e15f87c36156cb +size 7992 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/zero_to_fp32.py b/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-1040/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/added_tokens.json b/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..c9d3d3a1b74d87e381e471f7b33784015d2dc0ea --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/added_tokens.json @@ -0,0 +1,13 @@ +{ + "<|assistant|>": 32001, + "<|endoftext|>": 32000, + "<|end|>": 32007, + "<|placeholder1|>": 32002, + "<|placeholder2|>": 32003, + "<|placeholder3|>": 32004, + "<|placeholder4|>": 32005, + "<|placeholder5|>": 32008, + "<|placeholder6|>": 32009, + "<|system|>": 32006, + "<|user|>": 32010 +} diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/config.json b/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/config.json new file mode 100644 index 0000000000000000000000000000000000000000..987150c78c9255ac53c0408588036e10466fc436 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/config.json @@ -0,0 +1,200 @@ +{ + "_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "architectures": [ + "LlavaPhiForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_phi3.Phi3Config", + "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM" + }, + "bal_comp_loss_coef": 0.01, + "balance_loss_coef": 0.01, + "bos_token_id": 1, + "clip_smoe": false, + "diversity_loss_coef": 0.01, + "dropout": false, + "e_loss_coef": 0.001, + "embd_pdrop": 0.0, + "entropy_advance_loss": false, + "eos_token_id": 32000, + "freeze_backbone": false, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 3072, + "hybrid": false, + "image_aspect_ratio": "pad", + "init_weight": true, + "initializer_range": 0.02, + "intermediate_size": 8192, + "is_cosine": false, + "is_norm_weight": false, + "local_rank": 0, + "loss1": "balanceloss", + "loss2": "zloss", + "luna": false, + "max_compete_in_iter": 3, + "max_position_embeddings": 131072, + "mlp_smoe": true, + "mm_hidden_size": 1152, + "mm_patch_merge_type": "flat", + "mm_projector_lr": null, + "mm_projector_type": "moe", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-224", + "model_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "model_type": "llava_phi", + "moe_name": "smoe_perturbed", + "norm_softmax": false, + "normalization": false, + "num_attention_heads": 32, + "num_experts": 8, + "num_hidden_layers": 32, + "num_key_value_heads": 32, + "num_layers": 3, + "num_selected": 4, + "number_of_previous_tokens": 2, + "original_max_position_embeddings": 4096, + "pad_token_id": 32000, + "pretrain_mm_mlp_adapter": null, + "rate_compete": 0.2, + "rate_flip": 0.05, + "resid_pdrop": 0.0, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "long_factor": [ + 1.0800000429153442, + 1.1100000143051147, + 1.1399999856948853, + 1.340000033378601, + 1.5899999141693115, + 1.600000023841858, + 1.6200000047683716, + 2.620000123977661, + 3.2300000190734863, + 3.2300000190734863, + 4.789999961853027, + 7.400000095367432, + 7.700000286102295, + 9.09000015258789, + 12.199999809265137, + 17.670000076293945, + 24.46000099182129, + 28.57000160217285, + 30.420001983642578, + 30.840002059936523, + 32.590003967285156, + 32.93000411987305, + 42.320003509521484, + 44.96000289916992, + 50.340003967285156, + 50.45000457763672, + 57.55000305175781, + 57.93000411987305, + 58.21000289916992, + 60.1400032043457, + 62.61000442504883, + 62.62000274658203, + 62.71000289916992, + 63.1400032043457, + 63.1400032043457, + 63.77000427246094, + 63.93000411987305, + 63.96000289916992, + 63.970001220703125, + 64.02999877929688, + 64.06999969482422, + 64.08000183105469, + 64.12000274658203, + 64.41000366210938, + 64.4800033569336, + 64.51000213623047, + 64.52999877929688, + 64.83999633789062 + ], + "short_factor": [ + 1.0, + 1.0199999809265137, + 1.0299999713897705, + 1.0299999713897705, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0699999332427979, + 1.0999999046325684, + 1.1099998950958252, + 1.1599998474121094, + 1.1599998474121094, + 1.1699998378753662, + 1.2899998426437378, + 1.339999794960022, + 1.679999828338623, + 1.7899998426437378, + 1.8199998140335083, + 1.8499997854232788, + 1.8799997568130493, + 1.9099997282028198, + 1.9399996995925903, + 1.9899996519088745, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0799996852874756, + 2.0899996757507324, + 2.189999580383301, + 2.2199995517730713, + 2.5899994373321533, + 2.729999542236328, + 2.749999523162842, + 2.8399994373321533 + ], + "type": "longrope" + }, + "rope_theta": 10000.0, + "router_loss_coef": 0.01, + "router_theta": 0.1, + "router_z_loss_coef": 0.001, + "scales": [ + 1, + 3 + ], + "sliding_window": 262144, + "sparse_upcycling": false, + "strategy_train": "base", + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "topk_max": 2, + "topk_min": 1, + "torch_dtype": "bfloat16", + "training": true, + "transformers_version": "4.43.0", + "tune_mm_mlp_adapter": false, + "unit_test": true, + "use_cache": false, + "use_mm_proj": true, + "use_old": false, + "version": "phi35", + "vision_tower": "google/siglip-so400m-patch14-224", + "vision_tower_dir": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/clip.bin", + "vocab_size": 32064, + "warm_up": 0.05 +} diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/generation_config.json b/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dad5c4578f0dc5969b38755d095fc30c368bb54a --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/generation_config.json @@ -0,0 +1,12 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "do_sample": true, + "eos_token_id": [ + 32007, + 32001, + 32000 + ], + "pad_token_id": 32000, + "transformers_version": "4.43.0" +} diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/global_step2080/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/global_step2080/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..31dac677130cf68595d5101f9806651931c27bdc --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/global_step2080/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3eecac55b6dc5b7ef8ca67e4b7a3e5d9895ef11e858e4f6e093abe7a03567a9a +size 396609872 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/global_step2080/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/global_step2080/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ed51ef82b5a3119cb78216b526b616dcc560d290 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/global_step2080/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a34caa500deb5f148b997fca54c908a37c14edda34cbeca75e3fff6feb0303f6 +size 396609872 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/global_step2080/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/global_step2080/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3db0030c4b04abda5652a75d4933f08ac4414aa7 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/global_step2080/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f0daaaff789365d425d32c998352e222f074db98b0a9d34f667b5be03fed37f0 +size 396609872 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/global_step2080/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/global_step2080/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..17c161a0237665d86234f699f84dc307728d066e --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/global_step2080/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75201a66c6877fbf537964dc265cc6af4bab05747edc3532c6a9536bb3dc0dc1 +size 396609872 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/global_step2080/zero_pp_rank_0_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/global_step2080/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..25ebcedbe811c2ec7927e2c86405d7f00a76246f --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/global_step2080/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:396d82e050ac38fd488f5801a47883ffbff24942549ab7ea36494d97ed82bdbc +size 2117322436 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/global_step2080/zero_pp_rank_1_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/global_step2080/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5941cba430e3694305ea57d6b167f5c2b0a6ca6c --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/global_step2080/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a639bd07baaf16148625f41e3d4722998f0463688f5ec5925fd7c670f871b34b +size 2117322436 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/global_step2080/zero_pp_rank_2_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/global_step2080/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..44ff71c722ddb06571f83d9253b155c67144c219 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/global_step2080/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a246015a827b0b32be119d158e355d328acf249092d6a16b284254479e62ee88 +size 2117322436 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/global_step2080/zero_pp_rank_3_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/global_step2080/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ff6b921e72a11c78244dc035cb18883a223cbfbe --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/global_step2080/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a0c9ecea5896da34054c1c76a14c5f74886ac9cba6a0e6fb2d7721d015e0e75 +size 2117322436 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/latest b/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/latest new file mode 100644 index 0000000000000000000000000000000000000000..306b989cc55bbad3d1661dff0bcd6923a752cb0a --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/latest @@ -0,0 +1 @@ +global_step2080 \ No newline at end of file diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/model-00001-of-00002.safetensors b/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/model-00001-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..29d76f5d80605301aab2bba59b53a5e2582094c4 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/model-00001-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe6c4f6ef38e8993629091331e0bbf23484cc88bdfd038f0dd17b6ec2800d855 +size 4972489328 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/model-00002-of-00002.safetensors b/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/model-00002-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c5543340fc664b5541d747a97112e7c7297088af --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/model-00002-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e90354399b3a446587b565044bc87359e147a79215b2183e10c2534265b5cc4 +size 3759043888 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/model.safetensors.index.json b/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..01fe755c95da02467d97df3e39228dbbb26b065f --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/model.safetensors.index.json @@ -0,0 +1,674 @@ +{ + "metadata": { + "total_size": 8731443232 + }, + "weight_map": { + "lm_head.weight": "model-00002-of-00002.safetensors", + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.mm_projector.layer_norm.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.layer_norm.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.expert_embeddings": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.gate.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.inp_reduction.weight": "model-00002-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/rng_state_0.pth b/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..ef4849062bcdc8ffd2246c07673ba196a8d61a6d --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fae2114fffe9b1eea30e28bbdb4ce59046b0079ea5b8dc4682079f609d49d787 +size 14960 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/rng_state_1.pth b/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..2fcb2b640bc236c26aa841680d34a91240247970 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4ff5f3a53530ac868291e2667c8f824bfa1f4fa1ce880df8223a7165ef38e11 +size 14960 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/rng_state_2.pth b/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..00c3f989de00e6d58ca7345ae6f65fee0afcbdcd --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91f80a7779b0034e70106ba6cb0e3e686052334c20ce54453ee3977cc0219d15 +size 14960 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/rng_state_3.pth b/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..f289913854ee3fa52a86e282421da07d85b8a4c4 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ece3bc0d0e16c43ef245cc787cbd0d63d08d460f489c4cd52adf6501b9281a18 +size 14960 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/special_tokens_map.json b/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3e4d5a5bc1cb51753cc9ae0305ece0da60052b10 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/tokenizer.model b/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/tokenizer_config.json b/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d579bb0b91b24b214ea3c2e487e27a65017cdc4a --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/tokenizer_config.json @@ -0,0 +1,132 @@ +{ + "add_bos_token": false, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "32000": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32001": { + "content": "<|assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32002": { + "content": "<|placeholder1|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32003": { + "content": "<|placeholder2|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32004": { + "content": "<|placeholder3|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32005": { + "content": "<|placeholder4|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32006": { + "content": "<|system|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32007": { + "content": "<|end|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32008": { + "content": "<|placeholder5|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32009": { + "content": "<|placeholder6|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32010": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "legacy": false, + "model_max_length": 2048, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/trainer_state.json b/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..3c906e6c3db602c615f46c49e3e723f1ca9858d2 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/trainer_state.json @@ -0,0 +1,31233 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.4001539053482108, + "eval_steps": 500, + "global_step": 2080, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02574398, + "balance_loss_mlp": 1.85189414, + "epoch": 0.00019238168526356292, + "flos": 471022176768.0, + "grad_norm": 12.86455737221305, + "language_loss": 2.79777646, + "learning_rate": 0.0, + "loss": 1.8614465, + "num_input_tokens_seen": 67104, + "router_z_loss_mlp": 7.2109375, + "step": 1, + "time_per_iteration": 21.83068585395813 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02254613, + "balance_loss_mlp": 1.76785779, + "epoch": 0.00038476337052712584, + "flos": 505537981440.0, + "grad_norm": 51.581369656319104, + "language_loss": 12.34714699, + "learning_rate": 0.00013726078121135892, + "loss": 12.3696928, + "num_input_tokens_seen": 134080, + "router_z_loss_mlp": 4.875, + "step": 2, + "time_per_iteration": 2.6192572116851807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02235864, + "balance_loss_mlp": 1.75177932, + "epoch": 0.0005771450557906887, + "flos": 600333152256.0, + "grad_norm": 53.41660983156924, + "language_loss": 12.32898235, + "learning_rate": 0.00021755319103969496, + "loss": 12.35134125, + "num_input_tokens_seen": 205152, + "router_z_loss_mlp": 4.84765625, + "step": 3, + "time_per_iteration": 2.887979030609131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02281771, + "balance_loss_mlp": 1.79577887, + "epoch": 0.0007695267410542517, + "flos": 581496442368.0, + "grad_norm": 15.812083363335244, + "language_loss": 9.24414825, + "learning_rate": 0.00027452156242271784, + "loss": 9.26696682, + "num_input_tokens_seen": 269664, + "router_z_loss_mlp": 4.8671875, + "step": 4, + "time_per_iteration": 2.6792547702789307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02454864, + "balance_loss_mlp": 1.95551991, + "epoch": 0.0009619084263178145, + "flos": 487153164288.0, + "grad_norm": 10.3691594005885, + "language_loss": 9.1886158, + "learning_rate": 0.0003187096642208417, + "loss": 9.21316433, + "num_input_tokens_seen": 338560, + "router_z_loss_mlp": 4.98828125, + "step": 5, + "time_per_iteration": 2.627883195877075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0247156, + "balance_loss_mlp": 1.97450531, + "epoch": 0.0011542901115813775, + "flos": 561166519296.0, + "grad_norm": 9.061082825397735, + "language_loss": 9.31672573, + "learning_rate": 0.0003548139722510539, + "loss": 9.34144115, + "num_input_tokens_seen": 410112, + "router_z_loss_mlp": 4.96875, + "step": 6, + "time_per_iteration": 2.697327136993408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02496704, + "balance_loss_mlp": 1.9977417, + "epoch": 0.0013466717968449403, + "flos": 534950886912.0, + "grad_norm": 5.1401213461899875, + "language_loss": 8.45638084, + "learning_rate": 0.00038533972973918044, + "loss": 8.48134804, + "num_input_tokens_seen": 477552, + "router_z_loss_mlp": 4.984375, + "step": 7, + "time_per_iteration": 2.6605119705200195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02367166, + "balance_loss_mlp": 1.8800292, + "epoch": 0.0015390534821085034, + "flos": 493333587456.0, + "grad_norm": 4.765795170053606, + "language_loss": 7.86978722, + "learning_rate": 0.0004117823436340768, + "loss": 7.89345884, + "num_input_tokens_seen": 549184, + "router_z_loss_mlp": 4.87890625, + "step": 8, + "time_per_iteration": 2.60813570022583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02377529, + "balance_loss_mlp": 1.89153647, + "epoch": 0.0017314351673720662, + "flos": 565775139840.0, + "grad_norm": 2.6394105736579268, + "language_loss": 7.60834789, + "learning_rate": 0.00043510638207938993, + "loss": 7.63212299, + "num_input_tokens_seen": 622880, + "router_z_loss_mlp": 4.8671875, + "step": 9, + "time_per_iteration": 2.871943712234497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0239868, + "balance_loss_mlp": 1.91802776, + "epoch": 0.001923816852635629, + "flos": 594508568064.0, + "grad_norm": 2.7082435786924752, + "language_loss": 7.06748104, + "learning_rate": 0.00045597044543220066, + "loss": 7.09146786, + "num_input_tokens_seen": 693584, + "router_z_loss_mlp": 4.8125, + "step": 10, + "time_per_iteration": 2.671294689178467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02381293, + "balance_loss_mlp": 1.90254807, + "epoch": 0.002116198537899192, + "flos": 610894611456.0, + "grad_norm": 2.113301815517677, + "language_loss": 6.83692646, + "learning_rate": 0.00047484428652143135, + "loss": 6.86073971, + "num_input_tokens_seen": 774432, + "router_z_loss_mlp": 4.79296875, + "step": 11, + "time_per_iteration": 2.885416269302368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02427226, + "balance_loss_mlp": 1.95687437, + "epoch": 0.002308580223162755, + "flos": 546174359040.0, + "grad_norm": 1.7416212933802626, + "language_loss": 6.4295001, + "learning_rate": 0.0004920747534624128, + "loss": 6.45377207, + "num_input_tokens_seen": 844304, + "router_z_loss_mlp": 4.70703125, + "step": 12, + "time_per_iteration": 2.6201112270355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02503769, + "balance_loss_mlp": 2.03265429, + "epoch": 0.002500961908426318, + "flos": 645923255808.0, + "grad_norm": 2.43618245016211, + "language_loss": 6.0048914, + "learning_rate": 0.0005079252465375872, + "loss": 6.02992916, + "num_input_tokens_seen": 915104, + "router_z_loss_mlp": 4.71484375, + "step": 13, + "time_per_iteration": 2.852263927459717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02634854, + "balance_loss_mlp": 2.15916157, + "epoch": 0.0026933435936898806, + "flos": 488848492032.0, + "grad_norm": 4.143842376760835, + "language_loss": 5.42230844, + "learning_rate": 0.0005226005109505393, + "loss": 5.44865704, + "num_input_tokens_seen": 982720, + "router_z_loss_mlp": 4.76171875, + "step": 14, + "time_per_iteration": 2.5524611473083496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02844198, + "balance_loss_mlp": 2.3646903, + "epoch": 0.0028857252789534437, + "flos": 435525628416.0, + "grad_norm": 5.672862092220106, + "language_loss": 4.15845776, + "learning_rate": 0.0005362628552605367, + "loss": 4.18689966, + "num_input_tokens_seen": 1050528, + "router_z_loss_mlp": 4.80078125, + "step": 15, + "time_per_iteration": 2.7353649139404297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03208902, + "balance_loss_mlp": 2.72252893, + "epoch": 0.0030781069642170067, + "flos": 597840826368.0, + "grad_norm": 3.947061509829782, + "language_loss": 2.26971245, + "learning_rate": 0.0005490431248454357, + "loss": 2.30180168, + "num_input_tokens_seen": 1116512, + "router_z_loss_mlp": 4.87109375, + "step": 16, + "time_per_iteration": 2.676703929901123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03601284, + "balance_loss_mlp": 3.10232162, + "epoch": 0.0032704886494805694, + "flos": 1541510280192.0, + "grad_norm": 0.6213816402988768, + "language_loss": 0.75705111, + "learning_rate": 0.0005610483427624225, + "loss": 0.793064, + "num_input_tokens_seen": 1351216, + "router_z_loss_mlp": 5.0, + "step": 17, + "time_per_iteration": 6.1610119342803955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0334326, + "balance_loss_mlp": 2.85841203, + "epoch": 0.0034628703347441324, + "flos": 474970237440.0, + "grad_norm": 2.8341915883282045, + "language_loss": 1.71282685, + "learning_rate": 0.0005723671632907488, + "loss": 1.74625945, + "num_input_tokens_seen": 1420512, + "router_z_loss_mlp": 4.85546875, + "step": 18, + "time_per_iteration": 2.638371467590332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02881518, + "balance_loss_mlp": 2.39934015, + "epoch": 0.0036552520200076955, + "flos": 449477743104.0, + "grad_norm": 2.8867361132515086, + "language_loss": 1.68530536, + "learning_rate": 0.0005830738490244919, + "loss": 1.71412063, + "num_input_tokens_seen": 1484976, + "router_z_loss_mlp": 4.828125, + "step": 19, + "time_per_iteration": 2.56374454498291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02402526, + "balance_loss_mlp": 1.92301893, + "epoch": 0.003847633705271258, + "flos": 637350563328.0, + "grad_norm": 0.6925173808128176, + "language_loss": 1.38203168, + "learning_rate": 0.0005932312266435596, + "loss": 1.406057, + "num_input_tokens_seen": 1557392, + "router_z_loss_mlp": 4.80078125, + "step": 20, + "time_per_iteration": 2.763998508453369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02421171, + "balance_loss_mlp": 1.94814897, + "epoch": 0.004040015390534821, + "flos": 590590158336.0, + "grad_norm": 1.6265477944222306, + "language_loss": 1.40919662, + "learning_rate": 0.0006028929207788754, + "loss": 1.43340826, + "num_input_tokens_seen": 1626064, + "router_z_loss_mlp": 4.734375, + "step": 21, + "time_per_iteration": 2.746016502380371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02575294, + "balance_loss_mlp": 2.10036469, + "epoch": 0.004232397075798384, + "flos": 757865812992.0, + "grad_norm": 1.576079326940489, + "language_loss": 1.40810275, + "learning_rate": 0.0006121050677327902, + "loss": 1.43385565, + "num_input_tokens_seen": 1696528, + "router_z_loss_mlp": 4.75390625, + "step": 22, + "time_per_iteration": 2.9607386589050293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02550906, + "balance_loss_mlp": 2.07025433, + "epoch": 0.004424778761061947, + "flos": 527726415360.0, + "grad_norm": 0.6323448080178445, + "language_loss": 1.22419024, + "learning_rate": 0.0006209076479463684, + "loss": 1.24969923, + "num_input_tokens_seen": 1765936, + "router_z_loss_mlp": 4.8125, + "step": 23, + "time_per_iteration": 2.5966527462005615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02511897, + "balance_loss_mlp": 2.02285314, + "epoch": 0.00461716044632551, + "flos": 549217907712.0, + "grad_norm": 0.22573529074246063, + "language_loss": 1.26396596, + "learning_rate": 0.0006293355346737718, + "loss": 1.28908491, + "num_input_tokens_seen": 1841632, + "router_z_loss_mlp": 4.8984375, + "step": 24, + "time_per_iteration": 2.672264575958252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02557217, + "balance_loss_mlp": 2.05978036, + "epoch": 0.004809542131589073, + "flos": 568751559168.0, + "grad_norm": 0.10471299124135865, + "language_loss": 1.20974565, + "learning_rate": 0.0006374193284416834, + "loss": 1.23531783, + "num_input_tokens_seen": 1920256, + "router_z_loss_mlp": 4.96875, + "step": 25, + "time_per_iteration": 2.7392375469207764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02658191, + "balance_loss_mlp": 2.15503263, + "epoch": 0.005001923816852636, + "flos": 471583584768.0, + "grad_norm": 0.16888144752152706, + "language_loss": 1.20314312, + "learning_rate": 0.0006451860277489461, + "loss": 1.22972512, + "num_input_tokens_seen": 1986528, + "router_z_loss_mlp": 5.02734375, + "step": 26, + "time_per_iteration": 2.6047253608703613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02722422, + "balance_loss_mlp": 2.21582985, + "epoch": 0.005194305502116198, + "flos": 416380743168.0, + "grad_norm": 0.22424567034217777, + "language_loss": 1.28844571, + "learning_rate": 0.0006526595731190848, + "loss": 1.31566989, + "num_input_tokens_seen": 2048016, + "router_z_loss_mlp": 5.0625, + "step": 27, + "time_per_iteration": 2.481884717941284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02743244, + "balance_loss_mlp": 2.2351265, + "epoch": 0.005386687187379761, + "flos": 629995835904.0, + "grad_norm": 0.15642653525507078, + "language_loss": 1.18914986, + "learning_rate": 0.0006598612921618983, + "loss": 1.2165823, + "num_input_tokens_seen": 2127664, + "router_z_loss_mlp": 5.078125, + "step": 28, + "time_per_iteration": 2.8519153594970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02748247, + "balance_loss_mlp": 2.24051118, + "epoch": 0.005579068872643324, + "flos": 888019997184.0, + "grad_norm": 0.1209301216257677, + "language_loss": 1.12191987, + "learning_rate": 0.0006668102665011454, + "loss": 1.14940238, + "num_input_tokens_seen": 2213952, + "router_z_loss_mlp": 5.07421875, + "step": 29, + "time_per_iteration": 3.2244889736175537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02691091, + "balance_loss_mlp": 2.18411779, + "epoch": 0.005771450557906887, + "flos": 548657952768.0, + "grad_norm": 0.1098895199150706, + "language_loss": 1.21368051, + "learning_rate": 0.0006735236364718957, + "loss": 1.24059153, + "num_input_tokens_seen": 2284736, + "router_z_loss_mlp": 5.06640625, + "step": 30, + "time_per_iteration": 2.642730474472046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02653145, + "balance_loss_mlp": 2.14769816, + "epoch": 0.00596383224317045, + "flos": 533068907520.0, + "grad_norm": 0.11046596793449442, + "language_loss": 1.1970098, + "learning_rate": 0.0006800168558381346, + "loss": 1.22354114, + "num_input_tokens_seen": 2354384, + "router_z_loss_mlp": 5.05078125, + "step": 31, + "time_per_iteration": 2.581875801086426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0257592, + "balance_loss_mlp": 2.07123542, + "epoch": 0.0061562139284340135, + "flos": 590162460672.0, + "grad_norm": 0.10949645130098669, + "language_loss": 1.22987807, + "learning_rate": 0.0006863039060567947, + "loss": 1.25563729, + "num_input_tokens_seen": 2419440, + "router_z_loss_mlp": 5.04296875, + "step": 32, + "time_per_iteration": 2.733224868774414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02505923, + "balance_loss_mlp": 2.00390816, + "epoch": 0.006348595613697576, + "flos": 619441107456.0, + "grad_norm": 0.0835016489973258, + "language_loss": 1.14437437, + "learning_rate": 0.0006923974775611263, + "loss": 1.16943359, + "num_input_tokens_seen": 2496368, + "router_z_loss_mlp": 5.015625, + "step": 33, + "time_per_iteration": 2.820788621902466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02482464, + "balance_loss_mlp": 1.98159432, + "epoch": 0.006540977298961139, + "flos": 779298908160.0, + "grad_norm": 0.08776573315434787, + "language_loss": 1.10869515, + "learning_rate": 0.0006983091239737814, + "loss": 1.13351965, + "num_input_tokens_seen": 2573280, + "router_z_loss_mlp": 5.00390625, + "step": 34, + "time_per_iteration": 2.9917590618133545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02373805, + "balance_loss_mlp": 1.87636864, + "epoch": 0.006733358984224702, + "flos": 668372201472.0, + "grad_norm": 0.0744368555221442, + "language_loss": 1.09626412, + "learning_rate": 0.0007040493939600222, + "loss": 1.12000227, + "num_input_tokens_seen": 2647248, + "router_z_loss_mlp": 4.96875, + "step": 35, + "time_per_iteration": 2.813040256500244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02308046, + "balance_loss_mlp": 1.81175399, + "epoch": 0.006925740669488265, + "flos": 565495162368.0, + "grad_norm": 0.06560236116646054, + "language_loss": 1.0974791, + "learning_rate": 0.0007096279445021078, + "loss": 1.12055957, + "num_input_tokens_seen": 2720736, + "router_z_loss_mlp": 4.95703125, + "step": 36, + "time_per_iteration": 2.715013027191162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02240602, + "balance_loss_mlp": 1.74888754, + "epoch": 0.007118122354751828, + "flos": 551111347200.0, + "grad_norm": 0.05581405617561486, + "language_loss": 1.16120386, + "learning_rate": 0.0007150536386503726, + "loss": 1.18360972, + "num_input_tokens_seen": 2800336, + "router_z_loss_mlp": 4.91015625, + "step": 37, + "time_per_iteration": 2.8262643814086914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02218804, + "balance_loss_mlp": 1.7293781, + "epoch": 0.007310504040015391, + "flos": 703813807104.0, + "grad_norm": 0.06412720029508237, + "language_loss": 1.08394384, + "learning_rate": 0.0007203346302358509, + "loss": 1.10613179, + "num_input_tokens_seen": 2883184, + "router_z_loss_mlp": 4.890625, + "step": 38, + "time_per_iteration": 2.9149320125579834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0220325, + "balance_loss_mlp": 1.71954608, + "epoch": 0.007502885725278953, + "flos": 600500338176.0, + "grad_norm": 0.08018675586540955, + "language_loss": 1.13587177, + "learning_rate": 0.000725478437577282, + "loss": 1.15790427, + "num_input_tokens_seen": 2960736, + "router_z_loss_mlp": 4.84375, + "step": 39, + "time_per_iteration": 2.7649383544921875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02194939, + "balance_loss_mlp": 1.71237946, + "epoch": 0.007695267410542516, + "flos": 561427031040.0, + "grad_norm": 0.11080304178085185, + "language_loss": 1.08546591, + "learning_rate": 0.0007304920078549186, + "loss": 1.10741532, + "num_input_tokens_seen": 3033472, + "router_z_loss_mlp": 4.83203125, + "step": 40, + "time_per_iteration": 2.7245187759399414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02164234, + "balance_loss_mlp": 1.68548942, + "epoch": 0.007887649095806078, + "flos": 509230808064.0, + "grad_norm": 0.12864951336881933, + "language_loss": 1.10053396, + "learning_rate": 0.0007353817735343603, + "loss": 1.12217629, + "num_input_tokens_seen": 3107824, + "router_z_loss_mlp": 4.79296875, + "step": 41, + "time_per_iteration": 2.6662168502807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02109951, + "balance_loss_mlp": 1.63425827, + "epoch": 0.008080030781069641, + "flos": 504904166400.0, + "grad_norm": 0.0888118324595499, + "language_loss": 1.05816543, + "learning_rate": 0.0007401537019902344, + "loss": 1.07926488, + "num_input_tokens_seen": 3176528, + "router_z_loss_mlp": 4.76171875, + "step": 42, + "time_per_iteration": 2.583256244659424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02065976, + "balance_loss_mlp": 1.59219027, + "epoch": 0.008272412466333205, + "flos": 519106059264.0, + "grad_norm": 0.08974821197730459, + "language_loss": 1.0785954, + "learning_rate": 0.0007448133392900729, + "loss": 1.09925508, + "num_input_tokens_seen": 3254256, + "router_z_loss_mlp": 4.7421875, + "step": 43, + "time_per_iteration": 2.677175998687744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01955434, + "balance_loss_mlp": 1.4839375, + "epoch": 0.008464794151596768, + "flos": 609183820800.0, + "grad_norm": 0.06237767914218564, + "language_loss": 1.03785229, + "learning_rate": 0.0007493658489441491, + "loss": 1.05740666, + "num_input_tokens_seen": 3340224, + "router_z_loss_mlp": 4.71875, + "step": 44, + "time_per_iteration": 2.8553237915039062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01864539, + "balance_loss_mlp": 1.39800107, + "epoch": 0.00865717583686033, + "flos": 539006283264.0, + "grad_norm": 0.049849947719683325, + "language_loss": 1.08088911, + "learning_rate": 0.0007538160463002316, + "loss": 1.09953451, + "num_input_tokens_seen": 3409216, + "router_z_loss_mlp": 4.66796875, + "step": 45, + "time_per_iteration": 2.637796640396118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01780353, + "balance_loss_mlp": 1.31572247, + "epoch": 0.008849557522123894, + "flos": 509009227776.0, + "grad_norm": 0.046919324832442044, + "language_loss": 1.11748755, + "learning_rate": 0.0007581684291577274, + "loss": 1.1352911, + "num_input_tokens_seen": 3478352, + "router_z_loss_mlp": 4.6484375, + "step": 46, + "time_per_iteration": 2.5655901432037354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01764453, + "balance_loss_mlp": 1.30211222, + "epoch": 0.009041939207387457, + "flos": 626507125248.0, + "grad_norm": 0.05937298040562763, + "language_loss": 1.13580585, + "learning_rate": 0.0007624272050891776, + "loss": 1.15345049, + "num_input_tokens_seen": 3555616, + "router_z_loss_mlp": 4.625, + "step": 47, + "time_per_iteration": 2.804643392562866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01776852, + "balance_loss_mlp": 1.31908798, + "epoch": 0.00923432089265102, + "flos": 550609789440.0, + "grad_norm": 0.07500714899038924, + "language_loss": 1.03489327, + "learning_rate": 0.0007665963158851307, + "loss": 1.05266178, + "num_input_tokens_seen": 3634512, + "router_z_loss_mlp": 4.578125, + "step": 48, + "time_per_iteration": 2.781435489654541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01771411, + "balance_loss_mlp": 1.3170805, + "epoch": 0.009426702577914583, + "flos": 563678310912.0, + "grad_norm": 0.07921486390615404, + "language_loss": 1.12758589, + "learning_rate": 0.0007706794594783609, + "loss": 1.14529991, + "num_input_tokens_seen": 3708480, + "router_z_loss_mlp": 4.54296875, + "step": 49, + "time_per_iteration": 2.739976644515991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.017484, + "balance_loss_mlp": 1.29483247, + "epoch": 0.009619084263178146, + "flos": 617925700608.0, + "grad_norm": 0.05671895540127436, + "language_loss": 1.10915053, + "learning_rate": 0.0007746801096530423, + "loss": 1.12663448, + "num_input_tokens_seen": 3783472, + "router_z_loss_mlp": 4.53515625, + "step": 50, + "time_per_iteration": 2.7333760261535645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01715641, + "balance_loss_mlp": 1.2616924, + "epoch": 0.009811465948441709, + "flos": 542488263168.0, + "grad_norm": 0.04785443300923319, + "language_loss": 1.16231108, + "learning_rate": 0.0007786015338021173, + "loss": 1.17946756, + "num_input_tokens_seen": 3851360, + "router_z_loss_mlp": 4.5390625, + "step": 51, + "time_per_iteration": 2.681392192840576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01700387, + "balance_loss_mlp": 1.24720073, + "epoch": 0.010003847633705272, + "flos": 536976583680.0, + "grad_norm": 0.04536583817216675, + "language_loss": 1.08076, + "learning_rate": 0.0007824468089603051, + "loss": 1.0977639, + "num_input_tokens_seen": 3923056, + "router_z_loss_mlp": 4.53125, + "step": 52, + "time_per_iteration": 2.6839513778686523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01675834, + "balance_loss_mlp": 1.2218852, + "epoch": 0.010196229318968833, + "flos": 910805316096.0, + "grad_norm": 0.04374839581732082, + "language_loss": 1.0833261, + "learning_rate": 0.0007862188363098669, + "loss": 1.10008454, + "num_input_tokens_seen": 4004528, + "router_z_loss_mlp": 4.5390625, + "step": 53, + "time_per_iteration": 3.1748838424682617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01650634, + "balance_loss_mlp": 1.19477725, + "epoch": 0.010388611004232396, + "flos": 586969190400.0, + "grad_norm": 0.045477377455174536, + "language_loss": 1.08262885, + "learning_rate": 0.0007899203543304438, + "loss": 1.09913516, + "num_input_tokens_seen": 4078704, + "router_z_loss_mlp": 4.55859375, + "step": 54, + "time_per_iteration": 2.7011117935180664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01588572, + "balance_loss_mlp": 1.13195276, + "epoch": 0.01058099268949596, + "flos": 503471351808.0, + "grad_norm": 0.05216939031034974, + "language_loss": 1.22650576, + "learning_rate": 0.0007935539507422731, + "loss": 1.24239147, + "num_input_tokens_seen": 4143600, + "router_z_loss_mlp": 4.56640625, + "step": 55, + "time_per_iteration": 2.6142656803131104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01553155, + "balance_loss_mlp": 1.09462798, + "epoch": 0.010773374374759523, + "flos": 545558008320.0, + "grad_norm": 0.04278176221573414, + "language_loss": 1.12836909, + "learning_rate": 0.0007971220733732573, + "loss": 1.14390063, + "num_input_tokens_seen": 4217904, + "router_z_loss_mlp": 4.5859375, + "step": 56, + "time_per_iteration": 2.718210220336914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01586959, + "balance_loss_mlp": 1.1318655, + "epoch": 0.010965756060023086, + "flos": 527285982720.0, + "grad_norm": 0.06958617519474361, + "language_loss": 1.08844507, + "learning_rate": 0.0008006270400641869, + "loss": 1.10431468, + "num_input_tokens_seen": 4293920, + "router_z_loss_mlp": 4.55078125, + "step": 57, + "time_per_iteration": 2.702324628829956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01576177, + "balance_loss_mlp": 1.12375367, + "epoch": 0.011158137745286649, + "flos": 578097054720.0, + "grad_norm": 0.08376433329063605, + "language_loss": 1.09231043, + "learning_rate": 0.0008040710477125043, + "loss": 1.10807228, + "num_input_tokens_seen": 4370080, + "router_z_loss_mlp": 4.5234375, + "step": 58, + "time_per_iteration": 2.733733892440796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01587306, + "balance_loss_mlp": 1.13793492, + "epoch": 0.011350519430550212, + "flos": 530314068480.0, + "grad_norm": 0.056261163559927586, + "language_loss": 1.098104, + "learning_rate": 0.0008074561805429771, + "loss": 1.11397719, + "num_input_tokens_seen": 4439792, + "router_z_loss_mlp": 4.4921875, + "step": 59, + "time_per_iteration": 2.604173183441162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0153348, + "balance_loss_mlp": 1.0886867, + "epoch": 0.011542901115813775, + "flos": 556970133504.0, + "grad_norm": 0.07546157909609297, + "language_loss": 1.07214928, + "learning_rate": 0.0008107844176832545, + "loss": 1.08748412, + "num_input_tokens_seen": 4510800, + "router_z_loss_mlp": 4.45703125, + "step": 60, + "time_per_iteration": 2.670180082321167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01515203, + "balance_loss_mlp": 1.07155395, + "epoch": 0.011735282801077338, + "flos": 573175529472.0, + "grad_norm": 0.06932920743779293, + "language_loss": 1.09267807, + "learning_rate": 0.0008140576401132568, + "loss": 1.10783005, + "num_input_tokens_seen": 4581136, + "router_z_loss_mlp": 4.44921875, + "step": 61, + "time_per_iteration": 2.635917901992798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01537914, + "balance_loss_mlp": 1.0965538, + "epoch": 0.0119276644863409, + "flos": 616716467712.0, + "grad_norm": 0.056166475672555005, + "language_loss": 1.10548615, + "learning_rate": 0.0008172776370494935, + "loss": 1.12086535, + "num_input_tokens_seen": 4650352, + "router_z_loss_mlp": 4.42578125, + "step": 62, + "time_per_iteration": 2.709764242172241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.015397, + "balance_loss_mlp": 1.10024714, + "epoch": 0.012120046171604464, + "flos": 502084199424.0, + "grad_norm": 0.046962065793300374, + "language_loss": 1.17909575, + "learning_rate": 0.0008204461118185703, + "loss": 1.19449282, + "num_input_tokens_seen": 4716336, + "router_z_loss_mlp": 4.40625, + "step": 63, + "time_per_iteration": 2.5971004962921143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01545078, + "balance_loss_mlp": 1.10943925, + "epoch": 0.012312427856868027, + "flos": 474301493760.0, + "grad_norm": 0.04671162143151921, + "language_loss": 1.07277906, + "learning_rate": 0.0008235646872681536, + "loss": 1.08822989, + "num_input_tokens_seen": 4781648, + "router_z_loss_mlp": 4.3671875, + "step": 64, + "time_per_iteration": 2.567622423171997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01534227, + "balance_loss_mlp": 1.10240316, + "epoch": 0.012504809542131588, + "flos": 539470910976.0, + "grad_norm": 0.04435006978162803, + "language_loss": 1.0673492, + "learning_rate": 0.0008266349107584288, + "loss": 1.08269131, + "num_input_tokens_seen": 4852320, + "router_z_loss_mlp": 4.328125, + "step": 65, + "time_per_iteration": 2.6833384037017822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0149994, + "balance_loss_mlp": 1.07345641, + "epoch": 0.012697191227395151, + "flos": 609856567296.0, + "grad_norm": 0.04524096047594039, + "language_loss": 1.09403265, + "learning_rate": 0.0008296582587724851, + "loss": 1.10903215, + "num_input_tokens_seen": 4922016, + "router_z_loss_mlp": 4.2734375, + "step": 66, + "time_per_iteration": 2.692337989807129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01482262, + "balance_loss_mlp": 1.05806744, + "epoch": 0.012889572912658714, + "flos": 769397460480.0, + "grad_norm": 0.04198159389490698, + "language_loss": 1.06809163, + "learning_rate": 0.0008326361411800136, + "loss": 1.08291411, + "num_input_tokens_seen": 5000128, + "router_z_loss_mlp": 4.25, + "step": 67, + "time_per_iteration": 2.923720598220825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01474655, + "balance_loss_mlp": 1.05503809, + "epoch": 0.013081954597922277, + "flos": 535020744192.0, + "grad_norm": 0.041919130945389606, + "language_loss": 1.07100165, + "learning_rate": 0.0008355699051851403, + "loss": 1.0857482, + "num_input_tokens_seen": 5074512, + "router_z_loss_mlp": 4.203125, + "step": 68, + "time_per_iteration": 2.7417044639587402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01462817, + "balance_loss_mlp": 1.04701531, + "epoch": 0.01327433628318584, + "flos": 574180646400.0, + "grad_norm": 0.041322055356332446, + "language_loss": 1.14468551, + "learning_rate": 0.0008384608389860635, + "loss": 1.15931368, + "num_input_tokens_seen": 5141856, + "router_z_loss_mlp": 4.1640625, + "step": 69, + "time_per_iteration": 2.6545376777648926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01450151, + "balance_loss_mlp": 1.03930819, + "epoch": 0.013466717968449404, + "flos": 498259115520.0, + "grad_norm": 0.039605765449237204, + "language_loss": 1.04742777, + "learning_rate": 0.000841310175171381, + "loss": 1.06192923, + "num_input_tokens_seen": 5209280, + "router_z_loss_mlp": 4.11328125, + "step": 70, + "time_per_iteration": 2.5687999725341797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01441096, + "balance_loss_mlp": 1.03101599, + "epoch": 0.013659099653712967, + "flos": 566621803008.0, + "grad_norm": 0.03646297128801074, + "language_loss": 1.03104186, + "learning_rate": 0.000844119093875517, + "loss": 1.04545283, + "num_input_tokens_seen": 5285424, + "router_z_loss_mlp": 4.1015625, + "step": 71, + "time_per_iteration": 2.698259115219116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01433469, + "balance_loss_mlp": 1.02720368, + "epoch": 0.01385148133897653, + "flos": 574942715904.0, + "grad_norm": 0.02854119406997066, + "language_loss": 1.07372236, + "learning_rate": 0.0008468887257134666, + "loss": 1.08805704, + "num_input_tokens_seen": 5358624, + "router_z_loss_mlp": 4.06445312, + "step": 72, + "time_per_iteration": 2.7074387073516846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01422625, + "balance_loss_mlp": 1.01941192, + "epoch": 0.014043863024240093, + "flos": 577958066688.0, + "grad_norm": 0.03113282173853564, + "language_loss": 1.10314119, + "learning_rate": 0.0008496201545131264, + "loss": 1.11736751, + "num_input_tokens_seen": 5429792, + "router_z_loss_mlp": 4.03515625, + "step": 73, + "time_per_iteration": 2.725660562515259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01425762, + "balance_loss_mlp": 1.02655351, + "epoch": 0.014236244709503656, + "flos": 940263883776.0, + "grad_norm": 0.033199488198319166, + "language_loss": 1.07624495, + "learning_rate": 0.0008523144198617317, + "loss": 1.0905025, + "num_input_tokens_seen": 5518608, + "router_z_loss_mlp": 3.99414062, + "step": 74, + "time_per_iteration": 3.2577481269836426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01437934, + "balance_loss_mlp": 1.04139662, + "epoch": 0.014428626394767219, + "flos": 529495603200.0, + "grad_norm": 0.03119178099318558, + "language_loss": 1.07016373, + "learning_rate": 0.0008549725194813783, + "loss": 1.08454299, + "num_input_tokens_seen": 5590576, + "router_z_loss_mlp": 3.96679688, + "step": 75, + "time_per_iteration": 2.727982997894287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01437754, + "balance_loss_mlp": 1.0446496, + "epoch": 0.014621008080030782, + "flos": 805282226688.0, + "grad_norm": 0.02968258762679391, + "language_loss": 1.06415534, + "learning_rate": 0.0008575954114472099, + "loss": 1.07853293, + "num_input_tokens_seen": 5674224, + "router_z_loss_mlp": 3.93164062, + "step": 76, + "time_per_iteration": 3.172807455062866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0143975, + "balance_loss_mlp": 1.04950643, + "epoch": 0.014813389765294343, + "flos": 698356521984.0, + "grad_norm": 0.031905123056971844, + "language_loss": 1.03629625, + "learning_rate": 0.0008601840162606118, + "loss": 1.05069387, + "num_input_tokens_seen": 5757648, + "router_z_loss_mlp": 3.90234375, + "step": 77, + "time_per_iteration": 3.029114007949829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01438585, + "balance_loss_mlp": 1.05158365, + "epoch": 0.015005771450557906, + "flos": 598164464640.0, + "grad_norm": 0.026994348673938514, + "language_loss": 1.09661531, + "learning_rate": 0.000862739218788641, + "loss": 1.11100101, + "num_input_tokens_seen": 5837600, + "router_z_loss_mlp": 3.86914062, + "step": 78, + "time_per_iteration": 2.795952320098877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01440626, + "balance_loss_mlp": 1.05705774, + "epoch": 0.01519815313582147, + "flos": 550492268544.0, + "grad_norm": 0.029495859587709627, + "language_loss": 1.07574832, + "learning_rate": 0.0008652618700799138, + "loss": 1.09015465, + "num_input_tokens_seen": 5907248, + "router_z_loss_mlp": 3.83789062, + "step": 79, + "time_per_iteration": 2.6552224159240723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01430975, + "balance_loss_mlp": 1.05084014, + "epoch": 0.015390534821085032, + "flos": 431440032768.0, + "grad_norm": 0.037998818197719206, + "language_loss": 1.07206631, + "learning_rate": 0.0008677527890662774, + "loss": 1.08637595, + "num_input_tokens_seen": 5970864, + "router_z_loss_mlp": 3.80664062, + "step": 80, + "time_per_iteration": 2.530073881149292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01424927, + "balance_loss_mlp": 1.04727161, + "epoch": 0.015582916506348595, + "flos": 525184424448.0, + "grad_norm": 0.03521308344632083, + "language_loss": 1.08168781, + "learning_rate": 0.0008702127641587799, + "loss": 1.09593713, + "num_input_tokens_seen": 6040800, + "router_z_loss_mlp": 3.78125, + "step": 81, + "time_per_iteration": 2.6248533725738525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01426595, + "balance_loss_mlp": 1.05141926, + "epoch": 0.015775298191612157, + "flos": 576616576512.0, + "grad_norm": 0.026523126631237747, + "language_loss": 1.036394, + "learning_rate": 0.0008726425547457192, + "loss": 1.05065989, + "num_input_tokens_seen": 6111840, + "router_z_loss_mlp": 3.75585938, + "step": 82, + "time_per_iteration": 2.759159564971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01424967, + "balance_loss_mlp": 1.05303442, + "epoch": 0.01596767987687572, + "flos": 611439103488.0, + "grad_norm": 0.03656915183129864, + "language_loss": 1.03032446, + "learning_rate": 0.0008750428925998964, + "loss": 1.04457414, + "num_input_tokens_seen": 6183872, + "router_z_loss_mlp": 3.72265625, + "step": 83, + "time_per_iteration": 2.739105224609375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01431924, + "balance_loss_mlp": 1.06151688, + "epoch": 0.016160061562139283, + "flos": 568232537088.0, + "grad_norm": 0.03323001720600938, + "language_loss": 1.08511543, + "learning_rate": 0.0008774144832015932, + "loss": 1.09943461, + "num_input_tokens_seen": 6255760, + "router_z_loss_mlp": 3.70703125, + "step": 84, + "time_per_iteration": 2.7144806385040283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02085876, + "balance_loss_mlp": 1.68762207, + "epoch": 0.016352443247402846, + "flos": 1414499701248.0, + "grad_norm": 0.1388747380481991, + "language_loss": 0.74774313, + "learning_rate": 0.0008797580069832641, + "loss": 0.76860189, + "num_input_tokens_seen": 6472960, + "router_z_loss_mlp": 3.984375, + "step": 85, + "time_per_iteration": 4.569611072540283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01450774, + "balance_loss_mlp": 1.08532572, + "epoch": 0.01654482493266641, + "flos": 731785165824.0, + "grad_norm": 0.04601998260491519, + "language_loss": 1.03772068, + "learning_rate": 0.0008820741205014318, + "loss": 1.05222845, + "num_input_tokens_seen": 6548912, + "router_z_loss_mlp": 3.65625, + "step": 86, + "time_per_iteration": 2.8604419231414795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.014606, + "balance_loss_mlp": 1.09744096, + "epoch": 0.016737206617929972, + "flos": 537404281344.0, + "grad_norm": 0.03433335749497543, + "language_loss": 1.05140662, + "learning_rate": 0.0008843634575408404, + "loss": 1.06601262, + "num_input_tokens_seen": 6621520, + "router_z_loss_mlp": 3.62695312, + "step": 87, + "time_per_iteration": 2.677731513977051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0145769, + "balance_loss_mlp": 1.09777355, + "epoch": 0.016929588303193535, + "flos": 538129420800.0, + "grad_norm": 0.05036212092144492, + "language_loss": 1.06815004, + "learning_rate": 0.0008866266301555082, + "loss": 1.08272696, + "num_input_tokens_seen": 6698432, + "router_z_loss_mlp": 3.59765625, + "step": 88, + "time_per_iteration": 2.7037875652313232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0145347, + "balance_loss_mlp": 1.09622347, + "epoch": 0.017121969988457098, + "flos": 527791543296.0, + "grad_norm": 0.030252065691096418, + "language_loss": 1.07441962, + "learning_rate": 0.0008888642296509615, + "loss": 1.08895445, + "num_input_tokens_seen": 6764336, + "router_z_loss_mlp": 3.56445312, + "step": 89, + "time_per_iteration": 2.590280771255493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0145473, + "balance_loss_mlp": 1.10034442, + "epoch": 0.01731435167372066, + "flos": 626767636992.0, + "grad_norm": 0.041554939890322294, + "language_loss": 1.12743318, + "learning_rate": 0.0008910768275115906, + "loss": 1.14198053, + "num_input_tokens_seen": 6839392, + "router_z_loss_mlp": 3.54101562, + "step": 90, + "time_per_iteration": 2.7529714107513428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0145373, + "balance_loss_mlp": 1.10220587, + "epoch": 0.017506733358984224, + "flos": 497384254464.0, + "grad_norm": 0.05646737130307679, + "language_loss": 1.07978606, + "learning_rate": 0.0008932649762767675, + "loss": 1.0943234, + "num_input_tokens_seen": 6907344, + "router_z_loss_mlp": 3.50976562, + "step": 91, + "time_per_iteration": 2.5964808464050293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01457202, + "balance_loss_mlp": 1.10911036, + "epoch": 0.017699115044247787, + "flos": 747217758720.0, + "grad_norm": 0.04050166442287704, + "language_loss": 1.1018101, + "learning_rate": 0.0008954292103690864, + "loss": 1.11638212, + "num_input_tokens_seen": 6982464, + "router_z_loss_mlp": 3.47851562, + "step": 92, + "time_per_iteration": 2.9288997650146484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01459372, + "balance_loss_mlp": 1.11395121, + "epoch": 0.01789149672951135, + "flos": 516520407552.0, + "grad_norm": 0.054281950557984966, + "language_loss": 1.12496912, + "learning_rate": 0.0008975700468778296, + "loss": 1.13956285, + "num_input_tokens_seen": 7049712, + "router_z_loss_mlp": 3.45117188, + "step": 93, + "time_per_iteration": 2.5800487995147705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01462727, + "balance_loss_mlp": 1.11978543, + "epoch": 0.018083878414774913, + "flos": 587229702144.0, + "grad_norm": 0.04557553976021738, + "language_loss": 1.05795836, + "learning_rate": 0.0008996879863005366, + "loss": 1.07258558, + "num_input_tokens_seen": 7120288, + "router_z_loss_mlp": 3.42578125, + "step": 94, + "time_per_iteration": 2.6668198108673096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0146929, + "balance_loss_mlp": 1.12882805, + "epoch": 0.018276260100038477, + "flos": 498369905664.0, + "grad_norm": 0.055406629054909326, + "language_loss": 1.06168532, + "learning_rate": 0.0009017835132453337, + "loss": 1.07637823, + "num_input_tokens_seen": 7188896, + "router_z_loss_mlp": 3.40234375, + "step": 95, + "time_per_iteration": 2.588728904724121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0146889, + "balance_loss_mlp": 1.1312896, + "epoch": 0.01846864178530204, + "flos": 641232043008.0, + "grad_norm": 0.04012691806662063, + "language_loss": 1.05874133, + "learning_rate": 0.0009038570970964896, + "loss": 1.0734303, + "num_input_tokens_seen": 7259536, + "router_z_loss_mlp": 3.37890625, + "step": 96, + "time_per_iteration": 2.7860937118530273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01464817, + "balance_loss_mlp": 1.12912345, + "epoch": 0.018661023470565603, + "flos": 512667125760.0, + "grad_norm": 0.027884025705687265, + "language_loss": 1.03269148, + "learning_rate": 0.0009059091926454854, + "loss": 1.04733968, + "num_input_tokens_seen": 7326752, + "router_z_loss_mlp": 3.359375, + "step": 97, + "time_per_iteration": 2.6100950241088867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01470726, + "balance_loss_mlp": 1.13694024, + "epoch": 0.018853405155829166, + "flos": 932696308224.0, + "grad_norm": 0.03936003805775877, + "language_loss": 1.02435613, + "learning_rate": 0.0009079402406897198, + "loss": 1.03906357, + "num_input_tokens_seen": 7417488, + "router_z_loss_mlp": 3.33984375, + "step": 98, + "time_per_iteration": 3.2489542961120605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01467854, + "balance_loss_mlp": 1.13616598, + "epoch": 0.01904578684109273, + "flos": 577586764800.0, + "grad_norm": 0.036005296184057074, + "language_loss": 1.04073858, + "learning_rate": 0.0009099506686008212, + "loss": 1.05541718, + "num_input_tokens_seen": 7493136, + "router_z_loss_mlp": 3.31835938, + "step": 99, + "time_per_iteration": 2.7905051708221436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01467812, + "balance_loss_mlp": 1.13822246, + "epoch": 0.019238168526356292, + "flos": 559520856576.0, + "grad_norm": 0.02696843746399884, + "language_loss": 1.07409596, + "learning_rate": 0.0009119408908644013, + "loss": 1.08877409, + "num_input_tokens_seen": 7560896, + "router_z_loss_mlp": 3.296875, + "step": 100, + "time_per_iteration": 2.7075607776641846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01456893, + "balance_loss_mlp": 1.12882876, + "epoch": 0.019430550211619855, + "flos": 725103184896.0, + "grad_norm": 0.03304065923870771, + "language_loss": 1.12780023, + "learning_rate": 0.0009139113095929519, + "loss": 1.14236927, + "num_input_tokens_seen": 7629040, + "router_z_loss_mlp": 3.28125, + "step": 101, + "time_per_iteration": 2.86230731010437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01460167, + "balance_loss_mlp": 1.13439226, + "epoch": 0.019622931896883418, + "flos": 500456001024.0, + "grad_norm": 0.030619133870748612, + "language_loss": 1.06594038, + "learning_rate": 0.0009158623150134762, + "loss": 1.08054209, + "num_input_tokens_seen": 7694256, + "router_z_loss_mlp": 3.2578125, + "step": 102, + "time_per_iteration": 2.563690185546875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01458611, + "balance_loss_mlp": 1.13569677, + "epoch": 0.01981531358214698, + "flos": 510281587200.0, + "grad_norm": 0.03276303076426602, + "language_loss": 1.06164801, + "learning_rate": 0.000917794285931332, + "loss": 1.0762341, + "num_input_tokens_seen": 7762256, + "router_z_loss_mlp": 3.22851562, + "step": 103, + "time_per_iteration": 2.6599903106689453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01462945, + "balance_loss_mlp": 1.1421293, + "epoch": 0.020007695267410544, + "flos": 522392655360.0, + "grad_norm": 0.026505304013468463, + "language_loss": 0.98227251, + "learning_rate": 0.0009197075901716639, + "loss": 0.99690199, + "num_input_tokens_seen": 7834400, + "router_z_loss_mlp": 3.20703125, + "step": 104, + "time_per_iteration": 2.726245880126953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01469463, + "balance_loss_mlp": 1.14998221, + "epoch": 0.020200076952674107, + "flos": 534443324928.0, + "grad_norm": 0.029933884589862427, + "language_loss": 1.08736229, + "learning_rate": 0.0009216025849997171, + "loss": 1.10205698, + "num_input_tokens_seen": 7911184, + "router_z_loss_mlp": 3.19335938, + "step": 105, + "time_per_iteration": 2.8023486137390137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01468836, + "balance_loss_mlp": 1.15221632, + "epoch": 0.020392458637937667, + "flos": 686082270720.0, + "grad_norm": 0.024520994280375335, + "language_loss": 1.03054178, + "learning_rate": 0.0009234796175212258, + "loss": 1.04523015, + "num_input_tokens_seen": 7985280, + "router_z_loss_mlp": 3.1640625, + "step": 106, + "time_per_iteration": 2.9396088123321533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01469456, + "balance_loss_mlp": 1.15512502, + "epoch": 0.02058484032320123, + "flos": 703414307328.0, + "grad_norm": 0.02898567585615155, + "language_loss": 1.07201982, + "learning_rate": 0.000925339025064007, + "loss": 1.08671439, + "num_input_tokens_seen": 8068320, + "router_z_loss_mlp": 3.140625, + "step": 107, + "time_per_iteration": 2.9473297595977783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0147282, + "balance_loss_mlp": 1.16001439, + "epoch": 0.020777222008464793, + "flos": 640326982656.0, + "grad_norm": 0.02770789473723963, + "language_loss": 0.99879742, + "learning_rate": 0.0009271811355418027, + "loss": 1.01352561, + "num_input_tokens_seen": 8148144, + "router_z_loss_mlp": 3.125, + "step": 108, + "time_per_iteration": 2.8551387786865234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01469504, + "balance_loss_mlp": 1.15803361, + "epoch": 0.020969603693728356, + "flos": 683320700928.0, + "grad_norm": 0.029161506766480293, + "language_loss": 1.06637371, + "learning_rate": 0.0009290062678013548, + "loss": 1.08106875, + "num_input_tokens_seen": 8222256, + "router_z_loss_mlp": 3.11132812, + "step": 109, + "time_per_iteration": 2.821951389312744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01468675, + "balance_loss_mlp": 1.15949392, + "epoch": 0.02116198537899192, + "flos": 534419129856.0, + "grad_norm": 0.03188637458086245, + "language_loss": 1.05070233, + "learning_rate": 0.0009308147319536321, + "loss": 1.06538928, + "num_input_tokens_seen": 8292432, + "router_z_loss_mlp": 3.08789062, + "step": 110, + "time_per_iteration": 2.6315042972564697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01469018, + "balance_loss_mlp": 1.16212535, + "epoch": 0.021354367064255482, + "flos": 718727377920.0, + "grad_norm": 0.030955966903197116, + "language_loss": 1.11490715, + "learning_rate": 0.0009326068296900676, + "loss": 1.12959719, + "num_input_tokens_seen": 8365024, + "router_z_loss_mlp": 3.06445312, + "step": 111, + "time_per_iteration": 2.8208162784576416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01474326, + "balance_loss_mlp": 1.16934085, + "epoch": 0.021546748749519045, + "flos": 520623467520.0, + "grad_norm": 0.027870670355515197, + "language_loss": 1.02138007, + "learning_rate": 0.0009343828545846161, + "loss": 1.03612328, + "num_input_tokens_seen": 8442448, + "router_z_loss_mlp": 3.04492188, + "step": 112, + "time_per_iteration": 2.759277105331421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01474098, + "balance_loss_mlp": 1.17063916, + "epoch": 0.021739130434782608, + "flos": 506161062912.0, + "grad_norm": 0.03372988233582904, + "language_loss": 1.06662297, + "learning_rate": 0.0009361430923823841, + "loss": 1.08136404, + "num_input_tokens_seen": 8508992, + "router_z_loss_mlp": 3.02929688, + "step": 113, + "time_per_iteration": 2.565107822418213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01471087, + "balance_loss_mlp": 1.1693449, + "epoch": 0.02193151212004617, + "flos": 464426242560.0, + "grad_norm": 0.03803370713592907, + "language_loss": 1.10115385, + "learning_rate": 0.0009378878212755459, + "loss": 1.11586463, + "num_input_tokens_seen": 8574048, + "router_z_loss_mlp": 3.01171875, + "step": 114, + "time_per_iteration": 2.491929292678833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01471993, + "balance_loss_mlp": 1.17253923, + "epoch": 0.022123893805309734, + "flos": 553331701248.0, + "grad_norm": 0.029753755152528143, + "language_loss": 1.00006115, + "learning_rate": 0.0009396173121672103, + "loss": 1.014781, + "num_input_tokens_seen": 8647808, + "router_z_loss_mlp": 2.98828125, + "step": 115, + "time_per_iteration": 2.6869561672210693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01473585, + "balance_loss_mlp": 1.1754663, + "epoch": 0.022316275490573297, + "flos": 637378761216.0, + "grad_norm": 0.032022590728611564, + "language_loss": 1.0593642, + "learning_rate": 0.0009413318289238633, + "loss": 1.07410002, + "num_input_tokens_seen": 8719760, + "router_z_loss_mlp": 2.97460938, + "step": 116, + "time_per_iteration": 2.7639846801757812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01474428, + "balance_loss_mlp": 1.17859828, + "epoch": 0.02250865717583686, + "flos": 800315039232.0, + "grad_norm": 0.032750944460810345, + "language_loss": 0.98115921, + "learning_rate": 0.0009430316286169771, + "loss": 0.99590349, + "num_input_tokens_seen": 8798752, + "router_z_loss_mlp": 2.95117188, + "step": 117, + "time_per_iteration": 3.020703077316284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01469481, + "balance_loss_mlp": 1.17536783, + "epoch": 0.022701038861100423, + "flos": 457062782976.0, + "grad_norm": 0.027209249322999743, + "language_loss": 1.0327785, + "learning_rate": 0.0009447169617543361, + "loss": 1.04747331, + "num_input_tokens_seen": 8866848, + "router_z_loss_mlp": 2.9375, + "step": 118, + "time_per_iteration": 2.5938501358032227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01466386, + "balance_loss_mlp": 1.17437065, + "epoch": 0.022893420546363986, + "flos": 584186153472.0, + "grad_norm": 0.028075325054819567, + "language_loss": 1.10005641, + "learning_rate": 0.0009463880725016029, + "loss": 1.11472011, + "num_input_tokens_seen": 8935488, + "router_z_loss_mlp": 2.91992188, + "step": 119, + "time_per_iteration": 2.7082488536834717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01467196, + "balance_loss_mlp": 1.17861414, + "epoch": 0.02308580223162755, + "flos": 562477810176.0, + "grad_norm": 0.032360539397207934, + "language_loss": 1.05048943, + "learning_rate": 0.0009480451988946134, + "loss": 1.06516147, + "num_input_tokens_seen": 9015344, + "router_z_loss_mlp": 2.89257812, + "step": 120, + "time_per_iteration": 2.808687686920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01461098, + "balance_loss_mlp": 1.17423272, + "epoch": 0.023278183916891113, + "flos": 772645125120.0, + "grad_norm": 0.033180722862994706, + "language_loss": 1.06113267, + "learning_rate": 0.0009496885730428627, + "loss": 1.07574379, + "num_input_tokens_seen": 9094672, + "router_z_loss_mlp": 2.875, + "step": 121, + "time_per_iteration": 3.0043137073516846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01466426, + "balance_loss_mlp": 1.18070555, + "epoch": 0.023470565602154676, + "flos": 554430144000.0, + "grad_norm": 0.030787275004595428, + "language_loss": 1.04567683, + "learning_rate": 0.0009513184213246156, + "loss": 1.06034112, + "num_input_tokens_seen": 9160608, + "router_z_loss_mlp": 2.86328125, + "step": 122, + "time_per_iteration": 2.6666839122772217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01462554, + "balance_loss_mlp": 1.17835939, + "epoch": 0.02366294728741824, + "flos": 561166519296.0, + "grad_norm": 0.030499039091632818, + "language_loss": 1.08099937, + "learning_rate": 0.0009529349645740552, + "loss": 1.09562504, + "num_input_tokens_seen": 9228704, + "router_z_loss_mlp": 2.84765625, + "step": 123, + "time_per_iteration": 2.69850492477417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01460088, + "balance_loss_mlp": 1.17741883, + "epoch": 0.0238553289726818, + "flos": 469516955136.0, + "grad_norm": 0.026549221517309443, + "language_loss": 1.06623578, + "learning_rate": 0.0009545384182608524, + "loss": 1.08083653, + "num_input_tokens_seen": 9294288, + "router_z_loss_mlp": 2.83203125, + "step": 124, + "time_per_iteration": 2.5435874462127686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01462583, + "balance_loss_mlp": 1.18144011, + "epoch": 0.024047710657945365, + "flos": 561103392768.0, + "grad_norm": 0.03287811385355005, + "language_loss": 1.04055512, + "learning_rate": 0.0009561289926625252, + "loss": 1.05518079, + "num_input_tokens_seen": 9368048, + "router_z_loss_mlp": 2.81640625, + "step": 125, + "time_per_iteration": 2.6661720275878906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01464029, + "balance_loss_mlp": 1.18460226, + "epoch": 0.024240092343208928, + "flos": 505770295296.0, + "grad_norm": 0.030159442314643806, + "language_loss": 1.08985233, + "learning_rate": 0.0009577068930299292, + "loss": 1.10449266, + "num_input_tokens_seen": 9434848, + "router_z_loss_mlp": 2.79882812, + "step": 126, + "time_per_iteration": 2.596027135848999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01456959, + "balance_loss_mlp": 1.17944014, + "epoch": 0.02443247402847249, + "flos": 436752325632.0, + "grad_norm": 0.03465787530540315, + "language_loss": 1.04454637, + "learning_rate": 0.0009592723197462087, + "loss": 1.05911589, + "num_input_tokens_seen": 9504112, + "router_z_loss_mlp": 2.77929688, + "step": 127, + "time_per_iteration": 2.6355836391448975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0145855, + "balance_loss_mlp": 1.18236613, + "epoch": 0.024624855713736054, + "flos": 685068421632.0, + "grad_norm": 0.03103018628328697, + "language_loss": 1.00976562, + "learning_rate": 0.0009608254684795125, + "loss": 1.02435124, + "num_input_tokens_seen": 9590032, + "router_z_loss_mlp": 2.765625, + "step": 128, + "time_per_iteration": 2.956745147705078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01452077, + "balance_loss_mlp": 1.17741859, + "epoch": 0.024817237398999614, + "flos": 526113679872.0, + "grad_norm": 0.03378324138815482, + "language_loss": 1.03947771, + "learning_rate": 0.0009623665303297678, + "loss": 1.05399847, + "num_input_tokens_seen": 9663040, + "router_z_loss_mlp": 2.75, + "step": 129, + "time_per_iteration": 2.762612819671631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0145448, + "balance_loss_mlp": 1.18115723, + "epoch": 0.025009619084263177, + "flos": 656886216192.0, + "grad_norm": 0.03318348770393379, + "language_loss": 1.08023834, + "learning_rate": 0.0009638956919697878, + "loss": 1.09478307, + "num_input_tokens_seen": 9736544, + "router_z_loss_mlp": 2.73339844, + "step": 130, + "time_per_iteration": 2.8800294399261475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01453293, + "balance_loss_mlp": 1.18130565, + "epoch": 0.02520200076952674, + "flos": 455369456640.0, + "grad_norm": 0.028803226470227133, + "language_loss": 1.00211501, + "learning_rate": 0.0009654131357809714, + "loss": 1.01664793, + "num_input_tokens_seen": 9804656, + "router_z_loss_mlp": 2.71875, + "step": 131, + "time_per_iteration": 2.593409776687622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01454951, + "balance_loss_mlp": 1.18534708, + "epoch": 0.025394382454790303, + "flos": 841268324352.0, + "grad_norm": 0.035993676074610494, + "language_loss": 1.09494662, + "learning_rate": 0.0009669190399838441, + "loss": 1.10949612, + "num_input_tokens_seen": 9888864, + "router_z_loss_mlp": 2.69824219, + "step": 132, + "time_per_iteration": 3.1307294368743896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01454062, + "balance_loss_mlp": 1.18588877, + "epoch": 0.025586764140053866, + "flos": 582228312576.0, + "grad_norm": 0.03305283337163912, + "language_loss": 1.02299893, + "learning_rate": 0.0009684135787636724, + "loss": 1.03753948, + "num_input_tokens_seen": 9968208, + "router_z_loss_mlp": 2.68359375, + "step": 133, + "time_per_iteration": 2.8118627071380615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01454726, + "balance_loss_mlp": 1.18798327, + "epoch": 0.02577914582531743, + "flos": 791677218816.0, + "grad_norm": 0.03011124606519955, + "language_loss": 1.06380379, + "learning_rate": 0.0009698969223913726, + "loss": 1.07835102, + "num_input_tokens_seen": 10049664, + "router_z_loss_mlp": 2.66894531, + "step": 134, + "time_per_iteration": 3.0371806621551514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01450237, + "balance_loss_mlp": 1.18454385, + "epoch": 0.025971527510580992, + "flos": 596062906368.0, + "grad_norm": 0.030569012833979448, + "language_loss": 1.08986592, + "learning_rate": 0.0009713692373399265, + "loss": 1.10436833, + "num_input_tokens_seen": 10120096, + "router_z_loss_mlp": 2.65820312, + "step": 135, + "time_per_iteration": 2.684903621673584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01684837, + "balance_loss_mlp": 1.39873505, + "epoch": 0.026163909195844555, + "flos": 1581074411520.0, + "grad_norm": 0.08870187959024729, + "language_loss": 0.79456228, + "learning_rate": 0.0009728306863964993, + "loss": 0.81141067, + "num_input_tokens_seen": 10348976, + "router_z_loss_mlp": 2.8671875, + "step": 136, + "time_per_iteration": 5.94019627571106 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0161422, + "balance_loss_mlp": 1.33116913, + "epoch": 0.026356290881108118, + "flos": 1505160886272.0, + "grad_norm": 0.07212137850421584, + "language_loss": 0.77811038, + "learning_rate": 0.0009742814287704512, + "loss": 0.79425257, + "num_input_tokens_seen": 10576512, + "router_z_loss_mlp": 2.8359375, + "step": 137, + "time_per_iteration": 4.865153074264526 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01469938, + "balance_loss_mlp": 1.20901299, + "epoch": 0.02654867256637168, + "flos": 598340382720.0, + "grad_norm": 0.040535745966457745, + "language_loss": 1.01652551, + "learning_rate": 0.0009757216201974225, + "loss": 1.03122485, + "num_input_tokens_seen": 10659168, + "router_z_loss_mlp": 2.609375, + "step": 138, + "time_per_iteration": 2.8955435752868652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01487517, + "balance_loss_mlp": 1.22802222, + "epoch": 0.026741054251635244, + "flos": 546135427584.0, + "grad_norm": 0.04340470282065083, + "language_loss": 1.06732666, + "learning_rate": 0.0009771514130396581, + "loss": 1.08220184, + "num_input_tokens_seen": 10731584, + "router_z_loss_mlp": 2.59472656, + "step": 139, + "time_per_iteration": 2.6939706802368164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01498511, + "balance_loss_mlp": 1.24044681, + "epoch": 0.026933435936898807, + "flos": 507845657088.0, + "grad_norm": 0.04879945782970011, + "language_loss": 1.07520163, + "learning_rate": 0.00097857095638274, + "loss": 1.09018672, + "num_input_tokens_seen": 10799456, + "router_z_loss_mlp": 2.58007812, + "step": 140, + "time_per_iteration": 2.559673309326172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01492411, + "balance_loss_mlp": 1.23558652, + "epoch": 0.02712581762216237, + "flos": 742253299200.0, + "grad_norm": 0.043929969627725114, + "language_loss": 0.98754954, + "learning_rate": 0.0009799803961288726, + "loss": 1.00247359, + "num_input_tokens_seen": 10886416, + "router_z_loss_mlp": 2.5703125, + "step": 141, + "time_per_iteration": 3.008998394012451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01470778, + "balance_loss_mlp": 1.21567059, + "epoch": 0.027318199307425933, + "flos": 849777890304.0, + "grad_norm": 0.03716164217421175, + "language_loss": 1.04960537, + "learning_rate": 0.000981379875086876, + "loss": 1.06431305, + "num_input_tokens_seen": 10966064, + "router_z_loss_mlp": 2.55371094, + "step": 142, + "time_per_iteration": 3.057098865509033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01469037, + "balance_loss_mlp": 1.21535933, + "epoch": 0.027510580992689496, + "flos": 576638043648.0, + "grad_norm": 0.03712962317624948, + "language_loss": 1.00046849, + "learning_rate": 0.0009827695330590185, + "loss": 1.01515889, + "num_input_tokens_seen": 11039712, + "router_z_loss_mlp": 2.5390625, + "step": 143, + "time_per_iteration": 2.638338327407837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01450228, + "balance_loss_mlp": 1.19750416, + "epoch": 0.02770296267795306, + "flos": 773789230080.0, + "grad_norm": 0.030455330453953735, + "language_loss": 0.99027133, + "learning_rate": 0.0009841495069248256, + "loss": 1.00477362, + "num_input_tokens_seen": 11123984, + "router_z_loss_mlp": 2.52929688, + "step": 144, + "time_per_iteration": 2.981438636779785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01441391, + "balance_loss_mlp": 1.19009781, + "epoch": 0.027895344363216622, + "flos": 570448888320.0, + "grad_norm": 0.031624263879455494, + "language_loss": 0.98723662, + "learning_rate": 0.0009855199307219871, + "loss": 1.00165045, + "num_input_tokens_seen": 11192864, + "router_z_loss_mlp": 2.51464844, + "step": 145, + "time_per_iteration": 2.6923046112060547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01440125, + "balance_loss_mlp": 1.1903578, + "epoch": 0.028087726048480186, + "flos": 548408174592.0, + "grad_norm": 0.029995844711875903, + "language_loss": 1.00586843, + "learning_rate": 0.0009868809357244854, + "loss": 1.02026975, + "num_input_tokens_seen": 11261760, + "router_z_loss_mlp": 2.49902344, + "step": 146, + "time_per_iteration": 2.6284868717193604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01436833, + "balance_loss_mlp": 1.18782902, + "epoch": 0.02828010773374375, + "flos": 525872633856.0, + "grad_norm": 0.03288909570778387, + "language_loss": 1.05042541, + "learning_rate": 0.0009882326505180556, + "loss": 1.06479371, + "num_input_tokens_seen": 11334736, + "router_z_loss_mlp": 2.49121094, + "step": 147, + "time_per_iteration": 2.6588714122772217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01425728, + "balance_loss_mlp": 1.1783452, + "epoch": 0.02847248941900731, + "flos": 773771765760.0, + "grad_norm": 0.031738987003727674, + "language_loss": 1.02499485, + "learning_rate": 0.0009895752010730906, + "loss": 1.03925204, + "num_input_tokens_seen": 11409872, + "router_z_loss_mlp": 2.47460938, + "step": 148, + "time_per_iteration": 2.9316182136535645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01424571, + "balance_loss_mlp": 1.17785549, + "epoch": 0.028664871104270875, + "flos": 535469908992.0, + "grad_norm": 0.028294299214345536, + "language_loss": 1.0900923, + "learning_rate": 0.0009909087108150867, + "loss": 1.10433793, + "num_input_tokens_seen": 11481024, + "router_z_loss_mlp": 2.46777344, + "step": 149, + "time_per_iteration": 2.697423219680786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.014274, + "balance_loss_mlp": 1.18182933, + "epoch": 0.028857252789534438, + "flos": 368604487680.0, + "grad_norm": 0.03525963963400797, + "language_loss": 1.09753942, + "learning_rate": 0.0009922333006927371, + "loss": 1.11181331, + "num_input_tokens_seen": 11544240, + "router_z_loss_mlp": 2.45605469, + "step": 150, + "time_per_iteration": 2.483644723892212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01433542, + "balance_loss_mlp": 1.18911529, + "epoch": 0.029049634474798, + "flos": 516483477504.0, + "grad_norm": 0.03341635886009217, + "language_loss": 1.03220332, + "learning_rate": 0.0009935490892437632, + "loss": 1.04653883, + "num_input_tokens_seen": 11610416, + "router_z_loss_mlp": 2.44433594, + "step": 151, + "time_per_iteration": 2.604599952697754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01438911, + "balance_loss_mlp": 1.19553363, + "epoch": 0.029242016160061564, + "flos": 589348724736.0, + "grad_norm": 0.030166761621646727, + "language_loss": 1.01782072, + "learning_rate": 0.0009948561926585687, + "loss": 1.03220987, + "num_input_tokens_seen": 11687488, + "router_z_loss_mlp": 2.43359375, + "step": 152, + "time_per_iteration": 2.7724709510803223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01445258, + "balance_loss_mlp": 1.20350146, + "epoch": 0.029434397845325123, + "flos": 553136317440.0, + "grad_norm": 0.030739210798008048, + "language_loss": 1.05873716, + "learning_rate": 0.0009961547248418122, + "loss": 1.07318974, + "num_input_tokens_seen": 11754576, + "router_z_loss_mlp": 2.41699219, + "step": 153, + "time_per_iteration": 2.6247737407684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01440878, + "balance_loss_mlp": 1.19988418, + "epoch": 0.029626779530588686, + "flos": 604607400960.0, + "grad_norm": 0.030186385343499288, + "language_loss": 1.02632022, + "learning_rate": 0.0009974447974719707, + "loss": 1.04072905, + "num_input_tokens_seen": 11831360, + "router_z_loss_mlp": 2.40917969, + "step": 154, + "time_per_iteration": 2.730053663253784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01431891, + "balance_loss_mlp": 1.19194651, + "epoch": 0.02981916121585225, + "flos": 622217413632.0, + "grad_norm": 0.02801027733601246, + "language_loss": 1.04305005, + "learning_rate": 0.0009987265200589763, + "loss": 1.05736899, + "num_input_tokens_seen": 11902192, + "router_z_loss_mlp": 2.3984375, + "step": 155, + "time_per_iteration": 2.7198500633239746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01423605, + "balance_loss_mlp": 1.18537688, + "epoch": 0.030011542901115813, + "flos": 662879987712.0, + "grad_norm": 0.0349007823819893, + "language_loss": 1.04218483, + "learning_rate": 0.001, + "loss": 1.05642092, + "num_input_tokens_seen": 11979088, + "router_z_loss_mlp": 2.38085938, + "step": 156, + "time_per_iteration": 2.8801028728485107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01420835, + "balance_loss_mlp": 1.18289316, + "epoch": 0.030203924586379376, + "flos": 652818084864.0, + "grad_norm": 0.029403473562715665, + "language_loss": 1.01930022, + "learning_rate": 0.0009999999029413921, + "loss": 1.03350854, + "num_input_tokens_seen": 12059200, + "router_z_loss_mlp": 2.37792969, + "step": 157, + "time_per_iteration": 2.8549368381500244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01415444, + "balance_loss_mlp": 1.17921925, + "epoch": 0.03039630627164294, + "flos": 532443824640.0, + "grad_norm": 0.03295212675068383, + "language_loss": 1.02716291, + "learning_rate": 0.0009999996117656068, + "loss": 1.04131734, + "num_input_tokens_seen": 12134944, + "router_z_loss_mlp": 2.36035156, + "step": 158, + "time_per_iteration": 2.6989729404449463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01410387, + "balance_loss_mlp": 1.17530584, + "epoch": 0.030588687956906502, + "flos": 587294830080.0, + "grad_norm": 0.0291076208082698, + "language_loss": 0.96305156, + "learning_rate": 0.0009999991264727564, + "loss": 0.97715545, + "num_input_tokens_seen": 12207936, + "router_z_loss_mlp": 2.34863281, + "step": 159, + "time_per_iteration": 2.7609338760375977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0140999, + "balance_loss_mlp": 1.1752907, + "epoch": 0.030781069642170065, + "flos": 514286592000.0, + "grad_norm": 0.030494101007586163, + "language_loss": 1.0725081, + "learning_rate": 0.0009999984470630296, + "loss": 1.08660805, + "num_input_tokens_seen": 12273200, + "router_z_loss_mlp": 2.34472656, + "step": 160, + "time_per_iteration": 2.5805158615112305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01410287, + "balance_loss_mlp": 1.17711365, + "epoch": 0.030973451327433628, + "flos": 719559304704.0, + "grad_norm": 0.025032822394785544, + "language_loss": 0.95934659, + "learning_rate": 0.0009999975735366902, + "loss": 0.97344947, + "num_input_tokens_seen": 12359600, + "router_z_loss_mlp": 2.32910156, + "step": 161, + "time_per_iteration": 3.078343629837036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01409543, + "balance_loss_mlp": 1.17675149, + "epoch": 0.03116583301269719, + "flos": 1111614400512.0, + "grad_norm": 0.029903967107167622, + "language_loss": 0.98009437, + "learning_rate": 0.0009999965058940775, + "loss": 0.99418974, + "num_input_tokens_seen": 12443936, + "router_z_loss_mlp": 2.32519531, + "step": 162, + "time_per_iteration": 3.49137544631958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01408163, + "balance_loss_mlp": 1.17689729, + "epoch": 0.031358214697960754, + "flos": 451833082368.0, + "grad_norm": 0.11336845133687022, + "language_loss": 1.0463953, + "learning_rate": 0.0009999952441356057, + "loss": 1.06047678, + "num_input_tokens_seen": 12507488, + "router_z_loss_mlp": 2.30957031, + "step": 163, + "time_per_iteration": 2.531280755996704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01406979, + "balance_loss_mlp": 1.17676246, + "epoch": 0.031550596383224314, + "flos": 1257085658112.0, + "grad_norm": 0.03183858769064714, + "language_loss": 1.05248928, + "learning_rate": 0.000999993788261765, + "loss": 1.06655908, + "num_input_tokens_seen": 12594096, + "router_z_loss_mlp": 2.30078125, + "step": 164, + "time_per_iteration": 3.5714328289031982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01408503, + "balance_loss_mlp": 1.17943025, + "epoch": 0.03174297806848788, + "flos": 669322924032.0, + "grad_norm": 0.03191781964215587, + "language_loss": 1.06263065, + "learning_rate": 0.00099999213827312, + "loss": 1.07671571, + "num_input_tokens_seen": 12669424, + "router_z_loss_mlp": 2.29101562, + "step": 165, + "time_per_iteration": 2.7947938442230225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01409995, + "balance_loss_mlp": 1.18101788, + "epoch": 0.03193535975375144, + "flos": 552363514368.0, + "grad_norm": 0.03891580789868065, + "language_loss": 1.01044345, + "learning_rate": 0.000999990294170312, + "loss": 1.0245434, + "num_input_tokens_seen": 12740080, + "router_z_loss_mlp": 2.29003906, + "step": 166, + "time_per_iteration": 2.6462574005126953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0140342, + "balance_loss_mlp": 1.17577803, + "epoch": 0.032127741439015006, + "flos": 544739543040.0, + "grad_norm": 0.03757156138401865, + "language_loss": 1.05309296, + "learning_rate": 0.0009999882559540566, + "loss": 1.06712723, + "num_input_tokens_seen": 12810576, + "router_z_loss_mlp": 2.27636719, + "step": 167, + "time_per_iteration": 2.629549503326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0140941, + "balance_loss_mlp": 1.18234003, + "epoch": 0.032320123124278566, + "flos": 549513348096.0, + "grad_norm": 0.028659149555752484, + "language_loss": 1.01791751, + "learning_rate": 0.000999986023625145, + "loss": 1.03201175, + "num_input_tokens_seen": 12887904, + "router_z_loss_mlp": 2.27050781, + "step": 168, + "time_per_iteration": 2.7051401138305664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01589355, + "balance_loss_mlp": 1.35360718, + "epoch": 0.03251250480954213, + "flos": 1308815430144.0, + "grad_norm": 0.08201951270186027, + "language_loss": 0.78924417, + "learning_rate": 0.0009999835971844441, + "loss": 0.80513763, + "num_input_tokens_seen": 13107344, + "router_z_loss_mlp": 2.35546875, + "step": 169, + "time_per_iteration": 4.9428627490997314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01407645, + "balance_loss_mlp": 1.18257797, + "epoch": 0.03270488649480569, + "flos": 562201835520.0, + "grad_norm": 0.03970113019311383, + "language_loss": 1.02863848, + "learning_rate": 0.0009999809766328958, + "loss": 1.04271495, + "num_input_tokens_seen": 13175552, + "router_z_loss_mlp": 2.25, + "step": 170, + "time_per_iteration": 2.675811529159546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01415662, + "balance_loss_mlp": 1.19193029, + "epoch": 0.03289726818006926, + "flos": 483338813952.0, + "grad_norm": 0.03325277263778645, + "language_loss": 1.04760146, + "learning_rate": 0.0009999781619715177, + "loss": 1.06175804, + "num_input_tokens_seen": 13242384, + "router_z_loss_mlp": 2.23632812, + "step": 171, + "time_per_iteration": 2.5431392192840576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01419714, + "balance_loss_mlp": 1.1972214, + "epoch": 0.03308964986533282, + "flos": 675820254720.0, + "grad_norm": 0.02950894161591202, + "language_loss": 1.04164565, + "learning_rate": 0.000999975153201402, + "loss": 1.05584288, + "num_input_tokens_seen": 13316160, + "router_z_loss_mlp": 2.22363281, + "step": 172, + "time_per_iteration": 2.812837600708008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01422366, + "balance_loss_mlp": 1.20044637, + "epoch": 0.033282031550596385, + "flos": 610340660736.0, + "grad_norm": 0.03086814843966846, + "language_loss": 1.02532911, + "learning_rate": 0.0009999719503237174, + "loss": 1.03955269, + "num_input_tokens_seen": 13387664, + "router_z_loss_mlp": 2.21777344, + "step": 173, + "time_per_iteration": 2.755462646484375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01416936, + "balance_loss_mlp": 1.1959697, + "epoch": 0.033474413235859944, + "flos": 468995931648.0, + "grad_norm": 0.048603642070708566, + "language_loss": 1.1131072, + "learning_rate": 0.0009999685533397073, + "loss": 1.12727666, + "num_input_tokens_seen": 13454528, + "router_z_loss_mlp": 2.20800781, + "step": 174, + "time_per_iteration": 2.566751003265381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01414495, + "balance_loss_mlp": 1.19438744, + "epoch": 0.03366679492112351, + "flos": 580714907136.0, + "grad_norm": 0.03243683176756354, + "language_loss": 1.02908182, + "learning_rate": 0.00099996496225069, + "loss": 1.04322672, + "num_input_tokens_seen": 13522528, + "router_z_loss_mlp": 2.19921875, + "step": 175, + "time_per_iteration": 2.67861008644104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01407523, + "balance_loss_mlp": 1.1883682, + "epoch": 0.03385917660638707, + "flos": 638885435904.0, + "grad_norm": 0.029120554083078395, + "language_loss": 1.05784094, + "learning_rate": 0.0009999611770580604, + "loss": 1.0719161, + "num_input_tokens_seen": 13601120, + "router_z_loss_mlp": 2.18945312, + "step": 176, + "time_per_iteration": 2.8410942554473877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01401607, + "balance_loss_mlp": 1.18302441, + "epoch": 0.03405155829165064, + "flos": 442739366400.0, + "grad_norm": 0.031490867136515936, + "language_loss": 1.04703283, + "learning_rate": 0.0009999571977632876, + "loss": 1.06104875, + "num_input_tokens_seen": 13666384, + "router_z_loss_mlp": 2.18359375, + "step": 177, + "time_per_iteration": 2.559652090072632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01399051, + "balance_loss_mlp": 1.1813277, + "epoch": 0.034243939976914196, + "flos": 467274407424.0, + "grad_norm": 0.029366691437037535, + "language_loss": 1.0724479, + "learning_rate": 0.0009999530243679166, + "loss": 1.08643842, + "num_input_tokens_seen": 13733968, + "router_z_loss_mlp": 2.17480469, + "step": 178, + "time_per_iteration": 2.5423247814178467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01392432, + "balance_loss_mlp": 1.17556691, + "epoch": 0.03443632166217776, + "flos": 780712257024.0, + "grad_norm": 0.02507202069561695, + "language_loss": 1.01653552, + "learning_rate": 0.0009999486568735675, + "loss": 1.03045988, + "num_input_tokens_seen": 13818960, + "router_z_loss_mlp": 2.16601562, + "step": 179, + "time_per_iteration": 3.111632823944092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01381684, + "balance_loss_mlp": 1.16567647, + "epoch": 0.03462870334744132, + "flos": 1265758407168.0, + "grad_norm": 0.027829136834509844, + "language_loss": 1.02053452, + "learning_rate": 0.0009999440952819362, + "loss": 1.03435147, + "num_input_tokens_seen": 13912448, + "router_z_loss_mlp": 2.15722656, + "step": 180, + "time_per_iteration": 3.6354756355285645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01375883, + "balance_loss_mlp": 1.16035271, + "epoch": 0.03482108503270489, + "flos": 608302228992.0, + "grad_norm": 0.033531921209289, + "language_loss": 1.02966988, + "learning_rate": 0.0009999393395947935, + "loss": 1.04342866, + "num_input_tokens_seen": 13990752, + "router_z_loss_mlp": 2.15234375, + "step": 181, + "time_per_iteration": 2.8509652614593506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01372611, + "balance_loss_mlp": 1.15774834, + "epoch": 0.03501346671796845, + "flos": 539314458624.0, + "grad_norm": 0.029990628161131794, + "language_loss": 1.05946589, + "learning_rate": 0.0009999343898139858, + "loss": 1.07319212, + "num_input_tokens_seen": 14058608, + "router_z_loss_mlp": 2.14550781, + "step": 182, + "time_per_iteration": 2.6199755668640137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01375908, + "balance_loss_mlp": 1.16161704, + "epoch": 0.035205848403232015, + "flos": 519498828288.0, + "grad_norm": 0.03419998284579487, + "language_loss": 1.04830694, + "learning_rate": 0.0009999292459414348, + "loss": 1.06206608, + "num_input_tokens_seen": 14126656, + "router_z_loss_mlp": 2.13964844, + "step": 183, + "time_per_iteration": 2.563997983932495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01386507, + "balance_loss_mlp": 1.17269289, + "epoch": 0.035398230088495575, + "flos": 473333306880.0, + "grad_norm": 0.03346089667402367, + "language_loss": 1.09292293, + "learning_rate": 0.0009999239079791374, + "loss": 1.10678792, + "num_input_tokens_seen": 14195840, + "router_z_loss_mlp": 2.13476562, + "step": 184, + "time_per_iteration": 2.5561137199401855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01387981, + "balance_loss_mlp": 1.17512131, + "epoch": 0.03559061177375914, + "flos": 513094823424.0, + "grad_norm": 0.03551516541146116, + "language_loss": 1.01857162, + "learning_rate": 0.0009999183759291659, + "loss": 1.03245139, + "num_input_tokens_seen": 14269936, + "router_z_loss_mlp": 2.125, + "step": 185, + "time_per_iteration": 2.689763307571411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01383562, + "balance_loss_mlp": 1.17108345, + "epoch": 0.0357829934590227, + "flos": 478350159360.0, + "grad_norm": 0.03945465081959485, + "language_loss": 1.04534364, + "learning_rate": 0.0009999126497936682, + "loss": 1.05917931, + "num_input_tokens_seen": 14334848, + "router_z_loss_mlp": 2.12109375, + "step": 186, + "time_per_iteration": 2.5142176151275635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01375295, + "balance_loss_mlp": 1.16415167, + "epoch": 0.03597537514428627, + "flos": 645884324352.0, + "grad_norm": 0.029215470851159726, + "language_loss": 1.06864357, + "learning_rate": 0.0009999067295748676, + "loss": 1.08239663, + "num_input_tokens_seen": 14407888, + "router_z_loss_mlp": 2.10742188, + "step": 187, + "time_per_iteration": 2.8259549140930176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01370561, + "balance_loss_mlp": 1.16056204, + "epoch": 0.03616775682954983, + "flos": 582269245440.0, + "grad_norm": 0.03159066859467708, + "language_loss": 1.0519886, + "learning_rate": 0.000999900615275062, + "loss": 1.06569433, + "num_input_tokens_seen": 14479072, + "router_z_loss_mlp": 2.09570312, + "step": 188, + "time_per_iteration": 2.6748595237731934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01368603, + "balance_loss_mlp": 1.15898561, + "epoch": 0.03636013851481339, + "flos": 383264277504.0, + "grad_norm": 0.043734318168479426, + "language_loss": 1.10731864, + "learning_rate": 0.0009998943068966256, + "loss": 1.1210047, + "num_input_tokens_seen": 14540944, + "router_z_loss_mlp": 2.09179688, + "step": 189, + "time_per_iteration": 2.4394500255584717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01365543, + "balance_loss_mlp": 1.15668833, + "epoch": 0.03655252020007695, + "flos": 584307677184.0, + "grad_norm": 0.02577278402121573, + "language_loss": 1.05579162, + "learning_rate": 0.0009998878044420072, + "loss": 1.06944704, + "num_input_tokens_seen": 14611392, + "router_z_loss_mlp": 2.0859375, + "step": 190, + "time_per_iteration": 2.7022814750671387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01365865, + "balance_loss_mlp": 1.15882242, + "epoch": 0.03674490188534051, + "flos": 472597433856.0, + "grad_norm": 0.03520388751206912, + "language_loss": 1.01277018, + "learning_rate": 0.0009998811079137318, + "loss": 1.02642882, + "num_input_tokens_seen": 14679776, + "router_z_loss_mlp": 2.07324219, + "step": 191, + "time_per_iteration": 2.5930585861206055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0136447, + "balance_loss_mlp": 1.15742755, + "epoch": 0.03693728357060408, + "flos": 529411009536.0, + "grad_norm": 0.03125533686722731, + "language_loss": 1.02464271, + "learning_rate": 0.0009998742173143987, + "loss": 1.0382874, + "num_input_tokens_seen": 14749712, + "router_z_loss_mlp": 2.07324219, + "step": 192, + "time_per_iteration": 2.6235413551330566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01358793, + "balance_loss_mlp": 1.15222692, + "epoch": 0.03712966525586764, + "flos": 800345238528.0, + "grad_norm": 0.02848545485219292, + "language_loss": 1.02800548, + "learning_rate": 0.0009998671326466833, + "loss": 1.04159343, + "num_input_tokens_seen": 14827136, + "router_z_loss_mlp": 2.06835938, + "step": 193, + "time_per_iteration": 2.991110324859619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01351781, + "balance_loss_mlp": 1.1463598, + "epoch": 0.037322046941131205, + "flos": 831358144512.0, + "grad_norm": 0.03513998418582105, + "language_loss": 1.0392077, + "learning_rate": 0.0009998598539133362, + "loss": 1.05272543, + "num_input_tokens_seen": 14902880, + "router_z_loss_mlp": 2.05664062, + "step": 194, + "time_per_iteration": 3.0204203128814697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01349328, + "balance_loss_mlp": 1.14371598, + "epoch": 0.037514428626394765, + "flos": 438588642816.0, + "grad_norm": 0.028816536284039847, + "language_loss": 1.04176903, + "learning_rate": 0.0009998523811171828, + "loss": 1.05526221, + "num_input_tokens_seen": 14967264, + "router_z_loss_mlp": 2.05859375, + "step": 195, + "time_per_iteration": 2.5615782737731934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01345129, + "balance_loss_mlp": 1.14047015, + "epoch": 0.03770681031165833, + "flos": 512638927872.0, + "grad_norm": 0.030721230574493993, + "language_loss": 1.05052435, + "learning_rate": 0.0009998447142611248, + "loss": 1.06397557, + "num_input_tokens_seen": 15039104, + "router_z_loss_mlp": 2.04882812, + "step": 196, + "time_per_iteration": 2.6310269832611084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01347072, + "balance_loss_mlp": 1.14289033, + "epoch": 0.03789919199692189, + "flos": 808842069504.0, + "grad_norm": 0.024329502455983587, + "language_loss": 0.97805226, + "learning_rate": 0.0009998368533481387, + "loss": 0.99152303, + "num_input_tokens_seen": 15124864, + "router_z_loss_mlp": 2.04394531, + "step": 197, + "time_per_iteration": 3.0467066764831543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01344143, + "balance_loss_mlp": 1.14043784, + "epoch": 0.03809157368218546, + "flos": 691791335424.0, + "grad_norm": 0.028391473090668865, + "language_loss": 1.00891113, + "learning_rate": 0.0009998287983812762, + "loss": 1.0223527, + "num_input_tokens_seen": 15199680, + "router_z_loss_mlp": 2.0390625, + "step": 198, + "time_per_iteration": 2.8457672595977783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01342798, + "balance_loss_mlp": 1.14023721, + "epoch": 0.03828395536744902, + "flos": 519004001280.0, + "grad_norm": 0.02890411668538335, + "language_loss": 1.07749867, + "learning_rate": 0.0009998205493636646, + "loss": 1.09092665, + "num_input_tokens_seen": 15270176, + "router_z_loss_mlp": 2.02734375, + "step": 199, + "time_per_iteration": 2.66135573387146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01336213, + "balance_loss_mlp": 1.13432038, + "epoch": 0.038476337052712584, + "flos": 582762071040.0, + "grad_norm": 0.025165239757241963, + "language_loss": 0.99723649, + "learning_rate": 0.0009998121062985063, + "loss": 1.01059866, + "num_input_tokens_seen": 15343168, + "router_z_loss_mlp": 2.02050781, + "step": 200, + "time_per_iteration": 2.70021915435791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01340101, + "balance_loss_mlp": 1.13868463, + "epoch": 0.03866871873797614, + "flos": 578272972800.0, + "grad_norm": 0.025940014565947116, + "language_loss": 1.01401794, + "learning_rate": 0.0009998034691890794, + "loss": 1.02741897, + "num_input_tokens_seen": 15417328, + "router_z_loss_mlp": 2.015625, + "step": 201, + "time_per_iteration": 2.7596118450164795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0134112, + "balance_loss_mlp": 1.14018106, + "epoch": 0.03886110042323971, + "flos": 541771855872.0, + "grad_norm": 0.03045868040347491, + "language_loss": 1.06763899, + "learning_rate": 0.0009997946380387369, + "loss": 1.08105016, + "num_input_tokens_seen": 15489488, + "router_z_loss_mlp": 2.01074219, + "step": 202, + "time_per_iteration": 2.6249613761901855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01341912, + "balance_loss_mlp": 1.14192665, + "epoch": 0.03905348210850327, + "flos": 719239669248.0, + "grad_norm": 0.02826530469295273, + "language_loss": 1.09111357, + "learning_rate": 0.0009997856128509076, + "loss": 1.1045326, + "num_input_tokens_seen": 15558944, + "router_z_loss_mlp": 2.00097656, + "step": 203, + "time_per_iteration": 2.8254761695861816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01336015, + "balance_loss_mlp": 1.13660145, + "epoch": 0.039245863793766836, + "flos": 428396484096.0, + "grad_norm": 0.028264614074004907, + "language_loss": 1.0366801, + "learning_rate": 0.0009997763936290952, + "loss": 1.05004025, + "num_input_tokens_seen": 15625024, + "router_z_loss_mlp": 1.99511719, + "step": 204, + "time_per_iteration": 2.4907312393188477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01334897, + "balance_loss_mlp": 1.13624632, + "epoch": 0.039438245479030395, + "flos": 664269141504.0, + "grad_norm": 0.0294297584821439, + "language_loss": 1.09143519, + "learning_rate": 0.0009997669803768789, + "loss": 1.10478401, + "num_input_tokens_seen": 15697120, + "router_z_loss_mlp": 1.98730469, + "step": 205, + "time_per_iteration": 2.787046194076538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01332958, + "balance_loss_mlp": 1.13497555, + "epoch": 0.03963062716429396, + "flos": 636495168000.0, + "grad_norm": 0.025164669035445293, + "language_loss": 1.04324186, + "learning_rate": 0.0009997573730979134, + "loss": 1.05657148, + "num_input_tokens_seen": 15768752, + "router_z_loss_mlp": 1.98242188, + "step": 206, + "time_per_iteration": 2.744339942932129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01388672, + "balance_loss_mlp": 1.18687439, + "epoch": 0.03982300884955752, + "flos": 1421587186176.0, + "grad_norm": 0.04225268457123109, + "language_loss": 0.79193199, + "learning_rate": 0.0009997475717959284, + "loss": 0.80581868, + "num_input_tokens_seen": 15980624, + "router_z_loss_mlp": 2.01953125, + "step": 207, + "time_per_iteration": 4.62822699546814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01338974, + "balance_loss_mlp": 1.14251721, + "epoch": 0.04001539053482109, + "flos": 690519702528.0, + "grad_norm": 0.029734692172116686, + "language_loss": 1.02667236, + "learning_rate": 0.0009997375764747294, + "loss": 1.04006195, + "num_input_tokens_seen": 16067232, + "router_z_loss_mlp": 1.96875, + "step": 208, + "time_per_iteration": 3.0006470680236816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01332342, + "balance_loss_mlp": 1.1360755, + "epoch": 0.04020777222008465, + "flos": 534751500288.0, + "grad_norm": 0.02521302149444487, + "language_loss": 1.00535607, + "learning_rate": 0.0009997273871381967, + "loss": 1.01867938, + "num_input_tokens_seen": 16139808, + "router_z_loss_mlp": 1.96679688, + "step": 209, + "time_per_iteration": 2.6790220737457275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01335368, + "balance_loss_mlp": 1.14005554, + "epoch": 0.040400153905348214, + "flos": 568996608000.0, + "grad_norm": 0.04055154679799505, + "language_loss": 1.05331016, + "learning_rate": 0.0009997170037902862, + "loss": 1.06666374, + "num_input_tokens_seen": 16210848, + "router_z_loss_mlp": 1.95703125, + "step": 210, + "time_per_iteration": 2.748340129852295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01331596, + "balance_loss_mlp": 1.13647389, + "epoch": 0.040592535590611774, + "flos": 714678712320.0, + "grad_norm": 0.0276705792773584, + "language_loss": 1.07916689, + "learning_rate": 0.0009997064264350292, + "loss": 1.09248281, + "num_input_tokens_seen": 16283984, + "router_z_loss_mlp": 1.95507812, + "step": 211, + "time_per_iteration": 2.8284339904785156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01332545, + "balance_loss_mlp": 1.13761449, + "epoch": 0.04078491727587533, + "flos": 579206231040.0, + "grad_norm": 0.026753366885260317, + "language_loss": 1.01893198, + "learning_rate": 0.0009996956550765317, + "loss": 1.03225756, + "num_input_tokens_seen": 16353904, + "router_z_loss_mlp": 1.953125, + "step": 212, + "time_per_iteration": 2.6636033058166504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01330854, + "balance_loss_mlp": 1.13668597, + "epoch": 0.0409772989611389, + "flos": 553368631296.0, + "grad_norm": 0.03340351088011317, + "language_loss": 0.96620274, + "learning_rate": 0.0009996846897189762, + "loss": 0.97951126, + "num_input_tokens_seen": 16425488, + "router_z_loss_mlp": 1.9453125, + "step": 213, + "time_per_iteration": 2.62785005569458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01327396, + "balance_loss_mlp": 1.13332307, + "epoch": 0.04116968064640246, + "flos": 556764016128.0, + "grad_norm": 0.026256493309422244, + "language_loss": 1.0283711, + "learning_rate": 0.0009996735303666193, + "loss": 1.04164505, + "num_input_tokens_seen": 16498016, + "router_z_loss_mlp": 1.94433594, + "step": 214, + "time_per_iteration": 2.745412588119507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01324547, + "balance_loss_mlp": 1.13152313, + "epoch": 0.041362062331666026, + "flos": 579651393024.0, + "grad_norm": 0.025801807715809106, + "language_loss": 1.04973316, + "learning_rate": 0.0009996621770237937, + "loss": 1.06297863, + "num_input_tokens_seen": 16573744, + "router_z_loss_mlp": 1.93359375, + "step": 215, + "time_per_iteration": 2.7359023094177246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01319406, + "balance_loss_mlp": 1.12657344, + "epoch": 0.041554444016929586, + "flos": 612700729344.0, + "grad_norm": 0.027594527286323677, + "language_loss": 1.00985026, + "learning_rate": 0.0009996506296949073, + "loss": 1.02304435, + "num_input_tokens_seen": 16655344, + "router_z_loss_mlp": 1.93164062, + "step": 216, + "time_per_iteration": 2.860781669616699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01320461, + "balance_loss_mlp": 1.12781918, + "epoch": 0.04174682570219315, + "flos": 529150497792.0, + "grad_norm": 0.030561981852332186, + "language_loss": 1.01172602, + "learning_rate": 0.0009996388883844428, + "loss": 1.02493072, + "num_input_tokens_seen": 16726480, + "router_z_loss_mlp": 1.9296875, + "step": 217, + "time_per_iteration": 2.614837169647217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01315002, + "balance_loss_mlp": 1.12255037, + "epoch": 0.04193920738745671, + "flos": 512499939840.0, + "grad_norm": 0.024235201889365978, + "language_loss": 1.04092622, + "learning_rate": 0.0009996269530969588, + "loss": 1.05407631, + "num_input_tokens_seen": 16792112, + "router_z_loss_mlp": 1.92773438, + "step": 218, + "time_per_iteration": 2.5777087211608887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01317845, + "balance_loss_mlp": 1.1255846, + "epoch": 0.04213158907272028, + "flos": 572552448000.0, + "grad_norm": 0.03618883866707401, + "language_loss": 1.04623246, + "learning_rate": 0.0009996148238370888, + "loss": 1.05941105, + "num_input_tokens_seen": 16862960, + "router_z_loss_mlp": 1.92578125, + "step": 219, + "time_per_iteration": 2.723344564437866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01319419, + "balance_loss_mlp": 1.12830234, + "epoch": 0.04232397075798384, + "flos": 965904098304.0, + "grad_norm": 0.02808123492922437, + "language_loss": 0.99962145, + "learning_rate": 0.0009996025006095421, + "loss": 1.01281559, + "num_input_tokens_seen": 16950416, + "router_z_loss_mlp": 1.9140625, + "step": 220, + "time_per_iteration": 3.297567844390869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01355408, + "balance_loss_mlp": 1.16314697, + "epoch": 0.042516352443247404, + "flos": 1472730628608.0, + "grad_norm": 0.031119874656221472, + "language_loss": 0.77783144, + "learning_rate": 0.0009995899834191028, + "loss": 0.79138547, + "num_input_tokens_seen": 17180944, + "router_z_loss_mlp": 1.92578125, + "step": 221, + "time_per_iteration": 5.484851837158203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0132056, + "balance_loss_mlp": 1.13039756, + "epoch": 0.042708734128510964, + "flos": 655891832832.0, + "grad_norm": 0.027306518139410985, + "language_loss": 0.99887031, + "learning_rate": 0.0009995772722706307, + "loss": 1.0120759, + "num_input_tokens_seen": 17257792, + "router_z_loss_mlp": 1.90429688, + "step": 222, + "time_per_iteration": 2.801955461502075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01324867, + "balance_loss_mlp": 1.13518083, + "epoch": 0.04290111581377453, + "flos": 432733859328.0, + "grad_norm": 0.025166076900031344, + "language_loss": 1.13987851, + "learning_rate": 0.0009995643671690604, + "loss": 1.15312719, + "num_input_tokens_seen": 17320288, + "router_z_loss_mlp": 1.89941406, + "step": 223, + "time_per_iteration": 2.4589195251464844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01320058, + "balance_loss_mlp": 1.13142133, + "epoch": 0.04309349749903809, + "flos": 645866860032.0, + "grad_norm": 0.02470776233740571, + "language_loss": 1.01624262, + "learning_rate": 0.0009995512681194023, + "loss": 1.02944326, + "num_input_tokens_seen": 17396672, + "router_z_loss_mlp": 1.88867188, + "step": 224, + "time_per_iteration": 2.854653835296631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01319788, + "balance_loss_mlp": 1.13124692, + "epoch": 0.04328587918430166, + "flos": 832895745024.0, + "grad_norm": 0.02898896961022835, + "language_loss": 0.98942387, + "learning_rate": 0.0009995379751267417, + "loss": 1.00262189, + "num_input_tokens_seen": 17488096, + "router_z_loss_mlp": 1.88769531, + "step": 225, + "time_per_iteration": 3.260105609893799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01317885, + "balance_loss_mlp": 1.12943935, + "epoch": 0.043478260869565216, + "flos": 526115681280.0, + "grad_norm": 0.02601835272599882, + "language_loss": 1.00718379, + "learning_rate": 0.0009995244881962398, + "loss": 1.02036262, + "num_input_tokens_seen": 17557632, + "router_z_loss_mlp": 1.88671875, + "step": 226, + "time_per_iteration": 2.631685495376587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01320396, + "balance_loss_mlp": 1.13204539, + "epoch": 0.04367064255482878, + "flos": 440412225024.0, + "grad_norm": 0.02740546356326938, + "language_loss": 1.02089393, + "learning_rate": 0.0009995108073331323, + "loss": 1.03409791, + "num_input_tokens_seen": 17626672, + "router_z_loss_mlp": 1.88574219, + "step": 227, + "time_per_iteration": 2.6414895057678223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01308962, + "balance_loss_mlp": 1.12156498, + "epoch": 0.04386302424009234, + "flos": 508466737152.0, + "grad_norm": 0.023646446246452554, + "language_loss": 1.04017711, + "learning_rate": 0.0009994969325427309, + "loss": 1.05326676, + "num_input_tokens_seen": 17698624, + "router_z_loss_mlp": 1.87597656, + "step": 228, + "time_per_iteration": 2.6933584213256836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0130646, + "balance_loss_mlp": 1.11906338, + "epoch": 0.04405540592535591, + "flos": 541743657984.0, + "grad_norm": 0.02642836262436834, + "language_loss": 1.00691068, + "learning_rate": 0.0009994828638304218, + "loss": 1.0199753, + "num_input_tokens_seen": 17767760, + "router_z_loss_mlp": 1.87597656, + "step": 229, + "time_per_iteration": 2.604616165161133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01305226, + "balance_loss_mlp": 1.11792421, + "epoch": 0.04424778761061947, + "flos": 447309055488.0, + "grad_norm": 0.039218098968292335, + "language_loss": 1.07079852, + "learning_rate": 0.0009994686012016675, + "loss": 1.08385086, + "num_input_tokens_seen": 17833664, + "router_z_loss_mlp": 1.875, + "step": 230, + "time_per_iteration": 2.568608045578003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0130487, + "balance_loss_mlp": 1.1187129, + "epoch": 0.044440169295883035, + "flos": 701981492736.0, + "grad_norm": 0.02721662483758601, + "language_loss": 1.06240797, + "learning_rate": 0.000999454144662005, + "loss": 1.07545662, + "num_input_tokens_seen": 17908880, + "router_z_loss_mlp": 1.86328125, + "step": 231, + "time_per_iteration": 2.9104526042938232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01295735, + "balance_loss_mlp": 1.10957813, + "epoch": 0.044632550981146595, + "flos": 589426587648.0, + "grad_norm": 0.02817980914561194, + "language_loss": 1.003865, + "learning_rate": 0.0009994394942170468, + "loss": 1.01682234, + "num_input_tokens_seen": 17978208, + "router_z_loss_mlp": 1.86328125, + "step": 232, + "time_per_iteration": 2.674896001815796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01302928, + "balance_loss_mlp": 1.11667526, + "epoch": 0.04482493266641016, + "flos": 555854226432.0, + "grad_norm": 0.029144066951330677, + "language_loss": 0.98161608, + "learning_rate": 0.0009994246498724808, + "loss": 0.99464536, + "num_input_tokens_seen": 18049296, + "router_z_loss_mlp": 1.86425781, + "step": 233, + "time_per_iteration": 2.674178123474121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01302597, + "balance_loss_mlp": 1.11682117, + "epoch": 0.04501731435167372, + "flos": 724069870080.0, + "grad_norm": 0.027038299766394356, + "language_loss": 1.00722432, + "learning_rate": 0.00099940961163407, + "loss": 1.02025032, + "num_input_tokens_seen": 18123296, + "router_z_loss_mlp": 1.859375, + "step": 234, + "time_per_iteration": 2.8427939414978027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01301098, + "balance_loss_mlp": 1.11608493, + "epoch": 0.04520969603693728, + "flos": 512797381632.0, + "grad_norm": 0.027022139799708383, + "language_loss": 1.02586675, + "learning_rate": 0.0009993943795076528, + "loss": 1.03887773, + "num_input_tokens_seen": 18192784, + "router_z_loss_mlp": 1.8515625, + "step": 235, + "time_per_iteration": 2.5940792560577393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01295671, + "balance_loss_mlp": 1.11094403, + "epoch": 0.04540207772220085, + "flos": 365877846528.0, + "grad_norm": 0.03212133053651388, + "language_loss": 1.0562067, + "learning_rate": 0.0009993789534991427, + "loss": 1.06916356, + "num_input_tokens_seen": 18254064, + "router_z_loss_mlp": 1.84863281, + "step": 236, + "time_per_iteration": 2.4345834255218506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01294151, + "balance_loss_mlp": 1.1095196, + "epoch": 0.045594459407464406, + "flos": 523723411968.0, + "grad_norm": 0.029471400038435007, + "language_loss": 1.00276268, + "learning_rate": 0.0009993633336145287, + "loss": 1.01570415, + "num_input_tokens_seen": 18325728, + "router_z_loss_mlp": 1.84765625, + "step": 237, + "time_per_iteration": 2.6279234886169434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01296614, + "balance_loss_mlp": 1.11284053, + "epoch": 0.04578684109272797, + "flos": 673115807232.0, + "grad_norm": 0.032189822363292264, + "language_loss": 1.04537559, + "learning_rate": 0.0009993475198598752, + "loss": 1.05834174, + "num_input_tokens_seen": 18408608, + "router_z_loss_mlp": 1.83886719, + "step": 238, + "time_per_iteration": 2.98264741897583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01294154, + "balance_loss_mlp": 1.11047626, + "epoch": 0.04597922277799153, + "flos": 542620520448.0, + "grad_norm": 0.025834809881005002, + "language_loss": 1.01282692, + "learning_rate": 0.0009993315122413212, + "loss": 1.02576852, + "num_input_tokens_seen": 18471920, + "router_z_loss_mlp": 1.83789062, + "step": 239, + "time_per_iteration": 2.5969364643096924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01297016, + "balance_loss_mlp": 1.11333883, + "epoch": 0.0461716044632551, + "flos": 459993540096.0, + "grad_norm": 0.025301515003642434, + "language_loss": 1.01210213, + "learning_rate": 0.0009993153107650818, + "loss": 1.02507234, + "num_input_tokens_seen": 18540496, + "router_z_loss_mlp": 1.83789062, + "step": 240, + "time_per_iteration": 2.590198278427124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01297188, + "balance_loss_mlp": 1.11360526, + "epoch": 0.04636398614851866, + "flos": 456170457600.0, + "grad_norm": 0.0338801607583888, + "language_loss": 1.01026332, + "learning_rate": 0.0009992989154374468, + "loss": 1.0232352, + "num_input_tokens_seen": 18606944, + "router_z_loss_mlp": 1.83691406, + "step": 241, + "time_per_iteration": 2.5699570178985596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012963, + "balance_loss_mlp": 1.11271763, + "epoch": 0.046556367833782225, + "flos": 557901390336.0, + "grad_norm": 0.02656657647638049, + "language_loss": 1.0757494, + "learning_rate": 0.0009992823262647817, + "loss": 1.08871233, + "num_input_tokens_seen": 18679520, + "router_z_loss_mlp": 1.83691406, + "step": 242, + "time_per_iteration": 2.6949496269226074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01293965, + "balance_loss_mlp": 1.11047852, + "epoch": 0.046748749519045785, + "flos": 594087601152.0, + "grad_norm": 0.02772781005565529, + "language_loss": 1.02479577, + "learning_rate": 0.0009992655432535264, + "loss": 1.03773546, + "num_input_tokens_seen": 18756656, + "router_z_loss_mlp": 1.8359375, + "step": 243, + "time_per_iteration": 2.7783396244049072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01286985, + "balance_loss_mlp": 1.10454702, + "epoch": 0.04694113120430935, + "flos": 570941713920.0, + "grad_norm": 0.021337056529223342, + "language_loss": 1.01771712, + "learning_rate": 0.0009992485664101973, + "loss": 1.03058696, + "num_input_tokens_seen": 18829792, + "router_z_loss_mlp": 1.82519531, + "step": 244, + "time_per_iteration": 2.679227590560913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01286082, + "balance_loss_mlp": 1.10364425, + "epoch": 0.04713351288957291, + "flos": 865245411840.0, + "grad_norm": 0.03170954338904746, + "language_loss": 1.04355013, + "learning_rate": 0.000999231395741385, + "loss": 1.05641103, + "num_input_tokens_seen": 18906864, + "router_z_loss_mlp": 1.82519531, + "step": 245, + "time_per_iteration": 3.0976788997650146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01287082, + "balance_loss_mlp": 1.10473943, + "epoch": 0.04732589457483648, + "flos": 538235481600.0, + "grad_norm": 0.02353809889700427, + "language_loss": 1.02393425, + "learning_rate": 0.0009992140312537557, + "loss": 1.03680515, + "num_input_tokens_seen": 18973632, + "router_z_loss_mlp": 1.82421875, + "step": 246, + "time_per_iteration": 2.6005859375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01286378, + "balance_loss_mlp": 1.1048938, + "epoch": 0.04751827626010004, + "flos": 763271431680.0, + "grad_norm": 0.021903859990429042, + "language_loss": 0.96665001, + "learning_rate": 0.000999196472954051, + "loss": 0.97951376, + "num_input_tokens_seen": 19052944, + "router_z_loss_mlp": 1.81542969, + "step": 247, + "time_per_iteration": 2.95379638671875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01319153, + "balance_loss_mlp": 1.13833618, + "epoch": 0.0477106579453636, + "flos": 1583125578240.0, + "grad_norm": 0.034344144576267104, + "language_loss": 0.79424852, + "learning_rate": 0.0009991787208490878, + "loss": 0.80744004, + "num_input_tokens_seen": 19286288, + "router_z_loss_mlp": 1.80859375, + "step": 248, + "time_per_iteration": 6.070216655731201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01286412, + "balance_loss_mlp": 1.10521388, + "epoch": 0.04790303963062716, + "flos": 458692982784.0, + "grad_norm": 0.024476775577385278, + "language_loss": 1.04631317, + "learning_rate": 0.0009991607749457578, + "loss": 1.05917728, + "num_input_tokens_seen": 19349296, + "router_z_loss_mlp": 1.8125, + "step": 249, + "time_per_iteration": 2.5741825103759766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0128623, + "balance_loss_mlp": 1.10503209, + "epoch": 0.04809542131589073, + "flos": 783786004992.0, + "grad_norm": 0.021665977114244464, + "language_loss": 1.0235486, + "learning_rate": 0.0009991426352510286, + "loss": 1.03641105, + "num_input_tokens_seen": 19428416, + "router_z_loss_mlp": 1.81152344, + "step": 250, + "time_per_iteration": 3.004519462585449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01287109, + "balance_loss_mlp": 1.10648286, + "epoch": 0.04828780300115429, + "flos": 560321857536.0, + "grad_norm": 0.028059326531900755, + "language_loss": 1.04456568, + "learning_rate": 0.0009991243017719422, + "loss": 1.05743682, + "num_input_tokens_seen": 19498688, + "router_z_loss_mlp": 1.8046875, + "step": 251, + "time_per_iteration": 2.666212320327759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01283793, + "balance_loss_mlp": 1.10364354, + "epoch": 0.048480184686417856, + "flos": 502922130432.0, + "grad_norm": 0.02282661348297379, + "language_loss": 0.985008, + "learning_rate": 0.0009991057745156165, + "loss": 0.99784589, + "num_input_tokens_seen": 19567568, + "router_z_loss_mlp": 1.80078125, + "step": 252, + "time_per_iteration": 2.6053824424743652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01291534, + "balance_loss_mlp": 1.11186218, + "epoch": 0.048672566371681415, + "flos": 1539469120512.0, + "grad_norm": 0.022804524860740846, + "language_loss": 0.81910986, + "learning_rate": 0.0009990870534892446, + "loss": 0.83202517, + "num_input_tokens_seen": 19796368, + "router_z_loss_mlp": 1.796875, + "step": 253, + "time_per_iteration": 5.005317449569702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01285445, + "balance_loss_mlp": 1.10500991, + "epoch": 0.04886494805694498, + "flos": 538951888896.0, + "grad_norm": 0.028242285238858512, + "language_loss": 1.06865251, + "learning_rate": 0.0009990681387000943, + "loss": 1.08150697, + "num_input_tokens_seen": 19870480, + "router_z_loss_mlp": 1.80371094, + "step": 254, + "time_per_iteration": 2.743307590484619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01283321, + "balance_loss_mlp": 1.10317183, + "epoch": 0.04905732974220854, + "flos": 681484383744.0, + "grad_norm": 0.028658365214850164, + "language_loss": 1.02065015, + "learning_rate": 0.0009990490301555093, + "loss": 1.03348327, + "num_input_tokens_seen": 19956288, + "router_z_loss_mlp": 1.80126953, + "step": 255, + "time_per_iteration": 2.989856719970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01291977, + "balance_loss_mlp": 1.1134491, + "epoch": 0.04924971142747211, + "flos": 1424274895872.0, + "grad_norm": 0.01325206916769545, + "language_loss": 0.79215157, + "learning_rate": 0.0009990297278629078, + "loss": 0.80507129, + "num_input_tokens_seen": 20180080, + "router_z_loss_mlp": 1.78515625, + "step": 256, + "time_per_iteration": 4.888273477554321 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01281082, + "balance_loss_mlp": 1.10255432, + "epoch": 0.04944209311273567, + "flos": 1561236587520.0, + "grad_norm": 0.00993410716153638, + "language_loss": 0.79242742, + "learning_rate": 0.000999010231829784, + "loss": 0.80523825, + "num_input_tokens_seen": 20413456, + "router_z_loss_mlp": 1.78515625, + "step": 257, + "time_per_iteration": 4.983605623245239 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01285439, + "balance_loss_mlp": 1.10786438, + "epoch": 0.04963447479799923, + "flos": 1574170850304.0, + "grad_norm": 0.014798835308040135, + "language_loss": 0.69975883, + "learning_rate": 0.0009989905420637066, + "loss": 0.71261322, + "num_input_tokens_seen": 20644736, + "router_z_loss_mlp": 1.77539062, + "step": 258, + "time_per_iteration": 4.888776540756226 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01282209, + "balance_loss_mlp": 1.10310864, + "epoch": 0.049826856483262794, + "flos": 626498393088.0, + "grad_norm": 0.032236291487241595, + "language_loss": 0.9680413, + "learning_rate": 0.0009989706585723202, + "loss": 0.98086333, + "num_input_tokens_seen": 20719040, + "router_z_loss_mlp": 1.79003906, + "step": 259, + "time_per_iteration": 2.776397705078125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01280186, + "balance_loss_mlp": 1.10175359, + "epoch": 0.05001923816852635, + "flos": 505155945984.0, + "grad_norm": 0.03442249770662494, + "language_loss": 1.03026366, + "learning_rate": 0.0009989505813633442, + "loss": 1.04306555, + "num_input_tokens_seen": 20789376, + "router_z_loss_mlp": 1.78271484, + "step": 260, + "time_per_iteration": 2.651773691177368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01281097, + "balance_loss_mlp": 1.10295069, + "epoch": 0.05021161985378992, + "flos": 588467132928.0, + "grad_norm": 0.024781843968885862, + "language_loss": 1.02880228, + "learning_rate": 0.000998930310444573, + "loss": 1.04161322, + "num_input_tokens_seen": 20857856, + "router_z_loss_mlp": 1.78125, + "step": 261, + "time_per_iteration": 2.730717420578003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01266116, + "balance_loss_mlp": 1.08796966, + "epoch": 0.05040400153905348, + "flos": 634402341888.0, + "grad_norm": 0.028473185138455738, + "language_loss": 1.01351452, + "learning_rate": 0.0009989098458238765, + "loss": 1.02617574, + "num_input_tokens_seen": 20931232, + "router_z_loss_mlp": 1.77929688, + "step": 262, + "time_per_iteration": 2.7717010974884033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01272128, + "balance_loss_mlp": 1.09407711, + "epoch": 0.050596383224317046, + "flos": 554808176640.0, + "grad_norm": 0.03464065468219783, + "language_loss": 1.00597906, + "learning_rate": 0.0009988891875091998, + "loss": 1.01870036, + "num_input_tokens_seen": 21012672, + "router_z_loss_mlp": 1.77880859, + "step": 263, + "time_per_iteration": 2.8842556476593018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012725, + "balance_loss_mlp": 1.09444928, + "epoch": 0.050788764909580605, + "flos": 550761512448.0, + "grad_norm": 0.02541343292713684, + "language_loss": 0.95014787, + "learning_rate": 0.0009988683355085636, + "loss": 0.96287298, + "num_input_tokens_seen": 21088592, + "router_z_loss_mlp": 1.77880859, + "step": 264, + "time_per_iteration": 2.7466378211975098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01272896, + "balance_loss_mlp": 1.09527469, + "epoch": 0.05098114659484417, + "flos": 606344388096.0, + "grad_norm": 0.02024934595994547, + "language_loss": 1.03858495, + "learning_rate": 0.000998847289830063, + "loss": 1.05131388, + "num_input_tokens_seen": 21169840, + "router_z_loss_mlp": 1.77587891, + "step": 265, + "time_per_iteration": 2.821997880935669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01285574, + "balance_loss_mlp": 1.10761857, + "epoch": 0.05117352828010773, + "flos": 439472236032.0, + "grad_norm": 0.026937538773041583, + "language_loss": 0.97004128, + "learning_rate": 0.0009988260504818682, + "loss": 0.98289704, + "num_input_tokens_seen": 21236144, + "router_z_loss_mlp": 1.77832031, + "step": 266, + "time_per_iteration": 2.557830333709717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01277028, + "balance_loss_mlp": 1.09907281, + "epoch": 0.0513659099653713, + "flos": 506030807040.0, + "grad_norm": 0.02494960853942852, + "language_loss": 1.03986156, + "learning_rate": 0.000998804617472226, + "loss": 1.05263186, + "num_input_tokens_seen": 21304864, + "router_z_loss_mlp": 1.77832031, + "step": 267, + "time_per_iteration": 2.644099235534668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01269549, + "balance_loss_mlp": 1.09254682, + "epoch": 0.05155829165063486, + "flos": 696714862080.0, + "grad_norm": 0.027664306986101984, + "language_loss": 0.98796493, + "learning_rate": 0.0009987829908094568, + "loss": 1.00066042, + "num_input_tokens_seen": 21377504, + "router_z_loss_mlp": 1.76953125, + "step": 268, + "time_per_iteration": 2.8291003704071045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01265086, + "balance_loss_mlp": 1.08817983, + "epoch": 0.051750673335898424, + "flos": 1350300294144.0, + "grad_norm": 0.03385083640642466, + "language_loss": 1.06218576, + "learning_rate": 0.0009987611705019569, + "loss": 1.07483661, + "num_input_tokens_seen": 21463840, + "router_z_loss_mlp": 1.76855469, + "step": 269, + "time_per_iteration": 4.150776624679565 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01264769, + "balance_loss_mlp": 1.08795822, + "epoch": 0.051943055021161984, + "flos": 490589481984.0, + "grad_norm": 0.028250493976035247, + "language_loss": 1.04104686, + "learning_rate": 0.0009987391565581978, + "loss": 1.05369449, + "num_input_tokens_seen": 21531184, + "router_z_loss_mlp": 1.76757812, + "step": 270, + "time_per_iteration": 2.5921454429626465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01266977, + "balance_loss_mlp": 1.09092879, + "epoch": 0.05213543670642555, + "flos": 546880032768.0, + "grad_norm": 0.026669721507250346, + "language_loss": 0.96455419, + "learning_rate": 0.000998716948986726, + "loss": 0.97722399, + "num_input_tokens_seen": 21612224, + "router_z_loss_mlp": 1.75976562, + "step": 271, + "time_per_iteration": 2.7835500240325928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01268405, + "balance_loss_mlp": 1.09264266, + "epoch": 0.05232781839168911, + "flos": 604672528896.0, + "grad_norm": 0.03568520247936263, + "language_loss": 0.99334317, + "learning_rate": 0.0009986945477961633, + "loss": 1.00602722, + "num_input_tokens_seen": 21681024, + "router_z_loss_mlp": 1.75683594, + "step": 272, + "time_per_iteration": 2.6972289085388184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01271248, + "balance_loss_mlp": 1.0953902, + "epoch": 0.052520200076952676, + "flos": 539655561216.0, + "grad_norm": 0.02343402151836954, + "language_loss": 1.0317328, + "learning_rate": 0.0009986719529952066, + "loss": 1.04444528, + "num_input_tokens_seen": 21761616, + "router_z_loss_mlp": 1.7578125, + "step": 273, + "time_per_iteration": 2.908298969268799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0126867, + "balance_loss_mlp": 1.09266984, + "epoch": 0.052712581762216236, + "flos": 464332916736.0, + "grad_norm": 0.028493663433316604, + "language_loss": 1.03350449, + "learning_rate": 0.000998649164592628, + "loss": 1.0461911, + "num_input_tokens_seen": 21828416, + "router_z_loss_mlp": 1.75927734, + "step": 274, + "time_per_iteration": 2.5805718898773193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01263404, + "balance_loss_mlp": 1.08735609, + "epoch": 0.0529049634474798, + "flos": 549105116160.0, + "grad_norm": 0.024462560446863554, + "language_loss": 1.01155043, + "learning_rate": 0.0009986261825972748, + "loss": 1.02418458, + "num_input_tokens_seen": 21901600, + "router_z_loss_mlp": 1.75976562, + "step": 275, + "time_per_iteration": 2.675705909729004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01269015, + "balance_loss_mlp": 1.09334803, + "epoch": 0.05309734513274336, + "flos": 619200061440.0, + "grad_norm": 0.026443817532743642, + "language_loss": 1.03055406, + "learning_rate": 0.000998603007018069, + "loss": 1.04324436, + "num_input_tokens_seen": 21979312, + "router_z_loss_mlp": 1.75585938, + "step": 276, + "time_per_iteration": 2.77298903465271 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01264217, + "balance_loss_mlp": 1.08893192, + "epoch": 0.05328972681800693, + "flos": 606617634816.0, + "grad_norm": 0.022439827576013177, + "language_loss": 1.00613213, + "learning_rate": 0.0009985796378640089, + "loss": 1.01877427, + "num_input_tokens_seen": 22053776, + "router_z_loss_mlp": 1.75195312, + "step": 277, + "time_per_iteration": 2.693049669265747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01264635, + "balance_loss_mlp": 1.08963549, + "epoch": 0.05348210850327049, + "flos": 605730038784.0, + "grad_norm": 0.02549683888178727, + "language_loss": 1.01102281, + "learning_rate": 0.0009985560751441665, + "loss": 1.02366924, + "num_input_tokens_seen": 22134304, + "router_z_loss_mlp": 1.74902344, + "step": 278, + "time_per_iteration": 2.8009955883026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262716, + "balance_loss_mlp": 1.08757329, + "epoch": 0.053674490188534055, + "flos": 631997337600.0, + "grad_norm": 0.025192100126554, + "language_loss": 1.03316271, + "learning_rate": 0.00099853231886769, + "loss": 1.04578984, + "num_input_tokens_seen": 22212896, + "router_z_loss_mlp": 1.75048828, + "step": 279, + "time_per_iteration": 2.8228564262390137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262121, + "balance_loss_mlp": 1.08712184, + "epoch": 0.053866871873797614, + "flos": 480173741568.0, + "grad_norm": 0.02583251996588833, + "language_loss": 1.02629757, + "learning_rate": 0.0009985083690438024, + "loss": 1.03891873, + "num_input_tokens_seen": 22287216, + "router_z_loss_mlp": 1.74902344, + "step": 280, + "time_per_iteration": 2.705789566040039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01260843, + "balance_loss_mlp": 1.08655906, + "epoch": 0.054059253559061174, + "flos": 789489065472.0, + "grad_norm": 0.023704628566171972, + "language_loss": 0.9340027, + "learning_rate": 0.0009984842256818016, + "loss": 0.94661117, + "num_input_tokens_seen": 22370864, + "router_z_loss_mlp": 1.74169922, + "step": 281, + "time_per_iteration": 3.084801435470581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01257985, + "balance_loss_mlp": 1.08379591, + "epoch": 0.05425163524432474, + "flos": 629505011712.0, + "grad_norm": 0.027462270528210347, + "language_loss": 1.04308844, + "learning_rate": 0.0009984598887910613, + "loss": 1.05566835, + "num_input_tokens_seen": 22440080, + "router_z_loss_mlp": 1.74072266, + "step": 282, + "time_per_iteration": 2.729063034057617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262745, + "balance_loss_mlp": 1.08855665, + "epoch": 0.0544440169295883, + "flos": 616992442368.0, + "grad_norm": 0.02580860229759897, + "language_loss": 0.99945354, + "learning_rate": 0.0009984353583810297, + "loss": 1.01208091, + "num_input_tokens_seen": 22517936, + "router_z_loss_mlp": 1.74072266, + "step": 283, + "time_per_iteration": 2.812309741973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01258383, + "balance_loss_mlp": 1.08433735, + "epoch": 0.05463639861485187, + "flos": 648929874432.0, + "grad_norm": 0.0290705298354334, + "language_loss": 1.01989841, + "learning_rate": 0.0009984106344612302, + "loss": 1.03248215, + "num_input_tokens_seen": 22590480, + "router_z_loss_mlp": 1.73925781, + "step": 284, + "time_per_iteration": 2.785377264022827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0126395, + "balance_loss_mlp": 1.0907625, + "epoch": 0.054828780300115426, + "flos": 798584782848.0, + "grad_norm": 0.03167011835004719, + "language_loss": 0.97435868, + "learning_rate": 0.0009983857170412615, + "loss": 0.9869982, + "num_input_tokens_seen": 22668144, + "router_z_loss_mlp": 1.73046875, + "step": 285, + "time_per_iteration": 2.9822604656219482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01258353, + "balance_loss_mlp": 1.08511817, + "epoch": 0.05502116198537899, + "flos": 550798442496.0, + "grad_norm": 0.02077828299254123, + "language_loss": 0.96197385, + "learning_rate": 0.000998360606130798, + "loss": 0.9745574, + "num_input_tokens_seen": 22749648, + "router_z_loss_mlp": 1.73095703, + "step": 286, + "time_per_iteration": 2.8340489864349365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01266281, + "balance_loss_mlp": 1.09461975, + "epoch": 0.05521354367064255, + "flos": 1410906931200.0, + "grad_norm": 0.010589673029146669, + "language_loss": 0.69073117, + "learning_rate": 0.0009983353017395877, + "loss": 0.70339394, + "num_input_tokens_seen": 22982752, + "router_z_loss_mlp": 1.71484375, + "step": 287, + "time_per_iteration": 4.893908500671387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0126535, + "balance_loss_mlp": 1.09235394, + "epoch": 0.05540592535590612, + "flos": 646611465216.0, + "grad_norm": 0.04031113274469801, + "language_loss": 1.02544129, + "learning_rate": 0.0009983098038774552, + "loss": 1.03809476, + "num_input_tokens_seen": 23053584, + "router_z_loss_mlp": 1.72851562, + "step": 288, + "time_per_iteration": 2.800687551498413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01258598, + "balance_loss_mlp": 1.08712769, + "epoch": 0.05559830704116968, + "flos": 1514315727360.0, + "grad_norm": 0.011752943348929798, + "language_loss": 0.78170228, + "learning_rate": 0.0009982841125542993, + "loss": 0.79428822, + "num_input_tokens_seen": 23280256, + "router_z_loss_mlp": 1.71289062, + "step": 289, + "time_per_iteration": 4.802466630935669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0126164, + "balance_loss_mlp": 1.08869088, + "epoch": 0.055790688726433245, + "flos": 509334867456.0, + "grad_norm": 0.03460900762027919, + "language_loss": 1.00913107, + "learning_rate": 0.0009982582277800948, + "loss": 1.02174735, + "num_input_tokens_seen": 23345760, + "router_z_loss_mlp": 1.72802734, + "step": 290, + "time_per_iteration": 2.574007749557495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01255451, + "balance_loss_mlp": 1.08326483, + "epoch": 0.055983070411696804, + "flos": 659074369536.0, + "grad_norm": 0.03439417592421578, + "language_loss": 1.07703924, + "learning_rate": 0.0009982321495648908, + "loss": 1.08959377, + "num_input_tokens_seen": 23420720, + "router_z_loss_mlp": 1.72021484, + "step": 291, + "time_per_iteration": 2.8004326820373535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01257264, + "balance_loss_mlp": 1.08503067, + "epoch": 0.05617545209696037, + "flos": 588475865088.0, + "grad_norm": 0.024241847728240208, + "language_loss": 0.9905349, + "learning_rate": 0.0009982058779188115, + "loss": 1.00310755, + "num_input_tokens_seen": 23492576, + "router_z_loss_mlp": 1.72070312, + "step": 292, + "time_per_iteration": 2.763096570968628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01257503, + "balance_loss_mlp": 1.0853169, + "epoch": 0.05636783378222393, + "flos": 612787324416.0, + "grad_norm": 0.027188079674348095, + "language_loss": 1.06693649, + "learning_rate": 0.0009981794128520567, + "loss": 1.07951164, + "num_input_tokens_seen": 23569824, + "router_z_loss_mlp": 1.72021484, + "step": 293, + "time_per_iteration": 2.7630960941314697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01253426, + "balance_loss_mlp": 1.08123958, + "epoch": 0.0565602154674875, + "flos": 669422980608.0, + "grad_norm": 0.030197403892147204, + "language_loss": 1.03523457, + "learning_rate": 0.000998152754374901, + "loss": 1.04776871, + "num_input_tokens_seen": 23649984, + "router_z_loss_mlp": 1.72021484, + "step": 294, + "time_per_iteration": 2.8583314418792725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01249713, + "balance_loss_mlp": 1.07743168, + "epoch": 0.05675259715275106, + "flos": 618364131840.0, + "grad_norm": 0.026289358543143387, + "language_loss": 0.99071473, + "learning_rate": 0.0009981259024976943, + "loss": 1.00321186, + "num_input_tokens_seen": 23722032, + "router_z_loss_mlp": 1.72119141, + "step": 295, + "time_per_iteration": 2.719881534576416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250566, + "balance_loss_mlp": 1.07814193, + "epoch": 0.05694497883801462, + "flos": 753153133056.0, + "grad_norm": 0.03148267511857758, + "language_loss": 0.97962338, + "learning_rate": 0.0009980988572308612, + "loss": 0.99212909, + "num_input_tokens_seen": 23797376, + "router_z_loss_mlp": 1.72265625, + "step": 296, + "time_per_iteration": 2.9828195571899414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250905, + "balance_loss_mlp": 1.0789572, + "epoch": 0.05713736052327818, + "flos": 713380882944.0, + "grad_norm": 0.02524811137395651, + "language_loss": 1.00250125, + "learning_rate": 0.0009980716185849015, + "loss": 1.01501024, + "num_input_tokens_seen": 23880496, + "router_z_loss_mlp": 1.71777344, + "step": 297, + "time_per_iteration": 2.9749252796173096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01251066, + "balance_loss_mlp": 1.07959557, + "epoch": 0.05732974220854175, + "flos": 469935920640.0, + "grad_norm": 0.024054663695119705, + "language_loss": 0.96916056, + "learning_rate": 0.0009980441865703904, + "loss": 0.98167121, + "num_input_tokens_seen": 23950016, + "router_z_loss_mlp": 1.71289062, + "step": 298, + "time_per_iteration": 2.598325252532959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250911, + "balance_loss_mlp": 1.07939255, + "epoch": 0.05752212389380531, + "flos": 602540771328.0, + "grad_norm": 0.025930022992042723, + "language_loss": 1.05563986, + "learning_rate": 0.000998016561197978, + "loss": 1.06814897, + "num_input_tokens_seen": 24020064, + "router_z_loss_mlp": 1.71337891, + "step": 299, + "time_per_iteration": 2.690300703048706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250529, + "balance_loss_mlp": 1.07924938, + "epoch": 0.057714505579068875, + "flos": 679949511168.0, + "grad_norm": 0.025847674874905035, + "language_loss": 0.97115421, + "learning_rate": 0.0009979887424783895, + "loss": 0.98365951, + "num_input_tokens_seen": 24095360, + "router_z_loss_mlp": 1.7109375, + "step": 300, + "time_per_iteration": 2.863856554031372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01249286, + "balance_loss_mlp": 1.07810116, + "epoch": 0.057906887264332435, + "flos": 597011627520.0, + "grad_norm": 0.02594453351976595, + "language_loss": 0.96475613, + "learning_rate": 0.0009979607304224248, + "loss": 0.97724897, + "num_input_tokens_seen": 24164608, + "router_z_loss_mlp": 1.70996094, + "step": 301, + "time_per_iteration": 2.733415365219116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01248659, + "balance_loss_mlp": 1.0772841, + "epoch": 0.058099268949596, + "flos": 553164515328.0, + "grad_norm": 0.024492956239426298, + "language_loss": 1.0387162, + "learning_rate": 0.000997932525040959, + "loss": 1.05120289, + "num_input_tokens_seen": 24233840, + "router_z_loss_mlp": 1.71191406, + "step": 302, + "time_per_iteration": 2.6392264366149902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01252345, + "balance_loss_mlp": 1.08111238, + "epoch": 0.05829165063485956, + "flos": 509230808064.0, + "grad_norm": 0.038324718957869854, + "language_loss": 1.05616117, + "learning_rate": 0.000997904126344943, + "loss": 1.06868458, + "num_input_tokens_seen": 24302928, + "router_z_loss_mlp": 1.71044922, + "step": 303, + "time_per_iteration": 2.611621141433716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0125091, + "balance_loss_mlp": 1.080441, + "epoch": 0.05848403232012313, + "flos": 616362630144.0, + "grad_norm": 0.028818083574726525, + "language_loss": 1.02425826, + "learning_rate": 0.0009978755343454018, + "loss": 1.03676736, + "num_input_tokens_seen": 24377024, + "router_z_loss_mlp": 1.70263672, + "step": 304, + "time_per_iteration": 2.750213384628296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01245805, + "balance_loss_mlp": 1.07490659, + "epoch": 0.05867641400538669, + "flos": 501079082496.0, + "grad_norm": 0.025195073137535502, + "language_loss": 1.02874422, + "learning_rate": 0.0009978467490534355, + "loss": 1.04120219, + "num_input_tokens_seen": 24442736, + "router_z_loss_mlp": 1.70703125, + "step": 305, + "time_per_iteration": 2.591139078140259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0124905, + "balance_loss_mlp": 1.07853293, + "epoch": 0.05886879569065025, + "flos": 532378696704.0, + "grad_norm": 0.026491629776715375, + "language_loss": 0.99473399, + "learning_rate": 0.00099781777048022, + "loss": 1.00722456, + "num_input_tokens_seen": 24514800, + "router_z_loss_mlp": 1.703125, + "step": 306, + "time_per_iteration": 2.731084108352661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012482, + "balance_loss_mlp": 1.07782638, + "epoch": 0.05906117737591381, + "flos": 490040260608.0, + "grad_norm": 0.025118942729794178, + "language_loss": 1.01122224, + "learning_rate": 0.0009977885986370057, + "loss": 1.02370417, + "num_input_tokens_seen": 24581648, + "router_z_loss_mlp": 1.70166016, + "step": 307, + "time_per_iteration": 2.548307418823242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01247075, + "balance_loss_mlp": 1.0766536, + "epoch": 0.05925355906117737, + "flos": 592709180928.0, + "grad_norm": 0.029001286226925486, + "language_loss": 0.96780527, + "learning_rate": 0.000997759233535118, + "loss": 0.98027599, + "num_input_tokens_seen": 24658864, + "router_z_loss_mlp": 1.70214844, + "step": 308, + "time_per_iteration": 2.7876322269439697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01247056, + "balance_loss_mlp": 1.07668173, + "epoch": 0.05944594074644094, + "flos": 564787487232.0, + "grad_norm": 0.026648157056946717, + "language_loss": 1.03345561, + "learning_rate": 0.0009977296751859576, + "loss": 1.04592621, + "num_input_tokens_seen": 24735808, + "router_z_loss_mlp": 1.70166016, + "step": 309, + "time_per_iteration": 2.71488094329834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0124953, + "balance_loss_mlp": 1.07958508, + "epoch": 0.0596383224317045, + "flos": 539807284224.0, + "grad_norm": 0.023775477335694146, + "language_loss": 1.04459929, + "learning_rate": 0.0009976999236009998, + "loss": 1.05709469, + "num_input_tokens_seen": 24807744, + "router_z_loss_mlp": 1.69726562, + "step": 310, + "time_per_iteration": 2.7919182777404785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01255511, + "balance_loss_mlp": 1.08618629, + "epoch": 0.059830704116968066, + "flos": 562052113920.0, + "grad_norm": 0.02942700961653022, + "language_loss": 1.06853497, + "learning_rate": 0.0009976699787917955, + "loss": 1.08109009, + "num_input_tokens_seen": 24876640, + "router_z_loss_mlp": 1.69091797, + "step": 311, + "time_per_iteration": 2.6729257106781006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012565, + "balance_loss_mlp": 1.08789062, + "epoch": 0.060023085802231625, + "flos": 1574047325184.0, + "grad_norm": 0.029063497479097016, + "language_loss": 0.73442996, + "learning_rate": 0.00099763984076997, + "loss": 0.74699497, + "num_input_tokens_seen": 25110864, + "router_z_loss_mlp": 1.68359375, + "step": 312, + "time_per_iteration": 4.972649097442627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01249775, + "balance_loss_mlp": 1.08021212, + "epoch": 0.06021546748749519, + "flos": 483627523584.0, + "grad_norm": 0.0314235925459163, + "language_loss": 0.98280072, + "learning_rate": 0.0009976095095472243, + "loss": 0.9952985, + "num_input_tokens_seen": 25179328, + "router_z_loss_mlp": 1.69335938, + "step": 313, + "time_per_iteration": 2.5644209384918213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0125234, + "balance_loss_mlp": 1.08287179, + "epoch": 0.06040784917275875, + "flos": 621423143424.0, + "grad_norm": 0.030123719928355924, + "language_loss": 0.99538821, + "learning_rate": 0.0009975789851353334, + "loss": 1.00791156, + "num_input_tokens_seen": 25254128, + "router_z_loss_mlp": 1.69238281, + "step": 314, + "time_per_iteration": 2.794311285018921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01256592, + "balance_loss_mlp": 1.08741045, + "epoch": 0.06060023085802232, + "flos": 484602441216.0, + "grad_norm": 0.026992074473858402, + "language_loss": 1.01683283, + "learning_rate": 0.0009975482675461487, + "loss": 1.02939868, + "num_input_tokens_seen": 25324624, + "router_z_loss_mlp": 1.68945312, + "step": 315, + "time_per_iteration": 2.67146897315979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01249108, + "balance_loss_mlp": 1.08054566, + "epoch": 0.06079261254328588, + "flos": 582985652736.0, + "grad_norm": 0.0292304668639163, + "language_loss": 0.99909455, + "learning_rate": 0.0009975173567915952, + "loss": 1.01158559, + "num_input_tokens_seen": 25393648, + "router_z_loss_mlp": 1.68310547, + "step": 316, + "time_per_iteration": 2.693526268005371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0124983, + "balance_loss_mlp": 1.08131599, + "epoch": 0.060984994228549444, + "flos": 689008298496.0, + "grad_norm": 0.03272213432041067, + "language_loss": 0.93868685, + "learning_rate": 0.000997486252883674, + "loss": 0.95118511, + "num_input_tokens_seen": 25469152, + "router_z_loss_mlp": 1.68261719, + "step": 317, + "time_per_iteration": 2.837315082550049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01252509, + "balance_loss_mlp": 1.08399427, + "epoch": 0.061177375913813004, + "flos": 1316747398656.0, + "grad_norm": 0.031012352820614663, + "language_loss": 0.98949343, + "learning_rate": 0.0009974549558344602, + "loss": 1.00201845, + "num_input_tokens_seen": 25560944, + "router_z_loss_mlp": 1.68261719, + "step": 318, + "time_per_iteration": 3.686920166015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0125178, + "balance_loss_mlp": 1.08321846, + "epoch": 0.06136975759907657, + "flos": 575400612864.0, + "grad_norm": 0.027925836735275204, + "language_loss": 1.08640313, + "learning_rate": 0.000997423465656105, + "loss": 1.09892082, + "num_input_tokens_seen": 25631424, + "router_z_loss_mlp": 1.68310547, + "step": 319, + "time_per_iteration": 2.7691538333892822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250553, + "balance_loss_mlp": 1.08218133, + "epoch": 0.06156213928434013, + "flos": 528564346368.0, + "grad_norm": 0.033042319608268485, + "language_loss": 1.06051123, + "learning_rate": 0.0009973917823608335, + "loss": 1.07301688, + "num_input_tokens_seen": 25698176, + "router_z_loss_mlp": 1.68115234, + "step": 320, + "time_per_iteration": 2.583859443664551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01251303, + "balance_loss_mlp": 1.08364725, + "epoch": 0.061754520969603696, + "flos": 496589984256.0, + "grad_norm": 0.025351519610416894, + "language_loss": 0.99929821, + "learning_rate": 0.0009973599059609462, + "loss": 1.01181126, + "num_input_tokens_seen": 25773472, + "router_z_loss_mlp": 1.67382812, + "step": 321, + "time_per_iteration": 2.7139415740966797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01246641, + "balance_loss_mlp": 1.07893777, + "epoch": 0.061946902654867256, + "flos": 441044038656.0, + "grad_norm": 0.025867704850659153, + "language_loss": 0.98033404, + "learning_rate": 0.000997327836468819, + "loss": 0.99280047, + "num_input_tokens_seen": 25841088, + "router_z_loss_mlp": 1.67431641, + "step": 322, + "time_per_iteration": 2.598400831222534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250362, + "balance_loss_mlp": 1.08280182, + "epoch": 0.06213928434013082, + "flos": 600042441216.0, + "grad_norm": 0.02535167136018297, + "language_loss": 1.01516175, + "learning_rate": 0.000997295573896902, + "loss": 1.02766538, + "num_input_tokens_seen": 25919424, + "router_z_loss_mlp": 1.67285156, + "step": 323, + "time_per_iteration": 2.8295648097991943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0125071, + "balance_loss_mlp": 1.0847702, + "epoch": 0.06233166602539438, + "flos": 1453114384896.0, + "grad_norm": 0.012451454042686489, + "language_loss": 0.8119604, + "learning_rate": 0.000997263118257721, + "loss": 0.82446748, + "num_input_tokens_seen": 26135504, + "router_z_loss_mlp": 1.65625, + "step": 324, + "time_per_iteration": 4.72265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01244164, + "balance_loss_mlp": 1.07803345, + "epoch": 0.06252404771065795, + "flos": 1466628794880.0, + "grad_norm": 0.009026829376029815, + "language_loss": 0.78571939, + "learning_rate": 0.0009972304695638763, + "loss": 0.79816103, + "num_input_tokens_seen": 26358880, + "router_z_loss_mlp": 1.65820312, + "step": 325, + "time_per_iteration": 4.859014272689819 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01252677, + "balance_loss_mlp": 1.08535445, + "epoch": 0.06271642939592151, + "flos": 465235975680.0, + "grad_norm": 0.02899330239765154, + "language_loss": 0.95714885, + "learning_rate": 0.000997197627828043, + "loss": 0.96967566, + "num_input_tokens_seen": 26425888, + "router_z_loss_mlp": 1.67041016, + "step": 326, + "time_per_iteration": 2.5137081146240234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250284, + "balance_loss_mlp": 1.08343852, + "epoch": 0.06290881108118507, + "flos": 533431477248.0, + "grad_norm": 0.02712212536791958, + "language_loss": 0.90827119, + "learning_rate": 0.0009971645930629716, + "loss": 0.92077404, + "num_input_tokens_seen": 26500656, + "router_z_loss_mlp": 1.66552734, + "step": 327, + "time_per_iteration": 2.6867988109588623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01249402, + "balance_loss_mlp": 1.08260453, + "epoch": 0.06310119276644863, + "flos": 674767474176.0, + "grad_norm": 0.026247049513885422, + "language_loss": 1.04735494, + "learning_rate": 0.0009971313652814872, + "loss": 1.0598489, + "num_input_tokens_seen": 26577408, + "router_z_loss_mlp": 1.66503906, + "step": 328, + "time_per_iteration": 2.845618724822998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01245995, + "balance_loss_mlp": 1.07924485, + "epoch": 0.0632935744517122, + "flos": 772050241536.0, + "grad_norm": 0.03020034978800923, + "language_loss": 1.02482498, + "learning_rate": 0.0009970979444964903, + "loss": 1.03728485, + "num_input_tokens_seen": 26652048, + "router_z_loss_mlp": 1.66455078, + "step": 329, + "time_per_iteration": 2.967315196990967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01249674, + "balance_loss_mlp": 1.08316231, + "epoch": 0.06348595613697576, + "flos": 562974638592.0, + "grad_norm": 0.027434293654228625, + "language_loss": 1.03562641, + "learning_rate": 0.0009970643307209556, + "loss": 1.04812312, + "num_input_tokens_seen": 26728192, + "router_z_loss_mlp": 1.66210938, + "step": 330, + "time_per_iteration": 2.7991747856140137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01247918, + "balance_loss_mlp": 1.0814544, + "epoch": 0.06367833782223932, + "flos": 677383325184.0, + "grad_norm": 0.030236705728133754, + "language_loss": 1.00163436, + "learning_rate": 0.0009970305239679334, + "loss": 1.01411343, + "num_input_tokens_seen": 26798016, + "router_z_loss_mlp": 1.66162109, + "step": 331, + "time_per_iteration": 2.8012547492980957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01243208, + "balance_loss_mlp": 1.07669675, + "epoch": 0.06387071950750288, + "flos": 496348938240.0, + "grad_norm": 0.029279450628507057, + "language_loss": 1.04491925, + "learning_rate": 0.0009969965242505483, + "loss": 1.05735123, + "num_input_tokens_seen": 26867536, + "router_z_loss_mlp": 1.66210938, + "step": 332, + "time_per_iteration": 2.658085584640503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01251001, + "balance_loss_mlp": 1.08463287, + "epoch": 0.06406310119276645, + "flos": 534556116480.0, + "grad_norm": 0.029350032940601952, + "language_loss": 1.00548685, + "learning_rate": 0.0009969623315820007, + "loss": 1.01799679, + "num_input_tokens_seen": 26941216, + "router_z_loss_mlp": 1.66064453, + "step": 333, + "time_per_iteration": 2.6670596599578857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238877, + "balance_loss_mlp": 1.07246125, + "epoch": 0.06425548287803001, + "flos": 457164840960.0, + "grad_norm": 0.03277849846880731, + "language_loss": 1.00979996, + "learning_rate": 0.000996927945975565, + "loss": 1.02218866, + "num_input_tokens_seen": 27006560, + "router_z_loss_mlp": 1.66113281, + "step": 334, + "time_per_iteration": 2.5448765754699707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01245409, + "balance_loss_mlp": 1.0792315, + "epoch": 0.06444786456329357, + "flos": 561122858496.0, + "grad_norm": 0.03573042475309631, + "language_loss": 0.98108363, + "learning_rate": 0.0009968933674445906, + "loss": 0.99353766, + "num_input_tokens_seen": 27076400, + "router_z_loss_mlp": 1.65869141, + "step": 335, + "time_per_iteration": 2.679093360900879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01242425, + "balance_loss_mlp": 1.07672429, + "epoch": 0.06464024624855713, + "flos": 667356350976.0, + "grad_norm": 0.0316377115871937, + "language_loss": 0.99817598, + "learning_rate": 0.0009968585960025028, + "loss": 1.01060021, + "num_input_tokens_seen": 27158672, + "router_z_loss_mlp": 1.65380859, + "step": 336, + "time_per_iteration": 2.9642832279205322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01246223, + "balance_loss_mlp": 1.08085632, + "epoch": 0.0648326279338207, + "flos": 1524555549696.0, + "grad_norm": 0.012731648189289846, + "language_loss": 0.77653188, + "learning_rate": 0.0009968236316628006, + "loss": 0.78899413, + "num_input_tokens_seen": 27380592, + "router_z_loss_mlp": 1.65039062, + "step": 337, + "time_per_iteration": 4.799122333526611 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01242249, + "balance_loss_mlp": 1.07683408, + "epoch": 0.06502500961908426, + "flos": 1145214959616.0, + "grad_norm": 0.030168792806873873, + "language_loss": 0.98216963, + "learning_rate": 0.0009967884744390583, + "loss": 0.99459207, + "num_input_tokens_seen": 27469984, + "router_z_loss_mlp": 1.65087891, + "step": 338, + "time_per_iteration": 3.513155460357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01243978, + "balance_loss_mlp": 1.07865858, + "epoch": 0.06521739130434782, + "flos": 583693327872.0, + "grad_norm": 0.025823410577593665, + "language_loss": 0.98998213, + "learning_rate": 0.0009967531243449256, + "loss": 1.00242186, + "num_input_tokens_seen": 27543904, + "router_z_loss_mlp": 1.64990234, + "step": 339, + "time_per_iteration": 2.6683707237243652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01239476, + "balance_loss_mlp": 1.07453787, + "epoch": 0.06540977298961138, + "flos": 498658615296.0, + "grad_norm": 0.02384437782241591, + "language_loss": 1.06067204, + "learning_rate": 0.000996717581394126, + "loss": 1.07306671, + "num_input_tokens_seen": 27609888, + "router_z_loss_mlp": 1.64599609, + "step": 340, + "time_per_iteration": 2.5471885204315186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236124, + "balance_loss_mlp": 1.07171023, + "epoch": 0.06560215467487496, + "flos": 543903613440.0, + "grad_norm": 0.02318937955413124, + "language_loss": 1.0712086, + "learning_rate": 0.000996681845600459, + "loss": 1.08356977, + "num_input_tokens_seen": 27683936, + "router_z_loss_mlp": 1.640625, + "step": 341, + "time_per_iteration": 2.651742458343506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01240028, + "balance_loss_mlp": 1.07575738, + "epoch": 0.06579453636013852, + "flos": 414351043584.0, + "grad_norm": 0.026316803994829763, + "language_loss": 0.99228215, + "learning_rate": 0.0009966459169777982, + "loss": 1.00468254, + "num_input_tokens_seen": 27747840, + "router_z_loss_mlp": 1.63916016, + "step": 342, + "time_per_iteration": 2.4996230602264404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01244627, + "balance_loss_mlp": 1.08045232, + "epoch": 0.06598691804540208, + "flos": 561680812032.0, + "grad_norm": 0.03097158399986616, + "language_loss": 1.07124209, + "learning_rate": 0.0009966097955400924, + "loss": 1.08368838, + "num_input_tokens_seen": 27819728, + "router_z_loss_mlp": 1.63818359, + "step": 343, + "time_per_iteration": 2.7243080139160156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238691, + "balance_loss_mlp": 1.07451606, + "epoch": 0.06617929973066564, + "flos": 573301782528.0, + "grad_norm": 0.022915441754152527, + "language_loss": 1.00964892, + "learning_rate": 0.0009965734813013652, + "loss": 1.02203584, + "num_input_tokens_seen": 27893536, + "router_z_loss_mlp": 1.63818359, + "step": 344, + "time_per_iteration": 2.8087360858917236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01237027, + "balance_loss_mlp": 1.07375824, + "epoch": 0.06637168141592921, + "flos": 491464343040.0, + "grad_norm": 0.024444849604151265, + "language_loss": 1.03758335, + "learning_rate": 0.0009965369742757151, + "loss": 1.04995358, + "num_input_tokens_seen": 27960976, + "router_z_loss_mlp": 1.62890625, + "step": 345, + "time_per_iteration": 2.5691587924957275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01237907, + "balance_loss_mlp": 1.07459044, + "epoch": 0.06656406310119277, + "flos": 1081037924352.0, + "grad_norm": 0.024807678995847144, + "language_loss": 0.99529493, + "learning_rate": 0.0009965002744773152, + "loss": 1.00767398, + "num_input_tokens_seen": 28050864, + "router_z_loss_mlp": 1.62939453, + "step": 346, + "time_per_iteration": 3.507969856262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01239522, + "balance_loss_mlp": 1.07611036, + "epoch": 0.06675644478645633, + "flos": 514723021824.0, + "grad_norm": 0.02663627628784384, + "language_loss": 0.97097999, + "learning_rate": 0.0009964633819204139, + "loss": 0.98337519, + "num_input_tokens_seen": 28122448, + "router_z_loss_mlp": 1.63037109, + "step": 347, + "time_per_iteration": 2.6551601886749268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01261986, + "balance_loss_mlp": 1.09986115, + "epoch": 0.06694882647171989, + "flos": 1450534189056.0, + "grad_norm": 0.030948258254188146, + "language_loss": 0.81801116, + "learning_rate": 0.0009964262966193338, + "loss": 0.83063102, + "num_input_tokens_seen": 28350352, + "router_z_loss_mlp": 1.6171875, + "step": 348, + "time_per_iteration": 5.152506589889526 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236206, + "balance_loss_mlp": 1.07427216, + "epoch": 0.06714120815698346, + "flos": 1555397266944.0, + "grad_norm": 0.0077968992848742235, + "language_loss": 0.75153887, + "learning_rate": 0.000996389018588473, + "loss": 0.76390088, + "num_input_tokens_seen": 28585584, + "router_z_loss_mlp": 1.61523438, + "step": 349, + "time_per_iteration": 4.909464120864868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01242005, + "balance_loss_mlp": 1.07873547, + "epoch": 0.06733358984224702, + "flos": 881615992320.0, + "grad_norm": 0.03432587789196913, + "language_loss": 0.97228402, + "learning_rate": 0.000996351547842304, + "loss": 0.98470408, + "num_input_tokens_seen": 28672512, + "router_z_loss_mlp": 1.62890625, + "step": 350, + "time_per_iteration": 3.1799545288085938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01240315, + "balance_loss_mlp": 1.0778569, + "epoch": 0.06752597152751058, + "flos": 519917793792.0, + "grad_norm": 0.030803186893757592, + "language_loss": 0.96182388, + "learning_rate": 0.0009963138843953744, + "loss": 0.97422707, + "num_input_tokens_seen": 28741520, + "router_z_loss_mlp": 1.62060547, + "step": 351, + "time_per_iteration": 2.5873348712921143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238163, + "balance_loss_mlp": 1.07565665, + "epoch": 0.06771835321277414, + "flos": 540882258432.0, + "grad_norm": 0.023778523337364334, + "language_loss": 0.99575555, + "learning_rate": 0.000996276028262306, + "loss": 1.00813723, + "num_input_tokens_seen": 28814912, + "router_z_loss_mlp": 1.62109375, + "step": 352, + "time_per_iteration": 2.7943532466888428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238104, + "balance_loss_mlp": 1.07583654, + "epoch": 0.0679107348980377, + "flos": 461615007744.0, + "grad_norm": 0.02720743117278016, + "language_loss": 1.06749547, + "learning_rate": 0.0009962379794577964, + "loss": 1.07987642, + "num_input_tokens_seen": 28882192, + "router_z_loss_mlp": 1.61865234, + "step": 353, + "time_per_iteration": 2.589200973510742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01239427, + "balance_loss_mlp": 1.07711196, + "epoch": 0.06810311658330127, + "flos": 637207572480.0, + "grad_norm": 0.02321502152829773, + "language_loss": 0.95908678, + "learning_rate": 0.000996199737996617, + "loss": 0.97148108, + "num_input_tokens_seen": 28968576, + "router_z_loss_mlp": 1.61914062, + "step": 354, + "time_per_iteration": 2.8822708129882812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0123871, + "balance_loss_mlp": 1.07687151, + "epoch": 0.06829549826856483, + "flos": 465626743296.0, + "grad_norm": 0.030894548658215056, + "language_loss": 1.05554581, + "learning_rate": 0.0009961613038936149, + "loss": 1.06793284, + "num_input_tokens_seen": 29036160, + "router_z_loss_mlp": 1.61425781, + "step": 355, + "time_per_iteration": 2.576930522918701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236116, + "balance_loss_mlp": 1.07456315, + "epoch": 0.06848787995382839, + "flos": 635896281600.0, + "grad_norm": 0.0286185110148739, + "language_loss": 0.9730283, + "learning_rate": 0.000996122677163711, + "loss": 0.98538941, + "num_input_tokens_seen": 29112048, + "router_z_loss_mlp": 1.61132812, + "step": 356, + "time_per_iteration": 2.850829601287842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01237686, + "balance_loss_mlp": 1.07637215, + "epoch": 0.06868026163909195, + "flos": 807780556800.0, + "grad_norm": 0.03078602082995562, + "language_loss": 1.03526855, + "learning_rate": 0.000996083857821902, + "loss": 1.04764557, + "num_input_tokens_seen": 29190960, + "router_z_loss_mlp": 1.60888672, + "step": 357, + "time_per_iteration": 3.124053716659546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01237273, + "balance_loss_mlp": 1.07605469, + "epoch": 0.06887264332435553, + "flos": 440151713280.0, + "grad_norm": 0.02263887650004652, + "language_loss": 1.01701617, + "learning_rate": 0.0009960448458832588, + "loss": 1.0293889, + "num_input_tokens_seen": 29262832, + "router_z_loss_mlp": 1.60791016, + "step": 358, + "time_per_iteration": 2.6918816566467285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01242041, + "balance_loss_mlp": 1.08077514, + "epoch": 0.06906502500961909, + "flos": 485785477632.0, + "grad_norm": 0.021707311176365728, + "language_loss": 1.01897752, + "learning_rate": 0.000996005641362927, + "loss": 1.03139794, + "num_input_tokens_seen": 29329552, + "router_z_loss_mlp": 1.60839844, + "step": 359, + "time_per_iteration": 2.601358652114868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238764, + "balance_loss_mlp": 1.07725942, + "epoch": 0.06925740669488265, + "flos": 734885110272.0, + "grad_norm": 0.024380378407611886, + "language_loss": 1.04387617, + "learning_rate": 0.0009959662442761274, + "loss": 1.05626392, + "num_input_tokens_seen": 29410784, + "router_z_loss_mlp": 1.61083984, + "step": 360, + "time_per_iteration": 2.9404215812683105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236823, + "balance_loss_mlp": 1.07589066, + "epoch": 0.0694497883801462, + "flos": 553570745856.0, + "grad_norm": 0.023221163769242582, + "language_loss": 0.97943044, + "learning_rate": 0.000995926654638155, + "loss": 0.99179876, + "num_input_tokens_seen": 29486992, + "router_z_loss_mlp": 1.60498047, + "step": 361, + "time_per_iteration": 2.811624526977539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01234495, + "balance_loss_mlp": 1.07413495, + "epoch": 0.06964217006540978, + "flos": 679243837440.0, + "grad_norm": 0.025577226237571565, + "language_loss": 1.00741839, + "learning_rate": 0.00099588687246438, + "loss": 1.01976323, + "num_input_tokens_seen": 29557232, + "router_z_loss_mlp": 1.59912109, + "step": 362, + "time_per_iteration": 2.826204538345337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01235331, + "balance_loss_mlp": 1.0749228, + "epoch": 0.06983455175067334, + "flos": 525260285952.0, + "grad_norm": 0.054619150892928216, + "language_loss": 1.0805161, + "learning_rate": 0.0009958468977702471, + "loss": 1.09286952, + "num_input_tokens_seen": 29625344, + "router_z_loss_mlp": 1.59960938, + "step": 363, + "time_per_iteration": 2.5742297172546387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01269455, + "balance_loss_mlp": 1.11000061, + "epoch": 0.0700269334359369, + "flos": 1580173353984.0, + "grad_norm": 0.0347214045967213, + "language_loss": 0.79734707, + "learning_rate": 0.0009958067305712761, + "loss": 0.81004167, + "num_input_tokens_seen": 29843664, + "router_z_loss_mlp": 1.59179688, + "step": 364, + "time_per_iteration": 4.815373182296753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01234235, + "balance_loss_mlp": 1.07420838, + "epoch": 0.07021931512120046, + "flos": 1014856659456.0, + "grad_norm": 0.027565425727799023, + "language_loss": 0.95424879, + "learning_rate": 0.0009957663708830612, + "loss": 0.96659118, + "num_input_tokens_seen": 29927152, + "router_z_loss_mlp": 1.59667969, + "step": 365, + "time_per_iteration": 3.3032214641571045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238249, + "balance_loss_mlp": 1.07874703, + "epoch": 0.07041169680646403, + "flos": 824431114752.0, + "grad_norm": 0.03609893162101238, + "language_loss": 0.99641442, + "learning_rate": 0.0009957258187212714, + "loss": 1.00879693, + "num_input_tokens_seen": 30004928, + "router_z_loss_mlp": 1.59228516, + "step": 366, + "time_per_iteration": 3.143951654434204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01232948, + "balance_loss_mlp": 1.0748291, + "epoch": 0.07060407849172759, + "flos": 1417290743808.0, + "grad_norm": 0.015479474187128486, + "language_loss": 0.79194862, + "learning_rate": 0.0009956850741016502, + "loss": 0.80427808, + "num_input_tokens_seen": 30230256, + "router_z_loss_mlp": 1.578125, + "step": 367, + "time_per_iteration": 4.856614112854004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01232866, + "balance_loss_mlp": 1.07417488, + "epoch": 0.07079646017699115, + "flos": 513941486592.0, + "grad_norm": 0.03158452537667852, + "language_loss": 0.9606331, + "learning_rate": 0.0009956441370400167, + "loss": 0.97296178, + "num_input_tokens_seen": 30301200, + "router_z_loss_mlp": 1.58398438, + "step": 368, + "time_per_iteration": 2.6471550464630127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01231431, + "balance_loss_mlp": 1.07288289, + "epoch": 0.07098884186225471, + "flos": 541548274176.0, + "grad_norm": 0.03366854249700899, + "language_loss": 1.02536654, + "learning_rate": 0.0009956030075522636, + "loss": 1.03768086, + "num_input_tokens_seen": 30377024, + "router_z_loss_mlp": 1.58251953, + "step": 369, + "time_per_iteration": 2.764350175857544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01230433, + "balance_loss_mlp": 1.07183695, + "epoch": 0.07118122354751828, + "flos": 549738931200.0, + "grad_norm": 0.025388205653796188, + "language_loss": 1.02520657, + "learning_rate": 0.0009955616856543587, + "loss": 1.03751087, + "num_input_tokens_seen": 30448896, + "router_z_loss_mlp": 1.58300781, + "step": 370, + "time_per_iteration": 2.6488449573516846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01233332, + "balance_loss_mlp": 1.07483125, + "epoch": 0.07137360523278184, + "flos": 622076424192.0, + "grad_norm": 0.025131147277089937, + "language_loss": 0.94016552, + "learning_rate": 0.0009955201713623448, + "loss": 0.95249885, + "num_input_tokens_seen": 30523584, + "router_z_loss_mlp": 1.58203125, + "step": 371, + "time_per_iteration": 2.7475128173828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01231201, + "balance_loss_mlp": 1.07594299, + "epoch": 0.0715659869180454, + "flos": 1505973347328.0, + "grad_norm": 0.011087848535678398, + "language_loss": 0.76672721, + "learning_rate": 0.000995478464692339, + "loss": 0.77903926, + "num_input_tokens_seen": 30757920, + "router_z_loss_mlp": 1.55664062, + "step": 372, + "time_per_iteration": 4.930227518081665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01234827, + "balance_loss_mlp": 1.0769937, + "epoch": 0.07175836860330896, + "flos": 496481195520.0, + "grad_norm": 0.02946804107059058, + "language_loss": 1.07406306, + "learning_rate": 0.0009954365656605333, + "loss": 1.08641148, + "num_input_tokens_seen": 30824960, + "router_z_loss_mlp": 1.57910156, + "step": 373, + "time_per_iteration": 2.5494606494903564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01235693, + "balance_loss_mlp": 1.07862246, + "epoch": 0.07195075028857253, + "flos": 787081333248.0, + "grad_norm": 0.030340412148976308, + "language_loss": 1.00769055, + "learning_rate": 0.0009953944742831947, + "loss": 1.02004743, + "num_input_tokens_seen": 30902224, + "router_z_loss_mlp": 1.57519531, + "step": 374, + "time_per_iteration": 2.974013328552246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01234053, + "balance_loss_mlp": 1.07707787, + "epoch": 0.0721431319738361, + "flos": 594346111488.0, + "grad_norm": 0.024760984543104554, + "language_loss": 1.04227853, + "learning_rate": 0.0009953521905766642, + "loss": 1.05461907, + "num_input_tokens_seen": 30984784, + "router_z_loss_mlp": 1.57421875, + "step": 375, + "time_per_iteration": 2.9470102787017822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01233349, + "balance_loss_mlp": 1.07642198, + "epoch": 0.07233551365909965, + "flos": 549328697856.0, + "grad_norm": 0.025099095391344205, + "language_loss": 1.02903581, + "learning_rate": 0.0009953097145573577, + "loss": 1.04136944, + "num_input_tokens_seen": 31055376, + "router_z_loss_mlp": 1.57373047, + "step": 376, + "time_per_iteration": 2.656438112258911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01232315, + "balance_loss_mlp": 1.0754832, + "epoch": 0.07252789534436321, + "flos": 959167723008.0, + "grad_norm": 0.028756244795243427, + "language_loss": 1.01008701, + "learning_rate": 0.000995267046241766, + "loss": 1.02241015, + "num_input_tokens_seen": 31144944, + "router_z_loss_mlp": 1.57275391, + "step": 377, + "time_per_iteration": 3.2601664066314697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226098, + "balance_loss_mlp": 1.06931448, + "epoch": 0.07272027702962677, + "flos": 508655390208.0, + "grad_norm": 0.025279277167219092, + "language_loss": 1.00209188, + "learning_rate": 0.0009952241856464547, + "loss": 1.01435292, + "num_input_tokens_seen": 31213392, + "router_z_loss_mlp": 1.57226562, + "step": 378, + "time_per_iteration": 2.616483688354492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228279, + "balance_loss_mlp": 1.07159042, + "epoch": 0.07291265871489035, + "flos": 613551395328.0, + "grad_norm": 0.025059419305224793, + "language_loss": 1.0761106, + "learning_rate": 0.0009951811327880632, + "loss": 1.08839345, + "num_input_tokens_seen": 31289840, + "router_z_loss_mlp": 1.57128906, + "step": 379, + "time_per_iteration": 2.7666382789611816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226658, + "balance_loss_mlp": 1.07063651, + "epoch": 0.0731050404001539, + "flos": 496741707264.0, + "grad_norm": 0.032880990240464036, + "language_loss": 1.00766444, + "learning_rate": 0.0009951378876833063, + "loss": 1.01993108, + "num_input_tokens_seen": 31357600, + "router_z_loss_mlp": 1.56445312, + "step": 380, + "time_per_iteration": 2.5504086017608643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01230504, + "balance_loss_mlp": 1.07433975, + "epoch": 0.07329742208541747, + "flos": 641129985024.0, + "grad_norm": 0.0343074889031262, + "language_loss": 1.0780232, + "learning_rate": 0.0009950944503489736, + "loss": 1.0903281, + "num_input_tokens_seen": 31428896, + "router_z_loss_mlp": 1.56591797, + "step": 381, + "time_per_iteration": 2.7695260047912598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01231248, + "balance_loss_mlp": 1.07537043, + "epoch": 0.07348980377068103, + "flos": 817740401664.0, + "grad_norm": 0.027198888726283066, + "language_loss": 1.01785743, + "learning_rate": 0.0009950508208019285, + "loss": 1.03016996, + "num_input_tokens_seen": 31507424, + "router_z_loss_mlp": 1.56298828, + "step": 382, + "time_per_iteration": 2.9918277263641357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01227944, + "balance_loss_mlp": 1.07187521, + "epoch": 0.0736821854559446, + "flos": 509669239296.0, + "grad_norm": 0.03113985633155724, + "language_loss": 1.05612254, + "learning_rate": 0.0009950069990591096, + "loss": 1.06840205, + "num_input_tokens_seen": 31576768, + "router_z_loss_mlp": 1.56494141, + "step": 383, + "time_per_iteration": 2.610745429992676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01248039, + "balance_loss_mlp": 1.09392548, + "epoch": 0.07387456714120816, + "flos": 1558048046592.0, + "grad_norm": 0.03338671968111017, + "language_loss": 0.76401371, + "learning_rate": 0.0009949629851375302, + "loss": 0.77649409, + "num_input_tokens_seen": 31797312, + "router_z_loss_mlp": 1.54492188, + "step": 384, + "time_per_iteration": 4.854166269302368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01229749, + "balance_loss_mlp": 1.0736798, + "epoch": 0.07406694882647172, + "flos": 526643435520.0, + "grad_norm": 0.03274978311793036, + "language_loss": 0.98781282, + "learning_rate": 0.0009949187790542777, + "loss": 1.00011039, + "num_input_tokens_seen": 31869568, + "router_z_loss_mlp": 1.56494141, + "step": 385, + "time_per_iteration": 2.728701591491699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0123258, + "balance_loss_mlp": 1.07636821, + "epoch": 0.07425933051173528, + "flos": 498823799808.0, + "grad_norm": 0.026908846939264777, + "language_loss": 0.94723004, + "learning_rate": 0.0009948743808265148, + "loss": 0.95955586, + "num_input_tokens_seen": 31941712, + "router_z_loss_mlp": 1.56640625, + "step": 386, + "time_per_iteration": 2.6850693225860596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01231135, + "balance_loss_mlp": 1.07511437, + "epoch": 0.07445171219699885, + "flos": 506057003520.0, + "grad_norm": 0.05633654869747302, + "language_loss": 1.04553366, + "learning_rate": 0.0009948297904714782, + "loss": 1.05784488, + "num_input_tokens_seen": 32015232, + "router_z_loss_mlp": 1.56445312, + "step": 387, + "time_per_iteration": 2.6746010780334473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01231627, + "balance_loss_mlp": 1.07555866, + "epoch": 0.07464409388226241, + "flos": 555116352000.0, + "grad_norm": 0.03450843374667126, + "language_loss": 0.9665134, + "learning_rate": 0.0009947850080064796, + "loss": 0.97882968, + "num_input_tokens_seen": 32094640, + "router_z_loss_mlp": 1.56494141, + "step": 388, + "time_per_iteration": 2.7839057445526123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01230193, + "balance_loss_mlp": 1.07431459, + "epoch": 0.07483647556752597, + "flos": 778274325504.0, + "grad_norm": 0.021592891008175935, + "language_loss": 1.01240289, + "learning_rate": 0.0009947400334489047, + "loss": 1.02470493, + "num_input_tokens_seen": 32176640, + "router_z_loss_mlp": 1.56298828, + "step": 389, + "time_per_iteration": 2.9945342540740967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01229089, + "balance_loss_mlp": 1.07411718, + "epoch": 0.07502885725278953, + "flos": 613681651200.0, + "grad_norm": 0.023383004705128753, + "language_loss": 0.92341155, + "learning_rate": 0.0009946948668162145, + "loss": 0.93570244, + "num_input_tokens_seen": 32246704, + "router_z_loss_mlp": 1.55371094, + "step": 390, + "time_per_iteration": 2.7355024814605713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122989, + "balance_loss_mlp": 1.07496524, + "epoch": 0.0752212389380531, + "flos": 689854961664.0, + "grad_norm": 0.026752200694656208, + "language_loss": 0.97335494, + "learning_rate": 0.0009946495081259441, + "loss": 0.98565376, + "num_input_tokens_seen": 32320032, + "router_z_loss_mlp": 1.55322266, + "step": 391, + "time_per_iteration": 2.799938678741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01227768, + "balance_loss_mlp": 1.07303405, + "epoch": 0.07541362062331666, + "flos": 767050853376.0, + "grad_norm": 0.02596026064524479, + "language_loss": 1.01604676, + "learning_rate": 0.0009946039573957035, + "loss": 1.02832437, + "num_input_tokens_seen": 32398144, + "router_z_loss_mlp": 1.55126953, + "step": 392, + "time_per_iteration": 2.932504415512085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0123199, + "balance_loss_mlp": 1.07768571, + "epoch": 0.07560600230858022, + "flos": 589908679680.0, + "grad_norm": 0.028382748029943367, + "language_loss": 0.97495323, + "learning_rate": 0.000994558214643177, + "loss": 0.98727316, + "num_input_tokens_seen": 32471984, + "router_z_loss_mlp": 1.546875, + "step": 393, + "time_per_iteration": 2.752694845199585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228178, + "balance_loss_mlp": 1.07425475, + "epoch": 0.07579838399384378, + "flos": 751144900608.0, + "grad_norm": 0.028291982513743617, + "language_loss": 0.99160051, + "learning_rate": 0.000994512279886123, + "loss": 1.00388229, + "num_input_tokens_seen": 32550176, + "router_z_loss_mlp": 1.54296875, + "step": 394, + "time_per_iteration": 3.06592059135437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228894, + "balance_loss_mlp": 1.07530475, + "epoch": 0.07599076567910736, + "flos": 524550609408.0, + "grad_norm": 0.023352712612718218, + "language_loss": 0.98641121, + "learning_rate": 0.0009944661531423758, + "loss": 0.99870014, + "num_input_tokens_seen": 32620768, + "router_z_loss_mlp": 1.53955078, + "step": 395, + "time_per_iteration": 2.6720728874206543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122919, + "balance_loss_mlp": 1.07555354, + "epoch": 0.07618314736437092, + "flos": 552185594880.0, + "grad_norm": 0.026216962171459895, + "language_loss": 0.97914684, + "learning_rate": 0.000994419834429843, + "loss": 0.99143875, + "num_input_tokens_seen": 32693472, + "router_z_loss_mlp": 1.54003906, + "step": 396, + "time_per_iteration": 2.6652910709381104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226861, + "balance_loss_mlp": 1.07308066, + "epoch": 0.07637552904963447, + "flos": 699432771072.0, + "grad_norm": 0.029361663168223213, + "language_loss": 1.03114796, + "learning_rate": 0.0009943733237665069, + "loss": 1.0434165, + "num_input_tokens_seen": 32764976, + "router_z_loss_mlp": 1.54150391, + "step": 397, + "time_per_iteration": 2.808711290359497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01227023, + "balance_loss_mlp": 1.07329071, + "epoch": 0.07656791073489803, + "flos": 580635042816.0, + "grad_norm": 0.02000560632750303, + "language_loss": 1.01598048, + "learning_rate": 0.0009943266211704248, + "loss": 1.02825069, + "num_input_tokens_seen": 32853104, + "router_z_loss_mlp": 1.54101562, + "step": 398, + "time_per_iteration": 2.9420461654663086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226854, + "balance_loss_mlp": 1.0732646, + "epoch": 0.0767602924201616, + "flos": 418037139456.0, + "grad_norm": 0.02425852476792673, + "language_loss": 1.03237891, + "learning_rate": 0.000994279726659728, + "loss": 1.04464746, + "num_input_tokens_seen": 32919376, + "router_z_loss_mlp": 1.53955078, + "step": 399, + "time_per_iteration": 2.5185675621032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01230296, + "balance_loss_mlp": 1.07675469, + "epoch": 0.07695267410542517, + "flos": 483888035328.0, + "grad_norm": 0.030174375239475117, + "language_loss": 1.02145576, + "learning_rate": 0.0009942326402526231, + "loss": 1.03375876, + "num_input_tokens_seen": 32988064, + "router_z_loss_mlp": 1.5390625, + "step": 400, + "time_per_iteration": 2.5265390872955322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224857, + "balance_loss_mlp": 1.07184029, + "epoch": 0.07714505579068873, + "flos": 532026860544.0, + "grad_norm": 0.024483465572707617, + "language_loss": 0.99344772, + "learning_rate": 0.0009941853619673902, + "loss": 1.0056963, + "num_input_tokens_seen": 33059024, + "router_z_loss_mlp": 1.53369141, + "step": 401, + "time_per_iteration": 2.660491704940796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224912, + "balance_loss_mlp": 1.07218146, + "epoch": 0.07733743747595229, + "flos": 806439066624.0, + "grad_norm": 0.032921156451595594, + "language_loss": 1.03587961, + "learning_rate": 0.0009941378918223844, + "loss": 1.04812872, + "num_input_tokens_seen": 33137712, + "router_z_loss_mlp": 1.53076172, + "step": 402, + "time_per_iteration": 3.078272819519043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01222316, + "balance_loss_mlp": 1.06972802, + "epoch": 0.07752981916121585, + "flos": 623613298176.0, + "grad_norm": 0.02596227047756477, + "language_loss": 0.96322513, + "learning_rate": 0.0009940902298360354, + "loss": 0.97544825, + "num_input_tokens_seen": 33211296, + "router_z_loss_mlp": 1.52929688, + "step": 403, + "time_per_iteration": 2.78222918510437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224993, + "balance_loss_mlp": 1.07288182, + "epoch": 0.07772220084647942, + "flos": 729542618112.0, + "grad_norm": 0.031231063897144088, + "language_loss": 1.06544566, + "learning_rate": 0.0009940423760268473, + "loss": 1.07769561, + "num_input_tokens_seen": 33283632, + "router_z_loss_mlp": 1.52441406, + "step": 404, + "time_per_iteration": 2.8572018146514893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226552, + "balance_loss_mlp": 1.07472658, + "epoch": 0.07791458253174298, + "flos": 556468575744.0, + "grad_norm": 0.029548764371286118, + "language_loss": 0.99639893, + "learning_rate": 0.0009939943304133982, + "loss": 1.00866449, + "num_input_tokens_seen": 33350704, + "router_z_loss_mlp": 1.52148438, + "step": 405, + "time_per_iteration": 2.607412815093994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226106, + "balance_loss_mlp": 1.07409084, + "epoch": 0.07810696421700654, + "flos": 554234760192.0, + "grad_norm": 0.031141101296471768, + "language_loss": 1.06411445, + "learning_rate": 0.0009939460930143416, + "loss": 1.07637548, + "num_input_tokens_seen": 33416272, + "router_z_loss_mlp": 1.5234375, + "step": 406, + "time_per_iteration": 2.6132876873016357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223027, + "balance_loss_mlp": 1.07120168, + "epoch": 0.0782993459022701, + "flos": 651878095872.0, + "grad_norm": 0.023437908852709077, + "language_loss": 1.00106847, + "learning_rate": 0.0009938976638484043, + "loss": 1.01329875, + "num_input_tokens_seen": 33501824, + "router_z_loss_mlp": 1.52148438, + "step": 407, + "time_per_iteration": 2.905681610107422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218745, + "balance_loss_mlp": 1.06691968, + "epoch": 0.07849172758753367, + "flos": 497160672768.0, + "grad_norm": 0.02891290096917658, + "language_loss": 0.99991584, + "learning_rate": 0.0009938490429343887, + "loss": 1.01210332, + "num_input_tokens_seen": 33571456, + "router_z_loss_mlp": 1.52148438, + "step": 408, + "time_per_iteration": 2.539567708969116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01222677, + "balance_loss_mlp": 1.07066166, + "epoch": 0.07868410927279723, + "flos": 579075975168.0, + "grad_norm": 0.030601656563413092, + "language_loss": 0.99965751, + "learning_rate": 0.0009938002302911709, + "loss": 1.01188421, + "num_input_tokens_seen": 33646320, + "router_z_loss_mlp": 1.5234375, + "step": 409, + "time_per_iteration": 2.732064962387085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01220028, + "balance_loss_mlp": 1.0680126, + "epoch": 0.07887649095806079, + "flos": 524066515968.0, + "grad_norm": 0.03256443285635905, + "language_loss": 1.03146362, + "learning_rate": 0.0009937512259377015, + "loss": 1.04366398, + "num_input_tokens_seen": 33717664, + "router_z_loss_mlp": 1.5234375, + "step": 410, + "time_per_iteration": 2.6500303745269775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221864, + "balance_loss_mlp": 1.07013464, + "epoch": 0.07906887264332435, + "flos": 558437876736.0, + "grad_norm": 0.023780630120827737, + "language_loss": 1.01466393, + "learning_rate": 0.000993702029893006, + "loss": 1.02688265, + "num_input_tokens_seen": 33794720, + "router_z_loss_mlp": 1.52050781, + "step": 411, + "time_per_iteration": 2.7921671867370605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221791, + "balance_loss_mlp": 1.07010949, + "epoch": 0.07926125432858792, + "flos": 823362871296.0, + "grad_norm": 0.04077078343290612, + "language_loss": 1.01153946, + "learning_rate": 0.0009936526421761838, + "loss": 1.02375734, + "num_input_tokens_seen": 33868304, + "router_z_loss_mlp": 1.52001953, + "step": 412, + "time_per_iteration": 3.0569379329681396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217861, + "balance_loss_mlp": 1.06632257, + "epoch": 0.07945363601385148, + "flos": 563393604096.0, + "grad_norm": 0.02717343044282308, + "language_loss": 1.04004121, + "learning_rate": 0.000993603062806409, + "loss": 1.05221987, + "num_input_tokens_seen": 33937424, + "router_z_loss_mlp": 1.51855469, + "step": 413, + "time_per_iteration": 2.707462787628174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01219172, + "balance_loss_mlp": 1.06844354, + "epoch": 0.07964601769911504, + "flos": 518884478976.0, + "grad_norm": 0.031245789494761384, + "language_loss": 1.07179379, + "learning_rate": 0.0009935532918029298, + "loss": 1.08398533, + "num_input_tokens_seen": 34003984, + "router_z_loss_mlp": 1.51025391, + "step": 414, + "time_per_iteration": 2.668151617050171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224604, + "balance_loss_mlp": 1.07387555, + "epoch": 0.0798383993843786, + "flos": 540300109824.0, + "grad_norm": 0.025221671350570463, + "language_loss": 0.99906069, + "learning_rate": 0.0009935033291850694, + "loss": 1.01130676, + "num_input_tokens_seen": 34072400, + "router_z_loss_mlp": 1.51025391, + "step": 415, + "time_per_iteration": 2.64747953414917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216008, + "balance_loss_mlp": 1.06547058, + "epoch": 0.08003078106964218, + "flos": 486121850880.0, + "grad_norm": 0.027121462600521052, + "language_loss": 1.02766061, + "learning_rate": 0.0009934531749722247, + "loss": 1.03982067, + "num_input_tokens_seen": 34142448, + "router_z_loss_mlp": 1.50830078, + "step": 416, + "time_per_iteration": 2.5705764293670654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121625, + "balance_loss_mlp": 1.06576049, + "epoch": 0.08022316275490574, + "flos": 519275246592.0, + "grad_norm": 0.027391361962933233, + "language_loss": 1.00515926, + "learning_rate": 0.0009934028291838672, + "loss": 1.01732171, + "num_input_tokens_seen": 34214080, + "router_z_loss_mlp": 1.5078125, + "step": 417, + "time_per_iteration": 2.7232770919799805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01219761, + "balance_loss_mlp": 1.0695101, + "epoch": 0.0804155444401693, + "flos": 495046379520.0, + "grad_norm": 0.028534904701295792, + "language_loss": 0.95904237, + "learning_rate": 0.0009933522918395433, + "loss": 0.97123998, + "num_input_tokens_seen": 34288448, + "router_z_loss_mlp": 1.50537109, + "step": 418, + "time_per_iteration": 2.670992374420166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01265297, + "balance_loss_mlp": 1.11595154, + "epoch": 0.08060792612543285, + "flos": 1584853833216.0, + "grad_norm": 0.03473829356439328, + "language_loss": 0.782511, + "learning_rate": 0.0009933015629588731, + "loss": 0.79516399, + "num_input_tokens_seen": 34521632, + "router_z_loss_mlp": 1.49609375, + "step": 419, + "time_per_iteration": 4.9051830768585205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01222046, + "balance_loss_mlp": 1.07246244, + "epoch": 0.08080030781069643, + "flos": 526358728704.0, + "grad_norm": 0.03232182071246488, + "language_loss": 1.15746891, + "learning_rate": 0.000993250642561551, + "loss": 1.16968942, + "num_input_tokens_seen": 34590080, + "router_z_loss_mlp": 1.49853516, + "step": 420, + "time_per_iteration": 2.596930503845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224313, + "balance_loss_mlp": 1.07487273, + "epoch": 0.08099268949595999, + "flos": 547756895232.0, + "grad_norm": 0.03306568774928502, + "language_loss": 1.00193918, + "learning_rate": 0.0009931995306673466, + "loss": 1.01418233, + "num_input_tokens_seen": 34660512, + "router_z_loss_mlp": 1.49707031, + "step": 421, + "time_per_iteration": 2.704012155532837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223697, + "balance_loss_mlp": 1.0744468, + "epoch": 0.08118507118122355, + "flos": 511373299200.0, + "grad_norm": 0.026268861479682264, + "language_loss": 1.0597651, + "learning_rate": 0.000993148227296103, + "loss": 1.07200205, + "num_input_tokens_seen": 34732016, + "router_z_loss_mlp": 1.49511719, + "step": 422, + "time_per_iteration": 2.6110117435455322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224578, + "balance_loss_mlp": 1.0751853, + "epoch": 0.08137745286648711, + "flos": 722001239040.0, + "grad_norm": 0.024088300997991936, + "language_loss": 0.92380643, + "learning_rate": 0.000993096732467738, + "loss": 0.9360522, + "num_input_tokens_seen": 34810416, + "router_z_loss_mlp": 1.49658203, + "step": 423, + "time_per_iteration": 2.9790220260620117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224383, + "balance_loss_mlp": 1.0753237, + "epoch": 0.08156983455175067, + "flos": 680817641472.0, + "grad_norm": 0.029818930066630327, + "language_loss": 1.0177561, + "learning_rate": 0.0009930450462022435, + "loss": 1.02999997, + "num_input_tokens_seen": 34879504, + "router_z_loss_mlp": 1.49316406, + "step": 424, + "time_per_iteration": 2.8023674488067627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223, + "balance_loss_mlp": 1.07518005, + "epoch": 0.08176221623701424, + "flos": 1456588359168.0, + "grad_norm": 0.012435251357338771, + "language_loss": 0.79189807, + "learning_rate": 0.0009929931685196862, + "loss": 0.80412811, + "num_input_tokens_seen": 35111584, + "router_z_loss_mlp": 1.48046875, + "step": 425, + "time_per_iteration": 4.96533989906311 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01219597, + "balance_loss_mlp": 1.0711571, + "epoch": 0.0819545979222778, + "flos": 1558883071488.0, + "grad_norm": 0.04204100969257126, + "language_loss": 1.00605047, + "learning_rate": 0.0009929410994402065, + "loss": 1.01824641, + "num_input_tokens_seen": 35205664, + "router_z_loss_mlp": 1.48681641, + "step": 426, + "time_per_iteration": 3.850475311279297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01220758, + "balance_loss_mlp": 1.07236588, + "epoch": 0.08214697960754136, + "flos": 513800497152.0, + "grad_norm": 0.03975912273964659, + "language_loss": 1.03955805, + "learning_rate": 0.0009928888389840196, + "loss": 1.05176568, + "num_input_tokens_seen": 35280144, + "router_z_loss_mlp": 1.48632812, + "step": 427, + "time_per_iteration": 2.6892385482788086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224824, + "balance_loss_mlp": 1.07633698, + "epoch": 0.08233936129280492, + "flos": 596221360128.0, + "grad_norm": 0.02633667259549893, + "language_loss": 1.0604248, + "learning_rate": 0.0009928363871714147, + "loss": 1.07267296, + "num_input_tokens_seen": 35344768, + "router_z_loss_mlp": 1.48730469, + "step": 428, + "time_per_iteration": 2.666851282119751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224039, + "balance_loss_mlp": 1.07550442, + "epoch": 0.08253174297806849, + "flos": 573164795904.0, + "grad_norm": 0.03052010415677114, + "language_loss": 0.99677718, + "learning_rate": 0.0009927837440227556, + "loss": 1.00901759, + "num_input_tokens_seen": 35425536, + "router_z_loss_mlp": 1.48779297, + "step": 429, + "time_per_iteration": 2.810197591781616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228416, + "balance_loss_mlp": 1.07992899, + "epoch": 0.08272412466333205, + "flos": 624642610176.0, + "grad_norm": 0.029909202440675912, + "language_loss": 0.93710327, + "learning_rate": 0.0009927309095584798, + "loss": 0.94938743, + "num_input_tokens_seen": 35515440, + "router_z_loss_mlp": 1.48730469, + "step": 430, + "time_per_iteration": 2.98052978515625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122165, + "balance_loss_mlp": 1.07316256, + "epoch": 0.08291650634859561, + "flos": 514994267136.0, + "grad_norm": 0.038201439099628094, + "language_loss": 1.07072532, + "learning_rate": 0.0009926778837991, + "loss": 1.08294177, + "num_input_tokens_seen": 35580192, + "router_z_loss_mlp": 1.48730469, + "step": 431, + "time_per_iteration": 2.613912582397461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223506, + "balance_loss_mlp": 1.07516193, + "epoch": 0.08310888803385917, + "flos": 668541388800.0, + "grad_norm": 0.02618037233016902, + "language_loss": 1.04762018, + "learning_rate": 0.000992624666765202, + "loss": 1.05985522, + "num_input_tokens_seen": 35649472, + "router_z_loss_mlp": 1.48583984, + "step": 432, + "time_per_iteration": 2.785602331161499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224029, + "balance_loss_mlp": 1.07659137, + "epoch": 0.08330126971912274, + "flos": 584490326016.0, + "grad_norm": 0.023129420064945467, + "language_loss": 1.02043724, + "learning_rate": 0.000992571258477447, + "loss": 1.03267753, + "num_input_tokens_seen": 35722848, + "router_z_loss_mlp": 1.4765625, + "step": 433, + "time_per_iteration": 2.7774012088775635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225333, + "balance_loss_mlp": 1.07799041, + "epoch": 0.0834936514043863, + "flos": 562497275904.0, + "grad_norm": 0.02412369992445121, + "language_loss": 0.95710295, + "learning_rate": 0.0009925176589565695, + "loss": 0.9693563, + "num_input_tokens_seen": 35800944, + "router_z_loss_mlp": 1.47558594, + "step": 434, + "time_per_iteration": 2.7975149154663086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224713, + "balance_loss_mlp": 1.07751381, + "epoch": 0.08368603308964986, + "flos": 495513008640.0, + "grad_norm": 0.023499028814372425, + "language_loss": 1.06310439, + "learning_rate": 0.0009924638682233791, + "loss": 1.07535148, + "num_input_tokens_seen": 35866288, + "router_z_loss_mlp": 1.47412109, + "step": 435, + "time_per_iteration": 2.5623626708984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01247864, + "balance_loss_mlp": 1.10328674, + "epoch": 0.08387841477491342, + "flos": 1391808983040.0, + "grad_norm": 0.0329185074425942, + "language_loss": 0.79564589, + "learning_rate": 0.0009924098862987589, + "loss": 0.80812454, + "num_input_tokens_seen": 36083040, + "router_z_loss_mlp": 1.44726562, + "step": 436, + "time_per_iteration": 4.5364601612091064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01219037, + "balance_loss_mlp": 1.07174218, + "epoch": 0.084070796460177, + "flos": 800353970688.0, + "grad_norm": 0.025226905267595717, + "language_loss": 0.95941472, + "learning_rate": 0.0009923557132036668, + "loss": 0.97160506, + "num_input_tokens_seen": 36158816, + "router_z_loss_mlp": 1.47509766, + "step": 437, + "time_per_iteration": 3.031538963317871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01219746, + "balance_loss_mlp": 1.07226074, + "epoch": 0.08426317814544056, + "flos": 560096274432.0, + "grad_norm": 0.024291343012928023, + "language_loss": 0.99699497, + "learning_rate": 0.0009923013489591345, + "loss": 1.00919247, + "num_input_tokens_seen": 36236432, + "router_z_loss_mlp": 1.47705078, + "step": 438, + "time_per_iteration": 2.741021156311035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217749, + "balance_loss_mlp": 1.07073975, + "epoch": 0.08445555983070412, + "flos": 811883616768.0, + "grad_norm": 0.02787309358423107, + "language_loss": 0.97740996, + "learning_rate": 0.0009922467935862681, + "loss": 0.98958743, + "num_input_tokens_seen": 36327952, + "router_z_loss_mlp": 1.47216797, + "step": 439, + "time_per_iteration": 3.0727341175079346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215984, + "balance_loss_mlp": 1.06907046, + "epoch": 0.08464794151596768, + "flos": 511169183232.0, + "grad_norm": 0.02418736148641671, + "language_loss": 1.01547837, + "learning_rate": 0.0009921920471062478, + "loss": 1.0276382, + "num_input_tokens_seen": 36394896, + "router_z_loss_mlp": 1.47119141, + "step": 440, + "time_per_iteration": 2.5793957710266113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214442, + "balance_loss_mlp": 1.06805265, + "epoch": 0.08484032320123125, + "flos": 557473692672.0, + "grad_norm": 0.02549300900866748, + "language_loss": 0.99590349, + "learning_rate": 0.0009921371095403281, + "loss": 1.00804806, + "num_input_tokens_seen": 36464656, + "router_z_loss_mlp": 1.46582031, + "step": 441, + "time_per_iteration": 2.633976936340332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215261, + "balance_loss_mlp": 1.06887233, + "epoch": 0.08503270488649481, + "flos": 528360230400.0, + "grad_norm": 0.023285649852896013, + "language_loss": 1.02823853, + "learning_rate": 0.0009920819809098379, + "loss": 1.04039121, + "num_input_tokens_seen": 36532208, + "router_z_loss_mlp": 1.46582031, + "step": 442, + "time_per_iteration": 2.5975728034973145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213611, + "balance_loss_mlp": 1.06722176, + "epoch": 0.08522508657175837, + "flos": 615385711104.0, + "grad_norm": 0.021771679570127336, + "language_loss": 0.97986722, + "learning_rate": 0.0009920266612361798, + "loss": 0.99200332, + "num_input_tokens_seen": 36607360, + "router_z_loss_mlp": 1.46582031, + "step": 443, + "time_per_iteration": 2.7284042835235596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214332, + "balance_loss_mlp": 1.06803846, + "epoch": 0.08541746825702193, + "flos": 620986713600.0, + "grad_norm": 0.024601404202987703, + "language_loss": 0.97963679, + "learning_rate": 0.0009919711505408308, + "loss": 0.9917801, + "num_input_tokens_seen": 36680688, + "router_z_loss_mlp": 1.46484375, + "step": 444, + "time_per_iteration": 2.797030448913574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216522, + "balance_loss_mlp": 1.07051492, + "epoch": 0.08560984994228549, + "flos": 483888035328.0, + "grad_norm": 0.023417740932750293, + "language_loss": 0.96522343, + "learning_rate": 0.000991915448845342, + "loss": 0.97738856, + "num_input_tokens_seen": 36746288, + "router_z_loss_mlp": 1.46191406, + "step": 445, + "time_per_iteration": 2.544638156890869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121537, + "balance_loss_mlp": 1.06945765, + "epoch": 0.08580223162754906, + "flos": 518176803840.0, + "grad_norm": 0.025018627604332305, + "language_loss": 1.05275297, + "learning_rate": 0.000991859556171339, + "loss": 1.0649066, + "num_input_tokens_seen": 36812528, + "router_z_loss_mlp": 1.4609375, + "step": 446, + "time_per_iteration": 2.5865097045898438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214045, + "balance_loss_mlp": 1.06856191, + "epoch": 0.08599461331281262, + "flos": 532519686144.0, + "grad_norm": 0.025883227843611877, + "language_loss": 1.07190132, + "learning_rate": 0.000991803472540521, + "loss": 1.08404183, + "num_input_tokens_seen": 36879248, + "router_z_loss_mlp": 1.45654297, + "step": 447, + "time_per_iteration": 2.6001055240631104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213992, + "balance_loss_mlp": 1.06879497, + "epoch": 0.08618699499807618, + "flos": 791633558016.0, + "grad_norm": 0.022461373320799196, + "language_loss": 1.02303076, + "learning_rate": 0.0009917471979746615, + "loss": 1.03517067, + "num_input_tokens_seen": 36951376, + "router_z_loss_mlp": 1.45361328, + "step": 448, + "time_per_iteration": 2.9621376991271973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218395, + "balance_loss_mlp": 1.07300746, + "epoch": 0.08637937668333974, + "flos": 567114628608.0, + "grad_norm": 0.02449904215267775, + "language_loss": 1.00404847, + "learning_rate": 0.0009916907324956086, + "loss": 1.01623249, + "num_input_tokens_seen": 37025936, + "router_z_loss_mlp": 1.45556641, + "step": 449, + "time_per_iteration": 2.691150188446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214944, + "balance_loss_mlp": 1.0697943, + "epoch": 0.08657175836860331, + "flos": 446117286912.0, + "grad_norm": 0.025714213043280993, + "language_loss": 0.97109705, + "learning_rate": 0.0009916340761252837, + "loss": 0.98324645, + "num_input_tokens_seen": 37095872, + "router_z_loss_mlp": 1.453125, + "step": 450, + "time_per_iteration": 2.6118698120117188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212599, + "balance_loss_mlp": 1.067307, + "epoch": 0.08676414005386687, + "flos": 845588235264.0, + "grad_norm": 0.02612794411743426, + "language_loss": 0.94540501, + "learning_rate": 0.0009915772288856832, + "loss": 0.95753098, + "num_input_tokens_seen": 37179072, + "router_z_loss_mlp": 1.45458984, + "step": 451, + "time_per_iteration": 3.0883219242095947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213701, + "balance_loss_mlp": 1.06926715, + "epoch": 0.08695652173913043, + "flos": 604483875840.0, + "grad_norm": 0.02003375948944636, + "language_loss": 0.95739877, + "learning_rate": 0.000991520190798877, + "loss": 0.96953583, + "num_input_tokens_seen": 37260288, + "router_z_loss_mlp": 1.44580078, + "step": 452, + "time_per_iteration": 2.8387818336486816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213572, + "balance_loss_mlp": 1.06928122, + "epoch": 0.08714890342439399, + "flos": 732000015360.0, + "grad_norm": 0.027770143088691506, + "language_loss": 1.06693339, + "learning_rate": 0.0009914629618870089, + "loss": 1.07906914, + "num_input_tokens_seen": 37331136, + "router_z_loss_mlp": 1.44433594, + "step": 453, + "time_per_iteration": 2.9403207302093506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01234398, + "balance_loss_mlp": 1.0905838, + "epoch": 0.08734128510965757, + "flos": 1485454044672.0, + "grad_norm": 0.02536208637588336, + "language_loss": 0.78675872, + "learning_rate": 0.0009914055421722976, + "loss": 0.79910266, + "num_input_tokens_seen": 37559040, + "router_z_loss_mlp": 1.43945312, + "step": 454, + "time_per_iteration": 4.803662061691284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121994, + "balance_loss_mlp": 1.07631683, + "epoch": 0.08753366679492113, + "flos": 1526266340352.0, + "grad_norm": 0.01817690946373191, + "language_loss": 0.81427962, + "learning_rate": 0.0009913479316770353, + "loss": 0.82647902, + "num_input_tokens_seen": 37785136, + "router_z_loss_mlp": 1.4375, + "step": 455, + "time_per_iteration": 4.812621355056763 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213204, + "balance_loss_mlp": 1.06919885, + "epoch": 0.08772604848018468, + "flos": 722524263936.0, + "grad_norm": 0.030160618436618963, + "language_loss": 0.98162878, + "learning_rate": 0.0009912901304235883, + "loss": 0.99376082, + "num_input_tokens_seen": 37858832, + "router_z_loss_mlp": 1.44140625, + "step": 456, + "time_per_iteration": 2.9147355556488037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217818, + "balance_loss_mlp": 1.07386112, + "epoch": 0.08791843016544824, + "flos": 709466476032.0, + "grad_norm": 0.03064824893295274, + "language_loss": 0.96399593, + "learning_rate": 0.000991232138434397, + "loss": 0.97617412, + "num_input_tokens_seen": 37931856, + "router_z_loss_mlp": 1.44091797, + "step": 457, + "time_per_iteration": 2.8735082149505615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121922, + "balance_loss_mlp": 1.07540572, + "epoch": 0.08811081185071182, + "flos": 474021516288.0, + "grad_norm": 0.03193385229896835, + "language_loss": 1.03185177, + "learning_rate": 0.000991173955731976, + "loss": 1.04404402, + "num_input_tokens_seen": 38002432, + "router_z_loss_mlp": 1.43945312, + "step": 458, + "time_per_iteration": 2.6597843170166016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01220724, + "balance_loss_mlp": 1.07762539, + "epoch": 0.08830319353597538, + "flos": 686314584576.0, + "grad_norm": 0.057581270182385194, + "language_loss": 1.06524456, + "learning_rate": 0.0009911155823389137, + "loss": 1.07745171, + "num_input_tokens_seen": 38081648, + "router_z_loss_mlp": 1.43212891, + "step": 459, + "time_per_iteration": 2.938124656677246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218235, + "balance_loss_mlp": 1.07513571, + "epoch": 0.08849557522123894, + "flos": 574608344064.0, + "grad_norm": 0.027044136096108284, + "language_loss": 1.01923048, + "learning_rate": 0.000991057018277873, + "loss": 1.03141284, + "num_input_tokens_seen": 38153424, + "router_z_loss_mlp": 1.43212891, + "step": 460, + "time_per_iteration": 2.746169090270996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212445, + "balance_loss_mlp": 1.0693934, + "epoch": 0.0886879569065025, + "flos": 565627419648.0, + "grad_norm": 0.031092379840733354, + "language_loss": 1.03267121, + "learning_rate": 0.0009909982635715898, + "loss": 1.04479575, + "num_input_tokens_seen": 38223008, + "router_z_loss_mlp": 1.43164062, + "step": 461, + "time_per_iteration": 2.6196396350860596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212854, + "balance_loss_mlp": 1.06956458, + "epoch": 0.08888033859176607, + "flos": 564956674560.0, + "grad_norm": 0.030181357689894217, + "language_loss": 1.02059078, + "learning_rate": 0.0009909393182428751, + "loss": 1.03271937, + "num_input_tokens_seen": 38294592, + "router_z_loss_mlp": 1.43408203, + "step": 462, + "time_per_iteration": 2.679793357849121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216843, + "balance_loss_mlp": 1.07345808, + "epoch": 0.08907272027702963, + "flos": 466742650368.0, + "grad_norm": 0.029240136547664795, + "language_loss": 0.9639132, + "learning_rate": 0.000990880182314614, + "loss": 0.97608161, + "num_input_tokens_seen": 38365792, + "router_z_loss_mlp": 1.43505859, + "step": 463, + "time_per_iteration": 2.712097644805908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212421, + "balance_loss_mlp": 1.06922734, + "epoch": 0.08926510196229319, + "flos": 682843338240.0, + "grad_norm": 0.026287763165510035, + "language_loss": 0.96174729, + "learning_rate": 0.0009908208558097643, + "loss": 0.97387147, + "num_input_tokens_seen": 38447776, + "router_z_loss_mlp": 1.43310547, + "step": 464, + "time_per_iteration": 2.906903028488159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217208, + "balance_loss_mlp": 1.07406175, + "epoch": 0.08945748364755675, + "flos": 597821360640.0, + "grad_norm": 0.024374741633963998, + "language_loss": 0.98668623, + "learning_rate": 0.000990761338751359, + "loss": 0.99885827, + "num_input_tokens_seen": 38521632, + "router_z_loss_mlp": 1.43261719, + "step": 465, + "time_per_iteration": 2.7994933128356934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225639, + "balance_loss_mlp": 1.08506775, + "epoch": 0.08964986533282032, + "flos": 1589340930048.0, + "grad_norm": 0.02575129149720033, + "language_loss": 0.73659623, + "learning_rate": 0.0009907016311625045, + "loss": 0.74885261, + "num_input_tokens_seen": 38760528, + "router_z_loss_mlp": 1.40625, + "step": 466, + "time_per_iteration": 4.9763429164886475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221953, + "balance_loss_mlp": 1.07861578, + "epoch": 0.08984224701808388, + "flos": 534549385728.0, + "grad_norm": 0.024628184063577727, + "language_loss": 1.01551545, + "learning_rate": 0.0009906417330663815, + "loss": 1.02773499, + "num_input_tokens_seen": 38827200, + "router_z_loss_mlp": 1.43457031, + "step": 467, + "time_per_iteration": 2.614560842514038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01232523, + "balance_loss_mlp": 1.08994913, + "epoch": 0.09003462870334744, + "flos": 479850103296.0, + "grad_norm": 0.03230737833956583, + "language_loss": 0.98222148, + "learning_rate": 0.0009905816444862442, + "loss": 0.99454677, + "num_input_tokens_seen": 38891984, + "router_z_loss_mlp": 1.42675781, + "step": 468, + "time_per_iteration": 2.598146438598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223867, + "balance_loss_mlp": 1.08124495, + "epoch": 0.090227010388611, + "flos": 654902178816.0, + "grad_norm": 0.027522185030294237, + "language_loss": 0.95659769, + "learning_rate": 0.0009905213654454216, + "loss": 0.96883637, + "num_input_tokens_seen": 38977136, + "router_z_loss_mlp": 1.42724609, + "step": 469, + "time_per_iteration": 2.8876352310180664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01219852, + "balance_loss_mlp": 1.07737279, + "epoch": 0.09041939207387456, + "flos": 619358515200.0, + "grad_norm": 0.023282407360439072, + "language_loss": 1.03878951, + "learning_rate": 0.0009904608959673158, + "loss": 1.0509882, + "num_input_tokens_seen": 39052224, + "router_z_loss_mlp": 1.42578125, + "step": 470, + "time_per_iteration": 2.7882330417633057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213781, + "balance_loss_mlp": 1.0718745, + "epoch": 0.09061177375913813, + "flos": 455295596544.0, + "grad_norm": 0.02882877970469751, + "language_loss": 1.04707062, + "learning_rate": 0.000990400236075403, + "loss": 1.05920839, + "num_input_tokens_seen": 39116832, + "router_z_loss_mlp": 1.41992188, + "step": 471, + "time_per_iteration": 2.5016987323760986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217743, + "balance_loss_mlp": 1.07574117, + "epoch": 0.0908041554444017, + "flos": 545308230144.0, + "grad_norm": 0.02444258884202674, + "language_loss": 1.01020849, + "learning_rate": 0.0009903393857932338, + "loss": 1.02238584, + "num_input_tokens_seen": 39190528, + "router_z_loss_mlp": 1.42089844, + "step": 472, + "time_per_iteration": 2.644397497177124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218613, + "balance_loss_mlp": 1.07732654, + "epoch": 0.09099653712966525, + "flos": 565466964480.0, + "grad_norm": 0.02685769494428931, + "language_loss": 0.99245131, + "learning_rate": 0.0009902783451444317, + "loss": 1.00463748, + "num_input_tokens_seen": 39263168, + "router_z_loss_mlp": 1.41357422, + "step": 473, + "time_per_iteration": 2.7087745666503906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214499, + "balance_loss_mlp": 1.07292593, + "epoch": 0.09118891881492881, + "flos": 475501994496.0, + "grad_norm": 0.029476649456104027, + "language_loss": 1.02896917, + "learning_rate": 0.0009902171141526956, + "loss": 1.04111421, + "num_input_tokens_seen": 39330784, + "router_z_loss_mlp": 1.41650391, + "step": 474, + "time_per_iteration": 2.5271990299224854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215154, + "balance_loss_mlp": 1.07410538, + "epoch": 0.09138130050019239, + "flos": 546990822912.0, + "grad_norm": 0.02490932279529465, + "language_loss": 0.89845926, + "learning_rate": 0.000990155692841797, + "loss": 0.9106108, + "num_input_tokens_seen": 39417472, + "router_z_loss_mlp": 1.41113281, + "step": 475, + "time_per_iteration": 2.958740234375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214039, + "balance_loss_mlp": 1.07303798, + "epoch": 0.09157368218545595, + "flos": 733973319168.0, + "grad_norm": 0.02740759839690251, + "language_loss": 1.01869047, + "learning_rate": 0.0009900940812355818, + "loss": 1.03083086, + "num_input_tokens_seen": 39488656, + "router_z_loss_mlp": 1.41064453, + "step": 476, + "time_per_iteration": 2.891787528991699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205639, + "balance_loss_mlp": 1.06478107, + "epoch": 0.0917660638707195, + "flos": 612072918528.0, + "grad_norm": 0.029261712768775452, + "language_loss": 0.99624813, + "learning_rate": 0.00099003227935797, + "loss": 1.0083046, + "num_input_tokens_seen": 39558224, + "router_z_loss_mlp": 1.40917969, + "step": 477, + "time_per_iteration": 2.7569031715393066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207057, + "balance_loss_mlp": 1.06605613, + "epoch": 0.09195844555598306, + "flos": 657018473472.0, + "grad_norm": 0.026965523070242428, + "language_loss": 1.02860427, + "learning_rate": 0.000989970287232955, + "loss": 1.04067481, + "num_input_tokens_seen": 39629856, + "router_z_loss_mlp": 1.41064453, + "step": 478, + "time_per_iteration": 2.7705225944519043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212938, + "balance_loss_mlp": 1.07212758, + "epoch": 0.09215082724124664, + "flos": 477540426240.0, + "grad_norm": 0.02578247385618595, + "language_loss": 0.99767786, + "learning_rate": 0.0009899081048846043, + "loss": 1.00980723, + "num_input_tokens_seen": 39695984, + "router_z_loss_mlp": 1.40869141, + "step": 479, + "time_per_iteration": 2.5488922595977783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215229, + "balance_loss_mlp": 1.07437098, + "epoch": 0.0923432089265102, + "flos": 525325413888.0, + "grad_norm": 0.029009434883925433, + "language_loss": 1.05276799, + "learning_rate": 0.0009898457323370593, + "loss": 1.06492031, + "num_input_tokens_seen": 39760256, + "router_z_loss_mlp": 1.40917969, + "step": 480, + "time_per_iteration": 2.5628790855407715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213957, + "balance_loss_mlp": 1.07314658, + "epoch": 0.09253559061177376, + "flos": 546638986752.0, + "grad_norm": 0.030643020391807937, + "language_loss": 1.01694977, + "learning_rate": 0.000989783169614535, + "loss": 1.02908933, + "num_input_tokens_seen": 39827984, + "router_z_loss_mlp": 1.40869141, + "step": 481, + "time_per_iteration": 2.6431851387023926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206421, + "balance_loss_mlp": 1.06718445, + "epoch": 0.09272797229703732, + "flos": 1541334362112.0, + "grad_norm": 0.00793715508899474, + "language_loss": 0.78752756, + "learning_rate": 0.0009897204167413206, + "loss": 0.79959178, + "num_input_tokens_seen": 40056688, + "router_z_loss_mlp": 1.39257812, + "step": 482, + "time_per_iteration": 4.84259295463562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211177, + "balance_loss_mlp": 1.07041514, + "epoch": 0.09292035398230089, + "flos": 691064194560.0, + "grad_norm": 0.029391602229229655, + "language_loss": 0.99036419, + "learning_rate": 0.000989657473741779, + "loss": 1.00247598, + "num_input_tokens_seen": 40133120, + "router_z_loss_mlp": 1.40820312, + "step": 483, + "time_per_iteration": 2.8193717002868652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210505, + "balance_loss_mlp": 1.06964695, + "epoch": 0.09311273566756445, + "flos": 510822076416.0, + "grad_norm": 0.026713621627667553, + "language_loss": 1.0060308, + "learning_rate": 0.0009895943406403465, + "loss": 1.01813591, + "num_input_tokens_seen": 40206464, + "router_z_loss_mlp": 1.40917969, + "step": 484, + "time_per_iteration": 2.695058822631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210956, + "balance_loss_mlp": 1.07071841, + "epoch": 0.09330511735282801, + "flos": 660583045632.0, + "grad_norm": 0.02538483632370611, + "language_loss": 0.94170594, + "learning_rate": 0.0009895310174615338, + "loss": 0.95381546, + "num_input_tokens_seen": 40277744, + "router_z_loss_mlp": 1.40283203, + "step": 485, + "time_per_iteration": 2.7646515369415283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210991, + "balance_loss_mlp": 1.0725174, + "epoch": 0.09349749903809157, + "flos": 1456021673472.0, + "grad_norm": 0.008074315810691821, + "language_loss": 0.75718516, + "learning_rate": 0.0009894675042299251, + "loss": 0.7692951, + "num_input_tokens_seen": 40503664, + "router_z_loss_mlp": 1.38476562, + "step": 486, + "time_per_iteration": 4.652726888656616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208546, + "balance_loss_mlp": 1.06868994, + "epoch": 0.09368988072335514, + "flos": 521899829760.0, + "grad_norm": 0.021962490795067104, + "language_loss": 0.97574425, + "learning_rate": 0.0009894038009701782, + "loss": 0.98782969, + "num_input_tokens_seen": 40571376, + "router_z_loss_mlp": 1.39892578, + "step": 487, + "time_per_iteration": 2.647747755050659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207771, + "balance_loss_mlp": 1.06786692, + "epoch": 0.0938822624086187, + "flos": 498751941120.0, + "grad_norm": 0.02403393711112831, + "language_loss": 1.01297927, + "learning_rate": 0.0009893399077070253, + "loss": 1.02505696, + "num_input_tokens_seen": 40638096, + "router_z_loss_mlp": 1.39941406, + "step": 488, + "time_per_iteration": 2.5559775829315186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209251, + "balance_loss_mlp": 1.07006216, + "epoch": 0.09407464409388226, + "flos": 534223746048.0, + "grad_norm": 0.02465812888810929, + "language_loss": 0.94380867, + "learning_rate": 0.0009892758244652718, + "loss": 0.95590127, + "num_input_tokens_seen": 40710992, + "router_z_loss_mlp": 1.39208984, + "step": 489, + "time_per_iteration": 2.6696364879608154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203933, + "balance_loss_mlp": 1.06398153, + "epoch": 0.09426702577914582, + "flos": 587090714112.0, + "grad_norm": 0.02607881729553482, + "language_loss": 1.01920152, + "learning_rate": 0.0009892115512697968, + "loss": 1.03124094, + "num_input_tokens_seen": 40778896, + "router_z_loss_mlp": 1.39990234, + "step": 490, + "time_per_iteration": 2.645073652267456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205245, + "balance_loss_mlp": 1.06524527, + "epoch": 0.0944594074644094, + "flos": 504463733760.0, + "grad_norm": 0.02086232355550113, + "language_loss": 1.01703966, + "learning_rate": 0.0009891470881455537, + "loss": 1.02909207, + "num_input_tokens_seen": 40853376, + "router_z_loss_mlp": 1.40039062, + "step": 491, + "time_per_iteration": 2.669978618621826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207443, + "balance_loss_mlp": 1.06777763, + "epoch": 0.09465178914967295, + "flos": 572114016768.0, + "grad_norm": 0.026976181820206353, + "language_loss": 1.00743008, + "learning_rate": 0.0009890824351175692, + "loss": 1.01950443, + "num_input_tokens_seen": 40923776, + "router_z_loss_mlp": 1.39697266, + "step": 492, + "time_per_iteration": 2.6572952270507812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207157, + "balance_loss_mlp": 1.06796801, + "epoch": 0.09484417083493651, + "flos": 550418408448.0, + "grad_norm": 0.023611014675858334, + "language_loss": 1.04079592, + "learning_rate": 0.0009890175922109435, + "loss": 1.05286753, + "num_input_tokens_seen": 40996848, + "router_z_loss_mlp": 1.39208984, + "step": 493, + "time_per_iteration": 2.622361183166504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120413, + "balance_loss_mlp": 1.06498933, + "epoch": 0.09503655252020007, + "flos": 825271047168.0, + "grad_norm": 0.02510100112233158, + "language_loss": 1.0275588, + "learning_rate": 0.0009889525594508513, + "loss": 1.03960025, + "num_input_tokens_seen": 41071280, + "router_z_loss_mlp": 1.39160156, + "step": 494, + "time_per_iteration": 3.0307581424713135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202477, + "balance_loss_mlp": 1.06333554, + "epoch": 0.09522893420546363, + "flos": 405517839360.0, + "grad_norm": 0.02234367718934989, + "language_loss": 0.96151906, + "learning_rate": 0.0009888873368625404, + "loss": 0.97354376, + "num_input_tokens_seen": 41136304, + "router_z_loss_mlp": 1.39160156, + "step": 495, + "time_per_iteration": 2.4793317317962646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205465, + "balance_loss_mlp": 1.06665742, + "epoch": 0.0954213158907272, + "flos": 692255963136.0, + "grad_norm": 0.025506351191757377, + "language_loss": 1.00908709, + "learning_rate": 0.0009888219244713326, + "loss": 1.02114165, + "num_input_tokens_seen": 41212384, + "router_z_loss_mlp": 1.38818359, + "step": 496, + "time_per_iteration": 2.865914821624756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206499, + "balance_loss_mlp": 1.06773937, + "epoch": 0.09561369757599077, + "flos": 520074246144.0, + "grad_norm": 0.030124833611481355, + "language_loss": 1.02319717, + "learning_rate": 0.0009887563223026229, + "loss": 1.03526211, + "num_input_tokens_seen": 41282528, + "router_z_loss_mlp": 1.38671875, + "step": 497, + "time_per_iteration": 2.689708948135376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210899, + "balance_loss_mlp": 1.07376099, + "epoch": 0.09580607926125433, + "flos": 1388781623808.0, + "grad_norm": 0.014650036919455408, + "language_loss": 0.7906816, + "learning_rate": 0.0009886905303818805, + "loss": 0.80279064, + "num_input_tokens_seen": 41512256, + "router_z_loss_mlp": 1.37109375, + "step": 498, + "time_per_iteration": 4.940208196640015 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203477, + "balance_loss_mlp": 1.06476545, + "epoch": 0.09599846094651789, + "flos": 718825433088.0, + "grad_norm": 0.028840614245688557, + "language_loss": 0.98952407, + "learning_rate": 0.0009886245487346482, + "loss": 1.00155878, + "num_input_tokens_seen": 41596816, + "router_z_loss_mlp": 1.38427734, + "step": 499, + "time_per_iteration": 3.023056745529175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205479, + "balance_loss_mlp": 1.06690967, + "epoch": 0.09619084263178146, + "flos": 386893977600.0, + "grad_norm": 0.031706482821381415, + "language_loss": 1.0340035, + "learning_rate": 0.0009885583773865422, + "loss": 1.04605842, + "num_input_tokens_seen": 41658544, + "router_z_loss_mlp": 1.38183594, + "step": 500, + "time_per_iteration": 2.422914505004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202787, + "balance_loss_mlp": 1.06479073, + "epoch": 0.09638322431704502, + "flos": 535172467200.0, + "grad_norm": 0.02878579188863982, + "language_loss": 0.99392897, + "learning_rate": 0.0009884920163632524, + "loss": 1.00595689, + "num_input_tokens_seen": 41730736, + "router_z_loss_mlp": 1.37988281, + "step": 501, + "time_per_iteration": 2.6820154190063477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203474, + "balance_loss_mlp": 1.0655731, + "epoch": 0.09657560600230858, + "flos": 501656501760.0, + "grad_norm": 0.02635733095705931, + "language_loss": 1.03128934, + "learning_rate": 0.000988425465690543, + "loss": 1.04332411, + "num_input_tokens_seen": 41797824, + "router_z_loss_mlp": 1.37890625, + "step": 502, + "time_per_iteration": 2.605536699295044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204627, + "balance_loss_mlp": 1.06677341, + "epoch": 0.09676798768757214, + "flos": 530331532800.0, + "grad_norm": 0.023374032620567947, + "language_loss": 1.00861204, + "learning_rate": 0.0009883587253942505, + "loss": 1.02065825, + "num_input_tokens_seen": 41875520, + "router_z_loss_mlp": 1.37841797, + "step": 503, + "time_per_iteration": 2.7548091411590576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204765, + "balance_loss_mlp": 1.06686366, + "epoch": 0.09696036937283571, + "flos": 464556498432.0, + "grad_norm": 0.029206950172382878, + "language_loss": 1.0685035, + "learning_rate": 0.0009882917955002862, + "loss": 1.08055115, + "num_input_tokens_seen": 41942224, + "router_z_loss_mlp": 1.37890625, + "step": 504, + "time_per_iteration": 2.520970344543457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200777, + "balance_loss_mlp": 1.06297076, + "epoch": 0.09715275105809927, + "flos": 536010398208.0, + "grad_norm": 0.02484338661637091, + "language_loss": 0.9770751, + "learning_rate": 0.0009882246760346343, + "loss": 0.98908287, + "num_input_tokens_seen": 42007552, + "router_z_loss_mlp": 1.37695312, + "step": 505, + "time_per_iteration": 2.6314897537231445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204578, + "balance_loss_mlp": 1.06672478, + "epoch": 0.09734513274336283, + "flos": 455881747968.0, + "grad_norm": 0.02756591702740651, + "language_loss": 1.04990697, + "learning_rate": 0.0009881573670233533, + "loss": 1.06195283, + "num_input_tokens_seen": 42071760, + "router_z_loss_mlp": 1.37451172, + "step": 506, + "time_per_iteration": 2.492464780807495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203948, + "balance_loss_mlp": 1.06619, + "epoch": 0.09753751442862639, + "flos": 509827693056.0, + "grad_norm": 0.02954706972608782, + "language_loss": 0.97619581, + "learning_rate": 0.0009880898684925747, + "loss": 0.98823535, + "num_input_tokens_seen": 42140688, + "router_z_loss_mlp": 1.37353516, + "step": 507, + "time_per_iteration": 2.6402478218078613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120195, + "balance_loss_mlp": 1.06438243, + "epoch": 0.09772989611388996, + "flos": 485246989824.0, + "grad_norm": 0.02487380392257162, + "language_loss": 0.96617985, + "learning_rate": 0.0009880221804685037, + "loss": 0.97819936, + "num_input_tokens_seen": 42208544, + "router_z_loss_mlp": 1.37158203, + "step": 508, + "time_per_iteration": 2.5352439880371094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209412, + "balance_loss_mlp": 1.0741806, + "epoch": 0.09792227779915352, + "flos": 1569316454400.0, + "grad_norm": 0.016823619827393988, + "language_loss": 0.79344422, + "learning_rate": 0.000987954302977419, + "loss": 0.80553836, + "num_input_tokens_seen": 42426624, + "router_z_loss_mlp": 1.3515625, + "step": 509, + "time_per_iteration": 4.694217205047607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205455, + "balance_loss_mlp": 1.06831706, + "epoch": 0.09811465948441708, + "flos": 588914296320.0, + "grad_norm": 0.032012577058462416, + "language_loss": 1.03636336, + "learning_rate": 0.0009878862360456733, + "loss": 1.04841793, + "num_input_tokens_seen": 42494592, + "router_z_loss_mlp": 1.37011719, + "step": 510, + "time_per_iteration": 2.73879337310791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208431, + "balance_loss_mlp": 1.07148337, + "epoch": 0.09830704116968064, + "flos": 614128814592.0, + "grad_norm": 0.028115444050206044, + "language_loss": 0.94855493, + "learning_rate": 0.0009878179796996922, + "loss": 0.96063924, + "num_input_tokens_seen": 42564944, + "router_z_loss_mlp": 1.36914062, + "step": 511, + "time_per_iteration": 2.6949734687805176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207361, + "balance_loss_mlp": 1.07050836, + "epoch": 0.09849942285494422, + "flos": 539935538688.0, + "grad_norm": 0.022608937638108787, + "language_loss": 0.9790619, + "learning_rate": 0.0009877495339659754, + "loss": 0.99113548, + "num_input_tokens_seen": 42645616, + "router_z_loss_mlp": 1.36816406, + "step": 512, + "time_per_iteration": 2.7515861988067627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214076, + "balance_loss_mlp": 1.0773195, + "epoch": 0.09869180454020778, + "flos": 621603064320.0, + "grad_norm": 0.029833187637910333, + "language_loss": 0.94261241, + "learning_rate": 0.000987680898871096, + "loss": 0.95475316, + "num_input_tokens_seen": 42713632, + "router_z_loss_mlp": 1.3671875, + "step": 513, + "time_per_iteration": 2.6975760459899902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120845, + "balance_loss_mlp": 1.07145417, + "epoch": 0.09888418622547133, + "flos": 813059922432.0, + "grad_norm": 0.032512892127392744, + "language_loss": 0.9726817, + "learning_rate": 0.0009876120744417, + "loss": 0.98476619, + "num_input_tokens_seen": 42789088, + "router_z_loss_mlp": 1.36767578, + "step": 514, + "time_per_iteration": 2.9514927864074707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214576, + "balance_loss_mlp": 1.07762837, + "epoch": 0.0990765679107349, + "flos": 536857061376.0, + "grad_norm": 0.028495408786163776, + "language_loss": 1.0346663, + "learning_rate": 0.0009875430607045078, + "loss": 1.04681206, + "num_input_tokens_seen": 42861168, + "router_z_loss_mlp": 1.36523438, + "step": 515, + "time_per_iteration": 2.669271230697632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209323, + "balance_loss_mlp": 1.07242322, + "epoch": 0.09926894959599845, + "flos": 588970692096.0, + "grad_norm": 0.026228231589839293, + "language_loss": 0.98752952, + "learning_rate": 0.000987473857686313, + "loss": 0.9996227, + "num_input_tokens_seen": 42934112, + "router_z_loss_mlp": 1.36474609, + "step": 516, + "time_per_iteration": 2.7055716514587402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120601, + "balance_loss_mlp": 1.06934881, + "epoch": 0.09946133128126203, + "flos": 642386881536.0, + "grad_norm": 0.0302129460476142, + "language_loss": 1.04248524, + "learning_rate": 0.0009874044654139824, + "loss": 1.05454528, + "num_input_tokens_seen": 43005248, + "router_z_loss_mlp": 1.36230469, + "step": 517, + "time_per_iteration": 2.726618528366089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200307, + "balance_loss_mlp": 1.06340742, + "epoch": 0.09965371296652559, + "flos": 466725186048.0, + "grad_norm": 0.03251153136411229, + "language_loss": 1.02563679, + "learning_rate": 0.0009873348839144563, + "loss": 1.03763986, + "num_input_tokens_seen": 43070576, + "router_z_loss_mlp": 1.36474609, + "step": 518, + "time_per_iteration": 2.5855953693389893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200913, + "balance_loss_mlp": 1.06439471, + "epoch": 0.09984609465178915, + "flos": 484558780416.0, + "grad_norm": 0.029627125773621466, + "language_loss": 1.03352094, + "learning_rate": 0.000987265113214749, + "loss": 1.04552996, + "num_input_tokens_seen": 43138048, + "router_z_loss_mlp": 1.36279297, + "step": 519, + "time_per_iteration": 2.5350587368011475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01201703, + "balance_loss_mlp": 1.06566191, + "epoch": 0.1000384763370527, + "flos": 570095050752.0, + "grad_norm": 0.028931775658430137, + "language_loss": 1.07544637, + "learning_rate": 0.0009871951533419476, + "loss": 1.08746338, + "num_input_tokens_seen": 43207600, + "router_z_loss_mlp": 1.35986328, + "step": 520, + "time_per_iteration": 2.6423709392547607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200484, + "balance_loss_mlp": 1.06439495, + "epoch": 0.10023085802231628, + "flos": 546925694976.0, + "grad_norm": 0.025491893219336172, + "language_loss": 0.95403761, + "learning_rate": 0.0009871250043232132, + "loss": 0.96604246, + "num_input_tokens_seen": 43285104, + "router_z_loss_mlp": 1.36035156, + "step": 521, + "time_per_iteration": 2.7604362964630127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198813, + "balance_loss_mlp": 1.06205583, + "epoch": 0.10042323970757984, + "flos": 504439538688.0, + "grad_norm": 0.029888360913216814, + "language_loss": 0.96113187, + "learning_rate": 0.0009870546661857797, + "loss": 0.97311997, + "num_input_tokens_seen": 43353312, + "router_z_loss_mlp": 1.36328125, + "step": 522, + "time_per_iteration": 2.578458547592163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195212, + "balance_loss_mlp": 1.05931365, + "epoch": 0.1006156213928434, + "flos": 771724601856.0, + "grad_norm": 0.029426081780707294, + "language_loss": 1.05752206, + "learning_rate": 0.0009869841389569553, + "loss": 1.0694741, + "num_input_tokens_seen": 43427680, + "router_z_loss_mlp": 1.35839844, + "step": 523, + "time_per_iteration": 2.958531618118286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194366, + "balance_loss_mlp": 1.05846703, + "epoch": 0.10080800307810696, + "flos": 491008447488.0, + "grad_norm": 0.024593893632090205, + "language_loss": 0.96497846, + "learning_rate": 0.0009869134226641206, + "loss": 0.97692204, + "num_input_tokens_seen": 43495200, + "router_z_loss_mlp": 1.35839844, + "step": 524, + "time_per_iteration": 2.6820528507232666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196113, + "balance_loss_mlp": 1.06030965, + "epoch": 0.10100038476337053, + "flos": 455712560640.0, + "grad_norm": 0.026556514945601337, + "language_loss": 0.98348475, + "learning_rate": 0.0009868425173347303, + "loss": 0.99544585, + "num_input_tokens_seen": 43566256, + "router_z_loss_mlp": 1.35742188, + "step": 525, + "time_per_iteration": 2.6460907459259033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196515, + "balance_loss_mlp": 1.06099772, + "epoch": 0.10119276644863409, + "flos": 557573749248.0, + "grad_norm": 0.022458491608374247, + "language_loss": 1.03332829, + "learning_rate": 0.0009867714229963125, + "loss": 1.04529333, + "num_input_tokens_seen": 43639696, + "router_z_loss_mlp": 1.35449219, + "step": 526, + "time_per_iteration": 2.693362236022949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119647, + "balance_loss_mlp": 1.0609529, + "epoch": 0.10138514813389765, + "flos": 517219350528.0, + "grad_norm": 0.028969258136437262, + "language_loss": 1.0161202, + "learning_rate": 0.000986700139676468, + "loss": 1.02808487, + "num_input_tokens_seen": 43703872, + "router_z_loss_mlp": 1.35449219, + "step": 527, + "time_per_iteration": 2.5826644897460938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202893, + "balance_loss_mlp": 1.06742311, + "epoch": 0.10157752981916121, + "flos": 501563175936.0, + "grad_norm": 0.023004964960346017, + "language_loss": 0.98490077, + "learning_rate": 0.0009866286674028717, + "loss": 0.99692971, + "num_input_tokens_seen": 43774416, + "router_z_loss_mlp": 1.35400391, + "step": 528, + "time_per_iteration": 2.626595973968506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204326, + "balance_loss_mlp": 1.06876123, + "epoch": 0.10176991150442478, + "flos": 658093447680.0, + "grad_norm": 0.024381421822087013, + "language_loss": 0.95674849, + "learning_rate": 0.0009865570062032717, + "loss": 0.96879184, + "num_input_tokens_seen": 43853376, + "router_z_loss_mlp": 1.35498047, + "step": 529, + "time_per_iteration": 2.916924238204956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203456, + "balance_loss_mlp": 1.0680815, + "epoch": 0.10196229318968834, + "flos": 574402226688.0, + "grad_norm": 0.021344584600364362, + "language_loss": 0.99175954, + "learning_rate": 0.0009864851561054893, + "loss": 1.00379407, + "num_input_tokens_seen": 43929632, + "router_z_loss_mlp": 1.35302734, + "step": 530, + "time_per_iteration": 2.750075578689575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203649, + "balance_loss_mlp": 1.06856096, + "epoch": 0.1021546748749519, + "flos": 519255780864.0, + "grad_norm": 0.027896087186932737, + "language_loss": 0.99157, + "learning_rate": 0.0009864131171374191, + "loss": 1.00360656, + "num_input_tokens_seen": 44002144, + "router_z_loss_mlp": 1.35009766, + "step": 531, + "time_per_iteration": 2.6506359577178955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202329, + "balance_loss_mlp": 1.06728852, + "epoch": 0.10234705656021546, + "flos": 610953008640.0, + "grad_norm": 0.021304730024267197, + "language_loss": 0.98848057, + "learning_rate": 0.0009863408893270292, + "loss": 1.0005039, + "num_input_tokens_seen": 44078272, + "router_z_loss_mlp": 1.34960938, + "step": 532, + "time_per_iteration": 2.827632427215576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202805, + "balance_loss_mlp": 1.06776476, + "epoch": 0.10253943824547904, + "flos": 602912073216.0, + "grad_norm": 0.02650069508154076, + "language_loss": 0.95645475, + "learning_rate": 0.0009862684727023605, + "loss": 0.96848285, + "num_input_tokens_seen": 44152304, + "router_z_loss_mlp": 1.34960938, + "step": 533, + "time_per_iteration": 2.730771541595459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206135, + "balance_loss_mlp": 1.07152414, + "epoch": 0.1027318199307426, + "flos": 664156349952.0, + "grad_norm": 0.02579556790717569, + "language_loss": 0.96718729, + "learning_rate": 0.0009861958672915283, + "loss": 0.97924864, + "num_input_tokens_seen": 44226720, + "router_z_loss_mlp": 1.34521484, + "step": 534, + "time_per_iteration": 2.825239419937134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202189, + "balance_loss_mlp": 1.06776834, + "epoch": 0.10292420161600616, + "flos": 684529933824.0, + "grad_norm": 0.02492376876437301, + "language_loss": 0.95656139, + "learning_rate": 0.0009861230731227201, + "loss": 0.96858335, + "num_input_tokens_seen": 44303600, + "router_z_loss_mlp": 1.34326172, + "step": 535, + "time_per_iteration": 2.858596086502075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203815, + "balance_loss_mlp": 1.06958508, + "epoch": 0.10311658330126972, + "flos": 491268959232.0, + "grad_norm": 0.02833674325523021, + "language_loss": 0.99709427, + "learning_rate": 0.0009860500902241973, + "loss": 1.00913239, + "num_input_tokens_seen": 44370960, + "router_z_loss_mlp": 1.34130859, + "step": 536, + "time_per_iteration": 2.5780303478240967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197149, + "balance_loss_mlp": 1.06291902, + "epoch": 0.10330896498653329, + "flos": 432686195712.0, + "grad_norm": 0.024484943889946764, + "language_loss": 1.03652823, + "learning_rate": 0.0009859769186242942, + "loss": 1.0484997, + "num_input_tokens_seen": 44435584, + "router_z_loss_mlp": 1.34130859, + "step": 537, + "time_per_iteration": 2.5104598999023438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119791, + "balance_loss_mlp": 1.06415713, + "epoch": 0.10350134667179685, + "flos": 550641990144.0, + "grad_norm": 0.0271300181774947, + "language_loss": 0.97886324, + "learning_rate": 0.0009859035583514187, + "loss": 0.99084234, + "num_input_tokens_seen": 44505456, + "router_z_loss_mlp": 1.33642578, + "step": 538, + "time_per_iteration": 2.6156880855560303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197994, + "balance_loss_mlp": 1.06395507, + "epoch": 0.10369372835706041, + "flos": 641826926592.0, + "grad_norm": 0.024416305433678544, + "language_loss": 1.00991774, + "learning_rate": 0.0009858300094340517, + "loss": 1.02189767, + "num_input_tokens_seen": 44580208, + "router_z_loss_mlp": 1.33935547, + "step": 539, + "time_per_iteration": 2.764214515686035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198436, + "balance_loss_mlp": 1.06468332, + "epoch": 0.10388611004232397, + "flos": 522765958656.0, + "grad_norm": 0.025798430155835095, + "language_loss": 0.9342165, + "learning_rate": 0.0009857562719007473, + "loss": 0.94620085, + "num_input_tokens_seen": 44646576, + "router_z_loss_mlp": 1.33642578, + "step": 540, + "time_per_iteration": 2.6592581272125244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204547, + "balance_loss_mlp": 1.07122386, + "epoch": 0.10407849172758753, + "flos": 703739947008.0, + "grad_norm": 0.023593197084580173, + "language_loss": 0.95331407, + "learning_rate": 0.0009856823457801331, + "loss": 0.96535957, + "num_input_tokens_seen": 44726752, + "router_z_loss_mlp": 1.33203125, + "step": 541, + "time_per_iteration": 2.889531373977661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202711, + "balance_loss_mlp": 1.06924474, + "epoch": 0.1042708734128511, + "flos": 503944711680.0, + "grad_norm": 0.023957714626313076, + "language_loss": 1.02856565, + "learning_rate": 0.00098560823110091, + "loss": 1.04059267, + "num_input_tokens_seen": 44795824, + "router_z_loss_mlp": 1.33349609, + "step": 542, + "time_per_iteration": 2.6067047119140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205134, + "balance_loss_mlp": 1.07185781, + "epoch": 0.10446325509811466, + "flos": 486640872960.0, + "grad_norm": 0.0231214260398276, + "language_loss": 1.01405394, + "learning_rate": 0.000985533927891851, + "loss": 1.02610517, + "num_input_tokens_seen": 44868496, + "router_z_loss_mlp": 1.33154297, + "step": 543, + "time_per_iteration": 2.6622776985168457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01201388, + "balance_loss_mlp": 1.06820762, + "epoch": 0.10465563678337822, + "flos": 569713015296.0, + "grad_norm": 0.023482705287667723, + "language_loss": 1.01015687, + "learning_rate": 0.0009854594361818044, + "loss": 1.02217078, + "num_input_tokens_seen": 44939888, + "router_z_loss_mlp": 1.33056641, + "step": 544, + "time_per_iteration": 2.7061924934387207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195672, + "balance_loss_mlp": 1.06244385, + "epoch": 0.10484801846864178, + "flos": 627242998272.0, + "grad_norm": 0.023194608242680787, + "language_loss": 0.99799937, + "learning_rate": 0.0009853847559996897, + "loss": 1.00995612, + "num_input_tokens_seen": 45012720, + "router_z_loss_mlp": 1.33105469, + "step": 545, + "time_per_iteration": 2.742445707321167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192128, + "balance_loss_mlp": 1.05885231, + "epoch": 0.10504040015390535, + "flos": 744812754432.0, + "grad_norm": 0.025865682249952955, + "language_loss": 0.99192667, + "learning_rate": 0.0009853098873745, + "loss": 1.00384796, + "num_input_tokens_seen": 45093744, + "router_z_loss_mlp": 1.33154297, + "step": 546, + "time_per_iteration": 3.0260400772094727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192867, + "balance_loss_mlp": 1.05997264, + "epoch": 0.10523278183916891, + "flos": 587842050048.0, + "grad_norm": 0.02599355243407578, + "language_loss": 0.98197657, + "learning_rate": 0.0009852348303353027, + "loss": 0.99390525, + "num_input_tokens_seen": 45172784, + "router_z_loss_mlp": 1.32763672, + "step": 547, + "time_per_iteration": 2.8120169639587402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191481, + "balance_loss_mlp": 1.05844367, + "epoch": 0.10542516352443247, + "flos": 871145857536.0, + "grad_norm": 0.02495252935664815, + "language_loss": 0.91398883, + "learning_rate": 0.000985159584911237, + "loss": 0.92590368, + "num_input_tokens_seen": 45255600, + "router_z_loss_mlp": 1.32910156, + "step": 548, + "time_per_iteration": 3.1012043952941895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119193, + "balance_loss_mlp": 1.05913138, + "epoch": 0.10561754520969603, + "flos": 506412842496.0, + "grad_norm": 0.025955858684814606, + "language_loss": 0.9925828, + "learning_rate": 0.0009850841511315162, + "loss": 1.00450206, + "num_input_tokens_seen": 45325072, + "router_z_loss_mlp": 1.32666016, + "step": 549, + "time_per_iteration": 2.626220464706421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192876, + "balance_loss_mlp": 1.06022012, + "epoch": 0.1058099268949596, + "flos": 561147053568.0, + "grad_norm": 0.02554357007654854, + "language_loss": 0.98952115, + "learning_rate": 0.0009850085290254256, + "loss": 1.00144982, + "num_input_tokens_seen": 45401440, + "router_z_loss_mlp": 1.32519531, + "step": 550, + "time_per_iteration": 2.7464635372161865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194366, + "balance_loss_mlp": 1.06161487, + "epoch": 0.10600230858022316, + "flos": 563159288832.0, + "grad_norm": 0.020736613501838204, + "language_loss": 0.9519307, + "learning_rate": 0.0009849327186223246, + "loss": 0.9638744, + "num_input_tokens_seen": 45479264, + "router_z_loss_mlp": 1.32617188, + "step": 551, + "time_per_iteration": 2.7678163051605225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199655, + "balance_loss_mlp": 1.06728542, + "epoch": 0.10619469026548672, + "flos": 495317624832.0, + "grad_norm": 0.02236411826292933, + "language_loss": 1.02411103, + "learning_rate": 0.000984856719951646, + "loss": 1.03610754, + "num_input_tokens_seen": 45547328, + "router_z_loss_mlp": 1.32226562, + "step": 552, + "time_per_iteration": 2.5607285499572754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196226, + "balance_loss_mlp": 1.06404662, + "epoch": 0.10638707195075028, + "flos": 677463916032.0, + "grad_norm": 0.025808282690500464, + "language_loss": 1.00531495, + "learning_rate": 0.0009847805330428943, + "loss": 1.01727724, + "num_input_tokens_seen": 45631152, + "router_z_loss_mlp": 1.3203125, + "step": 553, + "time_per_iteration": 2.8748667240142822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190787, + "balance_loss_mlp": 1.05860806, + "epoch": 0.10657945363601386, + "flos": 489035143680.0, + "grad_norm": 0.02571681940882287, + "language_loss": 1.04715252, + "learning_rate": 0.0009847041579256481, + "loss": 1.05906045, + "num_input_tokens_seen": 45698208, + "router_z_loss_mlp": 1.3203125, + "step": 554, + "time_per_iteration": 2.56693696975708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191519, + "balance_loss_mlp": 1.05948246, + "epoch": 0.10677183532127742, + "flos": 483970627584.0, + "grad_norm": 0.020874824601389917, + "language_loss": 1.01746583, + "learning_rate": 0.0009846275946295592, + "loss": 1.02938092, + "num_input_tokens_seen": 45766640, + "router_z_loss_mlp": 1.31884766, + "step": 555, + "time_per_iteration": 2.596774101257324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195781, + "balance_loss_mlp": 1.06369734, + "epoch": 0.10696421700654098, + "flos": 657581156352.0, + "grad_norm": 0.023085993180182653, + "language_loss": 0.93557143, + "learning_rate": 0.0009845508431843518, + "loss": 0.94752926, + "num_input_tokens_seen": 45851408, + "router_z_loss_mlp": 1.31933594, + "step": 556, + "time_per_iteration": 2.9913573265075684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192823, + "balance_loss_mlp": 1.06088233, + "epoch": 0.10715659869180454, + "flos": 568792492032.0, + "grad_norm": 0.026087632201688016, + "language_loss": 0.9692713, + "learning_rate": 0.0009844739036198233, + "loss": 0.9811995, + "num_input_tokens_seen": 45919824, + "router_z_loss_mlp": 1.31787109, + "step": 557, + "time_per_iteration": 2.6583988666534424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192362, + "balance_loss_mlp": 1.06051683, + "epoch": 0.10734898037706811, + "flos": 541743657984.0, + "grad_norm": 0.02708275038302545, + "language_loss": 1.03564882, + "learning_rate": 0.0009843967759658448, + "loss": 1.04757237, + "num_input_tokens_seen": 45991024, + "router_z_loss_mlp": 1.31689453, + "step": 558, + "time_per_iteration": 2.6571173667907715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209854, + "balance_loss_mlp": 1.07920074, + "epoch": 0.10754136206233167, + "flos": 1479731518464.0, + "grad_norm": 0.021017403581586082, + "language_loss": 0.72767758, + "learning_rate": 0.0009843194602523592, + "loss": 0.73977602, + "num_input_tokens_seen": 46212736, + "router_z_loss_mlp": 1.30664062, + "step": 559, + "time_per_iteration": 4.901749134063721 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191994, + "balance_loss_mlp": 1.06024349, + "epoch": 0.10773374374759523, + "flos": 513411730944.0, + "grad_norm": 0.02623387515623986, + "language_loss": 1.03025067, + "learning_rate": 0.000984241956509384, + "loss": 1.04217052, + "num_input_tokens_seen": 46283920, + "router_z_loss_mlp": 1.31591797, + "step": 560, + "time_per_iteration": 2.642380714416504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011916, + "balance_loss_mlp": 1.06013584, + "epoch": 0.10792612543285879, + "flos": 497477580288.0, + "grad_norm": 0.029111560342126648, + "language_loss": 1.01683569, + "learning_rate": 0.0009841642647670078, + "loss": 1.02875161, + "num_input_tokens_seen": 46349664, + "router_z_loss_mlp": 1.31298828, + "step": 561, + "time_per_iteration": 2.5994741916656494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191695, + "balance_loss_mlp": 1.06027901, + "epoch": 0.10811850711812235, + "flos": 736836946944.0, + "grad_norm": 0.027918527501713815, + "language_loss": 0.94711685, + "learning_rate": 0.0009840863850553944, + "loss": 0.95903373, + "num_input_tokens_seen": 46432688, + "router_z_loss_mlp": 1.3125, + "step": 562, + "time_per_iteration": 2.980377435684204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193377, + "balance_loss_mlp": 1.06215191, + "epoch": 0.10831088880338592, + "flos": 612676534272.0, + "grad_norm": 0.025174626098757973, + "language_loss": 0.99795747, + "learning_rate": 0.0009840083174047782, + "loss": 1.00989127, + "num_input_tokens_seen": 46507216, + "router_z_loss_mlp": 1.31054688, + "step": 563, + "time_per_iteration": 2.7209153175354004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194645, + "balance_loss_mlp": 1.0633713, + "epoch": 0.10850327048864948, + "flos": 557497887744.0, + "grad_norm": 0.021851565940339403, + "language_loss": 0.93414235, + "learning_rate": 0.0009839300618454685, + "loss": 0.94608879, + "num_input_tokens_seen": 46590464, + "router_z_loss_mlp": 1.31103516, + "step": 564, + "time_per_iteration": 2.833120584487915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194873, + "balance_loss_mlp": 1.06402934, + "epoch": 0.10869565217391304, + "flos": 604436212224.0, + "grad_norm": 0.021697209366751603, + "language_loss": 0.98980927, + "learning_rate": 0.0009838516184078466, + "loss": 1.00175798, + "num_input_tokens_seen": 46666240, + "router_z_loss_mlp": 1.30664062, + "step": 565, + "time_per_iteration": 2.805722236633301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193483, + "balance_loss_mlp": 1.06263876, + "epoch": 0.1088880338591766, + "flos": 527205391872.0, + "grad_norm": 0.024778377976546286, + "language_loss": 0.97356248, + "learning_rate": 0.0009837729871223669, + "loss": 0.98549736, + "num_input_tokens_seen": 46734288, + "router_z_loss_mlp": 1.30664062, + "step": 566, + "time_per_iteration": 2.652186155319214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119656, + "balance_loss_mlp": 1.0658114, + "epoch": 0.10908041554444017, + "flos": 621416412672.0, + "grad_norm": 0.023487449334803984, + "language_loss": 0.99301046, + "learning_rate": 0.0009836941680195568, + "loss": 1.00497603, + "num_input_tokens_seen": 46809920, + "router_z_loss_mlp": 1.30566406, + "step": 567, + "time_per_iteration": 2.7732484340667725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192093, + "balance_loss_mlp": 1.06144011, + "epoch": 0.10927279722970373, + "flos": 899673168384.0, + "grad_norm": 0.026216288845653656, + "language_loss": 0.95416081, + "learning_rate": 0.0009836151611300166, + "loss": 0.96608174, + "num_input_tokens_seen": 46889984, + "router_z_loss_mlp": 1.3046875, + "step": 568, + "time_per_iteration": 3.174981117248535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190864, + "balance_loss_mlp": 1.06049693, + "epoch": 0.10946517891496729, + "flos": 529699719168.0, + "grad_norm": 0.02336242427092275, + "language_loss": 1.03071296, + "learning_rate": 0.0009835359664844194, + "loss": 1.04262161, + "num_input_tokens_seen": 46959536, + "router_z_loss_mlp": 1.30273438, + "step": 569, + "time_per_iteration": 2.595041513442993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190102, + "balance_loss_mlp": 1.06173706, + "epoch": 0.10965756060023085, + "flos": 1563991426560.0, + "grad_norm": 0.006726678932110135, + "language_loss": 0.81036806, + "learning_rate": 0.0009834565841135114, + "loss": 0.82226908, + "num_input_tokens_seen": 47196960, + "router_z_loss_mlp": 1.28320312, + "step": 570, + "time_per_iteration": 4.911731719970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193915, + "balance_loss_mlp": 1.0634526, + "epoch": 0.10984994228549443, + "flos": 514099940352.0, + "grad_norm": 0.027266515996607284, + "language_loss": 1.00165153, + "learning_rate": 0.0009833770140481118, + "loss": 1.01359057, + "num_input_tokens_seen": 47266560, + "router_z_loss_mlp": 1.30273438, + "step": 571, + "time_per_iteration": 2.6079747676849365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197777, + "balance_loss_mlp": 1.06741011, + "epoch": 0.11004232397075799, + "flos": 956273895936.0, + "grad_norm": 0.026548665437539986, + "language_loss": 0.90315044, + "learning_rate": 0.000983297256319112, + "loss": 0.91512823, + "num_input_tokens_seen": 47348512, + "router_z_loss_mlp": 1.30175781, + "step": 572, + "time_per_iteration": 3.1897354125976562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188275, + "balance_loss_mlp": 1.05776477, + "epoch": 0.11023470565602154, + "flos": 489228526080.0, + "grad_norm": 0.026034490292812715, + "language_loss": 0.95817071, + "learning_rate": 0.000983217310957477, + "loss": 0.97005343, + "num_input_tokens_seen": 47425392, + "router_z_loss_mlp": 1.30322266, + "step": 573, + "time_per_iteration": 2.7447898387908936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190883, + "balance_loss_mlp": 1.06056309, + "epoch": 0.1104270873412851, + "flos": 656990275584.0, + "grad_norm": 0.026590820610190004, + "language_loss": 1.00224817, + "learning_rate": 0.000983137177994244, + "loss": 1.01415706, + "num_input_tokens_seen": 47502336, + "router_z_loss_mlp": 1.30126953, + "step": 574, + "time_per_iteration": 2.846140146255493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185115, + "balance_loss_mlp": 1.0552249, + "epoch": 0.11061946902654868, + "flos": 724747345920.0, + "grad_norm": 0.019709272455133778, + "language_loss": 0.93286896, + "learning_rate": 0.0009830568574605235, + "loss": 0.94472009, + "num_input_tokens_seen": 47583552, + "router_z_loss_mlp": 1.29736328, + "step": 575, + "time_per_iteration": 2.922821044921875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185727, + "balance_loss_mlp": 1.05569339, + "epoch": 0.11081185071181224, + "flos": 836867822592.0, + "grad_norm": 0.025292755419638515, + "language_loss": 0.97880363, + "learning_rate": 0.0009829763493874992, + "loss": 0.99066085, + "num_input_tokens_seen": 47663440, + "router_z_loss_mlp": 1.29833984, + "step": 576, + "time_per_iteration": 3.022394895553589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183726, + "balance_loss_mlp": 1.05412149, + "epoch": 0.1110042323970758, + "flos": 610282263552.0, + "grad_norm": 0.023453623229808367, + "language_loss": 1.02838886, + "learning_rate": 0.0009828956538064264, + "loss": 1.04022622, + "num_input_tokens_seen": 47741920, + "router_z_loss_mlp": 1.29541016, + "step": 577, + "time_per_iteration": 2.817147970199585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182671, + "balance_loss_mlp": 1.05316234, + "epoch": 0.11119661408233936, + "flos": 597039825408.0, + "grad_norm": 0.025026186935027953, + "language_loss": 0.99076784, + "learning_rate": 0.0009828147707486344, + "loss": 1.00259459, + "num_input_tokens_seen": 47815136, + "router_z_loss_mlp": 1.29492188, + "step": 578, + "time_per_iteration": 2.6778078079223633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186939, + "balance_loss_mlp": 1.05752516, + "epoch": 0.11138899576760293, + "flos": 556887541248.0, + "grad_norm": 0.027590262528076937, + "language_loss": 0.96720088, + "learning_rate": 0.0009827337002455245, + "loss": 0.97907031, + "num_input_tokens_seen": 47881360, + "router_z_loss_mlp": 1.29394531, + "step": 579, + "time_per_iteration": 2.6259562969207764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188781, + "balance_loss_mlp": 1.05951095, + "epoch": 0.11158137745286649, + "flos": 691062193152.0, + "grad_norm": 0.0223692175133054, + "language_loss": 0.94567806, + "learning_rate": 0.0009826524423285712, + "loss": 0.9575659, + "num_input_tokens_seen": 47962720, + "router_z_loss_mlp": 1.29150391, + "step": 580, + "time_per_iteration": 2.9144554138183594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118328, + "balance_loss_mlp": 1.05386627, + "epoch": 0.11177375913813005, + "flos": 764306747904.0, + "grad_norm": 0.02877171771660235, + "language_loss": 0.97941083, + "learning_rate": 0.0009825709970293218, + "loss": 0.9912436, + "num_input_tokens_seen": 48035472, + "router_z_loss_mlp": 1.29296875, + "step": 581, + "time_per_iteration": 2.8999927043914795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181128, + "balance_loss_mlp": 1.05223894, + "epoch": 0.11196614082339361, + "flos": 808030334976.0, + "grad_norm": 0.029325346048851512, + "language_loss": 1.03732872, + "learning_rate": 0.0009824893643793956, + "loss": 1.04913998, + "num_input_tokens_seen": 48116944, + "router_z_loss_mlp": 1.28857422, + "step": 582, + "time_per_iteration": 3.0697131156921387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186636, + "balance_loss_mlp": 1.05731773, + "epoch": 0.11215852250865718, + "flos": 559724972544.0, + "grad_norm": 0.028740695003145394, + "language_loss": 0.98446089, + "learning_rate": 0.0009824075444104857, + "loss": 0.99632728, + "num_input_tokens_seen": 48187808, + "router_z_loss_mlp": 1.29150391, + "step": 583, + "time_per_iteration": 2.7398793697357178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190407, + "balance_loss_mlp": 1.06147003, + "epoch": 0.11235090419392074, + "flos": 514575301632.0, + "grad_norm": 0.02293328270345756, + "language_loss": 1.02460003, + "learning_rate": 0.000982325537154357, + "loss": 1.03650403, + "num_input_tokens_seen": 48254464, + "router_z_loss_mlp": 1.28808594, + "step": 584, + "time_per_iteration": 2.590156078338623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188149, + "balance_loss_mlp": 1.05954635, + "epoch": 0.1125432858791843, + "flos": 492432529920.0, + "grad_norm": 0.028214107652977688, + "language_loss": 1.0381788, + "learning_rate": 0.0009822433426428484, + "loss": 1.05006027, + "num_input_tokens_seen": 48318784, + "router_z_loss_mlp": 1.28564453, + "step": 585, + "time_per_iteration": 2.566488027572632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188321, + "balance_loss_mlp": 1.05957532, + "epoch": 0.11273566756444786, + "flos": 511727136768.0, + "grad_norm": 0.027438709113267498, + "language_loss": 0.95940274, + "learning_rate": 0.0009821609609078697, + "loss": 0.971286, + "num_input_tokens_seen": 48389248, + "router_z_loss_mlp": 1.28710938, + "step": 586, + "time_per_iteration": 2.6117701530456543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189545, + "balance_loss_mlp": 1.06098938, + "epoch": 0.11292804924971142, + "flos": 623639494656.0, + "grad_norm": 0.025949033694362005, + "language_loss": 0.97216725, + "learning_rate": 0.0009820783919814045, + "loss": 0.98406273, + "num_input_tokens_seen": 48463312, + "router_z_loss_mlp": 1.28515625, + "step": 587, + "time_per_iteration": 2.798182249069214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181783, + "balance_loss_mlp": 1.05360925, + "epoch": 0.113120430934975, + "flos": 479038368768.0, + "grad_norm": 0.03012596671256698, + "language_loss": 0.94172156, + "learning_rate": 0.0009819956358955095, + "loss": 0.95353937, + "num_input_tokens_seen": 48531856, + "router_z_loss_mlp": 1.28125, + "step": 588, + "time_per_iteration": 2.54179310798645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197707, + "balance_loss_mlp": 1.06905663, + "epoch": 0.11331281262023855, + "flos": 467990814720.0, + "grad_norm": 0.02502737191739997, + "language_loss": 0.9542653, + "learning_rate": 0.0009819126926823127, + "loss": 0.96624243, + "num_input_tokens_seen": 48596640, + "router_z_loss_mlp": 1.28613281, + "step": 589, + "time_per_iteration": 2.5262975692749023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191554, + "balance_loss_mlp": 1.06333208, + "epoch": 0.11350519430550211, + "flos": 651610853376.0, + "grad_norm": 0.023462259875113876, + "language_loss": 0.96713853, + "learning_rate": 0.000981829562374016, + "loss": 0.97905409, + "num_input_tokens_seen": 48669648, + "router_z_loss_mlp": 1.28173828, + "step": 590, + "time_per_iteration": 2.764240026473999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192039, + "balance_loss_mlp": 1.06415117, + "epoch": 0.11369757599076567, + "flos": 558860845056.0, + "grad_norm": 0.030341732837715945, + "language_loss": 1.07369685, + "learning_rate": 0.0009817462450028933, + "loss": 1.08561718, + "num_input_tokens_seen": 48737392, + "router_z_loss_mlp": 1.27832031, + "step": 591, + "time_per_iteration": 2.638333559036255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189947, + "balance_loss_mlp": 1.06215453, + "epoch": 0.11388995767602925, + "flos": 572305397760.0, + "grad_norm": 0.0238596111294556, + "language_loss": 0.94198918, + "learning_rate": 0.0009816627406012916, + "loss": 0.9538886, + "num_input_tokens_seen": 48817136, + "router_z_loss_mlp": 1.27734375, + "step": 592, + "time_per_iteration": 2.800842523574829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191939, + "balance_loss_mlp": 1.06395626, + "epoch": 0.1140823393612928, + "flos": 741743009280.0, + "grad_norm": 0.025351621893671843, + "language_loss": 0.93787777, + "learning_rate": 0.0009815790492016295, + "loss": 0.94979715, + "num_input_tokens_seen": 48895808, + "router_z_loss_mlp": 1.27929688, + "step": 593, + "time_per_iteration": 2.9331579208374023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191026, + "balance_loss_mlp": 1.06337643, + "epoch": 0.11427472104655637, + "flos": 700251236352.0, + "grad_norm": 0.02689478502881467, + "language_loss": 0.96601468, + "learning_rate": 0.0009814951708363993, + "loss": 0.97792494, + "num_input_tokens_seen": 48967456, + "router_z_loss_mlp": 1.27587891, + "step": 594, + "time_per_iteration": 2.832094192504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200218, + "balance_loss_mlp": 1.07414246, + "epoch": 0.11446710273181993, + "flos": 1480352598528.0, + "grad_norm": 0.020191453180706247, + "language_loss": 0.77990985, + "learning_rate": 0.0009814111055381654, + "loss": 0.79191208, + "num_input_tokens_seen": 49193152, + "router_z_loss_mlp": 1.25976562, + "step": 595, + "time_per_iteration": 4.752530574798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187485, + "balance_loss_mlp": 1.06026483, + "epoch": 0.1146594844170835, + "flos": 495912508416.0, + "grad_norm": 0.02910362847653251, + "language_loss": 0.97498882, + "learning_rate": 0.0009813268533395648, + "loss": 0.98686367, + "num_input_tokens_seen": 49260960, + "router_z_loss_mlp": 1.27148438, + "step": 596, + "time_per_iteration": 2.571367025375366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187961, + "balance_loss_mlp": 1.06093144, + "epoch": 0.11485186610234706, + "flos": 475790704128.0, + "grad_norm": 0.02927093575191284, + "language_loss": 0.98108673, + "learning_rate": 0.0009812424142733073, + "loss": 0.99296629, + "num_input_tokens_seen": 49327616, + "router_z_loss_mlp": 1.26953125, + "step": 597, + "time_per_iteration": 2.5622098445892334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187255, + "balance_loss_mlp": 1.06046438, + "epoch": 0.11504424778761062, + "flos": 732619094016.0, + "grad_norm": 0.02047017320895946, + "language_loss": 0.92490959, + "learning_rate": 0.000981157788372175, + "loss": 0.93678212, + "num_input_tokens_seen": 49412864, + "router_z_loss_mlp": 1.26708984, + "step": 598, + "time_per_iteration": 3.017120599746704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185489, + "balance_loss_mlp": 1.05855536, + "epoch": 0.11523662947287418, + "flos": 546962625024.0, + "grad_norm": 0.02044602685826044, + "language_loss": 0.96609688, + "learning_rate": 0.0009810729756690223, + "loss": 0.97795177, + "num_input_tokens_seen": 49483584, + "router_z_loss_mlp": 1.26855469, + "step": 599, + "time_per_iteration": 2.7182610034942627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190213, + "balance_loss_mlp": 1.06323159, + "epoch": 0.11542901115813775, + "flos": 776387616768.0, + "grad_norm": 0.023703305464208416, + "language_loss": 0.99939269, + "learning_rate": 0.0009809879761967766, + "loss": 1.01129484, + "num_input_tokens_seen": 49563568, + "router_z_loss_mlp": 1.26904297, + "step": 600, + "time_per_iteration": 2.9586148262023926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189892, + "balance_loss_mlp": 1.06319618, + "epoch": 0.11562139284340131, + "flos": 732212863488.0, + "grad_norm": 0.024193120208057816, + "language_loss": 0.99113685, + "learning_rate": 0.0009809027899884378, + "loss": 1.00303578, + "num_input_tokens_seen": 49640800, + "router_z_loss_mlp": 1.26611328, + "step": 601, + "time_per_iteration": 2.885070323944092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183816, + "balance_loss_mlp": 1.05731082, + "epoch": 0.11581377452866487, + "flos": 537039710208.0, + "grad_norm": 0.022696091128935367, + "language_loss": 0.96568906, + "learning_rate": 0.0009808174170770779, + "loss": 0.97752714, + "num_input_tokens_seen": 49721872, + "router_z_loss_mlp": 1.26416016, + "step": 602, + "time_per_iteration": 2.7809743881225586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191742, + "balance_loss_mlp": 1.0662384, + "epoch": 0.11600615621392843, + "flos": 1559211617280.0, + "grad_norm": 0.013792800863456836, + "language_loss": 0.84898245, + "learning_rate": 0.0009807318574958418, + "loss": 0.86089987, + "num_input_tokens_seen": 49951472, + "router_z_loss_mlp": 1.25390625, + "step": 603, + "time_per_iteration": 4.860181570053101 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187966, + "balance_loss_mlp": 1.06169963, + "epoch": 0.116198537899192, + "flos": 538467795456.0, + "grad_norm": 0.022659628017063727, + "language_loss": 1.02766323, + "learning_rate": 0.0009806461112779462, + "loss": 1.03954291, + "num_input_tokens_seen": 50021136, + "router_z_loss_mlp": 1.26171875, + "step": 604, + "time_per_iteration": 2.614189863204956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187324, + "balance_loss_mlp": 1.06091404, + "epoch": 0.11639091958445556, + "flos": 455137142784.0, + "grad_norm": 0.0301649070939891, + "language_loss": 1.00891566, + "learning_rate": 0.0009805601784566814, + "loss": 1.02078903, + "num_input_tokens_seen": 50083888, + "router_z_loss_mlp": 1.26318359, + "step": 605, + "time_per_iteration": 2.470878839492798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119223, + "balance_loss_mlp": 1.06658351, + "epoch": 0.11658330126971912, + "flos": 556151668224.0, + "grad_norm": 0.025758302551065336, + "language_loss": 1.05099356, + "learning_rate": 0.0009804740590654089, + "loss": 1.0629158, + "num_input_tokens_seen": 50151744, + "router_z_loss_mlp": 1.25537109, + "step": 606, + "time_per_iteration": 2.631462812423706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191677, + "balance_loss_mlp": 1.06588733, + "epoch": 0.11677568295498268, + "flos": 717600737280.0, + "grad_norm": 0.02545612001836415, + "language_loss": 0.99629396, + "learning_rate": 0.0009803877531375635, + "loss": 1.00821078, + "num_input_tokens_seen": 50221248, + "router_z_loss_mlp": 1.25683594, + "step": 607, + "time_per_iteration": 2.879645586013794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191881, + "balance_loss_mlp": 1.06613898, + "epoch": 0.11696806464024626, + "flos": 610898614272.0, + "grad_norm": 0.023619167708177922, + "language_loss": 0.99668628, + "learning_rate": 0.0009803012607066523, + "loss": 1.008605, + "num_input_tokens_seen": 50293792, + "router_z_loss_mlp": 1.25634766, + "step": 608, + "time_per_iteration": 2.717660427093506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189661, + "balance_loss_mlp": 1.06406212, + "epoch": 0.11716044632550981, + "flos": 521415736320.0, + "grad_norm": 0.023557070356346427, + "language_loss": 0.97414643, + "learning_rate": 0.0009802145818062543, + "loss": 0.98604298, + "num_input_tokens_seen": 50367760, + "router_z_loss_mlp": 1.25488281, + "step": 609, + "time_per_iteration": 2.7209720611572266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190685, + "balance_loss_mlp": 1.064991, + "epoch": 0.11735282801077337, + "flos": 508488204288.0, + "grad_norm": 0.03039581956620226, + "language_loss": 1.01476204, + "learning_rate": 0.0009801277164700212, + "loss": 1.02666891, + "num_input_tokens_seen": 50435664, + "router_z_loss_mlp": 1.25585938, + "step": 610, + "time_per_iteration": 2.5900633335113525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190447, + "balance_loss_mlp": 1.06489623, + "epoch": 0.11754520969603693, + "flos": 687835995648.0, + "grad_norm": 0.028512829376260446, + "language_loss": 0.97853899, + "learning_rate": 0.0009800406647316776, + "loss": 0.99044347, + "num_input_tokens_seen": 50514144, + "router_z_loss_mlp": 1.25439453, + "step": 611, + "time_per_iteration": 2.8018290996551514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118467, + "balance_loss_mlp": 1.06088257, + "epoch": 0.1177375913813005, + "flos": 1545756331008.0, + "grad_norm": 0.00764509792440145, + "language_loss": 0.76914459, + "learning_rate": 0.0009799534266250196, + "loss": 0.78099126, + "num_input_tokens_seen": 50738448, + "router_z_loss_mlp": 1.24023438, + "step": 612, + "time_per_iteration": 4.767510175704956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185632, + "balance_loss_mlp": 1.05974686, + "epoch": 0.11792997306656407, + "flos": 521537260032.0, + "grad_norm": 0.0290479345737112, + "language_loss": 0.97953087, + "learning_rate": 0.000979866002183916, + "loss": 0.99138713, + "num_input_tokens_seen": 50809328, + "router_z_loss_mlp": 1.2578125, + "step": 613, + "time_per_iteration": 2.6752681732177734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182111, + "balance_loss_mlp": 1.05632174, + "epoch": 0.11812235475182763, + "flos": 667488608256.0, + "grad_norm": 0.030776001440310688, + "language_loss": 0.9883132, + "learning_rate": 0.0009797783914423082, + "loss": 1.00013435, + "num_input_tokens_seen": 50887728, + "router_z_loss_mlp": 1.25683594, + "step": 614, + "time_per_iteration": 2.8556718826293945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182577, + "balance_loss_mlp": 1.05697787, + "epoch": 0.11831473643709119, + "flos": 622504121856.0, + "grad_norm": 0.02739500646081478, + "language_loss": 0.93579996, + "learning_rate": 0.0009796905944342094, + "loss": 0.94762576, + "num_input_tokens_seen": 50966160, + "router_z_loss_mlp": 1.25488281, + "step": 615, + "time_per_iteration": 2.80253267288208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187072, + "balance_loss_mlp": 1.06152117, + "epoch": 0.11850711812235475, + "flos": 457694596608.0, + "grad_norm": 0.020858577781052552, + "language_loss": 0.96166766, + "learning_rate": 0.0009796026111937057, + "loss": 0.9735384, + "num_input_tokens_seen": 51035712, + "router_z_loss_mlp": 1.25439453, + "step": 616, + "time_per_iteration": 2.5763044357299805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189497, + "balance_loss_mlp": 1.06404102, + "epoch": 0.11869949980761832, + "flos": 514927137792.0, + "grad_norm": 0.022050319992180305, + "language_loss": 0.96050835, + "learning_rate": 0.0009795144417549552, + "loss": 0.97240329, + "num_input_tokens_seen": 51108656, + "router_z_loss_mlp": 1.25341797, + "step": 617, + "time_per_iteration": 2.7428698539733887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186044, + "balance_loss_mlp": 1.06092167, + "epoch": 0.11889188149288188, + "flos": 536156116992.0, + "grad_norm": 0.0238791856796517, + "language_loss": 0.97532642, + "learning_rate": 0.0009794260861521883, + "loss": 0.98718691, + "num_input_tokens_seen": 51185552, + "router_z_loss_mlp": 1.25292969, + "step": 618, + "time_per_iteration": 2.784257173538208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189196, + "balance_loss_mlp": 1.06445491, + "epoch": 0.11908426317814544, + "flos": 499644266496.0, + "grad_norm": 0.024260475486046627, + "language_loss": 0.96495152, + "learning_rate": 0.0009793375444197075, + "loss": 0.97684348, + "num_input_tokens_seen": 51255808, + "router_z_loss_mlp": 1.25, + "step": 619, + "time_per_iteration": 2.6021063327789307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189567, + "balance_loss_mlp": 1.06482673, + "epoch": 0.119276644863409, + "flos": 661067139072.0, + "grad_norm": 0.023292068214373615, + "language_loss": 0.96012962, + "learning_rate": 0.000979248816591888, + "loss": 0.97202522, + "num_input_tokens_seen": 51329408, + "router_z_loss_mlp": 1.25, + "step": 620, + "time_per_iteration": 2.783372640609741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184512, + "balance_loss_mlp": 1.06001019, + "epoch": 0.11946902654867257, + "flos": 760152021504.0, + "grad_norm": 0.02911418191745056, + "language_loss": 0.95521206, + "learning_rate": 0.0009791599027031766, + "loss": 0.96705711, + "num_input_tokens_seen": 51408784, + "router_z_loss_mlp": 1.24755859, + "step": 621, + "time_per_iteration": 3.04338002204895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185972, + "balance_loss_mlp": 1.06156564, + "epoch": 0.11966140823393613, + "flos": 682213526016.0, + "grad_norm": 0.0317276180850791, + "language_loss": 0.96021026, + "learning_rate": 0.0009790708027880932, + "loss": 0.97206998, + "num_input_tokens_seen": 51482592, + "router_z_loss_mlp": 1.24658203, + "step": 622, + "time_per_iteration": 2.8048830032348633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184547, + "balance_loss_mlp": 1.06171417, + "epoch": 0.11985378991919969, + "flos": 1454298147840.0, + "grad_norm": 0.011779966077399251, + "language_loss": 0.77427292, + "learning_rate": 0.0009789815168812293, + "loss": 0.78611839, + "num_input_tokens_seen": 51712240, + "router_z_loss_mlp": 1.23046875, + "step": 623, + "time_per_iteration": 4.88221549987793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187655, + "balance_loss_mlp": 1.06291461, + "epoch": 0.12004617160446325, + "flos": 528898718208.0, + "grad_norm": 0.0243802584204396, + "language_loss": 1.01341891, + "learning_rate": 0.0009788920450172487, + "loss": 1.0252955, + "num_input_tokens_seen": 51781440, + "router_z_loss_mlp": 1.25, + "step": 624, + "time_per_iteration": 2.6179678440093994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190724, + "balance_loss_mlp": 1.06655562, + "epoch": 0.12023855328972682, + "flos": 475176354816.0, + "grad_norm": 0.025839680970612892, + "language_loss": 0.99598378, + "learning_rate": 0.0009788023872308875, + "loss": 1.00789118, + "num_input_tokens_seen": 51845424, + "router_z_loss_mlp": 1.24414062, + "step": 625, + "time_per_iteration": 2.5168616771698 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189499, + "balance_loss_mlp": 1.06723785, + "epoch": 0.12043093497499038, + "flos": 1535051880960.0, + "grad_norm": 0.008994278182213968, + "language_loss": 0.75428998, + "learning_rate": 0.0009787125435569539, + "loss": 0.76618505, + "num_input_tokens_seen": 52076496, + "router_z_loss_mlp": 1.22460938, + "step": 626, + "time_per_iteration": 4.739393472671509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194547, + "balance_loss_mlp": 1.07128501, + "epoch": 0.12062331666025394, + "flos": 540914459136.0, + "grad_norm": 0.025390703641747513, + "language_loss": 1.01758838, + "learning_rate": 0.0009786225140303285, + "loss": 1.02953386, + "num_input_tokens_seen": 52143072, + "router_z_loss_mlp": 1.23486328, + "step": 627, + "time_per_iteration": 2.627995729446411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119053, + "balance_loss_mlp": 1.06683803, + "epoch": 0.1208156983455175, + "flos": 512999496192.0, + "grad_norm": 0.027559316114759484, + "language_loss": 1.00245547, + "learning_rate": 0.0009785322986859634, + "loss": 1.0143609, + "num_input_tokens_seen": 52211888, + "router_z_loss_mlp": 1.23925781, + "step": 628, + "time_per_iteration": 2.657465696334839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011787, + "balance_loss_mlp": 1.05481803, + "epoch": 0.12100808003078108, + "flos": 597589046784.0, + "grad_norm": 0.024406659961039724, + "language_loss": 1.01031506, + "learning_rate": 0.0009784418975588838, + "loss": 1.02210212, + "num_input_tokens_seen": 52283696, + "router_z_loss_mlp": 1.24121094, + "step": 629, + "time_per_iteration": 2.6953535079956055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187008, + "balance_loss_mlp": 1.063555, + "epoch": 0.12120046171604464, + "flos": 524066515968.0, + "grad_norm": 0.02180733694842763, + "language_loss": 0.99517697, + "learning_rate": 0.0009783513106841862, + "loss": 1.00704694, + "num_input_tokens_seen": 52358624, + "router_z_loss_mlp": 1.23681641, + "step": 630, + "time_per_iteration": 2.7234978675842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189331, + "balance_loss_mlp": 1.06687927, + "epoch": 0.1213928434013082, + "flos": 1557907057152.0, + "grad_norm": 0.011472153843238986, + "language_loss": 0.76732707, + "learning_rate": 0.00097826053809704, + "loss": 0.77922034, + "num_input_tokens_seen": 52591248, + "router_z_loss_mlp": 1.2265625, + "step": 631, + "time_per_iteration": 4.975109100341797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184278, + "balance_loss_mlp": 1.06072986, + "epoch": 0.12158522508657175, + "flos": 496387869696.0, + "grad_norm": 0.025959921000511615, + "language_loss": 0.96498066, + "learning_rate": 0.0009781695798326854, + "loss": 0.97682351, + "num_input_tokens_seen": 52659920, + "router_z_loss_mlp": 1.23779297, + "step": 632, + "time_per_iteration": 2.5740485191345215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184351, + "balance_loss_mlp": 1.0608983, + "epoch": 0.12177760677183531, + "flos": 476589703680.0, + "grad_norm": 0.025554774573744533, + "language_loss": 0.96275663, + "learning_rate": 0.0009780784359264365, + "loss": 0.9746002, + "num_input_tokens_seen": 52728832, + "router_z_loss_mlp": 1.23681641, + "step": 633, + "time_per_iteration": 2.604390859603882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176552, + "balance_loss_mlp": 1.05543518, + "epoch": 0.12196998845709889, + "flos": 1471784635392.0, + "grad_norm": 0.009598735556444526, + "language_loss": 0.74188697, + "learning_rate": 0.0009779871064136778, + "loss": 0.75365245, + "num_input_tokens_seen": 52949776, + "router_z_loss_mlp": 1.21289062, + "step": 634, + "time_per_iteration": 4.757449626922607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117713, + "balance_loss_mlp": 1.05424869, + "epoch": 0.12216237014236245, + "flos": 587748724224.0, + "grad_norm": 0.021555120902870813, + "language_loss": 0.93822527, + "learning_rate": 0.000977895591329867, + "loss": 0.94999647, + "num_input_tokens_seen": 53027184, + "router_z_loss_mlp": 1.23095703, + "step": 635, + "time_per_iteration": 2.7859792709350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181489, + "balance_loss_mlp": 1.05851305, + "epoch": 0.12235475182762601, + "flos": 599106455040.0, + "grad_norm": 0.023775729584682537, + "language_loss": 0.96009773, + "learning_rate": 0.000977803890710533, + "loss": 0.97191262, + "num_input_tokens_seen": 53101072, + "router_z_loss_mlp": 1.23193359, + "step": 636, + "time_per_iteration": 2.76069712638855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180701, + "balance_loss_mlp": 1.05762947, + "epoch": 0.12254713351288957, + "flos": 498760673280.0, + "grad_norm": 0.024707427516876792, + "language_loss": 1.00440359, + "learning_rate": 0.0009777120045912774, + "loss": 1.01621056, + "num_input_tokens_seen": 53172992, + "router_z_loss_mlp": 1.23291016, + "step": 637, + "time_per_iteration": 2.5980072021484375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118065, + "balance_loss_mlp": 1.05772126, + "epoch": 0.12273951519815314, + "flos": 606980204544.0, + "grad_norm": 0.02489341207380848, + "language_loss": 0.99891078, + "learning_rate": 0.0009776199330077736, + "loss": 1.01071739, + "num_input_tokens_seen": 53248256, + "router_z_loss_mlp": 1.23144531, + "step": 638, + "time_per_iteration": 2.704040288925171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181154, + "balance_loss_mlp": 1.05841601, + "epoch": 0.1229318968834167, + "flos": 598984931328.0, + "grad_norm": 0.02631208797714665, + "language_loss": 1.02141118, + "learning_rate": 0.0009775276759957667, + "loss": 1.03322268, + "num_input_tokens_seen": 53318960, + "router_z_loss_mlp": 1.22949219, + "step": 639, + "time_per_iteration": 2.7442896366119385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179934, + "balance_loss_mlp": 1.05700564, + "epoch": 0.12312427856868026, + "flos": 679588942848.0, + "grad_norm": 0.026802425502252814, + "language_loss": 1.01084137, + "learning_rate": 0.0009774352335910745, + "loss": 1.02264071, + "num_input_tokens_seen": 53389120, + "router_z_loss_mlp": 1.23144531, + "step": 640, + "time_per_iteration": 2.8294076919555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117918, + "balance_loss_mlp": 1.05625129, + "epoch": 0.12331666025394382, + "flos": 610043218944.0, + "grad_norm": 0.020742791942005383, + "language_loss": 1.02118182, + "learning_rate": 0.000977342605829586, + "loss": 1.03297377, + "num_input_tokens_seen": 53459056, + "router_z_loss_mlp": 1.23144531, + "step": 641, + "time_per_iteration": 2.7078418731689453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180028, + "balance_loss_mlp": 1.05748129, + "epoch": 0.12350904193920739, + "flos": 763840118784.0, + "grad_norm": 0.025027209312251563, + "language_loss": 0.94737858, + "learning_rate": 0.0009772497927472623, + "loss": 0.95917892, + "num_input_tokens_seen": 53541552, + "router_z_loss_mlp": 1.22753906, + "step": 642, + "time_per_iteration": 3.0655579566955566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177096, + "balance_loss_mlp": 1.05454898, + "epoch": 0.12370142362447095, + "flos": 542049831936.0, + "grad_norm": 0.02608476880613399, + "language_loss": 0.96273685, + "learning_rate": 0.0009771567943801368, + "loss": 0.97450781, + "num_input_tokens_seen": 53611520, + "router_z_loss_mlp": 1.22753906, + "step": 643, + "time_per_iteration": 2.7343406677246094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179725, + "balance_loss_mlp": 1.05727291, + "epoch": 0.12389380530973451, + "flos": 549252836352.0, + "grad_norm": 0.02435000122960196, + "language_loss": 0.99357152, + "learning_rate": 0.0009770636107643152, + "loss": 1.00536871, + "num_input_tokens_seen": 53683888, + "router_z_loss_mlp": 1.2265625, + "step": 644, + "time_per_iteration": 2.7361886501312256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177422, + "balance_loss_mlp": 1.05516136, + "epoch": 0.12408618699499807, + "flos": 541352890368.0, + "grad_norm": 0.02246298440278387, + "language_loss": 0.95392644, + "learning_rate": 0.0009769702419359738, + "loss": 0.96570063, + "num_input_tokens_seen": 53751888, + "router_z_loss_mlp": 1.22460938, + "step": 645, + "time_per_iteration": 2.674142837524414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181453, + "balance_loss_mlp": 1.05904841, + "epoch": 0.12427856868026164, + "flos": 747159361536.0, + "grad_norm": 0.023095982047370255, + "language_loss": 0.97586024, + "learning_rate": 0.000976876687931362, + "loss": 0.98767477, + "num_input_tokens_seen": 53827648, + "router_z_loss_mlp": 1.22607422, + "step": 646, + "time_per_iteration": 2.9833688735961914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189298, + "balance_loss_mlp": 1.06703711, + "epoch": 0.1244709503655252, + "flos": 534744769536.0, + "grad_norm": 0.03060863164707411, + "language_loss": 0.94044995, + "learning_rate": 0.0009767829487868005, + "loss": 0.95234299, + "num_input_tokens_seen": 53896400, + "router_z_loss_mlp": 1.22460938, + "step": 647, + "time_per_iteration": 2.596003293991089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182997, + "balance_loss_mlp": 1.06073558, + "epoch": 0.12466333205078876, + "flos": 509111285760.0, + "grad_norm": 0.028982594733012217, + "language_loss": 0.98960567, + "learning_rate": 0.000976689024538682, + "loss": 1.00143564, + "num_input_tokens_seen": 53965904, + "router_z_loss_mlp": 1.22460938, + "step": 648, + "time_per_iteration": 2.5837948322296143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183924, + "balance_loss_mlp": 1.06171107, + "epoch": 0.12485571373605232, + "flos": 682639222272.0, + "grad_norm": 0.03213416167398649, + "language_loss": 0.97804081, + "learning_rate": 0.0009765949152234716, + "loss": 0.98988008, + "num_input_tokens_seen": 54049792, + "router_z_loss_mlp": 1.22412109, + "step": 649, + "time_per_iteration": 2.876009702682495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193359, + "balance_loss_mlp": 1.07243347, + "epoch": 0.1250480954213159, + "flos": 1333198748160.0, + "grad_norm": 0.014891788740719425, + "language_loss": 0.78686082, + "learning_rate": 0.0009765006208777055, + "loss": 0.79879445, + "num_input_tokens_seen": 54262432, + "router_z_loss_mlp": 1.2109375, + "step": 650, + "time_per_iteration": 4.675558805465698 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182781, + "balance_loss_mlp": 1.06152093, + "epoch": 0.12524047710657946, + "flos": 940196754432.0, + "grad_norm": 0.027794334398077363, + "language_loss": 0.91408408, + "learning_rate": 0.0009764061415379919, + "loss": 0.9259119, + "num_input_tokens_seen": 54351568, + "router_z_loss_mlp": 1.21435547, + "step": 651, + "time_per_iteration": 3.260758399963379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184193, + "balance_loss_mlp": 1.06288576, + "epoch": 0.12543285879184302, + "flos": 514900941312.0, + "grad_norm": 0.027655948956122736, + "language_loss": 0.97430605, + "learning_rate": 0.0009763114772410109, + "loss": 0.986148, + "num_input_tokens_seen": 54418944, + "router_z_loss_mlp": 1.21484375, + "step": 652, + "time_per_iteration": 2.60402512550354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179616, + "balance_loss_mlp": 1.05849957, + "epoch": 0.12562524047710658, + "flos": 719682829824.0, + "grad_norm": 0.022040452281994895, + "language_loss": 0.94100869, + "learning_rate": 0.0009762166280235146, + "loss": 0.95280486, + "num_input_tokens_seen": 54495312, + "router_z_loss_mlp": 1.21289062, + "step": 653, + "time_per_iteration": 2.953866958618164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177042, + "balance_loss_mlp": 1.05592513, + "epoch": 0.12581762216237014, + "flos": 564798220800.0, + "grad_norm": 0.026345633512325176, + "language_loss": 0.96725851, + "learning_rate": 0.0009761215939223267, + "loss": 0.97902894, + "num_input_tokens_seen": 54566832, + "router_z_loss_mlp": 1.21289062, + "step": 654, + "time_per_iteration": 2.6936216354370117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176243, + "balance_loss_mlp": 1.0553174, + "epoch": 0.1260100038476337, + "flos": 482900382720.0, + "grad_norm": 0.0302310026354778, + "language_loss": 0.97697163, + "learning_rate": 0.0009760263749743428, + "loss": 0.98873413, + "num_input_tokens_seen": 54632128, + "router_z_loss_mlp": 1.2109375, + "step": 655, + "time_per_iteration": 2.5425992012023926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173716, + "balance_loss_mlp": 1.05302835, + "epoch": 0.12620238553289725, + "flos": 576701170176.0, + "grad_norm": 0.026173940013352312, + "language_loss": 0.96703827, + "learning_rate": 0.0009759309712165299, + "loss": 0.97877538, + "num_input_tokens_seen": 54707600, + "router_z_loss_mlp": 1.20849609, + "step": 656, + "time_per_iteration": 2.7134242057800293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182641, + "balance_loss_mlp": 1.06185794, + "epoch": 0.12639476721816084, + "flos": 532185314304.0, + "grad_norm": 0.024272217680215723, + "language_loss": 1.00863099, + "learning_rate": 0.0009758353826859272, + "loss": 1.02045751, + "num_input_tokens_seen": 54776704, + "router_z_loss_mlp": 1.20947266, + "step": 657, + "time_per_iteration": 2.621317148208618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183764, + "balance_loss_mlp": 1.06288576, + "epoch": 0.1265871489034244, + "flos": 691231380480.0, + "grad_norm": 0.02639198012969831, + "language_loss": 0.9913975, + "learning_rate": 0.0009757396094196456, + "loss": 1.00323522, + "num_input_tokens_seen": 54851744, + "router_z_loss_mlp": 1.21044922, + "step": 658, + "time_per_iteration": 2.8867759704589844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183942, + "balance_loss_mlp": 1.06311166, + "epoch": 0.12677953058868796, + "flos": 538242212352.0, + "grad_norm": 0.02343039495549204, + "language_loss": 0.91435432, + "learning_rate": 0.0009756436514548673, + "loss": 0.92619371, + "num_input_tokens_seen": 54932576, + "router_z_loss_mlp": 1.20996094, + "step": 659, + "time_per_iteration": 2.8055155277252197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179962, + "balance_loss_mlp": 1.05903614, + "epoch": 0.12697191227395152, + "flos": 520119908352.0, + "grad_norm": 0.02147737158217614, + "language_loss": 0.94944704, + "learning_rate": 0.0009755475088288466, + "loss": 0.96124667, + "num_input_tokens_seen": 55007296, + "router_z_loss_mlp": 1.2109375, + "step": 660, + "time_per_iteration": 2.713801860809326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179144, + "balance_loss_mlp": 1.05826533, + "epoch": 0.12716429395921508, + "flos": 567665851392.0, + "grad_norm": 0.026687699897107686, + "language_loss": 0.99289566, + "learning_rate": 0.0009754511815789095, + "loss": 1.00468707, + "num_input_tokens_seen": 55079312, + "router_z_loss_mlp": 1.21044922, + "step": 661, + "time_per_iteration": 2.739250898361206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176549, + "balance_loss_mlp": 1.05590951, + "epoch": 0.12735667564447864, + "flos": 515141987328.0, + "grad_norm": 0.028028480179563667, + "language_loss": 0.94950283, + "learning_rate": 0.0009753546697424533, + "loss": 0.96126837, + "num_input_tokens_seen": 55151824, + "router_z_loss_mlp": 1.20800781, + "step": 662, + "time_per_iteration": 2.71746826171875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180242, + "balance_loss_mlp": 1.05941188, + "epoch": 0.1275490573297422, + "flos": 542321077248.0, + "grad_norm": 0.02443290319898258, + "language_loss": 0.98755229, + "learning_rate": 0.0009752579733569475, + "loss": 0.99935466, + "num_input_tokens_seen": 55224368, + "router_z_loss_mlp": 1.20996094, + "step": 663, + "time_per_iteration": 2.631284713745117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179512, + "balance_loss_mlp": 1.06030273, + "epoch": 0.12774143901500576, + "flos": 1562024853504.0, + "grad_norm": 0.010147906106003043, + "language_loss": 0.74881387, + "learning_rate": 0.0009751610924599328, + "loss": 0.76060903, + "num_input_tokens_seen": 55453584, + "router_z_loss_mlp": 1.19335938, + "step": 664, + "time_per_iteration": 4.941519260406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188286, + "balance_loss_mlp": 1.06783676, + "epoch": 0.12793382070026935, + "flos": 614873419776.0, + "grad_norm": 0.028758292375382164, + "language_loss": 1.00255466, + "learning_rate": 0.0009750640270890217, + "loss": 1.01443744, + "num_input_tokens_seen": 55528000, + "router_z_loss_mlp": 1.20605469, + "step": 665, + "time_per_iteration": 2.7382516860961914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185033, + "balance_loss_mlp": 1.06458378, + "epoch": 0.1281262023855329, + "flos": 709117367808.0, + "grad_norm": 0.02727882395737353, + "language_loss": 1.05972624, + "learning_rate": 0.0009749667772818983, + "loss": 1.0715766, + "num_input_tokens_seen": 55612416, + "router_z_loss_mlp": 1.20605469, + "step": 666, + "time_per_iteration": 2.961103677749634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117968, + "balance_loss_mlp": 1.06104279, + "epoch": 0.12831858407079647, + "flos": 1428182572032.0, + "grad_norm": 0.005713660367986308, + "language_loss": 0.76935941, + "learning_rate": 0.0009748693430763185, + "loss": 0.78115624, + "num_input_tokens_seen": 55843664, + "router_z_loss_mlp": 1.1875, + "step": 667, + "time_per_iteration": 4.799788475036621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180825, + "balance_loss_mlp": 1.06056714, + "epoch": 0.12851096575606002, + "flos": 450018232320.0, + "grad_norm": 0.027450705632443572, + "language_loss": 1.04045725, + "learning_rate": 0.0009747717245101093, + "loss": 1.05226541, + "num_input_tokens_seen": 55909072, + "router_z_loss_mlp": 1.20410156, + "step": 668, + "time_per_iteration": 2.503016471862793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181103, + "balance_loss_mlp": 1.0609405, + "epoch": 0.12870334744132358, + "flos": 480909614592.0, + "grad_norm": 0.024743463193645603, + "language_loss": 0.94192064, + "learning_rate": 0.00097467392162117, + "loss": 0.95373166, + "num_input_tokens_seen": 55978544, + "router_z_loss_mlp": 1.203125, + "step": 669, + "time_per_iteration": 2.6341683864593506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176215, + "balance_loss_mlp": 1.05609953, + "epoch": 0.12889572912658714, + "flos": 640151064576.0, + "grad_norm": 0.020470833753638586, + "language_loss": 0.98179239, + "learning_rate": 0.0009745759344474708, + "loss": 0.99355447, + "num_input_tokens_seen": 56054144, + "router_z_loss_mlp": 1.20263672, + "step": 670, + "time_per_iteration": 2.8753654956817627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175464, + "balance_loss_mlp": 1.05530083, + "epoch": 0.1290881108118507, + "flos": 510954333696.0, + "grad_norm": 0.02496408481001148, + "language_loss": 0.98669916, + "learning_rate": 0.0009744777630270536, + "loss": 0.99845386, + "num_input_tokens_seen": 56120960, + "router_z_loss_mlp": 1.203125, + "step": 671, + "time_per_iteration": 2.601480484008789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173739, + "balance_loss_mlp": 1.05381489, + "epoch": 0.12928049249711426, + "flos": 672290611200.0, + "grad_norm": 0.0267777739546368, + "language_loss": 1.0349828, + "learning_rate": 0.000974379407398032, + "loss": 1.04672015, + "num_input_tokens_seen": 56202560, + "router_z_loss_mlp": 1.20068359, + "step": 672, + "time_per_iteration": 2.8746023178100586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176311, + "balance_loss_mlp": 1.05633891, + "epoch": 0.12947287418237785, + "flos": 794998743552.0, + "grad_norm": 0.021070447178693698, + "language_loss": 0.89884377, + "learning_rate": 0.0009742808675985913, + "loss": 0.91060686, + "num_input_tokens_seen": 56289456, + "router_z_loss_mlp": 1.20117188, + "step": 673, + "time_per_iteration": 3.106855869293213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178925, + "balance_loss_mlp": 1.05895269, + "epoch": 0.1296652558676414, + "flos": 486447490560.0, + "grad_norm": 0.028552559493613055, + "language_loss": 1.00707459, + "learning_rate": 0.0009741821436669876, + "loss": 1.0188638, + "num_input_tokens_seen": 56354480, + "router_z_loss_mlp": 1.20117188, + "step": 674, + "time_per_iteration": 2.6221611499786377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180752, + "balance_loss_mlp": 1.06097043, + "epoch": 0.12985763755290497, + "flos": 454392537600.0, + "grad_norm": 0.03163366532216525, + "language_loss": 1.04449701, + "learning_rate": 0.0009740832356415492, + "loss": 1.05630445, + "num_input_tokens_seen": 56418944, + "router_z_loss_mlp": 1.19921875, + "step": 675, + "time_per_iteration": 2.508666515350342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179614, + "balance_loss_mlp": 1.05968916, + "epoch": 0.13005001923816853, + "flos": 826434617856.0, + "grad_norm": 0.02755997498495484, + "language_loss": 0.99148017, + "learning_rate": 0.0009739841435606756, + "loss": 1.00327623, + "num_input_tokens_seen": 56492368, + "router_z_loss_mlp": 1.20068359, + "step": 676, + "time_per_iteration": 3.026420831680298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180175, + "balance_loss_mlp": 1.06058431, + "epoch": 0.1302424009234321, + "flos": 532480754688.0, + "grad_norm": 0.02275953253130011, + "language_loss": 0.97366607, + "learning_rate": 0.0009738848674628377, + "loss": 0.98546779, + "num_input_tokens_seen": 56568128, + "router_z_loss_mlp": 1.19726562, + "step": 677, + "time_per_iteration": 2.710205554962158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179059, + "balance_loss_mlp": 1.05927801, + "epoch": 0.13043478260869565, + "flos": 526916682240.0, + "grad_norm": 0.02441501439452981, + "language_loss": 0.97902691, + "learning_rate": 0.000973785407386578, + "loss": 0.99081755, + "num_input_tokens_seen": 56646448, + "router_z_loss_mlp": 1.19921875, + "step": 678, + "time_per_iteration": 2.7785394191741943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184892, + "balance_loss_mlp": 1.06553924, + "epoch": 0.1306271642939592, + "flos": 627416914944.0, + "grad_norm": 0.023801085732510874, + "language_loss": 0.94469249, + "learning_rate": 0.0009736857633705103, + "loss": 0.95654142, + "num_input_tokens_seen": 56732080, + "router_z_loss_mlp": 1.19482422, + "step": 679, + "time_per_iteration": 2.8619470596313477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177483, + "balance_loss_mlp": 1.05827415, + "epoch": 0.13081954597922277, + "flos": 551840489472.0, + "grad_norm": 0.024512943765722366, + "language_loss": 1.01033652, + "learning_rate": 0.0009735859354533196, + "loss": 1.02211142, + "num_input_tokens_seen": 56804432, + "router_z_loss_mlp": 1.19335938, + "step": 680, + "time_per_iteration": 2.6954457759857178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176387, + "balance_loss_mlp": 1.05755925, + "epoch": 0.13101192766448633, + "flos": 537955504128.0, + "grad_norm": 0.029188130773433643, + "language_loss": 1.02405858, + "learning_rate": 0.0009734859236737628, + "loss": 1.03582239, + "num_input_tokens_seen": 56872512, + "router_z_loss_mlp": 1.18945312, + "step": 681, + "time_per_iteration": 2.606597661972046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172364, + "balance_loss_mlp": 1.05353606, + "epoch": 0.13120430934974991, + "flos": 504513398784.0, + "grad_norm": 0.02625319928532985, + "language_loss": 1.02007055, + "learning_rate": 0.0009733857280706678, + "loss": 1.03179431, + "num_input_tokens_seen": 56940928, + "router_z_loss_mlp": 1.18945312, + "step": 682, + "time_per_iteration": 2.626211404800415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011686, + "balance_loss_mlp": 1.05010605, + "epoch": 0.13139669103501347, + "flos": 615422641152.0, + "grad_norm": 0.025135553656080285, + "language_loss": 0.9321503, + "learning_rate": 0.000973285348682934, + "loss": 0.94383633, + "num_input_tokens_seen": 57012736, + "router_z_loss_mlp": 1.18603516, + "step": 683, + "time_per_iteration": 2.71779727935791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190269, + "balance_loss_mlp": 1.07296753, + "epoch": 0.13158907272027703, + "flos": 1488215614464.0, + "grad_norm": 0.025067429703540995, + "language_loss": 0.77898371, + "learning_rate": 0.0009731847855495323, + "loss": 0.7908864, + "num_input_tokens_seen": 57243136, + "router_z_loss_mlp": 1.17382812, + "step": 684, + "time_per_iteration": 4.811431169509888 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168738, + "balance_loss_mlp": 1.05048192, + "epoch": 0.1317814544055406, + "flos": 987117614592.0, + "grad_norm": 0.026136533405527674, + "language_loss": 0.93269205, + "learning_rate": 0.0009730840387095046, + "loss": 0.94437939, + "num_input_tokens_seen": 57336160, + "router_z_loss_mlp": 1.18359375, + "step": 685, + "time_per_iteration": 3.3154938220977783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117288, + "balance_loss_mlp": 1.05443382, + "epoch": 0.13197383609080415, + "flos": 612628870656.0, + "grad_norm": 0.026271684435729213, + "language_loss": 0.99177825, + "learning_rate": 0.0009729831082019642, + "loss": 1.00350702, + "num_input_tokens_seen": 57418976, + "router_z_loss_mlp": 1.18554688, + "step": 686, + "time_per_iteration": 2.79620623588562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179476, + "balance_loss_mlp": 1.06093395, + "epoch": 0.1321662177760677, + "flos": 495554668032.0, + "grad_norm": 0.02508782879826625, + "language_loss": 0.97052312, + "learning_rate": 0.0009728819940660958, + "loss": 0.98231786, + "num_input_tokens_seen": 57490288, + "router_z_loss_mlp": 1.18652344, + "step": 687, + "time_per_iteration": 2.779193162918091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178983, + "balance_loss_mlp": 1.06067955, + "epoch": 0.13235859946133127, + "flos": 496843765248.0, + "grad_norm": 0.02705130625621755, + "language_loss": 0.97550011, + "learning_rate": 0.0009727806963411557, + "loss": 0.98728997, + "num_input_tokens_seen": 57556064, + "router_z_loss_mlp": 1.18408203, + "step": 688, + "time_per_iteration": 2.5702319145202637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177139, + "balance_loss_mlp": 1.05883551, + "epoch": 0.13255098114659483, + "flos": 512767182336.0, + "grad_norm": 0.022910122085290585, + "language_loss": 0.96022904, + "learning_rate": 0.000972679215066471, + "loss": 0.97200048, + "num_input_tokens_seen": 57627248, + "router_z_loss_mlp": 1.18408203, + "step": 689, + "time_per_iteration": 2.64780592918396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178761, + "balance_loss_mlp": 1.06050563, + "epoch": 0.13274336283185842, + "flos": 548399442432.0, + "grad_norm": 0.030606528220640358, + "language_loss": 1.08985806, + "learning_rate": 0.0009725775502814401, + "loss": 1.10164571, + "num_input_tokens_seen": 57694832, + "router_z_loss_mlp": 1.18359375, + "step": 690, + "time_per_iteration": 2.5830535888671875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179512, + "balance_loss_mlp": 1.06120849, + "epoch": 0.13293574451712198, + "flos": 642002844672.0, + "grad_norm": 0.023439513257655937, + "language_loss": 0.94635952, + "learning_rate": 0.0009724757020255327, + "loss": 0.95815468, + "num_input_tokens_seen": 57771776, + "router_z_loss_mlp": 1.18408203, + "step": 691, + "time_per_iteration": 2.827944278717041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183334, + "balance_loss_mlp": 1.06517375, + "epoch": 0.13312812620238554, + "flos": 492469459968.0, + "grad_norm": 0.028212898490696088, + "language_loss": 0.96836531, + "learning_rate": 0.0009723736703382902, + "loss": 0.98019874, + "num_input_tokens_seen": 57836272, + "router_z_loss_mlp": 1.18261719, + "step": 692, + "time_per_iteration": 2.6144213676452637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180114, + "balance_loss_mlp": 1.06200123, + "epoch": 0.1333205078876491, + "flos": 509949216768.0, + "grad_norm": 0.023005533645913036, + "language_loss": 0.90654016, + "learning_rate": 0.0009722714552593244, + "loss": 0.91834128, + "num_input_tokens_seen": 57907232, + "router_z_loss_mlp": 1.18212891, + "step": 693, + "time_per_iteration": 2.600128173828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180549, + "balance_loss_mlp": 1.06262743, + "epoch": 0.13351288957291266, + "flos": 419591477760.0, + "grad_norm": 0.029950659996273835, + "language_loss": 1.05475199, + "learning_rate": 0.000972169056828319, + "loss": 1.06655741, + "num_input_tokens_seen": 57969808, + "router_z_loss_mlp": 1.18017578, + "step": 694, + "time_per_iteration": 2.466643810272217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178338, + "balance_loss_mlp": 1.0606066, + "epoch": 0.13370527125817622, + "flos": 617050839552.0, + "grad_norm": 0.021764231653516302, + "language_loss": 0.95444119, + "learning_rate": 0.0009720664750850283, + "loss": 0.96622455, + "num_input_tokens_seen": 58042944, + "router_z_loss_mlp": 1.17822266, + "step": 695, + "time_per_iteration": 2.7776308059692383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173328, + "balance_loss_mlp": 1.05578816, + "epoch": 0.13389765294343978, + "flos": 627169138176.0, + "grad_norm": 0.026088042391715836, + "language_loss": 1.0165019, + "learning_rate": 0.0009719637100692784, + "loss": 1.0282352, + "num_input_tokens_seen": 58116080, + "router_z_loss_mlp": 1.17626953, + "step": 696, + "time_per_iteration": 2.77535343170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175294, + "balance_loss_mlp": 1.0578016, + "epoch": 0.13409003462870334, + "flos": 610896612864.0, + "grad_norm": 0.027090913840535472, + "language_loss": 0.92017978, + "learning_rate": 0.0009718607618209661, + "loss": 0.93193275, + "num_input_tokens_seen": 58197616, + "router_z_loss_mlp": 1.17578125, + "step": 697, + "time_per_iteration": 2.8413584232330322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179845, + "balance_loss_mlp": 1.06235278, + "epoch": 0.13428241631396692, + "flos": 685087887360.0, + "grad_norm": 0.024883061853709334, + "language_loss": 0.95573747, + "learning_rate": 0.0009717576303800595, + "loss": 0.96753585, + "num_input_tokens_seen": 58280480, + "router_z_loss_mlp": 1.17578125, + "step": 698, + "time_per_iteration": 3.047100782394409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175386, + "balance_loss_mlp": 1.05794048, + "epoch": 0.13447479799923048, + "flos": 509818960896.0, + "grad_norm": 0.024888049065051182, + "language_loss": 0.95325053, + "learning_rate": 0.0009716543157865975, + "loss": 0.96500432, + "num_input_tokens_seen": 58352464, + "router_z_loss_mlp": 1.17529297, + "step": 699, + "time_per_iteration": 2.7481272220611572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176233, + "balance_loss_mlp": 1.05878782, + "epoch": 0.13466717968449404, + "flos": 899058819072.0, + "grad_norm": 0.023872779385430955, + "language_loss": 0.92076075, + "learning_rate": 0.0009715508180806907, + "loss": 0.93252313, + "num_input_tokens_seen": 58437216, + "router_z_loss_mlp": 1.17529297, + "step": 700, + "time_per_iteration": 3.2107367515563965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173529, + "balance_loss_mlp": 1.05660856, + "epoch": 0.1348595613697576, + "flos": 991694034432.0, + "grad_norm": 0.023513798430807663, + "language_loss": 1.00262749, + "learning_rate": 0.0009714471373025202, + "loss": 1.01436281, + "num_input_tokens_seen": 58533152, + "router_z_loss_mlp": 1.16992188, + "step": 701, + "time_per_iteration": 3.3966751098632812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173715, + "balance_loss_mlp": 1.0566988, + "epoch": 0.13505194305502116, + "flos": 488811561984.0, + "grad_norm": 0.028001983236069502, + "language_loss": 0.99373382, + "learning_rate": 0.0009713432734923386, + "loss": 1.00547099, + "num_input_tokens_seen": 58601376, + "router_z_loss_mlp": 1.17089844, + "step": 702, + "time_per_iteration": 2.615107536315918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171408, + "balance_loss_mlp": 1.05439234, + "epoch": 0.13524432474028472, + "flos": 614519582208.0, + "grad_norm": 0.024192478681639117, + "language_loss": 0.96606487, + "learning_rate": 0.0009712392266904696, + "loss": 0.97777891, + "num_input_tokens_seen": 58676608, + "router_z_loss_mlp": 1.17089844, + "step": 703, + "time_per_iteration": 2.7448034286499023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174325, + "balance_loss_mlp": 1.05740499, + "epoch": 0.13543670642554828, + "flos": 906274558464.0, + "grad_norm": 0.025492480769094515, + "language_loss": 0.96012545, + "learning_rate": 0.0009711349969373076, + "loss": 0.97186869, + "num_input_tokens_seen": 58759264, + "router_z_loss_mlp": 1.16992188, + "step": 704, + "time_per_iteration": 3.1337268352508545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172794, + "balance_loss_mlp": 1.05596876, + "epoch": 0.13562908811081184, + "flos": 551747163648.0, + "grad_norm": 0.026772975251671254, + "language_loss": 0.91034031, + "learning_rate": 0.0009710305842733178, + "loss": 0.9220683, + "num_input_tokens_seen": 58834800, + "router_z_loss_mlp": 1.16894531, + "step": 705, + "time_per_iteration": 2.7571139335632324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167183, + "balance_loss_mlp": 1.05031061, + "epoch": 0.1358214697960754, + "flos": 509037425664.0, + "grad_norm": 0.024292049069741084, + "language_loss": 0.98220038, + "learning_rate": 0.0009709259887390373, + "loss": 0.99387223, + "num_input_tokens_seen": 58901712, + "router_z_loss_mlp": 1.16943359, + "step": 706, + "time_per_iteration": 2.559511661529541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168004, + "balance_loss_mlp": 1.05141699, + "epoch": 0.136013851481339, + "flos": 529923300864.0, + "grad_norm": 0.025926611739077732, + "language_loss": 1.00068641, + "learning_rate": 0.0009708212103750737, + "loss": 1.01236641, + "num_input_tokens_seen": 58967824, + "router_z_loss_mlp": 1.16650391, + "step": 707, + "time_per_iteration": 2.6197190284729004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168587, + "balance_loss_mlp": 1.05219126, + "epoch": 0.13620623316660255, + "flos": 660320532480.0, + "grad_norm": 0.02235622943703988, + "language_loss": 0.96270919, + "learning_rate": 0.0009707162492221051, + "loss": 0.97439504, + "num_input_tokens_seen": 59045040, + "router_z_loss_mlp": 1.16455078, + "step": 708, + "time_per_iteration": 2.8917648792266846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171818, + "balance_loss_mlp": 1.05542207, + "epoch": 0.1363986148518661, + "flos": 673082880000.0, + "grad_norm": 0.027649047287573853, + "language_loss": 0.98132068, + "learning_rate": 0.0009706111053208815, + "loss": 0.99303889, + "num_input_tokens_seen": 59117216, + "router_z_loss_mlp": 1.16455078, + "step": 709, + "time_per_iteration": 2.7827165126800537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173191, + "balance_loss_mlp": 1.05669987, + "epoch": 0.13659099653712967, + "flos": 474004051968.0, + "grad_norm": 0.02773643003805471, + "language_loss": 0.94597077, + "learning_rate": 0.0009705057787122232, + "loss": 0.9577027, + "num_input_tokens_seen": 59183056, + "router_z_loss_mlp": 1.16552734, + "step": 710, + "time_per_iteration": 2.542836904525757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169067, + "balance_loss_mlp": 1.05286229, + "epoch": 0.13678337822239323, + "flos": 453647932416.0, + "grad_norm": 0.0248615327032158, + "language_loss": 0.9884814, + "learning_rate": 0.0009704002694370216, + "loss": 1.00017214, + "num_input_tokens_seen": 59247312, + "router_z_loss_mlp": 1.16259766, + "step": 711, + "time_per_iteration": 2.550527811050415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164533, + "balance_loss_mlp": 1.04842281, + "epoch": 0.13697575990765679, + "flos": 520625468928.0, + "grad_norm": 0.0274811578413112, + "language_loss": 0.97066599, + "learning_rate": 0.0009702945775362388, + "loss": 0.98231125, + "num_input_tokens_seen": 59317968, + "router_z_loss_mlp": 1.16162109, + "step": 712, + "time_per_iteration": 2.56953501701355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116862, + "balance_loss_mlp": 1.05246294, + "epoch": 0.13716814159292035, + "flos": 481365510144.0, + "grad_norm": 0.025544817797380492, + "language_loss": 0.98621845, + "learning_rate": 0.0009701887030509086, + "loss": 0.99790466, + "num_input_tokens_seen": 59387936, + "router_z_loss_mlp": 1.16210938, + "step": 713, + "time_per_iteration": 2.6443872451782227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173026, + "balance_loss_mlp": 1.05663013, + "epoch": 0.1373605232781839, + "flos": 546749776896.0, + "grad_norm": 0.02672517687154734, + "language_loss": 1.02031791, + "learning_rate": 0.0009700826460221346, + "loss": 1.03204811, + "num_input_tokens_seen": 59460624, + "router_z_loss_mlp": 1.16455078, + "step": 714, + "time_per_iteration": 2.6742734909057617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171339, + "balance_loss_mlp": 1.05508566, + "epoch": 0.1375529049634475, + "flos": 710070091776.0, + "grad_norm": 0.027473841831572973, + "language_loss": 1.03736091, + "learning_rate": 0.0009699764064910921, + "loss": 1.04907441, + "num_input_tokens_seen": 59536752, + "router_z_loss_mlp": 1.16308594, + "step": 715, + "time_per_iteration": 2.8945000171661377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116921, + "balance_loss_mlp": 1.05281401, + "epoch": 0.13774528664871105, + "flos": 487676189184.0, + "grad_norm": 0.02500038679906112, + "language_loss": 0.96403199, + "learning_rate": 0.0009698699844990268, + "loss": 0.9757241, + "num_input_tokens_seen": 59608128, + "router_z_loss_mlp": 1.16455078, + "step": 716, + "time_per_iteration": 2.638272762298584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116569, + "balance_loss_mlp": 1.04972363, + "epoch": 0.1379376683339746, + "flos": 681458187264.0, + "grad_norm": 0.024933229917961583, + "language_loss": 0.9565106, + "learning_rate": 0.0009697633800872555, + "loss": 0.96816742, + "num_input_tokens_seen": 59685120, + "router_z_loss_mlp": 1.16015625, + "step": 717, + "time_per_iteration": 2.8989553451538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168974, + "balance_loss_mlp": 1.05310297, + "epoch": 0.13813005001923817, + "flos": 612225368064.0, + "grad_norm": 0.02330012063083705, + "language_loss": 1.0130372, + "learning_rate": 0.0009696565932971655, + "loss": 1.02472687, + "num_input_tokens_seen": 59763376, + "router_z_loss_mlp": 1.15917969, + "step": 718, + "time_per_iteration": 2.8472671508789062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171117, + "balance_loss_mlp": 1.05524576, + "epoch": 0.13832243170450173, + "flos": 589926144000.0, + "grad_norm": 0.027418468702626427, + "language_loss": 0.98498988, + "learning_rate": 0.0009695496241702153, + "loss": 0.99670106, + "num_input_tokens_seen": 59836800, + "router_z_loss_mlp": 1.15917969, + "step": 719, + "time_per_iteration": 2.786895990371704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167345, + "balance_loss_mlp": 1.05180764, + "epoch": 0.1385148133897653, + "flos": 701319479808.0, + "grad_norm": 0.026285913371991803, + "language_loss": 0.94868541, + "learning_rate": 0.0009694424727479339, + "loss": 0.96035892, + "num_input_tokens_seen": 59914720, + "router_z_loss_mlp": 1.15576172, + "step": 720, + "time_per_iteration": 2.921644926071167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117298, + "balance_loss_mlp": 1.05729949, + "epoch": 0.13870719507502885, + "flos": 599366966784.0, + "grad_norm": 0.024279001882637877, + "language_loss": 0.97845113, + "learning_rate": 0.0009693351390719213, + "loss": 0.99018097, + "num_input_tokens_seen": 59984544, + "router_z_loss_mlp": 1.15722656, + "step": 721, + "time_per_iteration": 2.701911687850952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168632, + "balance_loss_mlp": 1.05304694, + "epoch": 0.1388995767602924, + "flos": 587748724224.0, + "grad_norm": 0.03212240351747381, + "language_loss": 0.98596126, + "learning_rate": 0.000969227623183848, + "loss": 0.99764758, + "num_input_tokens_seen": 60057056, + "router_z_loss_mlp": 1.15625, + "step": 722, + "time_per_iteration": 2.7723541259765625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167541, + "balance_loss_mlp": 1.05205071, + "epoch": 0.139091958445556, + "flos": 652362189312.0, + "grad_norm": 0.025655198862846312, + "language_loss": 0.99224544, + "learning_rate": 0.0009691199251254554, + "loss": 1.00392079, + "num_input_tokens_seen": 60133232, + "router_z_loss_mlp": 1.15527344, + "step": 723, + "time_per_iteration": 2.8426058292388916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165537, + "balance_loss_mlp": 1.05019021, + "epoch": 0.13928434013081956, + "flos": 576905286144.0, + "grad_norm": 0.022500478429048027, + "language_loss": 0.9243086, + "learning_rate": 0.0009690120449385555, + "loss": 0.93596393, + "num_input_tokens_seen": 60207104, + "router_z_loss_mlp": 1.15380859, + "step": 724, + "time_per_iteration": 2.7558276653289795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168709, + "balance_loss_mlp": 1.05307627, + "epoch": 0.13947672181608312, + "flos": 564314127360.0, + "grad_norm": 0.02294482348940274, + "language_loss": 1.00981367, + "learning_rate": 0.0009689039826650312, + "loss": 1.02150071, + "num_input_tokens_seen": 60277920, + "router_z_loss_mlp": 1.15673828, + "step": 725, + "time_per_iteration": 2.784708261489868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211281, + "balance_loss_mlp": 1.09550476, + "epoch": 0.13966910350134668, + "flos": 1524949045248.0, + "grad_norm": 0.02639881420994122, + "language_loss": 0.76523066, + "learning_rate": 0.000968795738346836, + "loss": 0.77734339, + "num_input_tokens_seen": 60494224, + "router_z_loss_mlp": 1.15820312, + "step": 726, + "time_per_iteration": 4.9523255825042725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171441, + "balance_loss_mlp": 1.05604661, + "epoch": 0.13986148518661023, + "flos": 500855500800.0, + "grad_norm": 0.0321160389091748, + "language_loss": 0.98954523, + "learning_rate": 0.0009686873120259941, + "loss": 1.00125957, + "num_input_tokens_seen": 60562176, + "router_z_loss_mlp": 1.15429688, + "step": 727, + "time_per_iteration": 2.584141731262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173326, + "balance_loss_mlp": 1.05850363, + "epoch": 0.1400538668718738, + "flos": 599849058816.0, + "grad_norm": 0.027531106684590426, + "language_loss": 0.93834305, + "learning_rate": 0.0009685787037446004, + "loss": 0.95007634, + "num_input_tokens_seen": 60631472, + "router_z_loss_mlp": 1.1484375, + "step": 728, + "time_per_iteration": 2.770592451095581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170215, + "balance_loss_mlp": 1.05520177, + "epoch": 0.14024624855713735, + "flos": 595168579584.0, + "grad_norm": 0.026051179565135866, + "language_loss": 0.98294961, + "learning_rate": 0.0009684699135448201, + "loss": 0.99465179, + "num_input_tokens_seen": 60703488, + "router_z_loss_mlp": 1.15039062, + "step": 729, + "time_per_iteration": 2.728573799133301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164865, + "balance_loss_mlp": 1.04985154, + "epoch": 0.1404386302424009, + "flos": 507585145344.0, + "grad_norm": 0.02205061924934426, + "language_loss": 0.98307908, + "learning_rate": 0.0009683609414688895, + "loss": 0.99472773, + "num_input_tokens_seen": 60773936, + "router_z_loss_mlp": 1.15039062, + "step": 730, + "time_per_iteration": 2.700016975402832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167363, + "balance_loss_mlp": 1.05254078, + "epoch": 0.14063101192766447, + "flos": 574515018240.0, + "grad_norm": 0.021243768346974407, + "language_loss": 0.95329058, + "learning_rate": 0.0009682517875591154, + "loss": 0.96496415, + "num_input_tokens_seen": 60851120, + "router_z_loss_mlp": 1.1484375, + "step": 731, + "time_per_iteration": 2.743590831756592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116728, + "balance_loss_mlp": 1.05264843, + "epoch": 0.14082339361292806, + "flos": 565764406272.0, + "grad_norm": 0.02284757167221282, + "language_loss": 0.93998873, + "learning_rate": 0.0009681424518578749, + "loss": 0.95166153, + "num_input_tokens_seen": 60924896, + "router_z_loss_mlp": 1.14648438, + "step": 732, + "time_per_iteration": 2.757690668106079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166596, + "balance_loss_mlp": 1.05215514, + "epoch": 0.14101577529819162, + "flos": 464582694912.0, + "grad_norm": 0.02112517179619274, + "language_loss": 0.95363593, + "learning_rate": 0.000968032934407616, + "loss": 0.96530199, + "num_input_tokens_seen": 60996016, + "router_z_loss_mlp": 1.14453125, + "step": 733, + "time_per_iteration": 2.6260647773742676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167013, + "balance_loss_mlp": 1.05257201, + "epoch": 0.14120815698345518, + "flos": 597261405696.0, + "grad_norm": 0.02235342076428548, + "language_loss": 0.90822989, + "learning_rate": 0.0009679232352508571, + "loss": 0.91990006, + "num_input_tokens_seen": 61072016, + "router_z_loss_mlp": 1.14453125, + "step": 734, + "time_per_iteration": 2.7677996158599854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167689, + "balance_loss_mlp": 1.05334342, + "epoch": 0.14140053866871874, + "flos": 536231978496.0, + "grad_norm": 0.023954026934244203, + "language_loss": 0.90350544, + "learning_rate": 0.0009678133544301871, + "loss": 0.91518235, + "num_input_tokens_seen": 61144528, + "router_z_loss_mlp": 1.14355469, + "step": 735, + "time_per_iteration": 2.6668286323547363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165912, + "balance_loss_mlp": 1.05147135, + "epoch": 0.1415929203539823, + "flos": 521276748288.0, + "grad_norm": 0.01836780541558419, + "language_loss": 0.98091269, + "learning_rate": 0.0009677032919882658, + "loss": 0.99257177, + "num_input_tokens_seen": 61216960, + "router_z_loss_mlp": 1.14453125, + "step": 736, + "time_per_iteration": 2.654975652694702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175055, + "balance_loss_mlp": 1.0601368, + "epoch": 0.14178530203924586, + "flos": 483301883904.0, + "grad_norm": 0.025248480485652293, + "language_loss": 1.00008237, + "learning_rate": 0.000967593047967823, + "loss": 1.01183295, + "num_input_tokens_seen": 61281312, + "router_z_loss_mlp": 1.14941406, + "step": 737, + "time_per_iteration": 2.529147148132324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167635, + "balance_loss_mlp": 1.05319452, + "epoch": 0.14197768372450942, + "flos": 677839220736.0, + "grad_norm": 0.02278890168576414, + "language_loss": 0.9561522, + "learning_rate": 0.0009674826224116593, + "loss": 0.96782857, + "num_input_tokens_seen": 61355888, + "router_z_loss_mlp": 1.14453125, + "step": 738, + "time_per_iteration": 2.8032455444335938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170364, + "balance_loss_mlp": 1.05606639, + "epoch": 0.14217006540977298, + "flos": 446992147968.0, + "grad_norm": 0.026055784762538982, + "language_loss": 0.97800839, + "learning_rate": 0.0009673720153626455, + "loss": 0.989712, + "num_input_tokens_seen": 61424288, + "router_z_loss_mlp": 1.14306641, + "step": 739, + "time_per_iteration": 2.629868984222412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172861, + "balance_loss_mlp": 1.05889642, + "epoch": 0.14236244709503657, + "flos": 497477580288.0, + "grad_norm": 0.02475738760241807, + "language_loss": 0.95941108, + "learning_rate": 0.0009672612268637235, + "loss": 0.97113973, + "num_input_tokens_seen": 61493344, + "router_z_loss_mlp": 1.13964844, + "step": 740, + "time_per_iteration": 2.6037824153900146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170194, + "balance_loss_mlp": 1.05618262, + "epoch": 0.14255482878030012, + "flos": 649479095808.0, + "grad_norm": 0.03387034378547869, + "language_loss": 0.95329261, + "learning_rate": 0.0009671502569579048, + "loss": 0.96499455, + "num_input_tokens_seen": 61565216, + "router_z_loss_mlp": 1.14013672, + "step": 741, + "time_per_iteration": 2.7700846195220947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170542, + "balance_loss_mlp": 1.05657792, + "epoch": 0.14274721046556368, + "flos": 537274025472.0, + "grad_norm": 0.02433568326488268, + "language_loss": 0.98081231, + "learning_rate": 0.0009670391056882719, + "loss": 0.99251777, + "num_input_tokens_seen": 61640928, + "router_z_loss_mlp": 1.13964844, + "step": 742, + "time_per_iteration": 2.696019172668457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174036, + "balance_loss_mlp": 1.06002402, + "epoch": 0.14293959215082724, + "flos": 958583572992.0, + "grad_norm": 0.027423351639808666, + "language_loss": 0.96458268, + "learning_rate": 0.0009669277730979776, + "loss": 0.97632295, + "num_input_tokens_seen": 61717552, + "router_z_loss_mlp": 1.14013672, + "step": 743, + "time_per_iteration": 3.2084367275238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174905, + "balance_loss_mlp": 1.06103587, + "epoch": 0.1431319738360908, + "flos": 694385719296.0, + "grad_norm": 0.02304461389980259, + "language_loss": 0.94654781, + "learning_rate": 0.0009668162592302449, + "loss": 0.9582969, + "num_input_tokens_seen": 61800016, + "router_z_loss_mlp": 1.13867188, + "step": 744, + "time_per_iteration": 2.8862292766571045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184206, + "balance_loss_mlp": 1.07009852, + "epoch": 0.14332435552135436, + "flos": 566502280704.0, + "grad_norm": 0.024928546312887438, + "language_loss": 0.9473027, + "learning_rate": 0.0009667045641283676, + "loss": 0.95914471, + "num_input_tokens_seen": 61865904, + "router_z_loss_mlp": 1.14111328, + "step": 745, + "time_per_iteration": 2.6714677810668945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170598, + "balance_loss_mlp": 1.05672932, + "epoch": 0.14351673720661792, + "flos": 739695845376.0, + "grad_norm": 0.027004630074695047, + "language_loss": 1.03854704, + "learning_rate": 0.0009665926878357092, + "loss": 1.05025315, + "num_input_tokens_seen": 61945728, + "router_z_loss_mlp": 1.13867188, + "step": 746, + "time_per_iteration": 2.9414963722229004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168037, + "balance_loss_mlp": 1.05416811, + "epoch": 0.14370911889188148, + "flos": 550351279104.0, + "grad_norm": 0.024394803732961844, + "language_loss": 0.99195439, + "learning_rate": 0.0009664806303957043, + "loss": 1.00363481, + "num_input_tokens_seen": 62016288, + "router_z_loss_mlp": 1.13867188, + "step": 747, + "time_per_iteration": 2.6798276901245117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175063, + "balance_loss_mlp": 1.06109881, + "epoch": 0.14390150057714507, + "flos": 591589271040.0, + "grad_norm": 0.028912253716933817, + "language_loss": 0.96970344, + "learning_rate": 0.0009663683918518571, + "loss": 0.98145401, + "num_input_tokens_seen": 62097904, + "router_z_loss_mlp": 1.13964844, + "step": 748, + "time_per_iteration": 2.894670248031616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172034, + "balance_loss_mlp": 1.05845118, + "epoch": 0.14409388226240863, + "flos": 592144496640.0, + "grad_norm": 0.025560266799661176, + "language_loss": 0.96381319, + "learning_rate": 0.0009662559722477428, + "loss": 0.97553355, + "num_input_tokens_seen": 62166736, + "router_z_loss_mlp": 1.13574219, + "step": 749, + "time_per_iteration": 2.702796220779419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193848, + "balance_loss_mlp": 1.08131409, + "epoch": 0.1442862639476722, + "flos": 1514654828544.0, + "grad_norm": 0.02305864885865106, + "language_loss": 0.7616297, + "learning_rate": 0.0009661433716270062, + "loss": 0.77356815, + "num_input_tokens_seen": 62402512, + "router_z_loss_mlp": 1.125, + "step": 750, + "time_per_iteration": 5.010634660720825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166507, + "balance_loss_mlp": 1.05287659, + "epoch": 0.14447864563293575, + "flos": 497855612928.0, + "grad_norm": 0.023714468612350204, + "language_loss": 0.97989428, + "learning_rate": 0.0009660305900333632, + "loss": 0.99155927, + "num_input_tokens_seen": 62473408, + "router_z_loss_mlp": 1.13623047, + "step": 751, + "time_per_iteration": 2.7064144611358643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172129, + "balance_loss_mlp": 1.05845106, + "epoch": 0.1446710273181993, + "flos": 590794274304.0, + "grad_norm": 0.03190287595859636, + "language_loss": 0.91963172, + "learning_rate": 0.0009659176275105992, + "loss": 0.93135297, + "num_input_tokens_seen": 62547440, + "router_z_loss_mlp": 1.13671875, + "step": 752, + "time_per_iteration": 2.7171401977539062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171619, + "balance_loss_mlp": 1.05803668, + "epoch": 0.14486340900346287, + "flos": 587012851200.0, + "grad_norm": 0.023715921645424867, + "language_loss": 0.93508279, + "learning_rate": 0.0009658044841025701, + "loss": 0.94679892, + "num_input_tokens_seen": 62620224, + "router_z_loss_mlp": 1.13574219, + "step": 753, + "time_per_iteration": 2.77504563331604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172686, + "balance_loss_mlp": 1.05900788, + "epoch": 0.14505579068872643, + "flos": 505740096000.0, + "grad_norm": 0.025730958483317315, + "language_loss": 0.9055903, + "learning_rate": 0.0009656911598532021, + "loss": 0.91731715, + "num_input_tokens_seen": 62690464, + "router_z_loss_mlp": 1.13671875, + "step": 754, + "time_per_iteration": 2.642886161804199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172881, + "balance_loss_mlp": 1.05925071, + "epoch": 0.14524817237399, + "flos": 487815177216.0, + "grad_norm": 0.025261406861214447, + "language_loss": 0.98625988, + "learning_rate": 0.0009655776548064917, + "loss": 0.9979887, + "num_input_tokens_seen": 62762240, + "router_z_loss_mlp": 1.13623047, + "step": 755, + "time_per_iteration": 2.6610004901885986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169342, + "balance_loss_mlp": 1.05571139, + "epoch": 0.14544055405925355, + "flos": 729449292288.0, + "grad_norm": 0.025093779151575485, + "language_loss": 0.97407329, + "learning_rate": 0.0009654639690065054, + "loss": 0.98576677, + "num_input_tokens_seen": 62839760, + "router_z_loss_mlp": 1.13623047, + "step": 756, + "time_per_iteration": 2.867976665496826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173831, + "balance_loss_mlp": 1.06024873, + "epoch": 0.14563293574451713, + "flos": 594786544128.0, + "grad_norm": 0.02769433731610086, + "language_loss": 0.96328217, + "learning_rate": 0.00096535010249738, + "loss": 0.97502041, + "num_input_tokens_seen": 62910336, + "router_z_loss_mlp": 1.13574219, + "step": 757, + "time_per_iteration": 2.718595266342163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171947, + "balance_loss_mlp": 1.05879402, + "epoch": 0.1458253174297807, + "flos": 561622414848.0, + "grad_norm": 0.027253539371253223, + "language_loss": 0.93671888, + "learning_rate": 0.0009652360553233224, + "loss": 0.94843829, + "num_input_tokens_seen": 62988160, + "router_z_loss_mlp": 1.13134766, + "step": 758, + "time_per_iteration": 2.732665538787842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181084, + "balance_loss_mlp": 1.06835938, + "epoch": 0.14601769911504425, + "flos": 1561186922496.0, + "grad_norm": 0.016548141494889222, + "language_loss": 0.73773748, + "learning_rate": 0.0009651218275286093, + "loss": 0.74954832, + "num_input_tokens_seen": 63224704, + "router_z_loss_mlp": 1.12695312, + "step": 759, + "time_per_iteration": 4.9278404712677 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161415, + "balance_loss_mlp": 1.04840457, + "epoch": 0.1462100808003078, + "flos": 867822331392.0, + "grad_norm": 0.024551380524627048, + "language_loss": 0.89752859, + "learning_rate": 0.0009650074191575883, + "loss": 0.90914273, + "num_input_tokens_seen": 63312400, + "router_z_loss_mlp": 1.12988281, + "step": 760, + "time_per_iteration": 3.18084716796875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011658, + "balance_loss_mlp": 1.05302811, + "epoch": 0.14640246248557137, + "flos": 524029585920.0, + "grad_norm": 0.025729752682943422, + "language_loss": 0.95023656, + "learning_rate": 0.0009648928302546766, + "loss": 0.96189463, + "num_input_tokens_seen": 63387792, + "router_z_loss_mlp": 1.12744141, + "step": 761, + "time_per_iteration": 2.707385301589966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161728, + "balance_loss_mlp": 1.04895639, + "epoch": 0.14659484417083493, + "flos": 1032241089024.0, + "grad_norm": 0.022974522077421757, + "language_loss": 0.94352418, + "learning_rate": 0.0009647780608643613, + "loss": 0.95514143, + "num_input_tokens_seen": 63475632, + "router_z_loss_mlp": 1.12744141, + "step": 762, + "time_per_iteration": 3.357776165008545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116078, + "balance_loss_mlp": 1.04848516, + "epoch": 0.1467872258560985, + "flos": 501656501760.0, + "grad_norm": 0.027279773355913427, + "language_loss": 0.99627388, + "learning_rate": 0.0009646631110312001, + "loss": 1.00788176, + "num_input_tokens_seen": 63546080, + "router_z_loss_mlp": 1.12255859, + "step": 763, + "time_per_iteration": 2.629650115966797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159049, + "balance_loss_mlp": 1.04665887, + "epoch": 0.14697960754136205, + "flos": 548935928832.0, + "grad_norm": 0.020644179018096606, + "language_loss": 0.95446718, + "learning_rate": 0.0009645479807998203, + "loss": 0.96605766, + "num_input_tokens_seen": 63622464, + "router_z_loss_mlp": 1.12353516, + "step": 764, + "time_per_iteration": 2.7475621700286865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157825, + "balance_loss_mlp": 1.04510117, + "epoch": 0.14717198922662564, + "flos": 518901943296.0, + "grad_norm": 0.021535065255329562, + "language_loss": 0.99812603, + "learning_rate": 0.0009644326702149196, + "loss": 1.00970435, + "num_input_tokens_seen": 63694736, + "router_z_loss_mlp": 1.12695312, + "step": 765, + "time_per_iteration": 2.711500406265259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158907, + "balance_loss_mlp": 1.04618227, + "epoch": 0.1473643709118892, + "flos": 733483221504.0, + "grad_norm": 0.02504361772442387, + "language_loss": 0.95452881, + "learning_rate": 0.0009643171793212653, + "loss": 0.96611786, + "num_input_tokens_seen": 63779072, + "router_z_loss_mlp": 1.12695312, + "step": 766, + "time_per_iteration": 3.130798578262329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163931, + "balance_loss_mlp": 1.05115891, + "epoch": 0.14755675259715276, + "flos": 621668192256.0, + "grad_norm": 0.027740201354691706, + "language_loss": 0.99870968, + "learning_rate": 0.0009642015081636952, + "loss": 1.01034904, + "num_input_tokens_seen": 63847472, + "router_z_loss_mlp": 1.12744141, + "step": 767, + "time_per_iteration": 2.701939344406128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160055, + "balance_loss_mlp": 1.04761696, + "epoch": 0.14774913428241632, + "flos": 453172571136.0, + "grad_norm": 0.025159341457135456, + "language_loss": 0.98449206, + "learning_rate": 0.0009640856567871166, + "loss": 0.99609256, + "num_input_tokens_seen": 63912496, + "router_z_loss_mlp": 1.12402344, + "step": 768, + "time_per_iteration": 2.516721725463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164874, + "balance_loss_mlp": 1.05262613, + "epoch": 0.14794151596767988, + "flos": 838654474752.0, + "grad_norm": 0.02612823197324643, + "language_loss": 0.99416363, + "learning_rate": 0.0009639696252365072, + "loss": 1.00581241, + "num_input_tokens_seen": 63990832, + "router_z_loss_mlp": 1.12207031, + "step": 769, + "time_per_iteration": 3.06074857711792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167068, + "balance_loss_mlp": 1.05472481, + "epoch": 0.14813389765294344, + "flos": 687404295168.0, + "grad_norm": 0.02602975967937929, + "language_loss": 0.89651555, + "learning_rate": 0.0009638534135569144, + "loss": 0.90818626, + "num_input_tokens_seen": 64067552, + "router_z_loss_mlp": 1.12304688, + "step": 770, + "time_per_iteration": 2.9440436363220215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169876, + "balance_loss_mlp": 1.05753326, + "epoch": 0.148326279338207, + "flos": 510943600128.0, + "grad_norm": 0.028093178265757666, + "language_loss": 1.01150489, + "learning_rate": 0.0009637370217934554, + "loss": 1.02320373, + "num_input_tokens_seen": 64140336, + "router_z_loss_mlp": 1.12304688, + "step": 771, + "time_per_iteration": 2.649656295776367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166681, + "balance_loss_mlp": 1.05443311, + "epoch": 0.14851866102347056, + "flos": 589331260416.0, + "grad_norm": 0.028336871459981, + "language_loss": 0.90924722, + "learning_rate": 0.0009636204499913175, + "loss": 0.92091405, + "num_input_tokens_seen": 64223472, + "router_z_loss_mlp": 1.12207031, + "step": 772, + "time_per_iteration": 2.8592941761016846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157961, + "balance_loss_mlp": 1.04609525, + "epoch": 0.14871104270873411, + "flos": 692247230976.0, + "grad_norm": 0.030313888046816524, + "language_loss": 0.95830965, + "learning_rate": 0.0009635036981957581, + "loss": 0.96988928, + "num_input_tokens_seen": 64299872, + "router_z_loss_mlp": 1.11816406, + "step": 773, + "time_per_iteration": 2.8690600395202637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160765, + "balance_loss_mlp": 1.04904246, + "epoch": 0.1489034243939977, + "flos": 656282600448.0, + "grad_norm": 0.02808100337337059, + "language_loss": 0.98035401, + "learning_rate": 0.0009633867664521043, + "loss": 0.99196172, + "num_input_tokens_seen": 64377152, + "router_z_loss_mlp": 1.11669922, + "step": 774, + "time_per_iteration": 2.812833070755005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159463, + "balance_loss_mlp": 1.04788363, + "epoch": 0.14909580607926126, + "flos": 476795821056.0, + "grad_norm": 0.030787585825694654, + "language_loss": 0.97385693, + "learning_rate": 0.0009632696548057527, + "loss": 0.98545158, + "num_input_tokens_seen": 64443008, + "router_z_loss_mlp": 1.11523438, + "step": 775, + "time_per_iteration": 2.5662474632263184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160778, + "balance_loss_mlp": 1.04910243, + "epoch": 0.14928818776452482, + "flos": 612283765248.0, + "grad_norm": 0.030552265213122824, + "language_loss": 0.94746792, + "learning_rate": 0.0009631523633021704, + "loss": 0.95907569, + "num_input_tokens_seen": 64519776, + "router_z_loss_mlp": 1.11621094, + "step": 776, + "time_per_iteration": 2.789336919784546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115528, + "balance_loss_mlp": 1.04408133, + "epoch": 0.14948056944978838, + "flos": 562916241408.0, + "grad_norm": 0.02653866309736765, + "language_loss": 0.98006344, + "learning_rate": 0.0009630348919868936, + "loss": 0.99161637, + "num_input_tokens_seen": 64593712, + "router_z_loss_mlp": 1.11132812, + "step": 777, + "time_per_iteration": 2.708918571472168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115506, + "balance_loss_mlp": 1.04395676, + "epoch": 0.14967295113505194, + "flos": 450111558144.0, + "grad_norm": 0.02761804701826243, + "language_loss": 0.92444694, + "learning_rate": 0.0009629172409055293, + "loss": 0.93599755, + "num_input_tokens_seen": 64658448, + "router_z_loss_mlp": 1.11035156, + "step": 778, + "time_per_iteration": 2.522322177886963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154649, + "balance_loss_mlp": 1.0435462, + "epoch": 0.1498653328203155, + "flos": 572428922880.0, + "grad_norm": 0.02112796064723151, + "language_loss": 0.9446094, + "learning_rate": 0.0009627994101037531, + "loss": 0.9561559, + "num_input_tokens_seen": 64734144, + "router_z_loss_mlp": 1.11035156, + "step": 779, + "time_per_iteration": 2.7606184482574463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154399, + "balance_loss_mlp": 1.0433439, + "epoch": 0.15005771450557906, + "flos": 632407570944.0, + "grad_norm": 0.02232887996041627, + "language_loss": 0.98232067, + "learning_rate": 0.0009626813996273114, + "loss": 0.99386466, + "num_input_tokens_seen": 64813456, + "router_z_loss_mlp": 1.10986328, + "step": 780, + "time_per_iteration": 2.8442463874816895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156542, + "balance_loss_mlp": 1.04553461, + "epoch": 0.15025009619084262, + "flos": 579165298176.0, + "grad_norm": 0.021576328362923832, + "language_loss": 0.96611506, + "learning_rate": 0.0009625632095220198, + "loss": 0.97768044, + "num_input_tokens_seen": 64896816, + "router_z_loss_mlp": 1.109375, + "step": 781, + "time_per_iteration": 2.823941469192505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156174, + "balance_loss_mlp": 1.04492784, + "epoch": 0.1504424778761062, + "flos": 484856222208.0, + "grad_norm": 0.023769174200548453, + "language_loss": 0.96595448, + "learning_rate": 0.0009624448398337637, + "loss": 0.97751617, + "num_input_tokens_seen": 64964176, + "router_z_loss_mlp": 1.11181641, + "step": 782, + "time_per_iteration": 2.517115354537964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153917, + "balance_loss_mlp": 1.04286146, + "epoch": 0.15063485956136977, + "flos": 763894513152.0, + "grad_norm": 0.022118467112767815, + "language_loss": 0.97773027, + "learning_rate": 0.0009623262906084984, + "loss": 0.98926944, + "num_input_tokens_seen": 65042592, + "router_z_loss_mlp": 1.10986328, + "step": 783, + "time_per_iteration": 2.9971072673797607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156171, + "balance_loss_mlp": 1.04554462, + "epoch": 0.15082724124663333, + "flos": 498676079616.0, + "grad_norm": 0.021733375764601555, + "language_loss": 0.99047554, + "learning_rate": 0.0009622075618922486, + "loss": 1.00203729, + "num_input_tokens_seen": 65114576, + "router_z_loss_mlp": 1.10546875, + "step": 784, + "time_per_iteration": 2.7209272384643555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161923, + "balance_loss_mlp": 1.05110586, + "epoch": 0.15101962293189689, + "flos": 510722019840.0, + "grad_norm": 0.02414763506099098, + "language_loss": 0.95223093, + "learning_rate": 0.0009620886537311091, + "loss": 0.96385014, + "num_input_tokens_seen": 65186640, + "router_z_loss_mlp": 1.10742188, + "step": 785, + "time_per_iteration": 2.668501138687134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154688, + "balance_loss_mlp": 1.04406226, + "epoch": 0.15121200461716044, + "flos": 458701714944.0, + "grad_norm": 0.026890312379790088, + "language_loss": 0.97208995, + "learning_rate": 0.000961969566171244, + "loss": 0.98363686, + "num_input_tokens_seen": 65252112, + "router_z_loss_mlp": 1.10546875, + "step": 786, + "time_per_iteration": 2.5466530323028564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153217, + "balance_loss_mlp": 1.04278123, + "epoch": 0.151404386302424, + "flos": 539017016832.0, + "grad_norm": 0.02528800532756524, + "language_loss": 1.00058115, + "learning_rate": 0.0009618502992588873, + "loss": 1.01211333, + "num_input_tokens_seen": 65318912, + "router_z_loss_mlp": 1.10351562, + "step": 787, + "time_per_iteration": 2.6463584899902344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154208, + "balance_loss_mlp": 1.04358232, + "epoch": 0.15159676798768756, + "flos": 689616643584.0, + "grad_norm": 0.023869082053813537, + "language_loss": 0.98612797, + "learning_rate": 0.0009617308530403424, + "loss": 0.99766994, + "num_input_tokens_seen": 65395424, + "router_z_loss_mlp": 1.10546875, + "step": 788, + "time_per_iteration": 3.065110921859741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158206, + "balance_loss_mlp": 1.04758012, + "epoch": 0.15178914967295112, + "flos": 546432869376.0, + "grad_norm": 0.025092696297707027, + "language_loss": 0.95288265, + "learning_rate": 0.0009616112275619825, + "loss": 0.96446472, + "num_input_tokens_seen": 65470480, + "router_z_loss_mlp": 1.10546875, + "step": 789, + "time_per_iteration": 2.7197253704071045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158368, + "balance_loss_mlp": 1.0478847, + "epoch": 0.1519815313582147, + "flos": 512814845952.0, + "grad_norm": 0.020890571468345706, + "language_loss": 0.90545368, + "learning_rate": 0.0009614914228702503, + "loss": 0.91703737, + "num_input_tokens_seen": 65544720, + "router_z_loss_mlp": 1.10400391, + "step": 790, + "time_per_iteration": 2.6894142627716064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158071, + "balance_loss_mlp": 1.04782641, + "epoch": 0.15217391304347827, + "flos": 685457187840.0, + "grad_norm": 0.02448742031060442, + "language_loss": 0.96480352, + "learning_rate": 0.0009613714390116581, + "loss": 0.97638422, + "num_input_tokens_seen": 65627872, + "router_z_loss_mlp": 1.1015625, + "step": 791, + "time_per_iteration": 2.9898860454559326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155788, + "balance_loss_mlp": 1.04568636, + "epoch": 0.15236629472874183, + "flos": 645445893120.0, + "grad_norm": 0.023088199171654812, + "language_loss": 0.93995309, + "learning_rate": 0.0009612512760327879, + "loss": 0.95151103, + "num_input_tokens_seen": 65705264, + "router_z_loss_mlp": 1.10009766, + "step": 792, + "time_per_iteration": 2.855648994445801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154532, + "balance_loss_mlp": 1.0444783, + "epoch": 0.1525586764140054, + "flos": 413764892160.0, + "grad_norm": 0.024948238648346503, + "language_loss": 0.97790802, + "learning_rate": 0.0009611309339802909, + "loss": 0.98945332, + "num_input_tokens_seen": 65768592, + "router_z_loss_mlp": 1.09960938, + "step": 793, + "time_per_iteration": 2.4684345722198486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153777, + "balance_loss_mlp": 1.04372334, + "epoch": 0.15275105809926895, + "flos": 804233448960.0, + "grad_norm": 0.02131820977076166, + "language_loss": 0.93039513, + "learning_rate": 0.0009610104129008881, + "loss": 0.94193292, + "num_input_tokens_seen": 65852432, + "router_z_loss_mlp": 1.09960938, + "step": 794, + "time_per_iteration": 3.1013269424438477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155691, + "balance_loss_mlp": 1.04554129, + "epoch": 0.1529434397845325, + "flos": 613542663168.0, + "grad_norm": 0.024012716250022468, + "language_loss": 0.97966266, + "learning_rate": 0.0009608897128413701, + "loss": 0.99121952, + "num_input_tokens_seen": 65927904, + "router_z_loss_mlp": 1.10058594, + "step": 795, + "time_per_iteration": 2.729837417602539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154149, + "balance_loss_mlp": 1.04419053, + "epoch": 0.15313582146979607, + "flos": 616471418880.0, + "grad_norm": 0.02134077894827986, + "language_loss": 0.93399352, + "learning_rate": 0.0009607688338485965, + "loss": 0.945535, + "num_input_tokens_seen": 66006800, + "router_z_loss_mlp": 1.09863281, + "step": 796, + "time_per_iteration": 2.8517422676086426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153572, + "balance_loss_mlp": 1.04409015, + "epoch": 0.15332820315505963, + "flos": 794992012800.0, + "grad_norm": 0.02204541106277596, + "language_loss": 0.98951191, + "learning_rate": 0.0009606477759694969, + "loss": 1.00104761, + "num_input_tokens_seen": 66088608, + "router_z_loss_mlp": 1.09375, + "step": 797, + "time_per_iteration": 3.0313384532928467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153537, + "balance_loss_mlp": 1.0440551, + "epoch": 0.1535205848403232, + "flos": 551256339456.0, + "grad_norm": 0.028291975879130113, + "language_loss": 0.99155664, + "learning_rate": 0.0009605265392510703, + "loss": 1.00309205, + "num_input_tokens_seen": 66153616, + "router_z_loss_mlp": 1.09375, + "step": 798, + "time_per_iteration": 2.6558592319488525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150991, + "balance_loss_mlp": 1.04122281, + "epoch": 0.15371296652558677, + "flos": 536978585088.0, + "grad_norm": 0.02676367025649214, + "language_loss": 1.00762391, + "learning_rate": 0.0009604051237403846, + "loss": 1.01913381, + "num_input_tokens_seen": 66219472, + "router_z_loss_mlp": 1.09667969, + "step": 799, + "time_per_iteration": 2.6129424571990967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151653, + "balance_loss_mlp": 1.04198015, + "epoch": 0.15390534821085033, + "flos": 396089751552.0, + "grad_norm": 0.02759928767191203, + "language_loss": 0.9523741, + "learning_rate": 0.0009602835294845776, + "loss": 0.96389061, + "num_input_tokens_seen": 66281456, + "router_z_loss_mlp": 1.09570312, + "step": 800, + "time_per_iteration": 2.4865612983703613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152453, + "balance_loss_mlp": 1.04297161, + "epoch": 0.1540977298961139, + "flos": 536885259264.0, + "grad_norm": 0.0240348205061721, + "language_loss": 0.99338514, + "learning_rate": 0.0009601617565308565, + "loss": 1.00490952, + "num_input_tokens_seen": 66348160, + "router_z_loss_mlp": 1.09375, + "step": 801, + "time_per_iteration": 2.646925449371338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155144, + "balance_loss_mlp": 1.04551864, + "epoch": 0.15429011158137745, + "flos": 725090449920.0, + "grad_norm": 0.022214532903779557, + "language_loss": 0.94821054, + "learning_rate": 0.0009600398049264977, + "loss": 0.95976186, + "num_input_tokens_seen": 66430576, + "router_z_loss_mlp": 1.09521484, + "step": 802, + "time_per_iteration": 3.0287652015686035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011558, + "balance_loss_mlp": 1.04627085, + "epoch": 0.154482493266641, + "flos": 621748783104.0, + "grad_norm": 0.025430739734688717, + "language_loss": 1.02679133, + "learning_rate": 0.0009599176747188469, + "loss": 1.03834927, + "num_input_tokens_seen": 66506480, + "router_z_loss_mlp": 1.09423828, + "step": 803, + "time_per_iteration": 2.8240089416503906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156206, + "balance_loss_mlp": 1.0467242, + "epoch": 0.15467487495190457, + "flos": 526719297024.0, + "grad_norm": 0.024483654101252486, + "language_loss": 0.90705526, + "learning_rate": 0.0009597953659553196, + "loss": 0.91861731, + "num_input_tokens_seen": 66577680, + "router_z_loss_mlp": 1.09375, + "step": 804, + "time_per_iteration": 2.745878219604492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153494, + "balance_loss_mlp": 1.04386926, + "epoch": 0.15486725663716813, + "flos": 528759730176.0, + "grad_norm": 0.02516296775651391, + "language_loss": 0.97286022, + "learning_rate": 0.0009596728786833997, + "loss": 0.98439509, + "num_input_tokens_seen": 66648496, + "router_z_loss_mlp": 1.09521484, + "step": 805, + "time_per_iteration": 2.6471030712127686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152075, + "balance_loss_mlp": 1.04244983, + "epoch": 0.1550596383224317, + "flos": 1050278799360.0, + "grad_norm": 0.026563720364072098, + "language_loss": 0.9858942, + "learning_rate": 0.0009595502129506415, + "loss": 0.99741489, + "num_input_tokens_seen": 66735216, + "router_z_loss_mlp": 1.09521484, + "step": 806, + "time_per_iteration": 3.3734352588653564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115037, + "balance_loss_mlp": 1.04088783, + "epoch": 0.15525202000769528, + "flos": 614836489728.0, + "grad_norm": 0.02624405223250092, + "language_loss": 0.91745955, + "learning_rate": 0.0009594273688046678, + "loss": 0.92896324, + "num_input_tokens_seen": 66810672, + "router_z_loss_mlp": 1.09375, + "step": 807, + "time_per_iteration": 2.8000967502593994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153708, + "balance_loss_mlp": 1.04441667, + "epoch": 0.15544440169295884, + "flos": 534102222336.0, + "grad_norm": 0.028049278390969077, + "language_loss": 0.97350299, + "learning_rate": 0.000959304346293171, + "loss": 0.98504007, + "num_input_tokens_seen": 66879824, + "router_z_loss_mlp": 1.09179688, + "step": 808, + "time_per_iteration": 2.7285830974578857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164275, + "balance_loss_mlp": 1.05464995, + "epoch": 0.1556367833782224, + "flos": 645886325760.0, + "grad_norm": 0.033021349518653896, + "language_loss": 0.99046445, + "learning_rate": 0.0009591811454639125, + "loss": 1.00210714, + "num_input_tokens_seen": 66949424, + "router_z_loss_mlp": 1.09521484, + "step": 809, + "time_per_iteration": 2.842867612838745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155411, + "balance_loss_mlp": 1.04612005, + "epoch": 0.15582916506348596, + "flos": 544952391168.0, + "grad_norm": 0.02421082053858415, + "language_loss": 0.95793635, + "learning_rate": 0.0009590577663647234, + "loss": 0.96949041, + "num_input_tokens_seen": 67024000, + "router_z_loss_mlp": 1.09179688, + "step": 810, + "time_per_iteration": 2.8207406997680664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158015, + "balance_loss_mlp": 1.04877126, + "epoch": 0.15602154674874952, + "flos": 581214463488.0, + "grad_norm": 0.022734781081273227, + "language_loss": 0.95110512, + "learning_rate": 0.0009589342090435036, + "loss": 0.96268523, + "num_input_tokens_seen": 67100672, + "router_z_loss_mlp": 1.09130859, + "step": 811, + "time_per_iteration": 2.8413872718811035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170356, + "balance_loss_mlp": 1.06068361, + "epoch": 0.15621392843401308, + "flos": 536316572160.0, + "grad_norm": 0.026628933906638022, + "language_loss": 0.97807872, + "learning_rate": 0.0009588104735482223, + "loss": 0.98978221, + "num_input_tokens_seen": 67171584, + "router_z_loss_mlp": 1.09570312, + "step": 812, + "time_per_iteration": 2.6673126220703125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164587, + "balance_loss_mlp": 1.05524826, + "epoch": 0.15640631011927664, + "flos": 551981478912.0, + "grad_norm": 0.027865461759282353, + "language_loss": 0.94247007, + "learning_rate": 0.0009586865599269177, + "loss": 0.95411587, + "num_input_tokens_seen": 67240640, + "router_z_loss_mlp": 1.09228516, + "step": 813, + "time_per_iteration": 2.655217409133911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159004, + "balance_loss_mlp": 1.04985571, + "epoch": 0.1565986918045402, + "flos": 638635657728.0, + "grad_norm": 0.024501009698068087, + "language_loss": 0.98888743, + "learning_rate": 0.0009585624682276977, + "loss": 1.00047755, + "num_input_tokens_seen": 67312976, + "router_z_loss_mlp": 1.09033203, + "step": 814, + "time_per_iteration": 2.7572293281555176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160029, + "balance_loss_mlp": 1.05073786, + "epoch": 0.15679107348980378, + "flos": 491781250560.0, + "grad_norm": 0.02545428800843787, + "language_loss": 0.97158241, + "learning_rate": 0.0009584381984987386, + "loss": 0.98318267, + "num_input_tokens_seen": 67378528, + "router_z_loss_mlp": 1.09179688, + "step": 815, + "time_per_iteration": 2.554208517074585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160766, + "balance_loss_mlp": 1.05185616, + "epoch": 0.15698345517506734, + "flos": 531002277888.0, + "grad_norm": 0.022736041606184667, + "language_loss": 0.98151159, + "learning_rate": 0.0009583137507882864, + "loss": 0.99311924, + "num_input_tokens_seen": 67449728, + "router_z_loss_mlp": 1.08789062, + "step": 816, + "time_per_iteration": 2.6635444164276123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158696, + "balance_loss_mlp": 1.04978669, + "epoch": 0.1571758368603309, + "flos": 547077417984.0, + "grad_norm": 0.024009976747476527, + "language_loss": 0.90921289, + "learning_rate": 0.000958189125144656, + "loss": 0.92079985, + "num_input_tokens_seen": 67520512, + "router_z_loss_mlp": 1.08789062, + "step": 817, + "time_per_iteration": 2.635559558868408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156061, + "balance_loss_mlp": 1.04719925, + "epoch": 0.15736821854559446, + "flos": 566743326720.0, + "grad_norm": 0.021547949482456395, + "language_loss": 0.97883654, + "learning_rate": 0.0009580643216162313, + "loss": 0.99039721, + "num_input_tokens_seen": 67592464, + "router_z_loss_mlp": 1.08740234, + "step": 818, + "time_per_iteration": 2.673997640609741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157698, + "balance_loss_mlp": 1.04888415, + "epoch": 0.15756060023085802, + "flos": 501953943552.0, + "grad_norm": 0.023826624353146583, + "language_loss": 0.90112716, + "learning_rate": 0.0009579393402514652, + "loss": 0.91270417, + "num_input_tokens_seen": 67658928, + "router_z_loss_mlp": 1.08691406, + "step": 819, + "time_per_iteration": 2.61460542678833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156999, + "balance_loss_mlp": 1.04823244, + "epoch": 0.15775298191612158, + "flos": 520271631360.0, + "grad_norm": 0.023927295219635936, + "language_loss": 0.99075627, + "learning_rate": 0.0009578141810988801, + "loss": 1.00232625, + "num_input_tokens_seen": 67727936, + "router_z_loss_mlp": 1.08642578, + "step": 820, + "time_per_iteration": 2.591036558151245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149976, + "balance_loss_mlp": 1.04111433, + "epoch": 0.15794536360138514, + "flos": 467087755776.0, + "grad_norm": 0.026283029611425073, + "language_loss": 1.00067806, + "learning_rate": 0.0009576888442070668, + "loss": 1.01217794, + "num_input_tokens_seen": 67795488, + "router_z_loss_mlp": 1.08740234, + "step": 821, + "time_per_iteration": 2.5960564613342285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151894, + "balance_loss_mlp": 1.04279363, + "epoch": 0.1581377452866487, + "flos": 518168071680.0, + "grad_norm": 0.02399653039287492, + "language_loss": 1.01290274, + "learning_rate": 0.0009575633296246854, + "loss": 1.02442169, + "num_input_tokens_seen": 67858896, + "router_z_loss_mlp": 1.08984375, + "step": 822, + "time_per_iteration": 2.579575300216675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151848, + "balance_loss_mlp": 1.04312956, + "epoch": 0.15833012697191226, + "flos": 550837373952.0, + "grad_norm": 0.02407632334340799, + "language_loss": 0.91124117, + "learning_rate": 0.0009574376374004652, + "loss": 0.92275965, + "num_input_tokens_seen": 67924864, + "router_z_loss_mlp": 1.0859375, + "step": 823, + "time_per_iteration": 2.661754608154297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162901, + "balance_loss_mlp": 1.05446815, + "epoch": 0.15852250865717585, + "flos": 488466456576.0, + "grad_norm": 0.026327967105985502, + "language_loss": 0.90841949, + "learning_rate": 0.000957311767583204, + "loss": 0.92004848, + "num_input_tokens_seen": 67992912, + "router_z_loss_mlp": 1.08300781, + "step": 824, + "time_per_iteration": 2.7887372970581055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156753, + "balance_loss_mlp": 1.04956055, + "epoch": 0.1587148903424394, + "flos": 1312696909824.0, + "grad_norm": 0.010620587901871582, + "language_loss": 0.8207159, + "learning_rate": 0.0009571857202217691, + "loss": 0.8322835, + "num_input_tokens_seen": 68207408, + "router_z_loss_mlp": 1.0703125, + "step": 825, + "time_per_iteration": 4.766167640686035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151145, + "balance_loss_mlp": 1.04304576, + "epoch": 0.15890727202770297, + "flos": 467832360960.0, + "grad_norm": 0.02959471781097451, + "language_loss": 1.0376749, + "learning_rate": 0.0009570594953650961, + "loss": 1.04918623, + "num_input_tokens_seen": 68270864, + "router_z_loss_mlp": 1.07958984, + "step": 826, + "time_per_iteration": 2.6334874629974365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151786, + "balance_loss_mlp": 1.04354417, + "epoch": 0.15909965371296653, + "flos": 778606695936.0, + "grad_norm": 0.024366848241159877, + "language_loss": 0.8923949, + "learning_rate": 0.00095693309306219, + "loss": 0.90391278, + "num_input_tokens_seen": 68355408, + "router_z_loss_mlp": 1.08105469, + "step": 827, + "time_per_iteration": 3.1078274250030518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115264, + "balance_loss_mlp": 1.04449332, + "epoch": 0.1592920353982301, + "flos": 1079962950144.0, + "grad_norm": 0.02547465125103231, + "language_loss": 0.98567259, + "learning_rate": 0.0009568065133621244, + "loss": 0.99719906, + "num_input_tokens_seen": 68437072, + "router_z_loss_mlp": 1.08007812, + "step": 828, + "time_per_iteration": 3.3287436962127686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147109, + "balance_loss_mlp": 1.03872418, + "epoch": 0.15948441708349365, + "flos": 726889837056.0, + "grad_norm": 0.026992334830630314, + "language_loss": 0.93815649, + "learning_rate": 0.0009566797563140422, + "loss": 0.94962764, + "num_input_tokens_seen": 68511696, + "router_z_loss_mlp": 1.08251953, + "step": 829, + "time_per_iteration": 2.8641507625579834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146027, + "balance_loss_mlp": 1.03788006, + "epoch": 0.1596767987687572, + "flos": 580075087872.0, + "grad_norm": 0.026140449767567974, + "language_loss": 0.96191794, + "learning_rate": 0.0009565528219671547, + "loss": 0.97337818, + "num_input_tokens_seen": 68587488, + "router_z_loss_mlp": 1.08007812, + "step": 830, + "time_per_iteration": 2.9082329273223877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147169, + "balance_loss_mlp": 1.03902268, + "epoch": 0.15986918045402077, + "flos": 530025358848.0, + "grad_norm": 0.02186736495212519, + "language_loss": 0.93771887, + "learning_rate": 0.0009564257103707418, + "loss": 0.94919056, + "num_input_tokens_seen": 68655760, + "router_z_loss_mlp": 1.08007812, + "step": 831, + "time_per_iteration": 4.109540700912476 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153246, + "balance_loss_mlp": 1.04519463, + "epoch": 0.16006156213928435, + "flos": 575669856768.0, + "grad_norm": 0.025156765484562034, + "language_loss": 1.01463771, + "learning_rate": 0.0009562984215741533, + "loss": 1.02617025, + "num_input_tokens_seen": 68724560, + "router_z_loss_mlp": 1.07910156, + "step": 832, + "time_per_iteration": 2.634381055831909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148637, + "balance_loss_mlp": 1.0408721, + "epoch": 0.1602539438245479, + "flos": 516674858496.0, + "grad_norm": 0.023022886756030446, + "language_loss": 0.90665066, + "learning_rate": 0.0009561709556268065, + "loss": 0.91813707, + "num_input_tokens_seen": 68795440, + "router_z_loss_mlp": 1.07617188, + "step": 833, + "time_per_iteration": 2.7094552516937256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115539, + "balance_loss_mlp": 1.04752922, + "epoch": 0.16044632550981147, + "flos": 622161017856.0, + "grad_norm": 0.02456985500743924, + "language_loss": 1.0306673, + "learning_rate": 0.0009560433125781884, + "loss": 1.04222107, + "num_input_tokens_seen": 68868176, + "router_z_loss_mlp": 1.07714844, + "step": 834, + "time_per_iteration": 2.7217955589294434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156285, + "balance_loss_mlp": 1.04794765, + "epoch": 0.16063870719507503, + "flos": 562127975424.0, + "grad_norm": 0.02550250825542428, + "language_loss": 1.02622008, + "learning_rate": 0.0009559154924778544, + "loss": 1.03778291, + "num_input_tokens_seen": 68939616, + "router_z_loss_mlp": 1.08203125, + "step": 835, + "time_per_iteration": 4.0438151359558105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153381, + "balance_loss_mlp": 1.04509139, + "epoch": 0.1608310888803386, + "flos": 806560590336.0, + "grad_norm": 0.023331498233936678, + "language_loss": 0.93980491, + "learning_rate": 0.0009557874953754284, + "loss": 0.95133871, + "num_input_tokens_seen": 69016192, + "router_z_loss_mlp": 1.08154297, + "step": 836, + "time_per_iteration": 3.0253541469573975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155161, + "balance_loss_mlp": 1.04739583, + "epoch": 0.16102347056560215, + "flos": 601694108160.0, + "grad_norm": 0.024039154316001603, + "language_loss": 0.9449209, + "learning_rate": 0.0009556593213206038, + "loss": 0.95647246, + "num_input_tokens_seen": 69089360, + "router_z_loss_mlp": 1.07617188, + "step": 837, + "time_per_iteration": 2.815293788909912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148071, + "balance_loss_mlp": 1.04049647, + "epoch": 0.1612158522508657, + "flos": 554614794240.0, + "grad_norm": 0.024490980939479982, + "language_loss": 0.96443379, + "learning_rate": 0.0009555309703631414, + "loss": 0.9759146, + "num_input_tokens_seen": 69161952, + "router_z_loss_mlp": 1.07421875, + "step": 838, + "time_per_iteration": 2.7353601455688477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148397, + "balance_loss_mlp": 1.0406791, + "epoch": 0.16140823393612927, + "flos": 557017797120.0, + "grad_norm": 0.026558461299776022, + "language_loss": 0.98485982, + "learning_rate": 0.0009554024425528722, + "loss": 0.99634379, + "num_input_tokens_seen": 69232432, + "router_z_loss_mlp": 1.07568359, + "step": 839, + "time_per_iteration": 2.801539182662964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146915, + "balance_loss_mlp": 1.03924477, + "epoch": 0.16160061562139286, + "flos": 544908730368.0, + "grad_norm": 0.023933605454050468, + "language_loss": 0.96992832, + "learning_rate": 0.0009552737379396948, + "loss": 0.98139745, + "num_input_tokens_seen": 69297696, + "router_z_loss_mlp": 1.07519531, + "step": 840, + "time_per_iteration": 2.613037586212158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148515, + "balance_loss_mlp": 1.04122651, + "epoch": 0.16179299730665642, + "flos": 605006900736.0, + "grad_norm": 0.020652206840645122, + "language_loss": 0.95695615, + "learning_rate": 0.0009551448565735767, + "loss": 0.96844131, + "num_input_tokens_seen": 69373888, + "router_z_loss_mlp": 1.07128906, + "step": 841, + "time_per_iteration": 2.779979705810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149052, + "balance_loss_mlp": 1.04128659, + "epoch": 0.16198537899191998, + "flos": 788551077888.0, + "grad_norm": 0.02358864683094414, + "language_loss": 0.96423578, + "learning_rate": 0.0009550157985045543, + "loss": 0.97572625, + "num_input_tokens_seen": 69449984, + "router_z_loss_mlp": 1.07617188, + "step": 842, + "time_per_iteration": 3.0352344512939453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148245, + "balance_loss_mlp": 1.04086173, + "epoch": 0.16217776067718354, + "flos": 520829584896.0, + "grad_norm": 0.02127918945612936, + "language_loss": 0.95624614, + "learning_rate": 0.0009548865637827321, + "loss": 0.96772861, + "num_input_tokens_seen": 69522736, + "router_z_loss_mlp": 1.07226562, + "step": 843, + "time_per_iteration": 2.695211172103882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148036, + "balance_loss_mlp": 1.04027128, + "epoch": 0.1623701423624471, + "flos": 506254388736.0, + "grad_norm": 0.02427958482397641, + "language_loss": 0.99469078, + "learning_rate": 0.0009547571524582838, + "loss": 1.00617111, + "num_input_tokens_seen": 69587184, + "router_z_loss_mlp": 1.07617188, + "step": 844, + "time_per_iteration": 2.586859941482544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145622, + "balance_loss_mlp": 1.03842914, + "epoch": 0.16256252404771065, + "flos": 498157057536.0, + "grad_norm": 0.025657026114593633, + "language_loss": 1.02873135, + "learning_rate": 0.0009546275645814512, + "loss": 1.04018748, + "num_input_tokens_seen": 69656560, + "router_z_loss_mlp": 1.0703125, + "step": 845, + "time_per_iteration": 2.735323190689087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147597, + "balance_loss_mlp": 1.04040384, + "epoch": 0.16275490573297421, + "flos": 503286701568.0, + "grad_norm": 0.024743383464961046, + "language_loss": 1.00377154, + "learning_rate": 0.0009544978002025446, + "loss": 1.01524746, + "num_input_tokens_seen": 69723872, + "router_z_loss_mlp": 1.0703125, + "step": 846, + "time_per_iteration": 2.5876121520996094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148997, + "balance_loss_mlp": 1.04189885, + "epoch": 0.16294728741823777, + "flos": 508353945600.0, + "grad_norm": 0.020876938588178177, + "language_loss": 0.94877481, + "learning_rate": 0.0009543678593719434, + "loss": 0.9602648, + "num_input_tokens_seen": 69795504, + "router_z_loss_mlp": 1.06933594, + "step": 847, + "time_per_iteration": 2.69250750541687 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159847, + "balance_loss_mlp": 1.05274892, + "epoch": 0.16313966910350133, + "flos": 510756948480.0, + "grad_norm": 0.020936629725758764, + "language_loss": 0.95534647, + "learning_rate": 0.0009542377421400945, + "loss": 0.96694493, + "num_input_tokens_seen": 69873408, + "router_z_loss_mlp": 1.06933594, + "step": 848, + "time_per_iteration": 2.7832183837890625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146796, + "balance_loss_mlp": 1.03965068, + "epoch": 0.16333205078876492, + "flos": 545056450560.0, + "grad_norm": 0.023544058946573278, + "language_loss": 0.94486761, + "learning_rate": 0.0009541074485575145, + "loss": 0.95633554, + "num_input_tokens_seen": 69944112, + "router_z_loss_mlp": 1.06982422, + "step": 849, + "time_per_iteration": 2.7163026332855225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147161, + "balance_loss_mlp": 1.03996801, + "epoch": 0.16352443247402848, + "flos": 508711785984.0, + "grad_norm": 0.023080110816121054, + "language_loss": 1.00550437, + "learning_rate": 0.0009539769786747874, + "loss": 1.01697588, + "num_input_tokens_seen": 70012288, + "router_z_loss_mlp": 1.0703125, + "step": 850, + "time_per_iteration": 2.5918350219726562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152854, + "balance_loss_mlp": 1.04547, + "epoch": 0.16371681415929204, + "flos": 543222134784.0, + "grad_norm": 0.022593715242085626, + "language_loss": 0.90895152, + "learning_rate": 0.0009538463325425665, + "loss": 0.92048007, + "num_input_tokens_seen": 70086560, + "router_z_loss_mlp": 1.07226562, + "step": 851, + "time_per_iteration": 2.701662063598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146583, + "balance_loss_mlp": 1.03939056, + "epoch": 0.1639091958445556, + "flos": 521760841728.0, + "grad_norm": 0.025319624949764974, + "language_loss": 0.95562863, + "learning_rate": 0.0009537155102115728, + "loss": 0.96709442, + "num_input_tokens_seen": 70153968, + "router_z_loss_mlp": 1.0703125, + "step": 852, + "time_per_iteration": 2.577416181564331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145576, + "balance_loss_mlp": 1.03871727, + "epoch": 0.16410157752981916, + "flos": 548482034688.0, + "grad_norm": 0.022217218078565786, + "language_loss": 0.92332971, + "learning_rate": 0.0009535845117325961, + "loss": 0.93478549, + "num_input_tokens_seen": 70222496, + "router_z_loss_mlp": 1.06689453, + "step": 853, + "time_per_iteration": 2.643528699874878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148166, + "balance_loss_mlp": 1.04135406, + "epoch": 0.16429395921508272, + "flos": 584025698304.0, + "grad_norm": 0.02024018106959617, + "language_loss": 1.00128078, + "learning_rate": 0.0009534533371564946, + "loss": 1.01276231, + "num_input_tokens_seen": 70301680, + "router_z_loss_mlp": 1.06640625, + "step": 854, + "time_per_iteration": 2.74361515045166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150543, + "balance_loss_mlp": 1.04377949, + "epoch": 0.16448634090034628, + "flos": 531961732608.0, + "grad_norm": 0.02843561601072028, + "language_loss": 1.00094676, + "learning_rate": 0.0009533219865341949, + "loss": 1.01245213, + "num_input_tokens_seen": 70371152, + "router_z_loss_mlp": 1.06591797, + "step": 855, + "time_per_iteration": 2.5858380794525146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156957, + "balance_loss_mlp": 1.05014503, + "epoch": 0.16467872258560984, + "flos": 492960284160.0, + "grad_norm": 0.026495144396752456, + "language_loss": 0.95923662, + "learning_rate": 0.0009531904599166916, + "loss": 0.97080612, + "num_input_tokens_seen": 70440832, + "router_z_loss_mlp": 1.06640625, + "step": 856, + "time_per_iteration": 2.638528823852539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147973, + "balance_loss_mlp": 1.04101861, + "epoch": 0.16487110427087343, + "flos": 507259505664.0, + "grad_norm": 0.02303677132947941, + "language_loss": 0.95950538, + "learning_rate": 0.0009530587573550478, + "loss": 0.97098505, + "num_input_tokens_seen": 70507424, + "router_z_loss_mlp": 1.06787109, + "step": 857, + "time_per_iteration": 2.5788354873657227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151596, + "balance_loss_mlp": 1.04592896, + "epoch": 0.16506348595613698, + "flos": 1436108714496.0, + "grad_norm": 0.011861304780107247, + "language_loss": 0.74319386, + "learning_rate": 0.0009529268789003953, + "loss": 0.75470984, + "num_input_tokens_seen": 70742320, + "router_z_loss_mlp": 1.0546875, + "step": 858, + "time_per_iteration": 5.003005027770996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153597, + "balance_loss_mlp": 1.04673755, + "epoch": 0.16525586764140054, + "flos": 478089647616.0, + "grad_norm": 0.02595402254221991, + "language_loss": 0.98057735, + "learning_rate": 0.0009527948246039337, + "loss": 0.99211335, + "num_input_tokens_seen": 70808400, + "router_z_loss_mlp": 1.06689453, + "step": 859, + "time_per_iteration": 2.541255474090576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152748, + "balance_loss_mlp": 1.04622293, + "epoch": 0.1654482493266641, + "flos": 882540518400.0, + "grad_norm": 0.024187417777422206, + "language_loss": 0.96476752, + "learning_rate": 0.000952662594516931, + "loss": 0.97629499, + "num_input_tokens_seen": 70886192, + "router_z_loss_mlp": 1.06347656, + "step": 860, + "time_per_iteration": 3.102233409881592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154678, + "balance_loss_mlp": 1.04791439, + "epoch": 0.16564063101192766, + "flos": 628105124352.0, + "grad_norm": 0.02242324391324738, + "language_loss": 0.93166292, + "learning_rate": 0.0009525301886907234, + "loss": 0.94320977, + "num_input_tokens_seen": 70964816, + "router_z_loss_mlp": 1.06591797, + "step": 861, + "time_per_iteration": 2.871971368789673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151309, + "balance_loss_mlp": 1.04487896, + "epoch": 0.16583301269719122, + "flos": 562592603136.0, + "grad_norm": 0.02248996903194516, + "language_loss": 0.97140592, + "learning_rate": 0.0009523976071767155, + "loss": 0.98291898, + "num_input_tokens_seen": 71037456, + "router_z_loss_mlp": 1.0625, + "step": 862, + "time_per_iteration": 2.653031349182129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146763, + "balance_loss_mlp": 1.04038036, + "epoch": 0.16602539438245478, + "flos": 568983873024.0, + "grad_norm": 0.020794335354585358, + "language_loss": 0.9646408, + "learning_rate": 0.00095226485002638, + "loss": 0.97610843, + "num_input_tokens_seen": 71111872, + "router_z_loss_mlp": 1.06201172, + "step": 863, + "time_per_iteration": 2.7685163021087646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147042, + "balance_loss_mlp": 1.04075551, + "epoch": 0.16621777606771834, + "flos": 576021692928.0, + "grad_norm": 0.021581021962121343, + "language_loss": 0.96560466, + "learning_rate": 0.0009521319172912576, + "loss": 0.9770751, + "num_input_tokens_seen": 71187808, + "router_z_loss_mlp": 1.06103516, + "step": 864, + "time_per_iteration": 2.762233257293701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149511, + "balance_loss_mlp": 1.0432713, + "epoch": 0.16641015775298193, + "flos": 515597882880.0, + "grad_norm": 0.029880870913045234, + "language_loss": 1.0375855, + "learning_rate": 0.0009519988090229579, + "loss": 1.04908061, + "num_input_tokens_seen": 71261728, + "router_z_loss_mlp": 1.06054688, + "step": 865, + "time_per_iteration": 2.7156929969787598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148426, + "balance_loss_mlp": 1.04199588, + "epoch": 0.1666025394382455, + "flos": 622849227264.0, + "grad_norm": 0.023088954173990716, + "language_loss": 0.96669209, + "learning_rate": 0.0009518655252731576, + "loss": 0.9781763, + "num_input_tokens_seen": 71338352, + "router_z_loss_mlp": 1.0625, + "step": 866, + "time_per_iteration": 2.76474928855896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147261, + "balance_loss_mlp": 1.04102135, + "epoch": 0.16679492112350905, + "flos": 549932313600.0, + "grad_norm": 0.021458749489738967, + "language_loss": 0.98467255, + "learning_rate": 0.0009517320660936022, + "loss": 0.99614513, + "num_input_tokens_seen": 71416544, + "router_z_loss_mlp": 1.06054688, + "step": 867, + "time_per_iteration": 2.7664713859558105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151692, + "balance_loss_mlp": 1.04545259, + "epoch": 0.1669873028087726, + "flos": 666865526784.0, + "grad_norm": 0.02209258354681387, + "language_loss": 0.92114806, + "learning_rate": 0.0009515984315361051, + "loss": 0.93266487, + "num_input_tokens_seen": 71494080, + "router_z_loss_mlp": 1.06054688, + "step": 868, + "time_per_iteration": 2.845388412475586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151588, + "balance_loss_mlp": 1.04563451, + "epoch": 0.16717968449403617, + "flos": 539603168256.0, + "grad_norm": 0.02501334283432316, + "language_loss": 0.95751995, + "learning_rate": 0.000951464621652548, + "loss": 0.96903574, + "num_input_tokens_seen": 71562672, + "router_z_loss_mlp": 1.05761719, + "step": 869, + "time_per_iteration": 2.623375415802002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148167, + "balance_loss_mlp": 1.04216599, + "epoch": 0.16737206617929973, + "flos": 531278252544.0, + "grad_norm": 0.02062860382438808, + "language_loss": 0.87610328, + "learning_rate": 0.0009513306364948804, + "loss": 0.88758498, + "num_input_tokens_seen": 71641904, + "router_z_loss_mlp": 1.05810547, + "step": 870, + "time_per_iteration": 2.792346239089966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148065, + "balance_loss_mlp": 1.04206407, + "epoch": 0.1675644478645633, + "flos": 481756277760.0, + "grad_norm": 0.023236257285911367, + "language_loss": 0.98118269, + "learning_rate": 0.0009511964761151197, + "loss": 0.99266338, + "num_input_tokens_seen": 71709616, + "router_z_loss_mlp": 1.05810547, + "step": 871, + "time_per_iteration": 2.572923183441162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152601, + "balance_loss_mlp": 1.04669595, + "epoch": 0.16775682954982685, + "flos": 495541206528.0, + "grad_norm": 0.026661505796453877, + "language_loss": 0.99311042, + "learning_rate": 0.0009510621405653521, + "loss": 1.00463641, + "num_input_tokens_seen": 71776592, + "router_z_loss_mlp": 1.05712891, + "step": 872, + "time_per_iteration": 2.6296472549438477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150326, + "balance_loss_mlp": 1.04484987, + "epoch": 0.1679492112350904, + "flos": 753404912640.0, + "grad_norm": 0.029291148216183213, + "language_loss": 0.93300939, + "learning_rate": 0.0009509276298977309, + "loss": 0.94451261, + "num_input_tokens_seen": 71856352, + "router_z_loss_mlp": 1.05273438, + "step": 873, + "time_per_iteration": 3.0177366733551025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150817, + "balance_loss_mlp": 1.04543638, + "epoch": 0.168141592920354, + "flos": 1137731977728.0, + "grad_norm": 0.021155110884158303, + "language_loss": 0.9134444, + "learning_rate": 0.0009507929441644778, + "loss": 0.92495263, + "num_input_tokens_seen": 71948480, + "router_z_loss_mlp": 1.05175781, + "step": 874, + "time_per_iteration": 3.53277325630188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160399, + "balance_loss_mlp": 1.05501771, + "epoch": 0.16833397460561755, + "flos": 633553677312.0, + "grad_norm": 0.025508723945600786, + "language_loss": 0.94342184, + "learning_rate": 0.0009506580834178826, + "loss": 0.95502585, + "num_input_tokens_seen": 72019200, + "router_z_loss_mlp": 1.05175781, + "step": 875, + "time_per_iteration": 2.763296365737915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151031, + "balance_loss_mlp": 1.04560196, + "epoch": 0.1685263562908811, + "flos": 542542657536.0, + "grad_norm": 0.0234395143242784, + "language_loss": 1.00066125, + "learning_rate": 0.0009505230477103028, + "loss": 1.01217151, + "num_input_tokens_seen": 72088672, + "router_z_loss_mlp": 1.05224609, + "step": 876, + "time_per_iteration": 2.7256453037261963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143495, + "balance_loss_mlp": 1.03801847, + "epoch": 0.16871873797614467, + "flos": 620485155840.0, + "grad_norm": 0.02951425183806971, + "language_loss": 0.91949958, + "learning_rate": 0.0009503878370941641, + "loss": 0.93093449, + "num_input_tokens_seen": 72159952, + "router_z_loss_mlp": 1.05273438, + "step": 877, + "time_per_iteration": 2.75011944770813 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143733, + "balance_loss_mlp": 1.038257, + "epoch": 0.16891111966140823, + "flos": 607455565824.0, + "grad_norm": 0.02526909046796152, + "language_loss": 0.99137431, + "learning_rate": 0.0009502524516219595, + "loss": 1.00281167, + "num_input_tokens_seen": 72231648, + "router_z_loss_mlp": 1.05273438, + "step": 878, + "time_per_iteration": 2.7107326984405518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145725, + "balance_loss_mlp": 1.04005778, + "epoch": 0.1691035013466718, + "flos": 553405561344.0, + "grad_norm": 0.023246247090994255, + "language_loss": 0.99022686, + "learning_rate": 0.0009501168913462506, + "loss": 1.00168419, + "num_input_tokens_seen": 72298608, + "router_z_loss_mlp": 1.0546875, + "step": 879, + "time_per_iteration": 2.654356002807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153572, + "balance_loss_mlp": 1.04866791, + "epoch": 0.16929588303193535, + "flos": 1479305822208.0, + "grad_norm": 0.014844444469597292, + "language_loss": 0.79121923, + "learning_rate": 0.0009499811563196665, + "loss": 0.802755, + "num_input_tokens_seen": 72525312, + "router_z_loss_mlp": 1.046875, + "step": 880, + "time_per_iteration": 4.877387762069702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114571, + "balance_loss_mlp": 1.04042399, + "epoch": 0.1694882647171989, + "flos": 927846641664.0, + "grad_norm": 0.023879743421000837, + "language_loss": 0.93963408, + "learning_rate": 0.0009498452465949042, + "loss": 0.95109117, + "num_input_tokens_seen": 72612976, + "router_z_loss_mlp": 1.05078125, + "step": 881, + "time_per_iteration": 3.241151809692383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150014, + "balance_loss_mlp": 1.0447762, + "epoch": 0.1696806464024625, + "flos": 547151278080.0, + "grad_norm": 0.02293023114251512, + "language_loss": 0.98854458, + "learning_rate": 0.0009497091622247285, + "loss": 1.0000447, + "num_input_tokens_seen": 72686800, + "router_z_loss_mlp": 1.05029297, + "step": 882, + "time_per_iteration": 2.720453977584839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145786, + "balance_loss_mlp": 1.0406431, + "epoch": 0.16987302808772606, + "flos": 530294602752.0, + "grad_norm": 0.02459483675822623, + "language_loss": 1.0302248, + "learning_rate": 0.0009495729032619723, + "loss": 1.04168272, + "num_input_tokens_seen": 72759360, + "router_z_loss_mlp": 1.04931641, + "step": 883, + "time_per_iteration": 2.717176675796509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151842, + "balance_loss_mlp": 1.04731977, + "epoch": 0.17006540977298962, + "flos": 756478660608.0, + "grad_norm": 0.02507713686866634, + "language_loss": 0.9295364, + "learning_rate": 0.0009494364697595354, + "loss": 0.94105482, + "num_input_tokens_seen": 72831424, + "router_z_loss_mlp": 1.04589844, + "step": 884, + "time_per_iteration": 2.924898147583008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157567, + "balance_loss_mlp": 1.05271089, + "epoch": 0.17025779145825318, + "flos": 559874694144.0, + "grad_norm": 0.025110060032482954, + "language_loss": 0.98774076, + "learning_rate": 0.0009492998617703867, + "loss": 0.99931645, + "num_input_tokens_seen": 72901536, + "router_z_loss_mlp": 1.04833984, + "step": 885, + "time_per_iteration": 2.6759417057037354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155376, + "balance_loss_mlp": 1.05104423, + "epoch": 0.17045017314351674, + "flos": 513216347136.0, + "grad_norm": 0.0280627140127875, + "language_loss": 0.96898842, + "learning_rate": 0.0009491630793475619, + "loss": 0.98054218, + "num_input_tokens_seen": 72970480, + "router_z_loss_mlp": 1.04492188, + "step": 886, + "time_per_iteration": 2.5910444259643555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149096, + "balance_loss_mlp": 1.04452574, + "epoch": 0.1706425548287803, + "flos": 510012343296.0, + "grad_norm": 0.023090423796267925, + "language_loss": 0.94873035, + "learning_rate": 0.0009490261225441643, + "loss": 0.96022129, + "num_input_tokens_seen": 73053376, + "router_z_loss_mlp": 1.04638672, + "step": 887, + "time_per_iteration": 2.960139513015747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149945, + "balance_loss_mlp": 1.04508829, + "epoch": 0.17083493651404386, + "flos": 718714642944.0, + "grad_norm": 0.024954435208077393, + "language_loss": 0.98478651, + "learning_rate": 0.0009488889914133656, + "loss": 0.99628592, + "num_input_tokens_seen": 73136032, + "router_z_loss_mlp": 1.04833984, + "step": 888, + "time_per_iteration": 3.0498712062835693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151016, + "balance_loss_mlp": 1.04649353, + "epoch": 0.17102731819930742, + "flos": 560200333824.0, + "grad_norm": 0.020862133880352407, + "language_loss": 0.97394216, + "learning_rate": 0.0009487516860084047, + "loss": 0.98545229, + "num_input_tokens_seen": 73208544, + "router_z_loss_mlp": 1.046875, + "step": 889, + "time_per_iteration": 2.799579381942749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115955, + "balance_loss_mlp": 1.0542171, + "epoch": 0.17121969988457098, + "flos": 495764788224.0, + "grad_norm": 0.030159167385703775, + "language_loss": 0.99659365, + "learning_rate": 0.0009486142063825884, + "loss": 1.0081892, + "num_input_tokens_seen": 73274336, + "router_z_loss_mlp": 1.05126953, + "step": 890, + "time_per_iteration": 2.5897767543792725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160561, + "balance_loss_mlp": 1.05718231, + "epoch": 0.17141208156983456, + "flos": 1552105941504.0, + "grad_norm": 0.012289453069715352, + "language_loss": 0.72426212, + "learning_rate": 0.0009484765525892909, + "loss": 0.73586774, + "num_input_tokens_seen": 73506320, + "router_z_loss_mlp": 1.03515625, + "step": 891, + "time_per_iteration": 4.971697807312012 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160561, + "balance_loss_mlp": 1.05527556, + "epoch": 0.17160446325509812, + "flos": 620700005376.0, + "grad_norm": 0.02677753623279009, + "language_loss": 1.00227833, + "learning_rate": 0.0009483387246819542, + "loss": 1.01388383, + "num_input_tokens_seen": 73578048, + "router_z_loss_mlp": 1.05078125, + "step": 892, + "time_per_iteration": 2.7142419815063477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153152, + "balance_loss_mlp": 1.04977417, + "epoch": 0.17179684494036168, + "flos": 1384693300224.0, + "grad_norm": 0.011012484205567044, + "language_loss": 0.82285583, + "learning_rate": 0.0009482007227140877, + "loss": 0.8343873, + "num_input_tokens_seen": 73798640, + "router_z_loss_mlp": 1.03515625, + "step": 893, + "time_per_iteration": 4.678752183914185 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159751, + "balance_loss_mlp": 1.05446541, + "epoch": 0.17198922662562524, + "flos": 493641762816.0, + "grad_norm": 0.02464509578240857, + "language_loss": 0.9638195, + "learning_rate": 0.0009480625467392688, + "loss": 0.97541702, + "num_input_tokens_seen": 73867328, + "router_z_loss_mlp": 1.05175781, + "step": 894, + "time_per_iteration": 2.6579103469848633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158279, + "balance_loss_mlp": 1.05490112, + "epoch": 0.1721816083108888, + "flos": 1461485689344.0, + "grad_norm": 0.014844728137103481, + "language_loss": 0.77994668, + "learning_rate": 0.0009479241968111421, + "loss": 0.79152954, + "num_input_tokens_seen": 74093376, + "router_z_loss_mlp": 1.03515625, + "step": 895, + "time_per_iteration": 4.754615783691406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157074, + "balance_loss_mlp": 1.0523603, + "epoch": 0.17237398999615236, + "flos": 529204892160.0, + "grad_norm": 0.024157534092911288, + "language_loss": 0.95005947, + "learning_rate": 0.0009477856729834196, + "loss": 0.96163023, + "num_input_tokens_seen": 74169136, + "router_z_loss_mlp": 1.046875, + "step": 896, + "time_per_iteration": 2.7640984058380127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161659, + "balance_loss_mlp": 1.05742288, + "epoch": 0.17256637168141592, + "flos": 605026366464.0, + "grad_norm": 0.02447501108745492, + "language_loss": 0.9782356, + "learning_rate": 0.0009476469753098809, + "loss": 0.98985219, + "num_input_tokens_seen": 74236912, + "router_z_loss_mlp": 1.04394531, + "step": 897, + "time_per_iteration": 2.7016282081604004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153769, + "balance_loss_mlp": 1.04957986, + "epoch": 0.17275875336667948, + "flos": 510693821952.0, + "grad_norm": 0.025419887327313116, + "language_loss": 0.94868481, + "learning_rate": 0.0009475081038443738, + "loss": 0.96022242, + "num_input_tokens_seen": 74305968, + "router_z_loss_mlp": 1.04345703, + "step": 898, + "time_per_iteration": 2.5731348991394043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148609, + "balance_loss_mlp": 1.0446589, + "epoch": 0.17295113505194307, + "flos": 666500955648.0, + "grad_norm": 0.02623291269769982, + "language_loss": 0.95752573, + "learning_rate": 0.0009473690586408124, + "loss": 0.96901178, + "num_input_tokens_seen": 74384144, + "router_z_loss_mlp": 1.04101562, + "step": 899, + "time_per_iteration": 2.8549156188964844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146417, + "balance_loss_mlp": 1.04227531, + "epoch": 0.17314351673720663, + "flos": 556431645696.0, + "grad_norm": 0.022300666942289, + "language_loss": 0.94826102, + "learning_rate": 0.0009472298397531792, + "loss": 0.9597252, + "num_input_tokens_seen": 74455040, + "router_z_loss_mlp": 1.04296875, + "step": 900, + "time_per_iteration": 2.7165167331695557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145486, + "balance_loss_mlp": 1.04124928, + "epoch": 0.17333589842247019, + "flos": 504606724608.0, + "grad_norm": 0.023477361471443404, + "language_loss": 0.95443118, + "learning_rate": 0.0009470904472355235, + "loss": 0.96588612, + "num_input_tokens_seen": 74525248, + "router_z_loss_mlp": 1.04394531, + "step": 901, + "time_per_iteration": 2.668320655822754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143811, + "balance_loss_mlp": 1.03967023, + "epoch": 0.17352828010773375, + "flos": 557350167552.0, + "grad_norm": 0.02470997420275152, + "language_loss": 0.90534914, + "learning_rate": 0.0009469508811419626, + "loss": 0.91678727, + "num_input_tokens_seen": 74597328, + "router_z_loss_mlp": 1.04296875, + "step": 902, + "time_per_iteration": 2.714174747467041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155739, + "balance_loss_mlp": 1.05331421, + "epoch": 0.1737206617929973, + "flos": 1557791537664.0, + "grad_norm": 0.011695515468407039, + "language_loss": 0.7161383, + "learning_rate": 0.0009468111415266806, + "loss": 0.7276957, + "num_input_tokens_seen": 74819664, + "router_z_loss_mlp": 1.02539062, + "step": 903, + "time_per_iteration": 4.783574104309082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146888, + "balance_loss_mlp": 1.04308009, + "epoch": 0.17391304347826086, + "flos": 517755836928.0, + "grad_norm": 0.027522671456014093, + "language_loss": 0.94518518, + "learning_rate": 0.0009466712284439292, + "loss": 0.95665407, + "num_input_tokens_seen": 74896224, + "router_z_loss_mlp": 1.03955078, + "step": 904, + "time_per_iteration": 2.7503864765167236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011486, + "balance_loss_mlp": 1.04503071, + "epoch": 0.17410542516352442, + "flos": 542160622080.0, + "grad_norm": 0.027186859166075866, + "language_loss": 0.99262786, + "learning_rate": 0.0009465311419480276, + "loss": 1.00411391, + "num_input_tokens_seen": 74966560, + "router_z_loss_mlp": 1.03710938, + "step": 905, + "time_per_iteration": 2.671753168106079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153491, + "balance_loss_mlp": 1.05011249, + "epoch": 0.17429780684878798, + "flos": 625081041408.0, + "grad_norm": 0.028950662808853365, + "language_loss": 0.96674442, + "learning_rate": 0.0009463908820933622, + "loss": 0.97827929, + "num_input_tokens_seen": 75045248, + "router_z_loss_mlp": 1.03515625, + "step": 906, + "time_per_iteration": 2.8291828632354736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151914, + "balance_loss_mlp": 1.04844034, + "epoch": 0.17449018853405157, + "flos": 576848890368.0, + "grad_norm": 0.03002954803612974, + "language_loss": 0.90420532, + "learning_rate": 0.0009462504489343868, + "loss": 0.91572446, + "num_input_tokens_seen": 75123952, + "router_z_loss_mlp": 1.03613281, + "step": 907, + "time_per_iteration": 2.8554108142852783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147077, + "balance_loss_mlp": 1.04341269, + "epoch": 0.17468257021931513, + "flos": 534772967424.0, + "grad_norm": 0.024073731406752365, + "language_loss": 1.01002121, + "learning_rate": 0.0009461098425256222, + "loss": 1.02149189, + "num_input_tokens_seen": 75191728, + "router_z_loss_mlp": 1.03808594, + "step": 908, + "time_per_iteration": 2.5968382358551025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114306, + "balance_loss_mlp": 1.03930068, + "epoch": 0.1748749519045787, + "flos": 541808785920.0, + "grad_norm": 0.02493910110608304, + "language_loss": 0.93412566, + "learning_rate": 0.0009459690629216567, + "loss": 0.94555628, + "num_input_tokens_seen": 75262224, + "router_z_loss_mlp": 1.0390625, + "step": 909, + "time_per_iteration": 2.670389413833618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150977, + "balance_loss_mlp": 1.04688334, + "epoch": 0.17506733358984225, + "flos": 499626802176.0, + "grad_norm": 0.02402970341263653, + "language_loss": 0.96272469, + "learning_rate": 0.0009458281101771457, + "loss": 0.97423446, + "num_input_tokens_seen": 75329760, + "router_z_loss_mlp": 1.04248047, + "step": 910, + "time_per_iteration": 2.6256320476531982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153015, + "balance_loss_mlp": 1.04906452, + "epoch": 0.1752597152751058, + "flos": 624132320256.0, + "grad_norm": 0.023679811966199643, + "language_loss": 0.91450173, + "learning_rate": 0.0009456869843468122, + "loss": 0.92603183, + "num_input_tokens_seen": 75407920, + "router_z_loss_mlp": 1.04101562, + "step": 911, + "time_per_iteration": 2.863004207611084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158204, + "balance_loss_mlp": 1.05434883, + "epoch": 0.17545209696036937, + "flos": 521993155584.0, + "grad_norm": 0.029813530713564303, + "language_loss": 0.92364156, + "learning_rate": 0.0009455456854854459, + "loss": 0.93522358, + "num_input_tokens_seen": 75476752, + "router_z_loss_mlp": 1.04003906, + "step": 912, + "time_per_iteration": 2.616231918334961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150079, + "balance_loss_mlp": 1.04612815, + "epoch": 0.17564447864563293, + "flos": 462945764352.0, + "grad_norm": 0.02810445184103091, + "language_loss": 0.92624664, + "learning_rate": 0.0009454042136479039, + "loss": 0.93774742, + "num_input_tokens_seen": 75542944, + "router_z_loss_mlp": 1.04101562, + "step": 913, + "time_per_iteration": 2.5944247245788574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155662, + "balance_loss_mlp": 1.05199766, + "epoch": 0.1758368603308965, + "flos": 481617289728.0, + "grad_norm": 0.02706355326928303, + "language_loss": 0.91841793, + "learning_rate": 0.0009452625688891103, + "loss": 0.92997456, + "num_input_tokens_seen": 75609840, + "router_z_loss_mlp": 1.03808594, + "step": 914, + "time_per_iteration": 2.580941915512085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144051, + "balance_loss_mlp": 1.04200745, + "epoch": 0.17602924201616005, + "flos": 1482084856320.0, + "grad_norm": 0.009713749524187035, + "language_loss": 0.78734738, + "learning_rate": 0.0009451207512640567, + "loss": 0.79878789, + "num_input_tokens_seen": 75819312, + "router_z_loss_mlp": 1.02148438, + "step": 915, + "time_per_iteration": 4.592097997665405 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148996, + "balance_loss_mlp": 1.04523647, + "epoch": 0.17622162370142364, + "flos": 603470026752.0, + "grad_norm": 0.02797967110469985, + "language_loss": 1.03421283, + "learning_rate": 0.0009449787608278015, + "loss": 1.0457027, + "num_input_tokens_seen": 75893984, + "router_z_loss_mlp": 1.0390625, + "step": 916, + "time_per_iteration": 2.755580425262451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150108, + "balance_loss_mlp": 1.04677713, + "epoch": 0.1764140053866872, + "flos": 443605495296.0, + "grad_norm": 0.024189441248888145, + "language_loss": 1.00777316, + "learning_rate": 0.0009448365976354704, + "loss": 1.01927423, + "num_input_tokens_seen": 75958944, + "router_z_loss_mlp": 1.03466797, + "step": 917, + "time_per_iteration": 2.4922571182250977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149055, + "balance_loss_mlp": 1.04567707, + "epoch": 0.17660638707195075, + "flos": 501591373824.0, + "grad_norm": 0.028333637349232343, + "language_loss": 1.01507974, + "learning_rate": 0.0009446942617422558, + "loss": 1.02657032, + "num_input_tokens_seen": 76024240, + "router_z_loss_mlp": 1.03515625, + "step": 918, + "time_per_iteration": 2.574998378753662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148191, + "balance_loss_mlp": 1.0448128, + "epoch": 0.17679876875721431, + "flos": 539983202304.0, + "grad_norm": 0.02432410226762854, + "language_loss": 0.94564992, + "learning_rate": 0.0009445517532034176, + "loss": 0.9571318, + "num_input_tokens_seen": 76095264, + "router_z_loss_mlp": 1.03515625, + "step": 919, + "time_per_iteration": 2.7170355319976807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153425, + "balance_loss_mlp": 1.05009484, + "epoch": 0.17699115044247787, + "flos": 498715011072.0, + "grad_norm": 0.026165935935680888, + "language_loss": 0.99032271, + "learning_rate": 0.0009444090720742824, + "loss": 1.00185692, + "num_input_tokens_seen": 76163520, + "router_z_loss_mlp": 1.03466797, + "step": 920, + "time_per_iteration": 2.6009292602539062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149157, + "balance_loss_mlp": 1.04587448, + "epoch": 0.17718353212774143, + "flos": 663915303936.0, + "grad_norm": 0.025722324934358026, + "language_loss": 0.98290348, + "learning_rate": 0.0009442662184102439, + "loss": 0.99439508, + "num_input_tokens_seen": 76233760, + "router_z_loss_mlp": 1.03417969, + "step": 921, + "time_per_iteration": 2.7612035274505615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145605, + "balance_loss_mlp": 1.04251313, + "epoch": 0.177375913813005, + "flos": 583847778816.0, + "grad_norm": 0.021564117555322487, + "language_loss": 0.93569565, + "learning_rate": 0.000944123192266763, + "loss": 0.94715166, + "num_input_tokens_seen": 76310704, + "router_z_loss_mlp": 1.03222656, + "step": 922, + "time_per_iteration": 2.8110268115997314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141792, + "balance_loss_mlp": 1.03865182, + "epoch": 0.17756829549826855, + "flos": 553683537408.0, + "grad_norm": 0.021487036209533367, + "language_loss": 0.92858881, + "learning_rate": 0.0009439799936993671, + "loss": 0.94000673, + "num_input_tokens_seen": 76386992, + "router_z_loss_mlp": 1.03271484, + "step": 923, + "time_per_iteration": 2.7440245151519775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142202, + "balance_loss_mlp": 1.03901482, + "epoch": 0.17776067718353214, + "flos": 557371634688.0, + "grad_norm": 0.02463154633112553, + "language_loss": 0.97990632, + "learning_rate": 0.0009438366227636511, + "loss": 0.99132836, + "num_input_tokens_seen": 76453328, + "router_z_loss_mlp": 1.03320312, + "step": 924, + "time_per_iteration": 2.7032759189605713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140208, + "balance_loss_mlp": 1.03721154, + "epoch": 0.1779530588687957, + "flos": 659651788800.0, + "grad_norm": 0.022941473179093813, + "language_loss": 0.94988692, + "learning_rate": 0.0009436930795152763, + "loss": 0.96128899, + "num_input_tokens_seen": 76529040, + "router_z_loss_mlp": 1.03125, + "step": 925, + "time_per_iteration": 2.854522943496704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143555, + "balance_loss_mlp": 1.04084456, + "epoch": 0.17814544055405926, + "flos": 645671476224.0, + "grad_norm": 0.02421412975678805, + "language_loss": 0.95479, + "learning_rate": 0.0009435493640099713, + "loss": 0.9662255, + "num_input_tokens_seen": 76604080, + "router_z_loss_mlp": 1.02832031, + "step": 926, + "time_per_iteration": 2.8268251419067383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143389, + "balance_loss_mlp": 1.04077399, + "epoch": 0.17833782223932282, + "flos": 461884251648.0, + "grad_norm": 0.0252062590806445, + "language_loss": 0.94177145, + "learning_rate": 0.0009434054763035314, + "loss": 0.95320535, + "num_input_tokens_seen": 76674096, + "router_z_loss_mlp": 1.02734375, + "step": 927, + "time_per_iteration": 2.629499673843384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139685, + "balance_loss_mlp": 1.03706956, + "epoch": 0.17853020392458638, + "flos": 760852965888.0, + "grad_norm": 0.02122720378042075, + "language_loss": 0.93181551, + "learning_rate": 0.0009432614164518185, + "loss": 0.94321233, + "num_input_tokens_seen": 76752144, + "router_z_loss_mlp": 1.02734375, + "step": 928, + "time_per_iteration": 2.9364700317382812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140803, + "balance_loss_mlp": 1.03818727, + "epoch": 0.17872258560984994, + "flos": 784055248896.0, + "grad_norm": 0.023477252169520995, + "language_loss": 0.93520033, + "learning_rate": 0.000943117184510762, + "loss": 0.94660836, + "num_input_tokens_seen": 76830240, + "router_z_loss_mlp": 1.02734375, + "step": 929, + "time_per_iteration": 3.07600474357605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150169, + "balance_loss_mlp": 1.04831696, + "epoch": 0.1789149672951135, + "flos": 1463031295488.0, + "grad_norm": 0.013755703560815407, + "language_loss": 0.78789961, + "learning_rate": 0.0009429727805363575, + "loss": 0.7994014, + "num_input_tokens_seen": 77062464, + "router_z_loss_mlp": 1.01953125, + "step": 930, + "time_per_iteration": 5.029282808303833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153323, + "balance_loss_mlp": 1.05099344, + "epoch": 0.17910734898037706, + "flos": 504930362880.0, + "grad_norm": 0.023999213273897636, + "language_loss": 0.96652937, + "learning_rate": 0.0009428282045846674, + "loss": 0.97806263, + "num_input_tokens_seen": 77136672, + "router_z_loss_mlp": 1.02441406, + "step": 931, + "time_per_iteration": 2.7112410068511963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145421, + "balance_loss_mlp": 1.04275823, + "epoch": 0.17929973066564064, + "flos": 747669651456.0, + "grad_norm": 0.02006943819739268, + "language_loss": 0.96385491, + "learning_rate": 0.0009426834567118214, + "loss": 0.97530913, + "num_input_tokens_seen": 77227040, + "router_z_loss_mlp": 1.02783203, + "step": 932, + "time_per_iteration": 3.0711913108825684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143693, + "balance_loss_mlp": 1.04098177, + "epoch": 0.1794921123509042, + "flos": 714572651520.0, + "grad_norm": 0.021210123960592832, + "language_loss": 0.89608383, + "learning_rate": 0.0009425385369740155, + "loss": 0.90752071, + "num_input_tokens_seen": 77319392, + "router_z_loss_mlp": 1.02832031, + "step": 933, + "time_per_iteration": 3.059857130050659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114727, + "balance_loss_mlp": 1.0451318, + "epoch": 0.17968449403616776, + "flos": 634361409024.0, + "grad_norm": 0.02299955090486112, + "language_loss": 0.96636283, + "learning_rate": 0.0009423934454275125, + "loss": 0.97783554, + "num_input_tokens_seen": 77394688, + "router_z_loss_mlp": 1.02246094, + "step": 934, + "time_per_iteration": 2.85917592048645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146917, + "balance_loss_mlp": 1.04477859, + "epoch": 0.17987687572143132, + "flos": 537378084864.0, + "grad_norm": 0.02461268142415081, + "language_loss": 1.01075852, + "learning_rate": 0.0009422481821286418, + "loss": 1.02222764, + "num_input_tokens_seen": 77468288, + "router_z_loss_mlp": 1.02246094, + "step": 935, + "time_per_iteration": 2.7314486503601074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150005, + "balance_loss_mlp": 1.04777098, + "epoch": 0.18006925740669488, + "flos": 539119074816.0, + "grad_norm": 0.026258801194945027, + "language_loss": 0.98970592, + "learning_rate": 0.0009421027471337998, + "loss": 1.00120604, + "num_input_tokens_seen": 77535840, + "router_z_loss_mlp": 1.0234375, + "step": 936, + "time_per_iteration": 2.6354496479034424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151337, + "balance_loss_mlp": 1.04891205, + "epoch": 0.18026163909195844, + "flos": 540534425088.0, + "grad_norm": 0.029056123283387615, + "language_loss": 0.94782555, + "learning_rate": 0.0009419571404994493, + "loss": 0.9593389, + "num_input_tokens_seen": 77604000, + "router_z_loss_mlp": 1.02539062, + "step": 937, + "time_per_iteration": 2.6368348598480225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148965, + "balance_loss_mlp": 1.04649317, + "epoch": 0.180454020777222, + "flos": 501682698240.0, + "grad_norm": 0.026973093946582868, + "language_loss": 1.00715971, + "learning_rate": 0.00094181136228212, + "loss": 1.01864934, + "num_input_tokens_seen": 77671488, + "router_z_loss_mlp": 1.02587891, + "step": 938, + "time_per_iteration": 2.710451602935791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145832, + "balance_loss_mlp": 1.043455, + "epoch": 0.18064640246248556, + "flos": 500006836224.0, + "grad_norm": 0.02510488837562242, + "language_loss": 0.93535352, + "learning_rate": 0.0009416654125384077, + "loss": 0.9468118, + "num_input_tokens_seen": 77746240, + "router_z_loss_mlp": 1.02490234, + "step": 939, + "time_per_iteration": 2.728480577468872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145905, + "balance_loss_mlp": 1.04424286, + "epoch": 0.18083878414774912, + "flos": 1522290808320.0, + "grad_norm": 0.01070150853185005, + "language_loss": 0.79772377, + "learning_rate": 0.0009415192913249752, + "loss": 0.80918276, + "num_input_tokens_seen": 77966080, + "router_z_loss_mlp": 1.01757812, + "step": 940, + "time_per_iteration": 4.915560007095337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145419, + "balance_loss_mlp": 1.04318535, + "epoch": 0.1810311658330127, + "flos": 728665755648.0, + "grad_norm": 0.023936590350452012, + "language_loss": 0.92724693, + "learning_rate": 0.000941372998698552, + "loss": 0.93870103, + "num_input_tokens_seen": 78049200, + "router_z_loss_mlp": 1.0234375, + "step": 941, + "time_per_iteration": 2.993441343307495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140689, + "balance_loss_mlp": 1.0385505, + "epoch": 0.18122354751827627, + "flos": 566044383744.0, + "grad_norm": 0.025062658148163358, + "language_loss": 0.94270039, + "learning_rate": 0.0009412265347159336, + "loss": 0.95410728, + "num_input_tokens_seen": 78122752, + "router_z_loss_mlp": 1.02246094, + "step": 942, + "time_per_iteration": 2.731416702270508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140669, + "balance_loss_mlp": 1.03848326, + "epoch": 0.18141592920353983, + "flos": 520317293568.0, + "grad_norm": 0.024682729806918415, + "language_loss": 0.94559634, + "learning_rate": 0.0009410798994339829, + "loss": 0.95700312, + "num_input_tokens_seen": 78194064, + "router_z_loss_mlp": 1.02294922, + "step": 943, + "time_per_iteration": 2.6001992225646973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138644, + "balance_loss_mlp": 1.03650522, + "epoch": 0.1816083108888034, + "flos": 513476858880.0, + "grad_norm": 0.022579221317186333, + "language_loss": 0.95589852, + "learning_rate": 0.000940933092909628, + "loss": 0.96728498, + "num_input_tokens_seen": 78262048, + "router_z_loss_mlp": 1.02246094, + "step": 944, + "time_per_iteration": 2.6360957622528076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147356, + "balance_loss_mlp": 1.04550409, + "epoch": 0.18180069257406695, + "flos": 493372518912.0, + "grad_norm": 0.02569410792888805, + "language_loss": 0.9276287, + "learning_rate": 0.0009407861151998649, + "loss": 0.93910229, + "num_input_tokens_seen": 78330624, + "router_z_loss_mlp": 1.01953125, + "step": 945, + "time_per_iteration": 2.6910903453826904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147749, + "balance_loss_mlp": 1.04608703, + "epoch": 0.1819930742593305, + "flos": 571230423552.0, + "grad_norm": 0.024877151530798884, + "language_loss": 0.95025092, + "learning_rate": 0.0009406389663617552, + "loss": 0.96172833, + "num_input_tokens_seen": 78400672, + "router_z_loss_mlp": 1.01757812, + "step": 946, + "time_per_iteration": 2.689232110977173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138734, + "balance_loss_mlp": 1.03669131, + "epoch": 0.18218545594459407, + "flos": 607110460416.0, + "grad_norm": 0.026141117268158143, + "language_loss": 0.96229172, + "learning_rate": 0.000940491646452427, + "loss": 0.97367907, + "num_input_tokens_seen": 78467952, + "router_z_loss_mlp": 1.02148438, + "step": 947, + "time_per_iteration": 2.720996618270874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136776, + "balance_loss_mlp": 1.03473294, + "epoch": 0.18237783762985763, + "flos": 549738931200.0, + "grad_norm": 0.02114848591843324, + "language_loss": 0.99382234, + "learning_rate": 0.000940344155529075, + "loss": 1.00519001, + "num_input_tokens_seen": 78538928, + "router_z_loss_mlp": 1.02148438, + "step": 948, + "time_per_iteration": 2.655764102935791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136656, + "balance_loss_mlp": 1.03489935, + "epoch": 0.1825702193151212, + "flos": 451674628608.0, + "grad_norm": 0.027816765537183038, + "language_loss": 0.98392528, + "learning_rate": 0.0009401964936489605, + "loss": 0.99529195, + "num_input_tokens_seen": 78602144, + "router_z_loss_mlp": 1.01855469, + "step": 949, + "time_per_iteration": 2.5372273921966553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137812, + "balance_loss_mlp": 1.03615081, + "epoch": 0.18276260100038477, + "flos": 590384040960.0, + "grad_norm": 0.023066854335363023, + "language_loss": 0.93237805, + "learning_rate": 0.0009400486608694108, + "loss": 0.94375616, + "num_input_tokens_seen": 78673152, + "router_z_loss_mlp": 1.01757812, + "step": 950, + "time_per_iteration": 2.7370681762695312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139002, + "balance_loss_mlp": 1.03719783, + "epoch": 0.18295498268564833, + "flos": 788709531648.0, + "grad_norm": 0.02337801281240106, + "language_loss": 0.97100747, + "learning_rate": 0.0009399006572478195, + "loss": 0.98239744, + "num_input_tokens_seen": 78753872, + "router_z_loss_mlp": 1.01904297, + "step": 951, + "time_per_iteration": 3.1136744022369385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144566, + "balance_loss_mlp": 1.04276168, + "epoch": 0.1831473643709119, + "flos": 579225696768.0, + "grad_norm": 0.024500893588447415, + "language_loss": 0.99522519, + "learning_rate": 0.0009397524828416468, + "loss": 1.00667083, + "num_input_tokens_seen": 78822640, + "router_z_loss_mlp": 1.01904297, + "step": 952, + "time_per_iteration": 2.680551767349243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138356, + "balance_loss_mlp": 1.03664696, + "epoch": 0.18333974605617545, + "flos": 567963293184.0, + "grad_norm": 0.023361368133084506, + "language_loss": 1.04812968, + "learning_rate": 0.0009396041377084192, + "loss": 1.05951309, + "num_input_tokens_seen": 78893792, + "router_z_loss_mlp": 1.01806641, + "step": 953, + "time_per_iteration": 2.6526119709014893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136097, + "balance_loss_mlp": 1.03443527, + "epoch": 0.183532127741439, + "flos": 528069519360.0, + "grad_norm": 0.02324700647994909, + "language_loss": 0.98137838, + "learning_rate": 0.0009394556219057295, + "loss": 0.99273932, + "num_input_tokens_seen": 78964752, + "router_z_loss_mlp": 1.01757812, + "step": 954, + "time_per_iteration": 2.6928489208221436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147999, + "balance_loss_mlp": 1.04671907, + "epoch": 0.18372450942670257, + "flos": 595643940864.0, + "grad_norm": 0.02338261009959255, + "language_loss": 0.93879586, + "learning_rate": 0.0009393069354912362, + "loss": 0.95027584, + "num_input_tokens_seen": 79034400, + "router_z_loss_mlp": 1.01367188, + "step": 955, + "time_per_iteration": 2.7496042251586914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151957, + "balance_loss_mlp": 1.05067647, + "epoch": 0.18391689111196613, + "flos": 646283824128.0, + "grad_norm": 0.029421035614033756, + "language_loss": 0.90626895, + "learning_rate": 0.0009391580785226649, + "loss": 0.91778857, + "num_input_tokens_seen": 79109488, + "router_z_loss_mlp": 1.01367188, + "step": 956, + "time_per_iteration": 2.9440600872039795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152481, + "balance_loss_mlp": 1.05253601, + "epoch": 0.18410927279722972, + "flos": 1460391975936.0, + "grad_norm": 0.020211591247266292, + "language_loss": 0.79340446, + "learning_rate": 0.0009390090510578067, + "loss": 0.80492932, + "num_input_tokens_seen": 79327712, + "router_z_loss_mlp": 1.0, + "step": 957, + "time_per_iteration": 4.738964796066284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138037, + "balance_loss_mlp": 1.03623211, + "epoch": 0.18430165448249328, + "flos": 660003624960.0, + "grad_norm": 0.026926680065899915, + "language_loss": 0.95339954, + "learning_rate": 0.0009388598531545196, + "loss": 0.96477991, + "num_input_tokens_seen": 79401504, + "router_z_loss_mlp": 1.01904297, + "step": 958, + "time_per_iteration": 2.859509229660034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138629, + "balance_loss_mlp": 1.03687191, + "epoch": 0.18449403616775684, + "flos": 518949606912.0, + "grad_norm": 0.029778126611616895, + "language_loss": 0.94583583, + "learning_rate": 0.000938710484870727, + "loss": 0.9572221, + "num_input_tokens_seen": 79466688, + "router_z_loss_mlp": 1.01855469, + "step": 959, + "time_per_iteration": 2.565548896789551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137101, + "balance_loss_mlp": 1.03543901, + "epoch": 0.1846864178530204, + "flos": 553824526848.0, + "grad_norm": 0.027283874554685776, + "language_loss": 0.94945395, + "learning_rate": 0.0009385609462644189, + "loss": 0.96082497, + "num_input_tokens_seen": 79540288, + "router_z_loss_mlp": 1.01757812, + "step": 960, + "time_per_iteration": 2.676379919052124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138569, + "balance_loss_mlp": 1.03709817, + "epoch": 0.18487879953828396, + "flos": 467115953664.0, + "grad_norm": 0.025693285519799033, + "language_loss": 0.96468461, + "learning_rate": 0.0009384112373936514, + "loss": 0.97607034, + "num_input_tokens_seen": 79611872, + "router_z_loss_mlp": 1.015625, + "step": 961, + "time_per_iteration": 2.7575974464416504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154728, + "balance_loss_mlp": 1.05325735, + "epoch": 0.18507118122354752, + "flos": 649683211776.0, + "grad_norm": 0.02725538915325764, + "language_loss": 1.0098747, + "learning_rate": 0.0009382613583165467, + "loss": 1.02142203, + "num_input_tokens_seen": 79689504, + "router_z_loss_mlp": 1.015625, + "step": 962, + "time_per_iteration": 2.8268754482269287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116263, + "balance_loss_mlp": 1.06125438, + "epoch": 0.18526356290881107, + "flos": 627922475520.0, + "grad_norm": 0.027998512126097927, + "language_loss": 0.99849832, + "learning_rate": 0.0009381113090912928, + "loss": 1.01012468, + "num_input_tokens_seen": 79759264, + "router_z_loss_mlp": 1.01464844, + "step": 963, + "time_per_iteration": 2.7762861251831055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147698, + "balance_loss_mlp": 1.04679894, + "epoch": 0.18545594459407463, + "flos": 433645650432.0, + "grad_norm": 0.027008272304904758, + "language_loss": 0.98634118, + "learning_rate": 0.000937961089776144, + "loss": 0.99781811, + "num_input_tokens_seen": 79824464, + "router_z_loss_mlp": 1.00976562, + "step": 964, + "time_per_iteration": 2.61759352684021 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149635, + "balance_loss_mlp": 1.04844999, + "epoch": 0.1856483262793382, + "flos": 750426491904.0, + "grad_norm": 0.028502333826765886, + "language_loss": 0.91998804, + "learning_rate": 0.0009378107004294208, + "loss": 0.93148446, + "num_input_tokens_seen": 79907152, + "router_z_loss_mlp": 1.01269531, + "step": 965, + "time_per_iteration": 2.964561939239502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151478, + "balance_loss_mlp": 1.05057883, + "epoch": 0.18584070796460178, + "flos": 531401777664.0, + "grad_norm": 0.02451376704559663, + "language_loss": 1.00210857, + "learning_rate": 0.0009376601411095096, + "loss": 1.01362348, + "num_input_tokens_seen": 79976944, + "router_z_loss_mlp": 1.00976562, + "step": 966, + "time_per_iteration": 2.6664164066314697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150482, + "balance_loss_mlp": 1.04953575, + "epoch": 0.18603308964986534, + "flos": 484083419136.0, + "grad_norm": 0.02282308899195351, + "language_loss": 0.93174511, + "learning_rate": 0.0009375094118748622, + "loss": 0.94324994, + "num_input_tokens_seen": 80042112, + "router_z_loss_mlp": 1.01025391, + "step": 967, + "time_per_iteration": 2.544952392578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142823, + "balance_loss_mlp": 1.041924, + "epoch": 0.1862254713351289, + "flos": 802681112064.0, + "grad_norm": 0.02495680742184495, + "language_loss": 1.00251484, + "learning_rate": 0.0009373585127839976, + "loss": 1.01394308, + "num_input_tokens_seen": 80118896, + "router_z_loss_mlp": 1.00976562, + "step": 968, + "time_per_iteration": 2.973095417022705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142113, + "balance_loss_mlp": 1.0413574, + "epoch": 0.18641785302039246, + "flos": 479290148352.0, + "grad_norm": 0.02509872783632802, + "language_loss": 0.9944787, + "learning_rate": 0.0009372074438954994, + "loss": 1.00589979, + "num_input_tokens_seen": 80183360, + "router_z_loss_mlp": 1.00830078, + "step": 969, + "time_per_iteration": 2.5303025245666504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142663, + "balance_loss_mlp": 1.04181159, + "epoch": 0.18661023470565602, + "flos": 389779072512.0, + "grad_norm": 0.02439046514561532, + "language_loss": 1.00939226, + "learning_rate": 0.0009370562052680181, + "loss": 1.02081895, + "num_input_tokens_seen": 80247024, + "router_z_loss_mlp": 1.00927734, + "step": 970, + "time_per_iteration": 2.5023443698883057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114981, + "balance_loss_mlp": 1.04929316, + "epoch": 0.18680261639091958, + "flos": 565775139840.0, + "grad_norm": 0.02213336285369191, + "language_loss": 0.95379293, + "learning_rate": 0.0009369047969602695, + "loss": 0.96529102, + "num_input_tokens_seen": 80318256, + "router_z_loss_mlp": 1.00585938, + "step": 971, + "time_per_iteration": 2.722823143005371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154865, + "balance_loss_mlp": 1.05420506, + "epoch": 0.18699499807618314, + "flos": 480230137344.0, + "grad_norm": 0.029574405329312194, + "language_loss": 0.9913702, + "learning_rate": 0.0009367532190310357, + "loss": 1.00291884, + "num_input_tokens_seen": 80384848, + "router_z_loss_mlp": 1.00732422, + "step": 972, + "time_per_iteration": 2.633387327194214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149336, + "balance_loss_mlp": 1.0490092, + "epoch": 0.1871873797614467, + "flos": 554328086016.0, + "grad_norm": 0.02905569815438633, + "language_loss": 0.99535728, + "learning_rate": 0.0009366014715391644, + "loss": 1.00685072, + "num_input_tokens_seen": 80453088, + "router_z_loss_mlp": 1.00390625, + "step": 973, + "time_per_iteration": 2.6549065113067627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153264, + "balance_loss_mlp": 1.05293763, + "epoch": 0.18737976144671029, + "flos": 553952781312.0, + "grad_norm": 0.023481989115367276, + "language_loss": 0.9123525, + "learning_rate": 0.0009364495545435693, + "loss": 0.92388517, + "num_input_tokens_seen": 80528608, + "router_z_loss_mlp": 1.00390625, + "step": 974, + "time_per_iteration": 4.409714221954346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155126, + "balance_loss_mlp": 1.05479944, + "epoch": 0.18757214313197385, + "flos": 503247770112.0, + "grad_norm": 0.022955013749569684, + "language_loss": 0.97297812, + "learning_rate": 0.0009362974681032297, + "loss": 0.98452938, + "num_input_tokens_seen": 80599600, + "router_z_loss_mlp": 1.00390625, + "step": 975, + "time_per_iteration": 2.61857533454895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153706, + "balance_loss_mlp": 1.05352271, + "epoch": 0.1877645248172374, + "flos": 676291613184.0, + "grad_norm": 0.028784531937469084, + "language_loss": 0.98011422, + "learning_rate": 0.0009361452122771907, + "loss": 0.9916513, + "num_input_tokens_seen": 80677264, + "router_z_loss_mlp": 1.00244141, + "step": 976, + "time_per_iteration": 2.91774320602417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149368, + "balance_loss_mlp": 1.04923177, + "epoch": 0.18795690650250096, + "flos": 405862944768.0, + "grad_norm": 0.029616845561456457, + "language_loss": 0.95658362, + "learning_rate": 0.0009359927871245635, + "loss": 0.9680773, + "num_input_tokens_seen": 80739776, + "router_z_loss_mlp": 1.00195312, + "step": 977, + "time_per_iteration": 2.563232183456421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149302, + "balance_loss_mlp": 1.04916573, + "epoch": 0.18814928818776452, + "flos": 639063355392.0, + "grad_norm": 0.027239481801034963, + "language_loss": 0.98439831, + "learning_rate": 0.0009358401927045246, + "loss": 0.99589127, + "num_input_tokens_seen": 80815200, + "router_z_loss_mlp": 1.00195312, + "step": 978, + "time_per_iteration": 2.8147568702697754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144978, + "balance_loss_mlp": 1.04498518, + "epoch": 0.18834166987302808, + "flos": 1140115514880.0, + "grad_norm": 0.022094320674951175, + "language_loss": 0.96123868, + "learning_rate": 0.0009356874290763166, + "loss": 0.9726885, + "num_input_tokens_seen": 80905024, + "router_z_loss_mlp": 1.00048828, + "step": 979, + "time_per_iteration": 3.4719691276550293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149894, + "balance_loss_mlp": 1.04971051, + "epoch": 0.18853405155829164, + "flos": 505815957504.0, + "grad_norm": 0.02560863383472628, + "language_loss": 0.98637187, + "learning_rate": 0.0009355344962992474, + "loss": 0.99787074, + "num_input_tokens_seen": 80976704, + "router_z_loss_mlp": 1.00244141, + "step": 980, + "time_per_iteration": 2.6199324131011963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139646, + "balance_loss_mlp": 1.03931963, + "epoch": 0.1887264332435552, + "flos": 609370472448.0, + "grad_norm": 0.02150131271194909, + "language_loss": 0.97900265, + "learning_rate": 0.0009353813944326908, + "loss": 0.99039912, + "num_input_tokens_seen": 81057152, + "router_z_loss_mlp": 1.00390625, + "step": 981, + "time_per_iteration": 2.8862478733062744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143203, + "balance_loss_mlp": 1.04287672, + "epoch": 0.1889188149288188, + "flos": 553592212992.0, + "grad_norm": 0.027403519760576756, + "language_loss": 0.92598587, + "learning_rate": 0.0009352281235360863, + "loss": 0.93741786, + "num_input_tokens_seen": 81131520, + "router_z_loss_mlp": 1.00390625, + "step": 982, + "time_per_iteration": 2.680797815322876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142003, + "balance_loss_mlp": 1.04167616, + "epoch": 0.18911119661408235, + "flos": 419469954048.0, + "grad_norm": 0.02481781093748577, + "language_loss": 0.92531025, + "learning_rate": 0.0009350746836689389, + "loss": 0.93673027, + "num_input_tokens_seen": 81195952, + "router_z_loss_mlp": 1.00390625, + "step": 983, + "time_per_iteration": 2.5687928199768066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152649, + "balance_loss_mlp": 1.05289459, + "epoch": 0.1893035782993459, + "flos": 1485317784576.0, + "grad_norm": 0.01747927461324531, + "language_loss": 0.81439221, + "learning_rate": 0.0009349210748908193, + "loss": 0.82591867, + "num_input_tokens_seen": 81427312, + "router_z_loss_mlp": 0.99804688, + "step": 984, + "time_per_iteration": 4.978898048400879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115218, + "balance_loss_mlp": 1.05237782, + "epoch": 0.18949595998460947, + "flos": 509456391168.0, + "grad_norm": 0.033971943902626214, + "language_loss": 0.94133711, + "learning_rate": 0.0009347672972613634, + "loss": 0.95285892, + "num_input_tokens_seen": 81494256, + "router_z_loss_mlp": 0.99853516, + "step": 985, + "time_per_iteration": 2.5850014686584473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153583, + "balance_loss_mlp": 1.05382824, + "epoch": 0.18968834166987303, + "flos": 532192045056.0, + "grad_norm": 0.027626772825507382, + "language_loss": 0.93152702, + "learning_rate": 0.0009346133508402735, + "loss": 0.9430629, + "num_input_tokens_seen": 81569312, + "router_z_loss_mlp": 0.99804688, + "step": 986, + "time_per_iteration": 2.7262227535247803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146056, + "balance_loss_mlp": 1.04658782, + "epoch": 0.1898807233551366, + "flos": 500753442816.0, + "grad_norm": 0.02768975875157221, + "language_loss": 0.95335174, + "learning_rate": 0.0009344592356873166, + "loss": 0.96481234, + "num_input_tokens_seen": 81637024, + "router_z_loss_mlp": 0.99511719, + "step": 987, + "time_per_iteration": 2.678715467453003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149829, + "balance_loss_mlp": 1.05002666, + "epoch": 0.19007310504040015, + "flos": 603359236608.0, + "grad_norm": 0.02899497531058058, + "language_loss": 0.87347138, + "learning_rate": 0.0009343049518623255, + "loss": 0.88496965, + "num_input_tokens_seen": 81709488, + "router_z_loss_mlp": 0.99853516, + "step": 988, + "time_per_iteration": 2.726668119430542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143975, + "balance_loss_mlp": 1.04407787, + "epoch": 0.1902654867256637, + "flos": 602764353024.0, + "grad_norm": 0.022945627178248204, + "language_loss": 0.90576518, + "learning_rate": 0.0009341504994251985, + "loss": 0.91720492, + "num_input_tokens_seen": 81787152, + "router_z_loss_mlp": 0.99951172, + "step": 989, + "time_per_iteration": 2.8518989086151123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151848, + "balance_loss_mlp": 1.05247498, + "epoch": 0.19045786841092727, + "flos": 1579231363584.0, + "grad_norm": 0.011944448483625032, + "language_loss": 0.73520499, + "learning_rate": 0.0009339958784358994, + "loss": 0.74672347, + "num_input_tokens_seen": 82030608, + "router_z_loss_mlp": 0.99414062, + "step": 990, + "time_per_iteration": 5.084089517593384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144398, + "balance_loss_mlp": 1.04445326, + "epoch": 0.19065025009619085, + "flos": 683054184960.0, + "grad_norm": 0.025253455013724026, + "language_loss": 0.88680583, + "learning_rate": 0.0009338410889544574, + "loss": 0.8982498, + "num_input_tokens_seen": 82119872, + "router_z_loss_mlp": 1.0, + "step": 991, + "time_per_iteration": 3.007277011871338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113977, + "balance_loss_mlp": 1.03949153, + "epoch": 0.1908426317814544, + "flos": 603441828864.0, + "grad_norm": 0.02514183514150974, + "language_loss": 0.96243769, + "learning_rate": 0.000933686131040967, + "loss": 0.97383535, + "num_input_tokens_seen": 82195552, + "router_z_loss_mlp": 1.00341797, + "step": 992, + "time_per_iteration": 2.7673017978668213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144459, + "balance_loss_mlp": 1.04441845, + "epoch": 0.19103501346671797, + "flos": 587433818112.0, + "grad_norm": 0.025095383977303525, + "language_loss": 0.99126339, + "learning_rate": 0.0009335310047555883, + "loss": 1.00270796, + "num_input_tokens_seen": 82267040, + "router_z_loss_mlp": 1.00097656, + "step": 993, + "time_per_iteration": 2.782841920852661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145602, + "balance_loss_mlp": 1.04565716, + "epoch": 0.19122739515198153, + "flos": 546834370560.0, + "grad_norm": 0.0365250692916995, + "language_loss": 0.97246122, + "learning_rate": 0.0009333757101585467, + "loss": 0.98391724, + "num_input_tokens_seen": 82337680, + "router_z_loss_mlp": 1.0, + "step": 994, + "time_per_iteration": 2.6937174797058105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142239, + "balance_loss_mlp": 1.04229414, + "epoch": 0.1914197768372451, + "flos": 522549107712.0, + "grad_norm": 0.02399514581888075, + "language_loss": 1.00362575, + "learning_rate": 0.0009332202473101329, + "loss": 1.01504803, + "num_input_tokens_seen": 82409600, + "router_z_loss_mlp": 1.0, + "step": 995, + "time_per_iteration": 2.7192962169647217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137582, + "balance_loss_mlp": 1.03763652, + "epoch": 0.19161215852250865, + "flos": 612387824640.0, + "grad_norm": 0.024864495797513732, + "language_loss": 0.91319168, + "learning_rate": 0.0009330646162707028, + "loss": 0.92456746, + "num_input_tokens_seen": 82480288, + "router_z_loss_mlp": 1.0, + "step": 996, + "time_per_iteration": 2.7450180053710938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113947, + "balance_loss_mlp": 1.03962064, + "epoch": 0.1918045402077722, + "flos": 848182619136.0, + "grad_norm": 0.02592603597590215, + "language_loss": 0.92579019, + "learning_rate": 0.0009329088171006779, + "loss": 0.93718487, + "num_input_tokens_seen": 82568960, + "router_z_loss_mlp": 0.99902344, + "step": 997, + "time_per_iteration": 3.1890194416046143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144521, + "balance_loss_mlp": 1.04457617, + "epoch": 0.19199692189303577, + "flos": 466892371968.0, + "grad_norm": 0.027577096255712943, + "language_loss": 0.95194477, + "learning_rate": 0.0009327528498605446, + "loss": 0.96338999, + "num_input_tokens_seen": 82634128, + "router_z_loss_mlp": 1.0, + "step": 998, + "time_per_iteration": 2.6845622062683105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141712, + "balance_loss_mlp": 1.04143262, + "epoch": 0.19218930357829936, + "flos": 532613011968.0, + "grad_norm": 0.026795980657526523, + "language_loss": 0.98209792, + "learning_rate": 0.0009325967146108548, + "loss": 0.99351501, + "num_input_tokens_seen": 82707472, + "router_z_loss_mlp": 1.00341797, + "step": 999, + "time_per_iteration": 2.690363883972168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145933, + "balance_loss_mlp": 1.04589295, + "epoch": 0.19238168526356292, + "flos": 602727422976.0, + "grad_norm": 0.025877996038880184, + "language_loss": 0.97816348, + "learning_rate": 0.0009324404114122258, + "loss": 0.98962283, + "num_input_tokens_seen": 82775232, + "router_z_loss_mlp": 1.00097656, + "step": 1000, + "time_per_iteration": 2.717535972595215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139683, + "balance_loss_mlp": 1.03969073, + "epoch": 0.19257406694882648, + "flos": 573154062336.0, + "grad_norm": 0.0251308575536182, + "language_loss": 0.95425117, + "learning_rate": 0.0009322839403253397, + "loss": 0.96564806, + "num_input_tokens_seen": 82850032, + "router_z_loss_mlp": 1.00048828, + "step": 1001, + "time_per_iteration": 2.8128621578216553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147687, + "balance_loss_mlp": 1.04793251, + "epoch": 0.19276644863409004, + "flos": 803156473344.0, + "grad_norm": 0.02827819499351052, + "language_loss": 0.93752921, + "learning_rate": 0.0009321273014109439, + "loss": 0.94900608, + "num_input_tokens_seen": 82926080, + "router_z_loss_mlp": 0.99804688, + "step": 1002, + "time_per_iteration": 2.9962668418884277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115103, + "balance_loss_mlp": 1.05127609, + "epoch": 0.1929588303193536, + "flos": 564479311872.0, + "grad_norm": 0.02425681225612504, + "language_loss": 0.92063946, + "learning_rate": 0.0009319704947298513, + "loss": 0.93214977, + "num_input_tokens_seen": 83005200, + "router_z_loss_mlp": 0.99804688, + "step": 1003, + "time_per_iteration": 2.9961774349212646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148634, + "balance_loss_mlp": 1.04887998, + "epoch": 0.19315121200461716, + "flos": 627987603456.0, + "grad_norm": 0.023688885680104285, + "language_loss": 0.95116329, + "learning_rate": 0.0009318135203429393, + "loss": 0.96264958, + "num_input_tokens_seen": 83077280, + "router_z_loss_mlp": 0.99804688, + "step": 1004, + "time_per_iteration": 2.7953245639801025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146221, + "balance_loss_mlp": 1.04646707, + "epoch": 0.19334359368988072, + "flos": 518583034368.0, + "grad_norm": 0.02448547542723696, + "language_loss": 0.95706153, + "learning_rate": 0.0009316563783111511, + "loss": 0.9685238, + "num_input_tokens_seen": 83145456, + "router_z_loss_mlp": 0.99804688, + "step": 1005, + "time_per_iteration": 2.7417562007904053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141812, + "balance_loss_mlp": 1.04224837, + "epoch": 0.19353597537514428, + "flos": 695399568384.0, + "grad_norm": 0.022656832097962477, + "language_loss": 0.91614294, + "learning_rate": 0.0009314990686954943, + "loss": 0.9275611, + "num_input_tokens_seen": 83225392, + "router_z_loss_mlp": 0.99609375, + "step": 1006, + "time_per_iteration": 2.921147584915161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143701, + "balance_loss_mlp": 1.04413795, + "epoch": 0.19372835706040784, + "flos": 1212199226880.0, + "grad_norm": 0.0213605480211332, + "language_loss": 0.89449364, + "learning_rate": 0.000931341591557042, + "loss": 0.90593064, + "num_input_tokens_seen": 83331296, + "router_z_loss_mlp": 0.99609375, + "step": 1007, + "time_per_iteration": 3.6934237480163574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142723, + "balance_loss_mlp": 1.04292154, + "epoch": 0.19392073874567142, + "flos": 521684980224.0, + "grad_norm": 0.02492230683936131, + "language_loss": 0.9970367, + "learning_rate": 0.0009311839469569325, + "loss": 1.00846386, + "num_input_tokens_seen": 83399952, + "router_z_loss_mlp": 0.99853516, + "step": 1008, + "time_per_iteration": 2.66283917427063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141437, + "balance_loss_mlp": 1.04187346, + "epoch": 0.19411312043093498, + "flos": 589910681088.0, + "grad_norm": 0.028572464719479444, + "language_loss": 0.9835515, + "learning_rate": 0.0009310261349563687, + "loss": 0.99496591, + "num_input_tokens_seen": 83468384, + "router_z_loss_mlp": 0.99609375, + "step": 1009, + "time_per_iteration": 2.6913864612579346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139912, + "balance_loss_mlp": 1.04034853, + "epoch": 0.19430550211619854, + "flos": 580571916288.0, + "grad_norm": 0.022224830980977262, + "language_loss": 0.9288035, + "learning_rate": 0.0009308681556166186, + "loss": 0.94020259, + "num_input_tokens_seen": 83547952, + "router_z_loss_mlp": 0.99609375, + "step": 1010, + "time_per_iteration": 2.8937342166900635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141907, + "balance_loss_mlp": 1.04234338, + "epoch": 0.1944978838014621, + "flos": 622245611520.0, + "grad_norm": 0.028831874511777204, + "language_loss": 1.01060331, + "learning_rate": 0.0009307100089990152, + "loss": 1.02202237, + "num_input_tokens_seen": 83615712, + "router_z_loss_mlp": 0.99609375, + "step": 1011, + "time_per_iteration": 2.7086822986602783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114452, + "balance_loss_mlp": 1.04495597, + "epoch": 0.19469026548672566, + "flos": 599814130176.0, + "grad_norm": 0.02434118582542042, + "language_loss": 0.95591187, + "learning_rate": 0.0009305516951649568, + "loss": 0.96735704, + "num_input_tokens_seen": 83687296, + "router_z_loss_mlp": 0.99609375, + "step": 1012, + "time_per_iteration": 2.7046425342559814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114359, + "balance_loss_mlp": 1.04402685, + "epoch": 0.19488264717198922, + "flos": 553247107584.0, + "grad_norm": 0.020712874248618226, + "language_loss": 0.93779677, + "learning_rate": 0.0009303932141759057, + "loss": 0.94923264, + "num_input_tokens_seen": 83763168, + "router_z_loss_mlp": 0.99609375, + "step": 1013, + "time_per_iteration": 2.7684950828552246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145994, + "balance_loss_mlp": 1.0468123, + "epoch": 0.19507502885725278, + "flos": 667312690176.0, + "grad_norm": 0.029421944235057496, + "language_loss": 0.94045115, + "learning_rate": 0.0009302345660933902, + "loss": 0.95191121, + "num_input_tokens_seen": 83837312, + "router_z_loss_mlp": 0.9921875, + "step": 1014, + "time_per_iteration": 2.8242082595825195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143606, + "balance_loss_mlp": 1.04442382, + "epoch": 0.19526741054251634, + "flos": 672327541248.0, + "grad_norm": 0.024449615989116238, + "language_loss": 0.93477654, + "learning_rate": 0.0009300757509790026, + "loss": 0.94621253, + "num_input_tokens_seen": 83917120, + "router_z_loss_mlp": 0.9921875, + "step": 1015, + "time_per_iteration": 2.840658664703369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144964, + "balance_loss_mlp": 1.04578233, + "epoch": 0.19545979222777993, + "flos": 448146986496.0, + "grad_norm": 0.028637929544829934, + "language_loss": 1.02226353, + "learning_rate": 0.0009299167688944005, + "loss": 1.0337131, + "num_input_tokens_seen": 83982992, + "router_z_loss_mlp": 0.9921875, + "step": 1016, + "time_per_iteration": 2.505427837371826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114266, + "balance_loss_mlp": 1.04376352, + "epoch": 0.1956521739130435, + "flos": 570168910848.0, + "grad_norm": 0.02609870742448671, + "language_loss": 0.93148959, + "learning_rate": 0.0009297576199013063, + "loss": 0.94291621, + "num_input_tokens_seen": 84057296, + "router_z_loss_mlp": 0.98925781, + "step": 1017, + "time_per_iteration": 2.7357168197631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155182, + "balance_loss_mlp": 1.05752563, + "epoch": 0.19584455559830705, + "flos": 1458880571904.0, + "grad_norm": 0.02028337436206496, + "language_loss": 0.73002136, + "learning_rate": 0.0009295983040615071, + "loss": 0.74157315, + "num_input_tokens_seen": 84292640, + "router_z_loss_mlp": 0.9765625, + "step": 1018, + "time_per_iteration": 5.09963059425354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147095, + "balance_loss_mlp": 1.04962921, + "epoch": 0.1960369372835706, + "flos": 1594481307648.0, + "grad_norm": 0.015251553743586253, + "language_loss": 0.79426301, + "learning_rate": 0.0009294388214368547, + "loss": 0.80573392, + "num_input_tokens_seen": 84524448, + "router_z_loss_mlp": 0.97460938, + "step": 1019, + "time_per_iteration": 6.03454852104187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146546, + "balance_loss_mlp": 1.0477457, + "epoch": 0.19622931896883417, + "flos": 617252954112.0, + "grad_norm": 0.02445318741287071, + "language_loss": 0.94190967, + "learning_rate": 0.0009292791720892659, + "loss": 0.9533751, + "num_input_tokens_seen": 84600208, + "router_z_loss_mlp": 0.98828125, + "step": 1020, + "time_per_iteration": 2.8369834423065186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147421, + "balance_loss_mlp": 1.0486201, + "epoch": 0.19642170065409773, + "flos": 467207278080.0, + "grad_norm": 0.027280190942869837, + "language_loss": 0.98824823, + "learning_rate": 0.0009291193560807218, + "loss": 0.99972242, + "num_input_tokens_seen": 84668032, + "router_z_loss_mlp": 0.98828125, + "step": 1021, + "time_per_iteration": 2.5833048820495605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144681, + "balance_loss_mlp": 1.0458802, + "epoch": 0.19661408233936128, + "flos": 516288093696.0, + "grad_norm": 0.025303886608753337, + "language_loss": 0.95740455, + "learning_rate": 0.0009289593734732688, + "loss": 0.96885145, + "num_input_tokens_seen": 84738176, + "router_z_loss_mlp": 0.98828125, + "step": 1022, + "time_per_iteration": 2.5913774967193604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149525, + "balance_loss_mlp": 1.05058122, + "epoch": 0.19680646402462484, + "flos": 393493366272.0, + "grad_norm": 0.0253763529676381, + "language_loss": 1.01103711, + "learning_rate": 0.0009287992243290175, + "loss": 1.02253246, + "num_input_tokens_seen": 84799936, + "router_z_loss_mlp": 0.98974609, + "step": 1023, + "time_per_iteration": 2.4793736934661865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115501, + "balance_loss_mlp": 1.05635238, + "epoch": 0.19699884570988843, + "flos": 627623032320.0, + "grad_norm": 0.02508480994731895, + "language_loss": 0.99886519, + "learning_rate": 0.0009286389087101435, + "loss": 1.01041532, + "num_input_tokens_seen": 84877216, + "router_z_loss_mlp": 0.98681641, + "step": 1024, + "time_per_iteration": 2.7772202491760254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153446, + "balance_loss_mlp": 1.05483615, + "epoch": 0.197191227395152, + "flos": 559073693184.0, + "grad_norm": 0.02445444816711275, + "language_loss": 0.98426372, + "learning_rate": 0.0009284784266788864, + "loss": 0.99579823, + "num_input_tokens_seen": 84952464, + "router_z_loss_mlp": 0.98632812, + "step": 1025, + "time_per_iteration": 2.6955864429473877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150264, + "balance_loss_mlp": 1.05165374, + "epoch": 0.19738360908041555, + "flos": 666249176064.0, + "grad_norm": 0.021666801749132464, + "language_loss": 0.99231869, + "learning_rate": 0.0009283177782975512, + "loss": 1.00382137, + "num_input_tokens_seen": 85031488, + "router_z_loss_mlp": 0.98632812, + "step": 1026, + "time_per_iteration": 2.9886229038238525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153907, + "balance_loss_mlp": 1.05529749, + "epoch": 0.1975759907656791, + "flos": 523510563840.0, + "grad_norm": 0.025961932589349316, + "language_loss": 0.98509014, + "learning_rate": 0.000928156963628507, + "loss": 0.99662918, + "num_input_tokens_seen": 85098384, + "router_z_loss_mlp": 0.98632812, + "step": 1027, + "time_per_iteration": 2.586740493774414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149439, + "balance_loss_mlp": 1.05097175, + "epoch": 0.19776837245094267, + "flos": 463484252160.0, + "grad_norm": 0.02550253779434718, + "language_loss": 0.96135926, + "learning_rate": 0.0009279959827341877, + "loss": 0.97285366, + "num_input_tokens_seen": 85172944, + "router_z_loss_mlp": 0.98486328, + "step": 1028, + "time_per_iteration": 2.723517894744873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146754, + "balance_loss_mlp": 1.04852605, + "epoch": 0.19796075413620623, + "flos": 504057503232.0, + "grad_norm": 0.02160335630411572, + "language_loss": 0.96627682, + "learning_rate": 0.0009278348356770915, + "loss": 0.97774434, + "num_input_tokens_seen": 85241632, + "router_z_loss_mlp": 0.98242188, + "step": 1029, + "time_per_iteration": 2.566802501678467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144801, + "balance_loss_mlp": 1.04666746, + "epoch": 0.1981531358214698, + "flos": 508570796544.0, + "grad_norm": 0.024261507948164947, + "language_loss": 0.9528529, + "learning_rate": 0.0009276735225197814, + "loss": 0.96430099, + "num_input_tokens_seen": 85308992, + "router_z_loss_mlp": 0.98144531, + "step": 1030, + "time_per_iteration": 2.6009340286254883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145205, + "balance_loss_mlp": 1.04702377, + "epoch": 0.19834551750673335, + "flos": 532639208448.0, + "grad_norm": 0.023062563394134136, + "language_loss": 0.95906407, + "learning_rate": 0.0009275120433248847, + "loss": 0.97051609, + "num_input_tokens_seen": 85381936, + "router_z_loss_mlp": 0.98193359, + "step": 1031, + "time_per_iteration": 2.684858560562134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145757, + "balance_loss_mlp": 1.0477196, + "epoch": 0.1985378991919969, + "flos": 776969765376.0, + "grad_norm": 0.02469129884935611, + "language_loss": 0.94986421, + "learning_rate": 0.0009273503981550931, + "loss": 0.96132183, + "num_input_tokens_seen": 85474352, + "router_z_loss_mlp": 0.98046875, + "step": 1032, + "time_per_iteration": 3.058094024658203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145313, + "balance_loss_mlp": 1.04737103, + "epoch": 0.1987302808772605, + "flos": 435191256576.0, + "grad_norm": 0.025952536265860523, + "language_loss": 0.96777844, + "learning_rate": 0.0009271885870731626, + "loss": 0.9792316, + "num_input_tokens_seen": 85538416, + "router_z_loss_mlp": 0.97949219, + "step": 1033, + "time_per_iteration": 2.493664503097534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153962, + "balance_loss_mlp": 1.05592442, + "epoch": 0.19892266256252406, + "flos": 554653725696.0, + "grad_norm": 0.029222795446194067, + "language_loss": 1.0035603, + "learning_rate": 0.0009270266101419143, + "loss": 1.01509976, + "num_input_tokens_seen": 85604416, + "router_z_loss_mlp": 0.98046875, + "step": 1034, + "time_per_iteration": 2.626612901687622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145521, + "balance_loss_mlp": 1.04748368, + "epoch": 0.19911504424778761, + "flos": 550948164096.0, + "grad_norm": 0.02425528851980561, + "language_loss": 0.92802572, + "learning_rate": 0.0009268644674242328, + "loss": 0.9394809, + "num_input_tokens_seen": 85677008, + "router_z_loss_mlp": 0.98046875, + "step": 1035, + "time_per_iteration": 2.683253288269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148174, + "balance_loss_mlp": 1.04994512, + "epoch": 0.19930742593305117, + "flos": 519312176640.0, + "grad_norm": 0.02646778626346152, + "language_loss": 0.91577774, + "learning_rate": 0.0009267021589830678, + "loss": 0.9272595, + "num_input_tokens_seen": 85745200, + "router_z_loss_mlp": 0.98242188, + "step": 1036, + "time_per_iteration": 2.7614338397979736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218948, + "balance_loss_mlp": 1.11824036, + "epoch": 0.19949980761831473, + "flos": 1512637863936.0, + "grad_norm": 0.02467753292442409, + "language_loss": 0.77627081, + "learning_rate": 0.0009265396848814328, + "loss": 0.78846025, + "num_input_tokens_seen": 85980608, + "router_z_loss_mlp": 1.0078125, + "step": 1037, + "time_per_iteration": 4.962339878082275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114988, + "balance_loss_mlp": 1.05184233, + "epoch": 0.1996921893035783, + "flos": 699439501824.0, + "grad_norm": 0.02757683731024766, + "language_loss": 1.02362621, + "learning_rate": 0.000926377045182406, + "loss": 1.03512502, + "num_input_tokens_seen": 86055952, + "router_z_loss_mlp": 0.98046875, + "step": 1038, + "time_per_iteration": 2.916594982147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155504, + "balance_loss_mlp": 1.05727601, + "epoch": 0.19988457098884185, + "flos": 728394510336.0, + "grad_norm": 0.024851830352508646, + "language_loss": 0.97729039, + "learning_rate": 0.0009262142399491296, + "loss": 0.98884547, + "num_input_tokens_seen": 86145536, + "router_z_loss_mlp": 0.98242188, + "step": 1039, + "time_per_iteration": 3.0976781845092773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156606, + "balance_loss_mlp": 1.05837739, + "epoch": 0.2000769526741054, + "flos": 561624416256.0, + "grad_norm": 0.025662568358030838, + "language_loss": 0.98388815, + "learning_rate": 0.0009260512692448105, + "loss": 0.99545419, + "num_input_tokens_seen": 86214480, + "router_z_loss_mlp": 0.98242188, + "step": 1040, + "time_per_iteration": 2.715479850769043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151311, + "balance_loss_mlp": 1.05308211, + "epoch": 0.200269334359369, + "flos": 573164795904.0, + "grad_norm": 0.022253887646478135, + "language_loss": 0.93097693, + "learning_rate": 0.000925888133132719, + "loss": 0.9424901, + "num_input_tokens_seen": 86289824, + "router_z_loss_mlp": 0.98242188, + "step": 1041, + "time_per_iteration": 2.7987864017486572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011912, + "balance_loss_mlp": 1.0923996, + "epoch": 0.20046171604463256, + "flos": 1489152875520.0, + "grad_norm": 0.020655335232781416, + "language_loss": 0.79610431, + "learning_rate": 0.0009257248316761906, + "loss": 0.80801636, + "num_input_tokens_seen": 86516384, + "router_z_loss_mlp": 0.98828125, + "step": 1042, + "time_per_iteration": 4.944507360458374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154531, + "balance_loss_mlp": 1.05644536, + "epoch": 0.20065409772989612, + "flos": 497577636864.0, + "grad_norm": 0.02609736880654102, + "language_loss": 0.92129564, + "learning_rate": 0.0009255613649386244, + "loss": 0.932841, + "num_input_tokens_seen": 86587296, + "router_z_loss_mlp": 0.98095703, + "step": 1043, + "time_per_iteration": 2.6478612422943115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157191, + "balance_loss_mlp": 1.05915368, + "epoch": 0.20084647941515968, + "flos": 580463127552.0, + "grad_norm": 0.02650777474930283, + "language_loss": 0.87469566, + "learning_rate": 0.0009253977329834838, + "loss": 0.88626754, + "num_input_tokens_seen": 86662656, + "router_z_loss_mlp": 0.98046875, + "step": 1044, + "time_per_iteration": 2.7641594409942627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161195, + "balance_loss_mlp": 1.06315744, + "epoch": 0.20103886110042324, + "flos": 643287939072.0, + "grad_norm": 0.030624079602620518, + "language_loss": 0.9713465, + "learning_rate": 0.0009252339358742965, + "loss": 0.98295844, + "num_input_tokens_seen": 86734704, + "router_z_loss_mlp": 0.98046875, + "step": 1045, + "time_per_iteration": 2.811687707901001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157153, + "balance_loss_mlp": 1.0594964, + "epoch": 0.2012312427856868, + "flos": 442969678848.0, + "grad_norm": 0.023268596270985206, + "language_loss": 0.93283701, + "learning_rate": 0.000925069973674654, + "loss": 0.94440854, + "num_input_tokens_seen": 86806512, + "router_z_loss_mlp": 0.9765625, + "step": 1046, + "time_per_iteration": 2.6709671020507812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157527, + "balance_loss_mlp": 1.05948889, + "epoch": 0.20142362447095036, + "flos": 555472190976.0, + "grad_norm": 0.022730221646095148, + "language_loss": 0.96496689, + "learning_rate": 0.000924905846448212, + "loss": 0.97654217, + "num_input_tokens_seen": 86883440, + "router_z_loss_mlp": 0.98046875, + "step": 1047, + "time_per_iteration": 2.7338547706604004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115317, + "balance_loss_mlp": 1.05522716, + "epoch": 0.20161600615621392, + "flos": 671554738176.0, + "grad_norm": 0.026697286803692055, + "language_loss": 0.96143991, + "learning_rate": 0.0009247415542586906, + "loss": 0.97297156, + "num_input_tokens_seen": 86960208, + "router_z_loss_mlp": 0.97949219, + "step": 1048, + "time_per_iteration": 2.849416494369507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149865, + "balance_loss_mlp": 1.05216146, + "epoch": 0.2018083878414775, + "flos": 574306899456.0, + "grad_norm": 0.021371049275305663, + "language_loss": 0.91504782, + "learning_rate": 0.0009245770971698735, + "loss": 0.92654645, + "num_input_tokens_seen": 87044144, + "router_z_loss_mlp": 0.97705078, + "step": 1049, + "time_per_iteration": 2.8751590251922607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151512, + "balance_loss_mlp": 1.05376041, + "epoch": 0.20200076952674106, + "flos": 426794482176.0, + "grad_norm": 0.027360075371486055, + "language_loss": 0.97835737, + "learning_rate": 0.0009244124752456087, + "loss": 0.98987252, + "num_input_tokens_seen": 87109136, + "router_z_loss_mlp": 0.97753906, + "step": 1050, + "time_per_iteration": 2.4985499382019043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153257, + "balance_loss_mlp": 1.05531442, + "epoch": 0.20219315121200462, + "flos": 537684258816.0, + "grad_norm": 0.025856302906645603, + "language_loss": 0.95370412, + "learning_rate": 0.0009242476885498081, + "loss": 0.96523666, + "num_input_tokens_seen": 87184320, + "router_z_loss_mlp": 0.97949219, + "step": 1051, + "time_per_iteration": 2.7127723693847656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150827, + "balance_loss_mlp": 1.05297983, + "epoch": 0.20238553289726818, + "flos": 478834252800.0, + "grad_norm": 0.02631802181941096, + "language_loss": 0.90995431, + "learning_rate": 0.0009240827371464474, + "loss": 0.92146254, + "num_input_tokens_seen": 87248224, + "router_z_loss_mlp": 0.97851562, + "step": 1052, + "time_per_iteration": 2.527918577194214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144335, + "balance_loss_mlp": 1.04667878, + "epoch": 0.20257791458253174, + "flos": 1153846049280.0, + "grad_norm": 0.025276400477213575, + "language_loss": 0.92167991, + "learning_rate": 0.0009239176210995666, + "loss": 0.93312329, + "num_input_tokens_seen": 87333088, + "router_z_loss_mlp": 0.9765625, + "step": 1053, + "time_per_iteration": 3.4556469917297363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144677, + "balance_loss_mlp": 1.04682982, + "epoch": 0.2027702962677953, + "flos": 668148619776.0, + "grad_norm": 0.025342755763179396, + "language_loss": 1.04358864, + "learning_rate": 0.0009237523404732695, + "loss": 1.05503547, + "num_input_tokens_seen": 87413840, + "router_z_loss_mlp": 0.97851562, + "step": 1054, + "time_per_iteration": 2.894198417663574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144665, + "balance_loss_mlp": 1.04676986, + "epoch": 0.20296267795305886, + "flos": 642452009472.0, + "grad_norm": 0.02468028394334187, + "language_loss": 0.94787639, + "learning_rate": 0.0009235868953317235, + "loss": 0.95932305, + "num_input_tokens_seen": 87487168, + "router_z_loss_mlp": 0.97900391, + "step": 1055, + "time_per_iteration": 2.812633991241455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148717, + "balance_loss_mlp": 1.05082273, + "epoch": 0.20315505963832242, + "flos": 932129622528.0, + "grad_norm": 0.02533903757078053, + "language_loss": 0.93907225, + "learning_rate": 0.0009234212857391602, + "loss": 0.95055938, + "num_input_tokens_seen": 87573184, + "router_z_loss_mlp": 0.97900391, + "step": 1056, + "time_per_iteration": 3.2061142921447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147493, + "balance_loss_mlp": 1.0496459, + "epoch": 0.20334744132358598, + "flos": 563287543296.0, + "grad_norm": 0.019686870604104637, + "language_loss": 0.97330248, + "learning_rate": 0.000923255511759875, + "loss": 0.98477745, + "num_input_tokens_seen": 87651968, + "router_z_loss_mlp": 0.97851562, + "step": 1057, + "time_per_iteration": 2.7639002799987793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150039, + "balance_loss_mlp": 1.05219197, + "epoch": 0.20353982300884957, + "flos": 645428428800.0, + "grad_norm": 0.023252811049323967, + "language_loss": 0.95256209, + "learning_rate": 0.000923089573458227, + "loss": 0.96406245, + "num_input_tokens_seen": 87727792, + "router_z_loss_mlp": 0.97851562, + "step": 1058, + "time_per_iteration": 2.857612133026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114962, + "balance_loss_mlp": 1.05177307, + "epoch": 0.20373220469411313, + "flos": 652705293312.0, + "grad_norm": 0.02395962669603635, + "language_loss": 0.93332446, + "learning_rate": 0.0009229234708986392, + "loss": 0.94482064, + "num_input_tokens_seen": 87806048, + "router_z_loss_mlp": 0.97851562, + "step": 1059, + "time_per_iteration": 2.877995729446411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150688, + "balance_loss_mlp": 1.05436707, + "epoch": 0.2039245863793767, + "flos": 1440396973056.0, + "grad_norm": 0.013896761524226428, + "language_loss": 0.81666899, + "learning_rate": 0.0009227572041455982, + "loss": 0.82817578, + "num_input_tokens_seen": 88018160, + "router_z_loss_mlp": 0.96289062, + "step": 1060, + "time_per_iteration": 4.659267902374268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142187, + "balance_loss_mlp": 1.04434025, + "epoch": 0.20411696806464025, + "flos": 598127534592.0, + "grad_norm": 0.026599581611848343, + "language_loss": 0.93894625, + "learning_rate": 0.0009225907732636548, + "loss": 0.95036817, + "num_input_tokens_seen": 88090864, + "router_z_loss_mlp": 0.97851562, + "step": 1061, + "time_per_iteration": 2.7480902671813965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115027, + "balance_loss_mlp": 1.05242312, + "epoch": 0.2043093497499038, + "flos": 574897053696.0, + "grad_norm": 0.026136319737411078, + "language_loss": 0.96460152, + "learning_rate": 0.0009224241783174227, + "loss": 0.97610414, + "num_input_tokens_seen": 88161360, + "router_z_loss_mlp": 0.97851562, + "step": 1062, + "time_per_iteration": 2.676877021789551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146738, + "balance_loss_mlp": 1.04874802, + "epoch": 0.20450173143516737, + "flos": 631523977728.0, + "grad_norm": 0.02709710709634581, + "language_loss": 0.94472104, + "learning_rate": 0.0009222574193715802, + "loss": 0.95618844, + "num_input_tokens_seen": 88234960, + "router_z_loss_mlp": 0.97998047, + "step": 1063, + "time_per_iteration": 2.7604472637176514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141026, + "balance_loss_mlp": 1.04298854, + "epoch": 0.20469411312043093, + "flos": 575146831872.0, + "grad_norm": 0.022769515120839894, + "language_loss": 0.95189404, + "learning_rate": 0.000922090496490869, + "loss": 0.96330428, + "num_input_tokens_seen": 88308176, + "router_z_loss_mlp": 0.98046875, + "step": 1064, + "time_per_iteration": 2.728154182434082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141583, + "balance_loss_mlp": 1.04383183, + "epoch": 0.20488649480569449, + "flos": 638279818752.0, + "grad_norm": 0.022393105289594414, + "language_loss": 0.97629392, + "learning_rate": 0.0009219234097400937, + "loss": 0.9877097, + "num_input_tokens_seen": 88386768, + "router_z_loss_mlp": 0.97753906, + "step": 1065, + "time_per_iteration": 2.889946937561035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137744, + "balance_loss_mlp": 1.03989744, + "epoch": 0.20507887649095807, + "flos": 977437747200.0, + "grad_norm": 0.024872828726298618, + "language_loss": 0.9305777, + "learning_rate": 0.0009217561591841237, + "loss": 0.94195515, + "num_input_tokens_seen": 88476576, + "router_z_loss_mlp": 0.97851562, + "step": 1066, + "time_per_iteration": 3.296248435974121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144611, + "balance_loss_mlp": 1.04681206, + "epoch": 0.20527125817622163, + "flos": 487155165696.0, + "grad_norm": 0.024567371957878288, + "language_loss": 0.90358436, + "learning_rate": 0.0009215887448878913, + "loss": 0.91503048, + "num_input_tokens_seen": 88541968, + "router_z_loss_mlp": 0.97802734, + "step": 1067, + "time_per_iteration": 2.5662190914154053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137303, + "balance_loss_mlp": 1.03945625, + "epoch": 0.2054636398614852, + "flos": 528210508800.0, + "grad_norm": 0.02249486638659544, + "language_loss": 0.94470721, + "learning_rate": 0.0009214211669163922, + "loss": 0.9560802, + "num_input_tokens_seen": 88615296, + "router_z_loss_mlp": 0.97851562, + "step": 1068, + "time_per_iteration": 2.6912589073181152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139468, + "balance_loss_mlp": 1.04162145, + "epoch": 0.20565602154674875, + "flos": 559323471360.0, + "grad_norm": 0.022635174506508055, + "language_loss": 1.02501464, + "learning_rate": 0.0009212534253346862, + "loss": 1.03640926, + "num_input_tokens_seen": 88691584, + "router_z_loss_mlp": 0.97851562, + "step": 1069, + "time_per_iteration": 2.708683490753174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135123, + "balance_loss_mlp": 1.03746641, + "epoch": 0.2058484032320123, + "flos": 505221073920.0, + "grad_norm": 0.02479403914192968, + "language_loss": 0.95383358, + "learning_rate": 0.0009210855202078964, + "loss": 0.96518481, + "num_input_tokens_seen": 88756592, + "router_z_loss_mlp": 0.9765625, + "step": 1070, + "time_per_iteration": 2.6434948444366455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132203, + "balance_loss_mlp": 1.03478527, + "epoch": 0.20604078491727587, + "flos": 434047151616.0, + "grad_norm": 0.024632817960327506, + "language_loss": 0.96572351, + "learning_rate": 0.0009209174516012091, + "loss": 0.97704554, + "num_input_tokens_seen": 88820928, + "router_z_loss_mlp": 0.97412109, + "step": 1071, + "time_per_iteration": 2.4891347885131836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148822, + "balance_loss_mlp": 1.05130851, + "epoch": 0.20623316660253943, + "flos": 609874031616.0, + "grad_norm": 0.024395492192686875, + "language_loss": 0.97482872, + "learning_rate": 0.0009207492195798747, + "loss": 0.98631692, + "num_input_tokens_seen": 88895440, + "router_z_loss_mlp": 0.97509766, + "step": 1072, + "time_per_iteration": 2.758575201034546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152495, + "balance_loss_mlp": 1.05502975, + "epoch": 0.206425548287803, + "flos": 481393708032.0, + "grad_norm": 0.027205333287948934, + "language_loss": 0.9402262, + "learning_rate": 0.0009205808242092061, + "loss": 0.95175123, + "num_input_tokens_seen": 88964400, + "router_z_loss_mlp": 0.97460938, + "step": 1073, + "time_per_iteration": 2.6534366607666016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152896, + "balance_loss_mlp": 1.05562115, + "epoch": 0.20661792997306658, + "flos": 951122784768.0, + "grad_norm": 0.02943422736446298, + "language_loss": 0.93147469, + "learning_rate": 0.0009204122655545808, + "loss": 0.94300359, + "num_input_tokens_seen": 89049600, + "router_z_loss_mlp": 0.97265625, + "step": 1074, + "time_per_iteration": 3.317518949508667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149199, + "balance_loss_mlp": 1.05201948, + "epoch": 0.20681031165833014, + "flos": 604616133120.0, + "grad_norm": 0.024855118115069977, + "language_loss": 0.88961834, + "learning_rate": 0.0009202435436814388, + "loss": 0.90111029, + "num_input_tokens_seen": 89119024, + "router_z_loss_mlp": 0.97167969, + "step": 1075, + "time_per_iteration": 2.6815345287323 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142912, + "balance_loss_mlp": 1.04563749, + "epoch": 0.2070026933435937, + "flos": 710265475584.0, + "grad_norm": 0.027130222852878607, + "language_loss": 0.99239773, + "learning_rate": 0.0009200746586552836, + "loss": 1.00382686, + "num_input_tokens_seen": 89197344, + "router_z_loss_mlp": 0.97265625, + "step": 1076, + "time_per_iteration": 2.9578917026519775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141976, + "balance_loss_mlp": 1.04451025, + "epoch": 0.20719507502885726, + "flos": 831254085120.0, + "grad_norm": 0.023090334700176834, + "language_loss": 0.92780054, + "learning_rate": 0.0009199056105416825, + "loss": 0.93922031, + "num_input_tokens_seen": 89280464, + "router_z_loss_mlp": 0.97460938, + "step": 1077, + "time_per_iteration": 3.0944156646728516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140475, + "balance_loss_mlp": 1.04324794, + "epoch": 0.20738745671412082, + "flos": 639499785216.0, + "grad_norm": 0.023914471883828003, + "language_loss": 0.96186948, + "learning_rate": 0.0009197363994062654, + "loss": 0.97327423, + "num_input_tokens_seen": 89353344, + "router_z_loss_mlp": 0.97216797, + "step": 1078, + "time_per_iteration": 2.8147799968719482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142489, + "balance_loss_mlp": 1.04521394, + "epoch": 0.20757983839938438, + "flos": 686983328256.0, + "grad_norm": 0.02237329029547868, + "language_loss": 0.90686679, + "learning_rate": 0.0009195670253147262, + "loss": 0.91829169, + "num_input_tokens_seen": 89439328, + "router_z_loss_mlp": 0.97265625, + "step": 1079, + "time_per_iteration": 2.994058609008789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141016, + "balance_loss_mlp": 1.04383624, + "epoch": 0.20777222008464794, + "flos": 520317293568.0, + "grad_norm": 0.026634413874044322, + "language_loss": 0.92195654, + "learning_rate": 0.0009193974883328216, + "loss": 0.93336666, + "num_input_tokens_seen": 89510160, + "router_z_loss_mlp": 0.97167969, + "step": 1080, + "time_per_iteration": 2.6506502628326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140462, + "balance_loss_mlp": 1.04333031, + "epoch": 0.2079646017699115, + "flos": 512469740544.0, + "grad_norm": 0.025261028079588584, + "language_loss": 0.97185814, + "learning_rate": 0.0009192277885263718, + "loss": 0.98326278, + "num_input_tokens_seen": 89582960, + "router_z_loss_mlp": 0.97119141, + "step": 1081, + "time_per_iteration": 2.646629810333252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143678, + "balance_loss_mlp": 1.04640269, + "epoch": 0.20815698345517505, + "flos": 933467109888.0, + "grad_norm": 0.02363260569338726, + "language_loss": 0.9496327, + "learning_rate": 0.0009190579259612602, + "loss": 0.96106946, + "num_input_tokens_seen": 89675488, + "router_z_loss_mlp": 0.97265625, + "step": 1082, + "time_per_iteration": 3.2829811573028564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150642, + "balance_loss_mlp": 1.05336761, + "epoch": 0.20834936514043864, + "flos": 633553677312.0, + "grad_norm": 0.02436625118168465, + "language_loss": 0.97094011, + "learning_rate": 0.000918887900703433, + "loss": 0.98244655, + "num_input_tokens_seen": 89747872, + "router_z_loss_mlp": 0.97265625, + "step": 1083, + "time_per_iteration": 2.779474973678589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147642, + "balance_loss_mlp": 1.05079603, + "epoch": 0.2085417468257022, + "flos": 395243088384.0, + "grad_norm": 0.027448171988374206, + "language_loss": 0.98109657, + "learning_rate": 0.0009187177128188999, + "loss": 0.99257296, + "num_input_tokens_seen": 89810176, + "router_z_loss_mlp": 0.96826172, + "step": 1084, + "time_per_iteration": 2.487755298614502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156746, + "balance_loss_mlp": 1.06118774, + "epoch": 0.20873412851096576, + "flos": 1405195138560.0, + "grad_norm": 0.014888537960634525, + "language_loss": 0.77156538, + "learning_rate": 0.0009185473623737339, + "loss": 0.78313285, + "num_input_tokens_seen": 90038432, + "router_z_loss_mlp": 0.95507812, + "step": 1085, + "time_per_iteration": 4.917901515960693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146704, + "balance_loss_mlp": 1.04981041, + "epoch": 0.20892651019622932, + "flos": 448761335808.0, + "grad_norm": 0.0275038267286557, + "language_loss": 0.93389261, + "learning_rate": 0.000918376849434071, + "loss": 0.94535965, + "num_input_tokens_seen": 90101568, + "router_z_loss_mlp": 0.96875, + "step": 1086, + "time_per_iteration": 2.5117850303649902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153188, + "balance_loss_mlp": 1.05629456, + "epoch": 0.20911889188149288, + "flos": 494080194048.0, + "grad_norm": 0.034273062806107445, + "language_loss": 1.02428699, + "learning_rate": 0.0009182061740661098, + "loss": 1.03581882, + "num_input_tokens_seen": 90169344, + "router_z_loss_mlp": 0.96875, + "step": 1087, + "time_per_iteration": 2.5270984172821045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154258, + "balance_loss_mlp": 1.05736482, + "epoch": 0.20931127356675644, + "flos": 842748802560.0, + "grad_norm": 0.02361505883443172, + "language_loss": 0.92997056, + "learning_rate": 0.0009180353363361127, + "loss": 0.94151306, + "num_input_tokens_seen": 90252416, + "router_z_loss_mlp": 0.96875, + "step": 1088, + "time_per_iteration": 3.1549112796783447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154015, + "balance_loss_mlp": 1.05688298, + "epoch": 0.20950365525202, + "flos": 758523823104.0, + "grad_norm": 0.028384526527587387, + "language_loss": 0.93851304, + "learning_rate": 0.0009178643363104044, + "loss": 0.95005322, + "num_input_tokens_seen": 90337952, + "router_z_loss_mlp": 0.97119141, + "step": 1089, + "time_per_iteration": 4.693684339523315 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148681, + "balance_loss_mlp": 1.05159688, + "epoch": 0.20969603693728356, + "flos": 473491760640.0, + "grad_norm": 0.03411348227976855, + "language_loss": 1.04663801, + "learning_rate": 0.0009176931740553735, + "loss": 1.05812478, + "num_input_tokens_seen": 90401488, + "router_z_loss_mlp": 0.97070312, + "step": 1090, + "time_per_iteration": 2.5203866958618164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146066, + "balance_loss_mlp": 1.04917288, + "epoch": 0.20988841862254715, + "flos": 978627514368.0, + "grad_norm": 0.027482857176328385, + "language_loss": 0.92998403, + "learning_rate": 0.0009175218496374708, + "loss": 0.94144469, + "num_input_tokens_seen": 90486144, + "router_z_loss_mlp": 0.96875, + "step": 1091, + "time_per_iteration": 3.362614870071411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152337, + "balance_loss_mlp": 1.05544364, + "epoch": 0.2100808003078107, + "flos": 1094818123776.0, + "grad_norm": 0.028049590852478556, + "language_loss": 0.96363866, + "learning_rate": 0.0009173503631232103, + "loss": 0.97516203, + "num_input_tokens_seen": 90571504, + "router_z_loss_mlp": 0.96875, + "step": 1092, + "time_per_iteration": 3.359970808029175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150696, + "balance_loss_mlp": 1.05399334, + "epoch": 0.21027318199307427, + "flos": 1014559217664.0, + "grad_norm": 0.03210489869185377, + "language_loss": 0.94109344, + "learning_rate": 0.0009171787145791691, + "loss": 0.95260036, + "num_input_tokens_seen": 90646016, + "router_z_loss_mlp": 0.96679688, + "step": 1093, + "time_per_iteration": 3.2180042266845703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150028, + "balance_loss_mlp": 1.05323017, + "epoch": 0.21046556367833782, + "flos": 522412121088.0, + "grad_norm": 0.02762257246471406, + "language_loss": 0.92679179, + "learning_rate": 0.000917006904071987, + "loss": 0.93829209, + "num_input_tokens_seen": 90713440, + "router_z_loss_mlp": 0.96777344, + "step": 1094, + "time_per_iteration": 2.5961859226226807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152841, + "balance_loss_mlp": 1.0559479, + "epoch": 0.21065794536360138, + "flos": 604839714816.0, + "grad_norm": 0.02570597393175465, + "language_loss": 0.97250223, + "learning_rate": 0.0009168349316683669, + "loss": 0.98403066, + "num_input_tokens_seen": 90788208, + "router_z_loss_mlp": 0.96875, + "step": 1095, + "time_per_iteration": 2.7164759635925293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153125, + "balance_loss_mlp": 1.05642295, + "epoch": 0.21085032704886494, + "flos": 604557735936.0, + "grad_norm": 0.022711755724658188, + "language_loss": 0.91088736, + "learning_rate": 0.0009166627974350741, + "loss": 0.92241859, + "num_input_tokens_seen": 90873776, + "router_z_loss_mlp": 0.96679688, + "step": 1096, + "time_per_iteration": 2.8912341594696045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153907, + "balance_loss_mlp": 1.05739498, + "epoch": 0.2110427087341285, + "flos": 638831041536.0, + "grad_norm": 0.027939519002465243, + "language_loss": 1.01164758, + "learning_rate": 0.0009164905014389373, + "loss": 1.02318668, + "num_input_tokens_seen": 90945872, + "router_z_loss_mlp": 0.96484375, + "step": 1097, + "time_per_iteration": 2.758725881576538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115008, + "balance_loss_mlp": 1.05356789, + "epoch": 0.21123509041939206, + "flos": 523929529344.0, + "grad_norm": 0.027217895626849283, + "language_loss": 0.96537346, + "learning_rate": 0.0009163180437468476, + "loss": 0.97687429, + "num_input_tokens_seen": 91016224, + "router_z_loss_mlp": 0.96484375, + "step": 1098, + "time_per_iteration": 2.6157684326171875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011531, + "balance_loss_mlp": 1.05658853, + "epoch": 0.21142747210465565, + "flos": 452193650688.0, + "grad_norm": 0.025540912808389868, + "language_loss": 0.94842321, + "learning_rate": 0.000916145424425759, + "loss": 0.9599542, + "num_input_tokens_seen": 91086752, + "router_z_loss_mlp": 0.96484375, + "step": 1099, + "time_per_iteration": 2.6368908882141113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157233, + "balance_loss_mlp": 1.06081605, + "epoch": 0.2116198537899192, + "flos": 877625723904.0, + "grad_norm": 0.02885196772961066, + "language_loss": 1.02573156, + "learning_rate": 0.0009159726435426885, + "loss": 1.03730392, + "num_input_tokens_seen": 91162960, + "router_z_loss_mlp": 0.96386719, + "step": 1100, + "time_per_iteration": 3.0916907787323 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011557, + "balance_loss_mlp": 1.05909276, + "epoch": 0.21181223547518277, + "flos": 524674134528.0, + "grad_norm": 0.025603473018395394, + "language_loss": 0.99936807, + "learning_rate": 0.0009157997011647154, + "loss": 1.01092505, + "num_input_tokens_seen": 91229840, + "router_z_loss_mlp": 0.96582031, + "step": 1101, + "time_per_iteration": 2.5971169471740723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152722, + "balance_loss_mlp": 1.05630529, + "epoch": 0.21200461716044633, + "flos": 573425307648.0, + "grad_norm": 0.02306433427515447, + "language_loss": 0.93708789, + "learning_rate": 0.0009156265973589817, + "loss": 0.94861513, + "num_input_tokens_seen": 91307936, + "router_z_loss_mlp": 0.96386719, + "step": 1102, + "time_per_iteration": 2.786557197570801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148247, + "balance_loss_mlp": 1.05187845, + "epoch": 0.2121969988457099, + "flos": 546174359040.0, + "grad_norm": 0.023119673851329285, + "language_loss": 0.9826746, + "learning_rate": 0.0009154533321926926, + "loss": 0.99415696, + "num_input_tokens_seen": 91372848, + "router_z_loss_mlp": 0.96337891, + "step": 1103, + "time_per_iteration": 2.6500911712646484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150448, + "balance_loss_mlp": 1.05393636, + "epoch": 0.21238938053097345, + "flos": 845353920000.0, + "grad_norm": 0.02523726215492747, + "language_loss": 0.96587884, + "learning_rate": 0.0009152799057331156, + "loss": 0.97738338, + "num_input_tokens_seen": 91452768, + "router_z_loss_mlp": 0.96484375, + "step": 1104, + "time_per_iteration": 3.1080517768859863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148697, + "balance_loss_mlp": 1.05213737, + "epoch": 0.212581762216237, + "flos": 447141869568.0, + "grad_norm": 0.026678256955328494, + "language_loss": 1.00256824, + "learning_rate": 0.0009151063180475805, + "loss": 1.01405525, + "num_input_tokens_seen": 91519888, + "router_z_loss_mlp": 0.96533203, + "step": 1105, + "time_per_iteration": 2.530207633972168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153737, + "balance_loss_mlp": 1.05703473, + "epoch": 0.21277414390150057, + "flos": 515385034752.0, + "grad_norm": 0.026680614248996183, + "language_loss": 0.9432478, + "learning_rate": 0.0009149325692034803, + "loss": 0.95478517, + "num_input_tokens_seen": 91585744, + "router_z_loss_mlp": 0.96679688, + "step": 1106, + "time_per_iteration": 2.576834201812744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159119, + "balance_loss_mlp": 1.06413269, + "epoch": 0.21296652558676413, + "flos": 1488512329728.0, + "grad_norm": 0.01358013302766655, + "language_loss": 0.79203427, + "learning_rate": 0.0009147586592682702, + "loss": 0.80362546, + "num_input_tokens_seen": 91805840, + "router_z_loss_mlp": 0.94921875, + "step": 1107, + "time_per_iteration": 4.821696996688843 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156765, + "balance_loss_mlp": 1.06006265, + "epoch": 0.21315890727202771, + "flos": 847450748928.0, + "grad_norm": 0.031460519319247274, + "language_loss": 0.96369046, + "learning_rate": 0.0009145845883094678, + "loss": 0.97525811, + "num_input_tokens_seen": 91885936, + "router_z_loss_mlp": 0.96679688, + "step": 1108, + "time_per_iteration": 3.029548168182373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159379, + "balance_loss_mlp": 1.06267655, + "epoch": 0.21335128895729127, + "flos": 630555790848.0, + "grad_norm": 0.028067626854192333, + "language_loss": 0.95182431, + "learning_rate": 0.000914410356394654, + "loss": 0.96341801, + "num_input_tokens_seen": 91959888, + "router_z_loss_mlp": 0.96679688, + "step": 1109, + "time_per_iteration": 2.737241268157959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160605, + "balance_loss_mlp": 1.06352139, + "epoch": 0.21354367064255483, + "flos": 712284441600.0, + "grad_norm": 0.023599510024272945, + "language_loss": 0.92540836, + "learning_rate": 0.0009142359635914709, + "loss": 0.93701446, + "num_input_tokens_seen": 92043728, + "router_z_loss_mlp": 0.97070312, + "step": 1110, + "time_per_iteration": 3.0267913341522217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161441, + "balance_loss_mlp": 1.0645479, + "epoch": 0.2137360523278184, + "flos": 457210503168.0, + "grad_norm": 0.02473497568188501, + "language_loss": 0.9156003, + "learning_rate": 0.0009140614099676245, + "loss": 0.92721474, + "num_input_tokens_seen": 92114096, + "router_z_loss_mlp": 0.96875, + "step": 1111, + "time_per_iteration": 2.5756866931915283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164266, + "balance_loss_mlp": 1.06727743, + "epoch": 0.21392843401308195, + "flos": 667265026560.0, + "grad_norm": 0.025344438139363285, + "language_loss": 0.90291333, + "learning_rate": 0.0009138866955908821, + "loss": 0.91455603, + "num_input_tokens_seen": 92193552, + "router_z_loss_mlp": 0.96972656, + "step": 1112, + "time_per_iteration": 2.9406254291534424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160039, + "balance_loss_mlp": 1.06319368, + "epoch": 0.2141208156983455, + "flos": 750361363968.0, + "grad_norm": 0.02581510235299489, + "language_loss": 0.89949894, + "learning_rate": 0.0009137118205290738, + "loss": 0.91109931, + "num_input_tokens_seen": 92279248, + "router_z_loss_mlp": 0.96826172, + "step": 1113, + "time_per_iteration": 2.966989278793335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162558, + "balance_loss_mlp": 1.06547356, + "epoch": 0.21431319738360907, + "flos": 420010443264.0, + "grad_norm": 0.024953242249854055, + "language_loss": 1.00419319, + "learning_rate": 0.0009135367848500924, + "loss": 1.01581883, + "num_input_tokens_seen": 92344064, + "router_z_loss_mlp": 0.97070312, + "step": 1114, + "time_per_iteration": 2.4954934120178223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161216, + "balance_loss_mlp": 1.06456113, + "epoch": 0.21450557906887263, + "flos": 610238602752.0, + "grad_norm": 0.030213425802119154, + "language_loss": 0.9839642, + "learning_rate": 0.0009133615886218927, + "loss": 0.99557638, + "num_input_tokens_seen": 92410544, + "router_z_loss_mlp": 0.96630859, + "step": 1115, + "time_per_iteration": 2.71352219581604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152764, + "balance_loss_mlp": 1.05625272, + "epoch": 0.21469796075413622, + "flos": 562974638592.0, + "grad_norm": 0.027635545182738433, + "language_loss": 0.99806535, + "learning_rate": 0.0009131862319124917, + "loss": 1.00959289, + "num_input_tokens_seen": 92480272, + "router_z_loss_mlp": 0.96484375, + "step": 1116, + "time_per_iteration": 2.630807876586914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153717, + "balance_loss_mlp": 1.05720496, + "epoch": 0.21489034243939978, + "flos": 595737266688.0, + "grad_norm": 0.024806539819872384, + "language_loss": 0.94489264, + "learning_rate": 0.0009130107147899691, + "loss": 0.95642984, + "num_input_tokens_seen": 92555584, + "router_z_loss_mlp": 0.96484375, + "step": 1117, + "time_per_iteration": 2.7123875617980957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154765, + "balance_loss_mlp": 1.05825305, + "epoch": 0.21508272412466334, + "flos": 442850156544.0, + "grad_norm": 0.024517194331867692, + "language_loss": 0.93784142, + "learning_rate": 0.0009128350373224665, + "loss": 0.9493891, + "num_input_tokens_seen": 92623136, + "router_z_loss_mlp": 0.96484375, + "step": 1118, + "time_per_iteration": 2.5384151935577393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169045, + "balance_loss_mlp": 1.07348633, + "epoch": 0.2152751058099269, + "flos": 1499232242688.0, + "grad_norm": 0.019396990855708212, + "language_loss": 0.81456429, + "learning_rate": 0.0009126591995781883, + "loss": 0.82625473, + "num_input_tokens_seen": 92842608, + "router_z_loss_mlp": 0.95507812, + "step": 1119, + "time_per_iteration": 4.644891262054443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156688, + "balance_loss_mlp": 1.05989027, + "epoch": 0.21546748749519046, + "flos": 494991985152.0, + "grad_norm": 0.030440112014221473, + "language_loss": 0.9407053, + "learning_rate": 0.0009124832016254005, + "loss": 0.95227218, + "num_input_tokens_seen": 92912960, + "router_z_loss_mlp": 0.96777344, + "step": 1120, + "time_per_iteration": 2.588834285736084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163526, + "balance_loss_mlp": 1.06691861, + "epoch": 0.21565986918045402, + "flos": 635694167040.0, + "grad_norm": 0.030206495794058562, + "language_loss": 0.96966755, + "learning_rate": 0.0009123070435324316, + "loss": 0.98130286, + "num_input_tokens_seen": 92982272, + "router_z_loss_mlp": 0.96582031, + "step": 1121, + "time_per_iteration": 2.786072015762329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170601, + "balance_loss_mlp": 1.07542419, + "epoch": 0.21585225086571758, + "flos": 1586798939136.0, + "grad_norm": 0.013013152417503263, + "language_loss": 0.77875781, + "learning_rate": 0.0009121307253676722, + "loss": 0.79046386, + "num_input_tokens_seen": 93218752, + "router_z_loss_mlp": 0.95117188, + "step": 1122, + "time_per_iteration": 4.946362733840942 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011651, + "balance_loss_mlp": 1.0685885, + "epoch": 0.21604463255098114, + "flos": 685322202624.0, + "grad_norm": 0.027822137906457534, + "language_loss": 0.94040322, + "learning_rate": 0.0009119542471995752, + "loss": 0.95205426, + "num_input_tokens_seen": 93293968, + "router_z_loss_mlp": 0.96484375, + "step": 1123, + "time_per_iteration": 2.8613343238830566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162625, + "balance_loss_mlp": 1.0660181, + "epoch": 0.2162370142362447, + "flos": 782307528192.0, + "grad_norm": 0.029561600436113455, + "language_loss": 0.90709835, + "learning_rate": 0.0009117776090966554, + "loss": 0.9187246, + "num_input_tokens_seen": 93367088, + "router_z_loss_mlp": 0.96582031, + "step": 1124, + "time_per_iteration": 2.9557414054870605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170148, + "balance_loss_mlp": 1.07344532, + "epoch": 0.21642939592150828, + "flos": 1003761441792.0, + "grad_norm": 0.032145354222626064, + "language_loss": 0.98171163, + "learning_rate": 0.0009116008111274899, + "loss": 0.99341309, + "num_input_tokens_seen": 93452944, + "router_z_loss_mlp": 0.96679688, + "step": 1125, + "time_per_iteration": 3.253286838531494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175423, + "balance_loss_mlp": 1.0798645, + "epoch": 0.21662177760677184, + "flos": 1485762220032.0, + "grad_norm": 0.016361962696647775, + "language_loss": 0.79106927, + "learning_rate": 0.0009114238533607176, + "loss": 0.80282342, + "num_input_tokens_seen": 93677328, + "router_z_loss_mlp": 0.95507812, + "step": 1126, + "time_per_iteration": 4.832986831665039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168208, + "balance_loss_mlp": 1.07150567, + "epoch": 0.2168141592920354, + "flos": 888859929600.0, + "grad_norm": 0.027606671666099106, + "language_loss": 0.94760346, + "learning_rate": 0.0009112467358650396, + "loss": 0.9592855, + "num_input_tokens_seen": 93756848, + "router_z_loss_mlp": 0.96679688, + "step": 1127, + "time_per_iteration": 3.1373836994171143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164208, + "balance_loss_mlp": 1.06741047, + "epoch": 0.21700654097729896, + "flos": 547084148736.0, + "grad_norm": 0.025712027239217825, + "language_loss": 0.95734817, + "learning_rate": 0.0009110694587092192, + "loss": 0.96899021, + "num_input_tokens_seen": 93834704, + "router_z_loss_mlp": 0.96777344, + "step": 1128, + "time_per_iteration": 2.752166986465454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162506, + "balance_loss_mlp": 1.06580317, + "epoch": 0.21719892266256252, + "flos": 510535368192.0, + "grad_norm": 0.02739880514200537, + "language_loss": 0.95310479, + "learning_rate": 0.0009108920219620815, + "loss": 0.96472991, + "num_input_tokens_seen": 93904448, + "router_z_loss_mlp": 0.96679688, + "step": 1129, + "time_per_iteration": 2.639409065246582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164125, + "balance_loss_mlp": 1.06742299, + "epoch": 0.21739130434782608, + "flos": 544461566976.0, + "grad_norm": 0.023064586598143682, + "language_loss": 0.97784394, + "learning_rate": 0.0009107144256925133, + "loss": 0.9894852, + "num_input_tokens_seen": 93979312, + "router_z_loss_mlp": 0.96679688, + "step": 1130, + "time_per_iteration": 2.73559308052063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165938, + "balance_loss_mlp": 1.06923568, + "epoch": 0.21758368603308964, + "flos": 617982096384.0, + "grad_norm": 0.027176951765382908, + "language_loss": 0.9233678, + "learning_rate": 0.0009105366699694638, + "loss": 0.93502718, + "num_input_tokens_seen": 94052032, + "router_z_loss_mlp": 0.96679688, + "step": 1131, + "time_per_iteration": 2.7653839588165283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166281, + "balance_loss_mlp": 1.06957853, + "epoch": 0.2177760677183532, + "flos": 636334712832.0, + "grad_norm": 0.021107298895209785, + "language_loss": 0.91459304, + "learning_rate": 0.0009103587548619439, + "loss": 0.92625588, + "num_input_tokens_seen": 94124944, + "router_z_loss_mlp": 0.96679688, + "step": 1132, + "time_per_iteration": 2.8519365787506104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160184, + "balance_loss_mlp": 1.06367195, + "epoch": 0.2179684494036168, + "flos": 533596661760.0, + "grad_norm": 0.022551614427290693, + "language_loss": 0.95995569, + "learning_rate": 0.0009101806804390261, + "loss": 0.97155756, + "num_input_tokens_seen": 94200384, + "router_z_loss_mlp": 0.96484375, + "step": 1133, + "time_per_iteration": 2.8218026161193848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163206, + "balance_loss_mlp": 1.06664658, + "epoch": 0.21816083108888035, + "flos": 476181471744.0, + "grad_norm": 0.0250418684782295, + "language_loss": 1.00355339, + "learning_rate": 0.0009100024467698453, + "loss": 1.01518536, + "num_input_tokens_seen": 94266992, + "router_z_loss_mlp": 0.96533203, + "step": 1134, + "time_per_iteration": 2.5639142990112305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167151, + "balance_loss_mlp": 1.07059181, + "epoch": 0.2183532127741439, + "flos": 578546219520.0, + "grad_norm": 0.029194142239697657, + "language_loss": 0.95151818, + "learning_rate": 0.0009098240539235981, + "loss": 0.96318972, + "num_input_tokens_seen": 94334304, + "router_z_loss_mlp": 0.96533203, + "step": 1135, + "time_per_iteration": 2.693704843521118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162362, + "balance_loss_mlp": 1.06565976, + "epoch": 0.21854559445940747, + "flos": 595279369728.0, + "grad_norm": 0.022714398939090653, + "language_loss": 0.96190184, + "learning_rate": 0.0009096455019695423, + "loss": 0.9735254, + "num_input_tokens_seen": 94413296, + "router_z_loss_mlp": 0.96679688, + "step": 1136, + "time_per_iteration": 2.829479217529297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166866, + "balance_loss_mlp": 1.06997275, + "epoch": 0.21873797614467103, + "flos": 409549040640.0, + "grad_norm": 0.027737994351600712, + "language_loss": 1.01424551, + "learning_rate": 0.000909466790976998, + "loss": 1.02591419, + "num_input_tokens_seen": 94475840, + "router_z_loss_mlp": 0.96875, + "step": 1137, + "time_per_iteration": 2.4491164684295654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165251, + "balance_loss_mlp": 1.06869149, + "epoch": 0.21893035782993459, + "flos": 895654702080.0, + "grad_norm": 0.022710058353260835, + "language_loss": 0.90594929, + "learning_rate": 0.0009092879210153473, + "loss": 0.91760182, + "num_input_tokens_seen": 94555184, + "router_z_loss_mlp": 0.96533203, + "step": 1138, + "time_per_iteration": 3.155076503753662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168627, + "balance_loss_mlp": 1.07192433, + "epoch": 0.21912273951519814, + "flos": 468568233984.0, + "grad_norm": 0.024281064631586205, + "language_loss": 0.97427768, + "learning_rate": 0.0009091088921540333, + "loss": 0.98596388, + "num_input_tokens_seen": 94622656, + "router_z_loss_mlp": 0.96679688, + "step": 1139, + "time_per_iteration": 2.5309600830078125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172859, + "balance_loss_mlp": 1.07711029, + "epoch": 0.2193151212004617, + "flos": 1535177407488.0, + "grad_norm": 0.009496329971255709, + "language_loss": 0.75508678, + "learning_rate": 0.0009089297044625615, + "loss": 0.76681536, + "num_input_tokens_seen": 94856496, + "router_z_loss_mlp": 0.95703125, + "step": 1140, + "time_per_iteration": 4.911335229873657 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172401, + "balance_loss_mlp": 1.07569873, + "epoch": 0.2195075028857253, + "flos": 592274752512.0, + "grad_norm": 0.033335232647672346, + "language_loss": 0.95078719, + "learning_rate": 0.0009087503580104985, + "loss": 0.96251118, + "num_input_tokens_seen": 94926880, + "router_z_loss_mlp": 0.96679688, + "step": 1141, + "time_per_iteration": 2.7083888053894043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169701, + "balance_loss_mlp": 1.07295096, + "epoch": 0.21969988457098885, + "flos": 637517749248.0, + "grad_norm": 0.02859165000671714, + "language_loss": 0.90439236, + "learning_rate": 0.0009085708528674728, + "loss": 0.91608942, + "num_input_tokens_seen": 95000528, + "router_z_loss_mlp": 0.96728516, + "step": 1142, + "time_per_iteration": 2.786891222000122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162201, + "balance_loss_mlp": 1.06549823, + "epoch": 0.2198922662562524, + "flos": 913859598336.0, + "grad_norm": 0.0328462843269242, + "language_loss": 0.98848528, + "learning_rate": 0.0009083911891031745, + "loss": 1.00010729, + "num_input_tokens_seen": 95081040, + "router_z_loss_mlp": 0.96679688, + "step": 1143, + "time_per_iteration": 3.1019930839538574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116483, + "balance_loss_mlp": 1.06793654, + "epoch": 0.22008464794151597, + "flos": 824494241280.0, + "grad_norm": 0.023913565571636344, + "language_loss": 1.01496291, + "learning_rate": 0.0009082113667873553, + "loss": 1.02661121, + "num_input_tokens_seen": 95167328, + "router_z_loss_mlp": 0.96875, + "step": 1144, + "time_per_iteration": 3.104292869567871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170855, + "balance_loss_mlp": 1.07405746, + "epoch": 0.22027702962677953, + "flos": 460618622976.0, + "grad_norm": 0.029355186834356364, + "language_loss": 1.00543249, + "learning_rate": 0.0009080313859898283, + "loss": 1.0171411, + "num_input_tokens_seen": 95230304, + "router_z_loss_mlp": 0.96777344, + "step": 1145, + "time_per_iteration": 2.552457332611084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170139, + "balance_loss_mlp": 1.07343698, + "epoch": 0.2204694113120431, + "flos": 532287372288.0, + "grad_norm": 0.025362278251747628, + "language_loss": 1.01871562, + "learning_rate": 0.0009078512467804684, + "loss": 1.03041708, + "num_input_tokens_seen": 95299520, + "router_z_loss_mlp": 0.96679688, + "step": 1146, + "time_per_iteration": 2.6138763427734375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170493, + "balance_loss_mlp": 1.07379043, + "epoch": 0.22066179299730665, + "flos": 523686481920.0, + "grad_norm": 0.02553067563602684, + "language_loss": 1.00136042, + "learning_rate": 0.0009076709492292119, + "loss": 1.01306534, + "num_input_tokens_seen": 95368912, + "router_z_loss_mlp": 0.96679688, + "step": 1147, + "time_per_iteration": 2.6107985973358154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163104, + "balance_loss_mlp": 1.0664016, + "epoch": 0.2208541746825702, + "flos": 547505115648.0, + "grad_norm": 0.02505349531569444, + "language_loss": 0.99364072, + "learning_rate": 0.0009074904934060562, + "loss": 1.00527167, + "num_input_tokens_seen": 95440800, + "router_z_loss_mlp": 0.96679688, + "step": 1148, + "time_per_iteration": 2.680250644683838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166008, + "balance_loss_mlp": 1.06873322, + "epoch": 0.22104655636783377, + "flos": 710059358208.0, + "grad_norm": 0.023468083856487864, + "language_loss": 0.93112767, + "learning_rate": 0.0009073098793810607, + "loss": 0.94278765, + "num_input_tokens_seen": 95519904, + "router_z_loss_mlp": 0.97265625, + "step": 1149, + "time_per_iteration": 2.9064676761627197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165673, + "balance_loss_mlp": 1.06882739, + "epoch": 0.22123893805309736, + "flos": 585964073472.0, + "grad_norm": 0.028202445852463846, + "language_loss": 0.98436809, + "learning_rate": 0.000907129107224346, + "loss": 0.99602491, + "num_input_tokens_seen": 95591568, + "router_z_loss_mlp": 0.96826172, + "step": 1150, + "time_per_iteration": 2.670436382293701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165906, + "balance_loss_mlp": 1.06901312, + "epoch": 0.22143131973836092, + "flos": 493250995200.0, + "grad_norm": 0.02267098136900654, + "language_loss": 0.95673937, + "learning_rate": 0.0009069481770060939, + "loss": 0.96839839, + "num_input_tokens_seen": 95664480, + "router_z_loss_mlp": 0.96875, + "step": 1151, + "time_per_iteration": 2.650136947631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167632, + "balance_loss_mlp": 1.07092977, + "epoch": 0.22162370142362448, + "flos": 1081467623424.0, + "grad_norm": 0.023887201965423828, + "language_loss": 0.92357147, + "learning_rate": 0.000906767088796548, + "loss": 0.93524778, + "num_input_tokens_seen": 95754400, + "router_z_loss_mlp": 0.96679688, + "step": 1152, + "time_per_iteration": 3.4331767559051514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174048, + "balance_loss_mlp": 1.07734585, + "epoch": 0.22181608310888803, + "flos": 493511506944.0, + "grad_norm": 0.021211000774135545, + "language_loss": 0.94297695, + "learning_rate": 0.0009065858426660127, + "loss": 0.9547174, + "num_input_tokens_seen": 95826944, + "router_z_loss_mlp": 0.96679688, + "step": 1153, + "time_per_iteration": 2.6492207050323486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171336, + "balance_loss_mlp": 1.07458591, + "epoch": 0.2220084647941516, + "flos": 725324765184.0, + "grad_norm": 0.02806046891368227, + "language_loss": 0.95655924, + "learning_rate": 0.0009064044386848543, + "loss": 0.96827257, + "num_input_tokens_seen": 95902688, + "router_z_loss_mlp": 0.96728516, + "step": 1154, + "time_per_iteration": 2.9135258197784424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116775, + "balance_loss_mlp": 1.07090425, + "epoch": 0.22220084647941515, + "flos": 490244376576.0, + "grad_norm": 0.029776005734579798, + "language_loss": 1.00600004, + "learning_rate": 0.0009062228769234997, + "loss": 1.01767755, + "num_input_tokens_seen": 95969952, + "router_z_loss_mlp": 0.96826172, + "step": 1155, + "time_per_iteration": 2.597781181335449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171214, + "balance_loss_mlp": 1.07451141, + "epoch": 0.2223932281646787, + "flos": 537295492608.0, + "grad_norm": 0.030445586519746, + "language_loss": 0.93354964, + "learning_rate": 0.0009060411574524376, + "loss": 0.94526184, + "num_input_tokens_seen": 96037344, + "router_z_loss_mlp": 0.96679688, + "step": 1156, + "time_per_iteration": 2.7325634956359863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168314, + "balance_loss_mlp": 1.07151604, + "epoch": 0.22258560984994227, + "flos": 932967553536.0, + "grad_norm": 0.0275078677514356, + "language_loss": 0.98614538, + "learning_rate": 0.0009058592803422178, + "loss": 0.99782854, + "num_input_tokens_seen": 96115616, + "router_z_loss_mlp": 0.96777344, + "step": 1157, + "time_per_iteration": 3.156981945037842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169861, + "balance_loss_mlp": 1.0739212, + "epoch": 0.22277799153520586, + "flos": 1202395286016.0, + "grad_norm": 0.00950920896526599, + "language_loss": 0.78710288, + "learning_rate": 0.0009056772456634512, + "loss": 0.79880148, + "num_input_tokens_seen": 96333600, + "router_z_loss_mlp": 0.95898438, + "step": 1158, + "time_per_iteration": 4.7935662269592285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170727, + "balance_loss_mlp": 1.07421494, + "epoch": 0.22297037322046942, + "flos": 502316513280.0, + "grad_norm": 0.05502374006765337, + "language_loss": 0.97024429, + "learning_rate": 0.00090549505348681, + "loss": 0.98195159, + "num_input_tokens_seen": 96402544, + "router_z_loss_mlp": 0.96484375, + "step": 1159, + "time_per_iteration": 2.579418659210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167768, + "balance_loss_mlp": 1.07135153, + "epoch": 0.22316275490573298, + "flos": 754112587776.0, + "grad_norm": 0.025312842068973822, + "language_loss": 0.9244132, + "learning_rate": 0.0009053127038830275, + "loss": 0.93609083, + "num_input_tokens_seen": 96487600, + "router_z_loss_mlp": 0.96386719, + "step": 1160, + "time_per_iteration": 2.970240592956543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169788, + "balance_loss_mlp": 1.07346714, + "epoch": 0.22335513659099654, + "flos": 515804000256.0, + "grad_norm": 0.02702757021011719, + "language_loss": 0.97474223, + "learning_rate": 0.000905130196922898, + "loss": 0.98644012, + "num_input_tokens_seen": 96554912, + "router_z_loss_mlp": 0.96289062, + "step": 1161, + "time_per_iteration": 2.558567762374878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175493, + "balance_loss_mlp": 1.07917213, + "epoch": 0.2235475182762601, + "flos": 485507501568.0, + "grad_norm": 0.024760780359754056, + "language_loss": 0.947945, + "learning_rate": 0.0009049475326772769, + "loss": 0.95969993, + "num_input_tokens_seen": 96624192, + "router_z_loss_mlp": 0.96289062, + "step": 1162, + "time_per_iteration": 2.5948867797851562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168008, + "balance_loss_mlp": 1.0716871, + "epoch": 0.22373989996152366, + "flos": 471067290624.0, + "grad_norm": 0.0243609738761747, + "language_loss": 0.92091036, + "learning_rate": 0.0009047647112170811, + "loss": 0.93259048, + "num_input_tokens_seen": 96701040, + "router_z_loss_mlp": 0.96289062, + "step": 1163, + "time_per_iteration": 2.7958250045776367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165002, + "balance_loss_mlp": 1.06868088, + "epoch": 0.22393228164678722, + "flos": 1273017807360.0, + "grad_norm": 0.0269563070164892, + "language_loss": 0.98098505, + "learning_rate": 0.0009045817326132876, + "loss": 0.99263507, + "num_input_tokens_seen": 96791200, + "router_z_loss_mlp": 0.96289062, + "step": 1164, + "time_per_iteration": 3.64853835105896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165462, + "balance_loss_mlp": 1.06914091, + "epoch": 0.22412466333205078, + "flos": 597467523072.0, + "grad_norm": 0.02771003139242203, + "language_loss": 0.94602239, + "learning_rate": 0.0009043985969369357, + "loss": 0.95767695, + "num_input_tokens_seen": 96869360, + "router_z_loss_mlp": 0.96289062, + "step": 1165, + "time_per_iteration": 2.8231425285339355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175209, + "balance_loss_mlp": 1.07860184, + "epoch": 0.22431704501731436, + "flos": 609630984192.0, + "grad_norm": 0.02516811505749033, + "language_loss": 0.93514198, + "learning_rate": 0.0009042153042591245, + "loss": 0.94689411, + "num_input_tokens_seen": 96945840, + "router_z_loss_mlp": 0.96582031, + "step": 1166, + "time_per_iteration": 2.755671501159668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174563, + "balance_loss_mlp": 1.07819414, + "epoch": 0.22450942670257792, + "flos": 908106872832.0, + "grad_norm": 0.024247493396408124, + "language_loss": 0.93277276, + "learning_rate": 0.0009040318546510146, + "loss": 0.94451833, + "num_input_tokens_seen": 97029296, + "router_z_loss_mlp": 0.96337891, + "step": 1167, + "time_per_iteration": 3.126707077026367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174214, + "balance_loss_mlp": 1.07770181, + "epoch": 0.22470180838784148, + "flos": 566380756992.0, + "grad_norm": 0.02335770706345326, + "language_loss": 0.94522464, + "learning_rate": 0.0009038482481838275, + "loss": 0.95696682, + "num_input_tokens_seen": 97097776, + "router_z_loss_mlp": 0.96484375, + "step": 1168, + "time_per_iteration": 2.6482362747192383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171371, + "balance_loss_mlp": 1.07485878, + "epoch": 0.22489419007310504, + "flos": 835917100032.0, + "grad_norm": 0.021740410096357694, + "language_loss": 0.9467479, + "learning_rate": 0.0009036644849288455, + "loss": 0.95846164, + "num_input_tokens_seen": 97181424, + "router_z_loss_mlp": 0.96484375, + "step": 1169, + "time_per_iteration": 3.0959203243255615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168691, + "balance_loss_mlp": 1.07217908, + "epoch": 0.2250865717583686, + "flos": 582138989568.0, + "grad_norm": 0.028400846177611044, + "language_loss": 0.95971251, + "learning_rate": 0.0009034805649574118, + "loss": 0.97139943, + "num_input_tokens_seen": 97252128, + "router_z_loss_mlp": 0.96484375, + "step": 1170, + "time_per_iteration": 2.65209698677063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171761, + "balance_loss_mlp": 1.07515407, + "epoch": 0.22527895344363216, + "flos": 601670639616.0, + "grad_norm": 0.021879369323455276, + "language_loss": 0.92857611, + "learning_rate": 0.0009032964883409308, + "loss": 0.94029367, + "num_input_tokens_seen": 97326640, + "router_z_loss_mlp": 0.96582031, + "step": 1171, + "time_per_iteration": 2.8586626052856445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175461, + "balance_loss_mlp": 1.07990265, + "epoch": 0.22547133512889572, + "flos": 1443731959296.0, + "grad_norm": 0.011387534292379292, + "language_loss": 0.73050535, + "learning_rate": 0.000903112255150867, + "loss": 0.74225998, + "num_input_tokens_seen": 97553952, + "router_z_loss_mlp": 0.95507812, + "step": 1172, + "time_per_iteration": 4.9882895946502686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171774, + "balance_loss_mlp": 1.07526255, + "epoch": 0.22566371681415928, + "flos": 491585866752.0, + "grad_norm": 0.025801800464723818, + "language_loss": 0.97062689, + "learning_rate": 0.0009029278654587462, + "loss": 0.98234463, + "num_input_tokens_seen": 97623584, + "router_z_loss_mlp": 0.96484375, + "step": 1173, + "time_per_iteration": 2.595419406890869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171429, + "balance_loss_mlp": 1.07491696, + "epoch": 0.22585609849942284, + "flos": 605751505920.0, + "grad_norm": 0.02576863859493135, + "language_loss": 0.92400688, + "learning_rate": 0.0009027433193361548, + "loss": 0.93572116, + "num_input_tokens_seen": 97695952, + "router_z_loss_mlp": 0.96484375, + "step": 1174, + "time_per_iteration": 2.738267183303833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117476, + "balance_loss_mlp": 1.07824779, + "epoch": 0.22604848018468643, + "flos": 636727481856.0, + "grad_norm": 0.028952390928102957, + "language_loss": 0.97668821, + "learning_rate": 0.00090255861685474, + "loss": 0.98843575, + "num_input_tokens_seen": 97764544, + "router_z_loss_mlp": 0.96484375, + "step": 1175, + "time_per_iteration": 2.7286014556884766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117152, + "balance_loss_mlp": 1.07481766, + "epoch": 0.22624086186995, + "flos": 480844486656.0, + "grad_norm": 0.027877026454804697, + "language_loss": 1.02366519, + "learning_rate": 0.0009023737580862095, + "loss": 1.03538048, + "num_input_tokens_seen": 97830976, + "router_z_loss_mlp": 0.96679688, + "step": 1176, + "time_per_iteration": 2.553281307220459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170774, + "balance_loss_mlp": 1.07388091, + "epoch": 0.22643324355521355, + "flos": 496806835200.0, + "grad_norm": 0.02249634447584531, + "language_loss": 0.90840948, + "learning_rate": 0.0009021887431023321, + "loss": 0.92011726, + "num_input_tokens_seen": 97898800, + "router_z_loss_mlp": 0.96875, + "step": 1177, + "time_per_iteration": 2.5862364768981934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172189, + "balance_loss_mlp": 1.07539093, + "epoch": 0.2266256252404771, + "flos": 562683927552.0, + "grad_norm": 0.02041789434880362, + "language_loss": 0.95725513, + "learning_rate": 0.0009020035719749369, + "loss": 0.96897697, + "num_input_tokens_seen": 97974112, + "router_z_loss_mlp": 0.96777344, + "step": 1178, + "time_per_iteration": 2.7553560733795166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176357, + "balance_loss_mlp": 1.0796541, + "epoch": 0.22681800692574067, + "flos": 581032541184.0, + "grad_norm": 0.026733278329428435, + "language_loss": 0.89533567, + "learning_rate": 0.0009018182447759136, + "loss": 0.90709925, + "num_input_tokens_seen": 98056640, + "router_z_loss_mlp": 0.96679688, + "step": 1179, + "time_per_iteration": 3.012024402618408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175508, + "balance_loss_mlp": 1.07904434, + "epoch": 0.22701038861100423, + "flos": 741465033216.0, + "grad_norm": 0.025064804828048133, + "language_loss": 0.90941453, + "learning_rate": 0.0009016327615772126, + "loss": 0.92116958, + "num_input_tokens_seen": 98135952, + "router_z_loss_mlp": 0.96435547, + "step": 1180, + "time_per_iteration": 2.969684600830078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172378, + "balance_loss_mlp": 1.07577109, + "epoch": 0.2272027702962678, + "flos": 578305173504.0, + "grad_norm": 0.036813558231106436, + "language_loss": 1.00164366, + "learning_rate": 0.0009014471224508451, + "loss": 1.01336741, + "num_input_tokens_seen": 98204288, + "router_z_loss_mlp": 0.96582031, + "step": 1181, + "time_per_iteration": 2.664487361907959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173976, + "balance_loss_mlp": 1.0774641, + "epoch": 0.22739515198153135, + "flos": 545290765824.0, + "grad_norm": 0.028585613124224512, + "language_loss": 0.95647848, + "learning_rate": 0.0009012613274688823, + "loss": 0.96821827, + "num_input_tokens_seen": 98269856, + "router_z_loss_mlp": 0.96484375, + "step": 1182, + "time_per_iteration": 2.647608518600464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177492, + "balance_loss_mlp": 1.08078945, + "epoch": 0.22758753366679493, + "flos": 441091702272.0, + "grad_norm": 0.02755397132508441, + "language_loss": 1.00651419, + "learning_rate": 0.0009010753767034565, + "loss": 1.01828909, + "num_input_tokens_seen": 98335632, + "router_z_loss_mlp": 0.96679688, + "step": 1183, + "time_per_iteration": 2.528580904006958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176952, + "balance_loss_mlp": 1.08053601, + "epoch": 0.2277799153520585, + "flos": 730823709696.0, + "grad_norm": 0.024484618665474616, + "language_loss": 0.90051508, + "learning_rate": 0.0009008892702267599, + "loss": 0.91228461, + "num_input_tokens_seen": 98420592, + "router_z_loss_mlp": 0.96386719, + "step": 1184, + "time_per_iteration": 2.990344285964966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117733, + "balance_loss_mlp": 1.08100891, + "epoch": 0.22797229703732205, + "flos": 527913067008.0, + "grad_norm": 0.030622621699729128, + "language_loss": 1.01022232, + "learning_rate": 0.0009007030081110457, + "loss": 1.02199566, + "num_input_tokens_seen": 98488096, + "router_z_loss_mlp": 0.96289062, + "step": 1185, + "time_per_iteration": 2.5795140266418457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172726, + "balance_loss_mlp": 1.07592821, + "epoch": 0.2281646787225856, + "flos": 536520688128.0, + "grad_norm": 0.026616575931436976, + "language_loss": 0.93079567, + "learning_rate": 0.000900516590428627, + "loss": 0.942523, + "num_input_tokens_seen": 98561664, + "router_z_loss_mlp": 0.96777344, + "step": 1186, + "time_per_iteration": 2.6647558212280273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117313, + "balance_loss_mlp": 1.07628405, + "epoch": 0.22835706040784917, + "flos": 542477529600.0, + "grad_norm": 0.02522496809839962, + "language_loss": 0.99033505, + "learning_rate": 0.0009003300172518778, + "loss": 1.00206637, + "num_input_tokens_seen": 98634336, + "router_z_loss_mlp": 0.96826172, + "step": 1187, + "time_per_iteration": 2.7046303749084473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177624, + "balance_loss_mlp": 1.08073056, + "epoch": 0.22854944209311273, + "flos": 792004859904.0, + "grad_norm": 0.026332453075710083, + "language_loss": 0.94325852, + "learning_rate": 0.0009001432886532321, + "loss": 0.95503473, + "num_input_tokens_seen": 98709600, + "router_z_loss_mlp": 0.96875, + "step": 1188, + "time_per_iteration": 2.9583094120025635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179036, + "balance_loss_mlp": 1.08233392, + "epoch": 0.2287418237783763, + "flos": 470215898112.0, + "grad_norm": 0.025775869396212594, + "language_loss": 0.97465944, + "learning_rate": 0.0008999564047051843, + "loss": 0.98644984, + "num_input_tokens_seen": 98775024, + "router_z_loss_mlp": 0.96679688, + "step": 1189, + "time_per_iteration": 2.49245023727417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178388, + "balance_loss_mlp": 1.08154237, + "epoch": 0.22893420546363985, + "flos": 469004663808.0, + "grad_norm": 0.023763579929190374, + "language_loss": 0.94691694, + "learning_rate": 0.0008997693654802894, + "loss": 0.95870078, + "num_input_tokens_seen": 98845248, + "router_z_loss_mlp": 0.96826172, + "step": 1190, + "time_per_iteration": 2.6276731491088867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178257, + "balance_loss_mlp": 1.08145857, + "epoch": 0.22912658714890344, + "flos": 627401452032.0, + "grad_norm": 0.023724149848154047, + "language_loss": 0.95182133, + "learning_rate": 0.0008995821710511625, + "loss": 0.96360391, + "num_input_tokens_seen": 98913584, + "router_z_loss_mlp": 0.96777344, + "step": 1191, + "time_per_iteration": 2.756840705871582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117993, + "balance_loss_mlp": 1.08308399, + "epoch": 0.229318968834167, + "flos": 504020573184.0, + "grad_norm": 0.024708694220473774, + "language_loss": 0.93247074, + "learning_rate": 0.0008993948214904786, + "loss": 0.94427001, + "num_input_tokens_seen": 98978608, + "router_z_loss_mlp": 0.96826172, + "step": 1192, + "time_per_iteration": 2.577340602874756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190514, + "balance_loss_mlp": 1.09533691, + "epoch": 0.22951135051943056, + "flos": 1377713877504.0, + "grad_norm": 0.021264094300491608, + "language_loss": 0.78422213, + "learning_rate": 0.0008992073168709733, + "loss": 0.79612726, + "num_input_tokens_seen": 99207424, + "router_z_loss_mlp": 0.95117188, + "step": 1193, + "time_per_iteration": 4.850237607955933 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179442, + "balance_loss_mlp": 1.08316851, + "epoch": 0.22970373220469412, + "flos": 645549952512.0, + "grad_norm": 0.02667568465905087, + "language_loss": 0.92540175, + "learning_rate": 0.0008990196572654427, + "loss": 0.93719625, + "num_input_tokens_seen": 99290592, + "router_z_loss_mlp": 0.96240234, + "step": 1194, + "time_per_iteration": 2.8638381958007812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180858, + "balance_loss_mlp": 1.08453715, + "epoch": 0.22989611388995768, + "flos": 501272464896.0, + "grad_norm": 0.02416134539694475, + "language_loss": 0.95937514, + "learning_rate": 0.0008988318427467426, + "loss": 0.97118378, + "num_input_tokens_seen": 99366096, + "router_z_loss_mlp": 0.96289062, + "step": 1195, + "time_per_iteration": 2.7063868045806885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182741, + "balance_loss_mlp": 1.08589542, + "epoch": 0.23008849557522124, + "flos": 1098333030912.0, + "grad_norm": 0.02922856270819412, + "language_loss": 0.9667449, + "learning_rate": 0.0008986438733877887, + "loss": 0.97857237, + "num_input_tokens_seen": 99456768, + "router_z_loss_mlp": 0.96826172, + "step": 1196, + "time_per_iteration": 3.4508113861083984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176758, + "balance_loss_mlp": 1.08043683, + "epoch": 0.2302808772604848, + "flos": 684992560128.0, + "grad_norm": 0.022228440588834414, + "language_loss": 0.91545051, + "learning_rate": 0.0008984557492615576, + "loss": 0.92721808, + "num_input_tokens_seen": 99539616, + "router_z_loss_mlp": 0.96289062, + "step": 1197, + "time_per_iteration": 2.93611741065979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179014, + "balance_loss_mlp": 1.08269298, + "epoch": 0.23047325894574835, + "flos": 529960230912.0, + "grad_norm": 0.026499525382426087, + "language_loss": 0.99148774, + "learning_rate": 0.0008982674704410854, + "loss": 1.0032779, + "num_input_tokens_seen": 99612064, + "router_z_loss_mlp": 0.96289062, + "step": 1198, + "time_per_iteration": 2.7032008171081543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180823, + "balance_loss_mlp": 1.08450174, + "epoch": 0.23066564063101191, + "flos": 684126431232.0, + "grad_norm": 0.025326379221325218, + "language_loss": 0.86113322, + "learning_rate": 0.0008980790369994682, + "loss": 0.87294143, + "num_input_tokens_seen": 99691040, + "router_z_loss_mlp": 0.96289062, + "step": 1199, + "time_per_iteration": 2.9629056453704834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173246, + "balance_loss_mlp": 1.07673466, + "epoch": 0.2308580223162755, + "flos": 559631646720.0, + "grad_norm": 0.02469990042405053, + "language_loss": 0.95889735, + "learning_rate": 0.000897890449009863, + "loss": 0.97062981, + "num_input_tokens_seen": 99762016, + "router_z_loss_mlp": 0.96484375, + "step": 1200, + "time_per_iteration": 2.6673126220703125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178191, + "balance_loss_mlp": 1.08167911, + "epoch": 0.23105040400153906, + "flos": 556729087488.0, + "grad_norm": 0.021551459012756572, + "language_loss": 0.97633696, + "learning_rate": 0.0008977017065454853, + "loss": 0.98811877, + "num_input_tokens_seen": 99835552, + "router_z_loss_mlp": 0.96484375, + "step": 1201, + "time_per_iteration": 2.6586263179779053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176954, + "balance_loss_mlp": 1.08048964, + "epoch": 0.23124278568680262, + "flos": 706049624064.0, + "grad_norm": 0.025666519973580538, + "language_loss": 0.89963996, + "learning_rate": 0.0008975128096796121, + "loss": 0.9114095, + "num_input_tokens_seen": 99910784, + "router_z_loss_mlp": 0.96435547, + "step": 1202, + "time_per_iteration": 2.8599958419799805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175929, + "balance_loss_mlp": 1.07989419, + "epoch": 0.23143516737206618, + "flos": 613968359424.0, + "grad_norm": 0.02791489713026627, + "language_loss": 0.96485001, + "learning_rate": 0.0008973237584855794, + "loss": 0.97660929, + "num_input_tokens_seen": 99991120, + "router_z_loss_mlp": 0.95996094, + "step": 1203, + "time_per_iteration": 2.8814125061035156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117493, + "balance_loss_mlp": 1.07903779, + "epoch": 0.23162754905732974, + "flos": 390095980032.0, + "grad_norm": 0.02381480195735972, + "language_loss": 0.91340852, + "learning_rate": 0.0008971345530367832, + "loss": 0.92515785, + "num_input_tokens_seen": 100053888, + "router_z_loss_mlp": 0.95849609, + "step": 1204, + "time_per_iteration": 2.513951301574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176133, + "balance_loss_mlp": 1.08024144, + "epoch": 0.2318199307425933, + "flos": 668969086464.0, + "grad_norm": 0.024943516104182908, + "language_loss": 0.94778013, + "learning_rate": 0.0008969451934066799, + "loss": 0.95954144, + "num_input_tokens_seen": 100124176, + "router_z_loss_mlp": 0.95849609, + "step": 1205, + "time_per_iteration": 2.80454421043396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173068, + "balance_loss_mlp": 1.07712853, + "epoch": 0.23201231242785686, + "flos": 667627596288.0, + "grad_norm": 0.029617322009159303, + "language_loss": 0.92493355, + "learning_rate": 0.0008967556796687854, + "loss": 0.93666422, + "num_input_tokens_seen": 100205296, + "router_z_loss_mlp": 0.95898438, + "step": 1206, + "time_per_iteration": 2.89932918548584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173146, + "balance_loss_mlp": 1.07720602, + "epoch": 0.23220469411312042, + "flos": 750094121472.0, + "grad_norm": 0.024264467100448908, + "language_loss": 0.94343531, + "learning_rate": 0.0008965660118966752, + "loss": 0.95516682, + "num_input_tokens_seen": 100279440, + "router_z_loss_mlp": 0.95898438, + "step": 1207, + "time_per_iteration": 2.9768385887145996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179014, + "balance_loss_mlp": 1.08307481, + "epoch": 0.232397075798384, + "flos": 668261411328.0, + "grad_norm": 0.02512248807118796, + "language_loss": 0.97498, + "learning_rate": 0.0008963761901639851, + "loss": 0.98677015, + "num_input_tokens_seen": 100354512, + "router_z_loss_mlp": 0.95898438, + "step": 1208, + "time_per_iteration": 2.8175342082977295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177539, + "balance_loss_mlp": 1.081599, + "epoch": 0.23258945748364757, + "flos": 611345777664.0, + "grad_norm": 0.025244332610569246, + "language_loss": 0.93465042, + "learning_rate": 0.0008961862145444103, + "loss": 0.9464258, + "num_input_tokens_seen": 100426848, + "router_z_loss_mlp": 0.95898438, + "step": 1209, + "time_per_iteration": 2.707583427429199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117491, + "balance_loss_mlp": 1.07901847, + "epoch": 0.23278183916891113, + "flos": 490672074240.0, + "grad_norm": 0.025133767455437463, + "language_loss": 0.96175104, + "learning_rate": 0.0008959960851117059, + "loss": 0.97350019, + "num_input_tokens_seen": 100496176, + "router_z_loss_mlp": 0.95849609, + "step": 1210, + "time_per_iteration": 2.5783777236938477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174943, + "balance_loss_mlp": 1.07895589, + "epoch": 0.23297422085417469, + "flos": 512673856512.0, + "grad_norm": 0.027877077505007057, + "language_loss": 0.94183683, + "learning_rate": 0.0008958058019396868, + "loss": 0.95358628, + "num_input_tokens_seen": 100575072, + "router_z_loss_mlp": 0.95947266, + "step": 1211, + "time_per_iteration": 2.7695388793945312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118178, + "balance_loss_mlp": 1.08560216, + "epoch": 0.23316660253943824, + "flos": 547531312128.0, + "grad_norm": 0.0259067341075638, + "language_loss": 0.95459378, + "learning_rate": 0.0008956153651022274, + "loss": 0.96641153, + "num_input_tokens_seen": 100648304, + "router_z_loss_mlp": 0.96142578, + "step": 1212, + "time_per_iteration": 2.7088377475738525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178328, + "balance_loss_mlp": 1.08181643, + "epoch": 0.2333589842247018, + "flos": 511288705536.0, + "grad_norm": 0.023917692799316066, + "language_loss": 0.93208623, + "learning_rate": 0.0008954247746732618, + "loss": 0.94386959, + "num_input_tokens_seen": 100717616, + "router_z_loss_mlp": 0.96484375, + "step": 1213, + "time_per_iteration": 2.6319668292999268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172909, + "balance_loss_mlp": 1.0766834, + "epoch": 0.23355136590996536, + "flos": 664406128128.0, + "grad_norm": 0.02356648487739955, + "language_loss": 0.98858505, + "learning_rate": 0.0008952340307267837, + "loss": 1.00031424, + "num_input_tokens_seen": 100797056, + "router_z_loss_mlp": 0.96191406, + "step": 1214, + "time_per_iteration": 2.891026735305786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172334, + "balance_loss_mlp": 1.07629859, + "epoch": 0.23374374759522892, + "flos": 509465123328.0, + "grad_norm": 0.027978905734491046, + "language_loss": 0.94424212, + "learning_rate": 0.0008950431333368468, + "loss": 0.95596552, + "num_input_tokens_seen": 100863632, + "router_z_loss_mlp": 0.95996094, + "step": 1215, + "time_per_iteration": 2.5823488235473633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173288, + "balance_loss_mlp": 1.07730114, + "epoch": 0.2339361292804925, + "flos": 1296428209152.0, + "grad_norm": 0.026145796218117214, + "language_loss": 0.94705772, + "learning_rate": 0.0008948520825775634, + "loss": 0.95879066, + "num_input_tokens_seen": 100950272, + "router_z_loss_mlp": 0.95947266, + "step": 1216, + "time_per_iteration": 3.6343605518341064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174216, + "balance_loss_mlp": 1.07808566, + "epoch": 0.23412851096575607, + "flos": 707176264704.0, + "grad_norm": 0.02578801546488365, + "language_loss": 0.93516719, + "learning_rate": 0.0008946608785231067, + "loss": 0.94690937, + "num_input_tokens_seen": 101031008, + "router_z_loss_mlp": 0.9609375, + "step": 1217, + "time_per_iteration": 2.8923676013946533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174557, + "balance_loss_mlp": 1.07842624, + "epoch": 0.23432089265101963, + "flos": 439174794240.0, + "grad_norm": 0.024987781095147748, + "language_loss": 0.94467312, + "learning_rate": 0.0008944695212477084, + "loss": 0.95641869, + "num_input_tokens_seen": 101094688, + "router_z_loss_mlp": 0.9609375, + "step": 1218, + "time_per_iteration": 2.47641658782959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176273, + "balance_loss_mlp": 1.08028615, + "epoch": 0.2345132743362832, + "flos": 481914731520.0, + "grad_norm": 0.02187031641141441, + "language_loss": 0.9320662, + "learning_rate": 0.0008942780108256599, + "loss": 0.94382894, + "num_input_tokens_seen": 101163744, + "router_z_loss_mlp": 0.95947266, + "step": 1219, + "time_per_iteration": 2.585204839706421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176397, + "balance_loss_mlp": 1.07993269, + "epoch": 0.23470565602154675, + "flos": 412340809728.0, + "grad_norm": 0.02314471919225668, + "language_loss": 0.95930934, + "learning_rate": 0.0008940863473313121, + "loss": 0.97107327, + "num_input_tokens_seen": 101226480, + "router_z_loss_mlp": 0.96435547, + "step": 1220, + "time_per_iteration": 2.461904764175415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174627, + "balance_loss_mlp": 1.07811534, + "epoch": 0.2348980377068103, + "flos": 546499998720.0, + "grad_norm": 0.029389735884218435, + "language_loss": 0.99771547, + "learning_rate": 0.0008938945308390756, + "loss": 1.00946164, + "num_input_tokens_seen": 101291824, + "router_z_loss_mlp": 0.96484375, + "step": 1221, + "time_per_iteration": 2.6403567790985107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179462, + "balance_loss_mlp": 1.08295047, + "epoch": 0.23509041939207387, + "flos": 576842159616.0, + "grad_norm": 0.023502241620232074, + "language_loss": 0.96374851, + "learning_rate": 0.00089370256142342, + "loss": 0.97554314, + "num_input_tokens_seen": 101367216, + "router_z_loss_mlp": 0.96484375, + "step": 1222, + "time_per_iteration": 2.7148585319519043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178637, + "balance_loss_mlp": 1.08198178, + "epoch": 0.23528280107733743, + "flos": 589947611136.0, + "grad_norm": 0.022852016666186668, + "language_loss": 0.93682569, + "learning_rate": 0.0008935104391588746, + "loss": 0.94861209, + "num_input_tokens_seen": 101438992, + "router_z_loss_mlp": 0.96630859, + "step": 1223, + "time_per_iteration": 2.7302677631378174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179799, + "balance_loss_mlp": 1.08338237, + "epoch": 0.235475182762601, + "flos": 824856811008.0, + "grad_norm": 0.02091323276417278, + "language_loss": 0.91087663, + "learning_rate": 0.0008933181641200276, + "loss": 0.9226746, + "num_input_tokens_seen": 101534464, + "router_z_loss_mlp": 0.96386719, + "step": 1224, + "time_per_iteration": 3.120337724685669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183017, + "balance_loss_mlp": 1.08650565, + "epoch": 0.23566756444786457, + "flos": 681366862848.0, + "grad_norm": 0.027323039985709546, + "language_loss": 0.94355077, + "learning_rate": 0.0008931257363815271, + "loss": 0.95538092, + "num_input_tokens_seen": 101616496, + "router_z_loss_mlp": 0.96484375, + "step": 1225, + "time_per_iteration": 2.893202543258667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178928, + "balance_loss_mlp": 1.08251154, + "epoch": 0.23585994613312813, + "flos": 703134329856.0, + "grad_norm": 0.022860929740297704, + "language_loss": 0.96590424, + "learning_rate": 0.0008929331560180798, + "loss": 0.97769356, + "num_input_tokens_seen": 101694496, + "router_z_loss_mlp": 0.96386719, + "step": 1226, + "time_per_iteration": 2.913858652114868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176734, + "balance_loss_mlp": 1.08017468, + "epoch": 0.2360523278183917, + "flos": 525195158016.0, + "grad_norm": 0.02227272458953822, + "language_loss": 0.99194574, + "learning_rate": 0.0008927404231044525, + "loss": 1.00371313, + "num_input_tokens_seen": 101766160, + "router_z_loss_mlp": 0.96533203, + "step": 1227, + "time_per_iteration": 2.7194507122039795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175869, + "balance_loss_mlp": 1.07921374, + "epoch": 0.23624470950365525, + "flos": 525442934784.0, + "grad_norm": 0.02071878597098496, + "language_loss": 0.89412713, + "learning_rate": 0.0008925475377154703, + "loss": 0.90588582, + "num_input_tokens_seen": 101844160, + "router_z_loss_mlp": 0.96630859, + "step": 1228, + "time_per_iteration": 2.742506265640259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175669, + "balance_loss_mlp": 1.07896686, + "epoch": 0.2364370911889188, + "flos": 597960348672.0, + "grad_norm": 0.023166098266421232, + "language_loss": 0.90900964, + "learning_rate": 0.0008923544999260183, + "loss": 0.92076635, + "num_input_tokens_seen": 101917968, + "router_z_loss_mlp": 0.96679688, + "step": 1229, + "time_per_iteration": 2.809842109680176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177841, + "balance_loss_mlp": 1.08113885, + "epoch": 0.23662947287418237, + "flos": 758171986944.0, + "grad_norm": 0.02725464196132968, + "language_loss": 1.00227833, + "learning_rate": 0.00089216130981104, + "loss": 1.0140568, + "num_input_tokens_seen": 101996880, + "router_z_loss_mlp": 0.96679688, + "step": 1230, + "time_per_iteration": 3.0096282958984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178297, + "balance_loss_mlp": 1.08159423, + "epoch": 0.23682185455944593, + "flos": 547207673856.0, + "grad_norm": 0.024713012089740163, + "language_loss": 0.91807795, + "learning_rate": 0.000891967967445539, + "loss": 0.92986089, + "num_input_tokens_seen": 102067936, + "router_z_loss_mlp": 0.96679688, + "step": 1231, + "time_per_iteration": 2.7001702785491943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185987, + "balance_loss_mlp": 1.08928442, + "epoch": 0.2370142362447095, + "flos": 663522534912.0, + "grad_norm": 0.02265672956199411, + "language_loss": 0.96654546, + "learning_rate": 0.0008917744729045772, + "loss": 0.97840536, + "num_input_tokens_seen": 102147552, + "router_z_loss_mlp": 0.96679688, + "step": 1232, + "time_per_iteration": 2.8703036308288574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184505, + "balance_loss_mlp": 1.08789778, + "epoch": 0.23720661792997308, + "flos": 684911969280.0, + "grad_norm": 0.02632145570598456, + "language_loss": 0.93737417, + "learning_rate": 0.0008915808262632757, + "loss": 0.94921923, + "num_input_tokens_seen": 102224480, + "router_z_loss_mlp": 0.96582031, + "step": 1233, + "time_per_iteration": 2.839534044265747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185605, + "balance_loss_mlp": 1.08928347, + "epoch": 0.23739899961523664, + "flos": 560022414336.0, + "grad_norm": 0.027552675935845497, + "language_loss": 1.01508975, + "learning_rate": 0.0008913870275968148, + "loss": 1.02694583, + "num_input_tokens_seen": 102297392, + "router_z_loss_mlp": 0.96289062, + "step": 1234, + "time_per_iteration": 2.7176129817962646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182161, + "balance_loss_mlp": 1.08545852, + "epoch": 0.2375913813005002, + "flos": 891163602432.0, + "grad_norm": 0.02404650352203449, + "language_loss": 0.9583261, + "learning_rate": 0.0008911930769804342, + "loss": 0.97014773, + "num_input_tokens_seen": 102386032, + "router_z_loss_mlp": 0.96679688, + "step": 1235, + "time_per_iteration": 3.244257688522339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179697, + "balance_loss_mlp": 1.08289862, + "epoch": 0.23778376298576376, + "flos": 642365414400.0, + "grad_norm": 0.020226791074773265, + "language_loss": 0.99461335, + "learning_rate": 0.0008909989744894318, + "loss": 1.00641024, + "num_input_tokens_seen": 102463504, + "router_z_loss_mlp": 0.96777344, + "step": 1236, + "time_per_iteration": 2.8618855476379395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179012, + "balance_loss_mlp": 1.08230948, + "epoch": 0.23797614467102732, + "flos": 617945166336.0, + "grad_norm": 0.025060145140963254, + "language_loss": 0.91887248, + "learning_rate": 0.0008908047201991649, + "loss": 0.93066257, + "num_input_tokens_seen": 102529632, + "router_z_loss_mlp": 0.96679688, + "step": 1237, + "time_per_iteration": 2.7335665225982666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177715, + "balance_loss_mlp": 1.08120298, + "epoch": 0.23816852635629088, + "flos": 625463076864.0, + "grad_norm": 0.02188809519195417, + "language_loss": 0.92642158, + "learning_rate": 0.0008906103141850502, + "loss": 0.93819869, + "num_input_tokens_seen": 102610192, + "router_z_loss_mlp": 0.96484375, + "step": 1238, + "time_per_iteration": 2.9244723320007324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178141, + "balance_loss_mlp": 1.0816294, + "epoch": 0.23836090804155444, + "flos": 522440318976.0, + "grad_norm": 0.025638098136730073, + "language_loss": 0.97356987, + "learning_rate": 0.0008904157565225621, + "loss": 0.98535126, + "num_input_tokens_seen": 102681216, + "router_z_loss_mlp": 0.96484375, + "step": 1239, + "time_per_iteration": 2.6046018600463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186867, + "balance_loss_mlp": 1.09059334, + "epoch": 0.238553289726818, + "flos": 1155854281728.0, + "grad_norm": 0.0279922632366243, + "language_loss": 0.91224372, + "learning_rate": 0.000890221047287235, + "loss": 0.92411238, + "num_input_tokens_seen": 102777184, + "router_z_loss_mlp": 0.96240234, + "step": 1240, + "time_per_iteration": 3.503387928009033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191442, + "balance_loss_mlp": 1.09512079, + "epoch": 0.23874567141208156, + "flos": 500909895168.0, + "grad_norm": 0.02294407067471098, + "language_loss": 0.98687088, + "learning_rate": 0.0008900261865546615, + "loss": 0.99878532, + "num_input_tokens_seen": 102845744, + "router_z_loss_mlp": 0.96289062, + "step": 1241, + "time_per_iteration": 2.6329948902130127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188291, + "balance_loss_mlp": 1.09197009, + "epoch": 0.23893805309734514, + "flos": 558049110528.0, + "grad_norm": 0.02727719764566138, + "language_loss": 0.96105886, + "learning_rate": 0.0008898311744004936, + "loss": 0.97294176, + "num_input_tokens_seen": 102918064, + "router_z_loss_mlp": 0.96289062, + "step": 1242, + "time_per_iteration": 2.6852729320526123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011866, + "balance_loss_mlp": 1.0902791, + "epoch": 0.2391304347826087, + "flos": 550316350464.0, + "grad_norm": 0.023767912183342704, + "language_loss": 0.95555472, + "learning_rate": 0.0008896360109004414, + "loss": 0.9674207, + "num_input_tokens_seen": 102983920, + "router_z_loss_mlp": 0.96289062, + "step": 1243, + "time_per_iteration": 2.6607675552368164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181953, + "balance_loss_mlp": 1.08558464, + "epoch": 0.23932281646787226, + "flos": 517078361088.0, + "grad_norm": 0.022492500831292953, + "language_loss": 0.92156398, + "learning_rate": 0.0008894406961302742, + "loss": 0.93338358, + "num_input_tokens_seen": 103053328, + "router_z_loss_mlp": 0.96337891, + "step": 1244, + "time_per_iteration": 2.658339262008667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180796, + "balance_loss_mlp": 1.0844276, + "epoch": 0.23951519815313582, + "flos": 745001407488.0, + "grad_norm": 0.0220414301985699, + "language_loss": 0.9171226, + "learning_rate": 0.0008892452301658201, + "loss": 0.92893052, + "num_input_tokens_seen": 103128208, + "router_z_loss_mlp": 0.96337891, + "step": 1245, + "time_per_iteration": 2.987859010696411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189345, + "balance_loss_mlp": 1.09302354, + "epoch": 0.23970757983839938, + "flos": 555174749184.0, + "grad_norm": 0.02624868476300941, + "language_loss": 0.92775297, + "learning_rate": 0.0008890496130829653, + "loss": 0.93964636, + "num_input_tokens_seen": 103197392, + "router_z_loss_mlp": 0.96289062, + "step": 1246, + "time_per_iteration": 2.7285211086273193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011891, + "balance_loss_mlp": 1.09287417, + "epoch": 0.23989996152366294, + "flos": 481617289728.0, + "grad_norm": 0.024405638758005322, + "language_loss": 0.93939734, + "learning_rate": 0.0008888538449576555, + "loss": 0.95128834, + "num_input_tokens_seen": 103265328, + "router_z_loss_mlp": 0.96191406, + "step": 1247, + "time_per_iteration": 2.603447675704956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181648, + "balance_loss_mlp": 1.08532703, + "epoch": 0.2400923432089265, + "flos": 486280304640.0, + "grad_norm": 0.02551404288502155, + "language_loss": 0.9456799, + "learning_rate": 0.0008886579258658944, + "loss": 0.9574964, + "num_input_tokens_seen": 103331632, + "router_z_loss_mlp": 0.96289062, + "step": 1248, + "time_per_iteration": 2.6195995807647705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183672, + "balance_loss_mlp": 1.08735096, + "epoch": 0.24028472489419006, + "flos": 624792331776.0, + "grad_norm": 0.02192042043345247, + "language_loss": 0.93244678, + "learning_rate": 0.0008884618558837446, + "loss": 0.94428349, + "num_input_tokens_seen": 103405408, + "router_z_loss_mlp": 0.96289062, + "step": 1249, + "time_per_iteration": 2.830350399017334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187022, + "balance_loss_mlp": 1.09113026, + "epoch": 0.24047710657945365, + "flos": 602808013824.0, + "grad_norm": 0.023766863499936387, + "language_loss": 0.96457344, + "learning_rate": 0.0008882656350873273, + "loss": 0.97644365, + "num_input_tokens_seen": 103487216, + "router_z_loss_mlp": 0.95849609, + "step": 1250, + "time_per_iteration": 2.8691956996917725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119127, + "balance_loss_mlp": 1.09547377, + "epoch": 0.2406694882647172, + "flos": 843000582144.0, + "grad_norm": 0.03001641023469985, + "language_loss": 1.00300837, + "learning_rate": 0.0008880692635528219, + "loss": 1.01492119, + "num_input_tokens_seen": 103568640, + "router_z_loss_mlp": 0.95751953, + "step": 1251, + "time_per_iteration": 3.066152572631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187351, + "balance_loss_mlp": 1.09155416, + "epoch": 0.24086186994998077, + "flos": 528134647296.0, + "grad_norm": 0.026461260661865858, + "language_loss": 0.98557454, + "learning_rate": 0.0008878727413564669, + "loss": 0.99744809, + "num_input_tokens_seen": 103640784, + "router_z_loss_mlp": 0.95751953, + "step": 1252, + "time_per_iteration": 2.7665653228759766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209228, + "balance_loss_mlp": 1.11519623, + "epoch": 0.24105425163524433, + "flos": 1341459262464.0, + "grad_norm": 0.018061169603452644, + "language_loss": 0.80135596, + "learning_rate": 0.0008876760685745588, + "loss": 0.81344825, + "num_input_tokens_seen": 103865824, + "router_z_loss_mlp": 0.93945312, + "step": 1253, + "time_per_iteration": 4.899695634841919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182732, + "balance_loss_mlp": 1.08679259, + "epoch": 0.24124663332050789, + "flos": 615227257344.0, + "grad_norm": 0.02599071752574661, + "language_loss": 0.90657973, + "learning_rate": 0.0008874792452834528, + "loss": 0.91840708, + "num_input_tokens_seen": 103939872, + "router_z_loss_mlp": 0.95898438, + "step": 1254, + "time_per_iteration": 2.7407760620117188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179855, + "balance_loss_mlp": 1.08401072, + "epoch": 0.24143901500577145, + "flos": 576592381440.0, + "grad_norm": 0.0285281411485809, + "language_loss": 0.99380314, + "learning_rate": 0.0008872822715595626, + "loss": 1.00560164, + "num_input_tokens_seen": 104011120, + "router_z_loss_mlp": 0.95800781, + "step": 1255, + "time_per_iteration": 2.7094287872314453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176059, + "balance_loss_mlp": 1.08007157, + "epoch": 0.241631396691035, + "flos": 496146823680.0, + "grad_norm": 0.026934202036951318, + "language_loss": 0.98012596, + "learning_rate": 0.0008870851474793598, + "loss": 0.9918865, + "num_input_tokens_seen": 104077040, + "router_z_loss_mlp": 0.95947266, + "step": 1256, + "time_per_iteration": 2.5717930793762207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180992, + "balance_loss_mlp": 1.08500445, + "epoch": 0.24182377837629856, + "flos": 637396225536.0, + "grad_norm": 0.02721147411023071, + "language_loss": 0.97604549, + "learning_rate": 0.0008868878731193752, + "loss": 0.98785543, + "num_input_tokens_seen": 104150880, + "router_z_loss_mlp": 0.95947266, + "step": 1257, + "time_per_iteration": 2.835613965988159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180736, + "balance_loss_mlp": 1.08460534, + "epoch": 0.24201616006156215, + "flos": 516349218816.0, + "grad_norm": 0.023847715865297152, + "language_loss": 0.9613235, + "learning_rate": 0.0008866904485561973, + "loss": 0.97313088, + "num_input_tokens_seen": 104223696, + "router_z_loss_mlp": 0.9609375, + "step": 1258, + "time_per_iteration": 2.697693347930908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182815, + "balance_loss_mlp": 1.08682752, + "epoch": 0.2422085417468257, + "flos": 616378093056.0, + "grad_norm": 0.023106527532664196, + "language_loss": 0.92363685, + "learning_rate": 0.000886492873866473, + "loss": 0.93546498, + "num_input_tokens_seen": 104301728, + "router_z_loss_mlp": 0.95947266, + "step": 1259, + "time_per_iteration": 2.8120577335357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118033, + "balance_loss_mlp": 1.08424771, + "epoch": 0.24240092343208927, + "flos": 586912794624.0, + "grad_norm": 0.025402415625288076, + "language_loss": 0.9586736, + "learning_rate": 0.000886295149126908, + "loss": 0.97047698, + "num_input_tokens_seen": 104374480, + "router_z_loss_mlp": 0.96044922, + "step": 1260, + "time_per_iteration": 2.7276840209960938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184073, + "balance_loss_mlp": 1.08813286, + "epoch": 0.24259330511735283, + "flos": 763570874880.0, + "grad_norm": 0.0207328591517146, + "language_loss": 0.94417751, + "learning_rate": 0.0008860972744142655, + "loss": 0.95601827, + "num_input_tokens_seen": 104452384, + "router_z_loss_mlp": 0.95898438, + "step": 1261, + "time_per_iteration": 2.898794412612915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184052, + "balance_loss_mlp": 1.08816016, + "epoch": 0.2427856868026164, + "flos": 628133322240.0, + "grad_norm": 0.02409331705070074, + "language_loss": 0.89591467, + "learning_rate": 0.0008858992498053671, + "loss": 0.90775526, + "num_input_tokens_seen": 104532576, + "router_z_loss_mlp": 0.95849609, + "step": 1262, + "time_per_iteration": 2.8477351665496826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183746, + "balance_loss_mlp": 1.08952332, + "epoch": 0.24297806848787995, + "flos": 1514919343104.0, + "grad_norm": 0.012580587939111834, + "language_loss": 0.7658875, + "learning_rate": 0.0008857010753770934, + "loss": 0.77772498, + "num_input_tokens_seen": 104765216, + "router_z_loss_mlp": 0.94140625, + "step": 1263, + "time_per_iteration": 4.826787710189819 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180613, + "balance_loss_mlp": 1.0848639, + "epoch": 0.2431704501731435, + "flos": 543072413184.0, + "grad_norm": 0.025826560533695943, + "language_loss": 0.92586392, + "learning_rate": 0.0008855027512063817, + "loss": 0.93767005, + "num_input_tokens_seen": 104836912, + "router_z_loss_mlp": 0.95703125, + "step": 1264, + "time_per_iteration": 2.722557306289673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179682, + "balance_loss_mlp": 1.08364689, + "epoch": 0.24336283185840707, + "flos": 524878250496.0, + "grad_norm": 0.025894380889017608, + "language_loss": 0.95614499, + "learning_rate": 0.0008853042773702292, + "loss": 0.96794176, + "num_input_tokens_seen": 104909280, + "router_z_loss_mlp": 0.95996094, + "step": 1265, + "time_per_iteration": 2.7258307933807373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118145, + "balance_loss_mlp": 1.0855577, + "epoch": 0.24355521354367063, + "flos": 538205282304.0, + "grad_norm": 0.022817154468993458, + "language_loss": 0.98287719, + "learning_rate": 0.0008851056539456896, + "loss": 0.99469173, + "num_input_tokens_seen": 104982560, + "router_z_loss_mlp": 0.95849609, + "step": 1266, + "time_per_iteration": 2.6970114707946777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182961, + "balance_loss_mlp": 1.08692622, + "epoch": 0.24374759522893422, + "flos": 932108155392.0, + "grad_norm": 0.024066297062525326, + "language_loss": 0.9148944, + "learning_rate": 0.0008849068810098755, + "loss": 0.92672402, + "num_input_tokens_seen": 105075056, + "router_z_loss_mlp": 0.95996094, + "step": 1267, + "time_per_iteration": 3.326692819595337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118368, + "balance_loss_mlp": 1.08764458, + "epoch": 0.24393997691419778, + "flos": 428685193728.0, + "grad_norm": 0.027357648838687767, + "language_loss": 0.94001949, + "learning_rate": 0.0008847079586399575, + "loss": 0.95185632, + "num_input_tokens_seen": 105137536, + "router_z_loss_mlp": 0.95996094, + "step": 1268, + "time_per_iteration": 2.466787099838257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180763, + "balance_loss_mlp": 1.08482289, + "epoch": 0.24413235859946134, + "flos": 579942104064.0, + "grad_norm": 0.026150492080556795, + "language_loss": 0.95411992, + "learning_rate": 0.0008845088869131641, + "loss": 0.96592754, + "num_input_tokens_seen": 105204848, + "router_z_loss_mlp": 0.95898438, + "step": 1269, + "time_per_iteration": 2.7016899585723877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175832, + "balance_loss_mlp": 1.07989287, + "epoch": 0.2443247402847249, + "flos": 530900219904.0, + "grad_norm": 0.025309414349457434, + "language_loss": 0.98951483, + "learning_rate": 0.0008843096659067818, + "loss": 1.00127316, + "num_input_tokens_seen": 105273456, + "router_z_loss_mlp": 0.95898438, + "step": 1270, + "time_per_iteration": 2.6240859031677246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179701, + "balance_loss_mlp": 1.08366621, + "epoch": 0.24451712196998845, + "flos": 697624651776.0, + "grad_norm": 0.020400222299851913, + "language_loss": 0.92813951, + "learning_rate": 0.000884110295698155, + "loss": 0.93993652, + "num_input_tokens_seen": 105355488, + "router_z_loss_mlp": 0.95996094, + "step": 1271, + "time_per_iteration": 2.945749044418335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180344, + "balance_loss_mlp": 1.08435643, + "epoch": 0.24470950365525201, + "flos": 530863289856.0, + "grad_norm": 0.02434814436965663, + "language_loss": 0.97428346, + "learning_rate": 0.0008839107763646861, + "loss": 0.98608696, + "num_input_tokens_seen": 105421568, + "router_z_loss_mlp": 0.95947266, + "step": 1272, + "time_per_iteration": 2.5816495418548584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182389, + "balance_loss_mlp": 1.08630657, + "epoch": 0.24490188534051557, + "flos": 492347936256.0, + "grad_norm": 0.027277570267404832, + "language_loss": 1.00778949, + "learning_rate": 0.0008837111079838353, + "loss": 1.0196135, + "num_input_tokens_seen": 105493072, + "router_z_loss_mlp": 0.96044922, + "step": 1273, + "time_per_iteration": 2.675060749053955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182001, + "balance_loss_mlp": 1.08587062, + "epoch": 0.24509426702577913, + "flos": 475111226880.0, + "grad_norm": 0.024851656777491255, + "language_loss": 0.98025054, + "learning_rate": 0.000883511290633121, + "loss": 0.99207056, + "num_input_tokens_seen": 105559840, + "router_z_loss_mlp": 0.9609375, + "step": 1274, + "time_per_iteration": 2.5230517387390137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183988, + "balance_loss_mlp": 1.08747613, + "epoch": 0.24528664871104272, + "flos": 551647107072.0, + "grad_norm": 0.02070792437524093, + "language_loss": 1.00507927, + "learning_rate": 0.000883311324390119, + "loss": 1.01691914, + "num_input_tokens_seen": 105634448, + "router_z_loss_mlp": 0.96484375, + "step": 1275, + "time_per_iteration": 2.690488338470459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184819, + "balance_loss_mlp": 1.08887982, + "epoch": 0.24547903039630628, + "flos": 827335675392.0, + "grad_norm": 0.02978995697497926, + "language_loss": 0.95172417, + "learning_rate": 0.0008831112093324629, + "loss": 0.96357232, + "num_input_tokens_seen": 105711936, + "router_z_loss_mlp": 0.95898438, + "step": 1276, + "time_per_iteration": 3.0883522033691406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184816, + "balance_loss_mlp": 1.08839917, + "epoch": 0.24567141208156984, + "flos": 592693718016.0, + "grad_norm": 0.026400385967418116, + "language_loss": 0.99731994, + "learning_rate": 0.0008829109455378444, + "loss": 1.00916803, + "num_input_tokens_seen": 105780240, + "router_z_loss_mlp": 0.96386719, + "step": 1277, + "time_per_iteration": 2.670658588409424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184585, + "balance_loss_mlp": 1.08812118, + "epoch": 0.2458637937668334, + "flos": 548929198080.0, + "grad_norm": 0.022333419000210953, + "language_loss": 0.95654261, + "learning_rate": 0.000882710533084013, + "loss": 0.96838844, + "num_input_tokens_seen": 105849840, + "router_z_loss_mlp": 0.96435547, + "step": 1278, + "time_per_iteration": 2.641019344329834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189057, + "balance_loss_mlp": 1.09244978, + "epoch": 0.24605617545209696, + "flos": 516911175168.0, + "grad_norm": 0.022487969609205835, + "language_loss": 0.97332817, + "learning_rate": 0.0008825099720487755, + "loss": 0.98521876, + "num_input_tokens_seen": 105921488, + "router_z_loss_mlp": 0.96582031, + "step": 1279, + "time_per_iteration": 2.626079559326172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193596, + "balance_loss_mlp": 1.09880066, + "epoch": 0.24624855713736052, + "flos": 1515058331136.0, + "grad_norm": 0.0162275920205478, + "language_loss": 0.7526114, + "learning_rate": 0.0008823092625099967, + "loss": 0.76454735, + "num_input_tokens_seen": 106146816, + "router_z_loss_mlp": 0.94726562, + "step": 1280, + "time_per_iteration": 4.846211671829224 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118811, + "balance_loss_mlp": 1.09350586, + "epoch": 0.24644093882262408, + "flos": 1530746706432.0, + "grad_norm": 0.013716798372908724, + "language_loss": 0.77944112, + "learning_rate": 0.0008821084045455987, + "loss": 0.79132223, + "num_input_tokens_seen": 106361568, + "router_z_loss_mlp": 0.9453125, + "step": 1281, + "time_per_iteration": 4.781409025192261 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189694, + "balance_loss_mlp": 1.09351575, + "epoch": 0.24663332050788764, + "flos": 660348730368.0, + "grad_norm": 0.028995521048395968, + "language_loss": 0.998649, + "learning_rate": 0.0008819073982335619, + "loss": 1.01054597, + "num_input_tokens_seen": 106435296, + "router_z_loss_mlp": 0.96142578, + "step": 1282, + "time_per_iteration": 2.873255729675293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187163, + "balance_loss_mlp": 1.09098482, + "epoch": 0.24682570219315123, + "flos": 542805170688.0, + "grad_norm": 0.0289675073475646, + "language_loss": 0.92590028, + "learning_rate": 0.0008817062436519235, + "loss": 0.93777192, + "num_input_tokens_seen": 106507184, + "router_z_loss_mlp": 0.96142578, + "step": 1283, + "time_per_iteration": 2.6918435096740723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184699, + "balance_loss_mlp": 1.08852112, + "epoch": 0.24701808387841478, + "flos": 441658387968.0, + "grad_norm": 0.027350099061339322, + "language_loss": 1.00939846, + "learning_rate": 0.0008815049408787788, + "loss": 1.02124548, + "num_input_tokens_seen": 106571472, + "router_z_loss_mlp": 0.96142578, + "step": 1284, + "time_per_iteration": 2.5155978202819824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190183, + "balance_loss_mlp": 1.09443462, + "epoch": 0.24721046556367834, + "flos": 469032861696.0, + "grad_norm": 0.028209143321693456, + "language_loss": 0.95635927, + "learning_rate": 0.0008813034899922805, + "loss": 0.96826112, + "num_input_tokens_seen": 106638368, + "router_z_loss_mlp": 0.95703125, + "step": 1285, + "time_per_iteration": 2.5152530670166016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193087, + "balance_loss_mlp": 1.09729075, + "epoch": 0.2474028472489419, + "flos": 505407725568.0, + "grad_norm": 0.027111907557838905, + "language_loss": 1.01196301, + "learning_rate": 0.0008811018910706387, + "loss": 1.02389383, + "num_input_tokens_seen": 106705312, + "router_z_loss_mlp": 0.95751953, + "step": 1286, + "time_per_iteration": 2.5593316555023193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188305, + "balance_loss_mlp": 1.09255612, + "epoch": 0.24759522893420546, + "flos": 480955276800.0, + "grad_norm": 0.03276846828627927, + "language_loss": 0.9498859, + "learning_rate": 0.0008809001441921211, + "loss": 0.96176893, + "num_input_tokens_seen": 106778624, + "router_z_loss_mlp": 0.95703125, + "step": 1287, + "time_per_iteration": 2.7347421646118164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181619, + "balance_loss_mlp": 1.08567917, + "epoch": 0.24778761061946902, + "flos": 534753501696.0, + "grad_norm": 0.025262665654883373, + "language_loss": 0.97019696, + "learning_rate": 0.0008806982494350528, + "loss": 0.98201311, + "num_input_tokens_seen": 106847744, + "router_z_loss_mlp": 0.95898438, + "step": 1288, + "time_per_iteration": 2.6499245166778564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181206, + "balance_loss_mlp": 1.08526671, + "epoch": 0.24797999230473258, + "flos": 560942937600.0, + "grad_norm": 0.021558514258727474, + "language_loss": 0.9849534, + "learning_rate": 0.0008804962068778161, + "loss": 0.99676538, + "num_input_tokens_seen": 106927584, + "router_z_loss_mlp": 0.95898438, + "step": 1289, + "time_per_iteration": 2.852257490158081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186476, + "balance_loss_mlp": 1.09053683, + "epoch": 0.24817237398999614, + "flos": 625480541184.0, + "grad_norm": 0.024913990838324927, + "language_loss": 0.90269625, + "learning_rate": 0.0008802940165988511, + "loss": 0.91456103, + "num_input_tokens_seen": 107006656, + "router_z_loss_mlp": 0.95898438, + "step": 1290, + "time_per_iteration": 2.846277952194214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181135, + "balance_loss_mlp": 1.08471859, + "epoch": 0.2483647556752597, + "flos": 613484265984.0, + "grad_norm": 0.02310813532639645, + "language_loss": 0.96774852, + "learning_rate": 0.000880091678676655, + "loss": 0.97955984, + "num_input_tokens_seen": 107084352, + "router_z_loss_mlp": 0.96386719, + "step": 1291, + "time_per_iteration": 2.8085777759552 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180122, + "balance_loss_mlp": 1.0837059, + "epoch": 0.2485571373605233, + "flos": 584687711232.0, + "grad_norm": 0.021422688776258386, + "language_loss": 0.9855839, + "learning_rate": 0.0008798891931897821, + "loss": 0.99738514, + "num_input_tokens_seen": 107158368, + "router_z_loss_mlp": 0.96386719, + "step": 1292, + "time_per_iteration": 2.7361133098602295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183371, + "balance_loss_mlp": 1.08704984, + "epoch": 0.24874951904578685, + "flos": 495736590336.0, + "grad_norm": 0.02424073807687162, + "language_loss": 0.92916596, + "learning_rate": 0.0008796865602168447, + "loss": 0.94099975, + "num_input_tokens_seen": 107224256, + "router_z_loss_mlp": 0.96289062, + "step": 1293, + "time_per_iteration": 2.5220131874084473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186197, + "balance_loss_mlp": 1.09025729, + "epoch": 0.2489419007310504, + "flos": 457173573120.0, + "grad_norm": 0.023099031146870112, + "language_loss": 0.94818902, + "learning_rate": 0.0008794837798365115, + "loss": 0.96005094, + "num_input_tokens_seen": 107292720, + "router_z_loss_mlp": 0.95898438, + "step": 1294, + "time_per_iteration": 2.6338109970092773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187707, + "balance_loss_mlp": 1.09191012, + "epoch": 0.24913428241631397, + "flos": 486565011456.0, + "grad_norm": 0.02215078033303108, + "language_loss": 0.96107936, + "learning_rate": 0.0008792808521275089, + "loss": 0.97295642, + "num_input_tokens_seen": 107368576, + "router_z_loss_mlp": 0.95751953, + "step": 1295, + "time_per_iteration": 2.7354605197906494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182687, + "balance_loss_mlp": 1.0869385, + "epoch": 0.24932666410157753, + "flos": 519917793792.0, + "grad_norm": 0.022601932216391857, + "language_loss": 0.96075213, + "learning_rate": 0.0008790777771686206, + "loss": 0.972579, + "num_input_tokens_seen": 107433856, + "router_z_loss_mlp": 0.95703125, + "step": 1296, + "time_per_iteration": 2.5746819972991943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181852, + "balance_loss_mlp": 1.08610308, + "epoch": 0.2495190457868411, + "flos": 473556888576.0, + "grad_norm": 0.022656020732285023, + "language_loss": 0.93397439, + "learning_rate": 0.0008788745550386872, + "loss": 0.94579285, + "num_input_tokens_seen": 107500944, + "router_z_loss_mlp": 0.95703125, + "step": 1297, + "time_per_iteration": 2.55985689163208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177725, + "balance_loss_mlp": 1.0820719, + "epoch": 0.24971142747210465, + "flos": 747198292992.0, + "grad_norm": 0.023996141347128058, + "language_loss": 0.88372529, + "learning_rate": 0.0008786711858166063, + "loss": 0.89550251, + "num_input_tokens_seen": 107580000, + "router_z_loss_mlp": 0.95605469, + "step": 1298, + "time_per_iteration": 2.9357082843780518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179743, + "balance_loss_mlp": 1.08399367, + "epoch": 0.2499038091573682, + "flos": 750901853184.0, + "grad_norm": 0.025666304870509565, + "language_loss": 0.93355387, + "learning_rate": 0.0008784676695813332, + "loss": 0.9453513, + "num_input_tokens_seen": 107660384, + "router_z_loss_mlp": 0.95703125, + "step": 1299, + "time_per_iteration": 2.939739942550659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187708, + "balance_loss_mlp": 1.09186363, + "epoch": 0.2500961908426318, + "flos": 746342897664.0, + "grad_norm": 0.02448521774653795, + "language_loss": 0.94308037, + "learning_rate": 0.0008782640064118796, + "loss": 0.95495749, + "num_input_tokens_seen": 107736320, + "router_z_loss_mlp": 0.95800781, + "step": 1300, + "time_per_iteration": 2.882838249206543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223068, + "balance_loss_mlp": 1.12808228, + "epoch": 0.2502885725278953, + "flos": 1420523672064.0, + "grad_norm": 0.019515623701574104, + "language_loss": 0.7618475, + "learning_rate": 0.0008780601963873149, + "loss": 0.77407825, + "num_input_tokens_seen": 107972608, + "router_z_loss_mlp": 0.94921875, + "step": 1301, + "time_per_iteration": 5.002445220947266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180814, + "balance_loss_mlp": 1.08520806, + "epoch": 0.2504809542131589, + "flos": 516231697920.0, + "grad_norm": 0.028413107884204602, + "language_loss": 0.96116567, + "learning_rate": 0.0008778562395867648, + "loss": 0.97297382, + "num_input_tokens_seen": 108043312, + "router_z_loss_mlp": 0.95556641, + "step": 1302, + "time_per_iteration": 2.6463139057159424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183586, + "balance_loss_mlp": 1.08783746, + "epoch": 0.25067333589842244, + "flos": 526851554304.0, + "grad_norm": 0.024791221234372676, + "language_loss": 0.9191972, + "learning_rate": 0.0008776521360894127, + "loss": 0.93103302, + "num_input_tokens_seen": 108114144, + "router_z_loss_mlp": 0.95703125, + "step": 1303, + "time_per_iteration": 2.60622239112854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203766, + "balance_loss_mlp": 1.10897064, + "epoch": 0.25086571758368603, + "flos": 1477157326848.0, + "grad_norm": 0.014632010139538269, + "language_loss": 0.78962064, + "learning_rate": 0.0008774478859744984, + "loss": 0.80165827, + "num_input_tokens_seen": 108338720, + "router_z_loss_mlp": 0.94726562, + "step": 1304, + "time_per_iteration": 4.810328006744385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188508, + "balance_loss_mlp": 1.09285462, + "epoch": 0.2510580992689496, + "flos": 529402277376.0, + "grad_norm": 0.027485922989720333, + "language_loss": 0.99458921, + "learning_rate": 0.0008772434893213186, + "loss": 1.00647426, + "num_input_tokens_seen": 108405456, + "router_z_loss_mlp": 0.95605469, + "step": 1305, + "time_per_iteration": 2.6031458377838135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187013, + "balance_loss_mlp": 1.09155023, + "epoch": 0.25125048095421315, + "flos": 518465513472.0, + "grad_norm": 0.0302061265456268, + "language_loss": 0.93206942, + "learning_rate": 0.0008770389462092276, + "loss": 0.94393957, + "num_input_tokens_seen": 108474368, + "router_z_loss_mlp": 0.95410156, + "step": 1306, + "time_per_iteration": 2.636845827102661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118174, + "balance_loss_mlp": 1.0858953, + "epoch": 0.25144286263947674, + "flos": 621674923008.0, + "grad_norm": 0.026354631998576704, + "language_loss": 0.96568018, + "learning_rate": 0.0008768342567176357, + "loss": 0.97749758, + "num_input_tokens_seen": 108548864, + "router_z_loss_mlp": 0.95800781, + "step": 1307, + "time_per_iteration": 2.797346591949463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187952, + "balance_loss_mlp": 1.09220326, + "epoch": 0.25163524432474027, + "flos": 504865234944.0, + "grad_norm": 0.024318536510777332, + "language_loss": 0.99895847, + "learning_rate": 0.0008766294209260107, + "loss": 1.01083803, + "num_input_tokens_seen": 108623072, + "router_z_loss_mlp": 0.95703125, + "step": 1308, + "time_per_iteration": 2.648099184036255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180717, + "balance_loss_mlp": 1.0850637, + "epoch": 0.25182762601000386, + "flos": 510079472640.0, + "grad_norm": 0.027727924866539442, + "language_loss": 1.0231359, + "learning_rate": 0.0008764244389138767, + "loss": 1.0349431, + "num_input_tokens_seen": 108690128, + "router_z_loss_mlp": 0.95605469, + "step": 1309, + "time_per_iteration": 2.575963258743286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179663, + "balance_loss_mlp": 1.08396196, + "epoch": 0.2520200076952674, + "flos": 635097282048.0, + "grad_norm": 0.028356059247082867, + "language_loss": 0.93336231, + "learning_rate": 0.000876219310760815, + "loss": 0.94515896, + "num_input_tokens_seen": 108770272, + "router_z_loss_mlp": 0.95654297, + "step": 1310, + "time_per_iteration": 2.8647706508636475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189244, + "balance_loss_mlp": 1.09330475, + "epoch": 0.252212389380531, + "flos": 495651996672.0, + "grad_norm": 0.024396868749396446, + "language_loss": 0.91954494, + "learning_rate": 0.0008760140365464631, + "loss": 0.93143737, + "num_input_tokens_seen": 108840592, + "router_z_loss_mlp": 0.95898438, + "step": 1311, + "time_per_iteration": 2.592453718185425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180261, + "balance_loss_mlp": 1.08451247, + "epoch": 0.2524047710657945, + "flos": 491529470976.0, + "grad_norm": 0.026197758988141227, + "language_loss": 0.97483641, + "learning_rate": 0.0008758086163505156, + "loss": 0.98663902, + "num_input_tokens_seen": 108910064, + "router_z_loss_mlp": 0.95703125, + "step": 1312, + "time_per_iteration": 2.56319260597229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181231, + "balance_loss_mlp": 1.08548176, + "epoch": 0.2525971527510581, + "flos": 648612966912.0, + "grad_norm": 0.0242630752619845, + "language_loss": 0.98733318, + "learning_rate": 0.0008756030502527239, + "loss": 0.99914545, + "num_input_tokens_seen": 108986336, + "router_z_loss_mlp": 0.95703125, + "step": 1313, + "time_per_iteration": 2.858691930770874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180546, + "balance_loss_mlp": 1.08455837, + "epoch": 0.2527895344363217, + "flos": 570373026816.0, + "grad_norm": 0.025539383487616106, + "language_loss": 0.99746555, + "learning_rate": 0.0008753973383328954, + "loss": 1.00927103, + "num_input_tokens_seen": 109059712, + "router_z_loss_mlp": 0.95947266, + "step": 1314, + "time_per_iteration": 2.6683549880981445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180137, + "balance_loss_mlp": 1.0841974, + "epoch": 0.2529819161215852, + "flos": 515068127232.0, + "grad_norm": 0.027266475314614652, + "language_loss": 0.95154297, + "learning_rate": 0.0008751914806708952, + "loss": 0.96334434, + "num_input_tokens_seen": 109127504, + "router_z_loss_mlp": 0.95898438, + "step": 1315, + "time_per_iteration": 2.6008012294769287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178852, + "balance_loss_mlp": 1.08310342, + "epoch": 0.2531742978068488, + "flos": 532350498816.0, + "grad_norm": 0.02508848621911812, + "language_loss": 0.91122246, + "learning_rate": 0.0008749854773466439, + "loss": 0.92301095, + "num_input_tokens_seen": 109198080, + "router_z_loss_mlp": 0.95703125, + "step": 1316, + "time_per_iteration": 2.6595401763916016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193828, + "balance_loss_mlp": 1.09822178, + "epoch": 0.25336667949211233, + "flos": 597747500544.0, + "grad_norm": 0.027675397486347803, + "language_loss": 0.92894816, + "learning_rate": 0.0008747793284401192, + "loss": 0.9408865, + "num_input_tokens_seen": 109268368, + "router_z_loss_mlp": 0.95556641, + "step": 1317, + "time_per_iteration": 2.6975109577178955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187696, + "balance_loss_mlp": 1.09175622, + "epoch": 0.2535590611773759, + "flos": 603255177216.0, + "grad_norm": 0.02603186041930466, + "language_loss": 0.95462376, + "learning_rate": 0.0008745730340313551, + "loss": 0.96650076, + "num_input_tokens_seen": 109344112, + "router_z_loss_mlp": 0.95898438, + "step": 1318, + "time_per_iteration": 2.805327892303467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187328, + "balance_loss_mlp": 1.0915786, + "epoch": 0.25375144286263945, + "flos": 496322741760.0, + "grad_norm": 0.027049333310240738, + "language_loss": 0.95645851, + "learning_rate": 0.0008743665942004422, + "loss": 0.96833169, + "num_input_tokens_seen": 109414112, + "router_z_loss_mlp": 0.95703125, + "step": 1319, + "time_per_iteration": 2.6340737342834473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185781, + "balance_loss_mlp": 1.0896982, + "epoch": 0.25394382454790304, + "flos": 513476858880.0, + "grad_norm": 0.02784781206620994, + "language_loss": 1.02473438, + "learning_rate": 0.0008741600090275277, + "loss": 1.03659225, + "num_input_tokens_seen": 109484336, + "router_z_loss_mlp": 0.96044922, + "step": 1320, + "time_per_iteration": 2.573155641555786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183427, + "balance_loss_mlp": 1.08763099, + "epoch": 0.25413620623316663, + "flos": 960855045120.0, + "grad_norm": 0.03323105604734599, + "language_loss": 0.94160318, + "learning_rate": 0.0008739532785928151, + "loss": 0.95343745, + "num_input_tokens_seen": 109590128, + "router_z_loss_mlp": 0.95751953, + "step": 1321, + "time_per_iteration": 3.470245122909546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190819, + "balance_loss_mlp": 1.09659576, + "epoch": 0.25432858791843016, + "flos": 1580648715264.0, + "grad_norm": 0.017424496497570757, + "language_loss": 0.74893582, + "learning_rate": 0.0008737464029765639, + "loss": 0.76084399, + "num_input_tokens_seen": 109816592, + "router_z_loss_mlp": 0.94140625, + "step": 1322, + "time_per_iteration": 4.8549723625183105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184096, + "balance_loss_mlp": 1.08806074, + "epoch": 0.25452096960369375, + "flos": 584893828608.0, + "grad_norm": 0.025099574916072127, + "language_loss": 0.94150972, + "learning_rate": 0.0008735393822590908, + "loss": 0.95335066, + "num_input_tokens_seen": 109890464, + "router_z_loss_mlp": 0.95996094, + "step": 1323, + "time_per_iteration": 2.6771461963653564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187145, + "balance_loss_mlp": 1.0910151, + "epoch": 0.2547133512889573, + "flos": 509641041408.0, + "grad_norm": 0.024104352127734364, + "language_loss": 0.95373654, + "learning_rate": 0.0008733322165207681, + "loss": 0.965608, + "num_input_tokens_seen": 109963408, + "router_z_loss_mlp": 0.9609375, + "step": 1324, + "time_per_iteration": 2.671187400817871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191608, + "balance_loss_mlp": 1.09590697, + "epoch": 0.25490573297422087, + "flos": 784035783168.0, + "grad_norm": 0.02719192919889817, + "language_loss": 0.93181324, + "learning_rate": 0.0008731249058420247, + "loss": 0.94372928, + "num_input_tokens_seen": 110048800, + "router_z_loss_mlp": 0.95654297, + "step": 1325, + "time_per_iteration": 3.0272371768951416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189078, + "balance_loss_mlp": 1.09332883, + "epoch": 0.2550981146594844, + "flos": 510952332288.0, + "grad_norm": 0.024872253546531747, + "language_loss": 1.00651383, + "learning_rate": 0.0008729174503033459, + "loss": 1.0184046, + "num_input_tokens_seen": 110118096, + "router_z_loss_mlp": 0.95703125, + "step": 1326, + "time_per_iteration": 2.6320900917053223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187412, + "balance_loss_mlp": 1.09166288, + "epoch": 0.255290496344748, + "flos": 677930545152.0, + "grad_norm": 0.02807770436691079, + "language_loss": 0.93655276, + "learning_rate": 0.0008727098499852728, + "loss": 0.9484269, + "num_input_tokens_seen": 110190160, + "router_z_loss_mlp": 0.95703125, + "step": 1327, + "time_per_iteration": 2.8246335983276367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187202, + "balance_loss_mlp": 1.09116733, + "epoch": 0.2554828780300115, + "flos": 538984816128.0, + "grad_norm": 0.02304152562423393, + "language_loss": 0.97811985, + "learning_rate": 0.0008725021049684034, + "loss": 0.9899919, + "num_input_tokens_seen": 110268000, + "router_z_loss_mlp": 0.95996094, + "step": 1328, + "time_per_iteration": 2.783276081085205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011849, + "balance_loss_mlp": 1.08924699, + "epoch": 0.2556752597152751, + "flos": 825622883328.0, + "grad_norm": 0.024322773499976656, + "language_loss": 0.90949428, + "learning_rate": 0.000872294215333391, + "loss": 0.92134333, + "num_input_tokens_seen": 110354816, + "router_z_loss_mlp": 0.95605469, + "step": 1329, + "time_per_iteration": 3.1658623218536377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184378, + "balance_loss_mlp": 1.08867729, + "epoch": 0.2558676414005387, + "flos": 571890435072.0, + "grad_norm": 0.026114012927401953, + "language_loss": 0.91800833, + "learning_rate": 0.0008720861811609457, + "loss": 0.92985213, + "num_input_tokens_seen": 110427968, + "router_z_loss_mlp": 0.95654297, + "step": 1330, + "time_per_iteration": 2.725680112838745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185897, + "balance_loss_mlp": 1.09024334, + "epoch": 0.2560600230858022, + "flos": 487748047872.0, + "grad_norm": 0.02457760145285043, + "language_loss": 0.93800515, + "learning_rate": 0.0008718780025318338, + "loss": 0.94986409, + "num_input_tokens_seen": 110501184, + "router_z_loss_mlp": 0.95605469, + "step": 1331, + "time_per_iteration": 2.730424404144287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184699, + "balance_loss_mlp": 1.08904529, + "epoch": 0.2562524047710658, + "flos": 514119406080.0, + "grad_norm": 0.027688932662206074, + "language_loss": 0.94349414, + "learning_rate": 0.0008716696795268771, + "loss": 0.9553411, + "num_input_tokens_seen": 110573008, + "router_z_loss_mlp": 0.95605469, + "step": 1332, + "time_per_iteration": 2.6572844982147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183855, + "balance_loss_mlp": 1.0881542, + "epoch": 0.25644478645632934, + "flos": 636109129728.0, + "grad_norm": 0.025705757243887913, + "language_loss": 0.96553451, + "learning_rate": 0.0008714612122269538, + "loss": 0.97737306, + "num_input_tokens_seen": 110646704, + "router_z_loss_mlp": 0.95654297, + "step": 1333, + "time_per_iteration": 2.867598295211792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184376, + "balance_loss_mlp": 1.0888176, + "epoch": 0.25663716814159293, + "flos": 437544594432.0, + "grad_norm": 0.025955971973603553, + "language_loss": 1.00358891, + "learning_rate": 0.0008712526007129982, + "loss": 1.01543272, + "num_input_tokens_seen": 110712208, + "router_z_loss_mlp": 0.95507812, + "step": 1334, + "time_per_iteration": 2.516052484512329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186528, + "balance_loss_mlp": 1.0908742, + "epoch": 0.25682954982685646, + "flos": 499242765312.0, + "grad_norm": 0.021880143416013124, + "language_loss": 0.98599482, + "learning_rate": 0.0008710438450660003, + "loss": 0.99786019, + "num_input_tokens_seen": 110783936, + "router_z_loss_mlp": 0.95605469, + "step": 1335, + "time_per_iteration": 2.659489870071411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184319, + "balance_loss_mlp": 1.08861768, + "epoch": 0.25702193151212005, + "flos": 458627854848.0, + "grad_norm": 0.028869593177541276, + "language_loss": 0.98979777, + "learning_rate": 0.0008708349453670064, + "loss": 1.00164104, + "num_input_tokens_seen": 110848560, + "router_z_loss_mlp": 0.95654297, + "step": 1336, + "time_per_iteration": 2.5267841815948486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185282, + "balance_loss_mlp": 1.08953345, + "epoch": 0.2572143131973836, + "flos": 599403896832.0, + "grad_norm": 0.021342480544698176, + "language_loss": 0.99445975, + "learning_rate": 0.0008706259016971185, + "loss": 1.00631261, + "num_input_tokens_seen": 110922672, + "router_z_loss_mlp": 0.95703125, + "step": 1337, + "time_per_iteration": 2.7561397552490234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118469, + "balance_loss_mlp": 1.08884537, + "epoch": 0.25740669488264717, + "flos": 699526096896.0, + "grad_norm": 0.032203199948080075, + "language_loss": 0.96320713, + "learning_rate": 0.0008704167141374944, + "loss": 0.97505397, + "num_input_tokens_seen": 110995456, + "router_z_loss_mlp": 0.95800781, + "step": 1338, + "time_per_iteration": 2.7987895011901855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118993, + "balance_loss_mlp": 1.09432399, + "epoch": 0.25759907656791076, + "flos": 503378025984.0, + "grad_norm": 0.024717846020590344, + "language_loss": 0.97755861, + "learning_rate": 0.0008702073827693482, + "loss": 0.98945785, + "num_input_tokens_seen": 111069568, + "router_z_loss_mlp": 0.95556641, + "step": 1339, + "time_per_iteration": 2.694470167160034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186155, + "balance_loss_mlp": 1.0904057, + "epoch": 0.2577914582531743, + "flos": 775241510400.0, + "grad_norm": 0.025036220674882887, + "language_loss": 0.97113985, + "learning_rate": 0.0008699979076739494, + "loss": 0.98300135, + "num_input_tokens_seen": 111142608, + "router_z_loss_mlp": 0.95703125, + "step": 1340, + "time_per_iteration": 2.962740421295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184068, + "balance_loss_mlp": 1.08836627, + "epoch": 0.2579838399384379, + "flos": 460609890816.0, + "grad_norm": 0.026880962232798965, + "language_loss": 0.99139833, + "learning_rate": 0.0008697882889326234, + "loss": 1.00323892, + "num_input_tokens_seen": 111206336, + "router_z_loss_mlp": 0.95654297, + "step": 1341, + "time_per_iteration": 2.517382860183716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185483, + "balance_loss_mlp": 1.08987677, + "epoch": 0.2581762216237014, + "flos": 570262236672.0, + "grad_norm": 0.0242955377416103, + "language_loss": 0.96170259, + "learning_rate": 0.0008695785266267515, + "loss": 0.97355735, + "num_input_tokens_seen": 111276736, + "router_z_loss_mlp": 0.95556641, + "step": 1342, + "time_per_iteration": 2.6961281299591064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118536, + "balance_loss_mlp": 1.08961082, + "epoch": 0.258368603308965, + "flos": 605386934784.0, + "grad_norm": 0.023671890991135848, + "language_loss": 0.9337616, + "learning_rate": 0.0008693686208377704, + "loss": 0.94561517, + "num_input_tokens_seen": 111353856, + "router_z_loss_mlp": 0.95703125, + "step": 1343, + "time_per_iteration": 2.8561604022979736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184784, + "balance_loss_mlp": 1.08908272, + "epoch": 0.2585609849942285, + "flos": 492486924288.0, + "grad_norm": 0.022133881226187983, + "language_loss": 0.96849036, + "learning_rate": 0.0008691585716471733, + "loss": 0.98033822, + "num_input_tokens_seen": 111424960, + "router_z_loss_mlp": 0.95654297, + "step": 1344, + "time_per_iteration": 2.6443324089050293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185279, + "balance_loss_mlp": 1.08952987, + "epoch": 0.2587533666794921, + "flos": 641957182464.0, + "grad_norm": 0.02305984249039353, + "language_loss": 0.94482636, + "learning_rate": 0.0008689483791365079, + "loss": 0.95667922, + "num_input_tokens_seen": 111505248, + "router_z_loss_mlp": 0.95703125, + "step": 1345, + "time_per_iteration": 2.8541483879089355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185515, + "balance_loss_mlp": 1.08976638, + "epoch": 0.2589457483647557, + "flos": 577994996736.0, + "grad_norm": 0.022382124417400225, + "language_loss": 0.97831523, + "learning_rate": 0.0008687380433873786, + "loss": 0.99017042, + "num_input_tokens_seen": 111581936, + "router_z_loss_mlp": 0.95703125, + "step": 1346, + "time_per_iteration": 2.8148868083953857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186141, + "balance_loss_mlp": 1.09048796, + "epoch": 0.25913813005001923, + "flos": 536466293760.0, + "grad_norm": 0.024690786073415343, + "language_loss": 0.93800229, + "learning_rate": 0.0008685275644814448, + "loss": 0.94986367, + "num_input_tokens_seen": 111651456, + "router_z_loss_mlp": 0.95605469, + "step": 1347, + "time_per_iteration": 2.6872267723083496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188569, + "balance_loss_mlp": 1.0930109, + "epoch": 0.2593305117352828, + "flos": 722346344448.0, + "grad_norm": 0.028015192621825148, + "language_loss": 0.944291, + "learning_rate": 0.0008683169425004216, + "loss": 0.95617664, + "num_input_tokens_seen": 111731712, + "router_z_loss_mlp": 0.95507812, + "step": 1348, + "time_per_iteration": 2.9036293029785156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187318, + "balance_loss_mlp": 1.09171176, + "epoch": 0.25952289342054635, + "flos": 711355186176.0, + "grad_norm": 0.028695706473352366, + "language_loss": 0.9867608, + "learning_rate": 0.0008681061775260799, + "loss": 0.99863392, + "num_input_tokens_seen": 111800752, + "router_z_loss_mlp": 0.95556641, + "step": 1349, + "time_per_iteration": 2.8635356426239014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185365, + "balance_loss_mlp": 1.08942509, + "epoch": 0.25971527510580994, + "flos": 456849934848.0, + "grad_norm": 0.028158951385379896, + "language_loss": 1.01652539, + "learning_rate": 0.0008678952696402458, + "loss": 1.02837896, + "num_input_tokens_seen": 111866752, + "router_z_loss_mlp": 0.95898438, + "step": 1350, + "time_per_iteration": 2.4997899532318115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184224, + "balance_loss_mlp": 1.08847523, + "epoch": 0.25990765679107347, + "flos": 613753509888.0, + "grad_norm": 0.022929201317296435, + "language_loss": 0.944794, + "learning_rate": 0.000867684218924801, + "loss": 0.95663619, + "num_input_tokens_seen": 111951328, + "router_z_loss_mlp": 0.95703125, + "step": 1351, + "time_per_iteration": 2.8553221225738525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190399, + "balance_loss_mlp": 1.09655762, + "epoch": 0.26010003847633706, + "flos": 1541404219392.0, + "grad_norm": 0.011373150433568688, + "language_loss": 0.78947091, + "learning_rate": 0.0008674730254616827, + "loss": 0.80137491, + "num_input_tokens_seen": 112182272, + "router_z_loss_mlp": 0.9375, + "step": 1352, + "time_per_iteration": 4.894901752471924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185829, + "balance_loss_mlp": 1.0900805, + "epoch": 0.2602924201616006, + "flos": 717544341504.0, + "grad_norm": 0.021521520095987904, + "language_loss": 0.9327749, + "learning_rate": 0.0008672616893328834, + "loss": 0.94463313, + "num_input_tokens_seen": 112261760, + "router_z_loss_mlp": 0.95703125, + "step": 1353, + "time_per_iteration": 2.9336133003234863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181557, + "balance_loss_mlp": 1.08571243, + "epoch": 0.2604848018468642, + "flos": 644685825024.0, + "grad_norm": 0.026147354827328006, + "language_loss": 0.99375951, + "learning_rate": 0.0008670502106204512, + "loss": 1.00557506, + "num_input_tokens_seen": 112339136, + "router_z_loss_mlp": 0.95800781, + "step": 1354, + "time_per_iteration": 2.828476667404175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182712, + "balance_loss_mlp": 1.08677256, + "epoch": 0.26067718353212777, + "flos": 518037815808.0, + "grad_norm": 0.024264679119450936, + "language_loss": 0.92830276, + "learning_rate": 0.0008668385894064892, + "loss": 0.94012988, + "num_input_tokens_seen": 112409872, + "router_z_loss_mlp": 0.95898438, + "step": 1355, + "time_per_iteration": 2.627603054046631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183025, + "balance_loss_mlp": 1.08708537, + "epoch": 0.2608695652173913, + "flos": 824224997376.0, + "grad_norm": 0.021603697394371835, + "language_loss": 0.98353279, + "learning_rate": 0.0008666268257731562, + "loss": 0.995363, + "num_input_tokens_seen": 112495616, + "router_z_loss_mlp": 0.95898438, + "step": 1356, + "time_per_iteration": 3.104410409927368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185288, + "balance_loss_mlp": 1.0894438, + "epoch": 0.2610619469026549, + "flos": 1009449039360.0, + "grad_norm": 0.029063247039842262, + "language_loss": 0.98633218, + "learning_rate": 0.0008664149198026662, + "loss": 0.99818504, + "num_input_tokens_seen": 112575168, + "router_z_loss_mlp": 0.95800781, + "step": 1357, + "time_per_iteration": 3.2552602291107178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184981, + "balance_loss_mlp": 1.08932745, + "epoch": 0.2612543285879184, + "flos": 537825248256.0, + "grad_norm": 0.02677910773484977, + "language_loss": 0.99748302, + "learning_rate": 0.0008662028715772883, + "loss": 1.00933278, + "num_input_tokens_seen": 112648480, + "router_z_loss_mlp": 0.95605469, + "step": 1358, + "time_per_iteration": 2.6044809818267822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186466, + "balance_loss_mlp": 1.09095597, + "epoch": 0.261446710273182, + "flos": 520438817280.0, + "grad_norm": 0.024887857022763207, + "language_loss": 0.95091379, + "learning_rate": 0.0008659906811793467, + "loss": 0.96277845, + "num_input_tokens_seen": 112719856, + "router_z_loss_mlp": 0.95458984, + "step": 1359, + "time_per_iteration": 2.660039186477661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118844, + "balance_loss_mlp": 1.09297669, + "epoch": 0.26163909195844554, + "flos": 584399001600.0, + "grad_norm": 0.02478490455868915, + "language_loss": 0.99414921, + "learning_rate": 0.0008657783486912215, + "loss": 1.00603366, + "num_input_tokens_seen": 112795088, + "router_z_loss_mlp": 0.95410156, + "step": 1360, + "time_per_iteration": 2.710707187652588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189735, + "balance_loss_mlp": 1.09412944, + "epoch": 0.2618314736437091, + "flos": 960368223744.0, + "grad_norm": 0.025390417969386195, + "language_loss": 0.99146813, + "learning_rate": 0.0008655658741953472, + "loss": 1.00336552, + "num_input_tokens_seen": 112879888, + "router_z_loss_mlp": 0.95556641, + "step": 1361, + "time_per_iteration": 3.2610023021698 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187461, + "balance_loss_mlp": 1.0919987, + "epoch": 0.26202385532897265, + "flos": 575902170624.0, + "grad_norm": 0.01965876060868175, + "language_loss": 0.95685869, + "learning_rate": 0.0008653532577742136, + "loss": 0.96873331, + "num_input_tokens_seen": 112952208, + "router_z_loss_mlp": 0.95410156, + "step": 1362, + "time_per_iteration": 2.753920793533325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190509, + "balance_loss_mlp": 1.09509337, + "epoch": 0.26221623701423624, + "flos": 446397264384.0, + "grad_norm": 0.024702919408059576, + "language_loss": 0.95440364, + "learning_rate": 0.0008651404995103659, + "loss": 0.96630871, + "num_input_tokens_seen": 113017472, + "router_z_loss_mlp": 0.95361328, + "step": 1363, + "time_per_iteration": 2.532839298248291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184254, + "balance_loss_mlp": 1.088696, + "epoch": 0.26240861869949983, + "flos": 536755003392.0, + "grad_norm": 0.021936659097783043, + "language_loss": 0.95658946, + "learning_rate": 0.0008649275994864041, + "loss": 0.96843195, + "num_input_tokens_seen": 113090000, + "router_z_loss_mlp": 0.95507812, + "step": 1364, + "time_per_iteration": 2.6723499298095703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182727, + "balance_loss_mlp": 1.08735919, + "epoch": 0.26260100038476336, + "flos": 566487544320.0, + "grad_norm": 0.02057443182875544, + "language_loss": 0.93747735, + "learning_rate": 0.0008647145577849834, + "loss": 0.94930464, + "num_input_tokens_seen": 113169424, + "router_z_loss_mlp": 0.953125, + "step": 1365, + "time_per_iteration": 2.817335844039917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184888, + "balance_loss_mlp": 1.089378, + "epoch": 0.26279338207002695, + "flos": 614320195584.0, + "grad_norm": 0.02000370099851243, + "language_loss": 0.90110707, + "learning_rate": 0.0008645013744888139, + "loss": 0.912956, + "num_input_tokens_seen": 113256752, + "router_z_loss_mlp": 0.95458984, + "step": 1366, + "time_per_iteration": 2.889956474304199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190369, + "balance_loss_mlp": 1.09452498, + "epoch": 0.2629857637552905, + "flos": 523944992256.0, + "grad_norm": 0.02433762343961203, + "language_loss": 0.96272296, + "learning_rate": 0.0008642880496806607, + "loss": 0.97462666, + "num_input_tokens_seen": 113330512, + "router_z_loss_mlp": 0.95800781, + "step": 1367, + "time_per_iteration": 2.7868857383728027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186128, + "balance_loss_mlp": 1.09028387, + "epoch": 0.26317814544055407, + "flos": 535654559232.0, + "grad_norm": 0.022945771924384736, + "language_loss": 0.9318915, + "learning_rate": 0.0008640745834433437, + "loss": 0.94375277, + "num_input_tokens_seen": 113409088, + "router_z_loss_mlp": 0.95800781, + "step": 1368, + "time_per_iteration": 2.7556509971618652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182695, + "balance_loss_mlp": 1.08718467, + "epoch": 0.2633705271258176, + "flos": 556779479040.0, + "grad_norm": 0.024336346931206027, + "language_loss": 0.96858466, + "learning_rate": 0.000863860975859738, + "loss": 0.98041165, + "num_input_tokens_seen": 113486624, + "router_z_loss_mlp": 0.95458984, + "step": 1369, + "time_per_iteration": 2.9069716930389404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184914, + "balance_loss_mlp": 1.08945167, + "epoch": 0.2635629088110812, + "flos": 553461957120.0, + "grad_norm": 0.02843668952404612, + "language_loss": 1.00276971, + "learning_rate": 0.0008636472270127733, + "loss": 1.01461875, + "num_input_tokens_seen": 113555776, + "router_z_loss_mlp": 0.95410156, + "step": 1370, + "time_per_iteration": 2.626201868057251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185086, + "balance_loss_mlp": 1.08952749, + "epoch": 0.2637552904963448, + "flos": 456915062784.0, + "grad_norm": 0.02826867423240315, + "language_loss": 1.01819849, + "learning_rate": 0.0008634333369854345, + "loss": 1.03004944, + "num_input_tokens_seen": 113624208, + "router_z_loss_mlp": 0.95507812, + "step": 1371, + "time_per_iteration": 2.5906460285186768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183664, + "balance_loss_mlp": 1.08820105, + "epoch": 0.2639476721816083, + "flos": 614259070464.0, + "grad_norm": 0.024066040008067748, + "language_loss": 0.95210433, + "learning_rate": 0.0008632193058607608, + "loss": 0.96394098, + "num_input_tokens_seen": 113698544, + "router_z_loss_mlp": 0.95410156, + "step": 1372, + "time_per_iteration": 2.7260935306549072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180244, + "balance_loss_mlp": 1.08487642, + "epoch": 0.2641400538668719, + "flos": 573025807872.0, + "grad_norm": 0.02730663798923432, + "language_loss": 0.93146777, + "learning_rate": 0.0008630051337218466, + "loss": 0.94327021, + "num_input_tokens_seen": 113769024, + "router_z_loss_mlp": 0.953125, + "step": 1373, + "time_per_iteration": 2.7155323028564453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193282, + "balance_loss_mlp": 1.09777129, + "epoch": 0.2643324355521354, + "flos": 583339490304.0, + "grad_norm": 0.02802871933703498, + "language_loss": 0.91373825, + "learning_rate": 0.0008627908206518409, + "loss": 0.9256711, + "num_input_tokens_seen": 113836320, + "router_z_loss_mlp": 0.95458984, + "step": 1374, + "time_per_iteration": 2.7118475437164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189674, + "balance_loss_mlp": 1.09621429, + "epoch": 0.264524817237399, + "flos": 1548025075200.0, + "grad_norm": 0.008601814223210932, + "language_loss": 0.75151253, + "learning_rate": 0.0008625763667339472, + "loss": 0.76340932, + "num_input_tokens_seen": 114065040, + "router_z_loss_mlp": 0.93359375, + "step": 1375, + "time_per_iteration": 4.9838175773620605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192464, + "balance_loss_mlp": 1.09709656, + "epoch": 0.26471719892266254, + "flos": 519042932736.0, + "grad_norm": 0.024634755338573868, + "language_loss": 0.99606347, + "learning_rate": 0.0008623617720514241, + "loss": 1.0079881, + "num_input_tokens_seen": 114133488, + "router_z_loss_mlp": 0.953125, + "step": 1376, + "time_per_iteration": 2.5836029052734375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191563, + "balance_loss_mlp": 1.09586143, + "epoch": 0.26490958060792613, + "flos": 518205001728.0, + "grad_norm": 0.02740625444526412, + "language_loss": 0.95827538, + "learning_rate": 0.0008621470366875848, + "loss": 0.97019094, + "num_input_tokens_seen": 114200704, + "router_z_loss_mlp": 0.95654297, + "step": 1377, + "time_per_iteration": 2.574557304382324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190438, + "balance_loss_mlp": 1.09507096, + "epoch": 0.26510196229318966, + "flos": 597682372608.0, + "grad_norm": 0.02552910213335578, + "language_loss": 0.96441573, + "learning_rate": 0.0008619321607257966, + "loss": 0.97632015, + "num_input_tokens_seen": 114272160, + "router_z_loss_mlp": 0.953125, + "step": 1378, + "time_per_iteration": 2.680574655532837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187734, + "balance_loss_mlp": 1.09227157, + "epoch": 0.26529434397845325, + "flos": 687052459008.0, + "grad_norm": 0.024630390251990656, + "language_loss": 0.90670931, + "learning_rate": 0.000861717144249482, + "loss": 0.91858661, + "num_input_tokens_seen": 114347904, + "router_z_loss_mlp": 0.95410156, + "step": 1379, + "time_per_iteration": 2.8311944007873535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181951, + "balance_loss_mlp": 1.08672631, + "epoch": 0.26548672566371684, + "flos": 425259609600.0, + "grad_norm": 0.02240925569996582, + "language_loss": 0.98143864, + "learning_rate": 0.0008615019873421175, + "loss": 0.99325812, + "num_input_tokens_seen": 114409952, + "router_z_loss_mlp": 0.95166016, + "step": 1380, + "time_per_iteration": 2.472280263900757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182344, + "balance_loss_mlp": 1.08716714, + "epoch": 0.26567910734898037, + "flos": 490849993728.0, + "grad_norm": 0.024166031959674275, + "language_loss": 0.9586165, + "learning_rate": 0.0008612866900872349, + "loss": 0.97043991, + "num_input_tokens_seen": 114474832, + "router_z_loss_mlp": 0.95117188, + "step": 1381, + "time_per_iteration": 2.5671043395996094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181037, + "balance_loss_mlp": 1.08586013, + "epoch": 0.26587148903424396, + "flos": 535228862976.0, + "grad_norm": 0.024625622440273682, + "language_loss": 0.97316492, + "learning_rate": 0.0008610712525684197, + "loss": 0.98497522, + "num_input_tokens_seen": 114545152, + "router_z_loss_mlp": 0.95117188, + "step": 1382, + "time_per_iteration": 2.6394782066345215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179642, + "balance_loss_mlp": 1.08446515, + "epoch": 0.2660638707195075, + "flos": 1019055046656.0, + "grad_norm": 0.02944222863828147, + "language_loss": 0.96464765, + "learning_rate": 0.0008608556748693121, + "loss": 0.97644401, + "num_input_tokens_seen": 114626512, + "router_z_loss_mlp": 0.95117188, + "step": 1383, + "time_per_iteration": 3.2514846324920654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184353, + "balance_loss_mlp": 1.08941519, + "epoch": 0.2662562524047711, + "flos": 525062900736.0, + "grad_norm": 0.024003921212174706, + "language_loss": 0.95956504, + "learning_rate": 0.000860639957073607, + "loss": 0.97140861, + "num_input_tokens_seen": 114701008, + "router_z_loss_mlp": 0.94873047, + "step": 1384, + "time_per_iteration": 2.6759448051452637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190743, + "balance_loss_mlp": 1.09594798, + "epoch": 0.2664486340900346, + "flos": 553479421440.0, + "grad_norm": 0.02584009515603871, + "language_loss": 0.97059226, + "learning_rate": 0.0008604240992650534, + "loss": 0.98249966, + "num_input_tokens_seen": 114771984, + "router_z_loss_mlp": 0.94726562, + "step": 1385, + "time_per_iteration": 2.6880476474761963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187786, + "balance_loss_mlp": 1.09260905, + "epoch": 0.2666410157752982, + "flos": 471208280064.0, + "grad_norm": 0.023709316387392747, + "language_loss": 0.98021734, + "learning_rate": 0.0008602081015274545, + "loss": 0.99209523, + "num_input_tokens_seen": 114844800, + "router_z_loss_mlp": 0.95117188, + "step": 1386, + "time_per_iteration": 2.71233868598938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187602, + "balance_loss_mlp": 1.0924257, + "epoch": 0.2668333974605617, + "flos": 571015574016.0, + "grad_norm": 0.021121239598078063, + "language_loss": 0.90840185, + "learning_rate": 0.0008599919639446684, + "loss": 0.92027789, + "num_input_tokens_seen": 114918544, + "router_z_loss_mlp": 0.95117188, + "step": 1387, + "time_per_iteration": 2.6656363010406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183674, + "balance_loss_mlp": 1.08840239, + "epoch": 0.2670257791458253, + "flos": 399895369728.0, + "grad_norm": 0.029257146370583235, + "language_loss": 0.92911923, + "learning_rate": 0.000859775686600607, + "loss": 0.940956, + "num_input_tokens_seen": 114984272, + "router_z_loss_mlp": 0.95214844, + "step": 1388, + "time_per_iteration": 2.5366902351379395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186225, + "balance_loss_mlp": 1.09104884, + "epoch": 0.2672181608310889, + "flos": 516891709440.0, + "grad_norm": 0.02488439836403737, + "language_loss": 0.94369394, + "learning_rate": 0.0008595592695792367, + "loss": 0.95555621, + "num_input_tokens_seen": 115054800, + "router_z_loss_mlp": 0.95117188, + "step": 1389, + "time_per_iteration": 2.6710469722747803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184466, + "balance_loss_mlp": 1.08928883, + "epoch": 0.26741054251635243, + "flos": 508525134336.0, + "grad_norm": 0.024055725628873734, + "language_loss": 0.99442971, + "learning_rate": 0.0008593427129645778, + "loss": 1.00627434, + "num_input_tokens_seen": 115120928, + "router_z_loss_mlp": 0.95117188, + "step": 1390, + "time_per_iteration": 2.5913095474243164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184607, + "balance_loss_mlp": 1.08919191, + "epoch": 0.267602924201616, + "flos": 577808345088.0, + "grad_norm": 0.025635319637122064, + "language_loss": 0.93523198, + "learning_rate": 0.0008591260168407052, + "loss": 0.94707805, + "num_input_tokens_seen": 115196688, + "router_z_loss_mlp": 0.95361328, + "step": 1391, + "time_per_iteration": 2.766150712966919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118642, + "balance_loss_mlp": 1.09095728, + "epoch": 0.26779530588687955, + "flos": 524999774208.0, + "grad_norm": 0.02196829508666122, + "language_loss": 0.92168128, + "learning_rate": 0.0008589091812917479, + "loss": 0.93354547, + "num_input_tokens_seen": 115264912, + "router_z_loss_mlp": 0.95410156, + "step": 1392, + "time_per_iteration": 2.6208953857421875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119079, + "balance_loss_mlp": 1.09580445, + "epoch": 0.26798768757214314, + "flos": 557827530240.0, + "grad_norm": 0.02442636530887492, + "language_loss": 0.95854455, + "learning_rate": 0.0008586922064018887, + "loss": 0.97045243, + "num_input_tokens_seen": 115334672, + "router_z_loss_mlp": 0.94921875, + "step": 1393, + "time_per_iteration": 2.6643927097320557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190751, + "balance_loss_mlp": 1.09581244, + "epoch": 0.2681800692574067, + "flos": 932094693888.0, + "grad_norm": 0.0254733622090453, + "language_loss": 0.99184585, + "learning_rate": 0.0008584750922553651, + "loss": 1.00375342, + "num_input_tokens_seen": 115420032, + "router_z_loss_mlp": 0.94873047, + "step": 1394, + "time_per_iteration": 3.1305503845214844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192347, + "balance_loss_mlp": 1.09712303, + "epoch": 0.26837245094267026, + "flos": 702317865984.0, + "grad_norm": 0.023340973249423663, + "language_loss": 0.92753315, + "learning_rate": 0.0008582578389364677, + "loss": 0.93945664, + "num_input_tokens_seen": 115492576, + "router_z_loss_mlp": 0.95166016, + "step": 1395, + "time_per_iteration": 2.8527095317840576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184756, + "balance_loss_mlp": 1.08953142, + "epoch": 0.26856483262793385, + "flos": 594393775104.0, + "grad_norm": 0.020526468408011762, + "language_loss": 1.00206113, + "learning_rate": 0.0008580404465295422, + "loss": 1.01390874, + "num_input_tokens_seen": 115568368, + "router_z_loss_mlp": 0.95166016, + "step": 1396, + "time_per_iteration": 2.784592866897583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184595, + "balance_loss_mlp": 1.08922791, + "epoch": 0.2687572143131974, + "flos": 715588502016.0, + "grad_norm": 0.024818089102904728, + "language_loss": 0.9790895, + "learning_rate": 0.0008578229151189876, + "loss": 0.99093544, + "num_input_tokens_seen": 115651536, + "router_z_loss_mlp": 0.953125, + "step": 1397, + "time_per_iteration": 2.901818037033081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185216, + "balance_loss_mlp": 1.0896579, + "epoch": 0.26894959599846097, + "flos": 468670291968.0, + "grad_norm": 0.028086023154021946, + "language_loss": 0.91012216, + "learning_rate": 0.0008576052447892573, + "loss": 0.92197436, + "num_input_tokens_seen": 115715696, + "router_z_loss_mlp": 0.95507812, + "step": 1398, + "time_per_iteration": 2.5849812030792236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186332, + "balance_loss_mlp": 1.09082139, + "epoch": 0.2691419776837245, + "flos": 469629746688.0, + "grad_norm": 0.022530608820729603, + "language_loss": 0.95147502, + "learning_rate": 0.000857387435624858, + "loss": 0.96333838, + "num_input_tokens_seen": 115780928, + "router_z_loss_mlp": 0.95458984, + "step": 1399, + "time_per_iteration": 2.5274569988250732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011908, + "balance_loss_mlp": 1.09567106, + "epoch": 0.2693343593689881, + "flos": 939284963328.0, + "grad_norm": 0.02095039568010189, + "language_loss": 0.95472848, + "learning_rate": 0.0008571694877103513, + "loss": 0.96663648, + "num_input_tokens_seen": 115874432, + "router_z_loss_mlp": 0.95068359, + "step": 1400, + "time_per_iteration": 3.2558727264404297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190554, + "balance_loss_mlp": 1.09542465, + "epoch": 0.2695267410542516, + "flos": 578793996288.0, + "grad_norm": 0.0241215692671091, + "language_loss": 0.95762217, + "learning_rate": 0.0008569514011303515, + "loss": 0.96952766, + "num_input_tokens_seen": 115956608, + "router_z_loss_mlp": 0.95068359, + "step": 1401, + "time_per_iteration": 2.8175997734069824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193641, + "balance_loss_mlp": 1.09846401, + "epoch": 0.2697191227395152, + "flos": 557964516864.0, + "grad_norm": 0.02413892998134183, + "language_loss": 0.96554017, + "learning_rate": 0.0008567331759695277, + "loss": 0.97747654, + "num_input_tokens_seen": 116031728, + "router_z_loss_mlp": 0.95117188, + "step": 1402, + "time_per_iteration": 2.7052927017211914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192424, + "balance_loss_mlp": 1.09729552, + "epoch": 0.26991150442477874, + "flos": 530314068480.0, + "grad_norm": 0.024237100625486396, + "language_loss": 0.97319567, + "learning_rate": 0.0008565148123126023, + "loss": 0.98511994, + "num_input_tokens_seen": 116104288, + "router_z_loss_mlp": 0.95068359, + "step": 1403, + "time_per_iteration": 2.6399028301239014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187922, + "balance_loss_mlp": 1.09274554, + "epoch": 0.2701038861100423, + "flos": 533086371840.0, + "grad_norm": 0.021620674049761555, + "language_loss": 0.93398714, + "learning_rate": 0.0008562963102443516, + "loss": 0.94586635, + "num_input_tokens_seen": 116177920, + "router_z_loss_mlp": 0.95117188, + "step": 1404, + "time_per_iteration": 2.6793839931488037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185578, + "balance_loss_mlp": 1.09035325, + "epoch": 0.2702962677953059, + "flos": 736504576512.0, + "grad_norm": 0.026106257639691363, + "language_loss": 0.94497591, + "learning_rate": 0.0008560776698496056, + "loss": 0.95683169, + "num_input_tokens_seen": 116251680, + "router_z_loss_mlp": 0.95166016, + "step": 1405, + "time_per_iteration": 2.8884029388427734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186883, + "balance_loss_mlp": 1.09170628, + "epoch": 0.27048864948056944, + "flos": 576000225792.0, + "grad_norm": 0.025611862530653208, + "language_loss": 0.95929742, + "learning_rate": 0.0008558588912132481, + "loss": 0.97116625, + "num_input_tokens_seen": 116327664, + "router_z_loss_mlp": 0.95117188, + "step": 1406, + "time_per_iteration": 2.8396451473236084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190124, + "balance_loss_mlp": 1.09666443, + "epoch": 0.27068103116583303, + "flos": 1426910212608.0, + "grad_norm": 0.014531874927713828, + "language_loss": 0.76458991, + "learning_rate": 0.0008556399744202163, + "loss": 0.77649117, + "num_input_tokens_seen": 116555152, + "router_z_loss_mlp": 0.93359375, + "step": 1407, + "time_per_iteration": 4.898139715194702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119097, + "balance_loss_mlp": 1.09603214, + "epoch": 0.27087341285109656, + "flos": 533031977472.0, + "grad_norm": 0.024689522623330563, + "language_loss": 0.90804136, + "learning_rate": 0.0008554209195555016, + "loss": 0.91995108, + "num_input_tokens_seen": 116626016, + "router_z_loss_mlp": 0.94873047, + "step": 1408, + "time_per_iteration": 2.670093297958374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189645, + "balance_loss_mlp": 1.09446859, + "epoch": 0.27106579453636015, + "flos": 582464629248.0, + "grad_norm": 0.0247795195650599, + "language_loss": 0.98232609, + "learning_rate": 0.0008552017267041483, + "loss": 0.99422252, + "num_input_tokens_seen": 116699152, + "router_z_loss_mlp": 0.95117188, + "step": 1409, + "time_per_iteration": 2.6904594898223877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118886, + "balance_loss_mlp": 1.09368336, + "epoch": 0.2712581762216237, + "flos": 507880585728.0, + "grad_norm": 0.024309295256612126, + "language_loss": 0.90687084, + "learning_rate": 0.0008549823959512549, + "loss": 0.91875941, + "num_input_tokens_seen": 116770912, + "router_z_loss_mlp": 0.95117188, + "step": 1410, + "time_per_iteration": 2.662597417831421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189943, + "balance_loss_mlp": 1.09481394, + "epoch": 0.27145055790688727, + "flos": 999142087680.0, + "grad_norm": 0.023895808714677214, + "language_loss": 0.95848304, + "learning_rate": 0.0008547629273819728, + "loss": 0.97038245, + "num_input_tokens_seen": 116863088, + "router_z_loss_mlp": 0.95068359, + "step": 1411, + "time_per_iteration": 3.36985182762146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186274, + "balance_loss_mlp": 1.09109735, + "epoch": 0.2716429395921508, + "flos": 547728697344.0, + "grad_norm": 0.02712613780862537, + "language_loss": 0.93229926, + "learning_rate": 0.0008545433210815074, + "loss": 0.94416201, + "num_input_tokens_seen": 116929504, + "router_z_loss_mlp": 0.95117188, + "step": 1412, + "time_per_iteration": 2.601452350616455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182035, + "balance_loss_mlp": 1.08685839, + "epoch": 0.2718353212774144, + "flos": 574310902272.0, + "grad_norm": 0.02439507328911507, + "language_loss": 0.95137858, + "learning_rate": 0.0008543235771351176, + "loss": 0.96319902, + "num_input_tokens_seen": 117004064, + "router_z_loss_mlp": 0.95117188, + "step": 1413, + "time_per_iteration": 2.7132034301757812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197126, + "balance_loss_mlp": 1.10209203, + "epoch": 0.272027702962678, + "flos": 645584881152.0, + "grad_norm": 0.02257567173785872, + "language_loss": 0.91220462, + "learning_rate": 0.0008541036956281154, + "loss": 0.92417586, + "num_input_tokens_seen": 117081328, + "router_z_loss_mlp": 0.94970703, + "step": 1414, + "time_per_iteration": 2.871951103210449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187874, + "balance_loss_mlp": 1.09284067, + "epoch": 0.2722200846479415, + "flos": 654995504640.0, + "grad_norm": 0.026411231013774135, + "language_loss": 0.93374348, + "learning_rate": 0.0008538836766458665, + "loss": 0.94562221, + "num_input_tokens_seen": 117156544, + "router_z_loss_mlp": 0.94970703, + "step": 1415, + "time_per_iteration": 2.8673384189605713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183666, + "balance_loss_mlp": 1.08868039, + "epoch": 0.2724124663332051, + "flos": 580778033664.0, + "grad_norm": 0.027862690716265133, + "language_loss": 0.96171892, + "learning_rate": 0.0008536635202737897, + "loss": 0.97355556, + "num_input_tokens_seen": 117230208, + "router_z_loss_mlp": 0.94921875, + "step": 1416, + "time_per_iteration": 2.7829935550689697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183251, + "balance_loss_mlp": 1.08831298, + "epoch": 0.2726048480184686, + "flos": 538467795456.0, + "grad_norm": 0.025077003090708358, + "language_loss": 0.93469489, + "learning_rate": 0.0008534432265973573, + "loss": 0.94652736, + "num_input_tokens_seen": 117298080, + "router_z_loss_mlp": 0.94873047, + "step": 1417, + "time_per_iteration": 2.593364715576172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183107, + "balance_loss_mlp": 1.08793056, + "epoch": 0.2727972297037322, + "flos": 997548817920.0, + "grad_norm": 0.025553987949566613, + "language_loss": 0.99255168, + "learning_rate": 0.000853222795702095, + "loss": 1.00438273, + "num_input_tokens_seen": 117396256, + "router_z_loss_mlp": 0.95117188, + "step": 1418, + "time_per_iteration": 3.387162685394287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119173, + "balance_loss_mlp": 1.09712589, + "epoch": 0.27298961138899575, + "flos": 607334042112.0, + "grad_norm": 0.02541700118612174, + "language_loss": 0.93465757, + "learning_rate": 0.0008530022276735813, + "loss": 0.94657481, + "num_input_tokens_seen": 117467936, + "router_z_loss_mlp": 0.9453125, + "step": 1419, + "time_per_iteration": 2.7426016330718994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191299, + "balance_loss_mlp": 1.0965513, + "epoch": 0.27318199307425933, + "flos": 530396660736.0, + "grad_norm": 0.025702548257077976, + "language_loss": 0.9374572, + "learning_rate": 0.0008527815225974489, + "loss": 0.94937015, + "num_input_tokens_seen": 117538256, + "router_z_loss_mlp": 0.94677734, + "step": 1420, + "time_per_iteration": 2.6544342041015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118326, + "balance_loss_mlp": 1.08865511, + "epoch": 0.2733743747595229, + "flos": 409911610368.0, + "grad_norm": 0.028874111022423956, + "language_loss": 0.99327809, + "learning_rate": 0.0008525606805593829, + "loss": 1.00511074, + "num_input_tokens_seen": 117599488, + "router_z_loss_mlp": 0.9453125, + "step": 1421, + "time_per_iteration": 2.4215376377105713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182106, + "balance_loss_mlp": 1.08721578, + "epoch": 0.27356675644478645, + "flos": 517228082688.0, + "grad_norm": 0.026406413504372096, + "language_loss": 0.92442018, + "learning_rate": 0.0008523397016451213, + "loss": 0.93624127, + "num_input_tokens_seen": 117664240, + "router_z_loss_mlp": 0.94824219, + "step": 1422, + "time_per_iteration": 2.5680603981018066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184812, + "balance_loss_mlp": 1.09011269, + "epoch": 0.27375913813005004, + "flos": 1054058221056.0, + "grad_norm": 0.02228341429952914, + "language_loss": 0.94973963, + "learning_rate": 0.0008521185859404564, + "loss": 0.96158779, + "num_input_tokens_seen": 117754768, + "router_z_loss_mlp": 0.94628906, + "step": 1423, + "time_per_iteration": 3.37345814704895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179884, + "balance_loss_mlp": 1.08485043, + "epoch": 0.27395151981531357, + "flos": 626003566080.0, + "grad_norm": 0.02387683630357993, + "language_loss": 0.97909242, + "learning_rate": 0.0008518973335312326, + "loss": 0.99089128, + "num_input_tokens_seen": 117832816, + "router_z_loss_mlp": 0.94970703, + "step": 1424, + "time_per_iteration": 2.8314859867095947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184763, + "balance_loss_mlp": 1.08982456, + "epoch": 0.27414390150057716, + "flos": 551414793216.0, + "grad_norm": 0.028545098094769822, + "language_loss": 0.95577884, + "learning_rate": 0.0008516759445033477, + "loss": 0.96762645, + "num_input_tokens_seen": 117899168, + "router_z_loss_mlp": 0.94873047, + "step": 1425, + "time_per_iteration": 2.6086578369140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118204, + "balance_loss_mlp": 1.08705389, + "epoch": 0.2743362831858407, + "flos": 540951389184.0, + "grad_norm": 0.02677358847245462, + "language_loss": 0.96958816, + "learning_rate": 0.0008514544189427526, + "loss": 0.9814086, + "num_input_tokens_seen": 117972384, + "router_z_loss_mlp": 0.94921875, + "step": 1426, + "time_per_iteration": 2.6927483081817627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191695, + "balance_loss_mlp": 1.09713852, + "epoch": 0.2745286648711043, + "flos": 469545153024.0, + "grad_norm": 0.025998263163597202, + "language_loss": 0.95807564, + "learning_rate": 0.0008512327569354511, + "loss": 0.96999258, + "num_input_tokens_seen": 118039584, + "router_z_loss_mlp": 0.94482422, + "step": 1427, + "time_per_iteration": 2.5617682933807373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119268, + "balance_loss_mlp": 1.09764659, + "epoch": 0.2747210465563678, + "flos": 473871794688.0, + "grad_norm": 0.02733358796633043, + "language_loss": 0.93333006, + "learning_rate": 0.0008510109585675001, + "loss": 0.94525683, + "num_input_tokens_seen": 118108352, + "router_z_loss_mlp": 0.94970703, + "step": 1428, + "time_per_iteration": 2.7269434928894043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205208, + "balance_loss_mlp": 1.11193848, + "epoch": 0.2749134282416314, + "flos": 1318056866304.0, + "grad_norm": 0.019809968329655446, + "language_loss": 0.81153345, + "learning_rate": 0.0008507890239250093, + "loss": 0.82358551, + "num_input_tokens_seen": 118331120, + "router_z_loss_mlp": 0.93164062, + "step": 1429, + "time_per_iteration": 4.731899738311768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190948, + "balance_loss_mlp": 1.0958662, + "epoch": 0.275105809926895, + "flos": 972531684864.0, + "grad_norm": 0.03147414200634365, + "language_loss": 0.91184711, + "learning_rate": 0.0008505669530941415, + "loss": 0.92375666, + "num_input_tokens_seen": 118415872, + "router_z_loss_mlp": 0.95019531, + "step": 1430, + "time_per_iteration": 3.3260724544525146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189047, + "balance_loss_mlp": 1.09387004, + "epoch": 0.2752981916121585, + "flos": 528368962560.0, + "grad_norm": 0.025580193945061114, + "language_loss": 0.95012403, + "learning_rate": 0.000850344746161112, + "loss": 0.96201456, + "num_input_tokens_seen": 118483008, + "router_z_loss_mlp": 0.95117188, + "step": 1431, + "time_per_iteration": 2.5820231437683105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186021, + "balance_loss_mlp": 1.09093964, + "epoch": 0.2754905732974221, + "flos": 454598654976.0, + "grad_norm": 0.024219881250434897, + "language_loss": 0.962569, + "learning_rate": 0.0008501224032121894, + "loss": 0.97442919, + "num_input_tokens_seen": 118545840, + "router_z_loss_mlp": 0.95019531, + "step": 1432, + "time_per_iteration": 2.501572847366333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188894, + "balance_loss_mlp": 1.09362173, + "epoch": 0.27568295498268564, + "flos": 498508893696.0, + "grad_norm": 0.02427263624604226, + "language_loss": 0.90960014, + "learning_rate": 0.0008498999243336946, + "loss": 0.921489, + "num_input_tokens_seen": 118615168, + "router_z_loss_mlp": 0.95214844, + "step": 1433, + "time_per_iteration": 2.6212003231048584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192375, + "balance_loss_mlp": 1.09715116, + "epoch": 0.2758753366679492, + "flos": 609416134656.0, + "grad_norm": 0.024278981864862804, + "language_loss": 0.95570171, + "learning_rate": 0.0008496773096120021, + "loss": 0.9676255, + "num_input_tokens_seen": 118690384, + "router_z_loss_mlp": 0.95166016, + "step": 1434, + "time_per_iteration": 2.804689407348633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118926, + "balance_loss_mlp": 1.09370184, + "epoch": 0.27606771835321275, + "flos": 741436835328.0, + "grad_norm": 0.025697024392157108, + "language_loss": 0.95037985, + "learning_rate": 0.0008494545591335381, + "loss": 0.96227252, + "num_input_tokens_seen": 118763024, + "router_z_loss_mlp": 0.95507812, + "step": 1435, + "time_per_iteration": 2.9329347610473633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195816, + "balance_loss_mlp": 1.10068655, + "epoch": 0.27626010003847634, + "flos": 555748165632.0, + "grad_norm": 0.0206290639721941, + "language_loss": 0.927001, + "learning_rate": 0.0008492316729847823, + "loss": 0.93895912, + "num_input_tokens_seen": 118845536, + "router_z_loss_mlp": 0.95068359, + "step": 1436, + "time_per_iteration": 2.820913553237915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118782, + "balance_loss_mlp": 1.09245288, + "epoch": 0.2764524817237399, + "flos": 543695494656.0, + "grad_norm": 0.02424730092158954, + "language_loss": 0.88914406, + "learning_rate": 0.0008490086512522664, + "loss": 0.90102232, + "num_input_tokens_seen": 118919008, + "router_z_loss_mlp": 0.953125, + "step": 1437, + "time_per_iteration": 2.7454309463500977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186593, + "balance_loss_mlp": 1.09127319, + "epoch": 0.27664486340900346, + "flos": 407128573440.0, + "grad_norm": 0.024912305575595636, + "language_loss": 0.99286187, + "learning_rate": 0.0008487854940225755, + "loss": 1.00472784, + "num_input_tokens_seen": 118981376, + "router_z_loss_mlp": 0.95263672, + "step": 1438, + "time_per_iteration": 2.4809510707855225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183239, + "balance_loss_mlp": 1.08834839, + "epoch": 0.27683724509426705, + "flos": 523156726272.0, + "grad_norm": 0.025259333782437998, + "language_loss": 0.98154646, + "learning_rate": 0.0008485622013823466, + "loss": 0.99337876, + "num_input_tokens_seen": 119050560, + "router_z_loss_mlp": 0.94824219, + "step": 1439, + "time_per_iteration": 2.65401554107666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183688, + "balance_loss_mlp": 1.08865404, + "epoch": 0.2770296267795306, + "flos": 536409897984.0, + "grad_norm": 0.02898674716386243, + "language_loss": 0.9318651, + "learning_rate": 0.00084833877341827, + "loss": 0.94370198, + "num_input_tokens_seen": 119121104, + "router_z_loss_mlp": 0.94970703, + "step": 1440, + "time_per_iteration": 2.6294455528259277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192537, + "balance_loss_mlp": 1.09755075, + "epoch": 0.27722200846479417, + "flos": 488970015744.0, + "grad_norm": 0.027244615130064133, + "language_loss": 0.90653217, + "learning_rate": 0.000848115210217088, + "loss": 0.91845751, + "num_input_tokens_seen": 119187712, + "router_z_loss_mlp": 0.94921875, + "step": 1441, + "time_per_iteration": 2.5394957065582275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118987, + "balance_loss_mlp": 1.09493196, + "epoch": 0.2774143901500577, + "flos": 619443108864.0, + "grad_norm": 0.024388639686817183, + "language_loss": 0.9228884, + "learning_rate": 0.0008478915118655952, + "loss": 0.93478709, + "num_input_tokens_seen": 119259264, + "router_z_loss_mlp": 0.94873047, + "step": 1442, + "time_per_iteration": 2.7634968757629395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119119, + "balance_loss_mlp": 1.0962522, + "epoch": 0.2776067718353213, + "flos": 514844545536.0, + "grad_norm": 0.021441164984372, + "language_loss": 0.94525409, + "learning_rate": 0.0008476676784506393, + "loss": 0.95716596, + "num_input_tokens_seen": 119328304, + "router_z_loss_mlp": 0.94873047, + "step": 1443, + "time_per_iteration": 2.6474499702453613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119148, + "balance_loss_mlp": 1.09678042, + "epoch": 0.2777991535205848, + "flos": 1006040919552.0, + "grad_norm": 0.026818715625153876, + "language_loss": 0.93016809, + "learning_rate": 0.0008474437100591201, + "loss": 0.94208288, + "num_input_tokens_seen": 119412352, + "router_z_loss_mlp": 0.94628906, + "step": 1444, + "time_per_iteration": 3.311842441558838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189789, + "balance_loss_mlp": 1.09494591, + "epoch": 0.2779915352058484, + "flos": 551375861760.0, + "grad_norm": 0.021641305677188864, + "language_loss": 0.95129728, + "learning_rate": 0.0008472196067779898, + "loss": 0.96319526, + "num_input_tokens_seen": 119484464, + "router_z_loss_mlp": 0.94775391, + "step": 1445, + "time_per_iteration": 2.667910575866699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186263, + "balance_loss_mlp": 1.091277, + "epoch": 0.278183916891112, + "flos": 875215990272.0, + "grad_norm": 0.030449834007814664, + "language_loss": 0.98351109, + "learning_rate": 0.0008469953686942531, + "loss": 0.99537361, + "num_input_tokens_seen": 119557280, + "router_z_loss_mlp": 0.94921875, + "step": 1446, + "time_per_iteration": 3.100473403930664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187264, + "balance_loss_mlp": 1.09246826, + "epoch": 0.2783762985763755, + "flos": 625195834368.0, + "grad_norm": 0.025904191205549917, + "language_loss": 0.93646944, + "learning_rate": 0.0008467709958949668, + "loss": 0.94834208, + "num_input_tokens_seen": 119631232, + "router_z_loss_mlp": 0.94726562, + "step": 1447, + "time_per_iteration": 2.7201731204986572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187983, + "balance_loss_mlp": 1.09333074, + "epoch": 0.2785686802616391, + "flos": 582911792640.0, + "grad_norm": 0.026760771702797625, + "language_loss": 0.94447374, + "learning_rate": 0.0008465464884672403, + "loss": 0.9563536, + "num_input_tokens_seen": 119700224, + "router_z_loss_mlp": 0.94580078, + "step": 1448, + "time_per_iteration": 2.7300403118133545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118631, + "balance_loss_mlp": 1.09180129, + "epoch": 0.27876106194690264, + "flos": 588538991616.0, + "grad_norm": 0.0212290178255441, + "language_loss": 0.93077391, + "learning_rate": 0.0008463218464982348, + "loss": 0.94263697, + "num_input_tokens_seen": 119781376, + "router_z_loss_mlp": 0.94433594, + "step": 1449, + "time_per_iteration": 2.86130952835083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190148, + "balance_loss_mlp": 1.09520972, + "epoch": 0.27895344363216623, + "flos": 877430340096.0, + "grad_norm": 0.02756647509109648, + "language_loss": 0.96903402, + "learning_rate": 0.0008460970700751645, + "loss": 0.98093557, + "num_input_tokens_seen": 119856672, + "router_z_loss_mlp": 0.94873047, + "step": 1450, + "time_per_iteration": 3.069391965866089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188227, + "balance_loss_mlp": 1.0932883, + "epoch": 0.27914582531742976, + "flos": 605035098624.0, + "grad_norm": 0.025261876769304706, + "language_loss": 0.97766632, + "learning_rate": 0.000845872159285295, + "loss": 0.98954856, + "num_input_tokens_seen": 119929008, + "router_z_loss_mlp": 0.94873047, + "step": 1451, + "time_per_iteration": 2.748164653778076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197098, + "balance_loss_mlp": 1.10325623, + "epoch": 0.27933820700269335, + "flos": 1501130411520.0, + "grad_norm": 0.012982305827020523, + "language_loss": 0.77766848, + "learning_rate": 0.0008456471142159447, + "loss": 0.78963947, + "num_input_tokens_seen": 120164032, + "router_z_loss_mlp": 0.9375, + "step": 1452, + "time_per_iteration": 4.906180143356323 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198876, + "balance_loss_mlp": 1.10408044, + "epoch": 0.2795305886879569, + "flos": 1033517451264.0, + "grad_norm": 0.027093914793319178, + "language_loss": 0.95323974, + "learning_rate": 0.0008454219349544836, + "loss": 0.9652285, + "num_input_tokens_seen": 120246784, + "router_z_loss_mlp": 0.94726562, + "step": 1453, + "time_per_iteration": 3.333178758621216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194793, + "balance_loss_mlp": 1.10014069, + "epoch": 0.27972297037322047, + "flos": 608226367488.0, + "grad_norm": 0.025225525542022995, + "language_loss": 0.8972255, + "learning_rate": 0.000845196621588334, + "loss": 0.90917349, + "num_input_tokens_seen": 120318208, + "router_z_loss_mlp": 0.94580078, + "step": 1454, + "time_per_iteration": 2.7425026893615723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191631, + "balance_loss_mlp": 1.09697926, + "epoch": 0.27991535205848406, + "flos": 631560907776.0, + "grad_norm": 0.023908777965609074, + "language_loss": 0.86623406, + "learning_rate": 0.0008449711742049706, + "loss": 0.87815034, + "num_input_tokens_seen": 120393248, + "router_z_loss_mlp": 0.94580078, + "step": 1455, + "time_per_iteration": 2.8148674964904785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188728, + "balance_loss_mlp": 1.09369469, + "epoch": 0.2801077337437476, + "flos": 550353280512.0, + "grad_norm": 0.02989232443782136, + "language_loss": 0.94001353, + "learning_rate": 0.0008447455928919196, + "loss": 0.95190072, + "num_input_tokens_seen": 120461040, + "router_z_loss_mlp": 0.94970703, + "step": 1456, + "time_per_iteration": 2.6030025482177734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186748, + "balance_loss_mlp": 1.09166706, + "epoch": 0.2803001154290112, + "flos": 487741317120.0, + "grad_norm": 0.023726139763527557, + "language_loss": 0.95883709, + "learning_rate": 0.0008445198777367595, + "loss": 0.97070462, + "num_input_tokens_seen": 120530400, + "router_z_loss_mlp": 0.95019531, + "step": 1457, + "time_per_iteration": 2.598212718963623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188426, + "balance_loss_mlp": 1.09344053, + "epoch": 0.2804924971142747, + "flos": 523091598336.0, + "grad_norm": 0.027291046925092925, + "language_loss": 0.9210875, + "learning_rate": 0.0008442940288271208, + "loss": 0.93297172, + "num_input_tokens_seen": 120598304, + "router_z_loss_mlp": 0.94921875, + "step": 1458, + "time_per_iteration": 2.617572069168091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189438, + "balance_loss_mlp": 1.09473801, + "epoch": 0.2806848787995383, + "flos": 528849053184.0, + "grad_norm": 0.02378106137707509, + "language_loss": 0.95258486, + "learning_rate": 0.0008440680462506856, + "loss": 0.96447927, + "num_input_tokens_seen": 120675712, + "router_z_loss_mlp": 0.94628906, + "step": 1459, + "time_per_iteration": 2.7465641498565674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191591, + "balance_loss_mlp": 1.09660506, + "epoch": 0.2808772604848018, + "flos": 486484420608.0, + "grad_norm": 0.02248739277997059, + "language_loss": 0.9351486, + "learning_rate": 0.0008438419300951883, + "loss": 0.94706452, + "num_input_tokens_seen": 120746544, + "router_z_loss_mlp": 0.94921875, + "step": 1460, + "time_per_iteration": 2.6331160068511963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188162, + "balance_loss_mlp": 1.09303284, + "epoch": 0.2810696421700654, + "flos": 619339049472.0, + "grad_norm": 0.024684272432392865, + "language_loss": 0.96464884, + "learning_rate": 0.0008436156804484148, + "loss": 0.97653049, + "num_input_tokens_seen": 120823520, + "router_z_loss_mlp": 0.95068359, + "step": 1461, + "time_per_iteration": 2.7740418910980225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188616, + "balance_loss_mlp": 1.09358263, + "epoch": 0.28126202385532895, + "flos": 455686364160.0, + "grad_norm": 0.026728942288464865, + "language_loss": 0.99464989, + "learning_rate": 0.0008433892973982031, + "loss": 1.00653601, + "num_input_tokens_seen": 120889568, + "router_z_loss_mlp": 0.94970703, + "step": 1462, + "time_per_iteration": 2.5151000022888184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188441, + "balance_loss_mlp": 1.09345496, + "epoch": 0.28145440554059253, + "flos": 531738150912.0, + "grad_norm": 0.02863032020985732, + "language_loss": 0.95777607, + "learning_rate": 0.0008431627810324431, + "loss": 0.96966046, + "num_input_tokens_seen": 120958480, + "router_z_loss_mlp": 0.94921875, + "step": 1463, + "time_per_iteration": 2.64477801322937 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187972, + "balance_loss_mlp": 1.09298646, + "epoch": 0.2816467872258561, + "flos": 453163838976.0, + "grad_norm": 0.025052425157320847, + "language_loss": 0.90961307, + "learning_rate": 0.000842936131439076, + "loss": 0.92149282, + "num_input_tokens_seen": 121028032, + "router_z_loss_mlp": 0.94921875, + "step": 1464, + "time_per_iteration": 2.5910096168518066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186267, + "balance_loss_mlp": 1.09147155, + "epoch": 0.28183916891111965, + "flos": 473704608768.0, + "grad_norm": 0.02627501463847235, + "language_loss": 0.97073281, + "learning_rate": 0.0008427093487060951, + "loss": 0.98259544, + "num_input_tokens_seen": 121099280, + "router_z_loss_mlp": 0.94726562, + "step": 1465, + "time_per_iteration": 2.6250505447387695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187944, + "balance_loss_mlp": 1.09300542, + "epoch": 0.28203155059638324, + "flos": 558188098560.0, + "grad_norm": 0.02108937585301408, + "language_loss": 0.91709232, + "learning_rate": 0.000842482432921545, + "loss": 0.92897177, + "num_input_tokens_seen": 121180240, + "router_z_loss_mlp": 0.94873047, + "step": 1466, + "time_per_iteration": 2.809101104736328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186286, + "balance_loss_mlp": 1.09139562, + "epoch": 0.28222393228164677, + "flos": 417878685696.0, + "grad_norm": 0.025824876793605126, + "language_loss": 0.96517414, + "learning_rate": 0.0008422553841735225, + "loss": 0.97703695, + "num_input_tokens_seen": 121242736, + "router_z_loss_mlp": 0.94824219, + "step": 1467, + "time_per_iteration": 2.468773365020752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184331, + "balance_loss_mlp": 1.08963072, + "epoch": 0.28241631396691036, + "flos": 606040215552.0, + "grad_norm": 0.02479925640814435, + "language_loss": 0.92490911, + "learning_rate": 0.0008420282025501757, + "loss": 0.93675244, + "num_input_tokens_seen": 121319248, + "router_z_loss_mlp": 0.94628906, + "step": 1468, + "time_per_iteration": 2.7617123126983643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184258, + "balance_loss_mlp": 1.08960581, + "epoch": 0.2826086956521739, + "flos": 574050390528.0, + "grad_norm": 0.023359152371130017, + "language_loss": 0.93868291, + "learning_rate": 0.0008418008881397043, + "loss": 0.95052546, + "num_input_tokens_seen": 121392064, + "router_z_loss_mlp": 0.94580078, + "step": 1469, + "time_per_iteration": 2.681727886199951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185359, + "balance_loss_mlp": 1.09056342, + "epoch": 0.2828010773374375, + "flos": 844318603776.0, + "grad_norm": 0.02469333041166596, + "language_loss": 0.92646587, + "learning_rate": 0.0008415734410303595, + "loss": 0.93831944, + "num_input_tokens_seen": 121475984, + "router_z_loss_mlp": 0.94726562, + "step": 1470, + "time_per_iteration": 3.1949617862701416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186089, + "balance_loss_mlp": 1.09124613, + "epoch": 0.28299345902270107, + "flos": 543771356160.0, + "grad_norm": 0.022743934694793657, + "language_loss": 0.98454034, + "learning_rate": 0.0008413458613104444, + "loss": 0.99640119, + "num_input_tokens_seen": 121551024, + "router_z_loss_mlp": 0.94775391, + "step": 1471, + "time_per_iteration": 2.679994583129883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184615, + "balance_loss_mlp": 1.08972394, + "epoch": 0.2831858407079646, + "flos": 572754562560.0, + "grad_norm": 0.02381851847695354, + "language_loss": 0.91435039, + "learning_rate": 0.0008411181490683129, + "loss": 0.92619658, + "num_input_tokens_seen": 121624528, + "router_z_loss_mlp": 0.94824219, + "step": 1472, + "time_per_iteration": 2.7178077697753906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186226, + "balance_loss_mlp": 1.09152639, + "epoch": 0.2833782223932282, + "flos": 765170875392.0, + "grad_norm": 0.023393787071714342, + "language_loss": 0.92628008, + "learning_rate": 0.0008408903043923707, + "loss": 0.9381423, + "num_input_tokens_seen": 121706736, + "router_z_loss_mlp": 0.94628906, + "step": 1473, + "time_per_iteration": 3.0261785984039307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184462, + "balance_loss_mlp": 1.0899055, + "epoch": 0.2835706040784917, + "flos": 540087261696.0, + "grad_norm": 0.026141956799832673, + "language_loss": 0.93214488, + "learning_rate": 0.0008406623273710754, + "loss": 0.94398952, + "num_input_tokens_seen": 121773008, + "router_z_loss_mlp": 0.94482422, + "step": 1474, + "time_per_iteration": 2.62430739402771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118759, + "balance_loss_mlp": 1.09312844, + "epoch": 0.2837629857637553, + "flos": 531653557248.0, + "grad_norm": 0.026627011980012938, + "language_loss": 0.91140723, + "learning_rate": 0.0008404342180929351, + "loss": 0.9232831, + "num_input_tokens_seen": 121840016, + "router_z_loss_mlp": 0.94384766, + "step": 1475, + "time_per_iteration": 2.6201882362365723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191029, + "balance_loss_mlp": 1.09666264, + "epoch": 0.28395536744901884, + "flos": 541109842944.0, + "grad_norm": 0.026942213566754976, + "language_loss": 0.91036892, + "learning_rate": 0.00084020597664651, + "loss": 0.92227924, + "num_input_tokens_seen": 121915008, + "router_z_loss_mlp": 0.94287109, + "step": 1476, + "time_per_iteration": 2.792515516281128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191806, + "balance_loss_mlp": 1.09743977, + "epoch": 0.2841477491342824, + "flos": 574801726464.0, + "grad_norm": 0.0281069748307863, + "language_loss": 0.94561875, + "learning_rate": 0.0008399776031204111, + "loss": 0.95753682, + "num_input_tokens_seen": 121987456, + "router_z_loss_mlp": 0.94287109, + "step": 1477, + "time_per_iteration": 2.7592930793762207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189206, + "balance_loss_mlp": 1.09479237, + "epoch": 0.28434013081954596, + "flos": 573138599424.0, + "grad_norm": 0.025578880464706598, + "language_loss": 0.90985346, + "learning_rate": 0.0008397490976033009, + "loss": 0.92174542, + "num_input_tokens_seen": 122058720, + "router_z_loss_mlp": 0.94335938, + "step": 1478, + "time_per_iteration": 2.72312331199646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193047, + "balance_loss_mlp": 1.10015869, + "epoch": 0.28453251250480954, + "flos": 1556673629184.0, + "grad_norm": 0.009281527310597816, + "language_loss": 0.77879643, + "learning_rate": 0.000839520460183893, + "loss": 0.7907269, + "num_input_tokens_seen": 122285792, + "router_z_loss_mlp": 0.92773438, + "step": 1479, + "time_per_iteration": 4.714428901672363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188304, + "balance_loss_mlp": 1.0943675, + "epoch": 0.28472489419007313, + "flos": 750426491904.0, + "grad_norm": 0.023822673694276757, + "language_loss": 0.93367732, + "learning_rate": 0.0008392916909509525, + "loss": 0.94556034, + "num_input_tokens_seen": 122366608, + "router_z_loss_mlp": 0.93847656, + "step": 1480, + "time_per_iteration": 3.0365796089172363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183623, + "balance_loss_mlp": 1.08930516, + "epoch": 0.28491727587533666, + "flos": 491138703360.0, + "grad_norm": 0.028675048847138535, + "language_loss": 0.94468164, + "learning_rate": 0.0008390627899932954, + "loss": 0.95651788, + "num_input_tokens_seen": 122435536, + "router_z_loss_mlp": 0.94238281, + "step": 1481, + "time_per_iteration": 2.562316656112671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187714, + "balance_loss_mlp": 1.09353888, + "epoch": 0.28510965756060025, + "flos": 730359081984.0, + "grad_norm": 0.028797322451775676, + "language_loss": 0.96514452, + "learning_rate": 0.000838833757399789, + "loss": 0.97702163, + "num_input_tokens_seen": 122515584, + "router_z_loss_mlp": 0.94091797, + "step": 1482, + "time_per_iteration": 2.955920696258545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189825, + "balance_loss_mlp": 1.09593546, + "epoch": 0.2853020392458638, + "flos": 552669688320.0, + "grad_norm": 0.027781834693451857, + "language_loss": 0.92148101, + "learning_rate": 0.0008386045932593515, + "loss": 0.93337923, + "num_input_tokens_seen": 122585552, + "router_z_loss_mlp": 0.93798828, + "step": 1483, + "time_per_iteration": 2.6609442234039307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185409, + "balance_loss_mlp": 1.09151959, + "epoch": 0.28549442093112737, + "flos": 756096625152.0, + "grad_norm": 0.023489805753692042, + "language_loss": 0.9365592, + "learning_rate": 0.0008383752976609525, + "loss": 0.94841331, + "num_input_tokens_seen": 122658928, + "router_z_loss_mlp": 0.93798828, + "step": 1484, + "time_per_iteration": 2.914872646331787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188644, + "balance_loss_mlp": 1.09480286, + "epoch": 0.2856868026163909, + "flos": 539703224832.0, + "grad_norm": 0.026354969281760218, + "language_loss": 0.9020288, + "learning_rate": 0.0008381458706936123, + "loss": 0.91391522, + "num_input_tokens_seen": 122729056, + "router_z_loss_mlp": 0.9375, + "step": 1485, + "time_per_iteration": 2.7100982666015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190691, + "balance_loss_mlp": 1.09675431, + "epoch": 0.2858791843016545, + "flos": 584920025088.0, + "grad_norm": 0.026556247425645045, + "language_loss": 0.97539783, + "learning_rate": 0.0008379163124464025, + "loss": 0.98730469, + "num_input_tokens_seen": 122802832, + "router_z_loss_mlp": 0.93847656, + "step": 1486, + "time_per_iteration": 2.7065536975860596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192022, + "balance_loss_mlp": 1.0979898, + "epoch": 0.286071565986918, + "flos": 646051510272.0, + "grad_norm": 0.03147840332437955, + "language_loss": 0.84533966, + "learning_rate": 0.0008376866230084452, + "loss": 0.85725987, + "num_input_tokens_seen": 122881328, + "router_z_loss_mlp": 0.93945312, + "step": 1487, + "time_per_iteration": 2.818673849105835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186798, + "balance_loss_mlp": 1.09295619, + "epoch": 0.2862639476721816, + "flos": 492330471936.0, + "grad_norm": 0.02612625436823832, + "language_loss": 0.963471, + "learning_rate": 0.000837456802468914, + "loss": 0.975339, + "num_input_tokens_seen": 122949680, + "router_z_loss_mlp": 0.9375, + "step": 1488, + "time_per_iteration": 2.5766210556030273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185712, + "balance_loss_mlp": 1.09187043, + "epoch": 0.2864563293574452, + "flos": 522744491520.0, + "grad_norm": 0.023875595461199783, + "language_loss": 0.96454561, + "learning_rate": 0.0008372268509170331, + "loss": 0.9764027, + "num_input_tokens_seen": 123024736, + "router_z_loss_mlp": 0.9375, + "step": 1489, + "time_per_iteration": 2.7241337299346924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117946, + "balance_loss_mlp": 1.08537972, + "epoch": 0.2866487110427087, + "flos": 548256451584.0, + "grad_norm": 0.022999113981848278, + "language_loss": 0.93815279, + "learning_rate": 0.0008369967684420779, + "loss": 0.94994742, + "num_input_tokens_seen": 123097344, + "router_z_loss_mlp": 0.93994141, + "step": 1490, + "time_per_iteration": 2.7358930110931396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180309, + "balance_loss_mlp": 1.08656251, + "epoch": 0.2868410927279723, + "flos": 483217290240.0, + "grad_norm": 0.024118055050044187, + "language_loss": 0.93676293, + "learning_rate": 0.0008367665551333736, + "loss": 0.94856608, + "num_input_tokens_seen": 123166240, + "router_z_loss_mlp": 0.93652344, + "step": 1491, + "time_per_iteration": 2.6094913482666016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181201, + "balance_loss_mlp": 1.08731139, + "epoch": 0.28703347441323585, + "flos": 726136499712.0, + "grad_norm": 0.03204326630579906, + "language_loss": 0.96034807, + "learning_rate": 0.0008365362110802977, + "loss": 0.9721601, + "num_input_tokens_seen": 123238160, + "router_z_loss_mlp": 0.93798828, + "step": 1492, + "time_per_iteration": 2.862281322479248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180339, + "balance_loss_mlp": 1.08630645, + "epoch": 0.28722585609849943, + "flos": 636213189120.0, + "grad_norm": 0.024948941988181064, + "language_loss": 0.92257547, + "learning_rate": 0.0008363057363722773, + "loss": 0.93437886, + "num_input_tokens_seen": 123319504, + "router_z_loss_mlp": 0.93945312, + "step": 1493, + "time_per_iteration": 2.8364765644073486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180894, + "balance_loss_mlp": 1.08695745, + "epoch": 0.28741823778376296, + "flos": 511251775488.0, + "grad_norm": 0.026788978355157977, + "language_loss": 0.94388151, + "learning_rate": 0.0008360751310987906, + "loss": 0.9556905, + "num_input_tokens_seen": 123387008, + "router_z_loss_mlp": 0.93847656, + "step": 1494, + "time_per_iteration": 2.5825915336608887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186005, + "balance_loss_mlp": 1.09244919, + "epoch": 0.28761061946902655, + "flos": 604931039232.0, + "grad_norm": 0.023099591474152015, + "language_loss": 0.92881125, + "learning_rate": 0.0008358443953493666, + "loss": 0.94067132, + "num_input_tokens_seen": 123471056, + "router_z_loss_mlp": 0.93457031, + "step": 1495, + "time_per_iteration": 2.8426852226257324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190116, + "balance_loss_mlp": 1.09617913, + "epoch": 0.28780300115429014, + "flos": 408059830272.0, + "grad_norm": 0.026469370193436835, + "language_loss": 0.97524667, + "learning_rate": 0.0008356135292135851, + "loss": 0.98714793, + "num_input_tokens_seen": 123535024, + "router_z_loss_mlp": 0.93847656, + "step": 1496, + "time_per_iteration": 2.505594491958618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187979, + "balance_loss_mlp": 1.09356499, + "epoch": 0.28799538283955367, + "flos": 375744365568.0, + "grad_norm": 0.028081335314896084, + "language_loss": 1.02447343, + "learning_rate": 0.0008353825327810758, + "loss": 1.03635335, + "num_input_tokens_seen": 123596224, + "router_z_loss_mlp": 0.94335938, + "step": 1497, + "time_per_iteration": 2.4137980937957764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188393, + "balance_loss_mlp": 1.09416974, + "epoch": 0.28818776452481726, + "flos": 593019357696.0, + "grad_norm": 0.027570910872340922, + "language_loss": 0.91214752, + "learning_rate": 0.00083515140614152, + "loss": 0.9240315, + "num_input_tokens_seen": 123668640, + "router_z_loss_mlp": 0.94140625, + "step": 1498, + "time_per_iteration": 2.7084319591522217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188877, + "balance_loss_mlp": 1.0943675, + "epoch": 0.2883801462100808, + "flos": 536103724032.0, + "grad_norm": 0.024692508476740448, + "language_loss": 0.97239816, + "learning_rate": 0.0008349201493846485, + "loss": 0.9842869, + "num_input_tokens_seen": 123740816, + "router_z_loss_mlp": 0.94433594, + "step": 1499, + "time_per_iteration": 2.6401236057281494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190398, + "balance_loss_mlp": 1.09617448, + "epoch": 0.2885725278953444, + "flos": 481076800512.0, + "grad_norm": 0.026282906035864008, + "language_loss": 0.98523659, + "learning_rate": 0.0008346887626002432, + "loss": 0.99714065, + "num_input_tokens_seen": 123805968, + "router_z_loss_mlp": 0.94140625, + "step": 1500, + "time_per_iteration": 2.52458119392395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193099, + "balance_loss_mlp": 1.09863722, + "epoch": 0.2887649095806079, + "flos": 465029858304.0, + "grad_norm": 0.024051725112114657, + "language_loss": 0.95880306, + "learning_rate": 0.000834457245878137, + "loss": 0.970734, + "num_input_tokens_seen": 123876576, + "router_z_loss_mlp": 0.94384766, + "step": 1501, + "time_per_iteration": 2.629535436630249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192018, + "balance_loss_mlp": 1.09765196, + "epoch": 0.2889572912658715, + "flos": 932639912448.0, + "grad_norm": 0.02596355901590014, + "language_loss": 0.90450358, + "learning_rate": 0.000834225599308212, + "loss": 0.9164238, + "num_input_tokens_seen": 123967664, + "router_z_loss_mlp": 0.94287109, + "step": 1502, + "time_per_iteration": 3.2340567111968994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189718, + "balance_loss_mlp": 1.09568572, + "epoch": 0.28914967295113503, + "flos": 571256620032.0, + "grad_norm": 0.02412179831144176, + "language_loss": 0.9487462, + "learning_rate": 0.0008339938229804016, + "loss": 0.96064335, + "num_input_tokens_seen": 124039680, + "router_z_loss_mlp": 0.93945312, + "step": 1503, + "time_per_iteration": 2.710339069366455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193321, + "balance_loss_mlp": 1.10081482, + "epoch": 0.2893420546363986, + "flos": 1489872010752.0, + "grad_norm": 0.01509287591883609, + "language_loss": 0.75434822, + "learning_rate": 0.0008337619169846895, + "loss": 0.76628143, + "num_input_tokens_seen": 124278848, + "router_z_loss_mlp": 0.92382812, + "step": 1504, + "time_per_iteration": 4.937675714492798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189832, + "balance_loss_mlp": 1.09579968, + "epoch": 0.2895344363216622, + "flos": 471182083584.0, + "grad_norm": 0.02978733186062401, + "language_loss": 0.95586789, + "learning_rate": 0.0008335298814111094, + "loss": 0.96776623, + "num_input_tokens_seen": 124346736, + "router_z_loss_mlp": 0.93945312, + "step": 1505, + "time_per_iteration": 2.5757808685302734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119483, + "balance_loss_mlp": 1.10075009, + "epoch": 0.28972681800692573, + "flos": 649340107776.0, + "grad_norm": 0.024998045510076724, + "language_loss": 0.95390272, + "learning_rate": 0.0008332977163497455, + "loss": 0.96585107, + "num_input_tokens_seen": 124420816, + "router_z_loss_mlp": 0.93994141, + "step": 1506, + "time_per_iteration": 2.8062288761138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190367, + "balance_loss_mlp": 1.09638238, + "epoch": 0.2899191996921893, + "flos": 573305785344.0, + "grad_norm": 0.023440576211443395, + "language_loss": 0.92864263, + "learning_rate": 0.0008330654218907325, + "loss": 0.94054627, + "num_input_tokens_seen": 124490480, + "router_z_loss_mlp": 0.93896484, + "step": 1507, + "time_per_iteration": 2.6871397495269775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195663, + "balance_loss_mlp": 1.10158336, + "epoch": 0.29011158137745285, + "flos": 662636940288.0, + "grad_norm": 0.026311762315396375, + "language_loss": 0.90949756, + "learning_rate": 0.0008328329981242548, + "loss": 0.92145419, + "num_input_tokens_seen": 124564960, + "router_z_loss_mlp": 0.93994141, + "step": 1508, + "time_per_iteration": 2.870436906814575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189885, + "balance_loss_mlp": 1.09585261, + "epoch": 0.29030396306271644, + "flos": 537402279936.0, + "grad_norm": 0.02293974263799261, + "language_loss": 0.95641714, + "learning_rate": 0.0008326004451405475, + "loss": 0.96831596, + "num_input_tokens_seen": 124637424, + "router_z_loss_mlp": 0.93945312, + "step": 1509, + "time_per_iteration": 2.7639336585998535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191857, + "balance_loss_mlp": 1.09815872, + "epoch": 0.29049634474798, + "flos": 512955835392.0, + "grad_norm": 0.025710607890434264, + "language_loss": 0.93112034, + "learning_rate": 0.0008323677630298957, + "loss": 0.94303894, + "num_input_tokens_seen": 124704832, + "router_z_loss_mlp": 0.93603516, + "step": 1510, + "time_per_iteration": 2.561455726623535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118953, + "balance_loss_mlp": 1.09592652, + "epoch": 0.29068872643324356, + "flos": 614982208512.0, + "grad_norm": 0.023671610956976636, + "language_loss": 0.92362118, + "learning_rate": 0.0008321349518826345, + "loss": 0.93551642, + "num_input_tokens_seen": 124779600, + "router_z_loss_mlp": 0.93505859, + "step": 1511, + "time_per_iteration": 2.807711362838745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191488, + "balance_loss_mlp": 1.09736073, + "epoch": 0.2908811081185071, + "flos": 547468185600.0, + "grad_norm": 0.029262624151918007, + "language_loss": 1.03824317, + "learning_rate": 0.0008319020117891491, + "loss": 1.05015802, + "num_input_tokens_seen": 124844128, + "router_z_loss_mlp": 0.94042969, + "step": 1512, + "time_per_iteration": 2.626357316970825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192195, + "balance_loss_mlp": 1.09840155, + "epoch": 0.2910734898037707, + "flos": 605901227520.0, + "grad_norm": 0.026098769068304807, + "language_loss": 0.96355087, + "learning_rate": 0.0008316689428398751, + "loss": 0.97547281, + "num_input_tokens_seen": 124915376, + "router_z_loss_mlp": 0.93701172, + "step": 1513, + "time_per_iteration": 2.6982998847961426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190959, + "balance_loss_mlp": 1.09721279, + "epoch": 0.29126587148903427, + "flos": 575835041280.0, + "grad_norm": 0.02240755749123148, + "language_loss": 0.95587385, + "learning_rate": 0.0008314357451252979, + "loss": 0.96778345, + "num_input_tokens_seen": 124995504, + "router_z_loss_mlp": 0.93652344, + "step": 1514, + "time_per_iteration": 2.7506277561187744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185358, + "balance_loss_mlp": 1.09170711, + "epoch": 0.2914582531742978, + "flos": 572133482496.0, + "grad_norm": 0.030106635879309524, + "language_loss": 0.98758858, + "learning_rate": 0.0008312024187359527, + "loss": 0.99944222, + "num_input_tokens_seen": 125064192, + "router_z_loss_mlp": 0.93554688, + "step": 1515, + "time_per_iteration": 2.6389546394348145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186161, + "balance_loss_mlp": 1.09265339, + "epoch": 0.2916506348595614, + "flos": 732302186496.0, + "grad_norm": 0.023105382424412787, + "language_loss": 0.95643955, + "learning_rate": 0.000830968963762425, + "loss": 0.96830118, + "num_input_tokens_seen": 125150560, + "router_z_loss_mlp": 0.93408203, + "step": 1516, + "time_per_iteration": 3.0222864151000977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183995, + "balance_loss_mlp": 1.09048688, + "epoch": 0.2918430165448249, + "flos": 511466625024.0, + "grad_norm": 0.027481799845478876, + "language_loss": 0.92072952, + "learning_rate": 0.0008307353802953497, + "loss": 0.93256938, + "num_input_tokens_seen": 125219264, + "router_z_loss_mlp": 0.93408203, + "step": 1517, + "time_per_iteration": 2.6852073669433594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188929, + "balance_loss_mlp": 1.09546912, + "epoch": 0.2920353982300885, + "flos": 631606569984.0, + "grad_norm": 0.024841994736450757, + "language_loss": 0.95207542, + "learning_rate": 0.0008305016684254125, + "loss": 0.9639647, + "num_input_tokens_seen": 125301904, + "router_z_loss_mlp": 0.93359375, + "step": 1518, + "time_per_iteration": 2.78326678276062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185623, + "balance_loss_mlp": 1.0920676, + "epoch": 0.29222777991535204, + "flos": 502670350848.0, + "grad_norm": 0.02442081482663903, + "language_loss": 0.96402657, + "learning_rate": 0.0008302678282433479, + "loss": 0.97588277, + "num_input_tokens_seen": 125367712, + "router_z_loss_mlp": 0.93457031, + "step": 1519, + "time_per_iteration": 2.580885887145996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186077, + "balance_loss_mlp": 1.09261727, + "epoch": 0.2924201616006156, + "flos": 487841373696.0, + "grad_norm": 0.025531334181834578, + "language_loss": 0.92434102, + "learning_rate": 0.0008300338598399411, + "loss": 0.93620181, + "num_input_tokens_seen": 125437648, + "router_z_loss_mlp": 0.93359375, + "step": 1520, + "time_per_iteration": 2.60040020942688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182574, + "balance_loss_mlp": 1.08911419, + "epoch": 0.2926125432858792, + "flos": 477410170368.0, + "grad_norm": 0.025034871095789283, + "language_loss": 1.04410791, + "learning_rate": 0.0008297997633060263, + "loss": 1.05593348, + "num_input_tokens_seen": 125502432, + "router_z_loss_mlp": 0.93359375, + "step": 1521, + "time_per_iteration": 2.5479507446289062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184296, + "balance_loss_mlp": 1.09083581, + "epoch": 0.29280492497114274, + "flos": 677867418624.0, + "grad_norm": 0.023158831925944874, + "language_loss": 0.93757105, + "learning_rate": 0.0008295655387324883, + "loss": 0.94941401, + "num_input_tokens_seen": 125575424, + "router_z_loss_mlp": 0.93359375, + "step": 1522, + "time_per_iteration": 2.80924916267395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184597, + "balance_loss_mlp": 1.09113646, + "epoch": 0.29299730665640633, + "flos": 459344262144.0, + "grad_norm": 0.024881330364852117, + "language_loss": 0.95369709, + "learning_rate": 0.0008293311862102609, + "loss": 0.96554303, + "num_input_tokens_seen": 125639040, + "router_z_loss_mlp": 0.93359375, + "step": 1523, + "time_per_iteration": 2.5006909370422363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183918, + "balance_loss_mlp": 1.09055364, + "epoch": 0.29318968834166986, + "flos": 447495707136.0, + "grad_norm": 0.027757525537519354, + "language_loss": 0.99242002, + "learning_rate": 0.0008290967058303275, + "loss": 1.00425935, + "num_input_tokens_seen": 125701712, + "router_z_loss_mlp": 0.93261719, + "step": 1524, + "time_per_iteration": 2.472071409225464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184496, + "balance_loss_mlp": 1.09098816, + "epoch": 0.29338207002693345, + "flos": 451255663104.0, + "grad_norm": 0.024483324027042522, + "language_loss": 0.93697757, + "learning_rate": 0.0008288620976837219, + "loss": 0.9488225, + "num_input_tokens_seen": 125765088, + "router_z_loss_mlp": 0.93408203, + "step": 1525, + "time_per_iteration": 2.486726760864258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183678, + "balance_loss_mlp": 1.08997941, + "epoch": 0.293574451712197, + "flos": 503284700160.0, + "grad_norm": 0.025672010983446535, + "language_loss": 0.92014909, + "learning_rate": 0.000828627361861527, + "loss": 0.93198591, + "num_input_tokens_seen": 125831328, + "router_z_loss_mlp": 0.93603516, + "step": 1526, + "time_per_iteration": 2.557725429534912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183155, + "balance_loss_mlp": 1.089504, + "epoch": 0.29376683339746057, + "flos": 697683048960.0, + "grad_norm": 0.028193197708561973, + "language_loss": 0.94158876, + "learning_rate": 0.0008283924984548752, + "loss": 0.95342028, + "num_input_tokens_seen": 125903664, + "router_z_loss_mlp": 0.93554688, + "step": 1527, + "time_per_iteration": 2.866138219833374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182528, + "balance_loss_mlp": 1.08882964, + "epoch": 0.2939592150827241, + "flos": 479541927936.0, + "grad_norm": 0.024215116577050826, + "language_loss": 0.92182994, + "learning_rate": 0.0008281575075549485, + "loss": 0.93365526, + "num_input_tokens_seen": 125971856, + "router_z_loss_mlp": 0.93603516, + "step": 1528, + "time_per_iteration": 2.5585758686065674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202408, + "balance_loss_mlp": 1.1108551, + "epoch": 0.2941515967679877, + "flos": 1488386803200.0, + "grad_norm": 0.02007823063587109, + "language_loss": 0.77352691, + "learning_rate": 0.000827922389252979, + "loss": 0.78555101, + "num_input_tokens_seen": 126183968, + "router_z_loss_mlp": 0.9140625, + "step": 1529, + "time_per_iteration": 4.658870697021484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186281, + "balance_loss_mlp": 1.09267783, + "epoch": 0.2943439784532513, + "flos": 675399287808.0, + "grad_norm": 0.027761434636537758, + "language_loss": 0.99164081, + "learning_rate": 0.0008276871436402469, + "loss": 1.00350356, + "num_input_tokens_seen": 126254448, + "router_z_loss_mlp": 0.93505859, + "step": 1530, + "time_per_iteration": 2.897517204284668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182983, + "balance_loss_mlp": 1.08909357, + "epoch": 0.2945363601385148, + "flos": 577382648832.0, + "grad_norm": 0.025208295044921922, + "language_loss": 0.95561033, + "learning_rate": 0.000827451770808083, + "loss": 0.96744013, + "num_input_tokens_seen": 126328208, + "router_z_loss_mlp": 0.93798828, + "step": 1531, + "time_per_iteration": 2.667419910430908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183127, + "balance_loss_mlp": 1.08923733, + "epoch": 0.2947287418237784, + "flos": 481617289728.0, + "grad_norm": 0.0238323033403859, + "language_loss": 0.92856085, + "learning_rate": 0.0008272162708478674, + "loss": 0.94039214, + "num_input_tokens_seen": 126396464, + "router_z_loss_mlp": 0.93798828, + "step": 1532, + "time_per_iteration": 2.532593250274658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190087, + "balance_loss_mlp": 1.09638822, + "epoch": 0.2949211235090419, + "flos": 559260344832.0, + "grad_norm": 0.023856250691152107, + "language_loss": 0.9573307, + "learning_rate": 0.000826980643851029, + "loss": 0.96923155, + "num_input_tokens_seen": 126468960, + "router_z_loss_mlp": 0.93603516, + "step": 1533, + "time_per_iteration": 2.648393154144287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190115, + "balance_loss_mlp": 1.09665465, + "epoch": 0.2951135051943055, + "flos": 484856222208.0, + "grad_norm": 0.02761517479674983, + "language_loss": 0.9290787, + "learning_rate": 0.0008267448899090464, + "loss": 0.94097984, + "num_input_tokens_seen": 126536496, + "router_z_loss_mlp": 0.93359375, + "step": 1534, + "time_per_iteration": 2.5158579349517822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185677, + "balance_loss_mlp": 1.09226477, + "epoch": 0.29530588687956905, + "flos": 551421523968.0, + "grad_norm": 0.024001584155810263, + "language_loss": 0.90244222, + "learning_rate": 0.0008265090091134473, + "loss": 0.91429895, + "num_input_tokens_seen": 126614048, + "router_z_loss_mlp": 0.93310547, + "step": 1535, + "time_per_iteration": 2.8246946334838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185762, + "balance_loss_mlp": 1.09234965, + "epoch": 0.29549826856483263, + "flos": 674309577216.0, + "grad_norm": 0.021562014940098434, + "language_loss": 0.8727591, + "learning_rate": 0.0008262730015558088, + "loss": 0.88461667, + "num_input_tokens_seen": 126697248, + "router_z_loss_mlp": 0.93310547, + "step": 1536, + "time_per_iteration": 2.8568825721740723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189062, + "balance_loss_mlp": 1.09560144, + "epoch": 0.29569065025009617, + "flos": 766135059456.0, + "grad_norm": 0.0253531059084562, + "language_loss": 0.89567208, + "learning_rate": 0.0008260368673277574, + "loss": 0.90756267, + "num_input_tokens_seen": 126782496, + "router_z_loss_mlp": 0.93359375, + "step": 1537, + "time_per_iteration": 3.1248908042907715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181656, + "balance_loss_mlp": 1.08781409, + "epoch": 0.29588303193535975, + "flos": 544830867456.0, + "grad_norm": 0.02589470547450269, + "language_loss": 0.93808746, + "learning_rate": 0.0008258006065209682, + "loss": 0.94990402, + "num_input_tokens_seen": 126857328, + "router_z_loss_mlp": 0.9375, + "step": 1538, + "time_per_iteration": 2.7405824661254883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182922, + "balance_loss_mlp": 1.0892235, + "epoch": 0.29607541362062334, + "flos": 598144998912.0, + "grad_norm": 0.02499469713889481, + "language_loss": 0.9045589, + "learning_rate": 0.0008255642192271657, + "loss": 0.91638815, + "num_input_tokens_seen": 126932608, + "router_z_loss_mlp": 0.93603516, + "step": 1539, + "time_per_iteration": 2.7654454708099365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183976, + "balance_loss_mlp": 1.09032559, + "epoch": 0.29626779530588687, + "flos": 611037602304.0, + "grad_norm": 0.024707919738005703, + "language_loss": 0.92616487, + "learning_rate": 0.0008253277055381241, + "loss": 0.93800461, + "num_input_tokens_seen": 127008928, + "router_z_loss_mlp": 0.93554688, + "step": 1540, + "time_per_iteration": 2.803755760192871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186228, + "balance_loss_mlp": 1.09252918, + "epoch": 0.29646017699115046, + "flos": 868957704192.0, + "grad_norm": 0.02707124240628881, + "language_loss": 0.95315254, + "learning_rate": 0.0008250910655456658, + "loss": 0.96501482, + "num_input_tokens_seen": 127097104, + "router_z_loss_mlp": 0.93603516, + "step": 1541, + "time_per_iteration": 3.11143159866333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181572, + "balance_loss_mlp": 1.08787382, + "epoch": 0.296652558676414, + "flos": 496880695296.0, + "grad_norm": 0.02670504880571787, + "language_loss": 0.9343757, + "learning_rate": 0.0008248542993416625, + "loss": 0.94619143, + "num_input_tokens_seen": 127165264, + "router_z_loss_mlp": 0.93603516, + "step": 1542, + "time_per_iteration": 2.5893712043762207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181697, + "balance_loss_mlp": 1.08790362, + "epoch": 0.2968449403616776, + "flos": 572626308096.0, + "grad_norm": 0.02711797813063544, + "language_loss": 0.9310621, + "learning_rate": 0.0008246174070180352, + "loss": 0.94287908, + "num_input_tokens_seen": 127238992, + "router_z_loss_mlp": 0.93701172, + "step": 1543, + "time_per_iteration": 2.677011489868164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189648, + "balance_loss_mlp": 1.09614003, + "epoch": 0.2970373220469411, + "flos": 795650022912.0, + "grad_norm": 0.029629985597633038, + "language_loss": 0.9263432, + "learning_rate": 0.0008243803886667537, + "loss": 0.93823969, + "num_input_tokens_seen": 127328160, + "router_z_loss_mlp": 0.93408203, + "step": 1544, + "time_per_iteration": 3.1022729873657227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188285, + "balance_loss_mlp": 1.09472907, + "epoch": 0.2972297037322047, + "flos": 662248174080.0, + "grad_norm": 0.0271995559284498, + "language_loss": 0.89610922, + "learning_rate": 0.0008241432443798364, + "loss": 0.90799212, + "num_input_tokens_seen": 127407328, + "router_z_loss_mlp": 0.93457031, + "step": 1545, + "time_per_iteration": 2.8079423904418945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181998, + "balance_loss_mlp": 1.08868086, + "epoch": 0.29742208541746823, + "flos": 598231593984.0, + "grad_norm": 0.02196679377417612, + "language_loss": 0.91743886, + "learning_rate": 0.0008239059742493512, + "loss": 0.92925882, + "num_input_tokens_seen": 127477136, + "router_z_loss_mlp": 0.93212891, + "step": 1546, + "time_per_iteration": 2.703385353088379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182095, + "balance_loss_mlp": 1.08868301, + "epoch": 0.2976144671027318, + "flos": 771338563584.0, + "grad_norm": 0.02555387631372138, + "language_loss": 0.94145298, + "learning_rate": 0.0008236685783674142, + "loss": 0.95327395, + "num_input_tokens_seen": 127565680, + "router_z_loss_mlp": 0.93310547, + "step": 1547, + "time_per_iteration": 3.0583412647247314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221115, + "balance_loss_mlp": 1.12822723, + "epoch": 0.2978068487879954, + "flos": 1487911441920.0, + "grad_norm": 0.023679675459363107, + "language_loss": 0.76221192, + "learning_rate": 0.0008234310568261911, + "loss": 0.77442312, + "num_input_tokens_seen": 127791584, + "router_z_loss_mlp": 0.92773438, + "step": 1548, + "time_per_iteration": 4.846614360809326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192812, + "balance_loss_mlp": 1.09925652, + "epoch": 0.29799923047325894, + "flos": 476329191936.0, + "grad_norm": 0.02691026692614136, + "language_loss": 0.91868371, + "learning_rate": 0.0008231934097178955, + "loss": 0.93061185, + "num_input_tokens_seen": 127860112, + "router_z_loss_mlp": 0.93457031, + "step": 1549, + "time_per_iteration": 2.600588798522949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189437, + "balance_loss_mlp": 1.09573877, + "epoch": 0.2981916121585225, + "flos": 761167872000.0, + "grad_norm": 0.02304182660847759, + "language_loss": 0.93441629, + "learning_rate": 0.0008229556371347903, + "loss": 0.94631064, + "num_input_tokens_seen": 127938752, + "router_z_loss_mlp": 0.93603516, + "step": 1550, + "time_per_iteration": 2.9500393867492676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196641, + "balance_loss_mlp": 1.10256064, + "epoch": 0.29838399384378606, + "flos": 876516547584.0, + "grad_norm": 0.029531977965095095, + "language_loss": 0.90478379, + "learning_rate": 0.0008227177391691874, + "loss": 0.91675019, + "num_input_tokens_seen": 128022192, + "router_z_loss_mlp": 0.93994141, + "step": 1551, + "time_per_iteration": 3.117060422897339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192501, + "balance_loss_mlp": 1.09870708, + "epoch": 0.29857637552904964, + "flos": 580751837184.0, + "grad_norm": 0.026349497602305087, + "language_loss": 0.9813534, + "learning_rate": 0.0008224797159134463, + "loss": 0.99327838, + "num_input_tokens_seen": 128097776, + "router_z_loss_mlp": 0.93701172, + "step": 1552, + "time_per_iteration": 2.694382429122925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185823, + "balance_loss_mlp": 1.09212494, + "epoch": 0.2987687572143132, + "flos": 837807811584.0, + "grad_norm": 0.022207279660822626, + "language_loss": 0.8985877, + "learning_rate": 0.0008222415674599765, + "loss": 0.91044593, + "num_input_tokens_seen": 128179888, + "router_z_loss_mlp": 0.93603516, + "step": 1553, + "time_per_iteration": 3.074347972869873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186024, + "balance_loss_mlp": 1.09203923, + "epoch": 0.29896113889957676, + "flos": 568167409152.0, + "grad_norm": 0.026892838709900748, + "language_loss": 0.93768913, + "learning_rate": 0.0008220032939012349, + "loss": 0.94954944, + "num_input_tokens_seen": 128251152, + "router_z_loss_mlp": 0.93896484, + "step": 1554, + "time_per_iteration": 2.6793601512908936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190641, + "balance_loss_mlp": 1.0965606, + "epoch": 0.29915352058484035, + "flos": 499835647488.0, + "grad_norm": 0.021647779244158522, + "language_loss": 0.95223451, + "learning_rate": 0.0008217648953297277, + "loss": 0.96414095, + "num_input_tokens_seen": 128327600, + "router_z_loss_mlp": 0.93994141, + "step": 1555, + "time_per_iteration": 2.836775779724121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189405, + "balance_loss_mlp": 1.09546852, + "epoch": 0.2993459022701039, + "flos": 593214741504.0, + "grad_norm": 0.03843372955580003, + "language_loss": 0.88026905, + "learning_rate": 0.0008215263718380095, + "loss": 0.89216304, + "num_input_tokens_seen": 128398432, + "router_z_loss_mlp": 0.93847656, + "step": 1556, + "time_per_iteration": 2.6840782165527344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192028, + "balance_loss_mlp": 1.09790027, + "epoch": 0.29953828395536747, + "flos": 573472971264.0, + "grad_norm": 0.02697506762846426, + "language_loss": 0.95771539, + "learning_rate": 0.0008212877235186833, + "loss": 0.96963573, + "num_input_tokens_seen": 128469696, + "router_z_loss_mlp": 0.94042969, + "step": 1557, + "time_per_iteration": 2.649303674697876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216583, + "balance_loss_mlp": 1.12350464, + "epoch": 0.299730665640631, + "flos": 1508083637760.0, + "grad_norm": 0.01733611069553414, + "language_loss": 0.77737558, + "learning_rate": 0.0008210489504644005, + "loss": 0.78954148, + "num_input_tokens_seen": 128698560, + "router_z_loss_mlp": 0.9296875, + "step": 1558, + "time_per_iteration": 4.920740365982056 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191809, + "balance_loss_mlp": 1.09772909, + "epoch": 0.2999230473258946, + "flos": 514807615488.0, + "grad_norm": 0.03091345134541536, + "language_loss": 0.92723, + "learning_rate": 0.0008208100527678611, + "loss": 0.93914807, + "num_input_tokens_seen": 128765952, + "router_z_loss_mlp": 0.93994141, + "step": 1559, + "time_per_iteration": 2.628755807876587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191055, + "balance_loss_mlp": 1.09692788, + "epoch": 0.3001154290111581, + "flos": 835853973504.0, + "grad_norm": 0.03027255896835194, + "language_loss": 0.86836946, + "learning_rate": 0.0008205710305218135, + "loss": 0.88028002, + "num_input_tokens_seen": 128840048, + "router_z_loss_mlp": 0.94042969, + "step": 1560, + "time_per_iteration": 3.0076475143432617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188346, + "balance_loss_mlp": 1.09431422, + "epoch": 0.3003078106964217, + "flos": 557945051136.0, + "grad_norm": 0.023845762720508586, + "language_loss": 0.96495396, + "learning_rate": 0.0008203318838190541, + "loss": 0.9768374, + "num_input_tokens_seen": 128912496, + "router_z_loss_mlp": 0.93945312, + "step": 1561, + "time_per_iteration": 2.7329952716827393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118952, + "balance_loss_mlp": 1.09548759, + "epoch": 0.30050019238168524, + "flos": 527168461824.0, + "grad_norm": 0.030147848994798797, + "language_loss": 0.95915771, + "learning_rate": 0.0008200926127524281, + "loss": 0.97105289, + "num_input_tokens_seen": 128980624, + "router_z_loss_mlp": 0.93945312, + "step": 1562, + "time_per_iteration": 2.625941753387451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186113, + "balance_loss_mlp": 1.09217656, + "epoch": 0.3006925740669488, + "flos": 578936987136.0, + "grad_norm": 0.02860364820877459, + "language_loss": 0.92538679, + "learning_rate": 0.0008198532174148289, + "loss": 0.93724799, + "num_input_tokens_seen": 129050576, + "router_z_loss_mlp": 0.93847656, + "step": 1563, + "time_per_iteration": 2.725884199142456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207901, + "balance_loss_mlp": 1.11539459, + "epoch": 0.3008849557522124, + "flos": 1493610499584.0, + "grad_norm": 0.014785027254047896, + "language_loss": 0.8068617, + "learning_rate": 0.0008196136978991977, + "loss": 0.8189407, + "num_input_tokens_seen": 129278880, + "router_z_loss_mlp": 0.92382812, + "step": 1564, + "time_per_iteration": 4.830730438232422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198016, + "balance_loss_mlp": 1.10398376, + "epoch": 0.30107733743747594, + "flos": 510824077824.0, + "grad_norm": 0.03423038852538926, + "language_loss": 0.994165, + "learning_rate": 0.0008193740542985244, + "loss": 1.00614524, + "num_input_tokens_seen": 129346560, + "router_z_loss_mlp": 0.93945312, + "step": 1565, + "time_per_iteration": 2.578756809234619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194051, + "balance_loss_mlp": 1.10020983, + "epoch": 0.30126971912273953, + "flos": 588820970496.0, + "grad_norm": 0.027351016206119898, + "language_loss": 0.95914042, + "learning_rate": 0.0008191342867058467, + "loss": 0.97108096, + "num_input_tokens_seen": 129420448, + "router_z_loss_mlp": 0.9375, + "step": 1566, + "time_per_iteration": 2.7046890258789062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192822, + "balance_loss_mlp": 1.09898102, + "epoch": 0.30146210080800306, + "flos": 603220248576.0, + "grad_norm": 0.029722715632080093, + "language_loss": 0.93181753, + "learning_rate": 0.0008188943952142509, + "loss": 0.94374579, + "num_input_tokens_seen": 129494032, + "router_z_loss_mlp": 0.9375, + "step": 1567, + "time_per_iteration": 2.7784945964813232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189204, + "balance_loss_mlp": 1.09588659, + "epoch": 0.30165448249326665, + "flos": 919286684160.0, + "grad_norm": 0.02698998287866622, + "language_loss": 0.91980577, + "learning_rate": 0.0008186543799168711, + "loss": 0.93169785, + "num_input_tokens_seen": 129569088, + "router_z_loss_mlp": 0.93212891, + "step": 1568, + "time_per_iteration": 3.1082897186279297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188766, + "balance_loss_mlp": 1.09530556, + "epoch": 0.3018468641785302, + "flos": 778630164480.0, + "grad_norm": 0.02791954193910651, + "language_loss": 0.98386627, + "learning_rate": 0.0008184142409068892, + "loss": 0.99575394, + "num_input_tokens_seen": 129647968, + "router_z_loss_mlp": 0.93359375, + "step": 1569, + "time_per_iteration": 3.0047945976257324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187793, + "balance_loss_mlp": 1.09433293, + "epoch": 0.30203924586379377, + "flos": 523389040128.0, + "grad_norm": 0.023468489537567368, + "language_loss": 0.94207543, + "learning_rate": 0.000818173978277536, + "loss": 0.95395339, + "num_input_tokens_seen": 129718928, + "router_z_loss_mlp": 0.93359375, + "step": 1570, + "time_per_iteration": 2.640746831893921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119455, + "balance_loss_mlp": 1.10094678, + "epoch": 0.3022316275490573, + "flos": 525649052160.0, + "grad_norm": 0.028721303316250762, + "language_loss": 0.92132497, + "learning_rate": 0.000817933592122089, + "loss": 0.93327045, + "num_input_tokens_seen": 129790128, + "router_z_loss_mlp": 0.93505859, + "step": 1571, + "time_per_iteration": 2.683819055557251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119426, + "balance_loss_mlp": 1.10037029, + "epoch": 0.3024240092343209, + "flos": 480872684544.0, + "grad_norm": 0.028034832338571278, + "language_loss": 0.93476671, + "learning_rate": 0.0008176930825338749, + "loss": 0.94670928, + "num_input_tokens_seen": 129857536, + "router_z_loss_mlp": 0.93798828, + "step": 1572, + "time_per_iteration": 2.5472469329833984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189801, + "balance_loss_mlp": 1.09605432, + "epoch": 0.3026163909195845, + "flos": 688430879232.0, + "grad_norm": 0.025848261804373458, + "language_loss": 0.98155606, + "learning_rate": 0.0008174524496062679, + "loss": 0.9934541, + "num_input_tokens_seen": 129931440, + "router_z_loss_mlp": 0.93652344, + "step": 1573, + "time_per_iteration": 2.90840482711792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185834, + "balance_loss_mlp": 1.0922308, + "epoch": 0.302808772604848, + "flos": 544086262272.0, + "grad_norm": 0.023993082839652336, + "language_loss": 0.9423182, + "learning_rate": 0.0008172116934326894, + "loss": 0.95417649, + "num_input_tokens_seen": 130005200, + "router_z_loss_mlp": 0.93505859, + "step": 1574, + "time_per_iteration": 2.735853433609009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197529, + "balance_loss_mlp": 1.10349655, + "epoch": 0.3030011542901116, + "flos": 476051215872.0, + "grad_norm": 0.025758910941944917, + "language_loss": 0.96492219, + "learning_rate": 0.0008169708141066097, + "loss": 0.97689748, + "num_input_tokens_seen": 130069136, + "router_z_loss_mlp": 0.93945312, + "step": 1575, + "time_per_iteration": 2.5468080043792725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195411, + "balance_loss_mlp": 1.10123575, + "epoch": 0.30319353597537513, + "flos": 482472685056.0, + "grad_norm": 0.02368764088299644, + "language_loss": 0.97863203, + "learning_rate": 0.0008167298117215465, + "loss": 0.99058616, + "num_input_tokens_seen": 130135456, + "router_z_loss_mlp": 0.94091797, + "step": 1576, + "time_per_iteration": 2.5703070163726807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191699, + "balance_loss_mlp": 1.09747636, + "epoch": 0.3033859176606387, + "flos": 706112750592.0, + "grad_norm": 0.02517452757559557, + "language_loss": 0.96809077, + "learning_rate": 0.0008164886863710649, + "loss": 0.98000777, + "num_input_tokens_seen": 130213712, + "router_z_loss_mlp": 0.94140625, + "step": 1577, + "time_per_iteration": 2.9235777854919434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194461, + "balance_loss_mlp": 1.09990454, + "epoch": 0.30357829934590225, + "flos": 766108862976.0, + "grad_norm": 0.022389524212240816, + "language_loss": 0.93041158, + "learning_rate": 0.0008162474381487783, + "loss": 0.94235623, + "num_input_tokens_seen": 130290928, + "router_z_loss_mlp": 0.94482422, + "step": 1578, + "time_per_iteration": 3.0875654220581055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198648, + "balance_loss_mlp": 1.10399556, + "epoch": 0.30377068103116583, + "flos": 533448941568.0, + "grad_norm": 0.026496061930467673, + "language_loss": 0.94202471, + "learning_rate": 0.0008160060671483475, + "loss": 0.9540112, + "num_input_tokens_seen": 130362672, + "router_z_loss_mlp": 0.94580078, + "step": 1579, + "time_per_iteration": 2.69014048576355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198759, + "balance_loss_mlp": 1.10415483, + "epoch": 0.3039630627164294, + "flos": 511223577600.0, + "grad_norm": 0.03174839578716906, + "language_loss": 0.93386602, + "learning_rate": 0.0008157645734634809, + "loss": 0.94585359, + "num_input_tokens_seen": 130428848, + "router_z_loss_mlp": 0.9453125, + "step": 1580, + "time_per_iteration": 2.602752923965454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221184, + "balance_loss_mlp": 1.12791443, + "epoch": 0.30415544440169295, + "flos": 1509188084736.0, + "grad_norm": 0.0221653057193215, + "language_loss": 0.76896489, + "learning_rate": 0.000815522957187935, + "loss": 0.78117669, + "num_input_tokens_seen": 130665440, + "router_z_loss_mlp": 0.93164062, + "step": 1581, + "time_per_iteration": 4.895219802856445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196045, + "balance_loss_mlp": 1.10334778, + "epoch": 0.30434782608695654, + "flos": 1461787133952.0, + "grad_norm": 0.012004742936218659, + "language_loss": 0.73214495, + "learning_rate": 0.0008152812184155132, + "loss": 0.74410546, + "num_input_tokens_seen": 130895248, + "router_z_loss_mlp": 0.92578125, + "step": 1582, + "time_per_iteration": 4.860503196716309 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199297, + "balance_loss_mlp": 1.10526431, + "epoch": 0.3045402077722201, + "flos": 483534197760.0, + "grad_norm": 0.030796945736395555, + "language_loss": 0.93027633, + "learning_rate": 0.000815039357240067, + "loss": 0.94226933, + "num_input_tokens_seen": 130964544, + "router_z_loss_mlp": 0.93945312, + "step": 1583, + "time_per_iteration": 2.6209895610809326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200124, + "balance_loss_mlp": 1.10613978, + "epoch": 0.30473258945748366, + "flos": 544626751488.0, + "grad_norm": 0.03019985050023197, + "language_loss": 0.95277119, + "learning_rate": 0.0008147973737554952, + "loss": 0.9647724, + "num_input_tokens_seen": 131041744, + "router_z_loss_mlp": 0.93896484, + "step": 1584, + "time_per_iteration": 2.7421703338623047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194047, + "balance_loss_mlp": 1.10039604, + "epoch": 0.3049249711427472, + "flos": 568121746944.0, + "grad_norm": 0.05356410902969654, + "language_loss": 0.96138752, + "learning_rate": 0.000814555268055744, + "loss": 0.97332799, + "num_input_tokens_seen": 131108864, + "router_z_loss_mlp": 0.93554688, + "step": 1585, + "time_per_iteration": 2.632770299911499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191549, + "balance_loss_mlp": 1.09804094, + "epoch": 0.3051173528280108, + "flos": 529289485824.0, + "grad_norm": 0.02648444030223836, + "language_loss": 0.96492249, + "learning_rate": 0.0008143130402348073, + "loss": 0.97683799, + "num_input_tokens_seen": 131181104, + "router_z_loss_mlp": 0.93408203, + "step": 1586, + "time_per_iteration": 2.67673659324646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01201208, + "balance_loss_mlp": 1.10746217, + "epoch": 0.3053097345132743, + "flos": 587599002624.0, + "grad_norm": 0.026229801397330138, + "language_loss": 0.86860031, + "learning_rate": 0.0008140706903867265, + "loss": 0.88061237, + "num_input_tokens_seen": 131258704, + "router_z_loss_mlp": 0.93652344, + "step": 1587, + "time_per_iteration": 2.800891399383545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198977, + "balance_loss_mlp": 1.10518289, + "epoch": 0.3055021161985379, + "flos": 608200171008.0, + "grad_norm": 0.031935519152889405, + "language_loss": 1.00360334, + "learning_rate": 0.0008138282186055897, + "loss": 1.01559317, + "num_input_tokens_seen": 131325712, + "router_z_loss_mlp": 0.93701172, + "step": 1588, + "time_per_iteration": 2.735144853591919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119001, + "balance_loss_mlp": 1.09645426, + "epoch": 0.3056944978838015, + "flos": 574962181632.0, + "grad_norm": 0.02354328369726863, + "language_loss": 0.90634608, + "learning_rate": 0.0008135856249855331, + "loss": 0.91824615, + "num_input_tokens_seen": 131397568, + "router_z_loss_mlp": 0.93457031, + "step": 1589, + "time_per_iteration": 2.676589012145996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193478, + "balance_loss_mlp": 1.0996846, + "epoch": 0.305886879569065, + "flos": 635071085568.0, + "grad_norm": 0.031037281782467684, + "language_loss": 0.99387443, + "learning_rate": 0.0008133429096207398, + "loss": 1.00580931, + "num_input_tokens_seen": 131467632, + "router_z_loss_mlp": 0.93701172, + "step": 1590, + "time_per_iteration": 2.7601518630981445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01232346, + "balance_loss_mlp": 1.14117432, + "epoch": 0.3060792612543286, + "flos": 1372131065856.0, + "grad_norm": 0.03086145734446917, + "language_loss": 0.75312257, + "learning_rate": 0.0008131000726054403, + "loss": 0.76544607, + "num_input_tokens_seen": 131702224, + "router_z_loss_mlp": 0.91015625, + "step": 1591, + "time_per_iteration": 4.945107460021973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194266, + "balance_loss_mlp": 1.10051942, + "epoch": 0.30627164293959214, + "flos": 519618350592.0, + "grad_norm": 0.024964882972055902, + "language_loss": 0.95062864, + "learning_rate": 0.0008128571140339123, + "loss": 0.96257126, + "num_input_tokens_seen": 131774608, + "router_z_loss_mlp": 0.93652344, + "step": 1592, + "time_per_iteration": 2.6392171382904053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01201642, + "balance_loss_mlp": 1.10780036, + "epoch": 0.3064640246248557, + "flos": 456533027328.0, + "grad_norm": 0.029487227531667784, + "language_loss": 0.98122042, + "learning_rate": 0.0008126140340004805, + "loss": 0.9932369, + "num_input_tokens_seen": 131841216, + "router_z_loss_mlp": 0.9375, + "step": 1593, + "time_per_iteration": 2.504150629043579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199461, + "balance_loss_mlp": 1.10561943, + "epoch": 0.30665640631011926, + "flos": 851608203264.0, + "grad_norm": 0.026956571268616787, + "language_loss": 0.91923594, + "learning_rate": 0.0008123708325995172, + "loss": 0.93123049, + "num_input_tokens_seen": 131937584, + "router_z_loss_mlp": 0.9375, + "step": 1594, + "time_per_iteration": 3.184525489807129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190831, + "balance_loss_mlp": 1.09713268, + "epoch": 0.30684878799538284, + "flos": 759615535104.0, + "grad_norm": 0.022474213305982697, + "language_loss": 0.88990366, + "learning_rate": 0.0008121275099254414, + "loss": 0.90181196, + "num_input_tokens_seen": 132012656, + "router_z_loss_mlp": 0.93603516, + "step": 1595, + "time_per_iteration": 2.892902374267578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200579, + "balance_loss_mlp": 1.10668933, + "epoch": 0.3070411696806464, + "flos": 518595769344.0, + "grad_norm": 0.025855927391394404, + "language_loss": 0.96650064, + "learning_rate": 0.0008118840660727194, + "loss": 0.97850645, + "num_input_tokens_seen": 132083728, + "router_z_loss_mlp": 0.93798828, + "step": 1596, + "time_per_iteration": 2.696312665939331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191708, + "balance_loss_mlp": 1.09805715, + "epoch": 0.30723355136590996, + "flos": 845790349824.0, + "grad_norm": 0.023513083336694603, + "language_loss": 0.94521677, + "learning_rate": 0.0008116405011358644, + "loss": 0.95713389, + "num_input_tokens_seen": 132170896, + "router_z_loss_mlp": 0.93554688, + "step": 1597, + "time_per_iteration": 3.1500890254974365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118938, + "balance_loss_mlp": 1.09572959, + "epoch": 0.30742593305117355, + "flos": 467079023616.0, + "grad_norm": 0.024597056369147573, + "language_loss": 0.89059556, + "learning_rate": 0.0008113968152094369, + "loss": 0.90248942, + "num_input_tokens_seen": 132234592, + "router_z_loss_mlp": 0.93554688, + "step": 1598, + "time_per_iteration": 2.502336263656616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191327, + "balance_loss_mlp": 1.09781969, + "epoch": 0.3076183147364371, + "flos": 687816529920.0, + "grad_norm": 0.025330429780868927, + "language_loss": 0.90385377, + "learning_rate": 0.0008111530083880438, + "loss": 0.91576707, + "num_input_tokens_seen": 132314720, + "router_z_loss_mlp": 0.93408203, + "step": 1599, + "time_per_iteration": 2.8846051692962646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192126, + "balance_loss_mlp": 1.09847498, + "epoch": 0.30781069642170067, + "flos": 615179593728.0, + "grad_norm": 0.02627563558110635, + "language_loss": 0.95310938, + "learning_rate": 0.0008109090807663399, + "loss": 0.96503073, + "num_input_tokens_seen": 132388768, + "router_z_loss_mlp": 0.93554688, + "step": 1600, + "time_per_iteration": 2.8132736682891846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119763, + "balance_loss_mlp": 1.10402679, + "epoch": 0.3080030781069642, + "flos": 591508680192.0, + "grad_norm": 0.027223292643472258, + "language_loss": 0.96310741, + "learning_rate": 0.0008106650324390257, + "loss": 0.97508371, + "num_input_tokens_seen": 132472544, + "router_z_loss_mlp": 0.93505859, + "step": 1601, + "time_per_iteration": 2.8477296829223633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188215, + "balance_loss_mlp": 1.0948981, + "epoch": 0.3081954597922278, + "flos": 563691045888.0, + "grad_norm": 0.027322987260225157, + "language_loss": 0.89918464, + "learning_rate": 0.0008104208635008493, + "loss": 0.91106677, + "num_input_tokens_seen": 132541968, + "router_z_loss_mlp": 0.93212891, + "step": 1602, + "time_per_iteration": 2.6639676094055176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192245, + "balance_loss_mlp": 1.09859383, + "epoch": 0.3083878414774913, + "flos": 448761335808.0, + "grad_norm": 0.031035394068971153, + "language_loss": 0.93496901, + "learning_rate": 0.0008101765740466058, + "loss": 0.94689143, + "num_input_tokens_seen": 132606976, + "router_z_loss_mlp": 0.93554688, + "step": 1603, + "time_per_iteration": 2.4892899990081787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188397, + "balance_loss_mlp": 1.09465039, + "epoch": 0.3085802231627549, + "flos": 494544821760.0, + "grad_norm": 0.029709960428380106, + "language_loss": 0.93853128, + "learning_rate": 0.0008099321641711364, + "loss": 0.95041513, + "num_input_tokens_seen": 132677984, + "router_z_loss_mlp": 0.93652344, + "step": 1604, + "time_per_iteration": 2.638798952102661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011875, + "balance_loss_mlp": 1.09380174, + "epoch": 0.3087726048480185, + "flos": 488690038272.0, + "grad_norm": 0.02367908107469003, + "language_loss": 0.91951108, + "learning_rate": 0.0008096876339693295, + "loss": 0.93138611, + "num_input_tokens_seen": 132749136, + "router_z_loss_mlp": 0.93603516, + "step": 1605, + "time_per_iteration": 2.6115643978118896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189736, + "balance_loss_mlp": 1.09603786, + "epoch": 0.308964986533282, + "flos": 731887223808.0, + "grad_norm": 0.029121548764615916, + "language_loss": 0.90058184, + "learning_rate": 0.0008094429835361206, + "loss": 0.91247922, + "num_input_tokens_seen": 132823824, + "router_z_loss_mlp": 0.93603516, + "step": 1606, + "time_per_iteration": 2.9361119270324707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185725, + "balance_loss_mlp": 1.09226441, + "epoch": 0.3091573682185456, + "flos": 606515576832.0, + "grad_norm": 0.024539043330914945, + "language_loss": 0.94318593, + "learning_rate": 0.0008091982129664908, + "loss": 0.95504314, + "num_input_tokens_seen": 132895936, + "router_z_loss_mlp": 0.93359375, + "step": 1607, + "time_per_iteration": 2.750641345977783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191863, + "balance_loss_mlp": 1.09821212, + "epoch": 0.30934974990380915, + "flos": 461306832384.0, + "grad_norm": 0.02635007664096696, + "language_loss": 0.92281848, + "learning_rate": 0.0008089533223554687, + "loss": 0.93473709, + "num_input_tokens_seen": 132968960, + "router_z_loss_mlp": 0.93554688, + "step": 1608, + "time_per_iteration": 2.733422040939331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187457, + "balance_loss_mlp": 1.09380579, + "epoch": 0.30954213158907273, + "flos": 554567130624.0, + "grad_norm": 0.025571984513822792, + "language_loss": 0.94345558, + "learning_rate": 0.0008087083117981294, + "loss": 0.95533013, + "num_input_tokens_seen": 133048448, + "router_z_loss_mlp": 0.93554688, + "step": 1609, + "time_per_iteration": 2.919583797454834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189683, + "balance_loss_mlp": 1.09665251, + "epoch": 0.30973451327433627, + "flos": 554113236480.0, + "grad_norm": 0.028700236773969223, + "language_loss": 0.98730469, + "learning_rate": 0.0008084631813895943, + "loss": 0.99920154, + "num_input_tokens_seen": 133121680, + "router_z_loss_mlp": 0.92919922, + "step": 1610, + "time_per_iteration": 2.7721197605133057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192773, + "balance_loss_mlp": 1.09955156, + "epoch": 0.30992689495959985, + "flos": 566762792448.0, + "grad_norm": 0.027612542910463767, + "language_loss": 0.93469882, + "learning_rate": 0.0008082179312250315, + "loss": 0.94662654, + "num_input_tokens_seen": 133190176, + "router_z_loss_mlp": 0.93115234, + "step": 1611, + "time_per_iteration": 2.658564805984497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01219437, + "balance_loss_mlp": 1.12769318, + "epoch": 0.3101192766448634, + "flos": 1445560270848.0, + "grad_norm": 0.021240149379623804, + "language_loss": 0.79855847, + "learning_rate": 0.0008079725613996555, + "loss": 0.81075287, + "num_input_tokens_seen": 133420512, + "router_z_loss_mlp": 0.91601562, + "step": 1612, + "time_per_iteration": 4.8431174755096436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01227287, + "balance_loss_mlp": 1.13497162, + "epoch": 0.31031165833012697, + "flos": 1535127742464.0, + "grad_norm": 0.019393089292119553, + "language_loss": 0.76629329, + "learning_rate": 0.0008077270720087273, + "loss": 0.77856624, + "num_input_tokens_seen": 133651984, + "router_z_loss_mlp": 0.921875, + "step": 1613, + "time_per_iteration": 5.043596029281616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191397, + "balance_loss_mlp": 1.09850931, + "epoch": 0.31050404001539056, + "flos": 993632409600.0, + "grad_norm": 0.029090005547288914, + "language_loss": 0.90590245, + "learning_rate": 0.0008074814631475545, + "loss": 0.91781646, + "num_input_tokens_seen": 133741648, + "router_z_loss_mlp": 0.92773438, + "step": 1614, + "time_per_iteration": 3.3308844566345215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011972, + "balance_loss_mlp": 1.10450339, + "epoch": 0.3106964217006541, + "flos": 446972682240.0, + "grad_norm": 0.029174032275502568, + "language_loss": 0.8959738, + "learning_rate": 0.0008072357349114907, + "loss": 0.90794587, + "num_input_tokens_seen": 133813344, + "router_z_loss_mlp": 0.92578125, + "step": 1615, + "time_per_iteration": 2.660557746887207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194484, + "balance_loss_mlp": 1.10169172, + "epoch": 0.3108888033859177, + "flos": 511494822912.0, + "grad_norm": 0.027617375290548026, + "language_loss": 0.9836188, + "learning_rate": 0.0008069898873959363, + "loss": 0.99556363, + "num_input_tokens_seen": 133884192, + "router_z_loss_mlp": 0.92675781, + "step": 1616, + "time_per_iteration": 2.650024175643921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203555, + "balance_loss_mlp": 1.11076295, + "epoch": 0.3110811850711812, + "flos": 521778306048.0, + "grad_norm": 0.027380341091067188, + "language_loss": 0.94434142, + "learning_rate": 0.0008067439206963375, + "loss": 0.95637697, + "num_input_tokens_seen": 133954848, + "router_z_loss_mlp": 0.92675781, + "step": 1617, + "time_per_iteration": 2.6584017276763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120371, + "balance_loss_mlp": 1.11082232, + "epoch": 0.3112735667564448, + "flos": 687729934848.0, + "grad_norm": 0.029016410329411102, + "language_loss": 0.95023614, + "learning_rate": 0.0008064978349081873, + "loss": 0.96227324, + "num_input_tokens_seen": 134031824, + "router_z_loss_mlp": 0.92773438, + "step": 1618, + "time_per_iteration": 2.911677122116089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199948, + "balance_loss_mlp": 1.10720289, + "epoch": 0.31146594844170833, + "flos": 534165348864.0, + "grad_norm": 0.025439718165996668, + "language_loss": 0.95660365, + "learning_rate": 0.0008062516301270245, + "loss": 0.96860307, + "num_input_tokens_seen": 134104480, + "router_z_loss_mlp": 0.92626953, + "step": 1619, + "time_per_iteration": 2.669111490249634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196196, + "balance_loss_mlp": 1.10388064, + "epoch": 0.3116583301269719, + "flos": 680841836544.0, + "grad_norm": 0.024218225399572888, + "language_loss": 0.96279341, + "learning_rate": 0.0008060053064484343, + "loss": 0.97475541, + "num_input_tokens_seen": 134185632, + "router_z_loss_mlp": 0.921875, + "step": 1620, + "time_per_iteration": 2.924476385116577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189886, + "balance_loss_mlp": 1.09733212, + "epoch": 0.31185071181223545, + "flos": 587329758720.0, + "grad_norm": 0.02529679167102671, + "language_loss": 0.92711556, + "learning_rate": 0.0008057588639680482, + "loss": 0.93901443, + "num_input_tokens_seen": 134261600, + "router_z_loss_mlp": 0.92431641, + "step": 1621, + "time_per_iteration": 2.74631667137146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119125, + "balance_loss_mlp": 1.09817135, + "epoch": 0.31204309349749904, + "flos": 726657523200.0, + "grad_norm": 0.03522846239796161, + "language_loss": 0.93884659, + "learning_rate": 0.0008055123027815434, + "loss": 0.95075905, + "num_input_tokens_seen": 134334368, + "router_z_loss_mlp": 0.9296875, + "step": 1622, + "time_per_iteration": 2.90444016456604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189249, + "balance_loss_mlp": 1.09631383, + "epoch": 0.3122354751827626, + "flos": 577894940160.0, + "grad_norm": 0.026492717763192643, + "language_loss": 0.93252558, + "learning_rate": 0.0008052656229846436, + "loss": 0.94441813, + "num_input_tokens_seen": 134403824, + "router_z_loss_mlp": 0.92822266, + "step": 1623, + "time_per_iteration": 2.680220603942871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187983, + "balance_loss_mlp": 1.09519064, + "epoch": 0.31242785686802615, + "flos": 577028811264.0, + "grad_norm": 0.026617450345468772, + "language_loss": 1.00026262, + "learning_rate": 0.0008050188246731182, + "loss": 1.01214242, + "num_input_tokens_seen": 134471296, + "router_z_loss_mlp": 0.92675781, + "step": 1624, + "time_per_iteration": 2.6526694297790527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190099, + "balance_loss_mlp": 1.09711611, + "epoch": 0.31262023855328974, + "flos": 738195901440.0, + "grad_norm": 0.023806346866415393, + "language_loss": 0.9048847, + "learning_rate": 0.0008047719079427834, + "loss": 0.91678566, + "num_input_tokens_seen": 134551360, + "router_z_loss_mlp": 0.92871094, + "step": 1625, + "time_per_iteration": 3.0077152252197266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119944, + "balance_loss_mlp": 1.108078, + "epoch": 0.3128126202385533, + "flos": 1562591539200.0, + "grad_norm": 0.020013754894949238, + "language_loss": 0.74351704, + "learning_rate": 0.0008045248728895, + "loss": 0.7555114, + "num_input_tokens_seen": 134761328, + "router_z_loss_mlp": 0.91210938, + "step": 1626, + "time_per_iteration": 4.793031215667725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194528, + "balance_loss_mlp": 1.10111523, + "epoch": 0.31300500192381686, + "flos": 515942988288.0, + "grad_norm": 0.023349922932092686, + "language_loss": 0.95821261, + "learning_rate": 0.0008042777196091757, + "loss": 0.97015792, + "num_input_tokens_seen": 134833136, + "router_z_loss_mlp": 0.93310547, + "step": 1627, + "time_per_iteration": 2.679588556289673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196127, + "balance_loss_mlp": 1.10281038, + "epoch": 0.3131973836090804, + "flos": 527661287424.0, + "grad_norm": 0.026058472156191805, + "language_loss": 0.91163933, + "learning_rate": 0.0008040304481977643, + "loss": 0.92360055, + "num_input_tokens_seen": 134904352, + "router_z_loss_mlp": 0.93212891, + "step": 1628, + "time_per_iteration": 2.6339213848114014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206718, + "balance_loss_mlp": 1.11335361, + "epoch": 0.313389765294344, + "flos": 824209534464.0, + "grad_norm": 0.028324849871922998, + "language_loss": 0.96729648, + "learning_rate": 0.0008037830587512649, + "loss": 0.97936368, + "num_input_tokens_seen": 134984880, + "router_z_loss_mlp": 0.93261719, + "step": 1629, + "time_per_iteration": 3.052304744720459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191904, + "balance_loss_mlp": 1.09896827, + "epoch": 0.31358214697960757, + "flos": 394702599168.0, + "grad_norm": 0.026724204555937114, + "language_loss": 0.89292234, + "learning_rate": 0.0008035355513657224, + "loss": 0.90484136, + "num_input_tokens_seen": 135047456, + "router_z_loss_mlp": 0.92822266, + "step": 1630, + "time_per_iteration": 2.470526695251465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198859, + "balance_loss_mlp": 1.1059711, + "epoch": 0.3137745286648711, + "flos": 573097666560.0, + "grad_norm": 0.025006494531642755, + "language_loss": 1.00651205, + "learning_rate": 0.0008032879261372279, + "loss": 1.01850057, + "num_input_tokens_seen": 135124256, + "router_z_loss_mlp": 0.92773438, + "step": 1631, + "time_per_iteration": 2.7967746257781982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194023, + "balance_loss_mlp": 1.10418701, + "epoch": 0.3139669103501347, + "flos": 1501629241344.0, + "grad_norm": 0.01894627505164378, + "language_loss": 0.79635841, + "learning_rate": 0.0008030401831619178, + "loss": 0.80829865, + "num_input_tokens_seen": 135353024, + "router_z_loss_mlp": 0.89648438, + "step": 1632, + "time_per_iteration": 5.690793991088867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187718, + "balance_loss_mlp": 1.09478259, + "epoch": 0.3141592920353982, + "flos": 526358728704.0, + "grad_norm": 0.023739615719740217, + "language_loss": 0.94780874, + "learning_rate": 0.0008027923225359748, + "loss": 0.95968592, + "num_input_tokens_seen": 135422464, + "router_z_loss_mlp": 0.92822266, + "step": 1633, + "time_per_iteration": 2.619640827178955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182027, + "balance_loss_mlp": 1.08894837, + "epoch": 0.3143516737206618, + "flos": 594387044352.0, + "grad_norm": 0.024020227962995952, + "language_loss": 0.97166598, + "learning_rate": 0.0008025443443556267, + "loss": 0.98348624, + "num_input_tokens_seen": 135490928, + "router_z_loss_mlp": 0.9296875, + "step": 1634, + "time_per_iteration": 2.7105367183685303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187192, + "balance_loss_mlp": 1.09397042, + "epoch": 0.31454405540592534, + "flos": 649679208960.0, + "grad_norm": 0.024579905610689918, + "language_loss": 0.95561564, + "learning_rate": 0.000802296248717147, + "loss": 0.96748757, + "num_input_tokens_seen": 135576288, + "router_z_loss_mlp": 0.93115234, + "step": 1635, + "time_per_iteration": 2.954427480697632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189389, + "balance_loss_mlp": 1.09616756, + "epoch": 0.3147364370911889, + "flos": 644069474304.0, + "grad_norm": 0.026460377875643523, + "language_loss": 0.89723325, + "learning_rate": 0.0008020480357168554, + "loss": 0.90912724, + "num_input_tokens_seen": 135652320, + "router_z_loss_mlp": 0.93115234, + "step": 1636, + "time_per_iteration": 2.7983195781707764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118902, + "balance_loss_mlp": 1.09575093, + "epoch": 0.31492881877645246, + "flos": 472821015552.0, + "grad_norm": 0.024118652497695542, + "language_loss": 0.95980144, + "learning_rate": 0.0008017997054511165, + "loss": 0.97169161, + "num_input_tokens_seen": 135719632, + "router_z_loss_mlp": 0.93164062, + "step": 1637, + "time_per_iteration": 2.543381690979004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188761, + "balance_loss_mlp": 1.09544361, + "epoch": 0.31512120046171604, + "flos": 630629650944.0, + "grad_norm": 0.026442486928658162, + "language_loss": 0.94192296, + "learning_rate": 0.0008015512580163407, + "loss": 0.95381057, + "num_input_tokens_seen": 135796544, + "router_z_loss_mlp": 0.93212891, + "step": 1638, + "time_per_iteration": 2.8069217205047607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189537, + "balance_loss_mlp": 1.09645832, + "epoch": 0.31531358214697963, + "flos": 705053239296.0, + "grad_norm": 0.0247809696854931, + "language_loss": 0.89687169, + "learning_rate": 0.0008013026935089838, + "loss": 0.9087671, + "num_input_tokens_seen": 135871344, + "router_z_loss_mlp": 0.9296875, + "step": 1639, + "time_per_iteration": 2.8575150966644287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189099, + "balance_loss_mlp": 1.09592521, + "epoch": 0.31550596383224316, + "flos": 573631425024.0, + "grad_norm": 0.026868409426578303, + "language_loss": 0.92173505, + "learning_rate": 0.0008010540120255472, + "loss": 0.93362606, + "num_input_tokens_seen": 135944320, + "router_z_loss_mlp": 0.93066406, + "step": 1640, + "time_per_iteration": 2.6781005859375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118909, + "balance_loss_mlp": 1.09591639, + "epoch": 0.31569834551750675, + "flos": 659512800768.0, + "grad_norm": 0.03030176261580671, + "language_loss": 0.95734656, + "learning_rate": 0.0008008052136625774, + "loss": 0.96923745, + "num_input_tokens_seen": 136019456, + "router_z_loss_mlp": 0.93066406, + "step": 1641, + "time_per_iteration": 2.8858654499053955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192627, + "balance_loss_mlp": 1.09950101, + "epoch": 0.3158907272027703, + "flos": 567403338240.0, + "grad_norm": 0.026165343030711524, + "language_loss": 0.94310361, + "learning_rate": 0.0008005562985166666, + "loss": 0.9550299, + "num_input_tokens_seen": 136091232, + "router_z_loss_mlp": 0.93017578, + "step": 1642, + "time_per_iteration": 2.7097506523132324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193912, + "balance_loss_mlp": 1.10102403, + "epoch": 0.31608310888803387, + "flos": 537972968448.0, + "grad_norm": 0.020568762002796243, + "language_loss": 0.9172346, + "learning_rate": 0.0008003072666844524, + "loss": 0.92917377, + "num_input_tokens_seen": 136165088, + "router_z_loss_mlp": 0.92773438, + "step": 1643, + "time_per_iteration": 2.6982197761535645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194419, + "balance_loss_mlp": 1.10181749, + "epoch": 0.3162754905732974, + "flos": 487639259136.0, + "grad_norm": 0.02816029335024998, + "language_loss": 0.90344775, + "learning_rate": 0.0008000581182626173, + "loss": 0.91539198, + "num_input_tokens_seen": 136230368, + "router_z_loss_mlp": 0.92480469, + "step": 1644, + "time_per_iteration": 2.546762466430664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193569, + "balance_loss_mlp": 1.10048997, + "epoch": 0.316467872258561, + "flos": 531095603712.0, + "grad_norm": 0.024394566764596542, + "language_loss": 0.93082815, + "learning_rate": 0.0007998088533478894, + "loss": 0.94276381, + "num_input_tokens_seen": 136302512, + "router_z_loss_mlp": 0.9296875, + "step": 1645, + "time_per_iteration": 2.6320817470550537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188922, + "balance_loss_mlp": 1.09622455, + "epoch": 0.3166602539438245, + "flos": 444413227008.0, + "grad_norm": 0.029455070645316363, + "language_loss": 0.9479661, + "learning_rate": 0.000799559472037042, + "loss": 0.95985526, + "num_input_tokens_seen": 136368064, + "router_z_loss_mlp": 0.92578125, + "step": 1646, + "time_per_iteration": 2.535414457321167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187182, + "balance_loss_mlp": 1.09458041, + "epoch": 0.3168526356290881, + "flos": 647102289408.0, + "grad_norm": 0.02168302123393663, + "language_loss": 0.94649625, + "learning_rate": 0.0007993099744268932, + "loss": 0.95836812, + "num_input_tokens_seen": 136451520, + "router_z_loss_mlp": 0.92480469, + "step": 1647, + "time_per_iteration": 2.912095785140991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182437, + "balance_loss_mlp": 1.08988261, + "epoch": 0.3170450173143517, + "flos": 587257900032.0, + "grad_norm": 0.023943172344495993, + "language_loss": 0.96008313, + "learning_rate": 0.000799060360614307, + "loss": 0.97190744, + "num_input_tokens_seen": 136521184, + "router_z_loss_mlp": 0.92431641, + "step": 1648, + "time_per_iteration": 2.6763339042663574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118738, + "balance_loss_mlp": 1.09482586, + "epoch": 0.3172373989996152, + "flos": 828573106176.0, + "grad_norm": 0.025050943971751935, + "language_loss": 0.91967106, + "learning_rate": 0.0007988106306961917, + "loss": 0.93154484, + "num_input_tokens_seen": 136612592, + "router_z_loss_mlp": 0.92431641, + "step": 1649, + "time_per_iteration": 3.1265392303466797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183645, + "balance_loss_mlp": 1.09151971, + "epoch": 0.3174297806848788, + "flos": 528434090496.0, + "grad_norm": 0.026893421102733506, + "language_loss": 0.92866611, + "learning_rate": 0.0007985607847695014, + "loss": 0.94050252, + "num_input_tokens_seen": 136684336, + "router_z_loss_mlp": 0.91992188, + "step": 1650, + "time_per_iteration": 2.640529155731201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184032, + "balance_loss_mlp": 1.09152567, + "epoch": 0.31762216237014235, + "flos": 714481327104.0, + "grad_norm": 0.024008942139765378, + "language_loss": 0.9102264, + "learning_rate": 0.0007983108229312345, + "loss": 0.92206669, + "num_input_tokens_seen": 136766400, + "router_z_loss_mlp": 0.92382812, + "step": 1651, + "time_per_iteration": 2.890881299972534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183971, + "balance_loss_mlp": 1.09170341, + "epoch": 0.31781454405540593, + "flos": 484799826432.0, + "grad_norm": 0.027702532543066302, + "language_loss": 0.9509185, + "learning_rate": 0.0007980607452784351, + "loss": 0.96275818, + "num_input_tokens_seen": 136834016, + "router_z_loss_mlp": 0.92138672, + "step": 1652, + "time_per_iteration": 2.5693578720092773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118418, + "balance_loss_mlp": 1.09186423, + "epoch": 0.31800692574066947, + "flos": 549804059136.0, + "grad_norm": 0.028510736103347943, + "language_loss": 0.99507928, + "learning_rate": 0.0007978105519081919, + "loss": 1.00692105, + "num_input_tokens_seen": 136906288, + "router_z_loss_mlp": 0.921875, + "step": 1653, + "time_per_iteration": 2.674062967300415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181597, + "balance_loss_mlp": 1.08947253, + "epoch": 0.31819930742593305, + "flos": 517916292096.0, + "grad_norm": 0.029899238666621586, + "language_loss": 0.96953475, + "learning_rate": 0.0007975602429176385, + "loss": 0.98135078, + "num_input_tokens_seen": 136972416, + "router_z_loss_mlp": 0.91992188, + "step": 1654, + "time_per_iteration": 2.595107316970825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011812, + "balance_loss_mlp": 1.08907461, + "epoch": 0.31839168911119664, + "flos": 456969457152.0, + "grad_norm": 0.02327460697487094, + "language_loss": 0.90136862, + "learning_rate": 0.0007973098184039536, + "loss": 0.91318059, + "num_input_tokens_seen": 137044576, + "router_z_loss_mlp": 0.91992188, + "step": 1655, + "time_per_iteration": 2.654873847961426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184047, + "balance_loss_mlp": 1.09192252, + "epoch": 0.3185840707964602, + "flos": 627295391232.0, + "grad_norm": 0.025652000789891626, + "language_loss": 0.955365, + "learning_rate": 0.0007970592784643602, + "loss": 0.96720552, + "num_input_tokens_seen": 137125120, + "router_z_loss_mlp": 0.91992188, + "step": 1656, + "time_per_iteration": 2.8485612869262695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183486, + "balance_loss_mlp": 1.09107482, + "epoch": 0.31877645248172376, + "flos": 568540712448.0, + "grad_norm": 0.02977939264047221, + "language_loss": 0.94253254, + "learning_rate": 0.0007968086231961272, + "loss": 0.9543674, + "num_input_tokens_seen": 137195344, + "router_z_loss_mlp": 0.92285156, + "step": 1657, + "time_per_iteration": 2.6949312686920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182357, + "balance_loss_mlp": 1.09004128, + "epoch": 0.3189688341669873, + "flos": 490552551936.0, + "grad_norm": 0.03598298081414456, + "language_loss": 0.95643866, + "learning_rate": 0.0007965578526965671, + "loss": 0.96826226, + "num_input_tokens_seen": 137261040, + "router_z_loss_mlp": 0.921875, + "step": 1658, + "time_per_iteration": 2.5717341899871826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182583, + "balance_loss_mlp": 1.09012401, + "epoch": 0.3191612158522509, + "flos": 577380647424.0, + "grad_norm": 0.02594626841132509, + "language_loss": 0.93226576, + "learning_rate": 0.0007963069670630377, + "loss": 0.94409156, + "num_input_tokens_seen": 137334400, + "router_z_loss_mlp": 0.92333984, + "step": 1659, + "time_per_iteration": 2.7431960105895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187517, + "balance_loss_mlp": 1.09486747, + "epoch": 0.3193535975375144, + "flos": 539192934912.0, + "grad_norm": 0.026552556196046555, + "language_loss": 0.97412628, + "learning_rate": 0.0007960559663929416, + "loss": 0.98600149, + "num_input_tokens_seen": 137405344, + "router_z_loss_mlp": 0.92529297, + "step": 1660, + "time_per_iteration": 2.631037473678589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186332, + "balance_loss_mlp": 1.09382606, + "epoch": 0.319545979222778, + "flos": 735627714048.0, + "grad_norm": 0.022912970149823363, + "language_loss": 0.94840437, + "learning_rate": 0.0007958048507837259, + "loss": 0.96026772, + "num_input_tokens_seen": 137486016, + "router_z_loss_mlp": 0.92382812, + "step": 1661, + "time_per_iteration": 2.925752878189087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191424, + "balance_loss_mlp": 1.09872651, + "epoch": 0.31973836090804153, + "flos": 765767760384.0, + "grad_norm": 0.030797304976158044, + "language_loss": 0.98320282, + "learning_rate": 0.0007955536203328822, + "loss": 0.99511707, + "num_input_tokens_seen": 137562304, + "router_z_loss_mlp": 0.92578125, + "step": 1662, + "time_per_iteration": 2.9076955318450928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187513, + "balance_loss_mlp": 1.09486389, + "epoch": 0.3199307425933051, + "flos": 561741937152.0, + "grad_norm": 0.02511010738984868, + "language_loss": 0.90468192, + "learning_rate": 0.0007953022751379469, + "loss": 0.91655713, + "num_input_tokens_seen": 137639248, + "router_z_loss_mlp": 0.92529297, + "step": 1663, + "time_per_iteration": 2.7703394889831543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188156, + "balance_loss_mlp": 1.09564936, + "epoch": 0.3201231242785687, + "flos": 752671041024.0, + "grad_norm": 0.029121282383782986, + "language_loss": 0.92101777, + "learning_rate": 0.000795050815296501, + "loss": 0.93289936, + "num_input_tokens_seen": 137718256, + "router_z_loss_mlp": 0.92382812, + "step": 1664, + "time_per_iteration": 2.966632843017578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188504, + "balance_loss_mlp": 1.0960933, + "epoch": 0.32031550596383224, + "flos": 497384254464.0, + "grad_norm": 0.02307975398987516, + "language_loss": 1.00050378, + "learning_rate": 0.0007947992409061695, + "loss": 1.01238883, + "num_input_tokens_seen": 137785216, + "router_z_loss_mlp": 0.92285156, + "step": 1665, + "time_per_iteration": 2.6264171600341797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193124, + "balance_loss_mlp": 1.10080826, + "epoch": 0.3205078876490958, + "flos": 732874876416.0, + "grad_norm": 0.02454331261307917, + "language_loss": 0.93550396, + "learning_rate": 0.0007945475520646226, + "loss": 0.9474352, + "num_input_tokens_seen": 137863424, + "router_z_loss_mlp": 0.921875, + "step": 1666, + "time_per_iteration": 2.915013551712036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191587, + "balance_loss_mlp": 1.09941399, + "epoch": 0.32070026933435936, + "flos": 550474804224.0, + "grad_norm": 0.02796219722650757, + "language_loss": 0.9429689, + "learning_rate": 0.0007942957488695743, + "loss": 0.95488477, + "num_input_tokens_seen": 137930384, + "router_z_loss_mlp": 0.92041016, + "step": 1667, + "time_per_iteration": 2.621396780014038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186724, + "balance_loss_mlp": 1.09421742, + "epoch": 0.32089265101962294, + "flos": 746684000256.0, + "grad_norm": 0.022875326013334737, + "language_loss": 0.87680244, + "learning_rate": 0.0007940438314187833, + "loss": 0.88866973, + "num_input_tokens_seen": 138017200, + "router_z_loss_mlp": 0.92382812, + "step": 1668, + "time_per_iteration": 3.0475997924804688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187112, + "balance_loss_mlp": 1.0947485, + "epoch": 0.3210850327048865, + "flos": 495196101120.0, + "grad_norm": 0.03400858364934581, + "language_loss": 0.88502395, + "learning_rate": 0.0007937917998100529, + "loss": 0.89689511, + "num_input_tokens_seen": 138084048, + "router_z_loss_mlp": 0.92236328, + "step": 1669, + "time_per_iteration": 2.6158430576324463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188853, + "balance_loss_mlp": 1.09658515, + "epoch": 0.32127741439015006, + "flos": 531673022976.0, + "grad_norm": 0.029937804889017615, + "language_loss": 0.92354518, + "learning_rate": 0.0007935396541412302, + "loss": 0.93543375, + "num_input_tokens_seen": 138153280, + "router_z_loss_mlp": 0.92138672, + "step": 1670, + "time_per_iteration": 2.6148414611816406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188159, + "balance_loss_mlp": 1.09589148, + "epoch": 0.3214697960754136, + "flos": 502223187456.0, + "grad_norm": 0.027719397006423088, + "language_loss": 0.94146281, + "learning_rate": 0.0007932873945102068, + "loss": 0.95334446, + "num_input_tokens_seen": 138222320, + "router_z_loss_mlp": 0.92138672, + "step": 1671, + "time_per_iteration": 2.5756680965423584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189911, + "balance_loss_mlp": 1.09950256, + "epoch": 0.3216621777606772, + "flos": 1386402089472.0, + "grad_norm": 0.015471737686433536, + "language_loss": 0.75761777, + "learning_rate": 0.0007930350210149188, + "loss": 0.76951689, + "num_input_tokens_seen": 138449488, + "router_z_loss_mlp": 0.90234375, + "step": 1672, + "time_per_iteration": 4.848818778991699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181453, + "balance_loss_mlp": 1.08975732, + "epoch": 0.32185455944594077, + "flos": 572635040256.0, + "grad_norm": 0.021338606013939526, + "language_loss": 0.94597888, + "learning_rate": 0.0007927825337533461, + "loss": 0.95779347, + "num_input_tokens_seen": 138522496, + "router_z_loss_mlp": 0.91552734, + "step": 1673, + "time_per_iteration": 2.6742517948150635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181114, + "balance_loss_mlp": 1.08975172, + "epoch": 0.3220469411312043, + "flos": 544936928256.0, + "grad_norm": 0.029706455848313437, + "language_loss": 0.9645716, + "learning_rate": 0.0007925299328235131, + "loss": 0.97638273, + "num_input_tokens_seen": 138590096, + "router_z_loss_mlp": 0.91210938, + "step": 1674, + "time_per_iteration": 2.637598991394043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182375, + "balance_loss_mlp": 1.09101272, + "epoch": 0.3222393228164679, + "flos": 492161284608.0, + "grad_norm": 0.02873592636128419, + "language_loss": 0.969607, + "learning_rate": 0.000792277218323488, + "loss": 0.98143071, + "num_input_tokens_seen": 138658224, + "router_z_loss_mlp": 0.91210938, + "step": 1675, + "time_per_iteration": 2.589118719100952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182718, + "balance_loss_mlp": 1.0914042, + "epoch": 0.3224317045017314, + "flos": 491362285056.0, + "grad_norm": 0.026517432951267347, + "language_loss": 0.94174361, + "learning_rate": 0.0007920243903513833, + "loss": 0.95357084, + "num_input_tokens_seen": 138722864, + "router_z_loss_mlp": 0.91162109, + "step": 1676, + "time_per_iteration": 2.5541775226593018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179593, + "balance_loss_mlp": 1.08832622, + "epoch": 0.322624086186995, + "flos": 576870357504.0, + "grad_norm": 0.028460659829427477, + "language_loss": 0.94868386, + "learning_rate": 0.0007917714490053556, + "loss": 0.96047986, + "num_input_tokens_seen": 138791472, + "router_z_loss_mlp": 0.91113281, + "step": 1677, + "time_per_iteration": 2.685833215713501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196193, + "balance_loss_mlp": 1.10454535, + "epoch": 0.32281646787225854, + "flos": 630571253760.0, + "grad_norm": 0.02861547850998442, + "language_loss": 0.93624204, + "learning_rate": 0.0007915183943836055, + "loss": 0.94820398, + "num_input_tokens_seen": 138873424, + "router_z_loss_mlp": 0.91503906, + "step": 1678, + "time_per_iteration": 2.8957157135009766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184806, + "balance_loss_mlp": 1.09363461, + "epoch": 0.3230088495575221, + "flos": 782807084544.0, + "grad_norm": 0.029736135795599906, + "language_loss": 0.92990124, + "learning_rate": 0.0007912652265843773, + "loss": 0.94174933, + "num_input_tokens_seen": 138956880, + "router_z_loss_mlp": 0.91015625, + "step": 1679, + "time_per_iteration": 3.0256145000457764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187663, + "balance_loss_mlp": 1.09620523, + "epoch": 0.3232012312427857, + "flos": 537200165376.0, + "grad_norm": 0.0299548546326655, + "language_loss": 0.88938797, + "learning_rate": 0.0007910119457059597, + "loss": 0.90126455, + "num_input_tokens_seen": 139031296, + "router_z_loss_mlp": 0.91308594, + "step": 1680, + "time_per_iteration": 2.7195773124694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118719, + "balance_loss_mlp": 1.09601843, + "epoch": 0.32339361292804925, + "flos": 706232272896.0, + "grad_norm": 0.03079987155163935, + "language_loss": 0.89790422, + "learning_rate": 0.0007907585518466849, + "loss": 0.90977609, + "num_input_tokens_seen": 139109776, + "router_z_loss_mlp": 0.91015625, + "step": 1681, + "time_per_iteration": 2.9635961055755615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186411, + "balance_loss_mlp": 1.09523988, + "epoch": 0.32358599461331283, + "flos": 453257164800.0, + "grad_norm": 0.027692195030378806, + "language_loss": 0.99450397, + "learning_rate": 0.000790505045104929, + "loss": 1.00636816, + "num_input_tokens_seen": 139174736, + "router_z_loss_mlp": 0.91015625, + "step": 1682, + "time_per_iteration": 2.5084030628204346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186896, + "balance_loss_mlp": 1.09553456, + "epoch": 0.32377837629857636, + "flos": 602091606528.0, + "grad_norm": 0.028152445524849662, + "language_loss": 0.96712899, + "learning_rate": 0.0007902514255791125, + "loss": 0.97899795, + "num_input_tokens_seen": 139252064, + "router_z_loss_mlp": 0.91210938, + "step": 1683, + "time_per_iteration": 2.7732536792755127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185338, + "balance_loss_mlp": 1.09388101, + "epoch": 0.32397075798383995, + "flos": 808898465280.0, + "grad_norm": 0.02645952871958238, + "language_loss": 0.9579218, + "learning_rate": 0.0007899976933676986, + "loss": 0.9697752, + "num_input_tokens_seen": 139333328, + "router_z_loss_mlp": 0.91308594, + "step": 1684, + "time_per_iteration": 2.985987424850464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184012, + "balance_loss_mlp": 1.09274495, + "epoch": 0.3241631396691035, + "flos": 602792550912.0, + "grad_norm": 0.02682215462305332, + "language_loss": 0.96423018, + "learning_rate": 0.0007897438485691955, + "loss": 0.97607034, + "num_input_tokens_seen": 139400976, + "router_z_loss_mlp": 0.91113281, + "step": 1685, + "time_per_iteration": 2.673083543777466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185177, + "balance_loss_mlp": 1.09386301, + "epoch": 0.32435552135436707, + "flos": 475176354816.0, + "grad_norm": 0.030260846574811467, + "language_loss": 0.93327641, + "learning_rate": 0.0007894898912821542, + "loss": 0.9451282, + "num_input_tokens_seen": 139465664, + "router_z_loss_mlp": 0.91162109, + "step": 1686, + "time_per_iteration": 2.526704788208008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181419, + "balance_loss_mlp": 1.09015274, + "epoch": 0.3245479030396306, + "flos": 539219131392.0, + "grad_norm": 0.02519584895765407, + "language_loss": 0.95407552, + "learning_rate": 0.0007892358216051695, + "loss": 0.96588969, + "num_input_tokens_seen": 139541984, + "router_z_loss_mlp": 0.91113281, + "step": 1687, + "time_per_iteration": 2.718292713165283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186611, + "balance_loss_mlp": 1.09543955, + "epoch": 0.3247402847248942, + "flos": 548696884224.0, + "grad_norm": 0.02873183694146744, + "language_loss": 1.00761271, + "learning_rate": 0.0007889816396368803, + "loss": 1.0194788, + "num_input_tokens_seen": 139607408, + "router_z_loss_mlp": 0.91015625, + "step": 1688, + "time_per_iteration": 2.6112852096557617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179714, + "balance_loss_mlp": 1.08835161, + "epoch": 0.3249326664101578, + "flos": 378992030208.0, + "grad_norm": 0.0263136625306578, + "language_loss": 0.95246112, + "learning_rate": 0.0007887273454759687, + "loss": 0.96425825, + "num_input_tokens_seen": 139670000, + "router_z_loss_mlp": 0.91210938, + "step": 1689, + "time_per_iteration": 2.466093063354492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185248, + "balance_loss_mlp": 1.09407663, + "epoch": 0.3251250480954213, + "flos": 529122299904.0, + "grad_norm": 0.02633136368880149, + "language_loss": 0.91763788, + "learning_rate": 0.0007884729392211603, + "loss": 0.92949039, + "num_input_tokens_seen": 139739872, + "router_z_loss_mlp": 0.91015625, + "step": 1690, + "time_per_iteration": 2.633387804031372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182102, + "balance_loss_mlp": 1.09054887, + "epoch": 0.3253174297806849, + "flos": 450558721536.0, + "grad_norm": 0.03256384134880849, + "language_loss": 0.96271229, + "learning_rate": 0.0007882184209712245, + "loss": 0.97453332, + "num_input_tokens_seen": 139802032, + "router_z_loss_mlp": 0.9140625, + "step": 1691, + "time_per_iteration": 2.511629104614258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183951, + "balance_loss_mlp": 1.09239864, + "epoch": 0.32550981146594843, + "flos": 705489669120.0, + "grad_norm": 0.02306884235196454, + "language_loss": 0.92818689, + "learning_rate": 0.000787963790824974, + "loss": 0.9400264, + "num_input_tokens_seen": 139885648, + "router_z_loss_mlp": 0.9140625, + "step": 1692, + "time_per_iteration": 2.953939914703369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118506, + "balance_loss_mlp": 1.0935545, + "epoch": 0.325702193151212, + "flos": 393558494208.0, + "grad_norm": 0.026666894987577915, + "language_loss": 0.98025191, + "learning_rate": 0.0007877090488812651, + "loss": 0.9921025, + "num_input_tokens_seen": 139947920, + "router_z_loss_mlp": 0.91357422, + "step": 1693, + "time_per_iteration": 2.4410316944122314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178009, + "balance_loss_mlp": 1.08659911, + "epoch": 0.32589457483647555, + "flos": 578583149568.0, + "grad_norm": 0.029080232987036207, + "language_loss": 0.92532402, + "learning_rate": 0.0007874541952389973, + "loss": 0.93710411, + "num_input_tokens_seen": 140020048, + "router_z_loss_mlp": 0.91259766, + "step": 1694, + "time_per_iteration": 2.660390853881836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179003, + "balance_loss_mlp": 1.08792675, + "epoch": 0.32608695652173914, + "flos": 499329360384.0, + "grad_norm": 0.023433013698769337, + "language_loss": 0.93903476, + "learning_rate": 0.0007871992299971136, + "loss": 0.9508248, + "num_input_tokens_seen": 140085600, + "router_z_loss_mlp": 0.90917969, + "step": 1695, + "time_per_iteration": 2.5506269931793213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179394, + "balance_loss_mlp": 1.08822274, + "epoch": 0.32627933820700267, + "flos": 592300948992.0, + "grad_norm": 0.02355558557065364, + "language_loss": 0.91491008, + "learning_rate": 0.0007869441532546001, + "loss": 0.92670405, + "num_input_tokens_seen": 140155152, + "router_z_loss_mlp": 0.91015625, + "step": 1696, + "time_per_iteration": 2.7493326663970947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177542, + "balance_loss_mlp": 1.08618009, + "epoch": 0.32647171989226625, + "flos": 610273531392.0, + "grad_norm": 0.02705729718991907, + "language_loss": 0.87004846, + "learning_rate": 0.0007866889651104867, + "loss": 0.8818239, + "num_input_tokens_seen": 140228560, + "router_z_loss_mlp": 0.91210938, + "step": 1697, + "time_per_iteration": 2.7824432849884033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179221, + "balance_loss_mlp": 1.08785892, + "epoch": 0.32666410157752984, + "flos": 478189704192.0, + "grad_norm": 0.028152017440838794, + "language_loss": 0.94142878, + "learning_rate": 0.000786433665663846, + "loss": 0.95322108, + "num_input_tokens_seen": 140297952, + "router_z_loss_mlp": 0.91210938, + "step": 1698, + "time_per_iteration": 2.6674411296844482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187877, + "balance_loss_mlp": 1.09670568, + "epoch": 0.3268564832627934, + "flos": 719693563392.0, + "grad_norm": 0.040459779361444057, + "language_loss": 0.95728016, + "learning_rate": 0.0007861782550137942, + "loss": 0.96915889, + "num_input_tokens_seen": 140373408, + "router_z_loss_mlp": 0.91015625, + "step": 1699, + "time_per_iteration": 2.923370599746704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187429, + "balance_loss_mlp": 1.09625793, + "epoch": 0.32704886494805696, + "flos": 770105135616.0, + "grad_norm": 0.025720199745930695, + "language_loss": 0.93479955, + "learning_rate": 0.0007859227332594901, + "loss": 0.94667387, + "num_input_tokens_seen": 140451840, + "router_z_loss_mlp": 0.91015625, + "step": 1700, + "time_per_iteration": 2.94014573097229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191948, + "balance_loss_mlp": 1.10120583, + "epoch": 0.3272412466333205, + "flos": 851404087296.0, + "grad_norm": 0.0329500691508657, + "language_loss": 0.94768298, + "learning_rate": 0.0007856671005001365, + "loss": 0.95960248, + "num_input_tokens_seen": 140537696, + "router_z_loss_mlp": 0.90576172, + "step": 1701, + "time_per_iteration": 3.1774539947509766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118211, + "balance_loss_mlp": 1.09065294, + "epoch": 0.3274336283185841, + "flos": 833040737280.0, + "grad_norm": 0.029774404200988806, + "language_loss": 0.90405869, + "learning_rate": 0.0007854113568349787, + "loss": 0.91587985, + "num_input_tokens_seen": 140623536, + "router_z_loss_mlp": 0.91308594, + "step": 1702, + "time_per_iteration": 3.107083559036255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186026, + "balance_loss_mlp": 1.09471202, + "epoch": 0.3276260100038476, + "flos": 693252347904.0, + "grad_norm": 0.029328613393929583, + "language_loss": 0.89606428, + "learning_rate": 0.0007851555023633052, + "loss": 0.90792453, + "num_input_tokens_seen": 140700688, + "router_z_loss_mlp": 0.91162109, + "step": 1703, + "time_per_iteration": 2.8335254192352295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011877, + "balance_loss_mlp": 1.09643364, + "epoch": 0.3278183916891112, + "flos": 436977908736.0, + "grad_norm": 0.03479764223743197, + "language_loss": 0.91987431, + "learning_rate": 0.0007848995371844474, + "loss": 0.93175125, + "num_input_tokens_seen": 140765808, + "router_z_loss_mlp": 0.91113281, + "step": 1704, + "time_per_iteration": 2.51261043548584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118827, + "balance_loss_mlp": 1.09728956, + "epoch": 0.3280107733743748, + "flos": 462016508928.0, + "grad_norm": 0.027955151013136243, + "language_loss": 0.90236068, + "learning_rate": 0.0007846434613977801, + "loss": 0.91424334, + "num_input_tokens_seen": 140830512, + "router_z_loss_mlp": 0.90820312, + "step": 1705, + "time_per_iteration": 2.51505708694458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185335, + "balance_loss_mlp": 1.09464061, + "epoch": 0.3282031550596383, + "flos": 680528931840.0, + "grad_norm": 0.0285448105624817, + "language_loss": 0.86403298, + "learning_rate": 0.0007843872751027203, + "loss": 0.87588632, + "num_input_tokens_seen": 140902816, + "router_z_loss_mlp": 0.90527344, + "step": 1706, + "time_per_iteration": 2.7977733612060547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183945, + "balance_loss_mlp": 1.0931555, + "epoch": 0.3283955367449019, + "flos": 546254949888.0, + "grad_norm": 0.024438576566567966, + "language_loss": 0.93906903, + "learning_rate": 0.0007841309783987287, + "loss": 0.95090854, + "num_input_tokens_seen": 140975488, + "router_z_loss_mlp": 0.90625, + "step": 1707, + "time_per_iteration": 2.737680196762085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178748, + "balance_loss_mlp": 1.08757639, + "epoch": 0.32858791843016544, + "flos": 482240371200.0, + "grad_norm": 0.027193371904651382, + "language_loss": 0.97315758, + "learning_rate": 0.0007838745713853084, + "loss": 0.98494506, + "num_input_tokens_seen": 141043248, + "router_z_loss_mlp": 0.91015625, + "step": 1708, + "time_per_iteration": 2.5702459812164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189964, + "balance_loss_mlp": 1.09879303, + "epoch": 0.328780300115429, + "flos": 567915629568.0, + "grad_norm": 0.029427091701823335, + "language_loss": 0.93208408, + "learning_rate": 0.0007836180541620053, + "loss": 0.94398379, + "num_input_tokens_seen": 141119408, + "router_z_loss_mlp": 0.91015625, + "step": 1709, + "time_per_iteration": 2.7365195751190186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189596, + "balance_loss_mlp": 1.09852052, + "epoch": 0.32897268180069256, + "flos": 476991204864.0, + "grad_norm": 0.02924752300223344, + "language_loss": 0.94609785, + "learning_rate": 0.0007833614268284082, + "loss": 0.95799387, + "num_input_tokens_seen": 141184112, + "router_z_loss_mlp": 0.90917969, + "step": 1710, + "time_per_iteration": 2.575416326522827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186913, + "balance_loss_mlp": 1.09745789, + "epoch": 0.32916506348595614, + "flos": 1580450603520.0, + "grad_norm": 0.014653073497659498, + "language_loss": 0.74109769, + "learning_rate": 0.0007831046894841489, + "loss": 0.75296688, + "num_input_tokens_seen": 141414960, + "router_z_loss_mlp": 0.89257812, + "step": 1711, + "time_per_iteration": 4.8569114208221436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117837, + "balance_loss_mlp": 1.08681703, + "epoch": 0.3293574451712197, + "flos": 483851105280.0, + "grad_norm": 0.027096123044633498, + "language_loss": 0.8678506, + "learning_rate": 0.0007828478422289016, + "loss": 0.87963432, + "num_input_tokens_seen": 141485744, + "router_z_loss_mlp": 0.9140625, + "step": 1712, + "time_per_iteration": 2.5748305320739746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181971, + "balance_loss_mlp": 1.09041798, + "epoch": 0.32954982685648326, + "flos": 623724088320.0, + "grad_norm": 0.027491608740018197, + "language_loss": 0.97854888, + "learning_rate": 0.0007825908851623833, + "loss": 0.99036855, + "num_input_tokens_seen": 141560592, + "router_z_loss_mlp": 0.9140625, + "step": 1713, + "time_per_iteration": 2.7387707233428955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180742, + "balance_loss_mlp": 1.0893327, + "epoch": 0.32974220854174685, + "flos": 546070299648.0, + "grad_norm": 0.028986059756107307, + "language_loss": 0.93660253, + "learning_rate": 0.0007823338183843533, + "loss": 0.94840991, + "num_input_tokens_seen": 141630400, + "router_z_loss_mlp": 0.91259766, + "step": 1714, + "time_per_iteration": 2.7061285972595215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194773, + "balance_loss_mlp": 1.10341084, + "epoch": 0.3299345902270104, + "flos": 983822286336.0, + "grad_norm": 0.02918308821255402, + "language_loss": 0.89344442, + "learning_rate": 0.0007820766419946141, + "loss": 0.90539211, + "num_input_tokens_seen": 141721552, + "router_z_loss_mlp": 0.91210938, + "step": 1715, + "time_per_iteration": 3.2698333263397217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119133, + "balance_loss_mlp": 1.10206604, + "epoch": 0.33012697191227397, + "flos": 1406901926400.0, + "grad_norm": 0.008988097140154246, + "language_loss": 0.7967248, + "learning_rate": 0.0007818193560930102, + "loss": 0.8086381, + "num_input_tokens_seen": 141956464, + "router_z_loss_mlp": 0.890625, + "step": 1716, + "time_per_iteration": 4.931420564651489 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193588, + "balance_loss_mlp": 1.10213029, + "epoch": 0.3303193535975375, + "flos": 506169795072.0, + "grad_norm": 0.03043585823380059, + "language_loss": 0.87317824, + "learning_rate": 0.0007815619607794288, + "loss": 0.88511419, + "num_input_tokens_seen": 142029552, + "router_z_loss_mlp": 0.91308594, + "step": 1717, + "time_per_iteration": 2.611924171447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198413, + "balance_loss_mlp": 1.10676467, + "epoch": 0.3305117352828011, + "flos": 939484349952.0, + "grad_norm": 0.029759763631388395, + "language_loss": 0.92828202, + "learning_rate": 0.0007813044561538001, + "loss": 0.94026613, + "num_input_tokens_seen": 142117344, + "router_z_loss_mlp": 0.91503906, + "step": 1718, + "time_per_iteration": 3.188633680343628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186368, + "balance_loss_mlp": 1.09495842, + "epoch": 0.3307041169680646, + "flos": 722793507840.0, + "grad_norm": 0.027827869889066197, + "language_loss": 0.97286105, + "learning_rate": 0.0007810468423160958, + "loss": 0.9847247, + "num_input_tokens_seen": 142190096, + "router_z_loss_mlp": 0.91259766, + "step": 1719, + "time_per_iteration": 2.8963494300842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179653, + "balance_loss_mlp": 1.08829057, + "epoch": 0.3308964986533282, + "flos": 584815965696.0, + "grad_norm": 0.0232486528054596, + "language_loss": 0.89203978, + "learning_rate": 0.0007807891193663306, + "loss": 0.90383637, + "num_input_tokens_seen": 142265584, + "router_z_loss_mlp": 0.91210938, + "step": 1720, + "time_per_iteration": 2.784005880355835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188579, + "balance_loss_mlp": 1.09712148, + "epoch": 0.33108888033859174, + "flos": 474525075456.0, + "grad_norm": 0.03234593548431852, + "language_loss": 0.92577451, + "learning_rate": 0.0007805312874045614, + "loss": 0.93766028, + "num_input_tokens_seen": 142330352, + "router_z_loss_mlp": 0.91308594, + "step": 1721, + "time_per_iteration": 2.5072579383850098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187856, + "balance_loss_mlp": 1.09635103, + "epoch": 0.3312812620238553, + "flos": 386996035584.0, + "grad_norm": 0.030880666413309405, + "language_loss": 0.96009982, + "learning_rate": 0.0007802733465308874, + "loss": 0.97197837, + "num_input_tokens_seen": 142392208, + "router_z_loss_mlp": 0.91357422, + "step": 1722, + "time_per_iteration": 2.460878372192383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193288, + "balance_loss_mlp": 1.10173571, + "epoch": 0.3314736437091189, + "flos": 495604333056.0, + "grad_norm": 0.02871647017272099, + "language_loss": 0.9219079, + "learning_rate": 0.0007800152968454501, + "loss": 0.93384075, + "num_input_tokens_seen": 142462112, + "router_z_loss_mlp": 0.9140625, + "step": 1723, + "time_per_iteration": 2.6537680625915527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185112, + "balance_loss_mlp": 1.09365499, + "epoch": 0.33166602539438245, + "flos": 654930376704.0, + "grad_norm": 0.0223046700763118, + "language_loss": 0.96869862, + "learning_rate": 0.0007797571384484334, + "loss": 0.98054969, + "num_input_tokens_seen": 142539120, + "router_z_loss_mlp": 0.91308594, + "step": 1724, + "time_per_iteration": 2.8509135246276855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180603, + "balance_loss_mlp": 1.08909798, + "epoch": 0.33185840707964603, + "flos": 521834701824.0, + "grad_norm": 0.02731483808063424, + "language_loss": 1.00636935, + "learning_rate": 0.0007794988714400633, + "loss": 1.01817536, + "num_input_tokens_seen": 142611520, + "router_z_loss_mlp": 0.91357422, + "step": 1725, + "time_per_iteration": 2.5883586406707764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180377, + "balance_loss_mlp": 1.08901501, + "epoch": 0.33205078876490957, + "flos": 437898432000.0, + "grad_norm": 0.028871117282170154, + "language_loss": 0.94438303, + "learning_rate": 0.0007792404959206079, + "loss": 0.95618677, + "num_input_tokens_seen": 142676064, + "router_z_loss_mlp": 0.91210938, + "step": 1726, + "time_per_iteration": 2.522392988204956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196305, + "balance_loss_mlp": 1.10499096, + "epoch": 0.33224317045017315, + "flos": 770094402048.0, + "grad_norm": 0.026417182809826974, + "language_loss": 0.89548182, + "learning_rate": 0.0007789820119903774, + "loss": 0.90744483, + "num_input_tokens_seen": 142750944, + "router_z_loss_mlp": 0.91162109, + "step": 1727, + "time_per_iteration": 3.015399217605591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119368, + "balance_loss_mlp": 1.10441589, + "epoch": 0.3324355521354367, + "flos": 1469293584384.0, + "grad_norm": 0.009201187704085647, + "language_loss": 0.78492665, + "learning_rate": 0.0007787234197497242, + "loss": 0.79686344, + "num_input_tokens_seen": 142974032, + "router_z_loss_mlp": 0.890625, + "step": 1728, + "time_per_iteration": 4.849627494812012 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187682, + "balance_loss_mlp": 1.09641564, + "epoch": 0.3326279338207003, + "flos": 497799217152.0, + "grad_norm": 0.02618775195690524, + "language_loss": 0.91979456, + "learning_rate": 0.0007784647192990428, + "loss": 0.93167138, + "num_input_tokens_seen": 143047280, + "router_z_loss_mlp": 0.91113281, + "step": 1729, + "time_per_iteration": 2.6944785118103027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178599, + "balance_loss_mlp": 1.08761811, + "epoch": 0.33282031550596386, + "flos": 637053121536.0, + "grad_norm": 0.02771760173732663, + "language_loss": 0.88792735, + "learning_rate": 0.0007782059107387696, + "loss": 0.89971334, + "num_input_tokens_seen": 143124224, + "router_z_loss_mlp": 0.90820312, + "step": 1730, + "time_per_iteration": 2.8583710193634033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179548, + "balance_loss_mlp": 1.0887109, + "epoch": 0.3330126971912274, + "flos": 690721090560.0, + "grad_norm": 0.027739782699759397, + "language_loss": 0.98025161, + "learning_rate": 0.0007779469941693826, + "loss": 0.99204707, + "num_input_tokens_seen": 143194048, + "router_z_loss_mlp": 0.90673828, + "step": 1731, + "time_per_iteration": 2.810589075088501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184359, + "balance_loss_mlp": 1.09361696, + "epoch": 0.333205078876491, + "flos": 567553059840.0, + "grad_norm": 0.03096728777448764, + "language_loss": 0.86715639, + "learning_rate": 0.0007776879696914029, + "loss": 0.87899995, + "num_input_tokens_seen": 143272976, + "router_z_loss_mlp": 0.90576172, + "step": 1732, + "time_per_iteration": 2.8331797122955322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179804, + "balance_loss_mlp": 1.08906233, + "epoch": 0.3333974605617545, + "flos": 642170030592.0, + "grad_norm": 0.024377484958938406, + "language_loss": 0.95668435, + "learning_rate": 0.000777428837405392, + "loss": 0.96848238, + "num_input_tokens_seen": 143346496, + "router_z_loss_mlp": 0.90576172, + "step": 1733, + "time_per_iteration": 2.8495984077453613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178278, + "balance_loss_mlp": 1.087345, + "epoch": 0.3335898422470181, + "flos": 462778578432.0, + "grad_norm": 0.02888991438897714, + "language_loss": 0.96001673, + "learning_rate": 0.0007771695974119544, + "loss": 0.97179955, + "num_input_tokens_seen": 143410448, + "router_z_loss_mlp": 0.90771484, + "step": 1734, + "time_per_iteration": 2.581843614578247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193993, + "balance_loss_mlp": 1.10267842, + "epoch": 0.33378222393228163, + "flos": 854336845824.0, + "grad_norm": 0.031032438471150628, + "language_loss": 0.84453082, + "learning_rate": 0.0007769102498117359, + "loss": 0.85647076, + "num_input_tokens_seen": 143492416, + "router_z_loss_mlp": 0.91162109, + "step": 1735, + "time_per_iteration": 3.092892646789551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118579, + "balance_loss_mlp": 1.09471452, + "epoch": 0.3339746056175452, + "flos": 956308824576.0, + "grad_norm": 0.02638013374987503, + "language_loss": 0.87690091, + "learning_rate": 0.000776650794705424, + "loss": 0.88875878, + "num_input_tokens_seen": 143590096, + "router_z_loss_mlp": 0.90917969, + "step": 1736, + "time_per_iteration": 3.26749587059021 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188294, + "balance_loss_mlp": 1.09693241, + "epoch": 0.33416698730280875, + "flos": 545894381568.0, + "grad_norm": 0.025194797458818457, + "language_loss": 0.89670336, + "learning_rate": 0.0007763912321937483, + "loss": 0.90858638, + "num_input_tokens_seen": 143663344, + "router_z_loss_mlp": 0.91210938, + "step": 1737, + "time_per_iteration": 2.680321455001831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186775, + "balance_loss_mlp": 1.09522188, + "epoch": 0.33435936898807234, + "flos": 1015875237888.0, + "grad_norm": 0.02847992800895855, + "language_loss": 0.91932124, + "learning_rate": 0.0007761315623774799, + "loss": 0.93118894, + "num_input_tokens_seen": 143753072, + "router_z_loss_mlp": 0.9140625, + "step": 1738, + "time_per_iteration": 3.3992278575897217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191791, + "balance_loss_mlp": 1.10014248, + "epoch": 0.3345517506733359, + "flos": 616371362304.0, + "grad_norm": 0.027566762490977777, + "language_loss": 0.97487831, + "learning_rate": 0.0007758717853574313, + "loss": 0.9867962, + "num_input_tokens_seen": 143827280, + "router_z_loss_mlp": 0.91503906, + "step": 1739, + "time_per_iteration": 2.7331244945526123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195023, + "balance_loss_mlp": 1.10327947, + "epoch": 0.33474413235859946, + "flos": 495569404416.0, + "grad_norm": 0.027457607023843998, + "language_loss": 0.9961037, + "learning_rate": 0.0007756119012344571, + "loss": 1.00805402, + "num_input_tokens_seen": 143895072, + "router_z_loss_mlp": 0.91601562, + "step": 1740, + "time_per_iteration": 2.5305063724517822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189378, + "balance_loss_mlp": 1.09772944, + "epoch": 0.33493651404386304, + "flos": 629487547392.0, + "grad_norm": 0.029043894294382887, + "language_loss": 0.93616855, + "learning_rate": 0.0007753519101094535, + "loss": 0.9480623, + "num_input_tokens_seen": 143965728, + "router_z_loss_mlp": 0.91503906, + "step": 1741, + "time_per_iteration": 2.7408056259155273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177762, + "balance_loss_mlp": 1.08630431, + "epoch": 0.3351288957291266, + "flos": 514742487552.0, + "grad_norm": 0.027889242250670986, + "language_loss": 0.95720202, + "learning_rate": 0.0007750918120833575, + "loss": 0.96897966, + "num_input_tokens_seen": 144030272, + "router_z_loss_mlp": 0.91308594, + "step": 1742, + "time_per_iteration": 2.5787625312805176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179593, + "balance_loss_mlp": 1.08818376, + "epoch": 0.33532127741439016, + "flos": 648482711040.0, + "grad_norm": 0.029208114264274002, + "language_loss": 0.95614851, + "learning_rate": 0.0007748316072571485, + "loss": 0.96794444, + "num_input_tokens_seen": 144104048, + "router_z_loss_mlp": 0.91259766, + "step": 1743, + "time_per_iteration": 2.751394033432007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178526, + "balance_loss_mlp": 1.08764088, + "epoch": 0.3355136590996537, + "flos": 769788228096.0, + "grad_norm": 0.02678280054581141, + "language_loss": 0.86505532, + "learning_rate": 0.0007745712957318467, + "loss": 0.87684047, + "num_input_tokens_seen": 144180432, + "router_z_loss_mlp": 0.90722656, + "step": 1744, + "time_per_iteration": 2.9703569412231445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179715, + "balance_loss_mlp": 1.088925, + "epoch": 0.3357060407849173, + "flos": 596649057792.0, + "grad_norm": 0.023433474800662903, + "language_loss": 0.94101429, + "learning_rate": 0.0007743108776085141, + "loss": 0.95281148, + "num_input_tokens_seen": 144258704, + "router_z_loss_mlp": 0.90625, + "step": 1745, + "time_per_iteration": 2.7529683113098145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184954, + "balance_loss_mlp": 1.09435499, + "epoch": 0.3358984224701808, + "flos": 599801395200.0, + "grad_norm": 0.02538707782704008, + "language_loss": 0.88967884, + "learning_rate": 0.0007740503529882543, + "loss": 0.9015283, + "num_input_tokens_seen": 144335104, + "router_z_loss_mlp": 0.90429688, + "step": 1746, + "time_per_iteration": 2.79131817817688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188552, + "balance_loss_mlp": 1.09780991, + "epoch": 0.3360908041554444, + "flos": 579429812736.0, + "grad_norm": 0.028485119021284356, + "language_loss": 0.99668056, + "learning_rate": 0.0007737897219722114, + "loss": 1.00856614, + "num_input_tokens_seen": 144402912, + "router_z_loss_mlp": 0.90576172, + "step": 1747, + "time_per_iteration": 2.685925006866455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189008, + "balance_loss_mlp": 1.09836173, + "epoch": 0.336283185840708, + "flos": 514620963840.0, + "grad_norm": 0.027318502045144608, + "language_loss": 0.90481317, + "learning_rate": 0.0007735289846615716, + "loss": 0.91670322, + "num_input_tokens_seen": 144475328, + "router_z_loss_mlp": 0.90478516, + "step": 1748, + "time_per_iteration": 2.62443470954895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189766, + "balance_loss_mlp": 1.09902358, + "epoch": 0.3364755675259715, + "flos": 526013623296.0, + "grad_norm": 0.026723032477842582, + "language_loss": 0.90137696, + "learning_rate": 0.0007732681411575621, + "loss": 0.91327465, + "num_input_tokens_seen": 144548288, + "router_z_loss_mlp": 0.90576172, + "step": 1749, + "time_per_iteration": 2.646358013153076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182694, + "balance_loss_mlp": 1.09209466, + "epoch": 0.3366679492112351, + "flos": 555973748736.0, + "grad_norm": 0.023573972968583972, + "language_loss": 0.93333745, + "learning_rate": 0.0007730071915614514, + "loss": 0.94516432, + "num_input_tokens_seen": 144619488, + "router_z_loss_mlp": 0.90429688, + "step": 1750, + "time_per_iteration": 2.6758012771606445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179857, + "balance_loss_mlp": 1.08901942, + "epoch": 0.33686033089649864, + "flos": 428164170240.0, + "grad_norm": 0.030830494146199924, + "language_loss": 0.97502697, + "learning_rate": 0.0007727461359745489, + "loss": 0.98682547, + "num_input_tokens_seen": 144682560, + "router_z_loss_mlp": 0.90673828, + "step": 1751, + "time_per_iteration": 2.4563541412353516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182248, + "balance_loss_mlp": 1.09145832, + "epoch": 0.3370527125817622, + "flos": 542840099328.0, + "grad_norm": 0.023246790346845608, + "language_loss": 0.93729055, + "learning_rate": 0.0007724849744982056, + "loss": 0.94911301, + "num_input_tokens_seen": 144753328, + "router_z_loss_mlp": 0.90625, + "step": 1752, + "time_per_iteration": 2.668113946914673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179422, + "balance_loss_mlp": 1.08858418, + "epoch": 0.33724509426702576, + "flos": 543230866944.0, + "grad_norm": 0.02371236203418416, + "language_loss": 0.90932786, + "learning_rate": 0.0007722237072338131, + "loss": 0.92112207, + "num_input_tokens_seen": 144827312, + "router_z_loss_mlp": 0.90673828, + "step": 1753, + "time_per_iteration": 2.69787335395813 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178324, + "balance_loss_mlp": 1.08753431, + "epoch": 0.33743747595228935, + "flos": 473752272384.0, + "grad_norm": 0.029898359882718887, + "language_loss": 0.95709926, + "learning_rate": 0.0007719623342828046, + "loss": 0.96888256, + "num_input_tokens_seen": 144893488, + "router_z_loss_mlp": 0.90625, + "step": 1754, + "time_per_iteration": 2.4994091987609863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183652, + "balance_loss_mlp": 1.09295714, + "epoch": 0.33762985763755293, + "flos": 470836978176.0, + "grad_norm": 0.02665869511949433, + "language_loss": 0.93777692, + "learning_rate": 0.000771700855746654, + "loss": 0.94961339, + "num_input_tokens_seen": 144961152, + "router_z_loss_mlp": 0.90527344, + "step": 1755, + "time_per_iteration": 2.58086895942688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178715, + "balance_loss_mlp": 1.08792567, + "epoch": 0.33782223932281646, + "flos": 493250995200.0, + "grad_norm": 0.024252070816233498, + "language_loss": 0.95916575, + "learning_rate": 0.0007714392717268763, + "loss": 0.97095293, + "num_input_tokens_seen": 145030576, + "router_z_loss_mlp": 0.90625, + "step": 1756, + "time_per_iteration": 2.5631322860717773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180772, + "balance_loss_mlp": 1.08988702, + "epoch": 0.33801462100808005, + "flos": 466017510912.0, + "grad_norm": 0.025388958299120416, + "language_loss": 0.95127004, + "learning_rate": 0.0007711775823250273, + "loss": 0.96307778, + "num_input_tokens_seen": 145095648, + "router_z_loss_mlp": 0.90722656, + "step": 1757, + "time_per_iteration": 2.5053045749664307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178431, + "balance_loss_mlp": 1.08754551, + "epoch": 0.3382070026933436, + "flos": 797067374592.0, + "grad_norm": 0.024419621343361942, + "language_loss": 0.92107689, + "learning_rate": 0.0007709157876427039, + "loss": 0.93286121, + "num_input_tokens_seen": 145181248, + "router_z_loss_mlp": 0.90722656, + "step": 1758, + "time_per_iteration": 3.1007301807403564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178269, + "balance_loss_mlp": 1.08738351, + "epoch": 0.33839938437860717, + "flos": 509428193280.0, + "grad_norm": 0.024832384176200758, + "language_loss": 0.94253516, + "learning_rate": 0.0007706538877815439, + "loss": 0.95431781, + "num_input_tokens_seen": 145252944, + "router_z_loss_mlp": 0.90722656, + "step": 1759, + "time_per_iteration": 2.588744640350342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178646, + "balance_loss_mlp": 1.0878557, + "epoch": 0.3385917660638707, + "flos": 485273186304.0, + "grad_norm": 0.02369115174437829, + "language_loss": 0.89945841, + "learning_rate": 0.0007703918828432259, + "loss": 0.91124481, + "num_input_tokens_seen": 145323168, + "router_z_loss_mlp": 0.90625, + "step": 1760, + "time_per_iteration": 2.5859875679016113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178403, + "balance_loss_mlp": 1.08770907, + "epoch": 0.3387841477491343, + "flos": 546415405056.0, + "grad_norm": 0.02534991906570622, + "language_loss": 0.96946132, + "learning_rate": 0.000770129772929469, + "loss": 0.9812454, + "num_input_tokens_seen": 145395776, + "router_z_loss_mlp": 0.90527344, + "step": 1761, + "time_per_iteration": 2.633229970932007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117744, + "balance_loss_mlp": 1.08684063, + "epoch": 0.3389765294343978, + "flos": 721063251456.0, + "grad_norm": 0.027907228809642075, + "language_loss": 0.96886694, + "learning_rate": 0.0007698675581420334, + "loss": 0.98064131, + "num_input_tokens_seen": 145470576, + "router_z_loss_mlp": 0.90429688, + "step": 1762, + "time_per_iteration": 2.8309946060180664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190138, + "balance_loss_mlp": 1.09987259, + "epoch": 0.3391689111196614, + "flos": 701263084032.0, + "grad_norm": 0.028701846645649853, + "language_loss": 0.87853253, + "learning_rate": 0.0007696052385827199, + "loss": 0.89043397, + "num_input_tokens_seen": 145548896, + "router_z_loss_mlp": 0.90087891, + "step": 1763, + "time_per_iteration": 2.9673497676849365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183311, + "balance_loss_mlp": 1.09304607, + "epoch": 0.339361292804925, + "flos": 628248115200.0, + "grad_norm": 0.027144566695111814, + "language_loss": 0.85910845, + "learning_rate": 0.00076934281435337, + "loss": 0.87094158, + "num_input_tokens_seen": 145617136, + "router_z_loss_mlp": 0.90087891, + "step": 1764, + "time_per_iteration": 2.7069530487060547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011791, + "balance_loss_mlp": 1.08869135, + "epoch": 0.33955367449018853, + "flos": 610794554880.0, + "grad_norm": 0.025973604998757366, + "language_loss": 0.94002628, + "learning_rate": 0.0007690802855558658, + "loss": 0.95181727, + "num_input_tokens_seen": 145696416, + "router_z_loss_mlp": 0.90234375, + "step": 1765, + "time_per_iteration": 2.8596885204315186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198868, + "balance_loss_mlp": 1.11151123, + "epoch": 0.3397460561754521, + "flos": 1456586357760.0, + "grad_norm": 0.018873382807181687, + "language_loss": 0.76374954, + "learning_rate": 0.0007688176522921302, + "loss": 0.77573818, + "num_input_tokens_seen": 145919680, + "router_z_loss_mlp": 0.87109375, + "step": 1766, + "time_per_iteration": 4.900039434432983 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183458, + "balance_loss_mlp": 1.09304976, + "epoch": 0.33993843786071565, + "flos": 488290538496.0, + "grad_norm": 0.033631077459875626, + "language_loss": 1.00266671, + "learning_rate": 0.0007685549146641262, + "loss": 1.01450121, + "num_input_tokens_seen": 145984272, + "router_z_loss_mlp": 0.90234375, + "step": 1767, + "time_per_iteration": 2.521587610244751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176512, + "balance_loss_mlp": 1.08557928, + "epoch": 0.34013081954597923, + "flos": 418232523264.0, + "grad_norm": 0.024531175575557927, + "language_loss": 0.95696396, + "learning_rate": 0.0007682920727738579, + "loss": 0.96872908, + "num_input_tokens_seen": 146047248, + "router_z_loss_mlp": 0.90771484, + "step": 1768, + "time_per_iteration": 2.4606878757476807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177177, + "balance_loss_mlp": 1.08614898, + "epoch": 0.34032320123124277, + "flos": 438430189056.0, + "grad_norm": 0.027457130501572214, + "language_loss": 0.93990809, + "learning_rate": 0.000768029126723369, + "loss": 0.95167989, + "num_input_tokens_seen": 146111872, + "router_z_loss_mlp": 0.90869141, + "step": 1769, + "time_per_iteration": 2.494699478149414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181852, + "balance_loss_mlp": 1.09077609, + "epoch": 0.34051558291650635, + "flos": 458543261184.0, + "grad_norm": 0.027949795017340132, + "language_loss": 0.90377855, + "learning_rate": 0.0007677660766147447, + "loss": 0.91559708, + "num_input_tokens_seen": 146172608, + "router_z_loss_mlp": 0.90917969, + "step": 1770, + "time_per_iteration": 2.5302748680114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183578, + "balance_loss_mlp": 1.09469604, + "epoch": 0.3407079646017699, + "flos": 1562137645056.0, + "grad_norm": 0.011444512115251876, + "language_loss": 0.72470945, + "learning_rate": 0.0007675029225501102, + "loss": 0.73654521, + "num_input_tokens_seen": 146413584, + "router_z_loss_mlp": 0.88671875, + "step": 1771, + "time_per_iteration": 4.913311004638672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188847, + "balance_loss_mlp": 1.09758055, + "epoch": 0.3409003462870335, + "flos": 493530972672.0, + "grad_norm": 0.032062498304007335, + "language_loss": 0.91194993, + "learning_rate": 0.0007672396646316306, + "loss": 0.92383844, + "num_input_tokens_seen": 146476992, + "router_z_loss_mlp": 0.91113281, + "step": 1772, + "time_per_iteration": 2.539181709289551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179934, + "balance_loss_mlp": 1.08885825, + "epoch": 0.34109272797229706, + "flos": 809820989952.0, + "grad_norm": 0.028470010979029077, + "language_loss": 0.88439053, + "learning_rate": 0.000766976302961512, + "loss": 0.89618981, + "num_input_tokens_seen": 146552848, + "router_z_loss_mlp": 0.90917969, + "step": 1773, + "time_per_iteration": 3.006547212600708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181829, + "balance_loss_mlp": 1.09094357, + "epoch": 0.3412851096575606, + "flos": 471099491328.0, + "grad_norm": 0.02901021255147234, + "language_loss": 0.91066158, + "learning_rate": 0.0007667128376420003, + "loss": 0.92247993, + "num_input_tokens_seen": 146617504, + "router_z_loss_mlp": 0.90722656, + "step": 1774, + "time_per_iteration": 2.534266233444214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118318, + "balance_loss_mlp": 1.09253371, + "epoch": 0.3414774913428242, + "flos": 596770581504.0, + "grad_norm": 0.02876896591079206, + "language_loss": 0.92739397, + "learning_rate": 0.0007664492687753817, + "loss": 0.93922579, + "num_input_tokens_seen": 146691568, + "router_z_loss_mlp": 0.90478516, + "step": 1775, + "time_per_iteration": 2.671475410461426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181574, + "balance_loss_mlp": 1.09102285, + "epoch": 0.3416698730280877, + "flos": 528507950592.0, + "grad_norm": 0.025483549401886952, + "language_loss": 0.89018893, + "learning_rate": 0.000766185596463983, + "loss": 0.90200466, + "num_input_tokens_seen": 146764208, + "router_z_loss_mlp": 0.90380859, + "step": 1776, + "time_per_iteration": 2.6099884510040283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177935, + "balance_loss_mlp": 1.08719325, + "epoch": 0.3418622547133513, + "flos": 876117047808.0, + "grad_norm": 0.026020404961979337, + "language_loss": 0.84743214, + "learning_rate": 0.0007659218208101706, + "loss": 0.8592115, + "num_input_tokens_seen": 146847744, + "router_z_loss_mlp": 0.90576172, + "step": 1777, + "time_per_iteration": 3.1272366046905518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118093, + "balance_loss_mlp": 1.08994997, + "epoch": 0.34205463639861483, + "flos": 604876644864.0, + "grad_norm": 0.024068405360429687, + "language_loss": 0.91582745, + "learning_rate": 0.0007656579419163515, + "loss": 0.92763674, + "num_input_tokens_seen": 146918336, + "router_z_loss_mlp": 0.90820312, + "step": 1778, + "time_per_iteration": 2.7243831157684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180436, + "balance_loss_mlp": 1.0894556, + "epoch": 0.3422470180838784, + "flos": 464714952192.0, + "grad_norm": 0.02739040164484414, + "language_loss": 0.86445272, + "learning_rate": 0.0007653939598849724, + "loss": 0.87625706, + "num_input_tokens_seen": 146982496, + "router_z_loss_mlp": 0.90820312, + "step": 1779, + "time_per_iteration": 2.4913573265075684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180695, + "balance_loss_mlp": 1.09143066, + "epoch": 0.34243939976914195, + "flos": 1589816291328.0, + "grad_norm": 0.01051605552964957, + "language_loss": 0.82880205, + "learning_rate": 0.0007651298748185204, + "loss": 0.84060901, + "num_input_tokens_seen": 147213600, + "router_z_loss_mlp": 0.890625, + "step": 1780, + "time_per_iteration": 4.891184091567993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176554, + "balance_loss_mlp": 1.085621, + "epoch": 0.34263178145440554, + "flos": 874443187200.0, + "grad_norm": 0.026322112436007235, + "language_loss": 0.88782489, + "learning_rate": 0.000764865686819522, + "loss": 0.89959043, + "num_input_tokens_seen": 147287664, + "router_z_loss_mlp": 0.90771484, + "step": 1781, + "time_per_iteration": 3.048123836517334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176352, + "balance_loss_mlp": 1.08551466, + "epoch": 0.3428241631396691, + "flos": 507873854976.0, + "grad_norm": 0.024622696081698998, + "language_loss": 0.93515933, + "learning_rate": 0.0007646013959905449, + "loss": 0.94692284, + "num_input_tokens_seen": 147356800, + "router_z_loss_mlp": 0.90673828, + "step": 1782, + "time_per_iteration": 2.565661907196045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176257, + "balance_loss_mlp": 1.08565772, + "epoch": 0.34301654482493266, + "flos": 881524667904.0, + "grad_norm": 0.0252118274748732, + "language_loss": 0.880337, + "learning_rate": 0.0007643370024341949, + "loss": 0.89209956, + "num_input_tokens_seen": 147432496, + "router_z_loss_mlp": 0.90429688, + "step": 1783, + "time_per_iteration": 3.0695888996124268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180625, + "balance_loss_mlp": 1.08959711, + "epoch": 0.34320892651019624, + "flos": 432668731392.0, + "grad_norm": 0.024350173092139916, + "language_loss": 0.89407057, + "learning_rate": 0.0007640725062531195, + "loss": 0.90587682, + "num_input_tokens_seen": 147495856, + "router_z_loss_mlp": 0.90869141, + "step": 1784, + "time_per_iteration": 2.5120832920074463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184023, + "balance_loss_mlp": 1.09294736, + "epoch": 0.3434013081954598, + "flos": 464593428480.0, + "grad_norm": 0.02877111448667641, + "language_loss": 0.95969987, + "learning_rate": 0.0007638079075500047, + "loss": 0.97154009, + "num_input_tokens_seen": 147559632, + "router_z_loss_mlp": 0.90917969, + "step": 1785, + "time_per_iteration": 2.5176198482513428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194351, + "balance_loss_mlp": 1.10546875, + "epoch": 0.34359368988072336, + "flos": 1560674631168.0, + "grad_norm": 0.01088995253456435, + "language_loss": 0.75180668, + "learning_rate": 0.0007635432064275772, + "loss": 0.7637502, + "num_input_tokens_seen": 147794576, + "router_z_loss_mlp": 0.88671875, + "step": 1786, + "time_per_iteration": 5.021549463272095 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183341, + "balance_loss_mlp": 1.09278917, + "epoch": 0.3437860715659869, + "flos": 496572519936.0, + "grad_norm": 0.024204144242014246, + "language_loss": 0.90540475, + "learning_rate": 0.0007632784029886026, + "loss": 0.91723818, + "num_input_tokens_seen": 147866960, + "router_z_loss_mlp": 0.90380859, + "step": 1787, + "time_per_iteration": 2.6350793838500977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178894, + "balance_loss_mlp": 1.08791375, + "epoch": 0.3439784532512505, + "flos": 719608969728.0, + "grad_norm": 0.025958683961259412, + "language_loss": 0.93068433, + "learning_rate": 0.0007630134973358873, + "loss": 0.94247323, + "num_input_tokens_seen": 147947808, + "router_z_loss_mlp": 0.90820312, + "step": 1788, + "time_per_iteration": 2.93084454536438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178793, + "balance_loss_mlp": 1.08785999, + "epoch": 0.34417083493651407, + "flos": 566921246208.0, + "grad_norm": 0.025032512144454056, + "language_loss": 0.92506206, + "learning_rate": 0.0007627484895722763, + "loss": 0.93685007, + "num_input_tokens_seen": 148015936, + "router_z_loss_mlp": 0.90771484, + "step": 1789, + "time_per_iteration": 2.649689197540283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177857, + "balance_loss_mlp": 1.08706772, + "epoch": 0.3443632166217776, + "flos": 797701189632.0, + "grad_norm": 0.027302991531117576, + "language_loss": 0.89870507, + "learning_rate": 0.0007624833798006552, + "loss": 0.9104836, + "num_input_tokens_seen": 148099776, + "router_z_loss_mlp": 0.90625, + "step": 1790, + "time_per_iteration": 3.0469179153442383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117862, + "balance_loss_mlp": 1.08811665, + "epoch": 0.3445555983070412, + "flos": 570392492544.0, + "grad_norm": 0.0288389056738737, + "language_loss": 0.92729777, + "learning_rate": 0.0007622181681239483, + "loss": 0.93908393, + "num_input_tokens_seen": 148169616, + "router_z_loss_mlp": 0.90332031, + "step": 1791, + "time_per_iteration": 2.6440184116363525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178949, + "balance_loss_mlp": 1.08849263, + "epoch": 0.3447479799923047, + "flos": 569980257792.0, + "grad_norm": 0.022982775931836206, + "language_loss": 0.91584516, + "learning_rate": 0.0007619528546451202, + "loss": 0.9276346, + "num_input_tokens_seen": 148247824, + "router_z_loss_mlp": 0.90283203, + "step": 1792, + "time_per_iteration": 2.797133445739746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177091, + "balance_loss_mlp": 1.08673048, + "epoch": 0.3449403616775683, + "flos": 969331683840.0, + "grad_norm": 0.02628926210615307, + "language_loss": 0.90923131, + "learning_rate": 0.0007616874394671745, + "loss": 0.92100227, + "num_input_tokens_seen": 148333040, + "router_z_loss_mlp": 0.90185547, + "step": 1793, + "time_per_iteration": 3.3191378116607666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178301, + "balance_loss_mlp": 1.08784556, + "epoch": 0.34513274336283184, + "flos": 569676085248.0, + "grad_norm": 0.03267712320672132, + "language_loss": 0.9558928, + "learning_rate": 0.0007614219226931547, + "loss": 0.96767581, + "num_input_tokens_seen": 148401840, + "router_z_loss_mlp": 0.90283203, + "step": 1794, + "time_per_iteration": 2.677525043487549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178051, + "balance_loss_mlp": 1.0875473, + "epoch": 0.3453251250480954, + "flos": 461858055168.0, + "grad_norm": 0.024689469906648515, + "language_loss": 0.92397773, + "learning_rate": 0.0007611563044261435, + "loss": 0.93575823, + "num_input_tokens_seen": 148466576, + "router_z_loss_mlp": 0.90332031, + "step": 1795, + "time_per_iteration": 2.5183908939361572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178812, + "balance_loss_mlp": 1.08835602, + "epoch": 0.34551750673335896, + "flos": 416519731200.0, + "grad_norm": 0.027710199676415265, + "language_loss": 0.96473086, + "learning_rate": 0.0007608905847692631, + "loss": 0.97651899, + "num_input_tokens_seen": 148530016, + "router_z_loss_mlp": 0.90283203, + "step": 1796, + "time_per_iteration": 2.4600772857666016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182482, + "balance_loss_mlp": 1.09212101, + "epoch": 0.34570988841862255, + "flos": 589114409472.0, + "grad_norm": 0.023363368939277738, + "language_loss": 0.92555124, + "learning_rate": 0.0007606247638256749, + "loss": 0.93737608, + "num_input_tokens_seen": 148610064, + "router_z_loss_mlp": 0.90185547, + "step": 1797, + "time_per_iteration": 2.8326525688171387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183395, + "balance_loss_mlp": 1.09565735, + "epoch": 0.34590227010388613, + "flos": 1571142764544.0, + "grad_norm": 0.009651567236440416, + "language_loss": 0.78170294, + "learning_rate": 0.0007603588416985798, + "loss": 0.79353684, + "num_input_tokens_seen": 148835872, + "router_z_loss_mlp": 0.875, + "step": 1798, + "time_per_iteration": 4.921091794967651 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118071, + "balance_loss_mlp": 1.09259033, + "epoch": 0.34609465178914967, + "flos": 1540928131584.0, + "grad_norm": 0.004186018133500934, + "language_loss": 0.79327202, + "learning_rate": 0.0007600928184912179, + "loss": 0.8050791, + "num_input_tokens_seen": 149066864, + "router_z_loss_mlp": 0.87890625, + "step": 1799, + "time_per_iteration": 4.76463508605957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177428, + "balance_loss_mlp": 1.08692396, + "epoch": 0.34628703347441325, + "flos": 610516578816.0, + "grad_norm": 0.027319297321258894, + "language_loss": 0.94778776, + "learning_rate": 0.0007598266943068686, + "loss": 0.95956194, + "num_input_tokens_seen": 149141600, + "router_z_loss_mlp": 0.90332031, + "step": 1800, + "time_per_iteration": 2.741830348968506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180421, + "balance_loss_mlp": 1.0898217, + "epoch": 0.3464794151596768, + "flos": 474264563712.0, + "grad_norm": 0.0268607754896097, + "language_loss": 0.91417915, + "learning_rate": 0.0007595604692488507, + "loss": 0.92598337, + "num_input_tokens_seen": 149205888, + "router_z_loss_mlp": 0.90429688, + "step": 1801, + "time_per_iteration": 2.5253777503967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117756, + "balance_loss_mlp": 1.08719921, + "epoch": 0.34667179684494037, + "flos": 606821750784.0, + "grad_norm": 0.0251267071243342, + "language_loss": 0.907076, + "learning_rate": 0.0007592941434205215, + "loss": 0.91885161, + "num_input_tokens_seen": 149281280, + "router_z_loss_mlp": 0.90185547, + "step": 1802, + "time_per_iteration": 2.7729735374450684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175873, + "balance_loss_mlp": 1.0877533, + "epoch": 0.3468641785302039, + "flos": 1568359727616.0, + "grad_norm": 0.004114808875680539, + "language_loss": 0.73571062, + "learning_rate": 0.0007590277169252782, + "loss": 0.74746931, + "num_input_tokens_seen": 149525008, + "router_z_loss_mlp": 0.87890625, + "step": 1803, + "time_per_iteration": 5.036771774291992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178076, + "balance_loss_mlp": 1.08776271, + "epoch": 0.3470565602154675, + "flos": 908723223552.0, + "grad_norm": 0.03174792037748739, + "language_loss": 0.90712535, + "learning_rate": 0.0007587611898665566, + "loss": 0.91890609, + "num_input_tokens_seen": 149600624, + "router_z_loss_mlp": 0.90136719, + "step": 1804, + "time_per_iteration": 3.0725910663604736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177414, + "balance_loss_mlp": 1.08719671, + "epoch": 0.347248941900731, + "flos": 640059740160.0, + "grad_norm": 0.023310551488003612, + "language_loss": 0.90306699, + "learning_rate": 0.0007584945623478315, + "loss": 0.91484118, + "num_input_tokens_seen": 149674224, + "router_z_loss_mlp": 0.90039062, + "step": 1805, + "time_per_iteration": 2.8080646991729736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176916, + "balance_loss_mlp": 1.08655512, + "epoch": 0.3474413235859946, + "flos": 848781505536.0, + "grad_norm": 0.027596494202169034, + "language_loss": 0.90514499, + "learning_rate": 0.000758227834472617, + "loss": 0.91691411, + "num_input_tokens_seen": 149758688, + "router_z_loss_mlp": 0.90185547, + "step": 1806, + "time_per_iteration": 3.0443291664123535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179899, + "balance_loss_mlp": 1.08972931, + "epoch": 0.3476337052712582, + "flos": 516696325632.0, + "grad_norm": 0.02724510251762829, + "language_loss": 0.86438924, + "learning_rate": 0.0007579610063444664, + "loss": 0.87618828, + "num_input_tokens_seen": 149831648, + "router_z_loss_mlp": 0.89990234, + "step": 1807, + "time_per_iteration": 2.716522455215454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177066, + "balance_loss_mlp": 1.08694386, + "epoch": 0.34782608695652173, + "flos": 915114493440.0, + "grad_norm": 0.02927822844999151, + "language_loss": 0.96424794, + "learning_rate": 0.0007576940780669712, + "loss": 0.97601861, + "num_input_tokens_seen": 149919440, + "router_z_loss_mlp": 0.89941406, + "step": 1808, + "time_per_iteration": 3.21464204788208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177424, + "balance_loss_mlp": 1.08734941, + "epoch": 0.3480184686417853, + "flos": 775083056640.0, + "grad_norm": 0.026376675364870938, + "language_loss": 0.91835052, + "learning_rate": 0.0007574270497437624, + "loss": 0.93012476, + "num_input_tokens_seen": 150001632, + "router_z_loss_mlp": 0.89892578, + "step": 1809, + "time_per_iteration": 2.965306043624878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177298, + "balance_loss_mlp": 1.0874145, + "epoch": 0.34821085032704885, + "flos": 578003728896.0, + "grad_norm": 0.024336980271772477, + "language_loss": 0.95592844, + "learning_rate": 0.000757159921478509, + "loss": 0.96770144, + "num_input_tokens_seen": 150077552, + "router_z_loss_mlp": 0.89697266, + "step": 1810, + "time_per_iteration": 2.781496047973633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177093, + "balance_loss_mlp": 1.088974, + "epoch": 0.34840323201231244, + "flos": 1528039531008.0, + "grad_norm": 0.007178450494277746, + "language_loss": 0.74450636, + "learning_rate": 0.0007568926933749201, + "loss": 0.75627732, + "num_input_tokens_seen": 150295328, + "router_z_loss_mlp": 0.87890625, + "step": 1811, + "time_per_iteration": 4.719515562057495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176704, + "balance_loss_mlp": 1.08691561, + "epoch": 0.34859561369757597, + "flos": 510181530624.0, + "grad_norm": 0.02648580139398905, + "language_loss": 0.96071857, + "learning_rate": 0.0007566253655367423, + "loss": 0.97248554, + "num_input_tokens_seen": 150360496, + "router_z_loss_mlp": 0.89599609, + "step": 1812, + "time_per_iteration": 2.5699198246002197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177921, + "balance_loss_mlp": 1.08822834, + "epoch": 0.34878799538283956, + "flos": 549756395520.0, + "grad_norm": 0.036663453377328174, + "language_loss": 0.96810794, + "learning_rate": 0.000756357938067762, + "loss": 0.97988713, + "num_input_tokens_seen": 150432064, + "router_z_loss_mlp": 0.89501953, + "step": 1813, + "time_per_iteration": 2.6622092723846436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179077, + "balance_loss_mlp": 1.08885992, + "epoch": 0.34898037706810314, + "flos": 985193975808.0, + "grad_norm": 0.026013801782247825, + "language_loss": 0.90032709, + "learning_rate": 0.0007560904110718033, + "loss": 0.91211784, + "num_input_tokens_seen": 150512176, + "router_z_loss_mlp": 0.90039062, + "step": 1814, + "time_per_iteration": 3.2704074382781982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176613, + "balance_loss_mlp": 1.08639514, + "epoch": 0.3491727587533667, + "flos": 682836607488.0, + "grad_norm": 0.025025787643359835, + "language_loss": 0.91824377, + "learning_rate": 0.0007558227846527297, + "loss": 0.93000984, + "num_input_tokens_seen": 150586416, + "router_z_loss_mlp": 0.90039062, + "step": 1815, + "time_per_iteration": 2.870858907699585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176853, + "balance_loss_mlp": 1.08673084, + "epoch": 0.34936514043863026, + "flos": 394889250816.0, + "grad_norm": 0.0291076708707547, + "language_loss": 0.91979998, + "learning_rate": 0.0007555550589144429, + "loss": 0.9315685, + "num_input_tokens_seen": 150648944, + "router_z_loss_mlp": 0.89941406, + "step": 1816, + "time_per_iteration": 2.4363009929656982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177424, + "balance_loss_mlp": 1.08739722, + "epoch": 0.3495575221238938, + "flos": 462340147200.0, + "grad_norm": 0.02440335273431038, + "language_loss": 0.92281306, + "learning_rate": 0.000755287233960883, + "loss": 0.9345873, + "num_input_tokens_seen": 150717200, + "router_z_loss_mlp": 0.8984375, + "step": 1817, + "time_per_iteration": 2.538250207901001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117706, + "balance_loss_mlp": 1.08693826, + "epoch": 0.3497499038091574, + "flos": 725428824576.0, + "grad_norm": 0.028430093115180927, + "language_loss": 0.88002723, + "learning_rate": 0.0007550193098960292, + "loss": 0.89179784, + "num_input_tokens_seen": 150790368, + "router_z_loss_mlp": 0.89941406, + "step": 1818, + "time_per_iteration": 2.8685545921325684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174425, + "balance_loss_mlp": 1.08411181, + "epoch": 0.3499422854944209, + "flos": 829196187648.0, + "grad_norm": 0.021653398091314287, + "language_loss": 0.92103571, + "learning_rate": 0.0007547512868238988, + "loss": 0.93277991, + "num_input_tokens_seen": 150879872, + "router_z_loss_mlp": 0.90136719, + "step": 1819, + "time_per_iteration": 3.115814685821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118204, + "balance_loss_mlp": 1.092013, + "epoch": 0.3501346671796845, + "flos": 494542820352.0, + "grad_norm": 0.026515438979626053, + "language_loss": 0.9198699, + "learning_rate": 0.0007544831648485473, + "loss": 0.93169028, + "num_input_tokens_seen": 150953712, + "router_z_loss_mlp": 0.8984375, + "step": 1820, + "time_per_iteration": 2.6666150093078613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178247, + "balance_loss_mlp": 1.08783865, + "epoch": 0.35032704886494803, + "flos": 579848778240.0, + "grad_norm": 0.026574936148936048, + "language_loss": 0.89372301, + "learning_rate": 0.0007542149440740694, + "loss": 0.90550542, + "num_input_tokens_seen": 151026192, + "router_z_loss_mlp": 0.90234375, + "step": 1821, + "time_per_iteration": 2.6776442527770996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178869, + "balance_loss_mlp": 1.08841276, + "epoch": 0.3505194305502116, + "flos": 585831816192.0, + "grad_norm": 0.02674162112947977, + "language_loss": 0.9602831, + "learning_rate": 0.000753946624604597, + "loss": 0.97207189, + "num_input_tokens_seen": 151100720, + "router_z_loss_mlp": 0.90283203, + "step": 1822, + "time_per_iteration": 2.746363639831543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175368, + "balance_loss_mlp": 1.08491182, + "epoch": 0.3507118122354752, + "flos": 527978194944.0, + "grad_norm": 0.02703682960411951, + "language_loss": 0.95658362, + "learning_rate": 0.0007536782065443015, + "loss": 0.9683373, + "num_input_tokens_seen": 151166032, + "router_z_loss_mlp": 0.90283203, + "step": 1823, + "time_per_iteration": 2.5945184230804443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175188, + "balance_loss_mlp": 1.08458936, + "epoch": 0.35090419392073874, + "flos": 512545602048.0, + "grad_norm": 0.03278557538641046, + "language_loss": 0.86822712, + "learning_rate": 0.0007534096899973919, + "loss": 0.87997901, + "num_input_tokens_seen": 151232208, + "router_z_loss_mlp": 0.90429688, + "step": 1824, + "time_per_iteration": 2.56933331489563 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184456, + "balance_loss_mlp": 1.0944289, + "epoch": 0.3510965756060023, + "flos": 565195719168.0, + "grad_norm": 0.023191753507183704, + "language_loss": 0.89392567, + "learning_rate": 0.0007531410750681154, + "loss": 0.90577018, + "num_input_tokens_seen": 151308128, + "router_z_loss_mlp": 0.8984375, + "step": 1825, + "time_per_iteration": 2.7223169803619385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186327, + "balance_loss_mlp": 1.09630024, + "epoch": 0.35128895729126586, + "flos": 1022253046272.0, + "grad_norm": 0.026424599574572643, + "language_loss": 0.93470478, + "learning_rate": 0.0007528723618607575, + "loss": 0.94656801, + "num_input_tokens_seen": 151402560, + "router_z_loss_mlp": 0.8984375, + "step": 1826, + "time_per_iteration": 3.404395580291748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182394, + "balance_loss_mlp": 1.09236717, + "epoch": 0.35148133897652944, + "flos": 589424586240.0, + "grad_norm": 0.02767542011563751, + "language_loss": 0.89242589, + "learning_rate": 0.0007526035504796422, + "loss": 0.90424991, + "num_input_tokens_seen": 151478816, + "router_z_loss_mlp": 0.8984375, + "step": 1827, + "time_per_iteration": 2.820510149002075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117853, + "balance_loss_mlp": 1.08850324, + "epoch": 0.351673720661793, + "flos": 496285811712.0, + "grad_norm": 0.02845608163714707, + "language_loss": 0.94670665, + "learning_rate": 0.0007523346410291312, + "loss": 0.95849192, + "num_input_tokens_seen": 151554528, + "router_z_loss_mlp": 0.8984375, + "step": 1828, + "time_per_iteration": 2.763277053833008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177518, + "balance_loss_mlp": 1.08753836, + "epoch": 0.35186610234705656, + "flos": 763998572544.0, + "grad_norm": 0.028566964886064136, + "language_loss": 0.91855693, + "learning_rate": 0.0007520656336136245, + "loss": 0.93033206, + "num_input_tokens_seen": 151629440, + "router_z_loss_mlp": 0.89794922, + "step": 1829, + "time_per_iteration": 2.9501917362213135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179113, + "balance_loss_mlp": 1.08908641, + "epoch": 0.3520584840323201, + "flos": 627388717056.0, + "grad_norm": 0.0235814228834027, + "language_loss": 0.94624627, + "learning_rate": 0.0007517965283375599, + "loss": 0.95803738, + "num_input_tokens_seen": 151708544, + "router_z_loss_mlp": 0.8984375, + "step": 1830, + "time_per_iteration": 2.8197402954101562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179857, + "balance_loss_mlp": 1.08992577, + "epoch": 0.3522508657175837, + "flos": 538448329728.0, + "grad_norm": 0.025024391475303026, + "language_loss": 0.97205818, + "learning_rate": 0.0007515273253054132, + "loss": 0.9838568, + "num_input_tokens_seen": 151779152, + "router_z_loss_mlp": 0.89746094, + "step": 1831, + "time_per_iteration": 2.6376330852508545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191124, + "balance_loss_mlp": 1.10109711, + "epoch": 0.35244324740284727, + "flos": 568501780992.0, + "grad_norm": 0.029882616882314406, + "language_loss": 0.9266001, + "learning_rate": 0.0007512580246216988, + "loss": 0.93851131, + "num_input_tokens_seen": 151853216, + "router_z_loss_mlp": 0.8984375, + "step": 1832, + "time_per_iteration": 2.708432912826538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179716, + "balance_loss_mlp": 1.08964145, + "epoch": 0.3526356290881108, + "flos": 514054278144.0, + "grad_norm": 0.030813246422457925, + "language_loss": 0.91671479, + "learning_rate": 0.000750988626390968, + "loss": 0.92851192, + "num_input_tokens_seen": 151920416, + "router_z_loss_mlp": 0.89892578, + "step": 1833, + "time_per_iteration": 2.592047929763794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179987, + "balance_loss_mlp": 1.09010315, + "epoch": 0.3528280107733744, + "flos": 596972696064.0, + "grad_norm": 0.024705197674389605, + "language_loss": 0.91622353, + "learning_rate": 0.0007507191307178108, + "loss": 0.9280234, + "num_input_tokens_seen": 151990848, + "router_z_loss_mlp": 0.89697266, + "step": 1834, + "time_per_iteration": 2.7884535789489746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176506, + "balance_loss_mlp": 1.08652651, + "epoch": 0.3530203924586379, + "flos": 552298386432.0, + "grad_norm": 0.0302975798262418, + "language_loss": 0.83893424, + "learning_rate": 0.0007504495377068543, + "loss": 0.85069931, + "num_input_tokens_seen": 152064864, + "router_z_loss_mlp": 0.89794922, + "step": 1835, + "time_per_iteration": 2.7751786708831787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175764, + "balance_loss_mlp": 1.08573675, + "epoch": 0.3532127741439015, + "flos": 654305293824.0, + "grad_norm": 0.027517554164180617, + "language_loss": 0.90655488, + "learning_rate": 0.0007501798474627642, + "loss": 0.91831255, + "num_input_tokens_seen": 152150096, + "router_z_loss_mlp": 0.8984375, + "step": 1836, + "time_per_iteration": 2.9638845920562744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179149, + "balance_loss_mlp": 1.08926523, + "epoch": 0.35340515582916504, + "flos": 724150460928.0, + "grad_norm": 0.024568481275515953, + "language_loss": 0.91140759, + "learning_rate": 0.0007499100600902433, + "loss": 0.92319906, + "num_input_tokens_seen": 152232528, + "router_z_loss_mlp": 0.89697266, + "step": 1837, + "time_per_iteration": 2.9948322772979736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184038, + "balance_loss_mlp": 1.09396327, + "epoch": 0.35359753751442863, + "flos": 595997778432.0, + "grad_norm": 0.031821297821065, + "language_loss": 0.92654896, + "learning_rate": 0.0007496401756940324, + "loss": 0.9383893, + "num_input_tokens_seen": 152299584, + "router_z_loss_mlp": 0.89892578, + "step": 1838, + "time_per_iteration": 2.678050994873047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176486, + "balance_loss_mlp": 1.08665001, + "epoch": 0.3537899191996922, + "flos": 633805456896.0, + "grad_norm": 0.02718368250353396, + "language_loss": 0.91091663, + "learning_rate": 0.0007493701943789098, + "loss": 0.92268145, + "num_input_tokens_seen": 152370368, + "router_z_loss_mlp": 0.89648438, + "step": 1839, + "time_per_iteration": 2.779574155807495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175825, + "balance_loss_mlp": 1.08608413, + "epoch": 0.35398230088495575, + "flos": 507352831488.0, + "grad_norm": 0.028671493841357993, + "language_loss": 0.91863656, + "learning_rate": 0.000749100116249692, + "loss": 0.93039483, + "num_input_tokens_seen": 152436928, + "router_z_loss_mlp": 0.89550781, + "step": 1840, + "time_per_iteration": 2.607614755630493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189406, + "balance_loss_mlp": 1.09980869, + "epoch": 0.35417468257021933, + "flos": 509046157824.0, + "grad_norm": 0.03229862826848899, + "language_loss": 0.95953786, + "learning_rate": 0.0007488299414112321, + "loss": 0.97143197, + "num_input_tokens_seen": 152505952, + "router_z_loss_mlp": 0.89404297, + "step": 1841, + "time_per_iteration": 2.566596746444702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181321, + "balance_loss_mlp": 1.09210455, + "epoch": 0.35436706425548287, + "flos": 657659019264.0, + "grad_norm": 0.02732135002339032, + "language_loss": 0.86453879, + "learning_rate": 0.0007485596699684215, + "loss": 0.87635195, + "num_input_tokens_seen": 152577408, + "router_z_loss_mlp": 0.89013672, + "step": 1842, + "time_per_iteration": 2.8111371994018555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185021, + "balance_loss_mlp": 1.09575689, + "epoch": 0.35455944594074645, + "flos": 653888329728.0, + "grad_norm": 0.026686949506238997, + "language_loss": 0.92940086, + "learning_rate": 0.000748289302026189, + "loss": 0.94125104, + "num_input_tokens_seen": 152654480, + "router_z_loss_mlp": 0.890625, + "step": 1843, + "time_per_iteration": 2.8244054317474365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187203, + "balance_loss_mlp": 1.09793901, + "epoch": 0.35475182762601, + "flos": 850010204160.0, + "grad_norm": 0.02649701564047654, + "language_loss": 0.9307664, + "learning_rate": 0.0007480188376895004, + "loss": 0.94263846, + "num_input_tokens_seen": 152732304, + "router_z_loss_mlp": 0.890625, + "step": 1844, + "time_per_iteration": 3.041001319885254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187935, + "balance_loss_mlp": 1.10115051, + "epoch": 0.3549442093112736, + "flos": 1524775128576.0, + "grad_norm": 0.01173136965559212, + "language_loss": 0.7381134, + "learning_rate": 0.0007477482770633596, + "loss": 0.74999273, + "num_input_tokens_seen": 152965952, + "router_z_loss_mlp": 0.86914062, + "step": 1845, + "time_per_iteration": 4.865761756896973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183261, + "balance_loss_mlp": 1.09390223, + "epoch": 0.3551365909965371, + "flos": 652714025472.0, + "grad_norm": 0.028658093872898062, + "language_loss": 0.85614175, + "learning_rate": 0.0007474776202528074, + "loss": 0.8679744, + "num_input_tokens_seen": 153053088, + "router_z_loss_mlp": 0.89160156, + "step": 1846, + "time_per_iteration": 2.9342904090881348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184977, + "balance_loss_mlp": 1.0954746, + "epoch": 0.3553289726818007, + "flos": 898921832448.0, + "grad_norm": 0.03609141350995601, + "language_loss": 0.89849555, + "learning_rate": 0.000747206867362922, + "loss": 0.91034532, + "num_input_tokens_seen": 153129216, + "router_z_loss_mlp": 0.89306641, + "step": 1847, + "time_per_iteration": 3.1089484691619873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185041, + "balance_loss_mlp": 1.09553862, + "epoch": 0.3555213543670643, + "flos": 689733437952.0, + "grad_norm": 0.0286779566522822, + "language_loss": 0.9096849, + "learning_rate": 0.0007469360184988194, + "loss": 0.92153525, + "num_input_tokens_seen": 153199360, + "router_z_loss_mlp": 0.89306641, + "step": 1848, + "time_per_iteration": 2.820265293121338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183493, + "balance_loss_mlp": 1.09399033, + "epoch": 0.3557137360523278, + "flos": 539603168256.0, + "grad_norm": 0.02648998316664428, + "language_loss": 0.93967247, + "learning_rate": 0.0007466650737656518, + "loss": 0.95150745, + "num_input_tokens_seen": 153269168, + "router_z_loss_mlp": 0.89306641, + "step": 1849, + "time_per_iteration": 2.596639394760132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183541, + "balance_loss_mlp": 1.09427702, + "epoch": 0.3559061177375914, + "flos": 403153767936.0, + "grad_norm": 0.02765421607491624, + "language_loss": 0.97574586, + "learning_rate": 0.0007463940332686098, + "loss": 0.98758125, + "num_input_tokens_seen": 153333120, + "router_z_loss_mlp": 0.890625, + "step": 1850, + "time_per_iteration": 2.478158473968506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177245, + "balance_loss_mlp": 1.08764756, + "epoch": 0.35609849942285493, + "flos": 697893895680.0, + "grad_norm": 0.023379973164811964, + "language_loss": 0.90857208, + "learning_rate": 0.0007461228971129205, + "loss": 0.92034447, + "num_input_tokens_seen": 153407600, + "router_z_loss_mlp": 0.89404297, + "step": 1851, + "time_per_iteration": 2.9202487468719482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179211, + "balance_loss_mlp": 1.08966124, + "epoch": 0.3562908811081185, + "flos": 570001724928.0, + "grad_norm": 0.028863121832353986, + "language_loss": 0.92692959, + "learning_rate": 0.0007458516654038483, + "loss": 0.93872178, + "num_input_tokens_seen": 153477408, + "router_z_loss_mlp": 0.89355469, + "step": 1852, + "time_per_iteration": 2.658867359161377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179202, + "balance_loss_mlp": 1.08936572, + "epoch": 0.35648326279338205, + "flos": 683609410560.0, + "grad_norm": 0.028040747176241956, + "language_loss": 0.94642723, + "learning_rate": 0.0007455803382466946, + "loss": 0.95821923, + "num_input_tokens_seen": 153551888, + "router_z_loss_mlp": 0.89648438, + "step": 1853, + "time_per_iteration": 2.86330509185791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183408, + "balance_loss_mlp": 1.09376252, + "epoch": 0.35667564447864564, + "flos": 630340941312.0, + "grad_norm": 0.02553826751691769, + "language_loss": 0.94946796, + "learning_rate": 0.0007453089157467979, + "loss": 0.96130198, + "num_input_tokens_seen": 153626912, + "router_z_loss_mlp": 0.89453125, + "step": 1854, + "time_per_iteration": 2.792577028274536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180437, + "balance_loss_mlp": 1.09093451, + "epoch": 0.35686802616390917, + "flos": 815504584704.0, + "grad_norm": 0.02468703395074296, + "language_loss": 0.8986901, + "learning_rate": 0.0007450373980095341, + "loss": 0.91049451, + "num_input_tokens_seen": 153711312, + "router_z_loss_mlp": 0.89306641, + "step": 1855, + "time_per_iteration": 3.0555014610290527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182657, + "balance_loss_mlp": 1.09334552, + "epoch": 0.35706040784917276, + "flos": 527205391872.0, + "grad_norm": 0.02890256158864057, + "language_loss": 0.93639445, + "learning_rate": 0.0007447657851403155, + "loss": 0.94822103, + "num_input_tokens_seen": 153780208, + "router_z_loss_mlp": 0.89111328, + "step": 1856, + "time_per_iteration": 2.589708089828491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182935, + "balance_loss_mlp": 1.09367096, + "epoch": 0.35725278953443634, + "flos": 513064624128.0, + "grad_norm": 0.032008561774258475, + "language_loss": 0.88987339, + "learning_rate": 0.0007444940772445915, + "loss": 0.9017027, + "num_input_tokens_seen": 153853152, + "router_z_loss_mlp": 0.890625, + "step": 1857, + "time_per_iteration": 2.7185556888580322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180668, + "balance_loss_mlp": 1.09169042, + "epoch": 0.3574451712196999, + "flos": 488492653056.0, + "grad_norm": 0.02708223160327311, + "language_loss": 0.88387084, + "learning_rate": 0.0007442222744278484, + "loss": 0.89567751, + "num_input_tokens_seen": 153924160, + "router_z_loss_mlp": 0.88769531, + "step": 1858, + "time_per_iteration": 2.6569979190826416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182567, + "balance_loss_mlp": 1.09339869, + "epoch": 0.35763755290496346, + "flos": 551821023744.0, + "grad_norm": 0.023402609147138306, + "language_loss": 0.90506786, + "learning_rate": 0.0007439503767956099, + "loss": 0.91689354, + "num_input_tokens_seen": 153998688, + "router_z_loss_mlp": 0.88964844, + "step": 1859, + "time_per_iteration": 2.7072699069976807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180801, + "balance_loss_mlp": 1.09249115, + "epoch": 0.357829934590227, + "flos": 1507225514496.0, + "grad_norm": 0.010565166743096084, + "language_loss": 0.79671603, + "learning_rate": 0.0007436783844534352, + "loss": 0.80852401, + "num_input_tokens_seen": 154230960, + "router_z_loss_mlp": 0.88085938, + "step": 1860, + "time_per_iteration": 4.9006147384643555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177337, + "balance_loss_mlp": 1.08835948, + "epoch": 0.3580223162754906, + "flos": 569841269760.0, + "grad_norm": 0.022894220472823423, + "language_loss": 0.92520916, + "learning_rate": 0.000743406297506922, + "loss": 0.93698251, + "num_input_tokens_seen": 154309104, + "router_z_loss_mlp": 0.88769531, + "step": 1861, + "time_per_iteration": 2.7065579891204834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186252, + "balance_loss_mlp": 1.09741747, + "epoch": 0.3582146979607541, + "flos": 627760018944.0, + "grad_norm": 0.02759787968542248, + "language_loss": 0.91638815, + "learning_rate": 0.0007431341160617031, + "loss": 0.92825067, + "num_input_tokens_seen": 154387424, + "router_z_loss_mlp": 0.88623047, + "step": 1862, + "time_per_iteration": 2.9316203594207764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179684, + "balance_loss_mlp": 1.09089661, + "epoch": 0.3584070796460177, + "flos": 508319016960.0, + "grad_norm": 0.024526236298265516, + "language_loss": 0.95309365, + "learning_rate": 0.0007428618402234491, + "loss": 0.96489048, + "num_input_tokens_seen": 154459952, + "router_z_loss_mlp": 0.88574219, + "step": 1863, + "time_per_iteration": 2.648061752319336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179939, + "balance_loss_mlp": 1.09129453, + "epoch": 0.3585994613312813, + "flos": 607640216064.0, + "grad_norm": 0.026400757424935653, + "language_loss": 0.88735509, + "learning_rate": 0.0007425894700978668, + "loss": 0.89915442, + "num_input_tokens_seen": 154535456, + "router_z_loss_mlp": 0.88427734, + "step": 1864, + "time_per_iteration": 2.7512128353118896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178956, + "balance_loss_mlp": 1.0905509, + "epoch": 0.3587918430165448, + "flos": 1415087675904.0, + "grad_norm": 0.025937088976099313, + "language_loss": 0.86489892, + "learning_rate": 0.0007423170057906996, + "loss": 0.87668848, + "num_input_tokens_seen": 154627568, + "router_z_loss_mlp": 0.88183594, + "step": 1865, + "time_per_iteration": 3.8491222858428955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181386, + "balance_loss_mlp": 1.0926944, + "epoch": 0.3589842247018084, + "flos": 479513730048.0, + "grad_norm": 0.0296684402619103, + "language_loss": 0.94328964, + "learning_rate": 0.0007420444474077275, + "loss": 0.95510352, + "num_input_tokens_seen": 154694640, + "router_z_loss_mlp": 0.88476562, + "step": 1866, + "time_per_iteration": 2.5396502017974854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183129, + "balance_loss_mlp": 1.09458029, + "epoch": 0.35917660638707194, + "flos": 505705167360.0, + "grad_norm": 0.030930075238968464, + "language_loss": 0.98337018, + "learning_rate": 0.0007417717950547671, + "loss": 0.99520147, + "num_input_tokens_seen": 154762048, + "router_z_loss_mlp": 0.88330078, + "step": 1867, + "time_per_iteration": 2.562638759613037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182945, + "balance_loss_mlp": 1.09654236, + "epoch": 0.3593689880723355, + "flos": 1495481745408.0, + "grad_norm": 0.008554058370081398, + "language_loss": 0.75996608, + "learning_rate": 0.0007414990488376713, + "loss": 0.77179551, + "num_input_tokens_seen": 154989952, + "router_z_loss_mlp": 0.86523438, + "step": 1868, + "time_per_iteration": 4.885401487350464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184482, + "balance_loss_mlp": 1.09583843, + "epoch": 0.35956136975759906, + "flos": 529671521280.0, + "grad_norm": 0.02257875970711003, + "language_loss": 0.91369003, + "learning_rate": 0.0007412262088623299, + "loss": 0.92553484, + "num_input_tokens_seen": 155066992, + "router_z_loss_mlp": 0.88427734, + "step": 1869, + "time_per_iteration": 2.755620241165161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184303, + "balance_loss_mlp": 1.09584975, + "epoch": 0.35975375144286265, + "flos": 535999664640.0, + "grad_norm": 0.02945163599469251, + "language_loss": 0.8810817, + "learning_rate": 0.0007409532752346684, + "loss": 0.89292467, + "num_input_tokens_seen": 155137616, + "router_z_loss_mlp": 0.88232422, + "step": 1870, + "time_per_iteration": 2.6426498889923096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187445, + "balance_loss_mlp": 1.09860992, + "epoch": 0.3599461331281262, + "flos": 505928749056.0, + "grad_norm": 0.025692069404306732, + "language_loss": 0.95194697, + "learning_rate": 0.0007406802480606491, + "loss": 0.96382141, + "num_input_tokens_seen": 155209248, + "router_z_loss_mlp": 0.88623047, + "step": 1871, + "time_per_iteration": 2.6156716346740723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180117, + "balance_loss_mlp": 1.09123456, + "epoch": 0.36013851481338977, + "flos": 512536869888.0, + "grad_norm": 0.029138864413584674, + "language_loss": 0.9874596, + "learning_rate": 0.0007404071274462707, + "loss": 0.99926078, + "num_input_tokens_seen": 155274176, + "router_z_loss_mlp": 0.88671875, + "step": 1872, + "time_per_iteration": 2.5790889263153076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179425, + "balance_loss_mlp": 1.09054244, + "epoch": 0.36033089649865335, + "flos": 548631756288.0, + "grad_norm": 0.029675252163234106, + "language_loss": 0.91584998, + "learning_rate": 0.0007401339134975682, + "loss": 0.92764425, + "num_input_tokens_seen": 155343232, + "router_z_loss_mlp": 0.88671875, + "step": 1873, + "time_per_iteration": 2.6279983520507812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185016, + "balance_loss_mlp": 1.09613371, + "epoch": 0.3605232781839169, + "flos": 459613506048.0, + "grad_norm": 0.030657976300352024, + "language_loss": 0.92556155, + "learning_rate": 0.0007398606063206122, + "loss": 0.93741173, + "num_input_tokens_seen": 155410080, + "router_z_loss_mlp": 0.88671875, + "step": 1874, + "time_per_iteration": 2.5750958919525146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178477, + "balance_loss_mlp": 1.0895946, + "epoch": 0.36071565986918047, + "flos": 510563566080.0, + "grad_norm": 0.029863822651947862, + "language_loss": 0.87000763, + "learning_rate": 0.0007395872060215101, + "loss": 0.88179243, + "num_input_tokens_seen": 155476240, + "router_z_loss_mlp": 0.88671875, + "step": 1875, + "time_per_iteration": 2.599595546722412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180043, + "balance_loss_mlp": 1.09101713, + "epoch": 0.360908041554444, + "flos": 560256729600.0, + "grad_norm": 0.02914010843617622, + "language_loss": 0.95866597, + "learning_rate": 0.0007393137127064056, + "loss": 0.97046638, + "num_input_tokens_seen": 155543392, + "router_z_loss_mlp": 0.88818359, + "step": 1876, + "time_per_iteration": 2.629855155944824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179718, + "balance_loss_mlp": 1.09064531, + "epoch": 0.3611004232397076, + "flos": 524878250496.0, + "grad_norm": 0.029199641876594032, + "language_loss": 0.93452048, + "learning_rate": 0.0007390401264814779, + "loss": 0.94631773, + "num_input_tokens_seen": 155613264, + "router_z_loss_mlp": 0.88867188, + "step": 1877, + "time_per_iteration": 2.6057403087615967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182123, + "balance_loss_mlp": 1.0932405, + "epoch": 0.3612928049249711, + "flos": 542032367616.0, + "grad_norm": 0.029384759310162312, + "language_loss": 0.93887711, + "learning_rate": 0.0007387664474529427, + "loss": 0.95069838, + "num_input_tokens_seen": 155683712, + "router_z_loss_mlp": 0.88671875, + "step": 1878, + "time_per_iteration": 2.612924814224243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181149, + "balance_loss_mlp": 1.09207559, + "epoch": 0.3614851866102347, + "flos": 553629143040.0, + "grad_norm": 0.028847856052759763, + "language_loss": 0.99400896, + "learning_rate": 0.0007384926757270518, + "loss": 1.00582051, + "num_input_tokens_seen": 155751760, + "router_z_loss_mlp": 0.88867188, + "step": 1879, + "time_per_iteration": 2.631417751312256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183007, + "balance_loss_mlp": 1.09364784, + "epoch": 0.36167756829549824, + "flos": 773426660352.0, + "grad_norm": 0.027790454764264987, + "language_loss": 0.87101346, + "learning_rate": 0.0007382188114100924, + "loss": 0.88284349, + "num_input_tokens_seen": 155830464, + "router_z_loss_mlp": 0.89160156, + "step": 1880, + "time_per_iteration": 3.0146212577819824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182663, + "balance_loss_mlp": 1.09330404, + "epoch": 0.36186994998076183, + "flos": 713187500544.0, + "grad_norm": 0.025874200926848077, + "language_loss": 0.89437282, + "learning_rate": 0.0007379448546083884, + "loss": 0.90619946, + "num_input_tokens_seen": 155906208, + "router_z_loss_mlp": 0.89160156, + "step": 1881, + "time_per_iteration": 2.9882314205169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182414, + "balance_loss_mlp": 1.09305489, + "epoch": 0.3620623316660254, + "flos": 748900351488.0, + "grad_norm": 0.028120122690860328, + "language_loss": 0.95218164, + "learning_rate": 0.0007376708054282992, + "loss": 0.96400583, + "num_input_tokens_seen": 155983584, + "router_z_loss_mlp": 0.89160156, + "step": 1882, + "time_per_iteration": 2.937251329421997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185259, + "balance_loss_mlp": 1.09609008, + "epoch": 0.36225471335128895, + "flos": 483534197760.0, + "grad_norm": 0.025051425069896712, + "language_loss": 0.90089262, + "learning_rate": 0.0007373966639762201, + "loss": 0.91274524, + "num_input_tokens_seen": 156052464, + "router_z_loss_mlp": 0.88964844, + "step": 1883, + "time_per_iteration": 2.5956366062164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189104, + "balance_loss_mlp": 1.09964943, + "epoch": 0.36244709503655254, + "flos": 507910785024.0, + "grad_norm": 0.028814908336841725, + "language_loss": 0.97620124, + "learning_rate": 0.0007371224303585822, + "loss": 0.9880923, + "num_input_tokens_seen": 156121424, + "router_z_loss_mlp": 0.89257812, + "step": 1884, + "time_per_iteration": 2.5689563751220703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188454, + "balance_loss_mlp": 1.10205078, + "epoch": 0.36263947672181607, + "flos": 1397052145152.0, + "grad_norm": 0.012535477100621303, + "language_loss": 0.80357069, + "learning_rate": 0.0007368481046818524, + "loss": 0.8154552, + "num_input_tokens_seen": 156346144, + "router_z_loss_mlp": 0.86523438, + "step": 1885, + "time_per_iteration": 4.708393573760986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184768, + "balance_loss_mlp": 1.09531295, + "epoch": 0.36283185840707965, + "flos": 654522144768.0, + "grad_norm": 0.026882878095346403, + "language_loss": 0.90798199, + "learning_rate": 0.0007365736870525335, + "loss": 0.91982961, + "num_input_tokens_seen": 156420880, + "router_z_loss_mlp": 0.89257812, + "step": 1886, + "time_per_iteration": 2.8096718788146973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188121, + "balance_loss_mlp": 1.09842801, + "epoch": 0.3630242400923432, + "flos": 489844876800.0, + "grad_norm": 0.028488669634490066, + "language_loss": 0.90766525, + "learning_rate": 0.000736299177577164, + "loss": 0.91954637, + "num_input_tokens_seen": 156485616, + "router_z_loss_mlp": 0.89501953, + "step": 1887, + "time_per_iteration": 2.5731940269470215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184527, + "balance_loss_mlp": 1.09488153, + "epoch": 0.3632166217776068, + "flos": 518231198208.0, + "grad_norm": 0.0291282657352475, + "language_loss": 0.90900671, + "learning_rate": 0.0007360245763623174, + "loss": 0.92085195, + "num_input_tokens_seen": 156557840, + "router_z_loss_mlp": 0.89453125, + "step": 1888, + "time_per_iteration": 2.6255550384521484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184122, + "balance_loss_mlp": 1.09457171, + "epoch": 0.36340900346287036, + "flos": 647347338240.0, + "grad_norm": 0.024297388169127104, + "language_loss": 0.96519047, + "learning_rate": 0.0007357498835146039, + "loss": 0.97703171, + "num_input_tokens_seen": 156632496, + "router_z_loss_mlp": 0.89355469, + "step": 1889, + "time_per_iteration": 2.8253488540649414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183322, + "balance_loss_mlp": 1.09386766, + "epoch": 0.3636013851481339, + "flos": 554410678272.0, + "grad_norm": 0.02538543495771105, + "language_loss": 0.93937147, + "learning_rate": 0.0007354750991406684, + "loss": 0.95120472, + "num_input_tokens_seen": 156705296, + "router_z_loss_mlp": 0.89257812, + "step": 1890, + "time_per_iteration": 2.692335844039917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182823, + "balance_loss_mlp": 1.09336889, + "epoch": 0.3637937668333975, + "flos": 547691767296.0, + "grad_norm": 0.028084450652072174, + "language_loss": 0.88223994, + "learning_rate": 0.0007352002233471919, + "loss": 0.89406812, + "num_input_tokens_seen": 156773376, + "router_z_loss_mlp": 0.89257812, + "step": 1891, + "time_per_iteration": 2.620753765106201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181153, + "balance_loss_mlp": 1.09212756, + "epoch": 0.363986148518661, + "flos": 539210399232.0, + "grad_norm": 0.027970426809957948, + "language_loss": 0.87592262, + "learning_rate": 0.0007349252562408906, + "loss": 0.88773412, + "num_input_tokens_seen": 156844336, + "router_z_loss_mlp": 0.88818359, + "step": 1892, + "time_per_iteration": 2.6963558197021484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186893, + "balance_loss_mlp": 1.09762907, + "epoch": 0.3641785302039246, + "flos": 661510299648.0, + "grad_norm": 0.026164868426956554, + "language_loss": 0.89186442, + "learning_rate": 0.0007346501979285158, + "loss": 0.90373337, + "num_input_tokens_seen": 156918848, + "router_z_loss_mlp": 0.890625, + "step": 1893, + "time_per_iteration": 2.880326747894287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188103, + "balance_loss_mlp": 1.10150909, + "epoch": 0.36437091188918813, + "flos": 1472082077184.0, + "grad_norm": 0.013556454199407954, + "language_loss": 0.80539101, + "learning_rate": 0.0007343750485168551, + "loss": 0.81727207, + "num_input_tokens_seen": 157134736, + "router_z_loss_mlp": 0.8671875, + "step": 1894, + "time_per_iteration": 4.7823100090026855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189424, + "balance_loss_mlp": 1.10011292, + "epoch": 0.3645632935744517, + "flos": 598444442112.0, + "grad_norm": 0.028411509484180794, + "language_loss": 0.93676329, + "learning_rate": 0.0007340998081127308, + "loss": 0.94865751, + "num_input_tokens_seen": 157211920, + "router_z_loss_mlp": 0.89111328, + "step": 1895, + "time_per_iteration": 2.7800211906433105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179101, + "balance_loss_mlp": 1.08998048, + "epoch": 0.36475567525971525, + "flos": 600695721984.0, + "grad_norm": 0.025932670803143428, + "language_loss": 0.98669052, + "learning_rate": 0.0007338244768230007, + "loss": 0.99848151, + "num_input_tokens_seen": 157284224, + "router_z_loss_mlp": 0.88916016, + "step": 1896, + "time_per_iteration": 2.7945594787597656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180722, + "balance_loss_mlp": 1.09169638, + "epoch": 0.36494805694497884, + "flos": 799830945792.0, + "grad_norm": 0.022772977260465788, + "language_loss": 0.94548512, + "learning_rate": 0.0007335490547545578, + "loss": 0.95729244, + "num_input_tokens_seen": 157367920, + "router_z_loss_mlp": 0.88818359, + "step": 1897, + "time_per_iteration": 3.031527280807495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182826, + "balance_loss_mlp": 1.09389579, + "epoch": 0.3651404386302424, + "flos": 638477203968.0, + "grad_norm": 0.024439781626348547, + "language_loss": 0.90189934, + "learning_rate": 0.0007332735420143308, + "loss": 0.91372758, + "num_input_tokens_seen": 157438672, + "router_z_loss_mlp": 0.88720703, + "step": 1898, + "time_per_iteration": 2.743051767349243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118252, + "balance_loss_mlp": 1.09363747, + "epoch": 0.36533282031550596, + "flos": 492562785792.0, + "grad_norm": 0.03052059755540218, + "language_loss": 0.95941794, + "learning_rate": 0.0007329979387092826, + "loss": 0.97124314, + "num_input_tokens_seen": 157505888, + "router_z_loss_mlp": 0.88671875, + "step": 1899, + "time_per_iteration": 2.5555779933929443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181449, + "balance_loss_mlp": 1.09247124, + "epoch": 0.36552520200076954, + "flos": 857508648960.0, + "grad_norm": 0.02266050351879182, + "language_loss": 0.89947438, + "learning_rate": 0.0007327222449464124, + "loss": 0.91128886, + "num_input_tokens_seen": 157601568, + "router_z_loss_mlp": 0.88769531, + "step": 1900, + "time_per_iteration": 3.2362029552459717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181183, + "balance_loss_mlp": 1.09206235, + "epoch": 0.3657175836860331, + "flos": 484715232768.0, + "grad_norm": 0.026374750280255838, + "language_loss": 0.95288622, + "learning_rate": 0.0007324464608327538, + "loss": 0.96469808, + "num_input_tokens_seen": 157670992, + "router_z_loss_mlp": 0.88916016, + "step": 1901, + "time_per_iteration": 2.5933730602264404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179798, + "balance_loss_mlp": 1.09058213, + "epoch": 0.36590996537129666, + "flos": 435721012224.0, + "grad_norm": 0.02685373461110618, + "language_loss": 0.96213037, + "learning_rate": 0.0007321705864753758, + "loss": 0.97392833, + "num_input_tokens_seen": 157743616, + "router_z_loss_mlp": 0.89013672, + "step": 1902, + "time_per_iteration": 2.6981201171875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180605, + "balance_loss_mlp": 1.09124577, + "epoch": 0.3661023470565602, + "flos": 713513140224.0, + "grad_norm": 0.022756571637903334, + "language_loss": 0.91225153, + "learning_rate": 0.0007318946219813823, + "loss": 0.9240576, + "num_input_tokens_seen": 157823520, + "router_z_loss_mlp": 0.89160156, + "step": 1903, + "time_per_iteration": 2.992624044418335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183651, + "balance_loss_mlp": 1.09443474, + "epoch": 0.3662947287418238, + "flos": 565822803456.0, + "grad_norm": 0.027935940535232063, + "language_loss": 0.96619356, + "learning_rate": 0.000731618567457912, + "loss": 0.97803003, + "num_input_tokens_seen": 157893248, + "router_z_loss_mlp": 0.89013672, + "step": 1904, + "time_per_iteration": 2.685476064682007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183785, + "balance_loss_mlp": 1.09433067, + "epoch": 0.3664871104270873, + "flos": 791201857536.0, + "grad_norm": 0.029459392082425068, + "language_loss": 0.95166355, + "learning_rate": 0.000731342423012139, + "loss": 0.96350139, + "num_input_tokens_seen": 157973216, + "router_z_loss_mlp": 0.89257812, + "step": 1905, + "time_per_iteration": 3.0574183464050293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184501, + "balance_loss_mlp": 1.09480846, + "epoch": 0.3666794921123509, + "flos": 753980330496.0, + "grad_norm": 0.028631588758117728, + "language_loss": 0.89661896, + "learning_rate": 0.0007310661887512722, + "loss": 0.90846401, + "num_input_tokens_seen": 158051088, + "router_z_loss_mlp": 0.89501953, + "step": 1906, + "time_per_iteration": 3.024423122406006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183077, + "balance_loss_mlp": 1.09343171, + "epoch": 0.3668718737976145, + "flos": 524607005184.0, + "grad_norm": 0.02900954708937733, + "language_loss": 0.89823443, + "learning_rate": 0.0007307898647825549, + "loss": 0.91006529, + "num_input_tokens_seen": 158124368, + "router_z_loss_mlp": 0.89453125, + "step": 1907, + "time_per_iteration": 2.6485068798065186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182186, + "balance_loss_mlp": 1.09277892, + "epoch": 0.367064255482878, + "flos": 573045273600.0, + "grad_norm": 0.031417651983294596, + "language_loss": 0.98967636, + "learning_rate": 0.0007305134512132659, + "loss": 1.00149822, + "num_input_tokens_seen": 158191472, + "router_z_loss_mlp": 0.89208984, + "step": 1908, + "time_per_iteration": 2.646838903427124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180724, + "balance_loss_mlp": 1.09107888, + "epoch": 0.3672566371681416, + "flos": 448053660672.0, + "grad_norm": 0.03289649974011927, + "language_loss": 0.93253779, + "learning_rate": 0.0007302369481507183, + "loss": 0.94434512, + "num_input_tokens_seen": 158254384, + "router_z_loss_mlp": 0.89453125, + "step": 1909, + "time_per_iteration": 2.562856674194336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188614, + "balance_loss_mlp": 1.10011292, + "epoch": 0.36744901885340514, + "flos": 1543364061696.0, + "grad_norm": 0.010877058892954462, + "language_loss": 0.79961759, + "learning_rate": 0.00072996035570226, + "loss": 0.81150377, + "num_input_tokens_seen": 158486160, + "router_z_loss_mlp": 0.8828125, + "step": 1910, + "time_per_iteration": 4.90735387802124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011789, + "balance_loss_mlp": 1.08949292, + "epoch": 0.36764140053866873, + "flos": 564761290752.0, + "grad_norm": 0.024499581587470617, + "language_loss": 0.92626876, + "learning_rate": 0.000729683673975274, + "loss": 0.93805778, + "num_input_tokens_seen": 158555616, + "router_z_loss_mlp": 0.89208984, + "step": 1911, + "time_per_iteration": 2.6646595001220703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182116, + "balance_loss_mlp": 1.09285223, + "epoch": 0.36783378222393226, + "flos": 1218650895360.0, + "grad_norm": 0.021973130552363645, + "language_loss": 0.89050859, + "learning_rate": 0.0007294069030771774, + "loss": 0.90232974, + "num_input_tokens_seen": 158653984, + "router_z_loss_mlp": 0.890625, + "step": 1912, + "time_per_iteration": 3.6834843158721924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189865, + "balance_loss_mlp": 1.10021913, + "epoch": 0.36802616390919585, + "flos": 499720128000.0, + "grad_norm": 0.028676866730684987, + "language_loss": 0.97328013, + "learning_rate": 0.0007291300431154224, + "loss": 0.98517883, + "num_input_tokens_seen": 158719728, + "router_z_loss_mlp": 0.89453125, + "step": 1913, + "time_per_iteration": 2.587052822113037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195931, + "balance_loss_mlp": 1.10838318, + "epoch": 0.36821854559445943, + "flos": 1585615902720.0, + "grad_norm": 0.013013835157786544, + "language_loss": 0.70389736, + "learning_rate": 0.0007288530941974955, + "loss": 0.71585667, + "num_input_tokens_seen": 158952544, + "router_z_loss_mlp": 0.87695312, + "step": 1914, + "time_per_iteration": 4.952203989028931 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185283, + "balance_loss_mlp": 1.09582841, + "epoch": 0.36841092727972297, + "flos": 837089402880.0, + "grad_norm": 0.02834339080565921, + "language_loss": 0.8768307, + "learning_rate": 0.0007285760564309179, + "loss": 0.88868356, + "num_input_tokens_seen": 159039680, + "router_z_loss_mlp": 0.89257812, + "step": 1915, + "time_per_iteration": 3.100893974304199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185476, + "balance_loss_mlp": 1.09602106, + "epoch": 0.36860330896498655, + "flos": 691209913344.0, + "grad_norm": 0.028423235038061073, + "language_loss": 0.92041719, + "learning_rate": 0.0007282989299232448, + "loss": 0.93227196, + "num_input_tokens_seen": 159128128, + "router_z_loss_mlp": 0.89257812, + "step": 1916, + "time_per_iteration": 3.0683393478393555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189801, + "balance_loss_mlp": 1.10048962, + "epoch": 0.3687956906502501, + "flos": 555239877120.0, + "grad_norm": 0.03332088686108748, + "language_loss": 0.92434603, + "learning_rate": 0.0007280217147820668, + "loss": 0.93624407, + "num_input_tokens_seen": 159193248, + "router_z_loss_mlp": 0.89111328, + "step": 1917, + "time_per_iteration": 2.635451078414917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188211, + "balance_loss_mlp": 1.09894717, + "epoch": 0.3689880723355137, + "flos": 577819078656.0, + "grad_norm": 0.027623597033391085, + "language_loss": 0.8697632, + "learning_rate": 0.0007277444111150079, + "loss": 0.88164532, + "num_input_tokens_seen": 159265824, + "router_z_loss_mlp": 0.890625, + "step": 1918, + "time_per_iteration": 2.810635805130005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184664, + "balance_loss_mlp": 1.09540033, + "epoch": 0.3691804540207772, + "flos": 529886370816.0, + "grad_norm": 0.029489830132381867, + "language_loss": 0.91299617, + "learning_rate": 0.0007274670190297272, + "loss": 0.92484283, + "num_input_tokens_seen": 159332992, + "router_z_loss_mlp": 0.890625, + "step": 1919, + "time_per_iteration": 2.615386486053467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118238, + "balance_loss_mlp": 1.09368861, + "epoch": 0.3693728357060408, + "flos": 562180368384.0, + "grad_norm": 0.025570373781710027, + "language_loss": 0.90037912, + "learning_rate": 0.0007271895386339179, + "loss": 0.91220295, + "num_input_tokens_seen": 159409808, + "router_z_loss_mlp": 0.88476562, + "step": 1920, + "time_per_iteration": 2.7868921756744385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192586, + "balance_loss_mlp": 1.10375118, + "epoch": 0.3695652173913043, + "flos": 580899557376.0, + "grad_norm": 0.02893533685872539, + "language_loss": 0.90819347, + "learning_rate": 0.0007269119700353073, + "loss": 0.92011935, + "num_input_tokens_seen": 159486128, + "router_z_loss_mlp": 0.88623047, + "step": 1921, + "time_per_iteration": 2.7836573123931885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178636, + "balance_loss_mlp": 1.09023082, + "epoch": 0.3697575990765679, + "flos": 514059007488.0, + "grad_norm": 0.024390447267758214, + "language_loss": 0.90977228, + "learning_rate": 0.0007266343133416571, + "loss": 0.92155862, + "num_input_tokens_seen": 159562224, + "router_z_loss_mlp": 0.8828125, + "step": 1922, + "time_per_iteration": 2.800387382507324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173615, + "balance_loss_mlp": 1.08816528, + "epoch": 0.3699499807618315, + "flos": 1573903607808.0, + "grad_norm": 0.0066311072211368925, + "language_loss": 0.77116919, + "learning_rate": 0.0007263565686607632, + "loss": 0.78290522, + "num_input_tokens_seen": 159784768, + "router_z_loss_mlp": 0.85546875, + "step": 1923, + "time_per_iteration": 4.845300912857056 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176045, + "balance_loss_mlp": 1.08844995, + "epoch": 0.37014236244709503, + "flos": 498324243456.0, + "grad_norm": 0.031949393340513096, + "language_loss": 0.9351213, + "learning_rate": 0.0007260787361004556, + "loss": 0.94688171, + "num_input_tokens_seen": 159848608, + "router_z_loss_mlp": 0.87744141, + "step": 1924, + "time_per_iteration": 2.5984597206115723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175598, + "balance_loss_mlp": 1.0905304, + "epoch": 0.3703347441323586, + "flos": 1447605433344.0, + "grad_norm": 0.008500773473990196, + "language_loss": 0.73761505, + "learning_rate": 0.0007258008157685987, + "loss": 0.74937099, + "num_input_tokens_seen": 160080928, + "router_z_loss_mlp": 0.8515625, + "step": 1925, + "time_per_iteration": 4.886027097702026 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197031, + "balance_loss_mlp": 1.10862505, + "epoch": 0.37052712581762215, + "flos": 564713627136.0, + "grad_norm": 0.03178088368953176, + "language_loss": 0.94516188, + "learning_rate": 0.0007255228077730903, + "loss": 0.95713222, + "num_input_tokens_seen": 160148976, + "router_z_loss_mlp": 0.88183594, + "step": 1926, + "time_per_iteration": 2.6847593784332275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185383, + "balance_loss_mlp": 1.09731126, + "epoch": 0.37071950750288574, + "flos": 927570667008.0, + "grad_norm": 0.029564625514678724, + "language_loss": 0.89603549, + "learning_rate": 0.0007252447122218632, + "loss": 0.90788931, + "num_input_tokens_seen": 160233504, + "router_z_loss_mlp": 0.88037109, + "step": 1927, + "time_per_iteration": 3.106748342514038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179784, + "balance_loss_mlp": 1.0919987, + "epoch": 0.37091188918814927, + "flos": 419200710144.0, + "grad_norm": 0.03402230349378661, + "language_loss": 0.98334146, + "learning_rate": 0.0007249665292228834, + "loss": 0.99513936, + "num_input_tokens_seen": 160299696, + "router_z_loss_mlp": 0.87939453, + "step": 1928, + "time_per_iteration": 2.5786120891571045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186321, + "balance_loss_mlp": 1.09801054, + "epoch": 0.37110427087341286, + "flos": 464146265088.0, + "grad_norm": 0.029271450765855984, + "language_loss": 0.9102214, + "learning_rate": 0.000724688258884151, + "loss": 0.92208457, + "num_input_tokens_seen": 160367904, + "router_z_loss_mlp": 0.88183594, + "step": 1929, + "time_per_iteration": 2.5388894081115723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185686, + "balance_loss_mlp": 1.09780467, + "epoch": 0.3712966525586764, + "flos": 851080449024.0, + "grad_norm": 0.02435916983518334, + "language_loss": 0.9136247, + "learning_rate": 0.0007244099013137002, + "loss": 0.92548156, + "num_input_tokens_seen": 160453600, + "router_z_loss_mlp": 0.88037109, + "step": 1930, + "time_per_iteration": 3.0708000659942627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179762, + "balance_loss_mlp": 1.09159458, + "epoch": 0.37148903424394, + "flos": 927557932032.0, + "grad_norm": 0.024720397528266293, + "language_loss": 0.95256186, + "learning_rate": 0.0007241314566195993, + "loss": 0.96435952, + "num_input_tokens_seen": 160543472, + "router_z_loss_mlp": 0.88232422, + "step": 1931, + "time_per_iteration": 3.2293543815612793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179876, + "balance_loss_mlp": 1.09180403, + "epoch": 0.37168141592920356, + "flos": 520820852736.0, + "grad_norm": 0.029266961451931986, + "language_loss": 0.92750597, + "learning_rate": 0.0007238529249099496, + "loss": 0.93930471, + "num_input_tokens_seen": 160614016, + "router_z_loss_mlp": 0.88232422, + "step": 1932, + "time_per_iteration": 2.6091582775115967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188461, + "balance_loss_mlp": 1.10263062, + "epoch": 0.3718737976144671, + "flos": 1449059715072.0, + "grad_norm": 0.015165360012205364, + "language_loss": 0.77856874, + "learning_rate": 0.0007235743062928872, + "loss": 0.79045337, + "num_input_tokens_seen": 160828640, + "router_z_loss_mlp": 0.859375, + "step": 1933, + "time_per_iteration": 4.854676246643066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184357, + "balance_loss_mlp": 1.09614182, + "epoch": 0.3720661792997307, + "flos": 760953022464.0, + "grad_norm": 0.028795817149727888, + "language_loss": 0.88381398, + "learning_rate": 0.000723295600876581, + "loss": 0.89565754, + "num_input_tokens_seen": 160913088, + "router_z_loss_mlp": 0.8828125, + "step": 1934, + "time_per_iteration": 2.9830405712127686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118189, + "balance_loss_mlp": 1.09396136, + "epoch": 0.3722585609849942, + "flos": 518044546560.0, + "grad_norm": 0.028690096062057496, + "language_loss": 0.95446575, + "learning_rate": 0.0007230168087692344, + "loss": 0.96628463, + "num_input_tokens_seen": 160982960, + "router_z_loss_mlp": 0.88085938, + "step": 1935, + "time_per_iteration": 2.651982307434082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181923, + "balance_loss_mlp": 1.09404159, + "epoch": 0.3724509426702578, + "flos": 783868597248.0, + "grad_norm": 0.02900654324264667, + "language_loss": 0.88952625, + "learning_rate": 0.0007227379300790839, + "loss": 0.90134549, + "num_input_tokens_seen": 161066000, + "router_z_loss_mlp": 0.88037109, + "step": 1936, + "time_per_iteration": 3.0127265453338623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177948, + "balance_loss_mlp": 1.09006691, + "epoch": 0.37264332435552133, + "flos": 392599039488.0, + "grad_norm": 0.02836050450865214, + "language_loss": 0.94049299, + "learning_rate": 0.0007224589649143997, + "loss": 0.95227242, + "num_input_tokens_seen": 161131040, + "router_z_loss_mlp": 0.88037109, + "step": 1937, + "time_per_iteration": 2.5600061416625977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178201, + "balance_loss_mlp": 1.09074926, + "epoch": 0.3728357060407849, + "flos": 543912345600.0, + "grad_norm": 0.027673862011078548, + "language_loss": 0.89373219, + "learning_rate": 0.0007221799133834861, + "loss": 0.90551418, + "num_input_tokens_seen": 161201248, + "router_z_loss_mlp": 0.87597656, + "step": 1938, + "time_per_iteration": 2.646632671356201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011797, + "balance_loss_mlp": 1.0919621, + "epoch": 0.3730280877260485, + "flos": 434483581440.0, + "grad_norm": 0.03019004471989451, + "language_loss": 0.90666437, + "learning_rate": 0.00072190077559468, + "loss": 0.91846132, + "num_input_tokens_seen": 161266288, + "router_z_loss_mlp": 0.87890625, + "step": 1939, + "time_per_iteration": 2.5193679332733154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118304, + "balance_loss_mlp": 1.0957315, + "epoch": 0.37322046941131204, + "flos": 532510953984.0, + "grad_norm": 0.02812892901872328, + "language_loss": 0.95514065, + "learning_rate": 0.0007216215516563527, + "loss": 0.96697104, + "num_input_tokens_seen": 161335648, + "router_z_loss_mlp": 0.87451172, + "step": 1940, + "time_per_iteration": 2.6975200176239014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184025, + "balance_loss_mlp": 1.09666896, + "epoch": 0.3734128510965756, + "flos": 532576081920.0, + "grad_norm": 0.028733495674926814, + "language_loss": 0.91960251, + "learning_rate": 0.0007213422416769083, + "loss": 0.93144274, + "num_input_tokens_seen": 161403440, + "router_z_loss_mlp": 0.875, + "step": 1941, + "time_per_iteration": 2.636056900024414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183262, + "balance_loss_mlp": 1.09561944, + "epoch": 0.37360523278183916, + "flos": 501432920064.0, + "grad_norm": 0.028111058318233337, + "language_loss": 0.83044219, + "learning_rate": 0.0007210628457647849, + "loss": 0.84227479, + "num_input_tokens_seen": 161472864, + "router_z_loss_mlp": 0.87792969, + "step": 1942, + "time_per_iteration": 2.6564345359802246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182498, + "balance_loss_mlp": 1.09475958, + "epoch": 0.37379761446710275, + "flos": 549111846912.0, + "grad_norm": 0.03172951338735415, + "language_loss": 0.86608446, + "learning_rate": 0.000720783364028453, + "loss": 0.87790942, + "num_input_tokens_seen": 161548096, + "router_z_loss_mlp": 0.87890625, + "step": 1943, + "time_per_iteration": 2.7782797813415527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176645, + "balance_loss_mlp": 1.08909822, + "epoch": 0.3739899961523663, + "flos": 476739425280.0, + "grad_norm": 0.0265564263320471, + "language_loss": 0.94348681, + "learning_rate": 0.0007205037965764177, + "loss": 0.95525324, + "num_input_tokens_seen": 161615600, + "router_z_loss_mlp": 0.87695312, + "step": 1944, + "time_per_iteration": 2.5670034885406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198539, + "balance_loss_mlp": 1.11003804, + "epoch": 0.37418237783762986, + "flos": 613076034048.0, + "grad_norm": 0.032068934234115415, + "language_loss": 0.94037992, + "learning_rate": 0.0007202241435172161, + "loss": 0.95236534, + "num_input_tokens_seen": 161687408, + "router_z_loss_mlp": 0.8828125, + "step": 1945, + "time_per_iteration": 2.7505762577056885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119095, + "balance_loss_mlp": 1.10283065, + "epoch": 0.3743747595228934, + "flos": 767628272640.0, + "grad_norm": 0.02891432689626354, + "language_loss": 0.95249915, + "learning_rate": 0.0007199444049594198, + "loss": 0.9644087, + "num_input_tokens_seen": 161764224, + "router_z_loss_mlp": 0.88085938, + "step": 1946, + "time_per_iteration": 2.9690663814544678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179721, + "balance_loss_mlp": 1.09188759, + "epoch": 0.374567141208157, + "flos": 525490598400.0, + "grad_norm": 0.029648083740235674, + "language_loss": 0.90769064, + "learning_rate": 0.0007196645810116322, + "loss": 0.91948783, + "num_input_tokens_seen": 161835520, + "router_z_loss_mlp": 0.87988281, + "step": 1947, + "time_per_iteration": 2.690214157104492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178535, + "balance_loss_mlp": 1.09065437, + "epoch": 0.37475952289342057, + "flos": 682613025792.0, + "grad_norm": 0.029716110952303924, + "language_loss": 0.91939867, + "learning_rate": 0.0007193846717824912, + "loss": 0.93118405, + "num_input_tokens_seen": 161912000, + "router_z_loss_mlp": 0.88037109, + "step": 1948, + "time_per_iteration": 2.9668121337890625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179187, + "balance_loss_mlp": 1.09140122, + "epoch": 0.3749519045786841, + "flos": 461215507968.0, + "grad_norm": 0.032662314662123194, + "language_loss": 0.97396064, + "learning_rate": 0.0007191046773806669, + "loss": 0.98575246, + "num_input_tokens_seen": 161977296, + "router_z_loss_mlp": 0.87939453, + "step": 1949, + "time_per_iteration": 2.5580427646636963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189402, + "balance_loss_mlp": 1.10166442, + "epoch": 0.3751442862639477, + "flos": 956386687488.0, + "grad_norm": 0.03764484603893814, + "language_loss": 0.94282359, + "learning_rate": 0.0007188245979148631, + "loss": 0.95471758, + "num_input_tokens_seen": 162051888, + "router_z_loss_mlp": 0.87890625, + "step": 1950, + "time_per_iteration": 3.1307644844055176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185097, + "balance_loss_mlp": 1.09678674, + "epoch": 0.3753366679492112, + "flos": 528805392384.0, + "grad_norm": 0.0321726971318772, + "language_loss": 0.95554888, + "learning_rate": 0.0007185444334938157, + "loss": 0.96739984, + "num_input_tokens_seen": 162124384, + "router_z_loss_mlp": 0.8828125, + "step": 1951, + "time_per_iteration": 2.7235019207000732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181124, + "balance_loss_mlp": 1.09324276, + "epoch": 0.3755290496344748, + "flos": 522848550912.0, + "grad_norm": 0.029170285322497422, + "language_loss": 0.91979843, + "learning_rate": 0.0007182641842262947, + "loss": 0.93160963, + "num_input_tokens_seen": 162191440, + "router_z_loss_mlp": 0.88037109, + "step": 1952, + "time_per_iteration": 2.6083579063415527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179821, + "balance_loss_mlp": 1.09193957, + "epoch": 0.37572143131973834, + "flos": 622371864576.0, + "grad_norm": 0.029206332986401715, + "language_loss": 0.85116351, + "learning_rate": 0.0007179838502211022, + "loss": 0.86296165, + "num_input_tokens_seen": 162268480, + "router_z_loss_mlp": 0.88037109, + "step": 1953, + "time_per_iteration": 2.8308520317077637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185603, + "balance_loss_mlp": 1.0973407, + "epoch": 0.37591381300500193, + "flos": 772273823232.0, + "grad_norm": 0.030259488278154622, + "language_loss": 0.94510454, + "learning_rate": 0.0007177034315870738, + "loss": 0.9569605, + "num_input_tokens_seen": 162346752, + "router_z_loss_mlp": 0.88232422, + "step": 1954, + "time_per_iteration": 2.966627359390259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187445, + "balance_loss_mlp": 1.09908688, + "epoch": 0.37610619469026546, + "flos": 521480864256.0, + "grad_norm": 0.02960656624392615, + "language_loss": 0.99060822, + "learning_rate": 0.0007174229284330773, + "loss": 1.00248265, + "num_input_tokens_seen": 162415120, + "router_z_loss_mlp": 0.88330078, + "step": 1955, + "time_per_iteration": 2.642186403274536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182076, + "balance_loss_mlp": 1.09338391, + "epoch": 0.37629857637552905, + "flos": 599970582528.0, + "grad_norm": 0.025408092842649905, + "language_loss": 0.92700577, + "learning_rate": 0.0007171423408680141, + "loss": 0.93882644, + "num_input_tokens_seen": 162493280, + "router_z_loss_mlp": 0.88671875, + "step": 1956, + "time_per_iteration": 2.8501906394958496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180409, + "balance_loss_mlp": 1.09138381, + "epoch": 0.37649095806079264, + "flos": 566018187264.0, + "grad_norm": 0.027446848492574977, + "language_loss": 0.96095192, + "learning_rate": 0.0007168616690008176, + "loss": 0.97275609, + "num_input_tokens_seen": 162560736, + "router_z_loss_mlp": 0.88818359, + "step": 1957, + "time_per_iteration": 2.658282995223999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183288, + "balance_loss_mlp": 1.09440601, + "epoch": 0.37668333974605617, + "flos": 593568579072.0, + "grad_norm": 0.029268558303355535, + "language_loss": 0.93381131, + "learning_rate": 0.0007165809129404545, + "loss": 0.9456442, + "num_input_tokens_seen": 162630688, + "router_z_loss_mlp": 0.88671875, + "step": 1958, + "time_per_iteration": 2.738896608352661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185047, + "balance_loss_mlp": 1.09621239, + "epoch": 0.37687572143131975, + "flos": 420364280832.0, + "grad_norm": 0.028940223287944336, + "language_loss": 0.94791234, + "learning_rate": 0.0007163000727959239, + "loss": 0.95976275, + "num_input_tokens_seen": 162694304, + "router_z_loss_mlp": 0.88623047, + "step": 1959, + "time_per_iteration": 2.5175514221191406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122541, + "balance_loss_mlp": 1.14034271, + "epoch": 0.3770681031165833, + "flos": 1360384568832.0, + "grad_norm": 0.031863979933265396, + "language_loss": 0.77959073, + "learning_rate": 0.0007160191486762575, + "loss": 0.79184484, + "num_input_tokens_seen": 162920336, + "router_z_loss_mlp": 0.8515625, + "step": 1960, + "time_per_iteration": 4.834294557571411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187625, + "balance_loss_mlp": 1.0985992, + "epoch": 0.3772604848018469, + "flos": 646153568256.0, + "grad_norm": 0.027699188267120346, + "language_loss": 0.9236567, + "learning_rate": 0.00071573814069052, + "loss": 0.93553299, + "num_input_tokens_seen": 163000720, + "router_z_loss_mlp": 0.88818359, + "step": 1961, + "time_per_iteration": 2.8704912662506104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195985, + "balance_loss_mlp": 1.10681665, + "epoch": 0.3774528664871104, + "flos": 903200810496.0, + "grad_norm": 0.025601029742712816, + "language_loss": 0.93588847, + "learning_rate": 0.0007154570489478081, + "loss": 0.94784832, + "num_input_tokens_seen": 163085680, + "router_z_loss_mlp": 0.88964844, + "step": 1962, + "time_per_iteration": 3.2312510013580322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198663, + "balance_loss_mlp": 1.1095897, + "epoch": 0.377645248172374, + "flos": 789462868992.0, + "grad_norm": 0.028157211525065163, + "language_loss": 0.92405236, + "learning_rate": 0.0007151758735572514, + "loss": 0.93603897, + "num_input_tokens_seen": 163162224, + "router_z_loss_mlp": 0.88867188, + "step": 1963, + "time_per_iteration": 3.0338857173919678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192995, + "balance_loss_mlp": 1.10396981, + "epoch": 0.3778376298576376, + "flos": 587924642304.0, + "grad_norm": 0.030822839560022956, + "language_loss": 0.89740217, + "learning_rate": 0.0007148946146280119, + "loss": 0.90933216, + "num_input_tokens_seen": 163237920, + "router_z_loss_mlp": 0.88818359, + "step": 1964, + "time_per_iteration": 2.795830488204956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193161, + "balance_loss_mlp": 1.10656738, + "epoch": 0.3780300115429011, + "flos": 1399669997568.0, + "grad_norm": 0.013238700163895742, + "language_loss": 0.72192144, + "learning_rate": 0.000714613272269284, + "loss": 0.7338531, + "num_input_tokens_seen": 163455760, + "router_z_loss_mlp": 0.8671875, + "step": 1965, + "time_per_iteration": 4.866962909698486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120089, + "balance_loss_mlp": 1.11372375, + "epoch": 0.3782223932281647, + "flos": 1360631619072.0, + "grad_norm": 0.015556792607008025, + "language_loss": 0.75341946, + "learning_rate": 0.0007143318465902943, + "loss": 0.76542836, + "num_input_tokens_seen": 163678064, + "router_z_loss_mlp": 0.87304688, + "step": 1966, + "time_per_iteration": 4.942438364028931 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179172, + "balance_loss_mlp": 1.09114802, + "epoch": 0.37841477491342823, + "flos": 705515865600.0, + "grad_norm": 0.024767419651172896, + "language_loss": 0.90831983, + "learning_rate": 0.0007140503377003022, + "loss": 0.92011154, + "num_input_tokens_seen": 163764320, + "router_z_loss_mlp": 0.88183594, + "step": 1967, + "time_per_iteration": 2.9852232933044434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118121, + "balance_loss_mlp": 1.09318614, + "epoch": 0.3786071565986918, + "flos": 530155614720.0, + "grad_norm": 0.02676934241732637, + "language_loss": 0.92451024, + "learning_rate": 0.000713768745708599, + "loss": 0.93632239, + "num_input_tokens_seen": 163831808, + "router_z_loss_mlp": 0.88183594, + "step": 1968, + "time_per_iteration": 2.6276321411132812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180899, + "balance_loss_mlp": 1.09311283, + "epoch": 0.37879953828395535, + "flos": 994900039680.0, + "grad_norm": 0.026029915049846697, + "language_loss": 0.85207623, + "learning_rate": 0.0007134870707245085, + "loss": 0.86388516, + "num_input_tokens_seen": 163918128, + "router_z_loss_mlp": 0.87939453, + "step": 1969, + "time_per_iteration": 3.2757370471954346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118867, + "balance_loss_mlp": 1.10074103, + "epoch": 0.37899191996921894, + "flos": 627792219648.0, + "grad_norm": 0.029282968357198087, + "language_loss": 0.91297084, + "learning_rate": 0.0007132053128573864, + "loss": 0.92485756, + "num_input_tokens_seen": 163987552, + "router_z_loss_mlp": 0.88085938, + "step": 1970, + "time_per_iteration": 2.713987350463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184407, + "balance_loss_mlp": 1.09633517, + "epoch": 0.37918430165448247, + "flos": 687519088128.0, + "grad_norm": 0.026716081838251738, + "language_loss": 0.91701669, + "learning_rate": 0.0007129234722166211, + "loss": 0.92886078, + "num_input_tokens_seen": 164063248, + "router_z_loss_mlp": 0.88232422, + "step": 1971, + "time_per_iteration": 2.830312728881836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178089, + "balance_loss_mlp": 1.09025514, + "epoch": 0.37937668333974606, + "flos": 476617901568.0, + "grad_norm": 0.023390773702336033, + "language_loss": 0.97041333, + "learning_rate": 0.0007126415489116328, + "loss": 0.98219419, + "num_input_tokens_seen": 164133776, + "router_z_loss_mlp": 0.87988281, + "step": 1972, + "time_per_iteration": 2.6577088832855225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186585, + "balance_loss_mlp": 1.09903812, + "epoch": 0.37956906502500964, + "flos": 708823928832.0, + "grad_norm": 0.02822522227358307, + "language_loss": 0.89341533, + "learning_rate": 0.0007123595430518736, + "loss": 0.90528119, + "num_input_tokens_seen": 164206672, + "router_z_loss_mlp": 0.87695312, + "step": 1973, + "time_per_iteration": 2.8803040981292725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187247, + "balance_loss_mlp": 1.09974778, + "epoch": 0.3797614467102732, + "flos": 427558553088.0, + "grad_norm": 0.030455517002935972, + "language_loss": 0.93240166, + "learning_rate": 0.0007120774547468282, + "loss": 0.94427419, + "num_input_tokens_seen": 164271968, + "router_z_loss_mlp": 0.87646484, + "step": 1974, + "time_per_iteration": 2.5190658569335938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185963, + "balance_loss_mlp": 1.09836841, + "epoch": 0.37995382839553676, + "flos": 482880916992.0, + "grad_norm": 0.028219754054602288, + "language_loss": 0.89357984, + "learning_rate": 0.0007117952841060128, + "loss": 0.9054395, + "num_input_tokens_seen": 164342800, + "router_z_loss_mlp": 0.87744141, + "step": 1975, + "time_per_iteration": 2.6428894996643066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184241, + "balance_loss_mlp": 1.09631252, + "epoch": 0.3801462100808003, + "flos": 561670078464.0, + "grad_norm": 0.02907805968320273, + "language_loss": 0.90876186, + "learning_rate": 0.0007115130312389756, + "loss": 0.92060423, + "num_input_tokens_seen": 164414928, + "router_z_loss_mlp": 0.88085938, + "step": 1976, + "time_per_iteration": 2.669287919998169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188644, + "balance_loss_mlp": 1.10066783, + "epoch": 0.3803385917660639, + "flos": 465887255040.0, + "grad_norm": 0.031138982719559682, + "language_loss": 0.88565898, + "learning_rate": 0.0007112306962552973, + "loss": 0.89754546, + "num_input_tokens_seen": 164483312, + "router_z_loss_mlp": 0.88134766, + "step": 1977, + "time_per_iteration": 2.617105007171631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188488, + "balance_loss_mlp": 1.10055935, + "epoch": 0.3805309734513274, + "flos": 522904946688.0, + "grad_norm": 0.027881475391737562, + "language_loss": 0.92461807, + "learning_rate": 0.0007109482792645896, + "loss": 0.93650293, + "num_input_tokens_seen": 164555760, + "router_z_loss_mlp": 0.88085938, + "step": 1978, + "time_per_iteration": 2.7350404262542725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191644, + "balance_loss_mlp": 1.10352468, + "epoch": 0.380723355136591, + "flos": 592552728576.0, + "grad_norm": 0.03010131618310245, + "language_loss": 0.91373634, + "learning_rate": 0.0007106657803764969, + "loss": 0.92565274, + "num_input_tokens_seen": 164626768, + "router_z_loss_mlp": 0.88183594, + "step": 1979, + "time_per_iteration": 2.7113609313964844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188099, + "balance_loss_mlp": 1.10007489, + "epoch": 0.38091573682185453, + "flos": 623854344192.0, + "grad_norm": 0.03122566409921124, + "language_loss": 0.90192807, + "learning_rate": 0.0007103831997006948, + "loss": 0.91380906, + "num_input_tokens_seen": 164698016, + "router_z_loss_mlp": 0.88183594, + "step": 1980, + "time_per_iteration": 2.7460203170776367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183293, + "balance_loss_mlp": 1.09507859, + "epoch": 0.3811081185071181, + "flos": 570175641600.0, + "grad_norm": 0.027157726640451497, + "language_loss": 0.92157245, + "learning_rate": 0.0007101005373468908, + "loss": 0.9334054, + "num_input_tokens_seen": 164780320, + "router_z_loss_mlp": 0.8828125, + "step": 1981, + "time_per_iteration": 2.869722604751587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176795, + "balance_loss_mlp": 1.08891392, + "epoch": 0.3813005001923817, + "flos": 585990269952.0, + "grad_norm": 0.026054611177121254, + "language_loss": 0.92786968, + "learning_rate": 0.0007098177934248242, + "loss": 0.9396376, + "num_input_tokens_seen": 164854400, + "router_z_loss_mlp": 0.88037109, + "step": 1982, + "time_per_iteration": 2.7341668605804443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179814, + "balance_loss_mlp": 1.09188521, + "epoch": 0.38149288187764524, + "flos": 622810295808.0, + "grad_norm": 0.03120804506271422, + "language_loss": 0.94404829, + "learning_rate": 0.0007095349680442661, + "loss": 0.95584643, + "num_input_tokens_seen": 164932896, + "router_z_loss_mlp": 0.88085938, + "step": 1983, + "time_per_iteration": 2.845836639404297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182966, + "balance_loss_mlp": 1.09522831, + "epoch": 0.3816852635629088, + "flos": 571797109248.0, + "grad_norm": 0.027372063240090748, + "language_loss": 0.86448967, + "learning_rate": 0.0007092520613150188, + "loss": 0.87631935, + "num_input_tokens_seen": 165002896, + "router_z_loss_mlp": 0.87890625, + "step": 1984, + "time_per_iteration": 2.6740176677703857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178711, + "balance_loss_mlp": 1.09106863, + "epoch": 0.38187764524817236, + "flos": 566678198784.0, + "grad_norm": 0.03160695384354602, + "language_loss": 0.87573516, + "learning_rate": 0.0007089690733469165, + "loss": 0.88752234, + "num_input_tokens_seen": 165074704, + "router_z_loss_mlp": 0.87792969, + "step": 1985, + "time_per_iteration": 2.717921733856201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178571, + "balance_loss_mlp": 1.09073794, + "epoch": 0.38207002693343595, + "flos": 632398838784.0, + "grad_norm": 0.031031403109496963, + "language_loss": 0.90504575, + "learning_rate": 0.000708686004249825, + "loss": 0.91683149, + "num_input_tokens_seen": 165149136, + "router_z_loss_mlp": 0.87988281, + "step": 1986, + "time_per_iteration": 2.758554697036743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179432, + "balance_loss_mlp": 1.09164619, + "epoch": 0.3822624086186995, + "flos": 549840989184.0, + "grad_norm": 0.025201133141653974, + "language_loss": 0.97533029, + "learning_rate": 0.0007084028541336413, + "loss": 0.98712462, + "num_input_tokens_seen": 165220864, + "router_z_loss_mlp": 0.87939453, + "step": 1987, + "time_per_iteration": 2.6981115341186523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187219, + "balance_loss_mlp": 1.09909916, + "epoch": 0.38245479030396307, + "flos": 615066802176.0, + "grad_norm": 0.02853553744793089, + "language_loss": 0.9291808, + "learning_rate": 0.0007081196231082942, + "loss": 0.94105303, + "num_input_tokens_seen": 165301568, + "router_z_loss_mlp": 0.8828125, + "step": 1988, + "time_per_iteration": 2.7912278175354004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186636, + "balance_loss_mlp": 1.09851646, + "epoch": 0.38264717198922665, + "flos": 669303458304.0, + "grad_norm": 0.029318681320032423, + "language_loss": 0.88455558, + "learning_rate": 0.0007078363112837436, + "loss": 0.89642197, + "num_input_tokens_seen": 165373152, + "router_z_loss_mlp": 0.8828125, + "step": 1989, + "time_per_iteration": 2.8133885860443115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187352, + "balance_loss_mlp": 1.09927964, + "epoch": 0.3828395536744902, + "flos": 455686364160.0, + "grad_norm": 0.029265262626364436, + "language_loss": 0.9249233, + "learning_rate": 0.000707552918769981, + "loss": 0.93679678, + "num_input_tokens_seen": 165439136, + "router_z_loss_mlp": 0.88232422, + "step": 1990, + "time_per_iteration": 2.538587808609009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180802, + "balance_loss_mlp": 1.09277809, + "epoch": 0.3830319353597538, + "flos": 500482197504.0, + "grad_norm": 0.02588536582900798, + "language_loss": 0.91112638, + "learning_rate": 0.000707269445677029, + "loss": 0.92293441, + "num_input_tokens_seen": 165514624, + "router_z_loss_mlp": 0.88183594, + "step": 1991, + "time_per_iteration": 2.7578041553497314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183391, + "balance_loss_mlp": 1.09536684, + "epoch": 0.3832243170450173, + "flos": 745466035200.0, + "grad_norm": 0.02707218781991338, + "language_loss": 0.91718936, + "learning_rate": 0.0007069858921149416, + "loss": 0.92902327, + "num_input_tokens_seen": 165594512, + "router_z_loss_mlp": 0.88183594, + "step": 1992, + "time_per_iteration": 2.948418617248535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184259, + "balance_loss_mlp": 1.09613955, + "epoch": 0.3834166987302809, + "flos": 579345219072.0, + "grad_norm": 0.02587271093699699, + "language_loss": 0.92343616, + "learning_rate": 0.0007067022581938043, + "loss": 0.93527877, + "num_input_tokens_seen": 165673968, + "router_z_loss_mlp": 0.8828125, + "step": 1993, + "time_per_iteration": 2.881967782974243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118782, + "balance_loss_mlp": 1.09965289, + "epoch": 0.3836090804155444, + "flos": 537608397312.0, + "grad_norm": 0.029882536442049617, + "language_loss": 0.91833031, + "learning_rate": 0.0007064185440237334, + "loss": 0.9302085, + "num_input_tokens_seen": 165747664, + "router_z_loss_mlp": 0.88330078, + "step": 1994, + "time_per_iteration": 2.7481510639190674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190014, + "balance_loss_mlp": 1.10189474, + "epoch": 0.383801462100808, + "flos": 603051061248.0, + "grad_norm": 0.027232179622410133, + "language_loss": 0.91516536, + "learning_rate": 0.0007061347497148764, + "loss": 0.92706549, + "num_input_tokens_seen": 165824624, + "router_z_loss_mlp": 0.8828125, + "step": 1995, + "time_per_iteration": 2.762807846069336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191619, + "balance_loss_mlp": 1.10321367, + "epoch": 0.38399384378607154, + "flos": 573798610944.0, + "grad_norm": 0.03191203592253993, + "language_loss": 0.9478448, + "learning_rate": 0.0007058508753774122, + "loss": 0.95976096, + "num_input_tokens_seen": 165896304, + "router_z_loss_mlp": 0.88476562, + "step": 1996, + "time_per_iteration": 2.7208473682403564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185202, + "balance_loss_mlp": 1.09708297, + "epoch": 0.38418622547133513, + "flos": 537779586048.0, + "grad_norm": 0.03234926235653744, + "language_loss": 0.93760306, + "learning_rate": 0.0007055669211215505, + "loss": 0.94945514, + "num_input_tokens_seen": 165961312, + "router_z_loss_mlp": 0.8828125, + "step": 1997, + "time_per_iteration": 2.6605474948883057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182194, + "balance_loss_mlp": 1.09397876, + "epoch": 0.3843786071565987, + "flos": 574013460480.0, + "grad_norm": 0.03558568539094479, + "language_loss": 0.86620909, + "learning_rate": 0.0007052828870575322, + "loss": 0.87803102, + "num_input_tokens_seen": 166028064, + "router_z_loss_mlp": 0.88378906, + "step": 1998, + "time_per_iteration": 2.6478962898254395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179215, + "balance_loss_mlp": 1.09100008, + "epoch": 0.38457098884186225, + "flos": 730079104512.0, + "grad_norm": 0.027610192556292087, + "language_loss": 0.94167769, + "learning_rate": 0.0007049987732956291, + "loss": 0.95346981, + "num_input_tokens_seen": 166110272, + "router_z_loss_mlp": 0.88378906, + "step": 1999, + "time_per_iteration": 2.9643850326538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190926, + "balance_loss_mlp": 1.10199583, + "epoch": 0.38476337052712584, + "flos": 584620581888.0, + "grad_norm": 0.023866575274933036, + "language_loss": 0.8787694, + "learning_rate": 0.0007047145799461439, + "loss": 0.89067864, + "num_input_tokens_seen": 166193088, + "router_z_loss_mlp": 0.88720703, + "step": 2000, + "time_per_iteration": 2.8542819023132324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191076, + "balance_loss_mlp": 1.10200322, + "epoch": 0.38495575221238937, + "flos": 554158898688.0, + "grad_norm": 0.025960095413567152, + "language_loss": 0.89154112, + "learning_rate": 0.00070443030711941, + "loss": 0.90345186, + "num_input_tokens_seen": 166271776, + "router_z_loss_mlp": 0.88867188, + "step": 2001, + "time_per_iteration": 2.770023822784424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189246, + "balance_loss_mlp": 1.10084057, + "epoch": 0.38514813389765296, + "flos": 655676983296.0, + "grad_norm": 0.026490656569535233, + "language_loss": 0.88696259, + "learning_rate": 0.0007041459549257924, + "loss": 0.89885509, + "num_input_tokens_seen": 166350000, + "router_z_loss_mlp": 0.88476562, + "step": 2002, + "time_per_iteration": 4.357714414596558 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182232, + "balance_loss_mlp": 1.09392142, + "epoch": 0.3853405155829165, + "flos": 869645913600.0, + "grad_norm": 0.03138294802585753, + "language_loss": 0.86704218, + "learning_rate": 0.0007038615234756859, + "loss": 0.87886453, + "num_input_tokens_seen": 166434336, + "router_z_loss_mlp": 0.88476562, + "step": 2003, + "time_per_iteration": 3.154315233230591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179573, + "balance_loss_mlp": 1.09135854, + "epoch": 0.3855328972681801, + "flos": 547468185600.0, + "grad_norm": 0.030993794918127784, + "language_loss": 0.91032863, + "learning_rate": 0.000703577012879517, + "loss": 0.92212439, + "num_input_tokens_seen": 166503952, + "router_z_loss_mlp": 0.88378906, + "step": 2004, + "time_per_iteration": 2.6320230960845947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184907, + "balance_loss_mlp": 1.09673953, + "epoch": 0.3857252789534436, + "flos": 535098607104.0, + "grad_norm": 0.029525133384240967, + "language_loss": 0.9687134, + "learning_rate": 0.0007032924232477423, + "loss": 0.98056245, + "num_input_tokens_seen": 166575168, + "router_z_loss_mlp": 0.88330078, + "step": 2005, + "time_per_iteration": 2.650982618331909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184324, + "balance_loss_mlp": 1.09630013, + "epoch": 0.3859176606387072, + "flos": 492766901760.0, + "grad_norm": 0.029334702789067958, + "language_loss": 0.8823278, + "learning_rate": 0.0007030077546908493, + "loss": 0.89417106, + "num_input_tokens_seen": 166647552, + "router_z_loss_mlp": 0.88183594, + "step": 2006, + "time_per_iteration": 2.642333745956421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203979, + "balance_loss_mlp": 1.11700439, + "epoch": 0.3861100423239708, + "flos": 1490155991040.0, + "grad_norm": 0.02217822259323008, + "language_loss": 0.83064663, + "learning_rate": 0.0007027230073193561, + "loss": 0.84268641, + "num_input_tokens_seen": 166875088, + "router_z_loss_mlp": 0.87109375, + "step": 2007, + "time_per_iteration": 4.759521961212158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184336, + "balance_loss_mlp": 1.09635913, + "epoch": 0.3863024240092343, + "flos": 474692261376.0, + "grad_norm": 0.030825589148035897, + "language_loss": 0.87378025, + "learning_rate": 0.0007024381812438117, + "loss": 0.88562357, + "num_input_tokens_seen": 166939344, + "router_z_loss_mlp": 0.88134766, + "step": 2008, + "time_per_iteration": 2.5227372646331787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184691, + "balance_loss_mlp": 1.09728634, + "epoch": 0.3864948056944979, + "flos": 717978769920.0, + "grad_norm": 0.032935981886219476, + "language_loss": 0.91112518, + "learning_rate": 0.0007021532765747951, + "loss": 0.92297208, + "num_input_tokens_seen": 167014992, + "router_z_loss_mlp": 0.87548828, + "step": 2009, + "time_per_iteration": 2.963550567626953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182737, + "balance_loss_mlp": 1.0952853, + "epoch": 0.38668718737976143, + "flos": 728954465280.0, + "grad_norm": 0.030267959416106823, + "language_loss": 0.86631739, + "learning_rate": 0.0007018682934229162, + "loss": 0.87814474, + "num_input_tokens_seen": 167092096, + "router_z_loss_mlp": 0.87597656, + "step": 2010, + "time_per_iteration": 2.955132246017456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179617, + "balance_loss_mlp": 1.09235525, + "epoch": 0.386879569065025, + "flos": 526488984576.0, + "grad_norm": 0.02588052645359636, + "language_loss": 0.89375025, + "learning_rate": 0.0007015832318988152, + "loss": 0.90554643, + "num_input_tokens_seen": 167162144, + "router_z_loss_mlp": 0.87402344, + "step": 2011, + "time_per_iteration": 2.612443208694458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117942, + "balance_loss_mlp": 1.09454346, + "epoch": 0.38707195075028855, + "flos": 1530724512768.0, + "grad_norm": 0.010241364382771095, + "language_loss": 0.73890078, + "learning_rate": 0.000701298092113163, + "loss": 0.75069499, + "num_input_tokens_seen": 167391536, + "router_z_loss_mlp": 0.84960938, + "step": 2012, + "time_per_iteration": 4.952507495880127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187813, + "balance_loss_mlp": 1.10040927, + "epoch": 0.38726433243555214, + "flos": 558385483776.0, + "grad_norm": 0.026729103388188073, + "language_loss": 0.89776802, + "learning_rate": 0.0007010128741766604, + "loss": 0.90964615, + "num_input_tokens_seen": 167466000, + "router_z_loss_mlp": 0.87548828, + "step": 2013, + "time_per_iteration": 2.759916067123413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184734, + "balance_loss_mlp": 1.09756815, + "epoch": 0.38745671412081567, + "flos": 554755783680.0, + "grad_norm": 0.0314384592840016, + "language_loss": 0.91517645, + "learning_rate": 0.0007007275782000391, + "loss": 0.92702377, + "num_input_tokens_seen": 167536144, + "router_z_loss_mlp": 0.87304688, + "step": 2014, + "time_per_iteration": 2.6659133434295654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181864, + "balance_loss_mlp": 1.09469819, + "epoch": 0.38764909580607926, + "flos": 459344262144.0, + "grad_norm": 0.028810992523736655, + "language_loss": 0.92611015, + "learning_rate": 0.0007004422042940605, + "loss": 0.9379288, + "num_input_tokens_seen": 167600064, + "router_z_loss_mlp": 0.87304688, + "step": 2015, + "time_per_iteration": 2.4901411533355713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180932, + "balance_loss_mlp": 1.09376657, + "epoch": 0.38784147749134285, + "flos": 523258784256.0, + "grad_norm": 0.030339968140386194, + "language_loss": 0.98432136, + "learning_rate": 0.0007001567525695169, + "loss": 0.99613065, + "num_input_tokens_seen": 167666576, + "router_z_loss_mlp": 0.87304688, + "step": 2016, + "time_per_iteration": 2.605134963989258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182969, + "balance_loss_mlp": 1.09575546, + "epoch": 0.3880338591766064, + "flos": 667400011776.0, + "grad_norm": 0.023304348995526428, + "language_loss": 0.90603948, + "learning_rate": 0.0006998712231372303, + "loss": 0.91786909, + "num_input_tokens_seen": 167753296, + "router_z_loss_mlp": 0.87353516, + "step": 2017, + "time_per_iteration": 2.9866511821746826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187647, + "balance_loss_mlp": 1.10024321, + "epoch": 0.38822624086186996, + "flos": 595175310336.0, + "grad_norm": 0.027834044235160192, + "language_loss": 0.92810535, + "learning_rate": 0.0006995856161080532, + "loss": 0.93998176, + "num_input_tokens_seen": 167834080, + "router_z_loss_mlp": 0.87548828, + "step": 2018, + "time_per_iteration": 2.8917806148529053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181908, + "balance_loss_mlp": 1.09426534, + "epoch": 0.3884186225471335, + "flos": 613681651200.0, + "grad_norm": 0.030912624722110756, + "language_loss": 0.90135586, + "learning_rate": 0.0006992999315928679, + "loss": 0.91317499, + "num_input_tokens_seen": 167912368, + "router_z_loss_mlp": 0.87792969, + "step": 2019, + "time_per_iteration": 2.821570873260498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179846, + "balance_loss_mlp": 1.0924896, + "epoch": 0.3886110042323971, + "flos": 608243831808.0, + "grad_norm": 0.025167723735071885, + "language_loss": 0.91748118, + "learning_rate": 0.0006990141697025871, + "loss": 0.92927969, + "num_input_tokens_seen": 167991968, + "router_z_loss_mlp": 0.875, + "step": 2020, + "time_per_iteration": 2.774073600769043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181915, + "balance_loss_mlp": 1.09684753, + "epoch": 0.3888033859176606, + "flos": 1531193869824.0, + "grad_norm": 0.011544022481713089, + "language_loss": 0.76359642, + "learning_rate": 0.0006987283305481533, + "loss": 0.77541554, + "num_input_tokens_seen": 168212128, + "router_z_loss_mlp": 0.8515625, + "step": 2021, + "time_per_iteration": 4.741650581359863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174887, + "balance_loss_mlp": 1.08734, + "epoch": 0.3889957676029242, + "flos": 693671313408.0, + "grad_norm": 0.03334226176751645, + "language_loss": 0.90383756, + "learning_rate": 0.0006984424142405392, + "loss": 0.91558647, + "num_input_tokens_seen": 168287440, + "router_z_loss_mlp": 0.87695312, + "step": 2022, + "time_per_iteration": 2.839838981628418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174992, + "balance_loss_mlp": 1.08734977, + "epoch": 0.3891881492881878, + "flos": 516194767872.0, + "grad_norm": 0.031660307701904165, + "language_loss": 0.90829813, + "learning_rate": 0.0006981564208907474, + "loss": 0.92004812, + "num_input_tokens_seen": 168354704, + "router_z_loss_mlp": 0.87792969, + "step": 2023, + "time_per_iteration": 2.6160523891448975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179623, + "balance_loss_mlp": 1.09178972, + "epoch": 0.3893805309734513, + "flos": 630175756800.0, + "grad_norm": 0.02822603249283798, + "language_loss": 0.96692258, + "learning_rate": 0.0006978703506098102, + "loss": 0.97871882, + "num_input_tokens_seen": 168424272, + "router_z_loss_mlp": 0.87988281, + "step": 2024, + "time_per_iteration": 2.770775556564331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177682, + "balance_loss_mlp": 1.08994389, + "epoch": 0.3895729126587149, + "flos": 545206172160.0, + "grad_norm": 0.026225366557941037, + "language_loss": 0.95314252, + "learning_rate": 0.00069758420350879, + "loss": 0.96491939, + "num_input_tokens_seen": 168488912, + "router_z_loss_mlp": 0.87890625, + "step": 2025, + "time_per_iteration": 2.615687608718872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179844, + "balance_loss_mlp": 1.09201062, + "epoch": 0.38976529434397844, + "flos": 619406178816.0, + "grad_norm": 0.03181269468531491, + "language_loss": 0.9379099, + "learning_rate": 0.000697297979698779, + "loss": 0.94970834, + "num_input_tokens_seen": 168563248, + "router_z_loss_mlp": 0.87988281, + "step": 2026, + "time_per_iteration": 2.723860740661621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187768, + "balance_loss_mlp": 1.10007727, + "epoch": 0.38995767602924203, + "flos": 836344797696.0, + "grad_norm": 0.025703512313876988, + "language_loss": 0.89683533, + "learning_rate": 0.0006970116792908992, + "loss": 0.90871298, + "num_input_tokens_seen": 168648272, + "router_z_loss_mlp": 0.87841797, + "step": 2027, + "time_per_iteration": 3.0871434211730957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117977, + "balance_loss_mlp": 1.09203207, + "epoch": 0.39015005771450556, + "flos": 542646716928.0, + "grad_norm": 0.03022946762166595, + "language_loss": 0.88945854, + "learning_rate": 0.000696725302396302, + "loss": 0.9012562, + "num_input_tokens_seen": 168721760, + "router_z_loss_mlp": 0.87890625, + "step": 2028, + "time_per_iteration": 2.632178783416748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174959, + "balance_loss_mlp": 1.0871253, + "epoch": 0.39034243939976915, + "flos": 1009140864000.0, + "grad_norm": 0.026055335602768993, + "language_loss": 0.92111158, + "learning_rate": 0.0006964388491261692, + "loss": 0.93286121, + "num_input_tokens_seen": 168803664, + "router_z_loss_mlp": 0.87988281, + "step": 2029, + "time_per_iteration": 3.2683680057525635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174119, + "balance_loss_mlp": 1.08633304, + "epoch": 0.3905348210850327, + "flos": 680240222208.0, + "grad_norm": 0.029787695509808892, + "language_loss": 0.96251416, + "learning_rate": 0.0006961523195917114, + "loss": 0.97425532, + "num_input_tokens_seen": 168879184, + "router_z_loss_mlp": 0.87939453, + "step": 2030, + "time_per_iteration": 2.807161331176758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182527, + "balance_loss_mlp": 1.09459865, + "epoch": 0.39072720277029627, + "flos": 549988709376.0, + "grad_norm": 0.03099080969443711, + "language_loss": 0.86433041, + "learning_rate": 0.0006958657139041696, + "loss": 0.87615567, + "num_input_tokens_seen": 168957808, + "router_z_loss_mlp": 0.88085938, + "step": 2031, + "time_per_iteration": 2.728208065032959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119693, + "balance_loss_mlp": 1.11052704, + "epoch": 0.39091958445555985, + "flos": 1551051159552.0, + "grad_norm": 0.01789751173127641, + "language_loss": 0.76712966, + "learning_rate": 0.0006955790321748136, + "loss": 0.77909899, + "num_input_tokens_seen": 169194416, + "router_z_loss_mlp": 0.86523438, + "step": 2032, + "time_per_iteration": 4.911708354949951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179573, + "balance_loss_mlp": 1.09193051, + "epoch": 0.3911119661408234, + "flos": 505051886592.0, + "grad_norm": 0.03095157096826047, + "language_loss": 0.85940099, + "learning_rate": 0.0006952922745149434, + "loss": 0.87119675, + "num_input_tokens_seen": 169263552, + "router_z_loss_mlp": 0.87792969, + "step": 2033, + "time_per_iteration": 2.649538040161133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176483, + "balance_loss_mlp": 1.08903146, + "epoch": 0.391304347826087, + "flos": 558329088000.0, + "grad_norm": 0.028319463440814277, + "language_loss": 0.94666743, + "learning_rate": 0.000695005441035888, + "loss": 0.95843232, + "num_input_tokens_seen": 169333696, + "router_z_loss_mlp": 0.87597656, + "step": 2034, + "time_per_iteration": 2.6671407222747803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178574, + "balance_loss_mlp": 1.09293365, + "epoch": 0.3914967295113505, + "flos": 1502941807104.0, + "grad_norm": 0.0063133772361172544, + "language_loss": 0.73723435, + "learning_rate": 0.0006947185318490064, + "loss": 0.7490201, + "num_input_tokens_seen": 169556416, + "router_z_loss_mlp": 0.85742188, + "step": 2035, + "time_per_iteration": 4.863725423812866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180506, + "balance_loss_mlp": 1.09338748, + "epoch": 0.3916891111966141, + "flos": 708329101824.0, + "grad_norm": 0.025753563122139746, + "language_loss": 0.86980474, + "learning_rate": 0.0006944315470656863, + "loss": 0.88160974, + "num_input_tokens_seen": 169643312, + "router_z_loss_mlp": 0.87255859, + "step": 2036, + "time_per_iteration": 2.936588764190674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188418, + "balance_loss_mlp": 1.10110939, + "epoch": 0.3918814928818776, + "flos": 557408564736.0, + "grad_norm": 0.031943380680049066, + "language_loss": 0.99613088, + "learning_rate": 0.000694144486797345, + "loss": 1.00801504, + "num_input_tokens_seen": 169712560, + "router_z_loss_mlp": 0.87451172, + "step": 2037, + "time_per_iteration": 2.676107883453369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193756, + "balance_loss_mlp": 1.10868835, + "epoch": 0.3920738745671412, + "flos": 1541685471744.0, + "grad_norm": 0.012882287356254449, + "language_loss": 0.79520434, + "learning_rate": 0.0006938573511554296, + "loss": 0.8071419, + "num_input_tokens_seen": 169914912, + "router_z_loss_mlp": 0.8515625, + "step": 2038, + "time_per_iteration": 4.63246750831604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178826, + "balance_loss_mlp": 1.0916127, + "epoch": 0.39226625625240474, + "flos": 499804721664.0, + "grad_norm": 0.027391930017631044, + "language_loss": 0.96627682, + "learning_rate": 0.0006935701402514156, + "loss": 0.97806513, + "num_input_tokens_seen": 169978848, + "router_z_loss_mlp": 0.87353516, + "step": 2039, + "time_per_iteration": 2.5613086223602295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177521, + "balance_loss_mlp": 1.092453, + "epoch": 0.39245863793766833, + "flos": 1350450920448.0, + "grad_norm": 0.011737641894846437, + "language_loss": 0.73034894, + "learning_rate": 0.0006932828541968083, + "loss": 0.74212414, + "num_input_tokens_seen": 170211488, + "router_z_loss_mlp": 0.8515625, + "step": 2040, + "time_per_iteration": 4.902123689651489 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176176, + "balance_loss_mlp": 1.08881962, + "epoch": 0.3926510196229319, + "flos": 1348114142208.0, + "grad_norm": 0.028665962134257456, + "language_loss": 0.92107272, + "learning_rate": 0.0006929954931031422, + "loss": 0.93283451, + "num_input_tokens_seen": 170298528, + "router_z_loss_mlp": 0.875, + "step": 2041, + "time_per_iteration": 3.7387020587921143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176234, + "balance_loss_mlp": 1.08902013, + "epoch": 0.39284340130819545, + "flos": 500603721216.0, + "grad_norm": 0.024641039111334598, + "language_loss": 0.95021844, + "learning_rate": 0.0006927080570819805, + "loss": 0.96198076, + "num_input_tokens_seen": 170365680, + "router_z_loss_mlp": 0.87353516, + "step": 2042, + "time_per_iteration": 2.5837514400482178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117531, + "balance_loss_mlp": 1.08814418, + "epoch": 0.39303578299345904, + "flos": 521341876224.0, + "grad_norm": 0.03605238478740547, + "language_loss": 0.89998531, + "learning_rate": 0.0006924205462449161, + "loss": 0.9117384, + "num_input_tokens_seen": 170432224, + "router_z_loss_mlp": 0.87304688, + "step": 2043, + "time_per_iteration": 2.560842514038086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173664, + "balance_loss_mlp": 1.08664155, + "epoch": 0.39322816467872257, + "flos": 909537686016.0, + "grad_norm": 0.029197625514705252, + "language_loss": 0.89668262, + "learning_rate": 0.0006921329607035702, + "loss": 0.90841925, + "num_input_tokens_seen": 170517920, + "router_z_loss_mlp": 0.87158203, + "step": 2044, + "time_per_iteration": 3.2215418815612793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185916, + "balance_loss_mlp": 1.09860718, + "epoch": 0.39342054636398616, + "flos": 518641431552.0, + "grad_norm": 0.026194219642157263, + "language_loss": 0.94294739, + "learning_rate": 0.0006918453005695938, + "loss": 0.95480657, + "num_input_tokens_seen": 170589072, + "router_z_loss_mlp": 0.87451172, + "step": 2045, + "time_per_iteration": 2.637197732925415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183114, + "balance_loss_mlp": 1.09594774, + "epoch": 0.3936129280492497, + "flos": 549011790336.0, + "grad_norm": 0.026944227420126074, + "language_loss": 0.91576457, + "learning_rate": 0.0006915575659546662, + "loss": 0.92759573, + "num_input_tokens_seen": 170657856, + "router_z_loss_mlp": 0.87304688, + "step": 2046, + "time_per_iteration": 2.7570858001708984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185485, + "balance_loss_mlp": 1.098176, + "epoch": 0.3938053097345133, + "flos": 527140263936.0, + "grad_norm": 0.02948359624940754, + "language_loss": 0.88347399, + "learning_rate": 0.0006912697569704959, + "loss": 0.89532876, + "num_input_tokens_seen": 170723696, + "router_z_loss_mlp": 0.87451172, + "step": 2047, + "time_per_iteration": 2.635467290878296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186252, + "balance_loss_mlp": 1.09899104, + "epoch": 0.39399769141977686, + "flos": 472588701696.0, + "grad_norm": 0.02995196024762557, + "language_loss": 0.93503523, + "learning_rate": 0.0006909818737288205, + "loss": 0.94689775, + "num_input_tokens_seen": 170789536, + "router_z_loss_mlp": 0.87402344, + "step": 2048, + "time_per_iteration": 2.558013916015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181668, + "balance_loss_mlp": 1.09488404, + "epoch": 0.3941900731050404, + "flos": 502726746624.0, + "grad_norm": 0.02878603575662113, + "language_loss": 0.88763595, + "learning_rate": 0.000690693916341406, + "loss": 0.89945263, + "num_input_tokens_seen": 170859232, + "router_z_loss_mlp": 0.86914062, + "step": 2049, + "time_per_iteration": 2.5820720195770264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178505, + "balance_loss_mlp": 1.09152949, + "epoch": 0.394382454790304, + "flos": 582006732288.0, + "grad_norm": 0.024885306311727563, + "language_loss": 0.90003175, + "learning_rate": 0.0006904058849200475, + "loss": 0.91181684, + "num_input_tokens_seen": 170931568, + "router_z_loss_mlp": 0.87109375, + "step": 2050, + "time_per_iteration": 2.7304697036743164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118427, + "balance_loss_mlp": 1.09700906, + "epoch": 0.3945748364755675, + "flos": 514844545536.0, + "grad_norm": 0.02745844528377672, + "language_loss": 0.91741204, + "learning_rate": 0.0006901177795765683, + "loss": 0.92925465, + "num_input_tokens_seen": 170999856, + "router_z_loss_mlp": 0.87402344, + "step": 2051, + "time_per_iteration": 2.610621213912964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180664, + "balance_loss_mlp": 1.09368861, + "epoch": 0.3947672181608311, + "flos": 595057789440.0, + "grad_norm": 0.03028158635704326, + "language_loss": 0.89240891, + "learning_rate": 0.0006898296004228213, + "loss": 0.90421557, + "num_input_tokens_seen": 171072320, + "router_z_loss_mlp": 0.87109375, + "step": 2052, + "time_per_iteration": 2.747377395629883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119046, + "balance_loss_mlp": 1.10634613, + "epoch": 0.39495959984609463, + "flos": 1551049158144.0, + "grad_norm": 0.018267218432335405, + "language_loss": 0.7812674, + "learning_rate": 0.0006895413475706873, + "loss": 0.793172, + "num_input_tokens_seen": 171304128, + "router_z_loss_mlp": 0.84179688, + "step": 2053, + "time_per_iteration": 4.871596336364746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117553, + "balance_loss_mlp": 1.08845937, + "epoch": 0.3951519815313582, + "flos": 497523242496.0, + "grad_norm": 0.028876315996474663, + "language_loss": 0.87133646, + "learning_rate": 0.0006892530211320763, + "loss": 0.88309175, + "num_input_tokens_seen": 171377392, + "router_z_loss_mlp": 0.87207031, + "step": 2054, + "time_per_iteration": 2.696796417236328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117541, + "balance_loss_mlp": 1.08824456, + "epoch": 0.39534436321662175, + "flos": 532222244352.0, + "grad_norm": 0.031248767008087052, + "language_loss": 0.9121244, + "learning_rate": 0.000688964621218926, + "loss": 0.92387855, + "num_input_tokens_seen": 171447424, + "router_z_loss_mlp": 0.87304688, + "step": 2055, + "time_per_iteration": 2.6398446559906006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176401, + "balance_loss_mlp": 1.08899677, + "epoch": 0.39553674490188534, + "flos": 703724484096.0, + "grad_norm": 0.031024749515969993, + "language_loss": 0.88066703, + "learning_rate": 0.0006886761479432037, + "loss": 0.89243108, + "num_input_tokens_seen": 171519920, + "router_z_loss_mlp": 0.87548828, + "step": 2056, + "time_per_iteration": 2.896899700164795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184707, + "balance_loss_mlp": 1.09720743, + "epoch": 0.3957291265871489, + "flos": 410656215552.0, + "grad_norm": 0.031805347037857014, + "language_loss": 0.92354834, + "learning_rate": 0.0006883876014169045, + "loss": 0.93539548, + "num_input_tokens_seen": 171583856, + "router_z_loss_mlp": 0.87646484, + "step": 2057, + "time_per_iteration": 2.49245023727417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118858, + "balance_loss_mlp": 1.10108006, + "epoch": 0.39592150827241246, + "flos": 619638492672.0, + "grad_norm": 0.03245947566344542, + "language_loss": 0.97519982, + "learning_rate": 0.000688098981752052, + "loss": 0.98708564, + "num_input_tokens_seen": 171656064, + "router_z_loss_mlp": 0.87646484, + "step": 2058, + "time_per_iteration": 2.7079999446868896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183973, + "balance_loss_mlp": 1.09642518, + "epoch": 0.39611388995767605, + "flos": 822720324096.0, + "grad_norm": 0.029593298786174956, + "language_loss": 0.88381338, + "learning_rate": 0.0006878102890606982, + "loss": 0.89565313, + "num_input_tokens_seen": 171738800, + "router_z_loss_mlp": 0.87695312, + "step": 2059, + "time_per_iteration": 3.089268922805786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182646, + "balance_loss_mlp": 1.09524131, + "epoch": 0.3963062716429396, + "flos": 493214065152.0, + "grad_norm": 0.03350279358204369, + "language_loss": 0.88991904, + "learning_rate": 0.0006875215234549239, + "loss": 0.9017455, + "num_input_tokens_seen": 171803664, + "router_z_loss_mlp": 0.87548828, + "step": 2060, + "time_per_iteration": 2.538806200027466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182648, + "balance_loss_mlp": 1.09533882, + "epoch": 0.39649865332820317, + "flos": 585833817600.0, + "grad_norm": 0.030947291001002426, + "language_loss": 0.93147129, + "learning_rate": 0.0006872326850468376, + "loss": 0.9432978, + "num_input_tokens_seen": 171871968, + "router_z_loss_mlp": 0.87451172, + "step": 2061, + "time_per_iteration": 2.6593003273010254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179357, + "balance_loss_mlp": 1.09214342, + "epoch": 0.3966910350134667, + "flos": 459511448064.0, + "grad_norm": 0.03264577108022065, + "language_loss": 0.89072591, + "learning_rate": 0.0006869437739485762, + "loss": 0.90251946, + "num_input_tokens_seen": 171942368, + "router_z_loss_mlp": 0.87353516, + "step": 2062, + "time_per_iteration": 2.605191230773926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180604, + "balance_loss_mlp": 1.0932951, + "epoch": 0.3968834166987303, + "flos": 509614844928.0, + "grad_norm": 0.02743430972643364, + "language_loss": 0.9889155, + "learning_rate": 0.0006866547902723053, + "loss": 1.00072145, + "num_input_tokens_seen": 172012336, + "router_z_loss_mlp": 0.87451172, + "step": 2063, + "time_per_iteration": 2.6466383934020996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178614, + "balance_loss_mlp": 1.09116209, + "epoch": 0.3970757983839938, + "flos": 573742215168.0, + "grad_norm": 0.030016333454088624, + "language_loss": 0.87640852, + "learning_rate": 0.000686365734130218, + "loss": 0.88819462, + "num_input_tokens_seen": 172084640, + "router_z_loss_mlp": 0.87597656, + "step": 2064, + "time_per_iteration": 2.6795899868011475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178875, + "balance_loss_mlp": 1.09161353, + "epoch": 0.3972681800692574, + "flos": 482585476608.0, + "grad_norm": 0.03115409384976, + "language_loss": 0.90479839, + "learning_rate": 0.000686076605634536, + "loss": 0.91658711, + "num_input_tokens_seen": 172152992, + "router_z_loss_mlp": 0.87402344, + "step": 2065, + "time_per_iteration": 2.6956639289855957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176026, + "balance_loss_mlp": 1.0887177, + "epoch": 0.397460561754521, + "flos": 488904887808.0, + "grad_norm": 0.028660372999824147, + "language_loss": 0.91924292, + "learning_rate": 0.0006857874048975088, + "loss": 0.93100321, + "num_input_tokens_seen": 172219312, + "router_z_loss_mlp": 0.87451172, + "step": 2066, + "time_per_iteration": 2.541707992553711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182319, + "balance_loss_mlp": 1.09515274, + "epoch": 0.3976529434397845, + "flos": 422895538176.0, + "grad_norm": 0.03007540042591745, + "language_loss": 0.93814421, + "learning_rate": 0.0006854981320314142, + "loss": 0.94996738, + "num_input_tokens_seen": 172282112, + "router_z_loss_mlp": 0.87304688, + "step": 2067, + "time_per_iteration": 2.455916166305542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118284, + "balance_loss_mlp": 1.09586513, + "epoch": 0.3978453251250481, + "flos": 546621522432.0, + "grad_norm": 0.0330596148196893, + "language_loss": 0.94973123, + "learning_rate": 0.0006852087871485579, + "loss": 0.96155965, + "num_input_tokens_seen": 172347872, + "router_z_loss_mlp": 0.87109375, + "step": 2068, + "time_per_iteration": 2.609492063522339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175372, + "balance_loss_mlp": 1.08801544, + "epoch": 0.39803770681031164, + "flos": 652001620992.0, + "grad_norm": 0.0336676185790188, + "language_loss": 0.8912071, + "learning_rate": 0.0006849193703612735, + "loss": 0.90296078, + "num_input_tokens_seen": 172418560, + "router_z_loss_mlp": 0.875, + "step": 2069, + "time_per_iteration": 2.816309690475464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177646, + "balance_loss_mlp": 1.09071827, + "epoch": 0.39823008849557523, + "flos": 741426101760.0, + "grad_norm": 0.026625397702565265, + "language_loss": 0.84925234, + "learning_rate": 0.0006846298817819225, + "loss": 0.86102879, + "num_input_tokens_seen": 172497984, + "router_z_loss_mlp": 0.87060547, + "step": 2070, + "time_per_iteration": 2.9875504970550537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175555, + "balance_loss_mlp": 1.088485, + "epoch": 0.39842247018083876, + "flos": 385888860672.0, + "grad_norm": 0.03226539532166374, + "language_loss": 0.89664173, + "learning_rate": 0.0006843403215228945, + "loss": 0.90839732, + "num_input_tokens_seen": 172560112, + "router_z_loss_mlp": 0.87207031, + "step": 2071, + "time_per_iteration": 2.4326088428497314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173604, + "balance_loss_mlp": 1.08648539, + "epoch": 0.39861485186610235, + "flos": 534762233856.0, + "grad_norm": 0.028550920618746804, + "language_loss": 0.88238078, + "learning_rate": 0.0006840506896966065, + "loss": 0.89411676, + "num_input_tokens_seen": 172636192, + "router_z_loss_mlp": 0.87255859, + "step": 2072, + "time_per_iteration": 2.6961326599121094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177722, + "balance_loss_mlp": 1.09084272, + "epoch": 0.39880723355136594, + "flos": 644412578304.0, + "grad_norm": 0.03366874484709253, + "language_loss": 0.90951228, + "learning_rate": 0.0006837609864155038, + "loss": 0.9212895, + "num_input_tokens_seen": 172715264, + "router_z_loss_mlp": 0.87011719, + "step": 2073, + "time_per_iteration": 2.8584561347961426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119321, + "balance_loss_mlp": 1.10623515, + "epoch": 0.39899961523662947, + "flos": 516891709440.0, + "grad_norm": 0.031985803275243696, + "language_loss": 0.90341693, + "learning_rate": 0.0006834712117920592, + "loss": 0.91534901, + "num_input_tokens_seen": 172783456, + "router_z_loss_mlp": 0.87109375, + "step": 2074, + "time_per_iteration": 2.624469757080078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186501, + "balance_loss_mlp": 1.09933496, + "epoch": 0.39919199692189306, + "flos": 465338033664.0, + "grad_norm": 0.0320663192521817, + "language_loss": 0.92968071, + "learning_rate": 0.0006831813659387729, + "loss": 0.94154572, + "num_input_tokens_seen": 172848928, + "router_z_loss_mlp": 0.87304688, + "step": 2075, + "time_per_iteration": 2.5216238498687744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184926, + "balance_loss_mlp": 1.09785569, + "epoch": 0.3993843786071566, + "flos": 532678139904.0, + "grad_norm": 0.03441409861038799, + "language_loss": 0.91210699, + "learning_rate": 0.0006828914489681733, + "loss": 0.92395616, + "num_input_tokens_seen": 172921152, + "router_z_loss_mlp": 0.87207031, + "step": 2076, + "time_per_iteration": 2.686810255050659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186966, + "balance_loss_mlp": 1.10008633, + "epoch": 0.3995767602924202, + "flos": 505023688704.0, + "grad_norm": 0.02837279486305722, + "language_loss": 0.91445708, + "learning_rate": 0.0006826014609928162, + "loss": 0.92632675, + "num_input_tokens_seen": 172998864, + "router_z_loss_mlp": 0.87011719, + "step": 2077, + "time_per_iteration": 2.6775381565093994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225517, + "balance_loss_mlp": 1.13892365, + "epoch": 0.3997691419776837, + "flos": 1457471225856.0, + "grad_norm": 0.023004253676312834, + "language_loss": 0.83199388, + "learning_rate": 0.0006823114021252846, + "loss": 0.84424907, + "num_input_tokens_seen": 173219216, + "router_z_loss_mlp": 0.8671875, + "step": 2078, + "time_per_iteration": 4.87092661857605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117794, + "balance_loss_mlp": 1.09134626, + "epoch": 0.3999615236629473, + "flos": 531755615232.0, + "grad_norm": 0.028989200184594895, + "language_loss": 0.86860782, + "learning_rate": 0.0006820212724781896, + "loss": 0.88038719, + "num_input_tokens_seen": 173292000, + "router_z_loss_mlp": 0.8671875, + "step": 2079, + "time_per_iteration": 2.6908116340637207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176834, + "balance_loss_mlp": 1.09033561, + "epoch": 0.4001539053482108, + "flos": 696361024512.0, + "grad_norm": 0.02837619494351951, + "language_loss": 0.90808308, + "learning_rate": 0.0006817310721641694, + "loss": 0.91985142, + "num_input_tokens_seen": 173365568, + "router_z_loss_mlp": 0.86621094, + "step": 2080, + "time_per_iteration": 2.8117949962615967 + } + ], + "logging_steps": 1.0, + "max_steps": 5198, + "num_input_tokens_seen": 173365568, + "num_train_epochs": 1, + "save_steps": 1040, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4728127731400704.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +} diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/training_args.bin b/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..dec1b7e0db130318069c72434f32c2789119b732 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c077e5103b778b39b648e3a5a2e73e36256d052f444290e14e15f87c36156cb +size 7992 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/zero_to_fp32.py b/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-2080/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/added_tokens.json b/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..c9d3d3a1b74d87e381e471f7b33784015d2dc0ea --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/added_tokens.json @@ -0,0 +1,13 @@ +{ + "<|assistant|>": 32001, + "<|endoftext|>": 32000, + "<|end|>": 32007, + "<|placeholder1|>": 32002, + "<|placeholder2|>": 32003, + "<|placeholder3|>": 32004, + "<|placeholder4|>": 32005, + "<|placeholder5|>": 32008, + "<|placeholder6|>": 32009, + "<|system|>": 32006, + "<|user|>": 32010 +} diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/config.json b/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/config.json new file mode 100644 index 0000000000000000000000000000000000000000..987150c78c9255ac53c0408588036e10466fc436 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/config.json @@ -0,0 +1,200 @@ +{ + "_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "architectures": [ + "LlavaPhiForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_phi3.Phi3Config", + "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM" + }, + "bal_comp_loss_coef": 0.01, + "balance_loss_coef": 0.01, + "bos_token_id": 1, + "clip_smoe": false, + "diversity_loss_coef": 0.01, + "dropout": false, + "e_loss_coef": 0.001, + "embd_pdrop": 0.0, + "entropy_advance_loss": false, + "eos_token_id": 32000, + "freeze_backbone": false, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 3072, + "hybrid": false, + "image_aspect_ratio": "pad", + "init_weight": true, + "initializer_range": 0.02, + "intermediate_size": 8192, + "is_cosine": false, + "is_norm_weight": false, + "local_rank": 0, + "loss1": "balanceloss", + "loss2": "zloss", + "luna": false, + "max_compete_in_iter": 3, + "max_position_embeddings": 131072, + "mlp_smoe": true, + "mm_hidden_size": 1152, + "mm_patch_merge_type": "flat", + "mm_projector_lr": null, + "mm_projector_type": "moe", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-224", + "model_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "model_type": "llava_phi", + "moe_name": "smoe_perturbed", + "norm_softmax": false, + "normalization": false, + "num_attention_heads": 32, + "num_experts": 8, + "num_hidden_layers": 32, + "num_key_value_heads": 32, + "num_layers": 3, + "num_selected": 4, + "number_of_previous_tokens": 2, + "original_max_position_embeddings": 4096, + "pad_token_id": 32000, + "pretrain_mm_mlp_adapter": null, + "rate_compete": 0.2, + "rate_flip": 0.05, + "resid_pdrop": 0.0, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "long_factor": [ + 1.0800000429153442, + 1.1100000143051147, + 1.1399999856948853, + 1.340000033378601, + 1.5899999141693115, + 1.600000023841858, + 1.6200000047683716, + 2.620000123977661, + 3.2300000190734863, + 3.2300000190734863, + 4.789999961853027, + 7.400000095367432, + 7.700000286102295, + 9.09000015258789, + 12.199999809265137, + 17.670000076293945, + 24.46000099182129, + 28.57000160217285, + 30.420001983642578, + 30.840002059936523, + 32.590003967285156, + 32.93000411987305, + 42.320003509521484, + 44.96000289916992, + 50.340003967285156, + 50.45000457763672, + 57.55000305175781, + 57.93000411987305, + 58.21000289916992, + 60.1400032043457, + 62.61000442504883, + 62.62000274658203, + 62.71000289916992, + 63.1400032043457, + 63.1400032043457, + 63.77000427246094, + 63.93000411987305, + 63.96000289916992, + 63.970001220703125, + 64.02999877929688, + 64.06999969482422, + 64.08000183105469, + 64.12000274658203, + 64.41000366210938, + 64.4800033569336, + 64.51000213623047, + 64.52999877929688, + 64.83999633789062 + ], + "short_factor": [ + 1.0, + 1.0199999809265137, + 1.0299999713897705, + 1.0299999713897705, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0699999332427979, + 1.0999999046325684, + 1.1099998950958252, + 1.1599998474121094, + 1.1599998474121094, + 1.1699998378753662, + 1.2899998426437378, + 1.339999794960022, + 1.679999828338623, + 1.7899998426437378, + 1.8199998140335083, + 1.8499997854232788, + 1.8799997568130493, + 1.9099997282028198, + 1.9399996995925903, + 1.9899996519088745, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0799996852874756, + 2.0899996757507324, + 2.189999580383301, + 2.2199995517730713, + 2.5899994373321533, + 2.729999542236328, + 2.749999523162842, + 2.8399994373321533 + ], + "type": "longrope" + }, + "rope_theta": 10000.0, + "router_loss_coef": 0.01, + "router_theta": 0.1, + "router_z_loss_coef": 0.001, + "scales": [ + 1, + 3 + ], + "sliding_window": 262144, + "sparse_upcycling": false, + "strategy_train": "base", + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "topk_max": 2, + "topk_min": 1, + "torch_dtype": "bfloat16", + "training": true, + "transformers_version": "4.43.0", + "tune_mm_mlp_adapter": false, + "unit_test": true, + "use_cache": false, + "use_mm_proj": true, + "use_old": false, + "version": "phi35", + "vision_tower": "google/siglip-so400m-patch14-224", + "vision_tower_dir": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/clip.bin", + "vocab_size": 32064, + "warm_up": 0.05 +} diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/generation_config.json b/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dad5c4578f0dc5969b38755d095fc30c368bb54a --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/generation_config.json @@ -0,0 +1,12 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "do_sample": true, + "eos_token_id": [ + 32007, + 32001, + 32000 + ], + "pad_token_id": 32000, + "transformers_version": "4.43.0" +} diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/global_step3120/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/global_step3120/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c22e2e20866d271ba7171f97e3a4bbb2fc7f0496 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/global_step3120/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec4c3041a365fe2776363d81ad1956c5456b524facf26a4702c49f689e5557c1 +size 396609872 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/global_step3120/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/global_step3120/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e71e86f9a51b55a1b6aa769bf86658ea057b5054 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/global_step3120/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82dac5f866d55a8052897b2a035088f7c296af11a2389ac4b0e684a6ac94c286 +size 396609872 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/global_step3120/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/global_step3120/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8cf34b7e1720090ef5769cd9e4af50216f2fee11 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/global_step3120/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0851091d869cb93547ef4a41aac7c0e32454b561fb7a1adce76f1820ea18df80 +size 396609872 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/global_step3120/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/global_step3120/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..106340460d93528b36dd6cd712b652414dd02487 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/global_step3120/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d1db55d209a60016d59e501f4e73f4eb7b30ba832004c668a3f30ad4c84ca32 +size 396609872 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/global_step3120/zero_pp_rank_0_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/global_step3120/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b276a5da52b88359353608cb801efbcc7aa5fe0e --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/global_step3120/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56100fb6feafb603bfdc4b3ab9bfb8f3229269007fec674553ba6ec4be6be8fa +size 2117322436 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/global_step3120/zero_pp_rank_1_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/global_step3120/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..81642fa8bdd94936a7b2f903d91d8df787bb790e --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/global_step3120/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9afda3d6cd8fc8c7dff99096c7b8354f9b0966e01df2839443b35e55b245a826 +size 2117322436 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/global_step3120/zero_pp_rank_2_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/global_step3120/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dbad277bdbad9120831bfbb44e07e20f4aa6baf0 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/global_step3120/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0632eae5e94ac4f4860cc2c0a6b200421dda6a26981a1b366efb02dd7a7f304 +size 2117322436 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/global_step3120/zero_pp_rank_3_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/global_step3120/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cdd5fcd428db463876063b385c771e5a3a704055 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/global_step3120/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9521218eb326a6355940ef82de9ef69f9d38ce5ae31d8645f30a298a310a729b +size 2117322436 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/latest b/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/latest new file mode 100644 index 0000000000000000000000000000000000000000..804da059f781bacb3f274fb2103e4bc7f9bb7407 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/latest @@ -0,0 +1 @@ +global_step3120 \ No newline at end of file diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/model-00001-of-00002.safetensors b/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/model-00001-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..29d76f5d80605301aab2bba59b53a5e2582094c4 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/model-00001-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe6c4f6ef38e8993629091331e0bbf23484cc88bdfd038f0dd17b6ec2800d855 +size 4972489328 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/model-00002-of-00002.safetensors b/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/model-00002-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..62ce7e51575ebe4e4eb74816529035d7a2c07188 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/model-00002-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91338e75344894f2ad76eea72388d75308388c071d06e956a71114d3759c79be +size 3759043888 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/model.safetensors.index.json b/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..01fe755c95da02467d97df3e39228dbbb26b065f --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/model.safetensors.index.json @@ -0,0 +1,674 @@ +{ + "metadata": { + "total_size": 8731443232 + }, + "weight_map": { + "lm_head.weight": "model-00002-of-00002.safetensors", + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.mm_projector.layer_norm.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.layer_norm.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.expert_embeddings": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.gate.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.inp_reduction.weight": "model-00002-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/rng_state_0.pth b/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..ef4849062bcdc8ffd2246c07673ba196a8d61a6d --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fae2114fffe9b1eea30e28bbdb4ce59046b0079ea5b8dc4682079f609d49d787 +size 14960 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/rng_state_1.pth b/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..2fcb2b640bc236c26aa841680d34a91240247970 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4ff5f3a53530ac868291e2667c8f824bfa1f4fa1ce880df8223a7165ef38e11 +size 14960 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/rng_state_2.pth b/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..00c3f989de00e6d58ca7345ae6f65fee0afcbdcd --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91f80a7779b0034e70106ba6cb0e3e686052334c20ce54453ee3977cc0219d15 +size 14960 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/rng_state_3.pth b/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..f289913854ee3fa52a86e282421da07d85b8a4c4 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ece3bc0d0e16c43ef245cc787cbd0d63d08d460f489c4cd52adf6501b9281a18 +size 14960 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/special_tokens_map.json b/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3e4d5a5bc1cb51753cc9ae0305ece0da60052b10 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/tokenizer.model b/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/tokenizer_config.json b/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d579bb0b91b24b214ea3c2e487e27a65017cdc4a --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/tokenizer_config.json @@ -0,0 +1,132 @@ +{ + "add_bos_token": false, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "32000": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32001": { + "content": "<|assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32002": { + "content": "<|placeholder1|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32003": { + "content": "<|placeholder2|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32004": { + "content": "<|placeholder3|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32005": { + "content": "<|placeholder4|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32006": { + "content": "<|system|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32007": { + "content": "<|end|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32008": { + "content": "<|placeholder5|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32009": { + "content": "<|placeholder6|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32010": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "legacy": false, + "model_max_length": 2048, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/trainer_state.json b/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..248ab72bc27e1b8b2bdfe23631ce564d500527fd --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/trainer_state.json @@ -0,0 +1,46833 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.6002308580223162, + "eval_steps": 500, + "global_step": 3120, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02574398, + "balance_loss_mlp": 1.85189414, + "epoch": 0.00019238168526356292, + "flos": 471022176768.0, + "grad_norm": 12.86455737221305, + "language_loss": 2.79777646, + "learning_rate": 0.0, + "loss": 1.8614465, + "num_input_tokens_seen": 67104, + "router_z_loss_mlp": 7.2109375, + "step": 1, + "time_per_iteration": 21.83068585395813 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02254613, + "balance_loss_mlp": 1.76785779, + "epoch": 0.00038476337052712584, + "flos": 505537981440.0, + "grad_norm": 51.581369656319104, + "language_loss": 12.34714699, + "learning_rate": 0.00013726078121135892, + "loss": 12.3696928, + "num_input_tokens_seen": 134080, + "router_z_loss_mlp": 4.875, + "step": 2, + "time_per_iteration": 2.6192572116851807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02235864, + "balance_loss_mlp": 1.75177932, + "epoch": 0.0005771450557906887, + "flos": 600333152256.0, + "grad_norm": 53.41660983156924, + "language_loss": 12.32898235, + "learning_rate": 0.00021755319103969496, + "loss": 12.35134125, + "num_input_tokens_seen": 205152, + "router_z_loss_mlp": 4.84765625, + "step": 3, + "time_per_iteration": 2.887979030609131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02281771, + "balance_loss_mlp": 1.79577887, + "epoch": 0.0007695267410542517, + "flos": 581496442368.0, + "grad_norm": 15.812083363335244, + "language_loss": 9.24414825, + "learning_rate": 0.00027452156242271784, + "loss": 9.26696682, + "num_input_tokens_seen": 269664, + "router_z_loss_mlp": 4.8671875, + "step": 4, + "time_per_iteration": 2.6792547702789307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02454864, + "balance_loss_mlp": 1.95551991, + "epoch": 0.0009619084263178145, + "flos": 487153164288.0, + "grad_norm": 10.3691594005885, + "language_loss": 9.1886158, + "learning_rate": 0.0003187096642208417, + "loss": 9.21316433, + "num_input_tokens_seen": 338560, + "router_z_loss_mlp": 4.98828125, + "step": 5, + "time_per_iteration": 2.627883195877075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0247156, + "balance_loss_mlp": 1.97450531, + "epoch": 0.0011542901115813775, + "flos": 561166519296.0, + "grad_norm": 9.061082825397735, + "language_loss": 9.31672573, + "learning_rate": 0.0003548139722510539, + "loss": 9.34144115, + "num_input_tokens_seen": 410112, + "router_z_loss_mlp": 4.96875, + "step": 6, + "time_per_iteration": 2.697327136993408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02496704, + "balance_loss_mlp": 1.9977417, + "epoch": 0.0013466717968449403, + "flos": 534950886912.0, + "grad_norm": 5.1401213461899875, + "language_loss": 8.45638084, + "learning_rate": 0.00038533972973918044, + "loss": 8.48134804, + "num_input_tokens_seen": 477552, + "router_z_loss_mlp": 4.984375, + "step": 7, + "time_per_iteration": 2.6605119705200195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02367166, + "balance_loss_mlp": 1.8800292, + "epoch": 0.0015390534821085034, + "flos": 493333587456.0, + "grad_norm": 4.765795170053606, + "language_loss": 7.86978722, + "learning_rate": 0.0004117823436340768, + "loss": 7.89345884, + "num_input_tokens_seen": 549184, + "router_z_loss_mlp": 4.87890625, + "step": 8, + "time_per_iteration": 2.60813570022583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02377529, + "balance_loss_mlp": 1.89153647, + "epoch": 0.0017314351673720662, + "flos": 565775139840.0, + "grad_norm": 2.6394105736579268, + "language_loss": 7.60834789, + "learning_rate": 0.00043510638207938993, + "loss": 7.63212299, + "num_input_tokens_seen": 622880, + "router_z_loss_mlp": 4.8671875, + "step": 9, + "time_per_iteration": 2.871943712234497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0239868, + "balance_loss_mlp": 1.91802776, + "epoch": 0.001923816852635629, + "flos": 594508568064.0, + "grad_norm": 2.7082435786924752, + "language_loss": 7.06748104, + "learning_rate": 0.00045597044543220066, + "loss": 7.09146786, + "num_input_tokens_seen": 693584, + "router_z_loss_mlp": 4.8125, + "step": 10, + "time_per_iteration": 2.671294689178467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02381293, + "balance_loss_mlp": 1.90254807, + "epoch": 0.002116198537899192, + "flos": 610894611456.0, + "grad_norm": 2.113301815517677, + "language_loss": 6.83692646, + "learning_rate": 0.00047484428652143135, + "loss": 6.86073971, + "num_input_tokens_seen": 774432, + "router_z_loss_mlp": 4.79296875, + "step": 11, + "time_per_iteration": 2.885416269302368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02427226, + "balance_loss_mlp": 1.95687437, + "epoch": 0.002308580223162755, + "flos": 546174359040.0, + "grad_norm": 1.7416212933802626, + "language_loss": 6.4295001, + "learning_rate": 0.0004920747534624128, + "loss": 6.45377207, + "num_input_tokens_seen": 844304, + "router_z_loss_mlp": 4.70703125, + "step": 12, + "time_per_iteration": 2.6201112270355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02503769, + "balance_loss_mlp": 2.03265429, + "epoch": 0.002500961908426318, + "flos": 645923255808.0, + "grad_norm": 2.43618245016211, + "language_loss": 6.0048914, + "learning_rate": 0.0005079252465375872, + "loss": 6.02992916, + "num_input_tokens_seen": 915104, + "router_z_loss_mlp": 4.71484375, + "step": 13, + "time_per_iteration": 2.852263927459717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02634854, + "balance_loss_mlp": 2.15916157, + "epoch": 0.0026933435936898806, + "flos": 488848492032.0, + "grad_norm": 4.143842376760835, + "language_loss": 5.42230844, + "learning_rate": 0.0005226005109505393, + "loss": 5.44865704, + "num_input_tokens_seen": 982720, + "router_z_loss_mlp": 4.76171875, + "step": 14, + "time_per_iteration": 2.5524611473083496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02844198, + "balance_loss_mlp": 2.3646903, + "epoch": 0.0028857252789534437, + "flos": 435525628416.0, + "grad_norm": 5.672862092220106, + "language_loss": 4.15845776, + "learning_rate": 0.0005362628552605367, + "loss": 4.18689966, + "num_input_tokens_seen": 1050528, + "router_z_loss_mlp": 4.80078125, + "step": 15, + "time_per_iteration": 2.7353649139404297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03208902, + "balance_loss_mlp": 2.72252893, + "epoch": 0.0030781069642170067, + "flos": 597840826368.0, + "grad_norm": 3.947061509829782, + "language_loss": 2.26971245, + "learning_rate": 0.0005490431248454357, + "loss": 2.30180168, + "num_input_tokens_seen": 1116512, + "router_z_loss_mlp": 4.87109375, + "step": 16, + "time_per_iteration": 2.676703929901123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03601284, + "balance_loss_mlp": 3.10232162, + "epoch": 0.0032704886494805694, + "flos": 1541510280192.0, + "grad_norm": 0.6213816402988768, + "language_loss": 0.75705111, + "learning_rate": 0.0005610483427624225, + "loss": 0.793064, + "num_input_tokens_seen": 1351216, + "router_z_loss_mlp": 5.0, + "step": 17, + "time_per_iteration": 6.1610119342803955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0334326, + "balance_loss_mlp": 2.85841203, + "epoch": 0.0034628703347441324, + "flos": 474970237440.0, + "grad_norm": 2.8341915883282045, + "language_loss": 1.71282685, + "learning_rate": 0.0005723671632907488, + "loss": 1.74625945, + "num_input_tokens_seen": 1420512, + "router_z_loss_mlp": 4.85546875, + "step": 18, + "time_per_iteration": 2.638371467590332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02881518, + "balance_loss_mlp": 2.39934015, + "epoch": 0.0036552520200076955, + "flos": 449477743104.0, + "grad_norm": 2.8867361132515086, + "language_loss": 1.68530536, + "learning_rate": 0.0005830738490244919, + "loss": 1.71412063, + "num_input_tokens_seen": 1484976, + "router_z_loss_mlp": 4.828125, + "step": 19, + "time_per_iteration": 2.56374454498291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02402526, + "balance_loss_mlp": 1.92301893, + "epoch": 0.003847633705271258, + "flos": 637350563328.0, + "grad_norm": 0.6925173808128176, + "language_loss": 1.38203168, + "learning_rate": 0.0005932312266435596, + "loss": 1.406057, + "num_input_tokens_seen": 1557392, + "router_z_loss_mlp": 4.80078125, + "step": 20, + "time_per_iteration": 2.763998508453369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02421171, + "balance_loss_mlp": 1.94814897, + "epoch": 0.004040015390534821, + "flos": 590590158336.0, + "grad_norm": 1.6265477944222306, + "language_loss": 1.40919662, + "learning_rate": 0.0006028929207788754, + "loss": 1.43340826, + "num_input_tokens_seen": 1626064, + "router_z_loss_mlp": 4.734375, + "step": 21, + "time_per_iteration": 2.746016502380371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02575294, + "balance_loss_mlp": 2.10036469, + "epoch": 0.004232397075798384, + "flos": 757865812992.0, + "grad_norm": 1.576079326940489, + "language_loss": 1.40810275, + "learning_rate": 0.0006121050677327902, + "loss": 1.43385565, + "num_input_tokens_seen": 1696528, + "router_z_loss_mlp": 4.75390625, + "step": 22, + "time_per_iteration": 2.9607386589050293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02550906, + "balance_loss_mlp": 2.07025433, + "epoch": 0.004424778761061947, + "flos": 527726415360.0, + "grad_norm": 0.6323448080178445, + "language_loss": 1.22419024, + "learning_rate": 0.0006209076479463684, + "loss": 1.24969923, + "num_input_tokens_seen": 1765936, + "router_z_loss_mlp": 4.8125, + "step": 23, + "time_per_iteration": 2.5966527462005615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02511897, + "balance_loss_mlp": 2.02285314, + "epoch": 0.00461716044632551, + "flos": 549217907712.0, + "grad_norm": 0.22573529074246063, + "language_loss": 1.26396596, + "learning_rate": 0.0006293355346737718, + "loss": 1.28908491, + "num_input_tokens_seen": 1841632, + "router_z_loss_mlp": 4.8984375, + "step": 24, + "time_per_iteration": 2.672264575958252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02557217, + "balance_loss_mlp": 2.05978036, + "epoch": 0.004809542131589073, + "flos": 568751559168.0, + "grad_norm": 0.10471299124135865, + "language_loss": 1.20974565, + "learning_rate": 0.0006374193284416834, + "loss": 1.23531783, + "num_input_tokens_seen": 1920256, + "router_z_loss_mlp": 4.96875, + "step": 25, + "time_per_iteration": 2.7392375469207764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02658191, + "balance_loss_mlp": 2.15503263, + "epoch": 0.005001923816852636, + "flos": 471583584768.0, + "grad_norm": 0.16888144752152706, + "language_loss": 1.20314312, + "learning_rate": 0.0006451860277489461, + "loss": 1.22972512, + "num_input_tokens_seen": 1986528, + "router_z_loss_mlp": 5.02734375, + "step": 26, + "time_per_iteration": 2.6047253608703613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02722422, + "balance_loss_mlp": 2.21582985, + "epoch": 0.005194305502116198, + "flos": 416380743168.0, + "grad_norm": 0.22424567034217777, + "language_loss": 1.28844571, + "learning_rate": 0.0006526595731190848, + "loss": 1.31566989, + "num_input_tokens_seen": 2048016, + "router_z_loss_mlp": 5.0625, + "step": 27, + "time_per_iteration": 2.481884717941284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02743244, + "balance_loss_mlp": 2.2351265, + "epoch": 0.005386687187379761, + "flos": 629995835904.0, + "grad_norm": 0.15642653525507078, + "language_loss": 1.18914986, + "learning_rate": 0.0006598612921618983, + "loss": 1.2165823, + "num_input_tokens_seen": 2127664, + "router_z_loss_mlp": 5.078125, + "step": 28, + "time_per_iteration": 2.8519153594970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02748247, + "balance_loss_mlp": 2.24051118, + "epoch": 0.005579068872643324, + "flos": 888019997184.0, + "grad_norm": 0.1209301216257677, + "language_loss": 1.12191987, + "learning_rate": 0.0006668102665011454, + "loss": 1.14940238, + "num_input_tokens_seen": 2213952, + "router_z_loss_mlp": 5.07421875, + "step": 29, + "time_per_iteration": 3.2244889736175537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02691091, + "balance_loss_mlp": 2.18411779, + "epoch": 0.005771450557906887, + "flos": 548657952768.0, + "grad_norm": 0.1098895199150706, + "language_loss": 1.21368051, + "learning_rate": 0.0006735236364718957, + "loss": 1.24059153, + "num_input_tokens_seen": 2284736, + "router_z_loss_mlp": 5.06640625, + "step": 30, + "time_per_iteration": 2.642730474472046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02653145, + "balance_loss_mlp": 2.14769816, + "epoch": 0.00596383224317045, + "flos": 533068907520.0, + "grad_norm": 0.11046596793449442, + "language_loss": 1.1970098, + "learning_rate": 0.0006800168558381346, + "loss": 1.22354114, + "num_input_tokens_seen": 2354384, + "router_z_loss_mlp": 5.05078125, + "step": 31, + "time_per_iteration": 2.581875801086426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0257592, + "balance_loss_mlp": 2.07123542, + "epoch": 0.0061562139284340135, + "flos": 590162460672.0, + "grad_norm": 0.10949645130098669, + "language_loss": 1.22987807, + "learning_rate": 0.0006863039060567947, + "loss": 1.25563729, + "num_input_tokens_seen": 2419440, + "router_z_loss_mlp": 5.04296875, + "step": 32, + "time_per_iteration": 2.733224868774414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02505923, + "balance_loss_mlp": 2.00390816, + "epoch": 0.006348595613697576, + "flos": 619441107456.0, + "grad_norm": 0.0835016489973258, + "language_loss": 1.14437437, + "learning_rate": 0.0006923974775611263, + "loss": 1.16943359, + "num_input_tokens_seen": 2496368, + "router_z_loss_mlp": 5.015625, + "step": 33, + "time_per_iteration": 2.820788621902466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02482464, + "balance_loss_mlp": 1.98159432, + "epoch": 0.006540977298961139, + "flos": 779298908160.0, + "grad_norm": 0.08776573315434787, + "language_loss": 1.10869515, + "learning_rate": 0.0006983091239737814, + "loss": 1.13351965, + "num_input_tokens_seen": 2573280, + "router_z_loss_mlp": 5.00390625, + "step": 34, + "time_per_iteration": 2.9917590618133545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02373805, + "balance_loss_mlp": 1.87636864, + "epoch": 0.006733358984224702, + "flos": 668372201472.0, + "grad_norm": 0.0744368555221442, + "language_loss": 1.09626412, + "learning_rate": 0.0007040493939600222, + "loss": 1.12000227, + "num_input_tokens_seen": 2647248, + "router_z_loss_mlp": 4.96875, + "step": 35, + "time_per_iteration": 2.813040256500244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02308046, + "balance_loss_mlp": 1.81175399, + "epoch": 0.006925740669488265, + "flos": 565495162368.0, + "grad_norm": 0.06560236116646054, + "language_loss": 1.0974791, + "learning_rate": 0.0007096279445021078, + "loss": 1.12055957, + "num_input_tokens_seen": 2720736, + "router_z_loss_mlp": 4.95703125, + "step": 36, + "time_per_iteration": 2.715013027191162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02240602, + "balance_loss_mlp": 1.74888754, + "epoch": 0.007118122354751828, + "flos": 551111347200.0, + "grad_norm": 0.05581405617561486, + "language_loss": 1.16120386, + "learning_rate": 0.0007150536386503726, + "loss": 1.18360972, + "num_input_tokens_seen": 2800336, + "router_z_loss_mlp": 4.91015625, + "step": 37, + "time_per_iteration": 2.8262643814086914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02218804, + "balance_loss_mlp": 1.7293781, + "epoch": 0.007310504040015391, + "flos": 703813807104.0, + "grad_norm": 0.06412720029508237, + "language_loss": 1.08394384, + "learning_rate": 0.0007203346302358509, + "loss": 1.10613179, + "num_input_tokens_seen": 2883184, + "router_z_loss_mlp": 4.890625, + "step": 38, + "time_per_iteration": 2.9149320125579834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0220325, + "balance_loss_mlp": 1.71954608, + "epoch": 0.007502885725278953, + "flos": 600500338176.0, + "grad_norm": 0.08018675586540955, + "language_loss": 1.13587177, + "learning_rate": 0.000725478437577282, + "loss": 1.15790427, + "num_input_tokens_seen": 2960736, + "router_z_loss_mlp": 4.84375, + "step": 39, + "time_per_iteration": 2.7649383544921875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02194939, + "balance_loss_mlp": 1.71237946, + "epoch": 0.007695267410542516, + "flos": 561427031040.0, + "grad_norm": 0.11080304178085185, + "language_loss": 1.08546591, + "learning_rate": 0.0007304920078549186, + "loss": 1.10741532, + "num_input_tokens_seen": 3033472, + "router_z_loss_mlp": 4.83203125, + "step": 40, + "time_per_iteration": 2.7245187759399414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02164234, + "balance_loss_mlp": 1.68548942, + "epoch": 0.007887649095806078, + "flos": 509230808064.0, + "grad_norm": 0.12864951336881933, + "language_loss": 1.10053396, + "learning_rate": 0.0007353817735343603, + "loss": 1.12217629, + "num_input_tokens_seen": 3107824, + "router_z_loss_mlp": 4.79296875, + "step": 41, + "time_per_iteration": 2.6662168502807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02109951, + "balance_loss_mlp": 1.63425827, + "epoch": 0.008080030781069641, + "flos": 504904166400.0, + "grad_norm": 0.0888118324595499, + "language_loss": 1.05816543, + "learning_rate": 0.0007401537019902344, + "loss": 1.07926488, + "num_input_tokens_seen": 3176528, + "router_z_loss_mlp": 4.76171875, + "step": 42, + "time_per_iteration": 2.583256244659424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02065976, + "balance_loss_mlp": 1.59219027, + "epoch": 0.008272412466333205, + "flos": 519106059264.0, + "grad_norm": 0.08974821197730459, + "language_loss": 1.0785954, + "learning_rate": 0.0007448133392900729, + "loss": 1.09925508, + "num_input_tokens_seen": 3254256, + "router_z_loss_mlp": 4.7421875, + "step": 43, + "time_per_iteration": 2.677175998687744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01955434, + "balance_loss_mlp": 1.4839375, + "epoch": 0.008464794151596768, + "flos": 609183820800.0, + "grad_norm": 0.06237767914218564, + "language_loss": 1.03785229, + "learning_rate": 0.0007493658489441491, + "loss": 1.05740666, + "num_input_tokens_seen": 3340224, + "router_z_loss_mlp": 4.71875, + "step": 44, + "time_per_iteration": 2.8553237915039062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01864539, + "balance_loss_mlp": 1.39800107, + "epoch": 0.00865717583686033, + "flos": 539006283264.0, + "grad_norm": 0.049849947719683325, + "language_loss": 1.08088911, + "learning_rate": 0.0007538160463002316, + "loss": 1.09953451, + "num_input_tokens_seen": 3409216, + "router_z_loss_mlp": 4.66796875, + "step": 45, + "time_per_iteration": 2.637796640396118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01780353, + "balance_loss_mlp": 1.31572247, + "epoch": 0.008849557522123894, + "flos": 509009227776.0, + "grad_norm": 0.046919324832442044, + "language_loss": 1.11748755, + "learning_rate": 0.0007581684291577274, + "loss": 1.1352911, + "num_input_tokens_seen": 3478352, + "router_z_loss_mlp": 4.6484375, + "step": 46, + "time_per_iteration": 2.5655901432037354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01764453, + "balance_loss_mlp": 1.30211222, + "epoch": 0.009041939207387457, + "flos": 626507125248.0, + "grad_norm": 0.05937298040562763, + "language_loss": 1.13580585, + "learning_rate": 0.0007624272050891776, + "loss": 1.15345049, + "num_input_tokens_seen": 3555616, + "router_z_loss_mlp": 4.625, + "step": 47, + "time_per_iteration": 2.804643392562866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01776852, + "balance_loss_mlp": 1.31908798, + "epoch": 0.00923432089265102, + "flos": 550609789440.0, + "grad_norm": 0.07500714899038924, + "language_loss": 1.03489327, + "learning_rate": 0.0007665963158851307, + "loss": 1.05266178, + "num_input_tokens_seen": 3634512, + "router_z_loss_mlp": 4.578125, + "step": 48, + "time_per_iteration": 2.781435489654541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01771411, + "balance_loss_mlp": 1.3170805, + "epoch": 0.009426702577914583, + "flos": 563678310912.0, + "grad_norm": 0.07921486390615404, + "language_loss": 1.12758589, + "learning_rate": 0.0007706794594783609, + "loss": 1.14529991, + "num_input_tokens_seen": 3708480, + "router_z_loss_mlp": 4.54296875, + "step": 49, + "time_per_iteration": 2.739976644515991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.017484, + "balance_loss_mlp": 1.29483247, + "epoch": 0.009619084263178146, + "flos": 617925700608.0, + "grad_norm": 0.05671895540127436, + "language_loss": 1.10915053, + "learning_rate": 0.0007746801096530423, + "loss": 1.12663448, + "num_input_tokens_seen": 3783472, + "router_z_loss_mlp": 4.53515625, + "step": 50, + "time_per_iteration": 2.7333760261535645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01715641, + "balance_loss_mlp": 1.2616924, + "epoch": 0.009811465948441709, + "flos": 542488263168.0, + "grad_norm": 0.04785443300923319, + "language_loss": 1.16231108, + "learning_rate": 0.0007786015338021173, + "loss": 1.17946756, + "num_input_tokens_seen": 3851360, + "router_z_loss_mlp": 4.5390625, + "step": 51, + "time_per_iteration": 2.681392192840576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01700387, + "balance_loss_mlp": 1.24720073, + "epoch": 0.010003847633705272, + "flos": 536976583680.0, + "grad_norm": 0.04536583817216675, + "language_loss": 1.08076, + "learning_rate": 0.0007824468089603051, + "loss": 1.0977639, + "num_input_tokens_seen": 3923056, + "router_z_loss_mlp": 4.53125, + "step": 52, + "time_per_iteration": 2.6839513778686523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01675834, + "balance_loss_mlp": 1.2218852, + "epoch": 0.010196229318968833, + "flos": 910805316096.0, + "grad_norm": 0.04374839581732082, + "language_loss": 1.0833261, + "learning_rate": 0.0007862188363098669, + "loss": 1.10008454, + "num_input_tokens_seen": 4004528, + "router_z_loss_mlp": 4.5390625, + "step": 53, + "time_per_iteration": 3.1748838424682617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01650634, + "balance_loss_mlp": 1.19477725, + "epoch": 0.010388611004232396, + "flos": 586969190400.0, + "grad_norm": 0.045477377455174536, + "language_loss": 1.08262885, + "learning_rate": 0.0007899203543304438, + "loss": 1.09913516, + "num_input_tokens_seen": 4078704, + "router_z_loss_mlp": 4.55859375, + "step": 54, + "time_per_iteration": 2.7011117935180664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01588572, + "balance_loss_mlp": 1.13195276, + "epoch": 0.01058099268949596, + "flos": 503471351808.0, + "grad_norm": 0.05216939031034974, + "language_loss": 1.22650576, + "learning_rate": 0.0007935539507422731, + "loss": 1.24239147, + "num_input_tokens_seen": 4143600, + "router_z_loss_mlp": 4.56640625, + "step": 55, + "time_per_iteration": 2.6142656803131104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01553155, + "balance_loss_mlp": 1.09462798, + "epoch": 0.010773374374759523, + "flos": 545558008320.0, + "grad_norm": 0.04278176221573414, + "language_loss": 1.12836909, + "learning_rate": 0.0007971220733732573, + "loss": 1.14390063, + "num_input_tokens_seen": 4217904, + "router_z_loss_mlp": 4.5859375, + "step": 56, + "time_per_iteration": 2.718210220336914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01586959, + "balance_loss_mlp": 1.1318655, + "epoch": 0.010965756060023086, + "flos": 527285982720.0, + "grad_norm": 0.06958617519474361, + "language_loss": 1.08844507, + "learning_rate": 0.0008006270400641869, + "loss": 1.10431468, + "num_input_tokens_seen": 4293920, + "router_z_loss_mlp": 4.55078125, + "step": 57, + "time_per_iteration": 2.702324628829956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01576177, + "balance_loss_mlp": 1.12375367, + "epoch": 0.011158137745286649, + "flos": 578097054720.0, + "grad_norm": 0.08376433329063605, + "language_loss": 1.09231043, + "learning_rate": 0.0008040710477125043, + "loss": 1.10807228, + "num_input_tokens_seen": 4370080, + "router_z_loss_mlp": 4.5234375, + "step": 58, + "time_per_iteration": 2.733733892440796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01587306, + "balance_loss_mlp": 1.13793492, + "epoch": 0.011350519430550212, + "flos": 530314068480.0, + "grad_norm": 0.056261163559927586, + "language_loss": 1.098104, + "learning_rate": 0.0008074561805429771, + "loss": 1.11397719, + "num_input_tokens_seen": 4439792, + "router_z_loss_mlp": 4.4921875, + "step": 59, + "time_per_iteration": 2.604173183441162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0153348, + "balance_loss_mlp": 1.0886867, + "epoch": 0.011542901115813775, + "flos": 556970133504.0, + "grad_norm": 0.07546157909609297, + "language_loss": 1.07214928, + "learning_rate": 0.0008107844176832545, + "loss": 1.08748412, + "num_input_tokens_seen": 4510800, + "router_z_loss_mlp": 4.45703125, + "step": 60, + "time_per_iteration": 2.670180082321167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01515203, + "balance_loss_mlp": 1.07155395, + "epoch": 0.011735282801077338, + "flos": 573175529472.0, + "grad_norm": 0.06932920743779293, + "language_loss": 1.09267807, + "learning_rate": 0.0008140576401132568, + "loss": 1.10783005, + "num_input_tokens_seen": 4581136, + "router_z_loss_mlp": 4.44921875, + "step": 61, + "time_per_iteration": 2.635917901992798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01537914, + "balance_loss_mlp": 1.0965538, + "epoch": 0.0119276644863409, + "flos": 616716467712.0, + "grad_norm": 0.056166475672555005, + "language_loss": 1.10548615, + "learning_rate": 0.0008172776370494935, + "loss": 1.12086535, + "num_input_tokens_seen": 4650352, + "router_z_loss_mlp": 4.42578125, + "step": 62, + "time_per_iteration": 2.709764242172241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.015397, + "balance_loss_mlp": 1.10024714, + "epoch": 0.012120046171604464, + "flos": 502084199424.0, + "grad_norm": 0.046962065793300374, + "language_loss": 1.17909575, + "learning_rate": 0.0008204461118185703, + "loss": 1.19449282, + "num_input_tokens_seen": 4716336, + "router_z_loss_mlp": 4.40625, + "step": 63, + "time_per_iteration": 2.5971004962921143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01545078, + "balance_loss_mlp": 1.10943925, + "epoch": 0.012312427856868027, + "flos": 474301493760.0, + "grad_norm": 0.04671162143151921, + "language_loss": 1.07277906, + "learning_rate": 0.0008235646872681536, + "loss": 1.08822989, + "num_input_tokens_seen": 4781648, + "router_z_loss_mlp": 4.3671875, + "step": 64, + "time_per_iteration": 2.567622423171997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01534227, + "balance_loss_mlp": 1.10240316, + "epoch": 0.012504809542131588, + "flos": 539470910976.0, + "grad_norm": 0.04435006978162803, + "language_loss": 1.0673492, + "learning_rate": 0.0008266349107584288, + "loss": 1.08269131, + "num_input_tokens_seen": 4852320, + "router_z_loss_mlp": 4.328125, + "step": 65, + "time_per_iteration": 2.6833384037017822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0149994, + "balance_loss_mlp": 1.07345641, + "epoch": 0.012697191227395151, + "flos": 609856567296.0, + "grad_norm": 0.04524096047594039, + "language_loss": 1.09403265, + "learning_rate": 0.0008296582587724851, + "loss": 1.10903215, + "num_input_tokens_seen": 4922016, + "router_z_loss_mlp": 4.2734375, + "step": 66, + "time_per_iteration": 2.692337989807129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01482262, + "balance_loss_mlp": 1.05806744, + "epoch": 0.012889572912658714, + "flos": 769397460480.0, + "grad_norm": 0.04198159389490698, + "language_loss": 1.06809163, + "learning_rate": 0.0008326361411800136, + "loss": 1.08291411, + "num_input_tokens_seen": 5000128, + "router_z_loss_mlp": 4.25, + "step": 67, + "time_per_iteration": 2.923720598220825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01474655, + "balance_loss_mlp": 1.05503809, + "epoch": 0.013081954597922277, + "flos": 535020744192.0, + "grad_norm": 0.041919130945389606, + "language_loss": 1.07100165, + "learning_rate": 0.0008355699051851403, + "loss": 1.0857482, + "num_input_tokens_seen": 5074512, + "router_z_loss_mlp": 4.203125, + "step": 68, + "time_per_iteration": 2.7417044639587402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01462817, + "balance_loss_mlp": 1.04701531, + "epoch": 0.01327433628318584, + "flos": 574180646400.0, + "grad_norm": 0.041322055356332446, + "language_loss": 1.14468551, + "learning_rate": 0.0008384608389860635, + "loss": 1.15931368, + "num_input_tokens_seen": 5141856, + "router_z_loss_mlp": 4.1640625, + "step": 69, + "time_per_iteration": 2.6545376777648926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01450151, + "balance_loss_mlp": 1.03930819, + "epoch": 0.013466717968449404, + "flos": 498259115520.0, + "grad_norm": 0.039605765449237204, + "language_loss": 1.04742777, + "learning_rate": 0.000841310175171381, + "loss": 1.06192923, + "num_input_tokens_seen": 5209280, + "router_z_loss_mlp": 4.11328125, + "step": 70, + "time_per_iteration": 2.5687999725341797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01441096, + "balance_loss_mlp": 1.03101599, + "epoch": 0.013659099653712967, + "flos": 566621803008.0, + "grad_norm": 0.03646297128801074, + "language_loss": 1.03104186, + "learning_rate": 0.000844119093875517, + "loss": 1.04545283, + "num_input_tokens_seen": 5285424, + "router_z_loss_mlp": 4.1015625, + "step": 71, + "time_per_iteration": 2.698259115219116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01433469, + "balance_loss_mlp": 1.02720368, + "epoch": 0.01385148133897653, + "flos": 574942715904.0, + "grad_norm": 0.02854119406997066, + "language_loss": 1.07372236, + "learning_rate": 0.0008468887257134666, + "loss": 1.08805704, + "num_input_tokens_seen": 5358624, + "router_z_loss_mlp": 4.06445312, + "step": 72, + "time_per_iteration": 2.7074387073516846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01422625, + "balance_loss_mlp": 1.01941192, + "epoch": 0.014043863024240093, + "flos": 577958066688.0, + "grad_norm": 0.03113282173853564, + "language_loss": 1.10314119, + "learning_rate": 0.0008496201545131264, + "loss": 1.11736751, + "num_input_tokens_seen": 5429792, + "router_z_loss_mlp": 4.03515625, + "step": 73, + "time_per_iteration": 2.725660562515259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01425762, + "balance_loss_mlp": 1.02655351, + "epoch": 0.014236244709503656, + "flos": 940263883776.0, + "grad_norm": 0.033199488198319166, + "language_loss": 1.07624495, + "learning_rate": 0.0008523144198617317, + "loss": 1.0905025, + "num_input_tokens_seen": 5518608, + "router_z_loss_mlp": 3.99414062, + "step": 74, + "time_per_iteration": 3.2577481269836426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01437934, + "balance_loss_mlp": 1.04139662, + "epoch": 0.014428626394767219, + "flos": 529495603200.0, + "grad_norm": 0.03119178099318558, + "language_loss": 1.07016373, + "learning_rate": 0.0008549725194813783, + "loss": 1.08454299, + "num_input_tokens_seen": 5590576, + "router_z_loss_mlp": 3.96679688, + "step": 75, + "time_per_iteration": 2.727982997894287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01437754, + "balance_loss_mlp": 1.0446496, + "epoch": 0.014621008080030782, + "flos": 805282226688.0, + "grad_norm": 0.02968258762679391, + "language_loss": 1.06415534, + "learning_rate": 0.0008575954114472099, + "loss": 1.07853293, + "num_input_tokens_seen": 5674224, + "router_z_loss_mlp": 3.93164062, + "step": 76, + "time_per_iteration": 3.172807455062866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0143975, + "balance_loss_mlp": 1.04950643, + "epoch": 0.014813389765294343, + "flos": 698356521984.0, + "grad_norm": 0.031905123056971844, + "language_loss": 1.03629625, + "learning_rate": 0.0008601840162606118, + "loss": 1.05069387, + "num_input_tokens_seen": 5757648, + "router_z_loss_mlp": 3.90234375, + "step": 77, + "time_per_iteration": 3.029114007949829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01438585, + "balance_loss_mlp": 1.05158365, + "epoch": 0.015005771450557906, + "flos": 598164464640.0, + "grad_norm": 0.026994348673938514, + "language_loss": 1.09661531, + "learning_rate": 0.000862739218788641, + "loss": 1.11100101, + "num_input_tokens_seen": 5837600, + "router_z_loss_mlp": 3.86914062, + "step": 78, + "time_per_iteration": 2.795952320098877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01440626, + "balance_loss_mlp": 1.05705774, + "epoch": 0.01519815313582147, + "flos": 550492268544.0, + "grad_norm": 0.029495859587709627, + "language_loss": 1.07574832, + "learning_rate": 0.0008652618700799138, + "loss": 1.09015465, + "num_input_tokens_seen": 5907248, + "router_z_loss_mlp": 3.83789062, + "step": 79, + "time_per_iteration": 2.6552224159240723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01430975, + "balance_loss_mlp": 1.05084014, + "epoch": 0.015390534821085032, + "flos": 431440032768.0, + "grad_norm": 0.037998818197719206, + "language_loss": 1.07206631, + "learning_rate": 0.0008677527890662774, + "loss": 1.08637595, + "num_input_tokens_seen": 5970864, + "router_z_loss_mlp": 3.80664062, + "step": 80, + "time_per_iteration": 2.530073881149292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01424927, + "balance_loss_mlp": 1.04727161, + "epoch": 0.015582916506348595, + "flos": 525184424448.0, + "grad_norm": 0.03521308344632083, + "language_loss": 1.08168781, + "learning_rate": 0.0008702127641587799, + "loss": 1.09593713, + "num_input_tokens_seen": 6040800, + "router_z_loss_mlp": 3.78125, + "step": 81, + "time_per_iteration": 2.6248533725738525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01426595, + "balance_loss_mlp": 1.05141926, + "epoch": 0.015775298191612157, + "flos": 576616576512.0, + "grad_norm": 0.026523126631237747, + "language_loss": 1.036394, + "learning_rate": 0.0008726425547457192, + "loss": 1.05065989, + "num_input_tokens_seen": 6111840, + "router_z_loss_mlp": 3.75585938, + "step": 82, + "time_per_iteration": 2.759159564971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01424967, + "balance_loss_mlp": 1.05303442, + "epoch": 0.01596767987687572, + "flos": 611439103488.0, + "grad_norm": 0.03656915183129864, + "language_loss": 1.03032446, + "learning_rate": 0.0008750428925998964, + "loss": 1.04457414, + "num_input_tokens_seen": 6183872, + "router_z_loss_mlp": 3.72265625, + "step": 83, + "time_per_iteration": 2.739105224609375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01431924, + "balance_loss_mlp": 1.06151688, + "epoch": 0.016160061562139283, + "flos": 568232537088.0, + "grad_norm": 0.03323001720600938, + "language_loss": 1.08511543, + "learning_rate": 0.0008774144832015932, + "loss": 1.09943461, + "num_input_tokens_seen": 6255760, + "router_z_loss_mlp": 3.70703125, + "step": 84, + "time_per_iteration": 2.7144806385040283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02085876, + "balance_loss_mlp": 1.68762207, + "epoch": 0.016352443247402846, + "flos": 1414499701248.0, + "grad_norm": 0.1388747380481991, + "language_loss": 0.74774313, + "learning_rate": 0.0008797580069832641, + "loss": 0.76860189, + "num_input_tokens_seen": 6472960, + "router_z_loss_mlp": 3.984375, + "step": 85, + "time_per_iteration": 4.569611072540283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01450774, + "balance_loss_mlp": 1.08532572, + "epoch": 0.01654482493266641, + "flos": 731785165824.0, + "grad_norm": 0.04601998260491519, + "language_loss": 1.03772068, + "learning_rate": 0.0008820741205014318, + "loss": 1.05222845, + "num_input_tokens_seen": 6548912, + "router_z_loss_mlp": 3.65625, + "step": 86, + "time_per_iteration": 2.8604419231414795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.014606, + "balance_loss_mlp": 1.09744096, + "epoch": 0.016737206617929972, + "flos": 537404281344.0, + "grad_norm": 0.03433335749497543, + "language_loss": 1.05140662, + "learning_rate": 0.0008843634575408404, + "loss": 1.06601262, + "num_input_tokens_seen": 6621520, + "router_z_loss_mlp": 3.62695312, + "step": 87, + "time_per_iteration": 2.677731513977051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0145769, + "balance_loss_mlp": 1.09777355, + "epoch": 0.016929588303193535, + "flos": 538129420800.0, + "grad_norm": 0.05036212092144492, + "language_loss": 1.06815004, + "learning_rate": 0.0008866266301555082, + "loss": 1.08272696, + "num_input_tokens_seen": 6698432, + "router_z_loss_mlp": 3.59765625, + "step": 88, + "time_per_iteration": 2.7037875652313232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0145347, + "balance_loss_mlp": 1.09622347, + "epoch": 0.017121969988457098, + "flos": 527791543296.0, + "grad_norm": 0.030252065691096418, + "language_loss": 1.07441962, + "learning_rate": 0.0008888642296509615, + "loss": 1.08895445, + "num_input_tokens_seen": 6764336, + "router_z_loss_mlp": 3.56445312, + "step": 89, + "time_per_iteration": 2.590280771255493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0145473, + "balance_loss_mlp": 1.10034442, + "epoch": 0.01731435167372066, + "flos": 626767636992.0, + "grad_norm": 0.041554939890322294, + "language_loss": 1.12743318, + "learning_rate": 0.0008910768275115906, + "loss": 1.14198053, + "num_input_tokens_seen": 6839392, + "router_z_loss_mlp": 3.54101562, + "step": 90, + "time_per_iteration": 2.7529714107513428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0145373, + "balance_loss_mlp": 1.10220587, + "epoch": 0.017506733358984224, + "flos": 497384254464.0, + "grad_norm": 0.05646737130307679, + "language_loss": 1.07978606, + "learning_rate": 0.0008932649762767675, + "loss": 1.0943234, + "num_input_tokens_seen": 6907344, + "router_z_loss_mlp": 3.50976562, + "step": 91, + "time_per_iteration": 2.5964808464050293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01457202, + "balance_loss_mlp": 1.10911036, + "epoch": 0.017699115044247787, + "flos": 747217758720.0, + "grad_norm": 0.04050166442287704, + "language_loss": 1.1018101, + "learning_rate": 0.0008954292103690864, + "loss": 1.11638212, + "num_input_tokens_seen": 6982464, + "router_z_loss_mlp": 3.47851562, + "step": 92, + "time_per_iteration": 2.9288997650146484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01459372, + "balance_loss_mlp": 1.11395121, + "epoch": 0.01789149672951135, + "flos": 516520407552.0, + "grad_norm": 0.054281950557984966, + "language_loss": 1.12496912, + "learning_rate": 0.0008975700468778296, + "loss": 1.13956285, + "num_input_tokens_seen": 7049712, + "router_z_loss_mlp": 3.45117188, + "step": 93, + "time_per_iteration": 2.5800487995147705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01462727, + "balance_loss_mlp": 1.11978543, + "epoch": 0.018083878414774913, + "flos": 587229702144.0, + "grad_norm": 0.04557553976021738, + "language_loss": 1.05795836, + "learning_rate": 0.0008996879863005366, + "loss": 1.07258558, + "num_input_tokens_seen": 7120288, + "router_z_loss_mlp": 3.42578125, + "step": 94, + "time_per_iteration": 2.6668198108673096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0146929, + "balance_loss_mlp": 1.12882805, + "epoch": 0.018276260100038477, + "flos": 498369905664.0, + "grad_norm": 0.055406629054909326, + "language_loss": 1.06168532, + "learning_rate": 0.0009017835132453337, + "loss": 1.07637823, + "num_input_tokens_seen": 7188896, + "router_z_loss_mlp": 3.40234375, + "step": 95, + "time_per_iteration": 2.588728904724121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0146889, + "balance_loss_mlp": 1.1312896, + "epoch": 0.01846864178530204, + "flos": 641232043008.0, + "grad_norm": 0.04012691806662063, + "language_loss": 1.05874133, + "learning_rate": 0.0009038570970964896, + "loss": 1.0734303, + "num_input_tokens_seen": 7259536, + "router_z_loss_mlp": 3.37890625, + "step": 96, + "time_per_iteration": 2.7860937118530273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01464817, + "balance_loss_mlp": 1.12912345, + "epoch": 0.018661023470565603, + "flos": 512667125760.0, + "grad_norm": 0.027884025705687265, + "language_loss": 1.03269148, + "learning_rate": 0.0009059091926454854, + "loss": 1.04733968, + "num_input_tokens_seen": 7326752, + "router_z_loss_mlp": 3.359375, + "step": 97, + "time_per_iteration": 2.6100950241088867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01470726, + "balance_loss_mlp": 1.13694024, + "epoch": 0.018853405155829166, + "flos": 932696308224.0, + "grad_norm": 0.03936003805775877, + "language_loss": 1.02435613, + "learning_rate": 0.0009079402406897198, + "loss": 1.03906357, + "num_input_tokens_seen": 7417488, + "router_z_loss_mlp": 3.33984375, + "step": 98, + "time_per_iteration": 3.2489542961120605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01467854, + "balance_loss_mlp": 1.13616598, + "epoch": 0.01904578684109273, + "flos": 577586764800.0, + "grad_norm": 0.036005296184057074, + "language_loss": 1.04073858, + "learning_rate": 0.0009099506686008212, + "loss": 1.05541718, + "num_input_tokens_seen": 7493136, + "router_z_loss_mlp": 3.31835938, + "step": 99, + "time_per_iteration": 2.7905051708221436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01467812, + "balance_loss_mlp": 1.13822246, + "epoch": 0.019238168526356292, + "flos": 559520856576.0, + "grad_norm": 0.02696843746399884, + "language_loss": 1.07409596, + "learning_rate": 0.0009119408908644013, + "loss": 1.08877409, + "num_input_tokens_seen": 7560896, + "router_z_loss_mlp": 3.296875, + "step": 100, + "time_per_iteration": 2.7075607776641846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01456893, + "balance_loss_mlp": 1.12882876, + "epoch": 0.019430550211619855, + "flos": 725103184896.0, + "grad_norm": 0.03304065923870771, + "language_loss": 1.12780023, + "learning_rate": 0.0009139113095929519, + "loss": 1.14236927, + "num_input_tokens_seen": 7629040, + "router_z_loss_mlp": 3.28125, + "step": 101, + "time_per_iteration": 2.86230731010437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01460167, + "balance_loss_mlp": 1.13439226, + "epoch": 0.019622931896883418, + "flos": 500456001024.0, + "grad_norm": 0.030619133870748612, + "language_loss": 1.06594038, + "learning_rate": 0.0009158623150134762, + "loss": 1.08054209, + "num_input_tokens_seen": 7694256, + "router_z_loss_mlp": 3.2578125, + "step": 102, + "time_per_iteration": 2.563690185546875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01458611, + "balance_loss_mlp": 1.13569677, + "epoch": 0.01981531358214698, + "flos": 510281587200.0, + "grad_norm": 0.03276303076426602, + "language_loss": 1.06164801, + "learning_rate": 0.000917794285931332, + "loss": 1.0762341, + "num_input_tokens_seen": 7762256, + "router_z_loss_mlp": 3.22851562, + "step": 103, + "time_per_iteration": 2.6599903106689453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01462945, + "balance_loss_mlp": 1.1421293, + "epoch": 0.020007695267410544, + "flos": 522392655360.0, + "grad_norm": 0.026505304013468463, + "language_loss": 0.98227251, + "learning_rate": 0.0009197075901716639, + "loss": 0.99690199, + "num_input_tokens_seen": 7834400, + "router_z_loss_mlp": 3.20703125, + "step": 104, + "time_per_iteration": 2.726245880126953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01469463, + "balance_loss_mlp": 1.14998221, + "epoch": 0.020200076952674107, + "flos": 534443324928.0, + "grad_norm": 0.029933884589862427, + "language_loss": 1.08736229, + "learning_rate": 0.0009216025849997171, + "loss": 1.10205698, + "num_input_tokens_seen": 7911184, + "router_z_loss_mlp": 3.19335938, + "step": 105, + "time_per_iteration": 2.8023486137390137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01468836, + "balance_loss_mlp": 1.15221632, + "epoch": 0.020392458637937667, + "flos": 686082270720.0, + "grad_norm": 0.024520994280375335, + "language_loss": 1.03054178, + "learning_rate": 0.0009234796175212258, + "loss": 1.04523015, + "num_input_tokens_seen": 7985280, + "router_z_loss_mlp": 3.1640625, + "step": 106, + "time_per_iteration": 2.9396088123321533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01469456, + "balance_loss_mlp": 1.15512502, + "epoch": 0.02058484032320123, + "flos": 703414307328.0, + "grad_norm": 0.02898567585615155, + "language_loss": 1.07201982, + "learning_rate": 0.000925339025064007, + "loss": 1.08671439, + "num_input_tokens_seen": 8068320, + "router_z_loss_mlp": 3.140625, + "step": 107, + "time_per_iteration": 2.9473297595977783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0147282, + "balance_loss_mlp": 1.16001439, + "epoch": 0.020777222008464793, + "flos": 640326982656.0, + "grad_norm": 0.02770789473723963, + "language_loss": 0.99879742, + "learning_rate": 0.0009271811355418027, + "loss": 1.01352561, + "num_input_tokens_seen": 8148144, + "router_z_loss_mlp": 3.125, + "step": 108, + "time_per_iteration": 2.8551387786865234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01469504, + "balance_loss_mlp": 1.15803361, + "epoch": 0.020969603693728356, + "flos": 683320700928.0, + "grad_norm": 0.029161506766480293, + "language_loss": 1.06637371, + "learning_rate": 0.0009290062678013548, + "loss": 1.08106875, + "num_input_tokens_seen": 8222256, + "router_z_loss_mlp": 3.11132812, + "step": 109, + "time_per_iteration": 2.821951389312744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01468675, + "balance_loss_mlp": 1.15949392, + "epoch": 0.02116198537899192, + "flos": 534419129856.0, + "grad_norm": 0.03188637458086245, + "language_loss": 1.05070233, + "learning_rate": 0.0009308147319536321, + "loss": 1.06538928, + "num_input_tokens_seen": 8292432, + "router_z_loss_mlp": 3.08789062, + "step": 110, + "time_per_iteration": 2.6315042972564697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01469018, + "balance_loss_mlp": 1.16212535, + "epoch": 0.021354367064255482, + "flos": 718727377920.0, + "grad_norm": 0.030955966903197116, + "language_loss": 1.11490715, + "learning_rate": 0.0009326068296900676, + "loss": 1.12959719, + "num_input_tokens_seen": 8365024, + "router_z_loss_mlp": 3.06445312, + "step": 111, + "time_per_iteration": 2.8208162784576416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01474326, + "balance_loss_mlp": 1.16934085, + "epoch": 0.021546748749519045, + "flos": 520623467520.0, + "grad_norm": 0.027870670355515197, + "language_loss": 1.02138007, + "learning_rate": 0.0009343828545846161, + "loss": 1.03612328, + "num_input_tokens_seen": 8442448, + "router_z_loss_mlp": 3.04492188, + "step": 112, + "time_per_iteration": 2.759277105331421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01474098, + "balance_loss_mlp": 1.17063916, + "epoch": 0.021739130434782608, + "flos": 506161062912.0, + "grad_norm": 0.03372988233582904, + "language_loss": 1.06662297, + "learning_rate": 0.0009361430923823841, + "loss": 1.08136404, + "num_input_tokens_seen": 8508992, + "router_z_loss_mlp": 3.02929688, + "step": 113, + "time_per_iteration": 2.565107822418213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01471087, + "balance_loss_mlp": 1.1693449, + "epoch": 0.02193151212004617, + "flos": 464426242560.0, + "grad_norm": 0.03803370713592907, + "language_loss": 1.10115385, + "learning_rate": 0.0009378878212755459, + "loss": 1.11586463, + "num_input_tokens_seen": 8574048, + "router_z_loss_mlp": 3.01171875, + "step": 114, + "time_per_iteration": 2.491929292678833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01471993, + "balance_loss_mlp": 1.17253923, + "epoch": 0.022123893805309734, + "flos": 553331701248.0, + "grad_norm": 0.029753755152528143, + "language_loss": 1.00006115, + "learning_rate": 0.0009396173121672103, + "loss": 1.014781, + "num_input_tokens_seen": 8647808, + "router_z_loss_mlp": 2.98828125, + "step": 115, + "time_per_iteration": 2.6869561672210693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01473585, + "balance_loss_mlp": 1.1754663, + "epoch": 0.022316275490573297, + "flos": 637378761216.0, + "grad_norm": 0.032022590728611564, + "language_loss": 1.0593642, + "learning_rate": 0.0009413318289238633, + "loss": 1.07410002, + "num_input_tokens_seen": 8719760, + "router_z_loss_mlp": 2.97460938, + "step": 116, + "time_per_iteration": 2.7639846801757812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01474428, + "balance_loss_mlp": 1.17859828, + "epoch": 0.02250865717583686, + "flos": 800315039232.0, + "grad_norm": 0.032750944460810345, + "language_loss": 0.98115921, + "learning_rate": 0.0009430316286169771, + "loss": 0.99590349, + "num_input_tokens_seen": 8798752, + "router_z_loss_mlp": 2.95117188, + "step": 117, + "time_per_iteration": 3.020703077316284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01469481, + "balance_loss_mlp": 1.17536783, + "epoch": 0.022701038861100423, + "flos": 457062782976.0, + "grad_norm": 0.027209249322999743, + "language_loss": 1.0327785, + "learning_rate": 0.0009447169617543361, + "loss": 1.04747331, + "num_input_tokens_seen": 8866848, + "router_z_loss_mlp": 2.9375, + "step": 118, + "time_per_iteration": 2.5938501358032227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01466386, + "balance_loss_mlp": 1.17437065, + "epoch": 0.022893420546363986, + "flos": 584186153472.0, + "grad_norm": 0.028075325054819567, + "language_loss": 1.10005641, + "learning_rate": 0.0009463880725016029, + "loss": 1.11472011, + "num_input_tokens_seen": 8935488, + "router_z_loss_mlp": 2.91992188, + "step": 119, + "time_per_iteration": 2.7082488536834717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01467196, + "balance_loss_mlp": 1.17861414, + "epoch": 0.02308580223162755, + "flos": 562477810176.0, + "grad_norm": 0.032360539397207934, + "language_loss": 1.05048943, + "learning_rate": 0.0009480451988946134, + "loss": 1.06516147, + "num_input_tokens_seen": 9015344, + "router_z_loss_mlp": 2.89257812, + "step": 120, + "time_per_iteration": 2.808687686920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01461098, + "balance_loss_mlp": 1.17423272, + "epoch": 0.023278183916891113, + "flos": 772645125120.0, + "grad_norm": 0.033180722862994706, + "language_loss": 1.06113267, + "learning_rate": 0.0009496885730428627, + "loss": 1.07574379, + "num_input_tokens_seen": 9094672, + "router_z_loss_mlp": 2.875, + "step": 121, + "time_per_iteration": 3.0043137073516846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01466426, + "balance_loss_mlp": 1.18070555, + "epoch": 0.023470565602154676, + "flos": 554430144000.0, + "grad_norm": 0.030787275004595428, + "language_loss": 1.04567683, + "learning_rate": 0.0009513184213246156, + "loss": 1.06034112, + "num_input_tokens_seen": 9160608, + "router_z_loss_mlp": 2.86328125, + "step": 122, + "time_per_iteration": 2.6666839122772217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01462554, + "balance_loss_mlp": 1.17835939, + "epoch": 0.02366294728741824, + "flos": 561166519296.0, + "grad_norm": 0.030499039091632818, + "language_loss": 1.08099937, + "learning_rate": 0.0009529349645740552, + "loss": 1.09562504, + "num_input_tokens_seen": 9228704, + "router_z_loss_mlp": 2.84765625, + "step": 123, + "time_per_iteration": 2.69850492477417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01460088, + "balance_loss_mlp": 1.17741883, + "epoch": 0.0238553289726818, + "flos": 469516955136.0, + "grad_norm": 0.026549221517309443, + "language_loss": 1.06623578, + "learning_rate": 0.0009545384182608524, + "loss": 1.08083653, + "num_input_tokens_seen": 9294288, + "router_z_loss_mlp": 2.83203125, + "step": 124, + "time_per_iteration": 2.5435874462127686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01462583, + "balance_loss_mlp": 1.18144011, + "epoch": 0.024047710657945365, + "flos": 561103392768.0, + "grad_norm": 0.03287811385355005, + "language_loss": 1.04055512, + "learning_rate": 0.0009561289926625252, + "loss": 1.05518079, + "num_input_tokens_seen": 9368048, + "router_z_loss_mlp": 2.81640625, + "step": 125, + "time_per_iteration": 2.6661720275878906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01464029, + "balance_loss_mlp": 1.18460226, + "epoch": 0.024240092343208928, + "flos": 505770295296.0, + "grad_norm": 0.030159442314643806, + "language_loss": 1.08985233, + "learning_rate": 0.0009577068930299292, + "loss": 1.10449266, + "num_input_tokens_seen": 9434848, + "router_z_loss_mlp": 2.79882812, + "step": 126, + "time_per_iteration": 2.596027135848999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01456959, + "balance_loss_mlp": 1.17944014, + "epoch": 0.02443247402847249, + "flos": 436752325632.0, + "grad_norm": 0.03465787530540315, + "language_loss": 1.04454637, + "learning_rate": 0.0009592723197462087, + "loss": 1.05911589, + "num_input_tokens_seen": 9504112, + "router_z_loss_mlp": 2.77929688, + "step": 127, + "time_per_iteration": 2.6355836391448975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0145855, + "balance_loss_mlp": 1.18236613, + "epoch": 0.024624855713736054, + "flos": 685068421632.0, + "grad_norm": 0.03103018628328697, + "language_loss": 1.00976562, + "learning_rate": 0.0009608254684795125, + "loss": 1.02435124, + "num_input_tokens_seen": 9590032, + "router_z_loss_mlp": 2.765625, + "step": 128, + "time_per_iteration": 2.956745147705078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01452077, + "balance_loss_mlp": 1.17741859, + "epoch": 0.024817237398999614, + "flos": 526113679872.0, + "grad_norm": 0.03378324138815482, + "language_loss": 1.03947771, + "learning_rate": 0.0009623665303297678, + "loss": 1.05399847, + "num_input_tokens_seen": 9663040, + "router_z_loss_mlp": 2.75, + "step": 129, + "time_per_iteration": 2.762612819671631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0145448, + "balance_loss_mlp": 1.18115723, + "epoch": 0.025009619084263177, + "flos": 656886216192.0, + "grad_norm": 0.03318348770393379, + "language_loss": 1.08023834, + "learning_rate": 0.0009638956919697878, + "loss": 1.09478307, + "num_input_tokens_seen": 9736544, + "router_z_loss_mlp": 2.73339844, + "step": 130, + "time_per_iteration": 2.8800294399261475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01453293, + "balance_loss_mlp": 1.18130565, + "epoch": 0.02520200076952674, + "flos": 455369456640.0, + "grad_norm": 0.028803226470227133, + "language_loss": 1.00211501, + "learning_rate": 0.0009654131357809714, + "loss": 1.01664793, + "num_input_tokens_seen": 9804656, + "router_z_loss_mlp": 2.71875, + "step": 131, + "time_per_iteration": 2.593409776687622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01454951, + "balance_loss_mlp": 1.18534708, + "epoch": 0.025394382454790303, + "flos": 841268324352.0, + "grad_norm": 0.035993676074610494, + "language_loss": 1.09494662, + "learning_rate": 0.0009669190399838441, + "loss": 1.10949612, + "num_input_tokens_seen": 9888864, + "router_z_loss_mlp": 2.69824219, + "step": 132, + "time_per_iteration": 3.1307294368743896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01454062, + "balance_loss_mlp": 1.18588877, + "epoch": 0.025586764140053866, + "flos": 582228312576.0, + "grad_norm": 0.03305283337163912, + "language_loss": 1.02299893, + "learning_rate": 0.0009684135787636724, + "loss": 1.03753948, + "num_input_tokens_seen": 9968208, + "router_z_loss_mlp": 2.68359375, + "step": 133, + "time_per_iteration": 2.8118627071380615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01454726, + "balance_loss_mlp": 1.18798327, + "epoch": 0.02577914582531743, + "flos": 791677218816.0, + "grad_norm": 0.03011124606519955, + "language_loss": 1.06380379, + "learning_rate": 0.0009698969223913726, + "loss": 1.07835102, + "num_input_tokens_seen": 10049664, + "router_z_loss_mlp": 2.66894531, + "step": 134, + "time_per_iteration": 3.0371806621551514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01450237, + "balance_loss_mlp": 1.18454385, + "epoch": 0.025971527510580992, + "flos": 596062906368.0, + "grad_norm": 0.030569012833979448, + "language_loss": 1.08986592, + "learning_rate": 0.0009713692373399265, + "loss": 1.10436833, + "num_input_tokens_seen": 10120096, + "router_z_loss_mlp": 2.65820312, + "step": 135, + "time_per_iteration": 2.684903621673584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01684837, + "balance_loss_mlp": 1.39873505, + "epoch": 0.026163909195844555, + "flos": 1581074411520.0, + "grad_norm": 0.08870187959024729, + "language_loss": 0.79456228, + "learning_rate": 0.0009728306863964993, + "loss": 0.81141067, + "num_input_tokens_seen": 10348976, + "router_z_loss_mlp": 2.8671875, + "step": 136, + "time_per_iteration": 5.94019627571106 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0161422, + "balance_loss_mlp": 1.33116913, + "epoch": 0.026356290881108118, + "flos": 1505160886272.0, + "grad_norm": 0.07212137850421584, + "language_loss": 0.77811038, + "learning_rate": 0.0009742814287704512, + "loss": 0.79425257, + "num_input_tokens_seen": 10576512, + "router_z_loss_mlp": 2.8359375, + "step": 137, + "time_per_iteration": 4.865153074264526 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01469938, + "balance_loss_mlp": 1.20901299, + "epoch": 0.02654867256637168, + "flos": 598340382720.0, + "grad_norm": 0.040535745966457745, + "language_loss": 1.01652551, + "learning_rate": 0.0009757216201974225, + "loss": 1.03122485, + "num_input_tokens_seen": 10659168, + "router_z_loss_mlp": 2.609375, + "step": 138, + "time_per_iteration": 2.8955435752868652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01487517, + "balance_loss_mlp": 1.22802222, + "epoch": 0.026741054251635244, + "flos": 546135427584.0, + "grad_norm": 0.04340470282065083, + "language_loss": 1.06732666, + "learning_rate": 0.0009771514130396581, + "loss": 1.08220184, + "num_input_tokens_seen": 10731584, + "router_z_loss_mlp": 2.59472656, + "step": 139, + "time_per_iteration": 2.6939706802368164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01498511, + "balance_loss_mlp": 1.24044681, + "epoch": 0.026933435936898807, + "flos": 507845657088.0, + "grad_norm": 0.04879945782970011, + "language_loss": 1.07520163, + "learning_rate": 0.00097857095638274, + "loss": 1.09018672, + "num_input_tokens_seen": 10799456, + "router_z_loss_mlp": 2.58007812, + "step": 140, + "time_per_iteration": 2.559673309326172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01492411, + "balance_loss_mlp": 1.23558652, + "epoch": 0.02712581762216237, + "flos": 742253299200.0, + "grad_norm": 0.043929969627725114, + "language_loss": 0.98754954, + "learning_rate": 0.0009799803961288726, + "loss": 1.00247359, + "num_input_tokens_seen": 10886416, + "router_z_loss_mlp": 2.5703125, + "step": 141, + "time_per_iteration": 3.008998394012451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01470778, + "balance_loss_mlp": 1.21567059, + "epoch": 0.027318199307425933, + "flos": 849777890304.0, + "grad_norm": 0.03716164217421175, + "language_loss": 1.04960537, + "learning_rate": 0.000981379875086876, + "loss": 1.06431305, + "num_input_tokens_seen": 10966064, + "router_z_loss_mlp": 2.55371094, + "step": 142, + "time_per_iteration": 3.057098865509033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01469037, + "balance_loss_mlp": 1.21535933, + "epoch": 0.027510580992689496, + "flos": 576638043648.0, + "grad_norm": 0.03712962317624948, + "language_loss": 1.00046849, + "learning_rate": 0.0009827695330590185, + "loss": 1.01515889, + "num_input_tokens_seen": 11039712, + "router_z_loss_mlp": 2.5390625, + "step": 143, + "time_per_iteration": 2.638338327407837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01450228, + "balance_loss_mlp": 1.19750416, + "epoch": 0.02770296267795306, + "flos": 773789230080.0, + "grad_norm": 0.030455330453953735, + "language_loss": 0.99027133, + "learning_rate": 0.0009841495069248256, + "loss": 1.00477362, + "num_input_tokens_seen": 11123984, + "router_z_loss_mlp": 2.52929688, + "step": 144, + "time_per_iteration": 2.981438636779785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01441391, + "balance_loss_mlp": 1.19009781, + "epoch": 0.027895344363216622, + "flos": 570448888320.0, + "grad_norm": 0.031624263879455494, + "language_loss": 0.98723662, + "learning_rate": 0.0009855199307219871, + "loss": 1.00165045, + "num_input_tokens_seen": 11192864, + "router_z_loss_mlp": 2.51464844, + "step": 145, + "time_per_iteration": 2.6923046112060547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01440125, + "balance_loss_mlp": 1.1903578, + "epoch": 0.028087726048480186, + "flos": 548408174592.0, + "grad_norm": 0.029995844711875903, + "language_loss": 1.00586843, + "learning_rate": 0.0009868809357244854, + "loss": 1.02026975, + "num_input_tokens_seen": 11261760, + "router_z_loss_mlp": 2.49902344, + "step": 146, + "time_per_iteration": 2.6284868717193604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01436833, + "balance_loss_mlp": 1.18782902, + "epoch": 0.02828010773374375, + "flos": 525872633856.0, + "grad_norm": 0.03288909570778387, + "language_loss": 1.05042541, + "learning_rate": 0.0009882326505180556, + "loss": 1.06479371, + "num_input_tokens_seen": 11334736, + "router_z_loss_mlp": 2.49121094, + "step": 147, + "time_per_iteration": 2.6588714122772217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01425728, + "balance_loss_mlp": 1.1783452, + "epoch": 0.02847248941900731, + "flos": 773771765760.0, + "grad_norm": 0.031738987003727674, + "language_loss": 1.02499485, + "learning_rate": 0.0009895752010730906, + "loss": 1.03925204, + "num_input_tokens_seen": 11409872, + "router_z_loss_mlp": 2.47460938, + "step": 148, + "time_per_iteration": 2.9316182136535645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01424571, + "balance_loss_mlp": 1.17785549, + "epoch": 0.028664871104270875, + "flos": 535469908992.0, + "grad_norm": 0.028294299214345536, + "language_loss": 1.0900923, + "learning_rate": 0.0009909087108150867, + "loss": 1.10433793, + "num_input_tokens_seen": 11481024, + "router_z_loss_mlp": 2.46777344, + "step": 149, + "time_per_iteration": 2.697423219680786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.014274, + "balance_loss_mlp": 1.18182933, + "epoch": 0.028857252789534438, + "flos": 368604487680.0, + "grad_norm": 0.03525963963400797, + "language_loss": 1.09753942, + "learning_rate": 0.0009922333006927371, + "loss": 1.11181331, + "num_input_tokens_seen": 11544240, + "router_z_loss_mlp": 2.45605469, + "step": 150, + "time_per_iteration": 2.483644723892212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01433542, + "balance_loss_mlp": 1.18911529, + "epoch": 0.029049634474798, + "flos": 516483477504.0, + "grad_norm": 0.03341635886009217, + "language_loss": 1.03220332, + "learning_rate": 0.0009935490892437632, + "loss": 1.04653883, + "num_input_tokens_seen": 11610416, + "router_z_loss_mlp": 2.44433594, + "step": 151, + "time_per_iteration": 2.604599952697754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01438911, + "balance_loss_mlp": 1.19553363, + "epoch": 0.029242016160061564, + "flos": 589348724736.0, + "grad_norm": 0.030166761621646727, + "language_loss": 1.01782072, + "learning_rate": 0.0009948561926585687, + "loss": 1.03220987, + "num_input_tokens_seen": 11687488, + "router_z_loss_mlp": 2.43359375, + "step": 152, + "time_per_iteration": 2.7724709510803223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01445258, + "balance_loss_mlp": 1.20350146, + "epoch": 0.029434397845325123, + "flos": 553136317440.0, + "grad_norm": 0.030739210798008048, + "language_loss": 1.05873716, + "learning_rate": 0.0009961547248418122, + "loss": 1.07318974, + "num_input_tokens_seen": 11754576, + "router_z_loss_mlp": 2.41699219, + "step": 153, + "time_per_iteration": 2.6247737407684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01440878, + "balance_loss_mlp": 1.19988418, + "epoch": 0.029626779530588686, + "flos": 604607400960.0, + "grad_norm": 0.030186385343499288, + "language_loss": 1.02632022, + "learning_rate": 0.0009974447974719707, + "loss": 1.04072905, + "num_input_tokens_seen": 11831360, + "router_z_loss_mlp": 2.40917969, + "step": 154, + "time_per_iteration": 2.730053663253784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01431891, + "balance_loss_mlp": 1.19194651, + "epoch": 0.02981916121585225, + "flos": 622217413632.0, + "grad_norm": 0.02801027733601246, + "language_loss": 1.04305005, + "learning_rate": 0.0009987265200589763, + "loss": 1.05736899, + "num_input_tokens_seen": 11902192, + "router_z_loss_mlp": 2.3984375, + "step": 155, + "time_per_iteration": 2.7198500633239746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01423605, + "balance_loss_mlp": 1.18537688, + "epoch": 0.030011542901115813, + "flos": 662879987712.0, + "grad_norm": 0.0349007823819893, + "language_loss": 1.04218483, + "learning_rate": 0.001, + "loss": 1.05642092, + "num_input_tokens_seen": 11979088, + "router_z_loss_mlp": 2.38085938, + "step": 156, + "time_per_iteration": 2.8801028728485107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01420835, + "balance_loss_mlp": 1.18289316, + "epoch": 0.030203924586379376, + "flos": 652818084864.0, + "grad_norm": 0.029403473562715665, + "language_loss": 1.01930022, + "learning_rate": 0.0009999999029413921, + "loss": 1.03350854, + "num_input_tokens_seen": 12059200, + "router_z_loss_mlp": 2.37792969, + "step": 157, + "time_per_iteration": 2.8549368381500244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01415444, + "balance_loss_mlp": 1.17921925, + "epoch": 0.03039630627164294, + "flos": 532443824640.0, + "grad_norm": 0.03295212675068383, + "language_loss": 1.02716291, + "learning_rate": 0.0009999996117656068, + "loss": 1.04131734, + "num_input_tokens_seen": 12134944, + "router_z_loss_mlp": 2.36035156, + "step": 158, + "time_per_iteration": 2.6989729404449463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01410387, + "balance_loss_mlp": 1.17530584, + "epoch": 0.030588687956906502, + "flos": 587294830080.0, + "grad_norm": 0.0291076208082698, + "language_loss": 0.96305156, + "learning_rate": 0.0009999991264727564, + "loss": 0.97715545, + "num_input_tokens_seen": 12207936, + "router_z_loss_mlp": 2.34863281, + "step": 159, + "time_per_iteration": 2.7609338760375977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0140999, + "balance_loss_mlp": 1.1752907, + "epoch": 0.030781069642170065, + "flos": 514286592000.0, + "grad_norm": 0.030494101007586163, + "language_loss": 1.0725081, + "learning_rate": 0.0009999984470630296, + "loss": 1.08660805, + "num_input_tokens_seen": 12273200, + "router_z_loss_mlp": 2.34472656, + "step": 160, + "time_per_iteration": 2.5805158615112305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01410287, + "balance_loss_mlp": 1.17711365, + "epoch": 0.030973451327433628, + "flos": 719559304704.0, + "grad_norm": 0.025032822394785544, + "language_loss": 0.95934659, + "learning_rate": 0.0009999975735366902, + "loss": 0.97344947, + "num_input_tokens_seen": 12359600, + "router_z_loss_mlp": 2.32910156, + "step": 161, + "time_per_iteration": 3.078343629837036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01409543, + "balance_loss_mlp": 1.17675149, + "epoch": 0.03116583301269719, + "flos": 1111614400512.0, + "grad_norm": 0.029903967107167622, + "language_loss": 0.98009437, + "learning_rate": 0.0009999965058940775, + "loss": 0.99418974, + "num_input_tokens_seen": 12443936, + "router_z_loss_mlp": 2.32519531, + "step": 162, + "time_per_iteration": 3.49137544631958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01408163, + "balance_loss_mlp": 1.17689729, + "epoch": 0.031358214697960754, + "flos": 451833082368.0, + "grad_norm": 0.11336845133687022, + "language_loss": 1.0463953, + "learning_rate": 0.0009999952441356057, + "loss": 1.06047678, + "num_input_tokens_seen": 12507488, + "router_z_loss_mlp": 2.30957031, + "step": 163, + "time_per_iteration": 2.531280755996704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01406979, + "balance_loss_mlp": 1.17676246, + "epoch": 0.031550596383224314, + "flos": 1257085658112.0, + "grad_norm": 0.03183858769064714, + "language_loss": 1.05248928, + "learning_rate": 0.000999993788261765, + "loss": 1.06655908, + "num_input_tokens_seen": 12594096, + "router_z_loss_mlp": 2.30078125, + "step": 164, + "time_per_iteration": 3.5714328289031982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01408503, + "balance_loss_mlp": 1.17943025, + "epoch": 0.03174297806848788, + "flos": 669322924032.0, + "grad_norm": 0.03191781964215587, + "language_loss": 1.06263065, + "learning_rate": 0.00099999213827312, + "loss": 1.07671571, + "num_input_tokens_seen": 12669424, + "router_z_loss_mlp": 2.29101562, + "step": 165, + "time_per_iteration": 2.7947938442230225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01409995, + "balance_loss_mlp": 1.18101788, + "epoch": 0.03193535975375144, + "flos": 552363514368.0, + "grad_norm": 0.03891580789868065, + "language_loss": 1.01044345, + "learning_rate": 0.000999990294170312, + "loss": 1.0245434, + "num_input_tokens_seen": 12740080, + "router_z_loss_mlp": 2.29003906, + "step": 166, + "time_per_iteration": 2.6462574005126953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0140342, + "balance_loss_mlp": 1.17577803, + "epoch": 0.032127741439015006, + "flos": 544739543040.0, + "grad_norm": 0.03757156138401865, + "language_loss": 1.05309296, + "learning_rate": 0.0009999882559540566, + "loss": 1.06712723, + "num_input_tokens_seen": 12810576, + "router_z_loss_mlp": 2.27636719, + "step": 167, + "time_per_iteration": 2.629549503326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0140941, + "balance_loss_mlp": 1.18234003, + "epoch": 0.032320123124278566, + "flos": 549513348096.0, + "grad_norm": 0.028659149555752484, + "language_loss": 1.01791751, + "learning_rate": 0.000999986023625145, + "loss": 1.03201175, + "num_input_tokens_seen": 12887904, + "router_z_loss_mlp": 2.27050781, + "step": 168, + "time_per_iteration": 2.7051401138305664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01589355, + "balance_loss_mlp": 1.35360718, + "epoch": 0.03251250480954213, + "flos": 1308815430144.0, + "grad_norm": 0.08201951270186027, + "language_loss": 0.78924417, + "learning_rate": 0.0009999835971844441, + "loss": 0.80513763, + "num_input_tokens_seen": 13107344, + "router_z_loss_mlp": 2.35546875, + "step": 169, + "time_per_iteration": 4.9428627490997314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01407645, + "balance_loss_mlp": 1.18257797, + "epoch": 0.03270488649480569, + "flos": 562201835520.0, + "grad_norm": 0.03970113019311383, + "language_loss": 1.02863848, + "learning_rate": 0.0009999809766328958, + "loss": 1.04271495, + "num_input_tokens_seen": 13175552, + "router_z_loss_mlp": 2.25, + "step": 170, + "time_per_iteration": 2.675811529159546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01415662, + "balance_loss_mlp": 1.19193029, + "epoch": 0.03289726818006926, + "flos": 483338813952.0, + "grad_norm": 0.03325277263778645, + "language_loss": 1.04760146, + "learning_rate": 0.0009999781619715177, + "loss": 1.06175804, + "num_input_tokens_seen": 13242384, + "router_z_loss_mlp": 2.23632812, + "step": 171, + "time_per_iteration": 2.5431392192840576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01419714, + "balance_loss_mlp": 1.1972214, + "epoch": 0.03308964986533282, + "flos": 675820254720.0, + "grad_norm": 0.02950894161591202, + "language_loss": 1.04164565, + "learning_rate": 0.000999975153201402, + "loss": 1.05584288, + "num_input_tokens_seen": 13316160, + "router_z_loss_mlp": 2.22363281, + "step": 172, + "time_per_iteration": 2.812837600708008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01422366, + "balance_loss_mlp": 1.20044637, + "epoch": 0.033282031550596385, + "flos": 610340660736.0, + "grad_norm": 0.03086814843966846, + "language_loss": 1.02532911, + "learning_rate": 0.0009999719503237174, + "loss": 1.03955269, + "num_input_tokens_seen": 13387664, + "router_z_loss_mlp": 2.21777344, + "step": 173, + "time_per_iteration": 2.755462646484375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01416936, + "balance_loss_mlp": 1.1959697, + "epoch": 0.033474413235859944, + "flos": 468995931648.0, + "grad_norm": 0.048603642070708566, + "language_loss": 1.1131072, + "learning_rate": 0.0009999685533397073, + "loss": 1.12727666, + "num_input_tokens_seen": 13454528, + "router_z_loss_mlp": 2.20800781, + "step": 174, + "time_per_iteration": 2.566751003265381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01414495, + "balance_loss_mlp": 1.19438744, + "epoch": 0.03366679492112351, + "flos": 580714907136.0, + "grad_norm": 0.03243683176756354, + "language_loss": 1.02908182, + "learning_rate": 0.00099996496225069, + "loss": 1.04322672, + "num_input_tokens_seen": 13522528, + "router_z_loss_mlp": 2.19921875, + "step": 175, + "time_per_iteration": 2.67861008644104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01407523, + "balance_loss_mlp": 1.1883682, + "epoch": 0.03385917660638707, + "flos": 638885435904.0, + "grad_norm": 0.029120554083078395, + "language_loss": 1.05784094, + "learning_rate": 0.0009999611770580604, + "loss": 1.0719161, + "num_input_tokens_seen": 13601120, + "router_z_loss_mlp": 2.18945312, + "step": 176, + "time_per_iteration": 2.8410942554473877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01401607, + "balance_loss_mlp": 1.18302441, + "epoch": 0.03405155829165064, + "flos": 442739366400.0, + "grad_norm": 0.031490867136515936, + "language_loss": 1.04703283, + "learning_rate": 0.0009999571977632876, + "loss": 1.06104875, + "num_input_tokens_seen": 13666384, + "router_z_loss_mlp": 2.18359375, + "step": 177, + "time_per_iteration": 2.559652090072632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01399051, + "balance_loss_mlp": 1.1813277, + "epoch": 0.034243939976914196, + "flos": 467274407424.0, + "grad_norm": 0.029366691437037535, + "language_loss": 1.0724479, + "learning_rate": 0.0009999530243679166, + "loss": 1.08643842, + "num_input_tokens_seen": 13733968, + "router_z_loss_mlp": 2.17480469, + "step": 178, + "time_per_iteration": 2.5423247814178467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01392432, + "balance_loss_mlp": 1.17556691, + "epoch": 0.03443632166217776, + "flos": 780712257024.0, + "grad_norm": 0.02507202069561695, + "language_loss": 1.01653552, + "learning_rate": 0.0009999486568735675, + "loss": 1.03045988, + "num_input_tokens_seen": 13818960, + "router_z_loss_mlp": 2.16601562, + "step": 179, + "time_per_iteration": 3.111632823944092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01381684, + "balance_loss_mlp": 1.16567647, + "epoch": 0.03462870334744132, + "flos": 1265758407168.0, + "grad_norm": 0.027829136834509844, + "language_loss": 1.02053452, + "learning_rate": 0.0009999440952819362, + "loss": 1.03435147, + "num_input_tokens_seen": 13912448, + "router_z_loss_mlp": 2.15722656, + "step": 180, + "time_per_iteration": 3.6354756355285645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01375883, + "balance_loss_mlp": 1.16035271, + "epoch": 0.03482108503270489, + "flos": 608302228992.0, + "grad_norm": 0.033531921209289, + "language_loss": 1.02966988, + "learning_rate": 0.0009999393395947935, + "loss": 1.04342866, + "num_input_tokens_seen": 13990752, + "router_z_loss_mlp": 2.15234375, + "step": 181, + "time_per_iteration": 2.8509652614593506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01372611, + "balance_loss_mlp": 1.15774834, + "epoch": 0.03501346671796845, + "flos": 539314458624.0, + "grad_norm": 0.029990628161131794, + "language_loss": 1.05946589, + "learning_rate": 0.0009999343898139858, + "loss": 1.07319212, + "num_input_tokens_seen": 14058608, + "router_z_loss_mlp": 2.14550781, + "step": 182, + "time_per_iteration": 2.6199755668640137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01375908, + "balance_loss_mlp": 1.16161704, + "epoch": 0.035205848403232015, + "flos": 519498828288.0, + "grad_norm": 0.03419998284579487, + "language_loss": 1.04830694, + "learning_rate": 0.0009999292459414348, + "loss": 1.06206608, + "num_input_tokens_seen": 14126656, + "router_z_loss_mlp": 2.13964844, + "step": 183, + "time_per_iteration": 2.563997983932495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01386507, + "balance_loss_mlp": 1.17269289, + "epoch": 0.035398230088495575, + "flos": 473333306880.0, + "grad_norm": 0.03346089667402367, + "language_loss": 1.09292293, + "learning_rate": 0.0009999239079791374, + "loss": 1.10678792, + "num_input_tokens_seen": 14195840, + "router_z_loss_mlp": 2.13476562, + "step": 184, + "time_per_iteration": 2.5561137199401855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01387981, + "balance_loss_mlp": 1.17512131, + "epoch": 0.03559061177375914, + "flos": 513094823424.0, + "grad_norm": 0.03551516541146116, + "language_loss": 1.01857162, + "learning_rate": 0.0009999183759291659, + "loss": 1.03245139, + "num_input_tokens_seen": 14269936, + "router_z_loss_mlp": 2.125, + "step": 185, + "time_per_iteration": 2.689763307571411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01383562, + "balance_loss_mlp": 1.17108345, + "epoch": 0.0357829934590227, + "flos": 478350159360.0, + "grad_norm": 0.03945465081959485, + "language_loss": 1.04534364, + "learning_rate": 0.0009999126497936682, + "loss": 1.05917931, + "num_input_tokens_seen": 14334848, + "router_z_loss_mlp": 2.12109375, + "step": 186, + "time_per_iteration": 2.5142176151275635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01375295, + "balance_loss_mlp": 1.16415167, + "epoch": 0.03597537514428627, + "flos": 645884324352.0, + "grad_norm": 0.029215470851159726, + "language_loss": 1.06864357, + "learning_rate": 0.0009999067295748676, + "loss": 1.08239663, + "num_input_tokens_seen": 14407888, + "router_z_loss_mlp": 2.10742188, + "step": 187, + "time_per_iteration": 2.8259549140930176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01370561, + "balance_loss_mlp": 1.16056204, + "epoch": 0.03616775682954983, + "flos": 582269245440.0, + "grad_norm": 0.03159066859467708, + "language_loss": 1.0519886, + "learning_rate": 0.000999900615275062, + "loss": 1.06569433, + "num_input_tokens_seen": 14479072, + "router_z_loss_mlp": 2.09570312, + "step": 188, + "time_per_iteration": 2.6748595237731934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01368603, + "balance_loss_mlp": 1.15898561, + "epoch": 0.03636013851481339, + "flos": 383264277504.0, + "grad_norm": 0.043734318168479426, + "language_loss": 1.10731864, + "learning_rate": 0.0009998943068966256, + "loss": 1.1210047, + "num_input_tokens_seen": 14540944, + "router_z_loss_mlp": 2.09179688, + "step": 189, + "time_per_iteration": 2.4394500255584717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01365543, + "balance_loss_mlp": 1.15668833, + "epoch": 0.03655252020007695, + "flos": 584307677184.0, + "grad_norm": 0.02577278402121573, + "language_loss": 1.05579162, + "learning_rate": 0.0009998878044420072, + "loss": 1.06944704, + "num_input_tokens_seen": 14611392, + "router_z_loss_mlp": 2.0859375, + "step": 190, + "time_per_iteration": 2.7022814750671387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01365865, + "balance_loss_mlp": 1.15882242, + "epoch": 0.03674490188534051, + "flos": 472597433856.0, + "grad_norm": 0.03520388751206912, + "language_loss": 1.01277018, + "learning_rate": 0.0009998811079137318, + "loss": 1.02642882, + "num_input_tokens_seen": 14679776, + "router_z_loss_mlp": 2.07324219, + "step": 191, + "time_per_iteration": 2.5930585861206055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0136447, + "balance_loss_mlp": 1.15742755, + "epoch": 0.03693728357060408, + "flos": 529411009536.0, + "grad_norm": 0.03125533686722731, + "language_loss": 1.02464271, + "learning_rate": 0.0009998742173143987, + "loss": 1.0382874, + "num_input_tokens_seen": 14749712, + "router_z_loss_mlp": 2.07324219, + "step": 192, + "time_per_iteration": 2.6235413551330566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01358793, + "balance_loss_mlp": 1.15222692, + "epoch": 0.03712966525586764, + "flos": 800345238528.0, + "grad_norm": 0.02848545485219292, + "language_loss": 1.02800548, + "learning_rate": 0.0009998671326466833, + "loss": 1.04159343, + "num_input_tokens_seen": 14827136, + "router_z_loss_mlp": 2.06835938, + "step": 193, + "time_per_iteration": 2.991110324859619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01351781, + "balance_loss_mlp": 1.1463598, + "epoch": 0.037322046941131205, + "flos": 831358144512.0, + "grad_norm": 0.03513998418582105, + "language_loss": 1.0392077, + "learning_rate": 0.0009998598539133362, + "loss": 1.05272543, + "num_input_tokens_seen": 14902880, + "router_z_loss_mlp": 2.05664062, + "step": 194, + "time_per_iteration": 3.0204203128814697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01349328, + "balance_loss_mlp": 1.14371598, + "epoch": 0.037514428626394765, + "flos": 438588642816.0, + "grad_norm": 0.028816536284039847, + "language_loss": 1.04176903, + "learning_rate": 0.0009998523811171828, + "loss": 1.05526221, + "num_input_tokens_seen": 14967264, + "router_z_loss_mlp": 2.05859375, + "step": 195, + "time_per_iteration": 2.5615782737731934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01345129, + "balance_loss_mlp": 1.14047015, + "epoch": 0.03770681031165833, + "flos": 512638927872.0, + "grad_norm": 0.030721230574493993, + "language_loss": 1.05052435, + "learning_rate": 0.0009998447142611248, + "loss": 1.06397557, + "num_input_tokens_seen": 15039104, + "router_z_loss_mlp": 2.04882812, + "step": 196, + "time_per_iteration": 2.6310269832611084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01347072, + "balance_loss_mlp": 1.14289033, + "epoch": 0.03789919199692189, + "flos": 808842069504.0, + "grad_norm": 0.024329502455983587, + "language_loss": 0.97805226, + "learning_rate": 0.0009998368533481387, + "loss": 0.99152303, + "num_input_tokens_seen": 15124864, + "router_z_loss_mlp": 2.04394531, + "step": 197, + "time_per_iteration": 3.0467066764831543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01344143, + "balance_loss_mlp": 1.14043784, + "epoch": 0.03809157368218546, + "flos": 691791335424.0, + "grad_norm": 0.028391473090668865, + "language_loss": 1.00891113, + "learning_rate": 0.0009998287983812762, + "loss": 1.0223527, + "num_input_tokens_seen": 15199680, + "router_z_loss_mlp": 2.0390625, + "step": 198, + "time_per_iteration": 2.8457672595977783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01342798, + "balance_loss_mlp": 1.14023721, + "epoch": 0.03828395536744902, + "flos": 519004001280.0, + "grad_norm": 0.02890411668538335, + "language_loss": 1.07749867, + "learning_rate": 0.0009998205493636646, + "loss": 1.09092665, + "num_input_tokens_seen": 15270176, + "router_z_loss_mlp": 2.02734375, + "step": 199, + "time_per_iteration": 2.66135573387146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01336213, + "balance_loss_mlp": 1.13432038, + "epoch": 0.038476337052712584, + "flos": 582762071040.0, + "grad_norm": 0.025165239757241963, + "language_loss": 0.99723649, + "learning_rate": 0.0009998121062985063, + "loss": 1.01059866, + "num_input_tokens_seen": 15343168, + "router_z_loss_mlp": 2.02050781, + "step": 200, + "time_per_iteration": 2.70021915435791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01340101, + "balance_loss_mlp": 1.13868463, + "epoch": 0.03866871873797614, + "flos": 578272972800.0, + "grad_norm": 0.025940014565947116, + "language_loss": 1.01401794, + "learning_rate": 0.0009998034691890794, + "loss": 1.02741897, + "num_input_tokens_seen": 15417328, + "router_z_loss_mlp": 2.015625, + "step": 201, + "time_per_iteration": 2.7596118450164795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0134112, + "balance_loss_mlp": 1.14018106, + "epoch": 0.03886110042323971, + "flos": 541771855872.0, + "grad_norm": 0.03045868040347491, + "language_loss": 1.06763899, + "learning_rate": 0.0009997946380387369, + "loss": 1.08105016, + "num_input_tokens_seen": 15489488, + "router_z_loss_mlp": 2.01074219, + "step": 202, + "time_per_iteration": 2.6249613761901855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01341912, + "balance_loss_mlp": 1.14192665, + "epoch": 0.03905348210850327, + "flos": 719239669248.0, + "grad_norm": 0.02826530469295273, + "language_loss": 1.09111357, + "learning_rate": 0.0009997856128509076, + "loss": 1.1045326, + "num_input_tokens_seen": 15558944, + "router_z_loss_mlp": 2.00097656, + "step": 203, + "time_per_iteration": 2.8254761695861816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01336015, + "balance_loss_mlp": 1.13660145, + "epoch": 0.039245863793766836, + "flos": 428396484096.0, + "grad_norm": 0.028264614074004907, + "language_loss": 1.0366801, + "learning_rate": 0.0009997763936290952, + "loss": 1.05004025, + "num_input_tokens_seen": 15625024, + "router_z_loss_mlp": 1.99511719, + "step": 204, + "time_per_iteration": 2.4907312393188477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01334897, + "balance_loss_mlp": 1.13624632, + "epoch": 0.039438245479030395, + "flos": 664269141504.0, + "grad_norm": 0.0294297584821439, + "language_loss": 1.09143519, + "learning_rate": 0.0009997669803768789, + "loss": 1.10478401, + "num_input_tokens_seen": 15697120, + "router_z_loss_mlp": 1.98730469, + "step": 205, + "time_per_iteration": 2.787046194076538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01332958, + "balance_loss_mlp": 1.13497555, + "epoch": 0.03963062716429396, + "flos": 636495168000.0, + "grad_norm": 0.025164669035445293, + "language_loss": 1.04324186, + "learning_rate": 0.0009997573730979134, + "loss": 1.05657148, + "num_input_tokens_seen": 15768752, + "router_z_loss_mlp": 1.98242188, + "step": 206, + "time_per_iteration": 2.744339942932129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01388672, + "balance_loss_mlp": 1.18687439, + "epoch": 0.03982300884955752, + "flos": 1421587186176.0, + "grad_norm": 0.04225268457123109, + "language_loss": 0.79193199, + "learning_rate": 0.0009997475717959284, + "loss": 0.80581868, + "num_input_tokens_seen": 15980624, + "router_z_loss_mlp": 2.01953125, + "step": 207, + "time_per_iteration": 4.62822699546814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01338974, + "balance_loss_mlp": 1.14251721, + "epoch": 0.04001539053482109, + "flos": 690519702528.0, + "grad_norm": 0.029734692172116686, + "language_loss": 1.02667236, + "learning_rate": 0.0009997375764747294, + "loss": 1.04006195, + "num_input_tokens_seen": 16067232, + "router_z_loss_mlp": 1.96875, + "step": 208, + "time_per_iteration": 3.0006470680236816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01332342, + "balance_loss_mlp": 1.1360755, + "epoch": 0.04020777222008465, + "flos": 534751500288.0, + "grad_norm": 0.02521302149444487, + "language_loss": 1.00535607, + "learning_rate": 0.0009997273871381967, + "loss": 1.01867938, + "num_input_tokens_seen": 16139808, + "router_z_loss_mlp": 1.96679688, + "step": 209, + "time_per_iteration": 2.6790220737457275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01335368, + "balance_loss_mlp": 1.14005554, + "epoch": 0.040400153905348214, + "flos": 568996608000.0, + "grad_norm": 0.04055154679799505, + "language_loss": 1.05331016, + "learning_rate": 0.0009997170037902862, + "loss": 1.06666374, + "num_input_tokens_seen": 16210848, + "router_z_loss_mlp": 1.95703125, + "step": 210, + "time_per_iteration": 2.748340129852295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01331596, + "balance_loss_mlp": 1.13647389, + "epoch": 0.040592535590611774, + "flos": 714678712320.0, + "grad_norm": 0.0276705792773584, + "language_loss": 1.07916689, + "learning_rate": 0.0009997064264350292, + "loss": 1.09248281, + "num_input_tokens_seen": 16283984, + "router_z_loss_mlp": 1.95507812, + "step": 211, + "time_per_iteration": 2.8284339904785156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01332545, + "balance_loss_mlp": 1.13761449, + "epoch": 0.04078491727587533, + "flos": 579206231040.0, + "grad_norm": 0.026753366885260317, + "language_loss": 1.01893198, + "learning_rate": 0.0009996956550765317, + "loss": 1.03225756, + "num_input_tokens_seen": 16353904, + "router_z_loss_mlp": 1.953125, + "step": 212, + "time_per_iteration": 2.6636033058166504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01330854, + "balance_loss_mlp": 1.13668597, + "epoch": 0.0409772989611389, + "flos": 553368631296.0, + "grad_norm": 0.03340351088011317, + "language_loss": 0.96620274, + "learning_rate": 0.0009996846897189762, + "loss": 0.97951126, + "num_input_tokens_seen": 16425488, + "router_z_loss_mlp": 1.9453125, + "step": 213, + "time_per_iteration": 2.62785005569458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01327396, + "balance_loss_mlp": 1.13332307, + "epoch": 0.04116968064640246, + "flos": 556764016128.0, + "grad_norm": 0.026256493309422244, + "language_loss": 1.0283711, + "learning_rate": 0.0009996735303666193, + "loss": 1.04164505, + "num_input_tokens_seen": 16498016, + "router_z_loss_mlp": 1.94433594, + "step": 214, + "time_per_iteration": 2.745412588119507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01324547, + "balance_loss_mlp": 1.13152313, + "epoch": 0.041362062331666026, + "flos": 579651393024.0, + "grad_norm": 0.025801807715809106, + "language_loss": 1.04973316, + "learning_rate": 0.0009996621770237937, + "loss": 1.06297863, + "num_input_tokens_seen": 16573744, + "router_z_loss_mlp": 1.93359375, + "step": 215, + "time_per_iteration": 2.7359023094177246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01319406, + "balance_loss_mlp": 1.12657344, + "epoch": 0.041554444016929586, + "flos": 612700729344.0, + "grad_norm": 0.027594527286323677, + "language_loss": 1.00985026, + "learning_rate": 0.0009996506296949073, + "loss": 1.02304435, + "num_input_tokens_seen": 16655344, + "router_z_loss_mlp": 1.93164062, + "step": 216, + "time_per_iteration": 2.860781669616699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01320461, + "balance_loss_mlp": 1.12781918, + "epoch": 0.04174682570219315, + "flos": 529150497792.0, + "grad_norm": 0.030561981852332186, + "language_loss": 1.01172602, + "learning_rate": 0.0009996388883844428, + "loss": 1.02493072, + "num_input_tokens_seen": 16726480, + "router_z_loss_mlp": 1.9296875, + "step": 217, + "time_per_iteration": 2.614837169647217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01315002, + "balance_loss_mlp": 1.12255037, + "epoch": 0.04193920738745671, + "flos": 512499939840.0, + "grad_norm": 0.024235201889365978, + "language_loss": 1.04092622, + "learning_rate": 0.0009996269530969588, + "loss": 1.05407631, + "num_input_tokens_seen": 16792112, + "router_z_loss_mlp": 1.92773438, + "step": 218, + "time_per_iteration": 2.5777087211608887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01317845, + "balance_loss_mlp": 1.1255846, + "epoch": 0.04213158907272028, + "flos": 572552448000.0, + "grad_norm": 0.03618883866707401, + "language_loss": 1.04623246, + "learning_rate": 0.0009996148238370888, + "loss": 1.05941105, + "num_input_tokens_seen": 16862960, + "router_z_loss_mlp": 1.92578125, + "step": 219, + "time_per_iteration": 2.723344564437866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01319419, + "balance_loss_mlp": 1.12830234, + "epoch": 0.04232397075798384, + "flos": 965904098304.0, + "grad_norm": 0.02808123492922437, + "language_loss": 0.99962145, + "learning_rate": 0.0009996025006095421, + "loss": 1.01281559, + "num_input_tokens_seen": 16950416, + "router_z_loss_mlp": 1.9140625, + "step": 220, + "time_per_iteration": 3.297567844390869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01355408, + "balance_loss_mlp": 1.16314697, + "epoch": 0.042516352443247404, + "flos": 1472730628608.0, + "grad_norm": 0.031119874656221472, + "language_loss": 0.77783144, + "learning_rate": 0.0009995899834191028, + "loss": 0.79138547, + "num_input_tokens_seen": 17180944, + "router_z_loss_mlp": 1.92578125, + "step": 221, + "time_per_iteration": 5.484851837158203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0132056, + "balance_loss_mlp": 1.13039756, + "epoch": 0.042708734128510964, + "flos": 655891832832.0, + "grad_norm": 0.027306518139410985, + "language_loss": 0.99887031, + "learning_rate": 0.0009995772722706307, + "loss": 1.0120759, + "num_input_tokens_seen": 17257792, + "router_z_loss_mlp": 1.90429688, + "step": 222, + "time_per_iteration": 2.801955461502075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01324867, + "balance_loss_mlp": 1.13518083, + "epoch": 0.04290111581377453, + "flos": 432733859328.0, + "grad_norm": 0.025166076900031344, + "language_loss": 1.13987851, + "learning_rate": 0.0009995643671690604, + "loss": 1.15312719, + "num_input_tokens_seen": 17320288, + "router_z_loss_mlp": 1.89941406, + "step": 223, + "time_per_iteration": 2.4589195251464844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01320058, + "balance_loss_mlp": 1.13142133, + "epoch": 0.04309349749903809, + "flos": 645866860032.0, + "grad_norm": 0.02470776233740571, + "language_loss": 1.01624262, + "learning_rate": 0.0009995512681194023, + "loss": 1.02944326, + "num_input_tokens_seen": 17396672, + "router_z_loss_mlp": 1.88867188, + "step": 224, + "time_per_iteration": 2.854653835296631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01319788, + "balance_loss_mlp": 1.13124692, + "epoch": 0.04328587918430166, + "flos": 832895745024.0, + "grad_norm": 0.02898896961022835, + "language_loss": 0.98942387, + "learning_rate": 0.0009995379751267417, + "loss": 1.00262189, + "num_input_tokens_seen": 17488096, + "router_z_loss_mlp": 1.88769531, + "step": 225, + "time_per_iteration": 3.260105609893799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01317885, + "balance_loss_mlp": 1.12943935, + "epoch": 0.043478260869565216, + "flos": 526115681280.0, + "grad_norm": 0.02601835272599882, + "language_loss": 1.00718379, + "learning_rate": 0.0009995244881962398, + "loss": 1.02036262, + "num_input_tokens_seen": 17557632, + "router_z_loss_mlp": 1.88671875, + "step": 226, + "time_per_iteration": 2.631685495376587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01320396, + "balance_loss_mlp": 1.13204539, + "epoch": 0.04367064255482878, + "flos": 440412225024.0, + "grad_norm": 0.02740546356326938, + "language_loss": 1.02089393, + "learning_rate": 0.0009995108073331323, + "loss": 1.03409791, + "num_input_tokens_seen": 17626672, + "router_z_loss_mlp": 1.88574219, + "step": 227, + "time_per_iteration": 2.6414895057678223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01308962, + "balance_loss_mlp": 1.12156498, + "epoch": 0.04386302424009234, + "flos": 508466737152.0, + "grad_norm": 0.023646446246452554, + "language_loss": 1.04017711, + "learning_rate": 0.0009994969325427309, + "loss": 1.05326676, + "num_input_tokens_seen": 17698624, + "router_z_loss_mlp": 1.87597656, + "step": 228, + "time_per_iteration": 2.6933584213256836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0130646, + "balance_loss_mlp": 1.11906338, + "epoch": 0.04405540592535591, + "flos": 541743657984.0, + "grad_norm": 0.02642836262436834, + "language_loss": 1.00691068, + "learning_rate": 0.0009994828638304218, + "loss": 1.0199753, + "num_input_tokens_seen": 17767760, + "router_z_loss_mlp": 1.87597656, + "step": 229, + "time_per_iteration": 2.604616165161133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01305226, + "balance_loss_mlp": 1.11792421, + "epoch": 0.04424778761061947, + "flos": 447309055488.0, + "grad_norm": 0.039218098968292335, + "language_loss": 1.07079852, + "learning_rate": 0.0009994686012016675, + "loss": 1.08385086, + "num_input_tokens_seen": 17833664, + "router_z_loss_mlp": 1.875, + "step": 230, + "time_per_iteration": 2.568608045578003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0130487, + "balance_loss_mlp": 1.1187129, + "epoch": 0.044440169295883035, + "flos": 701981492736.0, + "grad_norm": 0.02721662483758601, + "language_loss": 1.06240797, + "learning_rate": 0.000999454144662005, + "loss": 1.07545662, + "num_input_tokens_seen": 17908880, + "router_z_loss_mlp": 1.86328125, + "step": 231, + "time_per_iteration": 2.9104526042938232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01295735, + "balance_loss_mlp": 1.10957813, + "epoch": 0.044632550981146595, + "flos": 589426587648.0, + "grad_norm": 0.02817980914561194, + "language_loss": 1.003865, + "learning_rate": 0.0009994394942170468, + "loss": 1.01682234, + "num_input_tokens_seen": 17978208, + "router_z_loss_mlp": 1.86328125, + "step": 232, + "time_per_iteration": 2.674896001815796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01302928, + "balance_loss_mlp": 1.11667526, + "epoch": 0.04482493266641016, + "flos": 555854226432.0, + "grad_norm": 0.029144066951330677, + "language_loss": 0.98161608, + "learning_rate": 0.0009994246498724808, + "loss": 0.99464536, + "num_input_tokens_seen": 18049296, + "router_z_loss_mlp": 1.86425781, + "step": 233, + "time_per_iteration": 2.674178123474121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01302597, + "balance_loss_mlp": 1.11682117, + "epoch": 0.04501731435167372, + "flos": 724069870080.0, + "grad_norm": 0.027038299766394356, + "language_loss": 1.00722432, + "learning_rate": 0.00099940961163407, + "loss": 1.02025032, + "num_input_tokens_seen": 18123296, + "router_z_loss_mlp": 1.859375, + "step": 234, + "time_per_iteration": 2.8427939414978027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01301098, + "balance_loss_mlp": 1.11608493, + "epoch": 0.04520969603693728, + "flos": 512797381632.0, + "grad_norm": 0.027022139799708383, + "language_loss": 1.02586675, + "learning_rate": 0.0009993943795076528, + "loss": 1.03887773, + "num_input_tokens_seen": 18192784, + "router_z_loss_mlp": 1.8515625, + "step": 235, + "time_per_iteration": 2.5940792560577393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01295671, + "balance_loss_mlp": 1.11094403, + "epoch": 0.04540207772220085, + "flos": 365877846528.0, + "grad_norm": 0.03212133053651388, + "language_loss": 1.0562067, + "learning_rate": 0.0009993789534991427, + "loss": 1.06916356, + "num_input_tokens_seen": 18254064, + "router_z_loss_mlp": 1.84863281, + "step": 236, + "time_per_iteration": 2.4345834255218506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01294151, + "balance_loss_mlp": 1.1095196, + "epoch": 0.045594459407464406, + "flos": 523723411968.0, + "grad_norm": 0.029471400038435007, + "language_loss": 1.00276268, + "learning_rate": 0.0009993633336145287, + "loss": 1.01570415, + "num_input_tokens_seen": 18325728, + "router_z_loss_mlp": 1.84765625, + "step": 237, + "time_per_iteration": 2.6279234886169434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01296614, + "balance_loss_mlp": 1.11284053, + "epoch": 0.04578684109272797, + "flos": 673115807232.0, + "grad_norm": 0.032189822363292264, + "language_loss": 1.04537559, + "learning_rate": 0.0009993475198598752, + "loss": 1.05834174, + "num_input_tokens_seen": 18408608, + "router_z_loss_mlp": 1.83886719, + "step": 238, + "time_per_iteration": 2.98264741897583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01294154, + "balance_loss_mlp": 1.11047626, + "epoch": 0.04597922277799153, + "flos": 542620520448.0, + "grad_norm": 0.025834809881005002, + "language_loss": 1.01282692, + "learning_rate": 0.0009993315122413212, + "loss": 1.02576852, + "num_input_tokens_seen": 18471920, + "router_z_loss_mlp": 1.83789062, + "step": 239, + "time_per_iteration": 2.5969364643096924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01297016, + "balance_loss_mlp": 1.11333883, + "epoch": 0.0461716044632551, + "flos": 459993540096.0, + "grad_norm": 0.025301515003642434, + "language_loss": 1.01210213, + "learning_rate": 0.0009993153107650818, + "loss": 1.02507234, + "num_input_tokens_seen": 18540496, + "router_z_loss_mlp": 1.83789062, + "step": 240, + "time_per_iteration": 2.590198278427124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01297188, + "balance_loss_mlp": 1.11360526, + "epoch": 0.04636398614851866, + "flos": 456170457600.0, + "grad_norm": 0.0338801607583888, + "language_loss": 1.01026332, + "learning_rate": 0.0009992989154374468, + "loss": 1.0232352, + "num_input_tokens_seen": 18606944, + "router_z_loss_mlp": 1.83691406, + "step": 241, + "time_per_iteration": 2.5699570178985596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012963, + "balance_loss_mlp": 1.11271763, + "epoch": 0.046556367833782225, + "flos": 557901390336.0, + "grad_norm": 0.02656657647638049, + "language_loss": 1.0757494, + "learning_rate": 0.0009992823262647817, + "loss": 1.08871233, + "num_input_tokens_seen": 18679520, + "router_z_loss_mlp": 1.83691406, + "step": 242, + "time_per_iteration": 2.6949496269226074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01293965, + "balance_loss_mlp": 1.11047852, + "epoch": 0.046748749519045785, + "flos": 594087601152.0, + "grad_norm": 0.02772781005565529, + "language_loss": 1.02479577, + "learning_rate": 0.0009992655432535264, + "loss": 1.03773546, + "num_input_tokens_seen": 18756656, + "router_z_loss_mlp": 1.8359375, + "step": 243, + "time_per_iteration": 2.7783396244049072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01286985, + "balance_loss_mlp": 1.10454702, + "epoch": 0.04694113120430935, + "flos": 570941713920.0, + "grad_norm": 0.021337056529223342, + "language_loss": 1.01771712, + "learning_rate": 0.0009992485664101973, + "loss": 1.03058696, + "num_input_tokens_seen": 18829792, + "router_z_loss_mlp": 1.82519531, + "step": 244, + "time_per_iteration": 2.679227590560913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01286082, + "balance_loss_mlp": 1.10364425, + "epoch": 0.04713351288957291, + "flos": 865245411840.0, + "grad_norm": 0.03170954338904746, + "language_loss": 1.04355013, + "learning_rate": 0.000999231395741385, + "loss": 1.05641103, + "num_input_tokens_seen": 18906864, + "router_z_loss_mlp": 1.82519531, + "step": 245, + "time_per_iteration": 3.0976788997650146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01287082, + "balance_loss_mlp": 1.10473943, + "epoch": 0.04732589457483648, + "flos": 538235481600.0, + "grad_norm": 0.02353809889700427, + "language_loss": 1.02393425, + "learning_rate": 0.0009992140312537557, + "loss": 1.03680515, + "num_input_tokens_seen": 18973632, + "router_z_loss_mlp": 1.82421875, + "step": 246, + "time_per_iteration": 2.6005859375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01286378, + "balance_loss_mlp": 1.1048938, + "epoch": 0.04751827626010004, + "flos": 763271431680.0, + "grad_norm": 0.021903859990429042, + "language_loss": 0.96665001, + "learning_rate": 0.000999196472954051, + "loss": 0.97951376, + "num_input_tokens_seen": 19052944, + "router_z_loss_mlp": 1.81542969, + "step": 247, + "time_per_iteration": 2.95379638671875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01319153, + "balance_loss_mlp": 1.13833618, + "epoch": 0.0477106579453636, + "flos": 1583125578240.0, + "grad_norm": 0.034344144576267104, + "language_loss": 0.79424852, + "learning_rate": 0.0009991787208490878, + "loss": 0.80744004, + "num_input_tokens_seen": 19286288, + "router_z_loss_mlp": 1.80859375, + "step": 248, + "time_per_iteration": 6.070216655731201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01286412, + "balance_loss_mlp": 1.10521388, + "epoch": 0.04790303963062716, + "flos": 458692982784.0, + "grad_norm": 0.024476775577385278, + "language_loss": 1.04631317, + "learning_rate": 0.0009991607749457578, + "loss": 1.05917728, + "num_input_tokens_seen": 19349296, + "router_z_loss_mlp": 1.8125, + "step": 249, + "time_per_iteration": 2.5741825103759766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0128623, + "balance_loss_mlp": 1.10503209, + "epoch": 0.04809542131589073, + "flos": 783786004992.0, + "grad_norm": 0.021665977114244464, + "language_loss": 1.0235486, + "learning_rate": 0.0009991426352510286, + "loss": 1.03641105, + "num_input_tokens_seen": 19428416, + "router_z_loss_mlp": 1.81152344, + "step": 250, + "time_per_iteration": 3.004519462585449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01287109, + "balance_loss_mlp": 1.10648286, + "epoch": 0.04828780300115429, + "flos": 560321857536.0, + "grad_norm": 0.028059326531900755, + "language_loss": 1.04456568, + "learning_rate": 0.0009991243017719422, + "loss": 1.05743682, + "num_input_tokens_seen": 19498688, + "router_z_loss_mlp": 1.8046875, + "step": 251, + "time_per_iteration": 2.666212320327759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01283793, + "balance_loss_mlp": 1.10364354, + "epoch": 0.048480184686417856, + "flos": 502922130432.0, + "grad_norm": 0.02282661348297379, + "language_loss": 0.985008, + "learning_rate": 0.0009991057745156165, + "loss": 0.99784589, + "num_input_tokens_seen": 19567568, + "router_z_loss_mlp": 1.80078125, + "step": 252, + "time_per_iteration": 2.6053824424743652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01291534, + "balance_loss_mlp": 1.11186218, + "epoch": 0.048672566371681415, + "flos": 1539469120512.0, + "grad_norm": 0.022804524860740846, + "language_loss": 0.81910986, + "learning_rate": 0.0009990870534892446, + "loss": 0.83202517, + "num_input_tokens_seen": 19796368, + "router_z_loss_mlp": 1.796875, + "step": 253, + "time_per_iteration": 5.005317449569702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01285445, + "balance_loss_mlp": 1.10500991, + "epoch": 0.04886494805694498, + "flos": 538951888896.0, + "grad_norm": 0.028242285238858512, + "language_loss": 1.06865251, + "learning_rate": 0.0009990681387000943, + "loss": 1.08150697, + "num_input_tokens_seen": 19870480, + "router_z_loss_mlp": 1.80371094, + "step": 254, + "time_per_iteration": 2.743307590484619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01283321, + "balance_loss_mlp": 1.10317183, + "epoch": 0.04905732974220854, + "flos": 681484383744.0, + "grad_norm": 0.028658365214850164, + "language_loss": 1.02065015, + "learning_rate": 0.0009990490301555093, + "loss": 1.03348327, + "num_input_tokens_seen": 19956288, + "router_z_loss_mlp": 1.80126953, + "step": 255, + "time_per_iteration": 2.989856719970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01291977, + "balance_loss_mlp": 1.1134491, + "epoch": 0.04924971142747211, + "flos": 1424274895872.0, + "grad_norm": 0.01325206916769545, + "language_loss": 0.79215157, + "learning_rate": 0.0009990297278629078, + "loss": 0.80507129, + "num_input_tokens_seen": 20180080, + "router_z_loss_mlp": 1.78515625, + "step": 256, + "time_per_iteration": 4.888273477554321 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01281082, + "balance_loss_mlp": 1.10255432, + "epoch": 0.04944209311273567, + "flos": 1561236587520.0, + "grad_norm": 0.00993410716153638, + "language_loss": 0.79242742, + "learning_rate": 0.000999010231829784, + "loss": 0.80523825, + "num_input_tokens_seen": 20413456, + "router_z_loss_mlp": 1.78515625, + "step": 257, + "time_per_iteration": 4.983605623245239 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01285439, + "balance_loss_mlp": 1.10786438, + "epoch": 0.04963447479799923, + "flos": 1574170850304.0, + "grad_norm": 0.014798835308040135, + "language_loss": 0.69975883, + "learning_rate": 0.0009989905420637066, + "loss": 0.71261322, + "num_input_tokens_seen": 20644736, + "router_z_loss_mlp": 1.77539062, + "step": 258, + "time_per_iteration": 4.888776540756226 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01282209, + "balance_loss_mlp": 1.10310864, + "epoch": 0.049826856483262794, + "flos": 626498393088.0, + "grad_norm": 0.032236291487241595, + "language_loss": 0.9680413, + "learning_rate": 0.0009989706585723202, + "loss": 0.98086333, + "num_input_tokens_seen": 20719040, + "router_z_loss_mlp": 1.79003906, + "step": 259, + "time_per_iteration": 2.776397705078125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01280186, + "balance_loss_mlp": 1.10175359, + "epoch": 0.05001923816852635, + "flos": 505155945984.0, + "grad_norm": 0.03442249770662494, + "language_loss": 1.03026366, + "learning_rate": 0.0009989505813633442, + "loss": 1.04306555, + "num_input_tokens_seen": 20789376, + "router_z_loss_mlp": 1.78271484, + "step": 260, + "time_per_iteration": 2.651773691177368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01281097, + "balance_loss_mlp": 1.10295069, + "epoch": 0.05021161985378992, + "flos": 588467132928.0, + "grad_norm": 0.024781843968885862, + "language_loss": 1.02880228, + "learning_rate": 0.000998930310444573, + "loss": 1.04161322, + "num_input_tokens_seen": 20857856, + "router_z_loss_mlp": 1.78125, + "step": 261, + "time_per_iteration": 2.730717420578003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01266116, + "balance_loss_mlp": 1.08796966, + "epoch": 0.05040400153905348, + "flos": 634402341888.0, + "grad_norm": 0.028473185138455738, + "language_loss": 1.01351452, + "learning_rate": 0.0009989098458238765, + "loss": 1.02617574, + "num_input_tokens_seen": 20931232, + "router_z_loss_mlp": 1.77929688, + "step": 262, + "time_per_iteration": 2.7717010974884033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01272128, + "balance_loss_mlp": 1.09407711, + "epoch": 0.050596383224317046, + "flos": 554808176640.0, + "grad_norm": 0.03464065468219783, + "language_loss": 1.00597906, + "learning_rate": 0.0009988891875091998, + "loss": 1.01870036, + "num_input_tokens_seen": 21012672, + "router_z_loss_mlp": 1.77880859, + "step": 263, + "time_per_iteration": 2.8842556476593018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012725, + "balance_loss_mlp": 1.09444928, + "epoch": 0.050788764909580605, + "flos": 550761512448.0, + "grad_norm": 0.02541343292713684, + "language_loss": 0.95014787, + "learning_rate": 0.0009988683355085636, + "loss": 0.96287298, + "num_input_tokens_seen": 21088592, + "router_z_loss_mlp": 1.77880859, + "step": 264, + "time_per_iteration": 2.7466378211975098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01272896, + "balance_loss_mlp": 1.09527469, + "epoch": 0.05098114659484417, + "flos": 606344388096.0, + "grad_norm": 0.02024934595994547, + "language_loss": 1.03858495, + "learning_rate": 0.000998847289830063, + "loss": 1.05131388, + "num_input_tokens_seen": 21169840, + "router_z_loss_mlp": 1.77587891, + "step": 265, + "time_per_iteration": 2.821997880935669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01285574, + "balance_loss_mlp": 1.10761857, + "epoch": 0.05117352828010773, + "flos": 439472236032.0, + "grad_norm": 0.026937538773041583, + "language_loss": 0.97004128, + "learning_rate": 0.0009988260504818682, + "loss": 0.98289704, + "num_input_tokens_seen": 21236144, + "router_z_loss_mlp": 1.77832031, + "step": 266, + "time_per_iteration": 2.557830333709717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01277028, + "balance_loss_mlp": 1.09907281, + "epoch": 0.0513659099653713, + "flos": 506030807040.0, + "grad_norm": 0.02494960853942852, + "language_loss": 1.03986156, + "learning_rate": 0.000998804617472226, + "loss": 1.05263186, + "num_input_tokens_seen": 21304864, + "router_z_loss_mlp": 1.77832031, + "step": 267, + "time_per_iteration": 2.644099235534668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01269549, + "balance_loss_mlp": 1.09254682, + "epoch": 0.05155829165063486, + "flos": 696714862080.0, + "grad_norm": 0.027664306986101984, + "language_loss": 0.98796493, + "learning_rate": 0.0009987829908094568, + "loss": 1.00066042, + "num_input_tokens_seen": 21377504, + "router_z_loss_mlp": 1.76953125, + "step": 268, + "time_per_iteration": 2.8291003704071045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01265086, + "balance_loss_mlp": 1.08817983, + "epoch": 0.051750673335898424, + "flos": 1350300294144.0, + "grad_norm": 0.03385083640642466, + "language_loss": 1.06218576, + "learning_rate": 0.0009987611705019569, + "loss": 1.07483661, + "num_input_tokens_seen": 21463840, + "router_z_loss_mlp": 1.76855469, + "step": 269, + "time_per_iteration": 4.150776624679565 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01264769, + "balance_loss_mlp": 1.08795822, + "epoch": 0.051943055021161984, + "flos": 490589481984.0, + "grad_norm": 0.028250493976035247, + "language_loss": 1.04104686, + "learning_rate": 0.0009987391565581978, + "loss": 1.05369449, + "num_input_tokens_seen": 21531184, + "router_z_loss_mlp": 1.76757812, + "step": 270, + "time_per_iteration": 2.5921454429626465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01266977, + "balance_loss_mlp": 1.09092879, + "epoch": 0.05213543670642555, + "flos": 546880032768.0, + "grad_norm": 0.026669721507250346, + "language_loss": 0.96455419, + "learning_rate": 0.000998716948986726, + "loss": 0.97722399, + "num_input_tokens_seen": 21612224, + "router_z_loss_mlp": 1.75976562, + "step": 271, + "time_per_iteration": 2.7835500240325928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01268405, + "balance_loss_mlp": 1.09264266, + "epoch": 0.05232781839168911, + "flos": 604672528896.0, + "grad_norm": 0.03568520247936263, + "language_loss": 0.99334317, + "learning_rate": 0.0009986945477961633, + "loss": 1.00602722, + "num_input_tokens_seen": 21681024, + "router_z_loss_mlp": 1.75683594, + "step": 272, + "time_per_iteration": 2.6972289085388184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01271248, + "balance_loss_mlp": 1.0953902, + "epoch": 0.052520200076952676, + "flos": 539655561216.0, + "grad_norm": 0.02343402151836954, + "language_loss": 1.0317328, + "learning_rate": 0.0009986719529952066, + "loss": 1.04444528, + "num_input_tokens_seen": 21761616, + "router_z_loss_mlp": 1.7578125, + "step": 273, + "time_per_iteration": 2.908298969268799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0126867, + "balance_loss_mlp": 1.09266984, + "epoch": 0.052712581762216236, + "flos": 464332916736.0, + "grad_norm": 0.028493663433316604, + "language_loss": 1.03350449, + "learning_rate": 0.000998649164592628, + "loss": 1.0461911, + "num_input_tokens_seen": 21828416, + "router_z_loss_mlp": 1.75927734, + "step": 274, + "time_per_iteration": 2.5805718898773193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01263404, + "balance_loss_mlp": 1.08735609, + "epoch": 0.0529049634474798, + "flos": 549105116160.0, + "grad_norm": 0.024462560446863554, + "language_loss": 1.01155043, + "learning_rate": 0.0009986261825972748, + "loss": 1.02418458, + "num_input_tokens_seen": 21901600, + "router_z_loss_mlp": 1.75976562, + "step": 275, + "time_per_iteration": 2.675705909729004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01269015, + "balance_loss_mlp": 1.09334803, + "epoch": 0.05309734513274336, + "flos": 619200061440.0, + "grad_norm": 0.026443817532743642, + "language_loss": 1.03055406, + "learning_rate": 0.000998603007018069, + "loss": 1.04324436, + "num_input_tokens_seen": 21979312, + "router_z_loss_mlp": 1.75585938, + "step": 276, + "time_per_iteration": 2.77298903465271 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01264217, + "balance_loss_mlp": 1.08893192, + "epoch": 0.05328972681800693, + "flos": 606617634816.0, + "grad_norm": 0.022439827576013177, + "language_loss": 1.00613213, + "learning_rate": 0.0009985796378640089, + "loss": 1.01877427, + "num_input_tokens_seen": 22053776, + "router_z_loss_mlp": 1.75195312, + "step": 277, + "time_per_iteration": 2.693049669265747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01264635, + "balance_loss_mlp": 1.08963549, + "epoch": 0.05348210850327049, + "flos": 605730038784.0, + "grad_norm": 0.02549683888178727, + "language_loss": 1.01102281, + "learning_rate": 0.0009985560751441665, + "loss": 1.02366924, + "num_input_tokens_seen": 22134304, + "router_z_loss_mlp": 1.74902344, + "step": 278, + "time_per_iteration": 2.8009955883026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262716, + "balance_loss_mlp": 1.08757329, + "epoch": 0.053674490188534055, + "flos": 631997337600.0, + "grad_norm": 0.025192100126554, + "language_loss": 1.03316271, + "learning_rate": 0.00099853231886769, + "loss": 1.04578984, + "num_input_tokens_seen": 22212896, + "router_z_loss_mlp": 1.75048828, + "step": 279, + "time_per_iteration": 2.8228564262390137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262121, + "balance_loss_mlp": 1.08712184, + "epoch": 0.053866871873797614, + "flos": 480173741568.0, + "grad_norm": 0.02583251996588833, + "language_loss": 1.02629757, + "learning_rate": 0.0009985083690438024, + "loss": 1.03891873, + "num_input_tokens_seen": 22287216, + "router_z_loss_mlp": 1.74902344, + "step": 280, + "time_per_iteration": 2.705789566040039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01260843, + "balance_loss_mlp": 1.08655906, + "epoch": 0.054059253559061174, + "flos": 789489065472.0, + "grad_norm": 0.023704628566171972, + "language_loss": 0.9340027, + "learning_rate": 0.0009984842256818016, + "loss": 0.94661117, + "num_input_tokens_seen": 22370864, + "router_z_loss_mlp": 1.74169922, + "step": 281, + "time_per_iteration": 3.084801435470581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01257985, + "balance_loss_mlp": 1.08379591, + "epoch": 0.05425163524432474, + "flos": 629505011712.0, + "grad_norm": 0.027462270528210347, + "language_loss": 1.04308844, + "learning_rate": 0.0009984598887910613, + "loss": 1.05566835, + "num_input_tokens_seen": 22440080, + "router_z_loss_mlp": 1.74072266, + "step": 282, + "time_per_iteration": 2.729063034057617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262745, + "balance_loss_mlp": 1.08855665, + "epoch": 0.0544440169295883, + "flos": 616992442368.0, + "grad_norm": 0.02580860229759897, + "language_loss": 0.99945354, + "learning_rate": 0.0009984353583810297, + "loss": 1.01208091, + "num_input_tokens_seen": 22517936, + "router_z_loss_mlp": 1.74072266, + "step": 283, + "time_per_iteration": 2.812309741973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01258383, + "balance_loss_mlp": 1.08433735, + "epoch": 0.05463639861485187, + "flos": 648929874432.0, + "grad_norm": 0.0290705298354334, + "language_loss": 1.01989841, + "learning_rate": 0.0009984106344612302, + "loss": 1.03248215, + "num_input_tokens_seen": 22590480, + "router_z_loss_mlp": 1.73925781, + "step": 284, + "time_per_iteration": 2.785377264022827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0126395, + "balance_loss_mlp": 1.0907625, + "epoch": 0.054828780300115426, + "flos": 798584782848.0, + "grad_norm": 0.03167011835004719, + "language_loss": 0.97435868, + "learning_rate": 0.0009983857170412615, + "loss": 0.9869982, + "num_input_tokens_seen": 22668144, + "router_z_loss_mlp": 1.73046875, + "step": 285, + "time_per_iteration": 2.9822604656219482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01258353, + "balance_loss_mlp": 1.08511817, + "epoch": 0.05502116198537899, + "flos": 550798442496.0, + "grad_norm": 0.02077828299254123, + "language_loss": 0.96197385, + "learning_rate": 0.000998360606130798, + "loss": 0.9745574, + "num_input_tokens_seen": 22749648, + "router_z_loss_mlp": 1.73095703, + "step": 286, + "time_per_iteration": 2.8340489864349365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01266281, + "balance_loss_mlp": 1.09461975, + "epoch": 0.05521354367064255, + "flos": 1410906931200.0, + "grad_norm": 0.010589673029146669, + "language_loss": 0.69073117, + "learning_rate": 0.0009983353017395877, + "loss": 0.70339394, + "num_input_tokens_seen": 22982752, + "router_z_loss_mlp": 1.71484375, + "step": 287, + "time_per_iteration": 4.893908500671387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0126535, + "balance_loss_mlp": 1.09235394, + "epoch": 0.05540592535590612, + "flos": 646611465216.0, + "grad_norm": 0.04031113274469801, + "language_loss": 1.02544129, + "learning_rate": 0.0009983098038774552, + "loss": 1.03809476, + "num_input_tokens_seen": 23053584, + "router_z_loss_mlp": 1.72851562, + "step": 288, + "time_per_iteration": 2.800687551498413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01258598, + "balance_loss_mlp": 1.08712769, + "epoch": 0.05559830704116968, + "flos": 1514315727360.0, + "grad_norm": 0.011752943348929798, + "language_loss": 0.78170228, + "learning_rate": 0.0009982841125542993, + "loss": 0.79428822, + "num_input_tokens_seen": 23280256, + "router_z_loss_mlp": 1.71289062, + "step": 289, + "time_per_iteration": 4.802466630935669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0126164, + "balance_loss_mlp": 1.08869088, + "epoch": 0.055790688726433245, + "flos": 509334867456.0, + "grad_norm": 0.03460900762027919, + "language_loss": 1.00913107, + "learning_rate": 0.0009982582277800948, + "loss": 1.02174735, + "num_input_tokens_seen": 23345760, + "router_z_loss_mlp": 1.72802734, + "step": 290, + "time_per_iteration": 2.574007749557495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01255451, + "balance_loss_mlp": 1.08326483, + "epoch": 0.055983070411696804, + "flos": 659074369536.0, + "grad_norm": 0.03439417592421578, + "language_loss": 1.07703924, + "learning_rate": 0.0009982321495648908, + "loss": 1.08959377, + "num_input_tokens_seen": 23420720, + "router_z_loss_mlp": 1.72021484, + "step": 291, + "time_per_iteration": 2.8004326820373535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01257264, + "balance_loss_mlp": 1.08503067, + "epoch": 0.05617545209696037, + "flos": 588475865088.0, + "grad_norm": 0.024241847728240208, + "language_loss": 0.9905349, + "learning_rate": 0.0009982058779188115, + "loss": 1.00310755, + "num_input_tokens_seen": 23492576, + "router_z_loss_mlp": 1.72070312, + "step": 292, + "time_per_iteration": 2.763096570968628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01257503, + "balance_loss_mlp": 1.0853169, + "epoch": 0.05636783378222393, + "flos": 612787324416.0, + "grad_norm": 0.027188079674348095, + "language_loss": 1.06693649, + "learning_rate": 0.0009981794128520567, + "loss": 1.07951164, + "num_input_tokens_seen": 23569824, + "router_z_loss_mlp": 1.72021484, + "step": 293, + "time_per_iteration": 2.7630960941314697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01253426, + "balance_loss_mlp": 1.08123958, + "epoch": 0.0565602154674875, + "flos": 669422980608.0, + "grad_norm": 0.030197403892147204, + "language_loss": 1.03523457, + "learning_rate": 0.000998152754374901, + "loss": 1.04776871, + "num_input_tokens_seen": 23649984, + "router_z_loss_mlp": 1.72021484, + "step": 294, + "time_per_iteration": 2.8583314418792725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01249713, + "balance_loss_mlp": 1.07743168, + "epoch": 0.05675259715275106, + "flos": 618364131840.0, + "grad_norm": 0.026289358543143387, + "language_loss": 0.99071473, + "learning_rate": 0.0009981259024976943, + "loss": 1.00321186, + "num_input_tokens_seen": 23722032, + "router_z_loss_mlp": 1.72119141, + "step": 295, + "time_per_iteration": 2.719881534576416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250566, + "balance_loss_mlp": 1.07814193, + "epoch": 0.05694497883801462, + "flos": 753153133056.0, + "grad_norm": 0.03148267511857758, + "language_loss": 0.97962338, + "learning_rate": 0.0009980988572308612, + "loss": 0.99212909, + "num_input_tokens_seen": 23797376, + "router_z_loss_mlp": 1.72265625, + "step": 296, + "time_per_iteration": 2.9828195571899414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250905, + "balance_loss_mlp": 1.0789572, + "epoch": 0.05713736052327818, + "flos": 713380882944.0, + "grad_norm": 0.02524811137395651, + "language_loss": 1.00250125, + "learning_rate": 0.0009980716185849015, + "loss": 1.01501024, + "num_input_tokens_seen": 23880496, + "router_z_loss_mlp": 1.71777344, + "step": 297, + "time_per_iteration": 2.9749252796173096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01251066, + "balance_loss_mlp": 1.07959557, + "epoch": 0.05732974220854175, + "flos": 469935920640.0, + "grad_norm": 0.024054663695119705, + "language_loss": 0.96916056, + "learning_rate": 0.0009980441865703904, + "loss": 0.98167121, + "num_input_tokens_seen": 23950016, + "router_z_loss_mlp": 1.71289062, + "step": 298, + "time_per_iteration": 2.598325252532959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250911, + "balance_loss_mlp": 1.07939255, + "epoch": 0.05752212389380531, + "flos": 602540771328.0, + "grad_norm": 0.025930022992042723, + "language_loss": 1.05563986, + "learning_rate": 0.000998016561197978, + "loss": 1.06814897, + "num_input_tokens_seen": 24020064, + "router_z_loss_mlp": 1.71337891, + "step": 299, + "time_per_iteration": 2.690300703048706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250529, + "balance_loss_mlp": 1.07924938, + "epoch": 0.057714505579068875, + "flos": 679949511168.0, + "grad_norm": 0.025847674874905035, + "language_loss": 0.97115421, + "learning_rate": 0.0009979887424783895, + "loss": 0.98365951, + "num_input_tokens_seen": 24095360, + "router_z_loss_mlp": 1.7109375, + "step": 300, + "time_per_iteration": 2.863856554031372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01249286, + "balance_loss_mlp": 1.07810116, + "epoch": 0.057906887264332435, + "flos": 597011627520.0, + "grad_norm": 0.02594453351976595, + "language_loss": 0.96475613, + "learning_rate": 0.0009979607304224248, + "loss": 0.97724897, + "num_input_tokens_seen": 24164608, + "router_z_loss_mlp": 1.70996094, + "step": 301, + "time_per_iteration": 2.733415365219116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01248659, + "balance_loss_mlp": 1.0772841, + "epoch": 0.058099268949596, + "flos": 553164515328.0, + "grad_norm": 0.024492956239426298, + "language_loss": 1.0387162, + "learning_rate": 0.000997932525040959, + "loss": 1.05120289, + "num_input_tokens_seen": 24233840, + "router_z_loss_mlp": 1.71191406, + "step": 302, + "time_per_iteration": 2.6392264366149902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01252345, + "balance_loss_mlp": 1.08111238, + "epoch": 0.05829165063485956, + "flos": 509230808064.0, + "grad_norm": 0.038324718957869854, + "language_loss": 1.05616117, + "learning_rate": 0.000997904126344943, + "loss": 1.06868458, + "num_input_tokens_seen": 24302928, + "router_z_loss_mlp": 1.71044922, + "step": 303, + "time_per_iteration": 2.611621141433716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0125091, + "balance_loss_mlp": 1.080441, + "epoch": 0.05848403232012313, + "flos": 616362630144.0, + "grad_norm": 0.028818083574726525, + "language_loss": 1.02425826, + "learning_rate": 0.0009978755343454018, + "loss": 1.03676736, + "num_input_tokens_seen": 24377024, + "router_z_loss_mlp": 1.70263672, + "step": 304, + "time_per_iteration": 2.750213384628296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01245805, + "balance_loss_mlp": 1.07490659, + "epoch": 0.05867641400538669, + "flos": 501079082496.0, + "grad_norm": 0.025195073137535502, + "language_loss": 1.02874422, + "learning_rate": 0.0009978467490534355, + "loss": 1.04120219, + "num_input_tokens_seen": 24442736, + "router_z_loss_mlp": 1.70703125, + "step": 305, + "time_per_iteration": 2.591139078140259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0124905, + "balance_loss_mlp": 1.07853293, + "epoch": 0.05886879569065025, + "flos": 532378696704.0, + "grad_norm": 0.026491629776715375, + "language_loss": 0.99473399, + "learning_rate": 0.00099781777048022, + "loss": 1.00722456, + "num_input_tokens_seen": 24514800, + "router_z_loss_mlp": 1.703125, + "step": 306, + "time_per_iteration": 2.731084108352661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012482, + "balance_loss_mlp": 1.07782638, + "epoch": 0.05906117737591381, + "flos": 490040260608.0, + "grad_norm": 0.025118942729794178, + "language_loss": 1.01122224, + "learning_rate": 0.0009977885986370057, + "loss": 1.02370417, + "num_input_tokens_seen": 24581648, + "router_z_loss_mlp": 1.70166016, + "step": 307, + "time_per_iteration": 2.548307418823242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01247075, + "balance_loss_mlp": 1.0766536, + "epoch": 0.05925355906117737, + "flos": 592709180928.0, + "grad_norm": 0.029001286226925486, + "language_loss": 0.96780527, + "learning_rate": 0.000997759233535118, + "loss": 0.98027599, + "num_input_tokens_seen": 24658864, + "router_z_loss_mlp": 1.70214844, + "step": 308, + "time_per_iteration": 2.7876322269439697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01247056, + "balance_loss_mlp": 1.07668173, + "epoch": 0.05944594074644094, + "flos": 564787487232.0, + "grad_norm": 0.026648157056946717, + "language_loss": 1.03345561, + "learning_rate": 0.0009977296751859576, + "loss": 1.04592621, + "num_input_tokens_seen": 24735808, + "router_z_loss_mlp": 1.70166016, + "step": 309, + "time_per_iteration": 2.71488094329834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0124953, + "balance_loss_mlp": 1.07958508, + "epoch": 0.0596383224317045, + "flos": 539807284224.0, + "grad_norm": 0.023775477335694146, + "language_loss": 1.04459929, + "learning_rate": 0.0009976999236009998, + "loss": 1.05709469, + "num_input_tokens_seen": 24807744, + "router_z_loss_mlp": 1.69726562, + "step": 310, + "time_per_iteration": 2.7919182777404785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01255511, + "balance_loss_mlp": 1.08618629, + "epoch": 0.059830704116968066, + "flos": 562052113920.0, + "grad_norm": 0.02942700961653022, + "language_loss": 1.06853497, + "learning_rate": 0.0009976699787917955, + "loss": 1.08109009, + "num_input_tokens_seen": 24876640, + "router_z_loss_mlp": 1.69091797, + "step": 311, + "time_per_iteration": 2.6729257106781006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012565, + "balance_loss_mlp": 1.08789062, + "epoch": 0.060023085802231625, + "flos": 1574047325184.0, + "grad_norm": 0.029063497479097016, + "language_loss": 0.73442996, + "learning_rate": 0.00099763984076997, + "loss": 0.74699497, + "num_input_tokens_seen": 25110864, + "router_z_loss_mlp": 1.68359375, + "step": 312, + "time_per_iteration": 4.972649097442627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01249775, + "balance_loss_mlp": 1.08021212, + "epoch": 0.06021546748749519, + "flos": 483627523584.0, + "grad_norm": 0.0314235925459163, + "language_loss": 0.98280072, + "learning_rate": 0.0009976095095472243, + "loss": 0.9952985, + "num_input_tokens_seen": 25179328, + "router_z_loss_mlp": 1.69335938, + "step": 313, + "time_per_iteration": 2.5644209384918213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0125234, + "balance_loss_mlp": 1.08287179, + "epoch": 0.06040784917275875, + "flos": 621423143424.0, + "grad_norm": 0.030123719928355924, + "language_loss": 0.99538821, + "learning_rate": 0.0009975789851353334, + "loss": 1.00791156, + "num_input_tokens_seen": 25254128, + "router_z_loss_mlp": 1.69238281, + "step": 314, + "time_per_iteration": 2.794311285018921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01256592, + "balance_loss_mlp": 1.08741045, + "epoch": 0.06060023085802232, + "flos": 484602441216.0, + "grad_norm": 0.026992074473858402, + "language_loss": 1.01683283, + "learning_rate": 0.0009975482675461487, + "loss": 1.02939868, + "num_input_tokens_seen": 25324624, + "router_z_loss_mlp": 1.68945312, + "step": 315, + "time_per_iteration": 2.67146897315979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01249108, + "balance_loss_mlp": 1.08054566, + "epoch": 0.06079261254328588, + "flos": 582985652736.0, + "grad_norm": 0.0292304668639163, + "language_loss": 0.99909455, + "learning_rate": 0.0009975173567915952, + "loss": 1.01158559, + "num_input_tokens_seen": 25393648, + "router_z_loss_mlp": 1.68310547, + "step": 316, + "time_per_iteration": 2.693526268005371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0124983, + "balance_loss_mlp": 1.08131599, + "epoch": 0.060984994228549444, + "flos": 689008298496.0, + "grad_norm": 0.03272213432041067, + "language_loss": 0.93868685, + "learning_rate": 0.000997486252883674, + "loss": 0.95118511, + "num_input_tokens_seen": 25469152, + "router_z_loss_mlp": 1.68261719, + "step": 317, + "time_per_iteration": 2.837315082550049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01252509, + "balance_loss_mlp": 1.08399427, + "epoch": 0.061177375913813004, + "flos": 1316747398656.0, + "grad_norm": 0.031012352820614663, + "language_loss": 0.98949343, + "learning_rate": 0.0009974549558344602, + "loss": 1.00201845, + "num_input_tokens_seen": 25560944, + "router_z_loss_mlp": 1.68261719, + "step": 318, + "time_per_iteration": 3.686920166015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0125178, + "balance_loss_mlp": 1.08321846, + "epoch": 0.06136975759907657, + "flos": 575400612864.0, + "grad_norm": 0.027925836735275204, + "language_loss": 1.08640313, + "learning_rate": 0.000997423465656105, + "loss": 1.09892082, + "num_input_tokens_seen": 25631424, + "router_z_loss_mlp": 1.68310547, + "step": 319, + "time_per_iteration": 2.7691538333892822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250553, + "balance_loss_mlp": 1.08218133, + "epoch": 0.06156213928434013, + "flos": 528564346368.0, + "grad_norm": 0.033042319608268485, + "language_loss": 1.06051123, + "learning_rate": 0.0009973917823608335, + "loss": 1.07301688, + "num_input_tokens_seen": 25698176, + "router_z_loss_mlp": 1.68115234, + "step": 320, + "time_per_iteration": 2.583859443664551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01251303, + "balance_loss_mlp": 1.08364725, + "epoch": 0.061754520969603696, + "flos": 496589984256.0, + "grad_norm": 0.025351519610416894, + "language_loss": 0.99929821, + "learning_rate": 0.0009973599059609462, + "loss": 1.01181126, + "num_input_tokens_seen": 25773472, + "router_z_loss_mlp": 1.67382812, + "step": 321, + "time_per_iteration": 2.7139415740966797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01246641, + "balance_loss_mlp": 1.07893777, + "epoch": 0.061946902654867256, + "flos": 441044038656.0, + "grad_norm": 0.025867704850659153, + "language_loss": 0.98033404, + "learning_rate": 0.000997327836468819, + "loss": 0.99280047, + "num_input_tokens_seen": 25841088, + "router_z_loss_mlp": 1.67431641, + "step": 322, + "time_per_iteration": 2.598400831222534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250362, + "balance_loss_mlp": 1.08280182, + "epoch": 0.06213928434013082, + "flos": 600042441216.0, + "grad_norm": 0.02535167136018297, + "language_loss": 1.01516175, + "learning_rate": 0.000997295573896902, + "loss": 1.02766538, + "num_input_tokens_seen": 25919424, + "router_z_loss_mlp": 1.67285156, + "step": 323, + "time_per_iteration": 2.8295648097991943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0125071, + "balance_loss_mlp": 1.0847702, + "epoch": 0.06233166602539438, + "flos": 1453114384896.0, + "grad_norm": 0.012451454042686489, + "language_loss": 0.8119604, + "learning_rate": 0.000997263118257721, + "loss": 0.82446748, + "num_input_tokens_seen": 26135504, + "router_z_loss_mlp": 1.65625, + "step": 324, + "time_per_iteration": 4.72265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01244164, + "balance_loss_mlp": 1.07803345, + "epoch": 0.06252404771065795, + "flos": 1466628794880.0, + "grad_norm": 0.009026829376029815, + "language_loss": 0.78571939, + "learning_rate": 0.0009972304695638763, + "loss": 0.79816103, + "num_input_tokens_seen": 26358880, + "router_z_loss_mlp": 1.65820312, + "step": 325, + "time_per_iteration": 4.859014272689819 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01252677, + "balance_loss_mlp": 1.08535445, + "epoch": 0.06271642939592151, + "flos": 465235975680.0, + "grad_norm": 0.02899330239765154, + "language_loss": 0.95714885, + "learning_rate": 0.000997197627828043, + "loss": 0.96967566, + "num_input_tokens_seen": 26425888, + "router_z_loss_mlp": 1.67041016, + "step": 326, + "time_per_iteration": 2.5137081146240234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250284, + "balance_loss_mlp": 1.08343852, + "epoch": 0.06290881108118507, + "flos": 533431477248.0, + "grad_norm": 0.02712212536791958, + "language_loss": 0.90827119, + "learning_rate": 0.0009971645930629716, + "loss": 0.92077404, + "num_input_tokens_seen": 26500656, + "router_z_loss_mlp": 1.66552734, + "step": 327, + "time_per_iteration": 2.6867988109588623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01249402, + "balance_loss_mlp": 1.08260453, + "epoch": 0.06310119276644863, + "flos": 674767474176.0, + "grad_norm": 0.026247049513885422, + "language_loss": 1.04735494, + "learning_rate": 0.0009971313652814872, + "loss": 1.0598489, + "num_input_tokens_seen": 26577408, + "router_z_loss_mlp": 1.66503906, + "step": 328, + "time_per_iteration": 2.845618724822998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01245995, + "balance_loss_mlp": 1.07924485, + "epoch": 0.0632935744517122, + "flos": 772050241536.0, + "grad_norm": 0.03020034978800923, + "language_loss": 1.02482498, + "learning_rate": 0.0009970979444964903, + "loss": 1.03728485, + "num_input_tokens_seen": 26652048, + "router_z_loss_mlp": 1.66455078, + "step": 329, + "time_per_iteration": 2.967315196990967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01249674, + "balance_loss_mlp": 1.08316231, + "epoch": 0.06348595613697576, + "flos": 562974638592.0, + "grad_norm": 0.027434293654228625, + "language_loss": 1.03562641, + "learning_rate": 0.0009970643307209556, + "loss": 1.04812312, + "num_input_tokens_seen": 26728192, + "router_z_loss_mlp": 1.66210938, + "step": 330, + "time_per_iteration": 2.7991747856140137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01247918, + "balance_loss_mlp": 1.0814544, + "epoch": 0.06367833782223932, + "flos": 677383325184.0, + "grad_norm": 0.030236705728133754, + "language_loss": 1.00163436, + "learning_rate": 0.0009970305239679334, + "loss": 1.01411343, + "num_input_tokens_seen": 26798016, + "router_z_loss_mlp": 1.66162109, + "step": 331, + "time_per_iteration": 2.8012547492980957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01243208, + "balance_loss_mlp": 1.07669675, + "epoch": 0.06387071950750288, + "flos": 496348938240.0, + "grad_norm": 0.029279450628507057, + "language_loss": 1.04491925, + "learning_rate": 0.0009969965242505483, + "loss": 1.05735123, + "num_input_tokens_seen": 26867536, + "router_z_loss_mlp": 1.66210938, + "step": 332, + "time_per_iteration": 2.658085584640503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01251001, + "balance_loss_mlp": 1.08463287, + "epoch": 0.06406310119276645, + "flos": 534556116480.0, + "grad_norm": 0.029350032940601952, + "language_loss": 1.00548685, + "learning_rate": 0.0009969623315820007, + "loss": 1.01799679, + "num_input_tokens_seen": 26941216, + "router_z_loss_mlp": 1.66064453, + "step": 333, + "time_per_iteration": 2.6670596599578857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238877, + "balance_loss_mlp": 1.07246125, + "epoch": 0.06425548287803001, + "flos": 457164840960.0, + "grad_norm": 0.03277849846880731, + "language_loss": 1.00979996, + "learning_rate": 0.000996927945975565, + "loss": 1.02218866, + "num_input_tokens_seen": 27006560, + "router_z_loss_mlp": 1.66113281, + "step": 334, + "time_per_iteration": 2.5448765754699707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01245409, + "balance_loss_mlp": 1.0792315, + "epoch": 0.06444786456329357, + "flos": 561122858496.0, + "grad_norm": 0.03573042475309631, + "language_loss": 0.98108363, + "learning_rate": 0.0009968933674445906, + "loss": 0.99353766, + "num_input_tokens_seen": 27076400, + "router_z_loss_mlp": 1.65869141, + "step": 335, + "time_per_iteration": 2.679093360900879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01242425, + "balance_loss_mlp": 1.07672429, + "epoch": 0.06464024624855713, + "flos": 667356350976.0, + "grad_norm": 0.0316377115871937, + "language_loss": 0.99817598, + "learning_rate": 0.0009968585960025028, + "loss": 1.01060021, + "num_input_tokens_seen": 27158672, + "router_z_loss_mlp": 1.65380859, + "step": 336, + "time_per_iteration": 2.9642832279205322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01246223, + "balance_loss_mlp": 1.08085632, + "epoch": 0.0648326279338207, + "flos": 1524555549696.0, + "grad_norm": 0.012731648189289846, + "language_loss": 0.77653188, + "learning_rate": 0.0009968236316628006, + "loss": 0.78899413, + "num_input_tokens_seen": 27380592, + "router_z_loss_mlp": 1.65039062, + "step": 337, + "time_per_iteration": 4.799122333526611 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01242249, + "balance_loss_mlp": 1.07683408, + "epoch": 0.06502500961908426, + "flos": 1145214959616.0, + "grad_norm": 0.030168792806873873, + "language_loss": 0.98216963, + "learning_rate": 0.0009967884744390583, + "loss": 0.99459207, + "num_input_tokens_seen": 27469984, + "router_z_loss_mlp": 1.65087891, + "step": 338, + "time_per_iteration": 3.513155460357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01243978, + "balance_loss_mlp": 1.07865858, + "epoch": 0.06521739130434782, + "flos": 583693327872.0, + "grad_norm": 0.025823410577593665, + "language_loss": 0.98998213, + "learning_rate": 0.0009967531243449256, + "loss": 1.00242186, + "num_input_tokens_seen": 27543904, + "router_z_loss_mlp": 1.64990234, + "step": 339, + "time_per_iteration": 2.6683707237243652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01239476, + "balance_loss_mlp": 1.07453787, + "epoch": 0.06540977298961138, + "flos": 498658615296.0, + "grad_norm": 0.02384437782241591, + "language_loss": 1.06067204, + "learning_rate": 0.000996717581394126, + "loss": 1.07306671, + "num_input_tokens_seen": 27609888, + "router_z_loss_mlp": 1.64599609, + "step": 340, + "time_per_iteration": 2.5471885204315186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236124, + "balance_loss_mlp": 1.07171023, + "epoch": 0.06560215467487496, + "flos": 543903613440.0, + "grad_norm": 0.02318937955413124, + "language_loss": 1.0712086, + "learning_rate": 0.000996681845600459, + "loss": 1.08356977, + "num_input_tokens_seen": 27683936, + "router_z_loss_mlp": 1.640625, + "step": 341, + "time_per_iteration": 2.651742458343506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01240028, + "balance_loss_mlp": 1.07575738, + "epoch": 0.06579453636013852, + "flos": 414351043584.0, + "grad_norm": 0.026316803994829763, + "language_loss": 0.99228215, + "learning_rate": 0.0009966459169777982, + "loss": 1.00468254, + "num_input_tokens_seen": 27747840, + "router_z_loss_mlp": 1.63916016, + "step": 342, + "time_per_iteration": 2.4996230602264404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01244627, + "balance_loss_mlp": 1.08045232, + "epoch": 0.06598691804540208, + "flos": 561680812032.0, + "grad_norm": 0.03097158399986616, + "language_loss": 1.07124209, + "learning_rate": 0.0009966097955400924, + "loss": 1.08368838, + "num_input_tokens_seen": 27819728, + "router_z_loss_mlp": 1.63818359, + "step": 343, + "time_per_iteration": 2.7243080139160156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238691, + "balance_loss_mlp": 1.07451606, + "epoch": 0.06617929973066564, + "flos": 573301782528.0, + "grad_norm": 0.022915441754152527, + "language_loss": 1.00964892, + "learning_rate": 0.0009965734813013652, + "loss": 1.02203584, + "num_input_tokens_seen": 27893536, + "router_z_loss_mlp": 1.63818359, + "step": 344, + "time_per_iteration": 2.8087360858917236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01237027, + "balance_loss_mlp": 1.07375824, + "epoch": 0.06637168141592921, + "flos": 491464343040.0, + "grad_norm": 0.024444849604151265, + "language_loss": 1.03758335, + "learning_rate": 0.0009965369742757151, + "loss": 1.04995358, + "num_input_tokens_seen": 27960976, + "router_z_loss_mlp": 1.62890625, + "step": 345, + "time_per_iteration": 2.5691587924957275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01237907, + "balance_loss_mlp": 1.07459044, + "epoch": 0.06656406310119277, + "flos": 1081037924352.0, + "grad_norm": 0.024807678995847144, + "language_loss": 0.99529493, + "learning_rate": 0.0009965002744773152, + "loss": 1.00767398, + "num_input_tokens_seen": 28050864, + "router_z_loss_mlp": 1.62939453, + "step": 346, + "time_per_iteration": 3.507969856262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01239522, + "balance_loss_mlp": 1.07611036, + "epoch": 0.06675644478645633, + "flos": 514723021824.0, + "grad_norm": 0.02663627628784384, + "language_loss": 0.97097999, + "learning_rate": 0.0009964633819204139, + "loss": 0.98337519, + "num_input_tokens_seen": 28122448, + "router_z_loss_mlp": 1.63037109, + "step": 347, + "time_per_iteration": 2.6551601886749268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01261986, + "balance_loss_mlp": 1.09986115, + "epoch": 0.06694882647171989, + "flos": 1450534189056.0, + "grad_norm": 0.030948258254188146, + "language_loss": 0.81801116, + "learning_rate": 0.0009964262966193338, + "loss": 0.83063102, + "num_input_tokens_seen": 28350352, + "router_z_loss_mlp": 1.6171875, + "step": 348, + "time_per_iteration": 5.152506589889526 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236206, + "balance_loss_mlp": 1.07427216, + "epoch": 0.06714120815698346, + "flos": 1555397266944.0, + "grad_norm": 0.0077968992848742235, + "language_loss": 0.75153887, + "learning_rate": 0.000996389018588473, + "loss": 0.76390088, + "num_input_tokens_seen": 28585584, + "router_z_loss_mlp": 1.61523438, + "step": 349, + "time_per_iteration": 4.909464120864868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01242005, + "balance_loss_mlp": 1.07873547, + "epoch": 0.06733358984224702, + "flos": 881615992320.0, + "grad_norm": 0.03432587789196913, + "language_loss": 0.97228402, + "learning_rate": 0.000996351547842304, + "loss": 0.98470408, + "num_input_tokens_seen": 28672512, + "router_z_loss_mlp": 1.62890625, + "step": 350, + "time_per_iteration": 3.1799545288085938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01240315, + "balance_loss_mlp": 1.0778569, + "epoch": 0.06752597152751058, + "flos": 519917793792.0, + "grad_norm": 0.030803186893757592, + "language_loss": 0.96182388, + "learning_rate": 0.0009963138843953744, + "loss": 0.97422707, + "num_input_tokens_seen": 28741520, + "router_z_loss_mlp": 1.62060547, + "step": 351, + "time_per_iteration": 2.5873348712921143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238163, + "balance_loss_mlp": 1.07565665, + "epoch": 0.06771835321277414, + "flos": 540882258432.0, + "grad_norm": 0.023778523337364334, + "language_loss": 0.99575555, + "learning_rate": 0.000996276028262306, + "loss": 1.00813723, + "num_input_tokens_seen": 28814912, + "router_z_loss_mlp": 1.62109375, + "step": 352, + "time_per_iteration": 2.7943532466888428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238104, + "balance_loss_mlp": 1.07583654, + "epoch": 0.0679107348980377, + "flos": 461615007744.0, + "grad_norm": 0.02720743117278016, + "language_loss": 1.06749547, + "learning_rate": 0.0009962379794577964, + "loss": 1.07987642, + "num_input_tokens_seen": 28882192, + "router_z_loss_mlp": 1.61865234, + "step": 353, + "time_per_iteration": 2.589200973510742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01239427, + "balance_loss_mlp": 1.07711196, + "epoch": 0.06810311658330127, + "flos": 637207572480.0, + "grad_norm": 0.02321502152829773, + "language_loss": 0.95908678, + "learning_rate": 0.000996199737996617, + "loss": 0.97148108, + "num_input_tokens_seen": 28968576, + "router_z_loss_mlp": 1.61914062, + "step": 354, + "time_per_iteration": 2.8822708129882812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0123871, + "balance_loss_mlp": 1.07687151, + "epoch": 0.06829549826856483, + "flos": 465626743296.0, + "grad_norm": 0.030894548658215056, + "language_loss": 1.05554581, + "learning_rate": 0.0009961613038936149, + "loss": 1.06793284, + "num_input_tokens_seen": 29036160, + "router_z_loss_mlp": 1.61425781, + "step": 355, + "time_per_iteration": 2.576930522918701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236116, + "balance_loss_mlp": 1.07456315, + "epoch": 0.06848787995382839, + "flos": 635896281600.0, + "grad_norm": 0.0286185110148739, + "language_loss": 0.9730283, + "learning_rate": 0.000996122677163711, + "loss": 0.98538941, + "num_input_tokens_seen": 29112048, + "router_z_loss_mlp": 1.61132812, + "step": 356, + "time_per_iteration": 2.850829601287842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01237686, + "balance_loss_mlp": 1.07637215, + "epoch": 0.06868026163909195, + "flos": 807780556800.0, + "grad_norm": 0.03078602082995562, + "language_loss": 1.03526855, + "learning_rate": 0.000996083857821902, + "loss": 1.04764557, + "num_input_tokens_seen": 29190960, + "router_z_loss_mlp": 1.60888672, + "step": 357, + "time_per_iteration": 3.124053716659546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01237273, + "balance_loss_mlp": 1.07605469, + "epoch": 0.06887264332435553, + "flos": 440151713280.0, + "grad_norm": 0.02263887650004652, + "language_loss": 1.01701617, + "learning_rate": 0.0009960448458832588, + "loss": 1.0293889, + "num_input_tokens_seen": 29262832, + "router_z_loss_mlp": 1.60791016, + "step": 358, + "time_per_iteration": 2.6918816566467285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01242041, + "balance_loss_mlp": 1.08077514, + "epoch": 0.06906502500961909, + "flos": 485785477632.0, + "grad_norm": 0.021707311176365728, + "language_loss": 1.01897752, + "learning_rate": 0.000996005641362927, + "loss": 1.03139794, + "num_input_tokens_seen": 29329552, + "router_z_loss_mlp": 1.60839844, + "step": 359, + "time_per_iteration": 2.601358652114868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238764, + "balance_loss_mlp": 1.07725942, + "epoch": 0.06925740669488265, + "flos": 734885110272.0, + "grad_norm": 0.024380378407611886, + "language_loss": 1.04387617, + "learning_rate": 0.0009959662442761274, + "loss": 1.05626392, + "num_input_tokens_seen": 29410784, + "router_z_loss_mlp": 1.61083984, + "step": 360, + "time_per_iteration": 2.9404215812683105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236823, + "balance_loss_mlp": 1.07589066, + "epoch": 0.0694497883801462, + "flos": 553570745856.0, + "grad_norm": 0.023221163769242582, + "language_loss": 0.97943044, + "learning_rate": 0.000995926654638155, + "loss": 0.99179876, + "num_input_tokens_seen": 29486992, + "router_z_loss_mlp": 1.60498047, + "step": 361, + "time_per_iteration": 2.811624526977539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01234495, + "balance_loss_mlp": 1.07413495, + "epoch": 0.06964217006540978, + "flos": 679243837440.0, + "grad_norm": 0.025577226237571565, + "language_loss": 1.00741839, + "learning_rate": 0.00099588687246438, + "loss": 1.01976323, + "num_input_tokens_seen": 29557232, + "router_z_loss_mlp": 1.59912109, + "step": 362, + "time_per_iteration": 2.826204538345337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01235331, + "balance_loss_mlp": 1.0749228, + "epoch": 0.06983455175067334, + "flos": 525260285952.0, + "grad_norm": 0.054619150892928216, + "language_loss": 1.0805161, + "learning_rate": 0.0009958468977702471, + "loss": 1.09286952, + "num_input_tokens_seen": 29625344, + "router_z_loss_mlp": 1.59960938, + "step": 363, + "time_per_iteration": 2.5742297172546387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01269455, + "balance_loss_mlp": 1.11000061, + "epoch": 0.0700269334359369, + "flos": 1580173353984.0, + "grad_norm": 0.0347214045967213, + "language_loss": 0.79734707, + "learning_rate": 0.0009958067305712761, + "loss": 0.81004167, + "num_input_tokens_seen": 29843664, + "router_z_loss_mlp": 1.59179688, + "step": 364, + "time_per_iteration": 4.815373182296753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01234235, + "balance_loss_mlp": 1.07420838, + "epoch": 0.07021931512120046, + "flos": 1014856659456.0, + "grad_norm": 0.027565425727799023, + "language_loss": 0.95424879, + "learning_rate": 0.0009957663708830612, + "loss": 0.96659118, + "num_input_tokens_seen": 29927152, + "router_z_loss_mlp": 1.59667969, + "step": 365, + "time_per_iteration": 3.3032214641571045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238249, + "balance_loss_mlp": 1.07874703, + "epoch": 0.07041169680646403, + "flos": 824431114752.0, + "grad_norm": 0.03609893162101238, + "language_loss": 0.99641442, + "learning_rate": 0.0009957258187212714, + "loss": 1.00879693, + "num_input_tokens_seen": 30004928, + "router_z_loss_mlp": 1.59228516, + "step": 366, + "time_per_iteration": 3.143951654434204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01232948, + "balance_loss_mlp": 1.0748291, + "epoch": 0.07060407849172759, + "flos": 1417290743808.0, + "grad_norm": 0.015479474187128486, + "language_loss": 0.79194862, + "learning_rate": 0.0009956850741016502, + "loss": 0.80427808, + "num_input_tokens_seen": 30230256, + "router_z_loss_mlp": 1.578125, + "step": 367, + "time_per_iteration": 4.856614112854004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01232866, + "balance_loss_mlp": 1.07417488, + "epoch": 0.07079646017699115, + "flos": 513941486592.0, + "grad_norm": 0.03158452537667852, + "language_loss": 0.9606331, + "learning_rate": 0.0009956441370400167, + "loss": 0.97296178, + "num_input_tokens_seen": 30301200, + "router_z_loss_mlp": 1.58398438, + "step": 368, + "time_per_iteration": 2.6471550464630127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01231431, + "balance_loss_mlp": 1.07288289, + "epoch": 0.07098884186225471, + "flos": 541548274176.0, + "grad_norm": 0.03366854249700899, + "language_loss": 1.02536654, + "learning_rate": 0.0009956030075522636, + "loss": 1.03768086, + "num_input_tokens_seen": 30377024, + "router_z_loss_mlp": 1.58251953, + "step": 369, + "time_per_iteration": 2.764350175857544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01230433, + "balance_loss_mlp": 1.07183695, + "epoch": 0.07118122354751828, + "flos": 549738931200.0, + "grad_norm": 0.025388205653796188, + "language_loss": 1.02520657, + "learning_rate": 0.0009955616856543587, + "loss": 1.03751087, + "num_input_tokens_seen": 30448896, + "router_z_loss_mlp": 1.58300781, + "step": 370, + "time_per_iteration": 2.6488449573516846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01233332, + "balance_loss_mlp": 1.07483125, + "epoch": 0.07137360523278184, + "flos": 622076424192.0, + "grad_norm": 0.025131147277089937, + "language_loss": 0.94016552, + "learning_rate": 0.0009955201713623448, + "loss": 0.95249885, + "num_input_tokens_seen": 30523584, + "router_z_loss_mlp": 1.58203125, + "step": 371, + "time_per_iteration": 2.7475128173828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01231201, + "balance_loss_mlp": 1.07594299, + "epoch": 0.0715659869180454, + "flos": 1505973347328.0, + "grad_norm": 0.011087848535678398, + "language_loss": 0.76672721, + "learning_rate": 0.000995478464692339, + "loss": 0.77903926, + "num_input_tokens_seen": 30757920, + "router_z_loss_mlp": 1.55664062, + "step": 372, + "time_per_iteration": 4.930227518081665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01234827, + "balance_loss_mlp": 1.0769937, + "epoch": 0.07175836860330896, + "flos": 496481195520.0, + "grad_norm": 0.02946804107059058, + "language_loss": 1.07406306, + "learning_rate": 0.0009954365656605333, + "loss": 1.08641148, + "num_input_tokens_seen": 30824960, + "router_z_loss_mlp": 1.57910156, + "step": 373, + "time_per_iteration": 2.5494606494903564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01235693, + "balance_loss_mlp": 1.07862246, + "epoch": 0.07195075028857253, + "flos": 787081333248.0, + "grad_norm": 0.030340412148976308, + "language_loss": 1.00769055, + "learning_rate": 0.0009953944742831947, + "loss": 1.02004743, + "num_input_tokens_seen": 30902224, + "router_z_loss_mlp": 1.57519531, + "step": 374, + "time_per_iteration": 2.974013328552246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01234053, + "balance_loss_mlp": 1.07707787, + "epoch": 0.0721431319738361, + "flos": 594346111488.0, + "grad_norm": 0.024760984543104554, + "language_loss": 1.04227853, + "learning_rate": 0.0009953521905766642, + "loss": 1.05461907, + "num_input_tokens_seen": 30984784, + "router_z_loss_mlp": 1.57421875, + "step": 375, + "time_per_iteration": 2.9470102787017822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01233349, + "balance_loss_mlp": 1.07642198, + "epoch": 0.07233551365909965, + "flos": 549328697856.0, + "grad_norm": 0.025099095391344205, + "language_loss": 1.02903581, + "learning_rate": 0.0009953097145573577, + "loss": 1.04136944, + "num_input_tokens_seen": 31055376, + "router_z_loss_mlp": 1.57373047, + "step": 376, + "time_per_iteration": 2.656438112258911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01232315, + "balance_loss_mlp": 1.0754832, + "epoch": 0.07252789534436321, + "flos": 959167723008.0, + "grad_norm": 0.028756244795243427, + "language_loss": 1.01008701, + "learning_rate": 0.000995267046241766, + "loss": 1.02241015, + "num_input_tokens_seen": 31144944, + "router_z_loss_mlp": 1.57275391, + "step": 377, + "time_per_iteration": 3.2601664066314697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226098, + "balance_loss_mlp": 1.06931448, + "epoch": 0.07272027702962677, + "flos": 508655390208.0, + "grad_norm": 0.025279277167219092, + "language_loss": 1.00209188, + "learning_rate": 0.0009952241856464547, + "loss": 1.01435292, + "num_input_tokens_seen": 31213392, + "router_z_loss_mlp": 1.57226562, + "step": 378, + "time_per_iteration": 2.616483688354492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228279, + "balance_loss_mlp": 1.07159042, + "epoch": 0.07291265871489035, + "flos": 613551395328.0, + "grad_norm": 0.025059419305224793, + "language_loss": 1.0761106, + "learning_rate": 0.0009951811327880632, + "loss": 1.08839345, + "num_input_tokens_seen": 31289840, + "router_z_loss_mlp": 1.57128906, + "step": 379, + "time_per_iteration": 2.7666382789611816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226658, + "balance_loss_mlp": 1.07063651, + "epoch": 0.0731050404001539, + "flos": 496741707264.0, + "grad_norm": 0.032880990240464036, + "language_loss": 1.00766444, + "learning_rate": 0.0009951378876833063, + "loss": 1.01993108, + "num_input_tokens_seen": 31357600, + "router_z_loss_mlp": 1.56445312, + "step": 380, + "time_per_iteration": 2.5504086017608643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01230504, + "balance_loss_mlp": 1.07433975, + "epoch": 0.07329742208541747, + "flos": 641129985024.0, + "grad_norm": 0.0343074889031262, + "language_loss": 1.0780232, + "learning_rate": 0.0009950944503489736, + "loss": 1.0903281, + "num_input_tokens_seen": 31428896, + "router_z_loss_mlp": 1.56591797, + "step": 381, + "time_per_iteration": 2.7695260047912598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01231248, + "balance_loss_mlp": 1.07537043, + "epoch": 0.07348980377068103, + "flos": 817740401664.0, + "grad_norm": 0.027198888726283066, + "language_loss": 1.01785743, + "learning_rate": 0.0009950508208019285, + "loss": 1.03016996, + "num_input_tokens_seen": 31507424, + "router_z_loss_mlp": 1.56298828, + "step": 382, + "time_per_iteration": 2.9918277263641357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01227944, + "balance_loss_mlp": 1.07187521, + "epoch": 0.0736821854559446, + "flos": 509669239296.0, + "grad_norm": 0.03113985633155724, + "language_loss": 1.05612254, + "learning_rate": 0.0009950069990591096, + "loss": 1.06840205, + "num_input_tokens_seen": 31576768, + "router_z_loss_mlp": 1.56494141, + "step": 383, + "time_per_iteration": 2.610745429992676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01248039, + "balance_loss_mlp": 1.09392548, + "epoch": 0.07387456714120816, + "flos": 1558048046592.0, + "grad_norm": 0.03338671968111017, + "language_loss": 0.76401371, + "learning_rate": 0.0009949629851375302, + "loss": 0.77649409, + "num_input_tokens_seen": 31797312, + "router_z_loss_mlp": 1.54492188, + "step": 384, + "time_per_iteration": 4.854166269302368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01229749, + "balance_loss_mlp": 1.0736798, + "epoch": 0.07406694882647172, + "flos": 526643435520.0, + "grad_norm": 0.03274978311793036, + "language_loss": 0.98781282, + "learning_rate": 0.0009949187790542777, + "loss": 1.00011039, + "num_input_tokens_seen": 31869568, + "router_z_loss_mlp": 1.56494141, + "step": 385, + "time_per_iteration": 2.728701591491699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0123258, + "balance_loss_mlp": 1.07636821, + "epoch": 0.07425933051173528, + "flos": 498823799808.0, + "grad_norm": 0.026908846939264777, + "language_loss": 0.94723004, + "learning_rate": 0.0009948743808265148, + "loss": 0.95955586, + "num_input_tokens_seen": 31941712, + "router_z_loss_mlp": 1.56640625, + "step": 386, + "time_per_iteration": 2.6850693225860596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01231135, + "balance_loss_mlp": 1.07511437, + "epoch": 0.07445171219699885, + "flos": 506057003520.0, + "grad_norm": 0.05633654869747302, + "language_loss": 1.04553366, + "learning_rate": 0.0009948297904714782, + "loss": 1.05784488, + "num_input_tokens_seen": 32015232, + "router_z_loss_mlp": 1.56445312, + "step": 387, + "time_per_iteration": 2.6746010780334473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01231627, + "balance_loss_mlp": 1.07555866, + "epoch": 0.07464409388226241, + "flos": 555116352000.0, + "grad_norm": 0.03450843374667126, + "language_loss": 0.9665134, + "learning_rate": 0.0009947850080064796, + "loss": 0.97882968, + "num_input_tokens_seen": 32094640, + "router_z_loss_mlp": 1.56494141, + "step": 388, + "time_per_iteration": 2.7839057445526123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01230193, + "balance_loss_mlp": 1.07431459, + "epoch": 0.07483647556752597, + "flos": 778274325504.0, + "grad_norm": 0.021592891008175935, + "language_loss": 1.01240289, + "learning_rate": 0.0009947400334489047, + "loss": 1.02470493, + "num_input_tokens_seen": 32176640, + "router_z_loss_mlp": 1.56298828, + "step": 389, + "time_per_iteration": 2.9945342540740967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01229089, + "balance_loss_mlp": 1.07411718, + "epoch": 0.07502885725278953, + "flos": 613681651200.0, + "grad_norm": 0.023383004705128753, + "language_loss": 0.92341155, + "learning_rate": 0.0009946948668162145, + "loss": 0.93570244, + "num_input_tokens_seen": 32246704, + "router_z_loss_mlp": 1.55371094, + "step": 390, + "time_per_iteration": 2.7355024814605713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122989, + "balance_loss_mlp": 1.07496524, + "epoch": 0.0752212389380531, + "flos": 689854961664.0, + "grad_norm": 0.026752200694656208, + "language_loss": 0.97335494, + "learning_rate": 0.0009946495081259441, + "loss": 0.98565376, + "num_input_tokens_seen": 32320032, + "router_z_loss_mlp": 1.55322266, + "step": 391, + "time_per_iteration": 2.799938678741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01227768, + "balance_loss_mlp": 1.07303405, + "epoch": 0.07541362062331666, + "flos": 767050853376.0, + "grad_norm": 0.02596026064524479, + "language_loss": 1.01604676, + "learning_rate": 0.0009946039573957035, + "loss": 1.02832437, + "num_input_tokens_seen": 32398144, + "router_z_loss_mlp": 1.55126953, + "step": 392, + "time_per_iteration": 2.932504415512085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0123199, + "balance_loss_mlp": 1.07768571, + "epoch": 0.07560600230858022, + "flos": 589908679680.0, + "grad_norm": 0.028382748029943367, + "language_loss": 0.97495323, + "learning_rate": 0.000994558214643177, + "loss": 0.98727316, + "num_input_tokens_seen": 32471984, + "router_z_loss_mlp": 1.546875, + "step": 393, + "time_per_iteration": 2.752694845199585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228178, + "balance_loss_mlp": 1.07425475, + "epoch": 0.07579838399384378, + "flos": 751144900608.0, + "grad_norm": 0.028291982513743617, + "language_loss": 0.99160051, + "learning_rate": 0.000994512279886123, + "loss": 1.00388229, + "num_input_tokens_seen": 32550176, + "router_z_loss_mlp": 1.54296875, + "step": 394, + "time_per_iteration": 3.06592059135437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228894, + "balance_loss_mlp": 1.07530475, + "epoch": 0.07599076567910736, + "flos": 524550609408.0, + "grad_norm": 0.023352712612718218, + "language_loss": 0.98641121, + "learning_rate": 0.0009944661531423758, + "loss": 0.99870014, + "num_input_tokens_seen": 32620768, + "router_z_loss_mlp": 1.53955078, + "step": 395, + "time_per_iteration": 2.6720728874206543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122919, + "balance_loss_mlp": 1.07555354, + "epoch": 0.07618314736437092, + "flos": 552185594880.0, + "grad_norm": 0.026216962171459895, + "language_loss": 0.97914684, + "learning_rate": 0.000994419834429843, + "loss": 0.99143875, + "num_input_tokens_seen": 32693472, + "router_z_loss_mlp": 1.54003906, + "step": 396, + "time_per_iteration": 2.6652910709381104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226861, + "balance_loss_mlp": 1.07308066, + "epoch": 0.07637552904963447, + "flos": 699432771072.0, + "grad_norm": 0.029361663168223213, + "language_loss": 1.03114796, + "learning_rate": 0.0009943733237665069, + "loss": 1.0434165, + "num_input_tokens_seen": 32764976, + "router_z_loss_mlp": 1.54150391, + "step": 397, + "time_per_iteration": 2.808711290359497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01227023, + "balance_loss_mlp": 1.07329071, + "epoch": 0.07656791073489803, + "flos": 580635042816.0, + "grad_norm": 0.02000560632750303, + "language_loss": 1.01598048, + "learning_rate": 0.0009943266211704248, + "loss": 1.02825069, + "num_input_tokens_seen": 32853104, + "router_z_loss_mlp": 1.54101562, + "step": 398, + "time_per_iteration": 2.9420461654663086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226854, + "balance_loss_mlp": 1.0732646, + "epoch": 0.0767602924201616, + "flos": 418037139456.0, + "grad_norm": 0.02425852476792673, + "language_loss": 1.03237891, + "learning_rate": 0.000994279726659728, + "loss": 1.04464746, + "num_input_tokens_seen": 32919376, + "router_z_loss_mlp": 1.53955078, + "step": 399, + "time_per_iteration": 2.5185675621032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01230296, + "balance_loss_mlp": 1.07675469, + "epoch": 0.07695267410542517, + "flos": 483888035328.0, + "grad_norm": 0.030174375239475117, + "language_loss": 1.02145576, + "learning_rate": 0.0009942326402526231, + "loss": 1.03375876, + "num_input_tokens_seen": 32988064, + "router_z_loss_mlp": 1.5390625, + "step": 400, + "time_per_iteration": 2.5265390872955322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224857, + "balance_loss_mlp": 1.07184029, + "epoch": 0.07714505579068873, + "flos": 532026860544.0, + "grad_norm": 0.024483465572707617, + "language_loss": 0.99344772, + "learning_rate": 0.0009941853619673902, + "loss": 1.0056963, + "num_input_tokens_seen": 33059024, + "router_z_loss_mlp": 1.53369141, + "step": 401, + "time_per_iteration": 2.660491704940796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224912, + "balance_loss_mlp": 1.07218146, + "epoch": 0.07733743747595229, + "flos": 806439066624.0, + "grad_norm": 0.032921156451595594, + "language_loss": 1.03587961, + "learning_rate": 0.0009941378918223844, + "loss": 1.04812872, + "num_input_tokens_seen": 33137712, + "router_z_loss_mlp": 1.53076172, + "step": 402, + "time_per_iteration": 3.078272819519043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01222316, + "balance_loss_mlp": 1.06972802, + "epoch": 0.07752981916121585, + "flos": 623613298176.0, + "grad_norm": 0.02596227047756477, + "language_loss": 0.96322513, + "learning_rate": 0.0009940902298360354, + "loss": 0.97544825, + "num_input_tokens_seen": 33211296, + "router_z_loss_mlp": 1.52929688, + "step": 403, + "time_per_iteration": 2.78222918510437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224993, + "balance_loss_mlp": 1.07288182, + "epoch": 0.07772220084647942, + "flos": 729542618112.0, + "grad_norm": 0.031231063897144088, + "language_loss": 1.06544566, + "learning_rate": 0.0009940423760268473, + "loss": 1.07769561, + "num_input_tokens_seen": 33283632, + "router_z_loss_mlp": 1.52441406, + "step": 404, + "time_per_iteration": 2.8572018146514893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226552, + "balance_loss_mlp": 1.07472658, + "epoch": 0.07791458253174298, + "flos": 556468575744.0, + "grad_norm": 0.029548764371286118, + "language_loss": 0.99639893, + "learning_rate": 0.0009939943304133982, + "loss": 1.00866449, + "num_input_tokens_seen": 33350704, + "router_z_loss_mlp": 1.52148438, + "step": 405, + "time_per_iteration": 2.607412815093994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226106, + "balance_loss_mlp": 1.07409084, + "epoch": 0.07810696421700654, + "flos": 554234760192.0, + "grad_norm": 0.031141101296471768, + "language_loss": 1.06411445, + "learning_rate": 0.0009939460930143416, + "loss": 1.07637548, + "num_input_tokens_seen": 33416272, + "router_z_loss_mlp": 1.5234375, + "step": 406, + "time_per_iteration": 2.6132876873016357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223027, + "balance_loss_mlp": 1.07120168, + "epoch": 0.0782993459022701, + "flos": 651878095872.0, + "grad_norm": 0.023437908852709077, + "language_loss": 1.00106847, + "learning_rate": 0.0009938976638484043, + "loss": 1.01329875, + "num_input_tokens_seen": 33501824, + "router_z_loss_mlp": 1.52148438, + "step": 407, + "time_per_iteration": 2.905681610107422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218745, + "balance_loss_mlp": 1.06691968, + "epoch": 0.07849172758753367, + "flos": 497160672768.0, + "grad_norm": 0.02891290096917658, + "language_loss": 0.99991584, + "learning_rate": 0.0009938490429343887, + "loss": 1.01210332, + "num_input_tokens_seen": 33571456, + "router_z_loss_mlp": 1.52148438, + "step": 408, + "time_per_iteration": 2.539567708969116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01222677, + "balance_loss_mlp": 1.07066166, + "epoch": 0.07868410927279723, + "flos": 579075975168.0, + "grad_norm": 0.030601656563413092, + "language_loss": 0.99965751, + "learning_rate": 0.0009938002302911709, + "loss": 1.01188421, + "num_input_tokens_seen": 33646320, + "router_z_loss_mlp": 1.5234375, + "step": 409, + "time_per_iteration": 2.732064962387085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01220028, + "balance_loss_mlp": 1.0680126, + "epoch": 0.07887649095806079, + "flos": 524066515968.0, + "grad_norm": 0.03256443285635905, + "language_loss": 1.03146362, + "learning_rate": 0.0009937512259377015, + "loss": 1.04366398, + "num_input_tokens_seen": 33717664, + "router_z_loss_mlp": 1.5234375, + "step": 410, + "time_per_iteration": 2.6500303745269775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221864, + "balance_loss_mlp": 1.07013464, + "epoch": 0.07906887264332435, + "flos": 558437876736.0, + "grad_norm": 0.023780630120827737, + "language_loss": 1.01466393, + "learning_rate": 0.000993702029893006, + "loss": 1.02688265, + "num_input_tokens_seen": 33794720, + "router_z_loss_mlp": 1.52050781, + "step": 411, + "time_per_iteration": 2.7921671867370605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221791, + "balance_loss_mlp": 1.07010949, + "epoch": 0.07926125432858792, + "flos": 823362871296.0, + "grad_norm": 0.04077078343290612, + "language_loss": 1.01153946, + "learning_rate": 0.0009936526421761838, + "loss": 1.02375734, + "num_input_tokens_seen": 33868304, + "router_z_loss_mlp": 1.52001953, + "step": 412, + "time_per_iteration": 3.0569379329681396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217861, + "balance_loss_mlp": 1.06632257, + "epoch": 0.07945363601385148, + "flos": 563393604096.0, + "grad_norm": 0.02717343044282308, + "language_loss": 1.04004121, + "learning_rate": 0.000993603062806409, + "loss": 1.05221987, + "num_input_tokens_seen": 33937424, + "router_z_loss_mlp": 1.51855469, + "step": 413, + "time_per_iteration": 2.707462787628174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01219172, + "balance_loss_mlp": 1.06844354, + "epoch": 0.07964601769911504, + "flos": 518884478976.0, + "grad_norm": 0.031245789494761384, + "language_loss": 1.07179379, + "learning_rate": 0.0009935532918029298, + "loss": 1.08398533, + "num_input_tokens_seen": 34003984, + "router_z_loss_mlp": 1.51025391, + "step": 414, + "time_per_iteration": 2.668151617050171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224604, + "balance_loss_mlp": 1.07387555, + "epoch": 0.0798383993843786, + "flos": 540300109824.0, + "grad_norm": 0.025221671350570463, + "language_loss": 0.99906069, + "learning_rate": 0.0009935033291850694, + "loss": 1.01130676, + "num_input_tokens_seen": 34072400, + "router_z_loss_mlp": 1.51025391, + "step": 415, + "time_per_iteration": 2.64747953414917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216008, + "balance_loss_mlp": 1.06547058, + "epoch": 0.08003078106964218, + "flos": 486121850880.0, + "grad_norm": 0.027121462600521052, + "language_loss": 1.02766061, + "learning_rate": 0.0009934531749722247, + "loss": 1.03982067, + "num_input_tokens_seen": 34142448, + "router_z_loss_mlp": 1.50830078, + "step": 416, + "time_per_iteration": 2.5705764293670654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121625, + "balance_loss_mlp": 1.06576049, + "epoch": 0.08022316275490574, + "flos": 519275246592.0, + "grad_norm": 0.027391361962933233, + "language_loss": 1.00515926, + "learning_rate": 0.0009934028291838672, + "loss": 1.01732171, + "num_input_tokens_seen": 34214080, + "router_z_loss_mlp": 1.5078125, + "step": 417, + "time_per_iteration": 2.7232770919799805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01219761, + "balance_loss_mlp": 1.0695101, + "epoch": 0.0804155444401693, + "flos": 495046379520.0, + "grad_norm": 0.028534904701295792, + "language_loss": 0.95904237, + "learning_rate": 0.0009933522918395433, + "loss": 0.97123998, + "num_input_tokens_seen": 34288448, + "router_z_loss_mlp": 1.50537109, + "step": 418, + "time_per_iteration": 2.670992374420166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01265297, + "balance_loss_mlp": 1.11595154, + "epoch": 0.08060792612543285, + "flos": 1584853833216.0, + "grad_norm": 0.03473829356439328, + "language_loss": 0.782511, + "learning_rate": 0.0009933015629588731, + "loss": 0.79516399, + "num_input_tokens_seen": 34521632, + "router_z_loss_mlp": 1.49609375, + "step": 419, + "time_per_iteration": 4.9051830768585205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01222046, + "balance_loss_mlp": 1.07246244, + "epoch": 0.08080030781069643, + "flos": 526358728704.0, + "grad_norm": 0.03232182071246488, + "language_loss": 1.15746891, + "learning_rate": 0.000993250642561551, + "loss": 1.16968942, + "num_input_tokens_seen": 34590080, + "router_z_loss_mlp": 1.49853516, + "step": 420, + "time_per_iteration": 2.596930503845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224313, + "balance_loss_mlp": 1.07487273, + "epoch": 0.08099268949595999, + "flos": 547756895232.0, + "grad_norm": 0.03306568774928502, + "language_loss": 1.00193918, + "learning_rate": 0.0009931995306673466, + "loss": 1.01418233, + "num_input_tokens_seen": 34660512, + "router_z_loss_mlp": 1.49707031, + "step": 421, + "time_per_iteration": 2.704012155532837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223697, + "balance_loss_mlp": 1.0744468, + "epoch": 0.08118507118122355, + "flos": 511373299200.0, + "grad_norm": 0.026268861479682264, + "language_loss": 1.0597651, + "learning_rate": 0.000993148227296103, + "loss": 1.07200205, + "num_input_tokens_seen": 34732016, + "router_z_loss_mlp": 1.49511719, + "step": 422, + "time_per_iteration": 2.6110117435455322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224578, + "balance_loss_mlp": 1.0751853, + "epoch": 0.08137745286648711, + "flos": 722001239040.0, + "grad_norm": 0.024088300997991936, + "language_loss": 0.92380643, + "learning_rate": 0.000993096732467738, + "loss": 0.9360522, + "num_input_tokens_seen": 34810416, + "router_z_loss_mlp": 1.49658203, + "step": 423, + "time_per_iteration": 2.9790220260620117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224383, + "balance_loss_mlp": 1.0753237, + "epoch": 0.08156983455175067, + "flos": 680817641472.0, + "grad_norm": 0.029818930066630327, + "language_loss": 1.0177561, + "learning_rate": 0.0009930450462022435, + "loss": 1.02999997, + "num_input_tokens_seen": 34879504, + "router_z_loss_mlp": 1.49316406, + "step": 424, + "time_per_iteration": 2.8023674488067627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223, + "balance_loss_mlp": 1.07518005, + "epoch": 0.08176221623701424, + "flos": 1456588359168.0, + "grad_norm": 0.012435251357338771, + "language_loss": 0.79189807, + "learning_rate": 0.0009929931685196862, + "loss": 0.80412811, + "num_input_tokens_seen": 35111584, + "router_z_loss_mlp": 1.48046875, + "step": 425, + "time_per_iteration": 4.96533989906311 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01219597, + "balance_loss_mlp": 1.0711571, + "epoch": 0.0819545979222778, + "flos": 1558883071488.0, + "grad_norm": 0.04204100969257126, + "language_loss": 1.00605047, + "learning_rate": 0.0009929410994402065, + "loss": 1.01824641, + "num_input_tokens_seen": 35205664, + "router_z_loss_mlp": 1.48681641, + "step": 426, + "time_per_iteration": 3.850475311279297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01220758, + "balance_loss_mlp": 1.07236588, + "epoch": 0.08214697960754136, + "flos": 513800497152.0, + "grad_norm": 0.03975912273964659, + "language_loss": 1.03955805, + "learning_rate": 0.0009928888389840196, + "loss": 1.05176568, + "num_input_tokens_seen": 35280144, + "router_z_loss_mlp": 1.48632812, + "step": 427, + "time_per_iteration": 2.6892385482788086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224824, + "balance_loss_mlp": 1.07633698, + "epoch": 0.08233936129280492, + "flos": 596221360128.0, + "grad_norm": 0.02633667259549893, + "language_loss": 1.0604248, + "learning_rate": 0.0009928363871714147, + "loss": 1.07267296, + "num_input_tokens_seen": 35344768, + "router_z_loss_mlp": 1.48730469, + "step": 428, + "time_per_iteration": 2.666851282119751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224039, + "balance_loss_mlp": 1.07550442, + "epoch": 0.08253174297806849, + "flos": 573164795904.0, + "grad_norm": 0.03052010415677114, + "language_loss": 0.99677718, + "learning_rate": 0.0009927837440227556, + "loss": 1.00901759, + "num_input_tokens_seen": 35425536, + "router_z_loss_mlp": 1.48779297, + "step": 429, + "time_per_iteration": 2.810197591781616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228416, + "balance_loss_mlp": 1.07992899, + "epoch": 0.08272412466333205, + "flos": 624642610176.0, + "grad_norm": 0.029909202440675912, + "language_loss": 0.93710327, + "learning_rate": 0.0009927309095584798, + "loss": 0.94938743, + "num_input_tokens_seen": 35515440, + "router_z_loss_mlp": 1.48730469, + "step": 430, + "time_per_iteration": 2.98052978515625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122165, + "balance_loss_mlp": 1.07316256, + "epoch": 0.08291650634859561, + "flos": 514994267136.0, + "grad_norm": 0.038201439099628094, + "language_loss": 1.07072532, + "learning_rate": 0.0009926778837991, + "loss": 1.08294177, + "num_input_tokens_seen": 35580192, + "router_z_loss_mlp": 1.48730469, + "step": 431, + "time_per_iteration": 2.613912582397461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223506, + "balance_loss_mlp": 1.07516193, + "epoch": 0.08310888803385917, + "flos": 668541388800.0, + "grad_norm": 0.02618037233016902, + "language_loss": 1.04762018, + "learning_rate": 0.000992624666765202, + "loss": 1.05985522, + "num_input_tokens_seen": 35649472, + "router_z_loss_mlp": 1.48583984, + "step": 432, + "time_per_iteration": 2.785602331161499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224029, + "balance_loss_mlp": 1.07659137, + "epoch": 0.08330126971912274, + "flos": 584490326016.0, + "grad_norm": 0.023129420064945467, + "language_loss": 1.02043724, + "learning_rate": 0.000992571258477447, + "loss": 1.03267753, + "num_input_tokens_seen": 35722848, + "router_z_loss_mlp": 1.4765625, + "step": 433, + "time_per_iteration": 2.7774012088775635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225333, + "balance_loss_mlp": 1.07799041, + "epoch": 0.0834936514043863, + "flos": 562497275904.0, + "grad_norm": 0.02412369992445121, + "language_loss": 0.95710295, + "learning_rate": 0.0009925176589565695, + "loss": 0.9693563, + "num_input_tokens_seen": 35800944, + "router_z_loss_mlp": 1.47558594, + "step": 434, + "time_per_iteration": 2.7975149154663086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224713, + "balance_loss_mlp": 1.07751381, + "epoch": 0.08368603308964986, + "flos": 495513008640.0, + "grad_norm": 0.023499028814372425, + "language_loss": 1.06310439, + "learning_rate": 0.0009924638682233791, + "loss": 1.07535148, + "num_input_tokens_seen": 35866288, + "router_z_loss_mlp": 1.47412109, + "step": 435, + "time_per_iteration": 2.5623626708984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01247864, + "balance_loss_mlp": 1.10328674, + "epoch": 0.08387841477491342, + "flos": 1391808983040.0, + "grad_norm": 0.0329185074425942, + "language_loss": 0.79564589, + "learning_rate": 0.0009924098862987589, + "loss": 0.80812454, + "num_input_tokens_seen": 36083040, + "router_z_loss_mlp": 1.44726562, + "step": 436, + "time_per_iteration": 4.5364601612091064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01219037, + "balance_loss_mlp": 1.07174218, + "epoch": 0.084070796460177, + "flos": 800353970688.0, + "grad_norm": 0.025226905267595717, + "language_loss": 0.95941472, + "learning_rate": 0.0009923557132036668, + "loss": 0.97160506, + "num_input_tokens_seen": 36158816, + "router_z_loss_mlp": 1.47509766, + "step": 437, + "time_per_iteration": 3.031538963317871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01219746, + "balance_loss_mlp": 1.07226074, + "epoch": 0.08426317814544056, + "flos": 560096274432.0, + "grad_norm": 0.024291343012928023, + "language_loss": 0.99699497, + "learning_rate": 0.0009923013489591345, + "loss": 1.00919247, + "num_input_tokens_seen": 36236432, + "router_z_loss_mlp": 1.47705078, + "step": 438, + "time_per_iteration": 2.741021156311035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217749, + "balance_loss_mlp": 1.07073975, + "epoch": 0.08445555983070412, + "flos": 811883616768.0, + "grad_norm": 0.02787309358423107, + "language_loss": 0.97740996, + "learning_rate": 0.0009922467935862681, + "loss": 0.98958743, + "num_input_tokens_seen": 36327952, + "router_z_loss_mlp": 1.47216797, + "step": 439, + "time_per_iteration": 3.0727341175079346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215984, + "balance_loss_mlp": 1.06907046, + "epoch": 0.08464794151596768, + "flos": 511169183232.0, + "grad_norm": 0.02418736148641671, + "language_loss": 1.01547837, + "learning_rate": 0.0009921920471062478, + "loss": 1.0276382, + "num_input_tokens_seen": 36394896, + "router_z_loss_mlp": 1.47119141, + "step": 440, + "time_per_iteration": 2.5793957710266113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214442, + "balance_loss_mlp": 1.06805265, + "epoch": 0.08484032320123125, + "flos": 557473692672.0, + "grad_norm": 0.02549300900866748, + "language_loss": 0.99590349, + "learning_rate": 0.0009921371095403281, + "loss": 1.00804806, + "num_input_tokens_seen": 36464656, + "router_z_loss_mlp": 1.46582031, + "step": 441, + "time_per_iteration": 2.633976936340332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215261, + "balance_loss_mlp": 1.06887233, + "epoch": 0.08503270488649481, + "flos": 528360230400.0, + "grad_norm": 0.023285649852896013, + "language_loss": 1.02823853, + "learning_rate": 0.0009920819809098379, + "loss": 1.04039121, + "num_input_tokens_seen": 36532208, + "router_z_loss_mlp": 1.46582031, + "step": 442, + "time_per_iteration": 2.5975728034973145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213611, + "balance_loss_mlp": 1.06722176, + "epoch": 0.08522508657175837, + "flos": 615385711104.0, + "grad_norm": 0.021771679570127336, + "language_loss": 0.97986722, + "learning_rate": 0.0009920266612361798, + "loss": 0.99200332, + "num_input_tokens_seen": 36607360, + "router_z_loss_mlp": 1.46582031, + "step": 443, + "time_per_iteration": 2.7284042835235596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214332, + "balance_loss_mlp": 1.06803846, + "epoch": 0.08541746825702193, + "flos": 620986713600.0, + "grad_norm": 0.024601404202987703, + "language_loss": 0.97963679, + "learning_rate": 0.0009919711505408308, + "loss": 0.9917801, + "num_input_tokens_seen": 36680688, + "router_z_loss_mlp": 1.46484375, + "step": 444, + "time_per_iteration": 2.797030448913574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216522, + "balance_loss_mlp": 1.07051492, + "epoch": 0.08560984994228549, + "flos": 483888035328.0, + "grad_norm": 0.023417740932750293, + "language_loss": 0.96522343, + "learning_rate": 0.000991915448845342, + "loss": 0.97738856, + "num_input_tokens_seen": 36746288, + "router_z_loss_mlp": 1.46191406, + "step": 445, + "time_per_iteration": 2.544638156890869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121537, + "balance_loss_mlp": 1.06945765, + "epoch": 0.08580223162754906, + "flos": 518176803840.0, + "grad_norm": 0.025018627604332305, + "language_loss": 1.05275297, + "learning_rate": 0.000991859556171339, + "loss": 1.0649066, + "num_input_tokens_seen": 36812528, + "router_z_loss_mlp": 1.4609375, + "step": 446, + "time_per_iteration": 2.5865097045898438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214045, + "balance_loss_mlp": 1.06856191, + "epoch": 0.08599461331281262, + "flos": 532519686144.0, + "grad_norm": 0.025883227843611877, + "language_loss": 1.07190132, + "learning_rate": 0.000991803472540521, + "loss": 1.08404183, + "num_input_tokens_seen": 36879248, + "router_z_loss_mlp": 1.45654297, + "step": 447, + "time_per_iteration": 2.6001055240631104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213992, + "balance_loss_mlp": 1.06879497, + "epoch": 0.08618699499807618, + "flos": 791633558016.0, + "grad_norm": 0.022461373320799196, + "language_loss": 1.02303076, + "learning_rate": 0.0009917471979746615, + "loss": 1.03517067, + "num_input_tokens_seen": 36951376, + "router_z_loss_mlp": 1.45361328, + "step": 448, + "time_per_iteration": 2.9621376991271973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218395, + "balance_loss_mlp": 1.07300746, + "epoch": 0.08637937668333974, + "flos": 567114628608.0, + "grad_norm": 0.02449904215267775, + "language_loss": 1.00404847, + "learning_rate": 0.0009916907324956086, + "loss": 1.01623249, + "num_input_tokens_seen": 37025936, + "router_z_loss_mlp": 1.45556641, + "step": 449, + "time_per_iteration": 2.691150188446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214944, + "balance_loss_mlp": 1.0697943, + "epoch": 0.08657175836860331, + "flos": 446117286912.0, + "grad_norm": 0.025714213043280993, + "language_loss": 0.97109705, + "learning_rate": 0.0009916340761252837, + "loss": 0.98324645, + "num_input_tokens_seen": 37095872, + "router_z_loss_mlp": 1.453125, + "step": 450, + "time_per_iteration": 2.6118698120117188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212599, + "balance_loss_mlp": 1.067307, + "epoch": 0.08676414005386687, + "flos": 845588235264.0, + "grad_norm": 0.02612794411743426, + "language_loss": 0.94540501, + "learning_rate": 0.0009915772288856832, + "loss": 0.95753098, + "num_input_tokens_seen": 37179072, + "router_z_loss_mlp": 1.45458984, + "step": 451, + "time_per_iteration": 3.0883219242095947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213701, + "balance_loss_mlp": 1.06926715, + "epoch": 0.08695652173913043, + "flos": 604483875840.0, + "grad_norm": 0.02003375948944636, + "language_loss": 0.95739877, + "learning_rate": 0.000991520190798877, + "loss": 0.96953583, + "num_input_tokens_seen": 37260288, + "router_z_loss_mlp": 1.44580078, + "step": 452, + "time_per_iteration": 2.8387818336486816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213572, + "balance_loss_mlp": 1.06928122, + "epoch": 0.08714890342439399, + "flos": 732000015360.0, + "grad_norm": 0.027770143088691506, + "language_loss": 1.06693339, + "learning_rate": 0.0009914629618870089, + "loss": 1.07906914, + "num_input_tokens_seen": 37331136, + "router_z_loss_mlp": 1.44433594, + "step": 453, + "time_per_iteration": 2.9403207302093506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01234398, + "balance_loss_mlp": 1.0905838, + "epoch": 0.08734128510965757, + "flos": 1485454044672.0, + "grad_norm": 0.02536208637588336, + "language_loss": 0.78675872, + "learning_rate": 0.0009914055421722976, + "loss": 0.79910266, + "num_input_tokens_seen": 37559040, + "router_z_loss_mlp": 1.43945312, + "step": 454, + "time_per_iteration": 4.803662061691284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121994, + "balance_loss_mlp": 1.07631683, + "epoch": 0.08753366679492113, + "flos": 1526266340352.0, + "grad_norm": 0.01817690946373191, + "language_loss": 0.81427962, + "learning_rate": 0.0009913479316770353, + "loss": 0.82647902, + "num_input_tokens_seen": 37785136, + "router_z_loss_mlp": 1.4375, + "step": 455, + "time_per_iteration": 4.812621355056763 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213204, + "balance_loss_mlp": 1.06919885, + "epoch": 0.08772604848018468, + "flos": 722524263936.0, + "grad_norm": 0.030160618436618963, + "language_loss": 0.98162878, + "learning_rate": 0.0009912901304235883, + "loss": 0.99376082, + "num_input_tokens_seen": 37858832, + "router_z_loss_mlp": 1.44140625, + "step": 456, + "time_per_iteration": 2.9147355556488037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217818, + "balance_loss_mlp": 1.07386112, + "epoch": 0.08791843016544824, + "flos": 709466476032.0, + "grad_norm": 0.03064824893295274, + "language_loss": 0.96399593, + "learning_rate": 0.000991232138434397, + "loss": 0.97617412, + "num_input_tokens_seen": 37931856, + "router_z_loss_mlp": 1.44091797, + "step": 457, + "time_per_iteration": 2.8735082149505615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121922, + "balance_loss_mlp": 1.07540572, + "epoch": 0.08811081185071182, + "flos": 474021516288.0, + "grad_norm": 0.03193385229896835, + "language_loss": 1.03185177, + "learning_rate": 0.000991173955731976, + "loss": 1.04404402, + "num_input_tokens_seen": 38002432, + "router_z_loss_mlp": 1.43945312, + "step": 458, + "time_per_iteration": 2.6597843170166016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01220724, + "balance_loss_mlp": 1.07762539, + "epoch": 0.08830319353597538, + "flos": 686314584576.0, + "grad_norm": 0.057581270182385194, + "language_loss": 1.06524456, + "learning_rate": 0.0009911155823389137, + "loss": 1.07745171, + "num_input_tokens_seen": 38081648, + "router_z_loss_mlp": 1.43212891, + "step": 459, + "time_per_iteration": 2.938124656677246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218235, + "balance_loss_mlp": 1.07513571, + "epoch": 0.08849557522123894, + "flos": 574608344064.0, + "grad_norm": 0.027044136096108284, + "language_loss": 1.01923048, + "learning_rate": 0.000991057018277873, + "loss": 1.03141284, + "num_input_tokens_seen": 38153424, + "router_z_loss_mlp": 1.43212891, + "step": 460, + "time_per_iteration": 2.746169090270996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212445, + "balance_loss_mlp": 1.0693934, + "epoch": 0.0886879569065025, + "flos": 565627419648.0, + "grad_norm": 0.031092379840733354, + "language_loss": 1.03267121, + "learning_rate": 0.0009909982635715898, + "loss": 1.04479575, + "num_input_tokens_seen": 38223008, + "router_z_loss_mlp": 1.43164062, + "step": 461, + "time_per_iteration": 2.6196396350860596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212854, + "balance_loss_mlp": 1.06956458, + "epoch": 0.08888033859176607, + "flos": 564956674560.0, + "grad_norm": 0.030181357689894217, + "language_loss": 1.02059078, + "learning_rate": 0.0009909393182428751, + "loss": 1.03271937, + "num_input_tokens_seen": 38294592, + "router_z_loss_mlp": 1.43408203, + "step": 462, + "time_per_iteration": 2.679793357849121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216843, + "balance_loss_mlp": 1.07345808, + "epoch": 0.08907272027702963, + "flos": 466742650368.0, + "grad_norm": 0.029240136547664795, + "language_loss": 0.9639132, + "learning_rate": 0.000990880182314614, + "loss": 0.97608161, + "num_input_tokens_seen": 38365792, + "router_z_loss_mlp": 1.43505859, + "step": 463, + "time_per_iteration": 2.712097644805908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212421, + "balance_loss_mlp": 1.06922734, + "epoch": 0.08926510196229319, + "flos": 682843338240.0, + "grad_norm": 0.026287763165510035, + "language_loss": 0.96174729, + "learning_rate": 0.0009908208558097643, + "loss": 0.97387147, + "num_input_tokens_seen": 38447776, + "router_z_loss_mlp": 1.43310547, + "step": 464, + "time_per_iteration": 2.906903028488159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217208, + "balance_loss_mlp": 1.07406175, + "epoch": 0.08945748364755675, + "flos": 597821360640.0, + "grad_norm": 0.024374741633963998, + "language_loss": 0.98668623, + "learning_rate": 0.000990761338751359, + "loss": 0.99885827, + "num_input_tokens_seen": 38521632, + "router_z_loss_mlp": 1.43261719, + "step": 465, + "time_per_iteration": 2.7994933128356934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225639, + "balance_loss_mlp": 1.08506775, + "epoch": 0.08964986533282032, + "flos": 1589340930048.0, + "grad_norm": 0.02575129149720033, + "language_loss": 0.73659623, + "learning_rate": 0.0009907016311625045, + "loss": 0.74885261, + "num_input_tokens_seen": 38760528, + "router_z_loss_mlp": 1.40625, + "step": 466, + "time_per_iteration": 4.9763429164886475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221953, + "balance_loss_mlp": 1.07861578, + "epoch": 0.08984224701808388, + "flos": 534549385728.0, + "grad_norm": 0.024628184063577727, + "language_loss": 1.01551545, + "learning_rate": 0.0009906417330663815, + "loss": 1.02773499, + "num_input_tokens_seen": 38827200, + "router_z_loss_mlp": 1.43457031, + "step": 467, + "time_per_iteration": 2.614560842514038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01232523, + "balance_loss_mlp": 1.08994913, + "epoch": 0.09003462870334744, + "flos": 479850103296.0, + "grad_norm": 0.03230737833956583, + "language_loss": 0.98222148, + "learning_rate": 0.0009905816444862442, + "loss": 0.99454677, + "num_input_tokens_seen": 38891984, + "router_z_loss_mlp": 1.42675781, + "step": 468, + "time_per_iteration": 2.598146438598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223867, + "balance_loss_mlp": 1.08124495, + "epoch": 0.090227010388611, + "flos": 654902178816.0, + "grad_norm": 0.027522185030294237, + "language_loss": 0.95659769, + "learning_rate": 0.0009905213654454216, + "loss": 0.96883637, + "num_input_tokens_seen": 38977136, + "router_z_loss_mlp": 1.42724609, + "step": 469, + "time_per_iteration": 2.8876352310180664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01219852, + "balance_loss_mlp": 1.07737279, + "epoch": 0.09041939207387456, + "flos": 619358515200.0, + "grad_norm": 0.023282407360439072, + "language_loss": 1.03878951, + "learning_rate": 0.0009904608959673158, + "loss": 1.0509882, + "num_input_tokens_seen": 39052224, + "router_z_loss_mlp": 1.42578125, + "step": 470, + "time_per_iteration": 2.7882330417633057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213781, + "balance_loss_mlp": 1.0718745, + "epoch": 0.09061177375913813, + "flos": 455295596544.0, + "grad_norm": 0.02882877970469751, + "language_loss": 1.04707062, + "learning_rate": 0.000990400236075403, + "loss": 1.05920839, + "num_input_tokens_seen": 39116832, + "router_z_loss_mlp": 1.41992188, + "step": 471, + "time_per_iteration": 2.5016987323760986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217743, + "balance_loss_mlp": 1.07574117, + "epoch": 0.0908041554444017, + "flos": 545308230144.0, + "grad_norm": 0.02444258884202674, + "language_loss": 1.01020849, + "learning_rate": 0.0009903393857932338, + "loss": 1.02238584, + "num_input_tokens_seen": 39190528, + "router_z_loss_mlp": 1.42089844, + "step": 472, + "time_per_iteration": 2.644397497177124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218613, + "balance_loss_mlp": 1.07732654, + "epoch": 0.09099653712966525, + "flos": 565466964480.0, + "grad_norm": 0.02685769494428931, + "language_loss": 0.99245131, + "learning_rate": 0.0009902783451444317, + "loss": 1.00463748, + "num_input_tokens_seen": 39263168, + "router_z_loss_mlp": 1.41357422, + "step": 473, + "time_per_iteration": 2.7087745666503906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214499, + "balance_loss_mlp": 1.07292593, + "epoch": 0.09118891881492881, + "flos": 475501994496.0, + "grad_norm": 0.029476649456104027, + "language_loss": 1.02896917, + "learning_rate": 0.0009902171141526956, + "loss": 1.04111421, + "num_input_tokens_seen": 39330784, + "router_z_loss_mlp": 1.41650391, + "step": 474, + "time_per_iteration": 2.5271990299224854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215154, + "balance_loss_mlp": 1.07410538, + "epoch": 0.09138130050019239, + "flos": 546990822912.0, + "grad_norm": 0.02490932279529465, + "language_loss": 0.89845926, + "learning_rate": 0.000990155692841797, + "loss": 0.9106108, + "num_input_tokens_seen": 39417472, + "router_z_loss_mlp": 1.41113281, + "step": 475, + "time_per_iteration": 2.958740234375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214039, + "balance_loss_mlp": 1.07303798, + "epoch": 0.09157368218545595, + "flos": 733973319168.0, + "grad_norm": 0.02740759839690251, + "language_loss": 1.01869047, + "learning_rate": 0.0009900940812355818, + "loss": 1.03083086, + "num_input_tokens_seen": 39488656, + "router_z_loss_mlp": 1.41064453, + "step": 476, + "time_per_iteration": 2.891787528991699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205639, + "balance_loss_mlp": 1.06478107, + "epoch": 0.0917660638707195, + "flos": 612072918528.0, + "grad_norm": 0.029261712768775452, + "language_loss": 0.99624813, + "learning_rate": 0.00099003227935797, + "loss": 1.0083046, + "num_input_tokens_seen": 39558224, + "router_z_loss_mlp": 1.40917969, + "step": 477, + "time_per_iteration": 2.7569031715393066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207057, + "balance_loss_mlp": 1.06605613, + "epoch": 0.09195844555598306, + "flos": 657018473472.0, + "grad_norm": 0.026965523070242428, + "language_loss": 1.02860427, + "learning_rate": 0.000989970287232955, + "loss": 1.04067481, + "num_input_tokens_seen": 39629856, + "router_z_loss_mlp": 1.41064453, + "step": 478, + "time_per_iteration": 2.7705225944519043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212938, + "balance_loss_mlp": 1.07212758, + "epoch": 0.09215082724124664, + "flos": 477540426240.0, + "grad_norm": 0.02578247385618595, + "language_loss": 0.99767786, + "learning_rate": 0.0009899081048846043, + "loss": 1.00980723, + "num_input_tokens_seen": 39695984, + "router_z_loss_mlp": 1.40869141, + "step": 479, + "time_per_iteration": 2.5488922595977783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215229, + "balance_loss_mlp": 1.07437098, + "epoch": 0.0923432089265102, + "flos": 525325413888.0, + "grad_norm": 0.029009434883925433, + "language_loss": 1.05276799, + "learning_rate": 0.0009898457323370593, + "loss": 1.06492031, + "num_input_tokens_seen": 39760256, + "router_z_loss_mlp": 1.40917969, + "step": 480, + "time_per_iteration": 2.5628790855407715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213957, + "balance_loss_mlp": 1.07314658, + "epoch": 0.09253559061177376, + "flos": 546638986752.0, + "grad_norm": 0.030643020391807937, + "language_loss": 1.01694977, + "learning_rate": 0.000989783169614535, + "loss": 1.02908933, + "num_input_tokens_seen": 39827984, + "router_z_loss_mlp": 1.40869141, + "step": 481, + "time_per_iteration": 2.6431851387023926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206421, + "balance_loss_mlp": 1.06718445, + "epoch": 0.09272797229703732, + "flos": 1541334362112.0, + "grad_norm": 0.00793715508899474, + "language_loss": 0.78752756, + "learning_rate": 0.0009897204167413206, + "loss": 0.79959178, + "num_input_tokens_seen": 40056688, + "router_z_loss_mlp": 1.39257812, + "step": 482, + "time_per_iteration": 4.84259295463562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211177, + "balance_loss_mlp": 1.07041514, + "epoch": 0.09292035398230089, + "flos": 691064194560.0, + "grad_norm": 0.029391602229229655, + "language_loss": 0.99036419, + "learning_rate": 0.000989657473741779, + "loss": 1.00247598, + "num_input_tokens_seen": 40133120, + "router_z_loss_mlp": 1.40820312, + "step": 483, + "time_per_iteration": 2.8193717002868652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210505, + "balance_loss_mlp": 1.06964695, + "epoch": 0.09311273566756445, + "flos": 510822076416.0, + "grad_norm": 0.026713621627667553, + "language_loss": 1.0060308, + "learning_rate": 0.0009895943406403465, + "loss": 1.01813591, + "num_input_tokens_seen": 40206464, + "router_z_loss_mlp": 1.40917969, + "step": 484, + "time_per_iteration": 2.695058822631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210956, + "balance_loss_mlp": 1.07071841, + "epoch": 0.09330511735282801, + "flos": 660583045632.0, + "grad_norm": 0.02538483632370611, + "language_loss": 0.94170594, + "learning_rate": 0.0009895310174615338, + "loss": 0.95381546, + "num_input_tokens_seen": 40277744, + "router_z_loss_mlp": 1.40283203, + "step": 485, + "time_per_iteration": 2.7646515369415283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210991, + "balance_loss_mlp": 1.0725174, + "epoch": 0.09349749903809157, + "flos": 1456021673472.0, + "grad_norm": 0.008074315810691821, + "language_loss": 0.75718516, + "learning_rate": 0.0009894675042299251, + "loss": 0.7692951, + "num_input_tokens_seen": 40503664, + "router_z_loss_mlp": 1.38476562, + "step": 486, + "time_per_iteration": 4.652726888656616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208546, + "balance_loss_mlp": 1.06868994, + "epoch": 0.09368988072335514, + "flos": 521899829760.0, + "grad_norm": 0.021962490795067104, + "language_loss": 0.97574425, + "learning_rate": 0.0009894038009701782, + "loss": 0.98782969, + "num_input_tokens_seen": 40571376, + "router_z_loss_mlp": 1.39892578, + "step": 487, + "time_per_iteration": 2.647747755050659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207771, + "balance_loss_mlp": 1.06786692, + "epoch": 0.0938822624086187, + "flos": 498751941120.0, + "grad_norm": 0.02403393711112831, + "language_loss": 1.01297927, + "learning_rate": 0.0009893399077070253, + "loss": 1.02505696, + "num_input_tokens_seen": 40638096, + "router_z_loss_mlp": 1.39941406, + "step": 488, + "time_per_iteration": 2.5559775829315186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209251, + "balance_loss_mlp": 1.07006216, + "epoch": 0.09407464409388226, + "flos": 534223746048.0, + "grad_norm": 0.02465812888810929, + "language_loss": 0.94380867, + "learning_rate": 0.0009892758244652718, + "loss": 0.95590127, + "num_input_tokens_seen": 40710992, + "router_z_loss_mlp": 1.39208984, + "step": 489, + "time_per_iteration": 2.6696364879608154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203933, + "balance_loss_mlp": 1.06398153, + "epoch": 0.09426702577914582, + "flos": 587090714112.0, + "grad_norm": 0.02607881729553482, + "language_loss": 1.01920152, + "learning_rate": 0.0009892115512697968, + "loss": 1.03124094, + "num_input_tokens_seen": 40778896, + "router_z_loss_mlp": 1.39990234, + "step": 490, + "time_per_iteration": 2.645073652267456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205245, + "balance_loss_mlp": 1.06524527, + "epoch": 0.0944594074644094, + "flos": 504463733760.0, + "grad_norm": 0.02086232355550113, + "language_loss": 1.01703966, + "learning_rate": 0.0009891470881455537, + "loss": 1.02909207, + "num_input_tokens_seen": 40853376, + "router_z_loss_mlp": 1.40039062, + "step": 491, + "time_per_iteration": 2.669978618621826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207443, + "balance_loss_mlp": 1.06777763, + "epoch": 0.09465178914967295, + "flos": 572114016768.0, + "grad_norm": 0.026976181820206353, + "language_loss": 1.00743008, + "learning_rate": 0.0009890824351175692, + "loss": 1.01950443, + "num_input_tokens_seen": 40923776, + "router_z_loss_mlp": 1.39697266, + "step": 492, + "time_per_iteration": 2.6572952270507812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207157, + "balance_loss_mlp": 1.06796801, + "epoch": 0.09484417083493651, + "flos": 550418408448.0, + "grad_norm": 0.023611014675858334, + "language_loss": 1.04079592, + "learning_rate": 0.0009890175922109435, + "loss": 1.05286753, + "num_input_tokens_seen": 40996848, + "router_z_loss_mlp": 1.39208984, + "step": 493, + "time_per_iteration": 2.622361183166504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120413, + "balance_loss_mlp": 1.06498933, + "epoch": 0.09503655252020007, + "flos": 825271047168.0, + "grad_norm": 0.02510100112233158, + "language_loss": 1.0275588, + "learning_rate": 0.0009889525594508513, + "loss": 1.03960025, + "num_input_tokens_seen": 41071280, + "router_z_loss_mlp": 1.39160156, + "step": 494, + "time_per_iteration": 3.0307581424713135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202477, + "balance_loss_mlp": 1.06333554, + "epoch": 0.09522893420546363, + "flos": 405517839360.0, + "grad_norm": 0.02234367718934989, + "language_loss": 0.96151906, + "learning_rate": 0.0009888873368625404, + "loss": 0.97354376, + "num_input_tokens_seen": 41136304, + "router_z_loss_mlp": 1.39160156, + "step": 495, + "time_per_iteration": 2.4793317317962646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205465, + "balance_loss_mlp": 1.06665742, + "epoch": 0.0954213158907272, + "flos": 692255963136.0, + "grad_norm": 0.025506351191757377, + "language_loss": 1.00908709, + "learning_rate": 0.0009888219244713326, + "loss": 1.02114165, + "num_input_tokens_seen": 41212384, + "router_z_loss_mlp": 1.38818359, + "step": 496, + "time_per_iteration": 2.865914821624756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206499, + "balance_loss_mlp": 1.06773937, + "epoch": 0.09561369757599077, + "flos": 520074246144.0, + "grad_norm": 0.030124833611481355, + "language_loss": 1.02319717, + "learning_rate": 0.0009887563223026229, + "loss": 1.03526211, + "num_input_tokens_seen": 41282528, + "router_z_loss_mlp": 1.38671875, + "step": 497, + "time_per_iteration": 2.689708948135376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210899, + "balance_loss_mlp": 1.07376099, + "epoch": 0.09580607926125433, + "flos": 1388781623808.0, + "grad_norm": 0.014650036919455408, + "language_loss": 0.7906816, + "learning_rate": 0.0009886905303818805, + "loss": 0.80279064, + "num_input_tokens_seen": 41512256, + "router_z_loss_mlp": 1.37109375, + "step": 498, + "time_per_iteration": 4.940208196640015 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203477, + "balance_loss_mlp": 1.06476545, + "epoch": 0.09599846094651789, + "flos": 718825433088.0, + "grad_norm": 0.028840614245688557, + "language_loss": 0.98952407, + "learning_rate": 0.0009886245487346482, + "loss": 1.00155878, + "num_input_tokens_seen": 41596816, + "router_z_loss_mlp": 1.38427734, + "step": 499, + "time_per_iteration": 3.023056745529175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205479, + "balance_loss_mlp": 1.06690967, + "epoch": 0.09619084263178146, + "flos": 386893977600.0, + "grad_norm": 0.031706482821381415, + "language_loss": 1.0340035, + "learning_rate": 0.0009885583773865422, + "loss": 1.04605842, + "num_input_tokens_seen": 41658544, + "router_z_loss_mlp": 1.38183594, + "step": 500, + "time_per_iteration": 2.422914505004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202787, + "balance_loss_mlp": 1.06479073, + "epoch": 0.09638322431704502, + "flos": 535172467200.0, + "grad_norm": 0.02878579188863982, + "language_loss": 0.99392897, + "learning_rate": 0.0009884920163632524, + "loss": 1.00595689, + "num_input_tokens_seen": 41730736, + "router_z_loss_mlp": 1.37988281, + "step": 501, + "time_per_iteration": 2.6820154190063477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203474, + "balance_loss_mlp": 1.0655731, + "epoch": 0.09657560600230858, + "flos": 501656501760.0, + "grad_norm": 0.02635733095705931, + "language_loss": 1.03128934, + "learning_rate": 0.000988425465690543, + "loss": 1.04332411, + "num_input_tokens_seen": 41797824, + "router_z_loss_mlp": 1.37890625, + "step": 502, + "time_per_iteration": 2.605536699295044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204627, + "balance_loss_mlp": 1.06677341, + "epoch": 0.09676798768757214, + "flos": 530331532800.0, + "grad_norm": 0.023374032620567947, + "language_loss": 1.00861204, + "learning_rate": 0.0009883587253942505, + "loss": 1.02065825, + "num_input_tokens_seen": 41875520, + "router_z_loss_mlp": 1.37841797, + "step": 503, + "time_per_iteration": 2.7548091411590576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204765, + "balance_loss_mlp": 1.06686366, + "epoch": 0.09696036937283571, + "flos": 464556498432.0, + "grad_norm": 0.029206950172382878, + "language_loss": 1.0685035, + "learning_rate": 0.0009882917955002862, + "loss": 1.08055115, + "num_input_tokens_seen": 41942224, + "router_z_loss_mlp": 1.37890625, + "step": 504, + "time_per_iteration": 2.520970344543457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200777, + "balance_loss_mlp": 1.06297076, + "epoch": 0.09715275105809927, + "flos": 536010398208.0, + "grad_norm": 0.02484338661637091, + "language_loss": 0.9770751, + "learning_rate": 0.0009882246760346343, + "loss": 0.98908287, + "num_input_tokens_seen": 42007552, + "router_z_loss_mlp": 1.37695312, + "step": 505, + "time_per_iteration": 2.6314897537231445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204578, + "balance_loss_mlp": 1.06672478, + "epoch": 0.09734513274336283, + "flos": 455881747968.0, + "grad_norm": 0.02756591702740651, + "language_loss": 1.04990697, + "learning_rate": 0.0009881573670233533, + "loss": 1.06195283, + "num_input_tokens_seen": 42071760, + "router_z_loss_mlp": 1.37451172, + "step": 506, + "time_per_iteration": 2.492464780807495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203948, + "balance_loss_mlp": 1.06619, + "epoch": 0.09753751442862639, + "flos": 509827693056.0, + "grad_norm": 0.02954706972608782, + "language_loss": 0.97619581, + "learning_rate": 0.0009880898684925747, + "loss": 0.98823535, + "num_input_tokens_seen": 42140688, + "router_z_loss_mlp": 1.37353516, + "step": 507, + "time_per_iteration": 2.6402478218078613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120195, + "balance_loss_mlp": 1.06438243, + "epoch": 0.09772989611388996, + "flos": 485246989824.0, + "grad_norm": 0.02487380392257162, + "language_loss": 0.96617985, + "learning_rate": 0.0009880221804685037, + "loss": 0.97819936, + "num_input_tokens_seen": 42208544, + "router_z_loss_mlp": 1.37158203, + "step": 508, + "time_per_iteration": 2.5352439880371094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209412, + "balance_loss_mlp": 1.0741806, + "epoch": 0.09792227779915352, + "flos": 1569316454400.0, + "grad_norm": 0.016823619827393988, + "language_loss": 0.79344422, + "learning_rate": 0.000987954302977419, + "loss": 0.80553836, + "num_input_tokens_seen": 42426624, + "router_z_loss_mlp": 1.3515625, + "step": 509, + "time_per_iteration": 4.694217205047607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205455, + "balance_loss_mlp": 1.06831706, + "epoch": 0.09811465948441708, + "flos": 588914296320.0, + "grad_norm": 0.032012577058462416, + "language_loss": 1.03636336, + "learning_rate": 0.0009878862360456733, + "loss": 1.04841793, + "num_input_tokens_seen": 42494592, + "router_z_loss_mlp": 1.37011719, + "step": 510, + "time_per_iteration": 2.73879337310791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208431, + "balance_loss_mlp": 1.07148337, + "epoch": 0.09830704116968064, + "flos": 614128814592.0, + "grad_norm": 0.028115444050206044, + "language_loss": 0.94855493, + "learning_rate": 0.0009878179796996922, + "loss": 0.96063924, + "num_input_tokens_seen": 42564944, + "router_z_loss_mlp": 1.36914062, + "step": 511, + "time_per_iteration": 2.6949734687805176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207361, + "balance_loss_mlp": 1.07050836, + "epoch": 0.09849942285494422, + "flos": 539935538688.0, + "grad_norm": 0.022608937638108787, + "language_loss": 0.9790619, + "learning_rate": 0.0009877495339659754, + "loss": 0.99113548, + "num_input_tokens_seen": 42645616, + "router_z_loss_mlp": 1.36816406, + "step": 512, + "time_per_iteration": 2.7515861988067627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214076, + "balance_loss_mlp": 1.0773195, + "epoch": 0.09869180454020778, + "flos": 621603064320.0, + "grad_norm": 0.029833187637910333, + "language_loss": 0.94261241, + "learning_rate": 0.000987680898871096, + "loss": 0.95475316, + "num_input_tokens_seen": 42713632, + "router_z_loss_mlp": 1.3671875, + "step": 513, + "time_per_iteration": 2.6975760459899902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120845, + "balance_loss_mlp": 1.07145417, + "epoch": 0.09888418622547133, + "flos": 813059922432.0, + "grad_norm": 0.032512892127392744, + "language_loss": 0.9726817, + "learning_rate": 0.0009876120744417, + "loss": 0.98476619, + "num_input_tokens_seen": 42789088, + "router_z_loss_mlp": 1.36767578, + "step": 514, + "time_per_iteration": 2.9514927864074707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214576, + "balance_loss_mlp": 1.07762837, + "epoch": 0.0990765679107349, + "flos": 536857061376.0, + "grad_norm": 0.028495408786163776, + "language_loss": 1.0346663, + "learning_rate": 0.0009875430607045078, + "loss": 1.04681206, + "num_input_tokens_seen": 42861168, + "router_z_loss_mlp": 1.36523438, + "step": 515, + "time_per_iteration": 2.669271230697632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209323, + "balance_loss_mlp": 1.07242322, + "epoch": 0.09926894959599845, + "flos": 588970692096.0, + "grad_norm": 0.026228231589839293, + "language_loss": 0.98752952, + "learning_rate": 0.000987473857686313, + "loss": 0.9996227, + "num_input_tokens_seen": 42934112, + "router_z_loss_mlp": 1.36474609, + "step": 516, + "time_per_iteration": 2.7055716514587402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120601, + "balance_loss_mlp": 1.06934881, + "epoch": 0.09946133128126203, + "flos": 642386881536.0, + "grad_norm": 0.0302129460476142, + "language_loss": 1.04248524, + "learning_rate": 0.0009874044654139824, + "loss": 1.05454528, + "num_input_tokens_seen": 43005248, + "router_z_loss_mlp": 1.36230469, + "step": 517, + "time_per_iteration": 2.726618528366089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200307, + "balance_loss_mlp": 1.06340742, + "epoch": 0.09965371296652559, + "flos": 466725186048.0, + "grad_norm": 0.03251153136411229, + "language_loss": 1.02563679, + "learning_rate": 0.0009873348839144563, + "loss": 1.03763986, + "num_input_tokens_seen": 43070576, + "router_z_loss_mlp": 1.36474609, + "step": 518, + "time_per_iteration": 2.5855953693389893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200913, + "balance_loss_mlp": 1.06439471, + "epoch": 0.09984609465178915, + "flos": 484558780416.0, + "grad_norm": 0.029627125773621466, + "language_loss": 1.03352094, + "learning_rate": 0.000987265113214749, + "loss": 1.04552996, + "num_input_tokens_seen": 43138048, + "router_z_loss_mlp": 1.36279297, + "step": 519, + "time_per_iteration": 2.5350587368011475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01201703, + "balance_loss_mlp": 1.06566191, + "epoch": 0.1000384763370527, + "flos": 570095050752.0, + "grad_norm": 0.028931775658430137, + "language_loss": 1.07544637, + "learning_rate": 0.0009871951533419476, + "loss": 1.08746338, + "num_input_tokens_seen": 43207600, + "router_z_loss_mlp": 1.35986328, + "step": 520, + "time_per_iteration": 2.6423709392547607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200484, + "balance_loss_mlp": 1.06439495, + "epoch": 0.10023085802231628, + "flos": 546925694976.0, + "grad_norm": 0.025491893219336172, + "language_loss": 0.95403761, + "learning_rate": 0.0009871250043232132, + "loss": 0.96604246, + "num_input_tokens_seen": 43285104, + "router_z_loss_mlp": 1.36035156, + "step": 521, + "time_per_iteration": 2.7604362964630127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198813, + "balance_loss_mlp": 1.06205583, + "epoch": 0.10042323970757984, + "flos": 504439538688.0, + "grad_norm": 0.029888360913216814, + "language_loss": 0.96113187, + "learning_rate": 0.0009870546661857797, + "loss": 0.97311997, + "num_input_tokens_seen": 43353312, + "router_z_loss_mlp": 1.36328125, + "step": 522, + "time_per_iteration": 2.578458547592163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195212, + "balance_loss_mlp": 1.05931365, + "epoch": 0.1006156213928434, + "flos": 771724601856.0, + "grad_norm": 0.029426081780707294, + "language_loss": 1.05752206, + "learning_rate": 0.0009869841389569553, + "loss": 1.0694741, + "num_input_tokens_seen": 43427680, + "router_z_loss_mlp": 1.35839844, + "step": 523, + "time_per_iteration": 2.958531618118286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194366, + "balance_loss_mlp": 1.05846703, + "epoch": 0.10080800307810696, + "flos": 491008447488.0, + "grad_norm": 0.024593893632090205, + "language_loss": 0.96497846, + "learning_rate": 0.0009869134226641206, + "loss": 0.97692204, + "num_input_tokens_seen": 43495200, + "router_z_loss_mlp": 1.35839844, + "step": 524, + "time_per_iteration": 2.6820528507232666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196113, + "balance_loss_mlp": 1.06030965, + "epoch": 0.10100038476337053, + "flos": 455712560640.0, + "grad_norm": 0.026556514945601337, + "language_loss": 0.98348475, + "learning_rate": 0.0009868425173347303, + "loss": 0.99544585, + "num_input_tokens_seen": 43566256, + "router_z_loss_mlp": 1.35742188, + "step": 525, + "time_per_iteration": 2.6460907459259033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196515, + "balance_loss_mlp": 1.06099772, + "epoch": 0.10119276644863409, + "flos": 557573749248.0, + "grad_norm": 0.022458491608374247, + "language_loss": 1.03332829, + "learning_rate": 0.0009867714229963125, + "loss": 1.04529333, + "num_input_tokens_seen": 43639696, + "router_z_loss_mlp": 1.35449219, + "step": 526, + "time_per_iteration": 2.693362236022949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119647, + "balance_loss_mlp": 1.0609529, + "epoch": 0.10138514813389765, + "flos": 517219350528.0, + "grad_norm": 0.028969258136437262, + "language_loss": 1.0161202, + "learning_rate": 0.000986700139676468, + "loss": 1.02808487, + "num_input_tokens_seen": 43703872, + "router_z_loss_mlp": 1.35449219, + "step": 527, + "time_per_iteration": 2.5826644897460938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202893, + "balance_loss_mlp": 1.06742311, + "epoch": 0.10157752981916121, + "flos": 501563175936.0, + "grad_norm": 0.023004964960346017, + "language_loss": 0.98490077, + "learning_rate": 0.0009866286674028717, + "loss": 0.99692971, + "num_input_tokens_seen": 43774416, + "router_z_loss_mlp": 1.35400391, + "step": 528, + "time_per_iteration": 2.626595973968506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204326, + "balance_loss_mlp": 1.06876123, + "epoch": 0.10176991150442478, + "flos": 658093447680.0, + "grad_norm": 0.024381421822087013, + "language_loss": 0.95674849, + "learning_rate": 0.0009865570062032717, + "loss": 0.96879184, + "num_input_tokens_seen": 43853376, + "router_z_loss_mlp": 1.35498047, + "step": 529, + "time_per_iteration": 2.916924238204956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203456, + "balance_loss_mlp": 1.0680815, + "epoch": 0.10196229318968834, + "flos": 574402226688.0, + "grad_norm": 0.021344584600364362, + "language_loss": 0.99175954, + "learning_rate": 0.0009864851561054893, + "loss": 1.00379407, + "num_input_tokens_seen": 43929632, + "router_z_loss_mlp": 1.35302734, + "step": 530, + "time_per_iteration": 2.750075578689575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203649, + "balance_loss_mlp": 1.06856096, + "epoch": 0.1021546748749519, + "flos": 519255780864.0, + "grad_norm": 0.027896087186932737, + "language_loss": 0.99157, + "learning_rate": 0.0009864131171374191, + "loss": 1.00360656, + "num_input_tokens_seen": 44002144, + "router_z_loss_mlp": 1.35009766, + "step": 531, + "time_per_iteration": 2.6506359577178955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202329, + "balance_loss_mlp": 1.06728852, + "epoch": 0.10234705656021546, + "flos": 610953008640.0, + "grad_norm": 0.021304730024267197, + "language_loss": 0.98848057, + "learning_rate": 0.0009863408893270292, + "loss": 1.0005039, + "num_input_tokens_seen": 44078272, + "router_z_loss_mlp": 1.34960938, + "step": 532, + "time_per_iteration": 2.827632427215576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202805, + "balance_loss_mlp": 1.06776476, + "epoch": 0.10253943824547904, + "flos": 602912073216.0, + "grad_norm": 0.02650069508154076, + "language_loss": 0.95645475, + "learning_rate": 0.0009862684727023605, + "loss": 0.96848285, + "num_input_tokens_seen": 44152304, + "router_z_loss_mlp": 1.34960938, + "step": 533, + "time_per_iteration": 2.730771541595459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206135, + "balance_loss_mlp": 1.07152414, + "epoch": 0.1027318199307426, + "flos": 664156349952.0, + "grad_norm": 0.02579556790717569, + "language_loss": 0.96718729, + "learning_rate": 0.0009861958672915283, + "loss": 0.97924864, + "num_input_tokens_seen": 44226720, + "router_z_loss_mlp": 1.34521484, + "step": 534, + "time_per_iteration": 2.825239419937134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202189, + "balance_loss_mlp": 1.06776834, + "epoch": 0.10292420161600616, + "flos": 684529933824.0, + "grad_norm": 0.02492376876437301, + "language_loss": 0.95656139, + "learning_rate": 0.0009861230731227201, + "loss": 0.96858335, + "num_input_tokens_seen": 44303600, + "router_z_loss_mlp": 1.34326172, + "step": 535, + "time_per_iteration": 2.858596086502075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203815, + "balance_loss_mlp": 1.06958508, + "epoch": 0.10311658330126972, + "flos": 491268959232.0, + "grad_norm": 0.02833674325523021, + "language_loss": 0.99709427, + "learning_rate": 0.0009860500902241973, + "loss": 1.00913239, + "num_input_tokens_seen": 44370960, + "router_z_loss_mlp": 1.34130859, + "step": 536, + "time_per_iteration": 2.5780303478240967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197149, + "balance_loss_mlp": 1.06291902, + "epoch": 0.10330896498653329, + "flos": 432686195712.0, + "grad_norm": 0.024484943889946764, + "language_loss": 1.03652823, + "learning_rate": 0.0009859769186242942, + "loss": 1.0484997, + "num_input_tokens_seen": 44435584, + "router_z_loss_mlp": 1.34130859, + "step": 537, + "time_per_iteration": 2.5104598999023438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119791, + "balance_loss_mlp": 1.06415713, + "epoch": 0.10350134667179685, + "flos": 550641990144.0, + "grad_norm": 0.0271300181774947, + "language_loss": 0.97886324, + "learning_rate": 0.0009859035583514187, + "loss": 0.99084234, + "num_input_tokens_seen": 44505456, + "router_z_loss_mlp": 1.33642578, + "step": 538, + "time_per_iteration": 2.6156880855560303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197994, + "balance_loss_mlp": 1.06395507, + "epoch": 0.10369372835706041, + "flos": 641826926592.0, + "grad_norm": 0.024416305433678544, + "language_loss": 1.00991774, + "learning_rate": 0.0009858300094340517, + "loss": 1.02189767, + "num_input_tokens_seen": 44580208, + "router_z_loss_mlp": 1.33935547, + "step": 539, + "time_per_iteration": 2.764214515686035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198436, + "balance_loss_mlp": 1.06468332, + "epoch": 0.10388611004232397, + "flos": 522765958656.0, + "grad_norm": 0.025798430155835095, + "language_loss": 0.9342165, + "learning_rate": 0.0009857562719007473, + "loss": 0.94620085, + "num_input_tokens_seen": 44646576, + "router_z_loss_mlp": 1.33642578, + "step": 540, + "time_per_iteration": 2.6592581272125244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204547, + "balance_loss_mlp": 1.07122386, + "epoch": 0.10407849172758753, + "flos": 703739947008.0, + "grad_norm": 0.023593197084580173, + "language_loss": 0.95331407, + "learning_rate": 0.0009856823457801331, + "loss": 0.96535957, + "num_input_tokens_seen": 44726752, + "router_z_loss_mlp": 1.33203125, + "step": 541, + "time_per_iteration": 2.889531373977661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202711, + "balance_loss_mlp": 1.06924474, + "epoch": 0.1042708734128511, + "flos": 503944711680.0, + "grad_norm": 0.023957714626313076, + "language_loss": 1.02856565, + "learning_rate": 0.00098560823110091, + "loss": 1.04059267, + "num_input_tokens_seen": 44795824, + "router_z_loss_mlp": 1.33349609, + "step": 542, + "time_per_iteration": 2.6067047119140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205134, + "balance_loss_mlp": 1.07185781, + "epoch": 0.10446325509811466, + "flos": 486640872960.0, + "grad_norm": 0.0231214260398276, + "language_loss": 1.01405394, + "learning_rate": 0.000985533927891851, + "loss": 1.02610517, + "num_input_tokens_seen": 44868496, + "router_z_loss_mlp": 1.33154297, + "step": 543, + "time_per_iteration": 2.6622776985168457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01201388, + "balance_loss_mlp": 1.06820762, + "epoch": 0.10465563678337822, + "flos": 569713015296.0, + "grad_norm": 0.023482705287667723, + "language_loss": 1.01015687, + "learning_rate": 0.0009854594361818044, + "loss": 1.02217078, + "num_input_tokens_seen": 44939888, + "router_z_loss_mlp": 1.33056641, + "step": 544, + "time_per_iteration": 2.7061924934387207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195672, + "balance_loss_mlp": 1.06244385, + "epoch": 0.10484801846864178, + "flos": 627242998272.0, + "grad_norm": 0.023194608242680787, + "language_loss": 0.99799937, + "learning_rate": 0.0009853847559996897, + "loss": 1.00995612, + "num_input_tokens_seen": 45012720, + "router_z_loss_mlp": 1.33105469, + "step": 545, + "time_per_iteration": 2.742445707321167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192128, + "balance_loss_mlp": 1.05885231, + "epoch": 0.10504040015390535, + "flos": 744812754432.0, + "grad_norm": 0.025865682249952955, + "language_loss": 0.99192667, + "learning_rate": 0.0009853098873745, + "loss": 1.00384796, + "num_input_tokens_seen": 45093744, + "router_z_loss_mlp": 1.33154297, + "step": 546, + "time_per_iteration": 3.0260400772094727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192867, + "balance_loss_mlp": 1.05997264, + "epoch": 0.10523278183916891, + "flos": 587842050048.0, + "grad_norm": 0.02599355243407578, + "language_loss": 0.98197657, + "learning_rate": 0.0009852348303353027, + "loss": 0.99390525, + "num_input_tokens_seen": 45172784, + "router_z_loss_mlp": 1.32763672, + "step": 547, + "time_per_iteration": 2.8120169639587402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191481, + "balance_loss_mlp": 1.05844367, + "epoch": 0.10542516352443247, + "flos": 871145857536.0, + "grad_norm": 0.02495252935664815, + "language_loss": 0.91398883, + "learning_rate": 0.000985159584911237, + "loss": 0.92590368, + "num_input_tokens_seen": 45255600, + "router_z_loss_mlp": 1.32910156, + "step": 548, + "time_per_iteration": 3.1012043952941895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119193, + "balance_loss_mlp": 1.05913138, + "epoch": 0.10561754520969603, + "flos": 506412842496.0, + "grad_norm": 0.025955858684814606, + "language_loss": 0.9925828, + "learning_rate": 0.0009850841511315162, + "loss": 1.00450206, + "num_input_tokens_seen": 45325072, + "router_z_loss_mlp": 1.32666016, + "step": 549, + "time_per_iteration": 2.626220464706421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192876, + "balance_loss_mlp": 1.06022012, + "epoch": 0.1058099268949596, + "flos": 561147053568.0, + "grad_norm": 0.02554357007654854, + "language_loss": 0.98952115, + "learning_rate": 0.0009850085290254256, + "loss": 1.00144982, + "num_input_tokens_seen": 45401440, + "router_z_loss_mlp": 1.32519531, + "step": 550, + "time_per_iteration": 2.7464635372161865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194366, + "balance_loss_mlp": 1.06161487, + "epoch": 0.10600230858022316, + "flos": 563159288832.0, + "grad_norm": 0.020736613501838204, + "language_loss": 0.9519307, + "learning_rate": 0.0009849327186223246, + "loss": 0.9638744, + "num_input_tokens_seen": 45479264, + "router_z_loss_mlp": 1.32617188, + "step": 551, + "time_per_iteration": 2.7678163051605225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199655, + "balance_loss_mlp": 1.06728542, + "epoch": 0.10619469026548672, + "flos": 495317624832.0, + "grad_norm": 0.02236411826292933, + "language_loss": 1.02411103, + "learning_rate": 0.000984856719951646, + "loss": 1.03610754, + "num_input_tokens_seen": 45547328, + "router_z_loss_mlp": 1.32226562, + "step": 552, + "time_per_iteration": 2.5607285499572754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196226, + "balance_loss_mlp": 1.06404662, + "epoch": 0.10638707195075028, + "flos": 677463916032.0, + "grad_norm": 0.025808282690500464, + "language_loss": 1.00531495, + "learning_rate": 0.0009847805330428943, + "loss": 1.01727724, + "num_input_tokens_seen": 45631152, + "router_z_loss_mlp": 1.3203125, + "step": 553, + "time_per_iteration": 2.8748667240142822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190787, + "balance_loss_mlp": 1.05860806, + "epoch": 0.10657945363601386, + "flos": 489035143680.0, + "grad_norm": 0.02571681940882287, + "language_loss": 1.04715252, + "learning_rate": 0.0009847041579256481, + "loss": 1.05906045, + "num_input_tokens_seen": 45698208, + "router_z_loss_mlp": 1.3203125, + "step": 554, + "time_per_iteration": 2.56693696975708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191519, + "balance_loss_mlp": 1.05948246, + "epoch": 0.10677183532127742, + "flos": 483970627584.0, + "grad_norm": 0.020874824601389917, + "language_loss": 1.01746583, + "learning_rate": 0.0009846275946295592, + "loss": 1.02938092, + "num_input_tokens_seen": 45766640, + "router_z_loss_mlp": 1.31884766, + "step": 555, + "time_per_iteration": 2.596774101257324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195781, + "balance_loss_mlp": 1.06369734, + "epoch": 0.10696421700654098, + "flos": 657581156352.0, + "grad_norm": 0.023085993180182653, + "language_loss": 0.93557143, + "learning_rate": 0.0009845508431843518, + "loss": 0.94752926, + "num_input_tokens_seen": 45851408, + "router_z_loss_mlp": 1.31933594, + "step": 556, + "time_per_iteration": 2.9913573265075684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192823, + "balance_loss_mlp": 1.06088233, + "epoch": 0.10715659869180454, + "flos": 568792492032.0, + "grad_norm": 0.026087632201688016, + "language_loss": 0.9692713, + "learning_rate": 0.0009844739036198233, + "loss": 0.9811995, + "num_input_tokens_seen": 45919824, + "router_z_loss_mlp": 1.31787109, + "step": 557, + "time_per_iteration": 2.6583988666534424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192362, + "balance_loss_mlp": 1.06051683, + "epoch": 0.10734898037706811, + "flos": 541743657984.0, + "grad_norm": 0.02708275038302545, + "language_loss": 1.03564882, + "learning_rate": 0.0009843967759658448, + "loss": 1.04757237, + "num_input_tokens_seen": 45991024, + "router_z_loss_mlp": 1.31689453, + "step": 558, + "time_per_iteration": 2.6571173667907715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209854, + "balance_loss_mlp": 1.07920074, + "epoch": 0.10754136206233167, + "flos": 1479731518464.0, + "grad_norm": 0.021017403581586082, + "language_loss": 0.72767758, + "learning_rate": 0.0009843194602523592, + "loss": 0.73977602, + "num_input_tokens_seen": 46212736, + "router_z_loss_mlp": 1.30664062, + "step": 559, + "time_per_iteration": 4.901749134063721 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191994, + "balance_loss_mlp": 1.06024349, + "epoch": 0.10773374374759523, + "flos": 513411730944.0, + "grad_norm": 0.02623387515623986, + "language_loss": 1.03025067, + "learning_rate": 0.000984241956509384, + "loss": 1.04217052, + "num_input_tokens_seen": 46283920, + "router_z_loss_mlp": 1.31591797, + "step": 560, + "time_per_iteration": 2.642380714416504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011916, + "balance_loss_mlp": 1.06013584, + "epoch": 0.10792612543285879, + "flos": 497477580288.0, + "grad_norm": 0.029111560342126648, + "language_loss": 1.01683569, + "learning_rate": 0.0009841642647670078, + "loss": 1.02875161, + "num_input_tokens_seen": 46349664, + "router_z_loss_mlp": 1.31298828, + "step": 561, + "time_per_iteration": 2.5994741916656494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191695, + "balance_loss_mlp": 1.06027901, + "epoch": 0.10811850711812235, + "flos": 736836946944.0, + "grad_norm": 0.027918527501713815, + "language_loss": 0.94711685, + "learning_rate": 0.0009840863850553944, + "loss": 0.95903373, + "num_input_tokens_seen": 46432688, + "router_z_loss_mlp": 1.3125, + "step": 562, + "time_per_iteration": 2.980377435684204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193377, + "balance_loss_mlp": 1.06215191, + "epoch": 0.10831088880338592, + "flos": 612676534272.0, + "grad_norm": 0.025174626098757973, + "language_loss": 0.99795747, + "learning_rate": 0.0009840083174047782, + "loss": 1.00989127, + "num_input_tokens_seen": 46507216, + "router_z_loss_mlp": 1.31054688, + "step": 563, + "time_per_iteration": 2.7209153175354004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194645, + "balance_loss_mlp": 1.0633713, + "epoch": 0.10850327048864948, + "flos": 557497887744.0, + "grad_norm": 0.021851565940339403, + "language_loss": 0.93414235, + "learning_rate": 0.0009839300618454685, + "loss": 0.94608879, + "num_input_tokens_seen": 46590464, + "router_z_loss_mlp": 1.31103516, + "step": 564, + "time_per_iteration": 2.833120584487915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194873, + "balance_loss_mlp": 1.06402934, + "epoch": 0.10869565217391304, + "flos": 604436212224.0, + "grad_norm": 0.021697209366751603, + "language_loss": 0.98980927, + "learning_rate": 0.0009838516184078466, + "loss": 1.00175798, + "num_input_tokens_seen": 46666240, + "router_z_loss_mlp": 1.30664062, + "step": 565, + "time_per_iteration": 2.805722236633301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193483, + "balance_loss_mlp": 1.06263876, + "epoch": 0.1088880338591766, + "flos": 527205391872.0, + "grad_norm": 0.024778377976546286, + "language_loss": 0.97356248, + "learning_rate": 0.0009837729871223669, + "loss": 0.98549736, + "num_input_tokens_seen": 46734288, + "router_z_loss_mlp": 1.30664062, + "step": 566, + "time_per_iteration": 2.652186155319214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119656, + "balance_loss_mlp": 1.0658114, + "epoch": 0.10908041554444017, + "flos": 621416412672.0, + "grad_norm": 0.023487449334803984, + "language_loss": 0.99301046, + "learning_rate": 0.0009836941680195568, + "loss": 1.00497603, + "num_input_tokens_seen": 46809920, + "router_z_loss_mlp": 1.30566406, + "step": 567, + "time_per_iteration": 2.7732484340667725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192093, + "balance_loss_mlp": 1.06144011, + "epoch": 0.10927279722970373, + "flos": 899673168384.0, + "grad_norm": 0.026216288845653656, + "language_loss": 0.95416081, + "learning_rate": 0.0009836151611300166, + "loss": 0.96608174, + "num_input_tokens_seen": 46889984, + "router_z_loss_mlp": 1.3046875, + "step": 568, + "time_per_iteration": 3.174981117248535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190864, + "balance_loss_mlp": 1.06049693, + "epoch": 0.10946517891496729, + "flos": 529699719168.0, + "grad_norm": 0.02336242427092275, + "language_loss": 1.03071296, + "learning_rate": 0.0009835359664844194, + "loss": 1.04262161, + "num_input_tokens_seen": 46959536, + "router_z_loss_mlp": 1.30273438, + "step": 569, + "time_per_iteration": 2.595041513442993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190102, + "balance_loss_mlp": 1.06173706, + "epoch": 0.10965756060023085, + "flos": 1563991426560.0, + "grad_norm": 0.006726678932110135, + "language_loss": 0.81036806, + "learning_rate": 0.0009834565841135114, + "loss": 0.82226908, + "num_input_tokens_seen": 47196960, + "router_z_loss_mlp": 1.28320312, + "step": 570, + "time_per_iteration": 4.911731719970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193915, + "balance_loss_mlp": 1.0634526, + "epoch": 0.10984994228549443, + "flos": 514099940352.0, + "grad_norm": 0.027266515996607284, + "language_loss": 1.00165153, + "learning_rate": 0.0009833770140481118, + "loss": 1.01359057, + "num_input_tokens_seen": 47266560, + "router_z_loss_mlp": 1.30273438, + "step": 571, + "time_per_iteration": 2.6079747676849365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197777, + "balance_loss_mlp": 1.06741011, + "epoch": 0.11004232397075799, + "flos": 956273895936.0, + "grad_norm": 0.026548665437539986, + "language_loss": 0.90315044, + "learning_rate": 0.000983297256319112, + "loss": 0.91512823, + "num_input_tokens_seen": 47348512, + "router_z_loss_mlp": 1.30175781, + "step": 572, + "time_per_iteration": 3.1897354125976562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188275, + "balance_loss_mlp": 1.05776477, + "epoch": 0.11023470565602154, + "flos": 489228526080.0, + "grad_norm": 0.026034490292812715, + "language_loss": 0.95817071, + "learning_rate": 0.000983217310957477, + "loss": 0.97005343, + "num_input_tokens_seen": 47425392, + "router_z_loss_mlp": 1.30322266, + "step": 573, + "time_per_iteration": 2.7447898387908936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190883, + "balance_loss_mlp": 1.06056309, + "epoch": 0.1104270873412851, + "flos": 656990275584.0, + "grad_norm": 0.026590820610190004, + "language_loss": 1.00224817, + "learning_rate": 0.000983137177994244, + "loss": 1.01415706, + "num_input_tokens_seen": 47502336, + "router_z_loss_mlp": 1.30126953, + "step": 574, + "time_per_iteration": 2.846140146255493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185115, + "balance_loss_mlp": 1.0552249, + "epoch": 0.11061946902654868, + "flos": 724747345920.0, + "grad_norm": 0.019709272455133778, + "language_loss": 0.93286896, + "learning_rate": 0.0009830568574605235, + "loss": 0.94472009, + "num_input_tokens_seen": 47583552, + "router_z_loss_mlp": 1.29736328, + "step": 575, + "time_per_iteration": 2.922821044921875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185727, + "balance_loss_mlp": 1.05569339, + "epoch": 0.11081185071181224, + "flos": 836867822592.0, + "grad_norm": 0.025292755419638515, + "language_loss": 0.97880363, + "learning_rate": 0.0009829763493874992, + "loss": 0.99066085, + "num_input_tokens_seen": 47663440, + "router_z_loss_mlp": 1.29833984, + "step": 576, + "time_per_iteration": 3.022394895553589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183726, + "balance_loss_mlp": 1.05412149, + "epoch": 0.1110042323970758, + "flos": 610282263552.0, + "grad_norm": 0.023453623229808367, + "language_loss": 1.02838886, + "learning_rate": 0.0009828956538064264, + "loss": 1.04022622, + "num_input_tokens_seen": 47741920, + "router_z_loss_mlp": 1.29541016, + "step": 577, + "time_per_iteration": 2.817147970199585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182671, + "balance_loss_mlp": 1.05316234, + "epoch": 0.11119661408233936, + "flos": 597039825408.0, + "grad_norm": 0.025026186935027953, + "language_loss": 0.99076784, + "learning_rate": 0.0009828147707486344, + "loss": 1.00259459, + "num_input_tokens_seen": 47815136, + "router_z_loss_mlp": 1.29492188, + "step": 578, + "time_per_iteration": 2.6778078079223633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186939, + "balance_loss_mlp": 1.05752516, + "epoch": 0.11138899576760293, + "flos": 556887541248.0, + "grad_norm": 0.027590262528076937, + "language_loss": 0.96720088, + "learning_rate": 0.0009827337002455245, + "loss": 0.97907031, + "num_input_tokens_seen": 47881360, + "router_z_loss_mlp": 1.29394531, + "step": 579, + "time_per_iteration": 2.6259562969207764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188781, + "balance_loss_mlp": 1.05951095, + "epoch": 0.11158137745286649, + "flos": 691062193152.0, + "grad_norm": 0.0223692175133054, + "language_loss": 0.94567806, + "learning_rate": 0.0009826524423285712, + "loss": 0.9575659, + "num_input_tokens_seen": 47962720, + "router_z_loss_mlp": 1.29150391, + "step": 580, + "time_per_iteration": 2.9144554138183594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118328, + "balance_loss_mlp": 1.05386627, + "epoch": 0.11177375913813005, + "flos": 764306747904.0, + "grad_norm": 0.02877171771660235, + "language_loss": 0.97941083, + "learning_rate": 0.0009825709970293218, + "loss": 0.9912436, + "num_input_tokens_seen": 48035472, + "router_z_loss_mlp": 1.29296875, + "step": 581, + "time_per_iteration": 2.8999927043914795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181128, + "balance_loss_mlp": 1.05223894, + "epoch": 0.11196614082339361, + "flos": 808030334976.0, + "grad_norm": 0.029325346048851512, + "language_loss": 1.03732872, + "learning_rate": 0.0009824893643793956, + "loss": 1.04913998, + "num_input_tokens_seen": 48116944, + "router_z_loss_mlp": 1.28857422, + "step": 582, + "time_per_iteration": 3.0697131156921387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186636, + "balance_loss_mlp": 1.05731773, + "epoch": 0.11215852250865718, + "flos": 559724972544.0, + "grad_norm": 0.028740695003145394, + "language_loss": 0.98446089, + "learning_rate": 0.0009824075444104857, + "loss": 0.99632728, + "num_input_tokens_seen": 48187808, + "router_z_loss_mlp": 1.29150391, + "step": 583, + "time_per_iteration": 2.7398793697357178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190407, + "balance_loss_mlp": 1.06147003, + "epoch": 0.11235090419392074, + "flos": 514575301632.0, + "grad_norm": 0.02293328270345756, + "language_loss": 1.02460003, + "learning_rate": 0.000982325537154357, + "loss": 1.03650403, + "num_input_tokens_seen": 48254464, + "router_z_loss_mlp": 1.28808594, + "step": 584, + "time_per_iteration": 2.590156078338623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188149, + "balance_loss_mlp": 1.05954635, + "epoch": 0.1125432858791843, + "flos": 492432529920.0, + "grad_norm": 0.028214107652977688, + "language_loss": 1.0381788, + "learning_rate": 0.0009822433426428484, + "loss": 1.05006027, + "num_input_tokens_seen": 48318784, + "router_z_loss_mlp": 1.28564453, + "step": 585, + "time_per_iteration": 2.566488027572632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188321, + "balance_loss_mlp": 1.05957532, + "epoch": 0.11273566756444786, + "flos": 511727136768.0, + "grad_norm": 0.027438709113267498, + "language_loss": 0.95940274, + "learning_rate": 0.0009821609609078697, + "loss": 0.971286, + "num_input_tokens_seen": 48389248, + "router_z_loss_mlp": 1.28710938, + "step": 586, + "time_per_iteration": 2.6117701530456543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189545, + "balance_loss_mlp": 1.06098938, + "epoch": 0.11292804924971142, + "flos": 623639494656.0, + "grad_norm": 0.025949033694362005, + "language_loss": 0.97216725, + "learning_rate": 0.0009820783919814045, + "loss": 0.98406273, + "num_input_tokens_seen": 48463312, + "router_z_loss_mlp": 1.28515625, + "step": 587, + "time_per_iteration": 2.798182249069214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181783, + "balance_loss_mlp": 1.05360925, + "epoch": 0.113120430934975, + "flos": 479038368768.0, + "grad_norm": 0.03012596671256698, + "language_loss": 0.94172156, + "learning_rate": 0.0009819956358955095, + "loss": 0.95353937, + "num_input_tokens_seen": 48531856, + "router_z_loss_mlp": 1.28125, + "step": 588, + "time_per_iteration": 2.54179310798645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197707, + "balance_loss_mlp": 1.06905663, + "epoch": 0.11331281262023855, + "flos": 467990814720.0, + "grad_norm": 0.02502737191739997, + "language_loss": 0.9542653, + "learning_rate": 0.0009819126926823127, + "loss": 0.96624243, + "num_input_tokens_seen": 48596640, + "router_z_loss_mlp": 1.28613281, + "step": 589, + "time_per_iteration": 2.5262975692749023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191554, + "balance_loss_mlp": 1.06333208, + "epoch": 0.11350519430550211, + "flos": 651610853376.0, + "grad_norm": 0.023462259875113876, + "language_loss": 0.96713853, + "learning_rate": 0.000981829562374016, + "loss": 0.97905409, + "num_input_tokens_seen": 48669648, + "router_z_loss_mlp": 1.28173828, + "step": 590, + "time_per_iteration": 2.764240026473999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192039, + "balance_loss_mlp": 1.06415117, + "epoch": 0.11369757599076567, + "flos": 558860845056.0, + "grad_norm": 0.030341732837715945, + "language_loss": 1.07369685, + "learning_rate": 0.0009817462450028933, + "loss": 1.08561718, + "num_input_tokens_seen": 48737392, + "router_z_loss_mlp": 1.27832031, + "step": 591, + "time_per_iteration": 2.638333559036255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189947, + "balance_loss_mlp": 1.06215453, + "epoch": 0.11388995767602925, + "flos": 572305397760.0, + "grad_norm": 0.0238596111294556, + "language_loss": 0.94198918, + "learning_rate": 0.0009816627406012916, + "loss": 0.9538886, + "num_input_tokens_seen": 48817136, + "router_z_loss_mlp": 1.27734375, + "step": 592, + "time_per_iteration": 2.800842523574829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191939, + "balance_loss_mlp": 1.06395626, + "epoch": 0.1140823393612928, + "flos": 741743009280.0, + "grad_norm": 0.025351621893671843, + "language_loss": 0.93787777, + "learning_rate": 0.0009815790492016295, + "loss": 0.94979715, + "num_input_tokens_seen": 48895808, + "router_z_loss_mlp": 1.27929688, + "step": 593, + "time_per_iteration": 2.9331579208374023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191026, + "balance_loss_mlp": 1.06337643, + "epoch": 0.11427472104655637, + "flos": 700251236352.0, + "grad_norm": 0.02689478502881467, + "language_loss": 0.96601468, + "learning_rate": 0.0009814951708363993, + "loss": 0.97792494, + "num_input_tokens_seen": 48967456, + "router_z_loss_mlp": 1.27587891, + "step": 594, + "time_per_iteration": 2.832094192504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200218, + "balance_loss_mlp": 1.07414246, + "epoch": 0.11446710273181993, + "flos": 1480352598528.0, + "grad_norm": 0.020191453180706247, + "language_loss": 0.77990985, + "learning_rate": 0.0009814111055381654, + "loss": 0.79191208, + "num_input_tokens_seen": 49193152, + "router_z_loss_mlp": 1.25976562, + "step": 595, + "time_per_iteration": 4.752530574798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187485, + "balance_loss_mlp": 1.06026483, + "epoch": 0.1146594844170835, + "flos": 495912508416.0, + "grad_norm": 0.02910362847653251, + "language_loss": 0.97498882, + "learning_rate": 0.0009813268533395648, + "loss": 0.98686367, + "num_input_tokens_seen": 49260960, + "router_z_loss_mlp": 1.27148438, + "step": 596, + "time_per_iteration": 2.571367025375366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187961, + "balance_loss_mlp": 1.06093144, + "epoch": 0.11485186610234706, + "flos": 475790704128.0, + "grad_norm": 0.02927093575191284, + "language_loss": 0.98108673, + "learning_rate": 0.0009812424142733073, + "loss": 0.99296629, + "num_input_tokens_seen": 49327616, + "router_z_loss_mlp": 1.26953125, + "step": 597, + "time_per_iteration": 2.5622098445892334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187255, + "balance_loss_mlp": 1.06046438, + "epoch": 0.11504424778761062, + "flos": 732619094016.0, + "grad_norm": 0.02047017320895946, + "language_loss": 0.92490959, + "learning_rate": 0.000981157788372175, + "loss": 0.93678212, + "num_input_tokens_seen": 49412864, + "router_z_loss_mlp": 1.26708984, + "step": 598, + "time_per_iteration": 3.017120599746704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185489, + "balance_loss_mlp": 1.05855536, + "epoch": 0.11523662947287418, + "flos": 546962625024.0, + "grad_norm": 0.02044602685826044, + "language_loss": 0.96609688, + "learning_rate": 0.0009810729756690223, + "loss": 0.97795177, + "num_input_tokens_seen": 49483584, + "router_z_loss_mlp": 1.26855469, + "step": 599, + "time_per_iteration": 2.7182610034942627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190213, + "balance_loss_mlp": 1.06323159, + "epoch": 0.11542901115813775, + "flos": 776387616768.0, + "grad_norm": 0.023703305464208416, + "language_loss": 0.99939269, + "learning_rate": 0.0009809879761967766, + "loss": 1.01129484, + "num_input_tokens_seen": 49563568, + "router_z_loss_mlp": 1.26904297, + "step": 600, + "time_per_iteration": 2.9586148262023926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189892, + "balance_loss_mlp": 1.06319618, + "epoch": 0.11562139284340131, + "flos": 732212863488.0, + "grad_norm": 0.024193120208057816, + "language_loss": 0.99113685, + "learning_rate": 0.0009809027899884378, + "loss": 1.00303578, + "num_input_tokens_seen": 49640800, + "router_z_loss_mlp": 1.26611328, + "step": 601, + "time_per_iteration": 2.885070323944092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183816, + "balance_loss_mlp": 1.05731082, + "epoch": 0.11581377452866487, + "flos": 537039710208.0, + "grad_norm": 0.022696091128935367, + "language_loss": 0.96568906, + "learning_rate": 0.0009808174170770779, + "loss": 0.97752714, + "num_input_tokens_seen": 49721872, + "router_z_loss_mlp": 1.26416016, + "step": 602, + "time_per_iteration": 2.7809743881225586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191742, + "balance_loss_mlp": 1.0662384, + "epoch": 0.11600615621392843, + "flos": 1559211617280.0, + "grad_norm": 0.013792800863456836, + "language_loss": 0.84898245, + "learning_rate": 0.0009807318574958418, + "loss": 0.86089987, + "num_input_tokens_seen": 49951472, + "router_z_loss_mlp": 1.25390625, + "step": 603, + "time_per_iteration": 4.860181570053101 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187966, + "balance_loss_mlp": 1.06169963, + "epoch": 0.116198537899192, + "flos": 538467795456.0, + "grad_norm": 0.022659628017063727, + "language_loss": 1.02766323, + "learning_rate": 0.0009806461112779462, + "loss": 1.03954291, + "num_input_tokens_seen": 50021136, + "router_z_loss_mlp": 1.26171875, + "step": 604, + "time_per_iteration": 2.614189863204956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187324, + "balance_loss_mlp": 1.06091404, + "epoch": 0.11639091958445556, + "flos": 455137142784.0, + "grad_norm": 0.0301649070939891, + "language_loss": 1.00891566, + "learning_rate": 0.0009805601784566814, + "loss": 1.02078903, + "num_input_tokens_seen": 50083888, + "router_z_loss_mlp": 1.26318359, + "step": 605, + "time_per_iteration": 2.470878839492798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119223, + "balance_loss_mlp": 1.06658351, + "epoch": 0.11658330126971912, + "flos": 556151668224.0, + "grad_norm": 0.025758302551065336, + "language_loss": 1.05099356, + "learning_rate": 0.0009804740590654089, + "loss": 1.0629158, + "num_input_tokens_seen": 50151744, + "router_z_loss_mlp": 1.25537109, + "step": 606, + "time_per_iteration": 2.631462812423706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191677, + "balance_loss_mlp": 1.06588733, + "epoch": 0.11677568295498268, + "flos": 717600737280.0, + "grad_norm": 0.02545612001836415, + "language_loss": 0.99629396, + "learning_rate": 0.0009803877531375635, + "loss": 1.00821078, + "num_input_tokens_seen": 50221248, + "router_z_loss_mlp": 1.25683594, + "step": 607, + "time_per_iteration": 2.879645586013794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191881, + "balance_loss_mlp": 1.06613898, + "epoch": 0.11696806464024626, + "flos": 610898614272.0, + "grad_norm": 0.023619167708177922, + "language_loss": 0.99668628, + "learning_rate": 0.0009803012607066523, + "loss": 1.008605, + "num_input_tokens_seen": 50293792, + "router_z_loss_mlp": 1.25634766, + "step": 608, + "time_per_iteration": 2.717660427093506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189661, + "balance_loss_mlp": 1.06406212, + "epoch": 0.11716044632550981, + "flos": 521415736320.0, + "grad_norm": 0.023557070356346427, + "language_loss": 0.97414643, + "learning_rate": 0.0009802145818062543, + "loss": 0.98604298, + "num_input_tokens_seen": 50367760, + "router_z_loss_mlp": 1.25488281, + "step": 609, + "time_per_iteration": 2.7209720611572266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190685, + "balance_loss_mlp": 1.064991, + "epoch": 0.11735282801077337, + "flos": 508488204288.0, + "grad_norm": 0.03039581956620226, + "language_loss": 1.01476204, + "learning_rate": 0.0009801277164700212, + "loss": 1.02666891, + "num_input_tokens_seen": 50435664, + "router_z_loss_mlp": 1.25585938, + "step": 610, + "time_per_iteration": 2.5900633335113525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190447, + "balance_loss_mlp": 1.06489623, + "epoch": 0.11754520969603693, + "flos": 687835995648.0, + "grad_norm": 0.028512829376260446, + "language_loss": 0.97853899, + "learning_rate": 0.0009800406647316776, + "loss": 0.99044347, + "num_input_tokens_seen": 50514144, + "router_z_loss_mlp": 1.25439453, + "step": 611, + "time_per_iteration": 2.8018290996551514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118467, + "balance_loss_mlp": 1.06088257, + "epoch": 0.1177375913813005, + "flos": 1545756331008.0, + "grad_norm": 0.00764509792440145, + "language_loss": 0.76914459, + "learning_rate": 0.0009799534266250196, + "loss": 0.78099126, + "num_input_tokens_seen": 50738448, + "router_z_loss_mlp": 1.24023438, + "step": 612, + "time_per_iteration": 4.767510175704956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185632, + "balance_loss_mlp": 1.05974686, + "epoch": 0.11792997306656407, + "flos": 521537260032.0, + "grad_norm": 0.0290479345737112, + "language_loss": 0.97953087, + "learning_rate": 0.000979866002183916, + "loss": 0.99138713, + "num_input_tokens_seen": 50809328, + "router_z_loss_mlp": 1.2578125, + "step": 613, + "time_per_iteration": 2.6752681732177734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182111, + "balance_loss_mlp": 1.05632174, + "epoch": 0.11812235475182763, + "flos": 667488608256.0, + "grad_norm": 0.030776001440310688, + "language_loss": 0.9883132, + "learning_rate": 0.0009797783914423082, + "loss": 1.00013435, + "num_input_tokens_seen": 50887728, + "router_z_loss_mlp": 1.25683594, + "step": 614, + "time_per_iteration": 2.8556718826293945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182577, + "balance_loss_mlp": 1.05697787, + "epoch": 0.11831473643709119, + "flos": 622504121856.0, + "grad_norm": 0.02739500646081478, + "language_loss": 0.93579996, + "learning_rate": 0.0009796905944342094, + "loss": 0.94762576, + "num_input_tokens_seen": 50966160, + "router_z_loss_mlp": 1.25488281, + "step": 615, + "time_per_iteration": 2.80253267288208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187072, + "balance_loss_mlp": 1.06152117, + "epoch": 0.11850711812235475, + "flos": 457694596608.0, + "grad_norm": 0.020858577781052552, + "language_loss": 0.96166766, + "learning_rate": 0.0009796026111937057, + "loss": 0.9735384, + "num_input_tokens_seen": 51035712, + "router_z_loss_mlp": 1.25439453, + "step": 616, + "time_per_iteration": 2.5763044357299805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189497, + "balance_loss_mlp": 1.06404102, + "epoch": 0.11869949980761832, + "flos": 514927137792.0, + "grad_norm": 0.022050319992180305, + "language_loss": 0.96050835, + "learning_rate": 0.0009795144417549552, + "loss": 0.97240329, + "num_input_tokens_seen": 51108656, + "router_z_loss_mlp": 1.25341797, + "step": 617, + "time_per_iteration": 2.7428698539733887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186044, + "balance_loss_mlp": 1.06092167, + "epoch": 0.11889188149288188, + "flos": 536156116992.0, + "grad_norm": 0.0238791856796517, + "language_loss": 0.97532642, + "learning_rate": 0.0009794260861521883, + "loss": 0.98718691, + "num_input_tokens_seen": 51185552, + "router_z_loss_mlp": 1.25292969, + "step": 618, + "time_per_iteration": 2.784257173538208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189196, + "balance_loss_mlp": 1.06445491, + "epoch": 0.11908426317814544, + "flos": 499644266496.0, + "grad_norm": 0.024260475486046627, + "language_loss": 0.96495152, + "learning_rate": 0.0009793375444197075, + "loss": 0.97684348, + "num_input_tokens_seen": 51255808, + "router_z_loss_mlp": 1.25, + "step": 619, + "time_per_iteration": 2.6021063327789307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189567, + "balance_loss_mlp": 1.06482673, + "epoch": 0.119276644863409, + "flos": 661067139072.0, + "grad_norm": 0.023292068214373615, + "language_loss": 0.96012962, + "learning_rate": 0.000979248816591888, + "loss": 0.97202522, + "num_input_tokens_seen": 51329408, + "router_z_loss_mlp": 1.25, + "step": 620, + "time_per_iteration": 2.783372640609741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184512, + "balance_loss_mlp": 1.06001019, + "epoch": 0.11946902654867257, + "flos": 760152021504.0, + "grad_norm": 0.02911418191745056, + "language_loss": 0.95521206, + "learning_rate": 0.0009791599027031766, + "loss": 0.96705711, + "num_input_tokens_seen": 51408784, + "router_z_loss_mlp": 1.24755859, + "step": 621, + "time_per_iteration": 3.04338002204895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185972, + "balance_loss_mlp": 1.06156564, + "epoch": 0.11966140823393613, + "flos": 682213526016.0, + "grad_norm": 0.0317276180850791, + "language_loss": 0.96021026, + "learning_rate": 0.0009790708027880932, + "loss": 0.97206998, + "num_input_tokens_seen": 51482592, + "router_z_loss_mlp": 1.24658203, + "step": 622, + "time_per_iteration": 2.8048830032348633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184547, + "balance_loss_mlp": 1.06171417, + "epoch": 0.11985378991919969, + "flos": 1454298147840.0, + "grad_norm": 0.011779966077399251, + "language_loss": 0.77427292, + "learning_rate": 0.0009789815168812293, + "loss": 0.78611839, + "num_input_tokens_seen": 51712240, + "router_z_loss_mlp": 1.23046875, + "step": 623, + "time_per_iteration": 4.88221549987793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187655, + "balance_loss_mlp": 1.06291461, + "epoch": 0.12004617160446325, + "flos": 528898718208.0, + "grad_norm": 0.0243802584204396, + "language_loss": 1.01341891, + "learning_rate": 0.0009788920450172487, + "loss": 1.0252955, + "num_input_tokens_seen": 51781440, + "router_z_loss_mlp": 1.25, + "step": 624, + "time_per_iteration": 2.6179678440093994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190724, + "balance_loss_mlp": 1.06655562, + "epoch": 0.12023855328972682, + "flos": 475176354816.0, + "grad_norm": 0.025839680970612892, + "language_loss": 0.99598378, + "learning_rate": 0.0009788023872308875, + "loss": 1.00789118, + "num_input_tokens_seen": 51845424, + "router_z_loss_mlp": 1.24414062, + "step": 625, + "time_per_iteration": 2.5168616771698 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189499, + "balance_loss_mlp": 1.06723785, + "epoch": 0.12043093497499038, + "flos": 1535051880960.0, + "grad_norm": 0.008994278182213968, + "language_loss": 0.75428998, + "learning_rate": 0.0009787125435569539, + "loss": 0.76618505, + "num_input_tokens_seen": 52076496, + "router_z_loss_mlp": 1.22460938, + "step": 626, + "time_per_iteration": 4.739393472671509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194547, + "balance_loss_mlp": 1.07128501, + "epoch": 0.12062331666025394, + "flos": 540914459136.0, + "grad_norm": 0.025390703641747513, + "language_loss": 1.01758838, + "learning_rate": 0.0009786225140303285, + "loss": 1.02953386, + "num_input_tokens_seen": 52143072, + "router_z_loss_mlp": 1.23486328, + "step": 627, + "time_per_iteration": 2.627995729446411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119053, + "balance_loss_mlp": 1.06683803, + "epoch": 0.1208156983455175, + "flos": 512999496192.0, + "grad_norm": 0.027559316114759484, + "language_loss": 1.00245547, + "learning_rate": 0.0009785322986859634, + "loss": 1.0143609, + "num_input_tokens_seen": 52211888, + "router_z_loss_mlp": 1.23925781, + "step": 628, + "time_per_iteration": 2.657465696334839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011787, + "balance_loss_mlp": 1.05481803, + "epoch": 0.12100808003078108, + "flos": 597589046784.0, + "grad_norm": 0.024406659961039724, + "language_loss": 1.01031506, + "learning_rate": 0.0009784418975588838, + "loss": 1.02210212, + "num_input_tokens_seen": 52283696, + "router_z_loss_mlp": 1.24121094, + "step": 629, + "time_per_iteration": 2.6953535079956055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187008, + "balance_loss_mlp": 1.063555, + "epoch": 0.12120046171604464, + "flos": 524066515968.0, + "grad_norm": 0.02180733694842763, + "language_loss": 0.99517697, + "learning_rate": 0.0009783513106841862, + "loss": 1.00704694, + "num_input_tokens_seen": 52358624, + "router_z_loss_mlp": 1.23681641, + "step": 630, + "time_per_iteration": 2.7234978675842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189331, + "balance_loss_mlp": 1.06687927, + "epoch": 0.1213928434013082, + "flos": 1557907057152.0, + "grad_norm": 0.011472153843238986, + "language_loss": 0.76732707, + "learning_rate": 0.00097826053809704, + "loss": 0.77922034, + "num_input_tokens_seen": 52591248, + "router_z_loss_mlp": 1.2265625, + "step": 631, + "time_per_iteration": 4.975109100341797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184278, + "balance_loss_mlp": 1.06072986, + "epoch": 0.12158522508657175, + "flos": 496387869696.0, + "grad_norm": 0.025959921000511615, + "language_loss": 0.96498066, + "learning_rate": 0.0009781695798326854, + "loss": 0.97682351, + "num_input_tokens_seen": 52659920, + "router_z_loss_mlp": 1.23779297, + "step": 632, + "time_per_iteration": 2.5740485191345215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184351, + "balance_loss_mlp": 1.0608983, + "epoch": 0.12177760677183531, + "flos": 476589703680.0, + "grad_norm": 0.025554774573744533, + "language_loss": 0.96275663, + "learning_rate": 0.0009780784359264365, + "loss": 0.9746002, + "num_input_tokens_seen": 52728832, + "router_z_loss_mlp": 1.23681641, + "step": 633, + "time_per_iteration": 2.604390859603882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176552, + "balance_loss_mlp": 1.05543518, + "epoch": 0.12196998845709889, + "flos": 1471784635392.0, + "grad_norm": 0.009598735556444526, + "language_loss": 0.74188697, + "learning_rate": 0.0009779871064136778, + "loss": 0.75365245, + "num_input_tokens_seen": 52949776, + "router_z_loss_mlp": 1.21289062, + "step": 634, + "time_per_iteration": 4.757449626922607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117713, + "balance_loss_mlp": 1.05424869, + "epoch": 0.12216237014236245, + "flos": 587748724224.0, + "grad_norm": 0.021555120902870813, + "language_loss": 0.93822527, + "learning_rate": 0.000977895591329867, + "loss": 0.94999647, + "num_input_tokens_seen": 53027184, + "router_z_loss_mlp": 1.23095703, + "step": 635, + "time_per_iteration": 2.7859792709350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181489, + "balance_loss_mlp": 1.05851305, + "epoch": 0.12235475182762601, + "flos": 599106455040.0, + "grad_norm": 0.023775729584682537, + "language_loss": 0.96009773, + "learning_rate": 0.000977803890710533, + "loss": 0.97191262, + "num_input_tokens_seen": 53101072, + "router_z_loss_mlp": 1.23193359, + "step": 636, + "time_per_iteration": 2.76069712638855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180701, + "balance_loss_mlp": 1.05762947, + "epoch": 0.12254713351288957, + "flos": 498760673280.0, + "grad_norm": 0.024707427516876792, + "language_loss": 1.00440359, + "learning_rate": 0.0009777120045912774, + "loss": 1.01621056, + "num_input_tokens_seen": 53172992, + "router_z_loss_mlp": 1.23291016, + "step": 637, + "time_per_iteration": 2.5980072021484375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118065, + "balance_loss_mlp": 1.05772126, + "epoch": 0.12273951519815314, + "flos": 606980204544.0, + "grad_norm": 0.02489341207380848, + "language_loss": 0.99891078, + "learning_rate": 0.0009776199330077736, + "loss": 1.01071739, + "num_input_tokens_seen": 53248256, + "router_z_loss_mlp": 1.23144531, + "step": 638, + "time_per_iteration": 2.704040288925171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181154, + "balance_loss_mlp": 1.05841601, + "epoch": 0.1229318968834167, + "flos": 598984931328.0, + "grad_norm": 0.02631208797714665, + "language_loss": 1.02141118, + "learning_rate": 0.0009775276759957667, + "loss": 1.03322268, + "num_input_tokens_seen": 53318960, + "router_z_loss_mlp": 1.22949219, + "step": 639, + "time_per_iteration": 2.7442896366119385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179934, + "balance_loss_mlp": 1.05700564, + "epoch": 0.12312427856868026, + "flos": 679588942848.0, + "grad_norm": 0.026802425502252814, + "language_loss": 1.01084137, + "learning_rate": 0.0009774352335910745, + "loss": 1.02264071, + "num_input_tokens_seen": 53389120, + "router_z_loss_mlp": 1.23144531, + "step": 640, + "time_per_iteration": 2.8294076919555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117918, + "balance_loss_mlp": 1.05625129, + "epoch": 0.12331666025394382, + "flos": 610043218944.0, + "grad_norm": 0.020742791942005383, + "language_loss": 1.02118182, + "learning_rate": 0.000977342605829586, + "loss": 1.03297377, + "num_input_tokens_seen": 53459056, + "router_z_loss_mlp": 1.23144531, + "step": 641, + "time_per_iteration": 2.7078418731689453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180028, + "balance_loss_mlp": 1.05748129, + "epoch": 0.12350904193920739, + "flos": 763840118784.0, + "grad_norm": 0.025027209312251563, + "language_loss": 0.94737858, + "learning_rate": 0.0009772497927472623, + "loss": 0.95917892, + "num_input_tokens_seen": 53541552, + "router_z_loss_mlp": 1.22753906, + "step": 642, + "time_per_iteration": 3.0655579566955566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177096, + "balance_loss_mlp": 1.05454898, + "epoch": 0.12370142362447095, + "flos": 542049831936.0, + "grad_norm": 0.02608476880613399, + "language_loss": 0.96273685, + "learning_rate": 0.0009771567943801368, + "loss": 0.97450781, + "num_input_tokens_seen": 53611520, + "router_z_loss_mlp": 1.22753906, + "step": 643, + "time_per_iteration": 2.7343406677246094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179725, + "balance_loss_mlp": 1.05727291, + "epoch": 0.12389380530973451, + "flos": 549252836352.0, + "grad_norm": 0.02435000122960196, + "language_loss": 0.99357152, + "learning_rate": 0.0009770636107643152, + "loss": 1.00536871, + "num_input_tokens_seen": 53683888, + "router_z_loss_mlp": 1.2265625, + "step": 644, + "time_per_iteration": 2.7361886501312256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177422, + "balance_loss_mlp": 1.05516136, + "epoch": 0.12408618699499807, + "flos": 541352890368.0, + "grad_norm": 0.02246298440278387, + "language_loss": 0.95392644, + "learning_rate": 0.0009769702419359738, + "loss": 0.96570063, + "num_input_tokens_seen": 53751888, + "router_z_loss_mlp": 1.22460938, + "step": 645, + "time_per_iteration": 2.674142837524414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181453, + "balance_loss_mlp": 1.05904841, + "epoch": 0.12427856868026164, + "flos": 747159361536.0, + "grad_norm": 0.023095982047370255, + "language_loss": 0.97586024, + "learning_rate": 0.000976876687931362, + "loss": 0.98767477, + "num_input_tokens_seen": 53827648, + "router_z_loss_mlp": 1.22607422, + "step": 646, + "time_per_iteration": 2.9833688735961914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189298, + "balance_loss_mlp": 1.06703711, + "epoch": 0.1244709503655252, + "flos": 534744769536.0, + "grad_norm": 0.03060863164707411, + "language_loss": 0.94044995, + "learning_rate": 0.0009767829487868005, + "loss": 0.95234299, + "num_input_tokens_seen": 53896400, + "router_z_loss_mlp": 1.22460938, + "step": 647, + "time_per_iteration": 2.596003293991089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182997, + "balance_loss_mlp": 1.06073558, + "epoch": 0.12466333205078876, + "flos": 509111285760.0, + "grad_norm": 0.028982594733012217, + "language_loss": 0.98960567, + "learning_rate": 0.000976689024538682, + "loss": 1.00143564, + "num_input_tokens_seen": 53965904, + "router_z_loss_mlp": 1.22460938, + "step": 648, + "time_per_iteration": 2.5837948322296143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183924, + "balance_loss_mlp": 1.06171107, + "epoch": 0.12485571373605232, + "flos": 682639222272.0, + "grad_norm": 0.03213416167398649, + "language_loss": 0.97804081, + "learning_rate": 0.0009765949152234716, + "loss": 0.98988008, + "num_input_tokens_seen": 54049792, + "router_z_loss_mlp": 1.22412109, + "step": 649, + "time_per_iteration": 2.876009702682495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193359, + "balance_loss_mlp": 1.07243347, + "epoch": 0.1250480954213159, + "flos": 1333198748160.0, + "grad_norm": 0.014891788740719425, + "language_loss": 0.78686082, + "learning_rate": 0.0009765006208777055, + "loss": 0.79879445, + "num_input_tokens_seen": 54262432, + "router_z_loss_mlp": 1.2109375, + "step": 650, + "time_per_iteration": 4.675558805465698 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182781, + "balance_loss_mlp": 1.06152093, + "epoch": 0.12524047710657946, + "flos": 940196754432.0, + "grad_norm": 0.027794334398077363, + "language_loss": 0.91408408, + "learning_rate": 0.0009764061415379919, + "loss": 0.9259119, + "num_input_tokens_seen": 54351568, + "router_z_loss_mlp": 1.21435547, + "step": 651, + "time_per_iteration": 3.260758399963379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184193, + "balance_loss_mlp": 1.06288576, + "epoch": 0.12543285879184302, + "flos": 514900941312.0, + "grad_norm": 0.027655948956122736, + "language_loss": 0.97430605, + "learning_rate": 0.0009763114772410109, + "loss": 0.986148, + "num_input_tokens_seen": 54418944, + "router_z_loss_mlp": 1.21484375, + "step": 652, + "time_per_iteration": 2.60402512550354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179616, + "balance_loss_mlp": 1.05849957, + "epoch": 0.12562524047710658, + "flos": 719682829824.0, + "grad_norm": 0.022040452281994895, + "language_loss": 0.94100869, + "learning_rate": 0.0009762166280235146, + "loss": 0.95280486, + "num_input_tokens_seen": 54495312, + "router_z_loss_mlp": 1.21289062, + "step": 653, + "time_per_iteration": 2.953866958618164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177042, + "balance_loss_mlp": 1.05592513, + "epoch": 0.12581762216237014, + "flos": 564798220800.0, + "grad_norm": 0.026345633512325176, + "language_loss": 0.96725851, + "learning_rate": 0.0009761215939223267, + "loss": 0.97902894, + "num_input_tokens_seen": 54566832, + "router_z_loss_mlp": 1.21289062, + "step": 654, + "time_per_iteration": 2.6936216354370117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176243, + "balance_loss_mlp": 1.0553174, + "epoch": 0.1260100038476337, + "flos": 482900382720.0, + "grad_norm": 0.0302310026354778, + "language_loss": 0.97697163, + "learning_rate": 0.0009760263749743428, + "loss": 0.98873413, + "num_input_tokens_seen": 54632128, + "router_z_loss_mlp": 1.2109375, + "step": 655, + "time_per_iteration": 2.5425992012023926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173716, + "balance_loss_mlp": 1.05302835, + "epoch": 0.12620238553289725, + "flos": 576701170176.0, + "grad_norm": 0.026173940013352312, + "language_loss": 0.96703827, + "learning_rate": 0.0009759309712165299, + "loss": 0.97877538, + "num_input_tokens_seen": 54707600, + "router_z_loss_mlp": 1.20849609, + "step": 656, + "time_per_iteration": 2.7134242057800293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182641, + "balance_loss_mlp": 1.06185794, + "epoch": 0.12639476721816084, + "flos": 532185314304.0, + "grad_norm": 0.024272217680215723, + "language_loss": 1.00863099, + "learning_rate": 0.0009758353826859272, + "loss": 1.02045751, + "num_input_tokens_seen": 54776704, + "router_z_loss_mlp": 1.20947266, + "step": 657, + "time_per_iteration": 2.621317148208618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183764, + "balance_loss_mlp": 1.06288576, + "epoch": 0.1265871489034244, + "flos": 691231380480.0, + "grad_norm": 0.02639198012969831, + "language_loss": 0.9913975, + "learning_rate": 0.0009757396094196456, + "loss": 1.00323522, + "num_input_tokens_seen": 54851744, + "router_z_loss_mlp": 1.21044922, + "step": 658, + "time_per_iteration": 2.8867759704589844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183942, + "balance_loss_mlp": 1.06311166, + "epoch": 0.12677953058868796, + "flos": 538242212352.0, + "grad_norm": 0.02343039495549204, + "language_loss": 0.91435432, + "learning_rate": 0.0009756436514548673, + "loss": 0.92619371, + "num_input_tokens_seen": 54932576, + "router_z_loss_mlp": 1.20996094, + "step": 659, + "time_per_iteration": 2.8055155277252197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179962, + "balance_loss_mlp": 1.05903614, + "epoch": 0.12697191227395152, + "flos": 520119908352.0, + "grad_norm": 0.02147737158217614, + "language_loss": 0.94944704, + "learning_rate": 0.0009755475088288466, + "loss": 0.96124667, + "num_input_tokens_seen": 55007296, + "router_z_loss_mlp": 1.2109375, + "step": 660, + "time_per_iteration": 2.713801860809326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179144, + "balance_loss_mlp": 1.05826533, + "epoch": 0.12716429395921508, + "flos": 567665851392.0, + "grad_norm": 0.026687699897107686, + "language_loss": 0.99289566, + "learning_rate": 0.0009754511815789095, + "loss": 1.00468707, + "num_input_tokens_seen": 55079312, + "router_z_loss_mlp": 1.21044922, + "step": 661, + "time_per_iteration": 2.739250898361206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176549, + "balance_loss_mlp": 1.05590951, + "epoch": 0.12735667564447864, + "flos": 515141987328.0, + "grad_norm": 0.028028480179563667, + "language_loss": 0.94950283, + "learning_rate": 0.0009753546697424533, + "loss": 0.96126837, + "num_input_tokens_seen": 55151824, + "router_z_loss_mlp": 1.20800781, + "step": 662, + "time_per_iteration": 2.71746826171875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180242, + "balance_loss_mlp": 1.05941188, + "epoch": 0.1275490573297422, + "flos": 542321077248.0, + "grad_norm": 0.02443290319898258, + "language_loss": 0.98755229, + "learning_rate": 0.0009752579733569475, + "loss": 0.99935466, + "num_input_tokens_seen": 55224368, + "router_z_loss_mlp": 1.20996094, + "step": 663, + "time_per_iteration": 2.631284713745117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179512, + "balance_loss_mlp": 1.06030273, + "epoch": 0.12774143901500576, + "flos": 1562024853504.0, + "grad_norm": 0.010147906106003043, + "language_loss": 0.74881387, + "learning_rate": 0.0009751610924599328, + "loss": 0.76060903, + "num_input_tokens_seen": 55453584, + "router_z_loss_mlp": 1.19335938, + "step": 664, + "time_per_iteration": 4.941519260406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188286, + "balance_loss_mlp": 1.06783676, + "epoch": 0.12793382070026935, + "flos": 614873419776.0, + "grad_norm": 0.028758292375382164, + "language_loss": 1.00255466, + "learning_rate": 0.0009750640270890217, + "loss": 1.01443744, + "num_input_tokens_seen": 55528000, + "router_z_loss_mlp": 1.20605469, + "step": 665, + "time_per_iteration": 2.7382516860961914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185033, + "balance_loss_mlp": 1.06458378, + "epoch": 0.1281262023855329, + "flos": 709117367808.0, + "grad_norm": 0.02727882395737353, + "language_loss": 1.05972624, + "learning_rate": 0.0009749667772818983, + "loss": 1.0715766, + "num_input_tokens_seen": 55612416, + "router_z_loss_mlp": 1.20605469, + "step": 666, + "time_per_iteration": 2.961103677749634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117968, + "balance_loss_mlp": 1.06104279, + "epoch": 0.12831858407079647, + "flos": 1428182572032.0, + "grad_norm": 0.005713660367986308, + "language_loss": 0.76935941, + "learning_rate": 0.0009748693430763185, + "loss": 0.78115624, + "num_input_tokens_seen": 55843664, + "router_z_loss_mlp": 1.1875, + "step": 667, + "time_per_iteration": 4.799788475036621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180825, + "balance_loss_mlp": 1.06056714, + "epoch": 0.12851096575606002, + "flos": 450018232320.0, + "grad_norm": 0.027450705632443572, + "language_loss": 1.04045725, + "learning_rate": 0.0009747717245101093, + "loss": 1.05226541, + "num_input_tokens_seen": 55909072, + "router_z_loss_mlp": 1.20410156, + "step": 668, + "time_per_iteration": 2.503016471862793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181103, + "balance_loss_mlp": 1.0609405, + "epoch": 0.12870334744132358, + "flos": 480909614592.0, + "grad_norm": 0.024743463193645603, + "language_loss": 0.94192064, + "learning_rate": 0.00097467392162117, + "loss": 0.95373166, + "num_input_tokens_seen": 55978544, + "router_z_loss_mlp": 1.203125, + "step": 669, + "time_per_iteration": 2.6341683864593506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176215, + "balance_loss_mlp": 1.05609953, + "epoch": 0.12889572912658714, + "flos": 640151064576.0, + "grad_norm": 0.020470833753638586, + "language_loss": 0.98179239, + "learning_rate": 0.0009745759344474708, + "loss": 0.99355447, + "num_input_tokens_seen": 56054144, + "router_z_loss_mlp": 1.20263672, + "step": 670, + "time_per_iteration": 2.8753654956817627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175464, + "balance_loss_mlp": 1.05530083, + "epoch": 0.1290881108118507, + "flos": 510954333696.0, + "grad_norm": 0.02496408481001148, + "language_loss": 0.98669916, + "learning_rate": 0.0009744777630270536, + "loss": 0.99845386, + "num_input_tokens_seen": 56120960, + "router_z_loss_mlp": 1.203125, + "step": 671, + "time_per_iteration": 2.601480484008789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173739, + "balance_loss_mlp": 1.05381489, + "epoch": 0.12928049249711426, + "flos": 672290611200.0, + "grad_norm": 0.0267777739546368, + "language_loss": 1.0349828, + "learning_rate": 0.000974379407398032, + "loss": 1.04672015, + "num_input_tokens_seen": 56202560, + "router_z_loss_mlp": 1.20068359, + "step": 672, + "time_per_iteration": 2.8746023178100586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176311, + "balance_loss_mlp": 1.05633891, + "epoch": 0.12947287418237785, + "flos": 794998743552.0, + "grad_norm": 0.021070447178693698, + "language_loss": 0.89884377, + "learning_rate": 0.0009742808675985913, + "loss": 0.91060686, + "num_input_tokens_seen": 56289456, + "router_z_loss_mlp": 1.20117188, + "step": 673, + "time_per_iteration": 3.106855869293213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178925, + "balance_loss_mlp": 1.05895269, + "epoch": 0.1296652558676414, + "flos": 486447490560.0, + "grad_norm": 0.028552559493613055, + "language_loss": 1.00707459, + "learning_rate": 0.0009741821436669876, + "loss": 1.0188638, + "num_input_tokens_seen": 56354480, + "router_z_loss_mlp": 1.20117188, + "step": 674, + "time_per_iteration": 2.6221611499786377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180752, + "balance_loss_mlp": 1.06097043, + "epoch": 0.12985763755290497, + "flos": 454392537600.0, + "grad_norm": 0.03163366532216525, + "language_loss": 1.04449701, + "learning_rate": 0.0009740832356415492, + "loss": 1.05630445, + "num_input_tokens_seen": 56418944, + "router_z_loss_mlp": 1.19921875, + "step": 675, + "time_per_iteration": 2.508666515350342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179614, + "balance_loss_mlp": 1.05968916, + "epoch": 0.13005001923816853, + "flos": 826434617856.0, + "grad_norm": 0.02755997498495484, + "language_loss": 0.99148017, + "learning_rate": 0.0009739841435606756, + "loss": 1.00327623, + "num_input_tokens_seen": 56492368, + "router_z_loss_mlp": 1.20068359, + "step": 676, + "time_per_iteration": 3.026420831680298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180175, + "balance_loss_mlp": 1.06058431, + "epoch": 0.1302424009234321, + "flos": 532480754688.0, + "grad_norm": 0.02275953253130011, + "language_loss": 0.97366607, + "learning_rate": 0.0009738848674628377, + "loss": 0.98546779, + "num_input_tokens_seen": 56568128, + "router_z_loss_mlp": 1.19726562, + "step": 677, + "time_per_iteration": 2.710205554962158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179059, + "balance_loss_mlp": 1.05927801, + "epoch": 0.13043478260869565, + "flos": 526916682240.0, + "grad_norm": 0.02441501439452981, + "language_loss": 0.97902691, + "learning_rate": 0.000973785407386578, + "loss": 0.99081755, + "num_input_tokens_seen": 56646448, + "router_z_loss_mlp": 1.19921875, + "step": 678, + "time_per_iteration": 2.7785394191741943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184892, + "balance_loss_mlp": 1.06553924, + "epoch": 0.1306271642939592, + "flos": 627416914944.0, + "grad_norm": 0.023801085732510874, + "language_loss": 0.94469249, + "learning_rate": 0.0009736857633705103, + "loss": 0.95654142, + "num_input_tokens_seen": 56732080, + "router_z_loss_mlp": 1.19482422, + "step": 679, + "time_per_iteration": 2.8619470596313477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177483, + "balance_loss_mlp": 1.05827415, + "epoch": 0.13081954597922277, + "flos": 551840489472.0, + "grad_norm": 0.024512943765722366, + "language_loss": 1.01033652, + "learning_rate": 0.0009735859354533196, + "loss": 1.02211142, + "num_input_tokens_seen": 56804432, + "router_z_loss_mlp": 1.19335938, + "step": 680, + "time_per_iteration": 2.6954457759857178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176387, + "balance_loss_mlp": 1.05755925, + "epoch": 0.13101192766448633, + "flos": 537955504128.0, + "grad_norm": 0.029188130773433643, + "language_loss": 1.02405858, + "learning_rate": 0.0009734859236737628, + "loss": 1.03582239, + "num_input_tokens_seen": 56872512, + "router_z_loss_mlp": 1.18945312, + "step": 681, + "time_per_iteration": 2.606597661972046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172364, + "balance_loss_mlp": 1.05353606, + "epoch": 0.13120430934974991, + "flos": 504513398784.0, + "grad_norm": 0.02625319928532985, + "language_loss": 1.02007055, + "learning_rate": 0.0009733857280706678, + "loss": 1.03179431, + "num_input_tokens_seen": 56940928, + "router_z_loss_mlp": 1.18945312, + "step": 682, + "time_per_iteration": 2.626211404800415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011686, + "balance_loss_mlp": 1.05010605, + "epoch": 0.13139669103501347, + "flos": 615422641152.0, + "grad_norm": 0.025135553656080285, + "language_loss": 0.9321503, + "learning_rate": 0.000973285348682934, + "loss": 0.94383633, + "num_input_tokens_seen": 57012736, + "router_z_loss_mlp": 1.18603516, + "step": 683, + "time_per_iteration": 2.71779727935791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190269, + "balance_loss_mlp": 1.07296753, + "epoch": 0.13158907272027703, + "flos": 1488215614464.0, + "grad_norm": 0.025067429703540995, + "language_loss": 0.77898371, + "learning_rate": 0.0009731847855495323, + "loss": 0.7908864, + "num_input_tokens_seen": 57243136, + "router_z_loss_mlp": 1.17382812, + "step": 684, + "time_per_iteration": 4.811431169509888 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168738, + "balance_loss_mlp": 1.05048192, + "epoch": 0.1317814544055406, + "flos": 987117614592.0, + "grad_norm": 0.026136533405527674, + "language_loss": 0.93269205, + "learning_rate": 0.0009730840387095046, + "loss": 0.94437939, + "num_input_tokens_seen": 57336160, + "router_z_loss_mlp": 1.18359375, + "step": 685, + "time_per_iteration": 3.3154938220977783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117288, + "balance_loss_mlp": 1.05443382, + "epoch": 0.13197383609080415, + "flos": 612628870656.0, + "grad_norm": 0.026271684435729213, + "language_loss": 0.99177825, + "learning_rate": 0.0009729831082019642, + "loss": 1.00350702, + "num_input_tokens_seen": 57418976, + "router_z_loss_mlp": 1.18554688, + "step": 686, + "time_per_iteration": 2.79620623588562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179476, + "balance_loss_mlp": 1.06093395, + "epoch": 0.1321662177760677, + "flos": 495554668032.0, + "grad_norm": 0.02508782879826625, + "language_loss": 0.97052312, + "learning_rate": 0.0009728819940660958, + "loss": 0.98231786, + "num_input_tokens_seen": 57490288, + "router_z_loss_mlp": 1.18652344, + "step": 687, + "time_per_iteration": 2.779193162918091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178983, + "balance_loss_mlp": 1.06067955, + "epoch": 0.13235859946133127, + "flos": 496843765248.0, + "grad_norm": 0.02705130625621755, + "language_loss": 0.97550011, + "learning_rate": 0.0009727806963411557, + "loss": 0.98728997, + "num_input_tokens_seen": 57556064, + "router_z_loss_mlp": 1.18408203, + "step": 688, + "time_per_iteration": 2.5702319145202637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177139, + "balance_loss_mlp": 1.05883551, + "epoch": 0.13255098114659483, + "flos": 512767182336.0, + "grad_norm": 0.022910122085290585, + "language_loss": 0.96022904, + "learning_rate": 0.000972679215066471, + "loss": 0.97200048, + "num_input_tokens_seen": 57627248, + "router_z_loss_mlp": 1.18408203, + "step": 689, + "time_per_iteration": 2.64780592918396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178761, + "balance_loss_mlp": 1.06050563, + "epoch": 0.13274336283185842, + "flos": 548399442432.0, + "grad_norm": 0.030606528220640358, + "language_loss": 1.08985806, + "learning_rate": 0.0009725775502814401, + "loss": 1.10164571, + "num_input_tokens_seen": 57694832, + "router_z_loss_mlp": 1.18359375, + "step": 690, + "time_per_iteration": 2.5830535888671875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179512, + "balance_loss_mlp": 1.06120849, + "epoch": 0.13293574451712198, + "flos": 642002844672.0, + "grad_norm": 0.023439513257655937, + "language_loss": 0.94635952, + "learning_rate": 0.0009724757020255327, + "loss": 0.95815468, + "num_input_tokens_seen": 57771776, + "router_z_loss_mlp": 1.18408203, + "step": 691, + "time_per_iteration": 2.827944278717041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183334, + "balance_loss_mlp": 1.06517375, + "epoch": 0.13312812620238554, + "flos": 492469459968.0, + "grad_norm": 0.028212898490696088, + "language_loss": 0.96836531, + "learning_rate": 0.0009723736703382902, + "loss": 0.98019874, + "num_input_tokens_seen": 57836272, + "router_z_loss_mlp": 1.18261719, + "step": 692, + "time_per_iteration": 2.6144213676452637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180114, + "balance_loss_mlp": 1.06200123, + "epoch": 0.1333205078876491, + "flos": 509949216768.0, + "grad_norm": 0.023005533645913036, + "language_loss": 0.90654016, + "learning_rate": 0.0009722714552593244, + "loss": 0.91834128, + "num_input_tokens_seen": 57907232, + "router_z_loss_mlp": 1.18212891, + "step": 693, + "time_per_iteration": 2.600128173828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180549, + "balance_loss_mlp": 1.06262743, + "epoch": 0.13351288957291266, + "flos": 419591477760.0, + "grad_norm": 0.029950659996273835, + "language_loss": 1.05475199, + "learning_rate": 0.000972169056828319, + "loss": 1.06655741, + "num_input_tokens_seen": 57969808, + "router_z_loss_mlp": 1.18017578, + "step": 694, + "time_per_iteration": 2.466643810272217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178338, + "balance_loss_mlp": 1.0606066, + "epoch": 0.13370527125817622, + "flos": 617050839552.0, + "grad_norm": 0.021764231653516302, + "language_loss": 0.95444119, + "learning_rate": 0.0009720664750850283, + "loss": 0.96622455, + "num_input_tokens_seen": 58042944, + "router_z_loss_mlp": 1.17822266, + "step": 695, + "time_per_iteration": 2.7776308059692383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173328, + "balance_loss_mlp": 1.05578816, + "epoch": 0.13389765294343978, + "flos": 627169138176.0, + "grad_norm": 0.026088042391715836, + "language_loss": 1.0165019, + "learning_rate": 0.0009719637100692784, + "loss": 1.0282352, + "num_input_tokens_seen": 58116080, + "router_z_loss_mlp": 1.17626953, + "step": 696, + "time_per_iteration": 2.77535343170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175294, + "balance_loss_mlp": 1.0578016, + "epoch": 0.13409003462870334, + "flos": 610896612864.0, + "grad_norm": 0.027090913840535472, + "language_loss": 0.92017978, + "learning_rate": 0.0009718607618209661, + "loss": 0.93193275, + "num_input_tokens_seen": 58197616, + "router_z_loss_mlp": 1.17578125, + "step": 697, + "time_per_iteration": 2.8413584232330322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179845, + "balance_loss_mlp": 1.06235278, + "epoch": 0.13428241631396692, + "flos": 685087887360.0, + "grad_norm": 0.024883061853709334, + "language_loss": 0.95573747, + "learning_rate": 0.0009717576303800595, + "loss": 0.96753585, + "num_input_tokens_seen": 58280480, + "router_z_loss_mlp": 1.17578125, + "step": 698, + "time_per_iteration": 3.047100782394409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175386, + "balance_loss_mlp": 1.05794048, + "epoch": 0.13447479799923048, + "flos": 509818960896.0, + "grad_norm": 0.024888049065051182, + "language_loss": 0.95325053, + "learning_rate": 0.0009716543157865975, + "loss": 0.96500432, + "num_input_tokens_seen": 58352464, + "router_z_loss_mlp": 1.17529297, + "step": 699, + "time_per_iteration": 2.7481272220611572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176233, + "balance_loss_mlp": 1.05878782, + "epoch": 0.13466717968449404, + "flos": 899058819072.0, + "grad_norm": 0.023872779385430955, + "language_loss": 0.92076075, + "learning_rate": 0.0009715508180806907, + "loss": 0.93252313, + "num_input_tokens_seen": 58437216, + "router_z_loss_mlp": 1.17529297, + "step": 700, + "time_per_iteration": 3.2107367515563965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173529, + "balance_loss_mlp": 1.05660856, + "epoch": 0.1348595613697576, + "flos": 991694034432.0, + "grad_norm": 0.023513798430807663, + "language_loss": 1.00262749, + "learning_rate": 0.0009714471373025202, + "loss": 1.01436281, + "num_input_tokens_seen": 58533152, + "router_z_loss_mlp": 1.16992188, + "step": 701, + "time_per_iteration": 3.3966751098632812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173715, + "balance_loss_mlp": 1.0566988, + "epoch": 0.13505194305502116, + "flos": 488811561984.0, + "grad_norm": 0.028001983236069502, + "language_loss": 0.99373382, + "learning_rate": 0.0009713432734923386, + "loss": 1.00547099, + "num_input_tokens_seen": 58601376, + "router_z_loss_mlp": 1.17089844, + "step": 702, + "time_per_iteration": 2.615107536315918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171408, + "balance_loss_mlp": 1.05439234, + "epoch": 0.13524432474028472, + "flos": 614519582208.0, + "grad_norm": 0.024192478681639117, + "language_loss": 0.96606487, + "learning_rate": 0.0009712392266904696, + "loss": 0.97777891, + "num_input_tokens_seen": 58676608, + "router_z_loss_mlp": 1.17089844, + "step": 703, + "time_per_iteration": 2.7448034286499023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174325, + "balance_loss_mlp": 1.05740499, + "epoch": 0.13543670642554828, + "flos": 906274558464.0, + "grad_norm": 0.025492480769094515, + "language_loss": 0.96012545, + "learning_rate": 0.0009711349969373076, + "loss": 0.97186869, + "num_input_tokens_seen": 58759264, + "router_z_loss_mlp": 1.16992188, + "step": 704, + "time_per_iteration": 3.1337268352508545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172794, + "balance_loss_mlp": 1.05596876, + "epoch": 0.13562908811081184, + "flos": 551747163648.0, + "grad_norm": 0.026772975251671254, + "language_loss": 0.91034031, + "learning_rate": 0.0009710305842733178, + "loss": 0.9220683, + "num_input_tokens_seen": 58834800, + "router_z_loss_mlp": 1.16894531, + "step": 705, + "time_per_iteration": 2.7571139335632324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167183, + "balance_loss_mlp": 1.05031061, + "epoch": 0.1358214697960754, + "flos": 509037425664.0, + "grad_norm": 0.024292049069741084, + "language_loss": 0.98220038, + "learning_rate": 0.0009709259887390373, + "loss": 0.99387223, + "num_input_tokens_seen": 58901712, + "router_z_loss_mlp": 1.16943359, + "step": 706, + "time_per_iteration": 2.559511661529541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168004, + "balance_loss_mlp": 1.05141699, + "epoch": 0.136013851481339, + "flos": 529923300864.0, + "grad_norm": 0.025926611739077732, + "language_loss": 1.00068641, + "learning_rate": 0.0009708212103750737, + "loss": 1.01236641, + "num_input_tokens_seen": 58967824, + "router_z_loss_mlp": 1.16650391, + "step": 707, + "time_per_iteration": 2.6197190284729004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168587, + "balance_loss_mlp": 1.05219126, + "epoch": 0.13620623316660255, + "flos": 660320532480.0, + "grad_norm": 0.02235622943703988, + "language_loss": 0.96270919, + "learning_rate": 0.0009707162492221051, + "loss": 0.97439504, + "num_input_tokens_seen": 59045040, + "router_z_loss_mlp": 1.16455078, + "step": 708, + "time_per_iteration": 2.8917648792266846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171818, + "balance_loss_mlp": 1.05542207, + "epoch": 0.1363986148518661, + "flos": 673082880000.0, + "grad_norm": 0.027649047287573853, + "language_loss": 0.98132068, + "learning_rate": 0.0009706111053208815, + "loss": 0.99303889, + "num_input_tokens_seen": 59117216, + "router_z_loss_mlp": 1.16455078, + "step": 709, + "time_per_iteration": 2.7827165126800537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173191, + "balance_loss_mlp": 1.05669987, + "epoch": 0.13659099653712967, + "flos": 474004051968.0, + "grad_norm": 0.02773643003805471, + "language_loss": 0.94597077, + "learning_rate": 0.0009705057787122232, + "loss": 0.9577027, + "num_input_tokens_seen": 59183056, + "router_z_loss_mlp": 1.16552734, + "step": 710, + "time_per_iteration": 2.542836904525757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169067, + "balance_loss_mlp": 1.05286229, + "epoch": 0.13678337822239323, + "flos": 453647932416.0, + "grad_norm": 0.0248615327032158, + "language_loss": 0.9884814, + "learning_rate": 0.0009704002694370216, + "loss": 1.00017214, + "num_input_tokens_seen": 59247312, + "router_z_loss_mlp": 1.16259766, + "step": 711, + "time_per_iteration": 2.550527811050415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164533, + "balance_loss_mlp": 1.04842281, + "epoch": 0.13697575990765679, + "flos": 520625468928.0, + "grad_norm": 0.0274811578413112, + "language_loss": 0.97066599, + "learning_rate": 0.0009702945775362388, + "loss": 0.98231125, + "num_input_tokens_seen": 59317968, + "router_z_loss_mlp": 1.16162109, + "step": 712, + "time_per_iteration": 2.56953501701355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116862, + "balance_loss_mlp": 1.05246294, + "epoch": 0.13716814159292035, + "flos": 481365510144.0, + "grad_norm": 0.025544817797380492, + "language_loss": 0.98621845, + "learning_rate": 0.0009701887030509086, + "loss": 0.99790466, + "num_input_tokens_seen": 59387936, + "router_z_loss_mlp": 1.16210938, + "step": 713, + "time_per_iteration": 2.6443872451782227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173026, + "balance_loss_mlp": 1.05663013, + "epoch": 0.1373605232781839, + "flos": 546749776896.0, + "grad_norm": 0.02672517687154734, + "language_loss": 1.02031791, + "learning_rate": 0.0009700826460221346, + "loss": 1.03204811, + "num_input_tokens_seen": 59460624, + "router_z_loss_mlp": 1.16455078, + "step": 714, + "time_per_iteration": 2.6742734909057617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171339, + "balance_loss_mlp": 1.05508566, + "epoch": 0.1375529049634475, + "flos": 710070091776.0, + "grad_norm": 0.027473841831572973, + "language_loss": 1.03736091, + "learning_rate": 0.0009699764064910921, + "loss": 1.04907441, + "num_input_tokens_seen": 59536752, + "router_z_loss_mlp": 1.16308594, + "step": 715, + "time_per_iteration": 2.8945000171661377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116921, + "balance_loss_mlp": 1.05281401, + "epoch": 0.13774528664871105, + "flos": 487676189184.0, + "grad_norm": 0.02500038679906112, + "language_loss": 0.96403199, + "learning_rate": 0.0009698699844990268, + "loss": 0.9757241, + "num_input_tokens_seen": 59608128, + "router_z_loss_mlp": 1.16455078, + "step": 716, + "time_per_iteration": 2.638272762298584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116569, + "balance_loss_mlp": 1.04972363, + "epoch": 0.1379376683339746, + "flos": 681458187264.0, + "grad_norm": 0.024933229917961583, + "language_loss": 0.9565106, + "learning_rate": 0.0009697633800872555, + "loss": 0.96816742, + "num_input_tokens_seen": 59685120, + "router_z_loss_mlp": 1.16015625, + "step": 717, + "time_per_iteration": 2.8989553451538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168974, + "balance_loss_mlp": 1.05310297, + "epoch": 0.13813005001923817, + "flos": 612225368064.0, + "grad_norm": 0.02330012063083705, + "language_loss": 1.0130372, + "learning_rate": 0.0009696565932971655, + "loss": 1.02472687, + "num_input_tokens_seen": 59763376, + "router_z_loss_mlp": 1.15917969, + "step": 718, + "time_per_iteration": 2.8472671508789062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171117, + "balance_loss_mlp": 1.05524576, + "epoch": 0.13832243170450173, + "flos": 589926144000.0, + "grad_norm": 0.027418468702626427, + "language_loss": 0.98498988, + "learning_rate": 0.0009695496241702153, + "loss": 0.99670106, + "num_input_tokens_seen": 59836800, + "router_z_loss_mlp": 1.15917969, + "step": 719, + "time_per_iteration": 2.786895990371704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167345, + "balance_loss_mlp": 1.05180764, + "epoch": 0.1385148133897653, + "flos": 701319479808.0, + "grad_norm": 0.026285913371991803, + "language_loss": 0.94868541, + "learning_rate": 0.0009694424727479339, + "loss": 0.96035892, + "num_input_tokens_seen": 59914720, + "router_z_loss_mlp": 1.15576172, + "step": 720, + "time_per_iteration": 2.921644926071167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117298, + "balance_loss_mlp": 1.05729949, + "epoch": 0.13870719507502885, + "flos": 599366966784.0, + "grad_norm": 0.024279001882637877, + "language_loss": 0.97845113, + "learning_rate": 0.0009693351390719213, + "loss": 0.99018097, + "num_input_tokens_seen": 59984544, + "router_z_loss_mlp": 1.15722656, + "step": 721, + "time_per_iteration": 2.701911687850952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168632, + "balance_loss_mlp": 1.05304694, + "epoch": 0.1388995767602924, + "flos": 587748724224.0, + "grad_norm": 0.03212240351747381, + "language_loss": 0.98596126, + "learning_rate": 0.000969227623183848, + "loss": 0.99764758, + "num_input_tokens_seen": 60057056, + "router_z_loss_mlp": 1.15625, + "step": 722, + "time_per_iteration": 2.7723541259765625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167541, + "balance_loss_mlp": 1.05205071, + "epoch": 0.139091958445556, + "flos": 652362189312.0, + "grad_norm": 0.025655198862846312, + "language_loss": 0.99224544, + "learning_rate": 0.0009691199251254554, + "loss": 1.00392079, + "num_input_tokens_seen": 60133232, + "router_z_loss_mlp": 1.15527344, + "step": 723, + "time_per_iteration": 2.8426058292388916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165537, + "balance_loss_mlp": 1.05019021, + "epoch": 0.13928434013081956, + "flos": 576905286144.0, + "grad_norm": 0.022500478429048027, + "language_loss": 0.9243086, + "learning_rate": 0.0009690120449385555, + "loss": 0.93596393, + "num_input_tokens_seen": 60207104, + "router_z_loss_mlp": 1.15380859, + "step": 724, + "time_per_iteration": 2.7558276653289795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168709, + "balance_loss_mlp": 1.05307627, + "epoch": 0.13947672181608312, + "flos": 564314127360.0, + "grad_norm": 0.02294482348940274, + "language_loss": 1.00981367, + "learning_rate": 0.0009689039826650312, + "loss": 1.02150071, + "num_input_tokens_seen": 60277920, + "router_z_loss_mlp": 1.15673828, + "step": 725, + "time_per_iteration": 2.784708261489868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211281, + "balance_loss_mlp": 1.09550476, + "epoch": 0.13966910350134668, + "flos": 1524949045248.0, + "grad_norm": 0.02639881420994122, + "language_loss": 0.76523066, + "learning_rate": 0.000968795738346836, + "loss": 0.77734339, + "num_input_tokens_seen": 60494224, + "router_z_loss_mlp": 1.15820312, + "step": 726, + "time_per_iteration": 4.9523255825042725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171441, + "balance_loss_mlp": 1.05604661, + "epoch": 0.13986148518661023, + "flos": 500855500800.0, + "grad_norm": 0.0321160389091748, + "language_loss": 0.98954523, + "learning_rate": 0.0009686873120259941, + "loss": 1.00125957, + "num_input_tokens_seen": 60562176, + "router_z_loss_mlp": 1.15429688, + "step": 727, + "time_per_iteration": 2.584141731262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173326, + "balance_loss_mlp": 1.05850363, + "epoch": 0.1400538668718738, + "flos": 599849058816.0, + "grad_norm": 0.027531106684590426, + "language_loss": 0.93834305, + "learning_rate": 0.0009685787037446004, + "loss": 0.95007634, + "num_input_tokens_seen": 60631472, + "router_z_loss_mlp": 1.1484375, + "step": 728, + "time_per_iteration": 2.770592451095581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170215, + "balance_loss_mlp": 1.05520177, + "epoch": 0.14024624855713735, + "flos": 595168579584.0, + "grad_norm": 0.026051179565135866, + "language_loss": 0.98294961, + "learning_rate": 0.0009684699135448201, + "loss": 0.99465179, + "num_input_tokens_seen": 60703488, + "router_z_loss_mlp": 1.15039062, + "step": 729, + "time_per_iteration": 2.728573799133301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164865, + "balance_loss_mlp": 1.04985154, + "epoch": 0.1404386302424009, + "flos": 507585145344.0, + "grad_norm": 0.02205061924934426, + "language_loss": 0.98307908, + "learning_rate": 0.0009683609414688895, + "loss": 0.99472773, + "num_input_tokens_seen": 60773936, + "router_z_loss_mlp": 1.15039062, + "step": 730, + "time_per_iteration": 2.700016975402832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167363, + "balance_loss_mlp": 1.05254078, + "epoch": 0.14063101192766447, + "flos": 574515018240.0, + "grad_norm": 0.021243768346974407, + "language_loss": 0.95329058, + "learning_rate": 0.0009682517875591154, + "loss": 0.96496415, + "num_input_tokens_seen": 60851120, + "router_z_loss_mlp": 1.1484375, + "step": 731, + "time_per_iteration": 2.743590831756592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116728, + "balance_loss_mlp": 1.05264843, + "epoch": 0.14082339361292806, + "flos": 565764406272.0, + "grad_norm": 0.02284757167221282, + "language_loss": 0.93998873, + "learning_rate": 0.0009681424518578749, + "loss": 0.95166153, + "num_input_tokens_seen": 60924896, + "router_z_loss_mlp": 1.14648438, + "step": 732, + "time_per_iteration": 2.757690668106079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166596, + "balance_loss_mlp": 1.05215514, + "epoch": 0.14101577529819162, + "flos": 464582694912.0, + "grad_norm": 0.02112517179619274, + "language_loss": 0.95363593, + "learning_rate": 0.000968032934407616, + "loss": 0.96530199, + "num_input_tokens_seen": 60996016, + "router_z_loss_mlp": 1.14453125, + "step": 733, + "time_per_iteration": 2.6260647773742676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167013, + "balance_loss_mlp": 1.05257201, + "epoch": 0.14120815698345518, + "flos": 597261405696.0, + "grad_norm": 0.02235342076428548, + "language_loss": 0.90822989, + "learning_rate": 0.0009679232352508571, + "loss": 0.91990006, + "num_input_tokens_seen": 61072016, + "router_z_loss_mlp": 1.14453125, + "step": 734, + "time_per_iteration": 2.7677996158599854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167689, + "balance_loss_mlp": 1.05334342, + "epoch": 0.14140053866871874, + "flos": 536231978496.0, + "grad_norm": 0.023954026934244203, + "language_loss": 0.90350544, + "learning_rate": 0.0009678133544301871, + "loss": 0.91518235, + "num_input_tokens_seen": 61144528, + "router_z_loss_mlp": 1.14355469, + "step": 735, + "time_per_iteration": 2.6668286323547363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165912, + "balance_loss_mlp": 1.05147135, + "epoch": 0.1415929203539823, + "flos": 521276748288.0, + "grad_norm": 0.01836780541558419, + "language_loss": 0.98091269, + "learning_rate": 0.0009677032919882658, + "loss": 0.99257177, + "num_input_tokens_seen": 61216960, + "router_z_loss_mlp": 1.14453125, + "step": 736, + "time_per_iteration": 2.654975652694702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175055, + "balance_loss_mlp": 1.0601368, + "epoch": 0.14178530203924586, + "flos": 483301883904.0, + "grad_norm": 0.025248480485652293, + "language_loss": 1.00008237, + "learning_rate": 0.000967593047967823, + "loss": 1.01183295, + "num_input_tokens_seen": 61281312, + "router_z_loss_mlp": 1.14941406, + "step": 737, + "time_per_iteration": 2.529147148132324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167635, + "balance_loss_mlp": 1.05319452, + "epoch": 0.14197768372450942, + "flos": 677839220736.0, + "grad_norm": 0.02278890168576414, + "language_loss": 0.9561522, + "learning_rate": 0.0009674826224116593, + "loss": 0.96782857, + "num_input_tokens_seen": 61355888, + "router_z_loss_mlp": 1.14453125, + "step": 738, + "time_per_iteration": 2.8032455444335938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170364, + "balance_loss_mlp": 1.05606639, + "epoch": 0.14217006540977298, + "flos": 446992147968.0, + "grad_norm": 0.026055784762538982, + "language_loss": 0.97800839, + "learning_rate": 0.0009673720153626455, + "loss": 0.989712, + "num_input_tokens_seen": 61424288, + "router_z_loss_mlp": 1.14306641, + "step": 739, + "time_per_iteration": 2.629868984222412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172861, + "balance_loss_mlp": 1.05889642, + "epoch": 0.14236244709503657, + "flos": 497477580288.0, + "grad_norm": 0.02475738760241807, + "language_loss": 0.95941108, + "learning_rate": 0.0009672612268637235, + "loss": 0.97113973, + "num_input_tokens_seen": 61493344, + "router_z_loss_mlp": 1.13964844, + "step": 740, + "time_per_iteration": 2.6037824153900146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170194, + "balance_loss_mlp": 1.05618262, + "epoch": 0.14255482878030012, + "flos": 649479095808.0, + "grad_norm": 0.03387034378547869, + "language_loss": 0.95329261, + "learning_rate": 0.0009671502569579048, + "loss": 0.96499455, + "num_input_tokens_seen": 61565216, + "router_z_loss_mlp": 1.14013672, + "step": 741, + "time_per_iteration": 2.7700846195220947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170542, + "balance_loss_mlp": 1.05657792, + "epoch": 0.14274721046556368, + "flos": 537274025472.0, + "grad_norm": 0.02433568326488268, + "language_loss": 0.98081231, + "learning_rate": 0.0009670391056882719, + "loss": 0.99251777, + "num_input_tokens_seen": 61640928, + "router_z_loss_mlp": 1.13964844, + "step": 742, + "time_per_iteration": 2.696019172668457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174036, + "balance_loss_mlp": 1.06002402, + "epoch": 0.14293959215082724, + "flos": 958583572992.0, + "grad_norm": 0.027423351639808666, + "language_loss": 0.96458268, + "learning_rate": 0.0009669277730979776, + "loss": 0.97632295, + "num_input_tokens_seen": 61717552, + "router_z_loss_mlp": 1.14013672, + "step": 743, + "time_per_iteration": 3.2084367275238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174905, + "balance_loss_mlp": 1.06103587, + "epoch": 0.1431319738360908, + "flos": 694385719296.0, + "grad_norm": 0.02304461389980259, + "language_loss": 0.94654781, + "learning_rate": 0.0009668162592302449, + "loss": 0.9582969, + "num_input_tokens_seen": 61800016, + "router_z_loss_mlp": 1.13867188, + "step": 744, + "time_per_iteration": 2.8862292766571045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184206, + "balance_loss_mlp": 1.07009852, + "epoch": 0.14332435552135436, + "flos": 566502280704.0, + "grad_norm": 0.024928546312887438, + "language_loss": 0.9473027, + "learning_rate": 0.0009667045641283676, + "loss": 0.95914471, + "num_input_tokens_seen": 61865904, + "router_z_loss_mlp": 1.14111328, + "step": 745, + "time_per_iteration": 2.6714677810668945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170598, + "balance_loss_mlp": 1.05672932, + "epoch": 0.14351673720661792, + "flos": 739695845376.0, + "grad_norm": 0.027004630074695047, + "language_loss": 1.03854704, + "learning_rate": 0.0009665926878357092, + "loss": 1.05025315, + "num_input_tokens_seen": 61945728, + "router_z_loss_mlp": 1.13867188, + "step": 746, + "time_per_iteration": 2.9414963722229004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168037, + "balance_loss_mlp": 1.05416811, + "epoch": 0.14370911889188148, + "flos": 550351279104.0, + "grad_norm": 0.024394803732961844, + "language_loss": 0.99195439, + "learning_rate": 0.0009664806303957043, + "loss": 1.00363481, + "num_input_tokens_seen": 62016288, + "router_z_loss_mlp": 1.13867188, + "step": 747, + "time_per_iteration": 2.6798276901245117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175063, + "balance_loss_mlp": 1.06109881, + "epoch": 0.14390150057714507, + "flos": 591589271040.0, + "grad_norm": 0.028912253716933817, + "language_loss": 0.96970344, + "learning_rate": 0.0009663683918518571, + "loss": 0.98145401, + "num_input_tokens_seen": 62097904, + "router_z_loss_mlp": 1.13964844, + "step": 748, + "time_per_iteration": 2.894670248031616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172034, + "balance_loss_mlp": 1.05845118, + "epoch": 0.14409388226240863, + "flos": 592144496640.0, + "grad_norm": 0.025560266799661176, + "language_loss": 0.96381319, + "learning_rate": 0.0009662559722477428, + "loss": 0.97553355, + "num_input_tokens_seen": 62166736, + "router_z_loss_mlp": 1.13574219, + "step": 749, + "time_per_iteration": 2.702796220779419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193848, + "balance_loss_mlp": 1.08131409, + "epoch": 0.1442862639476722, + "flos": 1514654828544.0, + "grad_norm": 0.02305864885865106, + "language_loss": 0.7616297, + "learning_rate": 0.0009661433716270062, + "loss": 0.77356815, + "num_input_tokens_seen": 62402512, + "router_z_loss_mlp": 1.125, + "step": 750, + "time_per_iteration": 5.010634660720825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166507, + "balance_loss_mlp": 1.05287659, + "epoch": 0.14447864563293575, + "flos": 497855612928.0, + "grad_norm": 0.023714468612350204, + "language_loss": 0.97989428, + "learning_rate": 0.0009660305900333632, + "loss": 0.99155927, + "num_input_tokens_seen": 62473408, + "router_z_loss_mlp": 1.13623047, + "step": 751, + "time_per_iteration": 2.7064144611358643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172129, + "balance_loss_mlp": 1.05845106, + "epoch": 0.1446710273181993, + "flos": 590794274304.0, + "grad_norm": 0.03190287595859636, + "language_loss": 0.91963172, + "learning_rate": 0.0009659176275105992, + "loss": 0.93135297, + "num_input_tokens_seen": 62547440, + "router_z_loss_mlp": 1.13671875, + "step": 752, + "time_per_iteration": 2.7171401977539062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171619, + "balance_loss_mlp": 1.05803668, + "epoch": 0.14486340900346287, + "flos": 587012851200.0, + "grad_norm": 0.023715921645424867, + "language_loss": 0.93508279, + "learning_rate": 0.0009658044841025701, + "loss": 0.94679892, + "num_input_tokens_seen": 62620224, + "router_z_loss_mlp": 1.13574219, + "step": 753, + "time_per_iteration": 2.77504563331604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172686, + "balance_loss_mlp": 1.05900788, + "epoch": 0.14505579068872643, + "flos": 505740096000.0, + "grad_norm": 0.025730958483317315, + "language_loss": 0.9055903, + "learning_rate": 0.0009656911598532021, + "loss": 0.91731715, + "num_input_tokens_seen": 62690464, + "router_z_loss_mlp": 1.13671875, + "step": 754, + "time_per_iteration": 2.642886161804199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172881, + "balance_loss_mlp": 1.05925071, + "epoch": 0.14524817237399, + "flos": 487815177216.0, + "grad_norm": 0.025261406861214447, + "language_loss": 0.98625988, + "learning_rate": 0.0009655776548064917, + "loss": 0.9979887, + "num_input_tokens_seen": 62762240, + "router_z_loss_mlp": 1.13623047, + "step": 755, + "time_per_iteration": 2.6610004901885986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169342, + "balance_loss_mlp": 1.05571139, + "epoch": 0.14544055405925355, + "flos": 729449292288.0, + "grad_norm": 0.025093779151575485, + "language_loss": 0.97407329, + "learning_rate": 0.0009654639690065054, + "loss": 0.98576677, + "num_input_tokens_seen": 62839760, + "router_z_loss_mlp": 1.13623047, + "step": 756, + "time_per_iteration": 2.867976665496826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173831, + "balance_loss_mlp": 1.06024873, + "epoch": 0.14563293574451713, + "flos": 594786544128.0, + "grad_norm": 0.02769433731610086, + "language_loss": 0.96328217, + "learning_rate": 0.00096535010249738, + "loss": 0.97502041, + "num_input_tokens_seen": 62910336, + "router_z_loss_mlp": 1.13574219, + "step": 757, + "time_per_iteration": 2.718595266342163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171947, + "balance_loss_mlp": 1.05879402, + "epoch": 0.1458253174297807, + "flos": 561622414848.0, + "grad_norm": 0.027253539371253223, + "language_loss": 0.93671888, + "learning_rate": 0.0009652360553233224, + "loss": 0.94843829, + "num_input_tokens_seen": 62988160, + "router_z_loss_mlp": 1.13134766, + "step": 758, + "time_per_iteration": 2.732665538787842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181084, + "balance_loss_mlp": 1.06835938, + "epoch": 0.14601769911504425, + "flos": 1561186922496.0, + "grad_norm": 0.016548141494889222, + "language_loss": 0.73773748, + "learning_rate": 0.0009651218275286093, + "loss": 0.74954832, + "num_input_tokens_seen": 63224704, + "router_z_loss_mlp": 1.12695312, + "step": 759, + "time_per_iteration": 4.9278404712677 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161415, + "balance_loss_mlp": 1.04840457, + "epoch": 0.1462100808003078, + "flos": 867822331392.0, + "grad_norm": 0.024551380524627048, + "language_loss": 0.89752859, + "learning_rate": 0.0009650074191575883, + "loss": 0.90914273, + "num_input_tokens_seen": 63312400, + "router_z_loss_mlp": 1.12988281, + "step": 760, + "time_per_iteration": 3.18084716796875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011658, + "balance_loss_mlp": 1.05302811, + "epoch": 0.14640246248557137, + "flos": 524029585920.0, + "grad_norm": 0.025729752682943422, + "language_loss": 0.95023656, + "learning_rate": 0.0009648928302546766, + "loss": 0.96189463, + "num_input_tokens_seen": 63387792, + "router_z_loss_mlp": 1.12744141, + "step": 761, + "time_per_iteration": 2.707385301589966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161728, + "balance_loss_mlp": 1.04895639, + "epoch": 0.14659484417083493, + "flos": 1032241089024.0, + "grad_norm": 0.022974522077421757, + "language_loss": 0.94352418, + "learning_rate": 0.0009647780608643613, + "loss": 0.95514143, + "num_input_tokens_seen": 63475632, + "router_z_loss_mlp": 1.12744141, + "step": 762, + "time_per_iteration": 3.357776165008545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116078, + "balance_loss_mlp": 1.04848516, + "epoch": 0.1467872258560985, + "flos": 501656501760.0, + "grad_norm": 0.027279773355913427, + "language_loss": 0.99627388, + "learning_rate": 0.0009646631110312001, + "loss": 1.00788176, + "num_input_tokens_seen": 63546080, + "router_z_loss_mlp": 1.12255859, + "step": 763, + "time_per_iteration": 2.629650115966797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159049, + "balance_loss_mlp": 1.04665887, + "epoch": 0.14697960754136205, + "flos": 548935928832.0, + "grad_norm": 0.020644179018096606, + "language_loss": 0.95446718, + "learning_rate": 0.0009645479807998203, + "loss": 0.96605766, + "num_input_tokens_seen": 63622464, + "router_z_loss_mlp": 1.12353516, + "step": 764, + "time_per_iteration": 2.7475621700286865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157825, + "balance_loss_mlp": 1.04510117, + "epoch": 0.14717198922662564, + "flos": 518901943296.0, + "grad_norm": 0.021535065255329562, + "language_loss": 0.99812603, + "learning_rate": 0.0009644326702149196, + "loss": 1.00970435, + "num_input_tokens_seen": 63694736, + "router_z_loss_mlp": 1.12695312, + "step": 765, + "time_per_iteration": 2.711500406265259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158907, + "balance_loss_mlp": 1.04618227, + "epoch": 0.1473643709118892, + "flos": 733483221504.0, + "grad_norm": 0.02504361772442387, + "language_loss": 0.95452881, + "learning_rate": 0.0009643171793212653, + "loss": 0.96611786, + "num_input_tokens_seen": 63779072, + "router_z_loss_mlp": 1.12695312, + "step": 766, + "time_per_iteration": 3.130798578262329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163931, + "balance_loss_mlp": 1.05115891, + "epoch": 0.14755675259715276, + "flos": 621668192256.0, + "grad_norm": 0.027740201354691706, + "language_loss": 0.99870968, + "learning_rate": 0.0009642015081636952, + "loss": 1.01034904, + "num_input_tokens_seen": 63847472, + "router_z_loss_mlp": 1.12744141, + "step": 767, + "time_per_iteration": 2.701939344406128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160055, + "balance_loss_mlp": 1.04761696, + "epoch": 0.14774913428241632, + "flos": 453172571136.0, + "grad_norm": 0.025159341457135456, + "language_loss": 0.98449206, + "learning_rate": 0.0009640856567871166, + "loss": 0.99609256, + "num_input_tokens_seen": 63912496, + "router_z_loss_mlp": 1.12402344, + "step": 768, + "time_per_iteration": 2.516721725463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164874, + "balance_loss_mlp": 1.05262613, + "epoch": 0.14794151596767988, + "flos": 838654474752.0, + "grad_norm": 0.02612823197324643, + "language_loss": 0.99416363, + "learning_rate": 0.0009639696252365072, + "loss": 1.00581241, + "num_input_tokens_seen": 63990832, + "router_z_loss_mlp": 1.12207031, + "step": 769, + "time_per_iteration": 3.06074857711792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167068, + "balance_loss_mlp": 1.05472481, + "epoch": 0.14813389765294344, + "flos": 687404295168.0, + "grad_norm": 0.02602975967937929, + "language_loss": 0.89651555, + "learning_rate": 0.0009638534135569144, + "loss": 0.90818626, + "num_input_tokens_seen": 64067552, + "router_z_loss_mlp": 1.12304688, + "step": 770, + "time_per_iteration": 2.9440436363220215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169876, + "balance_loss_mlp": 1.05753326, + "epoch": 0.148326279338207, + "flos": 510943600128.0, + "grad_norm": 0.028093178265757666, + "language_loss": 1.01150489, + "learning_rate": 0.0009637370217934554, + "loss": 1.02320373, + "num_input_tokens_seen": 64140336, + "router_z_loss_mlp": 1.12304688, + "step": 771, + "time_per_iteration": 2.649656295776367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166681, + "balance_loss_mlp": 1.05443311, + "epoch": 0.14851866102347056, + "flos": 589331260416.0, + "grad_norm": 0.028336871459981, + "language_loss": 0.90924722, + "learning_rate": 0.0009636204499913175, + "loss": 0.92091405, + "num_input_tokens_seen": 64223472, + "router_z_loss_mlp": 1.12207031, + "step": 772, + "time_per_iteration": 2.8592941761016846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157961, + "balance_loss_mlp": 1.04609525, + "epoch": 0.14871104270873411, + "flos": 692247230976.0, + "grad_norm": 0.030313888046816524, + "language_loss": 0.95830965, + "learning_rate": 0.0009635036981957581, + "loss": 0.96988928, + "num_input_tokens_seen": 64299872, + "router_z_loss_mlp": 1.11816406, + "step": 773, + "time_per_iteration": 2.8690600395202637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160765, + "balance_loss_mlp": 1.04904246, + "epoch": 0.1489034243939977, + "flos": 656282600448.0, + "grad_norm": 0.02808100337337059, + "language_loss": 0.98035401, + "learning_rate": 0.0009633867664521043, + "loss": 0.99196172, + "num_input_tokens_seen": 64377152, + "router_z_loss_mlp": 1.11669922, + "step": 774, + "time_per_iteration": 2.812833070755005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159463, + "balance_loss_mlp": 1.04788363, + "epoch": 0.14909580607926126, + "flos": 476795821056.0, + "grad_norm": 0.030787585825694654, + "language_loss": 0.97385693, + "learning_rate": 0.0009632696548057527, + "loss": 0.98545158, + "num_input_tokens_seen": 64443008, + "router_z_loss_mlp": 1.11523438, + "step": 775, + "time_per_iteration": 2.5662474632263184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160778, + "balance_loss_mlp": 1.04910243, + "epoch": 0.14928818776452482, + "flos": 612283765248.0, + "grad_norm": 0.030552265213122824, + "language_loss": 0.94746792, + "learning_rate": 0.0009631523633021704, + "loss": 0.95907569, + "num_input_tokens_seen": 64519776, + "router_z_loss_mlp": 1.11621094, + "step": 776, + "time_per_iteration": 2.789336919784546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115528, + "balance_loss_mlp": 1.04408133, + "epoch": 0.14948056944978838, + "flos": 562916241408.0, + "grad_norm": 0.02653866309736765, + "language_loss": 0.98006344, + "learning_rate": 0.0009630348919868936, + "loss": 0.99161637, + "num_input_tokens_seen": 64593712, + "router_z_loss_mlp": 1.11132812, + "step": 777, + "time_per_iteration": 2.708918571472168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115506, + "balance_loss_mlp": 1.04395676, + "epoch": 0.14967295113505194, + "flos": 450111558144.0, + "grad_norm": 0.02761804701826243, + "language_loss": 0.92444694, + "learning_rate": 0.0009629172409055293, + "loss": 0.93599755, + "num_input_tokens_seen": 64658448, + "router_z_loss_mlp": 1.11035156, + "step": 778, + "time_per_iteration": 2.522322177886963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154649, + "balance_loss_mlp": 1.0435462, + "epoch": 0.1498653328203155, + "flos": 572428922880.0, + "grad_norm": 0.02112796064723151, + "language_loss": 0.9446094, + "learning_rate": 0.0009627994101037531, + "loss": 0.9561559, + "num_input_tokens_seen": 64734144, + "router_z_loss_mlp": 1.11035156, + "step": 779, + "time_per_iteration": 2.7606184482574463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154399, + "balance_loss_mlp": 1.0433439, + "epoch": 0.15005771450557906, + "flos": 632407570944.0, + "grad_norm": 0.02232887996041627, + "language_loss": 0.98232067, + "learning_rate": 0.0009626813996273114, + "loss": 0.99386466, + "num_input_tokens_seen": 64813456, + "router_z_loss_mlp": 1.10986328, + "step": 780, + "time_per_iteration": 2.8442463874816895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156542, + "balance_loss_mlp": 1.04553461, + "epoch": 0.15025009619084262, + "flos": 579165298176.0, + "grad_norm": 0.021576328362923832, + "language_loss": 0.96611506, + "learning_rate": 0.0009625632095220198, + "loss": 0.97768044, + "num_input_tokens_seen": 64896816, + "router_z_loss_mlp": 1.109375, + "step": 781, + "time_per_iteration": 2.823941469192505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156174, + "balance_loss_mlp": 1.04492784, + "epoch": 0.1504424778761062, + "flos": 484856222208.0, + "grad_norm": 0.023769174200548453, + "language_loss": 0.96595448, + "learning_rate": 0.0009624448398337637, + "loss": 0.97751617, + "num_input_tokens_seen": 64964176, + "router_z_loss_mlp": 1.11181641, + "step": 782, + "time_per_iteration": 2.517115354537964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153917, + "balance_loss_mlp": 1.04286146, + "epoch": 0.15063485956136977, + "flos": 763894513152.0, + "grad_norm": 0.022118467112767815, + "language_loss": 0.97773027, + "learning_rate": 0.0009623262906084984, + "loss": 0.98926944, + "num_input_tokens_seen": 65042592, + "router_z_loss_mlp": 1.10986328, + "step": 783, + "time_per_iteration": 2.9971072673797607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156171, + "balance_loss_mlp": 1.04554462, + "epoch": 0.15082724124663333, + "flos": 498676079616.0, + "grad_norm": 0.021733375764601555, + "language_loss": 0.99047554, + "learning_rate": 0.0009622075618922486, + "loss": 1.00203729, + "num_input_tokens_seen": 65114576, + "router_z_loss_mlp": 1.10546875, + "step": 784, + "time_per_iteration": 2.7209272384643555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161923, + "balance_loss_mlp": 1.05110586, + "epoch": 0.15101962293189689, + "flos": 510722019840.0, + "grad_norm": 0.02414763506099098, + "language_loss": 0.95223093, + "learning_rate": 0.0009620886537311091, + "loss": 0.96385014, + "num_input_tokens_seen": 65186640, + "router_z_loss_mlp": 1.10742188, + "step": 785, + "time_per_iteration": 2.668501138687134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154688, + "balance_loss_mlp": 1.04406226, + "epoch": 0.15121200461716044, + "flos": 458701714944.0, + "grad_norm": 0.026890312379790088, + "language_loss": 0.97208995, + "learning_rate": 0.000961969566171244, + "loss": 0.98363686, + "num_input_tokens_seen": 65252112, + "router_z_loss_mlp": 1.10546875, + "step": 786, + "time_per_iteration": 2.5466530323028564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153217, + "balance_loss_mlp": 1.04278123, + "epoch": 0.151404386302424, + "flos": 539017016832.0, + "grad_norm": 0.02528800532756524, + "language_loss": 1.00058115, + "learning_rate": 0.0009618502992588873, + "loss": 1.01211333, + "num_input_tokens_seen": 65318912, + "router_z_loss_mlp": 1.10351562, + "step": 787, + "time_per_iteration": 2.6463584899902344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154208, + "balance_loss_mlp": 1.04358232, + "epoch": 0.15159676798768756, + "flos": 689616643584.0, + "grad_norm": 0.023869082053813537, + "language_loss": 0.98612797, + "learning_rate": 0.0009617308530403424, + "loss": 0.99766994, + "num_input_tokens_seen": 65395424, + "router_z_loss_mlp": 1.10546875, + "step": 788, + "time_per_iteration": 3.065110921859741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158206, + "balance_loss_mlp": 1.04758012, + "epoch": 0.15178914967295112, + "flos": 546432869376.0, + "grad_norm": 0.025092696297707027, + "language_loss": 0.95288265, + "learning_rate": 0.0009616112275619825, + "loss": 0.96446472, + "num_input_tokens_seen": 65470480, + "router_z_loss_mlp": 1.10546875, + "step": 789, + "time_per_iteration": 2.7197253704071045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158368, + "balance_loss_mlp": 1.0478847, + "epoch": 0.1519815313582147, + "flos": 512814845952.0, + "grad_norm": 0.020890571468345706, + "language_loss": 0.90545368, + "learning_rate": 0.0009614914228702503, + "loss": 0.91703737, + "num_input_tokens_seen": 65544720, + "router_z_loss_mlp": 1.10400391, + "step": 790, + "time_per_iteration": 2.6894142627716064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158071, + "balance_loss_mlp": 1.04782641, + "epoch": 0.15217391304347827, + "flos": 685457187840.0, + "grad_norm": 0.02448742031060442, + "language_loss": 0.96480352, + "learning_rate": 0.0009613714390116581, + "loss": 0.97638422, + "num_input_tokens_seen": 65627872, + "router_z_loss_mlp": 1.1015625, + "step": 791, + "time_per_iteration": 2.9898860454559326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155788, + "balance_loss_mlp": 1.04568636, + "epoch": 0.15236629472874183, + "flos": 645445893120.0, + "grad_norm": 0.023088199171654812, + "language_loss": 0.93995309, + "learning_rate": 0.0009612512760327879, + "loss": 0.95151103, + "num_input_tokens_seen": 65705264, + "router_z_loss_mlp": 1.10009766, + "step": 792, + "time_per_iteration": 2.855648994445801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154532, + "balance_loss_mlp": 1.0444783, + "epoch": 0.1525586764140054, + "flos": 413764892160.0, + "grad_norm": 0.024948238648346503, + "language_loss": 0.97790802, + "learning_rate": 0.0009611309339802909, + "loss": 0.98945332, + "num_input_tokens_seen": 65768592, + "router_z_loss_mlp": 1.09960938, + "step": 793, + "time_per_iteration": 2.4684345722198486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153777, + "balance_loss_mlp": 1.04372334, + "epoch": 0.15275105809926895, + "flos": 804233448960.0, + "grad_norm": 0.02131820977076166, + "language_loss": 0.93039513, + "learning_rate": 0.0009610104129008881, + "loss": 0.94193292, + "num_input_tokens_seen": 65852432, + "router_z_loss_mlp": 1.09960938, + "step": 794, + "time_per_iteration": 3.1013269424438477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155691, + "balance_loss_mlp": 1.04554129, + "epoch": 0.1529434397845325, + "flos": 613542663168.0, + "grad_norm": 0.024012716250022468, + "language_loss": 0.97966266, + "learning_rate": 0.0009608897128413701, + "loss": 0.99121952, + "num_input_tokens_seen": 65927904, + "router_z_loss_mlp": 1.10058594, + "step": 795, + "time_per_iteration": 2.729837417602539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154149, + "balance_loss_mlp": 1.04419053, + "epoch": 0.15313582146979607, + "flos": 616471418880.0, + "grad_norm": 0.02134077894827986, + "language_loss": 0.93399352, + "learning_rate": 0.0009607688338485965, + "loss": 0.945535, + "num_input_tokens_seen": 66006800, + "router_z_loss_mlp": 1.09863281, + "step": 796, + "time_per_iteration": 2.8517422676086426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153572, + "balance_loss_mlp": 1.04409015, + "epoch": 0.15332820315505963, + "flos": 794992012800.0, + "grad_norm": 0.02204541106277596, + "language_loss": 0.98951191, + "learning_rate": 0.0009606477759694969, + "loss": 1.00104761, + "num_input_tokens_seen": 66088608, + "router_z_loss_mlp": 1.09375, + "step": 797, + "time_per_iteration": 3.0313384532928467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153537, + "balance_loss_mlp": 1.0440551, + "epoch": 0.1535205848403232, + "flos": 551256339456.0, + "grad_norm": 0.028291975879130113, + "language_loss": 0.99155664, + "learning_rate": 0.0009605265392510703, + "loss": 1.00309205, + "num_input_tokens_seen": 66153616, + "router_z_loss_mlp": 1.09375, + "step": 798, + "time_per_iteration": 2.6558592319488525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150991, + "balance_loss_mlp": 1.04122281, + "epoch": 0.15371296652558677, + "flos": 536978585088.0, + "grad_norm": 0.02676367025649214, + "language_loss": 1.00762391, + "learning_rate": 0.0009604051237403846, + "loss": 1.01913381, + "num_input_tokens_seen": 66219472, + "router_z_loss_mlp": 1.09667969, + "step": 799, + "time_per_iteration": 2.6129424571990967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151653, + "balance_loss_mlp": 1.04198015, + "epoch": 0.15390534821085033, + "flos": 396089751552.0, + "grad_norm": 0.02759928767191203, + "language_loss": 0.9523741, + "learning_rate": 0.0009602835294845776, + "loss": 0.96389061, + "num_input_tokens_seen": 66281456, + "router_z_loss_mlp": 1.09570312, + "step": 800, + "time_per_iteration": 2.4865612983703613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152453, + "balance_loss_mlp": 1.04297161, + "epoch": 0.1540977298961139, + "flos": 536885259264.0, + "grad_norm": 0.0240348205061721, + "language_loss": 0.99338514, + "learning_rate": 0.0009601617565308565, + "loss": 1.00490952, + "num_input_tokens_seen": 66348160, + "router_z_loss_mlp": 1.09375, + "step": 801, + "time_per_iteration": 2.646925449371338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155144, + "balance_loss_mlp": 1.04551864, + "epoch": 0.15429011158137745, + "flos": 725090449920.0, + "grad_norm": 0.022214532903779557, + "language_loss": 0.94821054, + "learning_rate": 0.0009600398049264977, + "loss": 0.95976186, + "num_input_tokens_seen": 66430576, + "router_z_loss_mlp": 1.09521484, + "step": 802, + "time_per_iteration": 3.0287652015686035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011558, + "balance_loss_mlp": 1.04627085, + "epoch": 0.154482493266641, + "flos": 621748783104.0, + "grad_norm": 0.025430739734688717, + "language_loss": 1.02679133, + "learning_rate": 0.0009599176747188469, + "loss": 1.03834927, + "num_input_tokens_seen": 66506480, + "router_z_loss_mlp": 1.09423828, + "step": 803, + "time_per_iteration": 2.8240089416503906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156206, + "balance_loss_mlp": 1.0467242, + "epoch": 0.15467487495190457, + "flos": 526719297024.0, + "grad_norm": 0.024483654101252486, + "language_loss": 0.90705526, + "learning_rate": 0.0009597953659553196, + "loss": 0.91861731, + "num_input_tokens_seen": 66577680, + "router_z_loss_mlp": 1.09375, + "step": 804, + "time_per_iteration": 2.745878219604492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153494, + "balance_loss_mlp": 1.04386926, + "epoch": 0.15486725663716813, + "flos": 528759730176.0, + "grad_norm": 0.02516296775651391, + "language_loss": 0.97286022, + "learning_rate": 0.0009596728786833997, + "loss": 0.98439509, + "num_input_tokens_seen": 66648496, + "router_z_loss_mlp": 1.09521484, + "step": 805, + "time_per_iteration": 2.6471030712127686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152075, + "balance_loss_mlp": 1.04244983, + "epoch": 0.1550596383224317, + "flos": 1050278799360.0, + "grad_norm": 0.026563720364072098, + "language_loss": 0.9858942, + "learning_rate": 0.0009595502129506415, + "loss": 0.99741489, + "num_input_tokens_seen": 66735216, + "router_z_loss_mlp": 1.09521484, + "step": 806, + "time_per_iteration": 3.3734352588653564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115037, + "balance_loss_mlp": 1.04088783, + "epoch": 0.15525202000769528, + "flos": 614836489728.0, + "grad_norm": 0.02624405223250092, + "language_loss": 0.91745955, + "learning_rate": 0.0009594273688046678, + "loss": 0.92896324, + "num_input_tokens_seen": 66810672, + "router_z_loss_mlp": 1.09375, + "step": 807, + "time_per_iteration": 2.8000967502593994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153708, + "balance_loss_mlp": 1.04441667, + "epoch": 0.15544440169295884, + "flos": 534102222336.0, + "grad_norm": 0.028049278390969077, + "language_loss": 0.97350299, + "learning_rate": 0.000959304346293171, + "loss": 0.98504007, + "num_input_tokens_seen": 66879824, + "router_z_loss_mlp": 1.09179688, + "step": 808, + "time_per_iteration": 2.7285830974578857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164275, + "balance_loss_mlp": 1.05464995, + "epoch": 0.1556367833782224, + "flos": 645886325760.0, + "grad_norm": 0.033021349518653896, + "language_loss": 0.99046445, + "learning_rate": 0.0009591811454639125, + "loss": 1.00210714, + "num_input_tokens_seen": 66949424, + "router_z_loss_mlp": 1.09521484, + "step": 809, + "time_per_iteration": 2.842867612838745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155411, + "balance_loss_mlp": 1.04612005, + "epoch": 0.15582916506348596, + "flos": 544952391168.0, + "grad_norm": 0.02421082053858415, + "language_loss": 0.95793635, + "learning_rate": 0.0009590577663647234, + "loss": 0.96949041, + "num_input_tokens_seen": 67024000, + "router_z_loss_mlp": 1.09179688, + "step": 810, + "time_per_iteration": 2.8207406997680664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158015, + "balance_loss_mlp": 1.04877126, + "epoch": 0.15602154674874952, + "flos": 581214463488.0, + "grad_norm": 0.022734781081273227, + "language_loss": 0.95110512, + "learning_rate": 0.0009589342090435036, + "loss": 0.96268523, + "num_input_tokens_seen": 67100672, + "router_z_loss_mlp": 1.09130859, + "step": 811, + "time_per_iteration": 2.8413872718811035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170356, + "balance_loss_mlp": 1.06068361, + "epoch": 0.15621392843401308, + "flos": 536316572160.0, + "grad_norm": 0.026628933906638022, + "language_loss": 0.97807872, + "learning_rate": 0.0009588104735482223, + "loss": 0.98978221, + "num_input_tokens_seen": 67171584, + "router_z_loss_mlp": 1.09570312, + "step": 812, + "time_per_iteration": 2.6673126220703125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164587, + "balance_loss_mlp": 1.05524826, + "epoch": 0.15640631011927664, + "flos": 551981478912.0, + "grad_norm": 0.027865461759282353, + "language_loss": 0.94247007, + "learning_rate": 0.0009586865599269177, + "loss": 0.95411587, + "num_input_tokens_seen": 67240640, + "router_z_loss_mlp": 1.09228516, + "step": 813, + "time_per_iteration": 2.655217409133911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159004, + "balance_loss_mlp": 1.04985571, + "epoch": 0.1565986918045402, + "flos": 638635657728.0, + "grad_norm": 0.024501009698068087, + "language_loss": 0.98888743, + "learning_rate": 0.0009585624682276977, + "loss": 1.00047755, + "num_input_tokens_seen": 67312976, + "router_z_loss_mlp": 1.09033203, + "step": 814, + "time_per_iteration": 2.7572293281555176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160029, + "balance_loss_mlp": 1.05073786, + "epoch": 0.15679107348980378, + "flos": 491781250560.0, + "grad_norm": 0.02545428800843787, + "language_loss": 0.97158241, + "learning_rate": 0.0009584381984987386, + "loss": 0.98318267, + "num_input_tokens_seen": 67378528, + "router_z_loss_mlp": 1.09179688, + "step": 815, + "time_per_iteration": 2.554208517074585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160766, + "balance_loss_mlp": 1.05185616, + "epoch": 0.15698345517506734, + "flos": 531002277888.0, + "grad_norm": 0.022736041606184667, + "language_loss": 0.98151159, + "learning_rate": 0.0009583137507882864, + "loss": 0.99311924, + "num_input_tokens_seen": 67449728, + "router_z_loss_mlp": 1.08789062, + "step": 816, + "time_per_iteration": 2.6635444164276123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158696, + "balance_loss_mlp": 1.04978669, + "epoch": 0.1571758368603309, + "flos": 547077417984.0, + "grad_norm": 0.024009976747476527, + "language_loss": 0.90921289, + "learning_rate": 0.000958189125144656, + "loss": 0.92079985, + "num_input_tokens_seen": 67520512, + "router_z_loss_mlp": 1.08789062, + "step": 817, + "time_per_iteration": 2.635559558868408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156061, + "balance_loss_mlp": 1.04719925, + "epoch": 0.15736821854559446, + "flos": 566743326720.0, + "grad_norm": 0.021547949482456395, + "language_loss": 0.97883654, + "learning_rate": 0.0009580643216162313, + "loss": 0.99039721, + "num_input_tokens_seen": 67592464, + "router_z_loss_mlp": 1.08740234, + "step": 818, + "time_per_iteration": 2.673997640609741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157698, + "balance_loss_mlp": 1.04888415, + "epoch": 0.15756060023085802, + "flos": 501953943552.0, + "grad_norm": 0.023826624353146583, + "language_loss": 0.90112716, + "learning_rate": 0.0009579393402514652, + "loss": 0.91270417, + "num_input_tokens_seen": 67658928, + "router_z_loss_mlp": 1.08691406, + "step": 819, + "time_per_iteration": 2.61460542678833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156999, + "balance_loss_mlp": 1.04823244, + "epoch": 0.15775298191612158, + "flos": 520271631360.0, + "grad_norm": 0.023927295219635936, + "language_loss": 0.99075627, + "learning_rate": 0.0009578141810988801, + "loss": 1.00232625, + "num_input_tokens_seen": 67727936, + "router_z_loss_mlp": 1.08642578, + "step": 820, + "time_per_iteration": 2.591036558151245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149976, + "balance_loss_mlp": 1.04111433, + "epoch": 0.15794536360138514, + "flos": 467087755776.0, + "grad_norm": 0.026283029611425073, + "language_loss": 1.00067806, + "learning_rate": 0.0009576888442070668, + "loss": 1.01217794, + "num_input_tokens_seen": 67795488, + "router_z_loss_mlp": 1.08740234, + "step": 821, + "time_per_iteration": 2.5960564613342285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151894, + "balance_loss_mlp": 1.04279363, + "epoch": 0.1581377452866487, + "flos": 518168071680.0, + "grad_norm": 0.02399653039287492, + "language_loss": 1.01290274, + "learning_rate": 0.0009575633296246854, + "loss": 1.02442169, + "num_input_tokens_seen": 67858896, + "router_z_loss_mlp": 1.08984375, + "step": 822, + "time_per_iteration": 2.579575300216675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151848, + "balance_loss_mlp": 1.04312956, + "epoch": 0.15833012697191226, + "flos": 550837373952.0, + "grad_norm": 0.02407632334340799, + "language_loss": 0.91124117, + "learning_rate": 0.0009574376374004652, + "loss": 0.92275965, + "num_input_tokens_seen": 67924864, + "router_z_loss_mlp": 1.0859375, + "step": 823, + "time_per_iteration": 2.661754608154297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162901, + "balance_loss_mlp": 1.05446815, + "epoch": 0.15852250865717585, + "flos": 488466456576.0, + "grad_norm": 0.026327967105985502, + "language_loss": 0.90841949, + "learning_rate": 0.000957311767583204, + "loss": 0.92004848, + "num_input_tokens_seen": 67992912, + "router_z_loss_mlp": 1.08300781, + "step": 824, + "time_per_iteration": 2.7887372970581055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156753, + "balance_loss_mlp": 1.04956055, + "epoch": 0.1587148903424394, + "flos": 1312696909824.0, + "grad_norm": 0.010620587901871582, + "language_loss": 0.8207159, + "learning_rate": 0.0009571857202217691, + "loss": 0.8322835, + "num_input_tokens_seen": 68207408, + "router_z_loss_mlp": 1.0703125, + "step": 825, + "time_per_iteration": 4.766167640686035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151145, + "balance_loss_mlp": 1.04304576, + "epoch": 0.15890727202770297, + "flos": 467832360960.0, + "grad_norm": 0.02959471781097451, + "language_loss": 1.0376749, + "learning_rate": 0.0009570594953650961, + "loss": 1.04918623, + "num_input_tokens_seen": 68270864, + "router_z_loss_mlp": 1.07958984, + "step": 826, + "time_per_iteration": 2.6334874629974365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151786, + "balance_loss_mlp": 1.04354417, + "epoch": 0.15909965371296653, + "flos": 778606695936.0, + "grad_norm": 0.024366848241159877, + "language_loss": 0.8923949, + "learning_rate": 0.00095693309306219, + "loss": 0.90391278, + "num_input_tokens_seen": 68355408, + "router_z_loss_mlp": 1.08105469, + "step": 827, + "time_per_iteration": 3.1078274250030518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115264, + "balance_loss_mlp": 1.04449332, + "epoch": 0.1592920353982301, + "flos": 1079962950144.0, + "grad_norm": 0.02547465125103231, + "language_loss": 0.98567259, + "learning_rate": 0.0009568065133621244, + "loss": 0.99719906, + "num_input_tokens_seen": 68437072, + "router_z_loss_mlp": 1.08007812, + "step": 828, + "time_per_iteration": 3.3287436962127686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147109, + "balance_loss_mlp": 1.03872418, + "epoch": 0.15948441708349365, + "flos": 726889837056.0, + "grad_norm": 0.026992334830630314, + "language_loss": 0.93815649, + "learning_rate": 0.0009566797563140422, + "loss": 0.94962764, + "num_input_tokens_seen": 68511696, + "router_z_loss_mlp": 1.08251953, + "step": 829, + "time_per_iteration": 2.8641507625579834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146027, + "balance_loss_mlp": 1.03788006, + "epoch": 0.1596767987687572, + "flos": 580075087872.0, + "grad_norm": 0.026140449767567974, + "language_loss": 0.96191794, + "learning_rate": 0.0009565528219671547, + "loss": 0.97337818, + "num_input_tokens_seen": 68587488, + "router_z_loss_mlp": 1.08007812, + "step": 830, + "time_per_iteration": 2.9082329273223877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147169, + "balance_loss_mlp": 1.03902268, + "epoch": 0.15986918045402077, + "flos": 530025358848.0, + "grad_norm": 0.02186736495212519, + "language_loss": 0.93771887, + "learning_rate": 0.0009564257103707418, + "loss": 0.94919056, + "num_input_tokens_seen": 68655760, + "router_z_loss_mlp": 1.08007812, + "step": 831, + "time_per_iteration": 4.109540700912476 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153246, + "balance_loss_mlp": 1.04519463, + "epoch": 0.16006156213928435, + "flos": 575669856768.0, + "grad_norm": 0.025156765484562034, + "language_loss": 1.01463771, + "learning_rate": 0.0009562984215741533, + "loss": 1.02617025, + "num_input_tokens_seen": 68724560, + "router_z_loss_mlp": 1.07910156, + "step": 832, + "time_per_iteration": 2.634381055831909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148637, + "balance_loss_mlp": 1.0408721, + "epoch": 0.1602539438245479, + "flos": 516674858496.0, + "grad_norm": 0.023022886756030446, + "language_loss": 0.90665066, + "learning_rate": 0.0009561709556268065, + "loss": 0.91813707, + "num_input_tokens_seen": 68795440, + "router_z_loss_mlp": 1.07617188, + "step": 833, + "time_per_iteration": 2.7094552516937256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115539, + "balance_loss_mlp": 1.04752922, + "epoch": 0.16044632550981147, + "flos": 622161017856.0, + "grad_norm": 0.02456985500743924, + "language_loss": 1.0306673, + "learning_rate": 0.0009560433125781884, + "loss": 1.04222107, + "num_input_tokens_seen": 68868176, + "router_z_loss_mlp": 1.07714844, + "step": 834, + "time_per_iteration": 2.7217955589294434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156285, + "balance_loss_mlp": 1.04794765, + "epoch": 0.16063870719507503, + "flos": 562127975424.0, + "grad_norm": 0.02550250825542428, + "language_loss": 1.02622008, + "learning_rate": 0.0009559154924778544, + "loss": 1.03778291, + "num_input_tokens_seen": 68939616, + "router_z_loss_mlp": 1.08203125, + "step": 835, + "time_per_iteration": 4.0438151359558105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153381, + "balance_loss_mlp": 1.04509139, + "epoch": 0.1608310888803386, + "flos": 806560590336.0, + "grad_norm": 0.023331498233936678, + "language_loss": 0.93980491, + "learning_rate": 0.0009557874953754284, + "loss": 0.95133871, + "num_input_tokens_seen": 69016192, + "router_z_loss_mlp": 1.08154297, + "step": 836, + "time_per_iteration": 3.0253541469573975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155161, + "balance_loss_mlp": 1.04739583, + "epoch": 0.16102347056560215, + "flos": 601694108160.0, + "grad_norm": 0.024039154316001603, + "language_loss": 0.9449209, + "learning_rate": 0.0009556593213206038, + "loss": 0.95647246, + "num_input_tokens_seen": 69089360, + "router_z_loss_mlp": 1.07617188, + "step": 837, + "time_per_iteration": 2.815293788909912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148071, + "balance_loss_mlp": 1.04049647, + "epoch": 0.1612158522508657, + "flos": 554614794240.0, + "grad_norm": 0.024490980939479982, + "language_loss": 0.96443379, + "learning_rate": 0.0009555309703631414, + "loss": 0.9759146, + "num_input_tokens_seen": 69161952, + "router_z_loss_mlp": 1.07421875, + "step": 838, + "time_per_iteration": 2.7353601455688477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148397, + "balance_loss_mlp": 1.0406791, + "epoch": 0.16140823393612927, + "flos": 557017797120.0, + "grad_norm": 0.026558461299776022, + "language_loss": 0.98485982, + "learning_rate": 0.0009554024425528722, + "loss": 0.99634379, + "num_input_tokens_seen": 69232432, + "router_z_loss_mlp": 1.07568359, + "step": 839, + "time_per_iteration": 2.801539182662964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146915, + "balance_loss_mlp": 1.03924477, + "epoch": 0.16160061562139286, + "flos": 544908730368.0, + "grad_norm": 0.023933605454050468, + "language_loss": 0.96992832, + "learning_rate": 0.0009552737379396948, + "loss": 0.98139745, + "num_input_tokens_seen": 69297696, + "router_z_loss_mlp": 1.07519531, + "step": 840, + "time_per_iteration": 2.613037586212158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148515, + "balance_loss_mlp": 1.04122651, + "epoch": 0.16179299730665642, + "flos": 605006900736.0, + "grad_norm": 0.020652206840645122, + "language_loss": 0.95695615, + "learning_rate": 0.0009551448565735767, + "loss": 0.96844131, + "num_input_tokens_seen": 69373888, + "router_z_loss_mlp": 1.07128906, + "step": 841, + "time_per_iteration": 2.779979705810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149052, + "balance_loss_mlp": 1.04128659, + "epoch": 0.16198537899191998, + "flos": 788551077888.0, + "grad_norm": 0.02358864683094414, + "language_loss": 0.96423578, + "learning_rate": 0.0009550157985045543, + "loss": 0.97572625, + "num_input_tokens_seen": 69449984, + "router_z_loss_mlp": 1.07617188, + "step": 842, + "time_per_iteration": 3.0352344512939453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148245, + "balance_loss_mlp": 1.04086173, + "epoch": 0.16217776067718354, + "flos": 520829584896.0, + "grad_norm": 0.02127918945612936, + "language_loss": 0.95624614, + "learning_rate": 0.0009548865637827321, + "loss": 0.96772861, + "num_input_tokens_seen": 69522736, + "router_z_loss_mlp": 1.07226562, + "step": 843, + "time_per_iteration": 2.695211172103882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148036, + "balance_loss_mlp": 1.04027128, + "epoch": 0.1623701423624471, + "flos": 506254388736.0, + "grad_norm": 0.02427958482397641, + "language_loss": 0.99469078, + "learning_rate": 0.0009547571524582838, + "loss": 1.00617111, + "num_input_tokens_seen": 69587184, + "router_z_loss_mlp": 1.07617188, + "step": 844, + "time_per_iteration": 2.586859941482544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145622, + "balance_loss_mlp": 1.03842914, + "epoch": 0.16256252404771065, + "flos": 498157057536.0, + "grad_norm": 0.025657026114593633, + "language_loss": 1.02873135, + "learning_rate": 0.0009546275645814512, + "loss": 1.04018748, + "num_input_tokens_seen": 69656560, + "router_z_loss_mlp": 1.0703125, + "step": 845, + "time_per_iteration": 2.735323190689087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147597, + "balance_loss_mlp": 1.04040384, + "epoch": 0.16275490573297421, + "flos": 503286701568.0, + "grad_norm": 0.024743383464961046, + "language_loss": 1.00377154, + "learning_rate": 0.0009544978002025446, + "loss": 1.01524746, + "num_input_tokens_seen": 69723872, + "router_z_loss_mlp": 1.0703125, + "step": 846, + "time_per_iteration": 2.5876121520996094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148997, + "balance_loss_mlp": 1.04189885, + "epoch": 0.16294728741823777, + "flos": 508353945600.0, + "grad_norm": 0.020876938588178177, + "language_loss": 0.94877481, + "learning_rate": 0.0009543678593719434, + "loss": 0.9602648, + "num_input_tokens_seen": 69795504, + "router_z_loss_mlp": 1.06933594, + "step": 847, + "time_per_iteration": 2.69250750541687 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159847, + "balance_loss_mlp": 1.05274892, + "epoch": 0.16313966910350133, + "flos": 510756948480.0, + "grad_norm": 0.020936629725758764, + "language_loss": 0.95534647, + "learning_rate": 0.0009542377421400945, + "loss": 0.96694493, + "num_input_tokens_seen": 69873408, + "router_z_loss_mlp": 1.06933594, + "step": 848, + "time_per_iteration": 2.7832183837890625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146796, + "balance_loss_mlp": 1.03965068, + "epoch": 0.16333205078876492, + "flos": 545056450560.0, + "grad_norm": 0.023544058946573278, + "language_loss": 0.94486761, + "learning_rate": 0.0009541074485575145, + "loss": 0.95633554, + "num_input_tokens_seen": 69944112, + "router_z_loss_mlp": 1.06982422, + "step": 849, + "time_per_iteration": 2.7163026332855225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147161, + "balance_loss_mlp": 1.03996801, + "epoch": 0.16352443247402848, + "flos": 508711785984.0, + "grad_norm": 0.023080110816121054, + "language_loss": 1.00550437, + "learning_rate": 0.0009539769786747874, + "loss": 1.01697588, + "num_input_tokens_seen": 70012288, + "router_z_loss_mlp": 1.0703125, + "step": 850, + "time_per_iteration": 2.5918350219726562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152854, + "balance_loss_mlp": 1.04547, + "epoch": 0.16371681415929204, + "flos": 543222134784.0, + "grad_norm": 0.022593715242085626, + "language_loss": 0.90895152, + "learning_rate": 0.0009538463325425665, + "loss": 0.92048007, + "num_input_tokens_seen": 70086560, + "router_z_loss_mlp": 1.07226562, + "step": 851, + "time_per_iteration": 2.701662063598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146583, + "balance_loss_mlp": 1.03939056, + "epoch": 0.1639091958445556, + "flos": 521760841728.0, + "grad_norm": 0.025319624949764974, + "language_loss": 0.95562863, + "learning_rate": 0.0009537155102115728, + "loss": 0.96709442, + "num_input_tokens_seen": 70153968, + "router_z_loss_mlp": 1.0703125, + "step": 852, + "time_per_iteration": 2.577416181564331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145576, + "balance_loss_mlp": 1.03871727, + "epoch": 0.16410157752981916, + "flos": 548482034688.0, + "grad_norm": 0.022217218078565786, + "language_loss": 0.92332971, + "learning_rate": 0.0009535845117325961, + "loss": 0.93478549, + "num_input_tokens_seen": 70222496, + "router_z_loss_mlp": 1.06689453, + "step": 853, + "time_per_iteration": 2.643528699874878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148166, + "balance_loss_mlp": 1.04135406, + "epoch": 0.16429395921508272, + "flos": 584025698304.0, + "grad_norm": 0.02024018106959617, + "language_loss": 1.00128078, + "learning_rate": 0.0009534533371564946, + "loss": 1.01276231, + "num_input_tokens_seen": 70301680, + "router_z_loss_mlp": 1.06640625, + "step": 854, + "time_per_iteration": 2.74361515045166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150543, + "balance_loss_mlp": 1.04377949, + "epoch": 0.16448634090034628, + "flos": 531961732608.0, + "grad_norm": 0.02843561601072028, + "language_loss": 1.00094676, + "learning_rate": 0.0009533219865341949, + "loss": 1.01245213, + "num_input_tokens_seen": 70371152, + "router_z_loss_mlp": 1.06591797, + "step": 855, + "time_per_iteration": 2.5858380794525146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156957, + "balance_loss_mlp": 1.05014503, + "epoch": 0.16467872258560984, + "flos": 492960284160.0, + "grad_norm": 0.026495144396752456, + "language_loss": 0.95923662, + "learning_rate": 0.0009531904599166916, + "loss": 0.97080612, + "num_input_tokens_seen": 70440832, + "router_z_loss_mlp": 1.06640625, + "step": 856, + "time_per_iteration": 2.638528823852539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147973, + "balance_loss_mlp": 1.04101861, + "epoch": 0.16487110427087343, + "flos": 507259505664.0, + "grad_norm": 0.02303677132947941, + "language_loss": 0.95950538, + "learning_rate": 0.0009530587573550478, + "loss": 0.97098505, + "num_input_tokens_seen": 70507424, + "router_z_loss_mlp": 1.06787109, + "step": 857, + "time_per_iteration": 2.5788354873657227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151596, + "balance_loss_mlp": 1.04592896, + "epoch": 0.16506348595613698, + "flos": 1436108714496.0, + "grad_norm": 0.011861304780107247, + "language_loss": 0.74319386, + "learning_rate": 0.0009529268789003953, + "loss": 0.75470984, + "num_input_tokens_seen": 70742320, + "router_z_loss_mlp": 1.0546875, + "step": 858, + "time_per_iteration": 5.003005027770996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153597, + "balance_loss_mlp": 1.04673755, + "epoch": 0.16525586764140054, + "flos": 478089647616.0, + "grad_norm": 0.02595402254221991, + "language_loss": 0.98057735, + "learning_rate": 0.0009527948246039337, + "loss": 0.99211335, + "num_input_tokens_seen": 70808400, + "router_z_loss_mlp": 1.06689453, + "step": 859, + "time_per_iteration": 2.541255474090576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152748, + "balance_loss_mlp": 1.04622293, + "epoch": 0.1654482493266641, + "flos": 882540518400.0, + "grad_norm": 0.024187417777422206, + "language_loss": 0.96476752, + "learning_rate": 0.000952662594516931, + "loss": 0.97629499, + "num_input_tokens_seen": 70886192, + "router_z_loss_mlp": 1.06347656, + "step": 860, + "time_per_iteration": 3.102233409881592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154678, + "balance_loss_mlp": 1.04791439, + "epoch": 0.16564063101192766, + "flos": 628105124352.0, + "grad_norm": 0.02242324391324738, + "language_loss": 0.93166292, + "learning_rate": 0.0009525301886907234, + "loss": 0.94320977, + "num_input_tokens_seen": 70964816, + "router_z_loss_mlp": 1.06591797, + "step": 861, + "time_per_iteration": 2.871971368789673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151309, + "balance_loss_mlp": 1.04487896, + "epoch": 0.16583301269719122, + "flos": 562592603136.0, + "grad_norm": 0.02248996903194516, + "language_loss": 0.97140592, + "learning_rate": 0.0009523976071767155, + "loss": 0.98291898, + "num_input_tokens_seen": 71037456, + "router_z_loss_mlp": 1.0625, + "step": 862, + "time_per_iteration": 2.653031349182129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146763, + "balance_loss_mlp": 1.04038036, + "epoch": 0.16602539438245478, + "flos": 568983873024.0, + "grad_norm": 0.020794335354585358, + "language_loss": 0.9646408, + "learning_rate": 0.00095226485002638, + "loss": 0.97610843, + "num_input_tokens_seen": 71111872, + "router_z_loss_mlp": 1.06201172, + "step": 863, + "time_per_iteration": 2.7685163021087646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147042, + "balance_loss_mlp": 1.04075551, + "epoch": 0.16621777606771834, + "flos": 576021692928.0, + "grad_norm": 0.021581021962121343, + "language_loss": 0.96560466, + "learning_rate": 0.0009521319172912576, + "loss": 0.9770751, + "num_input_tokens_seen": 71187808, + "router_z_loss_mlp": 1.06103516, + "step": 864, + "time_per_iteration": 2.762233257293701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149511, + "balance_loss_mlp": 1.0432713, + "epoch": 0.16641015775298193, + "flos": 515597882880.0, + "grad_norm": 0.029880870913045234, + "language_loss": 1.0375855, + "learning_rate": 0.0009519988090229579, + "loss": 1.04908061, + "num_input_tokens_seen": 71261728, + "router_z_loss_mlp": 1.06054688, + "step": 865, + "time_per_iteration": 2.7156929969787598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148426, + "balance_loss_mlp": 1.04199588, + "epoch": 0.1666025394382455, + "flos": 622849227264.0, + "grad_norm": 0.023088954173990716, + "language_loss": 0.96669209, + "learning_rate": 0.0009518655252731576, + "loss": 0.9781763, + "num_input_tokens_seen": 71338352, + "router_z_loss_mlp": 1.0625, + "step": 866, + "time_per_iteration": 2.76474928855896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147261, + "balance_loss_mlp": 1.04102135, + "epoch": 0.16679492112350905, + "flos": 549932313600.0, + "grad_norm": 0.021458749489738967, + "language_loss": 0.98467255, + "learning_rate": 0.0009517320660936022, + "loss": 0.99614513, + "num_input_tokens_seen": 71416544, + "router_z_loss_mlp": 1.06054688, + "step": 867, + "time_per_iteration": 2.7664713859558105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151692, + "balance_loss_mlp": 1.04545259, + "epoch": 0.1669873028087726, + "flos": 666865526784.0, + "grad_norm": 0.02209258354681387, + "language_loss": 0.92114806, + "learning_rate": 0.0009515984315361051, + "loss": 0.93266487, + "num_input_tokens_seen": 71494080, + "router_z_loss_mlp": 1.06054688, + "step": 868, + "time_per_iteration": 2.845388412475586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151588, + "balance_loss_mlp": 1.04563451, + "epoch": 0.16717968449403617, + "flos": 539603168256.0, + "grad_norm": 0.02501334283432316, + "language_loss": 0.95751995, + "learning_rate": 0.000951464621652548, + "loss": 0.96903574, + "num_input_tokens_seen": 71562672, + "router_z_loss_mlp": 1.05761719, + "step": 869, + "time_per_iteration": 2.623375415802002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148167, + "balance_loss_mlp": 1.04216599, + "epoch": 0.16737206617929973, + "flos": 531278252544.0, + "grad_norm": 0.02062860382438808, + "language_loss": 0.87610328, + "learning_rate": 0.0009513306364948804, + "loss": 0.88758498, + "num_input_tokens_seen": 71641904, + "router_z_loss_mlp": 1.05810547, + "step": 870, + "time_per_iteration": 2.792346239089966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148065, + "balance_loss_mlp": 1.04206407, + "epoch": 0.1675644478645633, + "flos": 481756277760.0, + "grad_norm": 0.023236257285911367, + "language_loss": 0.98118269, + "learning_rate": 0.0009511964761151197, + "loss": 0.99266338, + "num_input_tokens_seen": 71709616, + "router_z_loss_mlp": 1.05810547, + "step": 871, + "time_per_iteration": 2.572923183441162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152601, + "balance_loss_mlp": 1.04669595, + "epoch": 0.16775682954982685, + "flos": 495541206528.0, + "grad_norm": 0.026661505796453877, + "language_loss": 0.99311042, + "learning_rate": 0.0009510621405653521, + "loss": 1.00463641, + "num_input_tokens_seen": 71776592, + "router_z_loss_mlp": 1.05712891, + "step": 872, + "time_per_iteration": 2.6296472549438477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150326, + "balance_loss_mlp": 1.04484987, + "epoch": 0.1679492112350904, + "flos": 753404912640.0, + "grad_norm": 0.029291148216183213, + "language_loss": 0.93300939, + "learning_rate": 0.0009509276298977309, + "loss": 0.94451261, + "num_input_tokens_seen": 71856352, + "router_z_loss_mlp": 1.05273438, + "step": 873, + "time_per_iteration": 3.0177366733551025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150817, + "balance_loss_mlp": 1.04543638, + "epoch": 0.168141592920354, + "flos": 1137731977728.0, + "grad_norm": 0.021155110884158303, + "language_loss": 0.9134444, + "learning_rate": 0.0009507929441644778, + "loss": 0.92495263, + "num_input_tokens_seen": 71948480, + "router_z_loss_mlp": 1.05175781, + "step": 874, + "time_per_iteration": 3.53277325630188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160399, + "balance_loss_mlp": 1.05501771, + "epoch": 0.16833397460561755, + "flos": 633553677312.0, + "grad_norm": 0.025508723945600786, + "language_loss": 0.94342184, + "learning_rate": 0.0009506580834178826, + "loss": 0.95502585, + "num_input_tokens_seen": 72019200, + "router_z_loss_mlp": 1.05175781, + "step": 875, + "time_per_iteration": 2.763296365737915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151031, + "balance_loss_mlp": 1.04560196, + "epoch": 0.1685263562908811, + "flos": 542542657536.0, + "grad_norm": 0.0234395143242784, + "language_loss": 1.00066125, + "learning_rate": 0.0009505230477103028, + "loss": 1.01217151, + "num_input_tokens_seen": 72088672, + "router_z_loss_mlp": 1.05224609, + "step": 876, + "time_per_iteration": 2.7256453037261963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143495, + "balance_loss_mlp": 1.03801847, + "epoch": 0.16871873797614467, + "flos": 620485155840.0, + "grad_norm": 0.02951425183806971, + "language_loss": 0.91949958, + "learning_rate": 0.0009503878370941641, + "loss": 0.93093449, + "num_input_tokens_seen": 72159952, + "router_z_loss_mlp": 1.05273438, + "step": 877, + "time_per_iteration": 2.75011944770813 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143733, + "balance_loss_mlp": 1.038257, + "epoch": 0.16891111966140823, + "flos": 607455565824.0, + "grad_norm": 0.02526909046796152, + "language_loss": 0.99137431, + "learning_rate": 0.0009502524516219595, + "loss": 1.00281167, + "num_input_tokens_seen": 72231648, + "router_z_loss_mlp": 1.05273438, + "step": 878, + "time_per_iteration": 2.7107326984405518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145725, + "balance_loss_mlp": 1.04005778, + "epoch": 0.1691035013466718, + "flos": 553405561344.0, + "grad_norm": 0.023246247090994255, + "language_loss": 0.99022686, + "learning_rate": 0.0009501168913462506, + "loss": 1.00168419, + "num_input_tokens_seen": 72298608, + "router_z_loss_mlp": 1.0546875, + "step": 879, + "time_per_iteration": 2.654356002807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153572, + "balance_loss_mlp": 1.04866791, + "epoch": 0.16929588303193535, + "flos": 1479305822208.0, + "grad_norm": 0.014844444469597292, + "language_loss": 0.79121923, + "learning_rate": 0.0009499811563196665, + "loss": 0.802755, + "num_input_tokens_seen": 72525312, + "router_z_loss_mlp": 1.046875, + "step": 880, + "time_per_iteration": 4.877387762069702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114571, + "balance_loss_mlp": 1.04042399, + "epoch": 0.1694882647171989, + "flos": 927846641664.0, + "grad_norm": 0.023879743421000837, + "language_loss": 0.93963408, + "learning_rate": 0.0009498452465949042, + "loss": 0.95109117, + "num_input_tokens_seen": 72612976, + "router_z_loss_mlp": 1.05078125, + "step": 881, + "time_per_iteration": 3.241151809692383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150014, + "balance_loss_mlp": 1.0447762, + "epoch": 0.1696806464024625, + "flos": 547151278080.0, + "grad_norm": 0.02293023114251512, + "language_loss": 0.98854458, + "learning_rate": 0.0009497091622247285, + "loss": 1.0000447, + "num_input_tokens_seen": 72686800, + "router_z_loss_mlp": 1.05029297, + "step": 882, + "time_per_iteration": 2.720453977584839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145786, + "balance_loss_mlp": 1.0406431, + "epoch": 0.16987302808772606, + "flos": 530294602752.0, + "grad_norm": 0.02459483675822623, + "language_loss": 1.0302248, + "learning_rate": 0.0009495729032619723, + "loss": 1.04168272, + "num_input_tokens_seen": 72759360, + "router_z_loss_mlp": 1.04931641, + "step": 883, + "time_per_iteration": 2.717176675796509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151842, + "balance_loss_mlp": 1.04731977, + "epoch": 0.17006540977298962, + "flos": 756478660608.0, + "grad_norm": 0.02507713686866634, + "language_loss": 0.9295364, + "learning_rate": 0.0009494364697595354, + "loss": 0.94105482, + "num_input_tokens_seen": 72831424, + "router_z_loss_mlp": 1.04589844, + "step": 884, + "time_per_iteration": 2.924898147583008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157567, + "balance_loss_mlp": 1.05271089, + "epoch": 0.17025779145825318, + "flos": 559874694144.0, + "grad_norm": 0.025110060032482954, + "language_loss": 0.98774076, + "learning_rate": 0.0009492998617703867, + "loss": 0.99931645, + "num_input_tokens_seen": 72901536, + "router_z_loss_mlp": 1.04833984, + "step": 885, + "time_per_iteration": 2.6759417057037354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155376, + "balance_loss_mlp": 1.05104423, + "epoch": 0.17045017314351674, + "flos": 513216347136.0, + "grad_norm": 0.0280627140127875, + "language_loss": 0.96898842, + "learning_rate": 0.0009491630793475619, + "loss": 0.98054218, + "num_input_tokens_seen": 72970480, + "router_z_loss_mlp": 1.04492188, + "step": 886, + "time_per_iteration": 2.5910444259643555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149096, + "balance_loss_mlp": 1.04452574, + "epoch": 0.1706425548287803, + "flos": 510012343296.0, + "grad_norm": 0.023090423796267925, + "language_loss": 0.94873035, + "learning_rate": 0.0009490261225441643, + "loss": 0.96022129, + "num_input_tokens_seen": 73053376, + "router_z_loss_mlp": 1.04638672, + "step": 887, + "time_per_iteration": 2.960139513015747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149945, + "balance_loss_mlp": 1.04508829, + "epoch": 0.17083493651404386, + "flos": 718714642944.0, + "grad_norm": 0.024954435208077393, + "language_loss": 0.98478651, + "learning_rate": 0.0009488889914133656, + "loss": 0.99628592, + "num_input_tokens_seen": 73136032, + "router_z_loss_mlp": 1.04833984, + "step": 888, + "time_per_iteration": 3.0498712062835693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151016, + "balance_loss_mlp": 1.04649353, + "epoch": 0.17102731819930742, + "flos": 560200333824.0, + "grad_norm": 0.020862133880352407, + "language_loss": 0.97394216, + "learning_rate": 0.0009487516860084047, + "loss": 0.98545229, + "num_input_tokens_seen": 73208544, + "router_z_loss_mlp": 1.046875, + "step": 889, + "time_per_iteration": 2.799579381942749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115955, + "balance_loss_mlp": 1.0542171, + "epoch": 0.17121969988457098, + "flos": 495764788224.0, + "grad_norm": 0.030159167385703775, + "language_loss": 0.99659365, + "learning_rate": 0.0009486142063825884, + "loss": 1.0081892, + "num_input_tokens_seen": 73274336, + "router_z_loss_mlp": 1.05126953, + "step": 890, + "time_per_iteration": 2.5897767543792725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160561, + "balance_loss_mlp": 1.05718231, + "epoch": 0.17141208156983456, + "flos": 1552105941504.0, + "grad_norm": 0.012289453069715352, + "language_loss": 0.72426212, + "learning_rate": 0.0009484765525892909, + "loss": 0.73586774, + "num_input_tokens_seen": 73506320, + "router_z_loss_mlp": 1.03515625, + "step": 891, + "time_per_iteration": 4.971697807312012 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160561, + "balance_loss_mlp": 1.05527556, + "epoch": 0.17160446325509812, + "flos": 620700005376.0, + "grad_norm": 0.02677753623279009, + "language_loss": 1.00227833, + "learning_rate": 0.0009483387246819542, + "loss": 1.01388383, + "num_input_tokens_seen": 73578048, + "router_z_loss_mlp": 1.05078125, + "step": 892, + "time_per_iteration": 2.7142419815063477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153152, + "balance_loss_mlp": 1.04977417, + "epoch": 0.17179684494036168, + "flos": 1384693300224.0, + "grad_norm": 0.011012484205567044, + "language_loss": 0.82285583, + "learning_rate": 0.0009482007227140877, + "loss": 0.8343873, + "num_input_tokens_seen": 73798640, + "router_z_loss_mlp": 1.03515625, + "step": 893, + "time_per_iteration": 4.678752183914185 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159751, + "balance_loss_mlp": 1.05446541, + "epoch": 0.17198922662562524, + "flos": 493641762816.0, + "grad_norm": 0.02464509578240857, + "language_loss": 0.9638195, + "learning_rate": 0.0009480625467392688, + "loss": 0.97541702, + "num_input_tokens_seen": 73867328, + "router_z_loss_mlp": 1.05175781, + "step": 894, + "time_per_iteration": 2.6579103469848633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158279, + "balance_loss_mlp": 1.05490112, + "epoch": 0.1721816083108888, + "flos": 1461485689344.0, + "grad_norm": 0.014844728137103481, + "language_loss": 0.77994668, + "learning_rate": 0.0009479241968111421, + "loss": 0.79152954, + "num_input_tokens_seen": 74093376, + "router_z_loss_mlp": 1.03515625, + "step": 895, + "time_per_iteration": 4.754615783691406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157074, + "balance_loss_mlp": 1.0523603, + "epoch": 0.17237398999615236, + "flos": 529204892160.0, + "grad_norm": 0.024157534092911288, + "language_loss": 0.95005947, + "learning_rate": 0.0009477856729834196, + "loss": 0.96163023, + "num_input_tokens_seen": 74169136, + "router_z_loss_mlp": 1.046875, + "step": 896, + "time_per_iteration": 2.7640984058380127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161659, + "balance_loss_mlp": 1.05742288, + "epoch": 0.17256637168141592, + "flos": 605026366464.0, + "grad_norm": 0.02447501108745492, + "language_loss": 0.9782356, + "learning_rate": 0.0009476469753098809, + "loss": 0.98985219, + "num_input_tokens_seen": 74236912, + "router_z_loss_mlp": 1.04394531, + "step": 897, + "time_per_iteration": 2.7016282081604004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153769, + "balance_loss_mlp": 1.04957986, + "epoch": 0.17275875336667948, + "flos": 510693821952.0, + "grad_norm": 0.025419887327313116, + "language_loss": 0.94868481, + "learning_rate": 0.0009475081038443738, + "loss": 0.96022242, + "num_input_tokens_seen": 74305968, + "router_z_loss_mlp": 1.04345703, + "step": 898, + "time_per_iteration": 2.5731348991394043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148609, + "balance_loss_mlp": 1.0446589, + "epoch": 0.17295113505194307, + "flos": 666500955648.0, + "grad_norm": 0.02623291269769982, + "language_loss": 0.95752573, + "learning_rate": 0.0009473690586408124, + "loss": 0.96901178, + "num_input_tokens_seen": 74384144, + "router_z_loss_mlp": 1.04101562, + "step": 899, + "time_per_iteration": 2.8549156188964844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146417, + "balance_loss_mlp": 1.04227531, + "epoch": 0.17314351673720663, + "flos": 556431645696.0, + "grad_norm": 0.022300666942289, + "language_loss": 0.94826102, + "learning_rate": 0.0009472298397531792, + "loss": 0.9597252, + "num_input_tokens_seen": 74455040, + "router_z_loss_mlp": 1.04296875, + "step": 900, + "time_per_iteration": 2.7165167331695557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145486, + "balance_loss_mlp": 1.04124928, + "epoch": 0.17333589842247019, + "flos": 504606724608.0, + "grad_norm": 0.023477361471443404, + "language_loss": 0.95443118, + "learning_rate": 0.0009470904472355235, + "loss": 0.96588612, + "num_input_tokens_seen": 74525248, + "router_z_loss_mlp": 1.04394531, + "step": 901, + "time_per_iteration": 2.668320655822754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143811, + "balance_loss_mlp": 1.03967023, + "epoch": 0.17352828010773375, + "flos": 557350167552.0, + "grad_norm": 0.02470997420275152, + "language_loss": 0.90534914, + "learning_rate": 0.0009469508811419626, + "loss": 0.91678727, + "num_input_tokens_seen": 74597328, + "router_z_loss_mlp": 1.04296875, + "step": 902, + "time_per_iteration": 2.714174747467041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155739, + "balance_loss_mlp": 1.05331421, + "epoch": 0.1737206617929973, + "flos": 1557791537664.0, + "grad_norm": 0.011695515468407039, + "language_loss": 0.7161383, + "learning_rate": 0.0009468111415266806, + "loss": 0.7276957, + "num_input_tokens_seen": 74819664, + "router_z_loss_mlp": 1.02539062, + "step": 903, + "time_per_iteration": 4.783574104309082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146888, + "balance_loss_mlp": 1.04308009, + "epoch": 0.17391304347826086, + "flos": 517755836928.0, + "grad_norm": 0.027522671456014093, + "language_loss": 0.94518518, + "learning_rate": 0.0009466712284439292, + "loss": 0.95665407, + "num_input_tokens_seen": 74896224, + "router_z_loss_mlp": 1.03955078, + "step": 904, + "time_per_iteration": 2.7503864765167236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011486, + "balance_loss_mlp": 1.04503071, + "epoch": 0.17410542516352442, + "flos": 542160622080.0, + "grad_norm": 0.027186859166075866, + "language_loss": 0.99262786, + "learning_rate": 0.0009465311419480276, + "loss": 1.00411391, + "num_input_tokens_seen": 74966560, + "router_z_loss_mlp": 1.03710938, + "step": 905, + "time_per_iteration": 2.671753168106079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153491, + "balance_loss_mlp": 1.05011249, + "epoch": 0.17429780684878798, + "flos": 625081041408.0, + "grad_norm": 0.028950662808853365, + "language_loss": 0.96674442, + "learning_rate": 0.0009463908820933622, + "loss": 0.97827929, + "num_input_tokens_seen": 75045248, + "router_z_loss_mlp": 1.03515625, + "step": 906, + "time_per_iteration": 2.8291828632354736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151914, + "balance_loss_mlp": 1.04844034, + "epoch": 0.17449018853405157, + "flos": 576848890368.0, + "grad_norm": 0.03002954803612974, + "language_loss": 0.90420532, + "learning_rate": 0.0009462504489343868, + "loss": 0.91572446, + "num_input_tokens_seen": 75123952, + "router_z_loss_mlp": 1.03613281, + "step": 907, + "time_per_iteration": 2.8554108142852783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147077, + "balance_loss_mlp": 1.04341269, + "epoch": 0.17468257021931513, + "flos": 534772967424.0, + "grad_norm": 0.024073731406752365, + "language_loss": 1.01002121, + "learning_rate": 0.0009461098425256222, + "loss": 1.02149189, + "num_input_tokens_seen": 75191728, + "router_z_loss_mlp": 1.03808594, + "step": 908, + "time_per_iteration": 2.5968382358551025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114306, + "balance_loss_mlp": 1.03930068, + "epoch": 0.1748749519045787, + "flos": 541808785920.0, + "grad_norm": 0.02493910110608304, + "language_loss": 0.93412566, + "learning_rate": 0.0009459690629216567, + "loss": 0.94555628, + "num_input_tokens_seen": 75262224, + "router_z_loss_mlp": 1.0390625, + "step": 909, + "time_per_iteration": 2.670389413833618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150977, + "balance_loss_mlp": 1.04688334, + "epoch": 0.17506733358984225, + "flos": 499626802176.0, + "grad_norm": 0.02402970341263653, + "language_loss": 0.96272469, + "learning_rate": 0.0009458281101771457, + "loss": 0.97423446, + "num_input_tokens_seen": 75329760, + "router_z_loss_mlp": 1.04248047, + "step": 910, + "time_per_iteration": 2.6256320476531982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153015, + "balance_loss_mlp": 1.04906452, + "epoch": 0.1752597152751058, + "flos": 624132320256.0, + "grad_norm": 0.023679811966199643, + "language_loss": 0.91450173, + "learning_rate": 0.0009456869843468122, + "loss": 0.92603183, + "num_input_tokens_seen": 75407920, + "router_z_loss_mlp": 1.04101562, + "step": 911, + "time_per_iteration": 2.863004207611084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158204, + "balance_loss_mlp": 1.05434883, + "epoch": 0.17545209696036937, + "flos": 521993155584.0, + "grad_norm": 0.029813530713564303, + "language_loss": 0.92364156, + "learning_rate": 0.0009455456854854459, + "loss": 0.93522358, + "num_input_tokens_seen": 75476752, + "router_z_loss_mlp": 1.04003906, + "step": 912, + "time_per_iteration": 2.616231918334961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150079, + "balance_loss_mlp": 1.04612815, + "epoch": 0.17564447864563293, + "flos": 462945764352.0, + "grad_norm": 0.02810445184103091, + "language_loss": 0.92624664, + "learning_rate": 0.0009454042136479039, + "loss": 0.93774742, + "num_input_tokens_seen": 75542944, + "router_z_loss_mlp": 1.04101562, + "step": 913, + "time_per_iteration": 2.5944247245788574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155662, + "balance_loss_mlp": 1.05199766, + "epoch": 0.1758368603308965, + "flos": 481617289728.0, + "grad_norm": 0.02706355326928303, + "language_loss": 0.91841793, + "learning_rate": 0.0009452625688891103, + "loss": 0.92997456, + "num_input_tokens_seen": 75609840, + "router_z_loss_mlp": 1.03808594, + "step": 914, + "time_per_iteration": 2.580941915512085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144051, + "balance_loss_mlp": 1.04200745, + "epoch": 0.17602924201616005, + "flos": 1482084856320.0, + "grad_norm": 0.009713749524187035, + "language_loss": 0.78734738, + "learning_rate": 0.0009451207512640567, + "loss": 0.79878789, + "num_input_tokens_seen": 75819312, + "router_z_loss_mlp": 1.02148438, + "step": 915, + "time_per_iteration": 4.592097997665405 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148996, + "balance_loss_mlp": 1.04523647, + "epoch": 0.17622162370142364, + "flos": 603470026752.0, + "grad_norm": 0.02797967110469985, + "language_loss": 1.03421283, + "learning_rate": 0.0009449787608278015, + "loss": 1.0457027, + "num_input_tokens_seen": 75893984, + "router_z_loss_mlp": 1.0390625, + "step": 916, + "time_per_iteration": 2.755580425262451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150108, + "balance_loss_mlp": 1.04677713, + "epoch": 0.1764140053866872, + "flos": 443605495296.0, + "grad_norm": 0.024189441248888145, + "language_loss": 1.00777316, + "learning_rate": 0.0009448365976354704, + "loss": 1.01927423, + "num_input_tokens_seen": 75958944, + "router_z_loss_mlp": 1.03466797, + "step": 917, + "time_per_iteration": 2.4922571182250977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149055, + "balance_loss_mlp": 1.04567707, + "epoch": 0.17660638707195075, + "flos": 501591373824.0, + "grad_norm": 0.028333637349232343, + "language_loss": 1.01507974, + "learning_rate": 0.0009446942617422558, + "loss": 1.02657032, + "num_input_tokens_seen": 76024240, + "router_z_loss_mlp": 1.03515625, + "step": 918, + "time_per_iteration": 2.574998378753662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148191, + "balance_loss_mlp": 1.0448128, + "epoch": 0.17679876875721431, + "flos": 539983202304.0, + "grad_norm": 0.02432410226762854, + "language_loss": 0.94564992, + "learning_rate": 0.0009445517532034176, + "loss": 0.9571318, + "num_input_tokens_seen": 76095264, + "router_z_loss_mlp": 1.03515625, + "step": 919, + "time_per_iteration": 2.7170355319976807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153425, + "balance_loss_mlp": 1.05009484, + "epoch": 0.17699115044247787, + "flos": 498715011072.0, + "grad_norm": 0.026165935935680888, + "language_loss": 0.99032271, + "learning_rate": 0.0009444090720742824, + "loss": 1.00185692, + "num_input_tokens_seen": 76163520, + "router_z_loss_mlp": 1.03466797, + "step": 920, + "time_per_iteration": 2.6009292602539062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149157, + "balance_loss_mlp": 1.04587448, + "epoch": 0.17718353212774143, + "flos": 663915303936.0, + "grad_norm": 0.025722324934358026, + "language_loss": 0.98290348, + "learning_rate": 0.0009442662184102439, + "loss": 0.99439508, + "num_input_tokens_seen": 76233760, + "router_z_loss_mlp": 1.03417969, + "step": 921, + "time_per_iteration": 2.7612035274505615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145605, + "balance_loss_mlp": 1.04251313, + "epoch": 0.177375913813005, + "flos": 583847778816.0, + "grad_norm": 0.021564117555322487, + "language_loss": 0.93569565, + "learning_rate": 0.000944123192266763, + "loss": 0.94715166, + "num_input_tokens_seen": 76310704, + "router_z_loss_mlp": 1.03222656, + "step": 922, + "time_per_iteration": 2.8110268115997314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141792, + "balance_loss_mlp": 1.03865182, + "epoch": 0.17756829549826855, + "flos": 553683537408.0, + "grad_norm": 0.021487036209533367, + "language_loss": 0.92858881, + "learning_rate": 0.0009439799936993671, + "loss": 0.94000673, + "num_input_tokens_seen": 76386992, + "router_z_loss_mlp": 1.03271484, + "step": 923, + "time_per_iteration": 2.7440245151519775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142202, + "balance_loss_mlp": 1.03901482, + "epoch": 0.17776067718353214, + "flos": 557371634688.0, + "grad_norm": 0.02463154633112553, + "language_loss": 0.97990632, + "learning_rate": 0.0009438366227636511, + "loss": 0.99132836, + "num_input_tokens_seen": 76453328, + "router_z_loss_mlp": 1.03320312, + "step": 924, + "time_per_iteration": 2.7032759189605713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140208, + "balance_loss_mlp": 1.03721154, + "epoch": 0.1779530588687957, + "flos": 659651788800.0, + "grad_norm": 0.022941473179093813, + "language_loss": 0.94988692, + "learning_rate": 0.0009436930795152763, + "loss": 0.96128899, + "num_input_tokens_seen": 76529040, + "router_z_loss_mlp": 1.03125, + "step": 925, + "time_per_iteration": 2.854522943496704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143555, + "balance_loss_mlp": 1.04084456, + "epoch": 0.17814544055405926, + "flos": 645671476224.0, + "grad_norm": 0.02421412975678805, + "language_loss": 0.95479, + "learning_rate": 0.0009435493640099713, + "loss": 0.9662255, + "num_input_tokens_seen": 76604080, + "router_z_loss_mlp": 1.02832031, + "step": 926, + "time_per_iteration": 2.8268251419067383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143389, + "balance_loss_mlp": 1.04077399, + "epoch": 0.17833782223932282, + "flos": 461884251648.0, + "grad_norm": 0.0252062590806445, + "language_loss": 0.94177145, + "learning_rate": 0.0009434054763035314, + "loss": 0.95320535, + "num_input_tokens_seen": 76674096, + "router_z_loss_mlp": 1.02734375, + "step": 927, + "time_per_iteration": 2.629499673843384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139685, + "balance_loss_mlp": 1.03706956, + "epoch": 0.17853020392458638, + "flos": 760852965888.0, + "grad_norm": 0.02122720378042075, + "language_loss": 0.93181551, + "learning_rate": 0.0009432614164518185, + "loss": 0.94321233, + "num_input_tokens_seen": 76752144, + "router_z_loss_mlp": 1.02734375, + "step": 928, + "time_per_iteration": 2.9364700317382812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140803, + "balance_loss_mlp": 1.03818727, + "epoch": 0.17872258560984994, + "flos": 784055248896.0, + "grad_norm": 0.023477252169520995, + "language_loss": 0.93520033, + "learning_rate": 0.000943117184510762, + "loss": 0.94660836, + "num_input_tokens_seen": 76830240, + "router_z_loss_mlp": 1.02734375, + "step": 929, + "time_per_iteration": 3.07600474357605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150169, + "balance_loss_mlp": 1.04831696, + "epoch": 0.1789149672951135, + "flos": 1463031295488.0, + "grad_norm": 0.013755703560815407, + "language_loss": 0.78789961, + "learning_rate": 0.0009429727805363575, + "loss": 0.7994014, + "num_input_tokens_seen": 77062464, + "router_z_loss_mlp": 1.01953125, + "step": 930, + "time_per_iteration": 5.029282808303833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153323, + "balance_loss_mlp": 1.05099344, + "epoch": 0.17910734898037706, + "flos": 504930362880.0, + "grad_norm": 0.023999213273897636, + "language_loss": 0.96652937, + "learning_rate": 0.0009428282045846674, + "loss": 0.97806263, + "num_input_tokens_seen": 77136672, + "router_z_loss_mlp": 1.02441406, + "step": 931, + "time_per_iteration": 2.7112410068511963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145421, + "balance_loss_mlp": 1.04275823, + "epoch": 0.17929973066564064, + "flos": 747669651456.0, + "grad_norm": 0.02006943819739268, + "language_loss": 0.96385491, + "learning_rate": 0.0009426834567118214, + "loss": 0.97530913, + "num_input_tokens_seen": 77227040, + "router_z_loss_mlp": 1.02783203, + "step": 932, + "time_per_iteration": 3.0711913108825684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143693, + "balance_loss_mlp": 1.04098177, + "epoch": 0.1794921123509042, + "flos": 714572651520.0, + "grad_norm": 0.021210123960592832, + "language_loss": 0.89608383, + "learning_rate": 0.0009425385369740155, + "loss": 0.90752071, + "num_input_tokens_seen": 77319392, + "router_z_loss_mlp": 1.02832031, + "step": 933, + "time_per_iteration": 3.059857130050659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114727, + "balance_loss_mlp": 1.0451318, + "epoch": 0.17968449403616776, + "flos": 634361409024.0, + "grad_norm": 0.02299955090486112, + "language_loss": 0.96636283, + "learning_rate": 0.0009423934454275125, + "loss": 0.97783554, + "num_input_tokens_seen": 77394688, + "router_z_loss_mlp": 1.02246094, + "step": 934, + "time_per_iteration": 2.85917592048645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146917, + "balance_loss_mlp": 1.04477859, + "epoch": 0.17987687572143132, + "flos": 537378084864.0, + "grad_norm": 0.02461268142415081, + "language_loss": 1.01075852, + "learning_rate": 0.0009422481821286418, + "loss": 1.02222764, + "num_input_tokens_seen": 77468288, + "router_z_loss_mlp": 1.02246094, + "step": 935, + "time_per_iteration": 2.7314486503601074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150005, + "balance_loss_mlp": 1.04777098, + "epoch": 0.18006925740669488, + "flos": 539119074816.0, + "grad_norm": 0.026258801194945027, + "language_loss": 0.98970592, + "learning_rate": 0.0009421027471337998, + "loss": 1.00120604, + "num_input_tokens_seen": 77535840, + "router_z_loss_mlp": 1.0234375, + "step": 936, + "time_per_iteration": 2.6354496479034424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151337, + "balance_loss_mlp": 1.04891205, + "epoch": 0.18026163909195844, + "flos": 540534425088.0, + "grad_norm": 0.029056123283387615, + "language_loss": 0.94782555, + "learning_rate": 0.0009419571404994493, + "loss": 0.9593389, + "num_input_tokens_seen": 77604000, + "router_z_loss_mlp": 1.02539062, + "step": 937, + "time_per_iteration": 2.6368348598480225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148965, + "balance_loss_mlp": 1.04649317, + "epoch": 0.180454020777222, + "flos": 501682698240.0, + "grad_norm": 0.026973093946582868, + "language_loss": 1.00715971, + "learning_rate": 0.00094181136228212, + "loss": 1.01864934, + "num_input_tokens_seen": 77671488, + "router_z_loss_mlp": 1.02587891, + "step": 938, + "time_per_iteration": 2.710451602935791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145832, + "balance_loss_mlp": 1.043455, + "epoch": 0.18064640246248556, + "flos": 500006836224.0, + "grad_norm": 0.02510488837562242, + "language_loss": 0.93535352, + "learning_rate": 0.0009416654125384077, + "loss": 0.9468118, + "num_input_tokens_seen": 77746240, + "router_z_loss_mlp": 1.02490234, + "step": 939, + "time_per_iteration": 2.728480577468872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145905, + "balance_loss_mlp": 1.04424286, + "epoch": 0.18083878414774912, + "flos": 1522290808320.0, + "grad_norm": 0.01070150853185005, + "language_loss": 0.79772377, + "learning_rate": 0.0009415192913249752, + "loss": 0.80918276, + "num_input_tokens_seen": 77966080, + "router_z_loss_mlp": 1.01757812, + "step": 940, + "time_per_iteration": 4.915560007095337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145419, + "balance_loss_mlp": 1.04318535, + "epoch": 0.1810311658330127, + "flos": 728665755648.0, + "grad_norm": 0.023936590350452012, + "language_loss": 0.92724693, + "learning_rate": 0.000941372998698552, + "loss": 0.93870103, + "num_input_tokens_seen": 78049200, + "router_z_loss_mlp": 1.0234375, + "step": 941, + "time_per_iteration": 2.993441343307495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140689, + "balance_loss_mlp": 1.0385505, + "epoch": 0.18122354751827627, + "flos": 566044383744.0, + "grad_norm": 0.025062658148163358, + "language_loss": 0.94270039, + "learning_rate": 0.0009412265347159336, + "loss": 0.95410728, + "num_input_tokens_seen": 78122752, + "router_z_loss_mlp": 1.02246094, + "step": 942, + "time_per_iteration": 2.731416702270508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140669, + "balance_loss_mlp": 1.03848326, + "epoch": 0.18141592920353983, + "flos": 520317293568.0, + "grad_norm": 0.024682729806918415, + "language_loss": 0.94559634, + "learning_rate": 0.0009410798994339829, + "loss": 0.95700312, + "num_input_tokens_seen": 78194064, + "router_z_loss_mlp": 1.02294922, + "step": 943, + "time_per_iteration": 2.6001992225646973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138644, + "balance_loss_mlp": 1.03650522, + "epoch": 0.1816083108888034, + "flos": 513476858880.0, + "grad_norm": 0.022579221317186333, + "language_loss": 0.95589852, + "learning_rate": 0.000940933092909628, + "loss": 0.96728498, + "num_input_tokens_seen": 78262048, + "router_z_loss_mlp": 1.02246094, + "step": 944, + "time_per_iteration": 2.6360957622528076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147356, + "balance_loss_mlp": 1.04550409, + "epoch": 0.18180069257406695, + "flos": 493372518912.0, + "grad_norm": 0.02569410792888805, + "language_loss": 0.9276287, + "learning_rate": 0.0009407861151998649, + "loss": 0.93910229, + "num_input_tokens_seen": 78330624, + "router_z_loss_mlp": 1.01953125, + "step": 945, + "time_per_iteration": 2.6910903453826904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147749, + "balance_loss_mlp": 1.04608703, + "epoch": 0.1819930742593305, + "flos": 571230423552.0, + "grad_norm": 0.024877151530798884, + "language_loss": 0.95025092, + "learning_rate": 0.0009406389663617552, + "loss": 0.96172833, + "num_input_tokens_seen": 78400672, + "router_z_loss_mlp": 1.01757812, + "step": 946, + "time_per_iteration": 2.689232110977173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138734, + "balance_loss_mlp": 1.03669131, + "epoch": 0.18218545594459407, + "flos": 607110460416.0, + "grad_norm": 0.026141117268158143, + "language_loss": 0.96229172, + "learning_rate": 0.000940491646452427, + "loss": 0.97367907, + "num_input_tokens_seen": 78467952, + "router_z_loss_mlp": 1.02148438, + "step": 947, + "time_per_iteration": 2.720996618270874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136776, + "balance_loss_mlp": 1.03473294, + "epoch": 0.18237783762985763, + "flos": 549738931200.0, + "grad_norm": 0.02114848591843324, + "language_loss": 0.99382234, + "learning_rate": 0.000940344155529075, + "loss": 1.00519001, + "num_input_tokens_seen": 78538928, + "router_z_loss_mlp": 1.02148438, + "step": 948, + "time_per_iteration": 2.655764102935791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136656, + "balance_loss_mlp": 1.03489935, + "epoch": 0.1825702193151212, + "flos": 451674628608.0, + "grad_norm": 0.027816765537183038, + "language_loss": 0.98392528, + "learning_rate": 0.0009401964936489605, + "loss": 0.99529195, + "num_input_tokens_seen": 78602144, + "router_z_loss_mlp": 1.01855469, + "step": 949, + "time_per_iteration": 2.5372273921966553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137812, + "balance_loss_mlp": 1.03615081, + "epoch": 0.18276260100038477, + "flos": 590384040960.0, + "grad_norm": 0.023066854335363023, + "language_loss": 0.93237805, + "learning_rate": 0.0009400486608694108, + "loss": 0.94375616, + "num_input_tokens_seen": 78673152, + "router_z_loss_mlp": 1.01757812, + "step": 950, + "time_per_iteration": 2.7370681762695312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139002, + "balance_loss_mlp": 1.03719783, + "epoch": 0.18295498268564833, + "flos": 788709531648.0, + "grad_norm": 0.02337801281240106, + "language_loss": 0.97100747, + "learning_rate": 0.0009399006572478195, + "loss": 0.98239744, + "num_input_tokens_seen": 78753872, + "router_z_loss_mlp": 1.01904297, + "step": 951, + "time_per_iteration": 3.1136744022369385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144566, + "balance_loss_mlp": 1.04276168, + "epoch": 0.1831473643709119, + "flos": 579225696768.0, + "grad_norm": 0.024500893588447415, + "language_loss": 0.99522519, + "learning_rate": 0.0009397524828416468, + "loss": 1.00667083, + "num_input_tokens_seen": 78822640, + "router_z_loss_mlp": 1.01904297, + "step": 952, + "time_per_iteration": 2.680551767349243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138356, + "balance_loss_mlp": 1.03664696, + "epoch": 0.18333974605617545, + "flos": 567963293184.0, + "grad_norm": 0.023361368133084506, + "language_loss": 1.04812968, + "learning_rate": 0.0009396041377084192, + "loss": 1.05951309, + "num_input_tokens_seen": 78893792, + "router_z_loss_mlp": 1.01806641, + "step": 953, + "time_per_iteration": 2.6526119709014893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136097, + "balance_loss_mlp": 1.03443527, + "epoch": 0.183532127741439, + "flos": 528069519360.0, + "grad_norm": 0.02324700647994909, + "language_loss": 0.98137838, + "learning_rate": 0.0009394556219057295, + "loss": 0.99273932, + "num_input_tokens_seen": 78964752, + "router_z_loss_mlp": 1.01757812, + "step": 954, + "time_per_iteration": 2.6928489208221436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147999, + "balance_loss_mlp": 1.04671907, + "epoch": 0.18372450942670257, + "flos": 595643940864.0, + "grad_norm": 0.02338261009959255, + "language_loss": 0.93879586, + "learning_rate": 0.0009393069354912362, + "loss": 0.95027584, + "num_input_tokens_seen": 79034400, + "router_z_loss_mlp": 1.01367188, + "step": 955, + "time_per_iteration": 2.7496042251586914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151957, + "balance_loss_mlp": 1.05067647, + "epoch": 0.18391689111196613, + "flos": 646283824128.0, + "grad_norm": 0.029421035614033756, + "language_loss": 0.90626895, + "learning_rate": 0.0009391580785226649, + "loss": 0.91778857, + "num_input_tokens_seen": 79109488, + "router_z_loss_mlp": 1.01367188, + "step": 956, + "time_per_iteration": 2.9440600872039795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152481, + "balance_loss_mlp": 1.05253601, + "epoch": 0.18410927279722972, + "flos": 1460391975936.0, + "grad_norm": 0.020211591247266292, + "language_loss": 0.79340446, + "learning_rate": 0.0009390090510578067, + "loss": 0.80492932, + "num_input_tokens_seen": 79327712, + "router_z_loss_mlp": 1.0, + "step": 957, + "time_per_iteration": 4.738964796066284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138037, + "balance_loss_mlp": 1.03623211, + "epoch": 0.18430165448249328, + "flos": 660003624960.0, + "grad_norm": 0.026926680065899915, + "language_loss": 0.95339954, + "learning_rate": 0.0009388598531545196, + "loss": 0.96477991, + "num_input_tokens_seen": 79401504, + "router_z_loss_mlp": 1.01904297, + "step": 958, + "time_per_iteration": 2.859509229660034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138629, + "balance_loss_mlp": 1.03687191, + "epoch": 0.18449403616775684, + "flos": 518949606912.0, + "grad_norm": 0.029778126611616895, + "language_loss": 0.94583583, + "learning_rate": 0.000938710484870727, + "loss": 0.9572221, + "num_input_tokens_seen": 79466688, + "router_z_loss_mlp": 1.01855469, + "step": 959, + "time_per_iteration": 2.565548896789551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137101, + "balance_loss_mlp": 1.03543901, + "epoch": 0.1846864178530204, + "flos": 553824526848.0, + "grad_norm": 0.027283874554685776, + "language_loss": 0.94945395, + "learning_rate": 0.0009385609462644189, + "loss": 0.96082497, + "num_input_tokens_seen": 79540288, + "router_z_loss_mlp": 1.01757812, + "step": 960, + "time_per_iteration": 2.676379919052124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138569, + "balance_loss_mlp": 1.03709817, + "epoch": 0.18487879953828396, + "flos": 467115953664.0, + "grad_norm": 0.025693285519799033, + "language_loss": 0.96468461, + "learning_rate": 0.0009384112373936514, + "loss": 0.97607034, + "num_input_tokens_seen": 79611872, + "router_z_loss_mlp": 1.015625, + "step": 961, + "time_per_iteration": 2.7575974464416504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154728, + "balance_loss_mlp": 1.05325735, + "epoch": 0.18507118122354752, + "flos": 649683211776.0, + "grad_norm": 0.02725538915325764, + "language_loss": 1.0098747, + "learning_rate": 0.0009382613583165467, + "loss": 1.02142203, + "num_input_tokens_seen": 79689504, + "router_z_loss_mlp": 1.015625, + "step": 962, + "time_per_iteration": 2.8268754482269287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116263, + "balance_loss_mlp": 1.06125438, + "epoch": 0.18526356290881107, + "flos": 627922475520.0, + "grad_norm": 0.027998512126097927, + "language_loss": 0.99849832, + "learning_rate": 0.0009381113090912928, + "loss": 1.01012468, + "num_input_tokens_seen": 79759264, + "router_z_loss_mlp": 1.01464844, + "step": 963, + "time_per_iteration": 2.7762861251831055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147698, + "balance_loss_mlp": 1.04679894, + "epoch": 0.18545594459407463, + "flos": 433645650432.0, + "grad_norm": 0.027008272304904758, + "language_loss": 0.98634118, + "learning_rate": 0.000937961089776144, + "loss": 0.99781811, + "num_input_tokens_seen": 79824464, + "router_z_loss_mlp": 1.00976562, + "step": 964, + "time_per_iteration": 2.61759352684021 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149635, + "balance_loss_mlp": 1.04844999, + "epoch": 0.1856483262793382, + "flos": 750426491904.0, + "grad_norm": 0.028502333826765886, + "language_loss": 0.91998804, + "learning_rate": 0.0009378107004294208, + "loss": 0.93148446, + "num_input_tokens_seen": 79907152, + "router_z_loss_mlp": 1.01269531, + "step": 965, + "time_per_iteration": 2.964561939239502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151478, + "balance_loss_mlp": 1.05057883, + "epoch": 0.18584070796460178, + "flos": 531401777664.0, + "grad_norm": 0.02451376704559663, + "language_loss": 1.00210857, + "learning_rate": 0.0009376601411095096, + "loss": 1.01362348, + "num_input_tokens_seen": 79976944, + "router_z_loss_mlp": 1.00976562, + "step": 966, + "time_per_iteration": 2.6664164066314697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150482, + "balance_loss_mlp": 1.04953575, + "epoch": 0.18603308964986534, + "flos": 484083419136.0, + "grad_norm": 0.02282308899195351, + "language_loss": 0.93174511, + "learning_rate": 0.0009375094118748622, + "loss": 0.94324994, + "num_input_tokens_seen": 80042112, + "router_z_loss_mlp": 1.01025391, + "step": 967, + "time_per_iteration": 2.544952392578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142823, + "balance_loss_mlp": 1.041924, + "epoch": 0.1862254713351289, + "flos": 802681112064.0, + "grad_norm": 0.02495680742184495, + "language_loss": 1.00251484, + "learning_rate": 0.0009373585127839976, + "loss": 1.01394308, + "num_input_tokens_seen": 80118896, + "router_z_loss_mlp": 1.00976562, + "step": 968, + "time_per_iteration": 2.973095417022705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142113, + "balance_loss_mlp": 1.0413574, + "epoch": 0.18641785302039246, + "flos": 479290148352.0, + "grad_norm": 0.02509872783632802, + "language_loss": 0.9944787, + "learning_rate": 0.0009372074438954994, + "loss": 1.00589979, + "num_input_tokens_seen": 80183360, + "router_z_loss_mlp": 1.00830078, + "step": 969, + "time_per_iteration": 2.5303025245666504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142663, + "balance_loss_mlp": 1.04181159, + "epoch": 0.18661023470565602, + "flos": 389779072512.0, + "grad_norm": 0.02439046514561532, + "language_loss": 1.00939226, + "learning_rate": 0.0009370562052680181, + "loss": 1.02081895, + "num_input_tokens_seen": 80247024, + "router_z_loss_mlp": 1.00927734, + "step": 970, + "time_per_iteration": 2.5023443698883057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114981, + "balance_loss_mlp": 1.04929316, + "epoch": 0.18680261639091958, + "flos": 565775139840.0, + "grad_norm": 0.02213336285369191, + "language_loss": 0.95379293, + "learning_rate": 0.0009369047969602695, + "loss": 0.96529102, + "num_input_tokens_seen": 80318256, + "router_z_loss_mlp": 1.00585938, + "step": 971, + "time_per_iteration": 2.722823143005371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154865, + "balance_loss_mlp": 1.05420506, + "epoch": 0.18699499807618314, + "flos": 480230137344.0, + "grad_norm": 0.029574405329312194, + "language_loss": 0.9913702, + "learning_rate": 0.0009367532190310357, + "loss": 1.00291884, + "num_input_tokens_seen": 80384848, + "router_z_loss_mlp": 1.00732422, + "step": 972, + "time_per_iteration": 2.633387327194214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149336, + "balance_loss_mlp": 1.0490092, + "epoch": 0.1871873797614467, + "flos": 554328086016.0, + "grad_norm": 0.02905569815438633, + "language_loss": 0.99535728, + "learning_rate": 0.0009366014715391644, + "loss": 1.00685072, + "num_input_tokens_seen": 80453088, + "router_z_loss_mlp": 1.00390625, + "step": 973, + "time_per_iteration": 2.6549065113067627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153264, + "balance_loss_mlp": 1.05293763, + "epoch": 0.18737976144671029, + "flos": 553952781312.0, + "grad_norm": 0.023481989115367276, + "language_loss": 0.9123525, + "learning_rate": 0.0009364495545435693, + "loss": 0.92388517, + "num_input_tokens_seen": 80528608, + "router_z_loss_mlp": 1.00390625, + "step": 974, + "time_per_iteration": 4.409714221954346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155126, + "balance_loss_mlp": 1.05479944, + "epoch": 0.18757214313197385, + "flos": 503247770112.0, + "grad_norm": 0.022955013749569684, + "language_loss": 0.97297812, + "learning_rate": 0.0009362974681032297, + "loss": 0.98452938, + "num_input_tokens_seen": 80599600, + "router_z_loss_mlp": 1.00390625, + "step": 975, + "time_per_iteration": 2.61857533454895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153706, + "balance_loss_mlp": 1.05352271, + "epoch": 0.1877645248172374, + "flos": 676291613184.0, + "grad_norm": 0.028784531937469084, + "language_loss": 0.98011422, + "learning_rate": 0.0009361452122771907, + "loss": 0.9916513, + "num_input_tokens_seen": 80677264, + "router_z_loss_mlp": 1.00244141, + "step": 976, + "time_per_iteration": 2.91774320602417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149368, + "balance_loss_mlp": 1.04923177, + "epoch": 0.18795690650250096, + "flos": 405862944768.0, + "grad_norm": 0.029616845561456457, + "language_loss": 0.95658362, + "learning_rate": 0.0009359927871245635, + "loss": 0.9680773, + "num_input_tokens_seen": 80739776, + "router_z_loss_mlp": 1.00195312, + "step": 977, + "time_per_iteration": 2.563232183456421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149302, + "balance_loss_mlp": 1.04916573, + "epoch": 0.18814928818776452, + "flos": 639063355392.0, + "grad_norm": 0.027239481801034963, + "language_loss": 0.98439831, + "learning_rate": 0.0009358401927045246, + "loss": 0.99589127, + "num_input_tokens_seen": 80815200, + "router_z_loss_mlp": 1.00195312, + "step": 978, + "time_per_iteration": 2.8147568702697754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144978, + "balance_loss_mlp": 1.04498518, + "epoch": 0.18834166987302808, + "flos": 1140115514880.0, + "grad_norm": 0.022094320674951175, + "language_loss": 0.96123868, + "learning_rate": 0.0009356874290763166, + "loss": 0.9726885, + "num_input_tokens_seen": 80905024, + "router_z_loss_mlp": 1.00048828, + "step": 979, + "time_per_iteration": 3.4719691276550293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149894, + "balance_loss_mlp": 1.04971051, + "epoch": 0.18853405155829164, + "flos": 505815957504.0, + "grad_norm": 0.02560863383472628, + "language_loss": 0.98637187, + "learning_rate": 0.0009355344962992474, + "loss": 0.99787074, + "num_input_tokens_seen": 80976704, + "router_z_loss_mlp": 1.00244141, + "step": 980, + "time_per_iteration": 2.6199324131011963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139646, + "balance_loss_mlp": 1.03931963, + "epoch": 0.1887264332435552, + "flos": 609370472448.0, + "grad_norm": 0.02150131271194909, + "language_loss": 0.97900265, + "learning_rate": 0.0009353813944326908, + "loss": 0.99039912, + "num_input_tokens_seen": 81057152, + "router_z_loss_mlp": 1.00390625, + "step": 981, + "time_per_iteration": 2.8862478733062744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143203, + "balance_loss_mlp": 1.04287672, + "epoch": 0.1889188149288188, + "flos": 553592212992.0, + "grad_norm": 0.027403519760576756, + "language_loss": 0.92598587, + "learning_rate": 0.0009352281235360863, + "loss": 0.93741786, + "num_input_tokens_seen": 81131520, + "router_z_loss_mlp": 1.00390625, + "step": 982, + "time_per_iteration": 2.680797815322876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142003, + "balance_loss_mlp": 1.04167616, + "epoch": 0.18911119661408235, + "flos": 419469954048.0, + "grad_norm": 0.02481781093748577, + "language_loss": 0.92531025, + "learning_rate": 0.0009350746836689389, + "loss": 0.93673027, + "num_input_tokens_seen": 81195952, + "router_z_loss_mlp": 1.00390625, + "step": 983, + "time_per_iteration": 2.5687928199768066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152649, + "balance_loss_mlp": 1.05289459, + "epoch": 0.1893035782993459, + "flos": 1485317784576.0, + "grad_norm": 0.01747927461324531, + "language_loss": 0.81439221, + "learning_rate": 0.0009349210748908193, + "loss": 0.82591867, + "num_input_tokens_seen": 81427312, + "router_z_loss_mlp": 0.99804688, + "step": 984, + "time_per_iteration": 4.978898048400879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115218, + "balance_loss_mlp": 1.05237782, + "epoch": 0.18949595998460947, + "flos": 509456391168.0, + "grad_norm": 0.033971943902626214, + "language_loss": 0.94133711, + "learning_rate": 0.0009347672972613634, + "loss": 0.95285892, + "num_input_tokens_seen": 81494256, + "router_z_loss_mlp": 0.99853516, + "step": 985, + "time_per_iteration": 2.5850014686584473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153583, + "balance_loss_mlp": 1.05382824, + "epoch": 0.18968834166987303, + "flos": 532192045056.0, + "grad_norm": 0.027626772825507382, + "language_loss": 0.93152702, + "learning_rate": 0.0009346133508402735, + "loss": 0.9430629, + "num_input_tokens_seen": 81569312, + "router_z_loss_mlp": 0.99804688, + "step": 986, + "time_per_iteration": 2.7262227535247803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146056, + "balance_loss_mlp": 1.04658782, + "epoch": 0.1898807233551366, + "flos": 500753442816.0, + "grad_norm": 0.02768975875157221, + "language_loss": 0.95335174, + "learning_rate": 0.0009344592356873166, + "loss": 0.96481234, + "num_input_tokens_seen": 81637024, + "router_z_loss_mlp": 0.99511719, + "step": 987, + "time_per_iteration": 2.678715467453003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149829, + "balance_loss_mlp": 1.05002666, + "epoch": 0.19007310504040015, + "flos": 603359236608.0, + "grad_norm": 0.02899497531058058, + "language_loss": 0.87347138, + "learning_rate": 0.0009343049518623255, + "loss": 0.88496965, + "num_input_tokens_seen": 81709488, + "router_z_loss_mlp": 0.99853516, + "step": 988, + "time_per_iteration": 2.726668119430542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143975, + "balance_loss_mlp": 1.04407787, + "epoch": 0.1902654867256637, + "flos": 602764353024.0, + "grad_norm": 0.022945627178248204, + "language_loss": 0.90576518, + "learning_rate": 0.0009341504994251985, + "loss": 0.91720492, + "num_input_tokens_seen": 81787152, + "router_z_loss_mlp": 0.99951172, + "step": 989, + "time_per_iteration": 2.8518989086151123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151848, + "balance_loss_mlp": 1.05247498, + "epoch": 0.19045786841092727, + "flos": 1579231363584.0, + "grad_norm": 0.011944448483625032, + "language_loss": 0.73520499, + "learning_rate": 0.0009339958784358994, + "loss": 0.74672347, + "num_input_tokens_seen": 82030608, + "router_z_loss_mlp": 0.99414062, + "step": 990, + "time_per_iteration": 5.084089517593384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144398, + "balance_loss_mlp": 1.04445326, + "epoch": 0.19065025009619085, + "flos": 683054184960.0, + "grad_norm": 0.025253455013724026, + "language_loss": 0.88680583, + "learning_rate": 0.0009338410889544574, + "loss": 0.8982498, + "num_input_tokens_seen": 82119872, + "router_z_loss_mlp": 1.0, + "step": 991, + "time_per_iteration": 3.007277011871338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113977, + "balance_loss_mlp": 1.03949153, + "epoch": 0.1908426317814544, + "flos": 603441828864.0, + "grad_norm": 0.02514183514150974, + "language_loss": 0.96243769, + "learning_rate": 0.000933686131040967, + "loss": 0.97383535, + "num_input_tokens_seen": 82195552, + "router_z_loss_mlp": 1.00341797, + "step": 992, + "time_per_iteration": 2.7673017978668213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144459, + "balance_loss_mlp": 1.04441845, + "epoch": 0.19103501346671797, + "flos": 587433818112.0, + "grad_norm": 0.025095383977303525, + "language_loss": 0.99126339, + "learning_rate": 0.0009335310047555883, + "loss": 1.00270796, + "num_input_tokens_seen": 82267040, + "router_z_loss_mlp": 1.00097656, + "step": 993, + "time_per_iteration": 2.782841920852661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145602, + "balance_loss_mlp": 1.04565716, + "epoch": 0.19122739515198153, + "flos": 546834370560.0, + "grad_norm": 0.0365250692916995, + "language_loss": 0.97246122, + "learning_rate": 0.0009333757101585467, + "loss": 0.98391724, + "num_input_tokens_seen": 82337680, + "router_z_loss_mlp": 1.0, + "step": 994, + "time_per_iteration": 2.6937174797058105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142239, + "balance_loss_mlp": 1.04229414, + "epoch": 0.1914197768372451, + "flos": 522549107712.0, + "grad_norm": 0.02399514581888075, + "language_loss": 1.00362575, + "learning_rate": 0.0009332202473101329, + "loss": 1.01504803, + "num_input_tokens_seen": 82409600, + "router_z_loss_mlp": 1.0, + "step": 995, + "time_per_iteration": 2.7192962169647217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137582, + "balance_loss_mlp": 1.03763652, + "epoch": 0.19161215852250865, + "flos": 612387824640.0, + "grad_norm": 0.024864495797513732, + "language_loss": 0.91319168, + "learning_rate": 0.0009330646162707028, + "loss": 0.92456746, + "num_input_tokens_seen": 82480288, + "router_z_loss_mlp": 1.0, + "step": 996, + "time_per_iteration": 2.7450180053710938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113947, + "balance_loss_mlp": 1.03962064, + "epoch": 0.1918045402077722, + "flos": 848182619136.0, + "grad_norm": 0.02592603597590215, + "language_loss": 0.92579019, + "learning_rate": 0.0009329088171006779, + "loss": 0.93718487, + "num_input_tokens_seen": 82568960, + "router_z_loss_mlp": 0.99902344, + "step": 997, + "time_per_iteration": 3.1890194416046143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144521, + "balance_loss_mlp": 1.04457617, + "epoch": 0.19199692189303577, + "flos": 466892371968.0, + "grad_norm": 0.027577096255712943, + "language_loss": 0.95194477, + "learning_rate": 0.0009327528498605446, + "loss": 0.96338999, + "num_input_tokens_seen": 82634128, + "router_z_loss_mlp": 1.0, + "step": 998, + "time_per_iteration": 2.6845622062683105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141712, + "balance_loss_mlp": 1.04143262, + "epoch": 0.19218930357829936, + "flos": 532613011968.0, + "grad_norm": 0.026795980657526523, + "language_loss": 0.98209792, + "learning_rate": 0.0009325967146108548, + "loss": 0.99351501, + "num_input_tokens_seen": 82707472, + "router_z_loss_mlp": 1.00341797, + "step": 999, + "time_per_iteration": 2.690363883972168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145933, + "balance_loss_mlp": 1.04589295, + "epoch": 0.19238168526356292, + "flos": 602727422976.0, + "grad_norm": 0.025877996038880184, + "language_loss": 0.97816348, + "learning_rate": 0.0009324404114122258, + "loss": 0.98962283, + "num_input_tokens_seen": 82775232, + "router_z_loss_mlp": 1.00097656, + "step": 1000, + "time_per_iteration": 2.717535972595215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139683, + "balance_loss_mlp": 1.03969073, + "epoch": 0.19257406694882648, + "flos": 573154062336.0, + "grad_norm": 0.0251308575536182, + "language_loss": 0.95425117, + "learning_rate": 0.0009322839403253397, + "loss": 0.96564806, + "num_input_tokens_seen": 82850032, + "router_z_loss_mlp": 1.00048828, + "step": 1001, + "time_per_iteration": 2.8128621578216553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147687, + "balance_loss_mlp": 1.04793251, + "epoch": 0.19276644863409004, + "flos": 803156473344.0, + "grad_norm": 0.02827819499351052, + "language_loss": 0.93752921, + "learning_rate": 0.0009321273014109439, + "loss": 0.94900608, + "num_input_tokens_seen": 82926080, + "router_z_loss_mlp": 0.99804688, + "step": 1002, + "time_per_iteration": 2.9962668418884277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115103, + "balance_loss_mlp": 1.05127609, + "epoch": 0.1929588303193536, + "flos": 564479311872.0, + "grad_norm": 0.02425681225612504, + "language_loss": 0.92063946, + "learning_rate": 0.0009319704947298513, + "loss": 0.93214977, + "num_input_tokens_seen": 83005200, + "router_z_loss_mlp": 0.99804688, + "step": 1003, + "time_per_iteration": 2.9961774349212646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148634, + "balance_loss_mlp": 1.04887998, + "epoch": 0.19315121200461716, + "flos": 627987603456.0, + "grad_norm": 0.023688885680104285, + "language_loss": 0.95116329, + "learning_rate": 0.0009318135203429393, + "loss": 0.96264958, + "num_input_tokens_seen": 83077280, + "router_z_loss_mlp": 0.99804688, + "step": 1004, + "time_per_iteration": 2.7953245639801025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146221, + "balance_loss_mlp": 1.04646707, + "epoch": 0.19334359368988072, + "flos": 518583034368.0, + "grad_norm": 0.02448547542723696, + "language_loss": 0.95706153, + "learning_rate": 0.0009316563783111511, + "loss": 0.9685238, + "num_input_tokens_seen": 83145456, + "router_z_loss_mlp": 0.99804688, + "step": 1005, + "time_per_iteration": 2.7417562007904053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141812, + "balance_loss_mlp": 1.04224837, + "epoch": 0.19353597537514428, + "flos": 695399568384.0, + "grad_norm": 0.022656832097962477, + "language_loss": 0.91614294, + "learning_rate": 0.0009314990686954943, + "loss": 0.9275611, + "num_input_tokens_seen": 83225392, + "router_z_loss_mlp": 0.99609375, + "step": 1006, + "time_per_iteration": 2.921147584915161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143701, + "balance_loss_mlp": 1.04413795, + "epoch": 0.19372835706040784, + "flos": 1212199226880.0, + "grad_norm": 0.0213605480211332, + "language_loss": 0.89449364, + "learning_rate": 0.000931341591557042, + "loss": 0.90593064, + "num_input_tokens_seen": 83331296, + "router_z_loss_mlp": 0.99609375, + "step": 1007, + "time_per_iteration": 3.6934237480163574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142723, + "balance_loss_mlp": 1.04292154, + "epoch": 0.19392073874567142, + "flos": 521684980224.0, + "grad_norm": 0.02492230683936131, + "language_loss": 0.9970367, + "learning_rate": 0.0009311839469569325, + "loss": 1.00846386, + "num_input_tokens_seen": 83399952, + "router_z_loss_mlp": 0.99853516, + "step": 1008, + "time_per_iteration": 2.66283917427063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141437, + "balance_loss_mlp": 1.04187346, + "epoch": 0.19411312043093498, + "flos": 589910681088.0, + "grad_norm": 0.028572464719479444, + "language_loss": 0.9835515, + "learning_rate": 0.0009310261349563687, + "loss": 0.99496591, + "num_input_tokens_seen": 83468384, + "router_z_loss_mlp": 0.99609375, + "step": 1009, + "time_per_iteration": 2.6913864612579346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139912, + "balance_loss_mlp": 1.04034853, + "epoch": 0.19430550211619854, + "flos": 580571916288.0, + "grad_norm": 0.022224830980977262, + "language_loss": 0.9288035, + "learning_rate": 0.0009308681556166186, + "loss": 0.94020259, + "num_input_tokens_seen": 83547952, + "router_z_loss_mlp": 0.99609375, + "step": 1010, + "time_per_iteration": 2.8937342166900635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141907, + "balance_loss_mlp": 1.04234338, + "epoch": 0.1944978838014621, + "flos": 622245611520.0, + "grad_norm": 0.028831874511777204, + "language_loss": 1.01060331, + "learning_rate": 0.0009307100089990152, + "loss": 1.02202237, + "num_input_tokens_seen": 83615712, + "router_z_loss_mlp": 0.99609375, + "step": 1011, + "time_per_iteration": 2.7086822986602783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114452, + "balance_loss_mlp": 1.04495597, + "epoch": 0.19469026548672566, + "flos": 599814130176.0, + "grad_norm": 0.02434118582542042, + "language_loss": 0.95591187, + "learning_rate": 0.0009305516951649568, + "loss": 0.96735704, + "num_input_tokens_seen": 83687296, + "router_z_loss_mlp": 0.99609375, + "step": 1012, + "time_per_iteration": 2.7046425342559814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114359, + "balance_loss_mlp": 1.04402685, + "epoch": 0.19488264717198922, + "flos": 553247107584.0, + "grad_norm": 0.020712874248618226, + "language_loss": 0.93779677, + "learning_rate": 0.0009303932141759057, + "loss": 0.94923264, + "num_input_tokens_seen": 83763168, + "router_z_loss_mlp": 0.99609375, + "step": 1013, + "time_per_iteration": 2.7684950828552246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145994, + "balance_loss_mlp": 1.0468123, + "epoch": 0.19507502885725278, + "flos": 667312690176.0, + "grad_norm": 0.029421944235057496, + "language_loss": 0.94045115, + "learning_rate": 0.0009302345660933902, + "loss": 0.95191121, + "num_input_tokens_seen": 83837312, + "router_z_loss_mlp": 0.9921875, + "step": 1014, + "time_per_iteration": 2.8242082595825195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143606, + "balance_loss_mlp": 1.04442382, + "epoch": 0.19526741054251634, + "flos": 672327541248.0, + "grad_norm": 0.024449615989116238, + "language_loss": 0.93477654, + "learning_rate": 0.0009300757509790026, + "loss": 0.94621253, + "num_input_tokens_seen": 83917120, + "router_z_loss_mlp": 0.9921875, + "step": 1015, + "time_per_iteration": 2.840658664703369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144964, + "balance_loss_mlp": 1.04578233, + "epoch": 0.19545979222777993, + "flos": 448146986496.0, + "grad_norm": 0.028637929544829934, + "language_loss": 1.02226353, + "learning_rate": 0.0009299167688944005, + "loss": 1.0337131, + "num_input_tokens_seen": 83982992, + "router_z_loss_mlp": 0.9921875, + "step": 1016, + "time_per_iteration": 2.505427837371826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114266, + "balance_loss_mlp": 1.04376352, + "epoch": 0.1956521739130435, + "flos": 570168910848.0, + "grad_norm": 0.02609870742448671, + "language_loss": 0.93148959, + "learning_rate": 0.0009297576199013063, + "loss": 0.94291621, + "num_input_tokens_seen": 84057296, + "router_z_loss_mlp": 0.98925781, + "step": 1017, + "time_per_iteration": 2.7357168197631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155182, + "balance_loss_mlp": 1.05752563, + "epoch": 0.19584455559830705, + "flos": 1458880571904.0, + "grad_norm": 0.02028337436206496, + "language_loss": 0.73002136, + "learning_rate": 0.0009295983040615071, + "loss": 0.74157315, + "num_input_tokens_seen": 84292640, + "router_z_loss_mlp": 0.9765625, + "step": 1018, + "time_per_iteration": 5.09963059425354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147095, + "balance_loss_mlp": 1.04962921, + "epoch": 0.1960369372835706, + "flos": 1594481307648.0, + "grad_norm": 0.015251553743586253, + "language_loss": 0.79426301, + "learning_rate": 0.0009294388214368547, + "loss": 0.80573392, + "num_input_tokens_seen": 84524448, + "router_z_loss_mlp": 0.97460938, + "step": 1019, + "time_per_iteration": 6.03454852104187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146546, + "balance_loss_mlp": 1.0477457, + "epoch": 0.19622931896883417, + "flos": 617252954112.0, + "grad_norm": 0.02445318741287071, + "language_loss": 0.94190967, + "learning_rate": 0.0009292791720892659, + "loss": 0.9533751, + "num_input_tokens_seen": 84600208, + "router_z_loss_mlp": 0.98828125, + "step": 1020, + "time_per_iteration": 2.8369834423065186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147421, + "balance_loss_mlp": 1.0486201, + "epoch": 0.19642170065409773, + "flos": 467207278080.0, + "grad_norm": 0.027280190942869837, + "language_loss": 0.98824823, + "learning_rate": 0.0009291193560807218, + "loss": 0.99972242, + "num_input_tokens_seen": 84668032, + "router_z_loss_mlp": 0.98828125, + "step": 1021, + "time_per_iteration": 2.5833048820495605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144681, + "balance_loss_mlp": 1.0458802, + "epoch": 0.19661408233936128, + "flos": 516288093696.0, + "grad_norm": 0.025303886608753337, + "language_loss": 0.95740455, + "learning_rate": 0.0009289593734732688, + "loss": 0.96885145, + "num_input_tokens_seen": 84738176, + "router_z_loss_mlp": 0.98828125, + "step": 1022, + "time_per_iteration": 2.5913774967193604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149525, + "balance_loss_mlp": 1.05058122, + "epoch": 0.19680646402462484, + "flos": 393493366272.0, + "grad_norm": 0.0253763529676381, + "language_loss": 1.01103711, + "learning_rate": 0.0009287992243290175, + "loss": 1.02253246, + "num_input_tokens_seen": 84799936, + "router_z_loss_mlp": 0.98974609, + "step": 1023, + "time_per_iteration": 2.4793736934661865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115501, + "balance_loss_mlp": 1.05635238, + "epoch": 0.19699884570988843, + "flos": 627623032320.0, + "grad_norm": 0.02508480994731895, + "language_loss": 0.99886519, + "learning_rate": 0.0009286389087101435, + "loss": 1.01041532, + "num_input_tokens_seen": 84877216, + "router_z_loss_mlp": 0.98681641, + "step": 1024, + "time_per_iteration": 2.7772202491760254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153446, + "balance_loss_mlp": 1.05483615, + "epoch": 0.197191227395152, + "flos": 559073693184.0, + "grad_norm": 0.02445444816711275, + "language_loss": 0.98426372, + "learning_rate": 0.0009284784266788864, + "loss": 0.99579823, + "num_input_tokens_seen": 84952464, + "router_z_loss_mlp": 0.98632812, + "step": 1025, + "time_per_iteration": 2.6955864429473877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150264, + "balance_loss_mlp": 1.05165374, + "epoch": 0.19738360908041555, + "flos": 666249176064.0, + "grad_norm": 0.021666801749132464, + "language_loss": 0.99231869, + "learning_rate": 0.0009283177782975512, + "loss": 1.00382137, + "num_input_tokens_seen": 85031488, + "router_z_loss_mlp": 0.98632812, + "step": 1026, + "time_per_iteration": 2.9886229038238525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153907, + "balance_loss_mlp": 1.05529749, + "epoch": 0.1975759907656791, + "flos": 523510563840.0, + "grad_norm": 0.025961932589349316, + "language_loss": 0.98509014, + "learning_rate": 0.000928156963628507, + "loss": 0.99662918, + "num_input_tokens_seen": 85098384, + "router_z_loss_mlp": 0.98632812, + "step": 1027, + "time_per_iteration": 2.586740493774414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149439, + "balance_loss_mlp": 1.05097175, + "epoch": 0.19776837245094267, + "flos": 463484252160.0, + "grad_norm": 0.02550253779434718, + "language_loss": 0.96135926, + "learning_rate": 0.0009279959827341877, + "loss": 0.97285366, + "num_input_tokens_seen": 85172944, + "router_z_loss_mlp": 0.98486328, + "step": 1028, + "time_per_iteration": 2.723517894744873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146754, + "balance_loss_mlp": 1.04852605, + "epoch": 0.19796075413620623, + "flos": 504057503232.0, + "grad_norm": 0.02160335630411572, + "language_loss": 0.96627682, + "learning_rate": 0.0009278348356770915, + "loss": 0.97774434, + "num_input_tokens_seen": 85241632, + "router_z_loss_mlp": 0.98242188, + "step": 1029, + "time_per_iteration": 2.566802501678467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144801, + "balance_loss_mlp": 1.04666746, + "epoch": 0.1981531358214698, + "flos": 508570796544.0, + "grad_norm": 0.024261507948164947, + "language_loss": 0.9528529, + "learning_rate": 0.0009276735225197814, + "loss": 0.96430099, + "num_input_tokens_seen": 85308992, + "router_z_loss_mlp": 0.98144531, + "step": 1030, + "time_per_iteration": 2.6009340286254883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145205, + "balance_loss_mlp": 1.04702377, + "epoch": 0.19834551750673335, + "flos": 532639208448.0, + "grad_norm": 0.023062563394134136, + "language_loss": 0.95906407, + "learning_rate": 0.0009275120433248847, + "loss": 0.97051609, + "num_input_tokens_seen": 85381936, + "router_z_loss_mlp": 0.98193359, + "step": 1031, + "time_per_iteration": 2.684858560562134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145757, + "balance_loss_mlp": 1.0477196, + "epoch": 0.1985378991919969, + "flos": 776969765376.0, + "grad_norm": 0.02469129884935611, + "language_loss": 0.94986421, + "learning_rate": 0.0009273503981550931, + "loss": 0.96132183, + "num_input_tokens_seen": 85474352, + "router_z_loss_mlp": 0.98046875, + "step": 1032, + "time_per_iteration": 3.058094024658203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145313, + "balance_loss_mlp": 1.04737103, + "epoch": 0.1987302808772605, + "flos": 435191256576.0, + "grad_norm": 0.025952536265860523, + "language_loss": 0.96777844, + "learning_rate": 0.0009271885870731626, + "loss": 0.9792316, + "num_input_tokens_seen": 85538416, + "router_z_loss_mlp": 0.97949219, + "step": 1033, + "time_per_iteration": 2.493664503097534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153962, + "balance_loss_mlp": 1.05592442, + "epoch": 0.19892266256252406, + "flos": 554653725696.0, + "grad_norm": 0.029222795446194067, + "language_loss": 1.0035603, + "learning_rate": 0.0009270266101419143, + "loss": 1.01509976, + "num_input_tokens_seen": 85604416, + "router_z_loss_mlp": 0.98046875, + "step": 1034, + "time_per_iteration": 2.626612901687622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145521, + "balance_loss_mlp": 1.04748368, + "epoch": 0.19911504424778761, + "flos": 550948164096.0, + "grad_norm": 0.02425528851980561, + "language_loss": 0.92802572, + "learning_rate": 0.0009268644674242328, + "loss": 0.9394809, + "num_input_tokens_seen": 85677008, + "router_z_loss_mlp": 0.98046875, + "step": 1035, + "time_per_iteration": 2.683253288269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148174, + "balance_loss_mlp": 1.04994512, + "epoch": 0.19930742593305117, + "flos": 519312176640.0, + "grad_norm": 0.02646778626346152, + "language_loss": 0.91577774, + "learning_rate": 0.0009267021589830678, + "loss": 0.9272595, + "num_input_tokens_seen": 85745200, + "router_z_loss_mlp": 0.98242188, + "step": 1036, + "time_per_iteration": 2.7614338397979736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218948, + "balance_loss_mlp": 1.11824036, + "epoch": 0.19949980761831473, + "flos": 1512637863936.0, + "grad_norm": 0.02467753292442409, + "language_loss": 0.77627081, + "learning_rate": 0.0009265396848814328, + "loss": 0.78846025, + "num_input_tokens_seen": 85980608, + "router_z_loss_mlp": 1.0078125, + "step": 1037, + "time_per_iteration": 4.962339878082275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114988, + "balance_loss_mlp": 1.05184233, + "epoch": 0.1996921893035783, + "flos": 699439501824.0, + "grad_norm": 0.02757683731024766, + "language_loss": 1.02362621, + "learning_rate": 0.000926377045182406, + "loss": 1.03512502, + "num_input_tokens_seen": 86055952, + "router_z_loss_mlp": 0.98046875, + "step": 1038, + "time_per_iteration": 2.916594982147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155504, + "balance_loss_mlp": 1.05727601, + "epoch": 0.19988457098884185, + "flos": 728394510336.0, + "grad_norm": 0.024851830352508646, + "language_loss": 0.97729039, + "learning_rate": 0.0009262142399491296, + "loss": 0.98884547, + "num_input_tokens_seen": 86145536, + "router_z_loss_mlp": 0.98242188, + "step": 1039, + "time_per_iteration": 3.0976781845092773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156606, + "balance_loss_mlp": 1.05837739, + "epoch": 0.2000769526741054, + "flos": 561624416256.0, + "grad_norm": 0.025662568358030838, + "language_loss": 0.98388815, + "learning_rate": 0.0009260512692448105, + "loss": 0.99545419, + "num_input_tokens_seen": 86214480, + "router_z_loss_mlp": 0.98242188, + "step": 1040, + "time_per_iteration": 2.715479850769043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151311, + "balance_loss_mlp": 1.05308211, + "epoch": 0.200269334359369, + "flos": 573164795904.0, + "grad_norm": 0.022253887646478135, + "language_loss": 0.93097693, + "learning_rate": 0.000925888133132719, + "loss": 0.9424901, + "num_input_tokens_seen": 86289824, + "router_z_loss_mlp": 0.98242188, + "step": 1041, + "time_per_iteration": 2.7987864017486572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011912, + "balance_loss_mlp": 1.0923996, + "epoch": 0.20046171604463256, + "flos": 1489152875520.0, + "grad_norm": 0.020655335232781416, + "language_loss": 0.79610431, + "learning_rate": 0.0009257248316761906, + "loss": 0.80801636, + "num_input_tokens_seen": 86516384, + "router_z_loss_mlp": 0.98828125, + "step": 1042, + "time_per_iteration": 4.944507360458374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154531, + "balance_loss_mlp": 1.05644536, + "epoch": 0.20065409772989612, + "flos": 497577636864.0, + "grad_norm": 0.02609736880654102, + "language_loss": 0.92129564, + "learning_rate": 0.0009255613649386244, + "loss": 0.932841, + "num_input_tokens_seen": 86587296, + "router_z_loss_mlp": 0.98095703, + "step": 1043, + "time_per_iteration": 2.6478612422943115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157191, + "balance_loss_mlp": 1.05915368, + "epoch": 0.20084647941515968, + "flos": 580463127552.0, + "grad_norm": 0.02650777474930283, + "language_loss": 0.87469566, + "learning_rate": 0.0009253977329834838, + "loss": 0.88626754, + "num_input_tokens_seen": 86662656, + "router_z_loss_mlp": 0.98046875, + "step": 1044, + "time_per_iteration": 2.7641594409942627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161195, + "balance_loss_mlp": 1.06315744, + "epoch": 0.20103886110042324, + "flos": 643287939072.0, + "grad_norm": 0.030624079602620518, + "language_loss": 0.9713465, + "learning_rate": 0.0009252339358742965, + "loss": 0.98295844, + "num_input_tokens_seen": 86734704, + "router_z_loss_mlp": 0.98046875, + "step": 1045, + "time_per_iteration": 2.811687707901001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157153, + "balance_loss_mlp": 1.0594964, + "epoch": 0.2012312427856868, + "flos": 442969678848.0, + "grad_norm": 0.023268596270985206, + "language_loss": 0.93283701, + "learning_rate": 0.000925069973674654, + "loss": 0.94440854, + "num_input_tokens_seen": 86806512, + "router_z_loss_mlp": 0.9765625, + "step": 1046, + "time_per_iteration": 2.6709671020507812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157527, + "balance_loss_mlp": 1.05948889, + "epoch": 0.20142362447095036, + "flos": 555472190976.0, + "grad_norm": 0.022730221646095148, + "language_loss": 0.96496689, + "learning_rate": 0.000924905846448212, + "loss": 0.97654217, + "num_input_tokens_seen": 86883440, + "router_z_loss_mlp": 0.98046875, + "step": 1047, + "time_per_iteration": 2.7338547706604004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115317, + "balance_loss_mlp": 1.05522716, + "epoch": 0.20161600615621392, + "flos": 671554738176.0, + "grad_norm": 0.026697286803692055, + "language_loss": 0.96143991, + "learning_rate": 0.0009247415542586906, + "loss": 0.97297156, + "num_input_tokens_seen": 86960208, + "router_z_loss_mlp": 0.97949219, + "step": 1048, + "time_per_iteration": 2.849416494369507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149865, + "balance_loss_mlp": 1.05216146, + "epoch": 0.2018083878414775, + "flos": 574306899456.0, + "grad_norm": 0.021371049275305663, + "language_loss": 0.91504782, + "learning_rate": 0.0009245770971698735, + "loss": 0.92654645, + "num_input_tokens_seen": 87044144, + "router_z_loss_mlp": 0.97705078, + "step": 1049, + "time_per_iteration": 2.8751590251922607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151512, + "balance_loss_mlp": 1.05376041, + "epoch": 0.20200076952674106, + "flos": 426794482176.0, + "grad_norm": 0.027360075371486055, + "language_loss": 0.97835737, + "learning_rate": 0.0009244124752456087, + "loss": 0.98987252, + "num_input_tokens_seen": 87109136, + "router_z_loss_mlp": 0.97753906, + "step": 1050, + "time_per_iteration": 2.4985499382019043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153257, + "balance_loss_mlp": 1.05531442, + "epoch": 0.20219315121200462, + "flos": 537684258816.0, + "grad_norm": 0.025856302906645603, + "language_loss": 0.95370412, + "learning_rate": 0.0009242476885498081, + "loss": 0.96523666, + "num_input_tokens_seen": 87184320, + "router_z_loss_mlp": 0.97949219, + "step": 1051, + "time_per_iteration": 2.7127723693847656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150827, + "balance_loss_mlp": 1.05297983, + "epoch": 0.20238553289726818, + "flos": 478834252800.0, + "grad_norm": 0.02631802181941096, + "language_loss": 0.90995431, + "learning_rate": 0.0009240827371464474, + "loss": 0.92146254, + "num_input_tokens_seen": 87248224, + "router_z_loss_mlp": 0.97851562, + "step": 1052, + "time_per_iteration": 2.527918577194214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144335, + "balance_loss_mlp": 1.04667878, + "epoch": 0.20257791458253174, + "flos": 1153846049280.0, + "grad_norm": 0.025276400477213575, + "language_loss": 0.92167991, + "learning_rate": 0.0009239176210995666, + "loss": 0.93312329, + "num_input_tokens_seen": 87333088, + "router_z_loss_mlp": 0.9765625, + "step": 1053, + "time_per_iteration": 3.4556469917297363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144677, + "balance_loss_mlp": 1.04682982, + "epoch": 0.2027702962677953, + "flos": 668148619776.0, + "grad_norm": 0.025342755763179396, + "language_loss": 1.04358864, + "learning_rate": 0.0009237523404732695, + "loss": 1.05503547, + "num_input_tokens_seen": 87413840, + "router_z_loss_mlp": 0.97851562, + "step": 1054, + "time_per_iteration": 2.894198417663574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144665, + "balance_loss_mlp": 1.04676986, + "epoch": 0.20296267795305886, + "flos": 642452009472.0, + "grad_norm": 0.02468028394334187, + "language_loss": 0.94787639, + "learning_rate": 0.0009235868953317235, + "loss": 0.95932305, + "num_input_tokens_seen": 87487168, + "router_z_loss_mlp": 0.97900391, + "step": 1055, + "time_per_iteration": 2.812633991241455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148717, + "balance_loss_mlp": 1.05082273, + "epoch": 0.20315505963832242, + "flos": 932129622528.0, + "grad_norm": 0.02533903757078053, + "language_loss": 0.93907225, + "learning_rate": 0.0009234212857391602, + "loss": 0.95055938, + "num_input_tokens_seen": 87573184, + "router_z_loss_mlp": 0.97900391, + "step": 1056, + "time_per_iteration": 3.2061142921447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147493, + "balance_loss_mlp": 1.0496459, + "epoch": 0.20334744132358598, + "flos": 563287543296.0, + "grad_norm": 0.019686870604104637, + "language_loss": 0.97330248, + "learning_rate": 0.000923255511759875, + "loss": 0.98477745, + "num_input_tokens_seen": 87651968, + "router_z_loss_mlp": 0.97851562, + "step": 1057, + "time_per_iteration": 2.7639002799987793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150039, + "balance_loss_mlp": 1.05219197, + "epoch": 0.20353982300884957, + "flos": 645428428800.0, + "grad_norm": 0.023252811049323967, + "language_loss": 0.95256209, + "learning_rate": 0.000923089573458227, + "loss": 0.96406245, + "num_input_tokens_seen": 87727792, + "router_z_loss_mlp": 0.97851562, + "step": 1058, + "time_per_iteration": 2.857612133026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114962, + "balance_loss_mlp": 1.05177307, + "epoch": 0.20373220469411313, + "flos": 652705293312.0, + "grad_norm": 0.02395962669603635, + "language_loss": 0.93332446, + "learning_rate": 0.0009229234708986392, + "loss": 0.94482064, + "num_input_tokens_seen": 87806048, + "router_z_loss_mlp": 0.97851562, + "step": 1059, + "time_per_iteration": 2.877995729446411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150688, + "balance_loss_mlp": 1.05436707, + "epoch": 0.2039245863793767, + "flos": 1440396973056.0, + "grad_norm": 0.013896761524226428, + "language_loss": 0.81666899, + "learning_rate": 0.0009227572041455982, + "loss": 0.82817578, + "num_input_tokens_seen": 88018160, + "router_z_loss_mlp": 0.96289062, + "step": 1060, + "time_per_iteration": 4.659267902374268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142187, + "balance_loss_mlp": 1.04434025, + "epoch": 0.20411696806464025, + "flos": 598127534592.0, + "grad_norm": 0.026599581611848343, + "language_loss": 0.93894625, + "learning_rate": 0.0009225907732636548, + "loss": 0.95036817, + "num_input_tokens_seen": 88090864, + "router_z_loss_mlp": 0.97851562, + "step": 1061, + "time_per_iteration": 2.7480902671813965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115027, + "balance_loss_mlp": 1.05242312, + "epoch": 0.2043093497499038, + "flos": 574897053696.0, + "grad_norm": 0.026136319737411078, + "language_loss": 0.96460152, + "learning_rate": 0.0009224241783174227, + "loss": 0.97610414, + "num_input_tokens_seen": 88161360, + "router_z_loss_mlp": 0.97851562, + "step": 1062, + "time_per_iteration": 2.676877021789551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146738, + "balance_loss_mlp": 1.04874802, + "epoch": 0.20450173143516737, + "flos": 631523977728.0, + "grad_norm": 0.02709710709634581, + "language_loss": 0.94472104, + "learning_rate": 0.0009222574193715802, + "loss": 0.95618844, + "num_input_tokens_seen": 88234960, + "router_z_loss_mlp": 0.97998047, + "step": 1063, + "time_per_iteration": 2.7604472637176514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141026, + "balance_loss_mlp": 1.04298854, + "epoch": 0.20469411312043093, + "flos": 575146831872.0, + "grad_norm": 0.022769515120839894, + "language_loss": 0.95189404, + "learning_rate": 0.000922090496490869, + "loss": 0.96330428, + "num_input_tokens_seen": 88308176, + "router_z_loss_mlp": 0.98046875, + "step": 1064, + "time_per_iteration": 2.728154182434082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141583, + "balance_loss_mlp": 1.04383183, + "epoch": 0.20488649480569449, + "flos": 638279818752.0, + "grad_norm": 0.022393105289594414, + "language_loss": 0.97629392, + "learning_rate": 0.0009219234097400937, + "loss": 0.9877097, + "num_input_tokens_seen": 88386768, + "router_z_loss_mlp": 0.97753906, + "step": 1065, + "time_per_iteration": 2.889946937561035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137744, + "balance_loss_mlp": 1.03989744, + "epoch": 0.20507887649095807, + "flos": 977437747200.0, + "grad_norm": 0.024872828726298618, + "language_loss": 0.9305777, + "learning_rate": 0.0009217561591841237, + "loss": 0.94195515, + "num_input_tokens_seen": 88476576, + "router_z_loss_mlp": 0.97851562, + "step": 1066, + "time_per_iteration": 3.296248435974121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144611, + "balance_loss_mlp": 1.04681206, + "epoch": 0.20527125817622163, + "flos": 487155165696.0, + "grad_norm": 0.024567371957878288, + "language_loss": 0.90358436, + "learning_rate": 0.0009215887448878913, + "loss": 0.91503048, + "num_input_tokens_seen": 88541968, + "router_z_loss_mlp": 0.97802734, + "step": 1067, + "time_per_iteration": 2.5662190914154053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137303, + "balance_loss_mlp": 1.03945625, + "epoch": 0.2054636398614852, + "flos": 528210508800.0, + "grad_norm": 0.02249486638659544, + "language_loss": 0.94470721, + "learning_rate": 0.0009214211669163922, + "loss": 0.9560802, + "num_input_tokens_seen": 88615296, + "router_z_loss_mlp": 0.97851562, + "step": 1068, + "time_per_iteration": 2.6912589073181152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139468, + "balance_loss_mlp": 1.04162145, + "epoch": 0.20565602154674875, + "flos": 559323471360.0, + "grad_norm": 0.022635174506508055, + "language_loss": 1.02501464, + "learning_rate": 0.0009212534253346862, + "loss": 1.03640926, + "num_input_tokens_seen": 88691584, + "router_z_loss_mlp": 0.97851562, + "step": 1069, + "time_per_iteration": 2.708683490753174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135123, + "balance_loss_mlp": 1.03746641, + "epoch": 0.2058484032320123, + "flos": 505221073920.0, + "grad_norm": 0.02479403914192968, + "language_loss": 0.95383358, + "learning_rate": 0.0009210855202078964, + "loss": 0.96518481, + "num_input_tokens_seen": 88756592, + "router_z_loss_mlp": 0.9765625, + "step": 1070, + "time_per_iteration": 2.6434948444366455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132203, + "balance_loss_mlp": 1.03478527, + "epoch": 0.20604078491727587, + "flos": 434047151616.0, + "grad_norm": 0.024632817960327506, + "language_loss": 0.96572351, + "learning_rate": 0.0009209174516012091, + "loss": 0.97704554, + "num_input_tokens_seen": 88820928, + "router_z_loss_mlp": 0.97412109, + "step": 1071, + "time_per_iteration": 2.4891347885131836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148822, + "balance_loss_mlp": 1.05130851, + "epoch": 0.20623316660253943, + "flos": 609874031616.0, + "grad_norm": 0.024395492192686875, + "language_loss": 0.97482872, + "learning_rate": 0.0009207492195798747, + "loss": 0.98631692, + "num_input_tokens_seen": 88895440, + "router_z_loss_mlp": 0.97509766, + "step": 1072, + "time_per_iteration": 2.758575201034546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152495, + "balance_loss_mlp": 1.05502975, + "epoch": 0.206425548287803, + "flos": 481393708032.0, + "grad_norm": 0.027205333287948934, + "language_loss": 0.9402262, + "learning_rate": 0.0009205808242092061, + "loss": 0.95175123, + "num_input_tokens_seen": 88964400, + "router_z_loss_mlp": 0.97460938, + "step": 1073, + "time_per_iteration": 2.6534366607666016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152896, + "balance_loss_mlp": 1.05562115, + "epoch": 0.20661792997306658, + "flos": 951122784768.0, + "grad_norm": 0.02943422736446298, + "language_loss": 0.93147469, + "learning_rate": 0.0009204122655545808, + "loss": 0.94300359, + "num_input_tokens_seen": 89049600, + "router_z_loss_mlp": 0.97265625, + "step": 1074, + "time_per_iteration": 3.317518949508667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149199, + "balance_loss_mlp": 1.05201948, + "epoch": 0.20681031165833014, + "flos": 604616133120.0, + "grad_norm": 0.024855118115069977, + "language_loss": 0.88961834, + "learning_rate": 0.0009202435436814388, + "loss": 0.90111029, + "num_input_tokens_seen": 89119024, + "router_z_loss_mlp": 0.97167969, + "step": 1075, + "time_per_iteration": 2.6815345287323 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142912, + "balance_loss_mlp": 1.04563749, + "epoch": 0.2070026933435937, + "flos": 710265475584.0, + "grad_norm": 0.027130222852878607, + "language_loss": 0.99239773, + "learning_rate": 0.0009200746586552836, + "loss": 1.00382686, + "num_input_tokens_seen": 89197344, + "router_z_loss_mlp": 0.97265625, + "step": 1076, + "time_per_iteration": 2.9578917026519775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141976, + "balance_loss_mlp": 1.04451025, + "epoch": 0.20719507502885726, + "flos": 831254085120.0, + "grad_norm": 0.023090334700176834, + "language_loss": 0.92780054, + "learning_rate": 0.0009199056105416825, + "loss": 0.93922031, + "num_input_tokens_seen": 89280464, + "router_z_loss_mlp": 0.97460938, + "step": 1077, + "time_per_iteration": 3.0944156646728516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140475, + "balance_loss_mlp": 1.04324794, + "epoch": 0.20738745671412082, + "flos": 639499785216.0, + "grad_norm": 0.023914471883828003, + "language_loss": 0.96186948, + "learning_rate": 0.0009197363994062654, + "loss": 0.97327423, + "num_input_tokens_seen": 89353344, + "router_z_loss_mlp": 0.97216797, + "step": 1078, + "time_per_iteration": 2.8147799968719482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142489, + "balance_loss_mlp": 1.04521394, + "epoch": 0.20757983839938438, + "flos": 686983328256.0, + "grad_norm": 0.02237329029547868, + "language_loss": 0.90686679, + "learning_rate": 0.0009195670253147262, + "loss": 0.91829169, + "num_input_tokens_seen": 89439328, + "router_z_loss_mlp": 0.97265625, + "step": 1079, + "time_per_iteration": 2.994058609008789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141016, + "balance_loss_mlp": 1.04383624, + "epoch": 0.20777222008464794, + "flos": 520317293568.0, + "grad_norm": 0.026634413874044322, + "language_loss": 0.92195654, + "learning_rate": 0.0009193974883328216, + "loss": 0.93336666, + "num_input_tokens_seen": 89510160, + "router_z_loss_mlp": 0.97167969, + "step": 1080, + "time_per_iteration": 2.6506502628326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140462, + "balance_loss_mlp": 1.04333031, + "epoch": 0.2079646017699115, + "flos": 512469740544.0, + "grad_norm": 0.025261028079588584, + "language_loss": 0.97185814, + "learning_rate": 0.0009192277885263718, + "loss": 0.98326278, + "num_input_tokens_seen": 89582960, + "router_z_loss_mlp": 0.97119141, + "step": 1081, + "time_per_iteration": 2.646629810333252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143678, + "balance_loss_mlp": 1.04640269, + "epoch": 0.20815698345517505, + "flos": 933467109888.0, + "grad_norm": 0.02363260569338726, + "language_loss": 0.9496327, + "learning_rate": 0.0009190579259612602, + "loss": 0.96106946, + "num_input_tokens_seen": 89675488, + "router_z_loss_mlp": 0.97265625, + "step": 1082, + "time_per_iteration": 3.2829811573028564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150642, + "balance_loss_mlp": 1.05336761, + "epoch": 0.20834936514043864, + "flos": 633553677312.0, + "grad_norm": 0.02436625118168465, + "language_loss": 0.97094011, + "learning_rate": 0.000918887900703433, + "loss": 0.98244655, + "num_input_tokens_seen": 89747872, + "router_z_loss_mlp": 0.97265625, + "step": 1083, + "time_per_iteration": 2.779474973678589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147642, + "balance_loss_mlp": 1.05079603, + "epoch": 0.2085417468257022, + "flos": 395243088384.0, + "grad_norm": 0.027448171988374206, + "language_loss": 0.98109657, + "learning_rate": 0.0009187177128188999, + "loss": 0.99257296, + "num_input_tokens_seen": 89810176, + "router_z_loss_mlp": 0.96826172, + "step": 1084, + "time_per_iteration": 2.487755298614502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156746, + "balance_loss_mlp": 1.06118774, + "epoch": 0.20873412851096576, + "flos": 1405195138560.0, + "grad_norm": 0.014888537960634525, + "language_loss": 0.77156538, + "learning_rate": 0.0009185473623737339, + "loss": 0.78313285, + "num_input_tokens_seen": 90038432, + "router_z_loss_mlp": 0.95507812, + "step": 1085, + "time_per_iteration": 4.917901515960693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146704, + "balance_loss_mlp": 1.04981041, + "epoch": 0.20892651019622932, + "flos": 448761335808.0, + "grad_norm": 0.0275038267286557, + "language_loss": 0.93389261, + "learning_rate": 0.000918376849434071, + "loss": 0.94535965, + "num_input_tokens_seen": 90101568, + "router_z_loss_mlp": 0.96875, + "step": 1086, + "time_per_iteration": 2.5117850303649902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153188, + "balance_loss_mlp": 1.05629456, + "epoch": 0.20911889188149288, + "flos": 494080194048.0, + "grad_norm": 0.034273062806107445, + "language_loss": 1.02428699, + "learning_rate": 0.0009182061740661098, + "loss": 1.03581882, + "num_input_tokens_seen": 90169344, + "router_z_loss_mlp": 0.96875, + "step": 1087, + "time_per_iteration": 2.5270984172821045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154258, + "balance_loss_mlp": 1.05736482, + "epoch": 0.20931127356675644, + "flos": 842748802560.0, + "grad_norm": 0.02361505883443172, + "language_loss": 0.92997056, + "learning_rate": 0.0009180353363361127, + "loss": 0.94151306, + "num_input_tokens_seen": 90252416, + "router_z_loss_mlp": 0.96875, + "step": 1088, + "time_per_iteration": 3.1549112796783447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154015, + "balance_loss_mlp": 1.05688298, + "epoch": 0.20950365525202, + "flos": 758523823104.0, + "grad_norm": 0.028384526527587387, + "language_loss": 0.93851304, + "learning_rate": 0.0009178643363104044, + "loss": 0.95005322, + "num_input_tokens_seen": 90337952, + "router_z_loss_mlp": 0.97119141, + "step": 1089, + "time_per_iteration": 4.693684339523315 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148681, + "balance_loss_mlp": 1.05159688, + "epoch": 0.20969603693728356, + "flos": 473491760640.0, + "grad_norm": 0.03411348227976855, + "language_loss": 1.04663801, + "learning_rate": 0.0009176931740553735, + "loss": 1.05812478, + "num_input_tokens_seen": 90401488, + "router_z_loss_mlp": 0.97070312, + "step": 1090, + "time_per_iteration": 2.5203866958618164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146066, + "balance_loss_mlp": 1.04917288, + "epoch": 0.20988841862254715, + "flos": 978627514368.0, + "grad_norm": 0.027482857176328385, + "language_loss": 0.92998403, + "learning_rate": 0.0009175218496374708, + "loss": 0.94144469, + "num_input_tokens_seen": 90486144, + "router_z_loss_mlp": 0.96875, + "step": 1091, + "time_per_iteration": 3.362614870071411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152337, + "balance_loss_mlp": 1.05544364, + "epoch": 0.2100808003078107, + "flos": 1094818123776.0, + "grad_norm": 0.028049590852478556, + "language_loss": 0.96363866, + "learning_rate": 0.0009173503631232103, + "loss": 0.97516203, + "num_input_tokens_seen": 90571504, + "router_z_loss_mlp": 0.96875, + "step": 1092, + "time_per_iteration": 3.359970808029175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150696, + "balance_loss_mlp": 1.05399334, + "epoch": 0.21027318199307427, + "flos": 1014559217664.0, + "grad_norm": 0.03210489869185377, + "language_loss": 0.94109344, + "learning_rate": 0.0009171787145791691, + "loss": 0.95260036, + "num_input_tokens_seen": 90646016, + "router_z_loss_mlp": 0.96679688, + "step": 1093, + "time_per_iteration": 3.2180042266845703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150028, + "balance_loss_mlp": 1.05323017, + "epoch": 0.21046556367833782, + "flos": 522412121088.0, + "grad_norm": 0.02762257246471406, + "language_loss": 0.92679179, + "learning_rate": 0.000917006904071987, + "loss": 0.93829209, + "num_input_tokens_seen": 90713440, + "router_z_loss_mlp": 0.96777344, + "step": 1094, + "time_per_iteration": 2.5961859226226807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152841, + "balance_loss_mlp": 1.0559479, + "epoch": 0.21065794536360138, + "flos": 604839714816.0, + "grad_norm": 0.02570597393175465, + "language_loss": 0.97250223, + "learning_rate": 0.0009168349316683669, + "loss": 0.98403066, + "num_input_tokens_seen": 90788208, + "router_z_loss_mlp": 0.96875, + "step": 1095, + "time_per_iteration": 2.7164759635925293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153125, + "balance_loss_mlp": 1.05642295, + "epoch": 0.21085032704886494, + "flos": 604557735936.0, + "grad_norm": 0.022711755724658188, + "language_loss": 0.91088736, + "learning_rate": 0.0009166627974350741, + "loss": 0.92241859, + "num_input_tokens_seen": 90873776, + "router_z_loss_mlp": 0.96679688, + "step": 1096, + "time_per_iteration": 2.8912341594696045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153907, + "balance_loss_mlp": 1.05739498, + "epoch": 0.2110427087341285, + "flos": 638831041536.0, + "grad_norm": 0.027939519002465243, + "language_loss": 1.01164758, + "learning_rate": 0.0009164905014389373, + "loss": 1.02318668, + "num_input_tokens_seen": 90945872, + "router_z_loss_mlp": 0.96484375, + "step": 1097, + "time_per_iteration": 2.758725881576538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115008, + "balance_loss_mlp": 1.05356789, + "epoch": 0.21123509041939206, + "flos": 523929529344.0, + "grad_norm": 0.027217895626849283, + "language_loss": 0.96537346, + "learning_rate": 0.0009163180437468476, + "loss": 0.97687429, + "num_input_tokens_seen": 91016224, + "router_z_loss_mlp": 0.96484375, + "step": 1098, + "time_per_iteration": 2.6157684326171875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011531, + "balance_loss_mlp": 1.05658853, + "epoch": 0.21142747210465565, + "flos": 452193650688.0, + "grad_norm": 0.025540912808389868, + "language_loss": 0.94842321, + "learning_rate": 0.000916145424425759, + "loss": 0.9599542, + "num_input_tokens_seen": 91086752, + "router_z_loss_mlp": 0.96484375, + "step": 1099, + "time_per_iteration": 2.6368908882141113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157233, + "balance_loss_mlp": 1.06081605, + "epoch": 0.2116198537899192, + "flos": 877625723904.0, + "grad_norm": 0.02885196772961066, + "language_loss": 1.02573156, + "learning_rate": 0.0009159726435426885, + "loss": 1.03730392, + "num_input_tokens_seen": 91162960, + "router_z_loss_mlp": 0.96386719, + "step": 1100, + "time_per_iteration": 3.0916907787323 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011557, + "balance_loss_mlp": 1.05909276, + "epoch": 0.21181223547518277, + "flos": 524674134528.0, + "grad_norm": 0.025603473018395394, + "language_loss": 0.99936807, + "learning_rate": 0.0009157997011647154, + "loss": 1.01092505, + "num_input_tokens_seen": 91229840, + "router_z_loss_mlp": 0.96582031, + "step": 1101, + "time_per_iteration": 2.5971169471740723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152722, + "balance_loss_mlp": 1.05630529, + "epoch": 0.21200461716044633, + "flos": 573425307648.0, + "grad_norm": 0.02306433427515447, + "language_loss": 0.93708789, + "learning_rate": 0.0009156265973589817, + "loss": 0.94861513, + "num_input_tokens_seen": 91307936, + "router_z_loss_mlp": 0.96386719, + "step": 1102, + "time_per_iteration": 2.786557197570801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148247, + "balance_loss_mlp": 1.05187845, + "epoch": 0.2121969988457099, + "flos": 546174359040.0, + "grad_norm": 0.023119673851329285, + "language_loss": 0.9826746, + "learning_rate": 0.0009154533321926926, + "loss": 0.99415696, + "num_input_tokens_seen": 91372848, + "router_z_loss_mlp": 0.96337891, + "step": 1103, + "time_per_iteration": 2.6500911712646484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150448, + "balance_loss_mlp": 1.05393636, + "epoch": 0.21238938053097345, + "flos": 845353920000.0, + "grad_norm": 0.02523726215492747, + "language_loss": 0.96587884, + "learning_rate": 0.0009152799057331156, + "loss": 0.97738338, + "num_input_tokens_seen": 91452768, + "router_z_loss_mlp": 0.96484375, + "step": 1104, + "time_per_iteration": 3.1080517768859863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148697, + "balance_loss_mlp": 1.05213737, + "epoch": 0.212581762216237, + "flos": 447141869568.0, + "grad_norm": 0.026678256955328494, + "language_loss": 1.00256824, + "learning_rate": 0.0009151063180475805, + "loss": 1.01405525, + "num_input_tokens_seen": 91519888, + "router_z_loss_mlp": 0.96533203, + "step": 1105, + "time_per_iteration": 2.530207633972168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153737, + "balance_loss_mlp": 1.05703473, + "epoch": 0.21277414390150057, + "flos": 515385034752.0, + "grad_norm": 0.026680614248996183, + "language_loss": 0.9432478, + "learning_rate": 0.0009149325692034803, + "loss": 0.95478517, + "num_input_tokens_seen": 91585744, + "router_z_loss_mlp": 0.96679688, + "step": 1106, + "time_per_iteration": 2.576834201812744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159119, + "balance_loss_mlp": 1.06413269, + "epoch": 0.21296652558676413, + "flos": 1488512329728.0, + "grad_norm": 0.01358013302766655, + "language_loss": 0.79203427, + "learning_rate": 0.0009147586592682702, + "loss": 0.80362546, + "num_input_tokens_seen": 91805840, + "router_z_loss_mlp": 0.94921875, + "step": 1107, + "time_per_iteration": 4.821696996688843 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156765, + "balance_loss_mlp": 1.06006265, + "epoch": 0.21315890727202771, + "flos": 847450748928.0, + "grad_norm": 0.031460519319247274, + "language_loss": 0.96369046, + "learning_rate": 0.0009145845883094678, + "loss": 0.97525811, + "num_input_tokens_seen": 91885936, + "router_z_loss_mlp": 0.96679688, + "step": 1108, + "time_per_iteration": 3.029548168182373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159379, + "balance_loss_mlp": 1.06267655, + "epoch": 0.21335128895729127, + "flos": 630555790848.0, + "grad_norm": 0.028067626854192333, + "language_loss": 0.95182431, + "learning_rate": 0.000914410356394654, + "loss": 0.96341801, + "num_input_tokens_seen": 91959888, + "router_z_loss_mlp": 0.96679688, + "step": 1109, + "time_per_iteration": 2.737241268157959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160605, + "balance_loss_mlp": 1.06352139, + "epoch": 0.21354367064255483, + "flos": 712284441600.0, + "grad_norm": 0.023599510024272945, + "language_loss": 0.92540836, + "learning_rate": 0.0009142359635914709, + "loss": 0.93701446, + "num_input_tokens_seen": 92043728, + "router_z_loss_mlp": 0.97070312, + "step": 1110, + "time_per_iteration": 3.0267913341522217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161441, + "balance_loss_mlp": 1.0645479, + "epoch": 0.2137360523278184, + "flos": 457210503168.0, + "grad_norm": 0.02473497568188501, + "language_loss": 0.9156003, + "learning_rate": 0.0009140614099676245, + "loss": 0.92721474, + "num_input_tokens_seen": 92114096, + "router_z_loss_mlp": 0.96875, + "step": 1111, + "time_per_iteration": 2.5756866931915283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164266, + "balance_loss_mlp": 1.06727743, + "epoch": 0.21392843401308195, + "flos": 667265026560.0, + "grad_norm": 0.025344438139363285, + "language_loss": 0.90291333, + "learning_rate": 0.0009138866955908821, + "loss": 0.91455603, + "num_input_tokens_seen": 92193552, + "router_z_loss_mlp": 0.96972656, + "step": 1112, + "time_per_iteration": 2.9406254291534424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160039, + "balance_loss_mlp": 1.06319368, + "epoch": 0.2141208156983455, + "flos": 750361363968.0, + "grad_norm": 0.02581510235299489, + "language_loss": 0.89949894, + "learning_rate": 0.0009137118205290738, + "loss": 0.91109931, + "num_input_tokens_seen": 92279248, + "router_z_loss_mlp": 0.96826172, + "step": 1113, + "time_per_iteration": 2.966989278793335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162558, + "balance_loss_mlp": 1.06547356, + "epoch": 0.21431319738360907, + "flos": 420010443264.0, + "grad_norm": 0.024953242249854055, + "language_loss": 1.00419319, + "learning_rate": 0.0009135367848500924, + "loss": 1.01581883, + "num_input_tokens_seen": 92344064, + "router_z_loss_mlp": 0.97070312, + "step": 1114, + "time_per_iteration": 2.4954934120178223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161216, + "balance_loss_mlp": 1.06456113, + "epoch": 0.21450557906887263, + "flos": 610238602752.0, + "grad_norm": 0.030213425802119154, + "language_loss": 0.9839642, + "learning_rate": 0.0009133615886218927, + "loss": 0.99557638, + "num_input_tokens_seen": 92410544, + "router_z_loss_mlp": 0.96630859, + "step": 1115, + "time_per_iteration": 2.71352219581604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152764, + "balance_loss_mlp": 1.05625272, + "epoch": 0.21469796075413622, + "flos": 562974638592.0, + "grad_norm": 0.027635545182738433, + "language_loss": 0.99806535, + "learning_rate": 0.0009131862319124917, + "loss": 1.00959289, + "num_input_tokens_seen": 92480272, + "router_z_loss_mlp": 0.96484375, + "step": 1116, + "time_per_iteration": 2.630807876586914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153717, + "balance_loss_mlp": 1.05720496, + "epoch": 0.21489034243939978, + "flos": 595737266688.0, + "grad_norm": 0.024806539819872384, + "language_loss": 0.94489264, + "learning_rate": 0.0009130107147899691, + "loss": 0.95642984, + "num_input_tokens_seen": 92555584, + "router_z_loss_mlp": 0.96484375, + "step": 1117, + "time_per_iteration": 2.7123875617980957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154765, + "balance_loss_mlp": 1.05825305, + "epoch": 0.21508272412466334, + "flos": 442850156544.0, + "grad_norm": 0.024517194331867692, + "language_loss": 0.93784142, + "learning_rate": 0.0009128350373224665, + "loss": 0.9493891, + "num_input_tokens_seen": 92623136, + "router_z_loss_mlp": 0.96484375, + "step": 1118, + "time_per_iteration": 2.5384151935577393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169045, + "balance_loss_mlp": 1.07348633, + "epoch": 0.2152751058099269, + "flos": 1499232242688.0, + "grad_norm": 0.019396990855708212, + "language_loss": 0.81456429, + "learning_rate": 0.0009126591995781883, + "loss": 0.82625473, + "num_input_tokens_seen": 92842608, + "router_z_loss_mlp": 0.95507812, + "step": 1119, + "time_per_iteration": 4.644891262054443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156688, + "balance_loss_mlp": 1.05989027, + "epoch": 0.21546748749519046, + "flos": 494991985152.0, + "grad_norm": 0.030440112014221473, + "language_loss": 0.9407053, + "learning_rate": 0.0009124832016254005, + "loss": 0.95227218, + "num_input_tokens_seen": 92912960, + "router_z_loss_mlp": 0.96777344, + "step": 1120, + "time_per_iteration": 2.588834285736084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163526, + "balance_loss_mlp": 1.06691861, + "epoch": 0.21565986918045402, + "flos": 635694167040.0, + "grad_norm": 0.030206495794058562, + "language_loss": 0.96966755, + "learning_rate": 0.0009123070435324316, + "loss": 0.98130286, + "num_input_tokens_seen": 92982272, + "router_z_loss_mlp": 0.96582031, + "step": 1121, + "time_per_iteration": 2.786072015762329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170601, + "balance_loss_mlp": 1.07542419, + "epoch": 0.21585225086571758, + "flos": 1586798939136.0, + "grad_norm": 0.013013152417503263, + "language_loss": 0.77875781, + "learning_rate": 0.0009121307253676722, + "loss": 0.79046386, + "num_input_tokens_seen": 93218752, + "router_z_loss_mlp": 0.95117188, + "step": 1122, + "time_per_iteration": 4.946362733840942 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011651, + "balance_loss_mlp": 1.0685885, + "epoch": 0.21604463255098114, + "flos": 685322202624.0, + "grad_norm": 0.027822137906457534, + "language_loss": 0.94040322, + "learning_rate": 0.0009119542471995752, + "loss": 0.95205426, + "num_input_tokens_seen": 93293968, + "router_z_loss_mlp": 0.96484375, + "step": 1123, + "time_per_iteration": 2.8613343238830566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162625, + "balance_loss_mlp": 1.0660181, + "epoch": 0.2162370142362447, + "flos": 782307528192.0, + "grad_norm": 0.029561600436113455, + "language_loss": 0.90709835, + "learning_rate": 0.0009117776090966554, + "loss": 0.9187246, + "num_input_tokens_seen": 93367088, + "router_z_loss_mlp": 0.96582031, + "step": 1124, + "time_per_iteration": 2.9557414054870605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170148, + "balance_loss_mlp": 1.07344532, + "epoch": 0.21642939592150828, + "flos": 1003761441792.0, + "grad_norm": 0.032145354222626064, + "language_loss": 0.98171163, + "learning_rate": 0.0009116008111274899, + "loss": 0.99341309, + "num_input_tokens_seen": 93452944, + "router_z_loss_mlp": 0.96679688, + "step": 1125, + "time_per_iteration": 3.253286838531494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175423, + "balance_loss_mlp": 1.0798645, + "epoch": 0.21662177760677184, + "flos": 1485762220032.0, + "grad_norm": 0.016361962696647775, + "language_loss": 0.79106927, + "learning_rate": 0.0009114238533607176, + "loss": 0.80282342, + "num_input_tokens_seen": 93677328, + "router_z_loss_mlp": 0.95507812, + "step": 1126, + "time_per_iteration": 4.832986831665039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168208, + "balance_loss_mlp": 1.07150567, + "epoch": 0.2168141592920354, + "flos": 888859929600.0, + "grad_norm": 0.027606671666099106, + "language_loss": 0.94760346, + "learning_rate": 0.0009112467358650396, + "loss": 0.9592855, + "num_input_tokens_seen": 93756848, + "router_z_loss_mlp": 0.96679688, + "step": 1127, + "time_per_iteration": 3.1373836994171143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164208, + "balance_loss_mlp": 1.06741047, + "epoch": 0.21700654097729896, + "flos": 547084148736.0, + "grad_norm": 0.025712027239217825, + "language_loss": 0.95734817, + "learning_rate": 0.0009110694587092192, + "loss": 0.96899021, + "num_input_tokens_seen": 93834704, + "router_z_loss_mlp": 0.96777344, + "step": 1128, + "time_per_iteration": 2.752166986465454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162506, + "balance_loss_mlp": 1.06580317, + "epoch": 0.21719892266256252, + "flos": 510535368192.0, + "grad_norm": 0.02739880514200537, + "language_loss": 0.95310479, + "learning_rate": 0.0009108920219620815, + "loss": 0.96472991, + "num_input_tokens_seen": 93904448, + "router_z_loss_mlp": 0.96679688, + "step": 1129, + "time_per_iteration": 2.639409065246582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164125, + "balance_loss_mlp": 1.06742299, + "epoch": 0.21739130434782608, + "flos": 544461566976.0, + "grad_norm": 0.023064586598143682, + "language_loss": 0.97784394, + "learning_rate": 0.0009107144256925133, + "loss": 0.9894852, + "num_input_tokens_seen": 93979312, + "router_z_loss_mlp": 0.96679688, + "step": 1130, + "time_per_iteration": 2.73559308052063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165938, + "balance_loss_mlp": 1.06923568, + "epoch": 0.21758368603308964, + "flos": 617982096384.0, + "grad_norm": 0.027176951765382908, + "language_loss": 0.9233678, + "learning_rate": 0.0009105366699694638, + "loss": 0.93502718, + "num_input_tokens_seen": 94052032, + "router_z_loss_mlp": 0.96679688, + "step": 1131, + "time_per_iteration": 2.7653839588165283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166281, + "balance_loss_mlp": 1.06957853, + "epoch": 0.2177760677183532, + "flos": 636334712832.0, + "grad_norm": 0.021107298895209785, + "language_loss": 0.91459304, + "learning_rate": 0.0009103587548619439, + "loss": 0.92625588, + "num_input_tokens_seen": 94124944, + "router_z_loss_mlp": 0.96679688, + "step": 1132, + "time_per_iteration": 2.8519365787506104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160184, + "balance_loss_mlp": 1.06367195, + "epoch": 0.2179684494036168, + "flos": 533596661760.0, + "grad_norm": 0.022551614427290693, + "language_loss": 0.95995569, + "learning_rate": 0.0009101806804390261, + "loss": 0.97155756, + "num_input_tokens_seen": 94200384, + "router_z_loss_mlp": 0.96484375, + "step": 1133, + "time_per_iteration": 2.8218026161193848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163206, + "balance_loss_mlp": 1.06664658, + "epoch": 0.21816083108888035, + "flos": 476181471744.0, + "grad_norm": 0.0250418684782295, + "language_loss": 1.00355339, + "learning_rate": 0.0009100024467698453, + "loss": 1.01518536, + "num_input_tokens_seen": 94266992, + "router_z_loss_mlp": 0.96533203, + "step": 1134, + "time_per_iteration": 2.5639142990112305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167151, + "balance_loss_mlp": 1.07059181, + "epoch": 0.2183532127741439, + "flos": 578546219520.0, + "grad_norm": 0.029194142239697657, + "language_loss": 0.95151818, + "learning_rate": 0.0009098240539235981, + "loss": 0.96318972, + "num_input_tokens_seen": 94334304, + "router_z_loss_mlp": 0.96533203, + "step": 1135, + "time_per_iteration": 2.693704843521118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162362, + "balance_loss_mlp": 1.06565976, + "epoch": 0.21854559445940747, + "flos": 595279369728.0, + "grad_norm": 0.022714398939090653, + "language_loss": 0.96190184, + "learning_rate": 0.0009096455019695423, + "loss": 0.9735254, + "num_input_tokens_seen": 94413296, + "router_z_loss_mlp": 0.96679688, + "step": 1136, + "time_per_iteration": 2.829479217529297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166866, + "balance_loss_mlp": 1.06997275, + "epoch": 0.21873797614467103, + "flos": 409549040640.0, + "grad_norm": 0.027737994351600712, + "language_loss": 1.01424551, + "learning_rate": 0.000909466790976998, + "loss": 1.02591419, + "num_input_tokens_seen": 94475840, + "router_z_loss_mlp": 0.96875, + "step": 1137, + "time_per_iteration": 2.4491164684295654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165251, + "balance_loss_mlp": 1.06869149, + "epoch": 0.21893035782993459, + "flos": 895654702080.0, + "grad_norm": 0.022710058353260835, + "language_loss": 0.90594929, + "learning_rate": 0.0009092879210153473, + "loss": 0.91760182, + "num_input_tokens_seen": 94555184, + "router_z_loss_mlp": 0.96533203, + "step": 1138, + "time_per_iteration": 3.155076503753662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168627, + "balance_loss_mlp": 1.07192433, + "epoch": 0.21912273951519814, + "flos": 468568233984.0, + "grad_norm": 0.024281064631586205, + "language_loss": 0.97427768, + "learning_rate": 0.0009091088921540333, + "loss": 0.98596388, + "num_input_tokens_seen": 94622656, + "router_z_loss_mlp": 0.96679688, + "step": 1139, + "time_per_iteration": 2.5309600830078125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172859, + "balance_loss_mlp": 1.07711029, + "epoch": 0.2193151212004617, + "flos": 1535177407488.0, + "grad_norm": 0.009496329971255709, + "language_loss": 0.75508678, + "learning_rate": 0.0009089297044625615, + "loss": 0.76681536, + "num_input_tokens_seen": 94856496, + "router_z_loss_mlp": 0.95703125, + "step": 1140, + "time_per_iteration": 4.911335229873657 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172401, + "balance_loss_mlp": 1.07569873, + "epoch": 0.2195075028857253, + "flos": 592274752512.0, + "grad_norm": 0.033335232647672346, + "language_loss": 0.95078719, + "learning_rate": 0.0009087503580104985, + "loss": 0.96251118, + "num_input_tokens_seen": 94926880, + "router_z_loss_mlp": 0.96679688, + "step": 1141, + "time_per_iteration": 2.7083888053894043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169701, + "balance_loss_mlp": 1.07295096, + "epoch": 0.21969988457098885, + "flos": 637517749248.0, + "grad_norm": 0.02859165000671714, + "language_loss": 0.90439236, + "learning_rate": 0.0009085708528674728, + "loss": 0.91608942, + "num_input_tokens_seen": 95000528, + "router_z_loss_mlp": 0.96728516, + "step": 1142, + "time_per_iteration": 2.786891222000122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162201, + "balance_loss_mlp": 1.06549823, + "epoch": 0.2198922662562524, + "flos": 913859598336.0, + "grad_norm": 0.0328462843269242, + "language_loss": 0.98848528, + "learning_rate": 0.0009083911891031745, + "loss": 1.00010729, + "num_input_tokens_seen": 95081040, + "router_z_loss_mlp": 0.96679688, + "step": 1143, + "time_per_iteration": 3.1019930839538574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116483, + "balance_loss_mlp": 1.06793654, + "epoch": 0.22008464794151597, + "flos": 824494241280.0, + "grad_norm": 0.023913565571636344, + "language_loss": 1.01496291, + "learning_rate": 0.0009082113667873553, + "loss": 1.02661121, + "num_input_tokens_seen": 95167328, + "router_z_loss_mlp": 0.96875, + "step": 1144, + "time_per_iteration": 3.104292869567871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170855, + "balance_loss_mlp": 1.07405746, + "epoch": 0.22027702962677953, + "flos": 460618622976.0, + "grad_norm": 0.029355186834356364, + "language_loss": 1.00543249, + "learning_rate": 0.0009080313859898283, + "loss": 1.0171411, + "num_input_tokens_seen": 95230304, + "router_z_loss_mlp": 0.96777344, + "step": 1145, + "time_per_iteration": 2.552457332611084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170139, + "balance_loss_mlp": 1.07343698, + "epoch": 0.2204694113120431, + "flos": 532287372288.0, + "grad_norm": 0.025362278251747628, + "language_loss": 1.01871562, + "learning_rate": 0.0009078512467804684, + "loss": 1.03041708, + "num_input_tokens_seen": 95299520, + "router_z_loss_mlp": 0.96679688, + "step": 1146, + "time_per_iteration": 2.6138763427734375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170493, + "balance_loss_mlp": 1.07379043, + "epoch": 0.22066179299730665, + "flos": 523686481920.0, + "grad_norm": 0.02553067563602684, + "language_loss": 1.00136042, + "learning_rate": 0.0009076709492292119, + "loss": 1.01306534, + "num_input_tokens_seen": 95368912, + "router_z_loss_mlp": 0.96679688, + "step": 1147, + "time_per_iteration": 2.6107985973358154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163104, + "balance_loss_mlp": 1.0664016, + "epoch": 0.2208541746825702, + "flos": 547505115648.0, + "grad_norm": 0.02505349531569444, + "language_loss": 0.99364072, + "learning_rate": 0.0009074904934060562, + "loss": 1.00527167, + "num_input_tokens_seen": 95440800, + "router_z_loss_mlp": 0.96679688, + "step": 1148, + "time_per_iteration": 2.680250644683838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166008, + "balance_loss_mlp": 1.06873322, + "epoch": 0.22104655636783377, + "flos": 710059358208.0, + "grad_norm": 0.023468083856487864, + "language_loss": 0.93112767, + "learning_rate": 0.0009073098793810607, + "loss": 0.94278765, + "num_input_tokens_seen": 95519904, + "router_z_loss_mlp": 0.97265625, + "step": 1149, + "time_per_iteration": 2.9064676761627197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165673, + "balance_loss_mlp": 1.06882739, + "epoch": 0.22123893805309736, + "flos": 585964073472.0, + "grad_norm": 0.028202445852463846, + "language_loss": 0.98436809, + "learning_rate": 0.000907129107224346, + "loss": 0.99602491, + "num_input_tokens_seen": 95591568, + "router_z_loss_mlp": 0.96826172, + "step": 1150, + "time_per_iteration": 2.670436382293701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165906, + "balance_loss_mlp": 1.06901312, + "epoch": 0.22143131973836092, + "flos": 493250995200.0, + "grad_norm": 0.02267098136900654, + "language_loss": 0.95673937, + "learning_rate": 0.0009069481770060939, + "loss": 0.96839839, + "num_input_tokens_seen": 95664480, + "router_z_loss_mlp": 0.96875, + "step": 1151, + "time_per_iteration": 2.650136947631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167632, + "balance_loss_mlp": 1.07092977, + "epoch": 0.22162370142362448, + "flos": 1081467623424.0, + "grad_norm": 0.023887201965423828, + "language_loss": 0.92357147, + "learning_rate": 0.000906767088796548, + "loss": 0.93524778, + "num_input_tokens_seen": 95754400, + "router_z_loss_mlp": 0.96679688, + "step": 1152, + "time_per_iteration": 3.4331767559051514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174048, + "balance_loss_mlp": 1.07734585, + "epoch": 0.22181608310888803, + "flos": 493511506944.0, + "grad_norm": 0.021211000774135545, + "language_loss": 0.94297695, + "learning_rate": 0.0009065858426660127, + "loss": 0.9547174, + "num_input_tokens_seen": 95826944, + "router_z_loss_mlp": 0.96679688, + "step": 1153, + "time_per_iteration": 2.6492207050323486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171336, + "balance_loss_mlp": 1.07458591, + "epoch": 0.2220084647941516, + "flos": 725324765184.0, + "grad_norm": 0.02806046891368227, + "language_loss": 0.95655924, + "learning_rate": 0.0009064044386848543, + "loss": 0.96827257, + "num_input_tokens_seen": 95902688, + "router_z_loss_mlp": 0.96728516, + "step": 1154, + "time_per_iteration": 2.9135258197784424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116775, + "balance_loss_mlp": 1.07090425, + "epoch": 0.22220084647941515, + "flos": 490244376576.0, + "grad_norm": 0.029776005734579798, + "language_loss": 1.00600004, + "learning_rate": 0.0009062228769234997, + "loss": 1.01767755, + "num_input_tokens_seen": 95969952, + "router_z_loss_mlp": 0.96826172, + "step": 1155, + "time_per_iteration": 2.597781181335449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171214, + "balance_loss_mlp": 1.07451141, + "epoch": 0.2223932281646787, + "flos": 537295492608.0, + "grad_norm": 0.030445586519746, + "language_loss": 0.93354964, + "learning_rate": 0.0009060411574524376, + "loss": 0.94526184, + "num_input_tokens_seen": 96037344, + "router_z_loss_mlp": 0.96679688, + "step": 1156, + "time_per_iteration": 2.7325634956359863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168314, + "balance_loss_mlp": 1.07151604, + "epoch": 0.22258560984994227, + "flos": 932967553536.0, + "grad_norm": 0.0275078677514356, + "language_loss": 0.98614538, + "learning_rate": 0.0009058592803422178, + "loss": 0.99782854, + "num_input_tokens_seen": 96115616, + "router_z_loss_mlp": 0.96777344, + "step": 1157, + "time_per_iteration": 3.156981945037842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169861, + "balance_loss_mlp": 1.0739212, + "epoch": 0.22277799153520586, + "flos": 1202395286016.0, + "grad_norm": 0.00950920896526599, + "language_loss": 0.78710288, + "learning_rate": 0.0009056772456634512, + "loss": 0.79880148, + "num_input_tokens_seen": 96333600, + "router_z_loss_mlp": 0.95898438, + "step": 1158, + "time_per_iteration": 4.7935662269592285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170727, + "balance_loss_mlp": 1.07421494, + "epoch": 0.22297037322046942, + "flos": 502316513280.0, + "grad_norm": 0.05502374006765337, + "language_loss": 0.97024429, + "learning_rate": 0.00090549505348681, + "loss": 0.98195159, + "num_input_tokens_seen": 96402544, + "router_z_loss_mlp": 0.96484375, + "step": 1159, + "time_per_iteration": 2.579418659210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167768, + "balance_loss_mlp": 1.07135153, + "epoch": 0.22316275490573298, + "flos": 754112587776.0, + "grad_norm": 0.025312842068973822, + "language_loss": 0.9244132, + "learning_rate": 0.0009053127038830275, + "loss": 0.93609083, + "num_input_tokens_seen": 96487600, + "router_z_loss_mlp": 0.96386719, + "step": 1160, + "time_per_iteration": 2.970240592956543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169788, + "balance_loss_mlp": 1.07346714, + "epoch": 0.22335513659099654, + "flos": 515804000256.0, + "grad_norm": 0.02702757021011719, + "language_loss": 0.97474223, + "learning_rate": 0.000905130196922898, + "loss": 0.98644012, + "num_input_tokens_seen": 96554912, + "router_z_loss_mlp": 0.96289062, + "step": 1161, + "time_per_iteration": 2.558567762374878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175493, + "balance_loss_mlp": 1.07917213, + "epoch": 0.2235475182762601, + "flos": 485507501568.0, + "grad_norm": 0.024760780359754056, + "language_loss": 0.947945, + "learning_rate": 0.0009049475326772769, + "loss": 0.95969993, + "num_input_tokens_seen": 96624192, + "router_z_loss_mlp": 0.96289062, + "step": 1162, + "time_per_iteration": 2.5948867797851562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168008, + "balance_loss_mlp": 1.0716871, + "epoch": 0.22373989996152366, + "flos": 471067290624.0, + "grad_norm": 0.0243609738761747, + "language_loss": 0.92091036, + "learning_rate": 0.0009047647112170811, + "loss": 0.93259048, + "num_input_tokens_seen": 96701040, + "router_z_loss_mlp": 0.96289062, + "step": 1163, + "time_per_iteration": 2.7958250045776367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165002, + "balance_loss_mlp": 1.06868088, + "epoch": 0.22393228164678722, + "flos": 1273017807360.0, + "grad_norm": 0.0269563070164892, + "language_loss": 0.98098505, + "learning_rate": 0.0009045817326132876, + "loss": 0.99263507, + "num_input_tokens_seen": 96791200, + "router_z_loss_mlp": 0.96289062, + "step": 1164, + "time_per_iteration": 3.64853835105896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165462, + "balance_loss_mlp": 1.06914091, + "epoch": 0.22412466333205078, + "flos": 597467523072.0, + "grad_norm": 0.02771003139242203, + "language_loss": 0.94602239, + "learning_rate": 0.0009043985969369357, + "loss": 0.95767695, + "num_input_tokens_seen": 96869360, + "router_z_loss_mlp": 0.96289062, + "step": 1165, + "time_per_iteration": 2.8231425285339355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175209, + "balance_loss_mlp": 1.07860184, + "epoch": 0.22431704501731436, + "flos": 609630984192.0, + "grad_norm": 0.02516811505749033, + "language_loss": 0.93514198, + "learning_rate": 0.0009042153042591245, + "loss": 0.94689411, + "num_input_tokens_seen": 96945840, + "router_z_loss_mlp": 0.96582031, + "step": 1166, + "time_per_iteration": 2.755671501159668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174563, + "balance_loss_mlp": 1.07819414, + "epoch": 0.22450942670257792, + "flos": 908106872832.0, + "grad_norm": 0.024247493396408124, + "language_loss": 0.93277276, + "learning_rate": 0.0009040318546510146, + "loss": 0.94451833, + "num_input_tokens_seen": 97029296, + "router_z_loss_mlp": 0.96337891, + "step": 1167, + "time_per_iteration": 3.126707077026367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174214, + "balance_loss_mlp": 1.07770181, + "epoch": 0.22470180838784148, + "flos": 566380756992.0, + "grad_norm": 0.02335770706345326, + "language_loss": 0.94522464, + "learning_rate": 0.0009038482481838275, + "loss": 0.95696682, + "num_input_tokens_seen": 97097776, + "router_z_loss_mlp": 0.96484375, + "step": 1168, + "time_per_iteration": 2.6482362747192383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171371, + "balance_loss_mlp": 1.07485878, + "epoch": 0.22489419007310504, + "flos": 835917100032.0, + "grad_norm": 0.021740410096357694, + "language_loss": 0.9467479, + "learning_rate": 0.0009036644849288455, + "loss": 0.95846164, + "num_input_tokens_seen": 97181424, + "router_z_loss_mlp": 0.96484375, + "step": 1169, + "time_per_iteration": 3.0959203243255615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168691, + "balance_loss_mlp": 1.07217908, + "epoch": 0.2250865717583686, + "flos": 582138989568.0, + "grad_norm": 0.028400846177611044, + "language_loss": 0.95971251, + "learning_rate": 0.0009034805649574118, + "loss": 0.97139943, + "num_input_tokens_seen": 97252128, + "router_z_loss_mlp": 0.96484375, + "step": 1170, + "time_per_iteration": 2.65209698677063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171761, + "balance_loss_mlp": 1.07515407, + "epoch": 0.22527895344363216, + "flos": 601670639616.0, + "grad_norm": 0.021879369323455276, + "language_loss": 0.92857611, + "learning_rate": 0.0009032964883409308, + "loss": 0.94029367, + "num_input_tokens_seen": 97326640, + "router_z_loss_mlp": 0.96582031, + "step": 1171, + "time_per_iteration": 2.8586626052856445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175461, + "balance_loss_mlp": 1.07990265, + "epoch": 0.22547133512889572, + "flos": 1443731959296.0, + "grad_norm": 0.011387534292379292, + "language_loss": 0.73050535, + "learning_rate": 0.000903112255150867, + "loss": 0.74225998, + "num_input_tokens_seen": 97553952, + "router_z_loss_mlp": 0.95507812, + "step": 1172, + "time_per_iteration": 4.9882895946502686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171774, + "balance_loss_mlp": 1.07526255, + "epoch": 0.22566371681415928, + "flos": 491585866752.0, + "grad_norm": 0.025801800464723818, + "language_loss": 0.97062689, + "learning_rate": 0.0009029278654587462, + "loss": 0.98234463, + "num_input_tokens_seen": 97623584, + "router_z_loss_mlp": 0.96484375, + "step": 1173, + "time_per_iteration": 2.595419406890869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171429, + "balance_loss_mlp": 1.07491696, + "epoch": 0.22585609849942284, + "flos": 605751505920.0, + "grad_norm": 0.02576863859493135, + "language_loss": 0.92400688, + "learning_rate": 0.0009027433193361548, + "loss": 0.93572116, + "num_input_tokens_seen": 97695952, + "router_z_loss_mlp": 0.96484375, + "step": 1174, + "time_per_iteration": 2.738267183303833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117476, + "balance_loss_mlp": 1.07824779, + "epoch": 0.22604848018468643, + "flos": 636727481856.0, + "grad_norm": 0.028952390928102957, + "language_loss": 0.97668821, + "learning_rate": 0.00090255861685474, + "loss": 0.98843575, + "num_input_tokens_seen": 97764544, + "router_z_loss_mlp": 0.96484375, + "step": 1175, + "time_per_iteration": 2.7286014556884766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117152, + "balance_loss_mlp": 1.07481766, + "epoch": 0.22624086186995, + "flos": 480844486656.0, + "grad_norm": 0.027877026454804697, + "language_loss": 1.02366519, + "learning_rate": 0.0009023737580862095, + "loss": 1.03538048, + "num_input_tokens_seen": 97830976, + "router_z_loss_mlp": 0.96679688, + "step": 1176, + "time_per_iteration": 2.553281307220459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170774, + "balance_loss_mlp": 1.07388091, + "epoch": 0.22643324355521355, + "flos": 496806835200.0, + "grad_norm": 0.02249634447584531, + "language_loss": 0.90840948, + "learning_rate": 0.0009021887431023321, + "loss": 0.92011726, + "num_input_tokens_seen": 97898800, + "router_z_loss_mlp": 0.96875, + "step": 1177, + "time_per_iteration": 2.5862364768981934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172189, + "balance_loss_mlp": 1.07539093, + "epoch": 0.2266256252404771, + "flos": 562683927552.0, + "grad_norm": 0.02041789434880362, + "language_loss": 0.95725513, + "learning_rate": 0.0009020035719749369, + "loss": 0.96897697, + "num_input_tokens_seen": 97974112, + "router_z_loss_mlp": 0.96777344, + "step": 1178, + "time_per_iteration": 2.7553560733795166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176357, + "balance_loss_mlp": 1.0796541, + "epoch": 0.22681800692574067, + "flos": 581032541184.0, + "grad_norm": 0.026733278329428435, + "language_loss": 0.89533567, + "learning_rate": 0.0009018182447759136, + "loss": 0.90709925, + "num_input_tokens_seen": 98056640, + "router_z_loss_mlp": 0.96679688, + "step": 1179, + "time_per_iteration": 3.012024402618408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175508, + "balance_loss_mlp": 1.07904434, + "epoch": 0.22701038861100423, + "flos": 741465033216.0, + "grad_norm": 0.025064804828048133, + "language_loss": 0.90941453, + "learning_rate": 0.0009016327615772126, + "loss": 0.92116958, + "num_input_tokens_seen": 98135952, + "router_z_loss_mlp": 0.96435547, + "step": 1180, + "time_per_iteration": 2.969684600830078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172378, + "balance_loss_mlp": 1.07577109, + "epoch": 0.2272027702962678, + "flos": 578305173504.0, + "grad_norm": 0.036813558231106436, + "language_loss": 1.00164366, + "learning_rate": 0.0009014471224508451, + "loss": 1.01336741, + "num_input_tokens_seen": 98204288, + "router_z_loss_mlp": 0.96582031, + "step": 1181, + "time_per_iteration": 2.664487361907959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173976, + "balance_loss_mlp": 1.0774641, + "epoch": 0.22739515198153135, + "flos": 545290765824.0, + "grad_norm": 0.028585613124224512, + "language_loss": 0.95647848, + "learning_rate": 0.0009012613274688823, + "loss": 0.96821827, + "num_input_tokens_seen": 98269856, + "router_z_loss_mlp": 0.96484375, + "step": 1182, + "time_per_iteration": 2.647608518600464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177492, + "balance_loss_mlp": 1.08078945, + "epoch": 0.22758753366679493, + "flos": 441091702272.0, + "grad_norm": 0.02755397132508441, + "language_loss": 1.00651419, + "learning_rate": 0.0009010753767034565, + "loss": 1.01828909, + "num_input_tokens_seen": 98335632, + "router_z_loss_mlp": 0.96679688, + "step": 1183, + "time_per_iteration": 2.528580904006958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176952, + "balance_loss_mlp": 1.08053601, + "epoch": 0.2277799153520585, + "flos": 730823709696.0, + "grad_norm": 0.024484618665474616, + "language_loss": 0.90051508, + "learning_rate": 0.0009008892702267599, + "loss": 0.91228461, + "num_input_tokens_seen": 98420592, + "router_z_loss_mlp": 0.96386719, + "step": 1184, + "time_per_iteration": 2.990344285964966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117733, + "balance_loss_mlp": 1.08100891, + "epoch": 0.22797229703732205, + "flos": 527913067008.0, + "grad_norm": 0.030622621699729128, + "language_loss": 1.01022232, + "learning_rate": 0.0009007030081110457, + "loss": 1.02199566, + "num_input_tokens_seen": 98488096, + "router_z_loss_mlp": 0.96289062, + "step": 1185, + "time_per_iteration": 2.5795140266418457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172726, + "balance_loss_mlp": 1.07592821, + "epoch": 0.2281646787225856, + "flos": 536520688128.0, + "grad_norm": 0.026616575931436976, + "language_loss": 0.93079567, + "learning_rate": 0.000900516590428627, + "loss": 0.942523, + "num_input_tokens_seen": 98561664, + "router_z_loss_mlp": 0.96777344, + "step": 1186, + "time_per_iteration": 2.6647558212280273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117313, + "balance_loss_mlp": 1.07628405, + "epoch": 0.22835706040784917, + "flos": 542477529600.0, + "grad_norm": 0.02522496809839962, + "language_loss": 0.99033505, + "learning_rate": 0.0009003300172518778, + "loss": 1.00206637, + "num_input_tokens_seen": 98634336, + "router_z_loss_mlp": 0.96826172, + "step": 1187, + "time_per_iteration": 2.7046303749084473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177624, + "balance_loss_mlp": 1.08073056, + "epoch": 0.22854944209311273, + "flos": 792004859904.0, + "grad_norm": 0.026332453075710083, + "language_loss": 0.94325852, + "learning_rate": 0.0009001432886532321, + "loss": 0.95503473, + "num_input_tokens_seen": 98709600, + "router_z_loss_mlp": 0.96875, + "step": 1188, + "time_per_iteration": 2.9583094120025635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179036, + "balance_loss_mlp": 1.08233392, + "epoch": 0.2287418237783763, + "flos": 470215898112.0, + "grad_norm": 0.025775869396212594, + "language_loss": 0.97465944, + "learning_rate": 0.0008999564047051843, + "loss": 0.98644984, + "num_input_tokens_seen": 98775024, + "router_z_loss_mlp": 0.96679688, + "step": 1189, + "time_per_iteration": 2.49245023727417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178388, + "balance_loss_mlp": 1.08154237, + "epoch": 0.22893420546363985, + "flos": 469004663808.0, + "grad_norm": 0.023763579929190374, + "language_loss": 0.94691694, + "learning_rate": 0.0008997693654802894, + "loss": 0.95870078, + "num_input_tokens_seen": 98845248, + "router_z_loss_mlp": 0.96826172, + "step": 1190, + "time_per_iteration": 2.6276731491088867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178257, + "balance_loss_mlp": 1.08145857, + "epoch": 0.22912658714890344, + "flos": 627401452032.0, + "grad_norm": 0.023724149848154047, + "language_loss": 0.95182133, + "learning_rate": 0.0008995821710511625, + "loss": 0.96360391, + "num_input_tokens_seen": 98913584, + "router_z_loss_mlp": 0.96777344, + "step": 1191, + "time_per_iteration": 2.756840705871582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117993, + "balance_loss_mlp": 1.08308399, + "epoch": 0.229318968834167, + "flos": 504020573184.0, + "grad_norm": 0.024708694220473774, + "language_loss": 0.93247074, + "learning_rate": 0.0008993948214904786, + "loss": 0.94427001, + "num_input_tokens_seen": 98978608, + "router_z_loss_mlp": 0.96826172, + "step": 1192, + "time_per_iteration": 2.577340602874756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190514, + "balance_loss_mlp": 1.09533691, + "epoch": 0.22951135051943056, + "flos": 1377713877504.0, + "grad_norm": 0.021264094300491608, + "language_loss": 0.78422213, + "learning_rate": 0.0008992073168709733, + "loss": 0.79612726, + "num_input_tokens_seen": 99207424, + "router_z_loss_mlp": 0.95117188, + "step": 1193, + "time_per_iteration": 4.850237607955933 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179442, + "balance_loss_mlp": 1.08316851, + "epoch": 0.22970373220469412, + "flos": 645549952512.0, + "grad_norm": 0.02667568465905087, + "language_loss": 0.92540175, + "learning_rate": 0.0008990196572654427, + "loss": 0.93719625, + "num_input_tokens_seen": 99290592, + "router_z_loss_mlp": 0.96240234, + "step": 1194, + "time_per_iteration": 2.8638381958007812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180858, + "balance_loss_mlp": 1.08453715, + "epoch": 0.22989611388995768, + "flos": 501272464896.0, + "grad_norm": 0.02416134539694475, + "language_loss": 0.95937514, + "learning_rate": 0.0008988318427467426, + "loss": 0.97118378, + "num_input_tokens_seen": 99366096, + "router_z_loss_mlp": 0.96289062, + "step": 1195, + "time_per_iteration": 2.7063868045806885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182741, + "balance_loss_mlp": 1.08589542, + "epoch": 0.23008849557522124, + "flos": 1098333030912.0, + "grad_norm": 0.02922856270819412, + "language_loss": 0.9667449, + "learning_rate": 0.0008986438733877887, + "loss": 0.97857237, + "num_input_tokens_seen": 99456768, + "router_z_loss_mlp": 0.96826172, + "step": 1196, + "time_per_iteration": 3.4508113861083984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176758, + "balance_loss_mlp": 1.08043683, + "epoch": 0.2302808772604848, + "flos": 684992560128.0, + "grad_norm": 0.022228440588834414, + "language_loss": 0.91545051, + "learning_rate": 0.0008984557492615576, + "loss": 0.92721808, + "num_input_tokens_seen": 99539616, + "router_z_loss_mlp": 0.96289062, + "step": 1197, + "time_per_iteration": 2.93611741065979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179014, + "balance_loss_mlp": 1.08269298, + "epoch": 0.23047325894574835, + "flos": 529960230912.0, + "grad_norm": 0.026499525382426087, + "language_loss": 0.99148774, + "learning_rate": 0.0008982674704410854, + "loss": 1.0032779, + "num_input_tokens_seen": 99612064, + "router_z_loss_mlp": 0.96289062, + "step": 1198, + "time_per_iteration": 2.7032008171081543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180823, + "balance_loss_mlp": 1.08450174, + "epoch": 0.23066564063101191, + "flos": 684126431232.0, + "grad_norm": 0.025326379221325218, + "language_loss": 0.86113322, + "learning_rate": 0.0008980790369994682, + "loss": 0.87294143, + "num_input_tokens_seen": 99691040, + "router_z_loss_mlp": 0.96289062, + "step": 1199, + "time_per_iteration": 2.9629056453704834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173246, + "balance_loss_mlp": 1.07673466, + "epoch": 0.2308580223162755, + "flos": 559631646720.0, + "grad_norm": 0.02469990042405053, + "language_loss": 0.95889735, + "learning_rate": 0.000897890449009863, + "loss": 0.97062981, + "num_input_tokens_seen": 99762016, + "router_z_loss_mlp": 0.96484375, + "step": 1200, + "time_per_iteration": 2.6673126220703125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178191, + "balance_loss_mlp": 1.08167911, + "epoch": 0.23105040400153906, + "flos": 556729087488.0, + "grad_norm": 0.021551459012756572, + "language_loss": 0.97633696, + "learning_rate": 0.0008977017065454853, + "loss": 0.98811877, + "num_input_tokens_seen": 99835552, + "router_z_loss_mlp": 0.96484375, + "step": 1201, + "time_per_iteration": 2.6586263179779053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176954, + "balance_loss_mlp": 1.08048964, + "epoch": 0.23124278568680262, + "flos": 706049624064.0, + "grad_norm": 0.025666519973580538, + "language_loss": 0.89963996, + "learning_rate": 0.0008975128096796121, + "loss": 0.9114095, + "num_input_tokens_seen": 99910784, + "router_z_loss_mlp": 0.96435547, + "step": 1202, + "time_per_iteration": 2.8599958419799805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175929, + "balance_loss_mlp": 1.07989419, + "epoch": 0.23143516737206618, + "flos": 613968359424.0, + "grad_norm": 0.02791489713026627, + "language_loss": 0.96485001, + "learning_rate": 0.0008973237584855794, + "loss": 0.97660929, + "num_input_tokens_seen": 99991120, + "router_z_loss_mlp": 0.95996094, + "step": 1203, + "time_per_iteration": 2.8814125061035156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117493, + "balance_loss_mlp": 1.07903779, + "epoch": 0.23162754905732974, + "flos": 390095980032.0, + "grad_norm": 0.02381480195735972, + "language_loss": 0.91340852, + "learning_rate": 0.0008971345530367832, + "loss": 0.92515785, + "num_input_tokens_seen": 100053888, + "router_z_loss_mlp": 0.95849609, + "step": 1204, + "time_per_iteration": 2.513951301574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176133, + "balance_loss_mlp": 1.08024144, + "epoch": 0.2318199307425933, + "flos": 668969086464.0, + "grad_norm": 0.024943516104182908, + "language_loss": 0.94778013, + "learning_rate": 0.0008969451934066799, + "loss": 0.95954144, + "num_input_tokens_seen": 100124176, + "router_z_loss_mlp": 0.95849609, + "step": 1205, + "time_per_iteration": 2.80454421043396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173068, + "balance_loss_mlp": 1.07712853, + "epoch": 0.23201231242785686, + "flos": 667627596288.0, + "grad_norm": 0.029617322009159303, + "language_loss": 0.92493355, + "learning_rate": 0.0008967556796687854, + "loss": 0.93666422, + "num_input_tokens_seen": 100205296, + "router_z_loss_mlp": 0.95898438, + "step": 1206, + "time_per_iteration": 2.89932918548584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173146, + "balance_loss_mlp": 1.07720602, + "epoch": 0.23220469411312042, + "flos": 750094121472.0, + "grad_norm": 0.024264467100448908, + "language_loss": 0.94343531, + "learning_rate": 0.0008965660118966752, + "loss": 0.95516682, + "num_input_tokens_seen": 100279440, + "router_z_loss_mlp": 0.95898438, + "step": 1207, + "time_per_iteration": 2.9768385887145996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179014, + "balance_loss_mlp": 1.08307481, + "epoch": 0.232397075798384, + "flos": 668261411328.0, + "grad_norm": 0.02512248807118796, + "language_loss": 0.97498, + "learning_rate": 0.0008963761901639851, + "loss": 0.98677015, + "num_input_tokens_seen": 100354512, + "router_z_loss_mlp": 0.95898438, + "step": 1208, + "time_per_iteration": 2.8175342082977295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177539, + "balance_loss_mlp": 1.081599, + "epoch": 0.23258945748364757, + "flos": 611345777664.0, + "grad_norm": 0.025244332610569246, + "language_loss": 0.93465042, + "learning_rate": 0.0008961862145444103, + "loss": 0.9464258, + "num_input_tokens_seen": 100426848, + "router_z_loss_mlp": 0.95898438, + "step": 1209, + "time_per_iteration": 2.707583427429199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117491, + "balance_loss_mlp": 1.07901847, + "epoch": 0.23278183916891113, + "flos": 490672074240.0, + "grad_norm": 0.025133767455437463, + "language_loss": 0.96175104, + "learning_rate": 0.0008959960851117059, + "loss": 0.97350019, + "num_input_tokens_seen": 100496176, + "router_z_loss_mlp": 0.95849609, + "step": 1210, + "time_per_iteration": 2.5783777236938477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174943, + "balance_loss_mlp": 1.07895589, + "epoch": 0.23297422085417469, + "flos": 512673856512.0, + "grad_norm": 0.027877077505007057, + "language_loss": 0.94183683, + "learning_rate": 0.0008958058019396868, + "loss": 0.95358628, + "num_input_tokens_seen": 100575072, + "router_z_loss_mlp": 0.95947266, + "step": 1211, + "time_per_iteration": 2.7695388793945312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118178, + "balance_loss_mlp": 1.08560216, + "epoch": 0.23316660253943824, + "flos": 547531312128.0, + "grad_norm": 0.0259067341075638, + "language_loss": 0.95459378, + "learning_rate": 0.0008956153651022274, + "loss": 0.96641153, + "num_input_tokens_seen": 100648304, + "router_z_loss_mlp": 0.96142578, + "step": 1212, + "time_per_iteration": 2.7088377475738525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178328, + "balance_loss_mlp": 1.08181643, + "epoch": 0.2333589842247018, + "flos": 511288705536.0, + "grad_norm": 0.023917692799316066, + "language_loss": 0.93208623, + "learning_rate": 0.0008954247746732618, + "loss": 0.94386959, + "num_input_tokens_seen": 100717616, + "router_z_loss_mlp": 0.96484375, + "step": 1213, + "time_per_iteration": 2.6319668292999268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172909, + "balance_loss_mlp": 1.0766834, + "epoch": 0.23355136590996536, + "flos": 664406128128.0, + "grad_norm": 0.02356648487739955, + "language_loss": 0.98858505, + "learning_rate": 0.0008952340307267837, + "loss": 1.00031424, + "num_input_tokens_seen": 100797056, + "router_z_loss_mlp": 0.96191406, + "step": 1214, + "time_per_iteration": 2.891026735305786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172334, + "balance_loss_mlp": 1.07629859, + "epoch": 0.23374374759522892, + "flos": 509465123328.0, + "grad_norm": 0.027978905734491046, + "language_loss": 0.94424212, + "learning_rate": 0.0008950431333368468, + "loss": 0.95596552, + "num_input_tokens_seen": 100863632, + "router_z_loss_mlp": 0.95996094, + "step": 1215, + "time_per_iteration": 2.5823488235473633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173288, + "balance_loss_mlp": 1.07730114, + "epoch": 0.2339361292804925, + "flos": 1296428209152.0, + "grad_norm": 0.026145796218117214, + "language_loss": 0.94705772, + "learning_rate": 0.0008948520825775634, + "loss": 0.95879066, + "num_input_tokens_seen": 100950272, + "router_z_loss_mlp": 0.95947266, + "step": 1216, + "time_per_iteration": 3.6343605518341064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174216, + "balance_loss_mlp": 1.07808566, + "epoch": 0.23412851096575607, + "flos": 707176264704.0, + "grad_norm": 0.02578801546488365, + "language_loss": 0.93516719, + "learning_rate": 0.0008946608785231067, + "loss": 0.94690937, + "num_input_tokens_seen": 101031008, + "router_z_loss_mlp": 0.9609375, + "step": 1217, + "time_per_iteration": 2.8923676013946533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174557, + "balance_loss_mlp": 1.07842624, + "epoch": 0.23432089265101963, + "flos": 439174794240.0, + "grad_norm": 0.024987781095147748, + "language_loss": 0.94467312, + "learning_rate": 0.0008944695212477084, + "loss": 0.95641869, + "num_input_tokens_seen": 101094688, + "router_z_loss_mlp": 0.9609375, + "step": 1218, + "time_per_iteration": 2.47641658782959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176273, + "balance_loss_mlp": 1.08028615, + "epoch": 0.2345132743362832, + "flos": 481914731520.0, + "grad_norm": 0.02187031641141441, + "language_loss": 0.9320662, + "learning_rate": 0.0008942780108256599, + "loss": 0.94382894, + "num_input_tokens_seen": 101163744, + "router_z_loss_mlp": 0.95947266, + "step": 1219, + "time_per_iteration": 2.585204839706421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176397, + "balance_loss_mlp": 1.07993269, + "epoch": 0.23470565602154675, + "flos": 412340809728.0, + "grad_norm": 0.02314471919225668, + "language_loss": 0.95930934, + "learning_rate": 0.0008940863473313121, + "loss": 0.97107327, + "num_input_tokens_seen": 101226480, + "router_z_loss_mlp": 0.96435547, + "step": 1220, + "time_per_iteration": 2.461904764175415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174627, + "balance_loss_mlp": 1.07811534, + "epoch": 0.2348980377068103, + "flos": 546499998720.0, + "grad_norm": 0.029389735884218435, + "language_loss": 0.99771547, + "learning_rate": 0.0008938945308390756, + "loss": 1.00946164, + "num_input_tokens_seen": 101291824, + "router_z_loss_mlp": 0.96484375, + "step": 1221, + "time_per_iteration": 2.6403567790985107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179462, + "balance_loss_mlp": 1.08295047, + "epoch": 0.23509041939207387, + "flos": 576842159616.0, + "grad_norm": 0.023502241620232074, + "language_loss": 0.96374851, + "learning_rate": 0.00089370256142342, + "loss": 0.97554314, + "num_input_tokens_seen": 101367216, + "router_z_loss_mlp": 0.96484375, + "step": 1222, + "time_per_iteration": 2.7148585319519043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178637, + "balance_loss_mlp": 1.08198178, + "epoch": 0.23528280107733743, + "flos": 589947611136.0, + "grad_norm": 0.022852016666186668, + "language_loss": 0.93682569, + "learning_rate": 0.0008935104391588746, + "loss": 0.94861209, + "num_input_tokens_seen": 101438992, + "router_z_loss_mlp": 0.96630859, + "step": 1223, + "time_per_iteration": 2.7302677631378174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179799, + "balance_loss_mlp": 1.08338237, + "epoch": 0.235475182762601, + "flos": 824856811008.0, + "grad_norm": 0.02091323276417278, + "language_loss": 0.91087663, + "learning_rate": 0.0008933181641200276, + "loss": 0.9226746, + "num_input_tokens_seen": 101534464, + "router_z_loss_mlp": 0.96386719, + "step": 1224, + "time_per_iteration": 3.120337724685669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183017, + "balance_loss_mlp": 1.08650565, + "epoch": 0.23566756444786457, + "flos": 681366862848.0, + "grad_norm": 0.027323039985709546, + "language_loss": 0.94355077, + "learning_rate": 0.0008931257363815271, + "loss": 0.95538092, + "num_input_tokens_seen": 101616496, + "router_z_loss_mlp": 0.96484375, + "step": 1225, + "time_per_iteration": 2.893202543258667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178928, + "balance_loss_mlp": 1.08251154, + "epoch": 0.23585994613312813, + "flos": 703134329856.0, + "grad_norm": 0.022860929740297704, + "language_loss": 0.96590424, + "learning_rate": 0.0008929331560180798, + "loss": 0.97769356, + "num_input_tokens_seen": 101694496, + "router_z_loss_mlp": 0.96386719, + "step": 1226, + "time_per_iteration": 2.913858652114868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176734, + "balance_loss_mlp": 1.08017468, + "epoch": 0.2360523278183917, + "flos": 525195158016.0, + "grad_norm": 0.02227272458953822, + "language_loss": 0.99194574, + "learning_rate": 0.0008927404231044525, + "loss": 1.00371313, + "num_input_tokens_seen": 101766160, + "router_z_loss_mlp": 0.96533203, + "step": 1227, + "time_per_iteration": 2.7194507122039795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175869, + "balance_loss_mlp": 1.07921374, + "epoch": 0.23624470950365525, + "flos": 525442934784.0, + "grad_norm": 0.02071878597098496, + "language_loss": 0.89412713, + "learning_rate": 0.0008925475377154703, + "loss": 0.90588582, + "num_input_tokens_seen": 101844160, + "router_z_loss_mlp": 0.96630859, + "step": 1228, + "time_per_iteration": 2.742506265640259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175669, + "balance_loss_mlp": 1.07896686, + "epoch": 0.2364370911889188, + "flos": 597960348672.0, + "grad_norm": 0.023166098266421232, + "language_loss": 0.90900964, + "learning_rate": 0.0008923544999260183, + "loss": 0.92076635, + "num_input_tokens_seen": 101917968, + "router_z_loss_mlp": 0.96679688, + "step": 1229, + "time_per_iteration": 2.809842109680176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177841, + "balance_loss_mlp": 1.08113885, + "epoch": 0.23662947287418237, + "flos": 758171986944.0, + "grad_norm": 0.02725464196132968, + "language_loss": 1.00227833, + "learning_rate": 0.00089216130981104, + "loss": 1.0140568, + "num_input_tokens_seen": 101996880, + "router_z_loss_mlp": 0.96679688, + "step": 1230, + "time_per_iteration": 3.0096282958984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178297, + "balance_loss_mlp": 1.08159423, + "epoch": 0.23682185455944593, + "flos": 547207673856.0, + "grad_norm": 0.024713012089740163, + "language_loss": 0.91807795, + "learning_rate": 0.000891967967445539, + "loss": 0.92986089, + "num_input_tokens_seen": 102067936, + "router_z_loss_mlp": 0.96679688, + "step": 1231, + "time_per_iteration": 2.7001702785491943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185987, + "balance_loss_mlp": 1.08928442, + "epoch": 0.2370142362447095, + "flos": 663522534912.0, + "grad_norm": 0.02265672956199411, + "language_loss": 0.96654546, + "learning_rate": 0.0008917744729045772, + "loss": 0.97840536, + "num_input_tokens_seen": 102147552, + "router_z_loss_mlp": 0.96679688, + "step": 1232, + "time_per_iteration": 2.8703036308288574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184505, + "balance_loss_mlp": 1.08789778, + "epoch": 0.23720661792997308, + "flos": 684911969280.0, + "grad_norm": 0.02632145570598456, + "language_loss": 0.93737417, + "learning_rate": 0.0008915808262632757, + "loss": 0.94921923, + "num_input_tokens_seen": 102224480, + "router_z_loss_mlp": 0.96582031, + "step": 1233, + "time_per_iteration": 2.839534044265747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185605, + "balance_loss_mlp": 1.08928347, + "epoch": 0.23739899961523664, + "flos": 560022414336.0, + "grad_norm": 0.027552675935845497, + "language_loss": 1.01508975, + "learning_rate": 0.0008913870275968148, + "loss": 1.02694583, + "num_input_tokens_seen": 102297392, + "router_z_loss_mlp": 0.96289062, + "step": 1234, + "time_per_iteration": 2.7176129817962646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182161, + "balance_loss_mlp": 1.08545852, + "epoch": 0.2375913813005002, + "flos": 891163602432.0, + "grad_norm": 0.02404650352203449, + "language_loss": 0.9583261, + "learning_rate": 0.0008911930769804342, + "loss": 0.97014773, + "num_input_tokens_seen": 102386032, + "router_z_loss_mlp": 0.96679688, + "step": 1235, + "time_per_iteration": 3.244257688522339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179697, + "balance_loss_mlp": 1.08289862, + "epoch": 0.23778376298576376, + "flos": 642365414400.0, + "grad_norm": 0.020226791074773265, + "language_loss": 0.99461335, + "learning_rate": 0.0008909989744894318, + "loss": 1.00641024, + "num_input_tokens_seen": 102463504, + "router_z_loss_mlp": 0.96777344, + "step": 1236, + "time_per_iteration": 2.8618855476379395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179012, + "balance_loss_mlp": 1.08230948, + "epoch": 0.23797614467102732, + "flos": 617945166336.0, + "grad_norm": 0.025060145140963254, + "language_loss": 0.91887248, + "learning_rate": 0.0008908047201991649, + "loss": 0.93066257, + "num_input_tokens_seen": 102529632, + "router_z_loss_mlp": 0.96679688, + "step": 1237, + "time_per_iteration": 2.7335665225982666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177715, + "balance_loss_mlp": 1.08120298, + "epoch": 0.23816852635629088, + "flos": 625463076864.0, + "grad_norm": 0.02188809519195417, + "language_loss": 0.92642158, + "learning_rate": 0.0008906103141850502, + "loss": 0.93819869, + "num_input_tokens_seen": 102610192, + "router_z_loss_mlp": 0.96484375, + "step": 1238, + "time_per_iteration": 2.9244723320007324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178141, + "balance_loss_mlp": 1.0816294, + "epoch": 0.23836090804155444, + "flos": 522440318976.0, + "grad_norm": 0.025638098136730073, + "language_loss": 0.97356987, + "learning_rate": 0.0008904157565225621, + "loss": 0.98535126, + "num_input_tokens_seen": 102681216, + "router_z_loss_mlp": 0.96484375, + "step": 1239, + "time_per_iteration": 2.6046018600463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186867, + "balance_loss_mlp": 1.09059334, + "epoch": 0.238553289726818, + "flos": 1155854281728.0, + "grad_norm": 0.0279922632366243, + "language_loss": 0.91224372, + "learning_rate": 0.000890221047287235, + "loss": 0.92411238, + "num_input_tokens_seen": 102777184, + "router_z_loss_mlp": 0.96240234, + "step": 1240, + "time_per_iteration": 3.503387928009033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191442, + "balance_loss_mlp": 1.09512079, + "epoch": 0.23874567141208156, + "flos": 500909895168.0, + "grad_norm": 0.02294407067471098, + "language_loss": 0.98687088, + "learning_rate": 0.0008900261865546615, + "loss": 0.99878532, + "num_input_tokens_seen": 102845744, + "router_z_loss_mlp": 0.96289062, + "step": 1241, + "time_per_iteration": 2.6329948902130127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188291, + "balance_loss_mlp": 1.09197009, + "epoch": 0.23893805309734514, + "flos": 558049110528.0, + "grad_norm": 0.02727719764566138, + "language_loss": 0.96105886, + "learning_rate": 0.0008898311744004936, + "loss": 0.97294176, + "num_input_tokens_seen": 102918064, + "router_z_loss_mlp": 0.96289062, + "step": 1242, + "time_per_iteration": 2.6852729320526123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011866, + "balance_loss_mlp": 1.0902791, + "epoch": 0.2391304347826087, + "flos": 550316350464.0, + "grad_norm": 0.023767912183342704, + "language_loss": 0.95555472, + "learning_rate": 0.0008896360109004414, + "loss": 0.9674207, + "num_input_tokens_seen": 102983920, + "router_z_loss_mlp": 0.96289062, + "step": 1243, + "time_per_iteration": 2.6607675552368164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181953, + "balance_loss_mlp": 1.08558464, + "epoch": 0.23932281646787226, + "flos": 517078361088.0, + "grad_norm": 0.022492500831292953, + "language_loss": 0.92156398, + "learning_rate": 0.0008894406961302742, + "loss": 0.93338358, + "num_input_tokens_seen": 103053328, + "router_z_loss_mlp": 0.96337891, + "step": 1244, + "time_per_iteration": 2.658339262008667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180796, + "balance_loss_mlp": 1.0844276, + "epoch": 0.23951519815313582, + "flos": 745001407488.0, + "grad_norm": 0.0220414301985699, + "language_loss": 0.9171226, + "learning_rate": 0.0008892452301658201, + "loss": 0.92893052, + "num_input_tokens_seen": 103128208, + "router_z_loss_mlp": 0.96337891, + "step": 1245, + "time_per_iteration": 2.987859010696411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189345, + "balance_loss_mlp": 1.09302354, + "epoch": 0.23970757983839938, + "flos": 555174749184.0, + "grad_norm": 0.02624868476300941, + "language_loss": 0.92775297, + "learning_rate": 0.0008890496130829653, + "loss": 0.93964636, + "num_input_tokens_seen": 103197392, + "router_z_loss_mlp": 0.96289062, + "step": 1246, + "time_per_iteration": 2.7285211086273193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011891, + "balance_loss_mlp": 1.09287417, + "epoch": 0.23989996152366294, + "flos": 481617289728.0, + "grad_norm": 0.024405638758005322, + "language_loss": 0.93939734, + "learning_rate": 0.0008888538449576555, + "loss": 0.95128834, + "num_input_tokens_seen": 103265328, + "router_z_loss_mlp": 0.96191406, + "step": 1247, + "time_per_iteration": 2.603447675704956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181648, + "balance_loss_mlp": 1.08532703, + "epoch": 0.2400923432089265, + "flos": 486280304640.0, + "grad_norm": 0.02551404288502155, + "language_loss": 0.9456799, + "learning_rate": 0.0008886579258658944, + "loss": 0.9574964, + "num_input_tokens_seen": 103331632, + "router_z_loss_mlp": 0.96289062, + "step": 1248, + "time_per_iteration": 2.6195995807647705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183672, + "balance_loss_mlp": 1.08735096, + "epoch": 0.24028472489419006, + "flos": 624792331776.0, + "grad_norm": 0.02192042043345247, + "language_loss": 0.93244678, + "learning_rate": 0.0008884618558837446, + "loss": 0.94428349, + "num_input_tokens_seen": 103405408, + "router_z_loss_mlp": 0.96289062, + "step": 1249, + "time_per_iteration": 2.830350399017334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187022, + "balance_loss_mlp": 1.09113026, + "epoch": 0.24047710657945365, + "flos": 602808013824.0, + "grad_norm": 0.023766863499936387, + "language_loss": 0.96457344, + "learning_rate": 0.0008882656350873273, + "loss": 0.97644365, + "num_input_tokens_seen": 103487216, + "router_z_loss_mlp": 0.95849609, + "step": 1250, + "time_per_iteration": 2.8691956996917725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119127, + "balance_loss_mlp": 1.09547377, + "epoch": 0.2406694882647172, + "flos": 843000582144.0, + "grad_norm": 0.03001641023469985, + "language_loss": 1.00300837, + "learning_rate": 0.0008880692635528219, + "loss": 1.01492119, + "num_input_tokens_seen": 103568640, + "router_z_loss_mlp": 0.95751953, + "step": 1251, + "time_per_iteration": 3.066152572631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187351, + "balance_loss_mlp": 1.09155416, + "epoch": 0.24086186994998077, + "flos": 528134647296.0, + "grad_norm": 0.026461260661865858, + "language_loss": 0.98557454, + "learning_rate": 0.0008878727413564669, + "loss": 0.99744809, + "num_input_tokens_seen": 103640784, + "router_z_loss_mlp": 0.95751953, + "step": 1252, + "time_per_iteration": 2.7665653228759766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209228, + "balance_loss_mlp": 1.11519623, + "epoch": 0.24105425163524433, + "flos": 1341459262464.0, + "grad_norm": 0.018061169603452644, + "language_loss": 0.80135596, + "learning_rate": 0.0008876760685745588, + "loss": 0.81344825, + "num_input_tokens_seen": 103865824, + "router_z_loss_mlp": 0.93945312, + "step": 1253, + "time_per_iteration": 4.899695634841919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182732, + "balance_loss_mlp": 1.08679259, + "epoch": 0.24124663332050789, + "flos": 615227257344.0, + "grad_norm": 0.02599071752574661, + "language_loss": 0.90657973, + "learning_rate": 0.0008874792452834528, + "loss": 0.91840708, + "num_input_tokens_seen": 103939872, + "router_z_loss_mlp": 0.95898438, + "step": 1254, + "time_per_iteration": 2.7407760620117188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179855, + "balance_loss_mlp": 1.08401072, + "epoch": 0.24143901500577145, + "flos": 576592381440.0, + "grad_norm": 0.0285281411485809, + "language_loss": 0.99380314, + "learning_rate": 0.0008872822715595626, + "loss": 1.00560164, + "num_input_tokens_seen": 104011120, + "router_z_loss_mlp": 0.95800781, + "step": 1255, + "time_per_iteration": 2.7094287872314453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176059, + "balance_loss_mlp": 1.08007157, + "epoch": 0.241631396691035, + "flos": 496146823680.0, + "grad_norm": 0.026934202036951318, + "language_loss": 0.98012596, + "learning_rate": 0.0008870851474793598, + "loss": 0.9918865, + "num_input_tokens_seen": 104077040, + "router_z_loss_mlp": 0.95947266, + "step": 1256, + "time_per_iteration": 2.5717930793762207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180992, + "balance_loss_mlp": 1.08500445, + "epoch": 0.24182377837629856, + "flos": 637396225536.0, + "grad_norm": 0.02721147411023071, + "language_loss": 0.97604549, + "learning_rate": 0.0008868878731193752, + "loss": 0.98785543, + "num_input_tokens_seen": 104150880, + "router_z_loss_mlp": 0.95947266, + "step": 1257, + "time_per_iteration": 2.835613965988159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180736, + "balance_loss_mlp": 1.08460534, + "epoch": 0.24201616006156215, + "flos": 516349218816.0, + "grad_norm": 0.023847715865297152, + "language_loss": 0.9613235, + "learning_rate": 0.0008866904485561973, + "loss": 0.97313088, + "num_input_tokens_seen": 104223696, + "router_z_loss_mlp": 0.9609375, + "step": 1258, + "time_per_iteration": 2.697693347930908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182815, + "balance_loss_mlp": 1.08682752, + "epoch": 0.2422085417468257, + "flos": 616378093056.0, + "grad_norm": 0.023106527532664196, + "language_loss": 0.92363685, + "learning_rate": 0.000886492873866473, + "loss": 0.93546498, + "num_input_tokens_seen": 104301728, + "router_z_loss_mlp": 0.95947266, + "step": 1259, + "time_per_iteration": 2.8120577335357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118033, + "balance_loss_mlp": 1.08424771, + "epoch": 0.24240092343208927, + "flos": 586912794624.0, + "grad_norm": 0.025402415625288076, + "language_loss": 0.9586736, + "learning_rate": 0.000886295149126908, + "loss": 0.97047698, + "num_input_tokens_seen": 104374480, + "router_z_loss_mlp": 0.96044922, + "step": 1260, + "time_per_iteration": 2.7276840209960938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184073, + "balance_loss_mlp": 1.08813286, + "epoch": 0.24259330511735283, + "flos": 763570874880.0, + "grad_norm": 0.0207328591517146, + "language_loss": 0.94417751, + "learning_rate": 0.0008860972744142655, + "loss": 0.95601827, + "num_input_tokens_seen": 104452384, + "router_z_loss_mlp": 0.95898438, + "step": 1261, + "time_per_iteration": 2.898794412612915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184052, + "balance_loss_mlp": 1.08816016, + "epoch": 0.2427856868026164, + "flos": 628133322240.0, + "grad_norm": 0.02409331705070074, + "language_loss": 0.89591467, + "learning_rate": 0.0008858992498053671, + "loss": 0.90775526, + "num_input_tokens_seen": 104532576, + "router_z_loss_mlp": 0.95849609, + "step": 1262, + "time_per_iteration": 2.8477351665496826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183746, + "balance_loss_mlp": 1.08952332, + "epoch": 0.24297806848787995, + "flos": 1514919343104.0, + "grad_norm": 0.012580587939111834, + "language_loss": 0.7658875, + "learning_rate": 0.0008857010753770934, + "loss": 0.77772498, + "num_input_tokens_seen": 104765216, + "router_z_loss_mlp": 0.94140625, + "step": 1263, + "time_per_iteration": 4.826787710189819 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180613, + "balance_loss_mlp": 1.0848639, + "epoch": 0.2431704501731435, + "flos": 543072413184.0, + "grad_norm": 0.025826560533695943, + "language_loss": 0.92586392, + "learning_rate": 0.0008855027512063817, + "loss": 0.93767005, + "num_input_tokens_seen": 104836912, + "router_z_loss_mlp": 0.95703125, + "step": 1264, + "time_per_iteration": 2.722557306289673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179682, + "balance_loss_mlp": 1.08364689, + "epoch": 0.24336283185840707, + "flos": 524878250496.0, + "grad_norm": 0.025894380889017608, + "language_loss": 0.95614499, + "learning_rate": 0.0008853042773702292, + "loss": 0.96794176, + "num_input_tokens_seen": 104909280, + "router_z_loss_mlp": 0.95996094, + "step": 1265, + "time_per_iteration": 2.7258307933807373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118145, + "balance_loss_mlp": 1.0855577, + "epoch": 0.24355521354367063, + "flos": 538205282304.0, + "grad_norm": 0.022817154468993458, + "language_loss": 0.98287719, + "learning_rate": 0.0008851056539456896, + "loss": 0.99469173, + "num_input_tokens_seen": 104982560, + "router_z_loss_mlp": 0.95849609, + "step": 1266, + "time_per_iteration": 2.6970114707946777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182961, + "balance_loss_mlp": 1.08692622, + "epoch": 0.24374759522893422, + "flos": 932108155392.0, + "grad_norm": 0.024066297062525326, + "language_loss": 0.9148944, + "learning_rate": 0.0008849068810098755, + "loss": 0.92672402, + "num_input_tokens_seen": 105075056, + "router_z_loss_mlp": 0.95996094, + "step": 1267, + "time_per_iteration": 3.326692819595337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118368, + "balance_loss_mlp": 1.08764458, + "epoch": 0.24393997691419778, + "flos": 428685193728.0, + "grad_norm": 0.027357648838687767, + "language_loss": 0.94001949, + "learning_rate": 0.0008847079586399575, + "loss": 0.95185632, + "num_input_tokens_seen": 105137536, + "router_z_loss_mlp": 0.95996094, + "step": 1268, + "time_per_iteration": 2.466787099838257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180763, + "balance_loss_mlp": 1.08482289, + "epoch": 0.24413235859946134, + "flos": 579942104064.0, + "grad_norm": 0.026150492080556795, + "language_loss": 0.95411992, + "learning_rate": 0.0008845088869131641, + "loss": 0.96592754, + "num_input_tokens_seen": 105204848, + "router_z_loss_mlp": 0.95898438, + "step": 1269, + "time_per_iteration": 2.7016899585723877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175832, + "balance_loss_mlp": 1.07989287, + "epoch": 0.2443247402847249, + "flos": 530900219904.0, + "grad_norm": 0.025309414349457434, + "language_loss": 0.98951483, + "learning_rate": 0.0008843096659067818, + "loss": 1.00127316, + "num_input_tokens_seen": 105273456, + "router_z_loss_mlp": 0.95898438, + "step": 1270, + "time_per_iteration": 2.6240859031677246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179701, + "balance_loss_mlp": 1.08366621, + "epoch": 0.24451712196998845, + "flos": 697624651776.0, + "grad_norm": 0.020400222299851913, + "language_loss": 0.92813951, + "learning_rate": 0.000884110295698155, + "loss": 0.93993652, + "num_input_tokens_seen": 105355488, + "router_z_loss_mlp": 0.95996094, + "step": 1271, + "time_per_iteration": 2.945749044418335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180344, + "balance_loss_mlp": 1.08435643, + "epoch": 0.24470950365525201, + "flos": 530863289856.0, + "grad_norm": 0.02434814436965663, + "language_loss": 0.97428346, + "learning_rate": 0.0008839107763646861, + "loss": 0.98608696, + "num_input_tokens_seen": 105421568, + "router_z_loss_mlp": 0.95947266, + "step": 1272, + "time_per_iteration": 2.5816495418548584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182389, + "balance_loss_mlp": 1.08630657, + "epoch": 0.24490188534051557, + "flos": 492347936256.0, + "grad_norm": 0.027277570267404832, + "language_loss": 1.00778949, + "learning_rate": 0.0008837111079838353, + "loss": 1.0196135, + "num_input_tokens_seen": 105493072, + "router_z_loss_mlp": 0.96044922, + "step": 1273, + "time_per_iteration": 2.675060749053955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182001, + "balance_loss_mlp": 1.08587062, + "epoch": 0.24509426702577913, + "flos": 475111226880.0, + "grad_norm": 0.024851656777491255, + "language_loss": 0.98025054, + "learning_rate": 0.000883511290633121, + "loss": 0.99207056, + "num_input_tokens_seen": 105559840, + "router_z_loss_mlp": 0.9609375, + "step": 1274, + "time_per_iteration": 2.5230517387390137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183988, + "balance_loss_mlp": 1.08747613, + "epoch": 0.24528664871104272, + "flos": 551647107072.0, + "grad_norm": 0.02070792437524093, + "language_loss": 1.00507927, + "learning_rate": 0.000883311324390119, + "loss": 1.01691914, + "num_input_tokens_seen": 105634448, + "router_z_loss_mlp": 0.96484375, + "step": 1275, + "time_per_iteration": 2.690488338470459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184819, + "balance_loss_mlp": 1.08887982, + "epoch": 0.24547903039630628, + "flos": 827335675392.0, + "grad_norm": 0.02978995697497926, + "language_loss": 0.95172417, + "learning_rate": 0.0008831112093324629, + "loss": 0.96357232, + "num_input_tokens_seen": 105711936, + "router_z_loss_mlp": 0.95898438, + "step": 1276, + "time_per_iteration": 3.0883522033691406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184816, + "balance_loss_mlp": 1.08839917, + "epoch": 0.24567141208156984, + "flos": 592693718016.0, + "grad_norm": 0.026400385967418116, + "language_loss": 0.99731994, + "learning_rate": 0.0008829109455378444, + "loss": 1.00916803, + "num_input_tokens_seen": 105780240, + "router_z_loss_mlp": 0.96386719, + "step": 1277, + "time_per_iteration": 2.670658588409424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184585, + "balance_loss_mlp": 1.08812118, + "epoch": 0.2458637937668334, + "flos": 548929198080.0, + "grad_norm": 0.022333419000210953, + "language_loss": 0.95654261, + "learning_rate": 0.000882710533084013, + "loss": 0.96838844, + "num_input_tokens_seen": 105849840, + "router_z_loss_mlp": 0.96435547, + "step": 1278, + "time_per_iteration": 2.641019344329834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189057, + "balance_loss_mlp": 1.09244978, + "epoch": 0.24605617545209696, + "flos": 516911175168.0, + "grad_norm": 0.022487969609205835, + "language_loss": 0.97332817, + "learning_rate": 0.0008825099720487755, + "loss": 0.98521876, + "num_input_tokens_seen": 105921488, + "router_z_loss_mlp": 0.96582031, + "step": 1279, + "time_per_iteration": 2.626079559326172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193596, + "balance_loss_mlp": 1.09880066, + "epoch": 0.24624855713736052, + "flos": 1515058331136.0, + "grad_norm": 0.0162275920205478, + "language_loss": 0.7526114, + "learning_rate": 0.0008823092625099967, + "loss": 0.76454735, + "num_input_tokens_seen": 106146816, + "router_z_loss_mlp": 0.94726562, + "step": 1280, + "time_per_iteration": 4.846211671829224 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118811, + "balance_loss_mlp": 1.09350586, + "epoch": 0.24644093882262408, + "flos": 1530746706432.0, + "grad_norm": 0.013716798372908724, + "language_loss": 0.77944112, + "learning_rate": 0.0008821084045455987, + "loss": 0.79132223, + "num_input_tokens_seen": 106361568, + "router_z_loss_mlp": 0.9453125, + "step": 1281, + "time_per_iteration": 4.781409025192261 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189694, + "balance_loss_mlp": 1.09351575, + "epoch": 0.24663332050788764, + "flos": 660348730368.0, + "grad_norm": 0.028995521048395968, + "language_loss": 0.998649, + "learning_rate": 0.0008819073982335619, + "loss": 1.01054597, + "num_input_tokens_seen": 106435296, + "router_z_loss_mlp": 0.96142578, + "step": 1282, + "time_per_iteration": 2.873255729675293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187163, + "balance_loss_mlp": 1.09098482, + "epoch": 0.24682570219315123, + "flos": 542805170688.0, + "grad_norm": 0.0289675073475646, + "language_loss": 0.92590028, + "learning_rate": 0.0008817062436519235, + "loss": 0.93777192, + "num_input_tokens_seen": 106507184, + "router_z_loss_mlp": 0.96142578, + "step": 1283, + "time_per_iteration": 2.6918435096740723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184699, + "balance_loss_mlp": 1.08852112, + "epoch": 0.24701808387841478, + "flos": 441658387968.0, + "grad_norm": 0.027350099061339322, + "language_loss": 1.00939846, + "learning_rate": 0.0008815049408787788, + "loss": 1.02124548, + "num_input_tokens_seen": 106571472, + "router_z_loss_mlp": 0.96142578, + "step": 1284, + "time_per_iteration": 2.5155978202819824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190183, + "balance_loss_mlp": 1.09443462, + "epoch": 0.24721046556367834, + "flos": 469032861696.0, + "grad_norm": 0.028209143321693456, + "language_loss": 0.95635927, + "learning_rate": 0.0008813034899922805, + "loss": 0.96826112, + "num_input_tokens_seen": 106638368, + "router_z_loss_mlp": 0.95703125, + "step": 1285, + "time_per_iteration": 2.5152530670166016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193087, + "balance_loss_mlp": 1.09729075, + "epoch": 0.2474028472489419, + "flos": 505407725568.0, + "grad_norm": 0.027111907557838905, + "language_loss": 1.01196301, + "learning_rate": 0.0008811018910706387, + "loss": 1.02389383, + "num_input_tokens_seen": 106705312, + "router_z_loss_mlp": 0.95751953, + "step": 1286, + "time_per_iteration": 2.5593316555023193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188305, + "balance_loss_mlp": 1.09255612, + "epoch": 0.24759522893420546, + "flos": 480955276800.0, + "grad_norm": 0.03276846828627927, + "language_loss": 0.9498859, + "learning_rate": 0.0008809001441921211, + "loss": 0.96176893, + "num_input_tokens_seen": 106778624, + "router_z_loss_mlp": 0.95703125, + "step": 1287, + "time_per_iteration": 2.7347421646118164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181619, + "balance_loss_mlp": 1.08567917, + "epoch": 0.24778761061946902, + "flos": 534753501696.0, + "grad_norm": 0.025262665654883373, + "language_loss": 0.97019696, + "learning_rate": 0.0008806982494350528, + "loss": 0.98201311, + "num_input_tokens_seen": 106847744, + "router_z_loss_mlp": 0.95898438, + "step": 1288, + "time_per_iteration": 2.6499245166778564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181206, + "balance_loss_mlp": 1.08526671, + "epoch": 0.24797999230473258, + "flos": 560942937600.0, + "grad_norm": 0.021558514258727474, + "language_loss": 0.9849534, + "learning_rate": 0.0008804962068778161, + "loss": 0.99676538, + "num_input_tokens_seen": 106927584, + "router_z_loss_mlp": 0.95898438, + "step": 1289, + "time_per_iteration": 2.852257490158081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186476, + "balance_loss_mlp": 1.09053683, + "epoch": 0.24817237398999614, + "flos": 625480541184.0, + "grad_norm": 0.024913990838324927, + "language_loss": 0.90269625, + "learning_rate": 0.0008802940165988511, + "loss": 0.91456103, + "num_input_tokens_seen": 107006656, + "router_z_loss_mlp": 0.95898438, + "step": 1290, + "time_per_iteration": 2.846277952194214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181135, + "balance_loss_mlp": 1.08471859, + "epoch": 0.2483647556752597, + "flos": 613484265984.0, + "grad_norm": 0.02310813532639645, + "language_loss": 0.96774852, + "learning_rate": 0.000880091678676655, + "loss": 0.97955984, + "num_input_tokens_seen": 107084352, + "router_z_loss_mlp": 0.96386719, + "step": 1291, + "time_per_iteration": 2.8085777759552 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180122, + "balance_loss_mlp": 1.0837059, + "epoch": 0.2485571373605233, + "flos": 584687711232.0, + "grad_norm": 0.021422688776258386, + "language_loss": 0.9855839, + "learning_rate": 0.0008798891931897821, + "loss": 0.99738514, + "num_input_tokens_seen": 107158368, + "router_z_loss_mlp": 0.96386719, + "step": 1292, + "time_per_iteration": 2.7361133098602295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183371, + "balance_loss_mlp": 1.08704984, + "epoch": 0.24874951904578685, + "flos": 495736590336.0, + "grad_norm": 0.02424073807687162, + "language_loss": 0.92916596, + "learning_rate": 0.0008796865602168447, + "loss": 0.94099975, + "num_input_tokens_seen": 107224256, + "router_z_loss_mlp": 0.96289062, + "step": 1293, + "time_per_iteration": 2.5220131874084473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186197, + "balance_loss_mlp": 1.09025729, + "epoch": 0.2489419007310504, + "flos": 457173573120.0, + "grad_norm": 0.023099031146870112, + "language_loss": 0.94818902, + "learning_rate": 0.0008794837798365115, + "loss": 0.96005094, + "num_input_tokens_seen": 107292720, + "router_z_loss_mlp": 0.95898438, + "step": 1294, + "time_per_iteration": 2.6338109970092773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187707, + "balance_loss_mlp": 1.09191012, + "epoch": 0.24913428241631397, + "flos": 486565011456.0, + "grad_norm": 0.02215078033303108, + "language_loss": 0.96107936, + "learning_rate": 0.0008792808521275089, + "loss": 0.97295642, + "num_input_tokens_seen": 107368576, + "router_z_loss_mlp": 0.95751953, + "step": 1295, + "time_per_iteration": 2.7354605197906494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182687, + "balance_loss_mlp": 1.0869385, + "epoch": 0.24932666410157753, + "flos": 519917793792.0, + "grad_norm": 0.022601932216391857, + "language_loss": 0.96075213, + "learning_rate": 0.0008790777771686206, + "loss": 0.972579, + "num_input_tokens_seen": 107433856, + "router_z_loss_mlp": 0.95703125, + "step": 1296, + "time_per_iteration": 2.5746819972991943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181852, + "balance_loss_mlp": 1.08610308, + "epoch": 0.2495190457868411, + "flos": 473556888576.0, + "grad_norm": 0.022656020732285023, + "language_loss": 0.93397439, + "learning_rate": 0.0008788745550386872, + "loss": 0.94579285, + "num_input_tokens_seen": 107500944, + "router_z_loss_mlp": 0.95703125, + "step": 1297, + "time_per_iteration": 2.55985689163208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177725, + "balance_loss_mlp": 1.0820719, + "epoch": 0.24971142747210465, + "flos": 747198292992.0, + "grad_norm": 0.023996141347128058, + "language_loss": 0.88372529, + "learning_rate": 0.0008786711858166063, + "loss": 0.89550251, + "num_input_tokens_seen": 107580000, + "router_z_loss_mlp": 0.95605469, + "step": 1298, + "time_per_iteration": 2.9357082843780518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179743, + "balance_loss_mlp": 1.08399367, + "epoch": 0.2499038091573682, + "flos": 750901853184.0, + "grad_norm": 0.025666304870509565, + "language_loss": 0.93355387, + "learning_rate": 0.0008784676695813332, + "loss": 0.9453513, + "num_input_tokens_seen": 107660384, + "router_z_loss_mlp": 0.95703125, + "step": 1299, + "time_per_iteration": 2.939739942550659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187708, + "balance_loss_mlp": 1.09186363, + "epoch": 0.2500961908426318, + "flos": 746342897664.0, + "grad_norm": 0.02448521774653795, + "language_loss": 0.94308037, + "learning_rate": 0.0008782640064118796, + "loss": 0.95495749, + "num_input_tokens_seen": 107736320, + "router_z_loss_mlp": 0.95800781, + "step": 1300, + "time_per_iteration": 2.882838249206543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223068, + "balance_loss_mlp": 1.12808228, + "epoch": 0.2502885725278953, + "flos": 1420523672064.0, + "grad_norm": 0.019515623701574104, + "language_loss": 0.7618475, + "learning_rate": 0.0008780601963873149, + "loss": 0.77407825, + "num_input_tokens_seen": 107972608, + "router_z_loss_mlp": 0.94921875, + "step": 1301, + "time_per_iteration": 5.002445220947266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180814, + "balance_loss_mlp": 1.08520806, + "epoch": 0.2504809542131589, + "flos": 516231697920.0, + "grad_norm": 0.028413107884204602, + "language_loss": 0.96116567, + "learning_rate": 0.0008778562395867648, + "loss": 0.97297382, + "num_input_tokens_seen": 108043312, + "router_z_loss_mlp": 0.95556641, + "step": 1302, + "time_per_iteration": 2.6463139057159424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183586, + "balance_loss_mlp": 1.08783746, + "epoch": 0.25067333589842244, + "flos": 526851554304.0, + "grad_norm": 0.024791221234372676, + "language_loss": 0.9191972, + "learning_rate": 0.0008776521360894127, + "loss": 0.93103302, + "num_input_tokens_seen": 108114144, + "router_z_loss_mlp": 0.95703125, + "step": 1303, + "time_per_iteration": 2.60622239112854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203766, + "balance_loss_mlp": 1.10897064, + "epoch": 0.25086571758368603, + "flos": 1477157326848.0, + "grad_norm": 0.014632010139538269, + "language_loss": 0.78962064, + "learning_rate": 0.0008774478859744984, + "loss": 0.80165827, + "num_input_tokens_seen": 108338720, + "router_z_loss_mlp": 0.94726562, + "step": 1304, + "time_per_iteration": 4.810328006744385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188508, + "balance_loss_mlp": 1.09285462, + "epoch": 0.2510580992689496, + "flos": 529402277376.0, + "grad_norm": 0.027485922989720333, + "language_loss": 0.99458921, + "learning_rate": 0.0008772434893213186, + "loss": 1.00647426, + "num_input_tokens_seen": 108405456, + "router_z_loss_mlp": 0.95605469, + "step": 1305, + "time_per_iteration": 2.6031458377838135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187013, + "balance_loss_mlp": 1.09155023, + "epoch": 0.25125048095421315, + "flos": 518465513472.0, + "grad_norm": 0.0302061265456268, + "language_loss": 0.93206942, + "learning_rate": 0.0008770389462092276, + "loss": 0.94393957, + "num_input_tokens_seen": 108474368, + "router_z_loss_mlp": 0.95410156, + "step": 1306, + "time_per_iteration": 2.636845827102661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118174, + "balance_loss_mlp": 1.0858953, + "epoch": 0.25144286263947674, + "flos": 621674923008.0, + "grad_norm": 0.026354631998576704, + "language_loss": 0.96568018, + "learning_rate": 0.0008768342567176357, + "loss": 0.97749758, + "num_input_tokens_seen": 108548864, + "router_z_loss_mlp": 0.95800781, + "step": 1307, + "time_per_iteration": 2.797346591949463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187952, + "balance_loss_mlp": 1.09220326, + "epoch": 0.25163524432474027, + "flos": 504865234944.0, + "grad_norm": 0.024318536510777332, + "language_loss": 0.99895847, + "learning_rate": 0.0008766294209260107, + "loss": 1.01083803, + "num_input_tokens_seen": 108623072, + "router_z_loss_mlp": 0.95703125, + "step": 1308, + "time_per_iteration": 2.648099184036255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180717, + "balance_loss_mlp": 1.0850637, + "epoch": 0.25182762601000386, + "flos": 510079472640.0, + "grad_norm": 0.027727924866539442, + "language_loss": 1.0231359, + "learning_rate": 0.0008764244389138767, + "loss": 1.0349431, + "num_input_tokens_seen": 108690128, + "router_z_loss_mlp": 0.95605469, + "step": 1309, + "time_per_iteration": 2.575963258743286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179663, + "balance_loss_mlp": 1.08396196, + "epoch": 0.2520200076952674, + "flos": 635097282048.0, + "grad_norm": 0.028356059247082867, + "language_loss": 0.93336231, + "learning_rate": 0.000876219310760815, + "loss": 0.94515896, + "num_input_tokens_seen": 108770272, + "router_z_loss_mlp": 0.95654297, + "step": 1310, + "time_per_iteration": 2.8647706508636475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189244, + "balance_loss_mlp": 1.09330475, + "epoch": 0.252212389380531, + "flos": 495651996672.0, + "grad_norm": 0.024396868749396446, + "language_loss": 0.91954494, + "learning_rate": 0.0008760140365464631, + "loss": 0.93143737, + "num_input_tokens_seen": 108840592, + "router_z_loss_mlp": 0.95898438, + "step": 1311, + "time_per_iteration": 2.592453718185425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180261, + "balance_loss_mlp": 1.08451247, + "epoch": 0.2524047710657945, + "flos": 491529470976.0, + "grad_norm": 0.026197758988141227, + "language_loss": 0.97483641, + "learning_rate": 0.0008758086163505156, + "loss": 0.98663902, + "num_input_tokens_seen": 108910064, + "router_z_loss_mlp": 0.95703125, + "step": 1312, + "time_per_iteration": 2.56319260597229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181231, + "balance_loss_mlp": 1.08548176, + "epoch": 0.2525971527510581, + "flos": 648612966912.0, + "grad_norm": 0.0242630752619845, + "language_loss": 0.98733318, + "learning_rate": 0.0008756030502527239, + "loss": 0.99914545, + "num_input_tokens_seen": 108986336, + "router_z_loss_mlp": 0.95703125, + "step": 1313, + "time_per_iteration": 2.858691930770874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180546, + "balance_loss_mlp": 1.08455837, + "epoch": 0.2527895344363217, + "flos": 570373026816.0, + "grad_norm": 0.025539383487616106, + "language_loss": 0.99746555, + "learning_rate": 0.0008753973383328954, + "loss": 1.00927103, + "num_input_tokens_seen": 109059712, + "router_z_loss_mlp": 0.95947266, + "step": 1314, + "time_per_iteration": 2.6683549880981445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180137, + "balance_loss_mlp": 1.0841974, + "epoch": 0.2529819161215852, + "flos": 515068127232.0, + "grad_norm": 0.027266475314614652, + "language_loss": 0.95154297, + "learning_rate": 0.0008751914806708952, + "loss": 0.96334434, + "num_input_tokens_seen": 109127504, + "router_z_loss_mlp": 0.95898438, + "step": 1315, + "time_per_iteration": 2.6008012294769287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178852, + "balance_loss_mlp": 1.08310342, + "epoch": 0.2531742978068488, + "flos": 532350498816.0, + "grad_norm": 0.02508848621911812, + "language_loss": 0.91122246, + "learning_rate": 0.0008749854773466439, + "loss": 0.92301095, + "num_input_tokens_seen": 109198080, + "router_z_loss_mlp": 0.95703125, + "step": 1316, + "time_per_iteration": 2.6595401763916016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193828, + "balance_loss_mlp": 1.09822178, + "epoch": 0.25336667949211233, + "flos": 597747500544.0, + "grad_norm": 0.027675397486347803, + "language_loss": 0.92894816, + "learning_rate": 0.0008747793284401192, + "loss": 0.9408865, + "num_input_tokens_seen": 109268368, + "router_z_loss_mlp": 0.95556641, + "step": 1317, + "time_per_iteration": 2.6975109577178955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187696, + "balance_loss_mlp": 1.09175622, + "epoch": 0.2535590611773759, + "flos": 603255177216.0, + "grad_norm": 0.02603186041930466, + "language_loss": 0.95462376, + "learning_rate": 0.0008745730340313551, + "loss": 0.96650076, + "num_input_tokens_seen": 109344112, + "router_z_loss_mlp": 0.95898438, + "step": 1318, + "time_per_iteration": 2.805327892303467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187328, + "balance_loss_mlp": 1.0915786, + "epoch": 0.25375144286263945, + "flos": 496322741760.0, + "grad_norm": 0.027049333310240738, + "language_loss": 0.95645851, + "learning_rate": 0.0008743665942004422, + "loss": 0.96833169, + "num_input_tokens_seen": 109414112, + "router_z_loss_mlp": 0.95703125, + "step": 1319, + "time_per_iteration": 2.6340737342834473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185781, + "balance_loss_mlp": 1.0896982, + "epoch": 0.25394382454790304, + "flos": 513476858880.0, + "grad_norm": 0.02784781206620994, + "language_loss": 1.02473438, + "learning_rate": 0.0008741600090275277, + "loss": 1.03659225, + "num_input_tokens_seen": 109484336, + "router_z_loss_mlp": 0.96044922, + "step": 1320, + "time_per_iteration": 2.573155641555786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183427, + "balance_loss_mlp": 1.08763099, + "epoch": 0.25413620623316663, + "flos": 960855045120.0, + "grad_norm": 0.03323105604734599, + "language_loss": 0.94160318, + "learning_rate": 0.0008739532785928151, + "loss": 0.95343745, + "num_input_tokens_seen": 109590128, + "router_z_loss_mlp": 0.95751953, + "step": 1321, + "time_per_iteration": 3.470245122909546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190819, + "balance_loss_mlp": 1.09659576, + "epoch": 0.25432858791843016, + "flos": 1580648715264.0, + "grad_norm": 0.017424496497570757, + "language_loss": 0.74893582, + "learning_rate": 0.0008737464029765639, + "loss": 0.76084399, + "num_input_tokens_seen": 109816592, + "router_z_loss_mlp": 0.94140625, + "step": 1322, + "time_per_iteration": 4.8549723625183105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184096, + "balance_loss_mlp": 1.08806074, + "epoch": 0.25452096960369375, + "flos": 584893828608.0, + "grad_norm": 0.025099574916072127, + "language_loss": 0.94150972, + "learning_rate": 0.0008735393822590908, + "loss": 0.95335066, + "num_input_tokens_seen": 109890464, + "router_z_loss_mlp": 0.95996094, + "step": 1323, + "time_per_iteration": 2.6771461963653564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187145, + "balance_loss_mlp": 1.0910151, + "epoch": 0.2547133512889573, + "flos": 509641041408.0, + "grad_norm": 0.024104352127734364, + "language_loss": 0.95373654, + "learning_rate": 0.0008733322165207681, + "loss": 0.965608, + "num_input_tokens_seen": 109963408, + "router_z_loss_mlp": 0.9609375, + "step": 1324, + "time_per_iteration": 2.671187400817871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191608, + "balance_loss_mlp": 1.09590697, + "epoch": 0.25490573297422087, + "flos": 784035783168.0, + "grad_norm": 0.02719192919889817, + "language_loss": 0.93181324, + "learning_rate": 0.0008731249058420247, + "loss": 0.94372928, + "num_input_tokens_seen": 110048800, + "router_z_loss_mlp": 0.95654297, + "step": 1325, + "time_per_iteration": 3.0272371768951416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189078, + "balance_loss_mlp": 1.09332883, + "epoch": 0.2550981146594844, + "flos": 510952332288.0, + "grad_norm": 0.024872253546531747, + "language_loss": 1.00651383, + "learning_rate": 0.0008729174503033459, + "loss": 1.0184046, + "num_input_tokens_seen": 110118096, + "router_z_loss_mlp": 0.95703125, + "step": 1326, + "time_per_iteration": 2.6320900917053223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187412, + "balance_loss_mlp": 1.09166288, + "epoch": 0.255290496344748, + "flos": 677930545152.0, + "grad_norm": 0.02807770436691079, + "language_loss": 0.93655276, + "learning_rate": 0.0008727098499852728, + "loss": 0.9484269, + "num_input_tokens_seen": 110190160, + "router_z_loss_mlp": 0.95703125, + "step": 1327, + "time_per_iteration": 2.8246335983276367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187202, + "balance_loss_mlp": 1.09116733, + "epoch": 0.2554828780300115, + "flos": 538984816128.0, + "grad_norm": 0.02304152562423393, + "language_loss": 0.97811985, + "learning_rate": 0.0008725021049684034, + "loss": 0.9899919, + "num_input_tokens_seen": 110268000, + "router_z_loss_mlp": 0.95996094, + "step": 1328, + "time_per_iteration": 2.783276081085205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011849, + "balance_loss_mlp": 1.08924699, + "epoch": 0.2556752597152751, + "flos": 825622883328.0, + "grad_norm": 0.024322773499976656, + "language_loss": 0.90949428, + "learning_rate": 0.000872294215333391, + "loss": 0.92134333, + "num_input_tokens_seen": 110354816, + "router_z_loss_mlp": 0.95605469, + "step": 1329, + "time_per_iteration": 3.1658623218536377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184378, + "balance_loss_mlp": 1.08867729, + "epoch": 0.2558676414005387, + "flos": 571890435072.0, + "grad_norm": 0.026114012927401953, + "language_loss": 0.91800833, + "learning_rate": 0.0008720861811609457, + "loss": 0.92985213, + "num_input_tokens_seen": 110427968, + "router_z_loss_mlp": 0.95654297, + "step": 1330, + "time_per_iteration": 2.725680112838745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185897, + "balance_loss_mlp": 1.09024334, + "epoch": 0.2560600230858022, + "flos": 487748047872.0, + "grad_norm": 0.02457760145285043, + "language_loss": 0.93800515, + "learning_rate": 0.0008718780025318338, + "loss": 0.94986409, + "num_input_tokens_seen": 110501184, + "router_z_loss_mlp": 0.95605469, + "step": 1331, + "time_per_iteration": 2.730424404144287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184699, + "balance_loss_mlp": 1.08904529, + "epoch": 0.2562524047710658, + "flos": 514119406080.0, + "grad_norm": 0.027688932662206074, + "language_loss": 0.94349414, + "learning_rate": 0.0008716696795268771, + "loss": 0.9553411, + "num_input_tokens_seen": 110573008, + "router_z_loss_mlp": 0.95605469, + "step": 1332, + "time_per_iteration": 2.6572844982147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183855, + "balance_loss_mlp": 1.0881542, + "epoch": 0.25644478645632934, + "flos": 636109129728.0, + "grad_norm": 0.025705757243887913, + "language_loss": 0.96553451, + "learning_rate": 0.0008714612122269538, + "loss": 0.97737306, + "num_input_tokens_seen": 110646704, + "router_z_loss_mlp": 0.95654297, + "step": 1333, + "time_per_iteration": 2.867598295211792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184376, + "balance_loss_mlp": 1.0888176, + "epoch": 0.25663716814159293, + "flos": 437544594432.0, + "grad_norm": 0.025955971973603553, + "language_loss": 1.00358891, + "learning_rate": 0.0008712526007129982, + "loss": 1.01543272, + "num_input_tokens_seen": 110712208, + "router_z_loss_mlp": 0.95507812, + "step": 1334, + "time_per_iteration": 2.516052484512329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186528, + "balance_loss_mlp": 1.0908742, + "epoch": 0.25682954982685646, + "flos": 499242765312.0, + "grad_norm": 0.021880143416013124, + "language_loss": 0.98599482, + "learning_rate": 0.0008710438450660003, + "loss": 0.99786019, + "num_input_tokens_seen": 110783936, + "router_z_loss_mlp": 0.95605469, + "step": 1335, + "time_per_iteration": 2.659489870071411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184319, + "balance_loss_mlp": 1.08861768, + "epoch": 0.25702193151212005, + "flos": 458627854848.0, + "grad_norm": 0.028869593177541276, + "language_loss": 0.98979777, + "learning_rate": 0.0008708349453670064, + "loss": 1.00164104, + "num_input_tokens_seen": 110848560, + "router_z_loss_mlp": 0.95654297, + "step": 1336, + "time_per_iteration": 2.5267841815948486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185282, + "balance_loss_mlp": 1.08953345, + "epoch": 0.2572143131973836, + "flos": 599403896832.0, + "grad_norm": 0.021342480544698176, + "language_loss": 0.99445975, + "learning_rate": 0.0008706259016971185, + "loss": 1.00631261, + "num_input_tokens_seen": 110922672, + "router_z_loss_mlp": 0.95703125, + "step": 1337, + "time_per_iteration": 2.7561397552490234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118469, + "balance_loss_mlp": 1.08884537, + "epoch": 0.25740669488264717, + "flos": 699526096896.0, + "grad_norm": 0.032203199948080075, + "language_loss": 0.96320713, + "learning_rate": 0.0008704167141374944, + "loss": 0.97505397, + "num_input_tokens_seen": 110995456, + "router_z_loss_mlp": 0.95800781, + "step": 1338, + "time_per_iteration": 2.7987895011901855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118993, + "balance_loss_mlp": 1.09432399, + "epoch": 0.25759907656791076, + "flos": 503378025984.0, + "grad_norm": 0.024717846020590344, + "language_loss": 0.97755861, + "learning_rate": 0.0008702073827693482, + "loss": 0.98945785, + "num_input_tokens_seen": 111069568, + "router_z_loss_mlp": 0.95556641, + "step": 1339, + "time_per_iteration": 2.694470167160034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186155, + "balance_loss_mlp": 1.0904057, + "epoch": 0.2577914582531743, + "flos": 775241510400.0, + "grad_norm": 0.025036220674882887, + "language_loss": 0.97113985, + "learning_rate": 0.0008699979076739494, + "loss": 0.98300135, + "num_input_tokens_seen": 111142608, + "router_z_loss_mlp": 0.95703125, + "step": 1340, + "time_per_iteration": 2.962740421295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184068, + "balance_loss_mlp": 1.08836627, + "epoch": 0.2579838399384379, + "flos": 460609890816.0, + "grad_norm": 0.026880962232798965, + "language_loss": 0.99139833, + "learning_rate": 0.0008697882889326234, + "loss": 1.00323892, + "num_input_tokens_seen": 111206336, + "router_z_loss_mlp": 0.95654297, + "step": 1341, + "time_per_iteration": 2.517382860183716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185483, + "balance_loss_mlp": 1.08987677, + "epoch": 0.2581762216237014, + "flos": 570262236672.0, + "grad_norm": 0.0242955377416103, + "language_loss": 0.96170259, + "learning_rate": 0.0008695785266267515, + "loss": 0.97355735, + "num_input_tokens_seen": 111276736, + "router_z_loss_mlp": 0.95556641, + "step": 1342, + "time_per_iteration": 2.6961281299591064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118536, + "balance_loss_mlp": 1.08961082, + "epoch": 0.258368603308965, + "flos": 605386934784.0, + "grad_norm": 0.023671890991135848, + "language_loss": 0.9337616, + "learning_rate": 0.0008693686208377704, + "loss": 0.94561517, + "num_input_tokens_seen": 111353856, + "router_z_loss_mlp": 0.95703125, + "step": 1343, + "time_per_iteration": 2.8561604022979736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184784, + "balance_loss_mlp": 1.08908272, + "epoch": 0.2585609849942285, + "flos": 492486924288.0, + "grad_norm": 0.022133881226187983, + "language_loss": 0.96849036, + "learning_rate": 0.0008691585716471733, + "loss": 0.98033822, + "num_input_tokens_seen": 111424960, + "router_z_loss_mlp": 0.95654297, + "step": 1344, + "time_per_iteration": 2.6443324089050293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185279, + "balance_loss_mlp": 1.08952987, + "epoch": 0.2587533666794921, + "flos": 641957182464.0, + "grad_norm": 0.02305984249039353, + "language_loss": 0.94482636, + "learning_rate": 0.0008689483791365079, + "loss": 0.95667922, + "num_input_tokens_seen": 111505248, + "router_z_loss_mlp": 0.95703125, + "step": 1345, + "time_per_iteration": 2.8541483879089355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185515, + "balance_loss_mlp": 1.08976638, + "epoch": 0.2589457483647557, + "flos": 577994996736.0, + "grad_norm": 0.022382124417400225, + "language_loss": 0.97831523, + "learning_rate": 0.0008687380433873786, + "loss": 0.99017042, + "num_input_tokens_seen": 111581936, + "router_z_loss_mlp": 0.95703125, + "step": 1346, + "time_per_iteration": 2.8148868083953857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186141, + "balance_loss_mlp": 1.09048796, + "epoch": 0.25913813005001923, + "flos": 536466293760.0, + "grad_norm": 0.024690786073415343, + "language_loss": 0.93800229, + "learning_rate": 0.0008685275644814448, + "loss": 0.94986367, + "num_input_tokens_seen": 111651456, + "router_z_loss_mlp": 0.95605469, + "step": 1347, + "time_per_iteration": 2.6872267723083496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188569, + "balance_loss_mlp": 1.0930109, + "epoch": 0.2593305117352828, + "flos": 722346344448.0, + "grad_norm": 0.028015192621825148, + "language_loss": 0.944291, + "learning_rate": 0.0008683169425004216, + "loss": 0.95617664, + "num_input_tokens_seen": 111731712, + "router_z_loss_mlp": 0.95507812, + "step": 1348, + "time_per_iteration": 2.9036293029785156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187318, + "balance_loss_mlp": 1.09171176, + "epoch": 0.25952289342054635, + "flos": 711355186176.0, + "grad_norm": 0.028695706473352366, + "language_loss": 0.9867608, + "learning_rate": 0.0008681061775260799, + "loss": 0.99863392, + "num_input_tokens_seen": 111800752, + "router_z_loss_mlp": 0.95556641, + "step": 1349, + "time_per_iteration": 2.8635356426239014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185365, + "balance_loss_mlp": 1.08942509, + "epoch": 0.25971527510580994, + "flos": 456849934848.0, + "grad_norm": 0.028158951385379896, + "language_loss": 1.01652539, + "learning_rate": 0.0008678952696402458, + "loss": 1.02837896, + "num_input_tokens_seen": 111866752, + "router_z_loss_mlp": 0.95898438, + "step": 1350, + "time_per_iteration": 2.4997899532318115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184224, + "balance_loss_mlp": 1.08847523, + "epoch": 0.25990765679107347, + "flos": 613753509888.0, + "grad_norm": 0.022929201317296435, + "language_loss": 0.944794, + "learning_rate": 0.000867684218924801, + "loss": 0.95663619, + "num_input_tokens_seen": 111951328, + "router_z_loss_mlp": 0.95703125, + "step": 1351, + "time_per_iteration": 2.8553221225738525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190399, + "balance_loss_mlp": 1.09655762, + "epoch": 0.26010003847633706, + "flos": 1541404219392.0, + "grad_norm": 0.011373150433568688, + "language_loss": 0.78947091, + "learning_rate": 0.0008674730254616827, + "loss": 0.80137491, + "num_input_tokens_seen": 112182272, + "router_z_loss_mlp": 0.9375, + "step": 1352, + "time_per_iteration": 4.894901752471924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185829, + "balance_loss_mlp": 1.0900805, + "epoch": 0.2602924201616006, + "flos": 717544341504.0, + "grad_norm": 0.021521520095987904, + "language_loss": 0.9327749, + "learning_rate": 0.0008672616893328834, + "loss": 0.94463313, + "num_input_tokens_seen": 112261760, + "router_z_loss_mlp": 0.95703125, + "step": 1353, + "time_per_iteration": 2.9336133003234863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181557, + "balance_loss_mlp": 1.08571243, + "epoch": 0.2604848018468642, + "flos": 644685825024.0, + "grad_norm": 0.026147354827328006, + "language_loss": 0.99375951, + "learning_rate": 0.0008670502106204512, + "loss": 1.00557506, + "num_input_tokens_seen": 112339136, + "router_z_loss_mlp": 0.95800781, + "step": 1354, + "time_per_iteration": 2.828476667404175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182712, + "balance_loss_mlp": 1.08677256, + "epoch": 0.26067718353212777, + "flos": 518037815808.0, + "grad_norm": 0.024264679119450936, + "language_loss": 0.92830276, + "learning_rate": 0.0008668385894064892, + "loss": 0.94012988, + "num_input_tokens_seen": 112409872, + "router_z_loss_mlp": 0.95898438, + "step": 1355, + "time_per_iteration": 2.627603054046631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183025, + "balance_loss_mlp": 1.08708537, + "epoch": 0.2608695652173913, + "flos": 824224997376.0, + "grad_norm": 0.021603697394371835, + "language_loss": 0.98353279, + "learning_rate": 0.0008666268257731562, + "loss": 0.995363, + "num_input_tokens_seen": 112495616, + "router_z_loss_mlp": 0.95898438, + "step": 1356, + "time_per_iteration": 3.104410409927368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185288, + "balance_loss_mlp": 1.0894438, + "epoch": 0.2610619469026549, + "flos": 1009449039360.0, + "grad_norm": 0.029063247039842262, + "language_loss": 0.98633218, + "learning_rate": 0.0008664149198026662, + "loss": 0.99818504, + "num_input_tokens_seen": 112575168, + "router_z_loss_mlp": 0.95800781, + "step": 1357, + "time_per_iteration": 3.2552602291107178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184981, + "balance_loss_mlp": 1.08932745, + "epoch": 0.2612543285879184, + "flos": 537825248256.0, + "grad_norm": 0.02677910773484977, + "language_loss": 0.99748302, + "learning_rate": 0.0008662028715772883, + "loss": 1.00933278, + "num_input_tokens_seen": 112648480, + "router_z_loss_mlp": 0.95605469, + "step": 1358, + "time_per_iteration": 2.6044809818267822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186466, + "balance_loss_mlp": 1.09095597, + "epoch": 0.261446710273182, + "flos": 520438817280.0, + "grad_norm": 0.024887857022763207, + "language_loss": 0.95091379, + "learning_rate": 0.0008659906811793467, + "loss": 0.96277845, + "num_input_tokens_seen": 112719856, + "router_z_loss_mlp": 0.95458984, + "step": 1359, + "time_per_iteration": 2.660039186477661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118844, + "balance_loss_mlp": 1.09297669, + "epoch": 0.26163909195844554, + "flos": 584399001600.0, + "grad_norm": 0.02478490455868915, + "language_loss": 0.99414921, + "learning_rate": 0.0008657783486912215, + "loss": 1.00603366, + "num_input_tokens_seen": 112795088, + "router_z_loss_mlp": 0.95410156, + "step": 1360, + "time_per_iteration": 2.710707187652588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189735, + "balance_loss_mlp": 1.09412944, + "epoch": 0.2618314736437091, + "flos": 960368223744.0, + "grad_norm": 0.025390417969386195, + "language_loss": 0.99146813, + "learning_rate": 0.0008655658741953472, + "loss": 1.00336552, + "num_input_tokens_seen": 112879888, + "router_z_loss_mlp": 0.95556641, + "step": 1361, + "time_per_iteration": 3.2610023021698 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187461, + "balance_loss_mlp": 1.0919987, + "epoch": 0.26202385532897265, + "flos": 575902170624.0, + "grad_norm": 0.01965876060868175, + "language_loss": 0.95685869, + "learning_rate": 0.0008653532577742136, + "loss": 0.96873331, + "num_input_tokens_seen": 112952208, + "router_z_loss_mlp": 0.95410156, + "step": 1362, + "time_per_iteration": 2.753920793533325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190509, + "balance_loss_mlp": 1.09509337, + "epoch": 0.26221623701423624, + "flos": 446397264384.0, + "grad_norm": 0.024702919408059576, + "language_loss": 0.95440364, + "learning_rate": 0.0008651404995103659, + "loss": 0.96630871, + "num_input_tokens_seen": 113017472, + "router_z_loss_mlp": 0.95361328, + "step": 1363, + "time_per_iteration": 2.532839298248291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184254, + "balance_loss_mlp": 1.088696, + "epoch": 0.26240861869949983, + "flos": 536755003392.0, + "grad_norm": 0.021936659097783043, + "language_loss": 0.95658946, + "learning_rate": 0.0008649275994864041, + "loss": 0.96843195, + "num_input_tokens_seen": 113090000, + "router_z_loss_mlp": 0.95507812, + "step": 1364, + "time_per_iteration": 2.6723499298095703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182727, + "balance_loss_mlp": 1.08735919, + "epoch": 0.26260100038476336, + "flos": 566487544320.0, + "grad_norm": 0.02057443182875544, + "language_loss": 0.93747735, + "learning_rate": 0.0008647145577849834, + "loss": 0.94930464, + "num_input_tokens_seen": 113169424, + "router_z_loss_mlp": 0.953125, + "step": 1365, + "time_per_iteration": 2.817335844039917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184888, + "balance_loss_mlp": 1.089378, + "epoch": 0.26279338207002695, + "flos": 614320195584.0, + "grad_norm": 0.02000370099851243, + "language_loss": 0.90110707, + "learning_rate": 0.0008645013744888139, + "loss": 0.912956, + "num_input_tokens_seen": 113256752, + "router_z_loss_mlp": 0.95458984, + "step": 1366, + "time_per_iteration": 2.889956474304199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190369, + "balance_loss_mlp": 1.09452498, + "epoch": 0.2629857637552905, + "flos": 523944992256.0, + "grad_norm": 0.02433762343961203, + "language_loss": 0.96272296, + "learning_rate": 0.0008642880496806607, + "loss": 0.97462666, + "num_input_tokens_seen": 113330512, + "router_z_loss_mlp": 0.95800781, + "step": 1367, + "time_per_iteration": 2.7868857383728027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186128, + "balance_loss_mlp": 1.09028387, + "epoch": 0.26317814544055407, + "flos": 535654559232.0, + "grad_norm": 0.022945771924384736, + "language_loss": 0.9318915, + "learning_rate": 0.0008640745834433437, + "loss": 0.94375277, + "num_input_tokens_seen": 113409088, + "router_z_loss_mlp": 0.95800781, + "step": 1368, + "time_per_iteration": 2.7556509971618652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182695, + "balance_loss_mlp": 1.08718467, + "epoch": 0.2633705271258176, + "flos": 556779479040.0, + "grad_norm": 0.024336346931206027, + "language_loss": 0.96858466, + "learning_rate": 0.000863860975859738, + "loss": 0.98041165, + "num_input_tokens_seen": 113486624, + "router_z_loss_mlp": 0.95458984, + "step": 1369, + "time_per_iteration": 2.9069716930389404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184914, + "balance_loss_mlp": 1.08945167, + "epoch": 0.2635629088110812, + "flos": 553461957120.0, + "grad_norm": 0.02843668952404612, + "language_loss": 1.00276971, + "learning_rate": 0.0008636472270127733, + "loss": 1.01461875, + "num_input_tokens_seen": 113555776, + "router_z_loss_mlp": 0.95410156, + "step": 1370, + "time_per_iteration": 2.626201868057251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185086, + "balance_loss_mlp": 1.08952749, + "epoch": 0.2637552904963448, + "flos": 456915062784.0, + "grad_norm": 0.02826867423240315, + "language_loss": 1.01819849, + "learning_rate": 0.0008634333369854345, + "loss": 1.03004944, + "num_input_tokens_seen": 113624208, + "router_z_loss_mlp": 0.95507812, + "step": 1371, + "time_per_iteration": 2.5906460285186768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183664, + "balance_loss_mlp": 1.08820105, + "epoch": 0.2639476721816083, + "flos": 614259070464.0, + "grad_norm": 0.024066040008067748, + "language_loss": 0.95210433, + "learning_rate": 0.0008632193058607608, + "loss": 0.96394098, + "num_input_tokens_seen": 113698544, + "router_z_loss_mlp": 0.95410156, + "step": 1372, + "time_per_iteration": 2.7260935306549072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180244, + "balance_loss_mlp": 1.08487642, + "epoch": 0.2641400538668719, + "flos": 573025807872.0, + "grad_norm": 0.02730663798923432, + "language_loss": 0.93146777, + "learning_rate": 0.0008630051337218466, + "loss": 0.94327021, + "num_input_tokens_seen": 113769024, + "router_z_loss_mlp": 0.953125, + "step": 1373, + "time_per_iteration": 2.7155323028564453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193282, + "balance_loss_mlp": 1.09777129, + "epoch": 0.2643324355521354, + "flos": 583339490304.0, + "grad_norm": 0.02802871933703498, + "language_loss": 0.91373825, + "learning_rate": 0.0008627908206518409, + "loss": 0.9256711, + "num_input_tokens_seen": 113836320, + "router_z_loss_mlp": 0.95458984, + "step": 1374, + "time_per_iteration": 2.7118475437164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189674, + "balance_loss_mlp": 1.09621429, + "epoch": 0.264524817237399, + "flos": 1548025075200.0, + "grad_norm": 0.008601814223210932, + "language_loss": 0.75151253, + "learning_rate": 0.0008625763667339472, + "loss": 0.76340932, + "num_input_tokens_seen": 114065040, + "router_z_loss_mlp": 0.93359375, + "step": 1375, + "time_per_iteration": 4.9838175773620605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192464, + "balance_loss_mlp": 1.09709656, + "epoch": 0.26471719892266254, + "flos": 519042932736.0, + "grad_norm": 0.024634755338573868, + "language_loss": 0.99606347, + "learning_rate": 0.0008623617720514241, + "loss": 1.0079881, + "num_input_tokens_seen": 114133488, + "router_z_loss_mlp": 0.953125, + "step": 1376, + "time_per_iteration": 2.5836029052734375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191563, + "balance_loss_mlp": 1.09586143, + "epoch": 0.26490958060792613, + "flos": 518205001728.0, + "grad_norm": 0.02740625444526412, + "language_loss": 0.95827538, + "learning_rate": 0.0008621470366875848, + "loss": 0.97019094, + "num_input_tokens_seen": 114200704, + "router_z_loss_mlp": 0.95654297, + "step": 1377, + "time_per_iteration": 2.574557304382324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190438, + "balance_loss_mlp": 1.09507096, + "epoch": 0.26510196229318966, + "flos": 597682372608.0, + "grad_norm": 0.02552910213335578, + "language_loss": 0.96441573, + "learning_rate": 0.0008619321607257966, + "loss": 0.97632015, + "num_input_tokens_seen": 114272160, + "router_z_loss_mlp": 0.953125, + "step": 1378, + "time_per_iteration": 2.680574655532837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187734, + "balance_loss_mlp": 1.09227157, + "epoch": 0.26529434397845325, + "flos": 687052459008.0, + "grad_norm": 0.024630390251990656, + "language_loss": 0.90670931, + "learning_rate": 0.000861717144249482, + "loss": 0.91858661, + "num_input_tokens_seen": 114347904, + "router_z_loss_mlp": 0.95410156, + "step": 1379, + "time_per_iteration": 2.8311944007873535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181951, + "balance_loss_mlp": 1.08672631, + "epoch": 0.26548672566371684, + "flos": 425259609600.0, + "grad_norm": 0.02240925569996582, + "language_loss": 0.98143864, + "learning_rate": 0.0008615019873421175, + "loss": 0.99325812, + "num_input_tokens_seen": 114409952, + "router_z_loss_mlp": 0.95166016, + "step": 1380, + "time_per_iteration": 2.472280263900757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182344, + "balance_loss_mlp": 1.08716714, + "epoch": 0.26567910734898037, + "flos": 490849993728.0, + "grad_norm": 0.024166031959674275, + "language_loss": 0.9586165, + "learning_rate": 0.0008612866900872349, + "loss": 0.97043991, + "num_input_tokens_seen": 114474832, + "router_z_loss_mlp": 0.95117188, + "step": 1381, + "time_per_iteration": 2.5671043395996094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181037, + "balance_loss_mlp": 1.08586013, + "epoch": 0.26587148903424396, + "flos": 535228862976.0, + "grad_norm": 0.024625622440273682, + "language_loss": 0.97316492, + "learning_rate": 0.0008610712525684197, + "loss": 0.98497522, + "num_input_tokens_seen": 114545152, + "router_z_loss_mlp": 0.95117188, + "step": 1382, + "time_per_iteration": 2.6394782066345215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179642, + "balance_loss_mlp": 1.08446515, + "epoch": 0.2660638707195075, + "flos": 1019055046656.0, + "grad_norm": 0.02944222863828147, + "language_loss": 0.96464765, + "learning_rate": 0.0008608556748693121, + "loss": 0.97644401, + "num_input_tokens_seen": 114626512, + "router_z_loss_mlp": 0.95117188, + "step": 1383, + "time_per_iteration": 3.2514846324920654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184353, + "balance_loss_mlp": 1.08941519, + "epoch": 0.2662562524047711, + "flos": 525062900736.0, + "grad_norm": 0.024003921212174706, + "language_loss": 0.95956504, + "learning_rate": 0.000860639957073607, + "loss": 0.97140861, + "num_input_tokens_seen": 114701008, + "router_z_loss_mlp": 0.94873047, + "step": 1384, + "time_per_iteration": 2.6759448051452637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190743, + "balance_loss_mlp": 1.09594798, + "epoch": 0.2664486340900346, + "flos": 553479421440.0, + "grad_norm": 0.02584009515603871, + "language_loss": 0.97059226, + "learning_rate": 0.0008604240992650534, + "loss": 0.98249966, + "num_input_tokens_seen": 114771984, + "router_z_loss_mlp": 0.94726562, + "step": 1385, + "time_per_iteration": 2.6880476474761963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187786, + "balance_loss_mlp": 1.09260905, + "epoch": 0.2666410157752982, + "flos": 471208280064.0, + "grad_norm": 0.023709316387392747, + "language_loss": 0.98021734, + "learning_rate": 0.0008602081015274545, + "loss": 0.99209523, + "num_input_tokens_seen": 114844800, + "router_z_loss_mlp": 0.95117188, + "step": 1386, + "time_per_iteration": 2.71233868598938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187602, + "balance_loss_mlp": 1.0924257, + "epoch": 0.2668333974605617, + "flos": 571015574016.0, + "grad_norm": 0.021121239598078063, + "language_loss": 0.90840185, + "learning_rate": 0.0008599919639446684, + "loss": 0.92027789, + "num_input_tokens_seen": 114918544, + "router_z_loss_mlp": 0.95117188, + "step": 1387, + "time_per_iteration": 2.6656363010406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183674, + "balance_loss_mlp": 1.08840239, + "epoch": 0.2670257791458253, + "flos": 399895369728.0, + "grad_norm": 0.029257146370583235, + "language_loss": 0.92911923, + "learning_rate": 0.000859775686600607, + "loss": 0.940956, + "num_input_tokens_seen": 114984272, + "router_z_loss_mlp": 0.95214844, + "step": 1388, + "time_per_iteration": 2.5366902351379395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186225, + "balance_loss_mlp": 1.09104884, + "epoch": 0.2672181608310889, + "flos": 516891709440.0, + "grad_norm": 0.02488439836403737, + "language_loss": 0.94369394, + "learning_rate": 0.0008595592695792367, + "loss": 0.95555621, + "num_input_tokens_seen": 115054800, + "router_z_loss_mlp": 0.95117188, + "step": 1389, + "time_per_iteration": 2.6710469722747803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184466, + "balance_loss_mlp": 1.08928883, + "epoch": 0.26741054251635243, + "flos": 508525134336.0, + "grad_norm": 0.024055725628873734, + "language_loss": 0.99442971, + "learning_rate": 0.0008593427129645778, + "loss": 1.00627434, + "num_input_tokens_seen": 115120928, + "router_z_loss_mlp": 0.95117188, + "step": 1390, + "time_per_iteration": 2.5913095474243164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184607, + "balance_loss_mlp": 1.08919191, + "epoch": 0.267602924201616, + "flos": 577808345088.0, + "grad_norm": 0.025635319637122064, + "language_loss": 0.93523198, + "learning_rate": 0.0008591260168407052, + "loss": 0.94707805, + "num_input_tokens_seen": 115196688, + "router_z_loss_mlp": 0.95361328, + "step": 1391, + "time_per_iteration": 2.766150712966919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118642, + "balance_loss_mlp": 1.09095728, + "epoch": 0.26779530588687955, + "flos": 524999774208.0, + "grad_norm": 0.02196829508666122, + "language_loss": 0.92168128, + "learning_rate": 0.0008589091812917479, + "loss": 0.93354547, + "num_input_tokens_seen": 115264912, + "router_z_loss_mlp": 0.95410156, + "step": 1392, + "time_per_iteration": 2.6208953857421875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119079, + "balance_loss_mlp": 1.09580445, + "epoch": 0.26798768757214314, + "flos": 557827530240.0, + "grad_norm": 0.02442636530887492, + "language_loss": 0.95854455, + "learning_rate": 0.0008586922064018887, + "loss": 0.97045243, + "num_input_tokens_seen": 115334672, + "router_z_loss_mlp": 0.94921875, + "step": 1393, + "time_per_iteration": 2.6643927097320557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190751, + "balance_loss_mlp": 1.09581244, + "epoch": 0.2681800692574067, + "flos": 932094693888.0, + "grad_norm": 0.0254733622090453, + "language_loss": 0.99184585, + "learning_rate": 0.0008584750922553651, + "loss": 1.00375342, + "num_input_tokens_seen": 115420032, + "router_z_loss_mlp": 0.94873047, + "step": 1394, + "time_per_iteration": 3.1305503845214844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192347, + "balance_loss_mlp": 1.09712303, + "epoch": 0.26837245094267026, + "flos": 702317865984.0, + "grad_norm": 0.023340973249423663, + "language_loss": 0.92753315, + "learning_rate": 0.0008582578389364677, + "loss": 0.93945664, + "num_input_tokens_seen": 115492576, + "router_z_loss_mlp": 0.95166016, + "step": 1395, + "time_per_iteration": 2.8527095317840576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184756, + "balance_loss_mlp": 1.08953142, + "epoch": 0.26856483262793385, + "flos": 594393775104.0, + "grad_norm": 0.020526468408011762, + "language_loss": 1.00206113, + "learning_rate": 0.0008580404465295422, + "loss": 1.01390874, + "num_input_tokens_seen": 115568368, + "router_z_loss_mlp": 0.95166016, + "step": 1396, + "time_per_iteration": 2.784592866897583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184595, + "balance_loss_mlp": 1.08922791, + "epoch": 0.2687572143131974, + "flos": 715588502016.0, + "grad_norm": 0.024818089102904728, + "language_loss": 0.9790895, + "learning_rate": 0.0008578229151189876, + "loss": 0.99093544, + "num_input_tokens_seen": 115651536, + "router_z_loss_mlp": 0.953125, + "step": 1397, + "time_per_iteration": 2.901818037033081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185216, + "balance_loss_mlp": 1.0896579, + "epoch": 0.26894959599846097, + "flos": 468670291968.0, + "grad_norm": 0.028086023154021946, + "language_loss": 0.91012216, + "learning_rate": 0.0008576052447892573, + "loss": 0.92197436, + "num_input_tokens_seen": 115715696, + "router_z_loss_mlp": 0.95507812, + "step": 1398, + "time_per_iteration": 2.5849812030792236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186332, + "balance_loss_mlp": 1.09082139, + "epoch": 0.2691419776837245, + "flos": 469629746688.0, + "grad_norm": 0.022530608820729603, + "language_loss": 0.95147502, + "learning_rate": 0.000857387435624858, + "loss": 0.96333838, + "num_input_tokens_seen": 115780928, + "router_z_loss_mlp": 0.95458984, + "step": 1399, + "time_per_iteration": 2.5274569988250732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011908, + "balance_loss_mlp": 1.09567106, + "epoch": 0.2693343593689881, + "flos": 939284963328.0, + "grad_norm": 0.02095039568010189, + "language_loss": 0.95472848, + "learning_rate": 0.0008571694877103513, + "loss": 0.96663648, + "num_input_tokens_seen": 115874432, + "router_z_loss_mlp": 0.95068359, + "step": 1400, + "time_per_iteration": 3.2558727264404297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190554, + "balance_loss_mlp": 1.09542465, + "epoch": 0.2695267410542516, + "flos": 578793996288.0, + "grad_norm": 0.0241215692671091, + "language_loss": 0.95762217, + "learning_rate": 0.0008569514011303515, + "loss": 0.96952766, + "num_input_tokens_seen": 115956608, + "router_z_loss_mlp": 0.95068359, + "step": 1401, + "time_per_iteration": 2.8175997734069824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193641, + "balance_loss_mlp": 1.09846401, + "epoch": 0.2697191227395152, + "flos": 557964516864.0, + "grad_norm": 0.02413892998134183, + "language_loss": 0.96554017, + "learning_rate": 0.0008567331759695277, + "loss": 0.97747654, + "num_input_tokens_seen": 116031728, + "router_z_loss_mlp": 0.95117188, + "step": 1402, + "time_per_iteration": 2.7052927017211914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192424, + "balance_loss_mlp": 1.09729552, + "epoch": 0.26991150442477874, + "flos": 530314068480.0, + "grad_norm": 0.024237100625486396, + "language_loss": 0.97319567, + "learning_rate": 0.0008565148123126023, + "loss": 0.98511994, + "num_input_tokens_seen": 116104288, + "router_z_loss_mlp": 0.95068359, + "step": 1403, + "time_per_iteration": 2.6399028301239014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187922, + "balance_loss_mlp": 1.09274554, + "epoch": 0.2701038861100423, + "flos": 533086371840.0, + "grad_norm": 0.021620674049761555, + "language_loss": 0.93398714, + "learning_rate": 0.0008562963102443516, + "loss": 0.94586635, + "num_input_tokens_seen": 116177920, + "router_z_loss_mlp": 0.95117188, + "step": 1404, + "time_per_iteration": 2.6793839931488037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185578, + "balance_loss_mlp": 1.09035325, + "epoch": 0.2702962677953059, + "flos": 736504576512.0, + "grad_norm": 0.026106257639691363, + "language_loss": 0.94497591, + "learning_rate": 0.0008560776698496056, + "loss": 0.95683169, + "num_input_tokens_seen": 116251680, + "router_z_loss_mlp": 0.95166016, + "step": 1405, + "time_per_iteration": 2.8884029388427734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186883, + "balance_loss_mlp": 1.09170628, + "epoch": 0.27048864948056944, + "flos": 576000225792.0, + "grad_norm": 0.025611862530653208, + "language_loss": 0.95929742, + "learning_rate": 0.0008558588912132481, + "loss": 0.97116625, + "num_input_tokens_seen": 116327664, + "router_z_loss_mlp": 0.95117188, + "step": 1406, + "time_per_iteration": 2.8396451473236084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190124, + "balance_loss_mlp": 1.09666443, + "epoch": 0.27068103116583303, + "flos": 1426910212608.0, + "grad_norm": 0.014531874927713828, + "language_loss": 0.76458991, + "learning_rate": 0.0008556399744202163, + "loss": 0.77649117, + "num_input_tokens_seen": 116555152, + "router_z_loss_mlp": 0.93359375, + "step": 1407, + "time_per_iteration": 4.898139715194702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119097, + "balance_loss_mlp": 1.09603214, + "epoch": 0.27087341285109656, + "flos": 533031977472.0, + "grad_norm": 0.024689522623330563, + "language_loss": 0.90804136, + "learning_rate": 0.0008554209195555016, + "loss": 0.91995108, + "num_input_tokens_seen": 116626016, + "router_z_loss_mlp": 0.94873047, + "step": 1408, + "time_per_iteration": 2.670093297958374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189645, + "balance_loss_mlp": 1.09446859, + "epoch": 0.27106579453636015, + "flos": 582464629248.0, + "grad_norm": 0.0247795195650599, + "language_loss": 0.98232609, + "learning_rate": 0.0008552017267041483, + "loss": 0.99422252, + "num_input_tokens_seen": 116699152, + "router_z_loss_mlp": 0.95117188, + "step": 1409, + "time_per_iteration": 2.6904594898223877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118886, + "balance_loss_mlp": 1.09368336, + "epoch": 0.2712581762216237, + "flos": 507880585728.0, + "grad_norm": 0.024309295256612126, + "language_loss": 0.90687084, + "learning_rate": 0.0008549823959512549, + "loss": 0.91875941, + "num_input_tokens_seen": 116770912, + "router_z_loss_mlp": 0.95117188, + "step": 1410, + "time_per_iteration": 2.662597417831421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189943, + "balance_loss_mlp": 1.09481394, + "epoch": 0.27145055790688727, + "flos": 999142087680.0, + "grad_norm": 0.023895808714677214, + "language_loss": 0.95848304, + "learning_rate": 0.0008547629273819728, + "loss": 0.97038245, + "num_input_tokens_seen": 116863088, + "router_z_loss_mlp": 0.95068359, + "step": 1411, + "time_per_iteration": 3.36985182762146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186274, + "balance_loss_mlp": 1.09109735, + "epoch": 0.2716429395921508, + "flos": 547728697344.0, + "grad_norm": 0.02712613780862537, + "language_loss": 0.93229926, + "learning_rate": 0.0008545433210815074, + "loss": 0.94416201, + "num_input_tokens_seen": 116929504, + "router_z_loss_mlp": 0.95117188, + "step": 1412, + "time_per_iteration": 2.601452350616455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182035, + "balance_loss_mlp": 1.08685839, + "epoch": 0.2718353212774144, + "flos": 574310902272.0, + "grad_norm": 0.02439507328911507, + "language_loss": 0.95137858, + "learning_rate": 0.0008543235771351176, + "loss": 0.96319902, + "num_input_tokens_seen": 117004064, + "router_z_loss_mlp": 0.95117188, + "step": 1413, + "time_per_iteration": 2.7132034301757812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197126, + "balance_loss_mlp": 1.10209203, + "epoch": 0.272027702962678, + "flos": 645584881152.0, + "grad_norm": 0.02257567173785872, + "language_loss": 0.91220462, + "learning_rate": 0.0008541036956281154, + "loss": 0.92417586, + "num_input_tokens_seen": 117081328, + "router_z_loss_mlp": 0.94970703, + "step": 1414, + "time_per_iteration": 2.871951103210449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187874, + "balance_loss_mlp": 1.09284067, + "epoch": 0.2722200846479415, + "flos": 654995504640.0, + "grad_norm": 0.026411231013774135, + "language_loss": 0.93374348, + "learning_rate": 0.0008538836766458665, + "loss": 0.94562221, + "num_input_tokens_seen": 117156544, + "router_z_loss_mlp": 0.94970703, + "step": 1415, + "time_per_iteration": 2.8673384189605713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183666, + "balance_loss_mlp": 1.08868039, + "epoch": 0.2724124663332051, + "flos": 580778033664.0, + "grad_norm": 0.027862690716265133, + "language_loss": 0.96171892, + "learning_rate": 0.0008536635202737897, + "loss": 0.97355556, + "num_input_tokens_seen": 117230208, + "router_z_loss_mlp": 0.94921875, + "step": 1416, + "time_per_iteration": 2.7829935550689697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183251, + "balance_loss_mlp": 1.08831298, + "epoch": 0.2726048480184686, + "flos": 538467795456.0, + "grad_norm": 0.025077003090708358, + "language_loss": 0.93469489, + "learning_rate": 0.0008534432265973573, + "loss": 0.94652736, + "num_input_tokens_seen": 117298080, + "router_z_loss_mlp": 0.94873047, + "step": 1417, + "time_per_iteration": 2.593364715576172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183107, + "balance_loss_mlp": 1.08793056, + "epoch": 0.2727972297037322, + "flos": 997548817920.0, + "grad_norm": 0.025553987949566613, + "language_loss": 0.99255168, + "learning_rate": 0.000853222795702095, + "loss": 1.00438273, + "num_input_tokens_seen": 117396256, + "router_z_loss_mlp": 0.95117188, + "step": 1418, + "time_per_iteration": 3.387162685394287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119173, + "balance_loss_mlp": 1.09712589, + "epoch": 0.27298961138899575, + "flos": 607334042112.0, + "grad_norm": 0.02541700118612174, + "language_loss": 0.93465757, + "learning_rate": 0.0008530022276735813, + "loss": 0.94657481, + "num_input_tokens_seen": 117467936, + "router_z_loss_mlp": 0.9453125, + "step": 1419, + "time_per_iteration": 2.7426016330718994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191299, + "balance_loss_mlp": 1.0965513, + "epoch": 0.27318199307425933, + "flos": 530396660736.0, + "grad_norm": 0.025702548257077976, + "language_loss": 0.9374572, + "learning_rate": 0.0008527815225974489, + "loss": 0.94937015, + "num_input_tokens_seen": 117538256, + "router_z_loss_mlp": 0.94677734, + "step": 1420, + "time_per_iteration": 2.6544342041015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118326, + "balance_loss_mlp": 1.08865511, + "epoch": 0.2733743747595229, + "flos": 409911610368.0, + "grad_norm": 0.028874111022423956, + "language_loss": 0.99327809, + "learning_rate": 0.0008525606805593829, + "loss": 1.00511074, + "num_input_tokens_seen": 117599488, + "router_z_loss_mlp": 0.9453125, + "step": 1421, + "time_per_iteration": 2.4215376377105713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182106, + "balance_loss_mlp": 1.08721578, + "epoch": 0.27356675644478645, + "flos": 517228082688.0, + "grad_norm": 0.026406413504372096, + "language_loss": 0.92442018, + "learning_rate": 0.0008523397016451213, + "loss": 0.93624127, + "num_input_tokens_seen": 117664240, + "router_z_loss_mlp": 0.94824219, + "step": 1422, + "time_per_iteration": 2.5680603981018066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184812, + "balance_loss_mlp": 1.09011269, + "epoch": 0.27375913813005004, + "flos": 1054058221056.0, + "grad_norm": 0.02228341429952914, + "language_loss": 0.94973963, + "learning_rate": 0.0008521185859404564, + "loss": 0.96158779, + "num_input_tokens_seen": 117754768, + "router_z_loss_mlp": 0.94628906, + "step": 1423, + "time_per_iteration": 3.37345814704895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179884, + "balance_loss_mlp": 1.08485043, + "epoch": 0.27395151981531357, + "flos": 626003566080.0, + "grad_norm": 0.02387683630357993, + "language_loss": 0.97909242, + "learning_rate": 0.0008518973335312326, + "loss": 0.99089128, + "num_input_tokens_seen": 117832816, + "router_z_loss_mlp": 0.94970703, + "step": 1424, + "time_per_iteration": 2.8314859867095947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184763, + "balance_loss_mlp": 1.08982456, + "epoch": 0.27414390150057716, + "flos": 551414793216.0, + "grad_norm": 0.028545098094769822, + "language_loss": 0.95577884, + "learning_rate": 0.0008516759445033477, + "loss": 0.96762645, + "num_input_tokens_seen": 117899168, + "router_z_loss_mlp": 0.94873047, + "step": 1425, + "time_per_iteration": 2.6086578369140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118204, + "balance_loss_mlp": 1.08705389, + "epoch": 0.2743362831858407, + "flos": 540951389184.0, + "grad_norm": 0.02677358847245462, + "language_loss": 0.96958816, + "learning_rate": 0.0008514544189427526, + "loss": 0.9814086, + "num_input_tokens_seen": 117972384, + "router_z_loss_mlp": 0.94921875, + "step": 1426, + "time_per_iteration": 2.6927483081817627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191695, + "balance_loss_mlp": 1.09713852, + "epoch": 0.2745286648711043, + "flos": 469545153024.0, + "grad_norm": 0.025998263163597202, + "language_loss": 0.95807564, + "learning_rate": 0.0008512327569354511, + "loss": 0.96999258, + "num_input_tokens_seen": 118039584, + "router_z_loss_mlp": 0.94482422, + "step": 1427, + "time_per_iteration": 2.5617682933807373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119268, + "balance_loss_mlp": 1.09764659, + "epoch": 0.2747210465563678, + "flos": 473871794688.0, + "grad_norm": 0.02733358796633043, + "language_loss": 0.93333006, + "learning_rate": 0.0008510109585675001, + "loss": 0.94525683, + "num_input_tokens_seen": 118108352, + "router_z_loss_mlp": 0.94970703, + "step": 1428, + "time_per_iteration": 2.7269434928894043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205208, + "balance_loss_mlp": 1.11193848, + "epoch": 0.2749134282416314, + "flos": 1318056866304.0, + "grad_norm": 0.019809968329655446, + "language_loss": 0.81153345, + "learning_rate": 0.0008507890239250093, + "loss": 0.82358551, + "num_input_tokens_seen": 118331120, + "router_z_loss_mlp": 0.93164062, + "step": 1429, + "time_per_iteration": 4.731899738311768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190948, + "balance_loss_mlp": 1.0958662, + "epoch": 0.275105809926895, + "flos": 972531684864.0, + "grad_norm": 0.03147414200634365, + "language_loss": 0.91184711, + "learning_rate": 0.0008505669530941415, + "loss": 0.92375666, + "num_input_tokens_seen": 118415872, + "router_z_loss_mlp": 0.95019531, + "step": 1430, + "time_per_iteration": 3.3260724544525146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189047, + "balance_loss_mlp": 1.09387004, + "epoch": 0.2752981916121585, + "flos": 528368962560.0, + "grad_norm": 0.025580193945061114, + "language_loss": 0.95012403, + "learning_rate": 0.000850344746161112, + "loss": 0.96201456, + "num_input_tokens_seen": 118483008, + "router_z_loss_mlp": 0.95117188, + "step": 1431, + "time_per_iteration": 2.5820231437683105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186021, + "balance_loss_mlp": 1.09093964, + "epoch": 0.2754905732974221, + "flos": 454598654976.0, + "grad_norm": 0.024219881250434897, + "language_loss": 0.962569, + "learning_rate": 0.0008501224032121894, + "loss": 0.97442919, + "num_input_tokens_seen": 118545840, + "router_z_loss_mlp": 0.95019531, + "step": 1432, + "time_per_iteration": 2.501572847366333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188894, + "balance_loss_mlp": 1.09362173, + "epoch": 0.27568295498268564, + "flos": 498508893696.0, + "grad_norm": 0.02427263624604226, + "language_loss": 0.90960014, + "learning_rate": 0.0008498999243336946, + "loss": 0.921489, + "num_input_tokens_seen": 118615168, + "router_z_loss_mlp": 0.95214844, + "step": 1433, + "time_per_iteration": 2.6212003231048584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192375, + "balance_loss_mlp": 1.09715116, + "epoch": 0.2758753366679492, + "flos": 609416134656.0, + "grad_norm": 0.024278981864862804, + "language_loss": 0.95570171, + "learning_rate": 0.0008496773096120021, + "loss": 0.9676255, + "num_input_tokens_seen": 118690384, + "router_z_loss_mlp": 0.95166016, + "step": 1434, + "time_per_iteration": 2.804689407348633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118926, + "balance_loss_mlp": 1.09370184, + "epoch": 0.27606771835321275, + "flos": 741436835328.0, + "grad_norm": 0.025697024392157108, + "language_loss": 0.95037985, + "learning_rate": 0.0008494545591335381, + "loss": 0.96227252, + "num_input_tokens_seen": 118763024, + "router_z_loss_mlp": 0.95507812, + "step": 1435, + "time_per_iteration": 2.9329347610473633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195816, + "balance_loss_mlp": 1.10068655, + "epoch": 0.27626010003847634, + "flos": 555748165632.0, + "grad_norm": 0.0206290639721941, + "language_loss": 0.927001, + "learning_rate": 0.0008492316729847823, + "loss": 0.93895912, + "num_input_tokens_seen": 118845536, + "router_z_loss_mlp": 0.95068359, + "step": 1436, + "time_per_iteration": 2.820913553237915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118782, + "balance_loss_mlp": 1.09245288, + "epoch": 0.2764524817237399, + "flos": 543695494656.0, + "grad_norm": 0.02424730092158954, + "language_loss": 0.88914406, + "learning_rate": 0.0008490086512522664, + "loss": 0.90102232, + "num_input_tokens_seen": 118919008, + "router_z_loss_mlp": 0.953125, + "step": 1437, + "time_per_iteration": 2.7454309463500977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186593, + "balance_loss_mlp": 1.09127319, + "epoch": 0.27664486340900346, + "flos": 407128573440.0, + "grad_norm": 0.024912305575595636, + "language_loss": 0.99286187, + "learning_rate": 0.0008487854940225755, + "loss": 1.00472784, + "num_input_tokens_seen": 118981376, + "router_z_loss_mlp": 0.95263672, + "step": 1438, + "time_per_iteration": 2.4809510707855225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183239, + "balance_loss_mlp": 1.08834839, + "epoch": 0.27683724509426705, + "flos": 523156726272.0, + "grad_norm": 0.025259333782437998, + "language_loss": 0.98154646, + "learning_rate": 0.0008485622013823466, + "loss": 0.99337876, + "num_input_tokens_seen": 119050560, + "router_z_loss_mlp": 0.94824219, + "step": 1439, + "time_per_iteration": 2.65401554107666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183688, + "balance_loss_mlp": 1.08865404, + "epoch": 0.2770296267795306, + "flos": 536409897984.0, + "grad_norm": 0.02898674716386243, + "language_loss": 0.9318651, + "learning_rate": 0.00084833877341827, + "loss": 0.94370198, + "num_input_tokens_seen": 119121104, + "router_z_loss_mlp": 0.94970703, + "step": 1440, + "time_per_iteration": 2.6294455528259277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192537, + "balance_loss_mlp": 1.09755075, + "epoch": 0.27722200846479417, + "flos": 488970015744.0, + "grad_norm": 0.027244615130064133, + "language_loss": 0.90653217, + "learning_rate": 0.000848115210217088, + "loss": 0.91845751, + "num_input_tokens_seen": 119187712, + "router_z_loss_mlp": 0.94921875, + "step": 1441, + "time_per_iteration": 2.5394957065582275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118987, + "balance_loss_mlp": 1.09493196, + "epoch": 0.2774143901500577, + "flos": 619443108864.0, + "grad_norm": 0.024388639686817183, + "language_loss": 0.9228884, + "learning_rate": 0.0008478915118655952, + "loss": 0.93478709, + "num_input_tokens_seen": 119259264, + "router_z_loss_mlp": 0.94873047, + "step": 1442, + "time_per_iteration": 2.7634968757629395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119119, + "balance_loss_mlp": 1.0962522, + "epoch": 0.2776067718353213, + "flos": 514844545536.0, + "grad_norm": 0.021441164984372, + "language_loss": 0.94525409, + "learning_rate": 0.0008476676784506393, + "loss": 0.95716596, + "num_input_tokens_seen": 119328304, + "router_z_loss_mlp": 0.94873047, + "step": 1443, + "time_per_iteration": 2.6474499702453613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119148, + "balance_loss_mlp": 1.09678042, + "epoch": 0.2777991535205848, + "flos": 1006040919552.0, + "grad_norm": 0.026818715625153876, + "language_loss": 0.93016809, + "learning_rate": 0.0008474437100591201, + "loss": 0.94208288, + "num_input_tokens_seen": 119412352, + "router_z_loss_mlp": 0.94628906, + "step": 1444, + "time_per_iteration": 3.311842441558838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189789, + "balance_loss_mlp": 1.09494591, + "epoch": 0.2779915352058484, + "flos": 551375861760.0, + "grad_norm": 0.021641305677188864, + "language_loss": 0.95129728, + "learning_rate": 0.0008472196067779898, + "loss": 0.96319526, + "num_input_tokens_seen": 119484464, + "router_z_loss_mlp": 0.94775391, + "step": 1445, + "time_per_iteration": 2.667910575866699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186263, + "balance_loss_mlp": 1.091277, + "epoch": 0.278183916891112, + "flos": 875215990272.0, + "grad_norm": 0.030449834007814664, + "language_loss": 0.98351109, + "learning_rate": 0.0008469953686942531, + "loss": 0.99537361, + "num_input_tokens_seen": 119557280, + "router_z_loss_mlp": 0.94921875, + "step": 1446, + "time_per_iteration": 3.100473403930664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187264, + "balance_loss_mlp": 1.09246826, + "epoch": 0.2783762985763755, + "flos": 625195834368.0, + "grad_norm": 0.025904191205549917, + "language_loss": 0.93646944, + "learning_rate": 0.0008467709958949668, + "loss": 0.94834208, + "num_input_tokens_seen": 119631232, + "router_z_loss_mlp": 0.94726562, + "step": 1447, + "time_per_iteration": 2.7201731204986572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187983, + "balance_loss_mlp": 1.09333074, + "epoch": 0.2785686802616391, + "flos": 582911792640.0, + "grad_norm": 0.026760771702797625, + "language_loss": 0.94447374, + "learning_rate": 0.0008465464884672403, + "loss": 0.9563536, + "num_input_tokens_seen": 119700224, + "router_z_loss_mlp": 0.94580078, + "step": 1448, + "time_per_iteration": 2.7300403118133545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118631, + "balance_loss_mlp": 1.09180129, + "epoch": 0.27876106194690264, + "flos": 588538991616.0, + "grad_norm": 0.0212290178255441, + "language_loss": 0.93077391, + "learning_rate": 0.0008463218464982348, + "loss": 0.94263697, + "num_input_tokens_seen": 119781376, + "router_z_loss_mlp": 0.94433594, + "step": 1449, + "time_per_iteration": 2.86130952835083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190148, + "balance_loss_mlp": 1.09520972, + "epoch": 0.27895344363216623, + "flos": 877430340096.0, + "grad_norm": 0.02756647509109648, + "language_loss": 0.96903402, + "learning_rate": 0.0008460970700751645, + "loss": 0.98093557, + "num_input_tokens_seen": 119856672, + "router_z_loss_mlp": 0.94873047, + "step": 1450, + "time_per_iteration": 3.069391965866089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188227, + "balance_loss_mlp": 1.0932883, + "epoch": 0.27914582531742976, + "flos": 605035098624.0, + "grad_norm": 0.025261876769304706, + "language_loss": 0.97766632, + "learning_rate": 0.000845872159285295, + "loss": 0.98954856, + "num_input_tokens_seen": 119929008, + "router_z_loss_mlp": 0.94873047, + "step": 1451, + "time_per_iteration": 2.748164653778076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197098, + "balance_loss_mlp": 1.10325623, + "epoch": 0.27933820700269335, + "flos": 1501130411520.0, + "grad_norm": 0.012982305827020523, + "language_loss": 0.77766848, + "learning_rate": 0.0008456471142159447, + "loss": 0.78963947, + "num_input_tokens_seen": 120164032, + "router_z_loss_mlp": 0.9375, + "step": 1452, + "time_per_iteration": 4.906180143356323 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198876, + "balance_loss_mlp": 1.10408044, + "epoch": 0.2795305886879569, + "flos": 1033517451264.0, + "grad_norm": 0.027093914793319178, + "language_loss": 0.95323974, + "learning_rate": 0.0008454219349544836, + "loss": 0.9652285, + "num_input_tokens_seen": 120246784, + "router_z_loss_mlp": 0.94726562, + "step": 1453, + "time_per_iteration": 3.333178758621216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194793, + "balance_loss_mlp": 1.10014069, + "epoch": 0.27972297037322047, + "flos": 608226367488.0, + "grad_norm": 0.025225525542022995, + "language_loss": 0.8972255, + "learning_rate": 0.000845196621588334, + "loss": 0.90917349, + "num_input_tokens_seen": 120318208, + "router_z_loss_mlp": 0.94580078, + "step": 1454, + "time_per_iteration": 2.7425026893615723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191631, + "balance_loss_mlp": 1.09697926, + "epoch": 0.27991535205848406, + "flos": 631560907776.0, + "grad_norm": 0.023908777965609074, + "language_loss": 0.86623406, + "learning_rate": 0.0008449711742049706, + "loss": 0.87815034, + "num_input_tokens_seen": 120393248, + "router_z_loss_mlp": 0.94580078, + "step": 1455, + "time_per_iteration": 2.8148674964904785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188728, + "balance_loss_mlp": 1.09369469, + "epoch": 0.2801077337437476, + "flos": 550353280512.0, + "grad_norm": 0.02989232443782136, + "language_loss": 0.94001353, + "learning_rate": 0.0008447455928919196, + "loss": 0.95190072, + "num_input_tokens_seen": 120461040, + "router_z_loss_mlp": 0.94970703, + "step": 1456, + "time_per_iteration": 2.6030025482177734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186748, + "balance_loss_mlp": 1.09166706, + "epoch": 0.2803001154290112, + "flos": 487741317120.0, + "grad_norm": 0.023726139763527557, + "language_loss": 0.95883709, + "learning_rate": 0.0008445198777367595, + "loss": 0.97070462, + "num_input_tokens_seen": 120530400, + "router_z_loss_mlp": 0.95019531, + "step": 1457, + "time_per_iteration": 2.598212718963623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188426, + "balance_loss_mlp": 1.09344053, + "epoch": 0.2804924971142747, + "flos": 523091598336.0, + "grad_norm": 0.027291046925092925, + "language_loss": 0.9210875, + "learning_rate": 0.0008442940288271208, + "loss": 0.93297172, + "num_input_tokens_seen": 120598304, + "router_z_loss_mlp": 0.94921875, + "step": 1458, + "time_per_iteration": 2.617572069168091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189438, + "balance_loss_mlp": 1.09473801, + "epoch": 0.2806848787995383, + "flos": 528849053184.0, + "grad_norm": 0.02378106137707509, + "language_loss": 0.95258486, + "learning_rate": 0.0008440680462506856, + "loss": 0.96447927, + "num_input_tokens_seen": 120675712, + "router_z_loss_mlp": 0.94628906, + "step": 1459, + "time_per_iteration": 2.7465641498565674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191591, + "balance_loss_mlp": 1.09660506, + "epoch": 0.2808772604848018, + "flos": 486484420608.0, + "grad_norm": 0.02248739277997059, + "language_loss": 0.9351486, + "learning_rate": 0.0008438419300951883, + "loss": 0.94706452, + "num_input_tokens_seen": 120746544, + "router_z_loss_mlp": 0.94921875, + "step": 1460, + "time_per_iteration": 2.6331160068511963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188162, + "balance_loss_mlp": 1.09303284, + "epoch": 0.2810696421700654, + "flos": 619339049472.0, + "grad_norm": 0.024684272432392865, + "language_loss": 0.96464884, + "learning_rate": 0.0008436156804484148, + "loss": 0.97653049, + "num_input_tokens_seen": 120823520, + "router_z_loss_mlp": 0.95068359, + "step": 1461, + "time_per_iteration": 2.7740418910980225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188616, + "balance_loss_mlp": 1.09358263, + "epoch": 0.28126202385532895, + "flos": 455686364160.0, + "grad_norm": 0.026728942288464865, + "language_loss": 0.99464989, + "learning_rate": 0.0008433892973982031, + "loss": 1.00653601, + "num_input_tokens_seen": 120889568, + "router_z_loss_mlp": 0.94970703, + "step": 1462, + "time_per_iteration": 2.5151000022888184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188441, + "balance_loss_mlp": 1.09345496, + "epoch": 0.28145440554059253, + "flos": 531738150912.0, + "grad_norm": 0.02863032020985732, + "language_loss": 0.95777607, + "learning_rate": 0.0008431627810324431, + "loss": 0.96966046, + "num_input_tokens_seen": 120958480, + "router_z_loss_mlp": 0.94921875, + "step": 1463, + "time_per_iteration": 2.64477801322937 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187972, + "balance_loss_mlp": 1.09298646, + "epoch": 0.2816467872258561, + "flos": 453163838976.0, + "grad_norm": 0.025052425157320847, + "language_loss": 0.90961307, + "learning_rate": 0.000842936131439076, + "loss": 0.92149282, + "num_input_tokens_seen": 121028032, + "router_z_loss_mlp": 0.94921875, + "step": 1464, + "time_per_iteration": 2.5910096168518066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186267, + "balance_loss_mlp": 1.09147155, + "epoch": 0.28183916891111965, + "flos": 473704608768.0, + "grad_norm": 0.02627501463847235, + "language_loss": 0.97073281, + "learning_rate": 0.0008427093487060951, + "loss": 0.98259544, + "num_input_tokens_seen": 121099280, + "router_z_loss_mlp": 0.94726562, + "step": 1465, + "time_per_iteration": 2.6250505447387695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187944, + "balance_loss_mlp": 1.09300542, + "epoch": 0.28203155059638324, + "flos": 558188098560.0, + "grad_norm": 0.02108937585301408, + "language_loss": 0.91709232, + "learning_rate": 0.000842482432921545, + "loss": 0.92897177, + "num_input_tokens_seen": 121180240, + "router_z_loss_mlp": 0.94873047, + "step": 1466, + "time_per_iteration": 2.809101104736328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186286, + "balance_loss_mlp": 1.09139562, + "epoch": 0.28222393228164677, + "flos": 417878685696.0, + "grad_norm": 0.025824876793605126, + "language_loss": 0.96517414, + "learning_rate": 0.0008422553841735225, + "loss": 0.97703695, + "num_input_tokens_seen": 121242736, + "router_z_loss_mlp": 0.94824219, + "step": 1467, + "time_per_iteration": 2.468773365020752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184331, + "balance_loss_mlp": 1.08963072, + "epoch": 0.28241631396691036, + "flos": 606040215552.0, + "grad_norm": 0.02479925640814435, + "language_loss": 0.92490911, + "learning_rate": 0.0008420282025501757, + "loss": 0.93675244, + "num_input_tokens_seen": 121319248, + "router_z_loss_mlp": 0.94628906, + "step": 1468, + "time_per_iteration": 2.7617123126983643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184258, + "balance_loss_mlp": 1.08960581, + "epoch": 0.2826086956521739, + "flos": 574050390528.0, + "grad_norm": 0.023359152371130017, + "language_loss": 0.93868291, + "learning_rate": 0.0008418008881397043, + "loss": 0.95052546, + "num_input_tokens_seen": 121392064, + "router_z_loss_mlp": 0.94580078, + "step": 1469, + "time_per_iteration": 2.681727886199951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185359, + "balance_loss_mlp": 1.09056342, + "epoch": 0.2828010773374375, + "flos": 844318603776.0, + "grad_norm": 0.02469333041166596, + "language_loss": 0.92646587, + "learning_rate": 0.0008415734410303595, + "loss": 0.93831944, + "num_input_tokens_seen": 121475984, + "router_z_loss_mlp": 0.94726562, + "step": 1470, + "time_per_iteration": 3.1949617862701416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186089, + "balance_loss_mlp": 1.09124613, + "epoch": 0.28299345902270107, + "flos": 543771356160.0, + "grad_norm": 0.022743934694793657, + "language_loss": 0.98454034, + "learning_rate": 0.0008413458613104444, + "loss": 0.99640119, + "num_input_tokens_seen": 121551024, + "router_z_loss_mlp": 0.94775391, + "step": 1471, + "time_per_iteration": 2.679994583129883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184615, + "balance_loss_mlp": 1.08972394, + "epoch": 0.2831858407079646, + "flos": 572754562560.0, + "grad_norm": 0.02381851847695354, + "language_loss": 0.91435039, + "learning_rate": 0.0008411181490683129, + "loss": 0.92619658, + "num_input_tokens_seen": 121624528, + "router_z_loss_mlp": 0.94824219, + "step": 1472, + "time_per_iteration": 2.7178077697753906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186226, + "balance_loss_mlp": 1.09152639, + "epoch": 0.2833782223932282, + "flos": 765170875392.0, + "grad_norm": 0.023393787071714342, + "language_loss": 0.92628008, + "learning_rate": 0.0008408903043923707, + "loss": 0.9381423, + "num_input_tokens_seen": 121706736, + "router_z_loss_mlp": 0.94628906, + "step": 1473, + "time_per_iteration": 3.0261785984039307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184462, + "balance_loss_mlp": 1.0899055, + "epoch": 0.2835706040784917, + "flos": 540087261696.0, + "grad_norm": 0.026141956799832673, + "language_loss": 0.93214488, + "learning_rate": 0.0008406623273710754, + "loss": 0.94398952, + "num_input_tokens_seen": 121773008, + "router_z_loss_mlp": 0.94482422, + "step": 1474, + "time_per_iteration": 2.62430739402771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118759, + "balance_loss_mlp": 1.09312844, + "epoch": 0.2837629857637553, + "flos": 531653557248.0, + "grad_norm": 0.026627011980012938, + "language_loss": 0.91140723, + "learning_rate": 0.0008404342180929351, + "loss": 0.9232831, + "num_input_tokens_seen": 121840016, + "router_z_loss_mlp": 0.94384766, + "step": 1475, + "time_per_iteration": 2.6201882362365723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191029, + "balance_loss_mlp": 1.09666264, + "epoch": 0.28395536744901884, + "flos": 541109842944.0, + "grad_norm": 0.026942213566754976, + "language_loss": 0.91036892, + "learning_rate": 0.00084020597664651, + "loss": 0.92227924, + "num_input_tokens_seen": 121915008, + "router_z_loss_mlp": 0.94287109, + "step": 1476, + "time_per_iteration": 2.792515516281128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191806, + "balance_loss_mlp": 1.09743977, + "epoch": 0.2841477491342824, + "flos": 574801726464.0, + "grad_norm": 0.0281069748307863, + "language_loss": 0.94561875, + "learning_rate": 0.0008399776031204111, + "loss": 0.95753682, + "num_input_tokens_seen": 121987456, + "router_z_loss_mlp": 0.94287109, + "step": 1477, + "time_per_iteration": 2.7592930793762207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189206, + "balance_loss_mlp": 1.09479237, + "epoch": 0.28434013081954596, + "flos": 573138599424.0, + "grad_norm": 0.025578880464706598, + "language_loss": 0.90985346, + "learning_rate": 0.0008397490976033009, + "loss": 0.92174542, + "num_input_tokens_seen": 122058720, + "router_z_loss_mlp": 0.94335938, + "step": 1478, + "time_per_iteration": 2.72312331199646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193047, + "balance_loss_mlp": 1.10015869, + "epoch": 0.28453251250480954, + "flos": 1556673629184.0, + "grad_norm": 0.009281527310597816, + "language_loss": 0.77879643, + "learning_rate": 0.000839520460183893, + "loss": 0.7907269, + "num_input_tokens_seen": 122285792, + "router_z_loss_mlp": 0.92773438, + "step": 1479, + "time_per_iteration": 4.714428901672363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188304, + "balance_loss_mlp": 1.0943675, + "epoch": 0.28472489419007313, + "flos": 750426491904.0, + "grad_norm": 0.023822673694276757, + "language_loss": 0.93367732, + "learning_rate": 0.0008392916909509525, + "loss": 0.94556034, + "num_input_tokens_seen": 122366608, + "router_z_loss_mlp": 0.93847656, + "step": 1480, + "time_per_iteration": 3.0365796089172363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183623, + "balance_loss_mlp": 1.08930516, + "epoch": 0.28491727587533666, + "flos": 491138703360.0, + "grad_norm": 0.028675048847138535, + "language_loss": 0.94468164, + "learning_rate": 0.0008390627899932954, + "loss": 0.95651788, + "num_input_tokens_seen": 122435536, + "router_z_loss_mlp": 0.94238281, + "step": 1481, + "time_per_iteration": 2.562316656112671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187714, + "balance_loss_mlp": 1.09353888, + "epoch": 0.28510965756060025, + "flos": 730359081984.0, + "grad_norm": 0.028797322451775676, + "language_loss": 0.96514452, + "learning_rate": 0.000838833757399789, + "loss": 0.97702163, + "num_input_tokens_seen": 122515584, + "router_z_loss_mlp": 0.94091797, + "step": 1482, + "time_per_iteration": 2.955920696258545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189825, + "balance_loss_mlp": 1.09593546, + "epoch": 0.2853020392458638, + "flos": 552669688320.0, + "grad_norm": 0.027781834693451857, + "language_loss": 0.92148101, + "learning_rate": 0.0008386045932593515, + "loss": 0.93337923, + "num_input_tokens_seen": 122585552, + "router_z_loss_mlp": 0.93798828, + "step": 1483, + "time_per_iteration": 2.6609442234039307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185409, + "balance_loss_mlp": 1.09151959, + "epoch": 0.28549442093112737, + "flos": 756096625152.0, + "grad_norm": 0.023489805753692042, + "language_loss": 0.9365592, + "learning_rate": 0.0008383752976609525, + "loss": 0.94841331, + "num_input_tokens_seen": 122658928, + "router_z_loss_mlp": 0.93798828, + "step": 1484, + "time_per_iteration": 2.914872646331787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188644, + "balance_loss_mlp": 1.09480286, + "epoch": 0.2856868026163909, + "flos": 539703224832.0, + "grad_norm": 0.026354969281760218, + "language_loss": 0.9020288, + "learning_rate": 0.0008381458706936123, + "loss": 0.91391522, + "num_input_tokens_seen": 122729056, + "router_z_loss_mlp": 0.9375, + "step": 1485, + "time_per_iteration": 2.7100982666015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190691, + "balance_loss_mlp": 1.09675431, + "epoch": 0.2858791843016545, + "flos": 584920025088.0, + "grad_norm": 0.026556247425645045, + "language_loss": 0.97539783, + "learning_rate": 0.0008379163124464025, + "loss": 0.98730469, + "num_input_tokens_seen": 122802832, + "router_z_loss_mlp": 0.93847656, + "step": 1486, + "time_per_iteration": 2.7065536975860596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192022, + "balance_loss_mlp": 1.0979898, + "epoch": 0.286071565986918, + "flos": 646051510272.0, + "grad_norm": 0.03147840332437955, + "language_loss": 0.84533966, + "learning_rate": 0.0008376866230084452, + "loss": 0.85725987, + "num_input_tokens_seen": 122881328, + "router_z_loss_mlp": 0.93945312, + "step": 1487, + "time_per_iteration": 2.818673849105835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186798, + "balance_loss_mlp": 1.09295619, + "epoch": 0.2862639476721816, + "flos": 492330471936.0, + "grad_norm": 0.02612625436823832, + "language_loss": 0.963471, + "learning_rate": 0.000837456802468914, + "loss": 0.975339, + "num_input_tokens_seen": 122949680, + "router_z_loss_mlp": 0.9375, + "step": 1488, + "time_per_iteration": 2.5766210556030273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185712, + "balance_loss_mlp": 1.09187043, + "epoch": 0.2864563293574452, + "flos": 522744491520.0, + "grad_norm": 0.023875595461199783, + "language_loss": 0.96454561, + "learning_rate": 0.0008372268509170331, + "loss": 0.9764027, + "num_input_tokens_seen": 123024736, + "router_z_loss_mlp": 0.9375, + "step": 1489, + "time_per_iteration": 2.7241337299346924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117946, + "balance_loss_mlp": 1.08537972, + "epoch": 0.2866487110427087, + "flos": 548256451584.0, + "grad_norm": 0.022999113981848278, + "language_loss": 0.93815279, + "learning_rate": 0.0008369967684420779, + "loss": 0.94994742, + "num_input_tokens_seen": 123097344, + "router_z_loss_mlp": 0.93994141, + "step": 1490, + "time_per_iteration": 2.7358930110931396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180309, + "balance_loss_mlp": 1.08656251, + "epoch": 0.2868410927279723, + "flos": 483217290240.0, + "grad_norm": 0.024118055050044187, + "language_loss": 0.93676293, + "learning_rate": 0.0008367665551333736, + "loss": 0.94856608, + "num_input_tokens_seen": 123166240, + "router_z_loss_mlp": 0.93652344, + "step": 1491, + "time_per_iteration": 2.6094913482666016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181201, + "balance_loss_mlp": 1.08731139, + "epoch": 0.28703347441323585, + "flos": 726136499712.0, + "grad_norm": 0.03204326630579906, + "language_loss": 0.96034807, + "learning_rate": 0.0008365362110802977, + "loss": 0.9721601, + "num_input_tokens_seen": 123238160, + "router_z_loss_mlp": 0.93798828, + "step": 1492, + "time_per_iteration": 2.862281322479248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180339, + "balance_loss_mlp": 1.08630645, + "epoch": 0.28722585609849943, + "flos": 636213189120.0, + "grad_norm": 0.024948941988181064, + "language_loss": 0.92257547, + "learning_rate": 0.0008363057363722773, + "loss": 0.93437886, + "num_input_tokens_seen": 123319504, + "router_z_loss_mlp": 0.93945312, + "step": 1493, + "time_per_iteration": 2.8364765644073486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180894, + "balance_loss_mlp": 1.08695745, + "epoch": 0.28741823778376296, + "flos": 511251775488.0, + "grad_norm": 0.026788978355157977, + "language_loss": 0.94388151, + "learning_rate": 0.0008360751310987906, + "loss": 0.9556905, + "num_input_tokens_seen": 123387008, + "router_z_loss_mlp": 0.93847656, + "step": 1494, + "time_per_iteration": 2.5825915336608887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186005, + "balance_loss_mlp": 1.09244919, + "epoch": 0.28761061946902655, + "flos": 604931039232.0, + "grad_norm": 0.023099591474152015, + "language_loss": 0.92881125, + "learning_rate": 0.0008358443953493666, + "loss": 0.94067132, + "num_input_tokens_seen": 123471056, + "router_z_loss_mlp": 0.93457031, + "step": 1495, + "time_per_iteration": 2.8426852226257324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190116, + "balance_loss_mlp": 1.09617913, + "epoch": 0.28780300115429014, + "flos": 408059830272.0, + "grad_norm": 0.026469370193436835, + "language_loss": 0.97524667, + "learning_rate": 0.0008356135292135851, + "loss": 0.98714793, + "num_input_tokens_seen": 123535024, + "router_z_loss_mlp": 0.93847656, + "step": 1496, + "time_per_iteration": 2.505594491958618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187979, + "balance_loss_mlp": 1.09356499, + "epoch": 0.28799538283955367, + "flos": 375744365568.0, + "grad_norm": 0.028081335314896084, + "language_loss": 1.02447343, + "learning_rate": 0.0008353825327810758, + "loss": 1.03635335, + "num_input_tokens_seen": 123596224, + "router_z_loss_mlp": 0.94335938, + "step": 1497, + "time_per_iteration": 2.4137980937957764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188393, + "balance_loss_mlp": 1.09416974, + "epoch": 0.28818776452481726, + "flos": 593019357696.0, + "grad_norm": 0.027570910872340922, + "language_loss": 0.91214752, + "learning_rate": 0.00083515140614152, + "loss": 0.9240315, + "num_input_tokens_seen": 123668640, + "router_z_loss_mlp": 0.94140625, + "step": 1498, + "time_per_iteration": 2.7084319591522217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188877, + "balance_loss_mlp": 1.0943675, + "epoch": 0.2883801462100808, + "flos": 536103724032.0, + "grad_norm": 0.024692508476740448, + "language_loss": 0.97239816, + "learning_rate": 0.0008349201493846485, + "loss": 0.9842869, + "num_input_tokens_seen": 123740816, + "router_z_loss_mlp": 0.94433594, + "step": 1499, + "time_per_iteration": 2.6401236057281494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190398, + "balance_loss_mlp": 1.09617448, + "epoch": 0.2885725278953444, + "flos": 481076800512.0, + "grad_norm": 0.026282906035864008, + "language_loss": 0.98523659, + "learning_rate": 0.0008346887626002432, + "loss": 0.99714065, + "num_input_tokens_seen": 123805968, + "router_z_loss_mlp": 0.94140625, + "step": 1500, + "time_per_iteration": 2.52458119392395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193099, + "balance_loss_mlp": 1.09863722, + "epoch": 0.2887649095806079, + "flos": 465029858304.0, + "grad_norm": 0.024051725112114657, + "language_loss": 0.95880306, + "learning_rate": 0.000834457245878137, + "loss": 0.970734, + "num_input_tokens_seen": 123876576, + "router_z_loss_mlp": 0.94384766, + "step": 1501, + "time_per_iteration": 2.629535436630249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192018, + "balance_loss_mlp": 1.09765196, + "epoch": 0.2889572912658715, + "flos": 932639912448.0, + "grad_norm": 0.02596355901590014, + "language_loss": 0.90450358, + "learning_rate": 0.000834225599308212, + "loss": 0.9164238, + "num_input_tokens_seen": 123967664, + "router_z_loss_mlp": 0.94287109, + "step": 1502, + "time_per_iteration": 3.2340567111968994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189718, + "balance_loss_mlp": 1.09568572, + "epoch": 0.28914967295113503, + "flos": 571256620032.0, + "grad_norm": 0.02412179831144176, + "language_loss": 0.9487462, + "learning_rate": 0.0008339938229804016, + "loss": 0.96064335, + "num_input_tokens_seen": 124039680, + "router_z_loss_mlp": 0.93945312, + "step": 1503, + "time_per_iteration": 2.710339069366455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193321, + "balance_loss_mlp": 1.10081482, + "epoch": 0.2893420546363986, + "flos": 1489872010752.0, + "grad_norm": 0.01509287591883609, + "language_loss": 0.75434822, + "learning_rate": 0.0008337619169846895, + "loss": 0.76628143, + "num_input_tokens_seen": 124278848, + "router_z_loss_mlp": 0.92382812, + "step": 1504, + "time_per_iteration": 4.937675714492798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189832, + "balance_loss_mlp": 1.09579968, + "epoch": 0.2895344363216622, + "flos": 471182083584.0, + "grad_norm": 0.02978733186062401, + "language_loss": 0.95586789, + "learning_rate": 0.0008335298814111094, + "loss": 0.96776623, + "num_input_tokens_seen": 124346736, + "router_z_loss_mlp": 0.93945312, + "step": 1505, + "time_per_iteration": 2.5757808685302734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119483, + "balance_loss_mlp": 1.10075009, + "epoch": 0.28972681800692573, + "flos": 649340107776.0, + "grad_norm": 0.024998045510076724, + "language_loss": 0.95390272, + "learning_rate": 0.0008332977163497455, + "loss": 0.96585107, + "num_input_tokens_seen": 124420816, + "router_z_loss_mlp": 0.93994141, + "step": 1506, + "time_per_iteration": 2.8062288761138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190367, + "balance_loss_mlp": 1.09638238, + "epoch": 0.2899191996921893, + "flos": 573305785344.0, + "grad_norm": 0.023440576211443395, + "language_loss": 0.92864263, + "learning_rate": 0.0008330654218907325, + "loss": 0.94054627, + "num_input_tokens_seen": 124490480, + "router_z_loss_mlp": 0.93896484, + "step": 1507, + "time_per_iteration": 2.6871397495269775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195663, + "balance_loss_mlp": 1.10158336, + "epoch": 0.29011158137745285, + "flos": 662636940288.0, + "grad_norm": 0.026311762315396375, + "language_loss": 0.90949756, + "learning_rate": 0.0008328329981242548, + "loss": 0.92145419, + "num_input_tokens_seen": 124564960, + "router_z_loss_mlp": 0.93994141, + "step": 1508, + "time_per_iteration": 2.870436906814575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189885, + "balance_loss_mlp": 1.09585261, + "epoch": 0.29030396306271644, + "flos": 537402279936.0, + "grad_norm": 0.02293974263799261, + "language_loss": 0.95641714, + "learning_rate": 0.0008326004451405475, + "loss": 0.96831596, + "num_input_tokens_seen": 124637424, + "router_z_loss_mlp": 0.93945312, + "step": 1509, + "time_per_iteration": 2.7639336585998535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191857, + "balance_loss_mlp": 1.09815872, + "epoch": 0.29049634474798, + "flos": 512955835392.0, + "grad_norm": 0.025710607890434264, + "language_loss": 0.93112034, + "learning_rate": 0.0008323677630298957, + "loss": 0.94303894, + "num_input_tokens_seen": 124704832, + "router_z_loss_mlp": 0.93603516, + "step": 1510, + "time_per_iteration": 2.561455726623535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118953, + "balance_loss_mlp": 1.09592652, + "epoch": 0.29068872643324356, + "flos": 614982208512.0, + "grad_norm": 0.023671610956976636, + "language_loss": 0.92362118, + "learning_rate": 0.0008321349518826345, + "loss": 0.93551642, + "num_input_tokens_seen": 124779600, + "router_z_loss_mlp": 0.93505859, + "step": 1511, + "time_per_iteration": 2.807711362838745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191488, + "balance_loss_mlp": 1.09736073, + "epoch": 0.2908811081185071, + "flos": 547468185600.0, + "grad_norm": 0.029262624151918007, + "language_loss": 1.03824317, + "learning_rate": 0.0008319020117891491, + "loss": 1.05015802, + "num_input_tokens_seen": 124844128, + "router_z_loss_mlp": 0.94042969, + "step": 1512, + "time_per_iteration": 2.626357316970825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192195, + "balance_loss_mlp": 1.09840155, + "epoch": 0.2910734898037707, + "flos": 605901227520.0, + "grad_norm": 0.026098769068304807, + "language_loss": 0.96355087, + "learning_rate": 0.0008316689428398751, + "loss": 0.97547281, + "num_input_tokens_seen": 124915376, + "router_z_loss_mlp": 0.93701172, + "step": 1513, + "time_per_iteration": 2.6982998847961426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190959, + "balance_loss_mlp": 1.09721279, + "epoch": 0.29126587148903427, + "flos": 575835041280.0, + "grad_norm": 0.02240755749123148, + "language_loss": 0.95587385, + "learning_rate": 0.0008314357451252979, + "loss": 0.96778345, + "num_input_tokens_seen": 124995504, + "router_z_loss_mlp": 0.93652344, + "step": 1514, + "time_per_iteration": 2.7506277561187744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185358, + "balance_loss_mlp": 1.09170711, + "epoch": 0.2914582531742978, + "flos": 572133482496.0, + "grad_norm": 0.030106635879309524, + "language_loss": 0.98758858, + "learning_rate": 0.0008312024187359527, + "loss": 0.99944222, + "num_input_tokens_seen": 125064192, + "router_z_loss_mlp": 0.93554688, + "step": 1515, + "time_per_iteration": 2.6389546394348145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186161, + "balance_loss_mlp": 1.09265339, + "epoch": 0.2916506348595614, + "flos": 732302186496.0, + "grad_norm": 0.023105382424412787, + "language_loss": 0.95643955, + "learning_rate": 0.000830968963762425, + "loss": 0.96830118, + "num_input_tokens_seen": 125150560, + "router_z_loss_mlp": 0.93408203, + "step": 1516, + "time_per_iteration": 3.0222864151000977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183995, + "balance_loss_mlp": 1.09048688, + "epoch": 0.2918430165448249, + "flos": 511466625024.0, + "grad_norm": 0.027481799845478876, + "language_loss": 0.92072952, + "learning_rate": 0.0008307353802953497, + "loss": 0.93256938, + "num_input_tokens_seen": 125219264, + "router_z_loss_mlp": 0.93408203, + "step": 1517, + "time_per_iteration": 2.6852073669433594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188929, + "balance_loss_mlp": 1.09546912, + "epoch": 0.2920353982300885, + "flos": 631606569984.0, + "grad_norm": 0.024841994736450757, + "language_loss": 0.95207542, + "learning_rate": 0.0008305016684254125, + "loss": 0.9639647, + "num_input_tokens_seen": 125301904, + "router_z_loss_mlp": 0.93359375, + "step": 1518, + "time_per_iteration": 2.78326678276062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185623, + "balance_loss_mlp": 1.0920676, + "epoch": 0.29222777991535204, + "flos": 502670350848.0, + "grad_norm": 0.02442081482663903, + "language_loss": 0.96402657, + "learning_rate": 0.0008302678282433479, + "loss": 0.97588277, + "num_input_tokens_seen": 125367712, + "router_z_loss_mlp": 0.93457031, + "step": 1519, + "time_per_iteration": 2.580885887145996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186077, + "balance_loss_mlp": 1.09261727, + "epoch": 0.2924201616006156, + "flos": 487841373696.0, + "grad_norm": 0.025531334181834578, + "language_loss": 0.92434102, + "learning_rate": 0.0008300338598399411, + "loss": 0.93620181, + "num_input_tokens_seen": 125437648, + "router_z_loss_mlp": 0.93359375, + "step": 1520, + "time_per_iteration": 2.60040020942688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182574, + "balance_loss_mlp": 1.08911419, + "epoch": 0.2926125432858792, + "flos": 477410170368.0, + "grad_norm": 0.025034871095789283, + "language_loss": 1.04410791, + "learning_rate": 0.0008297997633060263, + "loss": 1.05593348, + "num_input_tokens_seen": 125502432, + "router_z_loss_mlp": 0.93359375, + "step": 1521, + "time_per_iteration": 2.5479507446289062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184296, + "balance_loss_mlp": 1.09083581, + "epoch": 0.29280492497114274, + "flos": 677867418624.0, + "grad_norm": 0.023158831925944874, + "language_loss": 0.93757105, + "learning_rate": 0.0008295655387324883, + "loss": 0.94941401, + "num_input_tokens_seen": 125575424, + "router_z_loss_mlp": 0.93359375, + "step": 1522, + "time_per_iteration": 2.80924916267395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184597, + "balance_loss_mlp": 1.09113646, + "epoch": 0.29299730665640633, + "flos": 459344262144.0, + "grad_norm": 0.024881330364852117, + "language_loss": 0.95369709, + "learning_rate": 0.0008293311862102609, + "loss": 0.96554303, + "num_input_tokens_seen": 125639040, + "router_z_loss_mlp": 0.93359375, + "step": 1523, + "time_per_iteration": 2.5006909370422363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183918, + "balance_loss_mlp": 1.09055364, + "epoch": 0.29318968834166986, + "flos": 447495707136.0, + "grad_norm": 0.027757525537519354, + "language_loss": 0.99242002, + "learning_rate": 0.0008290967058303275, + "loss": 1.00425935, + "num_input_tokens_seen": 125701712, + "router_z_loss_mlp": 0.93261719, + "step": 1524, + "time_per_iteration": 2.472071409225464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184496, + "balance_loss_mlp": 1.09098816, + "epoch": 0.29338207002693345, + "flos": 451255663104.0, + "grad_norm": 0.024483324027042522, + "language_loss": 0.93697757, + "learning_rate": 0.0008288620976837219, + "loss": 0.9488225, + "num_input_tokens_seen": 125765088, + "router_z_loss_mlp": 0.93408203, + "step": 1525, + "time_per_iteration": 2.486726760864258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183678, + "balance_loss_mlp": 1.08997941, + "epoch": 0.293574451712197, + "flos": 503284700160.0, + "grad_norm": 0.025672010983446535, + "language_loss": 0.92014909, + "learning_rate": 0.000828627361861527, + "loss": 0.93198591, + "num_input_tokens_seen": 125831328, + "router_z_loss_mlp": 0.93603516, + "step": 1526, + "time_per_iteration": 2.557725429534912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183155, + "balance_loss_mlp": 1.089504, + "epoch": 0.29376683339746057, + "flos": 697683048960.0, + "grad_norm": 0.028193197708561973, + "language_loss": 0.94158876, + "learning_rate": 0.0008283924984548752, + "loss": 0.95342028, + "num_input_tokens_seen": 125903664, + "router_z_loss_mlp": 0.93554688, + "step": 1527, + "time_per_iteration": 2.866138219833374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182528, + "balance_loss_mlp": 1.08882964, + "epoch": 0.2939592150827241, + "flos": 479541927936.0, + "grad_norm": 0.024215116577050826, + "language_loss": 0.92182994, + "learning_rate": 0.0008281575075549485, + "loss": 0.93365526, + "num_input_tokens_seen": 125971856, + "router_z_loss_mlp": 0.93603516, + "step": 1528, + "time_per_iteration": 2.5585758686065674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202408, + "balance_loss_mlp": 1.1108551, + "epoch": 0.2941515967679877, + "flos": 1488386803200.0, + "grad_norm": 0.02007823063587109, + "language_loss": 0.77352691, + "learning_rate": 0.000827922389252979, + "loss": 0.78555101, + "num_input_tokens_seen": 126183968, + "router_z_loss_mlp": 0.9140625, + "step": 1529, + "time_per_iteration": 4.658870697021484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186281, + "balance_loss_mlp": 1.09267783, + "epoch": 0.2943439784532513, + "flos": 675399287808.0, + "grad_norm": 0.027761434636537758, + "language_loss": 0.99164081, + "learning_rate": 0.0008276871436402469, + "loss": 1.00350356, + "num_input_tokens_seen": 126254448, + "router_z_loss_mlp": 0.93505859, + "step": 1530, + "time_per_iteration": 2.897517204284668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182983, + "balance_loss_mlp": 1.08909357, + "epoch": 0.2945363601385148, + "flos": 577382648832.0, + "grad_norm": 0.025208295044921922, + "language_loss": 0.95561033, + "learning_rate": 0.000827451770808083, + "loss": 0.96744013, + "num_input_tokens_seen": 126328208, + "router_z_loss_mlp": 0.93798828, + "step": 1531, + "time_per_iteration": 2.667419910430908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183127, + "balance_loss_mlp": 1.08923733, + "epoch": 0.2947287418237784, + "flos": 481617289728.0, + "grad_norm": 0.0238323033403859, + "language_loss": 0.92856085, + "learning_rate": 0.0008272162708478674, + "loss": 0.94039214, + "num_input_tokens_seen": 126396464, + "router_z_loss_mlp": 0.93798828, + "step": 1532, + "time_per_iteration": 2.532593250274658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190087, + "balance_loss_mlp": 1.09638822, + "epoch": 0.2949211235090419, + "flos": 559260344832.0, + "grad_norm": 0.023856250691152107, + "language_loss": 0.9573307, + "learning_rate": 0.000826980643851029, + "loss": 0.96923155, + "num_input_tokens_seen": 126468960, + "router_z_loss_mlp": 0.93603516, + "step": 1533, + "time_per_iteration": 2.648393154144287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190115, + "balance_loss_mlp": 1.09665465, + "epoch": 0.2951135051943055, + "flos": 484856222208.0, + "grad_norm": 0.02761517479674983, + "language_loss": 0.9290787, + "learning_rate": 0.0008267448899090464, + "loss": 0.94097984, + "num_input_tokens_seen": 126536496, + "router_z_loss_mlp": 0.93359375, + "step": 1534, + "time_per_iteration": 2.5158579349517822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185677, + "balance_loss_mlp": 1.09226477, + "epoch": 0.29530588687956905, + "flos": 551421523968.0, + "grad_norm": 0.024001584155810263, + "language_loss": 0.90244222, + "learning_rate": 0.0008265090091134473, + "loss": 0.91429895, + "num_input_tokens_seen": 126614048, + "router_z_loss_mlp": 0.93310547, + "step": 1535, + "time_per_iteration": 2.8246946334838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185762, + "balance_loss_mlp": 1.09234965, + "epoch": 0.29549826856483263, + "flos": 674309577216.0, + "grad_norm": 0.021562014940098434, + "language_loss": 0.8727591, + "learning_rate": 0.0008262730015558088, + "loss": 0.88461667, + "num_input_tokens_seen": 126697248, + "router_z_loss_mlp": 0.93310547, + "step": 1536, + "time_per_iteration": 2.8568825721740723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189062, + "balance_loss_mlp": 1.09560144, + "epoch": 0.29569065025009617, + "flos": 766135059456.0, + "grad_norm": 0.0253531059084562, + "language_loss": 0.89567208, + "learning_rate": 0.0008260368673277574, + "loss": 0.90756267, + "num_input_tokens_seen": 126782496, + "router_z_loss_mlp": 0.93359375, + "step": 1537, + "time_per_iteration": 3.1248908042907715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181656, + "balance_loss_mlp": 1.08781409, + "epoch": 0.29588303193535975, + "flos": 544830867456.0, + "grad_norm": 0.02589470547450269, + "language_loss": 0.93808746, + "learning_rate": 0.0008258006065209682, + "loss": 0.94990402, + "num_input_tokens_seen": 126857328, + "router_z_loss_mlp": 0.9375, + "step": 1538, + "time_per_iteration": 2.7405824661254883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182922, + "balance_loss_mlp": 1.0892235, + "epoch": 0.29607541362062334, + "flos": 598144998912.0, + "grad_norm": 0.02499469713889481, + "language_loss": 0.9045589, + "learning_rate": 0.0008255642192271657, + "loss": 0.91638815, + "num_input_tokens_seen": 126932608, + "router_z_loss_mlp": 0.93603516, + "step": 1539, + "time_per_iteration": 2.7654454708099365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183976, + "balance_loss_mlp": 1.09032559, + "epoch": 0.29626779530588687, + "flos": 611037602304.0, + "grad_norm": 0.024707919738005703, + "language_loss": 0.92616487, + "learning_rate": 0.0008253277055381241, + "loss": 0.93800461, + "num_input_tokens_seen": 127008928, + "router_z_loss_mlp": 0.93554688, + "step": 1540, + "time_per_iteration": 2.803755760192871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186228, + "balance_loss_mlp": 1.09252918, + "epoch": 0.29646017699115046, + "flos": 868957704192.0, + "grad_norm": 0.02707124240628881, + "language_loss": 0.95315254, + "learning_rate": 0.0008250910655456658, + "loss": 0.96501482, + "num_input_tokens_seen": 127097104, + "router_z_loss_mlp": 0.93603516, + "step": 1541, + "time_per_iteration": 3.11143159866333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181572, + "balance_loss_mlp": 1.08787382, + "epoch": 0.296652558676414, + "flos": 496880695296.0, + "grad_norm": 0.02670504880571787, + "language_loss": 0.9343757, + "learning_rate": 0.0008248542993416625, + "loss": 0.94619143, + "num_input_tokens_seen": 127165264, + "router_z_loss_mlp": 0.93603516, + "step": 1542, + "time_per_iteration": 2.5893712043762207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181697, + "balance_loss_mlp": 1.08790362, + "epoch": 0.2968449403616776, + "flos": 572626308096.0, + "grad_norm": 0.02711797813063544, + "language_loss": 0.9310621, + "learning_rate": 0.0008246174070180352, + "loss": 0.94287908, + "num_input_tokens_seen": 127238992, + "router_z_loss_mlp": 0.93701172, + "step": 1543, + "time_per_iteration": 2.677011489868164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189648, + "balance_loss_mlp": 1.09614003, + "epoch": 0.2970373220469411, + "flos": 795650022912.0, + "grad_norm": 0.029629985597633038, + "language_loss": 0.9263432, + "learning_rate": 0.0008243803886667537, + "loss": 0.93823969, + "num_input_tokens_seen": 127328160, + "router_z_loss_mlp": 0.93408203, + "step": 1544, + "time_per_iteration": 3.1022729873657227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188285, + "balance_loss_mlp": 1.09472907, + "epoch": 0.2972297037322047, + "flos": 662248174080.0, + "grad_norm": 0.0271995559284498, + "language_loss": 0.89610922, + "learning_rate": 0.0008241432443798364, + "loss": 0.90799212, + "num_input_tokens_seen": 127407328, + "router_z_loss_mlp": 0.93457031, + "step": 1545, + "time_per_iteration": 2.8079423904418945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181998, + "balance_loss_mlp": 1.08868086, + "epoch": 0.29742208541746823, + "flos": 598231593984.0, + "grad_norm": 0.02196679377417612, + "language_loss": 0.91743886, + "learning_rate": 0.0008239059742493512, + "loss": 0.92925882, + "num_input_tokens_seen": 127477136, + "router_z_loss_mlp": 0.93212891, + "step": 1546, + "time_per_iteration": 2.703385353088379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182095, + "balance_loss_mlp": 1.08868301, + "epoch": 0.2976144671027318, + "flos": 771338563584.0, + "grad_norm": 0.02555387631372138, + "language_loss": 0.94145298, + "learning_rate": 0.0008236685783674142, + "loss": 0.95327395, + "num_input_tokens_seen": 127565680, + "router_z_loss_mlp": 0.93310547, + "step": 1547, + "time_per_iteration": 3.0583412647247314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221115, + "balance_loss_mlp": 1.12822723, + "epoch": 0.2978068487879954, + "flos": 1487911441920.0, + "grad_norm": 0.023679675459363107, + "language_loss": 0.76221192, + "learning_rate": 0.0008234310568261911, + "loss": 0.77442312, + "num_input_tokens_seen": 127791584, + "router_z_loss_mlp": 0.92773438, + "step": 1548, + "time_per_iteration": 4.846614360809326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192812, + "balance_loss_mlp": 1.09925652, + "epoch": 0.29799923047325894, + "flos": 476329191936.0, + "grad_norm": 0.02691026692614136, + "language_loss": 0.91868371, + "learning_rate": 0.0008231934097178955, + "loss": 0.93061185, + "num_input_tokens_seen": 127860112, + "router_z_loss_mlp": 0.93457031, + "step": 1549, + "time_per_iteration": 2.600588798522949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189437, + "balance_loss_mlp": 1.09573877, + "epoch": 0.2981916121585225, + "flos": 761167872000.0, + "grad_norm": 0.02304182660847759, + "language_loss": 0.93441629, + "learning_rate": 0.0008229556371347903, + "loss": 0.94631064, + "num_input_tokens_seen": 127938752, + "router_z_loss_mlp": 0.93603516, + "step": 1550, + "time_per_iteration": 2.9500393867492676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196641, + "balance_loss_mlp": 1.10256064, + "epoch": 0.29838399384378606, + "flos": 876516547584.0, + "grad_norm": 0.029531977965095095, + "language_loss": 0.90478379, + "learning_rate": 0.0008227177391691874, + "loss": 0.91675019, + "num_input_tokens_seen": 128022192, + "router_z_loss_mlp": 0.93994141, + "step": 1551, + "time_per_iteration": 3.117060422897339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192501, + "balance_loss_mlp": 1.09870708, + "epoch": 0.29857637552904964, + "flos": 580751837184.0, + "grad_norm": 0.026349497602305087, + "language_loss": 0.9813534, + "learning_rate": 0.0008224797159134463, + "loss": 0.99327838, + "num_input_tokens_seen": 128097776, + "router_z_loss_mlp": 0.93701172, + "step": 1552, + "time_per_iteration": 2.694382429122925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185823, + "balance_loss_mlp": 1.09212494, + "epoch": 0.2987687572143132, + "flos": 837807811584.0, + "grad_norm": 0.022207279660822626, + "language_loss": 0.8985877, + "learning_rate": 0.0008222415674599765, + "loss": 0.91044593, + "num_input_tokens_seen": 128179888, + "router_z_loss_mlp": 0.93603516, + "step": 1553, + "time_per_iteration": 3.074347972869873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186024, + "balance_loss_mlp": 1.09203923, + "epoch": 0.29896113889957676, + "flos": 568167409152.0, + "grad_norm": 0.026892838709900748, + "language_loss": 0.93768913, + "learning_rate": 0.0008220032939012349, + "loss": 0.94954944, + "num_input_tokens_seen": 128251152, + "router_z_loss_mlp": 0.93896484, + "step": 1554, + "time_per_iteration": 2.6793601512908936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190641, + "balance_loss_mlp": 1.0965606, + "epoch": 0.29915352058484035, + "flos": 499835647488.0, + "grad_norm": 0.021647779244158522, + "language_loss": 0.95223451, + "learning_rate": 0.0008217648953297277, + "loss": 0.96414095, + "num_input_tokens_seen": 128327600, + "router_z_loss_mlp": 0.93994141, + "step": 1555, + "time_per_iteration": 2.836775779724121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189405, + "balance_loss_mlp": 1.09546852, + "epoch": 0.2993459022701039, + "flos": 593214741504.0, + "grad_norm": 0.03843372955580003, + "language_loss": 0.88026905, + "learning_rate": 0.0008215263718380095, + "loss": 0.89216304, + "num_input_tokens_seen": 128398432, + "router_z_loss_mlp": 0.93847656, + "step": 1556, + "time_per_iteration": 2.6840782165527344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192028, + "balance_loss_mlp": 1.09790027, + "epoch": 0.29953828395536747, + "flos": 573472971264.0, + "grad_norm": 0.02697506762846426, + "language_loss": 0.95771539, + "learning_rate": 0.0008212877235186833, + "loss": 0.96963573, + "num_input_tokens_seen": 128469696, + "router_z_loss_mlp": 0.94042969, + "step": 1557, + "time_per_iteration": 2.649303674697876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216583, + "balance_loss_mlp": 1.12350464, + "epoch": 0.299730665640631, + "flos": 1508083637760.0, + "grad_norm": 0.01733611069553414, + "language_loss": 0.77737558, + "learning_rate": 0.0008210489504644005, + "loss": 0.78954148, + "num_input_tokens_seen": 128698560, + "router_z_loss_mlp": 0.9296875, + "step": 1558, + "time_per_iteration": 4.920740365982056 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191809, + "balance_loss_mlp": 1.09772909, + "epoch": 0.2999230473258946, + "flos": 514807615488.0, + "grad_norm": 0.03091345134541536, + "language_loss": 0.92723, + "learning_rate": 0.0008208100527678611, + "loss": 0.93914807, + "num_input_tokens_seen": 128765952, + "router_z_loss_mlp": 0.93994141, + "step": 1559, + "time_per_iteration": 2.628755807876587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191055, + "balance_loss_mlp": 1.09692788, + "epoch": 0.3001154290111581, + "flos": 835853973504.0, + "grad_norm": 0.03027255896835194, + "language_loss": 0.86836946, + "learning_rate": 0.0008205710305218135, + "loss": 0.88028002, + "num_input_tokens_seen": 128840048, + "router_z_loss_mlp": 0.94042969, + "step": 1560, + "time_per_iteration": 3.0076475143432617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188346, + "balance_loss_mlp": 1.09431422, + "epoch": 0.3003078106964217, + "flos": 557945051136.0, + "grad_norm": 0.023845762720508586, + "language_loss": 0.96495396, + "learning_rate": 0.0008203318838190541, + "loss": 0.9768374, + "num_input_tokens_seen": 128912496, + "router_z_loss_mlp": 0.93945312, + "step": 1561, + "time_per_iteration": 2.7329952716827393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118952, + "balance_loss_mlp": 1.09548759, + "epoch": 0.30050019238168524, + "flos": 527168461824.0, + "grad_norm": 0.030147848994798797, + "language_loss": 0.95915771, + "learning_rate": 0.0008200926127524281, + "loss": 0.97105289, + "num_input_tokens_seen": 128980624, + "router_z_loss_mlp": 0.93945312, + "step": 1562, + "time_per_iteration": 2.625941753387451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186113, + "balance_loss_mlp": 1.09217656, + "epoch": 0.3006925740669488, + "flos": 578936987136.0, + "grad_norm": 0.02860364820877459, + "language_loss": 0.92538679, + "learning_rate": 0.0008198532174148289, + "loss": 0.93724799, + "num_input_tokens_seen": 129050576, + "router_z_loss_mlp": 0.93847656, + "step": 1563, + "time_per_iteration": 2.725884199142456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207901, + "balance_loss_mlp": 1.11539459, + "epoch": 0.3008849557522124, + "flos": 1493610499584.0, + "grad_norm": 0.014785027254047896, + "language_loss": 0.8068617, + "learning_rate": 0.0008196136978991977, + "loss": 0.8189407, + "num_input_tokens_seen": 129278880, + "router_z_loss_mlp": 0.92382812, + "step": 1564, + "time_per_iteration": 4.830730438232422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198016, + "balance_loss_mlp": 1.10398376, + "epoch": 0.30107733743747594, + "flos": 510824077824.0, + "grad_norm": 0.03423038852538926, + "language_loss": 0.994165, + "learning_rate": 0.0008193740542985244, + "loss": 1.00614524, + "num_input_tokens_seen": 129346560, + "router_z_loss_mlp": 0.93945312, + "step": 1565, + "time_per_iteration": 2.578756809234619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194051, + "balance_loss_mlp": 1.10020983, + "epoch": 0.30126971912273953, + "flos": 588820970496.0, + "grad_norm": 0.027351016206119898, + "language_loss": 0.95914042, + "learning_rate": 0.0008191342867058467, + "loss": 0.97108096, + "num_input_tokens_seen": 129420448, + "router_z_loss_mlp": 0.9375, + "step": 1566, + "time_per_iteration": 2.7046890258789062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192822, + "balance_loss_mlp": 1.09898102, + "epoch": 0.30146210080800306, + "flos": 603220248576.0, + "grad_norm": 0.029722715632080093, + "language_loss": 0.93181753, + "learning_rate": 0.0008188943952142509, + "loss": 0.94374579, + "num_input_tokens_seen": 129494032, + "router_z_loss_mlp": 0.9375, + "step": 1567, + "time_per_iteration": 2.7784945964813232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189204, + "balance_loss_mlp": 1.09588659, + "epoch": 0.30165448249326665, + "flos": 919286684160.0, + "grad_norm": 0.02698998287866622, + "language_loss": 0.91980577, + "learning_rate": 0.0008186543799168711, + "loss": 0.93169785, + "num_input_tokens_seen": 129569088, + "router_z_loss_mlp": 0.93212891, + "step": 1568, + "time_per_iteration": 3.1082897186279297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188766, + "balance_loss_mlp": 1.09530556, + "epoch": 0.3018468641785302, + "flos": 778630164480.0, + "grad_norm": 0.02791954193910651, + "language_loss": 0.98386627, + "learning_rate": 0.0008184142409068892, + "loss": 0.99575394, + "num_input_tokens_seen": 129647968, + "router_z_loss_mlp": 0.93359375, + "step": 1569, + "time_per_iteration": 3.0047945976257324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187793, + "balance_loss_mlp": 1.09433293, + "epoch": 0.30203924586379377, + "flos": 523389040128.0, + "grad_norm": 0.023468489537567368, + "language_loss": 0.94207543, + "learning_rate": 0.000818173978277536, + "loss": 0.95395339, + "num_input_tokens_seen": 129718928, + "router_z_loss_mlp": 0.93359375, + "step": 1570, + "time_per_iteration": 2.640746831893921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119455, + "balance_loss_mlp": 1.10094678, + "epoch": 0.3022316275490573, + "flos": 525649052160.0, + "grad_norm": 0.028721303316250762, + "language_loss": 0.92132497, + "learning_rate": 0.000817933592122089, + "loss": 0.93327045, + "num_input_tokens_seen": 129790128, + "router_z_loss_mlp": 0.93505859, + "step": 1571, + "time_per_iteration": 2.683819055557251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119426, + "balance_loss_mlp": 1.10037029, + "epoch": 0.3024240092343209, + "flos": 480872684544.0, + "grad_norm": 0.028034832338571278, + "language_loss": 0.93476671, + "learning_rate": 0.0008176930825338749, + "loss": 0.94670928, + "num_input_tokens_seen": 129857536, + "router_z_loss_mlp": 0.93798828, + "step": 1572, + "time_per_iteration": 2.5472469329833984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189801, + "balance_loss_mlp": 1.09605432, + "epoch": 0.3026163909195845, + "flos": 688430879232.0, + "grad_norm": 0.025848261804373458, + "language_loss": 0.98155606, + "learning_rate": 0.0008174524496062679, + "loss": 0.9934541, + "num_input_tokens_seen": 129931440, + "router_z_loss_mlp": 0.93652344, + "step": 1573, + "time_per_iteration": 2.90840482711792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185834, + "balance_loss_mlp": 1.0922308, + "epoch": 0.302808772604848, + "flos": 544086262272.0, + "grad_norm": 0.023993082839652336, + "language_loss": 0.9423182, + "learning_rate": 0.0008172116934326894, + "loss": 0.95417649, + "num_input_tokens_seen": 130005200, + "router_z_loss_mlp": 0.93505859, + "step": 1574, + "time_per_iteration": 2.735853433609009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197529, + "balance_loss_mlp": 1.10349655, + "epoch": 0.3030011542901116, + "flos": 476051215872.0, + "grad_norm": 0.025758910941944917, + "language_loss": 0.96492219, + "learning_rate": 0.0008169708141066097, + "loss": 0.97689748, + "num_input_tokens_seen": 130069136, + "router_z_loss_mlp": 0.93945312, + "step": 1575, + "time_per_iteration": 2.5468080043792725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195411, + "balance_loss_mlp": 1.10123575, + "epoch": 0.30319353597537513, + "flos": 482472685056.0, + "grad_norm": 0.02368764088299644, + "language_loss": 0.97863203, + "learning_rate": 0.0008167298117215465, + "loss": 0.99058616, + "num_input_tokens_seen": 130135456, + "router_z_loss_mlp": 0.94091797, + "step": 1576, + "time_per_iteration": 2.5703070163726807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191699, + "balance_loss_mlp": 1.09747636, + "epoch": 0.3033859176606387, + "flos": 706112750592.0, + "grad_norm": 0.02517452757559557, + "language_loss": 0.96809077, + "learning_rate": 0.0008164886863710649, + "loss": 0.98000777, + "num_input_tokens_seen": 130213712, + "router_z_loss_mlp": 0.94140625, + "step": 1577, + "time_per_iteration": 2.9235777854919434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194461, + "balance_loss_mlp": 1.09990454, + "epoch": 0.30357829934590225, + "flos": 766108862976.0, + "grad_norm": 0.022389524212240816, + "language_loss": 0.93041158, + "learning_rate": 0.0008162474381487783, + "loss": 0.94235623, + "num_input_tokens_seen": 130290928, + "router_z_loss_mlp": 0.94482422, + "step": 1578, + "time_per_iteration": 3.0875654220581055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198648, + "balance_loss_mlp": 1.10399556, + "epoch": 0.30377068103116583, + "flos": 533448941568.0, + "grad_norm": 0.026496061930467673, + "language_loss": 0.94202471, + "learning_rate": 0.0008160060671483475, + "loss": 0.9540112, + "num_input_tokens_seen": 130362672, + "router_z_loss_mlp": 0.94580078, + "step": 1579, + "time_per_iteration": 2.69014048576355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198759, + "balance_loss_mlp": 1.10415483, + "epoch": 0.3039630627164294, + "flos": 511223577600.0, + "grad_norm": 0.03174839578716906, + "language_loss": 0.93386602, + "learning_rate": 0.0008157645734634809, + "loss": 0.94585359, + "num_input_tokens_seen": 130428848, + "router_z_loss_mlp": 0.9453125, + "step": 1580, + "time_per_iteration": 2.602752923965454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221184, + "balance_loss_mlp": 1.12791443, + "epoch": 0.30415544440169295, + "flos": 1509188084736.0, + "grad_norm": 0.0221653057193215, + "language_loss": 0.76896489, + "learning_rate": 0.000815522957187935, + "loss": 0.78117669, + "num_input_tokens_seen": 130665440, + "router_z_loss_mlp": 0.93164062, + "step": 1581, + "time_per_iteration": 4.895219802856445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196045, + "balance_loss_mlp": 1.10334778, + "epoch": 0.30434782608695654, + "flos": 1461787133952.0, + "grad_norm": 0.012004742936218659, + "language_loss": 0.73214495, + "learning_rate": 0.0008152812184155132, + "loss": 0.74410546, + "num_input_tokens_seen": 130895248, + "router_z_loss_mlp": 0.92578125, + "step": 1582, + "time_per_iteration": 4.860503196716309 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199297, + "balance_loss_mlp": 1.10526431, + "epoch": 0.3045402077722201, + "flos": 483534197760.0, + "grad_norm": 0.030796945736395555, + "language_loss": 0.93027633, + "learning_rate": 0.000815039357240067, + "loss": 0.94226933, + "num_input_tokens_seen": 130964544, + "router_z_loss_mlp": 0.93945312, + "step": 1583, + "time_per_iteration": 2.6209895610809326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200124, + "balance_loss_mlp": 1.10613978, + "epoch": 0.30473258945748366, + "flos": 544626751488.0, + "grad_norm": 0.03019985050023197, + "language_loss": 0.95277119, + "learning_rate": 0.0008147973737554952, + "loss": 0.9647724, + "num_input_tokens_seen": 131041744, + "router_z_loss_mlp": 0.93896484, + "step": 1584, + "time_per_iteration": 2.7421703338623047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194047, + "balance_loss_mlp": 1.10039604, + "epoch": 0.3049249711427472, + "flos": 568121746944.0, + "grad_norm": 0.05356410902969654, + "language_loss": 0.96138752, + "learning_rate": 0.000814555268055744, + "loss": 0.97332799, + "num_input_tokens_seen": 131108864, + "router_z_loss_mlp": 0.93554688, + "step": 1585, + "time_per_iteration": 2.632770299911499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191549, + "balance_loss_mlp": 1.09804094, + "epoch": 0.3051173528280108, + "flos": 529289485824.0, + "grad_norm": 0.02648444030223836, + "language_loss": 0.96492249, + "learning_rate": 0.0008143130402348073, + "loss": 0.97683799, + "num_input_tokens_seen": 131181104, + "router_z_loss_mlp": 0.93408203, + "step": 1586, + "time_per_iteration": 2.67673659324646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01201208, + "balance_loss_mlp": 1.10746217, + "epoch": 0.3053097345132743, + "flos": 587599002624.0, + "grad_norm": 0.026229801397330138, + "language_loss": 0.86860031, + "learning_rate": 0.0008140706903867265, + "loss": 0.88061237, + "num_input_tokens_seen": 131258704, + "router_z_loss_mlp": 0.93652344, + "step": 1587, + "time_per_iteration": 2.800891399383545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198977, + "balance_loss_mlp": 1.10518289, + "epoch": 0.3055021161985379, + "flos": 608200171008.0, + "grad_norm": 0.031935519152889405, + "language_loss": 1.00360334, + "learning_rate": 0.0008138282186055897, + "loss": 1.01559317, + "num_input_tokens_seen": 131325712, + "router_z_loss_mlp": 0.93701172, + "step": 1588, + "time_per_iteration": 2.735144853591919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119001, + "balance_loss_mlp": 1.09645426, + "epoch": 0.3056944978838015, + "flos": 574962181632.0, + "grad_norm": 0.02354328369726863, + "language_loss": 0.90634608, + "learning_rate": 0.0008135856249855331, + "loss": 0.91824615, + "num_input_tokens_seen": 131397568, + "router_z_loss_mlp": 0.93457031, + "step": 1589, + "time_per_iteration": 2.676589012145996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193478, + "balance_loss_mlp": 1.0996846, + "epoch": 0.305886879569065, + "flos": 635071085568.0, + "grad_norm": 0.031037281782467684, + "language_loss": 0.99387443, + "learning_rate": 0.0008133429096207398, + "loss": 1.00580931, + "num_input_tokens_seen": 131467632, + "router_z_loss_mlp": 0.93701172, + "step": 1590, + "time_per_iteration": 2.7601518630981445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01232346, + "balance_loss_mlp": 1.14117432, + "epoch": 0.3060792612543286, + "flos": 1372131065856.0, + "grad_norm": 0.03086145734446917, + "language_loss": 0.75312257, + "learning_rate": 0.0008131000726054403, + "loss": 0.76544607, + "num_input_tokens_seen": 131702224, + "router_z_loss_mlp": 0.91015625, + "step": 1591, + "time_per_iteration": 4.945107460021973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194266, + "balance_loss_mlp": 1.10051942, + "epoch": 0.30627164293959214, + "flos": 519618350592.0, + "grad_norm": 0.024964882972055902, + "language_loss": 0.95062864, + "learning_rate": 0.0008128571140339123, + "loss": 0.96257126, + "num_input_tokens_seen": 131774608, + "router_z_loss_mlp": 0.93652344, + "step": 1592, + "time_per_iteration": 2.6392171382904053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01201642, + "balance_loss_mlp": 1.10780036, + "epoch": 0.3064640246248557, + "flos": 456533027328.0, + "grad_norm": 0.029487227531667784, + "language_loss": 0.98122042, + "learning_rate": 0.0008126140340004805, + "loss": 0.9932369, + "num_input_tokens_seen": 131841216, + "router_z_loss_mlp": 0.9375, + "step": 1593, + "time_per_iteration": 2.504150629043579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199461, + "balance_loss_mlp": 1.10561943, + "epoch": 0.30665640631011926, + "flos": 851608203264.0, + "grad_norm": 0.026956571268616787, + "language_loss": 0.91923594, + "learning_rate": 0.0008123708325995172, + "loss": 0.93123049, + "num_input_tokens_seen": 131937584, + "router_z_loss_mlp": 0.9375, + "step": 1594, + "time_per_iteration": 3.184525489807129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190831, + "balance_loss_mlp": 1.09713268, + "epoch": 0.30684878799538284, + "flos": 759615535104.0, + "grad_norm": 0.022474213305982697, + "language_loss": 0.88990366, + "learning_rate": 0.0008121275099254414, + "loss": 0.90181196, + "num_input_tokens_seen": 132012656, + "router_z_loss_mlp": 0.93603516, + "step": 1595, + "time_per_iteration": 2.892902374267578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200579, + "balance_loss_mlp": 1.10668933, + "epoch": 0.3070411696806464, + "flos": 518595769344.0, + "grad_norm": 0.025855927391394404, + "language_loss": 0.96650064, + "learning_rate": 0.0008118840660727194, + "loss": 0.97850645, + "num_input_tokens_seen": 132083728, + "router_z_loss_mlp": 0.93798828, + "step": 1596, + "time_per_iteration": 2.696312665939331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191708, + "balance_loss_mlp": 1.09805715, + "epoch": 0.30723355136590996, + "flos": 845790349824.0, + "grad_norm": 0.023513083336694603, + "language_loss": 0.94521677, + "learning_rate": 0.0008116405011358644, + "loss": 0.95713389, + "num_input_tokens_seen": 132170896, + "router_z_loss_mlp": 0.93554688, + "step": 1597, + "time_per_iteration": 3.1500890254974365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118938, + "balance_loss_mlp": 1.09572959, + "epoch": 0.30742593305117355, + "flos": 467079023616.0, + "grad_norm": 0.024597056369147573, + "language_loss": 0.89059556, + "learning_rate": 0.0008113968152094369, + "loss": 0.90248942, + "num_input_tokens_seen": 132234592, + "router_z_loss_mlp": 0.93554688, + "step": 1598, + "time_per_iteration": 2.502336263656616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191327, + "balance_loss_mlp": 1.09781969, + "epoch": 0.3076183147364371, + "flos": 687816529920.0, + "grad_norm": 0.025330429780868927, + "language_loss": 0.90385377, + "learning_rate": 0.0008111530083880438, + "loss": 0.91576707, + "num_input_tokens_seen": 132314720, + "router_z_loss_mlp": 0.93408203, + "step": 1599, + "time_per_iteration": 2.8846051692962646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192126, + "balance_loss_mlp": 1.09847498, + "epoch": 0.30781069642170067, + "flos": 615179593728.0, + "grad_norm": 0.02627563558110635, + "language_loss": 0.95310938, + "learning_rate": 0.0008109090807663399, + "loss": 0.96503073, + "num_input_tokens_seen": 132388768, + "router_z_loss_mlp": 0.93554688, + "step": 1600, + "time_per_iteration": 2.8132736682891846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119763, + "balance_loss_mlp": 1.10402679, + "epoch": 0.3080030781069642, + "flos": 591508680192.0, + "grad_norm": 0.027223292643472258, + "language_loss": 0.96310741, + "learning_rate": 0.0008106650324390257, + "loss": 0.97508371, + "num_input_tokens_seen": 132472544, + "router_z_loss_mlp": 0.93505859, + "step": 1601, + "time_per_iteration": 2.8477296829223633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188215, + "balance_loss_mlp": 1.0948981, + "epoch": 0.3081954597922278, + "flos": 563691045888.0, + "grad_norm": 0.027322987260225157, + "language_loss": 0.89918464, + "learning_rate": 0.0008104208635008493, + "loss": 0.91106677, + "num_input_tokens_seen": 132541968, + "router_z_loss_mlp": 0.93212891, + "step": 1602, + "time_per_iteration": 2.6639676094055176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192245, + "balance_loss_mlp": 1.09859383, + "epoch": 0.3083878414774913, + "flos": 448761335808.0, + "grad_norm": 0.031035394068971153, + "language_loss": 0.93496901, + "learning_rate": 0.0008101765740466058, + "loss": 0.94689143, + "num_input_tokens_seen": 132606976, + "router_z_loss_mlp": 0.93554688, + "step": 1603, + "time_per_iteration": 2.4892899990081787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188397, + "balance_loss_mlp": 1.09465039, + "epoch": 0.3085802231627549, + "flos": 494544821760.0, + "grad_norm": 0.029709960428380106, + "language_loss": 0.93853128, + "learning_rate": 0.0008099321641711364, + "loss": 0.95041513, + "num_input_tokens_seen": 132677984, + "router_z_loss_mlp": 0.93652344, + "step": 1604, + "time_per_iteration": 2.638798952102661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011875, + "balance_loss_mlp": 1.09380174, + "epoch": 0.3087726048480185, + "flos": 488690038272.0, + "grad_norm": 0.02367908107469003, + "language_loss": 0.91951108, + "learning_rate": 0.0008096876339693295, + "loss": 0.93138611, + "num_input_tokens_seen": 132749136, + "router_z_loss_mlp": 0.93603516, + "step": 1605, + "time_per_iteration": 2.6115643978118896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189736, + "balance_loss_mlp": 1.09603786, + "epoch": 0.308964986533282, + "flos": 731887223808.0, + "grad_norm": 0.029121548764615916, + "language_loss": 0.90058184, + "learning_rate": 0.0008094429835361206, + "loss": 0.91247922, + "num_input_tokens_seen": 132823824, + "router_z_loss_mlp": 0.93603516, + "step": 1606, + "time_per_iteration": 2.9361119270324707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185725, + "balance_loss_mlp": 1.09226441, + "epoch": 0.3091573682185456, + "flos": 606515576832.0, + "grad_norm": 0.024539043330914945, + "language_loss": 0.94318593, + "learning_rate": 0.0008091982129664908, + "loss": 0.95504314, + "num_input_tokens_seen": 132895936, + "router_z_loss_mlp": 0.93359375, + "step": 1607, + "time_per_iteration": 2.750641345977783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191863, + "balance_loss_mlp": 1.09821212, + "epoch": 0.30934974990380915, + "flos": 461306832384.0, + "grad_norm": 0.02635007664096696, + "language_loss": 0.92281848, + "learning_rate": 0.0008089533223554687, + "loss": 0.93473709, + "num_input_tokens_seen": 132968960, + "router_z_loss_mlp": 0.93554688, + "step": 1608, + "time_per_iteration": 2.733422040939331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187457, + "balance_loss_mlp": 1.09380579, + "epoch": 0.30954213158907273, + "flos": 554567130624.0, + "grad_norm": 0.025571984513822792, + "language_loss": 0.94345558, + "learning_rate": 0.0008087083117981294, + "loss": 0.95533013, + "num_input_tokens_seen": 133048448, + "router_z_loss_mlp": 0.93554688, + "step": 1609, + "time_per_iteration": 2.919583797454834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189683, + "balance_loss_mlp": 1.09665251, + "epoch": 0.30973451327433627, + "flos": 554113236480.0, + "grad_norm": 0.028700236773969223, + "language_loss": 0.98730469, + "learning_rate": 0.0008084631813895943, + "loss": 0.99920154, + "num_input_tokens_seen": 133121680, + "router_z_loss_mlp": 0.92919922, + "step": 1610, + "time_per_iteration": 2.7721197605133057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192773, + "balance_loss_mlp": 1.09955156, + "epoch": 0.30992689495959985, + "flos": 566762792448.0, + "grad_norm": 0.027612542910463767, + "language_loss": 0.93469882, + "learning_rate": 0.0008082179312250315, + "loss": 0.94662654, + "num_input_tokens_seen": 133190176, + "router_z_loss_mlp": 0.93115234, + "step": 1611, + "time_per_iteration": 2.658564805984497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01219437, + "balance_loss_mlp": 1.12769318, + "epoch": 0.3101192766448634, + "flos": 1445560270848.0, + "grad_norm": 0.021240149379623804, + "language_loss": 0.79855847, + "learning_rate": 0.0008079725613996555, + "loss": 0.81075287, + "num_input_tokens_seen": 133420512, + "router_z_loss_mlp": 0.91601562, + "step": 1612, + "time_per_iteration": 4.8431174755096436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01227287, + "balance_loss_mlp": 1.13497162, + "epoch": 0.31031165833012697, + "flos": 1535127742464.0, + "grad_norm": 0.019393089292119553, + "language_loss": 0.76629329, + "learning_rate": 0.0008077270720087273, + "loss": 0.77856624, + "num_input_tokens_seen": 133651984, + "router_z_loss_mlp": 0.921875, + "step": 1613, + "time_per_iteration": 5.043596029281616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191397, + "balance_loss_mlp": 1.09850931, + "epoch": 0.31050404001539056, + "flos": 993632409600.0, + "grad_norm": 0.029090005547288914, + "language_loss": 0.90590245, + "learning_rate": 0.0008074814631475545, + "loss": 0.91781646, + "num_input_tokens_seen": 133741648, + "router_z_loss_mlp": 0.92773438, + "step": 1614, + "time_per_iteration": 3.3308844566345215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011972, + "balance_loss_mlp": 1.10450339, + "epoch": 0.3106964217006541, + "flos": 446972682240.0, + "grad_norm": 0.029174032275502568, + "language_loss": 0.8959738, + "learning_rate": 0.0008072357349114907, + "loss": 0.90794587, + "num_input_tokens_seen": 133813344, + "router_z_loss_mlp": 0.92578125, + "step": 1615, + "time_per_iteration": 2.660557746887207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194484, + "balance_loss_mlp": 1.10169172, + "epoch": 0.3108888033859177, + "flos": 511494822912.0, + "grad_norm": 0.027617375290548026, + "language_loss": 0.9836188, + "learning_rate": 0.0008069898873959363, + "loss": 0.99556363, + "num_input_tokens_seen": 133884192, + "router_z_loss_mlp": 0.92675781, + "step": 1616, + "time_per_iteration": 2.650024175643921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203555, + "balance_loss_mlp": 1.11076295, + "epoch": 0.3110811850711812, + "flos": 521778306048.0, + "grad_norm": 0.027380341091067188, + "language_loss": 0.94434142, + "learning_rate": 0.0008067439206963375, + "loss": 0.95637697, + "num_input_tokens_seen": 133954848, + "router_z_loss_mlp": 0.92675781, + "step": 1617, + "time_per_iteration": 2.6584017276763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120371, + "balance_loss_mlp": 1.11082232, + "epoch": 0.3112735667564448, + "flos": 687729934848.0, + "grad_norm": 0.029016410329411102, + "language_loss": 0.95023614, + "learning_rate": 0.0008064978349081873, + "loss": 0.96227324, + "num_input_tokens_seen": 134031824, + "router_z_loss_mlp": 0.92773438, + "step": 1618, + "time_per_iteration": 2.911677122116089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199948, + "balance_loss_mlp": 1.10720289, + "epoch": 0.31146594844170833, + "flos": 534165348864.0, + "grad_norm": 0.025439718165996668, + "language_loss": 0.95660365, + "learning_rate": 0.0008062516301270245, + "loss": 0.96860307, + "num_input_tokens_seen": 134104480, + "router_z_loss_mlp": 0.92626953, + "step": 1619, + "time_per_iteration": 2.669111490249634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196196, + "balance_loss_mlp": 1.10388064, + "epoch": 0.3116583301269719, + "flos": 680841836544.0, + "grad_norm": 0.024218225399572888, + "language_loss": 0.96279341, + "learning_rate": 0.0008060053064484343, + "loss": 0.97475541, + "num_input_tokens_seen": 134185632, + "router_z_loss_mlp": 0.921875, + "step": 1620, + "time_per_iteration": 2.924476385116577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189886, + "balance_loss_mlp": 1.09733212, + "epoch": 0.31185071181223545, + "flos": 587329758720.0, + "grad_norm": 0.02529679167102671, + "language_loss": 0.92711556, + "learning_rate": 0.0008057588639680482, + "loss": 0.93901443, + "num_input_tokens_seen": 134261600, + "router_z_loss_mlp": 0.92431641, + "step": 1621, + "time_per_iteration": 2.74631667137146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119125, + "balance_loss_mlp": 1.09817135, + "epoch": 0.31204309349749904, + "flos": 726657523200.0, + "grad_norm": 0.03522846239796161, + "language_loss": 0.93884659, + "learning_rate": 0.0008055123027815434, + "loss": 0.95075905, + "num_input_tokens_seen": 134334368, + "router_z_loss_mlp": 0.9296875, + "step": 1622, + "time_per_iteration": 2.90444016456604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189249, + "balance_loss_mlp": 1.09631383, + "epoch": 0.3122354751827626, + "flos": 577894940160.0, + "grad_norm": 0.026492717763192643, + "language_loss": 0.93252558, + "learning_rate": 0.0008052656229846436, + "loss": 0.94441813, + "num_input_tokens_seen": 134403824, + "router_z_loss_mlp": 0.92822266, + "step": 1623, + "time_per_iteration": 2.680220603942871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187983, + "balance_loss_mlp": 1.09519064, + "epoch": 0.31242785686802615, + "flos": 577028811264.0, + "grad_norm": 0.026617450345468772, + "language_loss": 1.00026262, + "learning_rate": 0.0008050188246731182, + "loss": 1.01214242, + "num_input_tokens_seen": 134471296, + "router_z_loss_mlp": 0.92675781, + "step": 1624, + "time_per_iteration": 2.6526694297790527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190099, + "balance_loss_mlp": 1.09711611, + "epoch": 0.31262023855328974, + "flos": 738195901440.0, + "grad_norm": 0.023806346866415393, + "language_loss": 0.9048847, + "learning_rate": 0.0008047719079427834, + "loss": 0.91678566, + "num_input_tokens_seen": 134551360, + "router_z_loss_mlp": 0.92871094, + "step": 1625, + "time_per_iteration": 3.0077152252197266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119944, + "balance_loss_mlp": 1.108078, + "epoch": 0.3128126202385533, + "flos": 1562591539200.0, + "grad_norm": 0.020013754894949238, + "language_loss": 0.74351704, + "learning_rate": 0.0008045248728895, + "loss": 0.7555114, + "num_input_tokens_seen": 134761328, + "router_z_loss_mlp": 0.91210938, + "step": 1626, + "time_per_iteration": 4.793031215667725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194528, + "balance_loss_mlp": 1.10111523, + "epoch": 0.31300500192381686, + "flos": 515942988288.0, + "grad_norm": 0.023349922932092686, + "language_loss": 0.95821261, + "learning_rate": 0.0008042777196091757, + "loss": 0.97015792, + "num_input_tokens_seen": 134833136, + "router_z_loss_mlp": 0.93310547, + "step": 1627, + "time_per_iteration": 2.679588556289673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196127, + "balance_loss_mlp": 1.10281038, + "epoch": 0.3131973836090804, + "flos": 527661287424.0, + "grad_norm": 0.026058472156191805, + "language_loss": 0.91163933, + "learning_rate": 0.0008040304481977643, + "loss": 0.92360055, + "num_input_tokens_seen": 134904352, + "router_z_loss_mlp": 0.93212891, + "step": 1628, + "time_per_iteration": 2.6339213848114014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206718, + "balance_loss_mlp": 1.11335361, + "epoch": 0.313389765294344, + "flos": 824209534464.0, + "grad_norm": 0.028324849871922998, + "language_loss": 0.96729648, + "learning_rate": 0.0008037830587512649, + "loss": 0.97936368, + "num_input_tokens_seen": 134984880, + "router_z_loss_mlp": 0.93261719, + "step": 1629, + "time_per_iteration": 3.052304744720459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191904, + "balance_loss_mlp": 1.09896827, + "epoch": 0.31358214697960757, + "flos": 394702599168.0, + "grad_norm": 0.026724204555937114, + "language_loss": 0.89292234, + "learning_rate": 0.0008035355513657224, + "loss": 0.90484136, + "num_input_tokens_seen": 135047456, + "router_z_loss_mlp": 0.92822266, + "step": 1630, + "time_per_iteration": 2.470526695251465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198859, + "balance_loss_mlp": 1.1059711, + "epoch": 0.3137745286648711, + "flos": 573097666560.0, + "grad_norm": 0.025006494531642755, + "language_loss": 1.00651205, + "learning_rate": 0.0008032879261372279, + "loss": 1.01850057, + "num_input_tokens_seen": 135124256, + "router_z_loss_mlp": 0.92773438, + "step": 1631, + "time_per_iteration": 2.7967746257781982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194023, + "balance_loss_mlp": 1.10418701, + "epoch": 0.3139669103501347, + "flos": 1501629241344.0, + "grad_norm": 0.01894627505164378, + "language_loss": 0.79635841, + "learning_rate": 0.0008030401831619178, + "loss": 0.80829865, + "num_input_tokens_seen": 135353024, + "router_z_loss_mlp": 0.89648438, + "step": 1632, + "time_per_iteration": 5.690793991088867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187718, + "balance_loss_mlp": 1.09478259, + "epoch": 0.3141592920353982, + "flos": 526358728704.0, + "grad_norm": 0.023739615719740217, + "language_loss": 0.94780874, + "learning_rate": 0.0008027923225359748, + "loss": 0.95968592, + "num_input_tokens_seen": 135422464, + "router_z_loss_mlp": 0.92822266, + "step": 1633, + "time_per_iteration": 2.619640827178955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182027, + "balance_loss_mlp": 1.08894837, + "epoch": 0.3143516737206618, + "flos": 594387044352.0, + "grad_norm": 0.024020227962995952, + "language_loss": 0.97166598, + "learning_rate": 0.0008025443443556267, + "loss": 0.98348624, + "num_input_tokens_seen": 135490928, + "router_z_loss_mlp": 0.9296875, + "step": 1634, + "time_per_iteration": 2.7105367183685303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187192, + "balance_loss_mlp": 1.09397042, + "epoch": 0.31454405540592534, + "flos": 649679208960.0, + "grad_norm": 0.024579905610689918, + "language_loss": 0.95561564, + "learning_rate": 0.000802296248717147, + "loss": 0.96748757, + "num_input_tokens_seen": 135576288, + "router_z_loss_mlp": 0.93115234, + "step": 1635, + "time_per_iteration": 2.954427480697632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189389, + "balance_loss_mlp": 1.09616756, + "epoch": 0.3147364370911889, + "flos": 644069474304.0, + "grad_norm": 0.026460377875643523, + "language_loss": 0.89723325, + "learning_rate": 0.0008020480357168554, + "loss": 0.90912724, + "num_input_tokens_seen": 135652320, + "router_z_loss_mlp": 0.93115234, + "step": 1636, + "time_per_iteration": 2.7983195781707764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118902, + "balance_loss_mlp": 1.09575093, + "epoch": 0.31492881877645246, + "flos": 472821015552.0, + "grad_norm": 0.024118652497695542, + "language_loss": 0.95980144, + "learning_rate": 0.0008017997054511165, + "loss": 0.97169161, + "num_input_tokens_seen": 135719632, + "router_z_loss_mlp": 0.93164062, + "step": 1637, + "time_per_iteration": 2.543381690979004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188761, + "balance_loss_mlp": 1.09544361, + "epoch": 0.31512120046171604, + "flos": 630629650944.0, + "grad_norm": 0.026442486928658162, + "language_loss": 0.94192296, + "learning_rate": 0.0008015512580163407, + "loss": 0.95381057, + "num_input_tokens_seen": 135796544, + "router_z_loss_mlp": 0.93212891, + "step": 1638, + "time_per_iteration": 2.8069217205047607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189537, + "balance_loss_mlp": 1.09645832, + "epoch": 0.31531358214697963, + "flos": 705053239296.0, + "grad_norm": 0.0247809696854931, + "language_loss": 0.89687169, + "learning_rate": 0.0008013026935089838, + "loss": 0.9087671, + "num_input_tokens_seen": 135871344, + "router_z_loss_mlp": 0.9296875, + "step": 1639, + "time_per_iteration": 2.8575150966644287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189099, + "balance_loss_mlp": 1.09592521, + "epoch": 0.31550596383224316, + "flos": 573631425024.0, + "grad_norm": 0.026868409426578303, + "language_loss": 0.92173505, + "learning_rate": 0.0008010540120255472, + "loss": 0.93362606, + "num_input_tokens_seen": 135944320, + "router_z_loss_mlp": 0.93066406, + "step": 1640, + "time_per_iteration": 2.6781005859375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118909, + "balance_loss_mlp": 1.09591639, + "epoch": 0.31569834551750675, + "flos": 659512800768.0, + "grad_norm": 0.03030176261580671, + "language_loss": 0.95734656, + "learning_rate": 0.0008008052136625774, + "loss": 0.96923745, + "num_input_tokens_seen": 136019456, + "router_z_loss_mlp": 0.93066406, + "step": 1641, + "time_per_iteration": 2.8858654499053955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192627, + "balance_loss_mlp": 1.09950101, + "epoch": 0.3158907272027703, + "flos": 567403338240.0, + "grad_norm": 0.026165343030711524, + "language_loss": 0.94310361, + "learning_rate": 0.0008005562985166666, + "loss": 0.9550299, + "num_input_tokens_seen": 136091232, + "router_z_loss_mlp": 0.93017578, + "step": 1642, + "time_per_iteration": 2.7097506523132324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193912, + "balance_loss_mlp": 1.10102403, + "epoch": 0.31608310888803387, + "flos": 537972968448.0, + "grad_norm": 0.020568762002796243, + "language_loss": 0.9172346, + "learning_rate": 0.0008003072666844524, + "loss": 0.92917377, + "num_input_tokens_seen": 136165088, + "router_z_loss_mlp": 0.92773438, + "step": 1643, + "time_per_iteration": 2.6982197761535645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194419, + "balance_loss_mlp": 1.10181749, + "epoch": 0.3162754905732974, + "flos": 487639259136.0, + "grad_norm": 0.02816029335024998, + "language_loss": 0.90344775, + "learning_rate": 0.0008000581182626173, + "loss": 0.91539198, + "num_input_tokens_seen": 136230368, + "router_z_loss_mlp": 0.92480469, + "step": 1644, + "time_per_iteration": 2.546762466430664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193569, + "balance_loss_mlp": 1.10048997, + "epoch": 0.316467872258561, + "flos": 531095603712.0, + "grad_norm": 0.024394566764596542, + "language_loss": 0.93082815, + "learning_rate": 0.0007998088533478894, + "loss": 0.94276381, + "num_input_tokens_seen": 136302512, + "router_z_loss_mlp": 0.9296875, + "step": 1645, + "time_per_iteration": 2.6320817470550537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188922, + "balance_loss_mlp": 1.09622455, + "epoch": 0.3166602539438245, + "flos": 444413227008.0, + "grad_norm": 0.029455070645316363, + "language_loss": 0.9479661, + "learning_rate": 0.000799559472037042, + "loss": 0.95985526, + "num_input_tokens_seen": 136368064, + "router_z_loss_mlp": 0.92578125, + "step": 1646, + "time_per_iteration": 2.535414457321167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187182, + "balance_loss_mlp": 1.09458041, + "epoch": 0.3168526356290881, + "flos": 647102289408.0, + "grad_norm": 0.02168302123393663, + "language_loss": 0.94649625, + "learning_rate": 0.0007993099744268932, + "loss": 0.95836812, + "num_input_tokens_seen": 136451520, + "router_z_loss_mlp": 0.92480469, + "step": 1647, + "time_per_iteration": 2.912095785140991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182437, + "balance_loss_mlp": 1.08988261, + "epoch": 0.3170450173143517, + "flos": 587257900032.0, + "grad_norm": 0.023943172344495993, + "language_loss": 0.96008313, + "learning_rate": 0.000799060360614307, + "loss": 0.97190744, + "num_input_tokens_seen": 136521184, + "router_z_loss_mlp": 0.92431641, + "step": 1648, + "time_per_iteration": 2.6763339042663574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118738, + "balance_loss_mlp": 1.09482586, + "epoch": 0.3172373989996152, + "flos": 828573106176.0, + "grad_norm": 0.025050943971751935, + "language_loss": 0.91967106, + "learning_rate": 0.0007988106306961917, + "loss": 0.93154484, + "num_input_tokens_seen": 136612592, + "router_z_loss_mlp": 0.92431641, + "step": 1649, + "time_per_iteration": 3.1265392303466797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183645, + "balance_loss_mlp": 1.09151971, + "epoch": 0.3174297806848788, + "flos": 528434090496.0, + "grad_norm": 0.026893421102733506, + "language_loss": 0.92866611, + "learning_rate": 0.0007985607847695014, + "loss": 0.94050252, + "num_input_tokens_seen": 136684336, + "router_z_loss_mlp": 0.91992188, + "step": 1650, + "time_per_iteration": 2.640529155731201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184032, + "balance_loss_mlp": 1.09152567, + "epoch": 0.31762216237014235, + "flos": 714481327104.0, + "grad_norm": 0.024008942139765378, + "language_loss": 0.9102264, + "learning_rate": 0.0007983108229312345, + "loss": 0.92206669, + "num_input_tokens_seen": 136766400, + "router_z_loss_mlp": 0.92382812, + "step": 1651, + "time_per_iteration": 2.890881299972534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183971, + "balance_loss_mlp": 1.09170341, + "epoch": 0.31781454405540593, + "flos": 484799826432.0, + "grad_norm": 0.027702532543066302, + "language_loss": 0.9509185, + "learning_rate": 0.0007980607452784351, + "loss": 0.96275818, + "num_input_tokens_seen": 136834016, + "router_z_loss_mlp": 0.92138672, + "step": 1652, + "time_per_iteration": 2.5693578720092773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118418, + "balance_loss_mlp": 1.09186423, + "epoch": 0.31800692574066947, + "flos": 549804059136.0, + "grad_norm": 0.028510736103347943, + "language_loss": 0.99507928, + "learning_rate": 0.0007978105519081919, + "loss": 1.00692105, + "num_input_tokens_seen": 136906288, + "router_z_loss_mlp": 0.921875, + "step": 1653, + "time_per_iteration": 2.674062967300415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181597, + "balance_loss_mlp": 1.08947253, + "epoch": 0.31819930742593305, + "flos": 517916292096.0, + "grad_norm": 0.029899238666621586, + "language_loss": 0.96953475, + "learning_rate": 0.0007975602429176385, + "loss": 0.98135078, + "num_input_tokens_seen": 136972416, + "router_z_loss_mlp": 0.91992188, + "step": 1654, + "time_per_iteration": 2.595107316970825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011812, + "balance_loss_mlp": 1.08907461, + "epoch": 0.31839168911119664, + "flos": 456969457152.0, + "grad_norm": 0.02327460697487094, + "language_loss": 0.90136862, + "learning_rate": 0.0007973098184039536, + "loss": 0.91318059, + "num_input_tokens_seen": 137044576, + "router_z_loss_mlp": 0.91992188, + "step": 1655, + "time_per_iteration": 2.654873847961426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184047, + "balance_loss_mlp": 1.09192252, + "epoch": 0.3185840707964602, + "flos": 627295391232.0, + "grad_norm": 0.025652000789891626, + "language_loss": 0.955365, + "learning_rate": 0.0007970592784643602, + "loss": 0.96720552, + "num_input_tokens_seen": 137125120, + "router_z_loss_mlp": 0.91992188, + "step": 1656, + "time_per_iteration": 2.8485612869262695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183486, + "balance_loss_mlp": 1.09107482, + "epoch": 0.31877645248172376, + "flos": 568540712448.0, + "grad_norm": 0.02977939264047221, + "language_loss": 0.94253254, + "learning_rate": 0.0007968086231961272, + "loss": 0.9543674, + "num_input_tokens_seen": 137195344, + "router_z_loss_mlp": 0.92285156, + "step": 1657, + "time_per_iteration": 2.6949312686920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182357, + "balance_loss_mlp": 1.09004128, + "epoch": 0.3189688341669873, + "flos": 490552551936.0, + "grad_norm": 0.03598298081414456, + "language_loss": 0.95643866, + "learning_rate": 0.0007965578526965671, + "loss": 0.96826226, + "num_input_tokens_seen": 137261040, + "router_z_loss_mlp": 0.921875, + "step": 1658, + "time_per_iteration": 2.5717341899871826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182583, + "balance_loss_mlp": 1.09012401, + "epoch": 0.3191612158522509, + "flos": 577380647424.0, + "grad_norm": 0.02594626841132509, + "language_loss": 0.93226576, + "learning_rate": 0.0007963069670630377, + "loss": 0.94409156, + "num_input_tokens_seen": 137334400, + "router_z_loss_mlp": 0.92333984, + "step": 1659, + "time_per_iteration": 2.7431960105895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187517, + "balance_loss_mlp": 1.09486747, + "epoch": 0.3193535975375144, + "flos": 539192934912.0, + "grad_norm": 0.026552556196046555, + "language_loss": 0.97412628, + "learning_rate": 0.0007960559663929416, + "loss": 0.98600149, + "num_input_tokens_seen": 137405344, + "router_z_loss_mlp": 0.92529297, + "step": 1660, + "time_per_iteration": 2.631037473678589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186332, + "balance_loss_mlp": 1.09382606, + "epoch": 0.319545979222778, + "flos": 735627714048.0, + "grad_norm": 0.022912970149823363, + "language_loss": 0.94840437, + "learning_rate": 0.0007958048507837259, + "loss": 0.96026772, + "num_input_tokens_seen": 137486016, + "router_z_loss_mlp": 0.92382812, + "step": 1661, + "time_per_iteration": 2.925752878189087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191424, + "balance_loss_mlp": 1.09872651, + "epoch": 0.31973836090804153, + "flos": 765767760384.0, + "grad_norm": 0.030797304976158044, + "language_loss": 0.98320282, + "learning_rate": 0.0007955536203328822, + "loss": 0.99511707, + "num_input_tokens_seen": 137562304, + "router_z_loss_mlp": 0.92578125, + "step": 1662, + "time_per_iteration": 2.9076955318450928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187513, + "balance_loss_mlp": 1.09486389, + "epoch": 0.3199307425933051, + "flos": 561741937152.0, + "grad_norm": 0.02511010738984868, + "language_loss": 0.90468192, + "learning_rate": 0.0007953022751379469, + "loss": 0.91655713, + "num_input_tokens_seen": 137639248, + "router_z_loss_mlp": 0.92529297, + "step": 1663, + "time_per_iteration": 2.7703394889831543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188156, + "balance_loss_mlp": 1.09564936, + "epoch": 0.3201231242785687, + "flos": 752671041024.0, + "grad_norm": 0.029121282383782986, + "language_loss": 0.92101777, + "learning_rate": 0.000795050815296501, + "loss": 0.93289936, + "num_input_tokens_seen": 137718256, + "router_z_loss_mlp": 0.92382812, + "step": 1664, + "time_per_iteration": 2.966632843017578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188504, + "balance_loss_mlp": 1.0960933, + "epoch": 0.32031550596383224, + "flos": 497384254464.0, + "grad_norm": 0.02307975398987516, + "language_loss": 1.00050378, + "learning_rate": 0.0007947992409061695, + "loss": 1.01238883, + "num_input_tokens_seen": 137785216, + "router_z_loss_mlp": 0.92285156, + "step": 1665, + "time_per_iteration": 2.6264171600341797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193124, + "balance_loss_mlp": 1.10080826, + "epoch": 0.3205078876490958, + "flos": 732874876416.0, + "grad_norm": 0.02454331261307917, + "language_loss": 0.93550396, + "learning_rate": 0.0007945475520646226, + "loss": 0.9474352, + "num_input_tokens_seen": 137863424, + "router_z_loss_mlp": 0.921875, + "step": 1666, + "time_per_iteration": 2.915013551712036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191587, + "balance_loss_mlp": 1.09941399, + "epoch": 0.32070026933435936, + "flos": 550474804224.0, + "grad_norm": 0.02796219722650757, + "language_loss": 0.9429689, + "learning_rate": 0.0007942957488695743, + "loss": 0.95488477, + "num_input_tokens_seen": 137930384, + "router_z_loss_mlp": 0.92041016, + "step": 1667, + "time_per_iteration": 2.621396780014038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186724, + "balance_loss_mlp": 1.09421742, + "epoch": 0.32089265101962294, + "flos": 746684000256.0, + "grad_norm": 0.022875326013334737, + "language_loss": 0.87680244, + "learning_rate": 0.0007940438314187833, + "loss": 0.88866973, + "num_input_tokens_seen": 138017200, + "router_z_loss_mlp": 0.92382812, + "step": 1668, + "time_per_iteration": 3.0475997924804688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187112, + "balance_loss_mlp": 1.0947485, + "epoch": 0.3210850327048865, + "flos": 495196101120.0, + "grad_norm": 0.03400858364934581, + "language_loss": 0.88502395, + "learning_rate": 0.0007937917998100529, + "loss": 0.89689511, + "num_input_tokens_seen": 138084048, + "router_z_loss_mlp": 0.92236328, + "step": 1669, + "time_per_iteration": 2.6158430576324463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188853, + "balance_loss_mlp": 1.09658515, + "epoch": 0.32127741439015006, + "flos": 531673022976.0, + "grad_norm": 0.029937804889017615, + "language_loss": 0.92354518, + "learning_rate": 0.0007935396541412302, + "loss": 0.93543375, + "num_input_tokens_seen": 138153280, + "router_z_loss_mlp": 0.92138672, + "step": 1670, + "time_per_iteration": 2.6148414611816406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188159, + "balance_loss_mlp": 1.09589148, + "epoch": 0.3214697960754136, + "flos": 502223187456.0, + "grad_norm": 0.027719397006423088, + "language_loss": 0.94146281, + "learning_rate": 0.0007932873945102068, + "loss": 0.95334446, + "num_input_tokens_seen": 138222320, + "router_z_loss_mlp": 0.92138672, + "step": 1671, + "time_per_iteration": 2.5756680965423584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189911, + "balance_loss_mlp": 1.09950256, + "epoch": 0.3216621777606772, + "flos": 1386402089472.0, + "grad_norm": 0.015471737686433536, + "language_loss": 0.75761777, + "learning_rate": 0.0007930350210149188, + "loss": 0.76951689, + "num_input_tokens_seen": 138449488, + "router_z_loss_mlp": 0.90234375, + "step": 1672, + "time_per_iteration": 4.848818778991699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181453, + "balance_loss_mlp": 1.08975732, + "epoch": 0.32185455944594077, + "flos": 572635040256.0, + "grad_norm": 0.021338606013939526, + "language_loss": 0.94597888, + "learning_rate": 0.0007927825337533461, + "loss": 0.95779347, + "num_input_tokens_seen": 138522496, + "router_z_loss_mlp": 0.91552734, + "step": 1673, + "time_per_iteration": 2.6742517948150635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181114, + "balance_loss_mlp": 1.08975172, + "epoch": 0.3220469411312043, + "flos": 544936928256.0, + "grad_norm": 0.029706455848313437, + "language_loss": 0.9645716, + "learning_rate": 0.0007925299328235131, + "loss": 0.97638273, + "num_input_tokens_seen": 138590096, + "router_z_loss_mlp": 0.91210938, + "step": 1674, + "time_per_iteration": 2.637598991394043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182375, + "balance_loss_mlp": 1.09101272, + "epoch": 0.3222393228164679, + "flos": 492161284608.0, + "grad_norm": 0.02873592636128419, + "language_loss": 0.969607, + "learning_rate": 0.000792277218323488, + "loss": 0.98143071, + "num_input_tokens_seen": 138658224, + "router_z_loss_mlp": 0.91210938, + "step": 1675, + "time_per_iteration": 2.589118719100952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182718, + "balance_loss_mlp": 1.0914042, + "epoch": 0.3224317045017314, + "flos": 491362285056.0, + "grad_norm": 0.026517432951267347, + "language_loss": 0.94174361, + "learning_rate": 0.0007920243903513833, + "loss": 0.95357084, + "num_input_tokens_seen": 138722864, + "router_z_loss_mlp": 0.91162109, + "step": 1676, + "time_per_iteration": 2.5541775226593018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179593, + "balance_loss_mlp": 1.08832622, + "epoch": 0.322624086186995, + "flos": 576870357504.0, + "grad_norm": 0.028460659829427477, + "language_loss": 0.94868386, + "learning_rate": 0.0007917714490053556, + "loss": 0.96047986, + "num_input_tokens_seen": 138791472, + "router_z_loss_mlp": 0.91113281, + "step": 1677, + "time_per_iteration": 2.685833215713501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196193, + "balance_loss_mlp": 1.10454535, + "epoch": 0.32281646787225854, + "flos": 630571253760.0, + "grad_norm": 0.02861547850998442, + "language_loss": 0.93624204, + "learning_rate": 0.0007915183943836055, + "loss": 0.94820398, + "num_input_tokens_seen": 138873424, + "router_z_loss_mlp": 0.91503906, + "step": 1678, + "time_per_iteration": 2.8957157135009766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184806, + "balance_loss_mlp": 1.09363461, + "epoch": 0.3230088495575221, + "flos": 782807084544.0, + "grad_norm": 0.029736135795599906, + "language_loss": 0.92990124, + "learning_rate": 0.0007912652265843773, + "loss": 0.94174933, + "num_input_tokens_seen": 138956880, + "router_z_loss_mlp": 0.91015625, + "step": 1679, + "time_per_iteration": 3.0256145000457764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187663, + "balance_loss_mlp": 1.09620523, + "epoch": 0.3232012312427857, + "flos": 537200165376.0, + "grad_norm": 0.0299548546326655, + "language_loss": 0.88938797, + "learning_rate": 0.0007910119457059597, + "loss": 0.90126455, + "num_input_tokens_seen": 139031296, + "router_z_loss_mlp": 0.91308594, + "step": 1680, + "time_per_iteration": 2.7195773124694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118719, + "balance_loss_mlp": 1.09601843, + "epoch": 0.32339361292804925, + "flos": 706232272896.0, + "grad_norm": 0.03079987155163935, + "language_loss": 0.89790422, + "learning_rate": 0.0007907585518466849, + "loss": 0.90977609, + "num_input_tokens_seen": 139109776, + "router_z_loss_mlp": 0.91015625, + "step": 1681, + "time_per_iteration": 2.9635961055755615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186411, + "balance_loss_mlp": 1.09523988, + "epoch": 0.32358599461331283, + "flos": 453257164800.0, + "grad_norm": 0.027692195030378806, + "language_loss": 0.99450397, + "learning_rate": 0.000790505045104929, + "loss": 1.00636816, + "num_input_tokens_seen": 139174736, + "router_z_loss_mlp": 0.91015625, + "step": 1682, + "time_per_iteration": 2.5084030628204346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186896, + "balance_loss_mlp": 1.09553456, + "epoch": 0.32377837629857636, + "flos": 602091606528.0, + "grad_norm": 0.028152445524849662, + "language_loss": 0.96712899, + "learning_rate": 0.0007902514255791125, + "loss": 0.97899795, + "num_input_tokens_seen": 139252064, + "router_z_loss_mlp": 0.91210938, + "step": 1683, + "time_per_iteration": 2.7732536792755127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185338, + "balance_loss_mlp": 1.09388101, + "epoch": 0.32397075798383995, + "flos": 808898465280.0, + "grad_norm": 0.02645952871958238, + "language_loss": 0.9579218, + "learning_rate": 0.0007899976933676986, + "loss": 0.9697752, + "num_input_tokens_seen": 139333328, + "router_z_loss_mlp": 0.91308594, + "step": 1684, + "time_per_iteration": 2.985987424850464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184012, + "balance_loss_mlp": 1.09274495, + "epoch": 0.3241631396691035, + "flos": 602792550912.0, + "grad_norm": 0.02682215462305332, + "language_loss": 0.96423018, + "learning_rate": 0.0007897438485691955, + "loss": 0.97607034, + "num_input_tokens_seen": 139400976, + "router_z_loss_mlp": 0.91113281, + "step": 1685, + "time_per_iteration": 2.673083543777466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185177, + "balance_loss_mlp": 1.09386301, + "epoch": 0.32435552135436707, + "flos": 475176354816.0, + "grad_norm": 0.030260846574811467, + "language_loss": 0.93327641, + "learning_rate": 0.0007894898912821542, + "loss": 0.9451282, + "num_input_tokens_seen": 139465664, + "router_z_loss_mlp": 0.91162109, + "step": 1686, + "time_per_iteration": 2.526704788208008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181419, + "balance_loss_mlp": 1.09015274, + "epoch": 0.3245479030396306, + "flos": 539219131392.0, + "grad_norm": 0.02519584895765407, + "language_loss": 0.95407552, + "learning_rate": 0.0007892358216051695, + "loss": 0.96588969, + "num_input_tokens_seen": 139541984, + "router_z_loss_mlp": 0.91113281, + "step": 1687, + "time_per_iteration": 2.718292713165283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186611, + "balance_loss_mlp": 1.09543955, + "epoch": 0.3247402847248942, + "flos": 548696884224.0, + "grad_norm": 0.02873183694146744, + "language_loss": 1.00761271, + "learning_rate": 0.0007889816396368803, + "loss": 1.0194788, + "num_input_tokens_seen": 139607408, + "router_z_loss_mlp": 0.91015625, + "step": 1688, + "time_per_iteration": 2.6112852096557617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179714, + "balance_loss_mlp": 1.08835161, + "epoch": 0.3249326664101578, + "flos": 378992030208.0, + "grad_norm": 0.0263136625306578, + "language_loss": 0.95246112, + "learning_rate": 0.0007887273454759687, + "loss": 0.96425825, + "num_input_tokens_seen": 139670000, + "router_z_loss_mlp": 0.91210938, + "step": 1689, + "time_per_iteration": 2.466093063354492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185248, + "balance_loss_mlp": 1.09407663, + "epoch": 0.3251250480954213, + "flos": 529122299904.0, + "grad_norm": 0.02633136368880149, + "language_loss": 0.91763788, + "learning_rate": 0.0007884729392211603, + "loss": 0.92949039, + "num_input_tokens_seen": 139739872, + "router_z_loss_mlp": 0.91015625, + "step": 1690, + "time_per_iteration": 2.633387804031372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182102, + "balance_loss_mlp": 1.09054887, + "epoch": 0.3253174297806849, + "flos": 450558721536.0, + "grad_norm": 0.03256384134880849, + "language_loss": 0.96271229, + "learning_rate": 0.0007882184209712245, + "loss": 0.97453332, + "num_input_tokens_seen": 139802032, + "router_z_loss_mlp": 0.9140625, + "step": 1691, + "time_per_iteration": 2.511629104614258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183951, + "balance_loss_mlp": 1.09239864, + "epoch": 0.32550981146594843, + "flos": 705489669120.0, + "grad_norm": 0.02306884235196454, + "language_loss": 0.92818689, + "learning_rate": 0.000787963790824974, + "loss": 0.9400264, + "num_input_tokens_seen": 139885648, + "router_z_loss_mlp": 0.9140625, + "step": 1692, + "time_per_iteration": 2.953939914703369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118506, + "balance_loss_mlp": 1.0935545, + "epoch": 0.325702193151212, + "flos": 393558494208.0, + "grad_norm": 0.026666894987577915, + "language_loss": 0.98025191, + "learning_rate": 0.0007877090488812651, + "loss": 0.9921025, + "num_input_tokens_seen": 139947920, + "router_z_loss_mlp": 0.91357422, + "step": 1693, + "time_per_iteration": 2.4410316944122314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178009, + "balance_loss_mlp": 1.08659911, + "epoch": 0.32589457483647555, + "flos": 578583149568.0, + "grad_norm": 0.029080232987036207, + "language_loss": 0.92532402, + "learning_rate": 0.0007874541952389973, + "loss": 0.93710411, + "num_input_tokens_seen": 140020048, + "router_z_loss_mlp": 0.91259766, + "step": 1694, + "time_per_iteration": 2.660390853881836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179003, + "balance_loss_mlp": 1.08792675, + "epoch": 0.32608695652173914, + "flos": 499329360384.0, + "grad_norm": 0.023433013698769337, + "language_loss": 0.93903476, + "learning_rate": 0.0007871992299971136, + "loss": 0.9508248, + "num_input_tokens_seen": 140085600, + "router_z_loss_mlp": 0.90917969, + "step": 1695, + "time_per_iteration": 2.5506269931793213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179394, + "balance_loss_mlp": 1.08822274, + "epoch": 0.32627933820700267, + "flos": 592300948992.0, + "grad_norm": 0.02355558557065364, + "language_loss": 0.91491008, + "learning_rate": 0.0007869441532546001, + "loss": 0.92670405, + "num_input_tokens_seen": 140155152, + "router_z_loss_mlp": 0.91015625, + "step": 1696, + "time_per_iteration": 2.7493326663970947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177542, + "balance_loss_mlp": 1.08618009, + "epoch": 0.32647171989226625, + "flos": 610273531392.0, + "grad_norm": 0.02705729718991907, + "language_loss": 0.87004846, + "learning_rate": 0.0007866889651104867, + "loss": 0.8818239, + "num_input_tokens_seen": 140228560, + "router_z_loss_mlp": 0.91210938, + "step": 1697, + "time_per_iteration": 2.7824432849884033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179221, + "balance_loss_mlp": 1.08785892, + "epoch": 0.32666410157752984, + "flos": 478189704192.0, + "grad_norm": 0.028152017440838794, + "language_loss": 0.94142878, + "learning_rate": 0.000786433665663846, + "loss": 0.95322108, + "num_input_tokens_seen": 140297952, + "router_z_loss_mlp": 0.91210938, + "step": 1698, + "time_per_iteration": 2.6674411296844482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187877, + "balance_loss_mlp": 1.09670568, + "epoch": 0.3268564832627934, + "flos": 719693563392.0, + "grad_norm": 0.040459779361444057, + "language_loss": 0.95728016, + "learning_rate": 0.0007861782550137942, + "loss": 0.96915889, + "num_input_tokens_seen": 140373408, + "router_z_loss_mlp": 0.91015625, + "step": 1699, + "time_per_iteration": 2.923370599746704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187429, + "balance_loss_mlp": 1.09625793, + "epoch": 0.32704886494805696, + "flos": 770105135616.0, + "grad_norm": 0.025720199745930695, + "language_loss": 0.93479955, + "learning_rate": 0.0007859227332594901, + "loss": 0.94667387, + "num_input_tokens_seen": 140451840, + "router_z_loss_mlp": 0.91015625, + "step": 1700, + "time_per_iteration": 2.94014573097229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191948, + "balance_loss_mlp": 1.10120583, + "epoch": 0.3272412466333205, + "flos": 851404087296.0, + "grad_norm": 0.0329500691508657, + "language_loss": 0.94768298, + "learning_rate": 0.0007856671005001365, + "loss": 0.95960248, + "num_input_tokens_seen": 140537696, + "router_z_loss_mlp": 0.90576172, + "step": 1701, + "time_per_iteration": 3.1774539947509766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118211, + "balance_loss_mlp": 1.09065294, + "epoch": 0.3274336283185841, + "flos": 833040737280.0, + "grad_norm": 0.029774404200988806, + "language_loss": 0.90405869, + "learning_rate": 0.0007854113568349787, + "loss": 0.91587985, + "num_input_tokens_seen": 140623536, + "router_z_loss_mlp": 0.91308594, + "step": 1702, + "time_per_iteration": 3.107083559036255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186026, + "balance_loss_mlp": 1.09471202, + "epoch": 0.3276260100038476, + "flos": 693252347904.0, + "grad_norm": 0.029328613393929583, + "language_loss": 0.89606428, + "learning_rate": 0.0007851555023633052, + "loss": 0.90792453, + "num_input_tokens_seen": 140700688, + "router_z_loss_mlp": 0.91162109, + "step": 1703, + "time_per_iteration": 2.8335254192352295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011877, + "balance_loss_mlp": 1.09643364, + "epoch": 0.3278183916891112, + "flos": 436977908736.0, + "grad_norm": 0.03479764223743197, + "language_loss": 0.91987431, + "learning_rate": 0.0007848995371844474, + "loss": 0.93175125, + "num_input_tokens_seen": 140765808, + "router_z_loss_mlp": 0.91113281, + "step": 1704, + "time_per_iteration": 2.51261043548584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118827, + "balance_loss_mlp": 1.09728956, + "epoch": 0.3280107733743748, + "flos": 462016508928.0, + "grad_norm": 0.027955151013136243, + "language_loss": 0.90236068, + "learning_rate": 0.0007846434613977801, + "loss": 0.91424334, + "num_input_tokens_seen": 140830512, + "router_z_loss_mlp": 0.90820312, + "step": 1705, + "time_per_iteration": 2.51505708694458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185335, + "balance_loss_mlp": 1.09464061, + "epoch": 0.3282031550596383, + "flos": 680528931840.0, + "grad_norm": 0.0285448105624817, + "language_loss": 0.86403298, + "learning_rate": 0.0007843872751027203, + "loss": 0.87588632, + "num_input_tokens_seen": 140902816, + "router_z_loss_mlp": 0.90527344, + "step": 1706, + "time_per_iteration": 2.7977733612060547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183945, + "balance_loss_mlp": 1.0931555, + "epoch": 0.3283955367449019, + "flos": 546254949888.0, + "grad_norm": 0.024438576566567966, + "language_loss": 0.93906903, + "learning_rate": 0.0007841309783987287, + "loss": 0.95090854, + "num_input_tokens_seen": 140975488, + "router_z_loss_mlp": 0.90625, + "step": 1707, + "time_per_iteration": 2.737680196762085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178748, + "balance_loss_mlp": 1.08757639, + "epoch": 0.32858791843016544, + "flos": 482240371200.0, + "grad_norm": 0.027193371904651382, + "language_loss": 0.97315758, + "learning_rate": 0.0007838745713853084, + "loss": 0.98494506, + "num_input_tokens_seen": 141043248, + "router_z_loss_mlp": 0.91015625, + "step": 1708, + "time_per_iteration": 2.5702459812164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189964, + "balance_loss_mlp": 1.09879303, + "epoch": 0.328780300115429, + "flos": 567915629568.0, + "grad_norm": 0.029427091701823335, + "language_loss": 0.93208408, + "learning_rate": 0.0007836180541620053, + "loss": 0.94398379, + "num_input_tokens_seen": 141119408, + "router_z_loss_mlp": 0.91015625, + "step": 1709, + "time_per_iteration": 2.7365195751190186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189596, + "balance_loss_mlp": 1.09852052, + "epoch": 0.32897268180069256, + "flos": 476991204864.0, + "grad_norm": 0.02924752300223344, + "language_loss": 0.94609785, + "learning_rate": 0.0007833614268284082, + "loss": 0.95799387, + "num_input_tokens_seen": 141184112, + "router_z_loss_mlp": 0.90917969, + "step": 1710, + "time_per_iteration": 2.575416326522827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186913, + "balance_loss_mlp": 1.09745789, + "epoch": 0.32916506348595614, + "flos": 1580450603520.0, + "grad_norm": 0.014653073497659498, + "language_loss": 0.74109769, + "learning_rate": 0.0007831046894841489, + "loss": 0.75296688, + "num_input_tokens_seen": 141414960, + "router_z_loss_mlp": 0.89257812, + "step": 1711, + "time_per_iteration": 4.8569114208221436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117837, + "balance_loss_mlp": 1.08681703, + "epoch": 0.3293574451712197, + "flos": 483851105280.0, + "grad_norm": 0.027096123044633498, + "language_loss": 0.8678506, + "learning_rate": 0.0007828478422289016, + "loss": 0.87963432, + "num_input_tokens_seen": 141485744, + "router_z_loss_mlp": 0.9140625, + "step": 1712, + "time_per_iteration": 2.5748305320739746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181971, + "balance_loss_mlp": 1.09041798, + "epoch": 0.32954982685648326, + "flos": 623724088320.0, + "grad_norm": 0.027491608740018197, + "language_loss": 0.97854888, + "learning_rate": 0.0007825908851623833, + "loss": 0.99036855, + "num_input_tokens_seen": 141560592, + "router_z_loss_mlp": 0.9140625, + "step": 1713, + "time_per_iteration": 2.7387707233428955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180742, + "balance_loss_mlp": 1.0893327, + "epoch": 0.32974220854174685, + "flos": 546070299648.0, + "grad_norm": 0.028986059756107307, + "language_loss": 0.93660253, + "learning_rate": 0.0007823338183843533, + "loss": 0.94840991, + "num_input_tokens_seen": 141630400, + "router_z_loss_mlp": 0.91259766, + "step": 1714, + "time_per_iteration": 2.7061285972595215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194773, + "balance_loss_mlp": 1.10341084, + "epoch": 0.3299345902270104, + "flos": 983822286336.0, + "grad_norm": 0.02918308821255402, + "language_loss": 0.89344442, + "learning_rate": 0.0007820766419946141, + "loss": 0.90539211, + "num_input_tokens_seen": 141721552, + "router_z_loss_mlp": 0.91210938, + "step": 1715, + "time_per_iteration": 3.2698333263397217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119133, + "balance_loss_mlp": 1.10206604, + "epoch": 0.33012697191227397, + "flos": 1406901926400.0, + "grad_norm": 0.008988097140154246, + "language_loss": 0.7967248, + "learning_rate": 0.0007818193560930102, + "loss": 0.8086381, + "num_input_tokens_seen": 141956464, + "router_z_loss_mlp": 0.890625, + "step": 1716, + "time_per_iteration": 4.931420564651489 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193588, + "balance_loss_mlp": 1.10213029, + "epoch": 0.3303193535975375, + "flos": 506169795072.0, + "grad_norm": 0.03043585823380059, + "language_loss": 0.87317824, + "learning_rate": 0.0007815619607794288, + "loss": 0.88511419, + "num_input_tokens_seen": 142029552, + "router_z_loss_mlp": 0.91308594, + "step": 1717, + "time_per_iteration": 2.611924171447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198413, + "balance_loss_mlp": 1.10676467, + "epoch": 0.3305117352828011, + "flos": 939484349952.0, + "grad_norm": 0.029759763631388395, + "language_loss": 0.92828202, + "learning_rate": 0.0007813044561538001, + "loss": 0.94026613, + "num_input_tokens_seen": 142117344, + "router_z_loss_mlp": 0.91503906, + "step": 1718, + "time_per_iteration": 3.188633680343628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186368, + "balance_loss_mlp": 1.09495842, + "epoch": 0.3307041169680646, + "flos": 722793507840.0, + "grad_norm": 0.027827869889066197, + "language_loss": 0.97286105, + "learning_rate": 0.0007810468423160958, + "loss": 0.9847247, + "num_input_tokens_seen": 142190096, + "router_z_loss_mlp": 0.91259766, + "step": 1719, + "time_per_iteration": 2.8963494300842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179653, + "balance_loss_mlp": 1.08829057, + "epoch": 0.3308964986533282, + "flos": 584815965696.0, + "grad_norm": 0.0232486528054596, + "language_loss": 0.89203978, + "learning_rate": 0.0007807891193663306, + "loss": 0.90383637, + "num_input_tokens_seen": 142265584, + "router_z_loss_mlp": 0.91210938, + "step": 1720, + "time_per_iteration": 2.784005880355835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188579, + "balance_loss_mlp": 1.09712148, + "epoch": 0.33108888033859174, + "flos": 474525075456.0, + "grad_norm": 0.03234593548431852, + "language_loss": 0.92577451, + "learning_rate": 0.0007805312874045614, + "loss": 0.93766028, + "num_input_tokens_seen": 142330352, + "router_z_loss_mlp": 0.91308594, + "step": 1721, + "time_per_iteration": 2.5072579383850098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187856, + "balance_loss_mlp": 1.09635103, + "epoch": 0.3312812620238553, + "flos": 386996035584.0, + "grad_norm": 0.030880666413309405, + "language_loss": 0.96009982, + "learning_rate": 0.0007802733465308874, + "loss": 0.97197837, + "num_input_tokens_seen": 142392208, + "router_z_loss_mlp": 0.91357422, + "step": 1722, + "time_per_iteration": 2.460878372192383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193288, + "balance_loss_mlp": 1.10173571, + "epoch": 0.3314736437091189, + "flos": 495604333056.0, + "grad_norm": 0.02871647017272099, + "language_loss": 0.9219079, + "learning_rate": 0.0007800152968454501, + "loss": 0.93384075, + "num_input_tokens_seen": 142462112, + "router_z_loss_mlp": 0.9140625, + "step": 1723, + "time_per_iteration": 2.6537680625915527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185112, + "balance_loss_mlp": 1.09365499, + "epoch": 0.33166602539438245, + "flos": 654930376704.0, + "grad_norm": 0.0223046700763118, + "language_loss": 0.96869862, + "learning_rate": 0.0007797571384484334, + "loss": 0.98054969, + "num_input_tokens_seen": 142539120, + "router_z_loss_mlp": 0.91308594, + "step": 1724, + "time_per_iteration": 2.8509135246276855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180603, + "balance_loss_mlp": 1.08909798, + "epoch": 0.33185840707964603, + "flos": 521834701824.0, + "grad_norm": 0.02731483808063424, + "language_loss": 1.00636935, + "learning_rate": 0.0007794988714400633, + "loss": 1.01817536, + "num_input_tokens_seen": 142611520, + "router_z_loss_mlp": 0.91357422, + "step": 1725, + "time_per_iteration": 2.5883586406707764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180377, + "balance_loss_mlp": 1.08901501, + "epoch": 0.33205078876490957, + "flos": 437898432000.0, + "grad_norm": 0.028871117282170154, + "language_loss": 0.94438303, + "learning_rate": 0.0007792404959206079, + "loss": 0.95618677, + "num_input_tokens_seen": 142676064, + "router_z_loss_mlp": 0.91210938, + "step": 1726, + "time_per_iteration": 2.522392988204956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196305, + "balance_loss_mlp": 1.10499096, + "epoch": 0.33224317045017315, + "flos": 770094402048.0, + "grad_norm": 0.026417182809826974, + "language_loss": 0.89548182, + "learning_rate": 0.0007789820119903774, + "loss": 0.90744483, + "num_input_tokens_seen": 142750944, + "router_z_loss_mlp": 0.91162109, + "step": 1727, + "time_per_iteration": 3.015399217605591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119368, + "balance_loss_mlp": 1.10441589, + "epoch": 0.3324355521354367, + "flos": 1469293584384.0, + "grad_norm": 0.009201187704085647, + "language_loss": 0.78492665, + "learning_rate": 0.0007787234197497242, + "loss": 0.79686344, + "num_input_tokens_seen": 142974032, + "router_z_loss_mlp": 0.890625, + "step": 1728, + "time_per_iteration": 4.849627494812012 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187682, + "balance_loss_mlp": 1.09641564, + "epoch": 0.3326279338207003, + "flos": 497799217152.0, + "grad_norm": 0.02618775195690524, + "language_loss": 0.91979456, + "learning_rate": 0.0007784647192990428, + "loss": 0.93167138, + "num_input_tokens_seen": 143047280, + "router_z_loss_mlp": 0.91113281, + "step": 1729, + "time_per_iteration": 2.6944785118103027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178599, + "balance_loss_mlp": 1.08761811, + "epoch": 0.33282031550596386, + "flos": 637053121536.0, + "grad_norm": 0.02771760173732663, + "language_loss": 0.88792735, + "learning_rate": 0.0007782059107387696, + "loss": 0.89971334, + "num_input_tokens_seen": 143124224, + "router_z_loss_mlp": 0.90820312, + "step": 1730, + "time_per_iteration": 2.8583710193634033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179548, + "balance_loss_mlp": 1.0887109, + "epoch": 0.3330126971912274, + "flos": 690721090560.0, + "grad_norm": 0.027739782699759397, + "language_loss": 0.98025161, + "learning_rate": 0.0007779469941693826, + "loss": 0.99204707, + "num_input_tokens_seen": 143194048, + "router_z_loss_mlp": 0.90673828, + "step": 1731, + "time_per_iteration": 2.810589075088501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184359, + "balance_loss_mlp": 1.09361696, + "epoch": 0.333205078876491, + "flos": 567553059840.0, + "grad_norm": 0.03096728777448764, + "language_loss": 0.86715639, + "learning_rate": 0.0007776879696914029, + "loss": 0.87899995, + "num_input_tokens_seen": 143272976, + "router_z_loss_mlp": 0.90576172, + "step": 1732, + "time_per_iteration": 2.8331797122955322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179804, + "balance_loss_mlp": 1.08906233, + "epoch": 0.3333974605617545, + "flos": 642170030592.0, + "grad_norm": 0.024377484958938406, + "language_loss": 0.95668435, + "learning_rate": 0.000777428837405392, + "loss": 0.96848238, + "num_input_tokens_seen": 143346496, + "router_z_loss_mlp": 0.90576172, + "step": 1733, + "time_per_iteration": 2.8495984077453613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178278, + "balance_loss_mlp": 1.087345, + "epoch": 0.3335898422470181, + "flos": 462778578432.0, + "grad_norm": 0.02888991438897714, + "language_loss": 0.96001673, + "learning_rate": 0.0007771695974119544, + "loss": 0.97179955, + "num_input_tokens_seen": 143410448, + "router_z_loss_mlp": 0.90771484, + "step": 1734, + "time_per_iteration": 2.581843614578247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193993, + "balance_loss_mlp": 1.10267842, + "epoch": 0.33378222393228163, + "flos": 854336845824.0, + "grad_norm": 0.031032438471150628, + "language_loss": 0.84453082, + "learning_rate": 0.0007769102498117359, + "loss": 0.85647076, + "num_input_tokens_seen": 143492416, + "router_z_loss_mlp": 0.91162109, + "step": 1735, + "time_per_iteration": 3.092892646789551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118579, + "balance_loss_mlp": 1.09471452, + "epoch": 0.3339746056175452, + "flos": 956308824576.0, + "grad_norm": 0.02638013374987503, + "language_loss": 0.87690091, + "learning_rate": 0.000776650794705424, + "loss": 0.88875878, + "num_input_tokens_seen": 143590096, + "router_z_loss_mlp": 0.90917969, + "step": 1736, + "time_per_iteration": 3.26749587059021 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188294, + "balance_loss_mlp": 1.09693241, + "epoch": 0.33416698730280875, + "flos": 545894381568.0, + "grad_norm": 0.025194797458818457, + "language_loss": 0.89670336, + "learning_rate": 0.0007763912321937483, + "loss": 0.90858638, + "num_input_tokens_seen": 143663344, + "router_z_loss_mlp": 0.91210938, + "step": 1737, + "time_per_iteration": 2.680321455001831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186775, + "balance_loss_mlp": 1.09522188, + "epoch": 0.33435936898807234, + "flos": 1015875237888.0, + "grad_norm": 0.02847992800895855, + "language_loss": 0.91932124, + "learning_rate": 0.0007761315623774799, + "loss": 0.93118894, + "num_input_tokens_seen": 143753072, + "router_z_loss_mlp": 0.9140625, + "step": 1738, + "time_per_iteration": 3.3992278575897217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191791, + "balance_loss_mlp": 1.10014248, + "epoch": 0.3345517506733359, + "flos": 616371362304.0, + "grad_norm": 0.027566762490977777, + "language_loss": 0.97487831, + "learning_rate": 0.0007758717853574313, + "loss": 0.9867962, + "num_input_tokens_seen": 143827280, + "router_z_loss_mlp": 0.91503906, + "step": 1739, + "time_per_iteration": 2.7331244945526123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195023, + "balance_loss_mlp": 1.10327947, + "epoch": 0.33474413235859946, + "flos": 495569404416.0, + "grad_norm": 0.027457607023843998, + "language_loss": 0.9961037, + "learning_rate": 0.0007756119012344571, + "loss": 1.00805402, + "num_input_tokens_seen": 143895072, + "router_z_loss_mlp": 0.91601562, + "step": 1740, + "time_per_iteration": 2.5305063724517822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189378, + "balance_loss_mlp": 1.09772944, + "epoch": 0.33493651404386304, + "flos": 629487547392.0, + "grad_norm": 0.029043894294382887, + "language_loss": 0.93616855, + "learning_rate": 0.0007753519101094535, + "loss": 0.9480623, + "num_input_tokens_seen": 143965728, + "router_z_loss_mlp": 0.91503906, + "step": 1741, + "time_per_iteration": 2.7408056259155273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177762, + "balance_loss_mlp": 1.08630431, + "epoch": 0.3351288957291266, + "flos": 514742487552.0, + "grad_norm": 0.027889242250670986, + "language_loss": 0.95720202, + "learning_rate": 0.0007750918120833575, + "loss": 0.96897966, + "num_input_tokens_seen": 144030272, + "router_z_loss_mlp": 0.91308594, + "step": 1742, + "time_per_iteration": 2.5787625312805176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179593, + "balance_loss_mlp": 1.08818376, + "epoch": 0.33532127741439016, + "flos": 648482711040.0, + "grad_norm": 0.029208114264274002, + "language_loss": 0.95614851, + "learning_rate": 0.0007748316072571485, + "loss": 0.96794444, + "num_input_tokens_seen": 144104048, + "router_z_loss_mlp": 0.91259766, + "step": 1743, + "time_per_iteration": 2.751394033432007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178526, + "balance_loss_mlp": 1.08764088, + "epoch": 0.3355136590996537, + "flos": 769788228096.0, + "grad_norm": 0.02678280054581141, + "language_loss": 0.86505532, + "learning_rate": 0.0007745712957318467, + "loss": 0.87684047, + "num_input_tokens_seen": 144180432, + "router_z_loss_mlp": 0.90722656, + "step": 1744, + "time_per_iteration": 2.9703569412231445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179715, + "balance_loss_mlp": 1.088925, + "epoch": 0.3357060407849173, + "flos": 596649057792.0, + "grad_norm": 0.023433474800662903, + "language_loss": 0.94101429, + "learning_rate": 0.0007743108776085141, + "loss": 0.95281148, + "num_input_tokens_seen": 144258704, + "router_z_loss_mlp": 0.90625, + "step": 1745, + "time_per_iteration": 2.7529683113098145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184954, + "balance_loss_mlp": 1.09435499, + "epoch": 0.3358984224701808, + "flos": 599801395200.0, + "grad_norm": 0.02538707782704008, + "language_loss": 0.88967884, + "learning_rate": 0.0007740503529882543, + "loss": 0.9015283, + "num_input_tokens_seen": 144335104, + "router_z_loss_mlp": 0.90429688, + "step": 1746, + "time_per_iteration": 2.79131817817688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188552, + "balance_loss_mlp": 1.09780991, + "epoch": 0.3360908041554444, + "flos": 579429812736.0, + "grad_norm": 0.028485119021284356, + "language_loss": 0.99668056, + "learning_rate": 0.0007737897219722114, + "loss": 1.00856614, + "num_input_tokens_seen": 144402912, + "router_z_loss_mlp": 0.90576172, + "step": 1747, + "time_per_iteration": 2.685925006866455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189008, + "balance_loss_mlp": 1.09836173, + "epoch": 0.336283185840708, + "flos": 514620963840.0, + "grad_norm": 0.027318502045144608, + "language_loss": 0.90481317, + "learning_rate": 0.0007735289846615716, + "loss": 0.91670322, + "num_input_tokens_seen": 144475328, + "router_z_loss_mlp": 0.90478516, + "step": 1748, + "time_per_iteration": 2.62443470954895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189766, + "balance_loss_mlp": 1.09902358, + "epoch": 0.3364755675259715, + "flos": 526013623296.0, + "grad_norm": 0.026723032477842582, + "language_loss": 0.90137696, + "learning_rate": 0.0007732681411575621, + "loss": 0.91327465, + "num_input_tokens_seen": 144548288, + "router_z_loss_mlp": 0.90576172, + "step": 1749, + "time_per_iteration": 2.646358013153076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182694, + "balance_loss_mlp": 1.09209466, + "epoch": 0.3366679492112351, + "flos": 555973748736.0, + "grad_norm": 0.023573972968583972, + "language_loss": 0.93333745, + "learning_rate": 0.0007730071915614514, + "loss": 0.94516432, + "num_input_tokens_seen": 144619488, + "router_z_loss_mlp": 0.90429688, + "step": 1750, + "time_per_iteration": 2.6758012771606445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179857, + "balance_loss_mlp": 1.08901942, + "epoch": 0.33686033089649864, + "flos": 428164170240.0, + "grad_norm": 0.030830494146199924, + "language_loss": 0.97502697, + "learning_rate": 0.0007727461359745489, + "loss": 0.98682547, + "num_input_tokens_seen": 144682560, + "router_z_loss_mlp": 0.90673828, + "step": 1751, + "time_per_iteration": 2.4563541412353516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182248, + "balance_loss_mlp": 1.09145832, + "epoch": 0.3370527125817622, + "flos": 542840099328.0, + "grad_norm": 0.023246790346845608, + "language_loss": 0.93729055, + "learning_rate": 0.0007724849744982056, + "loss": 0.94911301, + "num_input_tokens_seen": 144753328, + "router_z_loss_mlp": 0.90625, + "step": 1752, + "time_per_iteration": 2.668113946914673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179422, + "balance_loss_mlp": 1.08858418, + "epoch": 0.33724509426702576, + "flos": 543230866944.0, + "grad_norm": 0.02371236203418416, + "language_loss": 0.90932786, + "learning_rate": 0.0007722237072338131, + "loss": 0.92112207, + "num_input_tokens_seen": 144827312, + "router_z_loss_mlp": 0.90673828, + "step": 1753, + "time_per_iteration": 2.69787335395813 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178324, + "balance_loss_mlp": 1.08753431, + "epoch": 0.33743747595228935, + "flos": 473752272384.0, + "grad_norm": 0.029898359882718887, + "language_loss": 0.95709926, + "learning_rate": 0.0007719623342828046, + "loss": 0.96888256, + "num_input_tokens_seen": 144893488, + "router_z_loss_mlp": 0.90625, + "step": 1754, + "time_per_iteration": 2.4994091987609863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183652, + "balance_loss_mlp": 1.09295714, + "epoch": 0.33762985763755293, + "flos": 470836978176.0, + "grad_norm": 0.02665869511949433, + "language_loss": 0.93777692, + "learning_rate": 0.000771700855746654, + "loss": 0.94961339, + "num_input_tokens_seen": 144961152, + "router_z_loss_mlp": 0.90527344, + "step": 1755, + "time_per_iteration": 2.58086895942688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178715, + "balance_loss_mlp": 1.08792567, + "epoch": 0.33782223932281646, + "flos": 493250995200.0, + "grad_norm": 0.024252070816233498, + "language_loss": 0.95916575, + "learning_rate": 0.0007714392717268763, + "loss": 0.97095293, + "num_input_tokens_seen": 145030576, + "router_z_loss_mlp": 0.90625, + "step": 1756, + "time_per_iteration": 2.5631322860717773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180772, + "balance_loss_mlp": 1.08988702, + "epoch": 0.33801462100808005, + "flos": 466017510912.0, + "grad_norm": 0.025388958299120416, + "language_loss": 0.95127004, + "learning_rate": 0.0007711775823250273, + "loss": 0.96307778, + "num_input_tokens_seen": 145095648, + "router_z_loss_mlp": 0.90722656, + "step": 1757, + "time_per_iteration": 2.5053045749664307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178431, + "balance_loss_mlp": 1.08754551, + "epoch": 0.3382070026933436, + "flos": 797067374592.0, + "grad_norm": 0.024419621343361942, + "language_loss": 0.92107689, + "learning_rate": 0.0007709157876427039, + "loss": 0.93286121, + "num_input_tokens_seen": 145181248, + "router_z_loss_mlp": 0.90722656, + "step": 1758, + "time_per_iteration": 3.1007301807403564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178269, + "balance_loss_mlp": 1.08738351, + "epoch": 0.33839938437860717, + "flos": 509428193280.0, + "grad_norm": 0.024832384176200758, + "language_loss": 0.94253516, + "learning_rate": 0.0007706538877815439, + "loss": 0.95431781, + "num_input_tokens_seen": 145252944, + "router_z_loss_mlp": 0.90722656, + "step": 1759, + "time_per_iteration": 2.588744640350342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178646, + "balance_loss_mlp": 1.0878557, + "epoch": 0.3385917660638707, + "flos": 485273186304.0, + "grad_norm": 0.02369115174437829, + "language_loss": 0.89945841, + "learning_rate": 0.0007703918828432259, + "loss": 0.91124481, + "num_input_tokens_seen": 145323168, + "router_z_loss_mlp": 0.90625, + "step": 1760, + "time_per_iteration": 2.5859875679016113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178403, + "balance_loss_mlp": 1.08770907, + "epoch": 0.3387841477491343, + "flos": 546415405056.0, + "grad_norm": 0.02534991906570622, + "language_loss": 0.96946132, + "learning_rate": 0.000770129772929469, + "loss": 0.9812454, + "num_input_tokens_seen": 145395776, + "router_z_loss_mlp": 0.90527344, + "step": 1761, + "time_per_iteration": 2.633229970932007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117744, + "balance_loss_mlp": 1.08684063, + "epoch": 0.3389765294343978, + "flos": 721063251456.0, + "grad_norm": 0.027907228809642075, + "language_loss": 0.96886694, + "learning_rate": 0.0007698675581420334, + "loss": 0.98064131, + "num_input_tokens_seen": 145470576, + "router_z_loss_mlp": 0.90429688, + "step": 1762, + "time_per_iteration": 2.8309946060180664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190138, + "balance_loss_mlp": 1.09987259, + "epoch": 0.3391689111196614, + "flos": 701263084032.0, + "grad_norm": 0.028701846645649853, + "language_loss": 0.87853253, + "learning_rate": 0.0007696052385827199, + "loss": 0.89043397, + "num_input_tokens_seen": 145548896, + "router_z_loss_mlp": 0.90087891, + "step": 1763, + "time_per_iteration": 2.9673497676849365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183311, + "balance_loss_mlp": 1.09304607, + "epoch": 0.339361292804925, + "flos": 628248115200.0, + "grad_norm": 0.027144566695111814, + "language_loss": 0.85910845, + "learning_rate": 0.00076934281435337, + "loss": 0.87094158, + "num_input_tokens_seen": 145617136, + "router_z_loss_mlp": 0.90087891, + "step": 1764, + "time_per_iteration": 2.7069530487060547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011791, + "balance_loss_mlp": 1.08869135, + "epoch": 0.33955367449018853, + "flos": 610794554880.0, + "grad_norm": 0.025973604998757366, + "language_loss": 0.94002628, + "learning_rate": 0.0007690802855558658, + "loss": 0.95181727, + "num_input_tokens_seen": 145696416, + "router_z_loss_mlp": 0.90234375, + "step": 1765, + "time_per_iteration": 2.8596885204315186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198868, + "balance_loss_mlp": 1.11151123, + "epoch": 0.3397460561754521, + "flos": 1456586357760.0, + "grad_norm": 0.018873382807181687, + "language_loss": 0.76374954, + "learning_rate": 0.0007688176522921302, + "loss": 0.77573818, + "num_input_tokens_seen": 145919680, + "router_z_loss_mlp": 0.87109375, + "step": 1766, + "time_per_iteration": 4.900039434432983 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183458, + "balance_loss_mlp": 1.09304976, + "epoch": 0.33993843786071565, + "flos": 488290538496.0, + "grad_norm": 0.033631077459875626, + "language_loss": 1.00266671, + "learning_rate": 0.0007685549146641262, + "loss": 1.01450121, + "num_input_tokens_seen": 145984272, + "router_z_loss_mlp": 0.90234375, + "step": 1767, + "time_per_iteration": 2.521587610244751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176512, + "balance_loss_mlp": 1.08557928, + "epoch": 0.34013081954597923, + "flos": 418232523264.0, + "grad_norm": 0.024531175575557927, + "language_loss": 0.95696396, + "learning_rate": 0.0007682920727738579, + "loss": 0.96872908, + "num_input_tokens_seen": 146047248, + "router_z_loss_mlp": 0.90771484, + "step": 1768, + "time_per_iteration": 2.4606878757476807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177177, + "balance_loss_mlp": 1.08614898, + "epoch": 0.34032320123124277, + "flos": 438430189056.0, + "grad_norm": 0.027457130501572214, + "language_loss": 0.93990809, + "learning_rate": 0.000768029126723369, + "loss": 0.95167989, + "num_input_tokens_seen": 146111872, + "router_z_loss_mlp": 0.90869141, + "step": 1769, + "time_per_iteration": 2.494699478149414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181852, + "balance_loss_mlp": 1.09077609, + "epoch": 0.34051558291650635, + "flos": 458543261184.0, + "grad_norm": 0.027949795017340132, + "language_loss": 0.90377855, + "learning_rate": 0.0007677660766147447, + "loss": 0.91559708, + "num_input_tokens_seen": 146172608, + "router_z_loss_mlp": 0.90917969, + "step": 1770, + "time_per_iteration": 2.5302748680114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183578, + "balance_loss_mlp": 1.09469604, + "epoch": 0.3407079646017699, + "flos": 1562137645056.0, + "grad_norm": 0.011444512115251876, + "language_loss": 0.72470945, + "learning_rate": 0.0007675029225501102, + "loss": 0.73654521, + "num_input_tokens_seen": 146413584, + "router_z_loss_mlp": 0.88671875, + "step": 1771, + "time_per_iteration": 4.913311004638672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188847, + "balance_loss_mlp": 1.09758055, + "epoch": 0.3409003462870335, + "flos": 493530972672.0, + "grad_norm": 0.032062498304007335, + "language_loss": 0.91194993, + "learning_rate": 0.0007672396646316306, + "loss": 0.92383844, + "num_input_tokens_seen": 146476992, + "router_z_loss_mlp": 0.91113281, + "step": 1772, + "time_per_iteration": 2.539181709289551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179934, + "balance_loss_mlp": 1.08885825, + "epoch": 0.34109272797229706, + "flos": 809820989952.0, + "grad_norm": 0.028470010979029077, + "language_loss": 0.88439053, + "learning_rate": 0.000766976302961512, + "loss": 0.89618981, + "num_input_tokens_seen": 146552848, + "router_z_loss_mlp": 0.90917969, + "step": 1773, + "time_per_iteration": 3.006547212600708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181829, + "balance_loss_mlp": 1.09094357, + "epoch": 0.3412851096575606, + "flos": 471099491328.0, + "grad_norm": 0.02901021255147234, + "language_loss": 0.91066158, + "learning_rate": 0.0007667128376420003, + "loss": 0.92247993, + "num_input_tokens_seen": 146617504, + "router_z_loss_mlp": 0.90722656, + "step": 1774, + "time_per_iteration": 2.534266233444214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118318, + "balance_loss_mlp": 1.09253371, + "epoch": 0.3414774913428242, + "flos": 596770581504.0, + "grad_norm": 0.02876896591079206, + "language_loss": 0.92739397, + "learning_rate": 0.0007664492687753817, + "loss": 0.93922579, + "num_input_tokens_seen": 146691568, + "router_z_loss_mlp": 0.90478516, + "step": 1775, + "time_per_iteration": 2.671475410461426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181574, + "balance_loss_mlp": 1.09102285, + "epoch": 0.3416698730280877, + "flos": 528507950592.0, + "grad_norm": 0.025483549401886952, + "language_loss": 0.89018893, + "learning_rate": 0.000766185596463983, + "loss": 0.90200466, + "num_input_tokens_seen": 146764208, + "router_z_loss_mlp": 0.90380859, + "step": 1776, + "time_per_iteration": 2.6099884510040283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177935, + "balance_loss_mlp": 1.08719325, + "epoch": 0.3418622547133513, + "flos": 876117047808.0, + "grad_norm": 0.026020404961979337, + "language_loss": 0.84743214, + "learning_rate": 0.0007659218208101706, + "loss": 0.8592115, + "num_input_tokens_seen": 146847744, + "router_z_loss_mlp": 0.90576172, + "step": 1777, + "time_per_iteration": 3.1272366046905518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118093, + "balance_loss_mlp": 1.08994997, + "epoch": 0.34205463639861483, + "flos": 604876644864.0, + "grad_norm": 0.024068405360429687, + "language_loss": 0.91582745, + "learning_rate": 0.0007656579419163515, + "loss": 0.92763674, + "num_input_tokens_seen": 146918336, + "router_z_loss_mlp": 0.90820312, + "step": 1778, + "time_per_iteration": 2.7243831157684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180436, + "balance_loss_mlp": 1.0894556, + "epoch": 0.3422470180838784, + "flos": 464714952192.0, + "grad_norm": 0.02739040164484414, + "language_loss": 0.86445272, + "learning_rate": 0.0007653939598849724, + "loss": 0.87625706, + "num_input_tokens_seen": 146982496, + "router_z_loss_mlp": 0.90820312, + "step": 1779, + "time_per_iteration": 2.4913573265075684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180695, + "balance_loss_mlp": 1.09143066, + "epoch": 0.34243939976914195, + "flos": 1589816291328.0, + "grad_norm": 0.01051605552964957, + "language_loss": 0.82880205, + "learning_rate": 0.0007651298748185204, + "loss": 0.84060901, + "num_input_tokens_seen": 147213600, + "router_z_loss_mlp": 0.890625, + "step": 1780, + "time_per_iteration": 4.891184091567993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176554, + "balance_loss_mlp": 1.085621, + "epoch": 0.34263178145440554, + "flos": 874443187200.0, + "grad_norm": 0.026322112436007235, + "language_loss": 0.88782489, + "learning_rate": 0.000764865686819522, + "loss": 0.89959043, + "num_input_tokens_seen": 147287664, + "router_z_loss_mlp": 0.90771484, + "step": 1781, + "time_per_iteration": 3.048123836517334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176352, + "balance_loss_mlp": 1.08551466, + "epoch": 0.3428241631396691, + "flos": 507873854976.0, + "grad_norm": 0.024622696081698998, + "language_loss": 0.93515933, + "learning_rate": 0.0007646013959905449, + "loss": 0.94692284, + "num_input_tokens_seen": 147356800, + "router_z_loss_mlp": 0.90673828, + "step": 1782, + "time_per_iteration": 2.565661907196045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176257, + "balance_loss_mlp": 1.08565772, + "epoch": 0.34301654482493266, + "flos": 881524667904.0, + "grad_norm": 0.0252118274748732, + "language_loss": 0.880337, + "learning_rate": 0.0007643370024341949, + "loss": 0.89209956, + "num_input_tokens_seen": 147432496, + "router_z_loss_mlp": 0.90429688, + "step": 1783, + "time_per_iteration": 3.0695888996124268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180625, + "balance_loss_mlp": 1.08959711, + "epoch": 0.34320892651019624, + "flos": 432668731392.0, + "grad_norm": 0.024350173092139916, + "language_loss": 0.89407057, + "learning_rate": 0.0007640725062531195, + "loss": 0.90587682, + "num_input_tokens_seen": 147495856, + "router_z_loss_mlp": 0.90869141, + "step": 1784, + "time_per_iteration": 2.5120832920074463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184023, + "balance_loss_mlp": 1.09294736, + "epoch": 0.3434013081954598, + "flos": 464593428480.0, + "grad_norm": 0.02877111448667641, + "language_loss": 0.95969987, + "learning_rate": 0.0007638079075500047, + "loss": 0.97154009, + "num_input_tokens_seen": 147559632, + "router_z_loss_mlp": 0.90917969, + "step": 1785, + "time_per_iteration": 2.5176198482513428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194351, + "balance_loss_mlp": 1.10546875, + "epoch": 0.34359368988072336, + "flos": 1560674631168.0, + "grad_norm": 0.01088995253456435, + "language_loss": 0.75180668, + "learning_rate": 0.0007635432064275772, + "loss": 0.7637502, + "num_input_tokens_seen": 147794576, + "router_z_loss_mlp": 0.88671875, + "step": 1786, + "time_per_iteration": 5.021549463272095 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183341, + "balance_loss_mlp": 1.09278917, + "epoch": 0.3437860715659869, + "flos": 496572519936.0, + "grad_norm": 0.024204144242014246, + "language_loss": 0.90540475, + "learning_rate": 0.0007632784029886026, + "loss": 0.91723818, + "num_input_tokens_seen": 147866960, + "router_z_loss_mlp": 0.90380859, + "step": 1787, + "time_per_iteration": 2.6350793838500977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178894, + "balance_loss_mlp": 1.08791375, + "epoch": 0.3439784532512505, + "flos": 719608969728.0, + "grad_norm": 0.025958683961259412, + "language_loss": 0.93068433, + "learning_rate": 0.0007630134973358873, + "loss": 0.94247323, + "num_input_tokens_seen": 147947808, + "router_z_loss_mlp": 0.90820312, + "step": 1788, + "time_per_iteration": 2.93084454536438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178793, + "balance_loss_mlp": 1.08785999, + "epoch": 0.34417083493651407, + "flos": 566921246208.0, + "grad_norm": 0.025032512144454056, + "language_loss": 0.92506206, + "learning_rate": 0.0007627484895722763, + "loss": 0.93685007, + "num_input_tokens_seen": 148015936, + "router_z_loss_mlp": 0.90771484, + "step": 1789, + "time_per_iteration": 2.649689197540283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177857, + "balance_loss_mlp": 1.08706772, + "epoch": 0.3443632166217776, + "flos": 797701189632.0, + "grad_norm": 0.027302991531117576, + "language_loss": 0.89870507, + "learning_rate": 0.0007624833798006552, + "loss": 0.9104836, + "num_input_tokens_seen": 148099776, + "router_z_loss_mlp": 0.90625, + "step": 1790, + "time_per_iteration": 3.0469179153442383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117862, + "balance_loss_mlp": 1.08811665, + "epoch": 0.3445555983070412, + "flos": 570392492544.0, + "grad_norm": 0.0288389056738737, + "language_loss": 0.92729777, + "learning_rate": 0.0007622181681239483, + "loss": 0.93908393, + "num_input_tokens_seen": 148169616, + "router_z_loss_mlp": 0.90332031, + "step": 1791, + "time_per_iteration": 2.6440184116363525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178949, + "balance_loss_mlp": 1.08849263, + "epoch": 0.3447479799923047, + "flos": 569980257792.0, + "grad_norm": 0.022982775931836206, + "language_loss": 0.91584516, + "learning_rate": 0.0007619528546451202, + "loss": 0.9276346, + "num_input_tokens_seen": 148247824, + "router_z_loss_mlp": 0.90283203, + "step": 1792, + "time_per_iteration": 2.797133445739746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177091, + "balance_loss_mlp": 1.08673048, + "epoch": 0.3449403616775683, + "flos": 969331683840.0, + "grad_norm": 0.02628926210615307, + "language_loss": 0.90923131, + "learning_rate": 0.0007616874394671745, + "loss": 0.92100227, + "num_input_tokens_seen": 148333040, + "router_z_loss_mlp": 0.90185547, + "step": 1793, + "time_per_iteration": 3.3191378116607666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178301, + "balance_loss_mlp": 1.08784556, + "epoch": 0.34513274336283184, + "flos": 569676085248.0, + "grad_norm": 0.03267712320672132, + "language_loss": 0.9558928, + "learning_rate": 0.0007614219226931547, + "loss": 0.96767581, + "num_input_tokens_seen": 148401840, + "router_z_loss_mlp": 0.90283203, + "step": 1794, + "time_per_iteration": 2.677525043487549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178051, + "balance_loss_mlp": 1.0875473, + "epoch": 0.3453251250480954, + "flos": 461858055168.0, + "grad_norm": 0.024689469906648515, + "language_loss": 0.92397773, + "learning_rate": 0.0007611563044261435, + "loss": 0.93575823, + "num_input_tokens_seen": 148466576, + "router_z_loss_mlp": 0.90332031, + "step": 1795, + "time_per_iteration": 2.5183908939361572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178812, + "balance_loss_mlp": 1.08835602, + "epoch": 0.34551750673335896, + "flos": 416519731200.0, + "grad_norm": 0.027710199676415265, + "language_loss": 0.96473086, + "learning_rate": 0.0007608905847692631, + "loss": 0.97651899, + "num_input_tokens_seen": 148530016, + "router_z_loss_mlp": 0.90283203, + "step": 1796, + "time_per_iteration": 2.4600772857666016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182482, + "balance_loss_mlp": 1.09212101, + "epoch": 0.34570988841862255, + "flos": 589114409472.0, + "grad_norm": 0.023363368939277738, + "language_loss": 0.92555124, + "learning_rate": 0.0007606247638256749, + "loss": 0.93737608, + "num_input_tokens_seen": 148610064, + "router_z_loss_mlp": 0.90185547, + "step": 1797, + "time_per_iteration": 2.8326525688171387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183395, + "balance_loss_mlp": 1.09565735, + "epoch": 0.34590227010388613, + "flos": 1571142764544.0, + "grad_norm": 0.009651567236440416, + "language_loss": 0.78170294, + "learning_rate": 0.0007603588416985798, + "loss": 0.79353684, + "num_input_tokens_seen": 148835872, + "router_z_loss_mlp": 0.875, + "step": 1798, + "time_per_iteration": 4.921091794967651 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118071, + "balance_loss_mlp": 1.09259033, + "epoch": 0.34609465178914967, + "flos": 1540928131584.0, + "grad_norm": 0.004186018133500934, + "language_loss": 0.79327202, + "learning_rate": 0.0007600928184912179, + "loss": 0.8050791, + "num_input_tokens_seen": 149066864, + "router_z_loss_mlp": 0.87890625, + "step": 1799, + "time_per_iteration": 4.76463508605957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177428, + "balance_loss_mlp": 1.08692396, + "epoch": 0.34628703347441325, + "flos": 610516578816.0, + "grad_norm": 0.027319297321258894, + "language_loss": 0.94778776, + "learning_rate": 0.0007598266943068686, + "loss": 0.95956194, + "num_input_tokens_seen": 149141600, + "router_z_loss_mlp": 0.90332031, + "step": 1800, + "time_per_iteration": 2.741830348968506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180421, + "balance_loss_mlp": 1.0898217, + "epoch": 0.3464794151596768, + "flos": 474264563712.0, + "grad_norm": 0.0268607754896097, + "language_loss": 0.91417915, + "learning_rate": 0.0007595604692488507, + "loss": 0.92598337, + "num_input_tokens_seen": 149205888, + "router_z_loss_mlp": 0.90429688, + "step": 1801, + "time_per_iteration": 2.5253777503967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117756, + "balance_loss_mlp": 1.08719921, + "epoch": 0.34667179684494037, + "flos": 606821750784.0, + "grad_norm": 0.0251267071243342, + "language_loss": 0.907076, + "learning_rate": 0.0007592941434205215, + "loss": 0.91885161, + "num_input_tokens_seen": 149281280, + "router_z_loss_mlp": 0.90185547, + "step": 1802, + "time_per_iteration": 2.7729735374450684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175873, + "balance_loss_mlp": 1.0877533, + "epoch": 0.3468641785302039, + "flos": 1568359727616.0, + "grad_norm": 0.004114808875680539, + "language_loss": 0.73571062, + "learning_rate": 0.0007590277169252782, + "loss": 0.74746931, + "num_input_tokens_seen": 149525008, + "router_z_loss_mlp": 0.87890625, + "step": 1803, + "time_per_iteration": 5.036771774291992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178076, + "balance_loss_mlp": 1.08776271, + "epoch": 0.3470565602154675, + "flos": 908723223552.0, + "grad_norm": 0.03174792037748739, + "language_loss": 0.90712535, + "learning_rate": 0.0007587611898665566, + "loss": 0.91890609, + "num_input_tokens_seen": 149600624, + "router_z_loss_mlp": 0.90136719, + "step": 1804, + "time_per_iteration": 3.0725910663604736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177414, + "balance_loss_mlp": 1.08719671, + "epoch": 0.347248941900731, + "flos": 640059740160.0, + "grad_norm": 0.023310551488003612, + "language_loss": 0.90306699, + "learning_rate": 0.0007584945623478315, + "loss": 0.91484118, + "num_input_tokens_seen": 149674224, + "router_z_loss_mlp": 0.90039062, + "step": 1805, + "time_per_iteration": 2.8080646991729736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176916, + "balance_loss_mlp": 1.08655512, + "epoch": 0.3474413235859946, + "flos": 848781505536.0, + "grad_norm": 0.027596494202169034, + "language_loss": 0.90514499, + "learning_rate": 0.000758227834472617, + "loss": 0.91691411, + "num_input_tokens_seen": 149758688, + "router_z_loss_mlp": 0.90185547, + "step": 1806, + "time_per_iteration": 3.0443291664123535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179899, + "balance_loss_mlp": 1.08972931, + "epoch": 0.3476337052712582, + "flos": 516696325632.0, + "grad_norm": 0.02724510251762829, + "language_loss": 0.86438924, + "learning_rate": 0.0007579610063444664, + "loss": 0.87618828, + "num_input_tokens_seen": 149831648, + "router_z_loss_mlp": 0.89990234, + "step": 1807, + "time_per_iteration": 2.716522455215454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177066, + "balance_loss_mlp": 1.08694386, + "epoch": 0.34782608695652173, + "flos": 915114493440.0, + "grad_norm": 0.02927822844999151, + "language_loss": 0.96424794, + "learning_rate": 0.0007576940780669712, + "loss": 0.97601861, + "num_input_tokens_seen": 149919440, + "router_z_loss_mlp": 0.89941406, + "step": 1808, + "time_per_iteration": 3.21464204788208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177424, + "balance_loss_mlp": 1.08734941, + "epoch": 0.3480184686417853, + "flos": 775083056640.0, + "grad_norm": 0.026376675364870938, + "language_loss": 0.91835052, + "learning_rate": 0.0007574270497437624, + "loss": 0.93012476, + "num_input_tokens_seen": 150001632, + "router_z_loss_mlp": 0.89892578, + "step": 1809, + "time_per_iteration": 2.965306043624878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177298, + "balance_loss_mlp": 1.0874145, + "epoch": 0.34821085032704885, + "flos": 578003728896.0, + "grad_norm": 0.024336980271772477, + "language_loss": 0.95592844, + "learning_rate": 0.000757159921478509, + "loss": 0.96770144, + "num_input_tokens_seen": 150077552, + "router_z_loss_mlp": 0.89697266, + "step": 1810, + "time_per_iteration": 2.781496047973633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177093, + "balance_loss_mlp": 1.088974, + "epoch": 0.34840323201231244, + "flos": 1528039531008.0, + "grad_norm": 0.007178450494277746, + "language_loss": 0.74450636, + "learning_rate": 0.0007568926933749201, + "loss": 0.75627732, + "num_input_tokens_seen": 150295328, + "router_z_loss_mlp": 0.87890625, + "step": 1811, + "time_per_iteration": 4.719515562057495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176704, + "balance_loss_mlp": 1.08691561, + "epoch": 0.34859561369757597, + "flos": 510181530624.0, + "grad_norm": 0.02648580139398905, + "language_loss": 0.96071857, + "learning_rate": 0.0007566253655367423, + "loss": 0.97248554, + "num_input_tokens_seen": 150360496, + "router_z_loss_mlp": 0.89599609, + "step": 1812, + "time_per_iteration": 2.5699198246002197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177921, + "balance_loss_mlp": 1.08822834, + "epoch": 0.34878799538283956, + "flos": 549756395520.0, + "grad_norm": 0.036663453377328174, + "language_loss": 0.96810794, + "learning_rate": 0.000756357938067762, + "loss": 0.97988713, + "num_input_tokens_seen": 150432064, + "router_z_loss_mlp": 0.89501953, + "step": 1813, + "time_per_iteration": 2.6622092723846436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179077, + "balance_loss_mlp": 1.08885992, + "epoch": 0.34898037706810314, + "flos": 985193975808.0, + "grad_norm": 0.026013801782247825, + "language_loss": 0.90032709, + "learning_rate": 0.0007560904110718033, + "loss": 0.91211784, + "num_input_tokens_seen": 150512176, + "router_z_loss_mlp": 0.90039062, + "step": 1814, + "time_per_iteration": 3.2704074382781982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176613, + "balance_loss_mlp": 1.08639514, + "epoch": 0.3491727587533667, + "flos": 682836607488.0, + "grad_norm": 0.025025787643359835, + "language_loss": 0.91824377, + "learning_rate": 0.0007558227846527297, + "loss": 0.93000984, + "num_input_tokens_seen": 150586416, + "router_z_loss_mlp": 0.90039062, + "step": 1815, + "time_per_iteration": 2.870858907699585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176853, + "balance_loss_mlp": 1.08673084, + "epoch": 0.34936514043863026, + "flos": 394889250816.0, + "grad_norm": 0.0291076708707547, + "language_loss": 0.91979998, + "learning_rate": 0.0007555550589144429, + "loss": 0.9315685, + "num_input_tokens_seen": 150648944, + "router_z_loss_mlp": 0.89941406, + "step": 1816, + "time_per_iteration": 2.4363009929656982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177424, + "balance_loss_mlp": 1.08739722, + "epoch": 0.3495575221238938, + "flos": 462340147200.0, + "grad_norm": 0.02440335273431038, + "language_loss": 0.92281306, + "learning_rate": 0.000755287233960883, + "loss": 0.9345873, + "num_input_tokens_seen": 150717200, + "router_z_loss_mlp": 0.8984375, + "step": 1817, + "time_per_iteration": 2.538250207901001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117706, + "balance_loss_mlp": 1.08693826, + "epoch": 0.3497499038091574, + "flos": 725428824576.0, + "grad_norm": 0.028430093115180927, + "language_loss": 0.88002723, + "learning_rate": 0.0007550193098960292, + "loss": 0.89179784, + "num_input_tokens_seen": 150790368, + "router_z_loss_mlp": 0.89941406, + "step": 1818, + "time_per_iteration": 2.8685545921325684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174425, + "balance_loss_mlp": 1.08411181, + "epoch": 0.3499422854944209, + "flos": 829196187648.0, + "grad_norm": 0.021653398091314287, + "language_loss": 0.92103571, + "learning_rate": 0.0007547512868238988, + "loss": 0.93277991, + "num_input_tokens_seen": 150879872, + "router_z_loss_mlp": 0.90136719, + "step": 1819, + "time_per_iteration": 3.115814685821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118204, + "balance_loss_mlp": 1.092013, + "epoch": 0.3501346671796845, + "flos": 494542820352.0, + "grad_norm": 0.026515438979626053, + "language_loss": 0.9198699, + "learning_rate": 0.0007544831648485473, + "loss": 0.93169028, + "num_input_tokens_seen": 150953712, + "router_z_loss_mlp": 0.8984375, + "step": 1820, + "time_per_iteration": 2.6666150093078613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178247, + "balance_loss_mlp": 1.08783865, + "epoch": 0.35032704886494803, + "flos": 579848778240.0, + "grad_norm": 0.026574936148936048, + "language_loss": 0.89372301, + "learning_rate": 0.0007542149440740694, + "loss": 0.90550542, + "num_input_tokens_seen": 151026192, + "router_z_loss_mlp": 0.90234375, + "step": 1821, + "time_per_iteration": 2.6776442527770996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178869, + "balance_loss_mlp": 1.08841276, + "epoch": 0.3505194305502116, + "flos": 585831816192.0, + "grad_norm": 0.02674162112947977, + "language_loss": 0.9602831, + "learning_rate": 0.000753946624604597, + "loss": 0.97207189, + "num_input_tokens_seen": 151100720, + "router_z_loss_mlp": 0.90283203, + "step": 1822, + "time_per_iteration": 2.746363639831543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175368, + "balance_loss_mlp": 1.08491182, + "epoch": 0.3507118122354752, + "flos": 527978194944.0, + "grad_norm": 0.02703682960411951, + "language_loss": 0.95658362, + "learning_rate": 0.0007536782065443015, + "loss": 0.9683373, + "num_input_tokens_seen": 151166032, + "router_z_loss_mlp": 0.90283203, + "step": 1823, + "time_per_iteration": 2.5945184230804443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175188, + "balance_loss_mlp": 1.08458936, + "epoch": 0.35090419392073874, + "flos": 512545602048.0, + "grad_norm": 0.03278557538641046, + "language_loss": 0.86822712, + "learning_rate": 0.0007534096899973919, + "loss": 0.87997901, + "num_input_tokens_seen": 151232208, + "router_z_loss_mlp": 0.90429688, + "step": 1824, + "time_per_iteration": 2.56933331489563 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184456, + "balance_loss_mlp": 1.0944289, + "epoch": 0.3510965756060023, + "flos": 565195719168.0, + "grad_norm": 0.023191753507183704, + "language_loss": 0.89392567, + "learning_rate": 0.0007531410750681154, + "loss": 0.90577018, + "num_input_tokens_seen": 151308128, + "router_z_loss_mlp": 0.8984375, + "step": 1825, + "time_per_iteration": 2.7223169803619385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186327, + "balance_loss_mlp": 1.09630024, + "epoch": 0.35128895729126586, + "flos": 1022253046272.0, + "grad_norm": 0.026424599574572643, + "language_loss": 0.93470478, + "learning_rate": 0.0007528723618607575, + "loss": 0.94656801, + "num_input_tokens_seen": 151402560, + "router_z_loss_mlp": 0.8984375, + "step": 1826, + "time_per_iteration": 3.404395580291748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182394, + "balance_loss_mlp": 1.09236717, + "epoch": 0.35148133897652944, + "flos": 589424586240.0, + "grad_norm": 0.02767542011563751, + "language_loss": 0.89242589, + "learning_rate": 0.0007526035504796422, + "loss": 0.90424991, + "num_input_tokens_seen": 151478816, + "router_z_loss_mlp": 0.8984375, + "step": 1827, + "time_per_iteration": 2.820510149002075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117853, + "balance_loss_mlp": 1.08850324, + "epoch": 0.351673720661793, + "flos": 496285811712.0, + "grad_norm": 0.02845608163714707, + "language_loss": 0.94670665, + "learning_rate": 0.0007523346410291312, + "loss": 0.95849192, + "num_input_tokens_seen": 151554528, + "router_z_loss_mlp": 0.8984375, + "step": 1828, + "time_per_iteration": 2.763277053833008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177518, + "balance_loss_mlp": 1.08753836, + "epoch": 0.35186610234705656, + "flos": 763998572544.0, + "grad_norm": 0.028566964886064136, + "language_loss": 0.91855693, + "learning_rate": 0.0007520656336136245, + "loss": 0.93033206, + "num_input_tokens_seen": 151629440, + "router_z_loss_mlp": 0.89794922, + "step": 1829, + "time_per_iteration": 2.9501917362213135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179113, + "balance_loss_mlp": 1.08908641, + "epoch": 0.3520584840323201, + "flos": 627388717056.0, + "grad_norm": 0.0235814228834027, + "language_loss": 0.94624627, + "learning_rate": 0.0007517965283375599, + "loss": 0.95803738, + "num_input_tokens_seen": 151708544, + "router_z_loss_mlp": 0.8984375, + "step": 1830, + "time_per_iteration": 2.8197402954101562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179857, + "balance_loss_mlp": 1.08992577, + "epoch": 0.3522508657175837, + "flos": 538448329728.0, + "grad_norm": 0.025024391475303026, + "language_loss": 0.97205818, + "learning_rate": 0.0007515273253054132, + "loss": 0.9838568, + "num_input_tokens_seen": 151779152, + "router_z_loss_mlp": 0.89746094, + "step": 1831, + "time_per_iteration": 2.6376330852508545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191124, + "balance_loss_mlp": 1.10109711, + "epoch": 0.35244324740284727, + "flos": 568501780992.0, + "grad_norm": 0.029882616882314406, + "language_loss": 0.9266001, + "learning_rate": 0.0007512580246216988, + "loss": 0.93851131, + "num_input_tokens_seen": 151853216, + "router_z_loss_mlp": 0.8984375, + "step": 1832, + "time_per_iteration": 2.708432912826538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179716, + "balance_loss_mlp": 1.08964145, + "epoch": 0.3526356290881108, + "flos": 514054278144.0, + "grad_norm": 0.030813246422457925, + "language_loss": 0.91671479, + "learning_rate": 0.000750988626390968, + "loss": 0.92851192, + "num_input_tokens_seen": 151920416, + "router_z_loss_mlp": 0.89892578, + "step": 1833, + "time_per_iteration": 2.592047929763794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179987, + "balance_loss_mlp": 1.09010315, + "epoch": 0.3528280107733744, + "flos": 596972696064.0, + "grad_norm": 0.024705197674389605, + "language_loss": 0.91622353, + "learning_rate": 0.0007507191307178108, + "loss": 0.9280234, + "num_input_tokens_seen": 151990848, + "router_z_loss_mlp": 0.89697266, + "step": 1834, + "time_per_iteration": 2.7884535789489746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176506, + "balance_loss_mlp": 1.08652651, + "epoch": 0.3530203924586379, + "flos": 552298386432.0, + "grad_norm": 0.0302975798262418, + "language_loss": 0.83893424, + "learning_rate": 0.0007504495377068543, + "loss": 0.85069931, + "num_input_tokens_seen": 152064864, + "router_z_loss_mlp": 0.89794922, + "step": 1835, + "time_per_iteration": 2.7751786708831787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175764, + "balance_loss_mlp": 1.08573675, + "epoch": 0.3532127741439015, + "flos": 654305293824.0, + "grad_norm": 0.027517554164180617, + "language_loss": 0.90655488, + "learning_rate": 0.0007501798474627642, + "loss": 0.91831255, + "num_input_tokens_seen": 152150096, + "router_z_loss_mlp": 0.8984375, + "step": 1836, + "time_per_iteration": 2.9638845920562744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179149, + "balance_loss_mlp": 1.08926523, + "epoch": 0.35340515582916504, + "flos": 724150460928.0, + "grad_norm": 0.024568481275515953, + "language_loss": 0.91140759, + "learning_rate": 0.0007499100600902433, + "loss": 0.92319906, + "num_input_tokens_seen": 152232528, + "router_z_loss_mlp": 0.89697266, + "step": 1837, + "time_per_iteration": 2.9948322772979736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184038, + "balance_loss_mlp": 1.09396327, + "epoch": 0.35359753751442863, + "flos": 595997778432.0, + "grad_norm": 0.031821297821065, + "language_loss": 0.92654896, + "learning_rate": 0.0007496401756940324, + "loss": 0.9383893, + "num_input_tokens_seen": 152299584, + "router_z_loss_mlp": 0.89892578, + "step": 1838, + "time_per_iteration": 2.678050994873047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176486, + "balance_loss_mlp": 1.08665001, + "epoch": 0.3537899191996922, + "flos": 633805456896.0, + "grad_norm": 0.02718368250353396, + "language_loss": 0.91091663, + "learning_rate": 0.0007493701943789098, + "loss": 0.92268145, + "num_input_tokens_seen": 152370368, + "router_z_loss_mlp": 0.89648438, + "step": 1839, + "time_per_iteration": 2.779574155807495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175825, + "balance_loss_mlp": 1.08608413, + "epoch": 0.35398230088495575, + "flos": 507352831488.0, + "grad_norm": 0.028671493841357993, + "language_loss": 0.91863656, + "learning_rate": 0.000749100116249692, + "loss": 0.93039483, + "num_input_tokens_seen": 152436928, + "router_z_loss_mlp": 0.89550781, + "step": 1840, + "time_per_iteration": 2.607614755630493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189406, + "balance_loss_mlp": 1.09980869, + "epoch": 0.35417468257021933, + "flos": 509046157824.0, + "grad_norm": 0.03229862826848899, + "language_loss": 0.95953786, + "learning_rate": 0.0007488299414112321, + "loss": 0.97143197, + "num_input_tokens_seen": 152505952, + "router_z_loss_mlp": 0.89404297, + "step": 1841, + "time_per_iteration": 2.566596746444702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181321, + "balance_loss_mlp": 1.09210455, + "epoch": 0.35436706425548287, + "flos": 657659019264.0, + "grad_norm": 0.02732135002339032, + "language_loss": 0.86453879, + "learning_rate": 0.0007485596699684215, + "loss": 0.87635195, + "num_input_tokens_seen": 152577408, + "router_z_loss_mlp": 0.89013672, + "step": 1842, + "time_per_iteration": 2.8111371994018555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185021, + "balance_loss_mlp": 1.09575689, + "epoch": 0.35455944594074645, + "flos": 653888329728.0, + "grad_norm": 0.026686949506238997, + "language_loss": 0.92940086, + "learning_rate": 0.000748289302026189, + "loss": 0.94125104, + "num_input_tokens_seen": 152654480, + "router_z_loss_mlp": 0.890625, + "step": 1843, + "time_per_iteration": 2.8244054317474365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187203, + "balance_loss_mlp": 1.09793901, + "epoch": 0.35475182762601, + "flos": 850010204160.0, + "grad_norm": 0.02649701564047654, + "language_loss": 0.9307664, + "learning_rate": 0.0007480188376895004, + "loss": 0.94263846, + "num_input_tokens_seen": 152732304, + "router_z_loss_mlp": 0.890625, + "step": 1844, + "time_per_iteration": 3.041001319885254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187935, + "balance_loss_mlp": 1.10115051, + "epoch": 0.3549442093112736, + "flos": 1524775128576.0, + "grad_norm": 0.01173136965559212, + "language_loss": 0.7381134, + "learning_rate": 0.0007477482770633596, + "loss": 0.74999273, + "num_input_tokens_seen": 152965952, + "router_z_loss_mlp": 0.86914062, + "step": 1845, + "time_per_iteration": 4.865761756896973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183261, + "balance_loss_mlp": 1.09390223, + "epoch": 0.3551365909965371, + "flos": 652714025472.0, + "grad_norm": 0.028658093872898062, + "language_loss": 0.85614175, + "learning_rate": 0.0007474776202528074, + "loss": 0.8679744, + "num_input_tokens_seen": 153053088, + "router_z_loss_mlp": 0.89160156, + "step": 1846, + "time_per_iteration": 2.9342904090881348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184977, + "balance_loss_mlp": 1.0954746, + "epoch": 0.3553289726818007, + "flos": 898921832448.0, + "grad_norm": 0.03609141350995601, + "language_loss": 0.89849555, + "learning_rate": 0.000747206867362922, + "loss": 0.91034532, + "num_input_tokens_seen": 153129216, + "router_z_loss_mlp": 0.89306641, + "step": 1847, + "time_per_iteration": 3.1089484691619873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185041, + "balance_loss_mlp": 1.09553862, + "epoch": 0.3555213543670643, + "flos": 689733437952.0, + "grad_norm": 0.0286779566522822, + "language_loss": 0.9096849, + "learning_rate": 0.0007469360184988194, + "loss": 0.92153525, + "num_input_tokens_seen": 153199360, + "router_z_loss_mlp": 0.89306641, + "step": 1848, + "time_per_iteration": 2.820265293121338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183493, + "balance_loss_mlp": 1.09399033, + "epoch": 0.3557137360523278, + "flos": 539603168256.0, + "grad_norm": 0.02648998316664428, + "language_loss": 0.93967247, + "learning_rate": 0.0007466650737656518, + "loss": 0.95150745, + "num_input_tokens_seen": 153269168, + "router_z_loss_mlp": 0.89306641, + "step": 1849, + "time_per_iteration": 2.596639394760132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183541, + "balance_loss_mlp": 1.09427702, + "epoch": 0.3559061177375914, + "flos": 403153767936.0, + "grad_norm": 0.02765421607491624, + "language_loss": 0.97574586, + "learning_rate": 0.0007463940332686098, + "loss": 0.98758125, + "num_input_tokens_seen": 153333120, + "router_z_loss_mlp": 0.890625, + "step": 1850, + "time_per_iteration": 2.478158473968506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177245, + "balance_loss_mlp": 1.08764756, + "epoch": 0.35609849942285493, + "flos": 697893895680.0, + "grad_norm": 0.023379973164811964, + "language_loss": 0.90857208, + "learning_rate": 0.0007461228971129205, + "loss": 0.92034447, + "num_input_tokens_seen": 153407600, + "router_z_loss_mlp": 0.89404297, + "step": 1851, + "time_per_iteration": 2.9202487468719482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179211, + "balance_loss_mlp": 1.08966124, + "epoch": 0.3562908811081185, + "flos": 570001724928.0, + "grad_norm": 0.028863121832353986, + "language_loss": 0.92692959, + "learning_rate": 0.0007458516654038483, + "loss": 0.93872178, + "num_input_tokens_seen": 153477408, + "router_z_loss_mlp": 0.89355469, + "step": 1852, + "time_per_iteration": 2.658867359161377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179202, + "balance_loss_mlp": 1.08936572, + "epoch": 0.35648326279338205, + "flos": 683609410560.0, + "grad_norm": 0.028040747176241956, + "language_loss": 0.94642723, + "learning_rate": 0.0007455803382466946, + "loss": 0.95821923, + "num_input_tokens_seen": 153551888, + "router_z_loss_mlp": 0.89648438, + "step": 1853, + "time_per_iteration": 2.86330509185791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183408, + "balance_loss_mlp": 1.09376252, + "epoch": 0.35667564447864564, + "flos": 630340941312.0, + "grad_norm": 0.02553826751691769, + "language_loss": 0.94946796, + "learning_rate": 0.0007453089157467979, + "loss": 0.96130198, + "num_input_tokens_seen": 153626912, + "router_z_loss_mlp": 0.89453125, + "step": 1854, + "time_per_iteration": 2.792577028274536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180437, + "balance_loss_mlp": 1.09093451, + "epoch": 0.35686802616390917, + "flos": 815504584704.0, + "grad_norm": 0.02468703395074296, + "language_loss": 0.8986901, + "learning_rate": 0.0007450373980095341, + "loss": 0.91049451, + "num_input_tokens_seen": 153711312, + "router_z_loss_mlp": 0.89306641, + "step": 1855, + "time_per_iteration": 3.0555014610290527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182657, + "balance_loss_mlp": 1.09334552, + "epoch": 0.35706040784917276, + "flos": 527205391872.0, + "grad_norm": 0.02890256158864057, + "language_loss": 0.93639445, + "learning_rate": 0.0007447657851403155, + "loss": 0.94822103, + "num_input_tokens_seen": 153780208, + "router_z_loss_mlp": 0.89111328, + "step": 1856, + "time_per_iteration": 2.589708089828491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182935, + "balance_loss_mlp": 1.09367096, + "epoch": 0.35725278953443634, + "flos": 513064624128.0, + "grad_norm": 0.032008561774258475, + "language_loss": 0.88987339, + "learning_rate": 0.0007444940772445915, + "loss": 0.9017027, + "num_input_tokens_seen": 153853152, + "router_z_loss_mlp": 0.890625, + "step": 1857, + "time_per_iteration": 2.7185556888580322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180668, + "balance_loss_mlp": 1.09169042, + "epoch": 0.3574451712196999, + "flos": 488492653056.0, + "grad_norm": 0.02708223160327311, + "language_loss": 0.88387084, + "learning_rate": 0.0007442222744278484, + "loss": 0.89567751, + "num_input_tokens_seen": 153924160, + "router_z_loss_mlp": 0.88769531, + "step": 1858, + "time_per_iteration": 2.6569979190826416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182567, + "balance_loss_mlp": 1.09339869, + "epoch": 0.35763755290496346, + "flos": 551821023744.0, + "grad_norm": 0.023402609147138306, + "language_loss": 0.90506786, + "learning_rate": 0.0007439503767956099, + "loss": 0.91689354, + "num_input_tokens_seen": 153998688, + "router_z_loss_mlp": 0.88964844, + "step": 1859, + "time_per_iteration": 2.7072699069976807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180801, + "balance_loss_mlp": 1.09249115, + "epoch": 0.357829934590227, + "flos": 1507225514496.0, + "grad_norm": 0.010565166743096084, + "language_loss": 0.79671603, + "learning_rate": 0.0007436783844534352, + "loss": 0.80852401, + "num_input_tokens_seen": 154230960, + "router_z_loss_mlp": 0.88085938, + "step": 1860, + "time_per_iteration": 4.9006147384643555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177337, + "balance_loss_mlp": 1.08835948, + "epoch": 0.3580223162754906, + "flos": 569841269760.0, + "grad_norm": 0.022894220472823423, + "language_loss": 0.92520916, + "learning_rate": 0.000743406297506922, + "loss": 0.93698251, + "num_input_tokens_seen": 154309104, + "router_z_loss_mlp": 0.88769531, + "step": 1861, + "time_per_iteration": 2.7065579891204834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186252, + "balance_loss_mlp": 1.09741747, + "epoch": 0.3582146979607541, + "flos": 627760018944.0, + "grad_norm": 0.02759787968542248, + "language_loss": 0.91638815, + "learning_rate": 0.0007431341160617031, + "loss": 0.92825067, + "num_input_tokens_seen": 154387424, + "router_z_loss_mlp": 0.88623047, + "step": 1862, + "time_per_iteration": 2.9316203594207764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179684, + "balance_loss_mlp": 1.09089661, + "epoch": 0.3584070796460177, + "flos": 508319016960.0, + "grad_norm": 0.024526236298265516, + "language_loss": 0.95309365, + "learning_rate": 0.0007428618402234491, + "loss": 0.96489048, + "num_input_tokens_seen": 154459952, + "router_z_loss_mlp": 0.88574219, + "step": 1863, + "time_per_iteration": 2.648061752319336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179939, + "balance_loss_mlp": 1.09129453, + "epoch": 0.3585994613312813, + "flos": 607640216064.0, + "grad_norm": 0.026400757424935653, + "language_loss": 0.88735509, + "learning_rate": 0.0007425894700978668, + "loss": 0.89915442, + "num_input_tokens_seen": 154535456, + "router_z_loss_mlp": 0.88427734, + "step": 1864, + "time_per_iteration": 2.7512128353118896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178956, + "balance_loss_mlp": 1.0905509, + "epoch": 0.3587918430165448, + "flos": 1415087675904.0, + "grad_norm": 0.025937088976099313, + "language_loss": 0.86489892, + "learning_rate": 0.0007423170057906996, + "loss": 0.87668848, + "num_input_tokens_seen": 154627568, + "router_z_loss_mlp": 0.88183594, + "step": 1865, + "time_per_iteration": 3.8491222858428955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181386, + "balance_loss_mlp": 1.0926944, + "epoch": 0.3589842247018084, + "flos": 479513730048.0, + "grad_norm": 0.0296684402619103, + "language_loss": 0.94328964, + "learning_rate": 0.0007420444474077275, + "loss": 0.95510352, + "num_input_tokens_seen": 154694640, + "router_z_loss_mlp": 0.88476562, + "step": 1866, + "time_per_iteration": 2.5396502017974854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183129, + "balance_loss_mlp": 1.09458029, + "epoch": 0.35917660638707194, + "flos": 505705167360.0, + "grad_norm": 0.030930075238968464, + "language_loss": 0.98337018, + "learning_rate": 0.0007417717950547671, + "loss": 0.99520147, + "num_input_tokens_seen": 154762048, + "router_z_loss_mlp": 0.88330078, + "step": 1867, + "time_per_iteration": 2.562638759613037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182945, + "balance_loss_mlp": 1.09654236, + "epoch": 0.3593689880723355, + "flos": 1495481745408.0, + "grad_norm": 0.008554058370081398, + "language_loss": 0.75996608, + "learning_rate": 0.0007414990488376713, + "loss": 0.77179551, + "num_input_tokens_seen": 154989952, + "router_z_loss_mlp": 0.86523438, + "step": 1868, + "time_per_iteration": 4.885401487350464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184482, + "balance_loss_mlp": 1.09583843, + "epoch": 0.35956136975759906, + "flos": 529671521280.0, + "grad_norm": 0.02257875970711003, + "language_loss": 0.91369003, + "learning_rate": 0.0007412262088623299, + "loss": 0.92553484, + "num_input_tokens_seen": 155066992, + "router_z_loss_mlp": 0.88427734, + "step": 1869, + "time_per_iteration": 2.755620241165161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184303, + "balance_loss_mlp": 1.09584975, + "epoch": 0.35975375144286265, + "flos": 535999664640.0, + "grad_norm": 0.02945163599469251, + "language_loss": 0.8810817, + "learning_rate": 0.0007409532752346684, + "loss": 0.89292467, + "num_input_tokens_seen": 155137616, + "router_z_loss_mlp": 0.88232422, + "step": 1870, + "time_per_iteration": 2.6426498889923096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187445, + "balance_loss_mlp": 1.09860992, + "epoch": 0.3599461331281262, + "flos": 505928749056.0, + "grad_norm": 0.025692069404306732, + "language_loss": 0.95194697, + "learning_rate": 0.0007406802480606491, + "loss": 0.96382141, + "num_input_tokens_seen": 155209248, + "router_z_loss_mlp": 0.88623047, + "step": 1871, + "time_per_iteration": 2.6156716346740723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180117, + "balance_loss_mlp": 1.09123456, + "epoch": 0.36013851481338977, + "flos": 512536869888.0, + "grad_norm": 0.029138864413584674, + "language_loss": 0.9874596, + "learning_rate": 0.0007404071274462707, + "loss": 0.99926078, + "num_input_tokens_seen": 155274176, + "router_z_loss_mlp": 0.88671875, + "step": 1872, + "time_per_iteration": 2.5790889263153076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179425, + "balance_loss_mlp": 1.09054244, + "epoch": 0.36033089649865335, + "flos": 548631756288.0, + "grad_norm": 0.029675252163234106, + "language_loss": 0.91584998, + "learning_rate": 0.0007401339134975682, + "loss": 0.92764425, + "num_input_tokens_seen": 155343232, + "router_z_loss_mlp": 0.88671875, + "step": 1873, + "time_per_iteration": 2.6279983520507812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185016, + "balance_loss_mlp": 1.09613371, + "epoch": 0.3605232781839169, + "flos": 459613506048.0, + "grad_norm": 0.030657976300352024, + "language_loss": 0.92556155, + "learning_rate": 0.0007398606063206122, + "loss": 0.93741173, + "num_input_tokens_seen": 155410080, + "router_z_loss_mlp": 0.88671875, + "step": 1874, + "time_per_iteration": 2.5750958919525146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178477, + "balance_loss_mlp": 1.0895946, + "epoch": 0.36071565986918047, + "flos": 510563566080.0, + "grad_norm": 0.029863822651947862, + "language_loss": 0.87000763, + "learning_rate": 0.0007395872060215101, + "loss": 0.88179243, + "num_input_tokens_seen": 155476240, + "router_z_loss_mlp": 0.88671875, + "step": 1875, + "time_per_iteration": 2.599595546722412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180043, + "balance_loss_mlp": 1.09101713, + "epoch": 0.360908041554444, + "flos": 560256729600.0, + "grad_norm": 0.02914010843617622, + "language_loss": 0.95866597, + "learning_rate": 0.0007393137127064056, + "loss": 0.97046638, + "num_input_tokens_seen": 155543392, + "router_z_loss_mlp": 0.88818359, + "step": 1876, + "time_per_iteration": 2.629855155944824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179718, + "balance_loss_mlp": 1.09064531, + "epoch": 0.3611004232397076, + "flos": 524878250496.0, + "grad_norm": 0.029199641876594032, + "language_loss": 0.93452048, + "learning_rate": 0.0007390401264814779, + "loss": 0.94631773, + "num_input_tokens_seen": 155613264, + "router_z_loss_mlp": 0.88867188, + "step": 1877, + "time_per_iteration": 2.6057403087615967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182123, + "balance_loss_mlp": 1.0932405, + "epoch": 0.3612928049249711, + "flos": 542032367616.0, + "grad_norm": 0.029384759310162312, + "language_loss": 0.93887711, + "learning_rate": 0.0007387664474529427, + "loss": 0.95069838, + "num_input_tokens_seen": 155683712, + "router_z_loss_mlp": 0.88671875, + "step": 1878, + "time_per_iteration": 2.612924814224243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181149, + "balance_loss_mlp": 1.09207559, + "epoch": 0.3614851866102347, + "flos": 553629143040.0, + "grad_norm": 0.028847856052759763, + "language_loss": 0.99400896, + "learning_rate": 0.0007384926757270518, + "loss": 1.00582051, + "num_input_tokens_seen": 155751760, + "router_z_loss_mlp": 0.88867188, + "step": 1879, + "time_per_iteration": 2.631417751312256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183007, + "balance_loss_mlp": 1.09364784, + "epoch": 0.36167756829549824, + "flos": 773426660352.0, + "grad_norm": 0.027790454764264987, + "language_loss": 0.87101346, + "learning_rate": 0.0007382188114100924, + "loss": 0.88284349, + "num_input_tokens_seen": 155830464, + "router_z_loss_mlp": 0.89160156, + "step": 1880, + "time_per_iteration": 3.0146212577819824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182663, + "balance_loss_mlp": 1.09330404, + "epoch": 0.36186994998076183, + "flos": 713187500544.0, + "grad_norm": 0.025874200926848077, + "language_loss": 0.89437282, + "learning_rate": 0.0007379448546083884, + "loss": 0.90619946, + "num_input_tokens_seen": 155906208, + "router_z_loss_mlp": 0.89160156, + "step": 1881, + "time_per_iteration": 2.9882314205169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182414, + "balance_loss_mlp": 1.09305489, + "epoch": 0.3620623316660254, + "flos": 748900351488.0, + "grad_norm": 0.028120122690860328, + "language_loss": 0.95218164, + "learning_rate": 0.0007376708054282992, + "loss": 0.96400583, + "num_input_tokens_seen": 155983584, + "router_z_loss_mlp": 0.89160156, + "step": 1882, + "time_per_iteration": 2.937251329421997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185259, + "balance_loss_mlp": 1.09609008, + "epoch": 0.36225471335128895, + "flos": 483534197760.0, + "grad_norm": 0.025051425069896712, + "language_loss": 0.90089262, + "learning_rate": 0.0007373966639762201, + "loss": 0.91274524, + "num_input_tokens_seen": 156052464, + "router_z_loss_mlp": 0.88964844, + "step": 1883, + "time_per_iteration": 2.5956366062164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189104, + "balance_loss_mlp": 1.09964943, + "epoch": 0.36244709503655254, + "flos": 507910785024.0, + "grad_norm": 0.028814908336841725, + "language_loss": 0.97620124, + "learning_rate": 0.0007371224303585822, + "loss": 0.9880923, + "num_input_tokens_seen": 156121424, + "router_z_loss_mlp": 0.89257812, + "step": 1884, + "time_per_iteration": 2.5689563751220703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188454, + "balance_loss_mlp": 1.10205078, + "epoch": 0.36263947672181607, + "flos": 1397052145152.0, + "grad_norm": 0.012535477100621303, + "language_loss": 0.80357069, + "learning_rate": 0.0007368481046818524, + "loss": 0.8154552, + "num_input_tokens_seen": 156346144, + "router_z_loss_mlp": 0.86523438, + "step": 1885, + "time_per_iteration": 4.708393573760986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184768, + "balance_loss_mlp": 1.09531295, + "epoch": 0.36283185840707965, + "flos": 654522144768.0, + "grad_norm": 0.026882878095346403, + "language_loss": 0.90798199, + "learning_rate": 0.0007365736870525335, + "loss": 0.91982961, + "num_input_tokens_seen": 156420880, + "router_z_loss_mlp": 0.89257812, + "step": 1886, + "time_per_iteration": 2.8096718788146973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188121, + "balance_loss_mlp": 1.09842801, + "epoch": 0.3630242400923432, + "flos": 489844876800.0, + "grad_norm": 0.028488669634490066, + "language_loss": 0.90766525, + "learning_rate": 0.000736299177577164, + "loss": 0.91954637, + "num_input_tokens_seen": 156485616, + "router_z_loss_mlp": 0.89501953, + "step": 1887, + "time_per_iteration": 2.5731940269470215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184527, + "balance_loss_mlp": 1.09488153, + "epoch": 0.3632166217776068, + "flos": 518231198208.0, + "grad_norm": 0.0291282657352475, + "language_loss": 0.90900671, + "learning_rate": 0.0007360245763623174, + "loss": 0.92085195, + "num_input_tokens_seen": 156557840, + "router_z_loss_mlp": 0.89453125, + "step": 1888, + "time_per_iteration": 2.6255550384521484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184122, + "balance_loss_mlp": 1.09457171, + "epoch": 0.36340900346287036, + "flos": 647347338240.0, + "grad_norm": 0.024297388169127104, + "language_loss": 0.96519047, + "learning_rate": 0.0007357498835146039, + "loss": 0.97703171, + "num_input_tokens_seen": 156632496, + "router_z_loss_mlp": 0.89355469, + "step": 1889, + "time_per_iteration": 2.8253488540649414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183322, + "balance_loss_mlp": 1.09386766, + "epoch": 0.3636013851481339, + "flos": 554410678272.0, + "grad_norm": 0.02538543495771105, + "language_loss": 0.93937147, + "learning_rate": 0.0007354750991406684, + "loss": 0.95120472, + "num_input_tokens_seen": 156705296, + "router_z_loss_mlp": 0.89257812, + "step": 1890, + "time_per_iteration": 2.692335844039917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182823, + "balance_loss_mlp": 1.09336889, + "epoch": 0.3637937668333975, + "flos": 547691767296.0, + "grad_norm": 0.028084450652072174, + "language_loss": 0.88223994, + "learning_rate": 0.0007352002233471919, + "loss": 0.89406812, + "num_input_tokens_seen": 156773376, + "router_z_loss_mlp": 0.89257812, + "step": 1891, + "time_per_iteration": 2.620753765106201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181153, + "balance_loss_mlp": 1.09212756, + "epoch": 0.363986148518661, + "flos": 539210399232.0, + "grad_norm": 0.027970426809957948, + "language_loss": 0.87592262, + "learning_rate": 0.0007349252562408906, + "loss": 0.88773412, + "num_input_tokens_seen": 156844336, + "router_z_loss_mlp": 0.88818359, + "step": 1892, + "time_per_iteration": 2.6963558197021484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186893, + "balance_loss_mlp": 1.09762907, + "epoch": 0.3641785302039246, + "flos": 661510299648.0, + "grad_norm": 0.026164868426956554, + "language_loss": 0.89186442, + "learning_rate": 0.0007346501979285158, + "loss": 0.90373337, + "num_input_tokens_seen": 156918848, + "router_z_loss_mlp": 0.890625, + "step": 1893, + "time_per_iteration": 2.880326747894287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188103, + "balance_loss_mlp": 1.10150909, + "epoch": 0.36437091188918813, + "flos": 1472082077184.0, + "grad_norm": 0.013556454199407954, + "language_loss": 0.80539101, + "learning_rate": 0.0007343750485168551, + "loss": 0.81727207, + "num_input_tokens_seen": 157134736, + "router_z_loss_mlp": 0.8671875, + "step": 1894, + "time_per_iteration": 4.7823100090026855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189424, + "balance_loss_mlp": 1.10011292, + "epoch": 0.3645632935744517, + "flos": 598444442112.0, + "grad_norm": 0.028411509484180794, + "language_loss": 0.93676329, + "learning_rate": 0.0007340998081127308, + "loss": 0.94865751, + "num_input_tokens_seen": 157211920, + "router_z_loss_mlp": 0.89111328, + "step": 1895, + "time_per_iteration": 2.7800211906433105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179101, + "balance_loss_mlp": 1.08998048, + "epoch": 0.36475567525971525, + "flos": 600695721984.0, + "grad_norm": 0.025932670803143428, + "language_loss": 0.98669052, + "learning_rate": 0.0007338244768230007, + "loss": 0.99848151, + "num_input_tokens_seen": 157284224, + "router_z_loss_mlp": 0.88916016, + "step": 1896, + "time_per_iteration": 2.7945594787597656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180722, + "balance_loss_mlp": 1.09169638, + "epoch": 0.36494805694497884, + "flos": 799830945792.0, + "grad_norm": 0.022772977260465788, + "language_loss": 0.94548512, + "learning_rate": 0.0007335490547545578, + "loss": 0.95729244, + "num_input_tokens_seen": 157367920, + "router_z_loss_mlp": 0.88818359, + "step": 1897, + "time_per_iteration": 3.031527280807495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182826, + "balance_loss_mlp": 1.09389579, + "epoch": 0.3651404386302424, + "flos": 638477203968.0, + "grad_norm": 0.024439781626348547, + "language_loss": 0.90189934, + "learning_rate": 0.0007332735420143308, + "loss": 0.91372758, + "num_input_tokens_seen": 157438672, + "router_z_loss_mlp": 0.88720703, + "step": 1898, + "time_per_iteration": 2.743051767349243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118252, + "balance_loss_mlp": 1.09363747, + "epoch": 0.36533282031550596, + "flos": 492562785792.0, + "grad_norm": 0.03052059755540218, + "language_loss": 0.95941794, + "learning_rate": 0.0007329979387092826, + "loss": 0.97124314, + "num_input_tokens_seen": 157505888, + "router_z_loss_mlp": 0.88671875, + "step": 1899, + "time_per_iteration": 2.5555779933929443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181449, + "balance_loss_mlp": 1.09247124, + "epoch": 0.36552520200076954, + "flos": 857508648960.0, + "grad_norm": 0.02266050351879182, + "language_loss": 0.89947438, + "learning_rate": 0.0007327222449464124, + "loss": 0.91128886, + "num_input_tokens_seen": 157601568, + "router_z_loss_mlp": 0.88769531, + "step": 1900, + "time_per_iteration": 3.2362029552459717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181183, + "balance_loss_mlp": 1.09206235, + "epoch": 0.3657175836860331, + "flos": 484715232768.0, + "grad_norm": 0.026374750280255838, + "language_loss": 0.95288622, + "learning_rate": 0.0007324464608327538, + "loss": 0.96469808, + "num_input_tokens_seen": 157670992, + "router_z_loss_mlp": 0.88916016, + "step": 1901, + "time_per_iteration": 2.5933730602264404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179798, + "balance_loss_mlp": 1.09058213, + "epoch": 0.36590996537129666, + "flos": 435721012224.0, + "grad_norm": 0.02685373461110618, + "language_loss": 0.96213037, + "learning_rate": 0.0007321705864753758, + "loss": 0.97392833, + "num_input_tokens_seen": 157743616, + "router_z_loss_mlp": 0.89013672, + "step": 1902, + "time_per_iteration": 2.6981201171875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180605, + "balance_loss_mlp": 1.09124577, + "epoch": 0.3661023470565602, + "flos": 713513140224.0, + "grad_norm": 0.022756571637903334, + "language_loss": 0.91225153, + "learning_rate": 0.0007318946219813823, + "loss": 0.9240576, + "num_input_tokens_seen": 157823520, + "router_z_loss_mlp": 0.89160156, + "step": 1903, + "time_per_iteration": 2.992624044418335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183651, + "balance_loss_mlp": 1.09443474, + "epoch": 0.3662947287418238, + "flos": 565822803456.0, + "grad_norm": 0.027935940535232063, + "language_loss": 0.96619356, + "learning_rate": 0.000731618567457912, + "loss": 0.97803003, + "num_input_tokens_seen": 157893248, + "router_z_loss_mlp": 0.89013672, + "step": 1904, + "time_per_iteration": 2.685476064682007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183785, + "balance_loss_mlp": 1.09433067, + "epoch": 0.3664871104270873, + "flos": 791201857536.0, + "grad_norm": 0.029459392082425068, + "language_loss": 0.95166355, + "learning_rate": 0.000731342423012139, + "loss": 0.96350139, + "num_input_tokens_seen": 157973216, + "router_z_loss_mlp": 0.89257812, + "step": 1905, + "time_per_iteration": 3.0574183464050293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184501, + "balance_loss_mlp": 1.09480846, + "epoch": 0.3666794921123509, + "flos": 753980330496.0, + "grad_norm": 0.028631588758117728, + "language_loss": 0.89661896, + "learning_rate": 0.0007310661887512722, + "loss": 0.90846401, + "num_input_tokens_seen": 158051088, + "router_z_loss_mlp": 0.89501953, + "step": 1906, + "time_per_iteration": 3.024423122406006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183077, + "balance_loss_mlp": 1.09343171, + "epoch": 0.3668718737976145, + "flos": 524607005184.0, + "grad_norm": 0.02900954708937733, + "language_loss": 0.89823443, + "learning_rate": 0.0007307898647825549, + "loss": 0.91006529, + "num_input_tokens_seen": 158124368, + "router_z_loss_mlp": 0.89453125, + "step": 1907, + "time_per_iteration": 2.6485068798065186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182186, + "balance_loss_mlp": 1.09277892, + "epoch": 0.367064255482878, + "flos": 573045273600.0, + "grad_norm": 0.031417651983294596, + "language_loss": 0.98967636, + "learning_rate": 0.0007305134512132659, + "loss": 1.00149822, + "num_input_tokens_seen": 158191472, + "router_z_loss_mlp": 0.89208984, + "step": 1908, + "time_per_iteration": 2.646838903427124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180724, + "balance_loss_mlp": 1.09107888, + "epoch": 0.3672566371681416, + "flos": 448053660672.0, + "grad_norm": 0.03289649974011927, + "language_loss": 0.93253779, + "learning_rate": 0.0007302369481507183, + "loss": 0.94434512, + "num_input_tokens_seen": 158254384, + "router_z_loss_mlp": 0.89453125, + "step": 1909, + "time_per_iteration": 2.562856674194336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188614, + "balance_loss_mlp": 1.10011292, + "epoch": 0.36744901885340514, + "flos": 1543364061696.0, + "grad_norm": 0.010877058892954462, + "language_loss": 0.79961759, + "learning_rate": 0.00072996035570226, + "loss": 0.81150377, + "num_input_tokens_seen": 158486160, + "router_z_loss_mlp": 0.8828125, + "step": 1910, + "time_per_iteration": 4.90735387802124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011789, + "balance_loss_mlp": 1.08949292, + "epoch": 0.36764140053866873, + "flos": 564761290752.0, + "grad_norm": 0.024499581587470617, + "language_loss": 0.92626876, + "learning_rate": 0.000729683673975274, + "loss": 0.93805778, + "num_input_tokens_seen": 158555616, + "router_z_loss_mlp": 0.89208984, + "step": 1911, + "time_per_iteration": 2.6646595001220703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182116, + "balance_loss_mlp": 1.09285223, + "epoch": 0.36783378222393226, + "flos": 1218650895360.0, + "grad_norm": 0.021973130552363645, + "language_loss": 0.89050859, + "learning_rate": 0.0007294069030771774, + "loss": 0.90232974, + "num_input_tokens_seen": 158653984, + "router_z_loss_mlp": 0.890625, + "step": 1912, + "time_per_iteration": 3.6834843158721924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189865, + "balance_loss_mlp": 1.10021913, + "epoch": 0.36802616390919585, + "flos": 499720128000.0, + "grad_norm": 0.028676866730684987, + "language_loss": 0.97328013, + "learning_rate": 0.0007291300431154224, + "loss": 0.98517883, + "num_input_tokens_seen": 158719728, + "router_z_loss_mlp": 0.89453125, + "step": 1913, + "time_per_iteration": 2.587052822113037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195931, + "balance_loss_mlp": 1.10838318, + "epoch": 0.36821854559445943, + "flos": 1585615902720.0, + "grad_norm": 0.013013835157786544, + "language_loss": 0.70389736, + "learning_rate": 0.0007288530941974955, + "loss": 0.71585667, + "num_input_tokens_seen": 158952544, + "router_z_loss_mlp": 0.87695312, + "step": 1914, + "time_per_iteration": 4.952203989028931 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185283, + "balance_loss_mlp": 1.09582841, + "epoch": 0.36841092727972297, + "flos": 837089402880.0, + "grad_norm": 0.02834339080565921, + "language_loss": 0.8768307, + "learning_rate": 0.0007285760564309179, + "loss": 0.88868356, + "num_input_tokens_seen": 159039680, + "router_z_loss_mlp": 0.89257812, + "step": 1915, + "time_per_iteration": 3.100893974304199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185476, + "balance_loss_mlp": 1.09602106, + "epoch": 0.36860330896498655, + "flos": 691209913344.0, + "grad_norm": 0.028423235038061073, + "language_loss": 0.92041719, + "learning_rate": 0.0007282989299232448, + "loss": 0.93227196, + "num_input_tokens_seen": 159128128, + "router_z_loss_mlp": 0.89257812, + "step": 1916, + "time_per_iteration": 3.0683393478393555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189801, + "balance_loss_mlp": 1.10048962, + "epoch": 0.3687956906502501, + "flos": 555239877120.0, + "grad_norm": 0.03332088686108748, + "language_loss": 0.92434603, + "learning_rate": 0.0007280217147820668, + "loss": 0.93624407, + "num_input_tokens_seen": 159193248, + "router_z_loss_mlp": 0.89111328, + "step": 1917, + "time_per_iteration": 2.635451078414917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188211, + "balance_loss_mlp": 1.09894717, + "epoch": 0.3689880723355137, + "flos": 577819078656.0, + "grad_norm": 0.027623597033391085, + "language_loss": 0.8697632, + "learning_rate": 0.0007277444111150079, + "loss": 0.88164532, + "num_input_tokens_seen": 159265824, + "router_z_loss_mlp": 0.890625, + "step": 1918, + "time_per_iteration": 2.810635805130005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184664, + "balance_loss_mlp": 1.09540033, + "epoch": 0.3691804540207772, + "flos": 529886370816.0, + "grad_norm": 0.029489830132381867, + "language_loss": 0.91299617, + "learning_rate": 0.0007274670190297272, + "loss": 0.92484283, + "num_input_tokens_seen": 159332992, + "router_z_loss_mlp": 0.890625, + "step": 1919, + "time_per_iteration": 2.615386486053467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118238, + "balance_loss_mlp": 1.09368861, + "epoch": 0.3693728357060408, + "flos": 562180368384.0, + "grad_norm": 0.025570373781710027, + "language_loss": 0.90037912, + "learning_rate": 0.0007271895386339179, + "loss": 0.91220295, + "num_input_tokens_seen": 159409808, + "router_z_loss_mlp": 0.88476562, + "step": 1920, + "time_per_iteration": 2.7868921756744385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192586, + "balance_loss_mlp": 1.10375118, + "epoch": 0.3695652173913043, + "flos": 580899557376.0, + "grad_norm": 0.02893533685872539, + "language_loss": 0.90819347, + "learning_rate": 0.0007269119700353073, + "loss": 0.92011935, + "num_input_tokens_seen": 159486128, + "router_z_loss_mlp": 0.88623047, + "step": 1921, + "time_per_iteration": 2.7836573123931885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178636, + "balance_loss_mlp": 1.09023082, + "epoch": 0.3697575990765679, + "flos": 514059007488.0, + "grad_norm": 0.024390447267758214, + "language_loss": 0.90977228, + "learning_rate": 0.0007266343133416571, + "loss": 0.92155862, + "num_input_tokens_seen": 159562224, + "router_z_loss_mlp": 0.8828125, + "step": 1922, + "time_per_iteration": 2.800387382507324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173615, + "balance_loss_mlp": 1.08816528, + "epoch": 0.3699499807618315, + "flos": 1573903607808.0, + "grad_norm": 0.0066311072211368925, + "language_loss": 0.77116919, + "learning_rate": 0.0007263565686607632, + "loss": 0.78290522, + "num_input_tokens_seen": 159784768, + "router_z_loss_mlp": 0.85546875, + "step": 1923, + "time_per_iteration": 4.845300912857056 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176045, + "balance_loss_mlp": 1.08844995, + "epoch": 0.37014236244709503, + "flos": 498324243456.0, + "grad_norm": 0.031949393340513096, + "language_loss": 0.9351213, + "learning_rate": 0.0007260787361004556, + "loss": 0.94688171, + "num_input_tokens_seen": 159848608, + "router_z_loss_mlp": 0.87744141, + "step": 1924, + "time_per_iteration": 2.5984597206115723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175598, + "balance_loss_mlp": 1.0905304, + "epoch": 0.3703347441323586, + "flos": 1447605433344.0, + "grad_norm": 0.008500773473990196, + "language_loss": 0.73761505, + "learning_rate": 0.0007258008157685987, + "loss": 0.74937099, + "num_input_tokens_seen": 160080928, + "router_z_loss_mlp": 0.8515625, + "step": 1925, + "time_per_iteration": 4.886027097702026 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197031, + "balance_loss_mlp": 1.10862505, + "epoch": 0.37052712581762215, + "flos": 564713627136.0, + "grad_norm": 0.03178088368953176, + "language_loss": 0.94516188, + "learning_rate": 0.0007255228077730903, + "loss": 0.95713222, + "num_input_tokens_seen": 160148976, + "router_z_loss_mlp": 0.88183594, + "step": 1926, + "time_per_iteration": 2.6847593784332275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185383, + "balance_loss_mlp": 1.09731126, + "epoch": 0.37071950750288574, + "flos": 927570667008.0, + "grad_norm": 0.029564625514678724, + "language_loss": 0.89603549, + "learning_rate": 0.0007252447122218632, + "loss": 0.90788931, + "num_input_tokens_seen": 160233504, + "router_z_loss_mlp": 0.88037109, + "step": 1927, + "time_per_iteration": 3.106748342514038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179784, + "balance_loss_mlp": 1.0919987, + "epoch": 0.37091188918814927, + "flos": 419200710144.0, + "grad_norm": 0.03402230349378661, + "language_loss": 0.98334146, + "learning_rate": 0.0007249665292228834, + "loss": 0.99513936, + "num_input_tokens_seen": 160299696, + "router_z_loss_mlp": 0.87939453, + "step": 1928, + "time_per_iteration": 2.5786120891571045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186321, + "balance_loss_mlp": 1.09801054, + "epoch": 0.37110427087341286, + "flos": 464146265088.0, + "grad_norm": 0.029271450765855984, + "language_loss": 0.9102214, + "learning_rate": 0.000724688258884151, + "loss": 0.92208457, + "num_input_tokens_seen": 160367904, + "router_z_loss_mlp": 0.88183594, + "step": 1929, + "time_per_iteration": 2.5388894081115723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185686, + "balance_loss_mlp": 1.09780467, + "epoch": 0.3712966525586764, + "flos": 851080449024.0, + "grad_norm": 0.02435916983518334, + "language_loss": 0.9136247, + "learning_rate": 0.0007244099013137002, + "loss": 0.92548156, + "num_input_tokens_seen": 160453600, + "router_z_loss_mlp": 0.88037109, + "step": 1930, + "time_per_iteration": 3.0708000659942627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179762, + "balance_loss_mlp": 1.09159458, + "epoch": 0.37148903424394, + "flos": 927557932032.0, + "grad_norm": 0.024720397528266293, + "language_loss": 0.95256186, + "learning_rate": 0.0007241314566195993, + "loss": 0.96435952, + "num_input_tokens_seen": 160543472, + "router_z_loss_mlp": 0.88232422, + "step": 1931, + "time_per_iteration": 3.2293543815612793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179876, + "balance_loss_mlp": 1.09180403, + "epoch": 0.37168141592920356, + "flos": 520820852736.0, + "grad_norm": 0.029266961451931986, + "language_loss": 0.92750597, + "learning_rate": 0.0007238529249099496, + "loss": 0.93930471, + "num_input_tokens_seen": 160614016, + "router_z_loss_mlp": 0.88232422, + "step": 1932, + "time_per_iteration": 2.6091582775115967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188461, + "balance_loss_mlp": 1.10263062, + "epoch": 0.3718737976144671, + "flos": 1449059715072.0, + "grad_norm": 0.015165360012205364, + "language_loss": 0.77856874, + "learning_rate": 0.0007235743062928872, + "loss": 0.79045337, + "num_input_tokens_seen": 160828640, + "router_z_loss_mlp": 0.859375, + "step": 1933, + "time_per_iteration": 4.854676246643066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184357, + "balance_loss_mlp": 1.09614182, + "epoch": 0.3720661792997307, + "flos": 760953022464.0, + "grad_norm": 0.028795817149727888, + "language_loss": 0.88381398, + "learning_rate": 0.000723295600876581, + "loss": 0.89565754, + "num_input_tokens_seen": 160913088, + "router_z_loss_mlp": 0.8828125, + "step": 1934, + "time_per_iteration": 2.9830405712127686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118189, + "balance_loss_mlp": 1.09396136, + "epoch": 0.3722585609849942, + "flos": 518044546560.0, + "grad_norm": 0.028690096062057496, + "language_loss": 0.95446575, + "learning_rate": 0.0007230168087692344, + "loss": 0.96628463, + "num_input_tokens_seen": 160982960, + "router_z_loss_mlp": 0.88085938, + "step": 1935, + "time_per_iteration": 2.651982307434082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181923, + "balance_loss_mlp": 1.09404159, + "epoch": 0.3724509426702578, + "flos": 783868597248.0, + "grad_norm": 0.02900654324264667, + "language_loss": 0.88952625, + "learning_rate": 0.0007227379300790839, + "loss": 0.90134549, + "num_input_tokens_seen": 161066000, + "router_z_loss_mlp": 0.88037109, + "step": 1936, + "time_per_iteration": 3.0127265453338623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177948, + "balance_loss_mlp": 1.09006691, + "epoch": 0.37264332435552133, + "flos": 392599039488.0, + "grad_norm": 0.02836050450865214, + "language_loss": 0.94049299, + "learning_rate": 0.0007224589649143997, + "loss": 0.95227242, + "num_input_tokens_seen": 161131040, + "router_z_loss_mlp": 0.88037109, + "step": 1937, + "time_per_iteration": 2.5600061416625977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178201, + "balance_loss_mlp": 1.09074926, + "epoch": 0.3728357060407849, + "flos": 543912345600.0, + "grad_norm": 0.027673862011078548, + "language_loss": 0.89373219, + "learning_rate": 0.0007221799133834861, + "loss": 0.90551418, + "num_input_tokens_seen": 161201248, + "router_z_loss_mlp": 0.87597656, + "step": 1938, + "time_per_iteration": 2.646632671356201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011797, + "balance_loss_mlp": 1.0919621, + "epoch": 0.3730280877260485, + "flos": 434483581440.0, + "grad_norm": 0.03019004471989451, + "language_loss": 0.90666437, + "learning_rate": 0.00072190077559468, + "loss": 0.91846132, + "num_input_tokens_seen": 161266288, + "router_z_loss_mlp": 0.87890625, + "step": 1939, + "time_per_iteration": 2.5193679332733154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118304, + "balance_loss_mlp": 1.0957315, + "epoch": 0.37322046941131204, + "flos": 532510953984.0, + "grad_norm": 0.02812892901872328, + "language_loss": 0.95514065, + "learning_rate": 0.0007216215516563527, + "loss": 0.96697104, + "num_input_tokens_seen": 161335648, + "router_z_loss_mlp": 0.87451172, + "step": 1940, + "time_per_iteration": 2.6975200176239014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184025, + "balance_loss_mlp": 1.09666896, + "epoch": 0.3734128510965756, + "flos": 532576081920.0, + "grad_norm": 0.028733495674926814, + "language_loss": 0.91960251, + "learning_rate": 0.0007213422416769083, + "loss": 0.93144274, + "num_input_tokens_seen": 161403440, + "router_z_loss_mlp": 0.875, + "step": 1941, + "time_per_iteration": 2.636056900024414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183262, + "balance_loss_mlp": 1.09561944, + "epoch": 0.37360523278183916, + "flos": 501432920064.0, + "grad_norm": 0.028111058318233337, + "language_loss": 0.83044219, + "learning_rate": 0.0007210628457647849, + "loss": 0.84227479, + "num_input_tokens_seen": 161472864, + "router_z_loss_mlp": 0.87792969, + "step": 1942, + "time_per_iteration": 2.6564345359802246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182498, + "balance_loss_mlp": 1.09475958, + "epoch": 0.37379761446710275, + "flos": 549111846912.0, + "grad_norm": 0.03172951338735415, + "language_loss": 0.86608446, + "learning_rate": 0.000720783364028453, + "loss": 0.87790942, + "num_input_tokens_seen": 161548096, + "router_z_loss_mlp": 0.87890625, + "step": 1943, + "time_per_iteration": 2.7782797813415527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176645, + "balance_loss_mlp": 1.08909822, + "epoch": 0.3739899961523663, + "flos": 476739425280.0, + "grad_norm": 0.0265564263320471, + "language_loss": 0.94348681, + "learning_rate": 0.0007205037965764177, + "loss": 0.95525324, + "num_input_tokens_seen": 161615600, + "router_z_loss_mlp": 0.87695312, + "step": 1944, + "time_per_iteration": 2.5670034885406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198539, + "balance_loss_mlp": 1.11003804, + "epoch": 0.37418237783762986, + "flos": 613076034048.0, + "grad_norm": 0.032068934234115415, + "language_loss": 0.94037992, + "learning_rate": 0.0007202241435172161, + "loss": 0.95236534, + "num_input_tokens_seen": 161687408, + "router_z_loss_mlp": 0.8828125, + "step": 1945, + "time_per_iteration": 2.7505762577056885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119095, + "balance_loss_mlp": 1.10283065, + "epoch": 0.3743747595228934, + "flos": 767628272640.0, + "grad_norm": 0.02891432689626354, + "language_loss": 0.95249915, + "learning_rate": 0.0007199444049594198, + "loss": 0.9644087, + "num_input_tokens_seen": 161764224, + "router_z_loss_mlp": 0.88085938, + "step": 1946, + "time_per_iteration": 2.9690663814544678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179721, + "balance_loss_mlp": 1.09188759, + "epoch": 0.374567141208157, + "flos": 525490598400.0, + "grad_norm": 0.029648083740235674, + "language_loss": 0.90769064, + "learning_rate": 0.0007196645810116322, + "loss": 0.91948783, + "num_input_tokens_seen": 161835520, + "router_z_loss_mlp": 0.87988281, + "step": 1947, + "time_per_iteration": 2.690214157104492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178535, + "balance_loss_mlp": 1.09065437, + "epoch": 0.37475952289342057, + "flos": 682613025792.0, + "grad_norm": 0.029716110952303924, + "language_loss": 0.91939867, + "learning_rate": 0.0007193846717824912, + "loss": 0.93118405, + "num_input_tokens_seen": 161912000, + "router_z_loss_mlp": 0.88037109, + "step": 1948, + "time_per_iteration": 2.9668121337890625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179187, + "balance_loss_mlp": 1.09140122, + "epoch": 0.3749519045786841, + "flos": 461215507968.0, + "grad_norm": 0.032662314662123194, + "language_loss": 0.97396064, + "learning_rate": 0.0007191046773806669, + "loss": 0.98575246, + "num_input_tokens_seen": 161977296, + "router_z_loss_mlp": 0.87939453, + "step": 1949, + "time_per_iteration": 2.5580427646636963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189402, + "balance_loss_mlp": 1.10166442, + "epoch": 0.3751442862639477, + "flos": 956386687488.0, + "grad_norm": 0.03764484603893814, + "language_loss": 0.94282359, + "learning_rate": 0.0007188245979148631, + "loss": 0.95471758, + "num_input_tokens_seen": 162051888, + "router_z_loss_mlp": 0.87890625, + "step": 1950, + "time_per_iteration": 3.1307644844055176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185097, + "balance_loss_mlp": 1.09678674, + "epoch": 0.3753366679492112, + "flos": 528805392384.0, + "grad_norm": 0.0321726971318772, + "language_loss": 0.95554888, + "learning_rate": 0.0007185444334938157, + "loss": 0.96739984, + "num_input_tokens_seen": 162124384, + "router_z_loss_mlp": 0.8828125, + "step": 1951, + "time_per_iteration": 2.7235019207000732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181124, + "balance_loss_mlp": 1.09324276, + "epoch": 0.3755290496344748, + "flos": 522848550912.0, + "grad_norm": 0.029170285322497422, + "language_loss": 0.91979843, + "learning_rate": 0.0007182641842262947, + "loss": 0.93160963, + "num_input_tokens_seen": 162191440, + "router_z_loss_mlp": 0.88037109, + "step": 1952, + "time_per_iteration": 2.6083579063415527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179821, + "balance_loss_mlp": 1.09193957, + "epoch": 0.37572143131973834, + "flos": 622371864576.0, + "grad_norm": 0.029206332986401715, + "language_loss": 0.85116351, + "learning_rate": 0.0007179838502211022, + "loss": 0.86296165, + "num_input_tokens_seen": 162268480, + "router_z_loss_mlp": 0.88037109, + "step": 1953, + "time_per_iteration": 2.8308520317077637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185603, + "balance_loss_mlp": 1.0973407, + "epoch": 0.37591381300500193, + "flos": 772273823232.0, + "grad_norm": 0.030259488278154622, + "language_loss": 0.94510454, + "learning_rate": 0.0007177034315870738, + "loss": 0.9569605, + "num_input_tokens_seen": 162346752, + "router_z_loss_mlp": 0.88232422, + "step": 1954, + "time_per_iteration": 2.966627359390259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187445, + "balance_loss_mlp": 1.09908688, + "epoch": 0.37610619469026546, + "flos": 521480864256.0, + "grad_norm": 0.02960656624392615, + "language_loss": 0.99060822, + "learning_rate": 0.0007174229284330773, + "loss": 1.00248265, + "num_input_tokens_seen": 162415120, + "router_z_loss_mlp": 0.88330078, + "step": 1955, + "time_per_iteration": 2.642186403274536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182076, + "balance_loss_mlp": 1.09338391, + "epoch": 0.37629857637552905, + "flos": 599970582528.0, + "grad_norm": 0.025408092842649905, + "language_loss": 0.92700577, + "learning_rate": 0.0007171423408680141, + "loss": 0.93882644, + "num_input_tokens_seen": 162493280, + "router_z_loss_mlp": 0.88671875, + "step": 1956, + "time_per_iteration": 2.8501906394958496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180409, + "balance_loss_mlp": 1.09138381, + "epoch": 0.37649095806079264, + "flos": 566018187264.0, + "grad_norm": 0.027446848492574977, + "language_loss": 0.96095192, + "learning_rate": 0.0007168616690008176, + "loss": 0.97275609, + "num_input_tokens_seen": 162560736, + "router_z_loss_mlp": 0.88818359, + "step": 1957, + "time_per_iteration": 2.658282995223999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183288, + "balance_loss_mlp": 1.09440601, + "epoch": 0.37668333974605617, + "flos": 593568579072.0, + "grad_norm": 0.029268558303355535, + "language_loss": 0.93381131, + "learning_rate": 0.0007165809129404545, + "loss": 0.9456442, + "num_input_tokens_seen": 162630688, + "router_z_loss_mlp": 0.88671875, + "step": 1958, + "time_per_iteration": 2.738896608352661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185047, + "balance_loss_mlp": 1.09621239, + "epoch": 0.37687572143131975, + "flos": 420364280832.0, + "grad_norm": 0.028940223287944336, + "language_loss": 0.94791234, + "learning_rate": 0.0007163000727959239, + "loss": 0.95976275, + "num_input_tokens_seen": 162694304, + "router_z_loss_mlp": 0.88623047, + "step": 1959, + "time_per_iteration": 2.5175514221191406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122541, + "balance_loss_mlp": 1.14034271, + "epoch": 0.3770681031165833, + "flos": 1360384568832.0, + "grad_norm": 0.031863979933265396, + "language_loss": 0.77959073, + "learning_rate": 0.0007160191486762575, + "loss": 0.79184484, + "num_input_tokens_seen": 162920336, + "router_z_loss_mlp": 0.8515625, + "step": 1960, + "time_per_iteration": 4.834294557571411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187625, + "balance_loss_mlp": 1.0985992, + "epoch": 0.3772604848018469, + "flos": 646153568256.0, + "grad_norm": 0.027699188267120346, + "language_loss": 0.9236567, + "learning_rate": 0.00071573814069052, + "loss": 0.93553299, + "num_input_tokens_seen": 163000720, + "router_z_loss_mlp": 0.88818359, + "step": 1961, + "time_per_iteration": 2.8704912662506104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195985, + "balance_loss_mlp": 1.10681665, + "epoch": 0.3774528664871104, + "flos": 903200810496.0, + "grad_norm": 0.025601029742712816, + "language_loss": 0.93588847, + "learning_rate": 0.0007154570489478081, + "loss": 0.94784832, + "num_input_tokens_seen": 163085680, + "router_z_loss_mlp": 0.88964844, + "step": 1962, + "time_per_iteration": 3.2312510013580322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198663, + "balance_loss_mlp": 1.1095897, + "epoch": 0.377645248172374, + "flos": 789462868992.0, + "grad_norm": 0.028157211525065163, + "language_loss": 0.92405236, + "learning_rate": 0.0007151758735572514, + "loss": 0.93603897, + "num_input_tokens_seen": 163162224, + "router_z_loss_mlp": 0.88867188, + "step": 1963, + "time_per_iteration": 3.0338857173919678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192995, + "balance_loss_mlp": 1.10396981, + "epoch": 0.3778376298576376, + "flos": 587924642304.0, + "grad_norm": 0.030822839560022956, + "language_loss": 0.89740217, + "learning_rate": 0.0007148946146280119, + "loss": 0.90933216, + "num_input_tokens_seen": 163237920, + "router_z_loss_mlp": 0.88818359, + "step": 1964, + "time_per_iteration": 2.795830488204956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193161, + "balance_loss_mlp": 1.10656738, + "epoch": 0.3780300115429011, + "flos": 1399669997568.0, + "grad_norm": 0.013238700163895742, + "language_loss": 0.72192144, + "learning_rate": 0.000714613272269284, + "loss": 0.7338531, + "num_input_tokens_seen": 163455760, + "router_z_loss_mlp": 0.8671875, + "step": 1965, + "time_per_iteration": 4.866962909698486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120089, + "balance_loss_mlp": 1.11372375, + "epoch": 0.3782223932281647, + "flos": 1360631619072.0, + "grad_norm": 0.015556792607008025, + "language_loss": 0.75341946, + "learning_rate": 0.0007143318465902943, + "loss": 0.76542836, + "num_input_tokens_seen": 163678064, + "router_z_loss_mlp": 0.87304688, + "step": 1966, + "time_per_iteration": 4.942438364028931 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179172, + "balance_loss_mlp": 1.09114802, + "epoch": 0.37841477491342823, + "flos": 705515865600.0, + "grad_norm": 0.024767419651172896, + "language_loss": 0.90831983, + "learning_rate": 0.0007140503377003022, + "loss": 0.92011154, + "num_input_tokens_seen": 163764320, + "router_z_loss_mlp": 0.88183594, + "step": 1967, + "time_per_iteration": 2.9852232933044434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118121, + "balance_loss_mlp": 1.09318614, + "epoch": 0.3786071565986918, + "flos": 530155614720.0, + "grad_norm": 0.02676934241732637, + "language_loss": 0.92451024, + "learning_rate": 0.000713768745708599, + "loss": 0.93632239, + "num_input_tokens_seen": 163831808, + "router_z_loss_mlp": 0.88183594, + "step": 1968, + "time_per_iteration": 2.6276321411132812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180899, + "balance_loss_mlp": 1.09311283, + "epoch": 0.37879953828395535, + "flos": 994900039680.0, + "grad_norm": 0.026029915049846697, + "language_loss": 0.85207623, + "learning_rate": 0.0007134870707245085, + "loss": 0.86388516, + "num_input_tokens_seen": 163918128, + "router_z_loss_mlp": 0.87939453, + "step": 1969, + "time_per_iteration": 3.2757370471954346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118867, + "balance_loss_mlp": 1.10074103, + "epoch": 0.37899191996921894, + "flos": 627792219648.0, + "grad_norm": 0.029282968357198087, + "language_loss": 0.91297084, + "learning_rate": 0.0007132053128573864, + "loss": 0.92485756, + "num_input_tokens_seen": 163987552, + "router_z_loss_mlp": 0.88085938, + "step": 1970, + "time_per_iteration": 2.713987350463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184407, + "balance_loss_mlp": 1.09633517, + "epoch": 0.37918430165448247, + "flos": 687519088128.0, + "grad_norm": 0.026716081838251738, + "language_loss": 0.91701669, + "learning_rate": 0.0007129234722166211, + "loss": 0.92886078, + "num_input_tokens_seen": 164063248, + "router_z_loss_mlp": 0.88232422, + "step": 1971, + "time_per_iteration": 2.830312728881836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178089, + "balance_loss_mlp": 1.09025514, + "epoch": 0.37937668333974606, + "flos": 476617901568.0, + "grad_norm": 0.023390773702336033, + "language_loss": 0.97041333, + "learning_rate": 0.0007126415489116328, + "loss": 0.98219419, + "num_input_tokens_seen": 164133776, + "router_z_loss_mlp": 0.87988281, + "step": 1972, + "time_per_iteration": 2.6577088832855225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186585, + "balance_loss_mlp": 1.09903812, + "epoch": 0.37956906502500964, + "flos": 708823928832.0, + "grad_norm": 0.02822522227358307, + "language_loss": 0.89341533, + "learning_rate": 0.0007123595430518736, + "loss": 0.90528119, + "num_input_tokens_seen": 164206672, + "router_z_loss_mlp": 0.87695312, + "step": 1973, + "time_per_iteration": 2.8803040981292725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187247, + "balance_loss_mlp": 1.09974778, + "epoch": 0.3797614467102732, + "flos": 427558553088.0, + "grad_norm": 0.030455517002935972, + "language_loss": 0.93240166, + "learning_rate": 0.0007120774547468282, + "loss": 0.94427419, + "num_input_tokens_seen": 164271968, + "router_z_loss_mlp": 0.87646484, + "step": 1974, + "time_per_iteration": 2.5190658569335938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185963, + "balance_loss_mlp": 1.09836841, + "epoch": 0.37995382839553676, + "flos": 482880916992.0, + "grad_norm": 0.028219754054602288, + "language_loss": 0.89357984, + "learning_rate": 0.0007117952841060128, + "loss": 0.9054395, + "num_input_tokens_seen": 164342800, + "router_z_loss_mlp": 0.87744141, + "step": 1975, + "time_per_iteration": 2.6428894996643066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184241, + "balance_loss_mlp": 1.09631252, + "epoch": 0.3801462100808003, + "flos": 561670078464.0, + "grad_norm": 0.02907805968320273, + "language_loss": 0.90876186, + "learning_rate": 0.0007115130312389756, + "loss": 0.92060423, + "num_input_tokens_seen": 164414928, + "router_z_loss_mlp": 0.88085938, + "step": 1976, + "time_per_iteration": 2.669287919998169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188644, + "balance_loss_mlp": 1.10066783, + "epoch": 0.3803385917660639, + "flos": 465887255040.0, + "grad_norm": 0.031138982719559682, + "language_loss": 0.88565898, + "learning_rate": 0.0007112306962552973, + "loss": 0.89754546, + "num_input_tokens_seen": 164483312, + "router_z_loss_mlp": 0.88134766, + "step": 1977, + "time_per_iteration": 2.617105007171631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188488, + "balance_loss_mlp": 1.10055935, + "epoch": 0.3805309734513274, + "flos": 522904946688.0, + "grad_norm": 0.027881475391737562, + "language_loss": 0.92461807, + "learning_rate": 0.0007109482792645896, + "loss": 0.93650293, + "num_input_tokens_seen": 164555760, + "router_z_loss_mlp": 0.88085938, + "step": 1978, + "time_per_iteration": 2.7350404262542725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191644, + "balance_loss_mlp": 1.10352468, + "epoch": 0.380723355136591, + "flos": 592552728576.0, + "grad_norm": 0.03010131618310245, + "language_loss": 0.91373634, + "learning_rate": 0.0007106657803764969, + "loss": 0.92565274, + "num_input_tokens_seen": 164626768, + "router_z_loss_mlp": 0.88183594, + "step": 1979, + "time_per_iteration": 2.7113609313964844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188099, + "balance_loss_mlp": 1.10007489, + "epoch": 0.38091573682185453, + "flos": 623854344192.0, + "grad_norm": 0.03122566409921124, + "language_loss": 0.90192807, + "learning_rate": 0.0007103831997006948, + "loss": 0.91380906, + "num_input_tokens_seen": 164698016, + "router_z_loss_mlp": 0.88183594, + "step": 1980, + "time_per_iteration": 2.7460203170776367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183293, + "balance_loss_mlp": 1.09507859, + "epoch": 0.3811081185071181, + "flos": 570175641600.0, + "grad_norm": 0.027157726640451497, + "language_loss": 0.92157245, + "learning_rate": 0.0007101005373468908, + "loss": 0.9334054, + "num_input_tokens_seen": 164780320, + "router_z_loss_mlp": 0.8828125, + "step": 1981, + "time_per_iteration": 2.869722604751587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176795, + "balance_loss_mlp": 1.08891392, + "epoch": 0.3813005001923817, + "flos": 585990269952.0, + "grad_norm": 0.026054611177121254, + "language_loss": 0.92786968, + "learning_rate": 0.0007098177934248242, + "loss": 0.9396376, + "num_input_tokens_seen": 164854400, + "router_z_loss_mlp": 0.88037109, + "step": 1982, + "time_per_iteration": 2.7341668605804443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179814, + "balance_loss_mlp": 1.09188521, + "epoch": 0.38149288187764524, + "flos": 622810295808.0, + "grad_norm": 0.03120804506271422, + "language_loss": 0.94404829, + "learning_rate": 0.0007095349680442661, + "loss": 0.95584643, + "num_input_tokens_seen": 164932896, + "router_z_loss_mlp": 0.88085938, + "step": 1983, + "time_per_iteration": 2.845836639404297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182966, + "balance_loss_mlp": 1.09522831, + "epoch": 0.3816852635629088, + "flos": 571797109248.0, + "grad_norm": 0.027372063240090748, + "language_loss": 0.86448967, + "learning_rate": 0.0007092520613150188, + "loss": 0.87631935, + "num_input_tokens_seen": 165002896, + "router_z_loss_mlp": 0.87890625, + "step": 1984, + "time_per_iteration": 2.6740176677703857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178711, + "balance_loss_mlp": 1.09106863, + "epoch": 0.38187764524817236, + "flos": 566678198784.0, + "grad_norm": 0.03160695384354602, + "language_loss": 0.87573516, + "learning_rate": 0.0007089690733469165, + "loss": 0.88752234, + "num_input_tokens_seen": 165074704, + "router_z_loss_mlp": 0.87792969, + "step": 1985, + "time_per_iteration": 2.717921733856201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178571, + "balance_loss_mlp": 1.09073794, + "epoch": 0.38207002693343595, + "flos": 632398838784.0, + "grad_norm": 0.031031403109496963, + "language_loss": 0.90504575, + "learning_rate": 0.000708686004249825, + "loss": 0.91683149, + "num_input_tokens_seen": 165149136, + "router_z_loss_mlp": 0.87988281, + "step": 1986, + "time_per_iteration": 2.758554697036743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179432, + "balance_loss_mlp": 1.09164619, + "epoch": 0.3822624086186995, + "flos": 549840989184.0, + "grad_norm": 0.025201133141653974, + "language_loss": 0.97533029, + "learning_rate": 0.0007084028541336413, + "loss": 0.98712462, + "num_input_tokens_seen": 165220864, + "router_z_loss_mlp": 0.87939453, + "step": 1987, + "time_per_iteration": 2.6981115341186523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187219, + "balance_loss_mlp": 1.09909916, + "epoch": 0.38245479030396307, + "flos": 615066802176.0, + "grad_norm": 0.02853553744793089, + "language_loss": 0.9291808, + "learning_rate": 0.0007081196231082942, + "loss": 0.94105303, + "num_input_tokens_seen": 165301568, + "router_z_loss_mlp": 0.8828125, + "step": 1988, + "time_per_iteration": 2.7912278175354004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186636, + "balance_loss_mlp": 1.09851646, + "epoch": 0.38264717198922665, + "flos": 669303458304.0, + "grad_norm": 0.029318681320032423, + "language_loss": 0.88455558, + "learning_rate": 0.0007078363112837436, + "loss": 0.89642197, + "num_input_tokens_seen": 165373152, + "router_z_loss_mlp": 0.8828125, + "step": 1989, + "time_per_iteration": 2.8133885860443115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187352, + "balance_loss_mlp": 1.09927964, + "epoch": 0.3828395536744902, + "flos": 455686364160.0, + "grad_norm": 0.029265262626364436, + "language_loss": 0.9249233, + "learning_rate": 0.000707552918769981, + "loss": 0.93679678, + "num_input_tokens_seen": 165439136, + "router_z_loss_mlp": 0.88232422, + "step": 1990, + "time_per_iteration": 2.538587808609009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180802, + "balance_loss_mlp": 1.09277809, + "epoch": 0.3830319353597538, + "flos": 500482197504.0, + "grad_norm": 0.02588536582900798, + "language_loss": 0.91112638, + "learning_rate": 0.000707269445677029, + "loss": 0.92293441, + "num_input_tokens_seen": 165514624, + "router_z_loss_mlp": 0.88183594, + "step": 1991, + "time_per_iteration": 2.7578041553497314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183391, + "balance_loss_mlp": 1.09536684, + "epoch": 0.3832243170450173, + "flos": 745466035200.0, + "grad_norm": 0.02707218781991338, + "language_loss": 0.91718936, + "learning_rate": 0.0007069858921149416, + "loss": 0.92902327, + "num_input_tokens_seen": 165594512, + "router_z_loss_mlp": 0.88183594, + "step": 1992, + "time_per_iteration": 2.948418617248535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184259, + "balance_loss_mlp": 1.09613955, + "epoch": 0.3834166987302809, + "flos": 579345219072.0, + "grad_norm": 0.02587271093699699, + "language_loss": 0.92343616, + "learning_rate": 0.0007067022581938043, + "loss": 0.93527877, + "num_input_tokens_seen": 165673968, + "router_z_loss_mlp": 0.8828125, + "step": 1993, + "time_per_iteration": 2.881967782974243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118782, + "balance_loss_mlp": 1.09965289, + "epoch": 0.3836090804155444, + "flos": 537608397312.0, + "grad_norm": 0.029882536442049617, + "language_loss": 0.91833031, + "learning_rate": 0.0007064185440237334, + "loss": 0.9302085, + "num_input_tokens_seen": 165747664, + "router_z_loss_mlp": 0.88330078, + "step": 1994, + "time_per_iteration": 2.7481510639190674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190014, + "balance_loss_mlp": 1.10189474, + "epoch": 0.383801462100808, + "flos": 603051061248.0, + "grad_norm": 0.027232179622410133, + "language_loss": 0.91516536, + "learning_rate": 0.0007061347497148764, + "loss": 0.92706549, + "num_input_tokens_seen": 165824624, + "router_z_loss_mlp": 0.8828125, + "step": 1995, + "time_per_iteration": 2.762807846069336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191619, + "balance_loss_mlp": 1.10321367, + "epoch": 0.38399384378607154, + "flos": 573798610944.0, + "grad_norm": 0.03191203592253993, + "language_loss": 0.9478448, + "learning_rate": 0.0007058508753774122, + "loss": 0.95976096, + "num_input_tokens_seen": 165896304, + "router_z_loss_mlp": 0.88476562, + "step": 1996, + "time_per_iteration": 2.7208473682403564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185202, + "balance_loss_mlp": 1.09708297, + "epoch": 0.38418622547133513, + "flos": 537779586048.0, + "grad_norm": 0.03234926235653744, + "language_loss": 0.93760306, + "learning_rate": 0.0007055669211215505, + "loss": 0.94945514, + "num_input_tokens_seen": 165961312, + "router_z_loss_mlp": 0.8828125, + "step": 1997, + "time_per_iteration": 2.6605474948883057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182194, + "balance_loss_mlp": 1.09397876, + "epoch": 0.3843786071565987, + "flos": 574013460480.0, + "grad_norm": 0.03558568539094479, + "language_loss": 0.86620909, + "learning_rate": 0.0007052828870575322, + "loss": 0.87803102, + "num_input_tokens_seen": 166028064, + "router_z_loss_mlp": 0.88378906, + "step": 1998, + "time_per_iteration": 2.6478962898254395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179215, + "balance_loss_mlp": 1.09100008, + "epoch": 0.38457098884186225, + "flos": 730079104512.0, + "grad_norm": 0.027610192556292087, + "language_loss": 0.94167769, + "learning_rate": 0.0007049987732956291, + "loss": 0.95346981, + "num_input_tokens_seen": 166110272, + "router_z_loss_mlp": 0.88378906, + "step": 1999, + "time_per_iteration": 2.9643850326538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190926, + "balance_loss_mlp": 1.10199583, + "epoch": 0.38476337052712584, + "flos": 584620581888.0, + "grad_norm": 0.023866575274933036, + "language_loss": 0.8787694, + "learning_rate": 0.0007047145799461439, + "loss": 0.89067864, + "num_input_tokens_seen": 166193088, + "router_z_loss_mlp": 0.88720703, + "step": 2000, + "time_per_iteration": 2.8542819023132324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191076, + "balance_loss_mlp": 1.10200322, + "epoch": 0.38495575221238937, + "flos": 554158898688.0, + "grad_norm": 0.025960095413567152, + "language_loss": 0.89154112, + "learning_rate": 0.00070443030711941, + "loss": 0.90345186, + "num_input_tokens_seen": 166271776, + "router_z_loss_mlp": 0.88867188, + "step": 2001, + "time_per_iteration": 2.770023822784424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189246, + "balance_loss_mlp": 1.10084057, + "epoch": 0.38514813389765296, + "flos": 655676983296.0, + "grad_norm": 0.026490656569535233, + "language_loss": 0.88696259, + "learning_rate": 0.0007041459549257924, + "loss": 0.89885509, + "num_input_tokens_seen": 166350000, + "router_z_loss_mlp": 0.88476562, + "step": 2002, + "time_per_iteration": 4.357714414596558 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182232, + "balance_loss_mlp": 1.09392142, + "epoch": 0.3853405155829165, + "flos": 869645913600.0, + "grad_norm": 0.03138294802585753, + "language_loss": 0.86704218, + "learning_rate": 0.0007038615234756859, + "loss": 0.87886453, + "num_input_tokens_seen": 166434336, + "router_z_loss_mlp": 0.88476562, + "step": 2003, + "time_per_iteration": 3.154315233230591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179573, + "balance_loss_mlp": 1.09135854, + "epoch": 0.3855328972681801, + "flos": 547468185600.0, + "grad_norm": 0.030993794918127784, + "language_loss": 0.91032863, + "learning_rate": 0.000703577012879517, + "loss": 0.92212439, + "num_input_tokens_seen": 166503952, + "router_z_loss_mlp": 0.88378906, + "step": 2004, + "time_per_iteration": 2.6320230960845947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184907, + "balance_loss_mlp": 1.09673953, + "epoch": 0.3857252789534436, + "flos": 535098607104.0, + "grad_norm": 0.029525133384240967, + "language_loss": 0.9687134, + "learning_rate": 0.0007032924232477423, + "loss": 0.98056245, + "num_input_tokens_seen": 166575168, + "router_z_loss_mlp": 0.88330078, + "step": 2005, + "time_per_iteration": 2.650982618331909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184324, + "balance_loss_mlp": 1.09630013, + "epoch": 0.3859176606387072, + "flos": 492766901760.0, + "grad_norm": 0.029334702789067958, + "language_loss": 0.8823278, + "learning_rate": 0.0007030077546908493, + "loss": 0.89417106, + "num_input_tokens_seen": 166647552, + "router_z_loss_mlp": 0.88183594, + "step": 2006, + "time_per_iteration": 2.642333745956421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203979, + "balance_loss_mlp": 1.11700439, + "epoch": 0.3861100423239708, + "flos": 1490155991040.0, + "grad_norm": 0.02217822259323008, + "language_loss": 0.83064663, + "learning_rate": 0.0007027230073193561, + "loss": 0.84268641, + "num_input_tokens_seen": 166875088, + "router_z_loss_mlp": 0.87109375, + "step": 2007, + "time_per_iteration": 4.759521961212158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184336, + "balance_loss_mlp": 1.09635913, + "epoch": 0.3863024240092343, + "flos": 474692261376.0, + "grad_norm": 0.030825589148035897, + "language_loss": 0.87378025, + "learning_rate": 0.0007024381812438117, + "loss": 0.88562357, + "num_input_tokens_seen": 166939344, + "router_z_loss_mlp": 0.88134766, + "step": 2008, + "time_per_iteration": 2.5227372646331787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184691, + "balance_loss_mlp": 1.09728634, + "epoch": 0.3864948056944979, + "flos": 717978769920.0, + "grad_norm": 0.032935981886219476, + "language_loss": 0.91112518, + "learning_rate": 0.0007021532765747951, + "loss": 0.92297208, + "num_input_tokens_seen": 167014992, + "router_z_loss_mlp": 0.87548828, + "step": 2009, + "time_per_iteration": 2.963550567626953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182737, + "balance_loss_mlp": 1.0952853, + "epoch": 0.38668718737976143, + "flos": 728954465280.0, + "grad_norm": 0.030267959416106823, + "language_loss": 0.86631739, + "learning_rate": 0.0007018682934229162, + "loss": 0.87814474, + "num_input_tokens_seen": 167092096, + "router_z_loss_mlp": 0.87597656, + "step": 2010, + "time_per_iteration": 2.955132246017456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179617, + "balance_loss_mlp": 1.09235525, + "epoch": 0.386879569065025, + "flos": 526488984576.0, + "grad_norm": 0.02588052645359636, + "language_loss": 0.89375025, + "learning_rate": 0.0007015832318988152, + "loss": 0.90554643, + "num_input_tokens_seen": 167162144, + "router_z_loss_mlp": 0.87402344, + "step": 2011, + "time_per_iteration": 2.612443208694458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117942, + "balance_loss_mlp": 1.09454346, + "epoch": 0.38707195075028855, + "flos": 1530724512768.0, + "grad_norm": 0.010241364382771095, + "language_loss": 0.73890078, + "learning_rate": 0.000701298092113163, + "loss": 0.75069499, + "num_input_tokens_seen": 167391536, + "router_z_loss_mlp": 0.84960938, + "step": 2012, + "time_per_iteration": 4.952507495880127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187813, + "balance_loss_mlp": 1.10040927, + "epoch": 0.38726433243555214, + "flos": 558385483776.0, + "grad_norm": 0.026729103388188073, + "language_loss": 0.89776802, + "learning_rate": 0.0007010128741766604, + "loss": 0.90964615, + "num_input_tokens_seen": 167466000, + "router_z_loss_mlp": 0.87548828, + "step": 2013, + "time_per_iteration": 2.759916067123413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184734, + "balance_loss_mlp": 1.09756815, + "epoch": 0.38745671412081567, + "flos": 554755783680.0, + "grad_norm": 0.0314384592840016, + "language_loss": 0.91517645, + "learning_rate": 0.0007007275782000391, + "loss": 0.92702377, + "num_input_tokens_seen": 167536144, + "router_z_loss_mlp": 0.87304688, + "step": 2014, + "time_per_iteration": 2.6659133434295654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181864, + "balance_loss_mlp": 1.09469819, + "epoch": 0.38764909580607926, + "flos": 459344262144.0, + "grad_norm": 0.028810992523736655, + "language_loss": 0.92611015, + "learning_rate": 0.0007004422042940605, + "loss": 0.9379288, + "num_input_tokens_seen": 167600064, + "router_z_loss_mlp": 0.87304688, + "step": 2015, + "time_per_iteration": 2.4901411533355713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180932, + "balance_loss_mlp": 1.09376657, + "epoch": 0.38784147749134285, + "flos": 523258784256.0, + "grad_norm": 0.030339968140386194, + "language_loss": 0.98432136, + "learning_rate": 0.0007001567525695169, + "loss": 0.99613065, + "num_input_tokens_seen": 167666576, + "router_z_loss_mlp": 0.87304688, + "step": 2016, + "time_per_iteration": 2.605134963989258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182969, + "balance_loss_mlp": 1.09575546, + "epoch": 0.3880338591766064, + "flos": 667400011776.0, + "grad_norm": 0.023304348995526428, + "language_loss": 0.90603948, + "learning_rate": 0.0006998712231372303, + "loss": 0.91786909, + "num_input_tokens_seen": 167753296, + "router_z_loss_mlp": 0.87353516, + "step": 2017, + "time_per_iteration": 2.9866511821746826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187647, + "balance_loss_mlp": 1.10024321, + "epoch": 0.38822624086186996, + "flos": 595175310336.0, + "grad_norm": 0.027834044235160192, + "language_loss": 0.92810535, + "learning_rate": 0.0006995856161080532, + "loss": 0.93998176, + "num_input_tokens_seen": 167834080, + "router_z_loss_mlp": 0.87548828, + "step": 2018, + "time_per_iteration": 2.8917806148529053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181908, + "balance_loss_mlp": 1.09426534, + "epoch": 0.3884186225471335, + "flos": 613681651200.0, + "grad_norm": 0.030912624722110756, + "language_loss": 0.90135586, + "learning_rate": 0.0006992999315928679, + "loss": 0.91317499, + "num_input_tokens_seen": 167912368, + "router_z_loss_mlp": 0.87792969, + "step": 2019, + "time_per_iteration": 2.821570873260498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179846, + "balance_loss_mlp": 1.0924896, + "epoch": 0.3886110042323971, + "flos": 608243831808.0, + "grad_norm": 0.025167723735071885, + "language_loss": 0.91748118, + "learning_rate": 0.0006990141697025871, + "loss": 0.92927969, + "num_input_tokens_seen": 167991968, + "router_z_loss_mlp": 0.875, + "step": 2020, + "time_per_iteration": 2.774073600769043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181915, + "balance_loss_mlp": 1.09684753, + "epoch": 0.3888033859176606, + "flos": 1531193869824.0, + "grad_norm": 0.011544022481713089, + "language_loss": 0.76359642, + "learning_rate": 0.0006987283305481533, + "loss": 0.77541554, + "num_input_tokens_seen": 168212128, + "router_z_loss_mlp": 0.8515625, + "step": 2021, + "time_per_iteration": 4.741650581359863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174887, + "balance_loss_mlp": 1.08734, + "epoch": 0.3889957676029242, + "flos": 693671313408.0, + "grad_norm": 0.03334226176751645, + "language_loss": 0.90383756, + "learning_rate": 0.0006984424142405392, + "loss": 0.91558647, + "num_input_tokens_seen": 168287440, + "router_z_loss_mlp": 0.87695312, + "step": 2022, + "time_per_iteration": 2.839838981628418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174992, + "balance_loss_mlp": 1.08734977, + "epoch": 0.3891881492881878, + "flos": 516194767872.0, + "grad_norm": 0.031660307701904165, + "language_loss": 0.90829813, + "learning_rate": 0.0006981564208907474, + "loss": 0.92004812, + "num_input_tokens_seen": 168354704, + "router_z_loss_mlp": 0.87792969, + "step": 2023, + "time_per_iteration": 2.6160523891448975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179623, + "balance_loss_mlp": 1.09178972, + "epoch": 0.3893805309734513, + "flos": 630175756800.0, + "grad_norm": 0.02822603249283798, + "language_loss": 0.96692258, + "learning_rate": 0.0006978703506098102, + "loss": 0.97871882, + "num_input_tokens_seen": 168424272, + "router_z_loss_mlp": 0.87988281, + "step": 2024, + "time_per_iteration": 2.770775556564331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177682, + "balance_loss_mlp": 1.08994389, + "epoch": 0.3895729126587149, + "flos": 545206172160.0, + "grad_norm": 0.026225366557941037, + "language_loss": 0.95314252, + "learning_rate": 0.00069758420350879, + "loss": 0.96491939, + "num_input_tokens_seen": 168488912, + "router_z_loss_mlp": 0.87890625, + "step": 2025, + "time_per_iteration": 2.615687608718872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179844, + "balance_loss_mlp": 1.09201062, + "epoch": 0.38976529434397844, + "flos": 619406178816.0, + "grad_norm": 0.03181269468531491, + "language_loss": 0.9379099, + "learning_rate": 0.000697297979698779, + "loss": 0.94970834, + "num_input_tokens_seen": 168563248, + "router_z_loss_mlp": 0.87988281, + "step": 2026, + "time_per_iteration": 2.723860740661621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187768, + "balance_loss_mlp": 1.10007727, + "epoch": 0.38995767602924203, + "flos": 836344797696.0, + "grad_norm": 0.025703512313876988, + "language_loss": 0.89683533, + "learning_rate": 0.0006970116792908992, + "loss": 0.90871298, + "num_input_tokens_seen": 168648272, + "router_z_loss_mlp": 0.87841797, + "step": 2027, + "time_per_iteration": 3.0871434211730957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117977, + "balance_loss_mlp": 1.09203207, + "epoch": 0.39015005771450556, + "flos": 542646716928.0, + "grad_norm": 0.03022946762166595, + "language_loss": 0.88945854, + "learning_rate": 0.000696725302396302, + "loss": 0.9012562, + "num_input_tokens_seen": 168721760, + "router_z_loss_mlp": 0.87890625, + "step": 2028, + "time_per_iteration": 2.632178783416748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174959, + "balance_loss_mlp": 1.0871253, + "epoch": 0.39034243939976915, + "flos": 1009140864000.0, + "grad_norm": 0.026055335602768993, + "language_loss": 0.92111158, + "learning_rate": 0.0006964388491261692, + "loss": 0.93286121, + "num_input_tokens_seen": 168803664, + "router_z_loss_mlp": 0.87988281, + "step": 2029, + "time_per_iteration": 3.2683680057525635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174119, + "balance_loss_mlp": 1.08633304, + "epoch": 0.3905348210850327, + "flos": 680240222208.0, + "grad_norm": 0.029787695509808892, + "language_loss": 0.96251416, + "learning_rate": 0.0006961523195917114, + "loss": 0.97425532, + "num_input_tokens_seen": 168879184, + "router_z_loss_mlp": 0.87939453, + "step": 2030, + "time_per_iteration": 2.807161331176758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182527, + "balance_loss_mlp": 1.09459865, + "epoch": 0.39072720277029627, + "flos": 549988709376.0, + "grad_norm": 0.03099080969443711, + "language_loss": 0.86433041, + "learning_rate": 0.0006958657139041696, + "loss": 0.87615567, + "num_input_tokens_seen": 168957808, + "router_z_loss_mlp": 0.88085938, + "step": 2031, + "time_per_iteration": 2.728208065032959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119693, + "balance_loss_mlp": 1.11052704, + "epoch": 0.39091958445555985, + "flos": 1551051159552.0, + "grad_norm": 0.01789751173127641, + "language_loss": 0.76712966, + "learning_rate": 0.0006955790321748136, + "loss": 0.77909899, + "num_input_tokens_seen": 169194416, + "router_z_loss_mlp": 0.86523438, + "step": 2032, + "time_per_iteration": 4.911708354949951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179573, + "balance_loss_mlp": 1.09193051, + "epoch": 0.3911119661408234, + "flos": 505051886592.0, + "grad_norm": 0.03095157096826047, + "language_loss": 0.85940099, + "learning_rate": 0.0006952922745149434, + "loss": 0.87119675, + "num_input_tokens_seen": 169263552, + "router_z_loss_mlp": 0.87792969, + "step": 2033, + "time_per_iteration": 2.649538040161133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176483, + "balance_loss_mlp": 1.08903146, + "epoch": 0.391304347826087, + "flos": 558329088000.0, + "grad_norm": 0.028319463440814277, + "language_loss": 0.94666743, + "learning_rate": 0.000695005441035888, + "loss": 0.95843232, + "num_input_tokens_seen": 169333696, + "router_z_loss_mlp": 0.87597656, + "step": 2034, + "time_per_iteration": 2.6671407222747803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178574, + "balance_loss_mlp": 1.09293365, + "epoch": 0.3914967295113505, + "flos": 1502941807104.0, + "grad_norm": 0.0063133772361172544, + "language_loss": 0.73723435, + "learning_rate": 0.0006947185318490064, + "loss": 0.7490201, + "num_input_tokens_seen": 169556416, + "router_z_loss_mlp": 0.85742188, + "step": 2035, + "time_per_iteration": 4.863725423812866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180506, + "balance_loss_mlp": 1.09338748, + "epoch": 0.3916891111966141, + "flos": 708329101824.0, + "grad_norm": 0.025753563122139746, + "language_loss": 0.86980474, + "learning_rate": 0.0006944315470656863, + "loss": 0.88160974, + "num_input_tokens_seen": 169643312, + "router_z_loss_mlp": 0.87255859, + "step": 2036, + "time_per_iteration": 2.936588764190674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188418, + "balance_loss_mlp": 1.10110939, + "epoch": 0.3918814928818776, + "flos": 557408564736.0, + "grad_norm": 0.031943380680049066, + "language_loss": 0.99613088, + "learning_rate": 0.000694144486797345, + "loss": 1.00801504, + "num_input_tokens_seen": 169712560, + "router_z_loss_mlp": 0.87451172, + "step": 2037, + "time_per_iteration": 2.676107883453369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193756, + "balance_loss_mlp": 1.10868835, + "epoch": 0.3920738745671412, + "flos": 1541685471744.0, + "grad_norm": 0.012882287356254449, + "language_loss": 0.79520434, + "learning_rate": 0.0006938573511554296, + "loss": 0.8071419, + "num_input_tokens_seen": 169914912, + "router_z_loss_mlp": 0.8515625, + "step": 2038, + "time_per_iteration": 4.63246750831604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178826, + "balance_loss_mlp": 1.0916127, + "epoch": 0.39226625625240474, + "flos": 499804721664.0, + "grad_norm": 0.027391930017631044, + "language_loss": 0.96627682, + "learning_rate": 0.0006935701402514156, + "loss": 0.97806513, + "num_input_tokens_seen": 169978848, + "router_z_loss_mlp": 0.87353516, + "step": 2039, + "time_per_iteration": 2.5613086223602295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177521, + "balance_loss_mlp": 1.092453, + "epoch": 0.39245863793766833, + "flos": 1350450920448.0, + "grad_norm": 0.011737641894846437, + "language_loss": 0.73034894, + "learning_rate": 0.0006932828541968083, + "loss": 0.74212414, + "num_input_tokens_seen": 170211488, + "router_z_loss_mlp": 0.8515625, + "step": 2040, + "time_per_iteration": 4.902123689651489 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176176, + "balance_loss_mlp": 1.08881962, + "epoch": 0.3926510196229319, + "flos": 1348114142208.0, + "grad_norm": 0.028665962134257456, + "language_loss": 0.92107272, + "learning_rate": 0.0006929954931031422, + "loss": 0.93283451, + "num_input_tokens_seen": 170298528, + "router_z_loss_mlp": 0.875, + "step": 2041, + "time_per_iteration": 3.7387020587921143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176234, + "balance_loss_mlp": 1.08902013, + "epoch": 0.39284340130819545, + "flos": 500603721216.0, + "grad_norm": 0.024641039111334598, + "language_loss": 0.95021844, + "learning_rate": 0.0006927080570819805, + "loss": 0.96198076, + "num_input_tokens_seen": 170365680, + "router_z_loss_mlp": 0.87353516, + "step": 2042, + "time_per_iteration": 2.5837514400482178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117531, + "balance_loss_mlp": 1.08814418, + "epoch": 0.39303578299345904, + "flos": 521341876224.0, + "grad_norm": 0.03605238478740547, + "language_loss": 0.89998531, + "learning_rate": 0.0006924205462449161, + "loss": 0.9117384, + "num_input_tokens_seen": 170432224, + "router_z_loss_mlp": 0.87304688, + "step": 2043, + "time_per_iteration": 2.560842514038086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173664, + "balance_loss_mlp": 1.08664155, + "epoch": 0.39322816467872257, + "flos": 909537686016.0, + "grad_norm": 0.029197625514705252, + "language_loss": 0.89668262, + "learning_rate": 0.0006921329607035702, + "loss": 0.90841925, + "num_input_tokens_seen": 170517920, + "router_z_loss_mlp": 0.87158203, + "step": 2044, + "time_per_iteration": 3.2215418815612793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185916, + "balance_loss_mlp": 1.09860718, + "epoch": 0.39342054636398616, + "flos": 518641431552.0, + "grad_norm": 0.026194219642157263, + "language_loss": 0.94294739, + "learning_rate": 0.0006918453005695938, + "loss": 0.95480657, + "num_input_tokens_seen": 170589072, + "router_z_loss_mlp": 0.87451172, + "step": 2045, + "time_per_iteration": 2.637197732925415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183114, + "balance_loss_mlp": 1.09594774, + "epoch": 0.3936129280492497, + "flos": 549011790336.0, + "grad_norm": 0.026944227420126074, + "language_loss": 0.91576457, + "learning_rate": 0.0006915575659546662, + "loss": 0.92759573, + "num_input_tokens_seen": 170657856, + "router_z_loss_mlp": 0.87304688, + "step": 2046, + "time_per_iteration": 2.7570858001708984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185485, + "balance_loss_mlp": 1.098176, + "epoch": 0.3938053097345133, + "flos": 527140263936.0, + "grad_norm": 0.02948359624940754, + "language_loss": 0.88347399, + "learning_rate": 0.0006912697569704959, + "loss": 0.89532876, + "num_input_tokens_seen": 170723696, + "router_z_loss_mlp": 0.87451172, + "step": 2047, + "time_per_iteration": 2.635467290878296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186252, + "balance_loss_mlp": 1.09899104, + "epoch": 0.39399769141977686, + "flos": 472588701696.0, + "grad_norm": 0.02995196024762557, + "language_loss": 0.93503523, + "learning_rate": 0.0006909818737288205, + "loss": 0.94689775, + "num_input_tokens_seen": 170789536, + "router_z_loss_mlp": 0.87402344, + "step": 2048, + "time_per_iteration": 2.558013916015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181668, + "balance_loss_mlp": 1.09488404, + "epoch": 0.3941900731050404, + "flos": 502726746624.0, + "grad_norm": 0.02878603575662113, + "language_loss": 0.88763595, + "learning_rate": 0.000690693916341406, + "loss": 0.89945263, + "num_input_tokens_seen": 170859232, + "router_z_loss_mlp": 0.86914062, + "step": 2049, + "time_per_iteration": 2.5820720195770264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178505, + "balance_loss_mlp": 1.09152949, + "epoch": 0.394382454790304, + "flos": 582006732288.0, + "grad_norm": 0.024885306311727563, + "language_loss": 0.90003175, + "learning_rate": 0.0006904058849200475, + "loss": 0.91181684, + "num_input_tokens_seen": 170931568, + "router_z_loss_mlp": 0.87109375, + "step": 2050, + "time_per_iteration": 2.7304697036743164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118427, + "balance_loss_mlp": 1.09700906, + "epoch": 0.3945748364755675, + "flos": 514844545536.0, + "grad_norm": 0.02745844528377672, + "language_loss": 0.91741204, + "learning_rate": 0.0006901177795765683, + "loss": 0.92925465, + "num_input_tokens_seen": 170999856, + "router_z_loss_mlp": 0.87402344, + "step": 2051, + "time_per_iteration": 2.610621213912964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180664, + "balance_loss_mlp": 1.09368861, + "epoch": 0.3947672181608311, + "flos": 595057789440.0, + "grad_norm": 0.03028158635704326, + "language_loss": 0.89240891, + "learning_rate": 0.0006898296004228213, + "loss": 0.90421557, + "num_input_tokens_seen": 171072320, + "router_z_loss_mlp": 0.87109375, + "step": 2052, + "time_per_iteration": 2.747377395629883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119046, + "balance_loss_mlp": 1.10634613, + "epoch": 0.39495959984609463, + "flos": 1551049158144.0, + "grad_norm": 0.018267218432335405, + "language_loss": 0.7812674, + "learning_rate": 0.0006895413475706873, + "loss": 0.793172, + "num_input_tokens_seen": 171304128, + "router_z_loss_mlp": 0.84179688, + "step": 2053, + "time_per_iteration": 4.871596336364746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117553, + "balance_loss_mlp": 1.08845937, + "epoch": 0.3951519815313582, + "flos": 497523242496.0, + "grad_norm": 0.028876315996474663, + "language_loss": 0.87133646, + "learning_rate": 0.0006892530211320763, + "loss": 0.88309175, + "num_input_tokens_seen": 171377392, + "router_z_loss_mlp": 0.87207031, + "step": 2054, + "time_per_iteration": 2.696796417236328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117541, + "balance_loss_mlp": 1.08824456, + "epoch": 0.39534436321662175, + "flos": 532222244352.0, + "grad_norm": 0.031248767008087052, + "language_loss": 0.9121244, + "learning_rate": 0.000688964621218926, + "loss": 0.92387855, + "num_input_tokens_seen": 171447424, + "router_z_loss_mlp": 0.87304688, + "step": 2055, + "time_per_iteration": 2.6398446559906006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176401, + "balance_loss_mlp": 1.08899677, + "epoch": 0.39553674490188534, + "flos": 703724484096.0, + "grad_norm": 0.031024749515969993, + "language_loss": 0.88066703, + "learning_rate": 0.0006886761479432037, + "loss": 0.89243108, + "num_input_tokens_seen": 171519920, + "router_z_loss_mlp": 0.87548828, + "step": 2056, + "time_per_iteration": 2.896899700164795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184707, + "balance_loss_mlp": 1.09720743, + "epoch": 0.3957291265871489, + "flos": 410656215552.0, + "grad_norm": 0.031805347037857014, + "language_loss": 0.92354834, + "learning_rate": 0.0006883876014169045, + "loss": 0.93539548, + "num_input_tokens_seen": 171583856, + "router_z_loss_mlp": 0.87646484, + "step": 2057, + "time_per_iteration": 2.49245023727417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118858, + "balance_loss_mlp": 1.10108006, + "epoch": 0.39592150827241246, + "flos": 619638492672.0, + "grad_norm": 0.03245947566344542, + "language_loss": 0.97519982, + "learning_rate": 0.000688098981752052, + "loss": 0.98708564, + "num_input_tokens_seen": 171656064, + "router_z_loss_mlp": 0.87646484, + "step": 2058, + "time_per_iteration": 2.7079999446868896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183973, + "balance_loss_mlp": 1.09642518, + "epoch": 0.39611388995767605, + "flos": 822720324096.0, + "grad_norm": 0.029593298786174956, + "language_loss": 0.88381338, + "learning_rate": 0.0006878102890606982, + "loss": 0.89565313, + "num_input_tokens_seen": 171738800, + "router_z_loss_mlp": 0.87695312, + "step": 2059, + "time_per_iteration": 3.089268922805786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182646, + "balance_loss_mlp": 1.09524131, + "epoch": 0.3963062716429396, + "flos": 493214065152.0, + "grad_norm": 0.03350279358204369, + "language_loss": 0.88991904, + "learning_rate": 0.0006875215234549239, + "loss": 0.9017455, + "num_input_tokens_seen": 171803664, + "router_z_loss_mlp": 0.87548828, + "step": 2060, + "time_per_iteration": 2.538806200027466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182648, + "balance_loss_mlp": 1.09533882, + "epoch": 0.39649865332820317, + "flos": 585833817600.0, + "grad_norm": 0.030947291001002426, + "language_loss": 0.93147129, + "learning_rate": 0.0006872326850468376, + "loss": 0.9432978, + "num_input_tokens_seen": 171871968, + "router_z_loss_mlp": 0.87451172, + "step": 2061, + "time_per_iteration": 2.6593003273010254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179357, + "balance_loss_mlp": 1.09214342, + "epoch": 0.3966910350134667, + "flos": 459511448064.0, + "grad_norm": 0.03264577108022065, + "language_loss": 0.89072591, + "learning_rate": 0.0006869437739485762, + "loss": 0.90251946, + "num_input_tokens_seen": 171942368, + "router_z_loss_mlp": 0.87353516, + "step": 2062, + "time_per_iteration": 2.605191230773926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180604, + "balance_loss_mlp": 1.0932951, + "epoch": 0.3968834166987303, + "flos": 509614844928.0, + "grad_norm": 0.02743430972643364, + "language_loss": 0.9889155, + "learning_rate": 0.0006866547902723053, + "loss": 1.00072145, + "num_input_tokens_seen": 172012336, + "router_z_loss_mlp": 0.87451172, + "step": 2063, + "time_per_iteration": 2.6466383934020996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178614, + "balance_loss_mlp": 1.09116209, + "epoch": 0.3970757983839938, + "flos": 573742215168.0, + "grad_norm": 0.030016333454088624, + "language_loss": 0.87640852, + "learning_rate": 0.000686365734130218, + "loss": 0.88819462, + "num_input_tokens_seen": 172084640, + "router_z_loss_mlp": 0.87597656, + "step": 2064, + "time_per_iteration": 2.6795899868011475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178875, + "balance_loss_mlp": 1.09161353, + "epoch": 0.3972681800692574, + "flos": 482585476608.0, + "grad_norm": 0.03115409384976, + "language_loss": 0.90479839, + "learning_rate": 0.000686076605634536, + "loss": 0.91658711, + "num_input_tokens_seen": 172152992, + "router_z_loss_mlp": 0.87402344, + "step": 2065, + "time_per_iteration": 2.6956639289855957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176026, + "balance_loss_mlp": 1.0887177, + "epoch": 0.397460561754521, + "flos": 488904887808.0, + "grad_norm": 0.028660372999824147, + "language_loss": 0.91924292, + "learning_rate": 0.0006857874048975088, + "loss": 0.93100321, + "num_input_tokens_seen": 172219312, + "router_z_loss_mlp": 0.87451172, + "step": 2066, + "time_per_iteration": 2.541707992553711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182319, + "balance_loss_mlp": 1.09515274, + "epoch": 0.3976529434397845, + "flos": 422895538176.0, + "grad_norm": 0.03007540042591745, + "language_loss": 0.93814421, + "learning_rate": 0.0006854981320314142, + "loss": 0.94996738, + "num_input_tokens_seen": 172282112, + "router_z_loss_mlp": 0.87304688, + "step": 2067, + "time_per_iteration": 2.455916166305542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118284, + "balance_loss_mlp": 1.09586513, + "epoch": 0.3978453251250481, + "flos": 546621522432.0, + "grad_norm": 0.0330596148196893, + "language_loss": 0.94973123, + "learning_rate": 0.0006852087871485579, + "loss": 0.96155965, + "num_input_tokens_seen": 172347872, + "router_z_loss_mlp": 0.87109375, + "step": 2068, + "time_per_iteration": 2.609492063522339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175372, + "balance_loss_mlp": 1.08801544, + "epoch": 0.39803770681031164, + "flos": 652001620992.0, + "grad_norm": 0.0336676185790188, + "language_loss": 0.8912071, + "learning_rate": 0.0006849193703612735, + "loss": 0.90296078, + "num_input_tokens_seen": 172418560, + "router_z_loss_mlp": 0.875, + "step": 2069, + "time_per_iteration": 2.816309690475464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177646, + "balance_loss_mlp": 1.09071827, + "epoch": 0.39823008849557523, + "flos": 741426101760.0, + "grad_norm": 0.026625397702565265, + "language_loss": 0.84925234, + "learning_rate": 0.0006846298817819225, + "loss": 0.86102879, + "num_input_tokens_seen": 172497984, + "router_z_loss_mlp": 0.87060547, + "step": 2070, + "time_per_iteration": 2.9875504970550537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175555, + "balance_loss_mlp": 1.088485, + "epoch": 0.39842247018083876, + "flos": 385888860672.0, + "grad_norm": 0.03226539532166374, + "language_loss": 0.89664173, + "learning_rate": 0.0006843403215228945, + "loss": 0.90839732, + "num_input_tokens_seen": 172560112, + "router_z_loss_mlp": 0.87207031, + "step": 2071, + "time_per_iteration": 2.4326088428497314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173604, + "balance_loss_mlp": 1.08648539, + "epoch": 0.39861485186610235, + "flos": 534762233856.0, + "grad_norm": 0.028550920618746804, + "language_loss": 0.88238078, + "learning_rate": 0.0006840506896966065, + "loss": 0.89411676, + "num_input_tokens_seen": 172636192, + "router_z_loss_mlp": 0.87255859, + "step": 2072, + "time_per_iteration": 2.6961326599121094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177722, + "balance_loss_mlp": 1.09084272, + "epoch": 0.39880723355136594, + "flos": 644412578304.0, + "grad_norm": 0.03366874484709253, + "language_loss": 0.90951228, + "learning_rate": 0.0006837609864155038, + "loss": 0.9212895, + "num_input_tokens_seen": 172715264, + "router_z_loss_mlp": 0.87011719, + "step": 2073, + "time_per_iteration": 2.8584561347961426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119321, + "balance_loss_mlp": 1.10623515, + "epoch": 0.39899961523662947, + "flos": 516891709440.0, + "grad_norm": 0.031985803275243696, + "language_loss": 0.90341693, + "learning_rate": 0.0006834712117920592, + "loss": 0.91534901, + "num_input_tokens_seen": 172783456, + "router_z_loss_mlp": 0.87109375, + "step": 2074, + "time_per_iteration": 2.624469757080078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186501, + "balance_loss_mlp": 1.09933496, + "epoch": 0.39919199692189306, + "flos": 465338033664.0, + "grad_norm": 0.0320663192521817, + "language_loss": 0.92968071, + "learning_rate": 0.0006831813659387729, + "loss": 0.94154572, + "num_input_tokens_seen": 172848928, + "router_z_loss_mlp": 0.87304688, + "step": 2075, + "time_per_iteration": 2.5216238498687744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184926, + "balance_loss_mlp": 1.09785569, + "epoch": 0.3993843786071566, + "flos": 532678139904.0, + "grad_norm": 0.03441409861038799, + "language_loss": 0.91210699, + "learning_rate": 0.0006828914489681733, + "loss": 0.92395616, + "num_input_tokens_seen": 172921152, + "router_z_loss_mlp": 0.87207031, + "step": 2076, + "time_per_iteration": 2.686810255050659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186966, + "balance_loss_mlp": 1.10008633, + "epoch": 0.3995767602924202, + "flos": 505023688704.0, + "grad_norm": 0.02837279486305722, + "language_loss": 0.91445708, + "learning_rate": 0.0006826014609928162, + "loss": 0.92632675, + "num_input_tokens_seen": 172998864, + "router_z_loss_mlp": 0.87011719, + "step": 2077, + "time_per_iteration": 2.6775381565093994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225517, + "balance_loss_mlp": 1.13892365, + "epoch": 0.3997691419776837, + "flos": 1457471225856.0, + "grad_norm": 0.023004253676312834, + "language_loss": 0.83199388, + "learning_rate": 0.0006823114021252846, + "loss": 0.84424907, + "num_input_tokens_seen": 173219216, + "router_z_loss_mlp": 0.8671875, + "step": 2078, + "time_per_iteration": 4.87092661857605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117794, + "balance_loss_mlp": 1.09134626, + "epoch": 0.3999615236629473, + "flos": 531755615232.0, + "grad_norm": 0.028989200184594895, + "language_loss": 0.86860782, + "learning_rate": 0.0006820212724781896, + "loss": 0.88038719, + "num_input_tokens_seen": 173292000, + "router_z_loss_mlp": 0.8671875, + "step": 2079, + "time_per_iteration": 2.6908116340637207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176834, + "balance_loss_mlp": 1.09033561, + "epoch": 0.4001539053482108, + "flos": 696361024512.0, + "grad_norm": 0.02837619494351951, + "language_loss": 0.90808308, + "learning_rate": 0.0006817310721641694, + "loss": 0.91985142, + "num_input_tokens_seen": 173365568, + "router_z_loss_mlp": 0.86621094, + "step": 2080, + "time_per_iteration": 2.8117949962615967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190878, + "balance_loss_mlp": 1.10437989, + "epoch": 0.4003462870334744, + "flos": 521378806272.0, + "grad_norm": 0.0346474179870518, + "language_loss": 0.91806537, + "learning_rate": 0.00068144080129589, + "loss": 0.9299742, + "num_input_tokens_seen": 173430144, + "router_z_loss_mlp": 0.86621094, + "step": 2081, + "time_per_iteration": 2.596397638320923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190824, + "balance_loss_mlp": 1.10422993, + "epoch": 0.400538668718738, + "flos": 493502774784.0, + "grad_norm": 0.03225854359639043, + "language_loss": 0.90241659, + "learning_rate": 0.0006811504599860441, + "loss": 0.91432476, + "num_input_tokens_seen": 173494464, + "router_z_loss_mlp": 0.8671875, + "step": 2082, + "time_per_iteration": 2.5100014209747314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187111, + "balance_loss_mlp": 1.10075557, + "epoch": 0.40073105040400153, + "flos": 491451608064.0, + "grad_norm": 0.02371927790759806, + "language_loss": 0.91368544, + "learning_rate": 0.0006808600483473526, + "loss": 0.92555654, + "num_input_tokens_seen": 173577168, + "router_z_loss_mlp": 0.86474609, + "step": 2083, + "time_per_iteration": 2.9103221893310547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178586, + "balance_loss_mlp": 1.0923264, + "epoch": 0.4009234320892651, + "flos": 563539322880.0, + "grad_norm": 0.025152017879447597, + "language_loss": 0.9285866, + "learning_rate": 0.0006805695664925629, + "loss": 0.94037247, + "num_input_tokens_seen": 173655632, + "router_z_loss_mlp": 0.86376953, + "step": 2084, + "time_per_iteration": 2.804859161376953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170802, + "balance_loss_mlp": 1.08444667, + "epoch": 0.40111581377452865, + "flos": 426852879360.0, + "grad_norm": 0.029415551527707178, + "language_loss": 0.90934992, + "learning_rate": 0.0006802790145344506, + "loss": 0.92105794, + "num_input_tokens_seen": 173719040, + "router_z_loss_mlp": 0.86474609, + "step": 2085, + "time_per_iteration": 2.476952075958252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117314, + "balance_loss_mlp": 1.0870235, + "epoch": 0.40130819545979224, + "flos": 613642719744.0, + "grad_norm": 0.028611036161279673, + "language_loss": 0.93620002, + "learning_rate": 0.0006799883925858176, + "loss": 0.94793141, + "num_input_tokens_seen": 173796704, + "router_z_loss_mlp": 0.86230469, + "step": 2086, + "time_per_iteration": 2.8800101280212402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118738, + "balance_loss_mlp": 1.10112, + "epoch": 0.40150057714505577, + "flos": 524450552832.0, + "grad_norm": 0.02956813955479834, + "language_loss": 0.92602348, + "learning_rate": 0.0006796977007594933, + "loss": 0.93789732, + "num_input_tokens_seen": 173862352, + "router_z_loss_mlp": 0.86376953, + "step": 2087, + "time_per_iteration": 2.6013576984405518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191969, + "balance_loss_mlp": 1.10537529, + "epoch": 0.40169295883031936, + "flos": 562553671680.0, + "grad_norm": 0.03319927890150985, + "language_loss": 0.92797327, + "learning_rate": 0.0006794069391683345, + "loss": 0.93989295, + "num_input_tokens_seen": 173935408, + "router_z_loss_mlp": 0.8671875, + "step": 2088, + "time_per_iteration": 2.7359838485717773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177019, + "balance_loss_mlp": 1.09095037, + "epoch": 0.4018853405155829, + "flos": 520019851776.0, + "grad_norm": 0.03157379152927814, + "language_loss": 0.87612534, + "learning_rate": 0.0006791161079252248, + "loss": 0.88789552, + "num_input_tokens_seen": 174007152, + "router_z_loss_mlp": 0.86181641, + "step": 2089, + "time_per_iteration": 2.596851348876953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118277, + "balance_loss_mlp": 1.09655797, + "epoch": 0.4020777222008465, + "flos": 527287984128.0, + "grad_norm": 0.02654740933555753, + "language_loss": 0.89437628, + "learning_rate": 0.0006788252071430747, + "loss": 0.90620387, + "num_input_tokens_seen": 174074976, + "router_z_loss_mlp": 0.86328125, + "step": 2090, + "time_per_iteration": 2.8311312198638916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184846, + "balance_loss_mlp": 1.09853876, + "epoch": 0.40227010388611006, + "flos": 526840820736.0, + "grad_norm": 0.026844852664274194, + "language_loss": 0.92195117, + "learning_rate": 0.0006785342369348222, + "loss": 0.93379962, + "num_input_tokens_seen": 174149392, + "router_z_loss_mlp": 0.86425781, + "step": 2091, + "time_per_iteration": 2.7458736896514893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191242, + "balance_loss_mlp": 1.10488725, + "epoch": 0.4024624855713736, + "flos": 433226684928.0, + "grad_norm": 0.031284534475277, + "language_loss": 0.86698365, + "learning_rate": 0.0006782431974134316, + "loss": 0.87889606, + "num_input_tokens_seen": 174214656, + "router_z_loss_mlp": 0.86474609, + "step": 2092, + "time_per_iteration": 2.607151985168457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176082, + "balance_loss_mlp": 1.08996522, + "epoch": 0.4026548672566372, + "flos": 768090898944.0, + "grad_norm": 0.02657615147076362, + "language_loss": 0.96284211, + "learning_rate": 0.0006779520886918949, + "loss": 0.97460294, + "num_input_tokens_seen": 174296064, + "router_z_loss_mlp": 0.86230469, + "step": 2093, + "time_per_iteration": 3.03474760055542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173331, + "balance_loss_mlp": 1.08711922, + "epoch": 0.4028472489419007, + "flos": 644117137920.0, + "grad_norm": 0.02625373299959776, + "language_loss": 0.87827718, + "learning_rate": 0.0006776609108832301, + "loss": 0.89001048, + "num_input_tokens_seen": 174370896, + "router_z_loss_mlp": 0.86328125, + "step": 2094, + "time_per_iteration": 2.7667970657348633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171496, + "balance_loss_mlp": 1.08537877, + "epoch": 0.4030396306271643, + "flos": 492823297536.0, + "grad_norm": 0.02676539061642846, + "language_loss": 0.91710174, + "learning_rate": 0.0006773696641004828, + "loss": 0.92881668, + "num_input_tokens_seen": 174438448, + "router_z_loss_mlp": 0.86230469, + "step": 2095, + "time_per_iteration": 2.6013715267181396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177786, + "balance_loss_mlp": 1.09119189, + "epoch": 0.40323201231242783, + "flos": 903194079744.0, + "grad_norm": 0.03019422222161545, + "language_loss": 0.84170926, + "learning_rate": 0.0006770783484567247, + "loss": 0.85348713, + "num_input_tokens_seen": 174525952, + "router_z_loss_mlp": 0.8671875, + "step": 2096, + "time_per_iteration": 3.1032629013061523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180554, + "balance_loss_mlp": 1.09405565, + "epoch": 0.4034243939976914, + "flos": 571729979904.0, + "grad_norm": 0.026575026001379017, + "language_loss": 0.91571426, + "learning_rate": 0.000676786964065055, + "loss": 0.9275198, + "num_input_tokens_seen": 174607200, + "router_z_loss_mlp": 0.86621094, + "step": 2097, + "time_per_iteration": 2.8030343055725098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179089, + "balance_loss_mlp": 1.09254348, + "epoch": 0.403616775682955, + "flos": 508460006400.0, + "grad_norm": 0.029415731928054877, + "language_loss": 0.85702783, + "learning_rate": 0.0006764955110385986, + "loss": 0.86881876, + "num_input_tokens_seen": 174680976, + "router_z_loss_mlp": 0.86669922, + "step": 2098, + "time_per_iteration": 2.7224180698394775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175119, + "balance_loss_mlp": 1.08857322, + "epoch": 0.40380915736821854, + "flos": 520410619392.0, + "grad_norm": 0.02850929110585318, + "language_loss": 0.87608683, + "learning_rate": 0.0006762039894905083, + "loss": 0.88783801, + "num_input_tokens_seen": 174753152, + "router_z_loss_mlp": 0.86669922, + "step": 2099, + "time_per_iteration": 2.5972354412078857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169724, + "balance_loss_mlp": 1.08313072, + "epoch": 0.40400153905348213, + "flos": 442887086592.0, + "grad_norm": 0.05130464738927161, + "language_loss": 0.88512945, + "learning_rate": 0.000675912399533962, + "loss": 0.89682674, + "num_input_tokens_seen": 174817184, + "router_z_loss_mlp": 0.8671875, + "step": 2100, + "time_per_iteration": 2.502772808074951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168649, + "balance_loss_mlp": 1.08210301, + "epoch": 0.40419392073874566, + "flos": 773704636416.0, + "grad_norm": 0.02210637201548751, + "language_loss": 0.90372586, + "learning_rate": 0.0006756207412821656, + "loss": 0.91541237, + "num_input_tokens_seen": 174898128, + "router_z_loss_mlp": 0.86669922, + "step": 2101, + "time_per_iteration": 2.991191864013672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169884, + "balance_loss_mlp": 1.08319497, + "epoch": 0.40438630242400925, + "flos": 767988840960.0, + "grad_norm": 0.03154624750871164, + "language_loss": 0.88513219, + "learning_rate": 0.0006753290148483505, + "loss": 0.89683104, + "num_input_tokens_seen": 174981872, + "router_z_loss_mlp": 0.86816406, + "step": 2102, + "time_per_iteration": 3.005350112915039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166151, + "balance_loss_mlp": 1.07950926, + "epoch": 0.4045786841092728, + "flos": 416128963584.0, + "grad_norm": 0.026413403572192035, + "language_loss": 0.86387646, + "learning_rate": 0.0006750372203457752, + "loss": 0.87553799, + "num_input_tokens_seen": 175044976, + "router_z_loss_mlp": 0.86767578, + "step": 2103, + "time_per_iteration": 2.4381816387176514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168631, + "balance_loss_mlp": 1.08203721, + "epoch": 0.40477106579453637, + "flos": 540308841984.0, + "grad_norm": 0.025857351914300337, + "language_loss": 0.93101668, + "learning_rate": 0.0006747453578877242, + "loss": 0.94270301, + "num_input_tokens_seen": 175121104, + "router_z_loss_mlp": 0.8671875, + "step": 2104, + "time_per_iteration": 2.7268197536468506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169336, + "balance_loss_mlp": 1.08269489, + "epoch": 0.4049634474797999, + "flos": 828091014144.0, + "grad_norm": 0.03225143111931073, + "language_loss": 0.91022515, + "learning_rate": 0.0006744534275875085, + "loss": 0.92191851, + "num_input_tokens_seen": 175194512, + "router_z_loss_mlp": 0.86767578, + "step": 2105, + "time_per_iteration": 3.0087900161743164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176017, + "balance_loss_mlp": 1.08970928, + "epoch": 0.4051558291650635, + "flos": 573752948736.0, + "grad_norm": 0.02821186929772288, + "language_loss": 0.92500931, + "learning_rate": 0.0006741614295584657, + "loss": 0.93676949, + "num_input_tokens_seen": 175264176, + "router_z_loss_mlp": 0.86425781, + "step": 2106, + "time_per_iteration": 2.666135787963867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183174, + "balance_loss_mlp": 1.09691453, + "epoch": 0.4053482108503271, + "flos": 733244176896.0, + "grad_norm": 0.04647201706044112, + "language_loss": 0.85025966, + "learning_rate": 0.0006738693639139595, + "loss": 0.86209136, + "num_input_tokens_seen": 175347488, + "router_z_loss_mlp": 0.86376953, + "step": 2107, + "time_per_iteration": 2.9633677005767822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177787, + "balance_loss_mlp": 1.09100294, + "epoch": 0.4055405925355906, + "flos": 1214949336576.0, + "grad_norm": 0.0302025425082437, + "language_loss": 0.85097325, + "learning_rate": 0.0006735772307673796, + "loss": 0.86275113, + "num_input_tokens_seen": 175438336, + "router_z_loss_mlp": 0.86914062, + "step": 2108, + "time_per_iteration": 3.5333871841430664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177556, + "balance_loss_mlp": 1.09105742, + "epoch": 0.4057329742208542, + "flos": 717107911680.0, + "grad_norm": 0.026166055652869804, + "language_loss": 0.8899157, + "learning_rate": 0.0006732850302321421, + "loss": 0.90169132, + "num_input_tokens_seen": 175510912, + "router_z_loss_mlp": 0.86621094, + "step": 2109, + "time_per_iteration": 2.8610079288482666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170548, + "balance_loss_mlp": 1.0842886, + "epoch": 0.4059253559061177, + "flos": 565953059328.0, + "grad_norm": 0.026405563608612303, + "language_loss": 0.90377712, + "learning_rate": 0.00067299276242169, + "loss": 0.91548264, + "num_input_tokens_seen": 175583040, + "router_z_loss_mlp": 0.86376953, + "step": 2110, + "time_per_iteration": 2.709127426147461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197311, + "balance_loss_mlp": 1.11319733, + "epoch": 0.4061177375913813, + "flos": 1597186481664.0, + "grad_norm": 0.02594110918583908, + "language_loss": 0.74382168, + "learning_rate": 0.0006727004274494908, + "loss": 0.75579476, + "num_input_tokens_seen": 175817952, + "router_z_loss_mlp": 0.84179688, + "step": 2111, + "time_per_iteration": 4.906593322753906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117304, + "balance_loss_mlp": 1.08654153, + "epoch": 0.40631011927664484, + "flos": 616621140480.0, + "grad_norm": 0.028870166263774127, + "language_loss": 0.85570323, + "learning_rate": 0.0006724080254290395, + "loss": 0.86743361, + "num_input_tokens_seen": 175896352, + "router_z_loss_mlp": 0.86621094, + "step": 2112, + "time_per_iteration": 2.8279542922973633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168033, + "balance_loss_mlp": 1.08134389, + "epoch": 0.40650250096190843, + "flos": 558748053504.0, + "grad_norm": 0.030551496532206422, + "language_loss": 0.96733952, + "learning_rate": 0.0006721155564738566, + "loss": 0.97901982, + "num_input_tokens_seen": 175967152, + "router_z_loss_mlp": 0.86816406, + "step": 2113, + "time_per_iteration": 2.6917896270751953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174904, + "balance_loss_mlp": 1.08964539, + "epoch": 0.40669488264717196, + "flos": 1583542542336.0, + "grad_norm": 0.010618058744132962, + "language_loss": 0.78622639, + "learning_rate": 0.0006718230206974884, + "loss": 0.79797542, + "num_input_tokens_seen": 176205248, + "router_z_loss_mlp": 0.85351562, + "step": 2114, + "time_per_iteration": 4.959328651428223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171549, + "balance_loss_mlp": 1.08476496, + "epoch": 0.40688726433243555, + "flos": 508655390208.0, + "grad_norm": 0.033503716654157654, + "language_loss": 0.93188733, + "learning_rate": 0.0006715304182135078, + "loss": 0.9436028, + "num_input_tokens_seen": 176276208, + "router_z_loss_mlp": 0.86914062, + "step": 2115, + "time_per_iteration": 2.6056840419769287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172073, + "balance_loss_mlp": 1.08528888, + "epoch": 0.40707964601769914, + "flos": 590351840256.0, + "grad_norm": 0.028307470802153102, + "language_loss": 0.95287716, + "learning_rate": 0.0006712377491355127, + "loss": 0.96459788, + "num_input_tokens_seen": 176355072, + "router_z_loss_mlp": 0.86914062, + "step": 2116, + "time_per_iteration": 2.8985562324523926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177825, + "balance_loss_mlp": 1.09146965, + "epoch": 0.40727202770296267, + "flos": 581650893312.0, + "grad_norm": 0.026081347286493965, + "language_loss": 0.86969304, + "learning_rate": 0.0006709450135771274, + "loss": 0.88147128, + "num_input_tokens_seen": 176444592, + "router_z_loss_mlp": 0.86474609, + "step": 2117, + "time_per_iteration": 2.938913345336914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116718, + "balance_loss_mlp": 1.08058655, + "epoch": 0.40746440938822626, + "flos": 505108282368.0, + "grad_norm": 0.02500723808493834, + "language_loss": 0.92501736, + "learning_rate": 0.0006706522116520023, + "loss": 0.93668914, + "num_input_tokens_seen": 176516144, + "router_z_loss_mlp": 0.8671875, + "step": 2118, + "time_per_iteration": 2.6295557022094727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169158, + "balance_loss_mlp": 1.08246934, + "epoch": 0.4076567910734898, + "flos": 606710960640.0, + "grad_norm": 0.031046149511695622, + "language_loss": 0.91392642, + "learning_rate": 0.0006703593434738127, + "loss": 0.92561805, + "num_input_tokens_seen": 176585712, + "router_z_loss_mlp": 0.86816406, + "step": 2119, + "time_per_iteration": 2.6925787925720215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170168, + "balance_loss_mlp": 1.08371782, + "epoch": 0.4078491727587534, + "flos": 480518846976.0, + "grad_norm": 0.026436329156680958, + "language_loss": 0.85361552, + "learning_rate": 0.0006700664091562604, + "loss": 0.86531723, + "num_input_tokens_seen": 176654736, + "router_z_loss_mlp": 0.86572266, + "step": 2120, + "time_per_iteration": 2.567094087600708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177249, + "balance_loss_mlp": 1.09065557, + "epoch": 0.4080415544440169, + "flos": 511418961408.0, + "grad_norm": 0.02549175858454111, + "language_loss": 0.92328954, + "learning_rate": 0.0006697734088130725, + "loss": 0.93506193, + "num_input_tokens_seen": 176722800, + "router_z_loss_mlp": 0.8671875, + "step": 2121, + "time_per_iteration": 2.618701934814453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175348, + "balance_loss_mlp": 1.0889926, + "epoch": 0.4082339361292805, + "flos": 735927157248.0, + "grad_norm": 0.030272250235271202, + "language_loss": 0.93378723, + "learning_rate": 0.0006694803425580018, + "loss": 0.94554067, + "num_input_tokens_seen": 176800320, + "router_z_loss_mlp": 0.86474609, + "step": 2122, + "time_per_iteration": 2.983313798904419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174826, + "balance_loss_mlp": 1.08851826, + "epoch": 0.4084263178145441, + "flos": 458404273152.0, + "grad_norm": 0.031322708915370194, + "language_loss": 0.925843, + "learning_rate": 0.0006691872105048268, + "loss": 0.93759131, + "num_input_tokens_seen": 176867440, + "router_z_loss_mlp": 0.86425781, + "step": 2123, + "time_per_iteration": 2.570157766342163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171971, + "balance_loss_mlp": 1.08566332, + "epoch": 0.4086186994998076, + "flos": 564025417728.0, + "grad_norm": 0.026602974246623758, + "language_loss": 0.91457534, + "learning_rate": 0.0006688940127673513, + "loss": 0.92629504, + "num_input_tokens_seen": 176942048, + "router_z_loss_mlp": 0.86425781, + "step": 2124, + "time_per_iteration": 2.6775970458984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172213, + "balance_loss_mlp": 1.08609629, + "epoch": 0.4088110811850712, + "flos": 574893050880.0, + "grad_norm": 0.023493992507127005, + "language_loss": 0.90594321, + "learning_rate": 0.0006686007494594049, + "loss": 0.91766536, + "num_input_tokens_seen": 177025104, + "router_z_loss_mlp": 0.86230469, + "step": 2125, + "time_per_iteration": 2.8212904930114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166923, + "balance_loss_mlp": 1.08028209, + "epoch": 0.40900346287033473, + "flos": 457846319616.0, + "grad_norm": 0.03600016157180187, + "language_loss": 0.89846623, + "learning_rate": 0.0006683074206948425, + "loss": 0.91013545, + "num_input_tokens_seen": 177089296, + "router_z_loss_mlp": 0.86767578, + "step": 2126, + "time_per_iteration": 2.4914121627807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165958, + "balance_loss_mlp": 1.07926905, + "epoch": 0.4091958445555983, + "flos": 618594444288.0, + "grad_norm": 0.027616550174826966, + "language_loss": 0.88032037, + "learning_rate": 0.0006680140265875443, + "loss": 0.89197993, + "num_input_tokens_seen": 177163648, + "router_z_loss_mlp": 0.86816406, + "step": 2127, + "time_per_iteration": 2.8309690952301025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164825, + "balance_loss_mlp": 1.07846975, + "epoch": 0.40938822624086185, + "flos": 473370236928.0, + "grad_norm": 0.02755246393115647, + "language_loss": 1.01638341, + "learning_rate": 0.0006677205672514162, + "loss": 1.02803159, + "num_input_tokens_seen": 177233856, + "router_z_loss_mlp": 0.86474609, + "step": 2128, + "time_per_iteration": 2.716601610183716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170358, + "balance_loss_mlp": 1.08395457, + "epoch": 0.40958060792612544, + "flos": 571117632000.0, + "grad_norm": 0.024298637355030545, + "language_loss": 0.93714547, + "learning_rate": 0.000667427042800389, + "loss": 0.94884908, + "num_input_tokens_seen": 177309824, + "router_z_loss_mlp": 0.86523438, + "step": 2129, + "time_per_iteration": 2.7863857746124268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181584, + "balance_loss_mlp": 1.09499085, + "epoch": 0.40977298961138897, + "flos": 610470916608.0, + "grad_norm": 0.027297656005279614, + "language_loss": 0.89951032, + "learning_rate": 0.0006671334533484192, + "loss": 0.91132617, + "num_input_tokens_seen": 177380592, + "router_z_loss_mlp": 0.8671875, + "step": 2130, + "time_per_iteration": 2.7272608280181885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177813, + "balance_loss_mlp": 1.09160113, + "epoch": 0.40996537129665256, + "flos": 582872861184.0, + "grad_norm": 0.02438545141207517, + "language_loss": 0.89143705, + "learning_rate": 0.0006668397990094881, + "loss": 0.90321517, + "num_input_tokens_seen": 177454720, + "router_z_loss_mlp": 0.86328125, + "step": 2131, + "time_per_iteration": 2.74776554107666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173755, + "balance_loss_mlp": 1.08739984, + "epoch": 0.41015775298191615, + "flos": 517553722368.0, + "grad_norm": 0.026155362463659675, + "language_loss": 0.91776133, + "learning_rate": 0.0006665460798976027, + "loss": 0.92949885, + "num_input_tokens_seen": 177528224, + "router_z_loss_mlp": 0.86474609, + "step": 2132, + "time_per_iteration": 2.728180170059204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172912, + "balance_loss_mlp": 1.08679533, + "epoch": 0.4103501346671797, + "flos": 511445157888.0, + "grad_norm": 0.02671704384652658, + "language_loss": 0.87880147, + "learning_rate": 0.0006662522961267947, + "loss": 0.89053059, + "num_input_tokens_seen": 177598176, + "router_z_loss_mlp": 0.86230469, + "step": 2133, + "time_per_iteration": 2.6707494258880615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172576, + "balance_loss_mlp": 1.08636391, + "epoch": 0.41054251635244327, + "flos": 550926696960.0, + "grad_norm": 0.02310158230225749, + "language_loss": 0.93120432, + "learning_rate": 0.0006659584478111211, + "loss": 0.9429301, + "num_input_tokens_seen": 177675840, + "router_z_loss_mlp": 0.86328125, + "step": 2134, + "time_per_iteration": 2.7634923458099365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167834, + "balance_loss_mlp": 1.08162224, + "epoch": 0.4107348980377068, + "flos": 841298523648.0, + "grad_norm": 0.0323112144897684, + "language_loss": 0.91370595, + "learning_rate": 0.000665664535064664, + "loss": 0.9253844, + "num_input_tokens_seen": 177751376, + "router_z_loss_mlp": 0.86328125, + "step": 2135, + "time_per_iteration": 3.028343677520752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170594, + "balance_loss_mlp": 1.08447671, + "epoch": 0.4109272797229704, + "flos": 504763176960.0, + "grad_norm": 0.026958983372987907, + "language_loss": 0.8977797, + "learning_rate": 0.0006653705580015303, + "loss": 0.90948564, + "num_input_tokens_seen": 177825264, + "router_z_loss_mlp": 0.86230469, + "step": 2136, + "time_per_iteration": 2.6786246299743652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173433, + "balance_loss_mlp": 1.08731592, + "epoch": 0.4111196614082339, + "flos": 612023253504.0, + "grad_norm": 0.02687154551301225, + "language_loss": 0.92936879, + "learning_rate": 0.0006650765167358523, + "loss": 0.9411031, + "num_input_tokens_seen": 177901680, + "router_z_loss_mlp": 0.86230469, + "step": 2137, + "time_per_iteration": 2.765503168106079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170304, + "balance_loss_mlp": 1.08409154, + "epoch": 0.4113120430934975, + "flos": 454103827968.0, + "grad_norm": 0.029691236683527498, + "language_loss": 0.97143424, + "learning_rate": 0.0006647824113817864, + "loss": 0.98313725, + "num_input_tokens_seen": 177965264, + "router_z_loss_mlp": 0.86328125, + "step": 2138, + "time_per_iteration": 2.490111827850342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179698, + "balance_loss_mlp": 1.09329462, + "epoch": 0.41150442477876104, + "flos": 542709843456.0, + "grad_norm": 0.027637209651618533, + "language_loss": 0.88423729, + "learning_rate": 0.000664488242053515, + "loss": 0.89603424, + "num_input_tokens_seen": 178039712, + "router_z_loss_mlp": 0.86523438, + "step": 2139, + "time_per_iteration": 2.7109243869781494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193887, + "balance_loss_mlp": 1.10748434, + "epoch": 0.4116968064640246, + "flos": 577391380992.0, + "grad_norm": 0.026757188222196804, + "language_loss": 0.8939023, + "learning_rate": 0.0006641940088652445, + "loss": 0.90584123, + "num_input_tokens_seen": 178114080, + "router_z_loss_mlp": 0.86523438, + "step": 2140, + "time_per_iteration": 2.7461891174316406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186164, + "balance_loss_mlp": 1.09952235, + "epoch": 0.4118891881492882, + "flos": 497149939200.0, + "grad_norm": 0.030186458882164903, + "language_loss": 0.90177953, + "learning_rate": 0.0006638997119312065, + "loss": 0.91364121, + "num_input_tokens_seen": 178188032, + "router_z_loss_mlp": 0.86767578, + "step": 2141, + "time_per_iteration": 2.7632482051849365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206482, + "balance_loss_mlp": 1.11969757, + "epoch": 0.41208156983455174, + "flos": 1541570678784.0, + "grad_norm": 0.01865751049600735, + "language_loss": 0.75063306, + "learning_rate": 0.0006636053513656568, + "loss": 0.76269788, + "num_input_tokens_seen": 178395328, + "router_z_loss_mlp": 0.86914062, + "step": 2142, + "time_per_iteration": 4.916187286376953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117268, + "balance_loss_mlp": 1.0864203, + "epoch": 0.41227395151981533, + "flos": 586057399296.0, + "grad_norm": 0.03006664462158482, + "language_loss": 0.91539335, + "learning_rate": 0.000663310927282877, + "loss": 0.92712009, + "num_input_tokens_seen": 178471952, + "router_z_loss_mlp": 0.86376953, + "step": 2143, + "time_per_iteration": 2.783862829208374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178317, + "balance_loss_mlp": 1.09220016, + "epoch": 0.41246633320507886, + "flos": 443892203520.0, + "grad_norm": 0.03021664461702893, + "language_loss": 0.92787349, + "learning_rate": 0.000663016439797172, + "loss": 0.93965667, + "num_input_tokens_seen": 178542192, + "router_z_loss_mlp": 0.86230469, + "step": 2144, + "time_per_iteration": 2.617626428604126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177938, + "balance_loss_mlp": 1.09177303, + "epoch": 0.41265871489034245, + "flos": 581094941184.0, + "grad_norm": 0.031114344129188405, + "language_loss": 0.87895894, + "learning_rate": 0.0006627218890228724, + "loss": 0.89073837, + "num_input_tokens_seen": 178622736, + "router_z_loss_mlp": 0.86279297, + "step": 2145, + "time_per_iteration": 2.823136329650879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172469, + "balance_loss_mlp": 1.08611357, + "epoch": 0.412851096575606, + "flos": 762528827904.0, + "grad_norm": 0.03009040753958223, + "language_loss": 0.9065426, + "learning_rate": 0.0006624272750743326, + "loss": 0.91826725, + "num_input_tokens_seen": 178705808, + "router_z_loss_mlp": 0.86474609, + "step": 2146, + "time_per_iteration": 3.009969472885132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172508, + "balance_loss_mlp": 1.08615267, + "epoch": 0.41304347826086957, + "flos": 556520968704.0, + "grad_norm": 0.023356325653820006, + "language_loss": 0.88529593, + "learning_rate": 0.0006621325980659322, + "loss": 0.89702094, + "num_input_tokens_seen": 178781200, + "router_z_loss_mlp": 0.86474609, + "step": 2147, + "time_per_iteration": 2.7459471225738525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176953, + "balance_loss_mlp": 1.09083641, + "epoch": 0.41323585994613315, + "flos": 666893724672.0, + "grad_norm": 0.029406479855093332, + "language_loss": 0.8760705, + "learning_rate": 0.000661837858112075, + "loss": 0.88783997, + "num_input_tokens_seen": 178855072, + "router_z_loss_mlp": 0.86230469, + "step": 2148, + "time_per_iteration": 2.816408634185791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173515, + "balance_loss_mlp": 1.08763647, + "epoch": 0.4134282416313967, + "flos": 549784593408.0, + "grad_norm": 0.02816234486414791, + "language_loss": 0.9661653, + "learning_rate": 0.0006615430553271888, + "loss": 0.97790039, + "num_input_tokens_seen": 178927936, + "router_z_loss_mlp": 0.85986328, + "step": 2149, + "time_per_iteration": 2.7518115043640137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174425, + "balance_loss_mlp": 1.08859468, + "epoch": 0.4136206233166603, + "flos": 647512522752.0, + "grad_norm": 0.025697121170903614, + "language_loss": 0.9133321, + "learning_rate": 0.0006612481898257264, + "loss": 0.92507643, + "num_input_tokens_seen": 179007792, + "router_z_loss_mlp": 0.859375, + "step": 2150, + "time_per_iteration": 2.841632127761841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179143, + "balance_loss_mlp": 1.09364581, + "epoch": 0.4138130050019238, + "flos": 518363455488.0, + "grad_norm": 0.029278566016903075, + "language_loss": 0.9170779, + "learning_rate": 0.000660953261722165, + "loss": 0.92886931, + "num_input_tokens_seen": 179075200, + "router_z_loss_mlp": 0.85595703, + "step": 2151, + "time_per_iteration": 2.6203365325927734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178641, + "balance_loss_mlp": 1.09309638, + "epoch": 0.4140053866871874, + "flos": 610368858624.0, + "grad_norm": 0.02858072061503926, + "language_loss": 0.90138143, + "learning_rate": 0.0006606582711310055, + "loss": 0.91316783, + "num_input_tokens_seen": 179144448, + "router_z_loss_mlp": 0.85644531, + "step": 2152, + "time_per_iteration": 2.71352481842041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167147, + "balance_loss_mlp": 1.08103001, + "epoch": 0.4141977683724509, + "flos": 580845163008.0, + "grad_norm": 0.02998636441804494, + "language_loss": 0.9075436, + "learning_rate": 0.0006603632181667736, + "loss": 0.91921502, + "num_input_tokens_seen": 179215776, + "router_z_loss_mlp": 0.86230469, + "step": 2153, + "time_per_iteration": 2.766855478286743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175224, + "balance_loss_mlp": 1.09034729, + "epoch": 0.4143901500577145, + "flos": 1310176386048.0, + "grad_norm": 0.007725969282803628, + "language_loss": 0.78943324, + "learning_rate": 0.0006600681029440187, + "loss": 0.80118549, + "num_input_tokens_seen": 179436688, + "router_z_loss_mlp": 0.84960938, + "step": 2154, + "time_per_iteration": 4.895019292831421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175162, + "balance_loss_mlp": 1.08890247, + "epoch": 0.41458253174297804, + "flos": 461122182144.0, + "grad_norm": 0.032062709167589486, + "language_loss": 0.89760709, + "learning_rate": 0.0006597729255773153, + "loss": 0.90935868, + "num_input_tokens_seen": 179503264, + "router_z_loss_mlp": 0.86376953, + "step": 2155, + "time_per_iteration": 2.5811779499053955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170487, + "balance_loss_mlp": 1.08413148, + "epoch": 0.41477491342824163, + "flos": 554438876160.0, + "grad_norm": 0.02646748417883587, + "language_loss": 0.88947552, + "learning_rate": 0.0006594776861812608, + "loss": 0.90118033, + "num_input_tokens_seen": 179574864, + "router_z_loss_mlp": 0.86474609, + "step": 2156, + "time_per_iteration": 2.6486780643463135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174434, + "balance_loss_mlp": 1.08803129, + "epoch": 0.4149672951135052, + "flos": 699085664256.0, + "grad_norm": 0.02893226937169889, + "language_loss": 0.92862517, + "learning_rate": 0.0006591823848704776, + "loss": 0.94036949, + "num_input_tokens_seen": 179658208, + "router_z_loss_mlp": 0.86523438, + "step": 2157, + "time_per_iteration": 2.9617741107940674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175673, + "balance_loss_mlp": 1.08946109, + "epoch": 0.41515967679876875, + "flos": 566836652544.0, + "grad_norm": 0.025963915394380376, + "language_loss": 0.87666786, + "learning_rate": 0.0006588870217596117, + "loss": 0.88842458, + "num_input_tokens_seen": 179732320, + "router_z_loss_mlp": 0.86328125, + "step": 2158, + "time_per_iteration": 2.7438344955444336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175578, + "balance_loss_mlp": 1.08927035, + "epoch": 0.41535205848403234, + "flos": 502177525248.0, + "grad_norm": 0.03336248103115958, + "language_loss": 0.93542749, + "learning_rate": 0.0006585915969633334, + "loss": 0.94718325, + "num_input_tokens_seen": 179801616, + "router_z_loss_mlp": 0.86425781, + "step": 2159, + "time_per_iteration": 2.5621583461761475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170555, + "balance_loss_mlp": 1.08429492, + "epoch": 0.41554444016929587, + "flos": 608701728768.0, + "grad_norm": 0.03070944646834424, + "language_loss": 0.95915914, + "learning_rate": 0.0006582961105963366, + "loss": 0.97086465, + "num_input_tokens_seen": 179876112, + "router_z_loss_mlp": 0.86376953, + "step": 2160, + "time_per_iteration": 2.798051118850708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171192, + "balance_loss_mlp": 1.08498013, + "epoch": 0.41573682185455946, + "flos": 530155614720.0, + "grad_norm": 0.02743693152360054, + "language_loss": 0.85023397, + "learning_rate": 0.0006580005627733395, + "loss": 0.86194587, + "num_input_tokens_seen": 179949936, + "router_z_loss_mlp": 0.86328125, + "step": 2161, + "time_per_iteration": 2.6954233646392822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168175, + "balance_loss_mlp": 1.08234429, + "epoch": 0.415929203539823, + "flos": 506037537792.0, + "grad_norm": 0.027357224978205523, + "language_loss": 0.88365781, + "learning_rate": 0.0006577049536090838, + "loss": 0.89533949, + "num_input_tokens_seen": 180023184, + "router_z_loss_mlp": 0.859375, + "step": 2162, + "time_per_iteration": 2.6762402057647705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167145, + "balance_loss_mlp": 1.08140957, + "epoch": 0.4161215852250866, + "flos": 583823583744.0, + "grad_norm": 0.02816159229600616, + "language_loss": 0.92433643, + "learning_rate": 0.000657409283218335, + "loss": 0.93600792, + "num_input_tokens_seen": 180091728, + "router_z_loss_mlp": 0.85839844, + "step": 2163, + "time_per_iteration": 2.708815574645996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116891, + "balance_loss_mlp": 1.0833174, + "epoch": 0.4163139669103501, + "flos": 491759783424.0, + "grad_norm": 0.02622965675004396, + "language_loss": 0.87195617, + "learning_rate": 0.0006571135517158829, + "loss": 0.8836453, + "num_input_tokens_seen": 180162096, + "router_z_loss_mlp": 0.85693359, + "step": 2164, + "time_per_iteration": 2.7412045001983643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177162, + "balance_loss_mlp": 1.0930481, + "epoch": 0.4165063485956137, + "flos": 1291020767232.0, + "grad_norm": 0.0113690904759025, + "language_loss": 0.76764059, + "learning_rate": 0.0006568177592165404, + "loss": 0.77941221, + "num_input_tokens_seen": 180380912, + "router_z_loss_mlp": 0.84179688, + "step": 2165, + "time_per_iteration": 4.793722867965698 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172447, + "balance_loss_mlp": 1.08680665, + "epoch": 0.4166987302808773, + "flos": 496257613824.0, + "grad_norm": 0.031372404533623194, + "language_loss": 0.90335643, + "learning_rate": 0.0006565219058351444, + "loss": 0.9150809, + "num_input_tokens_seen": 180447424, + "router_z_loss_mlp": 0.85742188, + "step": 2166, + "time_per_iteration": 2.5605039596557617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169955, + "balance_loss_mlp": 1.08412397, + "epoch": 0.4168911119661408, + "flos": 465066788352.0, + "grad_norm": 0.02745374217966413, + "language_loss": 0.89900762, + "learning_rate": 0.0006562259916865553, + "loss": 0.91070712, + "num_input_tokens_seen": 180516336, + "router_z_loss_mlp": 0.859375, + "step": 2167, + "time_per_iteration": 2.5815963745117188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011761, + "balance_loss_mlp": 1.09055507, + "epoch": 0.4170834936514044, + "flos": 537942769152.0, + "grad_norm": 0.0279390150832869, + "language_loss": 0.86569649, + "learning_rate": 0.0006559300168856573, + "loss": 0.8774575, + "num_input_tokens_seen": 180589824, + "router_z_loss_mlp": 0.85644531, + "step": 2168, + "time_per_iteration": 2.7917275428771973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181119, + "balance_loss_mlp": 1.09547901, + "epoch": 0.41727587533666793, + "flos": 551749165056.0, + "grad_norm": 0.026888463962073755, + "language_loss": 0.92254919, + "learning_rate": 0.0006556339815473577, + "loss": 0.93436038, + "num_input_tokens_seen": 180661296, + "router_z_loss_mlp": 0.85742188, + "step": 2169, + "time_per_iteration": 2.640456438064575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170658, + "balance_loss_mlp": 1.08492219, + "epoch": 0.4174682570219315, + "flos": 632377371648.0, + "grad_norm": 0.027558904728032622, + "language_loss": 0.91870886, + "learning_rate": 0.000655337885786588, + "loss": 0.93041539, + "num_input_tokens_seen": 180744896, + "router_z_loss_mlp": 0.85839844, + "step": 2170, + "time_per_iteration": 2.885754108428955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170686, + "balance_loss_mlp": 1.08485556, + "epoch": 0.41766063870719505, + "flos": 520755724800.0, + "grad_norm": 0.031037248087189308, + "language_loss": 0.9245193, + "learning_rate": 0.0006550417297183025, + "loss": 0.93622619, + "num_input_tokens_seen": 180813008, + "router_z_loss_mlp": 0.859375, + "step": 2171, + "time_per_iteration": 2.607590436935425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175474, + "balance_loss_mlp": 1.08945298, + "epoch": 0.41785302039245864, + "flos": 559054227456.0, + "grad_norm": 0.02737354340834092, + "language_loss": 0.87721866, + "learning_rate": 0.0006547455134574793, + "loss": 0.88897336, + "num_input_tokens_seen": 180886480, + "router_z_loss_mlp": 0.86132812, + "step": 2172, + "time_per_iteration": 2.7324562072753906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184116, + "balance_loss_mlp": 1.09833348, + "epoch": 0.41804540207772223, + "flos": 790027553280.0, + "grad_norm": 0.06230752646239431, + "language_loss": 0.90406793, + "learning_rate": 0.0006544492371191198, + "loss": 0.91590911, + "num_input_tokens_seen": 180973776, + "router_z_loss_mlp": 0.85888672, + "step": 2173, + "time_per_iteration": 3.1248764991760254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186676, + "balance_loss_mlp": 1.10089302, + "epoch": 0.41823778376298576, + "flos": 905890521600.0, + "grad_norm": 0.03053935653615099, + "language_loss": 0.9052453, + "learning_rate": 0.0006541529008182485, + "loss": 0.91711211, + "num_input_tokens_seen": 181062768, + "router_z_loss_mlp": 0.85888672, + "step": 2174, + "time_per_iteration": 3.2052760124206543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169526, + "balance_loss_mlp": 1.08383834, + "epoch": 0.41843016544824935, + "flos": 512573799936.0, + "grad_norm": 0.02722476190126499, + "language_loss": 0.93815506, + "learning_rate": 0.0006538565046699136, + "loss": 0.94985026, + "num_input_tokens_seen": 181129872, + "router_z_loss_mlp": 0.85791016, + "step": 2175, + "time_per_iteration": 2.578150987625122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167473, + "balance_loss_mlp": 1.08183265, + "epoch": 0.4186225471335129, + "flos": 654289830912.0, + "grad_norm": 0.03154991846739093, + "language_loss": 0.89587617, + "learning_rate": 0.0006535600487891862, + "loss": 0.90755087, + "num_input_tokens_seen": 181208112, + "router_z_loss_mlp": 0.85742188, + "step": 2176, + "time_per_iteration": 2.8699960708618164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167918, + "balance_loss_mlp": 1.08218253, + "epoch": 0.41881492881877647, + "flos": 570225306624.0, + "grad_norm": 0.027441287945076498, + "language_loss": 0.94665354, + "learning_rate": 0.0006532635332911603, + "loss": 0.95833272, + "num_input_tokens_seen": 181278736, + "router_z_loss_mlp": 0.85839844, + "step": 2177, + "time_per_iteration": 2.695180892944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168273, + "balance_loss_mlp": 1.08239508, + "epoch": 0.41900731050404, + "flos": 913484293632.0, + "grad_norm": 0.030353783790969455, + "language_loss": 0.86808872, + "learning_rate": 0.0006529669582909541, + "loss": 0.87977153, + "num_input_tokens_seen": 181362512, + "router_z_loss_mlp": 0.85986328, + "step": 2178, + "time_per_iteration": 3.2746284008026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116623, + "balance_loss_mlp": 1.08073354, + "epoch": 0.4191996921893036, + "flos": 536783201280.0, + "grad_norm": 0.031775111638151596, + "language_loss": 0.93350971, + "learning_rate": 0.0006526703239037077, + "loss": 0.94517195, + "num_input_tokens_seen": 181432080, + "router_z_loss_mlp": 0.85595703, + "step": 2179, + "time_per_iteration": 2.6485140323638916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167238, + "balance_loss_mlp": 1.08159792, + "epoch": 0.4193920738745671, + "flos": 583730257920.0, + "grad_norm": 0.027399178820930566, + "language_loss": 0.92623031, + "learning_rate": 0.0006523736302445851, + "loss": 0.93790269, + "num_input_tokens_seen": 181507296, + "router_z_loss_mlp": 0.85742188, + "step": 2180, + "time_per_iteration": 2.8337948322296143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116728, + "balance_loss_mlp": 1.08149683, + "epoch": 0.4195844555598307, + "flos": 1337800459776.0, + "grad_norm": 0.031235958835637387, + "language_loss": 0.83915186, + "learning_rate": 0.0006520768774287728, + "loss": 0.85082471, + "num_input_tokens_seen": 181599408, + "router_z_loss_mlp": 0.85888672, + "step": 2181, + "time_per_iteration": 3.725524663925171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170743, + "balance_loss_mlp": 1.08505547, + "epoch": 0.4197768372450943, + "flos": 599996779008.0, + "grad_norm": 0.025797087070179033, + "language_loss": 0.91158509, + "learning_rate": 0.0006517800655714806, + "loss": 0.92329252, + "num_input_tokens_seen": 181674944, + "router_z_loss_mlp": 0.85791016, + "step": 2182, + "time_per_iteration": 2.8207623958587646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172108, + "balance_loss_mlp": 1.08646846, + "epoch": 0.4199692189303578, + "flos": 736595900928.0, + "grad_norm": 0.0300192342725077, + "language_loss": 0.91644537, + "learning_rate": 0.0006514831947879407, + "loss": 0.92816639, + "num_input_tokens_seen": 181756704, + "router_z_loss_mlp": 0.85742188, + "step": 2183, + "time_per_iteration": 2.9593582153320312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170186, + "balance_loss_mlp": 1.08454573, + "epoch": 0.4201616006156214, + "flos": 751661921280.0, + "grad_norm": 0.02826942186100045, + "language_loss": 0.84773123, + "learning_rate": 0.0006511862651934091, + "loss": 0.85943305, + "num_input_tokens_seen": 181837952, + "router_z_loss_mlp": 0.85742188, + "step": 2184, + "time_per_iteration": 3.1170709133148193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168703, + "balance_loss_mlp": 1.08301497, + "epoch": 0.42035398230088494, + "flos": 548091267072.0, + "grad_norm": 0.027950639773315498, + "language_loss": 0.89124084, + "learning_rate": 0.0006508892769031638, + "loss": 0.90292788, + "num_input_tokens_seen": 181906896, + "router_z_loss_mlp": 0.85791016, + "step": 2185, + "time_per_iteration": 2.6419410705566406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116924, + "balance_loss_mlp": 1.08379054, + "epoch": 0.42054636398614853, + "flos": 618047224320.0, + "grad_norm": 0.03133969262582121, + "language_loss": 0.94198585, + "learning_rate": 0.000650592230032506, + "loss": 0.95367819, + "num_input_tokens_seen": 181974976, + "router_z_loss_mlp": 0.85546875, + "step": 2186, + "time_per_iteration": 2.7254862785339355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175, + "balance_loss_mlp": 1.08935976, + "epoch": 0.42073874567141206, + "flos": 641666471424.0, + "grad_norm": 0.02942747497692904, + "language_loss": 0.9171921, + "learning_rate": 0.0006502951246967595, + "loss": 0.92894208, + "num_input_tokens_seen": 182054704, + "router_z_loss_mlp": 0.85742188, + "step": 2187, + "time_per_iteration": 2.8912041187286377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174567, + "balance_loss_mlp": 1.08897436, + "epoch": 0.42093112735667565, + "flos": 494822797824.0, + "grad_norm": 0.02515329577356359, + "language_loss": 0.92510098, + "learning_rate": 0.0006499979610112706, + "loss": 0.93684661, + "num_input_tokens_seen": 182129696, + "router_z_loss_mlp": 0.85693359, + "step": 2188, + "time_per_iteration": 2.710610866546631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119078, + "balance_loss_mlp": 1.1055218, + "epoch": 0.4211235090419392, + "flos": 543436984320.0, + "grad_norm": 0.027549100686041793, + "language_loss": 0.89267701, + "learning_rate": 0.000649700739091409, + "loss": 0.90458483, + "num_input_tokens_seen": 182203792, + "router_z_loss_mlp": 0.85351562, + "step": 2189, + "time_per_iteration": 2.770158290863037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177139, + "balance_loss_mlp": 1.09321594, + "epoch": 0.42131589072720277, + "flos": 1535388254208.0, + "grad_norm": 0.007480893247264192, + "language_loss": 0.73836273, + "learning_rate": 0.0006494034590525657, + "loss": 0.75013411, + "num_input_tokens_seen": 182432080, + "router_z_loss_mlp": 0.83984375, + "step": 2190, + "time_per_iteration": 4.826355218887329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168739, + "balance_loss_mlp": 1.08381474, + "epoch": 0.42150827241246636, + "flos": 567935095296.0, + "grad_norm": 0.025807507169531153, + "language_loss": 0.91430855, + "learning_rate": 0.0006491061210101557, + "loss": 0.92599595, + "num_input_tokens_seen": 182500256, + "router_z_loss_mlp": 0.85009766, + "step": 2191, + "time_per_iteration": 2.6813712120056152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170756, + "balance_loss_mlp": 1.08568799, + "epoch": 0.4217006540977299, + "flos": 708841393152.0, + "grad_norm": 0.02710796189326301, + "language_loss": 0.90667284, + "learning_rate": 0.0006488087250796157, + "loss": 0.91838038, + "num_input_tokens_seen": 182582912, + "router_z_loss_mlp": 0.8515625, + "step": 2192, + "time_per_iteration": 2.8864076137542725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117035, + "balance_loss_mlp": 1.08528221, + "epoch": 0.4218930357829935, + "flos": 628561019904.0, + "grad_norm": 0.0271709214243351, + "language_loss": 0.87769991, + "learning_rate": 0.0006485112713764049, + "loss": 0.8894034, + "num_input_tokens_seen": 182670304, + "router_z_loss_mlp": 0.8515625, + "step": 2193, + "time_per_iteration": 2.9007742404937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170953, + "balance_loss_mlp": 1.08578944, + "epoch": 0.422085417468257, + "flos": 461289368064.0, + "grad_norm": 0.026123872435626132, + "language_loss": 0.89901912, + "learning_rate": 0.0006482137600160051, + "loss": 0.91072869, + "num_input_tokens_seen": 182735024, + "router_z_loss_mlp": 0.85253906, + "step": 2194, + "time_per_iteration": 2.4960973262786865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170401, + "balance_loss_mlp": 1.08533287, + "epoch": 0.4222777991535206, + "flos": 474980971008.0, + "grad_norm": 0.02685495955741856, + "language_loss": 0.90204549, + "learning_rate": 0.0006479161911139206, + "loss": 0.91374946, + "num_input_tokens_seen": 182805024, + "router_z_loss_mlp": 0.8515625, + "step": 2195, + "time_per_iteration": 2.574496030807495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170408, + "balance_loss_mlp": 1.08534062, + "epoch": 0.4224701808387841, + "flos": 471844096512.0, + "grad_norm": 0.03212817551635824, + "language_loss": 0.93686366, + "learning_rate": 0.0006476185647856778, + "loss": 0.94856775, + "num_input_tokens_seen": 182871360, + "router_z_loss_mlp": 0.8515625, + "step": 2196, + "time_per_iteration": 2.558581829071045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169081, + "balance_loss_mlp": 1.08401346, + "epoch": 0.4226625625240477, + "flos": 678822870528.0, + "grad_norm": 0.034209207392335836, + "language_loss": 0.88652933, + "learning_rate": 0.0006473208811468255, + "loss": 0.89822018, + "num_input_tokens_seen": 182952912, + "router_z_loss_mlp": 0.8515625, + "step": 2197, + "time_per_iteration": 2.8745005130767822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169989, + "balance_loss_mlp": 1.08487344, + "epoch": 0.4228549442093113, + "flos": 504559060992.0, + "grad_norm": 0.02694559660877684, + "language_loss": 0.9045344, + "learning_rate": 0.0006470231403129347, + "loss": 0.91623431, + "num_input_tokens_seen": 183022016, + "router_z_loss_mlp": 0.85205078, + "step": 2198, + "time_per_iteration": 2.6385552883148193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171157, + "balance_loss_mlp": 1.08594668, + "epoch": 0.42304732589457483, + "flos": 613074032640.0, + "grad_norm": 0.02362792419875934, + "language_loss": 0.86769903, + "learning_rate": 0.0006467253423995988, + "loss": 0.87941062, + "num_input_tokens_seen": 183101776, + "router_z_loss_mlp": 0.85302734, + "step": 2199, + "time_per_iteration": 2.8800480365753174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169589, + "balance_loss_mlp": 1.08418751, + "epoch": 0.4232397075798384, + "flos": 516648662016.0, + "grad_norm": 0.0345778065938135, + "language_loss": 0.86613309, + "learning_rate": 0.000646427487522433, + "loss": 0.87782902, + "num_input_tokens_seen": 183171392, + "router_z_loss_mlp": 0.85498047, + "step": 2200, + "time_per_iteration": 2.658045768737793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170112, + "balance_loss_mlp": 1.08451986, + "epoch": 0.42343208926510195, + "flos": 590933262336.0, + "grad_norm": 0.02424061904629306, + "language_loss": 0.89308071, + "learning_rate": 0.0006461295757970749, + "loss": 0.90478176, + "num_input_tokens_seen": 183253936, + "router_z_loss_mlp": 0.85693359, + "step": 2201, + "time_per_iteration": 2.8574764728546143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170293, + "balance_loss_mlp": 1.08465314, + "epoch": 0.42362447095036554, + "flos": 641818194432.0, + "grad_norm": 0.03053594684877434, + "language_loss": 0.89224029, + "learning_rate": 0.0006458316073391839, + "loss": 0.90394318, + "num_input_tokens_seen": 183333744, + "router_z_loss_mlp": 0.85742188, + "step": 2202, + "time_per_iteration": 2.932666063308716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168878, + "balance_loss_mlp": 1.08318996, + "epoch": 0.42381685263562907, + "flos": 513717904896.0, + "grad_norm": 0.025745877239568934, + "language_loss": 0.93694568, + "learning_rate": 0.0006455335822644422, + "loss": 0.94863445, + "num_input_tokens_seen": 183401904, + "router_z_loss_mlp": 0.85791016, + "step": 2203, + "time_per_iteration": 2.6537110805511475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169969, + "balance_loss_mlp": 1.0842818, + "epoch": 0.42400923432089266, + "flos": 547822023168.0, + "grad_norm": 0.028367329203477194, + "language_loss": 0.84440267, + "learning_rate": 0.0006452355006885527, + "loss": 0.85610235, + "num_input_tokens_seen": 183471312, + "router_z_loss_mlp": 0.85791016, + "step": 2204, + "time_per_iteration": 2.639218330383301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169105, + "balance_loss_mlp": 1.08346462, + "epoch": 0.4242016160061562, + "flos": 623287658496.0, + "grad_norm": 0.03537327431533643, + "language_loss": 0.96295106, + "learning_rate": 0.0006449373627272412, + "loss": 0.9746421, + "num_input_tokens_seen": 183539184, + "router_z_loss_mlp": 0.85742188, + "step": 2205, + "time_per_iteration": 2.728724956512451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168771, + "balance_loss_mlp": 1.08317852, + "epoch": 0.4243939976914198, + "flos": 572971413504.0, + "grad_norm": 0.029625174738980242, + "language_loss": 0.88551587, + "learning_rate": 0.0006446391684962553, + "loss": 0.89720356, + "num_input_tokens_seen": 183607504, + "router_z_loss_mlp": 0.85693359, + "step": 2206, + "time_per_iteration": 2.6687116622924805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167518, + "balance_loss_mlp": 1.08192575, + "epoch": 0.42458637937668336, + "flos": 449664394752.0, + "grad_norm": 0.02816858253159587, + "language_loss": 0.89565998, + "learning_rate": 0.000644340918111364, + "loss": 0.90733516, + "num_input_tokens_seen": 183674720, + "router_z_loss_mlp": 0.85693359, + "step": 2207, + "time_per_iteration": 2.620295763015747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167512, + "balance_loss_mlp": 1.08206332, + "epoch": 0.4247787610619469, + "flos": 436335361536.0, + "grad_norm": 0.0303416400904182, + "language_loss": 0.92792743, + "learning_rate": 0.0006440426116883585, + "loss": 0.93960261, + "num_input_tokens_seen": 183740448, + "router_z_loss_mlp": 0.85546875, + "step": 2208, + "time_per_iteration": 2.5411367416381836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171139, + "balance_loss_mlp": 1.08602309, + "epoch": 0.4249711427472105, + "flos": 497121741312.0, + "grad_norm": 0.025596497409994177, + "language_loss": 0.92383361, + "learning_rate": 0.0006437442493430519, + "loss": 0.93554503, + "num_input_tokens_seen": 183812640, + "router_z_loss_mlp": 0.85205078, + "step": 2209, + "time_per_iteration": 2.6431679725646973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172012, + "balance_loss_mlp": 1.08694398, + "epoch": 0.425163524432474, + "flos": 657107796480.0, + "grad_norm": 0.030657116246539617, + "language_loss": 0.93065524, + "learning_rate": 0.000643445831191278, + "loss": 0.94237542, + "num_input_tokens_seen": 183895312, + "router_z_loss_mlp": 0.8515625, + "step": 2210, + "time_per_iteration": 2.9031519889831543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117009, + "balance_loss_mlp": 1.08502185, + "epoch": 0.4253559061177376, + "flos": 651778039296.0, + "grad_norm": 0.031032190975230387, + "language_loss": 0.88729775, + "learning_rate": 0.0006431473573488937, + "loss": 0.89899862, + "num_input_tokens_seen": 183966384, + "router_z_loss_mlp": 0.8515625, + "step": 2211, + "time_per_iteration": 2.745398759841919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170674, + "balance_loss_mlp": 1.08560598, + "epoch": 0.42554828780300114, + "flos": 555202947072.0, + "grad_norm": 0.03338022114707726, + "language_loss": 0.92210639, + "learning_rate": 0.0006428488279317765, + "loss": 0.93381315, + "num_input_tokens_seen": 184031728, + "router_z_loss_mlp": 0.8515625, + "step": 2212, + "time_per_iteration": 2.6822004318237305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172615, + "balance_loss_mlp": 1.08797669, + "epoch": 0.4257406694882647, + "flos": 515421964800.0, + "grad_norm": 0.02921339084637532, + "language_loss": 0.9444955, + "learning_rate": 0.0006425502430558259, + "loss": 0.95622164, + "num_input_tokens_seen": 184096160, + "router_z_loss_mlp": 0.84716797, + "step": 2213, + "time_per_iteration": 2.6147451400756836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173123, + "balance_loss_mlp": 1.08824575, + "epoch": 0.42593305117352825, + "flos": 516705057792.0, + "grad_norm": 0.028975617453248656, + "language_loss": 0.90705556, + "learning_rate": 0.0006422516028369628, + "loss": 0.91878676, + "num_input_tokens_seen": 184169664, + "router_z_loss_mlp": 0.84960938, + "step": 2214, + "time_per_iteration": 2.634315013885498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169159, + "balance_loss_mlp": 1.08423436, + "epoch": 0.42612543285879184, + "flos": 589237934592.0, + "grad_norm": 0.02737510916321625, + "language_loss": 0.88997841, + "learning_rate": 0.0006419529073911296, + "loss": 0.90166998, + "num_input_tokens_seen": 184249152, + "router_z_loss_mlp": 0.85009766, + "step": 2215, + "time_per_iteration": 2.934429168701172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168143, + "balance_loss_mlp": 1.08321857, + "epoch": 0.42631781454405543, + "flos": 636751676928.0, + "grad_norm": 0.02841677319990709, + "language_loss": 0.91541028, + "learning_rate": 0.0006416541568342901, + "loss": 0.92709166, + "num_input_tokens_seen": 184326816, + "router_z_loss_mlp": 0.85009766, + "step": 2216, + "time_per_iteration": 2.924881935119629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167669, + "balance_loss_mlp": 1.08269632, + "epoch": 0.42651019622931896, + "flos": 542245215744.0, + "grad_norm": 0.024048936266806608, + "language_loss": 0.89849669, + "learning_rate": 0.0006413553512824297, + "loss": 0.91017342, + "num_input_tokens_seen": 184404336, + "router_z_loss_mlp": 0.85058594, + "step": 2217, + "time_per_iteration": 2.7312259674072266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166506, + "balance_loss_mlp": 1.08096182, + "epoch": 0.42670257791458255, + "flos": 559223414784.0, + "grad_norm": 0.030670266673020908, + "language_loss": 0.90927672, + "learning_rate": 0.0006410564908515549, + "loss": 0.92094177, + "num_input_tokens_seen": 184472320, + "router_z_loss_mlp": 0.85644531, + "step": 2218, + "time_per_iteration": 2.646705389022827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165047, + "balance_loss_mlp": 1.07964516, + "epoch": 0.4268949595998461, + "flos": 622449727488.0, + "grad_norm": 0.03126891192332862, + "language_loss": 0.92295194, + "learning_rate": 0.0006407575756576935, + "loss": 0.93460238, + "num_input_tokens_seen": 184544704, + "router_z_loss_mlp": 0.85498047, + "step": 2219, + "time_per_iteration": 2.750229597091675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163243, + "balance_loss_mlp": 1.07769799, + "epoch": 0.42708734128510967, + "flos": 539015015424.0, + "grad_norm": 0.029393225010211587, + "language_loss": 0.93690813, + "learning_rate": 0.0006404586058168951, + "loss": 0.94854057, + "num_input_tokens_seen": 184622544, + "router_z_loss_mlp": 0.85644531, + "step": 2220, + "time_per_iteration": 2.75992488861084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166043, + "balance_loss_mlp": 1.08049834, + "epoch": 0.4272797229703732, + "flos": 503862119424.0, + "grad_norm": 0.0277791101580606, + "language_loss": 0.93672097, + "learning_rate": 0.0006401595814452296, + "loss": 0.94838136, + "num_input_tokens_seen": 184692544, + "router_z_loss_mlp": 0.85644531, + "step": 2221, + "time_per_iteration": 2.6034135818481445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166502, + "balance_loss_mlp": 1.08081436, + "epoch": 0.4274721046556368, + "flos": 493437646848.0, + "grad_norm": 0.028798228067485887, + "language_loss": 0.8755163, + "learning_rate": 0.000639860502658789, + "loss": 0.88718128, + "num_input_tokens_seen": 184760480, + "router_z_loss_mlp": 0.85791016, + "step": 2222, + "time_per_iteration": 2.6364476680755615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168114, + "balance_loss_mlp": 1.08242607, + "epoch": 0.4276644863409004, + "flos": 569461235712.0, + "grad_norm": 0.025058965600795662, + "language_loss": 0.90727627, + "learning_rate": 0.0006395613695736853, + "loss": 0.91895741, + "num_input_tokens_seen": 184834080, + "router_z_loss_mlp": 0.85791016, + "step": 2223, + "time_per_iteration": 2.7128536701202393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170105, + "balance_loss_mlp": 1.08432245, + "epoch": 0.4278568680261639, + "flos": 608562740736.0, + "grad_norm": 0.029982203504376047, + "language_loss": 0.88910139, + "learning_rate": 0.0006392621823060529, + "loss": 0.90080237, + "num_input_tokens_seen": 184905872, + "router_z_loss_mlp": 0.85888672, + "step": 2224, + "time_per_iteration": 2.7404489517211914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167658, + "balance_loss_mlp": 1.08177996, + "epoch": 0.4280492497114275, + "flos": 561578754048.0, + "grad_norm": 0.03210591854722722, + "language_loss": 0.92597878, + "learning_rate": 0.0006389629409720465, + "loss": 0.93765533, + "num_input_tokens_seen": 184972320, + "router_z_loss_mlp": 0.85986328, + "step": 2225, + "time_per_iteration": 2.66398549079895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170504, + "balance_loss_mlp": 1.08467305, + "epoch": 0.428241631396691, + "flos": 721901182464.0, + "grad_norm": 0.03010502161811575, + "language_loss": 0.95236158, + "learning_rate": 0.0006386636456878417, + "loss": 0.96406662, + "num_input_tokens_seen": 185051040, + "router_z_loss_mlp": 0.859375, + "step": 2226, + "time_per_iteration": 2.866391897201538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168906, + "balance_loss_mlp": 1.08307493, + "epoch": 0.4284340130819546, + "flos": 430369787904.0, + "grad_norm": 0.032531705768225685, + "language_loss": 0.99370027, + "learning_rate": 0.0006383642965696353, + "loss": 1.00538921, + "num_input_tokens_seen": 185113552, + "router_z_loss_mlp": 0.859375, + "step": 2227, + "time_per_iteration": 2.4586703777313232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169599, + "balance_loss_mlp": 1.08376861, + "epoch": 0.42862639476721814, + "flos": 526159342080.0, + "grad_norm": 0.030010487503704626, + "language_loss": 0.90640998, + "learning_rate": 0.000638064893733645, + "loss": 0.91810596, + "num_input_tokens_seen": 185185056, + "router_z_loss_mlp": 0.859375, + "step": 2228, + "time_per_iteration": 2.71899676322937 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168473, + "balance_loss_mlp": 1.08269, + "epoch": 0.42881877645248173, + "flos": 466378079232.0, + "grad_norm": 0.029133853286813928, + "language_loss": 0.95973945, + "learning_rate": 0.000637765437296109, + "loss": 0.97142416, + "num_input_tokens_seen": 185257248, + "router_z_loss_mlp": 0.85888672, + "step": 2229, + "time_per_iteration": 2.6824750900268555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166344, + "balance_loss_mlp": 1.08075178, + "epoch": 0.42901115813774526, + "flos": 561355172352.0, + "grad_norm": 0.028234307189641095, + "language_loss": 0.92378092, + "learning_rate": 0.000637465927373287, + "loss": 0.93544424, + "num_input_tokens_seen": 185324800, + "router_z_loss_mlp": 0.85693359, + "step": 2230, + "time_per_iteration": 2.65869402885437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166629, + "balance_loss_mlp": 1.08137035, + "epoch": 0.42920353982300885, + "flos": 562527475200.0, + "grad_norm": 0.03139177124565146, + "language_loss": 0.86247277, + "learning_rate": 0.000637166364081459, + "loss": 0.87413907, + "num_input_tokens_seen": 185393408, + "router_z_loss_mlp": 0.85351562, + "step": 2231, + "time_per_iteration": 2.7071642875671387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165657, + "balance_loss_mlp": 1.080446, + "epoch": 0.42939592150827244, + "flos": 557315238912.0, + "grad_norm": 0.03049902562345181, + "language_loss": 0.89974546, + "learning_rate": 0.0006368667475369256, + "loss": 0.91140211, + "num_input_tokens_seen": 185467968, + "router_z_loss_mlp": 0.85302734, + "step": 2232, + "time_per_iteration": 2.74843168258667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166412, + "balance_loss_mlp": 1.08363342, + "epoch": 0.42958830319353597, + "flos": 1524942314496.0, + "grad_norm": 0.009964168253272706, + "language_loss": 0.78527778, + "learning_rate": 0.0006365670778560084, + "loss": 0.79694188, + "num_input_tokens_seen": 185705232, + "router_z_loss_mlp": 0.828125, + "step": 2233, + "time_per_iteration": 4.862222909927368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165146, + "balance_loss_mlp": 1.08236694, + "epoch": 0.42978068487879956, + "flos": 1498869672960.0, + "grad_norm": 0.007691227120989337, + "language_loss": 0.78895426, + "learning_rate": 0.0006362673551550494, + "loss": 0.80060571, + "num_input_tokens_seen": 185932672, + "router_z_loss_mlp": 0.828125, + "step": 2234, + "time_per_iteration": 4.816195011138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167111, + "balance_loss_mlp": 1.08242488, + "epoch": 0.4299730665640631, + "flos": 548063069184.0, + "grad_norm": 0.02593969644103988, + "language_loss": 0.92186785, + "learning_rate": 0.0006359675795504112, + "loss": 0.93353903, + "num_input_tokens_seen": 186006288, + "router_z_loss_mlp": 0.84765625, + "step": 2235, + "time_per_iteration": 2.6802918910980225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167601, + "balance_loss_mlp": 1.08300984, + "epoch": 0.4301654482493267, + "flos": 1131115124736.0, + "grad_norm": 0.035304816631346984, + "language_loss": 0.82753956, + "learning_rate": 0.0006356677511584775, + "loss": 0.83921564, + "num_input_tokens_seen": 186097168, + "router_z_loss_mlp": 0.84667969, + "step": 2236, + "time_per_iteration": 3.444307327270508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169724, + "balance_loss_mlp": 1.08522856, + "epoch": 0.4303578299345902, + "flos": 496741707264.0, + "grad_norm": 0.0313639268125667, + "language_loss": 0.9209317, + "learning_rate": 0.0006353678700956511, + "loss": 0.93262899, + "num_input_tokens_seen": 186163904, + "router_z_loss_mlp": 0.84570312, + "step": 2237, + "time_per_iteration": 2.5677876472473145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164152, + "balance_loss_mlp": 1.07965648, + "epoch": 0.4305502116198538, + "flos": 616929315840.0, + "grad_norm": 0.02814766917627989, + "language_loss": 0.90743506, + "learning_rate": 0.0006350679364783569, + "loss": 0.91907656, + "num_input_tokens_seen": 186233888, + "router_z_loss_mlp": 0.84570312, + "step": 2238, + "time_per_iteration": 2.7363951206207275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175266, + "balance_loss_mlp": 1.09081805, + "epoch": 0.4307425933051173, + "flos": 560321857536.0, + "grad_norm": 0.032687311784007, + "language_loss": 0.92748511, + "learning_rate": 0.0006347679504230393, + "loss": 0.93923771, + "num_input_tokens_seen": 186301168, + "router_z_loss_mlp": 0.84521484, + "step": 2239, + "time_per_iteration": 2.6805875301361084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172185, + "balance_loss_mlp": 1.08749855, + "epoch": 0.4309349749903809, + "flos": 973816779264.0, + "grad_norm": 0.03249158230487725, + "language_loss": 0.83304834, + "learning_rate": 0.0006344679120461632, + "loss": 0.84477019, + "num_input_tokens_seen": 186392096, + "router_z_loss_mlp": 0.84765625, + "step": 2240, + "time_per_iteration": 3.4101555347442627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166292, + "balance_loss_mlp": 1.08146274, + "epoch": 0.4311273566756445, + "flos": 542972356608.0, + "grad_norm": 0.03524791345855764, + "language_loss": 0.87825459, + "learning_rate": 0.0006341678214642134, + "loss": 0.88991749, + "num_input_tokens_seen": 186458000, + "router_z_loss_mlp": 0.84912109, + "step": 2241, + "time_per_iteration": 2.625896692276001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165486, + "balance_loss_mlp": 1.08041823, + "epoch": 0.43131973836090803, + "flos": 763110976512.0, + "grad_norm": 0.027424867307564667, + "language_loss": 0.89878041, + "learning_rate": 0.0006338676787936963, + "loss": 0.91043526, + "num_input_tokens_seen": 186544992, + "router_z_loss_mlp": 0.8515625, + "step": 2242, + "time_per_iteration": 3.063455820083618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167252, + "balance_loss_mlp": 1.08199346, + "epoch": 0.4315121200461716, + "flos": 555602446848.0, + "grad_norm": 0.031429355894507384, + "language_loss": 0.916659, + "learning_rate": 0.0006335674841511367, + "loss": 0.92833149, + "num_input_tokens_seen": 186614960, + "router_z_loss_mlp": 0.85351562, + "step": 2243, + "time_per_iteration": 2.666233777999878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192352, + "balance_loss_mlp": 1.10804749, + "epoch": 0.43170450173143515, + "flos": 1488686972928.0, + "grad_norm": 0.015912473948710273, + "language_loss": 0.7918117, + "learning_rate": 0.000633267237653081, + "loss": 0.80373514, + "num_input_tokens_seen": 186854288, + "router_z_loss_mlp": 0.84375, + "step": 2244, + "time_per_iteration": 4.980380535125732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183075, + "balance_loss_mlp": 1.09877014, + "epoch": 0.43189688341669874, + "flos": 1476907548672.0, + "grad_norm": 0.014137336443723746, + "language_loss": 0.77365553, + "learning_rate": 0.0006329669394160953, + "loss": 0.78548628, + "num_input_tokens_seen": 187090272, + "router_z_loss_mlp": 0.84375, + "step": 2245, + "time_per_iteration": 4.896914005279541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011678, + "balance_loss_mlp": 1.08254158, + "epoch": 0.43208926510196227, + "flos": 493984866816.0, + "grad_norm": 0.02893589890767333, + "language_loss": 0.89212227, + "learning_rate": 0.0006326665895567652, + "loss": 0.90380025, + "num_input_tokens_seen": 187157584, + "router_z_loss_mlp": 0.85351562, + "step": 2246, + "time_per_iteration": 2.6488964557647705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169613, + "balance_loss_mlp": 1.08430731, + "epoch": 0.43228164678722586, + "flos": 521302944768.0, + "grad_norm": 0.0351368535627373, + "language_loss": 0.94705987, + "learning_rate": 0.0006323661881916976, + "loss": 0.95875597, + "num_input_tokens_seen": 187229408, + "router_z_loss_mlp": 0.85400391, + "step": 2247, + "time_per_iteration": 2.7094948291778564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170289, + "balance_loss_mlp": 1.08522093, + "epoch": 0.4324740284724894, + "flos": 797395015680.0, + "grad_norm": 0.0300569180656374, + "language_loss": 0.88277382, + "learning_rate": 0.0006320657354375179, + "loss": 0.89447677, + "num_input_tokens_seen": 187304384, + "router_z_loss_mlp": 0.8515625, + "step": 2248, + "time_per_iteration": 2.942108154296875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166997, + "balance_loss_mlp": 1.08188176, + "epoch": 0.432666410157753, + "flos": 483097767936.0, + "grad_norm": 0.027676603795042543, + "language_loss": 0.93945193, + "learning_rate": 0.0006317652314108726, + "loss": 0.95112193, + "num_input_tokens_seen": 187368064, + "router_z_loss_mlp": 0.85205078, + "step": 2249, + "time_per_iteration": 2.559255838394165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167847, + "balance_loss_mlp": 1.08268416, + "epoch": 0.43285879184301657, + "flos": 501209338368.0, + "grad_norm": 0.028764721331973258, + "language_loss": 0.98109567, + "learning_rate": 0.0006314646762284277, + "loss": 0.99277413, + "num_input_tokens_seen": 187436320, + "router_z_loss_mlp": 0.85253906, + "step": 2250, + "time_per_iteration": 2.6713576316833496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188225, + "balance_loss_mlp": 1.10582733, + "epoch": 0.4330511735282801, + "flos": 1513790701056.0, + "grad_norm": 0.02095115440391329, + "language_loss": 0.75425828, + "learning_rate": 0.0006311640700068691, + "loss": 0.76614058, + "num_input_tokens_seen": 187670912, + "router_z_loss_mlp": 0.82421875, + "step": 2251, + "time_per_iteration": 4.936391592025757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170203, + "balance_loss_mlp": 1.08518302, + "epoch": 0.4332435552135437, + "flos": 700837387776.0, + "grad_norm": 0.037779543880407794, + "language_loss": 0.84241956, + "learning_rate": 0.0006308634128629022, + "loss": 0.85412163, + "num_input_tokens_seen": 187746432, + "router_z_loss_mlp": 0.85107422, + "step": 2252, + "time_per_iteration": 2.890848398208618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168176, + "balance_loss_mlp": 1.0830133, + "epoch": 0.4334359368988072, + "flos": 593481984000.0, + "grad_norm": 0.0295787243575072, + "language_loss": 0.93934762, + "learning_rate": 0.0006305627049132531, + "loss": 0.95102942, + "num_input_tokens_seen": 187820032, + "router_z_loss_mlp": 0.85253906, + "step": 2253, + "time_per_iteration": 2.7571680545806885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167414, + "balance_loss_mlp": 1.08220303, + "epoch": 0.4336283185840708, + "flos": 844274942976.0, + "grad_norm": 0.0242542623992157, + "language_loss": 0.90322375, + "learning_rate": 0.0006302619462746662, + "loss": 0.91489786, + "num_input_tokens_seen": 187904400, + "router_z_loss_mlp": 0.85302734, + "step": 2254, + "time_per_iteration": 3.1296751499176025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167279, + "balance_loss_mlp": 1.0821631, + "epoch": 0.43382070026933434, + "flos": 627401452032.0, + "grad_norm": 0.02849659363202695, + "language_loss": 0.96522522, + "learning_rate": 0.0006299611370639069, + "loss": 0.97689807, + "num_input_tokens_seen": 187973264, + "router_z_loss_mlp": 0.85205078, + "step": 2255, + "time_per_iteration": 2.7125463485717773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167069, + "balance_loss_mlp": 1.08181024, + "epoch": 0.4340130819545979, + "flos": 592209624576.0, + "grad_norm": 0.029264792527705672, + "language_loss": 0.85361564, + "learning_rate": 0.0006296602773977593, + "loss": 0.86528635, + "num_input_tokens_seen": 188039984, + "router_z_loss_mlp": 0.85351562, + "step": 2256, + "time_per_iteration": 2.692830801010132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166353, + "balance_loss_mlp": 1.0810945, + "epoch": 0.4342054636398615, + "flos": 491955167232.0, + "grad_norm": 0.02531800088280138, + "language_loss": 0.92533612, + "learning_rate": 0.0006293593673930277, + "loss": 0.93699974, + "num_input_tokens_seen": 188113456, + "router_z_loss_mlp": 0.85351562, + "step": 2257, + "time_per_iteration": 2.6522371768951416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118061, + "balance_loss_mlp": 1.09568477, + "epoch": 0.43439784532512504, + "flos": 700259968512.0, + "grad_norm": 0.028144633410819173, + "language_loss": 0.84340745, + "learning_rate": 0.0006290584071665358, + "loss": 0.85521352, + "num_input_tokens_seen": 188192480, + "router_z_loss_mlp": 0.85009766, + "step": 2258, + "time_per_iteration": 2.878753662109375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179592, + "balance_loss_mlp": 1.09452426, + "epoch": 0.43459022701038863, + "flos": 486801328128.0, + "grad_norm": 0.028951325004384125, + "language_loss": 0.88270766, + "learning_rate": 0.0006287573968351266, + "loss": 0.89450359, + "num_input_tokens_seen": 188258784, + "router_z_loss_mlp": 0.8515625, + "step": 2259, + "time_per_iteration": 2.55161190032959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173139, + "balance_loss_mlp": 1.08830976, + "epoch": 0.43478260869565216, + "flos": 644266859520.0, + "grad_norm": 0.030714073024811012, + "language_loss": 0.91379642, + "learning_rate": 0.0006284563365156626, + "loss": 0.92552781, + "num_input_tokens_seen": 188331312, + "router_z_loss_mlp": 0.84912109, + "step": 2260, + "time_per_iteration": 2.778975009918213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177671, + "balance_loss_mlp": 1.09274662, + "epoch": 0.43497499038091575, + "flos": 427009331712.0, + "grad_norm": 0.03207934204379992, + "language_loss": 0.94470251, + "learning_rate": 0.0006281552263250261, + "loss": 0.95647919, + "num_input_tokens_seen": 188393712, + "router_z_loss_mlp": 0.85009766, + "step": 2261, + "time_per_iteration": 2.540102005004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175407, + "balance_loss_mlp": 1.09281921, + "epoch": 0.4351673720661793, + "flos": 1541525016576.0, + "grad_norm": 0.010664027023399645, + "language_loss": 0.80691534, + "learning_rate": 0.000627854066380118, + "loss": 0.81866938, + "num_input_tokens_seen": 188621152, + "router_z_loss_mlp": 0.82617188, + "step": 2262, + "time_per_iteration": 4.828954219818115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167291, + "balance_loss_mlp": 1.08260465, + "epoch": 0.43535975375144287, + "flos": 750465423360.0, + "grad_norm": 0.02969029135984414, + "language_loss": 0.88281786, + "learning_rate": 0.0006275528567978593, + "loss": 0.89449072, + "num_input_tokens_seen": 188697120, + "router_z_loss_mlp": 0.84765625, + "step": 2263, + "time_per_iteration": 2.9683096408843994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167048, + "balance_loss_mlp": 1.08193278, + "epoch": 0.4355521354367064, + "flos": 862751084544.0, + "grad_norm": 0.03226302104273745, + "language_loss": 0.89985508, + "learning_rate": 0.0006272515976951898, + "loss": 0.91152549, + "num_input_tokens_seen": 188778480, + "router_z_loss_mlp": 0.85205078, + "step": 2264, + "time_per_iteration": 4.429616689682007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166942, + "balance_loss_mlp": 1.08182704, + "epoch": 0.43574451712197, + "flos": 735842563584.0, + "grad_norm": 0.02499576623287147, + "language_loss": 0.84365284, + "learning_rate": 0.0006269502891890687, + "loss": 0.8553223, + "num_input_tokens_seen": 188863616, + "router_z_loss_mlp": 0.85205078, + "step": 2265, + "time_per_iteration": 3.0444254875183105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166782, + "balance_loss_mlp": 1.08214331, + "epoch": 0.4359368988072336, + "flos": 571712515584.0, + "grad_norm": 0.02707186340155289, + "language_loss": 0.93191004, + "learning_rate": 0.0006266489313964743, + "loss": 0.94357783, + "num_input_tokens_seen": 188933984, + "router_z_loss_mlp": 0.84716797, + "step": 2266, + "time_per_iteration": 2.7227466106414795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164913, + "balance_loss_mlp": 1.0802747, + "epoch": 0.4361292804924971, + "flos": 556670690304.0, + "grad_norm": 0.03376827968070452, + "language_loss": 0.92200565, + "learning_rate": 0.0006263475244344041, + "loss": 0.93365479, + "num_input_tokens_seen": 189012976, + "router_z_loss_mlp": 0.84716797, + "step": 2267, + "time_per_iteration": 2.845227003097534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167657, + "balance_loss_mlp": 1.08335233, + "epoch": 0.4363216621777607, + "flos": 558348553728.0, + "grad_norm": 0.031080273211388402, + "language_loss": 0.91650617, + "learning_rate": 0.0006260460684198746, + "loss": 0.92818272, + "num_input_tokens_seen": 189079664, + "router_z_loss_mlp": 0.84375, + "step": 2268, + "time_per_iteration": 2.652310371398926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165668, + "balance_loss_mlp": 1.08141088, + "epoch": 0.4365140438630242, + "flos": 479196822528.0, + "grad_norm": 0.029843008840560653, + "language_loss": 0.92140841, + "learning_rate": 0.0006257445634699213, + "loss": 0.93306512, + "num_input_tokens_seen": 189144688, + "router_z_loss_mlp": 0.84326172, + "step": 2269, + "time_per_iteration": 2.5779240131378174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164543, + "balance_loss_mlp": 1.08042932, + "epoch": 0.4367064255482878, + "flos": 580007232000.0, + "grad_norm": 0.028296510675920098, + "language_loss": 0.89645165, + "learning_rate": 0.0006254430097015993, + "loss": 0.90809709, + "num_input_tokens_seen": 189213984, + "router_z_loss_mlp": 0.84179688, + "step": 2270, + "time_per_iteration": 2.6566953659057617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172028, + "balance_loss_mlp": 1.08963013, + "epoch": 0.43689880723355135, + "flos": 1462271953920.0, + "grad_norm": 0.010844604855090543, + "language_loss": 0.76479089, + "learning_rate": 0.0006251414072319815, + "loss": 0.77651119, + "num_input_tokens_seen": 189434416, + "router_z_loss_mlp": 0.82421875, + "step": 2271, + "time_per_iteration": 4.794802904129028 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170244, + "balance_loss_mlp": 1.08593976, + "epoch": 0.43709118891881493, + "flos": 668873759232.0, + "grad_norm": 0.024959132899117664, + "language_loss": 0.91526961, + "learning_rate": 0.0006248397561781609, + "loss": 0.92697203, + "num_input_tokens_seen": 189513248, + "router_z_loss_mlp": 0.84375, + "step": 2272, + "time_per_iteration": 2.8676164150238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170164, + "balance_loss_mlp": 1.08562064, + "epoch": 0.43728357060407846, + "flos": 545913847296.0, + "grad_norm": 0.033809863548240594, + "language_loss": 0.93834352, + "learning_rate": 0.0006245380566572482, + "loss": 0.95004517, + "num_input_tokens_seen": 189585392, + "router_z_loss_mlp": 0.84619141, + "step": 2273, + "time_per_iteration": 2.6419596672058105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169646, + "balance_loss_mlp": 1.08519816, + "epoch": 0.43747595228934205, + "flos": 748183944192.0, + "grad_norm": 0.02624268387252208, + "language_loss": 0.83012575, + "learning_rate": 0.0006242363087863744, + "loss": 0.84182227, + "num_input_tokens_seen": 189667552, + "router_z_loss_mlp": 0.84521484, + "step": 2274, + "time_per_iteration": 2.9927828311920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165646, + "balance_loss_mlp": 1.08057845, + "epoch": 0.43766833397460564, + "flos": 632529094656.0, + "grad_norm": 0.025411969041571628, + "language_loss": 0.92234564, + "learning_rate": 0.0006239345126826878, + "loss": 0.9340021, + "num_input_tokens_seen": 189742048, + "router_z_loss_mlp": 0.8515625, + "step": 2275, + "time_per_iteration": 2.8180527687072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164237, + "balance_loss_mlp": 1.07931209, + "epoch": 0.43786071565986917, + "flos": 532098719232.0, + "grad_norm": 0.028730665522240066, + "language_loss": 0.90992379, + "learning_rate": 0.0006236326684633561, + "loss": 0.92156613, + "num_input_tokens_seen": 189817968, + "router_z_loss_mlp": 0.85009766, + "step": 2276, + "time_per_iteration": 2.828425168991089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163177, + "balance_loss_mlp": 1.07810962, + "epoch": 0.43805309734513276, + "flos": 539557506048.0, + "grad_norm": 0.03648062799061939, + "language_loss": 0.82486773, + "learning_rate": 0.0006233307762455658, + "loss": 0.83649945, + "num_input_tokens_seen": 189882608, + "router_z_loss_mlp": 0.8515625, + "step": 2277, + "time_per_iteration": 2.608886957168579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164162, + "balance_loss_mlp": 1.07909381, + "epoch": 0.4382454790303963, + "flos": 865963820544.0, + "grad_norm": 0.025903790262040906, + "language_loss": 0.90223956, + "learning_rate": 0.0006230288361465216, + "loss": 0.91388112, + "num_input_tokens_seen": 189960608, + "router_z_loss_mlp": 0.8515625, + "step": 2278, + "time_per_iteration": 3.036163568496704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171688, + "balance_loss_mlp": 1.08638203, + "epoch": 0.4384378607156599, + "flos": 766801075200.0, + "grad_norm": 0.03187081568607536, + "language_loss": 0.92773926, + "learning_rate": 0.0006227268482834473, + "loss": 0.93945611, + "num_input_tokens_seen": 190035472, + "router_z_loss_mlp": 0.85400391, + "step": 2279, + "time_per_iteration": 2.9320731163024902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176636, + "balance_loss_mlp": 1.09137762, + "epoch": 0.4386302424009234, + "flos": 669796283904.0, + "grad_norm": 0.028047353495827182, + "language_loss": 0.9305023, + "learning_rate": 0.000622424812773585, + "loss": 0.94226873, + "num_input_tokens_seen": 190109312, + "router_z_loss_mlp": 0.85351562, + "step": 2280, + "time_per_iteration": 2.7847142219543457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174317, + "balance_loss_mlp": 1.08901083, + "epoch": 0.438822624086187, + "flos": 486150048768.0, + "grad_norm": 0.03276492690852342, + "language_loss": 0.87875438, + "learning_rate": 0.000622122729734195, + "loss": 0.89049757, + "num_input_tokens_seen": 190174176, + "router_z_loss_mlp": 0.85400391, + "step": 2281, + "time_per_iteration": 2.5878114700317383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175391, + "balance_loss_mlp": 1.09008515, + "epoch": 0.4390150057714506, + "flos": 500258615808.0, + "grad_norm": 0.02649151217717187, + "language_loss": 0.92922705, + "learning_rate": 0.0006218205992825566, + "loss": 0.94098091, + "num_input_tokens_seen": 190243888, + "router_z_loss_mlp": 0.85400391, + "step": 2282, + "time_per_iteration": 2.6129069328308105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171786, + "balance_loss_mlp": 1.08652771, + "epoch": 0.4392073874567141, + "flos": 559351669248.0, + "grad_norm": 0.029077625047839704, + "language_loss": 0.88682199, + "learning_rate": 0.0006215184215359671, + "loss": 0.89853978, + "num_input_tokens_seen": 190317504, + "router_z_loss_mlp": 0.85351562, + "step": 2283, + "time_per_iteration": 2.7397634983062744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011712, + "balance_loss_mlp": 1.08594131, + "epoch": 0.4393997691419777, + "flos": 606422251008.0, + "grad_norm": 0.030174398524898192, + "language_loss": 0.92242193, + "learning_rate": 0.0006212161966117425, + "loss": 0.93413389, + "num_input_tokens_seen": 190390160, + "router_z_loss_mlp": 0.85351562, + "step": 2284, + "time_per_iteration": 2.710947275161743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168513, + "balance_loss_mlp": 1.08349264, + "epoch": 0.43959215082724123, + "flos": 805483614720.0, + "grad_norm": 0.03159683391584848, + "language_loss": 0.8931039, + "learning_rate": 0.0006209139246272164, + "loss": 0.90478909, + "num_input_tokens_seen": 190467600, + "router_z_loss_mlp": 0.85107422, + "step": 2285, + "time_per_iteration": 2.9573750495910645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167409, + "balance_loss_mlp": 1.08229375, + "epoch": 0.4397845325125048, + "flos": 488607446016.0, + "grad_norm": 0.033192711624055064, + "language_loss": 0.89631027, + "learning_rate": 0.0006206116056997421, + "loss": 0.90798426, + "num_input_tokens_seen": 190534192, + "router_z_loss_mlp": 0.85205078, + "step": 2286, + "time_per_iteration": 2.5915918350219727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168495, + "balance_loss_mlp": 1.08380854, + "epoch": 0.43997691419776835, + "flos": 481784475648.0, + "grad_norm": 0.02920198010279229, + "language_loss": 0.88986552, + "learning_rate": 0.0006203092399466892, + "loss": 0.90155041, + "num_input_tokens_seen": 190601440, + "router_z_loss_mlp": 0.84765625, + "step": 2287, + "time_per_iteration": 2.6179182529449463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167372, + "balance_loss_mlp": 1.08282888, + "epoch": 0.44016929588303194, + "flos": 484129081344.0, + "grad_norm": 0.024305807708132735, + "language_loss": 0.91028094, + "learning_rate": 0.0006200068274854473, + "loss": 0.92195475, + "num_input_tokens_seen": 190672528, + "router_z_loss_mlp": 0.84619141, + "step": 2288, + "time_per_iteration": 2.6643898487091064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168421, + "balance_loss_mlp": 1.08387816, + "epoch": 0.4403616775682955, + "flos": 573023806464.0, + "grad_norm": 0.025110382343061666, + "language_loss": 0.90969157, + "learning_rate": 0.0006197043684334229, + "loss": 0.92137575, + "num_input_tokens_seen": 190750704, + "router_z_loss_mlp": 0.84619141, + "step": 2289, + "time_per_iteration": 2.7810122966766357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169529, + "balance_loss_mlp": 1.08503318, + "epoch": 0.44055405925355906, + "flos": 631999339008.0, + "grad_norm": 0.03160389670817918, + "language_loss": 0.85855997, + "learning_rate": 0.0006194018629080411, + "loss": 0.87025523, + "num_input_tokens_seen": 190821664, + "router_z_loss_mlp": 0.84570312, + "step": 2290, + "time_per_iteration": 2.7407448291778564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165877, + "balance_loss_mlp": 1.08147717, + "epoch": 0.44074644093882265, + "flos": 537825248256.0, + "grad_norm": 0.027939915930863316, + "language_loss": 0.87505877, + "learning_rate": 0.0006190993110267451, + "loss": 0.88671762, + "num_input_tokens_seen": 190893888, + "router_z_loss_mlp": 0.84472656, + "step": 2291, + "time_per_iteration": 2.7158915996551514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167062, + "balance_loss_mlp": 1.08280444, + "epoch": 0.4409388226240862, + "flos": 464165730816.0, + "grad_norm": 0.03127864863359821, + "language_loss": 0.91365832, + "learning_rate": 0.0006187967129069958, + "loss": 0.92532897, + "num_input_tokens_seen": 190956800, + "router_z_loss_mlp": 0.84326172, + "step": 2292, + "time_per_iteration": 2.506866931915283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167494, + "balance_loss_mlp": 1.08337986, + "epoch": 0.44113120430934977, + "flos": 567160290816.0, + "grad_norm": 0.024295125434261364, + "language_loss": 0.92081046, + "learning_rate": 0.0006184940686662722, + "loss": 0.93248534, + "num_input_tokens_seen": 191032048, + "router_z_loss_mlp": 0.84179688, + "step": 2293, + "time_per_iteration": 2.7406985759735107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168054, + "balance_loss_mlp": 1.084131, + "epoch": 0.4413235859946133, + "flos": 544674415104.0, + "grad_norm": 0.02998433601693185, + "language_loss": 0.95718068, + "learning_rate": 0.0006181913784220714, + "loss": 0.96886122, + "num_input_tokens_seen": 191099952, + "router_z_loss_mlp": 0.83984375, + "step": 2294, + "time_per_iteration": 2.7276971340179443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186783, + "balance_loss_mlp": 1.1034317, + "epoch": 0.4415159676798769, + "flos": 1573302720000.0, + "grad_norm": 0.012177255736314117, + "language_loss": 0.80553782, + "learning_rate": 0.0006178886422919078, + "loss": 0.8174057, + "num_input_tokens_seen": 191335968, + "router_z_loss_mlp": 0.83398438, + "step": 2295, + "time_per_iteration": 4.898420333862305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174829, + "balance_loss_mlp": 1.0908581, + "epoch": 0.4417083493651404, + "flos": 660012357120.0, + "grad_norm": 0.02926637357686751, + "language_loss": 0.86549121, + "learning_rate": 0.0006175858603933146, + "loss": 0.87723947, + "num_input_tokens_seen": 191410112, + "router_z_loss_mlp": 0.84033203, + "step": 2296, + "time_per_iteration": 2.866745710372925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166372, + "balance_loss_mlp": 1.08225799, + "epoch": 0.441900731050404, + "flos": 741816869376.0, + "grad_norm": 0.028401827027787777, + "language_loss": 0.8638438, + "learning_rate": 0.0006172830328438416, + "loss": 0.87550759, + "num_input_tokens_seen": 191491552, + "router_z_loss_mlp": 0.84179688, + "step": 2297, + "time_per_iteration": 2.9731123447418213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165335, + "balance_loss_mlp": 1.08088684, + "epoch": 0.44209311273566754, + "flos": 540595550208.0, + "grad_norm": 0.030114194292861593, + "language_loss": 0.93111193, + "learning_rate": 0.0006169801597610572, + "loss": 0.94276524, + "num_input_tokens_seen": 191567872, + "router_z_loss_mlp": 0.84521484, + "step": 2298, + "time_per_iteration": 2.777326822280884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163943, + "balance_loss_mlp": 1.07959104, + "epoch": 0.4422854944209311, + "flos": 622729704960.0, + "grad_norm": 0.030043302620551878, + "language_loss": 0.96779996, + "learning_rate": 0.0006166772412625469, + "loss": 0.97943938, + "num_input_tokens_seen": 191638032, + "router_z_loss_mlp": 0.84423828, + "step": 2299, + "time_per_iteration": 2.8143997192382812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164367, + "balance_loss_mlp": 1.08006215, + "epoch": 0.4424778761061947, + "flos": 660060020736.0, + "grad_norm": 0.031086205360051855, + "language_loss": 0.88609374, + "learning_rate": 0.0006163742774659141, + "loss": 0.89773744, + "num_input_tokens_seen": 191709104, + "router_z_loss_mlp": 0.84375, + "step": 2300, + "time_per_iteration": 2.8234009742736816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116513, + "balance_loss_mlp": 1.08087325, + "epoch": 0.44267025779145824, + "flos": 569702281728.0, + "grad_norm": 0.02554920530971592, + "language_loss": 0.92150819, + "learning_rate": 0.0006160712684887801, + "loss": 0.93315947, + "num_input_tokens_seen": 191787072, + "router_z_loss_mlp": 0.84326172, + "step": 2301, + "time_per_iteration": 2.733370542526245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170443, + "balance_loss_mlp": 1.08623374, + "epoch": 0.44286263947672183, + "flos": 497818682880.0, + "grad_norm": 0.02788747598953172, + "language_loss": 0.88145387, + "learning_rate": 0.0006157682144487832, + "loss": 0.89315832, + "num_input_tokens_seen": 191863040, + "router_z_loss_mlp": 0.84277344, + "step": 2302, + "time_per_iteration": 2.766334295272827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171189, + "balance_loss_mlp": 1.08697963, + "epoch": 0.44305502116198536, + "flos": 610607903232.0, + "grad_norm": 0.028872273370365097, + "language_loss": 0.89961743, + "learning_rate": 0.0006154651154635793, + "loss": 0.91132939, + "num_input_tokens_seen": 191940352, + "router_z_loss_mlp": 0.84277344, + "step": 2303, + "time_per_iteration": 2.844402313232422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172304, + "balance_loss_mlp": 1.08776116, + "epoch": 0.44324740284724895, + "flos": 471742038528.0, + "grad_norm": 0.028372285588360545, + "language_loss": 0.91810459, + "learning_rate": 0.0006151619716508421, + "loss": 0.92982763, + "num_input_tokens_seen": 192006896, + "router_z_loss_mlp": 0.84619141, + "step": 2304, + "time_per_iteration": 2.545243263244629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166666, + "balance_loss_mlp": 1.08197927, + "epoch": 0.4434397845325125, + "flos": 579811848192.0, + "grad_norm": 0.029138508250266412, + "language_loss": 0.93279153, + "learning_rate": 0.0006148587831282625, + "loss": 0.94445825, + "num_input_tokens_seen": 192075312, + "router_z_loss_mlp": 0.84765625, + "step": 2305, + "time_per_iteration": 2.6743574142456055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179131, + "balance_loss_mlp": 1.09654236, + "epoch": 0.44363216621777607, + "flos": 1499995038720.0, + "grad_norm": 0.011431210063158581, + "language_loss": 0.79176068, + "learning_rate": 0.0006145555500135483, + "loss": 0.80355197, + "num_input_tokens_seen": 192304816, + "router_z_loss_mlp": 0.82617188, + "step": 2306, + "time_per_iteration": 4.870469570159912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177668, + "balance_loss_mlp": 1.09298158, + "epoch": 0.44382454790303966, + "flos": 478285031424.0, + "grad_norm": 0.03377230518223979, + "language_loss": 0.94630158, + "learning_rate": 0.0006142522724244255, + "loss": 0.95807827, + "num_input_tokens_seen": 192369232, + "router_z_loss_mlp": 0.84765625, + "step": 2307, + "time_per_iteration": 2.5165300369262695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181709, + "balance_loss_mlp": 1.09912109, + "epoch": 0.4440169295883032, + "flos": 1547303938560.0, + "grad_norm": 0.010354849447395944, + "language_loss": 0.76484716, + "learning_rate": 0.0006139489504786368, + "loss": 0.77666426, + "num_input_tokens_seen": 192600176, + "router_z_loss_mlp": 0.82617188, + "step": 2308, + "time_per_iteration": 4.86593222618103 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168989, + "balance_loss_mlp": 1.0843029, + "epoch": 0.4442093112735668, + "flos": 592290215424.0, + "grad_norm": 0.030546908540126056, + "language_loss": 0.84313834, + "learning_rate": 0.000613645584293942, + "loss": 0.85482824, + "num_input_tokens_seen": 192675424, + "router_z_loss_mlp": 0.84765625, + "step": 2309, + "time_per_iteration": 2.9245197772979736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179296, + "balance_loss_mlp": 1.09465766, + "epoch": 0.4444016929588303, + "flos": 531327917568.0, + "grad_norm": 0.02954341623225009, + "language_loss": 0.89990199, + "learning_rate": 0.0006133421739881185, + "loss": 0.91169494, + "num_input_tokens_seen": 192747552, + "router_z_loss_mlp": 0.84716797, + "step": 2310, + "time_per_iteration": 2.6806466579437256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173935, + "balance_loss_mlp": 1.08958304, + "epoch": 0.4445940746440939, + "flos": 621388214784.0, + "grad_norm": 0.03132503362752706, + "language_loss": 0.89829159, + "learning_rate": 0.0006130387196789605, + "loss": 0.91003096, + "num_input_tokens_seen": 192819984, + "router_z_loss_mlp": 0.84423828, + "step": 2311, + "time_per_iteration": 2.7674410343170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171768, + "balance_loss_mlp": 1.08751106, + "epoch": 0.4447864563293574, + "flos": 630375869952.0, + "grad_norm": 0.024389617188914626, + "language_loss": 0.89820284, + "learning_rate": 0.0006127352214842795, + "loss": 0.90992051, + "num_input_tokens_seen": 192906080, + "router_z_loss_mlp": 0.84326172, + "step": 2312, + "time_per_iteration": 3.0181000232696533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170174, + "balance_loss_mlp": 1.08591735, + "epoch": 0.444978838014621, + "flos": 652001620992.0, + "grad_norm": 0.03266392614581568, + "language_loss": 0.92178452, + "learning_rate": 0.0006124316795219041, + "loss": 0.93348622, + "num_input_tokens_seen": 192972336, + "router_z_loss_mlp": 0.84326172, + "step": 2313, + "time_per_iteration": 2.7772133350372314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172939, + "balance_loss_mlp": 1.08911133, + "epoch": 0.44517121969988455, + "flos": 613588325376.0, + "grad_norm": 0.026148577301855224, + "language_loss": 0.88032007, + "learning_rate": 0.0006121280939096794, + "loss": 0.89204955, + "num_input_tokens_seen": 193045744, + "router_z_loss_mlp": 0.83886719, + "step": 2314, + "time_per_iteration": 2.7472517490386963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173697, + "balance_loss_mlp": 1.09010756, + "epoch": 0.44536360138514813, + "flos": 489714620928.0, + "grad_norm": 0.031365562822013526, + "language_loss": 0.94548678, + "learning_rate": 0.000611824464765468, + "loss": 0.95722377, + "num_input_tokens_seen": 193115248, + "router_z_loss_mlp": 0.83642578, + "step": 2315, + "time_per_iteration": 2.5471882820129395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188843, + "balance_loss_mlp": 1.10758972, + "epoch": 0.4455559830704117, + "flos": 1519053877248.0, + "grad_norm": 0.020817362108823283, + "language_loss": 0.78594941, + "learning_rate": 0.0006115207922071492, + "loss": 0.79783785, + "num_input_tokens_seen": 193330816, + "router_z_loss_mlp": 0.8125, + "step": 2316, + "time_per_iteration": 4.660900831222534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170364, + "balance_loss_mlp": 1.08663106, + "epoch": 0.44574836475567525, + "flos": 616816524288.0, + "grad_norm": 0.03088300803415325, + "language_loss": 0.9123913, + "learning_rate": 0.000611217076352619, + "loss": 0.92409492, + "num_input_tokens_seen": 193407616, + "router_z_loss_mlp": 0.83789062, + "step": 2317, + "time_per_iteration": 2.7556822299957275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171317, + "balance_loss_mlp": 1.08772719, + "epoch": 0.44594074644093884, + "flos": 507433422336.0, + "grad_norm": 0.026331926721779163, + "language_loss": 0.8931551, + "learning_rate": 0.0006109133173197905, + "loss": 0.90486825, + "num_input_tokens_seen": 193482624, + "router_z_loss_mlp": 0.83642578, + "step": 2318, + "time_per_iteration": 2.720372200012207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172625, + "balance_loss_mlp": 1.08908355, + "epoch": 0.44613312812620237, + "flos": 728311918080.0, + "grad_norm": 0.030991917971638312, + "language_loss": 0.91262019, + "learning_rate": 0.0006106095152265935, + "loss": 0.92434645, + "num_input_tokens_seen": 193555952, + "router_z_loss_mlp": 0.8359375, + "step": 2319, + "time_per_iteration": 2.8956825733184814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171779, + "balance_loss_mlp": 1.08776009, + "epoch": 0.44632550981146596, + "flos": 637057850880.0, + "grad_norm": 0.02763281666385245, + "language_loss": 0.90440875, + "learning_rate": 0.0006103056701909739, + "loss": 0.91612655, + "num_input_tokens_seen": 193636672, + "router_z_loss_mlp": 0.84082031, + "step": 2320, + "time_per_iteration": 2.9104726314544678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175182, + "balance_loss_mlp": 1.09116352, + "epoch": 0.4465178914967295, + "flos": 828616766976.0, + "grad_norm": 0.02413420043376393, + "language_loss": 0.88773656, + "learning_rate": 0.0006100017823308956, + "loss": 0.89948833, + "num_input_tokens_seen": 193721728, + "router_z_loss_mlp": 0.84082031, + "step": 2321, + "time_per_iteration": 3.1638107299804688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176807, + "balance_loss_mlp": 1.0927887, + "epoch": 0.4467102731819931, + "flos": 667032712704.0, + "grad_norm": 0.03201581013716374, + "language_loss": 0.87315178, + "learning_rate": 0.0006096978517643377, + "loss": 0.88491988, + "num_input_tokens_seen": 193795456, + "router_z_loss_mlp": 0.84082031, + "step": 2322, + "time_per_iteration": 2.7875144481658936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182039, + "balance_loss_mlp": 1.09792459, + "epoch": 0.4469026548672566, + "flos": 513969684480.0, + "grad_norm": 0.032089815412588485, + "language_loss": 0.90642822, + "learning_rate": 0.0006093938786092968, + "loss": 0.91824853, + "num_input_tokens_seen": 193865520, + "router_z_loss_mlp": 0.84179688, + "step": 2323, + "time_per_iteration": 2.6789090633392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181311, + "balance_loss_mlp": 1.097054, + "epoch": 0.4470950365525202, + "flos": 685285272576.0, + "grad_norm": 0.032095192334159584, + "language_loss": 0.95970643, + "learning_rate": 0.0006090898629837857, + "loss": 0.97151959, + "num_input_tokens_seen": 193935040, + "router_z_loss_mlp": 0.84326172, + "step": 2324, + "time_per_iteration": 2.842829704284668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174335, + "balance_loss_mlp": 1.08993506, + "epoch": 0.4472874182377838, + "flos": 628534823424.0, + "grad_norm": 0.02542366781046337, + "language_loss": 0.93390518, + "learning_rate": 0.0006087858050058337, + "loss": 0.94564855, + "num_input_tokens_seen": 194009120, + "router_z_loss_mlp": 0.84472656, + "step": 2325, + "time_per_iteration": 2.798461675643921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173301, + "balance_loss_mlp": 1.08899629, + "epoch": 0.4474797999230473, + "flos": 548240988672.0, + "grad_norm": 0.026872235695321916, + "language_loss": 0.8790192, + "learning_rate": 0.0006084817047934866, + "loss": 0.8907522, + "num_input_tokens_seen": 194076672, + "router_z_loss_mlp": 0.84375, + "step": 2326, + "time_per_iteration": 2.6333069801330566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170357, + "balance_loss_mlp": 1.08552742, + "epoch": 0.4476721816083109, + "flos": 456756609024.0, + "grad_norm": 0.03263470786125086, + "language_loss": 0.9605242, + "learning_rate": 0.0006081775624648066, + "loss": 0.97222769, + "num_input_tokens_seen": 194142320, + "router_z_loss_mlp": 0.84912109, + "step": 2327, + "time_per_iteration": 2.506568431854248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171196, + "balance_loss_mlp": 1.08660555, + "epoch": 0.44786456329357444, + "flos": 482500882944.0, + "grad_norm": 0.030530219610100114, + "language_loss": 0.89424241, + "learning_rate": 0.0006078733781378721, + "loss": 0.90595436, + "num_input_tokens_seen": 194208560, + "router_z_loss_mlp": 0.84667969, + "step": 2328, + "time_per_iteration": 2.5324759483337402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174464, + "balance_loss_mlp": 1.09006357, + "epoch": 0.448056944978838, + "flos": 553236374016.0, + "grad_norm": 0.028423200188041658, + "language_loss": 0.87742424, + "learning_rate": 0.0006075691519307781, + "loss": 0.88916886, + "num_input_tokens_seen": 194288080, + "router_z_loss_mlp": 0.84472656, + "step": 2329, + "time_per_iteration": 2.8329951763153076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169966, + "balance_loss_mlp": 1.08580375, + "epoch": 0.44824932666410156, + "flos": 551916350976.0, + "grad_norm": 0.030957218182316032, + "language_loss": 0.88990253, + "learning_rate": 0.0006072648839616356, + "loss": 0.90160215, + "num_input_tokens_seen": 194358464, + "router_z_loss_mlp": 0.84228516, + "step": 2330, + "time_per_iteration": 2.6367061138153076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169901, + "balance_loss_mlp": 1.08612072, + "epoch": 0.44844170834936514, + "flos": 990271953408.0, + "grad_norm": 0.02484019388371453, + "language_loss": 0.87772298, + "learning_rate": 0.0006069605743485718, + "loss": 0.88942194, + "num_input_tokens_seen": 194456112, + "router_z_loss_mlp": 0.83837891, + "step": 2331, + "time_per_iteration": 3.3425865173339844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177153, + "balance_loss_mlp": 1.09356356, + "epoch": 0.44863409003462873, + "flos": 592450670592.0, + "grad_norm": 0.02816420707323987, + "language_loss": 0.89319122, + "learning_rate": 0.0006066562232097303, + "loss": 0.90496272, + "num_input_tokens_seen": 194526880, + "router_z_loss_mlp": 0.83642578, + "step": 2332, + "time_per_iteration": 2.7754669189453125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178328, + "balance_loss_mlp": 1.09473884, + "epoch": 0.44882647171989226, + "flos": 725984776704.0, + "grad_norm": 0.02840681089712515, + "language_loss": 0.91798162, + "learning_rate": 0.0006063518306632708, + "loss": 0.92976487, + "num_input_tokens_seen": 194606800, + "router_z_loss_mlp": 0.83642578, + "step": 2333, + "time_per_iteration": 2.9270272254943848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174339, + "balance_loss_mlp": 1.09065437, + "epoch": 0.44901885340515585, + "flos": 535990932480.0, + "grad_norm": 0.029373675588589353, + "language_loss": 0.88265771, + "learning_rate": 0.0006060473968273688, + "loss": 0.89440107, + "num_input_tokens_seen": 194679856, + "router_z_loss_mlp": 0.83740234, + "step": 2334, + "time_per_iteration": 2.6593613624572754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199905, + "balance_loss_mlp": 1.11693573, + "epoch": 0.4492112350904194, + "flos": 1558690593792.0, + "grad_norm": 0.016875691883268894, + "language_loss": 0.77879542, + "learning_rate": 0.000605742921820216, + "loss": 0.79079443, + "num_input_tokens_seen": 194906320, + "router_z_loss_mlp": 0.83007812, + "step": 2335, + "time_per_iteration": 4.868390321731567 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182762, + "balance_loss_mlp": 1.10017395, + "epoch": 0.44940361677568297, + "flos": 1526700768768.0, + "grad_norm": 0.009982769528938305, + "language_loss": 0.81005216, + "learning_rate": 0.0006054384057600202, + "loss": 0.82187974, + "num_input_tokens_seen": 195129152, + "router_z_loss_mlp": 0.82617188, + "step": 2336, + "time_per_iteration": 4.8639936447143555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176453, + "balance_loss_mlp": 1.09286392, + "epoch": 0.4495959984609465, + "flos": 383320673280.0, + "grad_norm": 0.04017386378382665, + "language_loss": 0.95653474, + "learning_rate": 0.0006051338487650047, + "loss": 0.96829921, + "num_input_tokens_seen": 195189792, + "router_z_loss_mlp": 0.83642578, + "step": 2337, + "time_per_iteration": 2.451195240020752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177188, + "balance_loss_mlp": 1.09364605, + "epoch": 0.4497883801462101, + "flos": 498882196992.0, + "grad_norm": 0.03424215683733749, + "language_loss": 0.88682485, + "learning_rate": 0.0006048292509534095, + "loss": 0.89859676, + "num_input_tokens_seen": 195258640, + "router_z_loss_mlp": 0.8359375, + "step": 2338, + "time_per_iteration": 2.5799245834350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174646, + "balance_loss_mlp": 1.09139061, + "epoch": 0.4499807618314736, + "flos": 615589827072.0, + "grad_norm": 0.03300851417215051, + "language_loss": 0.85045063, + "learning_rate": 0.0006045246124434895, + "loss": 0.86219716, + "num_input_tokens_seen": 195327984, + "router_z_loss_mlp": 0.83300781, + "step": 2339, + "time_per_iteration": 2.732715368270874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170546, + "balance_loss_mlp": 1.08738542, + "epoch": 0.4501731435167372, + "flos": 1007067503616.0, + "grad_norm": 0.0319502465029259, + "language_loss": 0.92538428, + "learning_rate": 0.0006042199333535162, + "loss": 0.9370898, + "num_input_tokens_seen": 195409504, + "router_z_loss_mlp": 0.83203125, + "step": 2340, + "time_per_iteration": 3.3100435733795166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170678, + "balance_loss_mlp": 1.08742249, + "epoch": 0.4503655252020008, + "flos": 822327555072.0, + "grad_norm": 0.024782286149646622, + "language_loss": 0.88794839, + "learning_rate": 0.0006039152138017763, + "loss": 0.89965516, + "num_input_tokens_seen": 195489424, + "router_z_loss_mlp": 0.83300781, + "step": 2341, + "time_per_iteration": 3.0845420360565186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117382, + "balance_loss_mlp": 1.09027839, + "epoch": 0.4505579068872643, + "flos": 487413676032.0, + "grad_norm": 0.028274686754151398, + "language_loss": 0.8912791, + "learning_rate": 0.0006036104539065726, + "loss": 0.90301728, + "num_input_tokens_seen": 195562128, + "router_z_loss_mlp": 0.8359375, + "step": 2342, + "time_per_iteration": 2.704869270324707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170482, + "balance_loss_mlp": 1.08679724, + "epoch": 0.4507502885725279, + "flos": 886335403008.0, + "grad_norm": 0.02767032513042878, + "language_loss": 0.89237905, + "learning_rate": 0.000603305653786223, + "loss": 0.90408385, + "num_input_tokens_seen": 195646800, + "router_z_loss_mlp": 0.83740234, + "step": 2343, + "time_per_iteration": 3.143308162689209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169453, + "balance_loss_mlp": 1.08576834, + "epoch": 0.45094267025779144, + "flos": 579421080576.0, + "grad_norm": 0.028420960086658186, + "language_loss": 0.90634954, + "learning_rate": 0.0006030008135590622, + "loss": 0.91804409, + "num_input_tokens_seen": 195719648, + "router_z_loss_mlp": 0.83740234, + "step": 2344, + "time_per_iteration": 2.7383973598480225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177198, + "balance_loss_mlp": 1.09332275, + "epoch": 0.45113505194305503, + "flos": 526441320960.0, + "grad_norm": 0.025225422820390885, + "language_loss": 0.85642457, + "learning_rate": 0.0006026959333434387, + "loss": 0.86819655, + "num_input_tokens_seen": 195794800, + "router_z_loss_mlp": 0.83935547, + "step": 2345, + "time_per_iteration": 2.7594330310821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177326, + "balance_loss_mlp": 1.09316456, + "epoch": 0.45132743362831856, + "flos": 503115512832.0, + "grad_norm": 0.026356266791679354, + "language_loss": 0.83258432, + "learning_rate": 0.0006023910132577181, + "loss": 0.84435755, + "num_input_tokens_seen": 195866848, + "router_z_loss_mlp": 0.84228516, + "step": 2346, + "time_per_iteration": 2.6426072120666504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174296, + "balance_loss_mlp": 1.09051549, + "epoch": 0.45151981531358215, + "flos": 432835917312.0, + "grad_norm": 0.03747446326611767, + "language_loss": 0.91464496, + "learning_rate": 0.0006020860534202806, + "loss": 0.92638797, + "num_input_tokens_seen": 195930640, + "router_z_loss_mlp": 0.83837891, + "step": 2347, + "time_per_iteration": 2.5375916957855225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172304, + "balance_loss_mlp": 1.08799899, + "epoch": 0.4517121969988457, + "flos": 713493674496.0, + "grad_norm": 0.026159040948808, + "language_loss": 0.86486131, + "learning_rate": 0.0006017810539495224, + "loss": 0.87658435, + "num_input_tokens_seen": 196014240, + "router_z_loss_mlp": 0.84375, + "step": 2348, + "time_per_iteration": 2.935776472091675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172944, + "balance_loss_mlp": 1.0886873, + "epoch": 0.45190457868410927, + "flos": 580556453376.0, + "grad_norm": 0.02859512200307389, + "language_loss": 0.8919422, + "learning_rate": 0.0006014760149638547, + "loss": 0.90367162, + "num_input_tokens_seen": 196083296, + "router_z_loss_mlp": 0.84326172, + "step": 2349, + "time_per_iteration": 4.1359429359436035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117423, + "balance_loss_mlp": 1.08982956, + "epoch": 0.45209696036937286, + "flos": 483627523584.0, + "grad_norm": 0.04225699722465749, + "language_loss": 0.94155228, + "learning_rate": 0.000601170936581704, + "loss": 0.95329458, + "num_input_tokens_seen": 196147840, + "router_z_loss_mlp": 0.84472656, + "step": 2350, + "time_per_iteration": 2.551886796951294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171893, + "balance_loss_mlp": 1.08739793, + "epoch": 0.4522893420546364, + "flos": 541259564544.0, + "grad_norm": 0.03047412078786442, + "language_loss": 0.90869355, + "learning_rate": 0.0006008658189215121, + "loss": 0.92041242, + "num_input_tokens_seen": 196219008, + "router_z_loss_mlp": 0.84570312, + "step": 2351, + "time_per_iteration": 2.6196951866149902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176582, + "balance_loss_mlp": 1.09175217, + "epoch": 0.4524817237399, + "flos": 497690428416.0, + "grad_norm": 0.03573709607194862, + "language_loss": 0.8682127, + "learning_rate": 0.0006005606621017366, + "loss": 0.87997848, + "num_input_tokens_seen": 196287792, + "router_z_loss_mlp": 0.84912109, + "step": 2352, + "time_per_iteration": 2.5675714015960693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174694, + "balance_loss_mlp": 1.09024608, + "epoch": 0.4526741054251635, + "flos": 653840666112.0, + "grad_norm": 0.027536817578414453, + "language_loss": 0.86718237, + "learning_rate": 0.0006002554662408496, + "loss": 0.87892926, + "num_input_tokens_seen": 196371776, + "router_z_loss_mlp": 0.84521484, + "step": 2353, + "time_per_iteration": 2.887061595916748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182285, + "balance_loss_mlp": 1.09774196, + "epoch": 0.4528664871104271, + "flos": 572003226624.0, + "grad_norm": 0.03098083736113463, + "language_loss": 0.96988797, + "learning_rate": 0.0005999502314573388, + "loss": 0.98171079, + "num_input_tokens_seen": 196441840, + "router_z_loss_mlp": 0.84619141, + "step": 2354, + "time_per_iteration": 2.6700878143310547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184968, + "balance_loss_mlp": 1.1005199, + "epoch": 0.45305886879569063, + "flos": 459678633984.0, + "grad_norm": 0.034884925425697356, + "language_loss": 0.93055832, + "learning_rate": 0.0005996449578697066, + "loss": 0.94240803, + "num_input_tokens_seen": 196510464, + "router_z_loss_mlp": 0.84521484, + "step": 2355, + "time_per_iteration": 2.6873598098754883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180832, + "balance_loss_mlp": 1.09647942, + "epoch": 0.4532512504809542, + "flos": 506206725120.0, + "grad_norm": 0.028006133853455534, + "language_loss": 0.87364781, + "learning_rate": 0.0005993396455964709, + "loss": 0.88545609, + "num_input_tokens_seen": 196583888, + "router_z_loss_mlp": 0.84423828, + "step": 2356, + "time_per_iteration": 2.672428607940674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179518, + "balance_loss_mlp": 1.09545124, + "epoch": 0.4534436321662178, + "flos": 583311292416.0, + "grad_norm": 0.033764708533666976, + "language_loss": 0.88888013, + "learning_rate": 0.0005990342947561647, + "loss": 0.90067536, + "num_input_tokens_seen": 196652816, + "router_z_loss_mlp": 0.84130859, + "step": 2357, + "time_per_iteration": 2.7101337909698486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179265, + "balance_loss_mlp": 1.09529436, + "epoch": 0.45363601385148133, + "flos": 550772246016.0, + "grad_norm": 0.03168807299418994, + "language_loss": 0.84871709, + "learning_rate": 0.0005987289054673351, + "loss": 0.86050975, + "num_input_tokens_seen": 196720208, + "router_z_loss_mlp": 0.84033203, + "step": 2358, + "time_per_iteration": 2.6033973693847656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122184, + "balance_loss_mlp": 1.14096832, + "epoch": 0.4538283955367449, + "flos": 1477791141888.0, + "grad_norm": 0.02971290012878958, + "language_loss": 0.76575738, + "learning_rate": 0.0005984234778485451, + "loss": 0.7779758, + "num_input_tokens_seen": 196947696, + "router_z_loss_mlp": 0.80859375, + "step": 2359, + "time_per_iteration": 4.841644525527954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172875, + "balance_loss_mlp": 1.0889039, + "epoch": 0.45402077722200845, + "flos": 585796887552.0, + "grad_norm": 0.03208897744410929, + "language_loss": 0.98243296, + "learning_rate": 0.0005981180120183722, + "loss": 0.99416173, + "num_input_tokens_seen": 197015712, + "router_z_loss_mlp": 0.84033203, + "step": 2360, + "time_per_iteration": 2.76943302154541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183781, + "balance_loss_mlp": 1.09957135, + "epoch": 0.45421315890727204, + "flos": 532888986624.0, + "grad_norm": 0.026822351719262807, + "language_loss": 0.89930874, + "learning_rate": 0.0005978125080954089, + "loss": 0.91114652, + "num_input_tokens_seen": 197094880, + "router_z_loss_mlp": 0.84277344, + "step": 2361, + "time_per_iteration": 2.822767972946167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180091, + "balance_loss_mlp": 1.09597707, + "epoch": 0.4544055405925356, + "flos": 786551577600.0, + "grad_norm": 0.034773976616178995, + "language_loss": 0.84516251, + "learning_rate": 0.000597506966198262, + "loss": 0.85696352, + "num_input_tokens_seen": 197176448, + "router_z_loss_mlp": 0.84179688, + "step": 2362, + "time_per_iteration": 2.952383518218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177, + "balance_loss_mlp": 1.09288561, + "epoch": 0.45459792227779916, + "flos": 519201386496.0, + "grad_norm": 0.03664720273497137, + "language_loss": 0.91360861, + "learning_rate": 0.0005972013864455536, + "loss": 0.92537856, + "num_input_tokens_seen": 197243520, + "router_z_loss_mlp": 0.84179688, + "step": 2363, + "time_per_iteration": 2.6317927837371826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178521, + "balance_loss_mlp": 1.09450209, + "epoch": 0.4547903039630627, + "flos": 538598051328.0, + "grad_norm": 0.028772208334572696, + "language_loss": 0.91273308, + "learning_rate": 0.0005968957689559203, + "loss": 0.92451829, + "num_input_tokens_seen": 197311536, + "router_z_loss_mlp": 0.84082031, + "step": 2364, + "time_per_iteration": 2.6589906215667725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173596, + "balance_loss_mlp": 1.0895294, + "epoch": 0.4549826856483263, + "flos": 529690987008.0, + "grad_norm": 0.029727340486193105, + "language_loss": 0.95477283, + "learning_rate": 0.0005965901138480131, + "loss": 0.96650875, + "num_input_tokens_seen": 197382752, + "router_z_loss_mlp": 0.84130859, + "step": 2365, + "time_per_iteration": 2.595510959625244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171355, + "balance_loss_mlp": 1.08700228, + "epoch": 0.45517506733358987, + "flos": 521982422016.0, + "grad_norm": 0.030829958952989886, + "language_loss": 0.94295681, + "learning_rate": 0.0005962844212404982, + "loss": 0.95467031, + "num_input_tokens_seen": 197456592, + "router_z_loss_mlp": 0.84423828, + "step": 2366, + "time_per_iteration": 2.662235736846924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177016, + "balance_loss_mlp": 1.09271073, + "epoch": 0.4553674490188534, + "flos": 452009000448.0, + "grad_norm": 0.02436634770305822, + "language_loss": 0.92783928, + "learning_rate": 0.0005959786912520558, + "loss": 0.93960941, + "num_input_tokens_seen": 197525408, + "router_z_loss_mlp": 0.84375, + "step": 2367, + "time_per_iteration": 2.573124408721924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117318, + "balance_loss_mlp": 1.08906567, + "epoch": 0.455559830704117, + "flos": 547744160256.0, + "grad_norm": 0.037205613753220755, + "language_loss": 0.90209919, + "learning_rate": 0.0005956729240013806, + "loss": 0.913831, + "num_input_tokens_seen": 197608480, + "router_z_loss_mlp": 0.84179688, + "step": 2368, + "time_per_iteration": 2.772557020187378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173597, + "balance_loss_mlp": 1.08943486, + "epoch": 0.4557522123893805, + "flos": 584865630720.0, + "grad_norm": 0.026144628796570656, + "language_loss": 0.97770655, + "learning_rate": 0.0005953671196071824, + "loss": 0.98944247, + "num_input_tokens_seen": 197678416, + "router_z_loss_mlp": 0.84228516, + "step": 2369, + "time_per_iteration": 2.7082910537719727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172311, + "balance_loss_mlp": 1.08819652, + "epoch": 0.4559445940746441, + "flos": 527483367936.0, + "grad_norm": 0.0309922218143565, + "language_loss": 0.8751142, + "learning_rate": 0.0005950612781881846, + "loss": 0.8868373, + "num_input_tokens_seen": 197753424, + "router_z_loss_mlp": 0.84179688, + "step": 2370, + "time_per_iteration": 2.7258613109588623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172868, + "balance_loss_mlp": 1.08913577, + "epoch": 0.45613697575990764, + "flos": 653367306240.0, + "grad_norm": 0.03125586624235708, + "language_loss": 0.84058654, + "learning_rate": 0.0005947553998631259, + "loss": 0.85231519, + "num_input_tokens_seen": 197832080, + "router_z_loss_mlp": 0.83789062, + "step": 2371, + "time_per_iteration": 2.8463094234466553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169614, + "balance_loss_mlp": 1.08626282, + "epoch": 0.4563293574451712, + "flos": 868623332352.0, + "grad_norm": 0.025158843177806284, + "language_loss": 0.84537494, + "learning_rate": 0.000594449484750758, + "loss": 0.85707104, + "num_input_tokens_seen": 197919536, + "router_z_loss_mlp": 0.83398438, + "step": 2372, + "time_per_iteration": 3.1793160438537598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165382, + "balance_loss_mlp": 1.08193552, + "epoch": 0.45652173913043476, + "flos": 499131975168.0, + "grad_norm": 0.03016735007152292, + "language_loss": 0.8953886, + "learning_rate": 0.0005941435329698484, + "loss": 0.90704238, + "num_input_tokens_seen": 197991872, + "router_z_loss_mlp": 0.83496094, + "step": 2373, + "time_per_iteration": 2.6885011196136475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168274, + "balance_loss_mlp": 1.08458936, + "epoch": 0.45671412081569834, + "flos": 561958788096.0, + "grad_norm": 0.029049495784182693, + "language_loss": 0.89830238, + "learning_rate": 0.0005938375446391778, + "loss": 0.90998513, + "num_input_tokens_seen": 198063392, + "router_z_loss_mlp": 0.83740234, + "step": 2374, + "time_per_iteration": 2.7694103717803955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169785, + "balance_loss_mlp": 1.08605206, + "epoch": 0.45690650250096193, + "flos": 504122631168.0, + "grad_norm": 0.032895841438659715, + "language_loss": 0.95283711, + "learning_rate": 0.0005935315198775415, + "loss": 0.96453488, + "num_input_tokens_seen": 198131232, + "router_z_loss_mlp": 0.83789062, + "step": 2375, + "time_per_iteration": 2.6797261238098145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117336, + "balance_loss_mlp": 1.08967507, + "epoch": 0.45709888418622546, + "flos": 431598486528.0, + "grad_norm": 0.029217874962507603, + "language_loss": 0.93084061, + "learning_rate": 0.0005932254588037486, + "loss": 0.94257426, + "num_input_tokens_seen": 198194944, + "router_z_loss_mlp": 0.83740234, + "step": 2376, + "time_per_iteration": 2.5119664669036865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170171, + "balance_loss_mlp": 1.08634305, + "epoch": 0.45729126587148905, + "flos": 526693100544.0, + "grad_norm": 0.033600967739372, + "language_loss": 0.91914618, + "learning_rate": 0.000592919361536623, + "loss": 0.93084788, + "num_input_tokens_seen": 198265728, + "router_z_loss_mlp": 0.83886719, + "step": 2377, + "time_per_iteration": 2.627753734588623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172251, + "balance_loss_mlp": 1.08861363, + "epoch": 0.4574836475567526, + "flos": 639147949056.0, + "grad_norm": 0.02676395696709272, + "language_loss": 0.95213675, + "learning_rate": 0.0005926132281950017, + "loss": 0.9638592, + "num_input_tokens_seen": 198336640, + "router_z_loss_mlp": 0.83691406, + "step": 2378, + "time_per_iteration": 2.7404637336730957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171278, + "balance_loss_mlp": 1.08754539, + "epoch": 0.45767602924201617, + "flos": 650790386688.0, + "grad_norm": 0.03076010987013328, + "language_loss": 0.92175043, + "learning_rate": 0.0005923070588977367, + "loss": 0.93346316, + "num_input_tokens_seen": 198413552, + "router_z_loss_mlp": 0.83789062, + "step": 2379, + "time_per_iteration": 2.7948412895202637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173225, + "balance_loss_mlp": 1.08944476, + "epoch": 0.4578684109272797, + "flos": 747962363904.0, + "grad_norm": 0.027484014603145524, + "language_loss": 0.92339164, + "learning_rate": 0.0005920008537636931, + "loss": 0.93512392, + "num_input_tokens_seen": 198490864, + "router_z_loss_mlp": 0.83837891, + "step": 2380, + "time_per_iteration": 2.903837203979492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173408, + "balance_loss_mlp": 1.08972311, + "epoch": 0.4580607926125433, + "flos": 642727984128.0, + "grad_norm": 0.029077527756171735, + "language_loss": 0.92490625, + "learning_rate": 0.0005916946129117504, + "loss": 0.93664026, + "num_input_tokens_seen": 198571200, + "router_z_loss_mlp": 0.83740234, + "step": 2381, + "time_per_iteration": 2.902449131011963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169328, + "balance_loss_mlp": 1.08569121, + "epoch": 0.4582531742978069, + "flos": 803239065600.0, + "grad_norm": 0.02842187637415346, + "language_loss": 0.86509985, + "learning_rate": 0.0005913883364608017, + "loss": 0.87679315, + "num_input_tokens_seen": 198658624, + "router_z_loss_mlp": 0.83691406, + "step": 2382, + "time_per_iteration": 3.0474140644073486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171424, + "balance_loss_mlp": 1.0876435, + "epoch": 0.4584455559830704, + "flos": 685517586432.0, + "grad_norm": 0.02678099894990505, + "language_loss": 0.94194049, + "learning_rate": 0.0005910820245297542, + "loss": 0.95365477, + "num_input_tokens_seen": 198731312, + "router_z_loss_mlp": 0.83837891, + "step": 2383, + "time_per_iteration": 2.879652261734009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171015, + "balance_loss_mlp": 1.08718669, + "epoch": 0.458637937668334, + "flos": 519281977344.0, + "grad_norm": 0.03033035418174317, + "language_loss": 0.87193358, + "learning_rate": 0.000590775677237529, + "loss": 0.88364375, + "num_input_tokens_seen": 198805296, + "router_z_loss_mlp": 0.83886719, + "step": 2384, + "time_per_iteration": 2.718327045440674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116823, + "balance_loss_mlp": 1.08478332, + "epoch": 0.4588303193535975, + "flos": 506532364800.0, + "grad_norm": 0.028303891516217768, + "language_loss": 0.87188554, + "learning_rate": 0.0005904692947030601, + "loss": 0.88356787, + "num_input_tokens_seen": 198872112, + "router_z_loss_mlp": 0.83496094, + "step": 2385, + "time_per_iteration": 2.5850000381469727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166672, + "balance_loss_mlp": 1.08303475, + "epoch": 0.4590227010388611, + "flos": 496908893184.0, + "grad_norm": 0.031451346934425, + "language_loss": 0.9665041, + "learning_rate": 0.0005901628770452963, + "loss": 0.97817081, + "num_input_tokens_seen": 198938480, + "router_z_loss_mlp": 0.83691406, + "step": 2386, + "time_per_iteration": 2.5478482246398926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172991, + "balance_loss_mlp": 1.08964002, + "epoch": 0.45921508272412465, + "flos": 494601217536.0, + "grad_norm": 0.030858044337890404, + "language_loss": 0.93199378, + "learning_rate": 0.000589856424383199, + "loss": 0.94372368, + "num_input_tokens_seen": 199008608, + "router_z_loss_mlp": 0.83398438, + "step": 2387, + "time_per_iteration": 2.6889121532440186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170845, + "balance_loss_mlp": 1.08744633, + "epoch": 0.45940746440938823, + "flos": 692592336384.0, + "grad_norm": 0.02985924743030105, + "language_loss": 0.89320701, + "learning_rate": 0.000589549936835744, + "loss": 0.90491545, + "num_input_tokens_seen": 199084592, + "router_z_loss_mlp": 0.83447266, + "step": 2388, + "time_per_iteration": 2.929584264755249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167353, + "balance_loss_mlp": 1.08390617, + "epoch": 0.45959984609465176, + "flos": 504736980480.0, + "grad_norm": 0.026272627268038303, + "language_loss": 0.85652947, + "learning_rate": 0.0005892434145219202, + "loss": 0.86820304, + "num_input_tokens_seen": 199151504, + "router_z_loss_mlp": 0.83496094, + "step": 2389, + "time_per_iteration": 2.6049258708953857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169189, + "balance_loss_mlp": 1.08593321, + "epoch": 0.45979222777991535, + "flos": 677839220736.0, + "grad_norm": 0.032142260667283734, + "language_loss": 0.89047158, + "learning_rate": 0.0005889368575607303, + "loss": 0.90216345, + "num_input_tokens_seen": 199224528, + "router_z_loss_mlp": 0.83300781, + "step": 2390, + "time_per_iteration": 2.8630926609039307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170087, + "balance_loss_mlp": 1.08673584, + "epoch": 0.45998460946517894, + "flos": 779038396416.0, + "grad_norm": 0.02948026619685868, + "language_loss": 0.84149277, + "learning_rate": 0.00058863026607119, + "loss": 0.85319364, + "num_input_tokens_seen": 199312512, + "router_z_loss_mlp": 0.83398438, + "step": 2391, + "time_per_iteration": 3.0889787673950195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170542, + "balance_loss_mlp": 1.08709574, + "epoch": 0.46017699115044247, + "flos": 853021552128.0, + "grad_norm": 0.028406278062058678, + "language_loss": 0.85429174, + "learning_rate": 0.0005883236401723287, + "loss": 0.8659972, + "num_input_tokens_seen": 199397216, + "router_z_loss_mlp": 0.83496094, + "step": 2392, + "time_per_iteration": 3.1613874435424805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167478, + "balance_loss_mlp": 1.08403194, + "epoch": 0.46036937283570606, + "flos": 576963683328.0, + "grad_norm": 0.029157836827012555, + "language_loss": 0.90157199, + "learning_rate": 0.0005880169799831893, + "loss": 0.91324675, + "num_input_tokens_seen": 199464288, + "router_z_loss_mlp": 0.83496094, + "step": 2393, + "time_per_iteration": 2.6974027156829834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117291, + "balance_loss_mlp": 1.08955884, + "epoch": 0.4605617545209696, + "flos": 613119694848.0, + "grad_norm": 0.028584885066092792, + "language_loss": 0.87511885, + "learning_rate": 0.0005877102856228278, + "loss": 0.88684797, + "num_input_tokens_seen": 199538096, + "router_z_loss_mlp": 0.83398438, + "step": 2394, + "time_per_iteration": 2.862462043762207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169553, + "balance_loss_mlp": 1.08591628, + "epoch": 0.4607541362062332, + "flos": 534158618112.0, + "grad_norm": 0.03156913659667245, + "language_loss": 0.91444194, + "learning_rate": 0.0005874035572103133, + "loss": 0.92613751, + "num_input_tokens_seen": 199609504, + "router_z_loss_mlp": 0.83691406, + "step": 2395, + "time_per_iteration": 2.66796612739563 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171842, + "balance_loss_mlp": 1.08830035, + "epoch": 0.4609465178914967, + "flos": 648473978880.0, + "grad_norm": 0.039315545211924735, + "language_loss": 0.89278555, + "learning_rate": 0.0005870967948647288, + "loss": 0.90450394, + "num_input_tokens_seen": 199678960, + "router_z_loss_mlp": 0.8359375, + "step": 2396, + "time_per_iteration": 2.7669596672058105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209076, + "balance_loss_mlp": 1.12553406, + "epoch": 0.4611388995767603, + "flos": 1469498426880.0, + "grad_norm": 0.015424486797259693, + "language_loss": 0.743083, + "learning_rate": 0.0005867899987051693, + "loss": 0.7551738, + "num_input_tokens_seen": 199903568, + "router_z_loss_mlp": 0.8359375, + "step": 2397, + "time_per_iteration": 5.5382936000823975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177127, + "balance_loss_mlp": 1.09377611, + "epoch": 0.46133128126202383, + "flos": 724476100608.0, + "grad_norm": 0.029375695907885992, + "language_loss": 0.91919947, + "learning_rate": 0.0005864831688507443, + "loss": 0.93097073, + "num_input_tokens_seen": 199988672, + "router_z_loss_mlp": 0.83398438, + "step": 2398, + "time_per_iteration": 2.95526123046875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171581, + "balance_loss_mlp": 1.08846855, + "epoch": 0.4615236629472874, + "flos": 549113848320.0, + "grad_norm": 0.030696537047505416, + "language_loss": 0.82409662, + "learning_rate": 0.0005861763054205754, + "loss": 0.83581245, + "num_input_tokens_seen": 200062304, + "router_z_loss_mlp": 0.83154297, + "step": 2399, + "time_per_iteration": 2.767615795135498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172709, + "balance_loss_mlp": 1.08973968, + "epoch": 0.461716044632551, + "flos": 603459293184.0, + "grad_norm": 0.02737063612292851, + "language_loss": 0.84976828, + "learning_rate": 0.0005858694085337976, + "loss": 0.86149538, + "num_input_tokens_seen": 200138464, + "router_z_loss_mlp": 0.83007812, + "step": 2400, + "time_per_iteration": 2.7964670658111572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011724, + "balance_loss_mlp": 1.08966899, + "epoch": 0.46190842631781454, + "flos": 475436866560.0, + "grad_norm": 0.03229000781534058, + "language_loss": 0.9094255, + "learning_rate": 0.0005855624783095589, + "loss": 0.92114949, + "num_input_tokens_seen": 200205728, + "router_z_loss_mlp": 0.82763672, + "step": 2401, + "time_per_iteration": 2.534349203109741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170734, + "balance_loss_mlp": 1.08814597, + "epoch": 0.4621008080030781, + "flos": 438401991168.0, + "grad_norm": 0.027555285929390542, + "language_loss": 0.90607065, + "learning_rate": 0.00058525551486702, + "loss": 0.91777802, + "num_input_tokens_seen": 200269824, + "router_z_loss_mlp": 0.82617188, + "step": 2402, + "time_per_iteration": 2.5021228790283203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172463, + "balance_loss_mlp": 1.08987451, + "epoch": 0.46229318968834165, + "flos": 526497716736.0, + "grad_norm": 0.03262891309156314, + "language_loss": 0.88400978, + "learning_rate": 0.0005849485183253548, + "loss": 0.89573443, + "num_input_tokens_seen": 200341264, + "router_z_loss_mlp": 0.82617188, + "step": 2403, + "time_per_iteration": 2.6212213039398193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165506, + "balance_loss_mlp": 1.08291745, + "epoch": 0.46248557137360524, + "flos": 440533748736.0, + "grad_norm": 0.02845192827842058, + "language_loss": 0.92361593, + "learning_rate": 0.0005846414888037501, + "loss": 0.93527102, + "num_input_tokens_seen": 200405632, + "router_z_loss_mlp": 0.82617188, + "step": 2404, + "time_per_iteration": 2.482285499572754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166688, + "balance_loss_mlp": 1.08409953, + "epoch": 0.4626779530588688, + "flos": 618772363776.0, + "grad_norm": 0.03074329225106782, + "language_loss": 0.881423, + "learning_rate": 0.0005843344264214049, + "loss": 0.89308989, + "num_input_tokens_seen": 200479312, + "router_z_loss_mlp": 0.82617188, + "step": 2405, + "time_per_iteration": 2.746795415878296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170811, + "balance_loss_mlp": 1.08803225, + "epoch": 0.46287033474413236, + "flos": 671359354368.0, + "grad_norm": 0.02816556419491645, + "language_loss": 0.904742, + "learning_rate": 0.0005840273312975317, + "loss": 0.91645014, + "num_input_tokens_seen": 200552976, + "router_z_loss_mlp": 0.828125, + "step": 2406, + "time_per_iteration": 2.866894483566284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168834, + "balance_loss_mlp": 1.08572149, + "epoch": 0.46306271642939595, + "flos": 481198324224.0, + "grad_norm": 0.027370741977369897, + "language_loss": 0.96141434, + "learning_rate": 0.0005837202035513555, + "loss": 0.97310269, + "num_input_tokens_seen": 200621088, + "router_z_loss_mlp": 0.83154297, + "step": 2407, + "time_per_iteration": 2.589233636856079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168547, + "balance_loss_mlp": 1.08562469, + "epoch": 0.4632550981146595, + "flos": 581857010688.0, + "grad_norm": 0.028787881065009197, + "language_loss": 0.87249482, + "learning_rate": 0.0005834130433021136, + "loss": 0.88418025, + "num_input_tokens_seen": 200698400, + "router_z_loss_mlp": 0.82958984, + "step": 2408, + "time_per_iteration": 2.77109432220459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176276, + "balance_loss_mlp": 1.09311593, + "epoch": 0.46344747979992307, + "flos": 525017238528.0, + "grad_norm": 0.03139748973768327, + "language_loss": 0.79860151, + "learning_rate": 0.0005831058506690563, + "loss": 0.81036425, + "num_input_tokens_seen": 200767264, + "router_z_loss_mlp": 0.83203125, + "step": 2409, + "time_per_iteration": 2.6422629356384277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175968, + "balance_loss_mlp": 1.0931412, + "epoch": 0.4636398614851866, + "flos": 747812642304.0, + "grad_norm": 0.02712568041794283, + "language_loss": 0.9122293, + "learning_rate": 0.0005827986257714464, + "loss": 0.92398894, + "num_input_tokens_seen": 200841440, + "router_z_loss_mlp": 0.82861328, + "step": 2410, + "time_per_iteration": 2.915513515472412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175895, + "balance_loss_mlp": 1.09254348, + "epoch": 0.4638322431704502, + "flos": 597645442560.0, + "grad_norm": 0.03337742182336422, + "language_loss": 0.94969916, + "learning_rate": 0.0005824913687285591, + "loss": 0.96145809, + "num_input_tokens_seen": 200911296, + "router_z_loss_mlp": 0.83398438, + "step": 2411, + "time_per_iteration": 2.7729153633117676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174985, + "balance_loss_mlp": 1.09168148, + "epoch": 0.4640246248557137, + "flos": 540532423680.0, + "grad_norm": 0.028926449520475586, + "language_loss": 0.87762833, + "learning_rate": 0.0005821840796596821, + "loss": 0.88937813, + "num_input_tokens_seen": 200981920, + "router_z_loss_mlp": 0.83349609, + "step": 2412, + "time_per_iteration": 2.7454707622528076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174854, + "balance_loss_mlp": 1.09155095, + "epoch": 0.4642170065409773, + "flos": 563808566784.0, + "grad_norm": 0.027243427778446835, + "language_loss": 0.85983133, + "learning_rate": 0.0005818767586841158, + "loss": 0.87157989, + "num_input_tokens_seen": 201059392, + "router_z_loss_mlp": 0.83349609, + "step": 2413, + "time_per_iteration": 2.7634999752044678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174726, + "balance_loss_mlp": 1.09161353, + "epoch": 0.46440938822624084, + "flos": 532061789184.0, + "grad_norm": 0.026139841130999073, + "language_loss": 0.91185576, + "learning_rate": 0.0005815694059211726, + "loss": 0.923603, + "num_input_tokens_seen": 201130192, + "router_z_loss_mlp": 0.83154297, + "step": 2414, + "time_per_iteration": 2.6814608573913574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193306, + "balance_loss_mlp": 1.11109924, + "epoch": 0.4646017699115044, + "flos": 1529624795136.0, + "grad_norm": 0.015412108289742382, + "language_loss": 0.80873632, + "learning_rate": 0.0005812620214901778, + "loss": 0.82066941, + "num_input_tokens_seen": 201354720, + "router_z_loss_mlp": 0.82226562, + "step": 2415, + "time_per_iteration": 4.867271184921265 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183273, + "balance_loss_mlp": 1.10163879, + "epoch": 0.464794151596768, + "flos": 1544171793408.0, + "grad_norm": 0.012751682226462524, + "language_loss": 0.7694506, + "learning_rate": 0.000580954605510468, + "loss": 0.78128332, + "num_input_tokens_seen": 201592096, + "router_z_loss_mlp": 0.81640625, + "step": 2416, + "time_per_iteration": 5.0150392055511475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166548, + "balance_loss_mlp": 1.08391249, + "epoch": 0.46498653328203154, + "flos": 502538093568.0, + "grad_norm": 0.028765151082888876, + "language_loss": 0.92239797, + "learning_rate": 0.0005806471581013931, + "loss": 0.93406343, + "num_input_tokens_seen": 201666160, + "router_z_loss_mlp": 0.82666016, + "step": 2417, + "time_per_iteration": 2.6913554668426514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165917, + "balance_loss_mlp": 1.08332872, + "epoch": 0.46517891496729513, + "flos": 677300732928.0, + "grad_norm": 0.03431254801555697, + "language_loss": 0.85110676, + "learning_rate": 0.0005803396793823146, + "loss": 0.86276597, + "num_input_tokens_seen": 201733552, + "router_z_loss_mlp": 0.82617188, + "step": 2418, + "time_per_iteration": 2.8245232105255127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169421, + "balance_loss_mlp": 1.08702314, + "epoch": 0.46537129665255866, + "flos": 586511293440.0, + "grad_norm": 0.03532488466841911, + "language_loss": 0.93255758, + "learning_rate": 0.0005800321694726065, + "loss": 0.94425178, + "num_input_tokens_seen": 201806128, + "router_z_loss_mlp": 0.82421875, + "step": 2419, + "time_per_iteration": 2.74255108833313 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117097, + "balance_loss_mlp": 1.08866799, + "epoch": 0.46556367833782225, + "flos": 588820970496.0, + "grad_norm": 0.031254530654890866, + "language_loss": 0.92505676, + "learning_rate": 0.0005797246284916545, + "loss": 0.93676651, + "num_input_tokens_seen": 201874224, + "router_z_loss_mlp": 0.82324219, + "step": 2420, + "time_per_iteration": 2.6942667961120605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182114, + "balance_loss_mlp": 1.10238647, + "epoch": 0.4657560600230858, + "flos": 1488582187008.0, + "grad_norm": 0.01896402624903705, + "language_loss": 0.77505189, + "learning_rate": 0.0005794170565588569, + "loss": 0.78687304, + "num_input_tokens_seen": 202111648, + "router_z_loss_mlp": 0.796875, + "step": 2421, + "time_per_iteration": 4.965069532394409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179806, + "balance_loss_mlp": 1.09740829, + "epoch": 0.46594844170834937, + "flos": 581392382976.0, + "grad_norm": 0.035008146137172264, + "language_loss": 0.92618293, + "learning_rate": 0.0005791094537936233, + "loss": 0.93798101, + "num_input_tokens_seen": 202183344, + "router_z_loss_mlp": 0.82421875, + "step": 2422, + "time_per_iteration": 2.7509443759918213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116805, + "balance_loss_mlp": 1.08555722, + "epoch": 0.4661408233936129, + "flos": 513570184704.0, + "grad_norm": 0.03182837491947037, + "language_loss": 0.88539767, + "learning_rate": 0.0005788018203153762, + "loss": 0.89707822, + "num_input_tokens_seen": 202252512, + "router_z_loss_mlp": 0.82519531, + "step": 2423, + "time_per_iteration": 2.6291344165802 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163454, + "balance_loss_mlp": 1.08038855, + "epoch": 0.4663332050788765, + "flos": 492033030144.0, + "grad_norm": 0.03147692461991822, + "language_loss": 0.92034245, + "learning_rate": 0.000578494156243549, + "loss": 0.93197691, + "num_input_tokens_seen": 202320096, + "router_z_loss_mlp": 0.83105469, + "step": 2424, + "time_per_iteration": 2.5616393089294434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167158, + "balance_loss_mlp": 1.08390224, + "epoch": 0.4665255867641401, + "flos": 513707171328.0, + "grad_norm": 0.028174773974589257, + "language_loss": 0.94988501, + "learning_rate": 0.0005781864616975878, + "loss": 0.96155655, + "num_input_tokens_seen": 202391552, + "router_z_loss_mlp": 0.83300781, + "step": 2425, + "time_per_iteration": 2.67893648147583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178777, + "balance_loss_mlp": 1.09552157, + "epoch": 0.4667179684494036, + "flos": 425706772992.0, + "grad_norm": 0.03381525890081808, + "language_loss": 0.91298926, + "learning_rate": 0.0005778787367969502, + "loss": 0.92477703, + "num_input_tokens_seen": 202457328, + "router_z_loss_mlp": 0.83300781, + "step": 2426, + "time_per_iteration": 2.5708863735198975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180968, + "balance_loss_mlp": 1.09790349, + "epoch": 0.4669103501346672, + "flos": 709223428608.0, + "grad_norm": 0.031023375068471706, + "language_loss": 0.86979687, + "learning_rate": 0.0005775709816611053, + "loss": 0.88160658, + "num_input_tokens_seen": 202535888, + "router_z_loss_mlp": 0.83105469, + "step": 2427, + "time_per_iteration": 2.9488039016723633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178737, + "balance_loss_mlp": 1.09543312, + "epoch": 0.4671027318199307, + "flos": 555945550848.0, + "grad_norm": 0.0268683026146142, + "language_loss": 0.8862977, + "learning_rate": 0.0005772631964095346, + "loss": 0.89808506, + "num_input_tokens_seen": 202608400, + "router_z_loss_mlp": 0.83349609, + "step": 2428, + "time_per_iteration": 2.6830828189849854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176571, + "balance_loss_mlp": 1.09321952, + "epoch": 0.4672951135051943, + "flos": 568195607040.0, + "grad_norm": 0.029193722689313813, + "language_loss": 0.92024446, + "learning_rate": 0.000576955381161731, + "loss": 0.93201017, + "num_input_tokens_seen": 202677712, + "router_z_loss_mlp": 0.83398438, + "step": 2429, + "time_per_iteration": 2.7286531925201416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172919, + "balance_loss_mlp": 1.08956802, + "epoch": 0.46748749519045785, + "flos": 425418063360.0, + "grad_norm": 0.030194965591673555, + "language_loss": 0.93541706, + "learning_rate": 0.0005766475360371985, + "loss": 0.94714624, + "num_input_tokens_seen": 202743824, + "router_z_loss_mlp": 0.83398438, + "step": 2430, + "time_per_iteration": 2.5866243839263916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171537, + "balance_loss_mlp": 1.08809078, + "epoch": 0.46767987687572143, + "flos": 539370854400.0, + "grad_norm": 0.031323302876694416, + "language_loss": 0.91645998, + "learning_rate": 0.0005763396611554536, + "loss": 0.92817533, + "num_input_tokens_seen": 202813072, + "router_z_loss_mlp": 0.83496094, + "step": 2431, + "time_per_iteration": 2.644538402557373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169389, + "balance_loss_mlp": 1.08622885, + "epoch": 0.467872258560985, + "flos": 825075663360.0, + "grad_norm": 0.035112660876247544, + "language_loss": 0.8720994, + "learning_rate": 0.0005760317566360237, + "loss": 0.88379329, + "num_input_tokens_seen": 202886576, + "router_z_loss_mlp": 0.83203125, + "step": 2432, + "time_per_iteration": 2.9847497940063477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169145, + "balance_loss_mlp": 1.08598459, + "epoch": 0.46806464024624855, + "flos": 662853791232.0, + "grad_norm": 0.03130586605287321, + "language_loss": 0.92657965, + "learning_rate": 0.000575723822598448, + "loss": 0.93827116, + "num_input_tokens_seen": 202956736, + "router_z_loss_mlp": 0.83203125, + "step": 2433, + "time_per_iteration": 2.7757930755615234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166037, + "balance_loss_mlp": 1.08325768, + "epoch": 0.46825702193151214, + "flos": 757054078464.0, + "grad_norm": 0.025972857143736858, + "language_loss": 0.87588978, + "learning_rate": 0.0005754158591622773, + "loss": 0.88755012, + "num_input_tokens_seen": 203036432, + "router_z_loss_mlp": 0.828125, + "step": 2434, + "time_per_iteration": 2.9586892127990723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167751, + "balance_loss_mlp": 1.08482957, + "epoch": 0.4684494036167757, + "flos": 440310167040.0, + "grad_norm": 0.03095385887839679, + "language_loss": 0.89792037, + "learning_rate": 0.0005751078664470732, + "loss": 0.90959787, + "num_input_tokens_seen": 203101904, + "router_z_loss_mlp": 0.82958984, + "step": 2435, + "time_per_iteration": 2.5508580207824707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167106, + "balance_loss_mlp": 1.08446991, + "epoch": 0.46864178530203926, + "flos": 533748384768.0, + "grad_norm": 0.02784458934890301, + "language_loss": 0.91441107, + "learning_rate": 0.0005747998445724094, + "loss": 0.92608213, + "num_input_tokens_seen": 203170272, + "router_z_loss_mlp": 0.82666016, + "step": 2436, + "time_per_iteration": 2.6264078617095947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166893, + "balance_loss_mlp": 1.08435297, + "epoch": 0.4688341669873028, + "flos": 577825809408.0, + "grad_norm": 0.028098929039846225, + "language_loss": 0.94501269, + "learning_rate": 0.0005744917936578707, + "loss": 0.95668173, + "num_input_tokens_seen": 203243920, + "router_z_loss_mlp": 0.82568359, + "step": 2437, + "time_per_iteration": 2.7923285961151123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163054, + "balance_loss_mlp": 1.0805608, + "epoch": 0.4690265486725664, + "flos": 540717073920.0, + "grad_norm": 0.02510139841230761, + "language_loss": 0.88352144, + "learning_rate": 0.0005741837138230526, + "loss": 0.89515197, + "num_input_tokens_seen": 203321760, + "router_z_loss_mlp": 0.82519531, + "step": 2438, + "time_per_iteration": 2.720592737197876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117104, + "balance_loss_mlp": 1.08849919, + "epoch": 0.4692189303578299, + "flos": 771881054208.0, + "grad_norm": 0.031043213179005578, + "language_loss": 0.91746414, + "learning_rate": 0.0005738756051875627, + "loss": 0.92917454, + "num_input_tokens_seen": 203409088, + "router_z_loss_mlp": 0.82568359, + "step": 2439, + "time_per_iteration": 3.0688676834106445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179368, + "balance_loss_mlp": 1.09697056, + "epoch": 0.4694113120430935, + "flos": 572513516544.0, + "grad_norm": 0.031224617656339514, + "language_loss": 0.8895998, + "learning_rate": 0.0005735674678710192, + "loss": 0.90139341, + "num_input_tokens_seen": 203481680, + "router_z_loss_mlp": 0.82421875, + "step": 2440, + "time_per_iteration": 2.6647889614105225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180255, + "balance_loss_mlp": 1.09814322, + "epoch": 0.4696036937283571, + "flos": 750094121472.0, + "grad_norm": 0.03673041295896698, + "language_loss": 0.88509989, + "learning_rate": 0.0005732593019930517, + "loss": 0.89690244, + "num_input_tokens_seen": 203554848, + "router_z_loss_mlp": 0.82128906, + "step": 2441, + "time_per_iteration": 2.9219651222229004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177833, + "balance_loss_mlp": 1.09553087, + "epoch": 0.4697960754136206, + "flos": 494442763776.0, + "grad_norm": 0.03186685029176949, + "language_loss": 0.93415046, + "learning_rate": 0.0005729511076733008, + "loss": 0.94592881, + "num_input_tokens_seen": 203624816, + "router_z_loss_mlp": 0.82324219, + "step": 2442, + "time_per_iteration": 2.6268982887268066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163524, + "balance_loss_mlp": 1.08088803, + "epoch": 0.4699884570988842, + "flos": 726360081408.0, + "grad_norm": 0.03313850577325225, + "language_loss": 0.91418898, + "learning_rate": 0.000572642885031418, + "loss": 0.92582428, + "num_input_tokens_seen": 203698256, + "router_z_loss_mlp": 0.82666016, + "step": 2443, + "time_per_iteration": 2.847228527069092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165965, + "balance_loss_mlp": 1.08337641, + "epoch": 0.47018083878414774, + "flos": 556577364480.0, + "grad_norm": 0.031620033102277616, + "language_loss": 0.86240256, + "learning_rate": 0.0005723346341870662, + "loss": 0.87406218, + "num_input_tokens_seen": 203772672, + "router_z_loss_mlp": 0.82617188, + "step": 2444, + "time_per_iteration": 2.7060024738311768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171889, + "balance_loss_mlp": 1.08944428, + "epoch": 0.4703732204694113, + "flos": 424962167808.0, + "grad_norm": 0.03469194433982127, + "language_loss": 0.92819834, + "learning_rate": 0.0005720263552599188, + "loss": 0.93991721, + "num_input_tokens_seen": 203835904, + "router_z_loss_mlp": 0.82470703, + "step": 2445, + "time_per_iteration": 2.486546754837036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175277, + "balance_loss_mlp": 1.09307039, + "epoch": 0.47056560215467486, + "flos": 704755797504.0, + "grad_norm": 0.03273224664010927, + "language_loss": 0.86175644, + "learning_rate": 0.0005717180483696604, + "loss": 0.87350929, + "num_input_tokens_seen": 203914704, + "router_z_loss_mlp": 0.82226562, + "step": 2446, + "time_per_iteration": 2.8490843772888184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173534, + "balance_loss_mlp": 1.09123182, + "epoch": 0.47075798383993844, + "flos": 556012680192.0, + "grad_norm": 0.030967943008195494, + "language_loss": 0.88733399, + "learning_rate": 0.0005714097136359862, + "loss": 0.89906937, + "num_input_tokens_seen": 203985072, + "router_z_loss_mlp": 0.82324219, + "step": 2447, + "time_per_iteration": 2.6790409088134766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172662, + "balance_loss_mlp": 1.09035945, + "epoch": 0.470950365525202, + "flos": 565493160960.0, + "grad_norm": 0.028459673893144737, + "language_loss": 0.91199988, + "learning_rate": 0.0005711013511786027, + "loss": 0.92372644, + "num_input_tokens_seen": 204061904, + "router_z_loss_mlp": 0.82324219, + "step": 2448, + "time_per_iteration": 2.871711492538452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169516, + "balance_loss_mlp": 1.08745217, + "epoch": 0.47114274721046556, + "flos": 535498106880.0, + "grad_norm": 0.02665313173872239, + "language_loss": 0.88226557, + "learning_rate": 0.0005707929611172263, + "loss": 0.89396071, + "num_input_tokens_seen": 204137392, + "router_z_loss_mlp": 0.82080078, + "step": 2449, + "time_per_iteration": 2.69319748878479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166092, + "balance_loss_mlp": 1.08402824, + "epoch": 0.47133512889572915, + "flos": 474077912064.0, + "grad_norm": 0.0332447507442279, + "language_loss": 0.90459168, + "learning_rate": 0.000570484543571585, + "loss": 0.91625261, + "num_input_tokens_seen": 204202752, + "router_z_loss_mlp": 0.82080078, + "step": 2450, + "time_per_iteration": 2.5612680912017822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164305, + "balance_loss_mlp": 1.08228934, + "epoch": 0.4715275105809927, + "flos": 459967343616.0, + "grad_norm": 0.03392229050190778, + "language_loss": 0.90577096, + "learning_rate": 0.0005701760986614171, + "loss": 0.91741407, + "num_input_tokens_seen": 204266960, + "router_z_loss_mlp": 0.8203125, + "step": 2451, + "time_per_iteration": 2.5571579933166504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166326, + "balance_loss_mlp": 1.08435798, + "epoch": 0.47171989226625627, + "flos": 422886806016.0, + "grad_norm": 0.028518751420243762, + "language_loss": 0.93793362, + "learning_rate": 0.0005698676265064714, + "loss": 0.94959688, + "num_input_tokens_seen": 204331216, + "router_z_loss_mlp": 0.81982422, + "step": 2452, + "time_per_iteration": 2.476069211959839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169062, + "balance_loss_mlp": 1.08680761, + "epoch": 0.4719122739515198, + "flos": 458376075264.0, + "grad_norm": 0.03301356479716476, + "language_loss": 0.95592558, + "learning_rate": 0.0005695591272265074, + "loss": 0.9676162, + "num_input_tokens_seen": 204397216, + "router_z_loss_mlp": 0.82275391, + "step": 2453, + "time_per_iteration": 2.512503147125244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169417, + "balance_loss_mlp": 1.08730555, + "epoch": 0.4721046556367834, + "flos": 516016848384.0, + "grad_norm": 0.02961212180136774, + "language_loss": 0.87225032, + "learning_rate": 0.0005692506009412954, + "loss": 0.88394439, + "num_input_tokens_seen": 204469952, + "router_z_loss_mlp": 0.82128906, + "step": 2454, + "time_per_iteration": 2.673123836517334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187157, + "balance_loss_mlp": 1.10609436, + "epoch": 0.4722970373220469, + "flos": 1575703721472.0, + "grad_norm": 0.017157731663316397, + "language_loss": 0.7755127, + "learning_rate": 0.0005689420477706156, + "loss": 0.78738415, + "num_input_tokens_seen": 204701152, + "router_z_loss_mlp": 0.81054688, + "step": 2455, + "time_per_iteration": 4.97356915473938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164137, + "balance_loss_mlp": 1.08216834, + "epoch": 0.4724894190073105, + "flos": 587394886656.0, + "grad_norm": 0.02627427755104431, + "language_loss": 0.95142597, + "learning_rate": 0.0005686334678342593, + "loss": 0.96306741, + "num_input_tokens_seen": 204778144, + "router_z_loss_mlp": 0.81982422, + "step": 2456, + "time_per_iteration": 2.867849588394165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165061, + "balance_loss_mlp": 1.08304489, + "epoch": 0.4726818006925741, + "flos": 869072497152.0, + "grad_norm": 0.03086214810478132, + "language_loss": 0.87917793, + "learning_rate": 0.0005683248612520274, + "loss": 0.89082849, + "num_input_tokens_seen": 204853376, + "router_z_loss_mlp": 0.8203125, + "step": 2457, + "time_per_iteration": 3.078068733215332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164085, + "balance_loss_mlp": 1.08206928, + "epoch": 0.4728741823778376, + "flos": 754227380736.0, + "grad_norm": 0.03352301766800045, + "language_loss": 0.88896751, + "learning_rate": 0.0005680162281437321, + "loss": 0.90060842, + "num_input_tokens_seen": 204925280, + "router_z_loss_mlp": 0.8203125, + "step": 2458, + "time_per_iteration": 2.9237887859344482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116424, + "balance_loss_mlp": 1.08260512, + "epoch": 0.4730665640631012, + "flos": 539657562624.0, + "grad_norm": 0.027635752733509208, + "language_loss": 0.89953935, + "learning_rate": 0.000567707568629195, + "loss": 0.91118181, + "num_input_tokens_seen": 205000592, + "router_z_loss_mlp": 0.81640625, + "step": 2459, + "time_per_iteration": 2.719519853591919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166645, + "balance_loss_mlp": 1.08505821, + "epoch": 0.47325894574836475, + "flos": 492682308096.0, + "grad_norm": 0.027667404433321316, + "language_loss": 0.88089126, + "learning_rate": 0.0005673988828282486, + "loss": 0.89255774, + "num_input_tokens_seen": 205073968, + "router_z_loss_mlp": 0.81591797, + "step": 2460, + "time_per_iteration": 2.71736216545105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165583, + "balance_loss_mlp": 1.0839963, + "epoch": 0.47345132743362833, + "flos": 765830886912.0, + "grad_norm": 0.028127891455978875, + "language_loss": 0.87479305, + "learning_rate": 0.0005670901708607352, + "loss": 0.88644892, + "num_input_tokens_seen": 205153536, + "router_z_loss_mlp": 0.81591797, + "step": 2461, + "time_per_iteration": 2.9727017879486084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165349, + "balance_loss_mlp": 1.08371425, + "epoch": 0.47364370911889186, + "flos": 541168240128.0, + "grad_norm": 0.03987357596495419, + "language_loss": 0.90376979, + "learning_rate": 0.0005667814328465076, + "loss": 0.91542327, + "num_input_tokens_seen": 205220944, + "router_z_loss_mlp": 0.81640625, + "step": 2462, + "time_per_iteration": 2.632636547088623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163463, + "balance_loss_mlp": 1.0815897, + "epoch": 0.47383609080415545, + "flos": 407091643392.0, + "grad_norm": 0.03654753942721471, + "language_loss": 0.88796914, + "learning_rate": 0.0005664726689054285, + "loss": 0.89960378, + "num_input_tokens_seen": 205282688, + "router_z_loss_mlp": 0.81884766, + "step": 2463, + "time_per_iteration": 2.466054916381836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170123, + "balance_loss_mlp": 1.08867884, + "epoch": 0.474028472489419, + "flos": 454438199808.0, + "grad_norm": 0.03923165930345575, + "language_loss": 0.8627066, + "learning_rate": 0.0005661638791573704, + "loss": 0.87440789, + "num_input_tokens_seen": 205357360, + "router_z_loss_mlp": 0.81445312, + "step": 2464, + "time_per_iteration": 2.7042744159698486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166183, + "balance_loss_mlp": 1.08450055, + "epoch": 0.47422085417468257, + "flos": 493194599424.0, + "grad_norm": 0.026684931914484025, + "language_loss": 0.92592585, + "learning_rate": 0.0005658550637222164, + "loss": 0.93758774, + "num_input_tokens_seen": 205424352, + "router_z_loss_mlp": 0.81689453, + "step": 2465, + "time_per_iteration": 2.6058290004730225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168127, + "balance_loss_mlp": 1.08611059, + "epoch": 0.47441323585994616, + "flos": 740125544448.0, + "grad_norm": 0.026202374072225774, + "language_loss": 0.87139833, + "learning_rate": 0.0005655462227198592, + "loss": 0.88307959, + "num_input_tokens_seen": 205502912, + "router_z_loss_mlp": 0.8203125, + "step": 2466, + "time_per_iteration": 2.8945796489715576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167919, + "balance_loss_mlp": 1.08590269, + "epoch": 0.4746056175452097, + "flos": 485674687488.0, + "grad_norm": 0.02746668082221095, + "language_loss": 0.89712787, + "learning_rate": 0.0005652373562702016, + "loss": 0.90880704, + "num_input_tokens_seen": 205571168, + "router_z_loss_mlp": 0.8203125, + "step": 2467, + "time_per_iteration": 2.576364278793335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166795, + "balance_loss_mlp": 1.08463609, + "epoch": 0.4747979992304733, + "flos": 462005775360.0, + "grad_norm": 0.03040478239716322, + "language_loss": 0.95003092, + "learning_rate": 0.000564928464493156, + "loss": 0.96169889, + "num_input_tokens_seen": 205639648, + "router_z_loss_mlp": 0.82177734, + "step": 2468, + "time_per_iteration": 2.5468242168426514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168306, + "balance_loss_mlp": 1.08624196, + "epoch": 0.4749903809157368, + "flos": 865879226880.0, + "grad_norm": 0.029413898751956376, + "language_loss": 0.88262731, + "learning_rate": 0.000564619547508645, + "loss": 0.89431041, + "num_input_tokens_seen": 205721536, + "router_z_loss_mlp": 0.82080078, + "step": 2469, + "time_per_iteration": 3.042994260787964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116966, + "balance_loss_mlp": 1.08764374, + "epoch": 0.4751827626010004, + "flos": 506551830528.0, + "grad_norm": 0.035426943126194606, + "language_loss": 0.90271819, + "learning_rate": 0.0005643106054366008, + "loss": 0.91441476, + "num_input_tokens_seen": 205788512, + "router_z_loss_mlp": 0.8203125, + "step": 2470, + "time_per_iteration": 2.5660367012023926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168432, + "balance_loss_mlp": 1.0863688, + "epoch": 0.47537514428626393, + "flos": 560452113408.0, + "grad_norm": 0.029652672624791387, + "language_loss": 0.85815179, + "learning_rate": 0.000564001638396965, + "loss": 0.86983615, + "num_input_tokens_seen": 205863104, + "router_z_loss_mlp": 0.82080078, + "step": 2471, + "time_per_iteration": 2.7345728874206543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167677, + "balance_loss_mlp": 1.08566117, + "epoch": 0.4755675259715275, + "flos": 835676054016.0, + "grad_norm": 0.029111814859825738, + "language_loss": 0.87706691, + "learning_rate": 0.0005636926465096897, + "loss": 0.8887437, + "num_input_tokens_seen": 205940688, + "router_z_loss_mlp": 0.8203125, + "step": 2472, + "time_per_iteration": 3.0570740699768066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166306, + "balance_loss_mlp": 1.08424211, + "epoch": 0.47575990765679105, + "flos": 509232809472.0, + "grad_norm": 0.030849533450069865, + "language_loss": 0.93407679, + "learning_rate": 0.0005633836298947363, + "loss": 0.94573981, + "num_input_tokens_seen": 206008352, + "router_z_loss_mlp": 0.82080078, + "step": 2473, + "time_per_iteration": 2.6804757118225098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167624, + "balance_loss_mlp": 1.08570302, + "epoch": 0.47595228934205464, + "flos": 592962961920.0, + "grad_norm": 0.0319092637225127, + "language_loss": 0.77122205, + "learning_rate": 0.000563074588672075, + "loss": 0.78289831, + "num_input_tokens_seen": 206078240, + "router_z_loss_mlp": 0.81933594, + "step": 2474, + "time_per_iteration": 2.7190651893615723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166922, + "balance_loss_mlp": 1.08500123, + "epoch": 0.4761446710273182, + "flos": 581683094016.0, + "grad_norm": 0.028375010801601097, + "language_loss": 0.91505527, + "learning_rate": 0.0005627655229616868, + "loss": 0.92672449, + "num_input_tokens_seen": 206148896, + "router_z_loss_mlp": 0.81933594, + "step": 2475, + "time_per_iteration": 2.689652919769287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164128, + "balance_loss_mlp": 1.08235061, + "epoch": 0.47633705271258175, + "flos": 674079264768.0, + "grad_norm": 0.024988633596495675, + "language_loss": 0.94898891, + "learning_rate": 0.0005624564328835616, + "loss": 0.96063018, + "num_input_tokens_seen": 206223792, + "router_z_loss_mlp": 0.81787109, + "step": 2476, + "time_per_iteration": 2.8038489818573 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169163, + "balance_loss_mlp": 1.08728969, + "epoch": 0.47652943439784534, + "flos": 542970355200.0, + "grad_norm": 0.0285977430554916, + "language_loss": 0.89680123, + "learning_rate": 0.0005621473185576986, + "loss": 0.90849286, + "num_input_tokens_seen": 206299376, + "router_z_loss_mlp": 0.81884766, + "step": 2477, + "time_per_iteration": 2.7568743228912354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165779, + "balance_loss_mlp": 1.08433557, + "epoch": 0.4767218160831089, + "flos": 525846437376.0, + "grad_norm": 0.0316668482667046, + "language_loss": 0.93167424, + "learning_rate": 0.0005618381801041068, + "loss": 0.94333208, + "num_input_tokens_seen": 206367936, + "router_z_loss_mlp": 0.81445312, + "step": 2478, + "time_per_iteration": 2.612211227416992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167228, + "balance_loss_mlp": 1.08545041, + "epoch": 0.47691419776837246, + "flos": 569126863872.0, + "grad_norm": 0.03238452738028376, + "language_loss": 0.88936818, + "learning_rate": 0.0005615290176428044, + "loss": 0.90104043, + "num_input_tokens_seen": 206438864, + "router_z_loss_mlp": 0.81787109, + "step": 2479, + "time_per_iteration": 2.649019241333008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168128, + "balance_loss_mlp": 1.08668435, + "epoch": 0.477106579453636, + "flos": 532024859136.0, + "grad_norm": 0.027888492093205767, + "language_loss": 0.91917288, + "learning_rate": 0.0005612198312938187, + "loss": 0.93085408, + "num_input_tokens_seen": 206516656, + "router_z_loss_mlp": 0.81445312, + "step": 2480, + "time_per_iteration": 2.739767551422119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169934, + "balance_loss_mlp": 1.08839524, + "epoch": 0.4772989611388996, + "flos": 595500950016.0, + "grad_norm": 0.027931665483744535, + "language_loss": 0.84935582, + "learning_rate": 0.0005609106211771868, + "loss": 0.86105514, + "num_input_tokens_seen": 206595040, + "router_z_loss_mlp": 0.81542969, + "step": 2481, + "time_per_iteration": 2.850339651107788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169841, + "balance_loss_mlp": 1.08835006, + "epoch": 0.4774913428241631, + "flos": 545707729920.0, + "grad_norm": 0.027660076347337716, + "language_loss": 0.94426548, + "learning_rate": 0.0005606013874129543, + "loss": 0.95596385, + "num_input_tokens_seen": 206670192, + "router_z_loss_mlp": 0.81494141, + "step": 2482, + "time_per_iteration": 2.7403533458709717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169934, + "balance_loss_mlp": 1.08829987, + "epoch": 0.4776837245094267, + "flos": 541129308672.0, + "grad_norm": 0.02810737401227857, + "language_loss": 0.86136961, + "learning_rate": 0.0005602921301211768, + "loss": 0.87306893, + "num_input_tokens_seen": 206746992, + "router_z_loss_mlp": 0.81640625, + "step": 2483, + "time_per_iteration": 2.6941261291503906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171891, + "balance_loss_mlp": 1.09016109, + "epoch": 0.4778761061946903, + "flos": 472755887616.0, + "grad_norm": 0.029011275825861695, + "language_loss": 0.8832168, + "learning_rate": 0.0005599828494219185, + "loss": 0.89493567, + "num_input_tokens_seen": 206813584, + "router_z_loss_mlp": 0.81738281, + "step": 2484, + "time_per_iteration": 2.5801451206207275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116562, + "balance_loss_mlp": 1.08355606, + "epoch": 0.4780684878799538, + "flos": 727337000448.0, + "grad_norm": 0.03126301150284597, + "language_loss": 0.95766234, + "learning_rate": 0.0005596735454352527, + "loss": 0.96931851, + "num_input_tokens_seen": 206885840, + "router_z_loss_mlp": 0.82080078, + "step": 2485, + "time_per_iteration": 2.866809368133545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165282, + "balance_loss_mlp": 1.0832181, + "epoch": 0.4782608695652174, + "flos": 549953780736.0, + "grad_norm": 0.032811891631208345, + "language_loss": 0.91780031, + "learning_rate": 0.0005593642182812619, + "loss": 0.92945307, + "num_input_tokens_seen": 206955104, + "router_z_loss_mlp": 0.82080078, + "step": 2486, + "time_per_iteration": 2.6762824058532715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166087, + "balance_loss_mlp": 1.08388078, + "epoch": 0.47845325125048094, + "flos": 831401805312.0, + "grad_norm": 0.03291122574992765, + "language_loss": 0.91604954, + "learning_rate": 0.0005590548680800378, + "loss": 0.92771041, + "num_input_tokens_seen": 207039792, + "router_z_loss_mlp": 0.82226562, + "step": 2487, + "time_per_iteration": 3.1848442554473877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159859, + "balance_loss_mlp": 1.07765198, + "epoch": 0.4786456329357445, + "flos": 515270241792.0, + "grad_norm": 0.02977291399963519, + "language_loss": 0.8241533, + "learning_rate": 0.0005587454949516804, + "loss": 0.83575195, + "num_input_tokens_seen": 207115632, + "router_z_loss_mlp": 0.82226562, + "step": 2488, + "time_per_iteration": 2.728825330734253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163121, + "balance_loss_mlp": 1.08077133, + "epoch": 0.47883801462100806, + "flos": 565729477632.0, + "grad_norm": 0.034122039627151275, + "language_loss": 0.9412536, + "learning_rate": 0.0005584360990162993, + "loss": 0.95288485, + "num_input_tokens_seen": 207184336, + "router_z_loss_mlp": 0.82373047, + "step": 2489, + "time_per_iteration": 2.65055251121521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162976, + "balance_loss_mlp": 1.08076906, + "epoch": 0.47903039630627164, + "flos": 580704173568.0, + "grad_norm": 0.025976014522421025, + "language_loss": 0.89770818, + "learning_rate": 0.0005581266803940124, + "loss": 0.90933788, + "num_input_tokens_seen": 207258720, + "router_z_loss_mlp": 0.82226562, + "step": 2490, + "time_per_iteration": 2.740140199661255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164709, + "balance_loss_mlp": 1.08250248, + "epoch": 0.47922277799153523, + "flos": 620085656064.0, + "grad_norm": 0.030357385002024635, + "language_loss": 0.93398184, + "learning_rate": 0.0005578172392049471, + "loss": 0.94562888, + "num_input_tokens_seen": 207329216, + "router_z_loss_mlp": 0.82226562, + "step": 2491, + "time_per_iteration": 2.7492756843566895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164354, + "balance_loss_mlp": 1.08214724, + "epoch": 0.47941515967679876, + "flos": 640858739712.0, + "grad_norm": 0.03220406636162171, + "language_loss": 0.9124878, + "learning_rate": 0.0005575077755692386, + "loss": 0.92413139, + "num_input_tokens_seen": 207403712, + "router_z_loss_mlp": 0.82226562, + "step": 2492, + "time_per_iteration": 2.8061015605926514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166388, + "balance_loss_mlp": 1.08437181, + "epoch": 0.47960754136206235, + "flos": 520875247104.0, + "grad_norm": 0.02527329704122564, + "language_loss": 0.91187584, + "learning_rate": 0.0005571982896070316, + "loss": 0.92353964, + "num_input_tokens_seen": 207477120, + "router_z_loss_mlp": 0.8203125, + "step": 2493, + "time_per_iteration": 4.094395160675049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116615, + "balance_loss_mlp": 1.08399141, + "epoch": 0.4797999230473259, + "flos": 476031750144.0, + "grad_norm": 0.03303640593992076, + "language_loss": 0.95932508, + "learning_rate": 0.0005568887814384792, + "loss": 0.97098666, + "num_input_tokens_seen": 207544592, + "router_z_loss_mlp": 0.82177734, + "step": 2494, + "time_per_iteration": 2.572852373123169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011645, + "balance_loss_mlp": 1.08229315, + "epoch": 0.47999230473258947, + "flos": 533068907520.0, + "grad_norm": 0.028664161711311382, + "language_loss": 0.92573094, + "learning_rate": 0.000556579251183743, + "loss": 0.93737602, + "num_input_tokens_seen": 207613808, + "router_z_loss_mlp": 0.82226562, + "step": 2495, + "time_per_iteration": 2.6538801193237305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162424, + "balance_loss_mlp": 1.08036053, + "epoch": 0.480184686417853, + "flos": 602605899264.0, + "grad_norm": 0.03331899292815792, + "language_loss": 0.86056805, + "learning_rate": 0.0005562696989629936, + "loss": 0.87219226, + "num_input_tokens_seen": 207684464, + "router_z_loss_mlp": 0.82080078, + "step": 2496, + "time_per_iteration": 2.687903881072998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162213, + "balance_loss_mlp": 1.08019686, + "epoch": 0.4803770681031166, + "flos": 529261287936.0, + "grad_norm": 0.02923998603568501, + "language_loss": 0.88484073, + "learning_rate": 0.0005559601248964095, + "loss": 0.89646292, + "num_input_tokens_seen": 207754016, + "router_z_loss_mlp": 0.8203125, + "step": 2497, + "time_per_iteration": 2.6282827854156494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161296, + "balance_loss_mlp": 1.07918417, + "epoch": 0.4805694497883801, + "flos": 512228694528.0, + "grad_norm": 0.02922528152793709, + "language_loss": 0.91127884, + "learning_rate": 0.0005556505291041783, + "loss": 0.92289186, + "num_input_tokens_seen": 207827104, + "router_z_loss_mlp": 0.82128906, + "step": 2498, + "time_per_iteration": 2.662783622741699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161007, + "balance_loss_mlp": 1.07899094, + "epoch": 0.4807618314736437, + "flos": 601605511680.0, + "grad_norm": 0.02724196548061384, + "language_loss": 0.8966158, + "learning_rate": 0.0005553409117064954, + "loss": 0.90822583, + "num_input_tokens_seen": 207907824, + "router_z_loss_mlp": 0.8203125, + "step": 2499, + "time_per_iteration": 2.898850917816162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164849, + "balance_loss_mlp": 1.08245122, + "epoch": 0.4809542131589073, + "flos": 570029922816.0, + "grad_norm": 0.028349491645904, + "language_loss": 0.91357303, + "learning_rate": 0.0005550312728235654, + "loss": 0.92522144, + "num_input_tokens_seen": 207975632, + "router_z_loss_mlp": 0.82421875, + "step": 2500, + "time_per_iteration": 2.754187822341919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164619, + "balance_loss_mlp": 1.08217347, + "epoch": 0.4811465948441708, + "flos": 577165797888.0, + "grad_norm": 0.034664680835738745, + "language_loss": 0.91214681, + "learning_rate": 0.0005547216125756003, + "loss": 0.92379302, + "num_input_tokens_seen": 208048000, + "router_z_loss_mlp": 0.82470703, + "step": 2501, + "time_per_iteration": 2.778639078140259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164023, + "balance_loss_mlp": 1.08143485, + "epoch": 0.4813389765294344, + "flos": 825297243648.0, + "grad_norm": 0.028167486861350455, + "language_loss": 0.87736559, + "learning_rate": 0.0005544119310828211, + "loss": 0.88900584, + "num_input_tokens_seen": 208132592, + "router_z_loss_mlp": 0.82617188, + "step": 2502, + "time_per_iteration": 3.0756351947784424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164093, + "balance_loss_mlp": 1.08174348, + "epoch": 0.48153135821469795, + "flos": 636699283968.0, + "grad_norm": 0.030410217991048386, + "language_loss": 0.91046345, + "learning_rate": 0.0005541022284654568, + "loss": 0.92210436, + "num_input_tokens_seen": 208215824, + "router_z_loss_mlp": 0.82373047, + "step": 2503, + "time_per_iteration": 2.892679214477539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163382, + "balance_loss_mlp": 1.08103192, + "epoch": 0.48172373989996153, + "flos": 504708782592.0, + "grad_norm": 0.02826951852510112, + "language_loss": 0.89667141, + "learning_rate": 0.0005537925048437446, + "loss": 0.90830529, + "num_input_tokens_seen": 208284304, + "router_z_loss_mlp": 0.82373047, + "step": 2504, + "time_per_iteration": 2.5750081539154053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179108, + "balance_loss_mlp": 1.09918976, + "epoch": 0.48191612158522507, + "flos": 1535566173696.0, + "grad_norm": 0.017261305400491866, + "language_loss": 0.75751472, + "learning_rate": 0.00055348276033793, + "loss": 0.76930583, + "num_input_tokens_seen": 208510224, + "router_z_loss_mlp": 0.79882812, + "step": 2505, + "time_per_iteration": 4.912463426589966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162522, + "balance_loss_mlp": 1.07988608, + "epoch": 0.48210850327048865, + "flos": 703811805696.0, + "grad_norm": 0.027104005826713556, + "language_loss": 0.93955028, + "learning_rate": 0.0005531729950682664, + "loss": 0.95117545, + "num_input_tokens_seen": 208596816, + "router_z_loss_mlp": 0.82666016, + "step": 2506, + "time_per_iteration": 3.000925064086914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162538, + "balance_loss_mlp": 1.07999802, + "epoch": 0.4823008849557522, + "flos": 440700934656.0, + "grad_norm": 0.03451729562062639, + "language_loss": 0.91777337, + "learning_rate": 0.000552863209155015, + "loss": 0.92939872, + "num_input_tokens_seen": 208659616, + "router_z_loss_mlp": 0.82568359, + "step": 2507, + "time_per_iteration": 2.478809118270874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159773, + "balance_loss_mlp": 1.07737529, + "epoch": 0.48249326664101577, + "flos": 472812283392.0, + "grad_norm": 0.02691149649688828, + "language_loss": 0.87363136, + "learning_rate": 0.0005525534027184461, + "loss": 0.88522899, + "num_input_tokens_seen": 208728080, + "router_z_loss_mlp": 0.82421875, + "step": 2508, + "time_per_iteration": 2.54645037651062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161526, + "balance_loss_mlp": 1.07951045, + "epoch": 0.48268564832627936, + "flos": 564314127360.0, + "grad_norm": 0.023137570540037285, + "language_loss": 0.88137501, + "learning_rate": 0.0005522435758788365, + "loss": 0.89299035, + "num_input_tokens_seen": 208803376, + "router_z_loss_mlp": 0.8203125, + "step": 2509, + "time_per_iteration": 2.700540542602539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160536, + "balance_loss_mlp": 1.07842445, + "epoch": 0.4828780300115429, + "flos": 630842499072.0, + "grad_norm": 0.03372990027790351, + "language_loss": 0.86188895, + "learning_rate": 0.0005519337287564721, + "loss": 0.87349427, + "num_input_tokens_seen": 208876656, + "router_z_loss_mlp": 0.82128906, + "step": 2510, + "time_per_iteration": 2.8127758502960205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161519, + "balance_loss_mlp": 1.07945526, + "epoch": 0.4830704116968065, + "flos": 633004455936.0, + "grad_norm": 0.029001937113396697, + "language_loss": 0.88535267, + "learning_rate": 0.000551623861471646, + "loss": 0.89696789, + "num_input_tokens_seen": 208950224, + "router_z_loss_mlp": 0.82080078, + "step": 2511, + "time_per_iteration": 2.7925469875335693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166962, + "balance_loss_mlp": 1.08647156, + "epoch": 0.48326279338207, + "flos": 1572616512000.0, + "grad_norm": 0.009161484988790693, + "language_loss": 0.78818834, + "learning_rate": 0.0005513139741446594, + "loss": 0.79985785, + "num_input_tokens_seen": 209173984, + "router_z_loss_mlp": 0.8046875, + "step": 2512, + "time_per_iteration": 4.850747108459473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159851, + "balance_loss_mlp": 1.07783449, + "epoch": 0.4834551750673336, + "flos": 510237926400.0, + "grad_norm": 0.028933780257729795, + "language_loss": 0.92768925, + "learning_rate": 0.0005510040668958211, + "loss": 0.93928778, + "num_input_tokens_seen": 209242832, + "router_z_loss_mlp": 0.8203125, + "step": 2513, + "time_per_iteration": 2.56387996673584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165955, + "balance_loss_mlp": 1.08546448, + "epoch": 0.48364755675259713, + "flos": 1531825683456.0, + "grad_norm": 0.007133010503999018, + "language_loss": 0.77760583, + "learning_rate": 0.0005506941398454483, + "loss": 0.78926539, + "num_input_tokens_seen": 209473520, + "router_z_loss_mlp": 0.8046875, + "step": 2514, + "time_per_iteration": 4.836379289627075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160977, + "balance_loss_mlp": 1.07938981, + "epoch": 0.4838399384378607, + "flos": 566046385152.0, + "grad_norm": 0.029153045334521625, + "language_loss": 0.89274001, + "learning_rate": 0.0005503841931138645, + "loss": 0.9043498, + "num_input_tokens_seen": 209544208, + "router_z_loss_mlp": 0.81591797, + "step": 2515, + "time_per_iteration": 2.6633048057556152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160148, + "balance_loss_mlp": 1.07846582, + "epoch": 0.4840323201231243, + "flos": 388541641728.0, + "grad_norm": 0.03187042626689644, + "language_loss": 0.88861662, + "learning_rate": 0.0005500742268214025, + "loss": 0.90021807, + "num_input_tokens_seen": 209607408, + "router_z_loss_mlp": 0.81689453, + "step": 2516, + "time_per_iteration": 2.4762659072875977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160045, + "balance_loss_mlp": 1.07845843, + "epoch": 0.48422470180838784, + "flos": 632175257088.0, + "grad_norm": 0.026732605532440536, + "language_loss": 0.9007901, + "learning_rate": 0.0005497642410884014, + "loss": 0.91239059, + "num_input_tokens_seen": 209683392, + "router_z_loss_mlp": 0.81591797, + "step": 2517, + "time_per_iteration": 2.7693819999694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164478, + "balance_loss_mlp": 1.08246255, + "epoch": 0.4844170834936514, + "flos": 500313010176.0, + "grad_norm": 0.028128961210665323, + "language_loss": 0.90248644, + "learning_rate": 0.0005494542360352085, + "loss": 0.91413122, + "num_input_tokens_seen": 209753184, + "router_z_loss_mlp": 0.8203125, + "step": 2518, + "time_per_iteration": 2.6704978942871094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163589, + "balance_loss_mlp": 1.08152497, + "epoch": 0.48460946517891496, + "flos": 552194327040.0, + "grad_norm": 0.02893400906180164, + "language_loss": 0.92442286, + "learning_rate": 0.0005491442117821783, + "loss": 0.93605876, + "num_input_tokens_seen": 209829568, + "router_z_loss_mlp": 0.82080078, + "step": 2519, + "time_per_iteration": 2.691898822784424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167118, + "balance_loss_mlp": 1.08491123, + "epoch": 0.48480184686417854, + "flos": 530461788672.0, + "grad_norm": 0.03488173137086134, + "language_loss": 0.937814, + "learning_rate": 0.0005488341684496732, + "loss": 0.94948518, + "num_input_tokens_seen": 209902176, + "router_z_loss_mlp": 0.82226562, + "step": 2520, + "time_per_iteration": 2.6527535915374756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165597, + "balance_loss_mlp": 1.08343804, + "epoch": 0.4849942285494421, + "flos": 533047440384.0, + "grad_norm": 0.028537304261499467, + "language_loss": 0.97065389, + "learning_rate": 0.0005485241061580624, + "loss": 0.98230994, + "num_input_tokens_seen": 209969168, + "router_z_loss_mlp": 0.82177734, + "step": 2521, + "time_per_iteration": 2.7213969230651855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166792, + "balance_loss_mlp": 1.08463287, + "epoch": 0.48518661023470566, + "flos": 723972541440.0, + "grad_norm": 0.02938300657957885, + "language_loss": 0.90224278, + "learning_rate": 0.0005482140250277228, + "loss": 0.91391075, + "num_input_tokens_seen": 210049616, + "router_z_loss_mlp": 0.82177734, + "step": 2522, + "time_per_iteration": 2.9924206733703613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116808, + "balance_loss_mlp": 1.08592129, + "epoch": 0.4853789919199692, + "flos": 507155446272.0, + "grad_norm": 0.030604201389603965, + "language_loss": 0.93692237, + "learning_rate": 0.0005479039251790387, + "loss": 0.94860315, + "num_input_tokens_seen": 210118512, + "router_z_loss_mlp": 0.82177734, + "step": 2523, + "time_per_iteration": 2.7099061012268066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167569, + "balance_loss_mlp": 1.08541012, + "epoch": 0.4855713736052328, + "flos": 661698952704.0, + "grad_norm": 0.03222198223164457, + "language_loss": 0.90574634, + "learning_rate": 0.0005475938067324014, + "loss": 0.917422, + "num_input_tokens_seen": 210193728, + "router_z_loss_mlp": 0.82177734, + "step": 2524, + "time_per_iteration": 2.8379342555999756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117016, + "balance_loss_mlp": 1.08823884, + "epoch": 0.48576375529049637, + "flos": 437889699840.0, + "grad_norm": 0.03297241328571355, + "language_loss": 0.89402866, + "learning_rate": 0.0005472836698082098, + "loss": 0.90573025, + "num_input_tokens_seen": 210258832, + "router_z_loss_mlp": 0.81933594, + "step": 2525, + "time_per_iteration": 2.5135462284088135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165117, + "balance_loss_mlp": 1.08300531, + "epoch": 0.4859561369757599, + "flos": 582844663296.0, + "grad_norm": 0.028434138704400515, + "language_loss": 0.88848263, + "learning_rate": 0.0005469735145268694, + "loss": 0.90013373, + "num_input_tokens_seen": 210335280, + "router_z_loss_mlp": 0.82128906, + "step": 2526, + "time_per_iteration": 2.7137279510498047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162635, + "balance_loss_mlp": 1.08066678, + "epoch": 0.4861485186610235, + "flos": 488933085696.0, + "grad_norm": 0.028544121185286958, + "language_loss": 0.86922419, + "learning_rate": 0.0005466633410087933, + "loss": 0.88085049, + "num_input_tokens_seen": 210407072, + "router_z_loss_mlp": 0.81982422, + "step": 2527, + "time_per_iteration": 2.7106595039367676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116584, + "balance_loss_mlp": 1.08554077, + "epoch": 0.486340900346287, + "flos": 1561111060992.0, + "grad_norm": 0.005447093154513016, + "language_loss": 0.77260822, + "learning_rate": 0.0005463531493744017, + "loss": 0.78426665, + "num_input_tokens_seen": 210644544, + "router_z_loss_mlp": 0.80273438, + "step": 2528, + "time_per_iteration": 4.841828346252441 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162423, + "balance_loss_mlp": 1.08069348, + "epoch": 0.4865332820315506, + "flos": 483990093312.0, + "grad_norm": 0.026581719305211308, + "language_loss": 0.93869209, + "learning_rate": 0.0005460429397441214, + "loss": 0.95031631, + "num_input_tokens_seen": 210711760, + "router_z_loss_mlp": 0.81738281, + "step": 2529, + "time_per_iteration": 2.553438425064087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164502, + "balance_loss_mlp": 1.08296263, + "epoch": 0.48672566371681414, + "flos": 536857061376.0, + "grad_norm": 0.02943507577689114, + "language_loss": 0.92893845, + "learning_rate": 0.0005457327122383866, + "loss": 0.94058347, + "num_input_tokens_seen": 210783040, + "router_z_loss_mlp": 0.81542969, + "step": 2530, + "time_per_iteration": 2.628859043121338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167305, + "balance_loss_mlp": 1.08795929, + "epoch": 0.4869180454020777, + "flos": 1415830457856.0, + "grad_norm": 0.01207374103656724, + "language_loss": 0.74636483, + "learning_rate": 0.0005454224669776385, + "loss": 0.75803792, + "num_input_tokens_seen": 211002128, + "router_z_loss_mlp": 0.79296875, + "step": 2531, + "time_per_iteration": 4.798464775085449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163612, + "balance_loss_mlp": 1.08212042, + "epoch": 0.48711042708734126, + "flos": 574226308608.0, + "grad_norm": 0.027593185975689192, + "language_loss": 0.81384307, + "learning_rate": 0.0005451122040823244, + "loss": 0.82547921, + "num_input_tokens_seen": 211080080, + "router_z_loss_mlp": 0.81494141, + "step": 2532, + "time_per_iteration": 2.7749013900756836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116272, + "balance_loss_mlp": 1.08118057, + "epoch": 0.48730280877260485, + "flos": 627816414720.0, + "grad_norm": 0.02591805781842408, + "language_loss": 0.82129884, + "learning_rate": 0.0005448019236728997, + "loss": 0.83292603, + "num_input_tokens_seen": 211162944, + "router_z_loss_mlp": 0.81542969, + "step": 2533, + "time_per_iteration": 2.865239381790161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164787, + "balance_loss_mlp": 1.08315206, + "epoch": 0.48749519045786843, + "flos": 513468126720.0, + "grad_norm": 0.03027053938911928, + "language_loss": 0.91336226, + "learning_rate": 0.0005444916258698255, + "loss": 0.92501009, + "num_input_tokens_seen": 211230448, + "router_z_loss_mlp": 0.81640625, + "step": 2534, + "time_per_iteration": 2.5986597537994385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164085, + "balance_loss_mlp": 1.08259368, + "epoch": 0.48768757214313196, + "flos": 526478251008.0, + "grad_norm": 0.02699578070604874, + "language_loss": 0.90958095, + "learning_rate": 0.0005441813107935704, + "loss": 0.92122173, + "num_input_tokens_seen": 211301248, + "router_z_loss_mlp": 0.81494141, + "step": 2535, + "time_per_iteration": 2.685478925704956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162911, + "balance_loss_mlp": 1.08137167, + "epoch": 0.48787995382839555, + "flos": 506030807040.0, + "grad_norm": 0.02902824988643181, + "language_loss": 0.91504169, + "learning_rate": 0.0005438709785646091, + "loss": 0.92667079, + "num_input_tokens_seen": 211369888, + "router_z_loss_mlp": 0.81542969, + "step": 2536, + "time_per_iteration": 2.563302755355835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164758, + "balance_loss_mlp": 1.08302808, + "epoch": 0.4880723355136591, + "flos": 576247276032.0, + "grad_norm": 0.028837521239882914, + "language_loss": 0.92468232, + "learning_rate": 0.0005435606293034234, + "loss": 0.93632984, + "num_input_tokens_seen": 211441808, + "router_z_loss_mlp": 0.81738281, + "step": 2537, + "time_per_iteration": 2.6447930335998535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117327, + "balance_loss_mlp": 1.09163582, + "epoch": 0.48826471719892267, + "flos": 562536207360.0, + "grad_norm": 0.0312247117460979, + "language_loss": 0.90714639, + "learning_rate": 0.0005432502631305016, + "loss": 0.91887903, + "num_input_tokens_seen": 211511216, + "router_z_loss_mlp": 0.81640625, + "step": 2538, + "time_per_iteration": 2.6652588844299316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173314, + "balance_loss_mlp": 1.09163225, + "epoch": 0.4884570988841862, + "flos": 727547847168.0, + "grad_norm": 0.027646073497336384, + "language_loss": 0.88003767, + "learning_rate": 0.0005429398801663386, + "loss": 0.89177084, + "num_input_tokens_seen": 211589264, + "router_z_loss_mlp": 0.81689453, + "step": 2539, + "time_per_iteration": 2.9378042221069336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163435, + "balance_loss_mlp": 1.08180094, + "epoch": 0.4886494805694498, + "flos": 431924126208.0, + "grad_norm": 0.03488087397138866, + "language_loss": 0.90234458, + "learning_rate": 0.0005426294805314355, + "loss": 0.91397893, + "num_input_tokens_seen": 211652928, + "router_z_loss_mlp": 0.81640625, + "step": 2540, + "time_per_iteration": 2.538275718688965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161042, + "balance_loss_mlp": 1.07935977, + "epoch": 0.4888418622547134, + "flos": 674344505856.0, + "grad_norm": 0.02710942555690322, + "language_loss": 0.8497895, + "learning_rate": 0.0005423190643463003, + "loss": 0.86139989, + "num_input_tokens_seen": 211741664, + "router_z_loss_mlp": 0.81689453, + "step": 2541, + "time_per_iteration": 2.9786784648895264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163064, + "balance_loss_mlp": 1.08133411, + "epoch": 0.4890342439399769, + "flos": 542935426560.0, + "grad_norm": 0.02908053911836938, + "language_loss": 0.88889569, + "learning_rate": 0.0005420086317314473, + "loss": 0.90052634, + "num_input_tokens_seen": 211809136, + "router_z_loss_mlp": 0.81738281, + "step": 2542, + "time_per_iteration": 2.650505781173706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163957, + "balance_loss_mlp": 1.08198881, + "epoch": 0.4892266256252405, + "flos": 591862517760.0, + "grad_norm": 0.032456825889771945, + "language_loss": 0.86421382, + "learning_rate": 0.0005416981828073971, + "loss": 0.87585342, + "num_input_tokens_seen": 211883136, + "router_z_loss_mlp": 0.81982422, + "step": 2543, + "time_per_iteration": 2.756906032562256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167862, + "balance_loss_mlp": 1.08718109, + "epoch": 0.48941900731050403, + "flos": 1519654216704.0, + "grad_norm": 0.009398242691954228, + "language_loss": 0.77115011, + "learning_rate": 0.0005413877176946765, + "loss": 0.78282875, + "num_input_tokens_seen": 212117488, + "router_z_loss_mlp": 0.80664062, + "step": 2544, + "time_per_iteration": 4.826622486114502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163984, + "balance_loss_mlp": 1.08225381, + "epoch": 0.4896113889957676, + "flos": 471518456832.0, + "grad_norm": 0.03564931489131084, + "language_loss": 0.92759442, + "learning_rate": 0.000541077236513819, + "loss": 0.93923426, + "num_input_tokens_seen": 212181952, + "router_z_loss_mlp": 0.81738281, + "step": 2545, + "time_per_iteration": 2.5047078132629395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169885, + "balance_loss_mlp": 1.08848882, + "epoch": 0.48980377068103115, + "flos": 497551440384.0, + "grad_norm": 0.02644804149278648, + "language_loss": 0.87771875, + "learning_rate": 0.0005407667393853638, + "loss": 0.88941759, + "num_input_tokens_seen": 212252608, + "router_z_loss_mlp": 0.81396484, + "step": 2546, + "time_per_iteration": 2.615182876586914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172802, + "balance_loss_mlp": 1.09116721, + "epoch": 0.48999615236629473, + "flos": 694107743232.0, + "grad_norm": 0.032384144791382644, + "language_loss": 0.89844877, + "learning_rate": 0.0005404562264298569, + "loss": 0.91017681, + "num_input_tokens_seen": 212328560, + "router_z_loss_mlp": 0.81640625, + "step": 2547, + "time_per_iteration": 2.8694136142730713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164836, + "balance_loss_mlp": 1.08310628, + "epoch": 0.49018853405155827, + "flos": 542748774912.0, + "grad_norm": 0.02932030725962162, + "language_loss": 0.90206313, + "learning_rate": 0.0005401456977678498, + "loss": 0.91371155, + "num_input_tokens_seen": 212399616, + "router_z_loss_mlp": 0.81738281, + "step": 2548, + "time_per_iteration": 2.644604444503784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158708, + "balance_loss_mlp": 1.07702553, + "epoch": 0.49038091573682185, + "flos": 697108357632.0, + "grad_norm": 0.0348486432591887, + "language_loss": 0.83939159, + "learning_rate": 0.0005398351535199008, + "loss": 0.85097861, + "num_input_tokens_seen": 212482352, + "router_z_loss_mlp": 0.81689453, + "step": 2549, + "time_per_iteration": 3.064962863922119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158664, + "balance_loss_mlp": 1.07693398, + "epoch": 0.49057329742208544, + "flos": 598062406656.0, + "grad_norm": 0.028343941430048352, + "language_loss": 0.89488542, + "learning_rate": 0.0005395245938065735, + "loss": 0.90647209, + "num_input_tokens_seen": 212559504, + "router_z_loss_mlp": 0.81738281, + "step": 2550, + "time_per_iteration": 2.8023993968963623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162826, + "balance_loss_mlp": 1.08119094, + "epoch": 0.490765679107349, + "flos": 514416847872.0, + "grad_norm": 0.036438353865587, + "language_loss": 0.8920716, + "learning_rate": 0.0005392140187484379, + "loss": 0.90369982, + "num_input_tokens_seen": 212625664, + "router_z_loss_mlp": 0.81640625, + "step": 2551, + "time_per_iteration": 2.5544004440307617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160822, + "balance_loss_mlp": 1.07928288, + "epoch": 0.49095806079261256, + "flos": 630842499072.0, + "grad_norm": 0.02833803159801528, + "language_loss": 0.95730108, + "learning_rate": 0.0005389034284660701, + "loss": 0.96890926, + "num_input_tokens_seen": 212702000, + "router_z_loss_mlp": 0.81542969, + "step": 2552, + "time_per_iteration": 2.787997245788574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156735, + "balance_loss_mlp": 1.07524312, + "epoch": 0.4911504424778761, + "flos": 916792356864.0, + "grad_norm": 0.03441290589053542, + "language_loss": 0.8892417, + "learning_rate": 0.000538592823080052, + "loss": 0.90080899, + "num_input_tokens_seen": 212785376, + "router_z_loss_mlp": 0.81494141, + "step": 2553, + "time_per_iteration": 3.1353423595428467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159599, + "balance_loss_mlp": 1.07858455, + "epoch": 0.4913428241631397, + "flos": 439854271488.0, + "grad_norm": 0.03215354145178159, + "language_loss": 0.91146123, + "learning_rate": 0.000538282202710971, + "loss": 0.9230572, + "num_input_tokens_seen": 212848176, + "router_z_loss_mlp": 0.81005859, + "step": 2554, + "time_per_iteration": 2.524106025695801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158745, + "balance_loss_mlp": 1.0776825, + "epoch": 0.4915352058484032, + "flos": 637239773184.0, + "grad_norm": 0.03412299335020121, + "language_loss": 0.8861627, + "learning_rate": 0.000537971567479421, + "loss": 0.8977502, + "num_input_tokens_seen": 212917888, + "router_z_loss_mlp": 0.81054688, + "step": 2555, + "time_per_iteration": 2.750051736831665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162188, + "balance_loss_mlp": 1.08107841, + "epoch": 0.4917275875336668, + "flos": 505509783552.0, + "grad_norm": 0.03289434989172404, + "language_loss": 0.93214262, + "learning_rate": 0.0005376609175060011, + "loss": 0.94376451, + "num_input_tokens_seen": 212986288, + "router_z_loss_mlp": 0.81103516, + "step": 2556, + "time_per_iteration": 2.588437557220459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160453, + "balance_loss_mlp": 1.07924759, + "epoch": 0.49191996921893033, + "flos": 655733379072.0, + "grad_norm": 0.02731850736189593, + "language_loss": 0.86463559, + "learning_rate": 0.0005373502529113162, + "loss": 0.87624013, + "num_input_tokens_seen": 213059504, + "router_z_loss_mlp": 0.81201172, + "step": 2557, + "time_per_iteration": 2.775529146194458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160279, + "balance_loss_mlp": 1.07897866, + "epoch": 0.4921123509041939, + "flos": 493398715392.0, + "grad_norm": 0.02896728411720768, + "language_loss": 0.88084292, + "learning_rate": 0.0005370395738159773, + "loss": 0.8924458, + "num_input_tokens_seen": 213129984, + "router_z_loss_mlp": 0.81298828, + "step": 2558, + "time_per_iteration": 2.638489007949829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162432, + "balance_loss_mlp": 1.08084488, + "epoch": 0.4923047325894575, + "flos": 547207673856.0, + "grad_norm": 0.030679841284503157, + "language_loss": 0.90182674, + "learning_rate": 0.0005367288803406003, + "loss": 0.91345102, + "num_input_tokens_seen": 213199184, + "router_z_loss_mlp": 0.81591797, + "step": 2559, + "time_per_iteration": 2.655319929122925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166456, + "balance_loss_mlp": 1.08477354, + "epoch": 0.49249711427472104, + "flos": 597589046784.0, + "grad_norm": 0.03258957792314928, + "language_loss": 0.88157088, + "learning_rate": 0.0005364181726058073, + "loss": 0.89323545, + "num_input_tokens_seen": 213272480, + "router_z_loss_mlp": 0.81689453, + "step": 2560, + "time_per_iteration": 2.7416017055511475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116275, + "balance_loss_mlp": 1.08111596, + "epoch": 0.4926894959599846, + "flos": 498808336896.0, + "grad_norm": 0.03132101057916933, + "language_loss": 0.88768357, + "learning_rate": 0.0005361074507322261, + "loss": 0.89931107, + "num_input_tokens_seen": 213338704, + "router_z_loss_mlp": 0.81640625, + "step": 2561, + "time_per_iteration": 2.6130712032318115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165857, + "balance_loss_mlp": 1.08446133, + "epoch": 0.49288187764524816, + "flos": 537182701056.0, + "grad_norm": 0.03057631912079697, + "language_loss": 0.88031554, + "learning_rate": 0.000535796714840489, + "loss": 0.89197409, + "num_input_tokens_seen": 213406016, + "router_z_loss_mlp": 0.81396484, + "step": 2562, + "time_per_iteration": 2.6463782787323 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167526, + "balance_loss_mlp": 1.08584368, + "epoch": 0.49307425933051174, + "flos": 642712521216.0, + "grad_norm": 0.037191189532270505, + "language_loss": 0.90339726, + "learning_rate": 0.0005354859650512348, + "loss": 0.91507256, + "num_input_tokens_seen": 213474016, + "router_z_loss_mlp": 0.81689453, + "step": 2563, + "time_per_iteration": 2.807185649871826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169953, + "balance_loss_mlp": 1.08831811, + "epoch": 0.4932666410157753, + "flos": 517265012736.0, + "grad_norm": 0.033499096438589164, + "language_loss": 0.92994809, + "learning_rate": 0.0005351752014851074, + "loss": 0.94164765, + "num_input_tokens_seen": 213539696, + "router_z_loss_mlp": 0.81640625, + "step": 2564, + "time_per_iteration": 2.574969530105591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164544, + "balance_loss_mlp": 1.08310056, + "epoch": 0.49345902270103886, + "flos": 602651561472.0, + "grad_norm": 0.03279756121209128, + "language_loss": 0.89816988, + "learning_rate": 0.0005348644242627553, + "loss": 0.90981531, + "num_input_tokens_seen": 213609504, + "router_z_loss_mlp": 0.81445312, + "step": 2565, + "time_per_iteration": 2.718763828277588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170387, + "balance_loss_mlp": 1.0912323, + "epoch": 0.49365140438630245, + "flos": 1496981689344.0, + "grad_norm": 0.010263800536892794, + "language_loss": 0.75286627, + "learning_rate": 0.0005345536335048336, + "loss": 0.76457012, + "num_input_tokens_seen": 213846064, + "router_z_loss_mlp": 0.79101562, + "step": 2566, + "time_per_iteration": 4.933185815811157 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116695, + "balance_loss_mlp": 1.08588743, + "epoch": 0.493843786071566, + "flos": 630788104704.0, + "grad_norm": 0.030129730382445888, + "language_loss": 0.87054515, + "learning_rate": 0.0005342428293320013, + "loss": 0.88221461, + "num_input_tokens_seen": 213923216, + "router_z_loss_mlp": 0.81054688, + "step": 2567, + "time_per_iteration": 2.7435762882232666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167603, + "balance_loss_mlp": 1.08635032, + "epoch": 0.49403616775682957, + "flos": 618689771520.0, + "grad_norm": 0.03756496493147188, + "language_loss": 0.89032316, + "learning_rate": 0.0005339320118649238, + "loss": 0.90199912, + "num_input_tokens_seen": 213994096, + "router_z_loss_mlp": 0.8125, + "step": 2568, + "time_per_iteration": 2.732135057449341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162688, + "balance_loss_mlp": 1.08148313, + "epoch": 0.4942285494420931, + "flos": 578813462016.0, + "grad_norm": 0.027001968550623295, + "language_loss": 0.91260755, + "learning_rate": 0.000533621181224271, + "loss": 0.92423451, + "num_input_tokens_seen": 214069104, + "router_z_loss_mlp": 0.81201172, + "step": 2569, + "time_per_iteration": 2.79868483543396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164198, + "balance_loss_mlp": 1.08304083, + "epoch": 0.4944209311273567, + "flos": 631465580544.0, + "grad_norm": 0.0320565630919746, + "language_loss": 0.86978823, + "learning_rate": 0.0005333103375307182, + "loss": 0.88143021, + "num_input_tokens_seen": 214150368, + "router_z_loss_mlp": 0.81152344, + "step": 2570, + "time_per_iteration": 2.850125551223755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159265, + "balance_loss_mlp": 1.07825053, + "epoch": 0.4946133128126202, + "flos": 588718912512.0, + "grad_norm": 0.030887982554767154, + "language_loss": 0.91666126, + "learning_rate": 0.0005329994809049451, + "loss": 0.92825389, + "num_input_tokens_seen": 214220112, + "router_z_loss_mlp": 0.81005859, + "step": 2571, + "time_per_iteration": 2.716823101043701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115557, + "balance_loss_mlp": 1.07460296, + "epoch": 0.4948056944978838, + "flos": 584846164992.0, + "grad_norm": 0.031743542415023744, + "language_loss": 0.93336749, + "learning_rate": 0.0005326886114676375, + "loss": 0.94492316, + "num_input_tokens_seen": 214294480, + "router_z_loss_mlp": 0.80957031, + "step": 2572, + "time_per_iteration": 2.7895162105560303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160915, + "balance_loss_mlp": 1.08004355, + "epoch": 0.49499807618314734, + "flos": 482780860416.0, + "grad_norm": 0.03097072525481985, + "language_loss": 0.93359911, + "learning_rate": 0.0005323777293394854, + "loss": 0.94520825, + "num_input_tokens_seen": 214359568, + "router_z_loss_mlp": 0.80859375, + "step": 2573, + "time_per_iteration": 2.5428624153137207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161628, + "balance_loss_mlp": 1.08089912, + "epoch": 0.4951904578684109, + "flos": 520037316096.0, + "grad_norm": 0.029847836155631635, + "language_loss": 0.87235224, + "learning_rate": 0.000532066834641184, + "loss": 0.88396853, + "num_input_tokens_seen": 214432032, + "router_z_loss_mlp": 0.80712891, + "step": 2574, + "time_per_iteration": 2.666405439376831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116292, + "balance_loss_mlp": 1.08195353, + "epoch": 0.4953828395536745, + "flos": 536577083904.0, + "grad_norm": 0.029607666498307577, + "language_loss": 0.91085738, + "learning_rate": 0.0005317559274934334, + "loss": 0.92248654, + "num_input_tokens_seen": 214504096, + "router_z_loss_mlp": 0.80957031, + "step": 2575, + "time_per_iteration": 2.694953441619873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161488, + "balance_loss_mlp": 1.08056831, + "epoch": 0.49557522123893805, + "flos": 529606393344.0, + "grad_norm": 0.03416750639658743, + "language_loss": 0.87365144, + "learning_rate": 0.0005314450080169382, + "loss": 0.8852663, + "num_input_tokens_seen": 214575920, + "router_z_loss_mlp": 0.80908203, + "step": 2576, + "time_per_iteration": 2.6648805141448975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160753, + "balance_loss_mlp": 1.07973826, + "epoch": 0.49576760292420163, + "flos": 428917507584.0, + "grad_norm": 0.028909192983869472, + "language_loss": 0.86833698, + "learning_rate": 0.0005311340763324083, + "loss": 0.87994456, + "num_input_tokens_seen": 214641664, + "router_z_loss_mlp": 0.81005859, + "step": 2577, + "time_per_iteration": 2.563143014907837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160705, + "balance_loss_mlp": 1.07945204, + "epoch": 0.49595998460946517, + "flos": 566315629056.0, + "grad_norm": 0.02703431344264104, + "language_loss": 0.87897325, + "learning_rate": 0.0005308231325605578, + "loss": 0.8905803, + "num_input_tokens_seen": 214711744, + "router_z_loss_mlp": 0.8125, + "step": 2578, + "time_per_iteration": 2.690247058868408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159003, + "balance_loss_mlp": 1.07746387, + "epoch": 0.49615236629472875, + "flos": 703813807104.0, + "grad_norm": 0.02447176932933424, + "language_loss": 0.81124884, + "learning_rate": 0.0005305121768221061, + "loss": 0.8228389, + "num_input_tokens_seen": 214802256, + "router_z_loss_mlp": 0.81542969, + "step": 2579, + "time_per_iteration": 3.1026089191436768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011698, + "balance_loss_mlp": 1.08969116, + "epoch": 0.4963447479799923, + "flos": 1444752539136.0, + "grad_norm": 0.010536082657862093, + "language_loss": 0.75038326, + "learning_rate": 0.000530201209237777, + "loss": 0.76208121, + "num_input_tokens_seen": 215023648, + "router_z_loss_mlp": 0.80078125, + "step": 2580, + "time_per_iteration": 4.814293146133423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160566, + "balance_loss_mlp": 1.07912242, + "epoch": 0.49653712966525587, + "flos": 538663179264.0, + "grad_norm": 0.027995208065503225, + "language_loss": 0.97084171, + "learning_rate": 0.0005298902299282984, + "loss": 0.98244739, + "num_input_tokens_seen": 215094080, + "router_z_loss_mlp": 0.81445312, + "step": 2581, + "time_per_iteration": 2.6197092533111572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115749, + "balance_loss_mlp": 1.07609439, + "epoch": 0.4967295113505194, + "flos": 608395554816.0, + "grad_norm": 0.029727926282221828, + "language_loss": 0.90264994, + "learning_rate": 0.0005295792390144033, + "loss": 0.91422486, + "num_input_tokens_seen": 215165456, + "router_z_loss_mlp": 0.81396484, + "step": 2582, + "time_per_iteration": 2.6830005645751953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156586, + "balance_loss_mlp": 1.07528532, + "epoch": 0.496921893035783, + "flos": 475530192384.0, + "grad_norm": 0.034235181262718475, + "language_loss": 0.90576661, + "learning_rate": 0.0005292682366168294, + "loss": 0.91733253, + "num_input_tokens_seen": 215229344, + "router_z_loss_mlp": 0.81298828, + "step": 2583, + "time_per_iteration": 2.5291895866394043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158052, + "balance_loss_mlp": 1.07694244, + "epoch": 0.4971142747210466, + "flos": 598602895872.0, + "grad_norm": 0.029240794220739816, + "language_loss": 0.86485231, + "learning_rate": 0.0005289572228563181, + "loss": 0.8764329, + "num_input_tokens_seen": 215305616, + "router_z_loss_mlp": 0.81103516, + "step": 2584, + "time_per_iteration": 2.777571678161621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159994, + "balance_loss_mlp": 1.0788368, + "epoch": 0.4973066564063101, + "flos": 600734653440.0, + "grad_norm": 0.030481884249605188, + "language_loss": 0.889974, + "learning_rate": 0.000528646197853616, + "loss": 0.90157396, + "num_input_tokens_seen": 215378128, + "router_z_loss_mlp": 0.81152344, + "step": 2585, + "time_per_iteration": 2.767935276031494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116269, + "balance_loss_mlp": 1.08162796, + "epoch": 0.4974990380915737, + "flos": 650768919552.0, + "grad_norm": 0.027212373173769577, + "language_loss": 0.90572929, + "learning_rate": 0.0005283351617294735, + "loss": 0.91735625, + "num_input_tokens_seen": 215453536, + "router_z_loss_mlp": 0.81054688, + "step": 2586, + "time_per_iteration": 2.890571117401123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167969, + "balance_loss_mlp": 1.08862305, + "epoch": 0.49769141977683723, + "flos": 1532440032768.0, + "grad_norm": 0.00993779830792852, + "language_loss": 0.7663666, + "learning_rate": 0.0005280241146046456, + "loss": 0.77804637, + "num_input_tokens_seen": 215689440, + "router_z_loss_mlp": 0.79296875, + "step": 2587, + "time_per_iteration": 4.995927095413208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116898, + "balance_loss_mlp": 1.08791721, + "epoch": 0.4978838014621008, + "flos": 537397550592.0, + "grad_norm": 0.03215658272946184, + "language_loss": 0.92911154, + "learning_rate": 0.0005277130565998916, + "loss": 0.94080132, + "num_input_tokens_seen": 215759600, + "router_z_loss_mlp": 0.81054688, + "step": 2588, + "time_per_iteration": 2.717165946960449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162431, + "balance_loss_mlp": 1.08122599, + "epoch": 0.49807618314736435, + "flos": 540745271808.0, + "grad_norm": 0.02720148099542, + "language_loss": 0.86777204, + "learning_rate": 0.0005274019878359748, + "loss": 0.87939632, + "num_input_tokens_seen": 215833920, + "router_z_loss_mlp": 0.81201172, + "step": 2589, + "time_per_iteration": 2.71560001373291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162135, + "balance_loss_mlp": 1.08088183, + "epoch": 0.49826856483262794, + "flos": 543521577984.0, + "grad_norm": 0.03624054616449923, + "language_loss": 0.92995536, + "learning_rate": 0.0005270909084336628, + "loss": 0.94157672, + "num_input_tokens_seen": 215903616, + "router_z_loss_mlp": 0.8125, + "step": 2590, + "time_per_iteration": 2.6439368724823 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165371, + "balance_loss_mlp": 1.08435619, + "epoch": 0.4984609465178915, + "flos": 523360842240.0, + "grad_norm": 0.02994333023587166, + "language_loss": 0.94466031, + "learning_rate": 0.0005267798185137276, + "loss": 0.95631397, + "num_input_tokens_seen": 215974832, + "router_z_loss_mlp": 0.81005859, + "step": 2591, + "time_per_iteration": 2.6229867935180664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159677, + "balance_loss_mlp": 1.07851899, + "epoch": 0.49865332820315506, + "flos": 575704785408.0, + "grad_norm": 0.030323117469882623, + "language_loss": 0.94773531, + "learning_rate": 0.0005264687181969444, + "loss": 0.95933211, + "num_input_tokens_seen": 216045024, + "router_z_loss_mlp": 0.81152344, + "step": 2592, + "time_per_iteration": 2.7226686477661133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164286, + "balance_loss_mlp": 1.08303344, + "epoch": 0.49884570988841864, + "flos": 1015210497024.0, + "grad_norm": 0.0376584975450282, + "language_loss": 0.82159829, + "learning_rate": 0.0005261576076040937, + "loss": 0.83324111, + "num_input_tokens_seen": 216129024, + "router_z_loss_mlp": 0.8125, + "step": 2593, + "time_per_iteration": 3.2477946281433105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169307, + "balance_loss_mlp": 1.08843529, + "epoch": 0.4990380915736822, + "flos": 560647497216.0, + "grad_norm": 0.03227625840551658, + "language_loss": 0.90092522, + "learning_rate": 0.0005258464868559591, + "loss": 0.91261828, + "num_input_tokens_seen": 216197648, + "router_z_loss_mlp": 0.80859375, + "step": 2594, + "time_per_iteration": 2.650367259979248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167043, + "balance_loss_mlp": 1.08588493, + "epoch": 0.49923047325894576, + "flos": 499943709696.0, + "grad_norm": 0.030210069947970843, + "language_loss": 0.94528484, + "learning_rate": 0.0005255353560733284, + "loss": 0.95695531, + "num_input_tokens_seen": 216263904, + "router_z_loss_mlp": 0.81152344, + "step": 2595, + "time_per_iteration": 2.6242079734802246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174149, + "balance_loss_mlp": 1.09518433, + "epoch": 0.4994228549442093, + "flos": 1499788194816.0, + "grad_norm": 0.015118012466641684, + "language_loss": 0.75578642, + "learning_rate": 0.0005252242153769931, + "loss": 0.76752794, + "num_input_tokens_seen": 216493152, + "router_z_loss_mlp": 0.7890625, + "step": 2596, + "time_per_iteration": 4.820875883102417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116628, + "balance_loss_mlp": 1.08521724, + "epoch": 0.4996152366294729, + "flos": 558513738240.0, + "grad_norm": 0.031441861478263874, + "language_loss": 0.89123356, + "learning_rate": 0.0005249130648877492, + "loss": 0.9028964, + "num_input_tokens_seen": 216567216, + "router_z_loss_mlp": 0.81054688, + "step": 2597, + "time_per_iteration": 2.71932053565979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158102, + "balance_loss_mlp": 1.07699203, + "epoch": 0.4998076183147364, + "flos": 416482801152.0, + "grad_norm": 0.03314289919132309, + "language_loss": 0.90550959, + "learning_rate": 0.0005246019047263953, + "loss": 0.91709059, + "num_input_tokens_seen": 216630624, + "router_z_loss_mlp": 0.81103516, + "step": 2598, + "time_per_iteration": 2.4899134635925293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158453, + "balance_loss_mlp": 1.07739091, + "epoch": 0.5, + "flos": 468325186560.0, + "grad_norm": 0.03341299307449988, + "language_loss": 0.88387024, + "learning_rate": 0.0005242907350137353, + "loss": 0.89545476, + "num_input_tokens_seen": 216696576, + "router_z_loss_mlp": 0.81054688, + "step": 2599, + "time_per_iteration": 2.553997039794922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164809, + "balance_loss_mlp": 1.08369899, + "epoch": 0.5001923816852636, + "flos": 483755778048.0, + "grad_norm": 0.03321709561705903, + "language_loss": 0.85543942, + "learning_rate": 0.0005239795558705754, + "loss": 0.86708754, + "num_input_tokens_seen": 216767584, + "router_z_loss_mlp": 0.81103516, + "step": 2600, + "time_per_iteration": 2.6166868209838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164506, + "balance_loss_mlp": 1.08339632, + "epoch": 0.5003847633705272, + "flos": 534855559680.0, + "grad_norm": 0.030012173683065246, + "language_loss": 0.95093107, + "learning_rate": 0.0005236683674177264, + "loss": 0.96257615, + "num_input_tokens_seen": 216834320, + "router_z_loss_mlp": 0.81103516, + "step": 2601, + "time_per_iteration": 2.6404433250427246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162684, + "balance_loss_mlp": 1.08157361, + "epoch": 0.5005771450557907, + "flos": 739055299584.0, + "grad_norm": 0.032030290781944436, + "language_loss": 0.88311857, + "learning_rate": 0.0005233571697760021, + "loss": 0.89474535, + "num_input_tokens_seen": 216907312, + "router_z_loss_mlp": 0.81103516, + "step": 2602, + "time_per_iteration": 2.8534095287323 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160577, + "balance_loss_mlp": 1.07937133, + "epoch": 0.5007695267410542, + "flos": 780306026496.0, + "grad_norm": 0.036141348793487994, + "language_loss": 0.90016913, + "learning_rate": 0.0005230459630662203, + "loss": 0.91177493, + "num_input_tokens_seen": 216979872, + "router_z_loss_mlp": 0.81201172, + "step": 2603, + "time_per_iteration": 2.952563524246216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162299, + "balance_loss_mlp": 1.0812366, + "epoch": 0.5009619084263178, + "flos": 624618415104.0, + "grad_norm": 0.03600647163377571, + "language_loss": 0.88813984, + "learning_rate": 0.0005227347474092022, + "loss": 0.89976281, + "num_input_tokens_seen": 217054000, + "router_z_loss_mlp": 0.81054688, + "step": 2604, + "time_per_iteration": 2.70975399017334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166549, + "balance_loss_mlp": 1.08543897, + "epoch": 0.5011542901115814, + "flos": 532192045056.0, + "grad_norm": 0.023202845192485378, + "language_loss": 0.88172328, + "learning_rate": 0.0005224235229257724, + "loss": 0.89338881, + "num_input_tokens_seen": 217126784, + "router_z_loss_mlp": 0.81103516, + "step": 2605, + "time_per_iteration": 2.6811788082122803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165049, + "balance_loss_mlp": 1.08393872, + "epoch": 0.5013466717968449, + "flos": 528627472896.0, + "grad_norm": 0.02710312658737552, + "language_loss": 0.91735983, + "learning_rate": 0.0005221122897367589, + "loss": 0.92901027, + "num_input_tokens_seen": 217203056, + "router_z_loss_mlp": 0.81103516, + "step": 2606, + "time_per_iteration": 2.7866344451904297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115755, + "balance_loss_mlp": 1.07644022, + "epoch": 0.5015390534821085, + "flos": 567088432128.0, + "grad_norm": 0.035852557706828735, + "language_loss": 0.88253903, + "learning_rate": 0.0005218010479629932, + "loss": 0.89411449, + "num_input_tokens_seen": 217273280, + "router_z_loss_mlp": 0.81103516, + "step": 2607, + "time_per_iteration": 2.7290749549865723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157153, + "balance_loss_mlp": 1.07594728, + "epoch": 0.5017314351673721, + "flos": 567767909376.0, + "grad_norm": 0.03266328125205783, + "language_loss": 0.88539654, + "learning_rate": 0.0005214897977253102, + "loss": 0.89696807, + "num_input_tokens_seen": 217345568, + "router_z_loss_mlp": 0.81201172, + "step": 2608, + "time_per_iteration": 2.695686101913452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158723, + "balance_loss_mlp": 1.07751739, + "epoch": 0.5019238168526357, + "flos": 523387038720.0, + "grad_norm": 0.02584859781626205, + "language_loss": 0.88962579, + "learning_rate": 0.0005211785391445473, + "loss": 0.90121305, + "num_input_tokens_seen": 217422848, + "router_z_loss_mlp": 0.81201172, + "step": 2609, + "time_per_iteration": 2.7320780754089355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157806, + "balance_loss_mlp": 1.07674336, + "epoch": 0.5021161985378992, + "flos": 642636659712.0, + "grad_norm": 0.03213074952610081, + "language_loss": 0.85809815, + "learning_rate": 0.0005208672723415467, + "loss": 0.86967611, + "num_input_tokens_seen": 217502896, + "router_z_loss_mlp": 0.81054688, + "step": 2610, + "time_per_iteration": 2.8137152194976807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115836, + "balance_loss_mlp": 1.07729781, + "epoch": 0.5023085802231627, + "flos": 592422472704.0, + "grad_norm": 0.03276582898634011, + "language_loss": 0.85898113, + "learning_rate": 0.0005205559974371525, + "loss": 0.8705647, + "num_input_tokens_seen": 217575072, + "router_z_loss_mlp": 0.81054688, + "step": 2611, + "time_per_iteration": 2.7611584663391113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158271, + "balance_loss_mlp": 1.07720828, + "epoch": 0.5025009619084263, + "flos": 473333306880.0, + "grad_norm": 0.02842666355233711, + "language_loss": 0.86990851, + "learning_rate": 0.0005202447145522123, + "loss": 0.88149118, + "num_input_tokens_seen": 217644976, + "router_z_loss_mlp": 0.81054688, + "step": 2612, + "time_per_iteration": 2.6646487712860107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161741, + "balance_loss_mlp": 1.08067882, + "epoch": 0.5026933435936899, + "flos": 456077131776.0, + "grad_norm": 0.031223796902704184, + "language_loss": 0.84174728, + "learning_rate": 0.0005199334238075769, + "loss": 0.85336471, + "num_input_tokens_seen": 217712816, + "router_z_loss_mlp": 0.81054688, + "step": 2613, + "time_per_iteration": 2.567990779876709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163025, + "balance_loss_mlp": 1.08229649, + "epoch": 0.5028857252789535, + "flos": 492721239552.0, + "grad_norm": 0.02841040015147714, + "language_loss": 0.97840261, + "learning_rate": 0.0005196221253241, + "loss": 0.99003285, + "num_input_tokens_seen": 217780256, + "router_z_loss_mlp": 0.80712891, + "step": 2614, + "time_per_iteration": 2.5584659576416016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160421, + "balance_loss_mlp": 1.07988286, + "epoch": 0.503078106964217, + "flos": 626730706944.0, + "grad_norm": 0.03241817920698289, + "language_loss": 0.88891315, + "learning_rate": 0.0005193108192226383, + "loss": 0.90051734, + "num_input_tokens_seen": 217848496, + "router_z_loss_mlp": 0.80517578, + "step": 2615, + "time_per_iteration": 2.7840871810913086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164078, + "balance_loss_mlp": 1.0830152, + "epoch": 0.5032704886494805, + "flos": 580137487872.0, + "grad_norm": 0.02867464613296787, + "language_loss": 0.91759968, + "learning_rate": 0.000518999505624052, + "loss": 0.92924047, + "num_input_tokens_seen": 217919216, + "router_z_loss_mlp": 0.81054688, + "step": 2616, + "time_per_iteration": 2.6807193756103516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161331, + "balance_loss_mlp": 1.08017337, + "epoch": 0.5034628703347441, + "flos": 472845210624.0, + "grad_norm": 0.027070743385767714, + "language_loss": 0.8816672, + "learning_rate": 0.000518688184649203, + "loss": 0.89328051, + "num_input_tokens_seen": 217996096, + "router_z_loss_mlp": 0.81152344, + "step": 2617, + "time_per_iteration": 2.7943994998931885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159886, + "balance_loss_mlp": 1.07877576, + "epoch": 0.5036552520200077, + "flos": 490813063680.0, + "grad_norm": 0.03074056287258418, + "language_loss": 0.88926733, + "learning_rate": 0.0005183768564189577, + "loss": 0.90086615, + "num_input_tokens_seen": 218063072, + "router_z_loss_mlp": 0.81103516, + "step": 2618, + "time_per_iteration": 2.549255609512329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159762, + "balance_loss_mlp": 1.07860434, + "epoch": 0.5038476337052713, + "flos": 495215566848.0, + "grad_norm": 0.030783318052010424, + "language_loss": 0.87459326, + "learning_rate": 0.0005180655210541838, + "loss": 0.88619089, + "num_input_tokens_seen": 218131056, + "router_z_loss_mlp": 0.81152344, + "step": 2619, + "time_per_iteration": 2.5555741786956787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157127, + "balance_loss_mlp": 1.0759213, + "epoch": 0.5040400153905348, + "flos": 601739770368.0, + "grad_norm": 0.036447475930772646, + "language_loss": 0.89893603, + "learning_rate": 0.0005177541786757527, + "loss": 0.91050732, + "num_input_tokens_seen": 218203536, + "router_z_loss_mlp": 0.81201172, + "step": 2620, + "time_per_iteration": 2.75068998336792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157658, + "balance_loss_mlp": 1.07621455, + "epoch": 0.5042323970757984, + "flos": 812918932992.0, + "grad_norm": 0.03476449221513998, + "language_loss": 0.90274507, + "learning_rate": 0.000517442829404538, + "loss": 0.91432166, + "num_input_tokens_seen": 218283008, + "router_z_loss_mlp": 0.81445312, + "step": 2621, + "time_per_iteration": 2.981661558151245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160033, + "balance_loss_mlp": 1.07854116, + "epoch": 0.504424778761062, + "flos": 628606682112.0, + "grad_norm": 0.030074963346690586, + "language_loss": 0.92839754, + "learning_rate": 0.0005171314733614166, + "loss": 0.93999791, + "num_input_tokens_seen": 218362096, + "router_z_loss_mlp": 0.81494141, + "step": 2622, + "time_per_iteration": 2.942354202270508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160933, + "balance_loss_mlp": 1.07934618, + "epoch": 0.5046171604463255, + "flos": 516956837376.0, + "grad_norm": 0.029806335990833818, + "language_loss": 0.84097135, + "learning_rate": 0.0005168201106672671, + "loss": 0.85258067, + "num_input_tokens_seen": 218439440, + "router_z_loss_mlp": 0.81591797, + "step": 2623, + "time_per_iteration": 2.7703733444213867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160048, + "balance_loss_mlp": 1.07841325, + "epoch": 0.504809542131589, + "flos": 528853056000.0, + "grad_norm": 0.03248441490058616, + "language_loss": 0.91679412, + "learning_rate": 0.0005165087414429717, + "loss": 0.92839456, + "num_input_tokens_seen": 218505936, + "router_z_loss_mlp": 0.81640625, + "step": 2624, + "time_per_iteration": 2.620872974395752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116106, + "balance_loss_mlp": 1.07937741, + "epoch": 0.5050019238168526, + "flos": 555174749184.0, + "grad_norm": 0.03119977790816051, + "language_loss": 0.88980711, + "learning_rate": 0.0005161973658094144, + "loss": 0.90141767, + "num_input_tokens_seen": 218573824, + "router_z_loss_mlp": 0.81689453, + "step": 2625, + "time_per_iteration": 2.640408754348755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161049, + "balance_loss_mlp": 1.07955778, + "epoch": 0.5051943055021162, + "flos": 575928367104.0, + "grad_norm": 0.024986408688213266, + "language_loss": 0.88551366, + "learning_rate": 0.000515885983887482, + "loss": 0.89712417, + "num_input_tokens_seen": 218648016, + "router_z_loss_mlp": 0.81494141, + "step": 2626, + "time_per_iteration": 2.7737276554107666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161913, + "balance_loss_mlp": 1.08066046, + "epoch": 0.5053866871873798, + "flos": 497681696256.0, + "grad_norm": 0.03126501141119064, + "language_loss": 0.91551393, + "learning_rate": 0.0005155745957980636, + "loss": 0.92713308, + "num_input_tokens_seen": 218714128, + "router_z_loss_mlp": 0.8125, + "step": 2627, + "time_per_iteration": 2.5588245391845703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159267, + "balance_loss_mlp": 1.07801354, + "epoch": 0.5055790688726434, + "flos": 503219572224.0, + "grad_norm": 0.028407663328603422, + "language_loss": 0.94095421, + "learning_rate": 0.000515263201662051, + "loss": 0.95254695, + "num_input_tokens_seen": 218784800, + "router_z_loss_mlp": 0.8125, + "step": 2628, + "time_per_iteration": 2.6333348751068115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115977, + "balance_loss_mlp": 1.07851708, + "epoch": 0.5057714505579068, + "flos": 846767268864.0, + "grad_norm": 0.025627158908879104, + "language_loss": 0.8802768, + "learning_rate": 0.0005149518016003378, + "loss": 0.89187449, + "num_input_tokens_seen": 218868256, + "router_z_loss_mlp": 0.8125, + "step": 2629, + "time_per_iteration": 3.159515142440796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115843, + "balance_loss_mlp": 1.07722509, + "epoch": 0.5059638322431704, + "flos": 498808336896.0, + "grad_norm": 0.032654832965012745, + "language_loss": 0.88445461, + "learning_rate": 0.0005146403957338206, + "loss": 0.89603889, + "num_input_tokens_seen": 218932496, + "router_z_loss_mlp": 0.81201172, + "step": 2630, + "time_per_iteration": 2.569671154022217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166774, + "balance_loss_mlp": 1.08571208, + "epoch": 0.506156213928434, + "flos": 619113466368.0, + "grad_norm": 0.027165343024338446, + "language_loss": 0.86742038, + "learning_rate": 0.0005143289841833975, + "loss": 0.8790881, + "num_input_tokens_seen": 219010672, + "router_z_loss_mlp": 0.81054688, + "step": 2631, + "time_per_iteration": 2.8505327701568604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169752, + "balance_loss_mlp": 1.08911932, + "epoch": 0.5063485956136976, + "flos": 425789365248.0, + "grad_norm": 0.03495904047465476, + "language_loss": 0.89354646, + "learning_rate": 0.0005140175670699696, + "loss": 0.90524399, + "num_input_tokens_seen": 219077104, + "router_z_loss_mlp": 0.80615234, + "step": 2632, + "time_per_iteration": 2.5920779705047607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174002, + "balance_loss_mlp": 1.09341669, + "epoch": 0.5065409772989612, + "flos": 571069968384.0, + "grad_norm": 0.02494402323857881, + "language_loss": 0.86924809, + "learning_rate": 0.0005137061445144395, + "loss": 0.88098812, + "num_input_tokens_seen": 219164880, + "router_z_loss_mlp": 0.80566406, + "step": 2633, + "time_per_iteration": 2.8890433311462402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172992, + "balance_loss_mlp": 1.09250152, + "epoch": 0.5067333589842247, + "flos": 629969639424.0, + "grad_norm": 0.03395805639170181, + "language_loss": 0.93242514, + "learning_rate": 0.000513394716637712, + "loss": 0.94415504, + "num_input_tokens_seen": 219237376, + "router_z_loss_mlp": 0.8046875, + "step": 2634, + "time_per_iteration": 2.7772305011749268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171906, + "balance_loss_mlp": 1.09217834, + "epoch": 0.5069257406694883, + "flos": 1451096145408.0, + "grad_norm": 0.011960900894201355, + "language_loss": 0.79191709, + "learning_rate": 0.0005130832835606946, + "loss": 0.80363613, + "num_input_tokens_seen": 219467632, + "router_z_loss_mlp": 0.796875, + "step": 2635, + "time_per_iteration": 4.93586802482605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116392, + "balance_loss_mlp": 1.08323884, + "epoch": 0.5071181223547518, + "flos": 640057738752.0, + "grad_norm": 0.03273720191955115, + "language_loss": 0.86367166, + "learning_rate": 0.0005127718454042958, + "loss": 0.87531078, + "num_input_tokens_seen": 219545392, + "router_z_loss_mlp": 0.80664062, + "step": 2636, + "time_per_iteration": 2.8407700061798096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115771, + "balance_loss_mlp": 1.07683849, + "epoch": 0.5073105040400154, + "flos": 714872094720.0, + "grad_norm": 0.03167408399625075, + "language_loss": 0.89809334, + "learning_rate": 0.0005124604022894269, + "loss": 0.90967047, + "num_input_tokens_seen": 219623104, + "router_z_loss_mlp": 0.80859375, + "step": 2637, + "time_per_iteration": 2.9438648223876953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011651, + "balance_loss_mlp": 1.08575439, + "epoch": 0.5075028857252789, + "flos": 1439612161536.0, + "grad_norm": 0.009234713476178756, + "language_loss": 0.77188224, + "learning_rate": 0.000512148954337001, + "loss": 0.78353328, + "num_input_tokens_seen": 219853328, + "router_z_loss_mlp": 0.79296875, + "step": 2638, + "time_per_iteration": 4.855467319488525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170042, + "balance_loss_mlp": 1.08950412, + "epoch": 0.5076952674105425, + "flos": 572307399168.0, + "grad_norm": 0.033371281415520225, + "language_loss": 0.89923447, + "learning_rate": 0.0005118375016679325, + "loss": 0.91093493, + "num_input_tokens_seen": 219925024, + "router_z_loss_mlp": 0.80517578, + "step": 2639, + "time_per_iteration": 2.7761123180389404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168126, + "balance_loss_mlp": 1.08735013, + "epoch": 0.5078876490958061, + "flos": 517712176128.0, + "grad_norm": 0.04218063889538898, + "language_loss": 0.87796986, + "learning_rate": 0.0005115260444031382, + "loss": 0.88965112, + "num_input_tokens_seen": 219992752, + "router_z_loss_mlp": 0.80761719, + "step": 2640, + "time_per_iteration": 2.5914742946624756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164741, + "balance_loss_mlp": 1.08596802, + "epoch": 0.5080800307810697, + "flos": 1587619405824.0, + "grad_norm": 0.012463066852979446, + "language_loss": 0.78731823, + "learning_rate": 0.000511214582663537, + "loss": 0.79896557, + "num_input_tokens_seen": 220224160, + "router_z_loss_mlp": 0.78710938, + "step": 2641, + "time_per_iteration": 4.9428391456604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164884, + "balance_loss_mlp": 1.08420289, + "epoch": 0.5082724124663333, + "flos": 486186978816.0, + "grad_norm": 0.039006057605032056, + "language_loss": 0.93060952, + "learning_rate": 0.0005109031165700483, + "loss": 0.94225836, + "num_input_tokens_seen": 220289504, + "router_z_loss_mlp": 0.80664062, + "step": 2642, + "time_per_iteration": 2.5630409717559814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164249, + "balance_loss_mlp": 1.08318675, + "epoch": 0.5084647941515967, + "flos": 683442224640.0, + "grad_norm": 0.03324563219825503, + "language_loss": 0.88873887, + "learning_rate": 0.0005105916462435945, + "loss": 0.90038145, + "num_input_tokens_seen": 220361376, + "router_z_loss_mlp": 0.81054688, + "step": 2643, + "time_per_iteration": 2.8135592937469482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165445, + "balance_loss_mlp": 1.08438289, + "epoch": 0.5086571758368603, + "flos": 549812791296.0, + "grad_norm": 0.031221131167697595, + "language_loss": 0.92092431, + "learning_rate": 0.0005102801718050989, + "loss": 0.93257874, + "num_input_tokens_seen": 220434720, + "router_z_loss_mlp": 0.81054688, + "step": 2644, + "time_per_iteration": 2.684957981109619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165053, + "balance_loss_mlp": 1.08413339, + "epoch": 0.5088495575221239, + "flos": 565078198272.0, + "grad_norm": 0.032204925975490975, + "language_loss": 0.95189679, + "learning_rate": 0.0005099686933754867, + "loss": 0.96354735, + "num_input_tokens_seen": 220506208, + "router_z_loss_mlp": 0.80908203, + "step": 2645, + "time_per_iteration": 2.6721112728118896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167263, + "balance_loss_mlp": 1.08620095, + "epoch": 0.5090419392073875, + "flos": 552511234560.0, + "grad_norm": 0.03332524240735616, + "language_loss": 0.90223062, + "learning_rate": 0.0005096572110756845, + "loss": 0.9139033, + "num_input_tokens_seen": 220577456, + "router_z_loss_mlp": 0.81054688, + "step": 2646, + "time_per_iteration": 2.6559739112854004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167828, + "balance_loss_mlp": 1.08686149, + "epoch": 0.509234320892651, + "flos": 568883816448.0, + "grad_norm": 0.029529111031728714, + "language_loss": 0.90596855, + "learning_rate": 0.0005093457250266205, + "loss": 0.91764688, + "num_input_tokens_seen": 220649648, + "router_z_loss_mlp": 0.80957031, + "step": 2647, + "time_per_iteration": 2.7653987407684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167889, + "balance_loss_mlp": 1.08673143, + "epoch": 0.5094267025779146, + "flos": 583693327872.0, + "grad_norm": 0.03457257756125772, + "language_loss": 0.89727396, + "learning_rate": 0.000509034235349224, + "loss": 0.90895277, + "num_input_tokens_seen": 220721168, + "router_z_loss_mlp": 0.81152344, + "step": 2648, + "time_per_iteration": 2.690363645553589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159753, + "balance_loss_mlp": 1.07854819, + "epoch": 0.5096190842631781, + "flos": 593138880000.0, + "grad_norm": 0.0341546457293008, + "language_loss": 0.88255095, + "learning_rate": 0.0005087227421644266, + "loss": 0.89414853, + "num_input_tokens_seen": 220796464, + "router_z_loss_mlp": 0.81201172, + "step": 2649, + "time_per_iteration": 2.6982481479644775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159974, + "balance_loss_mlp": 1.07891166, + "epoch": 0.5098114659484417, + "flos": 514584033792.0, + "grad_norm": 0.030485361797949893, + "language_loss": 0.92298341, + "learning_rate": 0.0005084112455931602, + "loss": 0.93458325, + "num_input_tokens_seen": 220862976, + "router_z_loss_mlp": 0.81054688, + "step": 2650, + "time_per_iteration": 2.5739448070526123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162291, + "balance_loss_mlp": 1.08170521, + "epoch": 0.5100038476337053, + "flos": 485600827392.0, + "grad_norm": 0.03052985498468287, + "language_loss": 0.91529775, + "learning_rate": 0.0005080997457563586, + "loss": 0.92692065, + "num_input_tokens_seen": 220926432, + "router_z_loss_mlp": 0.80566406, + "step": 2651, + "time_per_iteration": 2.5381717681884766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165638, + "balance_loss_mlp": 1.08514845, + "epoch": 0.5101962293189688, + "flos": 462554996736.0, + "grad_norm": 0.037278277228963375, + "language_loss": 0.86181092, + "learning_rate": 0.0005077882427749569, + "loss": 0.87346727, + "num_input_tokens_seen": 220993008, + "router_z_loss_mlp": 0.8046875, + "step": 2652, + "time_per_iteration": 2.490943670272827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158092, + "balance_loss_mlp": 1.07745898, + "epoch": 0.5103886110042324, + "flos": 588132761088.0, + "grad_norm": 0.03182463194953253, + "language_loss": 0.91334021, + "learning_rate": 0.0005074767367698913, + "loss": 0.9249211, + "num_input_tokens_seen": 221059248, + "router_z_loss_mlp": 0.80615234, + "step": 2653, + "time_per_iteration": 2.6900839805603027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115906, + "balance_loss_mlp": 1.07847476, + "epoch": 0.510580992689496, + "flos": 846677945856.0, + "grad_norm": 0.027057922805634398, + "language_loss": 0.89024949, + "learning_rate": 0.0005071652278620988, + "loss": 0.90184009, + "num_input_tokens_seen": 221133712, + "router_z_loss_mlp": 0.80566406, + "step": 2654, + "time_per_iteration": 3.044296979904175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115973, + "balance_loss_mlp": 1.07919204, + "epoch": 0.5107733743747596, + "flos": 659810242560.0, + "grad_norm": 0.0315385737613105, + "language_loss": 0.89305294, + "learning_rate": 0.0005068537161725186, + "loss": 0.90465021, + "num_input_tokens_seen": 221202192, + "router_z_loss_mlp": 0.80517578, + "step": 2655, + "time_per_iteration": 2.770669937133789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160641, + "balance_loss_mlp": 1.08000755, + "epoch": 0.510965756060023, + "flos": 702960413184.0, + "grad_norm": 0.03531630249392906, + "language_loss": 0.91070223, + "learning_rate": 0.0005065422018220893, + "loss": 0.92230862, + "num_input_tokens_seen": 221277104, + "router_z_loss_mlp": 0.80615234, + "step": 2656, + "time_per_iteration": 2.833031177520752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165495, + "balance_loss_mlp": 1.08490956, + "epoch": 0.5111581377452866, + "flos": 560940936192.0, + "grad_norm": 0.03615724120857576, + "language_loss": 0.85921729, + "learning_rate": 0.0005062306849317521, + "loss": 0.87087226, + "num_input_tokens_seen": 221352320, + "router_z_loss_mlp": 0.80566406, + "step": 2657, + "time_per_iteration": 2.800971031188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167929, + "balance_loss_mlp": 1.0873909, + "epoch": 0.5113505194305502, + "flos": 610145276928.0, + "grad_norm": 0.029932060678028026, + "language_loss": 0.88435352, + "learning_rate": 0.0005059191656224487, + "loss": 0.89603281, + "num_input_tokens_seen": 221421056, + "router_z_loss_mlp": 0.80517578, + "step": 2658, + "time_per_iteration": 2.7075443267822266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159414, + "balance_loss_mlp": 1.07882822, + "epoch": 0.5115429011158138, + "flos": 535535036928.0, + "grad_norm": 0.028231439832000826, + "language_loss": 0.94975483, + "learning_rate": 0.0005056076440151212, + "loss": 0.96134901, + "num_input_tokens_seen": 221492064, + "router_z_loss_mlp": 0.80566406, + "step": 2659, + "time_per_iteration": 2.6906392574310303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162323, + "balance_loss_mlp": 1.0835495, + "epoch": 0.5117352828010774, + "flos": 1365273166848.0, + "grad_norm": 0.00971890017277948, + "language_loss": 0.76288116, + "learning_rate": 0.0005052961202307133, + "loss": 0.77450442, + "num_input_tokens_seen": 221724672, + "router_z_loss_mlp": 0.78515625, + "step": 2660, + "time_per_iteration": 4.880187273025513 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160968, + "balance_loss_mlp": 1.07990551, + "epoch": 0.5119276644863409, + "flos": 634930096128.0, + "grad_norm": 0.027317751888226913, + "language_loss": 0.91815728, + "learning_rate": 0.0005049845943901691, + "loss": 0.92976695, + "num_input_tokens_seen": 221800144, + "router_z_loss_mlp": 0.81054688, + "step": 2661, + "time_per_iteration": 2.8184986114501953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160969, + "balance_loss_mlp": 1.08004987, + "epoch": 0.5121200461716044, + "flos": 586780537344.0, + "grad_norm": 0.02944382500923868, + "language_loss": 0.91654462, + "learning_rate": 0.0005046730666144338, + "loss": 0.92815423, + "num_input_tokens_seen": 221877168, + "router_z_loss_mlp": 0.80908203, + "step": 2662, + "time_per_iteration": 2.755974769592285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160878, + "balance_loss_mlp": 1.0798161, + "epoch": 0.512312427856868, + "flos": 1034223124992.0, + "grad_norm": 0.029507171441845153, + "language_loss": 0.93013144, + "learning_rate": 0.0005043615370244532, + "loss": 0.94174021, + "num_input_tokens_seen": 221964208, + "router_z_loss_mlp": 0.81054688, + "step": 2663, + "time_per_iteration": 3.3488211631774902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177849, + "balance_loss_mlp": 1.09907532, + "epoch": 0.5125048095421316, + "flos": 1540899207168.0, + "grad_norm": 0.013662934984579522, + "language_loss": 0.78244388, + "learning_rate": 0.0005040500057411736, + "loss": 0.79422235, + "num_input_tokens_seen": 222179264, + "router_z_loss_mlp": 0.78710938, + "step": 2664, + "time_per_iteration": 4.6237993240356445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162223, + "balance_loss_mlp": 1.08130419, + "epoch": 0.5126971912273951, + "flos": 592327145472.0, + "grad_norm": 0.024418914459260154, + "language_loss": 0.89686567, + "learning_rate": 0.0005037384728855425, + "loss": 0.90848792, + "num_input_tokens_seen": 222259504, + "router_z_loss_mlp": 0.80908203, + "step": 2665, + "time_per_iteration": 2.8003761768341064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163774, + "balance_loss_mlp": 1.08299828, + "epoch": 0.5128895729126587, + "flos": 552717351936.0, + "grad_norm": 0.03867267783646357, + "language_loss": 0.9114759, + "learning_rate": 0.0005034269385785075, + "loss": 0.9231137, + "num_input_tokens_seen": 222330512, + "router_z_loss_mlp": 0.80761719, + "step": 2666, + "time_per_iteration": 2.664607286453247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161159, + "balance_loss_mlp": 1.08047831, + "epoch": 0.5130819545979223, + "flos": 482231639040.0, + "grad_norm": 0.037339426134761385, + "language_loss": 0.92204285, + "learning_rate": 0.0005031154029410168, + "loss": 0.93365449, + "num_input_tokens_seen": 222394000, + "router_z_loss_mlp": 0.80664062, + "step": 2667, + "time_per_iteration": 2.5419206619262695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157708, + "balance_loss_mlp": 1.0769316, + "epoch": 0.5132743362831859, + "flos": 476767623168.0, + "grad_norm": 0.03576788906651519, + "language_loss": 0.93073893, + "learning_rate": 0.0005028038660940197, + "loss": 0.942316, + "num_input_tokens_seen": 222459344, + "router_z_loss_mlp": 0.80761719, + "step": 2668, + "time_per_iteration": 2.5499191284179688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166102, + "balance_loss_mlp": 1.08542132, + "epoch": 0.5134667179684494, + "flos": 504902164992.0, + "grad_norm": 0.02981054719592371, + "language_loss": 0.89144588, + "learning_rate": 0.0005024923281584648, + "loss": 0.90310693, + "num_input_tokens_seen": 222528912, + "router_z_loss_mlp": 0.80664062, + "step": 2669, + "time_per_iteration": 2.6367011070251465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165888, + "balance_loss_mlp": 1.08496881, + "epoch": 0.5136590996537129, + "flos": 505004222976.0, + "grad_norm": 0.029270286325536108, + "language_loss": 0.87695622, + "learning_rate": 0.0005021807892553026, + "loss": 0.88861501, + "num_input_tokens_seen": 222604704, + "router_z_loss_mlp": 0.80908203, + "step": 2670, + "time_per_iteration": 2.697326421737671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165807, + "balance_loss_mlp": 1.08522201, + "epoch": 0.5138514813389765, + "flos": 625799450112.0, + "grad_norm": 0.029434336289691197, + "language_loss": 0.8977018, + "learning_rate": 0.0005018692495054828, + "loss": 0.90935987, + "num_input_tokens_seen": 222677888, + "router_z_loss_mlp": 0.80566406, + "step": 2671, + "time_per_iteration": 2.848576784133911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154912, + "balance_loss_mlp": 1.07394516, + "epoch": 0.5140438630242401, + "flos": 584633316864.0, + "grad_norm": 0.027486728027613972, + "language_loss": 0.85466325, + "learning_rate": 0.0005015577090299561, + "loss": 0.86621237, + "num_input_tokens_seen": 222751936, + "router_z_loss_mlp": 0.80957031, + "step": 2672, + "time_per_iteration": 2.698976993560791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155424, + "balance_loss_mlp": 1.0744096, + "epoch": 0.5142362447095037, + "flos": 488904887808.0, + "grad_norm": 0.030629892529963922, + "language_loss": 0.92615306, + "learning_rate": 0.0005012461679496729, + "loss": 0.9377073, + "num_input_tokens_seen": 222819616, + "router_z_loss_mlp": 0.81005859, + "step": 2673, + "time_per_iteration": 2.5998294353485107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115671, + "balance_loss_mlp": 1.07564759, + "epoch": 0.5144286263947672, + "flos": 527884869120.0, + "grad_norm": 0.029257555563523763, + "language_loss": 0.93652987, + "learning_rate": 0.0005009346263855848, + "loss": 0.94809699, + "num_input_tokens_seen": 222888448, + "router_z_loss_mlp": 0.81054688, + "step": 2674, + "time_per_iteration": 2.702364444732666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156546, + "balance_loss_mlp": 1.07548332, + "epoch": 0.5146210080800308, + "flos": 487589594112.0, + "grad_norm": 0.025826040346785265, + "language_loss": 0.88576883, + "learning_rate": 0.0005006230844586422, + "loss": 0.89733428, + "num_input_tokens_seen": 222964736, + "router_z_loss_mlp": 0.81054688, + "step": 2675, + "time_per_iteration": 2.7889058589935303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159564, + "balance_loss_mlp": 1.07845449, + "epoch": 0.5148133897652943, + "flos": 516974301696.0, + "grad_norm": 0.025127862595781116, + "language_loss": 0.83195055, + "learning_rate": 0.0005003115422897968, + "loss": 0.84354615, + "num_input_tokens_seen": 223040944, + "router_z_loss_mlp": 0.81103516, + "step": 2676, + "time_per_iteration": 2.7474374771118164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165139, + "balance_loss_mlp": 1.08436286, + "epoch": 0.5150057714505579, + "flos": 512211230208.0, + "grad_norm": 0.02805317572608274, + "language_loss": 0.92311704, + "learning_rate": 0.0005, + "loss": 0.93476844, + "num_input_tokens_seen": 223109632, + "router_z_loss_mlp": 0.80761719, + "step": 2677, + "time_per_iteration": 2.635801076889038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167536, + "balance_loss_mlp": 1.08652139, + "epoch": 0.5151981531358215, + "flos": 912389853696.0, + "grad_norm": 0.03671017270530106, + "language_loss": 0.86270726, + "learning_rate": 0.0004996884577102033, + "loss": 0.87438262, + "num_input_tokens_seen": 223191648, + "router_z_loss_mlp": 0.81005859, + "step": 2678, + "time_per_iteration": 3.1016898155212402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116356, + "balance_loss_mlp": 1.08264065, + "epoch": 0.515390534821085, + "flos": 472929804288.0, + "grad_norm": 0.02746999857609634, + "language_loss": 0.90178144, + "learning_rate": 0.000499376915541358, + "loss": 0.91341698, + "num_input_tokens_seen": 223265920, + "router_z_loss_mlp": 0.80908203, + "step": 2679, + "time_per_iteration": 2.7041540145874023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163327, + "balance_loss_mlp": 1.0826937, + "epoch": 0.5155829165063486, + "flos": 651357072384.0, + "grad_norm": 0.02786171231522906, + "language_loss": 0.85589147, + "learning_rate": 0.0004990653736144155, + "loss": 0.86752468, + "num_input_tokens_seen": 223340688, + "router_z_loss_mlp": 0.80615234, + "step": 2680, + "time_per_iteration": 2.883392572402954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163916, + "balance_loss_mlp": 1.08280623, + "epoch": 0.5157752981916122, + "flos": 415160776704.0, + "grad_norm": 0.030701546031170052, + "language_loss": 0.92331398, + "learning_rate": 0.0004987538320503271, + "loss": 0.93495315, + "num_input_tokens_seen": 223404064, + "router_z_loss_mlp": 0.81103516, + "step": 2681, + "time_per_iteration": 2.4719676971435547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169918, + "balance_loss_mlp": 1.0890938, + "epoch": 0.5159676798768758, + "flos": 554931701760.0, + "grad_norm": 0.03041903817165714, + "language_loss": 0.89793313, + "learning_rate": 0.0004984422909700442, + "loss": 0.90963233, + "num_input_tokens_seen": 223476784, + "router_z_loss_mlp": 0.80810547, + "step": 2682, + "time_per_iteration": 2.7486019134521484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168893, + "balance_loss_mlp": 1.08816493, + "epoch": 0.5161600615621393, + "flos": 587620469760.0, + "grad_norm": 0.02833679783776788, + "language_loss": 0.89197505, + "learning_rate": 0.0004981307504945173, + "loss": 0.90366399, + "num_input_tokens_seen": 223542832, + "router_z_loss_mlp": 0.80712891, + "step": 2683, + "time_per_iteration": 2.6918153762817383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161385, + "balance_loss_mlp": 1.08060837, + "epoch": 0.5163524432474028, + "flos": 589947611136.0, + "grad_norm": 0.03153559446680845, + "language_loss": 0.9527353, + "learning_rate": 0.0004978192107446976, + "loss": 0.96434915, + "num_input_tokens_seen": 223617968, + "router_z_loss_mlp": 0.80761719, + "step": 2684, + "time_per_iteration": 2.7622218132019043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159701, + "balance_loss_mlp": 1.07906806, + "epoch": 0.5165448249326664, + "flos": 504904166400.0, + "grad_norm": 0.029863924033148703, + "language_loss": 0.92634213, + "learning_rate": 0.0004975076718415353, + "loss": 0.93793911, + "num_input_tokens_seen": 223689504, + "router_z_loss_mlp": 0.80615234, + "step": 2685, + "time_per_iteration": 2.644228219985962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172411, + "balance_loss_mlp": 1.09220684, + "epoch": 0.51673720661793, + "flos": 417646371840.0, + "grad_norm": 0.031084732221220036, + "language_loss": 0.95470178, + "learning_rate": 0.0004971961339059806, + "loss": 0.96642584, + "num_input_tokens_seen": 223752288, + "router_z_loss_mlp": 0.80175781, + "step": 2686, + "time_per_iteration": 2.469081401824951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160009, + "balance_loss_mlp": 1.0795666, + "epoch": 0.5169295883031936, + "flos": 600074641920.0, + "grad_norm": 0.03147701291149863, + "language_loss": 0.89665824, + "learning_rate": 0.0004968845970589832, + "loss": 0.90825832, + "num_input_tokens_seen": 223822304, + "router_z_loss_mlp": 0.80419922, + "step": 2687, + "time_per_iteration": 2.7054736614227295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159105, + "balance_loss_mlp": 1.07847178, + "epoch": 0.517121969988457, + "flos": 557910122496.0, + "grad_norm": 0.03772331123991374, + "language_loss": 0.90882772, + "learning_rate": 0.0004965730614214926, + "loss": 0.92041886, + "num_input_tokens_seen": 223888592, + "router_z_loss_mlp": 0.80615234, + "step": 2688, + "time_per_iteration": 2.6433985233306885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159068, + "balance_loss_mlp": 1.0787214, + "epoch": 0.5173143516737206, + "flos": 470374351872.0, + "grad_norm": 0.031353493154565384, + "language_loss": 0.9113276, + "learning_rate": 0.0004962615271144576, + "loss": 0.92291832, + "num_input_tokens_seen": 223952880, + "router_z_loss_mlp": 0.80322266, + "step": 2689, + "time_per_iteration": 2.5081796646118164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159566, + "balance_loss_mlp": 1.07912409, + "epoch": 0.5175067333589842, + "flos": 721378157568.0, + "grad_norm": 0.03531118205346665, + "language_loss": 0.88785195, + "learning_rate": 0.0004959499942588264, + "loss": 0.89944768, + "num_input_tokens_seen": 224030000, + "router_z_loss_mlp": 0.80419922, + "step": 2690, + "time_per_iteration": 2.8977034091949463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165977, + "balance_loss_mlp": 1.08682251, + "epoch": 0.5176991150442478, + "flos": 1469341974528.0, + "grad_norm": 0.00940812354228104, + "language_loss": 0.78200024, + "learning_rate": 0.0004956384629755469, + "loss": 0.79365999, + "num_input_tokens_seen": 224252384, + "router_z_loss_mlp": 0.79101562, + "step": 2691, + "time_per_iteration": 4.744166851043701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162816, + "balance_loss_mlp": 1.08227849, + "epoch": 0.5178914967295114, + "flos": 613783709184.0, + "grad_norm": 0.0285194405600695, + "language_loss": 0.91181535, + "learning_rate": 0.0004953269333855661, + "loss": 0.92344356, + "num_input_tokens_seen": 224324640, + "router_z_loss_mlp": 0.80517578, + "step": 2692, + "time_per_iteration": 2.7305634021759033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164372, + "balance_loss_mlp": 1.0839293, + "epoch": 0.5180838784147749, + "flos": 501980140032.0, + "grad_norm": 0.03457473418848995, + "language_loss": 0.89626956, + "learning_rate": 0.0004950154056098309, + "loss": 0.90791321, + "num_input_tokens_seen": 224398368, + "router_z_loss_mlp": 0.80419922, + "step": 2693, + "time_per_iteration": 2.7358009815216064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162458, + "balance_loss_mlp": 1.08215868, + "epoch": 0.5182762601000385, + "flos": 690041613312.0, + "grad_norm": 0.03333155233389222, + "language_loss": 0.90543425, + "learning_rate": 0.0004947038797692867, + "loss": 0.91705889, + "num_input_tokens_seen": 224465456, + "router_z_loss_mlp": 0.80273438, + "step": 2694, + "time_per_iteration": 2.8636367321014404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178055, + "balance_loss_mlp": 1.09775615, + "epoch": 0.518468641785302, + "flos": 666800398848.0, + "grad_norm": 0.03410817354988479, + "language_loss": 0.8335048, + "learning_rate": 0.0004943923559848789, + "loss": 0.84528536, + "num_input_tokens_seen": 224540960, + "router_z_loss_mlp": 0.80273438, + "step": 2695, + "time_per_iteration": 2.797072172164917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117824, + "balance_loss_mlp": 1.09794104, + "epoch": 0.5186610234705656, + "flos": 567813571584.0, + "grad_norm": 0.02729227458516312, + "language_loss": 0.95474803, + "learning_rate": 0.0004940808343775515, + "loss": 0.96653044, + "num_input_tokens_seen": 224613200, + "router_z_loss_mlp": 0.80273438, + "step": 2696, + "time_per_iteration": 2.6839044094085693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162534, + "balance_loss_mlp": 1.08204436, + "epoch": 0.5188534051558291, + "flos": 429792368640.0, + "grad_norm": 0.03355790964159957, + "language_loss": 0.87542081, + "learning_rate": 0.0004937693150682479, + "loss": 0.88704622, + "num_input_tokens_seen": 224677456, + "router_z_loss_mlp": 0.8046875, + "step": 2697, + "time_per_iteration": 2.5123825073242188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161323, + "balance_loss_mlp": 1.08045113, + "epoch": 0.5190457868410927, + "flos": 547411789824.0, + "grad_norm": 0.031455242836056954, + "language_loss": 0.81813598, + "learning_rate": 0.0004934577981779107, + "loss": 0.82974923, + "num_input_tokens_seen": 224745600, + "router_z_loss_mlp": 0.80859375, + "step": 2698, + "time_per_iteration": 2.662545919418335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117247, + "balance_loss_mlp": 1.09159839, + "epoch": 0.5192381685263563, + "flos": 549745661952.0, + "grad_norm": 0.02804159255629041, + "language_loss": 0.86178321, + "learning_rate": 0.0004931462838274817, + "loss": 0.87350786, + "num_input_tokens_seen": 224826944, + "router_z_loss_mlp": 0.80859375, + "step": 2699, + "time_per_iteration": 2.877682685852051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172435, + "balance_loss_mlp": 1.09156311, + "epoch": 0.5194305502116199, + "flos": 576349334016.0, + "grad_norm": 0.03885998177020277, + "language_loss": 0.90400088, + "learning_rate": 0.0004928347721379011, + "loss": 0.91572523, + "num_input_tokens_seen": 224895280, + "router_z_loss_mlp": 0.80859375, + "step": 2700, + "time_per_iteration": 2.671849489212036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169932, + "balance_loss_mlp": 1.08906007, + "epoch": 0.5196229318968835, + "flos": 435217453056.0, + "grad_norm": 0.030583901836551724, + "language_loss": 0.87633044, + "learning_rate": 0.0004925232632301089, + "loss": 0.88802975, + "num_input_tokens_seen": 224961632, + "router_z_loss_mlp": 0.80859375, + "step": 2701, + "time_per_iteration": 2.57857608795166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166407, + "balance_loss_mlp": 1.08558309, + "epoch": 0.5198153135821469, + "flos": 559985484288.0, + "grad_norm": 0.03187287566803064, + "language_loss": 0.85556304, + "learning_rate": 0.0004922117572250431, + "loss": 0.86722708, + "num_input_tokens_seen": 225032816, + "router_z_loss_mlp": 0.80810547, + "step": 2702, + "time_per_iteration": 2.7037737369537354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166773, + "balance_loss_mlp": 1.08618808, + "epoch": 0.5200076952674105, + "flos": 566834651136.0, + "grad_norm": 0.03219739559056917, + "language_loss": 0.8641057, + "learning_rate": 0.0004919002542436414, + "loss": 0.87577343, + "num_input_tokens_seen": 225112736, + "router_z_loss_mlp": 0.80566406, + "step": 2703, + "time_per_iteration": 2.8919363021850586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169953, + "balance_loss_mlp": 1.08965361, + "epoch": 0.5202000769526741, + "flos": 572272470528.0, + "grad_norm": 0.0327510509858114, + "language_loss": 0.87948251, + "learning_rate": 0.0004915887544068399, + "loss": 0.89118207, + "num_input_tokens_seen": 225182672, + "router_z_loss_mlp": 0.80273438, + "step": 2704, + "time_per_iteration": 2.6497535705566406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169089, + "balance_loss_mlp": 1.08869386, + "epoch": 0.5203924586379377, + "flos": 695466697728.0, + "grad_norm": 0.02924473313894461, + "language_loss": 0.83824521, + "learning_rate": 0.0004912772578355736, + "loss": 0.84993607, + "num_input_tokens_seen": 225260272, + "router_z_loss_mlp": 0.80371094, + "step": 2705, + "time_per_iteration": 2.8862009048461914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163429, + "balance_loss_mlp": 1.08274853, + "epoch": 0.5205848403232012, + "flos": 567690046464.0, + "grad_norm": 0.031189936278329552, + "language_loss": 0.88606453, + "learning_rate": 0.000490965764650776, + "loss": 0.89769882, + "num_input_tokens_seen": 225337120, + "router_z_loss_mlp": 0.80664062, + "step": 2706, + "time_per_iteration": 2.865553379058838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163571, + "balance_loss_mlp": 1.08308065, + "epoch": 0.5207772220084648, + "flos": 1216204231680.0, + "grad_norm": 0.03053180986383906, + "language_loss": 0.8816222, + "learning_rate": 0.0004906542749733798, + "loss": 0.89325786, + "num_input_tokens_seen": 225433984, + "router_z_loss_mlp": 0.8046875, + "step": 2707, + "time_per_iteration": 3.6396875381469727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162365, + "balance_loss_mlp": 1.08197033, + "epoch": 0.5209696036937284, + "flos": 594031205376.0, + "grad_norm": 0.027334962594272247, + "language_loss": 0.90568572, + "learning_rate": 0.0004903427889243156, + "loss": 0.91730928, + "num_input_tokens_seen": 225512112, + "router_z_loss_mlp": 0.80371094, + "step": 2708, + "time_per_iteration": 2.853013753890991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116169, + "balance_loss_mlp": 1.08129489, + "epoch": 0.5211619853789919, + "flos": 523955725824.0, + "grad_norm": 0.032301377197285666, + "language_loss": 0.91200471, + "learning_rate": 0.0004900313066245134, + "loss": 0.92362165, + "num_input_tokens_seen": 225586944, + "router_z_loss_mlp": 0.80371094, + "step": 2709, + "time_per_iteration": 2.706407070159912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161577, + "balance_loss_mlp": 1.08146846, + "epoch": 0.5213543670642555, + "flos": 503860118016.0, + "grad_norm": 0.02918491733204221, + "language_loss": 0.86683327, + "learning_rate": 0.0004897198281949012, + "loss": 0.87844902, + "num_input_tokens_seen": 225657184, + "router_z_loss_mlp": 0.80078125, + "step": 2710, + "time_per_iteration": 2.6603598594665527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115684, + "balance_loss_mlp": 1.07654023, + "epoch": 0.521546748749519, + "flos": 587071248384.0, + "grad_norm": 0.0328837537508598, + "language_loss": 0.84538651, + "learning_rate": 0.0004894083537564057, + "loss": 0.85695493, + "num_input_tokens_seen": 225729968, + "router_z_loss_mlp": 0.80273438, + "step": 2711, + "time_per_iteration": 2.740659236907959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159708, + "balance_loss_mlp": 1.07955158, + "epoch": 0.5217391304347826, + "flos": 571265352192.0, + "grad_norm": 0.028894041826031003, + "language_loss": 0.85799223, + "learning_rate": 0.0004890968834299519, + "loss": 0.86958933, + "num_input_tokens_seen": 225801808, + "router_z_loss_mlp": 0.80126953, + "step": 2712, + "time_per_iteration": 2.7206225395202637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157432, + "balance_loss_mlp": 1.077371, + "epoch": 0.5219315121200462, + "flos": 543919076352.0, + "grad_norm": 0.029763432747936528, + "language_loss": 0.83741677, + "learning_rate": 0.0004887854173364633, + "loss": 0.84899104, + "num_input_tokens_seen": 225878576, + "router_z_loss_mlp": 0.80029297, + "step": 2713, + "time_per_iteration": 2.737755060195923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160512, + "balance_loss_mlp": 1.08097565, + "epoch": 0.5221238938053098, + "flos": 551530312704.0, + "grad_norm": 0.028214516718367867, + "language_loss": 0.86704654, + "learning_rate": 0.0004884739555968617, + "loss": 0.87865162, + "num_input_tokens_seen": 225960096, + "router_z_loss_mlp": 0.79492188, + "step": 2714, + "time_per_iteration": 2.872819185256958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168823, + "balance_loss_mlp": 1.09100342, + "epoch": 0.5223162754905732, + "flos": 1358389797888.0, + "grad_norm": 0.012476009787944744, + "language_loss": 0.78977054, + "learning_rate": 0.0004881624983320676, + "loss": 0.80145878, + "num_input_tokens_seen": 226184960, + "router_z_loss_mlp": 0.77539062, + "step": 2715, + "time_per_iteration": 4.96741795539856 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170398, + "balance_loss_mlp": 1.09028971, + "epoch": 0.5225086571758368, + "flos": 568973139456.0, + "grad_norm": 0.03267804467904664, + "language_loss": 0.92675197, + "learning_rate": 0.0004878510456629992, + "loss": 0.93845594, + "num_input_tokens_seen": 226271328, + "router_z_loss_mlp": 0.80078125, + "step": 2716, + "time_per_iteration": 2.9626121520996094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160651, + "balance_loss_mlp": 1.08054268, + "epoch": 0.5227010388611004, + "flos": 501135478272.0, + "grad_norm": 0.033781088666230946, + "language_loss": 0.9089278, + "learning_rate": 0.00048753959771057314, + "loss": 0.92053425, + "num_input_tokens_seen": 226340080, + "router_z_loss_mlp": 0.80078125, + "step": 2717, + "time_per_iteration": 2.611691951751709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157135, + "balance_loss_mlp": 1.07702601, + "epoch": 0.522893420546364, + "flos": 598798279680.0, + "grad_norm": 0.032963356718883376, + "language_loss": 0.88626194, + "learning_rate": 0.0004872281545957044, + "loss": 0.89783323, + "num_input_tokens_seen": 226415120, + "router_z_loss_mlp": 0.80078125, + "step": 2718, + "time_per_iteration": 2.7218518257141113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116303, + "balance_loss_mlp": 1.08287394, + "epoch": 0.5230858022316276, + "flos": 665921534976.0, + "grad_norm": 0.02884991307967795, + "language_loss": 0.91186881, + "learning_rate": 0.0004869167164393055, + "loss": 0.92349917, + "num_input_tokens_seen": 226501200, + "router_z_loss_mlp": 0.80126953, + "step": 2719, + "time_per_iteration": 2.932335376739502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164195, + "balance_loss_mlp": 1.08403885, + "epoch": 0.5232781839168911, + "flos": 605033097216.0, + "grad_norm": 0.02708280335676697, + "language_loss": 0.94493294, + "learning_rate": 0.00048660528336228793, + "loss": 0.95657486, + "num_input_tokens_seen": 226582064, + "router_z_loss_mlp": 0.80126953, + "step": 2720, + "time_per_iteration": 2.8030405044555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158564, + "balance_loss_mlp": 1.07840788, + "epoch": 0.5234705656021547, + "flos": 551840489472.0, + "grad_norm": 0.028885887647779437, + "language_loss": 0.95077229, + "learning_rate": 0.0004862938554855606, + "loss": 0.96235794, + "num_input_tokens_seen": 226656448, + "router_z_loss_mlp": 0.80126953, + "step": 2721, + "time_per_iteration": 2.797297716140747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159208, + "balance_loss_mlp": 1.0790993, + "epoch": 0.5236629472874182, + "flos": 505294934016.0, + "grad_norm": 0.03214550067861962, + "language_loss": 0.91548902, + "learning_rate": 0.0004859824329300304, + "loss": 0.92708111, + "num_input_tokens_seen": 226725568, + "router_z_loss_mlp": 0.80078125, + "step": 2722, + "time_per_iteration": 2.589529037475586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164653, + "balance_loss_mlp": 1.08444893, + "epoch": 0.5238553289726818, + "flos": 548696884224.0, + "grad_norm": 0.029959051591606282, + "language_loss": 0.88512689, + "learning_rate": 0.00048567101581660244, + "loss": 0.89677346, + "num_input_tokens_seen": 226795728, + "router_z_loss_mlp": 0.80175781, + "step": 2723, + "time_per_iteration": 2.6637237071990967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160999, + "balance_loss_mlp": 1.08065164, + "epoch": 0.5240477106579453, + "flos": 533003779584.0, + "grad_norm": 0.031636293719806106, + "language_loss": 0.92529982, + "learning_rate": 0.00048535960426617956, + "loss": 0.93690991, + "num_input_tokens_seen": 226865344, + "router_z_loss_mlp": 0.80322266, + "step": 2724, + "time_per_iteration": 2.6061489582061768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156405, + "balance_loss_mlp": 1.07620108, + "epoch": 0.5242400923432089, + "flos": 619089271296.0, + "grad_norm": 0.028230181756235023, + "language_loss": 0.87247139, + "learning_rate": 0.0004850481983996621, + "loss": 0.88403541, + "num_input_tokens_seen": 226936800, + "router_z_loss_mlp": 0.80175781, + "step": 2725, + "time_per_iteration": 2.7699060440063477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157933, + "balance_loss_mlp": 1.07787168, + "epoch": 0.5244324740284725, + "flos": 417589976064.0, + "grad_norm": 0.03201067328997522, + "language_loss": 0.93398654, + "learning_rate": 0.0004847367983379492, + "loss": 0.94556582, + "num_input_tokens_seen": 226998448, + "router_z_loss_mlp": 0.80029297, + "step": 2726, + "time_per_iteration": 2.521516799926758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156009, + "balance_loss_mlp": 1.07599604, + "epoch": 0.5246248557137361, + "flos": 627731821056.0, + "grad_norm": 0.028083517097400017, + "language_loss": 0.83866012, + "learning_rate": 0.00048442540420193643, + "loss": 0.8502202, + "num_input_tokens_seen": 227081872, + "router_z_loss_mlp": 0.79980469, + "step": 2727, + "time_per_iteration": 2.8968660831451416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155443, + "balance_loss_mlp": 1.07547724, + "epoch": 0.5248172373989997, + "flos": 1250401675776.0, + "grad_norm": 0.032601939018394276, + "language_loss": 0.85122609, + "learning_rate": 0.0004841140161125182, + "loss": 0.86278045, + "num_input_tokens_seen": 227167744, + "router_z_loss_mlp": 0.79931641, + "step": 2728, + "time_per_iteration": 3.585556983947754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156303, + "balance_loss_mlp": 1.0764327, + "epoch": 0.5250096190842631, + "flos": 507882587136.0, + "grad_norm": 0.02942710549962748, + "language_loss": 0.90605354, + "learning_rate": 0.0004838026341905857, + "loss": 0.91761655, + "num_input_tokens_seen": 227239136, + "router_z_loss_mlp": 0.79833984, + "step": 2729, + "time_per_iteration": 2.7116506099700928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157734, + "balance_loss_mlp": 1.07781577, + "epoch": 0.5252020007695267, + "flos": 612507346944.0, + "grad_norm": 0.029260311632026755, + "language_loss": 0.9089191, + "learning_rate": 0.00048349125855702844, + "loss": 0.92049646, + "num_input_tokens_seen": 227311968, + "router_z_loss_mlp": 0.79882812, + "step": 2730, + "time_per_iteration": 2.772508144378662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157575, + "balance_loss_mlp": 1.07780039, + "epoch": 0.5253943824547903, + "flos": 540291377664.0, + "grad_norm": 0.027039643287400304, + "language_loss": 0.86249292, + "learning_rate": 0.00048317988933273287, + "loss": 0.87406862, + "num_input_tokens_seen": 227385248, + "router_z_loss_mlp": 0.79736328, + "step": 2731, + "time_per_iteration": 2.7501025199890137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159148, + "balance_loss_mlp": 1.07918203, + "epoch": 0.5255867641400539, + "flos": 699337443840.0, + "grad_norm": 0.030025626211663315, + "language_loss": 0.87967253, + "learning_rate": 0.00048286852663858367, + "loss": 0.89126396, + "num_input_tokens_seen": 227464640, + "router_z_loss_mlp": 0.79931641, + "step": 2732, + "time_per_iteration": 2.9441256523132324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156016, + "balance_loss_mlp": 1.07604992, + "epoch": 0.5257791458253175, + "flos": 668548119552.0, + "grad_norm": 0.03127119397180798, + "language_loss": 0.89405584, + "learning_rate": 0.000482557170595462, + "loss": 0.90561604, + "num_input_tokens_seen": 227542192, + "router_z_loss_mlp": 0.79931641, + "step": 2733, + "time_per_iteration": 2.875559091567993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158055, + "balance_loss_mlp": 1.07813704, + "epoch": 0.525971527510581, + "flos": 484604442624.0, + "grad_norm": 0.02914442262172993, + "language_loss": 0.93156296, + "learning_rate": 0.0004822458213242475, + "loss": 0.94314349, + "num_input_tokens_seen": 227606096, + "router_z_loss_mlp": 0.79882812, + "step": 2734, + "time_per_iteration": 2.5386509895324707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157288, + "balance_loss_mlp": 1.07737029, + "epoch": 0.5261639091958445, + "flos": 831347410944.0, + "grad_norm": 0.025020932409653307, + "language_loss": 0.90545583, + "learning_rate": 0.00048193447894581627, + "loss": 0.91702867, + "num_input_tokens_seen": 227689552, + "router_z_loss_mlp": 0.79882812, + "step": 2735, + "time_per_iteration": 3.087679862976074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158596, + "balance_loss_mlp": 1.07853508, + "epoch": 0.5263562908811081, + "flos": 521732643840.0, + "grad_norm": 0.03948252554958876, + "language_loss": 0.93270254, + "learning_rate": 0.00048162314358104243, + "loss": 0.94428849, + "num_input_tokens_seen": 227760784, + "router_z_loss_mlp": 0.80029297, + "step": 2736, + "time_per_iteration": 2.601278305053711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156345, + "balance_loss_mlp": 1.07633209, + "epoch": 0.5265486725663717, + "flos": 576097554432.0, + "grad_norm": 0.032044906976615765, + "language_loss": 0.89525604, + "learning_rate": 0.0004813118153507969, + "loss": 0.90681952, + "num_input_tokens_seen": 227834304, + "router_z_loss_mlp": 0.79980469, + "step": 2737, + "time_per_iteration": 2.7360177040100098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160461, + "balance_loss_mlp": 1.0820694, + "epoch": 0.5267410542516352, + "flos": 1550558333952.0, + "grad_norm": 0.008730383218555248, + "language_loss": 0.82447124, + "learning_rate": 0.0004810004943759482, + "loss": 0.8360759, + "num_input_tokens_seen": 228057232, + "router_z_loss_mlp": 0.78320312, + "step": 2738, + "time_per_iteration": 4.80830717086792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160505, + "balance_loss_mlp": 1.08039653, + "epoch": 0.5269334359368988, + "flos": 931460878848.0, + "grad_norm": 0.03056162512939441, + "language_loss": 0.89627469, + "learning_rate": 0.00048068918077736163, + "loss": 0.90787971, + "num_input_tokens_seen": 228140816, + "router_z_loss_mlp": 0.80078125, + "step": 2739, + "time_per_iteration": 3.228745222091675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160328, + "balance_loss_mlp": 1.08021903, + "epoch": 0.5271258176221624, + "flos": 656634436608.0, + "grad_norm": 0.03221347808604687, + "language_loss": 0.87126762, + "learning_rate": 0.0004803778746759001, + "loss": 0.88287091, + "num_input_tokens_seen": 228216208, + "router_z_loss_mlp": 0.80078125, + "step": 2740, + "time_per_iteration": 2.888040542602539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161897, + "balance_loss_mlp": 1.08217001, + "epoch": 0.527318199307426, + "flos": 544062067200.0, + "grad_norm": 0.03125376981830108, + "language_loss": 0.87138033, + "learning_rate": 0.00048006657619242317, + "loss": 0.8829993, + "num_input_tokens_seen": 228283184, + "router_z_loss_mlp": 0.796875, + "step": 2741, + "time_per_iteration": 2.6788547039031982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156491, + "balance_loss_mlp": 1.07662046, + "epoch": 0.5275105809926895, + "flos": 448898322432.0, + "grad_norm": 0.035204553781932095, + "language_loss": 0.84527659, + "learning_rate": 0.00047975528544778775, + "loss": 0.8568415, + "num_input_tokens_seen": 228351328, + "router_z_loss_mlp": 0.79833984, + "step": 2742, + "time_per_iteration": 2.5953187942504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156742, + "balance_loss_mlp": 1.07677603, + "epoch": 0.527702962677953, + "flos": 580052894208.0, + "grad_norm": 0.031790657619887884, + "language_loss": 0.9544906, + "learning_rate": 0.00047944400256284754, + "loss": 0.96605802, + "num_input_tokens_seen": 228423632, + "router_z_loss_mlp": 0.79931641, + "step": 2743, + "time_per_iteration": 2.6874876022338867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158128, + "balance_loss_mlp": 1.07821035, + "epoch": 0.5278953443632166, + "flos": 654009853440.0, + "grad_norm": 0.028533864641999515, + "language_loss": 0.84914398, + "learning_rate": 0.0004791327276584532, + "loss": 0.86072528, + "num_input_tokens_seen": 228498736, + "router_z_loss_mlp": 0.79882812, + "step": 2744, + "time_per_iteration": 2.851484537124634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159082, + "balance_loss_mlp": 1.07902145, + "epoch": 0.5280877260484802, + "flos": 515048661504.0, + "grad_norm": 0.02936794285447426, + "language_loss": 0.85631824, + "learning_rate": 0.00047882146085545264, + "loss": 0.86790907, + "num_input_tokens_seen": 228569056, + "router_z_loss_mlp": 0.80029297, + "step": 2745, + "time_per_iteration": 2.6376991271972656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159996, + "balance_loss_mlp": 1.081604, + "epoch": 0.5282801077337438, + "flos": 1448712608256.0, + "grad_norm": 0.005116949586401208, + "language_loss": 0.75402379, + "learning_rate": 0.00047851020227469, + "loss": 0.76562381, + "num_input_tokens_seen": 228800560, + "router_z_loss_mlp": 0.78125, + "step": 2746, + "time_per_iteration": 4.958376169204712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158639, + "balance_loss_mlp": 1.0789119, + "epoch": 0.5284724894190073, + "flos": 605966355456.0, + "grad_norm": 0.03386849685542916, + "language_loss": 0.85558748, + "learning_rate": 0.00047819895203700684, + "loss": 0.86717391, + "num_input_tokens_seen": 228869216, + "router_z_loss_mlp": 0.796875, + "step": 2747, + "time_per_iteration": 2.7103474140167236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161659, + "balance_loss_mlp": 1.08326721, + "epoch": 0.5286648711042709, + "flos": 1498103600640.0, + "grad_norm": 0.005524480658063938, + "language_loss": 0.75512433, + "learning_rate": 0.0004778877102632412, + "loss": 0.76674092, + "num_input_tokens_seen": 229085520, + "router_z_loss_mlp": 0.78125, + "step": 2748, + "time_per_iteration": 4.636225938796997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156911, + "balance_loss_mlp": 1.077088, + "epoch": 0.5288572527895344, + "flos": 598833208320.0, + "grad_norm": 0.030227845431380972, + "language_loss": 0.94071984, + "learning_rate": 0.0004775764770742277, + "loss": 0.95228899, + "num_input_tokens_seen": 229160912, + "router_z_loss_mlp": 0.79785156, + "step": 2749, + "time_per_iteration": 2.7894628047943115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154981, + "balance_loss_mlp": 1.07496762, + "epoch": 0.529049634474798, + "flos": 558439878144.0, + "grad_norm": 0.038921610012438906, + "language_loss": 0.92515904, + "learning_rate": 0.00047726525259079777, + "loss": 0.93670887, + "num_input_tokens_seen": 229235792, + "router_z_loss_mlp": 0.79980469, + "step": 2750, + "time_per_iteration": 2.8399362564086914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156308, + "balance_loss_mlp": 1.07643819, + "epoch": 0.5292420161600616, + "flos": 582434429952.0, + "grad_norm": 0.03493339209419754, + "language_loss": 0.94807124, + "learning_rate": 0.0004769540369337798, + "loss": 0.9596343, + "num_input_tokens_seen": 229309984, + "router_z_loss_mlp": 0.79833984, + "step": 2751, + "time_per_iteration": 2.7520663738250732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171177, + "balance_loss_mlp": 1.09097254, + "epoch": 0.5294343978453251, + "flos": 609563854848.0, + "grad_norm": 0.029200425139457874, + "language_loss": 0.90377945, + "learning_rate": 0.00047664283022399794, + "loss": 0.91549122, + "num_input_tokens_seen": 229394000, + "router_z_loss_mlp": 0.80175781, + "step": 2752, + "time_per_iteration": 2.827075719833374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169344, + "balance_loss_mlp": 1.08904481, + "epoch": 0.5296267795305887, + "flos": 647709907968.0, + "grad_norm": 0.03322281077035965, + "language_loss": 0.85670567, + "learning_rate": 0.00047633163258227376, + "loss": 0.86839902, + "num_input_tokens_seen": 229474320, + "router_z_loss_mlp": 0.80273438, + "step": 2753, + "time_per_iteration": 2.8684630393981934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168156, + "balance_loss_mlp": 1.08790445, + "epoch": 0.5298191612158523, + "flos": 560805950976.0, + "grad_norm": 0.0355054677596956, + "language_loss": 0.92337191, + "learning_rate": 0.0004760204441294247, + "loss": 0.93505347, + "num_input_tokens_seen": 229543072, + "router_z_loss_mlp": 0.80224609, + "step": 2754, + "time_per_iteration": 2.6347973346710205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162052, + "balance_loss_mlp": 1.08194327, + "epoch": 0.5300115429011159, + "flos": 515131253760.0, + "grad_norm": 0.03178410473183971, + "language_loss": 0.90992713, + "learning_rate": 0.00047570926498626486, + "loss": 0.92154765, + "num_input_tokens_seen": 229615296, + "router_z_loss_mlp": 0.80078125, + "step": 2755, + "time_per_iteration": 2.6713931560516357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165293, + "balance_loss_mlp": 1.08513677, + "epoch": 0.5302039245863793, + "flos": 674049065472.0, + "grad_norm": 0.025883205751119107, + "language_loss": 0.86624229, + "learning_rate": 0.00047539809527360474, + "loss": 0.87789524, + "num_input_tokens_seen": 229693728, + "router_z_loss_mlp": 0.80126953, + "step": 2756, + "time_per_iteration": 2.855339765548706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163284, + "balance_loss_mlp": 1.08312809, + "epoch": 0.5303963062716429, + "flos": 732156467712.0, + "grad_norm": 0.025616439830169112, + "language_loss": 0.86757731, + "learning_rate": 0.0004750869351122511, + "loss": 0.87921017, + "num_input_tokens_seen": 229772144, + "router_z_loss_mlp": 0.80126953, + "step": 2757, + "time_per_iteration": 2.9861788749694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157792, + "balance_loss_mlp": 1.07773066, + "epoch": 0.5305886879569065, + "flos": 574551948288.0, + "grad_norm": 0.030995691560080724, + "language_loss": 0.87564695, + "learning_rate": 0.00047477578462300685, + "loss": 0.88722491, + "num_input_tokens_seen": 229847024, + "router_z_loss_mlp": 0.80029297, + "step": 2758, + "time_per_iteration": 2.711434841156006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158236, + "balance_loss_mlp": 1.07817531, + "epoch": 0.5307810696421701, + "flos": 696728323584.0, + "grad_norm": 0.030944173565867344, + "language_loss": 0.85500729, + "learning_rate": 0.0004744646439266718, + "loss": 0.86658955, + "num_input_tokens_seen": 229932416, + "router_z_loss_mlp": 0.80029297, + "step": 2759, + "time_per_iteration": 3.012730121612549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159665, + "balance_loss_mlp": 1.07965159, + "epoch": 0.5309734513274337, + "flos": 650202233856.0, + "grad_norm": 0.02922555436454367, + "language_loss": 0.9794637, + "learning_rate": 0.000474153513144041, + "loss": 0.99106038, + "num_input_tokens_seen": 230010976, + "router_z_loss_mlp": 0.79980469, + "step": 2760, + "time_per_iteration": 2.9069197177886963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158721, + "balance_loss_mlp": 1.07866037, + "epoch": 0.5311658330126972, + "flos": 606055678464.0, + "grad_norm": 0.0324154212137011, + "language_loss": 0.92613202, + "learning_rate": 0.00047384239239590633, + "loss": 0.93771923, + "num_input_tokens_seen": 230093344, + "router_z_loss_mlp": 0.80029297, + "step": 2761, + "time_per_iteration": 2.8556571006774902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159506, + "balance_loss_mlp": 1.07949257, + "epoch": 0.5313582146979607, + "flos": 559316740608.0, + "grad_norm": 0.03061440617121834, + "language_loss": 0.94290936, + "learning_rate": 0.0004735312818030556, + "loss": 0.95450437, + "num_input_tokens_seen": 230165520, + "router_z_loss_mlp": 0.79980469, + "step": 2762, + "time_per_iteration": 2.6934847831726074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157514, + "balance_loss_mlp": 1.07764399, + "epoch": 0.5315505963832243, + "flos": 509445657600.0, + "grad_norm": 0.029953313176207894, + "language_loss": 0.88601178, + "learning_rate": 0.0004732201814862727, + "loss": 0.89758694, + "num_input_tokens_seen": 230237808, + "router_z_loss_mlp": 0.79833984, + "step": 2763, + "time_per_iteration": 2.7555651664733887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156859, + "balance_loss_mlp": 1.0773226, + "epoch": 0.5317429780684879, + "flos": 627668694528.0, + "grad_norm": 0.030098925618691368, + "language_loss": 0.87074947, + "learning_rate": 0.0004729090915663373, + "loss": 0.88231808, + "num_input_tokens_seen": 230321568, + "router_z_loss_mlp": 0.79492188, + "step": 2764, + "time_per_iteration": 2.83986496925354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157289, + "balance_loss_mlp": 1.07751369, + "epoch": 0.5319353597537514, + "flos": 477698880000.0, + "grad_norm": 0.035256009305486516, + "language_loss": 0.9145658, + "learning_rate": 0.00047259801216402534, + "loss": 0.92613864, + "num_input_tokens_seen": 230385376, + "router_z_loss_mlp": 0.79736328, + "step": 2765, + "time_per_iteration": 2.49153208732605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158926, + "balance_loss_mlp": 1.07934201, + "epoch": 0.532127741439015, + "flos": 502633420800.0, + "grad_norm": 0.031216360034414494, + "language_loss": 0.91137969, + "learning_rate": 0.00047228694340010845, + "loss": 0.92296898, + "num_input_tokens_seen": 230449760, + "router_z_loss_mlp": 0.79541016, + "step": 2766, + "time_per_iteration": 2.5491669178009033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163587, + "balance_loss_mlp": 1.08385968, + "epoch": 0.5323201231242786, + "flos": 1166482870272.0, + "grad_norm": 0.028947902109049614, + "language_loss": 0.91277415, + "learning_rate": 0.0004719758853953544, + "loss": 0.92440999, + "num_input_tokens_seen": 230536592, + "router_z_loss_mlp": 0.796875, + "step": 2767, + "time_per_iteration": 3.576573610305786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167049, + "balance_loss_mlp": 1.08694029, + "epoch": 0.5325125048095422, + "flos": 379541251584.0, + "grad_norm": 0.04259356627609034, + "language_loss": 0.91498351, + "learning_rate": 0.00047166483827052645, + "loss": 0.92665404, + "num_input_tokens_seen": 230596688, + "router_z_loss_mlp": 0.80078125, + "step": 2768, + "time_per_iteration": 2.3893725872039795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172249, + "balance_loss_mlp": 1.09423828, + "epoch": 0.5327048864948057, + "flos": 1544747211264.0, + "grad_norm": 0.007240897484727242, + "language_loss": 0.77078491, + "learning_rate": 0.00047135380214638413, + "loss": 0.78250736, + "num_input_tokens_seen": 230829408, + "router_z_loss_mlp": 0.77929688, + "step": 2769, + "time_per_iteration": 4.972010374069214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167053, + "balance_loss_mlp": 1.08737326, + "epoch": 0.5328972681800692, + "flos": 912861212160.0, + "grad_norm": 0.03027786850862354, + "language_loss": 0.8989411, + "learning_rate": 0.000471042777143682, + "loss": 0.91061163, + "num_input_tokens_seen": 230912528, + "router_z_loss_mlp": 0.79638672, + "step": 2770, + "time_per_iteration": 3.1992523670196533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116085, + "balance_loss_mlp": 1.08126593, + "epoch": 0.5330896498653328, + "flos": 474850715136.0, + "grad_norm": 0.032478463467180745, + "language_loss": 0.85492694, + "learning_rate": 0.0004707317633831707, + "loss": 0.86653543, + "num_input_tokens_seen": 230979424, + "router_z_loss_mlp": 0.79541016, + "step": 2771, + "time_per_iteration": 2.636418342590332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159417, + "balance_loss_mlp": 1.07983315, + "epoch": 0.5332820315505964, + "flos": 502633420800.0, + "grad_norm": 0.034509360784450445, + "language_loss": 0.84931278, + "learning_rate": 0.00047042076098559673, + "loss": 0.86090696, + "num_input_tokens_seen": 231046416, + "router_z_loss_mlp": 0.79541016, + "step": 2772, + "time_per_iteration": 2.587954521179199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155982, + "balance_loss_mlp": 1.07615912, + "epoch": 0.53347441323586, + "flos": 926031791616.0, + "grad_norm": 0.036007721663536225, + "language_loss": 0.8042109, + "learning_rate": 0.00047010977007170174, + "loss": 0.81577075, + "num_input_tokens_seen": 231136064, + "router_z_loss_mlp": 0.79785156, + "step": 2773, + "time_per_iteration": 3.207517623901367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154797, + "balance_loss_mlp": 1.07497442, + "epoch": 0.5336667949211235, + "flos": 575539600896.0, + "grad_norm": 0.032460813123339774, + "language_loss": 0.88737571, + "learning_rate": 0.00046979879076222334, + "loss": 0.89892364, + "num_input_tokens_seen": 231203616, + "router_z_loss_mlp": 0.79785156, + "step": 2774, + "time_per_iteration": 2.711036443710327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154367, + "balance_loss_mlp": 1.07459235, + "epoch": 0.533859176606387, + "flos": 1066390869504.0, + "grad_norm": 0.02757600625184913, + "language_loss": 0.88843602, + "learning_rate": 0.0004694878231778939, + "loss": 0.89997971, + "num_input_tokens_seen": 231287008, + "router_z_loss_mlp": 0.79736328, + "step": 2775, + "time_per_iteration": 3.3735690116882324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154523, + "balance_loss_mlp": 1.07512975, + "epoch": 0.5340515582916506, + "flos": 747905968128.0, + "grad_norm": 0.025749810309272533, + "language_loss": 0.89188796, + "learning_rate": 0.0004691768674394423, + "loss": 0.9034332, + "num_input_tokens_seen": 231365296, + "router_z_loss_mlp": 0.79345703, + "step": 2776, + "time_per_iteration": 2.9947128295898438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171234, + "balance_loss_mlp": 1.09341431, + "epoch": 0.5342439399769142, + "flos": 1448818669056.0, + "grad_norm": 0.018487467205991936, + "language_loss": 0.84484011, + "learning_rate": 0.0004688659236675918, + "loss": 0.85655242, + "num_input_tokens_seen": 231579040, + "router_z_loss_mlp": 0.77734375, + "step": 2777, + "time_per_iteration": 4.765547275543213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166931, + "balance_loss_mlp": 1.08872986, + "epoch": 0.5344363216621778, + "flos": 1430696365056.0, + "grad_norm": 0.01490962088780182, + "language_loss": 0.76653534, + "learning_rate": 0.00046855499198306187, + "loss": 0.77820462, + "num_input_tokens_seen": 231812736, + "router_z_loss_mlp": 0.77929688, + "step": 2778, + "time_per_iteration": 4.979669570922852 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156329, + "balance_loss_mlp": 1.07636368, + "epoch": 0.5346287033474413, + "flos": 528675136512.0, + "grad_norm": 0.028255812601682327, + "language_loss": 0.84707999, + "learning_rate": 0.00046824407250656676, + "loss": 0.85864329, + "num_input_tokens_seen": 231883840, + "router_z_loss_mlp": 0.79931641, + "step": 2779, + "time_per_iteration": 2.6169135570526123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161852, + "balance_loss_mlp": 1.08183897, + "epoch": 0.5348210850327049, + "flos": 511755334656.0, + "grad_norm": 0.02960487915529887, + "language_loss": 0.89552319, + "learning_rate": 0.0004679331653588161, + "loss": 0.90714169, + "num_input_tokens_seen": 231955360, + "router_z_loss_mlp": 0.79980469, + "step": 2780, + "time_per_iteration": 2.651503562927246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165567, + "balance_loss_mlp": 1.08536327, + "epoch": 0.5350134667179685, + "flos": 463625241600.0, + "grad_norm": 0.0331551624405392, + "language_loss": 0.91242051, + "learning_rate": 0.0004676222706605147, + "loss": 0.9240762, + "num_input_tokens_seen": 232027088, + "router_z_loss_mlp": 0.80175781, + "step": 2781, + "time_per_iteration": 2.609180450439453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171695, + "balance_loss_mlp": 1.09149086, + "epoch": 0.535205848403232, + "flos": 710117755392.0, + "grad_norm": 0.03114563748345981, + "language_loss": 0.9013232, + "learning_rate": 0.0004673113885323626, + "loss": 0.91304016, + "num_input_tokens_seen": 232099472, + "router_z_loss_mlp": 0.80175781, + "step": 2782, + "time_per_iteration": 2.889096736907959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167285, + "balance_loss_mlp": 1.08708084, + "epoch": 0.5353982300884956, + "flos": 895791688704.0, + "grad_norm": 0.029628425021764316, + "language_loss": 0.840244, + "learning_rate": 0.00046700051909505494, + "loss": 0.85191679, + "num_input_tokens_seen": 232182528, + "router_z_loss_mlp": 0.80175781, + "step": 2783, + "time_per_iteration": 3.1921920776367188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161558, + "balance_loss_mlp": 1.08130586, + "epoch": 0.5355906117737591, + "flos": 537024247296.0, + "grad_norm": 0.03383499561986932, + "language_loss": 0.89968938, + "learning_rate": 0.000466689662469282, + "loss": 0.91130495, + "num_input_tokens_seen": 232253344, + "router_z_loss_mlp": 0.80224609, + "step": 2784, + "time_per_iteration": 2.644693613052368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160347, + "balance_loss_mlp": 1.08009481, + "epoch": 0.5357829934590227, + "flos": 870327392256.0, + "grad_norm": 0.02956685166305249, + "language_loss": 0.89793074, + "learning_rate": 0.00046637881877572917, + "loss": 0.90953422, + "num_input_tokens_seen": 232337232, + "router_z_loss_mlp": 0.80224609, + "step": 2785, + "time_per_iteration": 3.134896755218506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159974, + "balance_loss_mlp": 1.0797224, + "epoch": 0.5359753751442863, + "flos": 554445606912.0, + "grad_norm": 0.027747995864539122, + "language_loss": 0.88820761, + "learning_rate": 0.0004660679881350764, + "loss": 0.89980739, + "num_input_tokens_seen": 232412864, + "router_z_loss_mlp": 0.80224609, + "step": 2786, + "time_per_iteration": 2.7258269786834717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186935, + "balance_loss_mlp": 1.10682678, + "epoch": 0.5361677568295499, + "flos": 1483756715520.0, + "grad_norm": 0.018012162763561924, + "language_loss": 0.75608146, + "learning_rate": 0.0004657571706679988, + "loss": 0.76795077, + "num_input_tokens_seen": 232639888, + "router_z_loss_mlp": 0.80078125, + "step": 2787, + "time_per_iteration": 5.011500835418701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163662, + "balance_loss_mlp": 1.08345807, + "epoch": 0.5363601385148133, + "flos": 807641568768.0, + "grad_norm": 0.03200093229385197, + "language_loss": 0.83718783, + "learning_rate": 0.0004654463664951667, + "loss": 0.84882444, + "num_input_tokens_seen": 232719248, + "router_z_loss_mlp": 0.80175781, + "step": 2788, + "time_per_iteration": 3.0044353008270264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162852, + "balance_loss_mlp": 1.08274364, + "epoch": 0.5365525202000769, + "flos": 508878971904.0, + "grad_norm": 0.03055357919616021, + "language_loss": 0.89048028, + "learning_rate": 0.0004651355757372447, + "loss": 0.90210879, + "num_input_tokens_seen": 232788464, + "router_z_loss_mlp": 0.80078125, + "step": 2789, + "time_per_iteration": 2.6024739742279053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011626, + "balance_loss_mlp": 1.08277702, + "epoch": 0.5367449018853405, + "flos": 530014625280.0, + "grad_norm": 0.03243837084279447, + "language_loss": 0.90724301, + "learning_rate": 0.00046482479851489274, + "loss": 0.91886902, + "num_input_tokens_seen": 232859792, + "router_z_loss_mlp": 0.79785156, + "step": 2790, + "time_per_iteration": 2.7023818492889404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168089, + "balance_loss_mlp": 1.08840978, + "epoch": 0.5369372835706041, + "flos": 651216082944.0, + "grad_norm": 0.035661652748611536, + "language_loss": 0.83603406, + "learning_rate": 0.00046451403494876525, + "loss": 0.84771496, + "num_input_tokens_seen": 232941472, + "router_z_loss_mlp": 0.79443359, + "step": 2791, + "time_per_iteration": 2.9009790420532227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169917, + "balance_loss_mlp": 1.09033263, + "epoch": 0.5371296652558677, + "flos": 585627700224.0, + "grad_norm": 0.03267915449635738, + "language_loss": 0.90313196, + "learning_rate": 0.0004642032851595111, + "loss": 0.91483116, + "num_input_tokens_seen": 233017120, + "router_z_loss_mlp": 0.79345703, + "step": 2792, + "time_per_iteration": 2.743093967437744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171549, + "balance_loss_mlp": 1.09196496, + "epoch": 0.5373220469411312, + "flos": 597083486208.0, + "grad_norm": 0.03226534649155799, + "language_loss": 0.89917493, + "learning_rate": 0.00046389254926777404, + "loss": 0.91089034, + "num_input_tokens_seen": 233095408, + "router_z_loss_mlp": 0.79345703, + "step": 2793, + "time_per_iteration": 2.816979169845581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162732, + "balance_loss_mlp": 1.08319557, + "epoch": 0.5375144286263948, + "flos": 1116277415424.0, + "grad_norm": 0.030732828924726157, + "language_loss": 0.83480382, + "learning_rate": 0.0004635818273941926, + "loss": 0.84643114, + "num_input_tokens_seen": 233191056, + "router_z_loss_mlp": 0.79443359, + "step": 2794, + "time_per_iteration": 3.538351058959961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156539, + "balance_loss_mlp": 1.07704997, + "epoch": 0.5377068103116583, + "flos": 596768580096.0, + "grad_norm": 0.03686105726392354, + "language_loss": 0.88212651, + "learning_rate": 0.0004632711196593997, + "loss": 0.8936919, + "num_input_tokens_seen": 233265536, + "router_z_loss_mlp": 0.79443359, + "step": 2795, + "time_per_iteration": 2.7304327487945557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153271, + "balance_loss_mlp": 1.07383037, + "epoch": 0.5378991919969219, + "flos": 885649195008.0, + "grad_norm": 0.031821277780470766, + "language_loss": 0.90781128, + "learning_rate": 0.00046296042618402297, + "loss": 0.91934395, + "num_input_tokens_seen": 233348224, + "router_z_loss_mlp": 0.79394531, + "step": 2796, + "time_per_iteration": 3.117605447769165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154822, + "balance_loss_mlp": 1.07523799, + "epoch": 0.5380915736821854, + "flos": 711950069760.0, + "grad_norm": 0.03181223121167454, + "language_loss": 0.84282267, + "learning_rate": 0.0004626497470886839, + "loss": 0.85437095, + "num_input_tokens_seen": 233429344, + "router_z_loss_mlp": 0.79541016, + "step": 2797, + "time_per_iteration": 2.943110704421997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154308, + "balance_loss_mlp": 1.07439017, + "epoch": 0.538283955367449, + "flos": 558114238464.0, + "grad_norm": 0.03131439333064892, + "language_loss": 0.87165904, + "learning_rate": 0.00046233908249399897, + "loss": 0.88320208, + "num_input_tokens_seen": 233504944, + "router_z_loss_mlp": 0.79882812, + "step": 2798, + "time_per_iteration": 2.753664970397949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156214, + "balance_loss_mlp": 1.0763911, + "epoch": 0.5384763370527126, + "flos": 514481975808.0, + "grad_norm": 0.02763164557850803, + "language_loss": 0.84223002, + "learning_rate": 0.00046202843252057905, + "loss": 0.85379213, + "num_input_tokens_seen": 233573072, + "router_z_loss_mlp": 0.79785156, + "step": 2799, + "time_per_iteration": 2.5850727558135986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157398, + "balance_loss_mlp": 1.07767105, + "epoch": 0.5386687187379762, + "flos": 490719737856.0, + "grad_norm": 0.033199019667933, + "language_loss": 0.8910532, + "learning_rate": 0.00046171779728902896, + "loss": 0.90262723, + "num_input_tokens_seen": 233640896, + "router_z_loss_mlp": 0.796875, + "step": 2800, + "time_per_iteration": 2.54720139503479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157318, + "balance_loss_mlp": 1.07730448, + "epoch": 0.5388611004232398, + "flos": 483627523584.0, + "grad_norm": 0.041719681603307614, + "language_loss": 0.92617553, + "learning_rate": 0.000461407176919948, + "loss": 0.93774867, + "num_input_tokens_seen": 233703904, + "router_z_loss_mlp": 0.79980469, + "step": 2801, + "time_per_iteration": 2.5201830863952637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158799, + "balance_loss_mlp": 1.07868993, + "epoch": 0.5390534821085032, + "flos": 562089043968.0, + "grad_norm": 0.03196091571695152, + "language_loss": 0.90337479, + "learning_rate": 0.00046109657153392997, + "loss": 0.91496283, + "num_input_tokens_seen": 233779248, + "router_z_loss_mlp": 0.80078125, + "step": 2802, + "time_per_iteration": 2.694173574447632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160257, + "balance_loss_mlp": 1.08014798, + "epoch": 0.5392458637937668, + "flos": 489360783360.0, + "grad_norm": 0.039860159596143786, + "language_loss": 0.89760619, + "learning_rate": 0.0004607859812515622, + "loss": 0.90920877, + "num_input_tokens_seen": 233847520, + "router_z_loss_mlp": 0.80078125, + "step": 2803, + "time_per_iteration": 2.585549831390381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164203, + "balance_loss_mlp": 1.08404684, + "epoch": 0.5394382454790304, + "flos": 513049161216.0, + "grad_norm": 0.03534563174473093, + "language_loss": 0.94152969, + "learning_rate": 0.00046047540619342667, + "loss": 0.95317167, + "num_input_tokens_seen": 233911328, + "router_z_loss_mlp": 0.80126953, + "step": 2804, + "time_per_iteration": 2.589845895767212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116808, + "balance_loss_mlp": 1.08835244, + "epoch": 0.539630627164294, + "flos": 568688432640.0, + "grad_norm": 0.02864783436473809, + "language_loss": 0.85705817, + "learning_rate": 0.00046016484648009933, + "loss": 0.86873901, + "num_input_tokens_seen": 233987104, + "router_z_loss_mlp": 0.796875, + "step": 2805, + "time_per_iteration": 2.687539577484131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162339, + "balance_loss_mlp": 1.08246911, + "epoch": 0.5398230088495575, + "flos": 527502833664.0, + "grad_norm": 0.03312242512211549, + "language_loss": 0.8782742, + "learning_rate": 0.0004598543022321501, + "loss": 0.88989753, + "num_input_tokens_seen": 234057216, + "router_z_loss_mlp": 0.79833984, + "step": 2806, + "time_per_iteration": 2.6111719608306885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159262, + "balance_loss_mlp": 1.07910562, + "epoch": 0.5400153905348211, + "flos": 539852946432.0, + "grad_norm": 0.03059923694994547, + "language_loss": 0.85068846, + "learning_rate": 0.0004595437735701433, + "loss": 0.86228108, + "num_input_tokens_seen": 234129984, + "router_z_loss_mlp": 0.80126953, + "step": 2807, + "time_per_iteration": 2.668133020401001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158376, + "balance_loss_mlp": 1.07826769, + "epoch": 0.5402077722200846, + "flos": 514664624640.0, + "grad_norm": 0.03937747929323063, + "language_loss": 0.88849455, + "learning_rate": 0.00045923326061463623, + "loss": 0.90007836, + "num_input_tokens_seen": 234203920, + "router_z_loss_mlp": 0.80078125, + "step": 2808, + "time_per_iteration": 2.76680588722229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152678, + "balance_loss_mlp": 1.07261717, + "epoch": 0.5404001539053482, + "flos": 677565974016.0, + "grad_norm": 0.030976456011377742, + "language_loss": 0.87454319, + "learning_rate": 0.00045892276348618113, + "loss": 0.88606995, + "num_input_tokens_seen": 234285440, + "router_z_loss_mlp": 0.80029297, + "step": 2809, + "time_per_iteration": 2.9939539432525635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173447, + "balance_loss_mlp": 1.09410095, + "epoch": 0.5405925355906118, + "flos": 1558189036032.0, + "grad_norm": 0.015961767794208704, + "language_loss": 0.78260827, + "learning_rate": 0.0004586122823053235, + "loss": 0.79434276, + "num_input_tokens_seen": 234521424, + "router_z_loss_mlp": 0.79296875, + "step": 2810, + "time_per_iteration": 4.974013328552246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157913, + "balance_loss_mlp": 1.07818568, + "epoch": 0.5407849172758753, + "flos": 648537105408.0, + "grad_norm": 0.02696900388574031, + "language_loss": 0.85372365, + "learning_rate": 0.000458301817192603, + "loss": 0.8653028, + "num_input_tokens_seen": 234601632, + "router_z_loss_mlp": 0.796875, + "step": 2811, + "time_per_iteration": 2.8575778007507324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118454, + "balance_loss_mlp": 1.1057663, + "epoch": 0.5409772989611389, + "flos": 1410481234944.0, + "grad_norm": 0.012734794042181983, + "language_loss": 0.8084178, + "learning_rate": 0.00045799136826855263, + "loss": 0.82026327, + "num_input_tokens_seen": 234825776, + "router_z_loss_mlp": 0.78710938, + "step": 2812, + "time_per_iteration": 4.809651613235474 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163077, + "balance_loss_mlp": 1.0835402, + "epoch": 0.5411696806464025, + "flos": 555544049664.0, + "grad_norm": 0.031759632467193835, + "language_loss": 0.91974443, + "learning_rate": 0.00045768093565369983, + "loss": 0.93137515, + "num_input_tokens_seen": 234901504, + "router_z_loss_mlp": 0.79492188, + "step": 2813, + "time_per_iteration": 2.7333316802978516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164131, + "balance_loss_mlp": 1.0847373, + "epoch": 0.5413620623316661, + "flos": 529204892160.0, + "grad_norm": 0.03127565438509195, + "language_loss": 0.8788538, + "learning_rate": 0.0004573705194685646, + "loss": 0.89049512, + "num_input_tokens_seen": 234970288, + "router_z_loss_mlp": 0.79199219, + "step": 2814, + "time_per_iteration": 2.645961284637451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164839, + "balance_loss_mlp": 1.08544588, + "epoch": 0.5415544440169295, + "flos": 599851060224.0, + "grad_norm": 0.03485280634812332, + "language_loss": 0.91058564, + "learning_rate": 0.00045706011983366157, + "loss": 0.92223406, + "num_input_tokens_seen": 235039984, + "router_z_loss_mlp": 0.79199219, + "step": 2815, + "time_per_iteration": 2.6676552295684814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161812, + "balance_loss_mlp": 1.08237088, + "epoch": 0.5417468257021931, + "flos": 471713840640.0, + "grad_norm": 0.03625185410953689, + "language_loss": 0.88930029, + "learning_rate": 0.00045674973686949847, + "loss": 0.90091836, + "num_input_tokens_seen": 235105232, + "router_z_loss_mlp": 0.79199219, + "step": 2816, + "time_per_iteration": 2.51118540763855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116016, + "balance_loss_mlp": 1.08076715, + "epoch": 0.5419392073874567, + "flos": 682190057472.0, + "grad_norm": 0.02856526912727588, + "language_loss": 0.90316737, + "learning_rate": 0.0004564393706965766, + "loss": 0.91476899, + "num_input_tokens_seen": 235192560, + "router_z_loss_mlp": 0.79199219, + "step": 2817, + "time_per_iteration": 2.9563546180725098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160311, + "balance_loss_mlp": 1.0809654, + "epoch": 0.5421315890727203, + "flos": 463336531968.0, + "grad_norm": 0.032507832188727104, + "language_loss": 0.87249088, + "learning_rate": 0.00045612902143539116, + "loss": 0.884094, + "num_input_tokens_seen": 235258448, + "router_z_loss_mlp": 0.79199219, + "step": 2818, + "time_per_iteration": 2.5383646488189697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162479, + "balance_loss_mlp": 1.08294284, + "epoch": 0.5423239707579839, + "flos": 437889699840.0, + "grad_norm": 0.03622660962153638, + "language_loss": 0.8863132, + "learning_rate": 0.00045581868920642986, + "loss": 0.89793801, + "num_input_tokens_seen": 235322176, + "router_z_loss_mlp": 0.79296875, + "step": 2819, + "time_per_iteration": 2.4692800045013428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163903, + "balance_loss_mlp": 1.08441401, + "epoch": 0.5425163524432474, + "flos": 459305330688.0, + "grad_norm": 0.036307438946012835, + "language_loss": 0.86308074, + "learning_rate": 0.00045550837413017457, + "loss": 0.8747198, + "num_input_tokens_seen": 235390960, + "router_z_loss_mlp": 0.79296875, + "step": 2820, + "time_per_iteration": 2.59252667427063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160476, + "balance_loss_mlp": 1.08089161, + "epoch": 0.542708734128511, + "flos": 420409943040.0, + "grad_norm": 0.028561818537522772, + "language_loss": 0.89964175, + "learning_rate": 0.0004551980763271005, + "loss": 0.91124654, + "num_input_tokens_seen": 235460976, + "router_z_loss_mlp": 0.79394531, + "step": 2821, + "time_per_iteration": 2.64975643157959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158342, + "balance_loss_mlp": 1.07880592, + "epoch": 0.5429011158137745, + "flos": 679708465152.0, + "grad_norm": 0.03014006642218495, + "language_loss": 0.89564693, + "learning_rate": 0.0004548877959176756, + "loss": 0.90723038, + "num_input_tokens_seen": 235540912, + "router_z_loss_mlp": 0.79345703, + "step": 2822, + "time_per_iteration": 2.881334066390991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166233, + "balance_loss_mlp": 1.08693492, + "epoch": 0.5430934974990381, + "flos": 541967239680.0, + "grad_norm": 0.03201888254331298, + "language_loss": 0.91779578, + "learning_rate": 0.00045457753302236166, + "loss": 0.92945808, + "num_input_tokens_seen": 235608736, + "router_z_loss_mlp": 0.79150391, + "step": 2823, + "time_per_iteration": 2.615506887435913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160293, + "balance_loss_mlp": 1.08075619, + "epoch": 0.5432858791843016, + "flos": 659643056640.0, + "grad_norm": 0.03397006228821556, + "language_loss": 0.93680996, + "learning_rate": 0.00045426728776161353, + "loss": 0.94841284, + "num_input_tokens_seen": 235678720, + "router_z_loss_mlp": 0.79443359, + "step": 2824, + "time_per_iteration": 2.815668821334839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160478, + "balance_loss_mlp": 1.08084619, + "epoch": 0.5434782608695652, + "flos": 532966849536.0, + "grad_norm": 0.030340926449950675, + "language_loss": 0.86484039, + "learning_rate": 0.00045395706025587863, + "loss": 0.87644517, + "num_input_tokens_seen": 235748704, + "router_z_loss_mlp": 0.79589844, + "step": 2825, + "time_per_iteration": 2.677969455718994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159818, + "balance_loss_mlp": 1.0802815, + "epoch": 0.5436706425548288, + "flos": 609632985600.0, + "grad_norm": 0.032758454025991736, + "language_loss": 0.88250875, + "learning_rate": 0.00045364685062559843, + "loss": 0.89410686, + "num_input_tokens_seen": 235828224, + "router_z_loss_mlp": 0.79492188, + "step": 2826, + "time_per_iteration": 2.7975664138793945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160655, + "balance_loss_mlp": 1.08111823, + "epoch": 0.5438630242400924, + "flos": 706772762112.0, + "grad_norm": 0.047560346967580276, + "language_loss": 0.96112239, + "learning_rate": 0.0004533366589912067, + "loss": 0.97272885, + "num_input_tokens_seen": 235909392, + "router_z_loss_mlp": 0.79492188, + "step": 2827, + "time_per_iteration": 2.9455690383911133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161858, + "balance_loss_mlp": 1.08232152, + "epoch": 0.544055405925356, + "flos": 857838291456.0, + "grad_norm": 0.035082604549872, + "language_loss": 0.84527165, + "learning_rate": 0.0004530264854731306, + "loss": 0.8568902, + "num_input_tokens_seen": 235983888, + "router_z_loss_mlp": 0.79492188, + "step": 2828, + "time_per_iteration": 3.0149006843566895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161186, + "balance_loss_mlp": 1.08160186, + "epoch": 0.5442477876106194, + "flos": 572967410688.0, + "grad_norm": 0.029506216108961765, + "language_loss": 0.89973861, + "learning_rate": 0.00045271633019179034, + "loss": 0.91135049, + "num_input_tokens_seen": 236063056, + "router_z_loss_mlp": 0.79541016, + "step": 2829, + "time_per_iteration": 2.7735414505004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162764, + "balance_loss_mlp": 1.08313203, + "epoch": 0.544440169295883, + "flos": 626802565632.0, + "grad_norm": 0.028700635940731967, + "language_loss": 0.92908496, + "learning_rate": 0.0004524061932675986, + "loss": 0.94071257, + "num_input_tokens_seen": 236141104, + "router_z_loss_mlp": 0.79589844, + "step": 2830, + "time_per_iteration": 2.828461170196533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116197, + "balance_loss_mlp": 1.08224237, + "epoch": 0.5446325509811466, + "flos": 837640625664.0, + "grad_norm": 0.03503891147687097, + "language_loss": 0.92219722, + "learning_rate": 0.00045209607482096125, + "loss": 0.93381691, + "num_input_tokens_seen": 236220320, + "router_z_loss_mlp": 0.79541016, + "step": 2831, + "time_per_iteration": 3.0058434009552 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162561, + "balance_loss_mlp": 1.08292878, + "epoch": 0.5448249326664102, + "flos": 484389593088.0, + "grad_norm": 0.03287703969217422, + "language_loss": 0.89665288, + "learning_rate": 0.0004517859749722772, + "loss": 0.90827847, + "num_input_tokens_seen": 236288208, + "router_z_loss_mlp": 0.79443359, + "step": 2832, + "time_per_iteration": 2.6527607440948486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116426, + "balance_loss_mlp": 1.08453321, + "epoch": 0.5450173143516738, + "flos": 562345552896.0, + "grad_norm": 0.03300449363670703, + "language_loss": 0.84396762, + "learning_rate": 0.0004514758938419376, + "loss": 0.85561025, + "num_input_tokens_seen": 236366864, + "router_z_loss_mlp": 0.79541016, + "step": 2833, + "time_per_iteration": 2.799923896789551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176773, + "balance_loss_mlp": 1.09971619, + "epoch": 0.5452096960369373, + "flos": 1473586023936.0, + "grad_norm": 0.016868588983801922, + "language_loss": 0.76920587, + "learning_rate": 0.0004511658315503268, + "loss": 0.78097355, + "num_input_tokens_seen": 236597120, + "router_z_loss_mlp": 0.76953125, + "step": 2834, + "time_per_iteration": 4.904434442520142 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116397, + "balance_loss_mlp": 1.08414805, + "epoch": 0.5454020777222008, + "flos": 466017510912.0, + "grad_norm": 0.028290923396431526, + "language_loss": 0.88719809, + "learning_rate": 0.00045085578821782175, + "loss": 0.8988378, + "num_input_tokens_seen": 236664192, + "router_z_loss_mlp": 0.79589844, + "step": 2835, + "time_per_iteration": 2.5375516414642334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116069, + "balance_loss_mlp": 1.08325195, + "epoch": 0.5455944594074644, + "flos": 1472615109120.0, + "grad_norm": 0.00840245760684232, + "language_loss": 0.76134741, + "learning_rate": 0.0004505457639647917, + "loss": 0.77295429, + "num_input_tokens_seen": 236888784, + "router_z_loss_mlp": 0.7734375, + "step": 2836, + "time_per_iteration": 4.908621549606323 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161179, + "balance_loss_mlp": 1.08121371, + "epoch": 0.545786841092728, + "flos": 534304336896.0, + "grad_norm": 0.026675001792915147, + "language_loss": 0.85451794, + "learning_rate": 0.00045023575891159866, + "loss": 0.86612976, + "num_input_tokens_seen": 236962528, + "router_z_loss_mlp": 0.79931641, + "step": 2837, + "time_per_iteration": 2.77382230758667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167343, + "balance_loss_mlp": 1.08952332, + "epoch": 0.5459792227779915, + "flos": 1355426113536.0, + "grad_norm": 0.010026273514264956, + "language_loss": 0.74763811, + "learning_rate": 0.00044992577317859764, + "loss": 0.75931144, + "num_input_tokens_seen": 237179360, + "router_z_loss_mlp": 0.77734375, + "step": 2838, + "time_per_iteration": 4.8985395431518555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163141, + "balance_loss_mlp": 1.08322346, + "epoch": 0.5461716044632551, + "flos": 639072087552.0, + "grad_norm": 0.03170534586871267, + "language_loss": 0.83100337, + "learning_rate": 0.0004496158068861354, + "loss": 0.8426348, + "num_input_tokens_seen": 237256240, + "router_z_loss_mlp": 0.79833984, + "step": 2839, + "time_per_iteration": 2.8032078742980957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163887, + "balance_loss_mlp": 1.08396888, + "epoch": 0.5463639861485187, + "flos": 603925922304.0, + "grad_norm": 0.031486344316249366, + "language_loss": 0.85257053, + "learning_rate": 0.00044930586015455207, + "loss": 0.86420941, + "num_input_tokens_seen": 237334272, + "router_z_loss_mlp": 0.79833984, + "step": 2840, + "time_per_iteration": 2.780024290084839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168265, + "balance_loss_mlp": 1.08834755, + "epoch": 0.5465563678337823, + "flos": 643752566784.0, + "grad_norm": 0.02832807598538896, + "language_loss": 0.93569458, + "learning_rate": 0.000448995933104179, + "loss": 0.9473772, + "num_input_tokens_seen": 237415408, + "router_z_loss_mlp": 0.79736328, + "step": 2841, + "time_per_iteration": 2.848741292953491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168336, + "balance_loss_mlp": 1.08841801, + "epoch": 0.5467487495190458, + "flos": 615364243968.0, + "grad_norm": 0.03451251764660495, + "language_loss": 0.86641318, + "learning_rate": 0.00044868602585534077, + "loss": 0.87809658, + "num_input_tokens_seen": 237493232, + "router_z_loss_mlp": 0.796875, + "step": 2842, + "time_per_iteration": 2.8590362071990967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166404, + "balance_loss_mlp": 1.08677208, + "epoch": 0.5469411312043093, + "flos": 462127299072.0, + "grad_norm": 0.03329693034046033, + "language_loss": 0.9437651, + "learning_rate": 0.0004483761385283541, + "loss": 0.95542908, + "num_input_tokens_seen": 237556624, + "router_z_loss_mlp": 0.79443359, + "step": 2843, + "time_per_iteration": 2.523390769958496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116664, + "balance_loss_mlp": 1.08691323, + "epoch": 0.5471335128895729, + "flos": 562266963456.0, + "grad_norm": 0.03201679454384124, + "language_loss": 0.87509483, + "learning_rate": 0.0004480662712435281, + "loss": 0.88676119, + "num_input_tokens_seen": 237632048, + "router_z_loss_mlp": 0.79492188, + "step": 2844, + "time_per_iteration": 2.7186124324798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162399, + "balance_loss_mlp": 1.08286297, + "epoch": 0.5473258945748365, + "flos": 519685479936.0, + "grad_norm": 0.032165214678065886, + "language_loss": 0.93768156, + "learning_rate": 0.0004477564241211635, + "loss": 0.94930553, + "num_input_tokens_seen": 237699840, + "router_z_loss_mlp": 0.79345703, + "step": 2845, + "time_per_iteration": 2.5637102127075195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159503, + "balance_loss_mlp": 1.08034766, + "epoch": 0.5475182762601001, + "flos": 434744093184.0, + "grad_norm": 0.03138398317411523, + "language_loss": 0.92521811, + "learning_rate": 0.0004474465972815541, + "loss": 0.93681312, + "num_input_tokens_seen": 237762560, + "router_z_loss_mlp": 0.79101562, + "step": 2846, + "time_per_iteration": 2.470494508743286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162403, + "balance_loss_mlp": 1.08348668, + "epoch": 0.5477106579453636, + "flos": 512573799936.0, + "grad_norm": 0.02767233380819538, + "language_loss": 0.92665255, + "learning_rate": 0.000447136790844985, + "loss": 0.93827659, + "num_input_tokens_seen": 237837152, + "router_z_loss_mlp": 0.78759766, + "step": 2847, + "time_per_iteration": 2.7123520374298096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164922, + "balance_loss_mlp": 1.0861007, + "epoch": 0.5479030396306271, + "flos": 677140277760.0, + "grad_norm": 0.030326073882101023, + "language_loss": 0.85917926, + "learning_rate": 0.00044682700493173385, + "loss": 0.87082845, + "num_input_tokens_seen": 237909488, + "router_z_loss_mlp": 0.78710938, + "step": 2848, + "time_per_iteration": 2.826556921005249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166552, + "balance_loss_mlp": 1.08787405, + "epoch": 0.5480954213158907, + "flos": 877578060288.0, + "grad_norm": 0.033676298977630685, + "language_loss": 0.86673969, + "learning_rate": 0.00044651723966207004, + "loss": 0.87840521, + "num_input_tokens_seen": 237991056, + "router_z_loss_mlp": 0.78564453, + "step": 2849, + "time_per_iteration": 3.192443370819092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164243, + "balance_loss_mlp": 1.08556521, + "epoch": 0.5482878030011543, + "flos": 623174866944.0, + "grad_norm": 0.03042847520175512, + "language_loss": 0.83109522, + "learning_rate": 0.00044620749515625536, + "loss": 0.84273762, + "num_input_tokens_seen": 238064576, + "router_z_loss_mlp": 0.78564453, + "step": 2850, + "time_per_iteration": 2.7753841876983643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164392, + "balance_loss_mlp": 1.08528447, + "epoch": 0.5484801846864179, + "flos": 498257114112.0, + "grad_norm": 0.03264010932273605, + "language_loss": 0.90008557, + "learning_rate": 0.00044589777153454334, + "loss": 0.91172945, + "num_input_tokens_seen": 238136464, + "router_z_loss_mlp": 0.78857422, + "step": 2851, + "time_per_iteration": 2.7295939922332764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162977, + "balance_loss_mlp": 1.08391714, + "epoch": 0.5486725663716814, + "flos": 443353715712.0, + "grad_norm": 0.029420479903708215, + "language_loss": 0.88820338, + "learning_rate": 0.00044558806891717895, + "loss": 0.8998332, + "num_input_tokens_seen": 238198912, + "router_z_loss_mlp": 0.78808594, + "step": 2852, + "time_per_iteration": 2.4784035682678223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164311, + "balance_loss_mlp": 1.08548951, + "epoch": 0.548864948056945, + "flos": 656347728384.0, + "grad_norm": 0.02822438724303185, + "language_loss": 0.84744209, + "learning_rate": 0.0004452783874243998, + "loss": 0.8590852, + "num_input_tokens_seen": 238275184, + "router_z_loss_mlp": 0.78759766, + "step": 2853, + "time_per_iteration": 2.821592092514038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159975, + "balance_loss_mlp": 1.08105898, + "epoch": 0.5490573297422086, + "flos": 547140544512.0, + "grad_norm": 0.03150495246723179, + "language_loss": 0.90787637, + "learning_rate": 0.00044496872717643475, + "loss": 0.91947615, + "num_input_tokens_seen": 238348496, + "router_z_loss_mlp": 0.78710938, + "step": 2854, + "time_per_iteration": 2.6908938884735107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011614, + "balance_loss_mlp": 1.08415222, + "epoch": 0.5492497114274721, + "flos": 1593760897536.0, + "grad_norm": 0.006862097523809848, + "language_loss": 0.77089292, + "learning_rate": 0.00044465908829350453, + "loss": 0.78250694, + "num_input_tokens_seen": 238578464, + "router_z_loss_mlp": 0.77148438, + "step": 2855, + "time_per_iteration": 4.92158579826355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159374, + "balance_loss_mlp": 1.08036256, + "epoch": 0.5494420931127356, + "flos": 752269539840.0, + "grad_norm": 0.030842116299214104, + "language_loss": 0.87009478, + "learning_rate": 0.0004443494708958217, + "loss": 0.88168848, + "num_input_tokens_seen": 238660256, + "router_z_loss_mlp": 0.78759766, + "step": 2856, + "time_per_iteration": 2.952693223953247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155384, + "balance_loss_mlp": 1.07627714, + "epoch": 0.5496344747979992, + "flos": 627304123392.0, + "grad_norm": 0.026887140123268247, + "language_loss": 0.85396117, + "learning_rate": 0.0004440398751035906, + "loss": 0.86551499, + "num_input_tokens_seen": 238745856, + "router_z_loss_mlp": 0.79052734, + "step": 2857, + "time_per_iteration": 2.8657121658325195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156313, + "balance_loss_mlp": 1.07691979, + "epoch": 0.5498268564832628, + "flos": 524124913152.0, + "grad_norm": 0.03681476772579859, + "language_loss": 0.90347362, + "learning_rate": 0.00044373030103700645, + "loss": 0.9150368, + "num_input_tokens_seen": 238813888, + "router_z_loss_mlp": 0.79248047, + "step": 2858, + "time_per_iteration": 2.6372759342193604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161253, + "balance_loss_mlp": 1.08185947, + "epoch": 0.5500192381685264, + "flos": 605777702400.0, + "grad_norm": 0.027579474955625485, + "language_loss": 0.8405782, + "learning_rate": 0.000443420748816257, + "loss": 0.85219079, + "num_input_tokens_seen": 238885440, + "router_z_loss_mlp": 0.79248047, + "step": 2859, + "time_per_iteration": 2.832864999771118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163587, + "balance_loss_mlp": 1.08395553, + "epoch": 0.55021161985379, + "flos": 521654780928.0, + "grad_norm": 0.03409053016014856, + "language_loss": 0.84214079, + "learning_rate": 0.0004431112185615208, + "loss": 0.85377669, + "num_input_tokens_seen": 238960944, + "router_z_loss_mlp": 0.79443359, + "step": 2860, + "time_per_iteration": 2.7533481121063232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165675, + "balance_loss_mlp": 1.0862813, + "epoch": 0.5504040015390534, + "flos": 490654609920.0, + "grad_norm": 0.028251427239966796, + "language_loss": 0.84584463, + "learning_rate": 0.00044280171039296845, + "loss": 0.85750139, + "num_input_tokens_seen": 239030592, + "router_z_loss_mlp": 0.79296875, + "step": 2861, + "time_per_iteration": 2.6798369884490967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116251, + "balance_loss_mlp": 1.08306909, + "epoch": 0.550596383224317, + "flos": 576861625344.0, + "grad_norm": 0.030462386563617952, + "language_loss": 0.93688512, + "learning_rate": 0.0004424922244307616, + "loss": 0.94851023, + "num_input_tokens_seen": 239097440, + "router_z_loss_mlp": 0.79296875, + "step": 2862, + "time_per_iteration": 2.7042698860168457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164147, + "balance_loss_mlp": 1.08461094, + "epoch": 0.5507887649095806, + "flos": 643633044480.0, + "grad_norm": 0.03244616812289036, + "language_loss": 0.87943101, + "learning_rate": 0.00044218276079505315, + "loss": 0.89107251, + "num_input_tokens_seen": 239179872, + "router_z_loss_mlp": 0.79296875, + "step": 2863, + "time_per_iteration": 2.869657278060913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116435, + "balance_loss_mlp": 1.08490932, + "epoch": 0.5509811465948442, + "flos": 532864791552.0, + "grad_norm": 0.03309127401700594, + "language_loss": 0.80069649, + "learning_rate": 0.0004418733196059876, + "loss": 0.81234002, + "num_input_tokens_seen": 239251264, + "router_z_loss_mlp": 0.79248047, + "step": 2864, + "time_per_iteration": 2.694439649581909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164051, + "balance_loss_mlp": 1.08489633, + "epoch": 0.5511735282801077, + "flos": 655983157248.0, + "grad_norm": 0.031218908498787497, + "language_loss": 0.85167533, + "learning_rate": 0.0004415639009837008, + "loss": 0.86331582, + "num_input_tokens_seen": 239326688, + "router_z_loss_mlp": 0.79101562, + "step": 2865, + "time_per_iteration": 2.8214035034179688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160959, + "balance_loss_mlp": 1.08175683, + "epoch": 0.5513659099653713, + "flos": 530609508864.0, + "grad_norm": 0.029306479659861318, + "language_loss": 0.87106019, + "learning_rate": 0.00044125450504831955, + "loss": 0.88266975, + "num_input_tokens_seen": 239401248, + "router_z_loss_mlp": 0.79150391, + "step": 2866, + "time_per_iteration": 2.7755370140075684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157699, + "balance_loss_mlp": 1.0782584, + "epoch": 0.5515582916506349, + "flos": 555973748736.0, + "grad_norm": 0.03358668454464356, + "language_loss": 0.88577026, + "learning_rate": 0.0004409451319199622, + "loss": 0.89734721, + "num_input_tokens_seen": 239471600, + "router_z_loss_mlp": 0.79248047, + "step": 2867, + "time_per_iteration": 2.700601577758789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160497, + "balance_loss_mlp": 1.08105552, + "epoch": 0.5517506733358984, + "flos": 736771819008.0, + "grad_norm": 0.033780629576782226, + "language_loss": 0.90037191, + "learning_rate": 0.0004406357817187381, + "loss": 0.91197693, + "num_input_tokens_seen": 239548592, + "router_z_loss_mlp": 0.79248047, + "step": 2868, + "time_per_iteration": 2.9809505939483643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160757, + "balance_loss_mlp": 1.0816493, + "epoch": 0.551943055021162, + "flos": 1117189206528.0, + "grad_norm": 0.02667902344135768, + "language_loss": 0.86254233, + "learning_rate": 0.0004403264545647474, + "loss": 0.87414992, + "num_input_tokens_seen": 239644432, + "router_z_loss_mlp": 0.79052734, + "step": 2869, + "time_per_iteration": 3.5932819843292236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156378, + "balance_loss_mlp": 1.07727027, + "epoch": 0.5521354367064255, + "flos": 545501612544.0, + "grad_norm": 0.024843999573841903, + "language_loss": 0.89363241, + "learning_rate": 0.00044001715057808154, + "loss": 0.90519619, + "num_input_tokens_seen": 239723392, + "router_z_loss_mlp": 0.79052734, + "step": 2870, + "time_per_iteration": 2.7333626747131348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159059, + "balance_loss_mlp": 1.07999909, + "epoch": 0.5523278183916891, + "flos": 937871614464.0, + "grad_norm": 0.027996488517333572, + "language_loss": 0.86652702, + "learning_rate": 0.0004397078698788232, + "loss": 0.87811756, + "num_input_tokens_seen": 239806896, + "router_z_loss_mlp": 0.79003906, + "step": 2871, + "time_per_iteration": 3.199366807937622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168602, + "balance_loss_mlp": 1.0909729, + "epoch": 0.5525202000769527, + "flos": 1469098927104.0, + "grad_norm": 0.009568898658781464, + "language_loss": 0.80442369, + "learning_rate": 0.0004393986125870456, + "loss": 0.81610966, + "num_input_tokens_seen": 240037824, + "router_z_loss_mlp": 0.77539062, + "step": 2872, + "time_per_iteration": 4.912739515304565 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163231, + "balance_loss_mlp": 1.08426642, + "epoch": 0.5527125817622163, + "flos": 490784865792.0, + "grad_norm": 0.03313805620558485, + "language_loss": 0.83656394, + "learning_rate": 0.00043908937882281343, + "loss": 0.84819627, + "num_input_tokens_seen": 240107952, + "router_z_loss_mlp": 0.78808594, + "step": 2873, + "time_per_iteration": 2.6517224311828613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163059, + "balance_loss_mlp": 1.08409429, + "epoch": 0.5529049634474797, + "flos": 636148061184.0, + "grad_norm": 0.033554896267230024, + "language_loss": 0.87775517, + "learning_rate": 0.0004387801687061814, + "loss": 0.88938576, + "num_input_tokens_seen": 240183824, + "router_z_loss_mlp": 0.78710938, + "step": 2874, + "time_per_iteration": 2.8159070014953613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159743, + "balance_loss_mlp": 1.08073115, + "epoch": 0.5530973451327433, + "flos": 582434429952.0, + "grad_norm": 0.02986403100144585, + "language_loss": 0.86760765, + "learning_rate": 0.0004384709823571958, + "loss": 0.87920505, + "num_input_tokens_seen": 240259296, + "router_z_loss_mlp": 0.78857422, + "step": 2875, + "time_per_iteration": 2.755831480026245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158961, + "balance_loss_mlp": 1.08004439, + "epoch": 0.5532897268180069, + "flos": 1124329084416.0, + "grad_norm": 0.02992932493519035, + "language_loss": 0.88625169, + "learning_rate": 0.0004381618198958932, + "loss": 0.89784127, + "num_input_tokens_seen": 240346768, + "router_z_loss_mlp": 0.78662109, + "step": 2876, + "time_per_iteration": 3.504112720489502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157815, + "balance_loss_mlp": 1.0788027, + "epoch": 0.5534821085032705, + "flos": 638512132608.0, + "grad_norm": 0.032170459842753865, + "language_loss": 0.89321101, + "learning_rate": 0.00043785268144230137, + "loss": 0.90478921, + "num_input_tokens_seen": 240429344, + "router_z_loss_mlp": 0.78808594, + "step": 2877, + "time_per_iteration": 2.889683961868286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158076, + "balance_loss_mlp": 1.07911134, + "epoch": 0.5536744901885341, + "flos": 572216074752.0, + "grad_norm": 0.0339903958733494, + "language_loss": 0.87417912, + "learning_rate": 0.00043754356711643837, + "loss": 0.88575995, + "num_input_tokens_seen": 240497008, + "router_z_loss_mlp": 0.78759766, + "step": 2878, + "time_per_iteration": 2.6604373455047607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115856, + "balance_loss_mlp": 1.07950056, + "epoch": 0.5538668718737976, + "flos": 596916300288.0, + "grad_norm": 0.029580626213001865, + "language_loss": 0.88473797, + "learning_rate": 0.0004372344770383132, + "loss": 0.89632356, + "num_input_tokens_seen": 240578432, + "router_z_loss_mlp": 0.78808594, + "step": 2879, + "time_per_iteration": 2.7906830310821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011565, + "balance_loss_mlp": 1.07753599, + "epoch": 0.5540592535590612, + "flos": 533718185472.0, + "grad_norm": 0.030293675767491222, + "language_loss": 0.88174736, + "learning_rate": 0.00043692541132792507, + "loss": 0.89331234, + "num_input_tokens_seen": 240649136, + "router_z_loss_mlp": 0.78710938, + "step": 2880, + "time_per_iteration": 2.7152342796325684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156751, + "balance_loss_mlp": 1.07764363, + "epoch": 0.5542516352443247, + "flos": 413504380416.0, + "grad_norm": 0.03343546183057337, + "language_loss": 0.89203489, + "learning_rate": 0.00043661637010526384, + "loss": 0.90360242, + "num_input_tokens_seen": 240714240, + "router_z_loss_mlp": 0.78857422, + "step": 2881, + "time_per_iteration": 2.4759325981140137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156889, + "balance_loss_mlp": 1.07792521, + "epoch": 0.5544440169295883, + "flos": 548677418496.0, + "grad_norm": 0.03944129006740139, + "language_loss": 0.89678496, + "learning_rate": 0.00043630735349031025, + "loss": 0.90835381, + "num_input_tokens_seen": 240786928, + "router_z_loss_mlp": 0.78759766, + "step": 2882, + "time_per_iteration": 2.6376428604125977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157119, + "balance_loss_mlp": 1.07815528, + "epoch": 0.5546363986148518, + "flos": 623033877504.0, + "grad_norm": 0.025659357486645176, + "language_loss": 0.85712773, + "learning_rate": 0.00043599836160303495, + "loss": 0.86869895, + "num_input_tokens_seen": 240865328, + "router_z_loss_mlp": 0.78710938, + "step": 2883, + "time_per_iteration": 2.861966133117676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155488, + "balance_loss_mlp": 1.07633352, + "epoch": 0.5548287803001154, + "flos": 706579379712.0, + "grad_norm": 0.03141972013571756, + "language_loss": 0.82934201, + "learning_rate": 0.0004356893945633995, + "loss": 0.8408969, + "num_input_tokens_seen": 240945680, + "router_z_loss_mlp": 0.7890625, + "step": 2884, + "time_per_iteration": 2.9471499919891357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154456, + "balance_loss_mlp": 1.07534921, + "epoch": 0.555021161985379, + "flos": 505184143872.0, + "grad_norm": 0.031430850490502316, + "language_loss": 0.85807753, + "learning_rate": 0.0004353804524913551, + "loss": 0.86962205, + "num_input_tokens_seen": 241010800, + "router_z_loss_mlp": 0.78857422, + "step": 2885, + "time_per_iteration": 2.6182141304016113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154918, + "balance_loss_mlp": 1.07576323, + "epoch": 0.5552135436706426, + "flos": 617209293312.0, + "grad_norm": 0.033803824808406595, + "language_loss": 0.88278472, + "learning_rate": 0.0004350715355068441, + "loss": 0.89433384, + "num_input_tokens_seen": 241085328, + "router_z_loss_mlp": 0.7890625, + "step": 2886, + "time_per_iteration": 2.815993547439575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154719, + "balance_loss_mlp": 1.07556415, + "epoch": 0.5554059253559062, + "flos": 464817010176.0, + "grad_norm": 0.03994579560883884, + "language_loss": 0.85848737, + "learning_rate": 0.00043476264372979847, + "loss": 0.87003452, + "num_input_tokens_seen": 241149600, + "router_z_loss_mlp": 0.7890625, + "step": 2887, + "time_per_iteration": 2.5898871421813965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154914, + "balance_loss_mlp": 1.07618785, + "epoch": 0.5555983070411696, + "flos": 1564874841600.0, + "grad_norm": 0.03588081892536478, + "language_loss": 0.85341823, + "learning_rate": 0.0004344537772801408, + "loss": 0.86496735, + "num_input_tokens_seen": 241244832, + "router_z_loss_mlp": 0.78613281, + "step": 2888, + "time_per_iteration": 3.880375385284424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158798, + "balance_loss_mlp": 1.0821228, + "epoch": 0.5557906887264332, + "flos": 1471226681856.0, + "grad_norm": 0.005822600355857551, + "language_loss": 0.73422456, + "learning_rate": 0.0004341449362777836, + "loss": 0.74581254, + "num_input_tokens_seen": 241479728, + "router_z_loss_mlp": 0.76757812, + "step": 2889, + "time_per_iteration": 4.9117255210876465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155617, + "balance_loss_mlp": 1.07670069, + "epoch": 0.5559830704116968, + "flos": 530863289856.0, + "grad_norm": 0.03666523888945824, + "language_loss": 0.89283395, + "learning_rate": 0.0004338361208426298, + "loss": 0.90439016, + "num_input_tokens_seen": 241545616, + "router_z_loss_mlp": 0.78710938, + "step": 2890, + "time_per_iteration": 2.6093485355377197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155534, + "balance_loss_mlp": 1.07671309, + "epoch": 0.5561754520969604, + "flos": 652518641664.0, + "grad_norm": 0.027207956668339604, + "language_loss": 0.85981715, + "learning_rate": 0.00043352733109457164, + "loss": 0.87137252, + "num_input_tokens_seen": 241629040, + "router_z_loss_mlp": 0.78710938, + "step": 2891, + "time_per_iteration": 2.929133892059326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155522, + "balance_loss_mlp": 1.07670057, + "epoch": 0.556367833782224, + "flos": 735618981888.0, + "grad_norm": 0.028477777137297752, + "language_loss": 0.89055073, + "learning_rate": 0.00043321856715349244, + "loss": 0.90210593, + "num_input_tokens_seen": 241706272, + "router_z_loss_mlp": 0.78662109, + "step": 2892, + "time_per_iteration": 2.94014573097229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154528, + "balance_loss_mlp": 1.0758971, + "epoch": 0.5565602154674875, + "flos": 673640833536.0, + "grad_norm": 0.028305708839331062, + "language_loss": 0.85380936, + "learning_rate": 0.00043290982913926466, + "loss": 0.8653546, + "num_input_tokens_seen": 241782304, + "router_z_loss_mlp": 0.78564453, + "step": 2893, + "time_per_iteration": 2.797816038131714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153225, + "balance_loss_mlp": 1.07449973, + "epoch": 0.556752597152751, + "flos": 587503675392.0, + "grad_norm": 0.03108865563447884, + "language_loss": 0.90100253, + "learning_rate": 0.0004326011171717514, + "loss": 0.91253483, + "num_input_tokens_seen": 241868576, + "router_z_loss_mlp": 0.78613281, + "step": 2894, + "time_per_iteration": 2.885183334350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153367, + "balance_loss_mlp": 1.07426023, + "epoch": 0.5569449788380146, + "flos": 438690700800.0, + "grad_norm": 0.03571349027789826, + "language_loss": 0.87187707, + "learning_rate": 0.0004322924313708051, + "loss": 0.88341075, + "num_input_tokens_seen": 241933696, + "router_z_loss_mlp": 0.78857422, + "step": 2895, + "time_per_iteration": 2.505321502685547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115508, + "balance_loss_mlp": 1.07635403, + "epoch": 0.5571373605232782, + "flos": 503247770112.0, + "grad_norm": 0.03410983593663488, + "language_loss": 0.90630054, + "learning_rate": 0.0004319837718562681, + "loss": 0.91785133, + "num_input_tokens_seen": 242003056, + "router_z_loss_mlp": 0.78613281, + "step": 2896, + "time_per_iteration": 2.6243269443511963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154122, + "balance_loss_mlp": 1.07530081, + "epoch": 0.5573297422085417, + "flos": 578589880320.0, + "grad_norm": 0.033933273128928194, + "language_loss": 0.88206899, + "learning_rate": 0.0004316751387479726, + "loss": 0.89361024, + "num_input_tokens_seen": 242076368, + "router_z_loss_mlp": 0.78662109, + "step": 2897, + "time_per_iteration": 2.7566635608673096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153209, + "balance_loss_mlp": 1.074579, + "epoch": 0.5575221238938053, + "flos": 1346047512576.0, + "grad_norm": 0.03456307454544867, + "language_loss": 0.88955474, + "learning_rate": 0.0004313665321657409, + "loss": 0.90108681, + "num_input_tokens_seen": 242161600, + "router_z_loss_mlp": 0.78564453, + "step": 2898, + "time_per_iteration": 3.766465187072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155323, + "balance_loss_mlp": 1.07616794, + "epoch": 0.5577145055790689, + "flos": 603098724864.0, + "grad_norm": 0.03371138021934881, + "language_loss": 0.86232543, + "learning_rate": 0.00043105795222938436, + "loss": 0.8738786, + "num_input_tokens_seen": 242237904, + "router_z_loss_mlp": 0.7890625, + "step": 2899, + "time_per_iteration": 2.7334022521972656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155497, + "balance_loss_mlp": 1.07658088, + "epoch": 0.5579068872643325, + "flos": 563691045888.0, + "grad_norm": 0.045182395108838744, + "language_loss": 0.86075807, + "learning_rate": 0.00043074939905870467, + "loss": 0.87231296, + "num_input_tokens_seen": 242306736, + "router_z_loss_mlp": 0.78759766, + "step": 2900, + "time_per_iteration": 2.696669340133667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155611, + "balance_loss_mlp": 1.0766468, + "epoch": 0.558099268949596, + "flos": 545588207616.0, + "grad_norm": 0.03640236345196184, + "language_loss": 0.86178941, + "learning_rate": 0.0004304408727734927, + "loss": 0.87334555, + "num_input_tokens_seen": 242376000, + "router_z_loss_mlp": 0.78759766, + "step": 2901, + "time_per_iteration": 2.62982439994812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115605, + "balance_loss_mlp": 1.07727695, + "epoch": 0.5582916506348595, + "flos": 553852724736.0, + "grad_norm": 0.027303392187282394, + "language_loss": 0.9274894, + "learning_rate": 0.0004301323734935288, + "loss": 0.93904984, + "num_input_tokens_seen": 242447056, + "router_z_loss_mlp": 0.78613281, + "step": 2902, + "time_per_iteration": 2.705291986465454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164959, + "balance_loss_mlp": 1.08632815, + "epoch": 0.5584840323201231, + "flos": 544424636928.0, + "grad_norm": 0.032065850930778406, + "language_loss": 0.92794406, + "learning_rate": 0.000429823901338583, + "loss": 0.93959367, + "num_input_tokens_seen": 242514400, + "router_z_loss_mlp": 0.78564453, + "step": 2903, + "time_per_iteration": 2.620115041732788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162843, + "balance_loss_mlp": 1.08421218, + "epoch": 0.5586764140053867, + "flos": 817021992960.0, + "grad_norm": 0.03266293414683286, + "language_loss": 0.92888266, + "learning_rate": 0.00042951545642841513, + "loss": 0.94051105, + "num_input_tokens_seen": 242601616, + "router_z_loss_mlp": 0.78564453, + "step": 2904, + "time_per_iteration": 3.066140651702881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160381, + "balance_loss_mlp": 1.08165538, + "epoch": 0.5588687956906503, + "flos": 487415677440.0, + "grad_norm": 0.02932995016233391, + "language_loss": 0.91419339, + "learning_rate": 0.0004292070388827737, + "loss": 0.92579722, + "num_input_tokens_seen": 242669648, + "router_z_loss_mlp": 0.78613281, + "step": 2905, + "time_per_iteration": 2.5493688583374023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153401, + "balance_loss_mlp": 1.07453251, + "epoch": 0.5590611773759138, + "flos": 453068511744.0, + "grad_norm": 0.02745082882239035, + "language_loss": 0.85835731, + "learning_rate": 0.00042889864882139753, + "loss": 0.86989129, + "num_input_tokens_seen": 242737456, + "router_z_loss_mlp": 0.78710938, + "step": 2906, + "time_per_iteration": 2.572270631790161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115253, + "balance_loss_mlp": 1.07347012, + "epoch": 0.5592535590611774, + "flos": 521956225536.0, + "grad_norm": 0.03525028250709423, + "language_loss": 0.87143886, + "learning_rate": 0.0004285902863640139, + "loss": 0.88296419, + "num_input_tokens_seen": 242807008, + "router_z_loss_mlp": 0.78857422, + "step": 2907, + "time_per_iteration": 2.657799482345581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153209, + "balance_loss_mlp": 1.07448292, + "epoch": 0.5594459407464409, + "flos": 553600945152.0, + "grad_norm": 0.02873947635122419, + "language_loss": 0.90871602, + "learning_rate": 0.00042828195163033966, + "loss": 0.92024809, + "num_input_tokens_seen": 242877328, + "router_z_loss_mlp": 0.78613281, + "step": 2908, + "time_per_iteration": 2.6421632766723633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152251, + "balance_loss_mlp": 1.07323921, + "epoch": 0.5596383224317045, + "flos": 485787479040.0, + "grad_norm": 0.030747286656696786, + "language_loss": 0.84394485, + "learning_rate": 0.0004279736447400812, + "loss": 0.85546738, + "num_input_tokens_seen": 242943152, + "router_z_loss_mlp": 0.78808594, + "step": 2909, + "time_per_iteration": 2.571681022644043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152122, + "balance_loss_mlp": 1.07344413, + "epoch": 0.5598307041169681, + "flos": 612379092480.0, + "grad_norm": 0.030942423142950287, + "language_loss": 0.83957374, + "learning_rate": 0.00042766536581293385, + "loss": 0.85109496, + "num_input_tokens_seen": 243014656, + "router_z_loss_mlp": 0.78613281, + "step": 2910, + "time_per_iteration": 2.7282116413116455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155729, + "balance_loss_mlp": 1.07662177, + "epoch": 0.5600230858022316, + "flos": 489916735488.0, + "grad_norm": 0.03226747500803281, + "language_loss": 0.85277241, + "learning_rate": 0.0004273571149685819, + "loss": 0.86432964, + "num_input_tokens_seen": 243089040, + "router_z_loss_mlp": 0.78857422, + "step": 2911, + "time_per_iteration": 2.787032127380371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154593, + "balance_loss_mlp": 1.0759151, + "epoch": 0.5602154674874952, + "flos": 599981316096.0, + "grad_norm": 0.03215276166374932, + "language_loss": 0.88704693, + "learning_rate": 0.00042704889232669937, + "loss": 0.89859283, + "num_input_tokens_seen": 243162480, + "router_z_loss_mlp": 0.78613281, + "step": 2912, + "time_per_iteration": 2.686586856842041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154743, + "balance_loss_mlp": 1.07611275, + "epoch": 0.5604078491727588, + "flos": 587062516224.0, + "grad_norm": 0.032254540051477425, + "language_loss": 0.9111523, + "learning_rate": 0.0004267406980069484, + "loss": 0.92269969, + "num_input_tokens_seen": 243232880, + "router_z_loss_mlp": 0.78466797, + "step": 2913, + "time_per_iteration": 2.6899847984313965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154041, + "balance_loss_mlp": 1.07545817, + "epoch": 0.5606002308580224, + "flos": 542327808000.0, + "grad_norm": 0.028324891167666608, + "language_loss": 0.8452785, + "learning_rate": 0.0004264325321289808, + "loss": 0.85681891, + "num_input_tokens_seen": 243309168, + "router_z_loss_mlp": 0.78515625, + "step": 2914, + "time_per_iteration": 2.770299196243286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151899, + "balance_loss_mlp": 1.07331622, + "epoch": 0.5607926125432858, + "flos": 585078478848.0, + "grad_norm": 0.03365993170310601, + "language_loss": 0.91764051, + "learning_rate": 0.00042612439481243736, + "loss": 0.92915952, + "num_input_tokens_seen": 243382064, + "router_z_loss_mlp": 0.78515625, + "step": 2915, + "time_per_iteration": 2.7451834678649902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162837, + "balance_loss_mlp": 1.08406377, + "epoch": 0.5609849942285494, + "flos": 628630150656.0, + "grad_norm": 0.03395322139017605, + "language_loss": 0.95402431, + "learning_rate": 0.00042581628617694735, + "loss": 0.96565264, + "num_input_tokens_seen": 243452064, + "router_z_loss_mlp": 0.78613281, + "step": 2916, + "time_per_iteration": 2.7379772663116455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157541, + "balance_loss_mlp": 1.07871938, + "epoch": 0.561177375913813, + "flos": 589454785536.0, + "grad_norm": 0.03197816551531196, + "language_loss": 0.86920869, + "learning_rate": 0.0004255082063421296, + "loss": 0.88078409, + "num_input_tokens_seen": 243525600, + "router_z_loss_mlp": 0.78759766, + "step": 2917, + "time_per_iteration": 2.7153422832489014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161631, + "balance_loss_mlp": 1.08285797, + "epoch": 0.5613697575990766, + "flos": 528143379456.0, + "grad_norm": 0.03128753614155992, + "language_loss": 0.89917612, + "learning_rate": 0.00042520015542759065, + "loss": 0.91079247, + "num_input_tokens_seen": 243605536, + "router_z_loss_mlp": 0.78710938, + "step": 2918, + "time_per_iteration": 2.8688042163848877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165136, + "balance_loss_mlp": 1.08636212, + "epoch": 0.5615621392843402, + "flos": 643874090496.0, + "grad_norm": 0.03249260096588731, + "language_loss": 0.93211949, + "learning_rate": 0.00042489213355292687, + "loss": 0.94377089, + "num_input_tokens_seen": 243684208, + "router_z_loss_mlp": 0.78613281, + "step": 2919, + "time_per_iteration": 2.8982832431793213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167734, + "balance_loss_mlp": 1.08900821, + "epoch": 0.5617545209696037, + "flos": 428656995840.0, + "grad_norm": 0.034334958581954525, + "language_loss": 0.87036526, + "learning_rate": 0.00042458414083772276, + "loss": 0.88204259, + "num_input_tokens_seen": 243749376, + "router_z_loss_mlp": 0.78466797, + "step": 2920, + "time_per_iteration": 2.5067636966705322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164187, + "balance_loss_mlp": 1.08536625, + "epoch": 0.5619469026548672, + "flos": 569589490176.0, + "grad_norm": 0.025989129211014445, + "language_loss": 0.89547098, + "learning_rate": 0.000424276177401552, + "loss": 0.90711284, + "num_input_tokens_seen": 243828096, + "router_z_loss_mlp": 0.78710938, + "step": 2921, + "time_per_iteration": 2.810723304748535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158764, + "balance_loss_mlp": 1.07975173, + "epoch": 0.5621392843401308, + "flos": 506243655168.0, + "grad_norm": 0.03554030610259364, + "language_loss": 0.91916943, + "learning_rate": 0.0004239682433639763, + "loss": 0.93075705, + "num_input_tokens_seen": 243896752, + "router_z_loss_mlp": 0.7890625, + "step": 2922, + "time_per_iteration": 2.6607391834259033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159452, + "balance_loss_mlp": 1.08034527, + "epoch": 0.5623316660253944, + "flos": 518009617920.0, + "grad_norm": 0.03283867999662062, + "language_loss": 0.91225737, + "learning_rate": 0.0004236603388445467, + "loss": 0.92385185, + "num_input_tokens_seen": 243964592, + "router_z_loss_mlp": 0.78955078, + "step": 2923, + "time_per_iteration": 2.586524248123169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159206, + "balance_loss_mlp": 1.08043242, + "epoch": 0.5625240477106579, + "flos": 607138658304.0, + "grad_norm": 0.07898356089021562, + "language_loss": 0.87176222, + "learning_rate": 0.00042335246396280166, + "loss": 0.88335431, + "num_input_tokens_seen": 244036656, + "router_z_loss_mlp": 0.78710938, + "step": 2924, + "time_per_iteration": 2.7597639560699463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115906, + "balance_loss_mlp": 1.08004844, + "epoch": 0.5627164293959215, + "flos": 451340256768.0, + "grad_norm": 0.0302800933285396, + "language_loss": 0.96241242, + "learning_rate": 0.0004230446188382693, + "loss": 0.97400308, + "num_input_tokens_seen": 244102704, + "router_z_loss_mlp": 0.7890625, + "step": 2925, + "time_per_iteration": 2.573899030685425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158596, + "balance_loss_mlp": 1.07977474, + "epoch": 0.5629088110811851, + "flos": 743436335616.0, + "grad_norm": 0.03229142562201564, + "language_loss": 0.85888505, + "learning_rate": 0.0004227368035904654, + "loss": 0.87047106, + "num_input_tokens_seen": 244186640, + "router_z_loss_mlp": 0.78759766, + "step": 2926, + "time_per_iteration": 2.9811575412750244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161727, + "balance_loss_mlp": 1.08295333, + "epoch": 0.5631011927664487, + "flos": 497979138048.0, + "grad_norm": 0.030188812186764755, + "language_loss": 0.88692701, + "learning_rate": 0.00042242901833889474, + "loss": 0.89854425, + "num_input_tokens_seen": 244257680, + "router_z_loss_mlp": 0.78710938, + "step": 2927, + "time_per_iteration": 2.6326565742492676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160764, + "balance_loss_mlp": 1.08194327, + "epoch": 0.5632935744517122, + "flos": 887594300928.0, + "grad_norm": 0.033144673445412554, + "language_loss": 0.91819888, + "learning_rate": 0.0004221212632030501, + "loss": 0.92980659, + "num_input_tokens_seen": 244331248, + "router_z_loss_mlp": 0.78759766, + "step": 2928, + "time_per_iteration": 3.0669453144073486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115887, + "balance_loss_mlp": 1.08014381, + "epoch": 0.5634859561369757, + "flos": 605901227520.0, + "grad_norm": 0.03167965641147859, + "language_loss": 0.85548306, + "learning_rate": 0.0004218135383024124, + "loss": 0.86707169, + "num_input_tokens_seen": 244403920, + "router_z_loss_mlp": 0.78662109, + "step": 2929, + "time_per_iteration": 2.704127788543701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154152, + "balance_loss_mlp": 1.07542574, + "epoch": 0.5636783378222393, + "flos": 454902827520.0, + "grad_norm": 0.0331862396137692, + "language_loss": 0.91072655, + "learning_rate": 0.0004215058437564511, + "loss": 0.92226809, + "num_input_tokens_seen": 244470464, + "router_z_loss_mlp": 0.78662109, + "step": 2930, + "time_per_iteration": 2.5648486614227295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153736, + "balance_loss_mlp": 1.07496285, + "epoch": 0.5638707195075029, + "flos": 519461898240.0, + "grad_norm": 0.030026295980520465, + "language_loss": 0.87243164, + "learning_rate": 0.00042119817968462397, + "loss": 0.88396895, + "num_input_tokens_seen": 244536864, + "router_z_loss_mlp": 0.78613281, + "step": 2931, + "time_per_iteration": 2.596165895462036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011545, + "balance_loss_mlp": 1.07572603, + "epoch": 0.5640631011927665, + "flos": 565844270592.0, + "grad_norm": 0.035813464167598875, + "language_loss": 0.92307299, + "learning_rate": 0.0004208905462063766, + "loss": 0.934618, + "num_input_tokens_seen": 244603344, + "router_z_loss_mlp": 0.78564453, + "step": 2932, + "time_per_iteration": 2.6596782207489014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161524, + "balance_loss_mlp": 1.0827024, + "epoch": 0.56425548287803, + "flos": 518037815808.0, + "grad_norm": 0.03163601566095553, + "language_loss": 0.90576756, + "learning_rate": 0.00042058294344114315, + "loss": 0.91738278, + "num_input_tokens_seen": 244671984, + "router_z_loss_mlp": 0.78564453, + "step": 2933, + "time_per_iteration": 2.6681416034698486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160347, + "balance_loss_mlp": 1.08157361, + "epoch": 0.5644478645632935, + "flos": 855669603840.0, + "grad_norm": 0.031443670044009366, + "language_loss": 0.83703303, + "learning_rate": 0.0004202753715083456, + "loss": 0.84863651, + "num_input_tokens_seen": 244754000, + "router_z_loss_mlp": 0.78515625, + "step": 2934, + "time_per_iteration": 3.1047325134277344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159543, + "balance_loss_mlp": 1.08081746, + "epoch": 0.5646402462485571, + "flos": 554495271936.0, + "grad_norm": 0.034946601892201584, + "language_loss": 0.87802339, + "learning_rate": 0.0004199678305273936, + "loss": 0.88961881, + "num_input_tokens_seen": 244820896, + "router_z_loss_mlp": 0.78613281, + "step": 2935, + "time_per_iteration": 2.649768352508545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159598, + "balance_loss_mlp": 1.08092046, + "epoch": 0.5648326279338207, + "flos": 687310969344.0, + "grad_norm": 0.04027660967531297, + "language_loss": 0.86366433, + "learning_rate": 0.0004196603206176854, + "loss": 0.87526035, + "num_input_tokens_seen": 244904464, + "router_z_loss_mlp": 0.78613281, + "step": 2936, + "time_per_iteration": 2.916745662689209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158764, + "balance_loss_mlp": 1.08003819, + "epoch": 0.5650250096190843, + "flos": 804682613760.0, + "grad_norm": 0.03045212290633188, + "language_loss": 0.89034498, + "learning_rate": 0.000419352841898607, + "loss": 0.9019326, + "num_input_tokens_seen": 244983760, + "router_z_loss_mlp": 0.78662109, + "step": 2937, + "time_per_iteration": 3.019742250442505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154573, + "balance_loss_mlp": 1.07541847, + "epoch": 0.5652173913043478, + "flos": 583144106496.0, + "grad_norm": 0.0352415717236192, + "language_loss": 0.82975399, + "learning_rate": 0.000419045394489532, + "loss": 0.84129971, + "num_input_tokens_seen": 245053184, + "router_z_loss_mlp": 0.79003906, + "step": 2938, + "time_per_iteration": 2.7263834476470947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155775, + "balance_loss_mlp": 1.07661998, + "epoch": 0.5654097729896114, + "flos": 822167099904.0, + "grad_norm": 0.030545896529673648, + "language_loss": 0.81679785, + "learning_rate": 0.0004187379785098224, + "loss": 0.82835561, + "num_input_tokens_seen": 245137408, + "router_z_loss_mlp": 0.7890625, + "step": 2939, + "time_per_iteration": 3.125208854675293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155934, + "balance_loss_mlp": 1.07682657, + "epoch": 0.565602154674875, + "flos": 785481332736.0, + "grad_norm": 0.038076573598017076, + "language_loss": 0.89879513, + "learning_rate": 0.00041843059407882744, + "loss": 0.9103545, + "num_input_tokens_seen": 245215504, + "router_z_loss_mlp": 0.78857422, + "step": 2940, + "time_per_iteration": 2.9577417373657227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157161, + "balance_loss_mlp": 1.07814884, + "epoch": 0.5657945363601385, + "flos": 550744048128.0, + "grad_norm": 0.03292975836505615, + "language_loss": 0.88439214, + "learning_rate": 0.0004181232413158842, + "loss": 0.89596379, + "num_input_tokens_seen": 245286032, + "router_z_loss_mlp": 0.78759766, + "step": 2941, + "time_per_iteration": 2.636016845703125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156819, + "balance_loss_mlp": 1.07771146, + "epoch": 0.5659869180454021, + "flos": 669331656192.0, + "grad_norm": 0.0384606105275957, + "language_loss": 0.88344961, + "learning_rate": 0.0004178159203403179, + "loss": 0.89501786, + "num_input_tokens_seen": 245359040, + "router_z_loss_mlp": 0.78857422, + "step": 2942, + "time_per_iteration": 2.873724937438965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157408, + "balance_loss_mlp": 1.07839596, + "epoch": 0.5661792997306656, + "flos": 500948826624.0, + "grad_norm": 0.031907837289758996, + "language_loss": 0.86677325, + "learning_rate": 0.0004175086312714409, + "loss": 0.8783474, + "num_input_tokens_seen": 245426384, + "router_z_loss_mlp": 0.78808594, + "step": 2943, + "time_per_iteration": 2.553450107574463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160396, + "balance_loss_mlp": 1.08138418, + "epoch": 0.5663716814159292, + "flos": 602362851840.0, + "grad_norm": 0.02897032807353051, + "language_loss": 0.8872959, + "learning_rate": 0.00041720137422855366, + "loss": 0.89889991, + "num_input_tokens_seen": 245501216, + "router_z_loss_mlp": 0.78759766, + "step": 2944, + "time_per_iteration": 2.7116591930389404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159876, + "balance_loss_mlp": 1.08095932, + "epoch": 0.5665640631011928, + "flos": 542032367616.0, + "grad_norm": 0.031139658556859174, + "language_loss": 0.83964241, + "learning_rate": 0.00041689414933094383, + "loss": 0.85124123, + "num_input_tokens_seen": 245571600, + "router_z_loss_mlp": 0.78710938, + "step": 2945, + "time_per_iteration": 2.638216495513916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158364, + "balance_loss_mlp": 1.07968628, + "epoch": 0.5667564447864564, + "flos": 603061794816.0, + "grad_norm": 0.037847476611961306, + "language_loss": 0.8757143, + "learning_rate": 0.00041658695669788653, + "loss": 0.88729787, + "num_input_tokens_seen": 245645632, + "router_z_loss_mlp": 0.78613281, + "step": 2946, + "time_per_iteration": 2.736724615097046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159515, + "balance_loss_mlp": 1.08074152, + "epoch": 0.5669488264717198, + "flos": 660722033664.0, + "grad_norm": 0.03809672024086723, + "language_loss": 0.87564874, + "learning_rate": 0.00041627979644864453, + "loss": 0.88724387, + "num_input_tokens_seen": 245715776, + "router_z_loss_mlp": 0.78662109, + "step": 2947, + "time_per_iteration": 2.787102460861206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160652, + "balance_loss_mlp": 1.08192623, + "epoch": 0.5671412081569834, + "flos": 486382362624.0, + "grad_norm": 0.028726289994514737, + "language_loss": 0.86769605, + "learning_rate": 0.0004159726687024683, + "loss": 0.87930262, + "num_input_tokens_seen": 245785328, + "router_z_loss_mlp": 0.78662109, + "step": 2948, + "time_per_iteration": 2.627268075942993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157953, + "balance_loss_mlp": 1.07917941, + "epoch": 0.567333589842247, + "flos": 731060026368.0, + "grad_norm": 0.031224685517340662, + "language_loss": 0.85094821, + "learning_rate": 0.00041566557357859506, + "loss": 0.86252779, + "num_input_tokens_seen": 245858000, + "router_z_loss_mlp": 0.78710938, + "step": 2949, + "time_per_iteration": 2.903480052947998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115639, + "balance_loss_mlp": 1.07737851, + "epoch": 0.5675259715275106, + "flos": 970558381056.0, + "grad_norm": 0.02889906202993953, + "language_loss": 0.84761345, + "learning_rate": 0.0004153585111962502, + "loss": 0.85917735, + "num_input_tokens_seen": 245950640, + "router_z_loss_mlp": 0.78857422, + "step": 2950, + "time_per_iteration": 3.327157497406006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155395, + "balance_loss_mlp": 1.07638264, + "epoch": 0.5677183532127742, + "flos": 566213571072.0, + "grad_norm": 0.036221800053715905, + "language_loss": 0.90357536, + "learning_rate": 0.0004150514816746453, + "loss": 0.9151293, + "num_input_tokens_seen": 246019568, + "router_z_loss_mlp": 0.78857422, + "step": 2951, + "time_per_iteration": 2.664881467819214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155178, + "balance_loss_mlp": 1.07640433, + "epoch": 0.5679107348980377, + "flos": 552745549824.0, + "grad_norm": 0.032718571293428464, + "language_loss": 0.90599716, + "learning_rate": 0.0004147444851329802, + "loss": 0.91754901, + "num_input_tokens_seen": 246089520, + "router_z_loss_mlp": 0.78710938, + "step": 2952, + "time_per_iteration": 2.659607410430908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156293, + "balance_loss_mlp": 1.07752001, + "epoch": 0.5681031165833013, + "flos": 820840346112.0, + "grad_norm": 0.029462667986489877, + "language_loss": 0.91018391, + "learning_rate": 0.00041443752169044126, + "loss": 0.92174685, + "num_input_tokens_seen": 246165920, + "router_z_loss_mlp": 0.78710938, + "step": 2953, + "time_per_iteration": 3.0214719772338867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115648, + "balance_loss_mlp": 1.07775402, + "epoch": 0.5682954982685648, + "flos": 619145667072.0, + "grad_norm": 0.03021657930021912, + "language_loss": 0.89565808, + "learning_rate": 0.0004141305914662025, + "loss": 0.90722287, + "num_input_tokens_seen": 246238672, + "router_z_loss_mlp": 0.78662109, + "step": 2954, + "time_per_iteration": 2.7215545177459717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154854, + "balance_loss_mlp": 1.07608008, + "epoch": 0.5684878799538284, + "flos": 649251511296.0, + "grad_norm": 0.03170231797387521, + "language_loss": 0.85884857, + "learning_rate": 0.0004138236945794246, + "loss": 0.87039715, + "num_input_tokens_seen": 246320208, + "router_z_loss_mlp": 0.78613281, + "step": 2955, + "time_per_iteration": 2.896960496902466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154548, + "balance_loss_mlp": 1.07587004, + "epoch": 0.5686802616390919, + "flos": 807352859136.0, + "grad_norm": 0.03477888356704498, + "language_loss": 0.88849628, + "learning_rate": 0.00041351683114925576, + "loss": 0.90004176, + "num_input_tokens_seen": 246406464, + "router_z_loss_mlp": 0.78564453, + "step": 2956, + "time_per_iteration": 3.056138753890991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155475, + "balance_loss_mlp": 1.07698798, + "epoch": 0.5688726433243555, + "flos": 548175860736.0, + "grad_norm": 0.02988071875067647, + "language_loss": 0.91774637, + "learning_rate": 0.0004132100012948308, + "loss": 0.92930108, + "num_input_tokens_seen": 246477456, + "router_z_loss_mlp": 0.78320312, + "step": 2957, + "time_per_iteration": 2.620039701461792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153148, + "balance_loss_mlp": 1.07475579, + "epoch": 0.5690650250096191, + "flos": 487545933312.0, + "grad_norm": 0.03388139796228596, + "language_loss": 0.90210378, + "learning_rate": 0.00041290320513527145, + "loss": 0.91363525, + "num_input_tokens_seen": 246541744, + "router_z_loss_mlp": 0.78222656, + "step": 2958, + "time_per_iteration": 2.5424137115478516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158065, + "balance_loss_mlp": 1.07953036, + "epoch": 0.5692574066948827, + "flos": 578554951680.0, + "grad_norm": 0.03065337308060062, + "language_loss": 0.9014492, + "learning_rate": 0.0004125964427896867, + "loss": 0.91302985, + "num_input_tokens_seen": 246611440, + "router_z_loss_mlp": 0.78369141, + "step": 2959, + "time_per_iteration": 2.6540746688842773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157828, + "balance_loss_mlp": 1.07924569, + "epoch": 0.5694497883801463, + "flos": 455219735040.0, + "grad_norm": 0.03288997710459115, + "language_loss": 0.8486557, + "learning_rate": 0.0004122897143771723, + "loss": 0.86023396, + "num_input_tokens_seen": 246676496, + "router_z_loss_mlp": 0.78515625, + "step": 2960, + "time_per_iteration": 2.5677952766418457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157581, + "balance_loss_mlp": 1.07899833, + "epoch": 0.5696421700654097, + "flos": 560582369280.0, + "grad_norm": 0.029260680521972587, + "language_loss": 0.86686659, + "learning_rate": 0.0004119830200168109, + "loss": 0.87844241, + "num_input_tokens_seen": 246746464, + "router_z_loss_mlp": 0.78515625, + "step": 2961, + "time_per_iteration": 2.661398410797119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116102, + "balance_loss_mlp": 1.08243668, + "epoch": 0.5698345517506733, + "flos": 466501604352.0, + "grad_norm": 0.06131137217333051, + "language_loss": 0.93434393, + "learning_rate": 0.0004116763598276714, + "loss": 0.94595408, + "num_input_tokens_seen": 246811808, + "router_z_loss_mlp": 0.78515625, + "step": 2962, + "time_per_iteration": 2.5421509742736816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161307, + "balance_loss_mlp": 1.08267653, + "epoch": 0.5700269334359369, + "flos": 607191051264.0, + "grad_norm": 0.033090735660708526, + "language_loss": 0.8645342, + "learning_rate": 0.00041136973392881017, + "loss": 0.87614727, + "num_input_tokens_seen": 246890432, + "router_z_loss_mlp": 0.78515625, + "step": 2963, + "time_per_iteration": 2.826312303543091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116111, + "balance_loss_mlp": 1.08233654, + "epoch": 0.5702193151212005, + "flos": 563856230400.0, + "grad_norm": 0.029371137494056676, + "language_loss": 0.87366056, + "learning_rate": 0.00041106314243926983, + "loss": 0.88527167, + "num_input_tokens_seen": 246959616, + "router_z_loss_mlp": 0.78613281, + "step": 2964, + "time_per_iteration": 2.729848861694336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163001, + "balance_loss_mlp": 1.08432257, + "epoch": 0.570411696806464, + "flos": 524309563392.0, + "grad_norm": 0.030081020285570834, + "language_loss": 0.91922152, + "learning_rate": 0.0004107565854780798, + "loss": 0.93085158, + "num_input_tokens_seen": 247030656, + "router_z_loss_mlp": 0.78564453, + "step": 2965, + "time_per_iteration": 2.6243247985839844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162398, + "balance_loss_mlp": 1.08348167, + "epoch": 0.5706040784917276, + "flos": 719471983104.0, + "grad_norm": 0.03134673766290682, + "language_loss": 0.86833286, + "learning_rate": 0.000410450063164256, + "loss": 0.87995684, + "num_input_tokens_seen": 247105872, + "router_z_loss_mlp": 0.78710938, + "step": 2966, + "time_per_iteration": 2.8488268852233887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160157, + "balance_loss_mlp": 1.08109784, + "epoch": 0.5707964601769911, + "flos": 477670682112.0, + "grad_norm": 0.03469711129941245, + "language_loss": 0.88420385, + "learning_rate": 0.00041014357561680115, + "loss": 0.89580548, + "num_input_tokens_seen": 247170448, + "router_z_loss_mlp": 0.78808594, + "step": 2967, + "time_per_iteration": 2.531399965286255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158843, + "balance_loss_mlp": 1.07997382, + "epoch": 0.5709888418622547, + "flos": 581216464896.0, + "grad_norm": 0.0299141756983156, + "language_loss": 0.91230297, + "learning_rate": 0.0004098371229547039, + "loss": 0.92389137, + "num_input_tokens_seen": 247240400, + "router_z_loss_mlp": 0.78662109, + "step": 2968, + "time_per_iteration": 2.7010715007781982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166153, + "balance_loss_mlp": 1.08947754, + "epoch": 0.5711812235475183, + "flos": 1583192707584.0, + "grad_norm": 0.007250174551889785, + "language_loss": 0.80010808, + "learning_rate": 0.0004095307052969399, + "loss": 0.8117696, + "num_input_tokens_seen": 247469136, + "router_z_loss_mlp": 0.76757812, + "step": 2969, + "time_per_iteration": 4.720959663391113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158975, + "balance_loss_mlp": 1.08001077, + "epoch": 0.5713736052327818, + "flos": 469497489408.0, + "grad_norm": 0.030927251593918268, + "language_loss": 0.85219097, + "learning_rate": 0.00040922432276247107, + "loss": 0.86378068, + "num_input_tokens_seen": 247537712, + "router_z_loss_mlp": 0.78710938, + "step": 2970, + "time_per_iteration": 2.5976855754852295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155112, + "balance_loss_mlp": 1.07610035, + "epoch": 0.5715659869180454, + "flos": 538754503680.0, + "grad_norm": 0.02782082883725602, + "language_loss": 0.88734138, + "learning_rate": 0.0004089179754702457, + "loss": 0.89889252, + "num_input_tokens_seen": 247613872, + "router_z_loss_mlp": 0.78759766, + "step": 2971, + "time_per_iteration": 2.735511064529419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155002, + "balance_loss_mlp": 1.07608509, + "epoch": 0.571758368603309, + "flos": 657250787328.0, + "grad_norm": 0.03021364085019089, + "language_loss": 0.86246514, + "learning_rate": 0.00040861166353919843, + "loss": 0.87401509, + "num_input_tokens_seen": 247686064, + "router_z_loss_mlp": 0.78710938, + "step": 2972, + "time_per_iteration": 2.784243583679199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156758, + "balance_loss_mlp": 1.07808018, + "epoch": 0.5719507502885726, + "flos": 669099342336.0, + "grad_norm": 0.04093131787913085, + "language_loss": 0.87037605, + "learning_rate": 0.00040830538708824983, + "loss": 0.8819437, + "num_input_tokens_seen": 247760384, + "router_z_loss_mlp": 0.78564453, + "step": 2973, + "time_per_iteration": 2.847334861755371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156641, + "balance_loss_mlp": 1.07815385, + "epoch": 0.572143131973836, + "flos": 477279914496.0, + "grad_norm": 0.029260532033913305, + "language_loss": 0.87478364, + "learning_rate": 0.000407999146236307, + "loss": 0.88635004, + "num_input_tokens_seen": 247824768, + "router_z_loss_mlp": 0.78417969, + "step": 2974, + "time_per_iteration": 2.5809874534606934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156886, + "balance_loss_mlp": 1.07849395, + "epoch": 0.5723355136590996, + "flos": 540534425088.0, + "grad_norm": 0.03484414683288605, + "language_loss": 0.89636898, + "learning_rate": 0.0004076929411022634, + "loss": 0.90793782, + "num_input_tokens_seen": 247894448, + "router_z_loss_mlp": 0.78320312, + "step": 2975, + "time_per_iteration": 2.631016969680786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156314, + "balance_loss_mlp": 1.07782686, + "epoch": 0.5725278953443632, + "flos": 825649079808.0, + "grad_norm": 0.03393435544828211, + "language_loss": 0.84972572, + "learning_rate": 0.0004073867718049982, + "loss": 0.86128891, + "num_input_tokens_seen": 247976432, + "router_z_loss_mlp": 0.78369141, + "step": 2976, + "time_per_iteration": 3.09523606300354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158881, + "balance_loss_mlp": 1.08044088, + "epoch": 0.5727202770296268, + "flos": 588569190912.0, + "grad_norm": 0.031011693938846972, + "language_loss": 0.87586653, + "learning_rate": 0.00040708063846337704, + "loss": 0.88745534, + "num_input_tokens_seen": 248048800, + "router_z_loss_mlp": 0.78222656, + "step": 2977, + "time_per_iteration": 2.7148561477661133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159545, + "balance_loss_mlp": 1.08100963, + "epoch": 0.5729126587148904, + "flos": 447940869120.0, + "grad_norm": 0.0318916011479424, + "language_loss": 0.87124234, + "learning_rate": 0.00040677454119625143, + "loss": 0.88283777, + "num_input_tokens_seen": 248116496, + "router_z_loss_mlp": 0.78320312, + "step": 2978, + "time_per_iteration": 2.6003363132476807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158917, + "balance_loss_mlp": 1.0804776, + "epoch": 0.5731050404001539, + "flos": 520467015168.0, + "grad_norm": 0.03318988951179658, + "language_loss": 0.88396186, + "learning_rate": 0.0004064684801224587, + "loss": 0.89555109, + "num_input_tokens_seen": 248184960, + "router_z_loss_mlp": 0.78173828, + "step": 2979, + "time_per_iteration": 2.6103272438049316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160698, + "balance_loss_mlp": 1.08225846, + "epoch": 0.5732974220854175, + "flos": 505770295296.0, + "grad_norm": 0.029710652762807207, + "language_loss": 0.85663891, + "learning_rate": 0.00040616245536082224, + "loss": 0.86824596, + "num_input_tokens_seen": 248252208, + "router_z_loss_mlp": 0.78222656, + "step": 2980, + "time_per_iteration": 2.5594868659973145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159175, + "balance_loss_mlp": 1.08078313, + "epoch": 0.573489803770681, + "flos": 593677367808.0, + "grad_norm": 0.027966372317681742, + "language_loss": 0.86258745, + "learning_rate": 0.00040585646703015165, + "loss": 0.87417924, + "num_input_tokens_seen": 248333312, + "router_z_loss_mlp": 0.78320312, + "step": 2981, + "time_per_iteration": 2.789937734603882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157316, + "balance_loss_mlp": 1.07878125, + "epoch": 0.5736821854559446, + "flos": 490869459456.0, + "grad_norm": 0.031111464824263694, + "language_loss": 0.83780992, + "learning_rate": 0.0004055505152492419, + "loss": 0.84938312, + "num_input_tokens_seen": 248403808, + "router_z_loss_mlp": 0.78466797, + "step": 2982, + "time_per_iteration": 2.6471428871154785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158265, + "balance_loss_mlp": 1.07963431, + "epoch": 0.5738745671412081, + "flos": 459201271296.0, + "grad_norm": 0.03311000411840089, + "language_loss": 0.79528159, + "learning_rate": 0.00040524460013687425, + "loss": 0.80686426, + "num_input_tokens_seen": 248477184, + "router_z_loss_mlp": 0.78564453, + "step": 2983, + "time_per_iteration": 2.708540678024292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155372, + "balance_loss_mlp": 1.07650268, + "epoch": 0.5740669488264717, + "flos": 581620694016.0, + "grad_norm": 0.028109694322635652, + "language_loss": 0.86855406, + "learning_rate": 0.0004049387218118155, + "loss": 0.88010776, + "num_input_tokens_seen": 248565552, + "router_z_loss_mlp": 0.78759766, + "step": 2984, + "time_per_iteration": 2.926750421524048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155283, + "balance_loss_mlp": 1.07622325, + "epoch": 0.5742593305117353, + "flos": 525573190656.0, + "grad_norm": 0.03395381439898354, + "language_loss": 0.91635472, + "learning_rate": 0.00040463288039281777, + "loss": 0.92790747, + "num_input_tokens_seen": 248635456, + "router_z_loss_mlp": 0.78857422, + "step": 2985, + "time_per_iteration": 2.704287528991699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162964, + "balance_loss_mlp": 1.08666992, + "epoch": 0.5744517121969989, + "flos": 1557266511360.0, + "grad_norm": 0.007878379047691413, + "language_loss": 0.77876419, + "learning_rate": 0.0004043270759986194, + "loss": 0.79039383, + "num_input_tokens_seen": 248870160, + "router_z_loss_mlp": 0.76367188, + "step": 2986, + "time_per_iteration": 4.989194869995117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155742, + "balance_loss_mlp": 1.07677734, + "epoch": 0.5746440938822625, + "flos": 753202798080.0, + "grad_norm": 0.03402997808137808, + "language_loss": 0.87620312, + "learning_rate": 0.0004040213087479444, + "loss": 0.88776052, + "num_input_tokens_seen": 248946960, + "router_z_loss_mlp": 0.78759766, + "step": 2987, + "time_per_iteration": 2.9275078773498535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163311, + "balance_loss_mlp": 1.08453715, + "epoch": 0.5748364755675259, + "flos": 502857002496.0, + "grad_norm": 0.03361733343242669, + "language_loss": 0.90824878, + "learning_rate": 0.0004037155787595018, + "loss": 0.91988194, + "num_input_tokens_seen": 249014128, + "router_z_loss_mlp": 0.78710938, + "step": 2988, + "time_per_iteration": 2.576448440551758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160011, + "balance_loss_mlp": 1.08109498, + "epoch": 0.5750288572527895, + "flos": 505197605376.0, + "grad_norm": 0.02880586923954642, + "language_loss": 0.85724807, + "learning_rate": 0.000403409886151987, + "loss": 0.86884815, + "num_input_tokens_seen": 249090016, + "router_z_loss_mlp": 0.78759766, + "step": 2989, + "time_per_iteration": 2.916322946548462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157013, + "balance_loss_mlp": 1.08033752, + "epoch": 0.5752212389380531, + "flos": 1544675352576.0, + "grad_norm": 0.005932241765552608, + "language_loss": 0.81999105, + "learning_rate": 0.0004031042310440799, + "loss": 0.83156121, + "num_input_tokens_seen": 249305552, + "router_z_loss_mlp": 0.765625, + "step": 2990, + "time_per_iteration": 4.758445978164673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115937, + "balance_loss_mlp": 1.08269501, + "epoch": 0.5754136206233167, + "flos": 1570671406080.0, + "grad_norm": 0.005822498768858246, + "language_loss": 0.781986, + "learning_rate": 0.00040279861355444656, + "loss": 0.7935797, + "num_input_tokens_seen": 249523408, + "router_z_loss_mlp": 0.765625, + "step": 2991, + "time_per_iteration": 4.785308122634888 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163075, + "balance_loss_mlp": 1.08420658, + "epoch": 0.5756060023085803, + "flos": 799561701888.0, + "grad_norm": 0.0320241684810352, + "language_loss": 0.81581879, + "learning_rate": 0.00040249303380173807, + "loss": 0.82744956, + "num_input_tokens_seen": 249616624, + "router_z_loss_mlp": 0.78808594, + "step": 2992, + "time_per_iteration": 3.060910940170288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160943, + "balance_loss_mlp": 1.08202648, + "epoch": 0.5757983839938438, + "flos": 589033818624.0, + "grad_norm": 0.033230938583522406, + "language_loss": 0.85061818, + "learning_rate": 0.00040218749190459126, + "loss": 0.86222756, + "num_input_tokens_seen": 249689936, + "router_z_loss_mlp": 0.78857422, + "step": 2993, + "time_per_iteration": 2.722538948059082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159067, + "balance_loss_mlp": 1.08029306, + "epoch": 0.5759907656791073, + "flos": 517851164160.0, + "grad_norm": 0.036503805232005304, + "language_loss": 0.88598883, + "learning_rate": 0.00040188198798162775, + "loss": 0.89757949, + "num_input_tokens_seen": 249759984, + "router_z_loss_mlp": 0.78662109, + "step": 2994, + "time_per_iteration": 2.626763105392456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157444, + "balance_loss_mlp": 1.078861, + "epoch": 0.5761831473643709, + "flos": 588289213440.0, + "grad_norm": 0.030677551313055676, + "language_loss": 0.90523088, + "learning_rate": 0.000401576522151455, + "loss": 0.91680533, + "num_input_tokens_seen": 249837888, + "router_z_loss_mlp": 0.78466797, + "step": 2995, + "time_per_iteration": 2.8290417194366455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156979, + "balance_loss_mlp": 1.07839644, + "epoch": 0.5763755290496345, + "flos": 545008786944.0, + "grad_norm": 0.030026851509959627, + "language_loss": 0.87201327, + "learning_rate": 0.0004012710945326651, + "loss": 0.88358307, + "num_input_tokens_seen": 249913584, + "router_z_loss_mlp": 0.78515625, + "step": 2996, + "time_per_iteration": 2.78725004196167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156215, + "balance_loss_mlp": 1.07767999, + "epoch": 0.576567910734898, + "flos": 627427648512.0, + "grad_norm": 0.03065527687354923, + "language_loss": 0.86651611, + "learning_rate": 0.0004009657052438355, + "loss": 0.87807822, + "num_input_tokens_seen": 249992144, + "router_z_loss_mlp": 0.78271484, + "step": 2997, + "time_per_iteration": 2.8221359252929688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156096, + "balance_loss_mlp": 1.07756102, + "epoch": 0.5767602924201616, + "flos": 539277528576.0, + "grad_norm": 0.032463443859892846, + "language_loss": 0.9117527, + "learning_rate": 0.00040066035440352904, + "loss": 0.92331362, + "num_input_tokens_seen": 250060736, + "router_z_loss_mlp": 0.78271484, + "step": 2998, + "time_per_iteration": 2.614291191101074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169762, + "balance_loss_mlp": 1.09403992, + "epoch": 0.5769526741054252, + "flos": 1563023239680.0, + "grad_norm": 0.012552051598097233, + "language_loss": 0.79293132, + "learning_rate": 0.0004003550421302934, + "loss": 0.80462897, + "num_input_tokens_seen": 250296864, + "router_z_loss_mlp": 0.7578125, + "step": 2999, + "time_per_iteration": 4.9131574630737305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162548, + "balance_loss_mlp": 1.0844425, + "epoch": 0.5771450557906888, + "flos": 469171849728.0, + "grad_norm": 0.03695219944655869, + "language_loss": 0.82297212, + "learning_rate": 0.00040004976854266145, + "loss": 0.83459759, + "num_input_tokens_seen": 250362528, + "router_z_loss_mlp": 0.78027344, + "step": 3000, + "time_per_iteration": 2.599562406539917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161323, + "balance_loss_mlp": 1.08321714, + "epoch": 0.5773374374759523, + "flos": 575632926720.0, + "grad_norm": 0.03253250172707863, + "language_loss": 0.86701882, + "learning_rate": 0.0003997445337591505, + "loss": 0.87863207, + "num_input_tokens_seen": 250432768, + "router_z_loss_mlp": 0.78027344, + "step": 3001, + "time_per_iteration": 2.651052951812744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161912, + "balance_loss_mlp": 1.08380568, + "epoch": 0.5775298191612158, + "flos": 529504335360.0, + "grad_norm": 0.030455172240490772, + "language_loss": 0.78589356, + "learning_rate": 0.0003994393378982635, + "loss": 0.79751271, + "num_input_tokens_seen": 250501504, + "router_z_loss_mlp": 0.78027344, + "step": 3002, + "time_per_iteration": 2.6081488132476807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162445, + "balance_loss_mlp": 1.08576965, + "epoch": 0.5777222008464794, + "flos": 1306896520704.0, + "grad_norm": 0.00976162227486582, + "language_loss": 0.79538, + "learning_rate": 0.00039913418107848786, + "loss": 0.80700445, + "num_input_tokens_seen": 250733632, + "router_z_loss_mlp": 0.765625, + "step": 3003, + "time_per_iteration": 4.794616460800171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154088, + "balance_loss_mlp": 1.07550502, + "epoch": 0.577914582531743, + "flos": 604792051200.0, + "grad_norm": 0.035927509548420514, + "language_loss": 0.93844306, + "learning_rate": 0.0003988290634182961, + "loss": 0.94998395, + "num_input_tokens_seen": 250809152, + "router_z_loss_mlp": 0.78417969, + "step": 3004, + "time_per_iteration": 2.7580206394195557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152956, + "balance_loss_mlp": 1.07465923, + "epoch": 0.5781069642170066, + "flos": 487832641536.0, + "grad_norm": 0.03166140659951907, + "language_loss": 0.85788441, + "learning_rate": 0.0003985239850361453, + "loss": 0.86941397, + "num_input_tokens_seen": 250879152, + "router_z_loss_mlp": 0.78173828, + "step": 3005, + "time_per_iteration": 2.5811102390289307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148402, + "balance_loss_mlp": 1.0700103, + "epoch": 0.5782993459022701, + "flos": 507413956608.0, + "grad_norm": 0.03361154868402879, + "language_loss": 0.90845788, + "learning_rate": 0.0003982189460504777, + "loss": 0.9199419, + "num_input_tokens_seen": 250949904, + "router_z_loss_mlp": 0.78271484, + "step": 3006, + "time_per_iteration": 2.701486349105835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150426, + "balance_loss_mlp": 1.07208133, + "epoch": 0.5784917275875336, + "flos": 603294108672.0, + "grad_norm": 0.03266847587020217, + "language_loss": 0.84488243, + "learning_rate": 0.00039791394657971935, + "loss": 0.85638666, + "num_input_tokens_seen": 251020976, + "router_z_loss_mlp": 0.78222656, + "step": 3007, + "time_per_iteration": 2.7029902935028076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114812, + "balance_loss_mlp": 1.06953716, + "epoch": 0.5786841092727972, + "flos": 522588039168.0, + "grad_norm": 0.03327041662205967, + "language_loss": 0.89717233, + "learning_rate": 0.00039760898674228205, + "loss": 0.90865356, + "num_input_tokens_seen": 251093280, + "router_z_loss_mlp": 0.78466797, + "step": 3008, + "time_per_iteration": 2.6650431156158447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163782, + "balance_loss_mlp": 1.08510339, + "epoch": 0.5788764909580608, + "flos": 768835504128.0, + "grad_norm": 0.02880825356575122, + "language_loss": 0.85863519, + "learning_rate": 0.0003973040666565613, + "loss": 0.87027305, + "num_input_tokens_seen": 251181376, + "router_z_loss_mlp": 0.78515625, + "step": 3009, + "time_per_iteration": 3.0480079650878906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165461, + "balance_loss_mlp": 1.08668745, + "epoch": 0.5790688726433244, + "flos": 600331150848.0, + "grad_norm": 0.03153140111016463, + "language_loss": 0.87491179, + "learning_rate": 0.000396999186440938, + "loss": 0.8865664, + "num_input_tokens_seen": 251256176, + "router_z_loss_mlp": 0.78515625, + "step": 3010, + "time_per_iteration": 2.866971254348755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163905, + "balance_loss_mlp": 1.08517945, + "epoch": 0.5792612543285879, + "flos": 524105447424.0, + "grad_norm": 0.03493307290908607, + "language_loss": 0.90569246, + "learning_rate": 0.000396694346213777, + "loss": 0.91733146, + "num_input_tokens_seen": 251325344, + "router_z_loss_mlp": 0.78564453, + "step": 3011, + "time_per_iteration": 2.6576690673828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160972, + "balance_loss_mlp": 1.08234167, + "epoch": 0.5794536360138515, + "flos": 878079618048.0, + "grad_norm": 0.028681737588389107, + "language_loss": 0.88734698, + "learning_rate": 0.0003963895460934276, + "loss": 0.89895672, + "num_input_tokens_seen": 251406656, + "router_z_loss_mlp": 0.78369141, + "step": 3012, + "time_per_iteration": 3.1439104080200195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159333, + "balance_loss_mlp": 1.08065438, + "epoch": 0.5796460176991151, + "flos": 402298372608.0, + "grad_norm": 0.038884721414284784, + "language_loss": 0.92029333, + "learning_rate": 0.00039608478619822376, + "loss": 0.93188667, + "num_input_tokens_seen": 251467760, + "router_z_loss_mlp": 0.78613281, + "step": 3013, + "time_per_iteration": 2.4331459999084473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115895, + "balance_loss_mlp": 1.08032, + "epoch": 0.5798383993843786, + "flos": 619675422720.0, + "grad_norm": 0.029275699876953817, + "language_loss": 0.87518513, + "learning_rate": 0.00039578006664648394, + "loss": 0.88677466, + "num_input_tokens_seen": 251542272, + "router_z_loss_mlp": 0.78417969, + "step": 3014, + "time_per_iteration": 2.770930290222168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157872, + "balance_loss_mlp": 1.07928884, + "epoch": 0.5800307810696421, + "flos": 845792351232.0, + "grad_norm": 0.03304881172222658, + "language_loss": 0.8676393, + "learning_rate": 0.0003954753875565105, + "loss": 0.87921804, + "num_input_tokens_seen": 251625584, + "router_z_loss_mlp": 0.78320312, + "step": 3015, + "time_per_iteration": 3.08627986907959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155618, + "balance_loss_mlp": 1.0769875, + "epoch": 0.5802231627549057, + "flos": 570364294656.0, + "grad_norm": 0.02949140039649942, + "language_loss": 0.86755216, + "learning_rate": 0.00039517074904659057, + "loss": 0.87910825, + "num_input_tokens_seen": 251696704, + "router_z_loss_mlp": 0.78369141, + "step": 3016, + "time_per_iteration": 2.685842990875244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155954, + "balance_loss_mlp": 1.07732403, + "epoch": 0.5804155444401693, + "flos": 661662022656.0, + "grad_norm": 0.030068480846806175, + "language_loss": 0.90490985, + "learning_rate": 0.00039486615123499535, + "loss": 0.91646945, + "num_input_tokens_seen": 251774784, + "router_z_loss_mlp": 0.78369141, + "step": 3017, + "time_per_iteration": 2.8422367572784424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158277, + "balance_loss_mlp": 1.07950318, + "epoch": 0.5806079261254329, + "flos": 515057393664.0, + "grad_norm": 0.0339975061302382, + "language_loss": 0.90716887, + "learning_rate": 0.00039456159423997996, + "loss": 0.91875166, + "num_input_tokens_seen": 251844768, + "router_z_loss_mlp": 0.78515625, + "step": 3018, + "time_per_iteration": 2.6301286220550537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159604, + "balance_loss_mlp": 1.08116388, + "epoch": 0.5808003078106965, + "flos": 529717183488.0, + "grad_norm": 0.035522237622510534, + "language_loss": 0.94178265, + "learning_rate": 0.00039425707817978406, + "loss": 0.95337874, + "num_input_tokens_seen": 251912736, + "router_z_loss_mlp": 0.78320312, + "step": 3019, + "time_per_iteration": 2.6516103744506836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159065, + "balance_loss_mlp": 1.08033943, + "epoch": 0.58099268949596, + "flos": 477996321792.0, + "grad_norm": 0.033660479575399194, + "language_loss": 0.88736534, + "learning_rate": 0.00039395260317263124, + "loss": 0.89895594, + "num_input_tokens_seen": 251979328, + "router_z_loss_mlp": 0.78466797, + "step": 3020, + "time_per_iteration": 2.5736000537872314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158964, + "balance_loss_mlp": 1.08033383, + "epoch": 0.5811850711812235, + "flos": 518687093760.0, + "grad_norm": 0.032372571582398105, + "language_loss": 0.90171605, + "learning_rate": 0.0003936481693367291, + "loss": 0.9133057, + "num_input_tokens_seen": 252050928, + "router_z_loss_mlp": 0.78417969, + "step": 3021, + "time_per_iteration": 2.655585289001465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152938, + "balance_loss_mlp": 1.07416463, + "epoch": 0.5813774528664871, + "flos": 617626257408.0, + "grad_norm": 0.037353178472421755, + "language_loss": 0.94038713, + "learning_rate": 0.0003933437767902697, + "loss": 0.95191658, + "num_input_tokens_seen": 252126496, + "router_z_loss_mlp": 0.78564453, + "step": 3022, + "time_per_iteration": 2.7785356044769287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155749, + "balance_loss_mlp": 1.07707083, + "epoch": 0.5815698345517507, + "flos": 568603838976.0, + "grad_norm": 0.03237494754713459, + "language_loss": 0.83540273, + "learning_rate": 0.00039303942565142825, + "loss": 0.84696019, + "num_input_tokens_seen": 252203008, + "router_z_loss_mlp": 0.78466797, + "step": 3023, + "time_per_iteration": 2.8082921504974365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115966, + "balance_loss_mlp": 1.08122075, + "epoch": 0.5817622162370142, + "flos": 564303393792.0, + "grad_norm": 0.030406133972166762, + "language_loss": 0.81602162, + "learning_rate": 0.0003927351160383644, + "loss": 0.82761824, + "num_input_tokens_seen": 252283440, + "router_z_loss_mlp": 0.78369141, + "step": 3024, + "time_per_iteration": 2.8258216381073 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115841, + "balance_loss_mlp": 1.07992303, + "epoch": 0.5819545979222778, + "flos": 460153995264.0, + "grad_norm": 0.0330231934286986, + "language_loss": 0.82985759, + "learning_rate": 0.000392430848069222, + "loss": 0.84144175, + "num_input_tokens_seen": 252351760, + "router_z_loss_mlp": 0.78369141, + "step": 3025, + "time_per_iteration": 2.552351713180542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155737, + "balance_loss_mlp": 1.0769639, + "epoch": 0.5821469796075414, + "flos": 542516461056.0, + "grad_norm": 0.03445814315346002, + "language_loss": 0.88443869, + "learning_rate": 0.00039212662186212795, + "loss": 0.89599597, + "num_input_tokens_seen": 252418480, + "router_z_loss_mlp": 0.78515625, + "step": 3026, + "time_per_iteration": 2.6369402408599854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157395, + "balance_loss_mlp": 1.07890785, + "epoch": 0.582339361292805, + "flos": 553340433408.0, + "grad_norm": 0.029462079730168216, + "language_loss": 0.82325065, + "learning_rate": 0.0003918224375351934, + "loss": 0.83482456, + "num_input_tokens_seen": 252493712, + "router_z_loss_mlp": 0.78369141, + "step": 3027, + "time_per_iteration": 2.698915958404541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116249, + "balance_loss_mlp": 1.08386004, + "epoch": 0.5825317429780685, + "flos": 497447380992.0, + "grad_norm": 0.03190253080273137, + "language_loss": 0.83360291, + "learning_rate": 0.0003915182952065135, + "loss": 0.84522784, + "num_input_tokens_seen": 252566096, + "router_z_loss_mlp": 0.78417969, + "step": 3028, + "time_per_iteration": 2.6572346687316895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160994, + "balance_loss_mlp": 1.08265007, + "epoch": 0.582724124663332, + "flos": 565254116352.0, + "grad_norm": 0.030478660984130428, + "language_loss": 0.92836106, + "learning_rate": 0.0003912141949941664, + "loss": 0.93997103, + "num_input_tokens_seen": 252639424, + "router_z_loss_mlp": 0.78271484, + "step": 3029, + "time_per_iteration": 2.683072090148926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153282, + "balance_loss_mlp": 1.07484198, + "epoch": 0.5829165063485956, + "flos": 493112007168.0, + "grad_norm": 0.03294557051603365, + "language_loss": 0.89173961, + "learning_rate": 0.0003909101370162143, + "loss": 0.90327239, + "num_input_tokens_seen": 252706672, + "router_z_loss_mlp": 0.78369141, + "step": 3030, + "time_per_iteration": 2.575670003890991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160767, + "balance_loss_mlp": 1.08370972, + "epoch": 0.5831088880338592, + "flos": 1531877349888.0, + "grad_norm": 0.012849020092446796, + "language_loss": 0.72433889, + "learning_rate": 0.00039060612139070326, + "loss": 0.7359466, + "num_input_tokens_seen": 252932464, + "router_z_loss_mlp": 0.76953125, + "step": 3031, + "time_per_iteration": 4.9284889698028564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152337, + "balance_loss_mlp": 1.07370639, + "epoch": 0.5833012697191228, + "flos": 619208793600.0, + "grad_norm": 0.02929875839371022, + "language_loss": 0.87939668, + "learning_rate": 0.0003903021482356622, + "loss": 0.89092004, + "num_input_tokens_seen": 253011920, + "router_z_loss_mlp": 0.78466797, + "step": 3032, + "time_per_iteration": 2.8254482746124268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152205, + "balance_loss_mlp": 1.07362223, + "epoch": 0.5834936514043862, + "flos": 769293401088.0, + "grad_norm": 0.02695668391828596, + "language_loss": 0.87565535, + "learning_rate": 0.00038999821766910465, + "loss": 0.88717741, + "num_input_tokens_seen": 253091552, + "router_z_loss_mlp": 0.78417969, + "step": 3033, + "time_per_iteration": 3.006687641143799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156362, + "balance_loss_mlp": 1.07796979, + "epoch": 0.5836860330896498, + "flos": 459316064256.0, + "grad_norm": 0.030677066462792797, + "language_loss": 0.91205192, + "learning_rate": 0.00038969432980902606, + "loss": 0.92361552, + "num_input_tokens_seen": 253158608, + "router_z_loss_mlp": 0.78320312, + "step": 3034, + "time_per_iteration": 2.550684690475464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011586, + "balance_loss_mlp": 1.08192444, + "epoch": 0.5838784147749134, + "flos": 1364196191232.0, + "grad_norm": 0.008170267563240248, + "language_loss": 0.79784501, + "learning_rate": 0.0003893904847734068, + "loss": 0.80943102, + "num_input_tokens_seen": 253381184, + "router_z_loss_mlp": 0.765625, + "step": 3035, + "time_per_iteration": 4.859564304351807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154223, + "balance_loss_mlp": 1.07592607, + "epoch": 0.584070796460177, + "flos": 568288932864.0, + "grad_norm": 0.030253680936045732, + "language_loss": 0.87217242, + "learning_rate": 0.00038908668268020953, + "loss": 0.88371468, + "num_input_tokens_seen": 253452880, + "router_z_loss_mlp": 0.78222656, + "step": 3036, + "time_per_iteration": 2.7140538692474365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154776, + "balance_loss_mlp": 1.07624114, + "epoch": 0.5842631781454406, + "flos": 612665800704.0, + "grad_norm": 0.02904438680956131, + "language_loss": 0.90014827, + "learning_rate": 0.00038878292364738097, + "loss": 0.91169608, + "num_input_tokens_seen": 253530000, + "router_z_loss_mlp": 0.78271484, + "step": 3037, + "time_per_iteration": 2.787289619445801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157819, + "balance_loss_mlp": 1.07923615, + "epoch": 0.5844555598307041, + "flos": 464332916736.0, + "grad_norm": 0.03338514659593435, + "language_loss": 0.93144816, + "learning_rate": 0.0003884792077928508, + "loss": 0.94302636, + "num_input_tokens_seen": 253593504, + "router_z_loss_mlp": 0.78320312, + "step": 3038, + "time_per_iteration": 2.513655185699463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155243, + "balance_loss_mlp": 1.07666051, + "epoch": 0.5846479415159677, + "flos": 411057716736.0, + "grad_norm": 0.039769663121131886, + "language_loss": 0.82121253, + "learning_rate": 0.0003881755352345322, + "loss": 0.83276498, + "num_input_tokens_seen": 253657904, + "router_z_loss_mlp": 0.78320312, + "step": 3039, + "time_per_iteration": 2.5270330905914307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154802, + "balance_loss_mlp": 1.07641041, + "epoch": 0.5848403232012312, + "flos": 492265344000.0, + "grad_norm": 0.02801571871014385, + "language_loss": 0.90901846, + "learning_rate": 0.0003878719060903207, + "loss": 0.9205665, + "num_input_tokens_seen": 253725280, + "router_z_loss_mlp": 0.78222656, + "step": 3040, + "time_per_iteration": 2.5588507652282715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154937, + "balance_loss_mlp": 1.07644928, + "epoch": 0.5850327048864948, + "flos": 585508177920.0, + "grad_norm": 0.037771067006053156, + "language_loss": 0.89005375, + "learning_rate": 0.0003875683204780961, + "loss": 0.90160316, + "num_input_tokens_seen": 253795040, + "router_z_loss_mlp": 0.78271484, + "step": 3041, + "time_per_iteration": 2.668827533721924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152572, + "balance_loss_mlp": 1.07408428, + "epoch": 0.5852250865717584, + "flos": 652718028288.0, + "grad_norm": 0.037622145269810676, + "language_loss": 0.92115968, + "learning_rate": 0.00038726477851572043, + "loss": 0.93268543, + "num_input_tokens_seen": 253866384, + "router_z_loss_mlp": 0.78271484, + "step": 3042, + "time_per_iteration": 2.813145160675049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152742, + "balance_loss_mlp": 1.07434952, + "epoch": 0.5854174682570219, + "flos": 535619630592.0, + "grad_norm": 0.034632487357399135, + "language_loss": 0.85911977, + "learning_rate": 0.0003869612803210395, + "loss": 0.87064719, + "num_input_tokens_seen": 253935712, + "router_z_loss_mlp": 0.78222656, + "step": 3043, + "time_per_iteration": 2.6411526203155518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150207, + "balance_loss_mlp": 1.07176721, + "epoch": 0.5856098499422855, + "flos": 510758949888.0, + "grad_norm": 0.03364322076393535, + "language_loss": 0.8838582, + "learning_rate": 0.0003866578260118817, + "loss": 0.89536023, + "num_input_tokens_seen": 254003152, + "router_z_loss_mlp": 0.78271484, + "step": 3044, + "time_per_iteration": 2.59216570854187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160339, + "balance_loss_mlp": 1.08228123, + "epoch": 0.5858022316275491, + "flos": 594992661504.0, + "grad_norm": 0.03592243508466687, + "language_loss": 0.87963545, + "learning_rate": 0.0003863544157060581, + "loss": 0.89123881, + "num_input_tokens_seen": 254072816, + "router_z_loss_mlp": 0.77978516, + "step": 3045, + "time_per_iteration": 2.6693618297576904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159373, + "balance_loss_mlp": 1.08131468, + "epoch": 0.5859946133128127, + "flos": 560317854720.0, + "grad_norm": 0.029657376615259006, + "language_loss": 0.86909235, + "learning_rate": 0.0003860510495213634, + "loss": 0.88068604, + "num_input_tokens_seen": 254152800, + "router_z_loss_mlp": 0.77978516, + "step": 3046, + "time_per_iteration": 2.799967050552368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159061, + "balance_loss_mlp": 1.08085966, + "epoch": 0.5861869949980761, + "flos": 554755783680.0, + "grad_norm": 0.03663253930872626, + "language_loss": 0.84493214, + "learning_rate": 0.0003857477275755746, + "loss": 0.85652274, + "num_input_tokens_seen": 254224384, + "router_z_loss_mlp": 0.78125, + "step": 3047, + "time_per_iteration": 2.6989481449127197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116382, + "balance_loss_mlp": 1.08566678, + "epoch": 0.5863793766833397, + "flos": 720054131712.0, + "grad_norm": 0.029238524404730352, + "language_loss": 0.89394152, + "learning_rate": 0.00038544444998645167, + "loss": 0.90557969, + "num_input_tokens_seen": 254310960, + "router_z_loss_mlp": 0.78076172, + "step": 3048, + "time_per_iteration": 3.0829827785491943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162492, + "balance_loss_mlp": 1.0843389, + "epoch": 0.5865717583686033, + "flos": 473285643264.0, + "grad_norm": 0.03316519352776713, + "language_loss": 0.8619799, + "learning_rate": 0.00038514121687173767, + "loss": 0.87360477, + "num_input_tokens_seen": 254378336, + "router_z_loss_mlp": 0.78076172, + "step": 3049, + "time_per_iteration": 2.575395107269287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157324, + "balance_loss_mlp": 1.07897997, + "epoch": 0.5867641400538669, + "flos": 814846574592.0, + "grad_norm": 0.0318856413902076, + "language_loss": 0.87874395, + "learning_rate": 0.00038483802834915807, + "loss": 0.8903172, + "num_input_tokens_seen": 254454352, + "router_z_loss_mlp": 0.78271484, + "step": 3050, + "time_per_iteration": 2.973144292831421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153006, + "balance_loss_mlp": 1.07461429, + "epoch": 0.5869565217391305, + "flos": 487517735424.0, + "grad_norm": 0.034960474960603255, + "language_loss": 0.8386789, + "learning_rate": 0.00038453488453642074, + "loss": 0.85020894, + "num_input_tokens_seen": 254526352, + "router_z_loss_mlp": 0.78320312, + "step": 3051, + "time_per_iteration": 2.7100586891174316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152299, + "balance_loss_mlp": 1.0736686, + "epoch": 0.587148903424394, + "flos": 570512014848.0, + "grad_norm": 0.03111841936731719, + "language_loss": 0.91899282, + "learning_rate": 0.00038423178555121697, + "loss": 0.93051583, + "num_input_tokens_seen": 254598720, + "router_z_loss_mlp": 0.78466797, + "step": 3052, + "time_per_iteration": 2.713294744491577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151746, + "balance_loss_mlp": 1.07316351, + "epoch": 0.5873412851096576, + "flos": 748694234112.0, + "grad_norm": 0.039836143626506074, + "language_loss": 0.90698159, + "learning_rate": 0.00038392873151121994, + "loss": 0.91849899, + "num_input_tokens_seen": 254683664, + "router_z_loss_mlp": 0.78466797, + "step": 3053, + "time_per_iteration": 3.0334441661834717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151743, + "balance_loss_mlp": 1.07320774, + "epoch": 0.5875336667949211, + "flos": 529187427840.0, + "grad_norm": 0.03304313685691396, + "language_loss": 0.89048851, + "learning_rate": 0.0003836257225340859, + "loss": 0.90200597, + "num_input_tokens_seen": 254754688, + "router_z_loss_mlp": 0.78417969, + "step": 3054, + "time_per_iteration": 2.612002372741699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152089, + "balance_loss_mlp": 1.07360125, + "epoch": 0.5877260484801847, + "flos": 825640347648.0, + "grad_norm": 0.04168388263761463, + "language_loss": 0.87033945, + "learning_rate": 0.00038332275873745336, + "loss": 0.88186038, + "num_input_tokens_seen": 254838976, + "router_z_loss_mlp": 0.78369141, + "step": 3055, + "time_per_iteration": 3.0469071865081787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153404, + "balance_loss_mlp": 1.07472539, + "epoch": 0.5879184301654482, + "flos": 592693718016.0, + "grad_norm": 0.028534237237830384, + "language_loss": 0.87091875, + "learning_rate": 0.0003830198402389431, + "loss": 0.88245273, + "num_input_tokens_seen": 254912912, + "router_z_loss_mlp": 0.78466797, + "step": 3056, + "time_per_iteration": 2.7129743099212646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116227, + "balance_loss_mlp": 1.08635712, + "epoch": 0.5881108118507118, + "flos": 1549223574528.0, + "grad_norm": 0.013735077759529469, + "language_loss": 0.77348936, + "learning_rate": 0.0003827169671561585, + "loss": 0.78511202, + "num_input_tokens_seen": 255151488, + "router_z_loss_mlp": 0.75976562, + "step": 3057, + "time_per_iteration": 4.971419334411621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155251, + "balance_loss_mlp": 1.0767163, + "epoch": 0.5883031935359754, + "flos": 490598214144.0, + "grad_norm": 0.03703880470659913, + "language_loss": 0.88891268, + "learning_rate": 0.0003824141396066855, + "loss": 0.90046519, + "num_input_tokens_seen": 255218896, + "router_z_loss_mlp": 0.78417969, + "step": 3058, + "time_per_iteration": 2.5657668113708496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153431, + "balance_loss_mlp": 1.0749433, + "epoch": 0.588495575221239, + "flos": 583980036096.0, + "grad_norm": 0.04132288833299083, + "language_loss": 0.89364433, + "learning_rate": 0.000382111357708092, + "loss": 0.90517867, + "num_input_tokens_seen": 255287408, + "router_z_loss_mlp": 0.78417969, + "step": 3059, + "time_per_iteration": 2.7690227031707764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152167, + "balance_loss_mlp": 1.07377541, + "epoch": 0.5886879569065026, + "flos": 662239441920.0, + "grad_norm": 0.03195995960407152, + "language_loss": 0.89352429, + "learning_rate": 0.00038180862157792864, + "loss": 0.90504599, + "num_input_tokens_seen": 255358432, + "router_z_loss_mlp": 0.78320312, + "step": 3060, + "time_per_iteration": 2.797255039215088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149069, + "balance_loss_mlp": 1.07048619, + "epoch": 0.588880338591766, + "flos": 563719243776.0, + "grad_norm": 0.031223560866560994, + "language_loss": 0.86781317, + "learning_rate": 0.0003815059313337279, + "loss": 0.87930381, + "num_input_tokens_seen": 255425744, + "router_z_loss_mlp": 0.78369141, + "step": 3061, + "time_per_iteration": 2.6690454483032227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149002, + "balance_loss_mlp": 1.07056284, + "epoch": 0.5890727202770296, + "flos": 555852225024.0, + "grad_norm": 0.029451906852367885, + "language_loss": 0.83063936, + "learning_rate": 0.00038120328709300436, + "loss": 0.84212935, + "num_input_tokens_seen": 255505808, + "router_z_loss_mlp": 0.78271484, + "step": 3062, + "time_per_iteration": 2.902662515640259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149399, + "balance_loss_mlp": 1.07095897, + "epoch": 0.5892651019622932, + "flos": 656701565952.0, + "grad_norm": 0.028569643240873292, + "language_loss": 0.89099294, + "learning_rate": 0.0003809006889732549, + "loss": 0.90248692, + "num_input_tokens_seen": 255580160, + "router_z_loss_mlp": 0.78320312, + "step": 3063, + "time_per_iteration": 2.8155622482299805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150242, + "balance_loss_mlp": 1.07185006, + "epoch": 0.5894574836475568, + "flos": 454132025856.0, + "grad_norm": 0.03219128848339896, + "language_loss": 0.93056011, + "learning_rate": 0.0003805981370919589, + "loss": 0.9420625, + "num_input_tokens_seen": 255644016, + "router_z_loss_mlp": 0.78173828, + "step": 3064, + "time_per_iteration": 2.533978223800659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156603, + "balance_loss_mlp": 1.07840204, + "epoch": 0.5896498653328203, + "flos": 520111176192.0, + "grad_norm": 0.0315116121131164, + "language_loss": 0.89031386, + "learning_rate": 0.0003802956315665771, + "loss": 0.90187985, + "num_input_tokens_seen": 255718192, + "router_z_loss_mlp": 0.78125, + "step": 3065, + "time_per_iteration": 2.6914567947387695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151617, + "balance_loss_mlp": 1.07341576, + "epoch": 0.5898422470180839, + "flos": 550084036608.0, + "grad_norm": 0.037269486879405754, + "language_loss": 0.87739515, + "learning_rate": 0.0003799931725145529, + "loss": 0.88891131, + "num_input_tokens_seen": 255787696, + "router_z_loss_mlp": 0.78125, + "step": 3066, + "time_per_iteration": 2.6040141582489014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151797, + "balance_loss_mlp": 1.07359576, + "epoch": 0.5900346287033474, + "flos": 525379808256.0, + "grad_norm": 0.03210441330274425, + "language_loss": 0.90831029, + "learning_rate": 0.00037969076005331083, + "loss": 0.9198283, + "num_input_tokens_seen": 255862992, + "router_z_loss_mlp": 0.78125, + "step": 3067, + "time_per_iteration": 2.773045301437378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151142, + "balance_loss_mlp": 1.07298875, + "epoch": 0.590227010388611, + "flos": 568215072768.0, + "grad_norm": 0.03944068050463326, + "language_loss": 0.93933421, + "learning_rate": 0.00037938839430025817, + "loss": 0.9508456, + "num_input_tokens_seen": 255931872, + "router_z_loss_mlp": 0.78076172, + "step": 3068, + "time_per_iteration": 2.6502816677093506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149777, + "balance_loss_mlp": 1.07148039, + "epoch": 0.5904193920738746, + "flos": 584455397376.0, + "grad_norm": 0.029602074998044806, + "language_loss": 0.90136111, + "learning_rate": 0.0003790860753727835, + "loss": 0.91285884, + "num_input_tokens_seen": 256004656, + "router_z_loss_mlp": 0.78173828, + "step": 3069, + "time_per_iteration": 2.8173305988311768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148373, + "balance_loss_mlp": 1.07007682, + "epoch": 0.5906117737591381, + "flos": 530796160512.0, + "grad_norm": 0.03761421694137887, + "language_loss": 0.88493633, + "learning_rate": 0.00037878380338825766, + "loss": 0.89642012, + "num_input_tokens_seen": 256076944, + "router_z_loss_mlp": 0.78173828, + "step": 3070, + "time_per_iteration": 2.6682841777801514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148557, + "balance_loss_mlp": 1.07059419, + "epoch": 0.5908041554444017, + "flos": 685515585024.0, + "grad_norm": 0.029847469423829834, + "language_loss": 0.85616612, + "learning_rate": 0.00037848157846403287, + "loss": 0.86765176, + "num_input_tokens_seen": 256154768, + "router_z_loss_mlp": 0.77880859, + "step": 3071, + "time_per_iteration": 2.942607879638672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148313, + "balance_loss_mlp": 1.07015908, + "epoch": 0.5909965371296653, + "flos": 551132814336.0, + "grad_norm": 0.030659229377642858, + "language_loss": 0.88636756, + "learning_rate": 0.0003781794007174435, + "loss": 0.89785063, + "num_input_tokens_seen": 256230896, + "router_z_loss_mlp": 0.78076172, + "step": 3072, + "time_per_iteration": 2.7619588375091553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159439, + "balance_loss_mlp": 1.08276367, + "epoch": 0.5911889188149289, + "flos": 1495642200576.0, + "grad_norm": 0.009662354088300913, + "language_loss": 0.74074531, + "learning_rate": 0.0003778772702658051, + "loss": 0.75233972, + "num_input_tokens_seen": 256462336, + "router_z_loss_mlp": 0.765625, + "step": 3073, + "time_per_iteration": 4.855187177658081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115096, + "balance_loss_mlp": 1.07275867, + "epoch": 0.5913813005001923, + "flos": 488885422080.0, + "grad_norm": 0.030913240812320716, + "language_loss": 0.86239564, + "learning_rate": 0.0003775751872264152, + "loss": 0.87390518, + "num_input_tokens_seen": 256539376, + "router_z_loss_mlp": 0.78125, + "step": 3074, + "time_per_iteration": 2.7676284313201904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150595, + "balance_loss_mlp": 1.0724895, + "epoch": 0.5915736821854559, + "flos": 574521748992.0, + "grad_norm": 0.02774902568268271, + "language_loss": 0.91979122, + "learning_rate": 0.0003772731517165527, + "loss": 0.93129718, + "num_input_tokens_seen": 256617728, + "router_z_loss_mlp": 0.78027344, + "step": 3075, + "time_per_iteration": 2.7969858646392822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146907, + "balance_loss_mlp": 1.06884861, + "epoch": 0.5917660638707195, + "flos": 790860754944.0, + "grad_norm": 0.032083383212934545, + "language_loss": 0.88416231, + "learning_rate": 0.0003769711638534784, + "loss": 0.89563137, + "num_input_tokens_seen": 256696032, + "router_z_loss_mlp": 0.77978516, + "step": 3076, + "time_per_iteration": 2.966887950897217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147265, + "balance_loss_mlp": 1.06915915, + "epoch": 0.5919584455559831, + "flos": 529756114944.0, + "grad_norm": 0.039188776409307895, + "language_loss": 0.84855187, + "learning_rate": 0.00037666922375443446, + "loss": 0.86002445, + "num_input_tokens_seen": 256767360, + "router_z_loss_mlp": 0.78027344, + "step": 3077, + "time_per_iteration": 2.6466495990753174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146857, + "balance_loss_mlp": 1.06889355, + "epoch": 0.5921508272412467, + "flos": 561752670720.0, + "grad_norm": 0.03396925526876144, + "language_loss": 0.87058771, + "learning_rate": 0.00037636733153664396, + "loss": 0.88205624, + "num_input_tokens_seen": 256844848, + "router_z_loss_mlp": 0.77880859, + "step": 3078, + "time_per_iteration": 2.868244171142578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147912, + "balance_loss_mlp": 1.06980658, + "epoch": 0.5923432089265102, + "flos": 564333593088.0, + "grad_norm": 0.03405949699736924, + "language_loss": 0.86518288, + "learning_rate": 0.0003760654873173124, + "loss": 0.87666202, + "num_input_tokens_seen": 256916688, + "router_z_loss_mlp": 0.78027344, + "step": 3079, + "time_per_iteration": 2.665978193283081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148871, + "balance_loss_mlp": 1.07095611, + "epoch": 0.5925355906117737, + "flos": 496750439424.0, + "grad_norm": 0.031078530741144403, + "language_loss": 0.87091482, + "learning_rate": 0.00037576369121362566, + "loss": 0.88240349, + "num_input_tokens_seen": 256985520, + "router_z_loss_mlp": 0.77832031, + "step": 3080, + "time_per_iteration": 2.5879437923431396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152698, + "balance_loss_mlp": 1.07483089, + "epoch": 0.5927279722970373, + "flos": 567492661248.0, + "grad_norm": 0.029886004026783125, + "language_loss": 0.86116624, + "learning_rate": 0.0003754619433427516, + "loss": 0.87269318, + "num_input_tokens_seen": 257067552, + "router_z_loss_mlp": 0.77783203, + "step": 3081, + "time_per_iteration": 2.911530017852783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149482, + "balance_loss_mlp": 1.07156706, + "epoch": 0.5929203539823009, + "flos": 668159353344.0, + "grad_norm": 0.03611880785888225, + "language_loss": 0.84511012, + "learning_rate": 0.0003751602438218392, + "loss": 0.85660493, + "num_input_tokens_seen": 257138896, + "router_z_loss_mlp": 0.77832031, + "step": 3082, + "time_per_iteration": 2.767104148864746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148924, + "balance_loss_mlp": 1.07105672, + "epoch": 0.5931127356675644, + "flos": 556785483264.0, + "grad_norm": 0.03271098535749721, + "language_loss": 0.89783478, + "learning_rate": 0.0003748585927680186, + "loss": 0.90932405, + "num_input_tokens_seen": 257210592, + "router_z_loss_mlp": 0.77783203, + "step": 3083, + "time_per_iteration": 2.6630167961120605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148966, + "balance_loss_mlp": 1.07100332, + "epoch": 0.593305117352828, + "flos": 536242712064.0, + "grad_norm": 0.03028975884774044, + "language_loss": 0.88271487, + "learning_rate": 0.00037455699029840086, + "loss": 0.89420456, + "num_input_tokens_seen": 257276208, + "router_z_loss_mlp": 0.77880859, + "step": 3084, + "time_per_iteration": 2.647643566131592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148168, + "balance_loss_mlp": 1.07020473, + "epoch": 0.5934974990380916, + "flos": 595057789440.0, + "grad_norm": 0.028668930156423956, + "language_loss": 0.89615595, + "learning_rate": 0.0003742554365300787, + "loss": 0.9076376, + "num_input_tokens_seen": 257351920, + "router_z_loss_mlp": 0.77880859, + "step": 3085, + "time_per_iteration": 2.743479013442993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148026, + "balance_loss_mlp": 1.07015836, + "epoch": 0.5936898807233552, + "flos": 714014697984.0, + "grad_norm": 0.030266517596009415, + "language_loss": 0.84002471, + "learning_rate": 0.0003739539315801255, + "loss": 0.85150492, + "num_input_tokens_seen": 257430016, + "router_z_loss_mlp": 0.77783203, + "step": 3086, + "time_per_iteration": 2.9327478408813477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147359, + "balance_loss_mlp": 1.06944346, + "epoch": 0.5938822624086187, + "flos": 392748761088.0, + "grad_norm": 0.030603721844952317, + "language_loss": 0.96139234, + "learning_rate": 0.000373652475565596, + "loss": 0.97286594, + "num_input_tokens_seen": 257492224, + "router_z_loss_mlp": 0.77832031, + "step": 3087, + "time_per_iteration": 2.471726417541504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146572, + "balance_loss_mlp": 1.06860876, + "epoch": 0.5940746440938822, + "flos": 481335310848.0, + "grad_norm": 0.033612762678092996, + "language_loss": 0.86454874, + "learning_rate": 0.00037335106860352587, + "loss": 0.87601447, + "num_input_tokens_seen": 257567824, + "router_z_loss_mlp": 0.77880859, + "step": 3088, + "time_per_iteration": 2.692692756652832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148512, + "balance_loss_mlp": 1.07045376, + "epoch": 0.5942670257791458, + "flos": 484307000832.0, + "grad_norm": 0.031191733120893732, + "language_loss": 0.87924445, + "learning_rate": 0.00037304971081093146, + "loss": 0.89072955, + "num_input_tokens_seen": 257635488, + "router_z_loss_mlp": 0.77978516, + "step": 3089, + "time_per_iteration": 2.568676710128784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149298, + "balance_loss_mlp": 1.071383, + "epoch": 0.5944594074644094, + "flos": 549057452544.0, + "grad_norm": 0.027833968511861495, + "language_loss": 0.85559821, + "learning_rate": 0.00037274840230481024, + "loss": 0.86709118, + "num_input_tokens_seen": 257709552, + "router_z_loss_mlp": 0.77832031, + "step": 3090, + "time_per_iteration": 2.7224090099334717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148103, + "balance_loss_mlp": 1.07009256, + "epoch": 0.594651789149673, + "flos": 450129022464.0, + "grad_norm": 0.03399265003555819, + "language_loss": 0.85464221, + "learning_rate": 0.00037244714320214077, + "loss": 0.86612326, + "num_input_tokens_seen": 257775520, + "router_z_loss_mlp": 0.77929688, + "step": 3091, + "time_per_iteration": 2.545518398284912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148303, + "balance_loss_mlp": 1.07034016, + "epoch": 0.5948441708349365, + "flos": 597465521664.0, + "grad_norm": 0.029759995876706483, + "language_loss": 0.88336015, + "learning_rate": 0.000372145933619882, + "loss": 0.89484322, + "num_input_tokens_seen": 257858560, + "router_z_loss_mlp": 0.77880859, + "step": 3092, + "time_per_iteration": 2.8612496852874756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147536, + "balance_loss_mlp": 1.06952572, + "epoch": 0.5950365525202, + "flos": 549580477440.0, + "grad_norm": 0.03567164883764641, + "language_loss": 0.87935793, + "learning_rate": 0.000371844773674974, + "loss": 0.89083326, + "num_input_tokens_seen": 257928048, + "router_z_loss_mlp": 0.77929688, + "step": 3093, + "time_per_iteration": 2.6431939601898193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147858, + "balance_loss_mlp": 1.06980002, + "epoch": 0.5952289342054636, + "flos": 655963691520.0, + "grad_norm": 0.03489323159702664, + "language_loss": 0.87669003, + "learning_rate": 0.0003715436634843375, + "loss": 0.88816857, + "num_input_tokens_seen": 258003088, + "router_z_loss_mlp": 0.77978516, + "step": 3094, + "time_per_iteration": 2.889326572418213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115074, + "balance_loss_mlp": 1.07268155, + "epoch": 0.5954213158907272, + "flos": 604603398144.0, + "grad_norm": 0.02937888511977547, + "language_loss": 0.85120195, + "learning_rate": 0.00037124260316487355, + "loss": 0.86270934, + "num_input_tokens_seen": 258084880, + "router_z_loss_mlp": 0.77978516, + "step": 3095, + "time_per_iteration": 2.8256890773773193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011487, + "balance_loss_mlp": 1.07064188, + "epoch": 0.5956136975759908, + "flos": 487267957248.0, + "grad_norm": 0.03289727477229571, + "language_loss": 0.94411993, + "learning_rate": 0.0003709415928334643, + "loss": 0.95560694, + "num_input_tokens_seen": 258152032, + "router_z_loss_mlp": 0.77978516, + "step": 3096, + "time_per_iteration": 2.587526559829712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148362, + "balance_loss_mlp": 1.07025576, + "epoch": 0.5958060792612543, + "flos": 660040555008.0, + "grad_norm": 0.03760653483237211, + "language_loss": 0.8629458, + "learning_rate": 0.00037064063260697233, + "loss": 0.8744294, + "num_input_tokens_seen": 258228896, + "router_z_loss_mlp": 0.78027344, + "step": 3097, + "time_per_iteration": 2.8921737670898438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149624, + "balance_loss_mlp": 1.07170904, + "epoch": 0.5959984609465179, + "flos": 724995122688.0, + "grad_norm": 0.02933465569925715, + "language_loss": 0.84228349, + "learning_rate": 0.0003703397226022407, + "loss": 0.85377973, + "num_input_tokens_seen": 258311152, + "router_z_loss_mlp": 0.77832031, + "step": 3098, + "time_per_iteration": 3.0898213386535645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115181, + "balance_loss_mlp": 1.07627869, + "epoch": 0.5961908426317815, + "flos": 1523218788864.0, + "grad_norm": 0.004520881067607934, + "language_loss": 0.75499874, + "learning_rate": 0.00037003886293609335, + "loss": 0.7665168, + "num_input_tokens_seen": 258540656, + "router_z_loss_mlp": 0.75585938, + "step": 3099, + "time_per_iteration": 4.9205827713012695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148148, + "balance_loss_mlp": 1.07023323, + "epoch": 0.596383224317045, + "flos": 533646326784.0, + "grad_norm": 0.03064762726337019, + "language_loss": 0.87394881, + "learning_rate": 0.0003697380537253339, + "loss": 0.88543034, + "num_input_tokens_seen": 258608960, + "router_z_loss_mlp": 0.77832031, + "step": 3100, + "time_per_iteration": 2.6238889694213867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148472, + "balance_loss_mlp": 1.07065213, + "epoch": 0.5965756060023086, + "flos": 592366076928.0, + "grad_norm": 0.03279417600266174, + "language_loss": 0.87095284, + "learning_rate": 0.0003694372950867471, + "loss": 0.88243759, + "num_input_tokens_seen": 258684304, + "router_z_loss_mlp": 0.77734375, + "step": 3101, + "time_per_iteration": 2.754004955291748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149351, + "balance_loss_mlp": 1.0715313, + "epoch": 0.5967679876875721, + "flos": 863469493248.0, + "grad_norm": 0.096940863219985, + "language_loss": 0.82642257, + "learning_rate": 0.0003691365871370976, + "loss": 0.83791614, + "num_input_tokens_seen": 258769472, + "router_z_loss_mlp": 0.77734375, + "step": 3102, + "time_per_iteration": 3.027898073196411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148471, + "balance_loss_mlp": 1.07065165, + "epoch": 0.5969603693728357, + "flos": 554877307392.0, + "grad_norm": 0.03194116769832037, + "language_loss": 0.90513253, + "learning_rate": 0.00036883592999313093, + "loss": 0.91661727, + "num_input_tokens_seen": 258841696, + "router_z_loss_mlp": 0.77734375, + "step": 3103, + "time_per_iteration": 2.6555323600769043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114931, + "balance_loss_mlp": 1.07158601, + "epoch": 0.5971527510580993, + "flos": 719936610816.0, + "grad_norm": 0.037867869271097296, + "language_loss": 0.85018742, + "learning_rate": 0.0003685353237715722, + "loss": 0.86168051, + "num_input_tokens_seen": 258915616, + "router_z_loss_mlp": 0.77636719, + "step": 3104, + "time_per_iteration": 2.88739013671875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115032, + "balance_loss_mlp": 1.07245219, + "epoch": 0.5973451327433629, + "flos": 648862745088.0, + "grad_norm": 0.032062315519195535, + "language_loss": 0.86408043, + "learning_rate": 0.0003682347685891274, + "loss": 0.87558353, + "num_input_tokens_seen": 258994080, + "router_z_loss_mlp": 0.77783203, + "step": 3105, + "time_per_iteration": 2.8420920372009277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149351, + "balance_loss_mlp": 1.07162631, + "epoch": 0.5975375144286263, + "flos": 723088948224.0, + "grad_norm": 0.03318206210872103, + "language_loss": 0.86870039, + "learning_rate": 0.0003679342645624822, + "loss": 0.88019389, + "num_input_tokens_seen": 259075968, + "router_z_loss_mlp": 0.77636719, + "step": 3106, + "time_per_iteration": 2.995124578475952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150114, + "balance_loss_mlp": 1.07248521, + "epoch": 0.5977298961138899, + "flos": 752343399936.0, + "grad_norm": 0.029134934835651077, + "language_loss": 0.86725187, + "learning_rate": 0.0003676338118083025, + "loss": 0.87875295, + "num_input_tokens_seen": 259162512, + "router_z_loss_mlp": 0.77539062, + "step": 3107, + "time_per_iteration": 2.972302198410034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150139, + "balance_loss_mlp": 1.07251036, + "epoch": 0.5979222777991535, + "flos": 531998662656.0, + "grad_norm": 0.035100601373903646, + "language_loss": 0.857481, + "learning_rate": 0.0003673334104432347, + "loss": 0.86898237, + "num_input_tokens_seen": 259228752, + "router_z_loss_mlp": 0.77539062, + "step": 3108, + "time_per_iteration": 2.6626758575439453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149837, + "balance_loss_mlp": 1.07230318, + "epoch": 0.5981146594844171, + "flos": 622914355200.0, + "grad_norm": 0.0316193314504938, + "language_loss": 0.88024735, + "learning_rate": 0.0003670330605839048, + "loss": 0.89174569, + "num_input_tokens_seen": 259303440, + "router_z_loss_mlp": 0.77441406, + "step": 3109, + "time_per_iteration": 2.8445565700531006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149651, + "balance_loss_mlp": 1.07216513, + "epoch": 0.5983070411696807, + "flos": 604709458944.0, + "grad_norm": 0.030685816325192888, + "language_loss": 0.81470084, + "learning_rate": 0.0003667327623469191, + "loss": 0.82619739, + "num_input_tokens_seen": 259378752, + "router_z_loss_mlp": 0.77392578, + "step": 3110, + "time_per_iteration": 2.7507362365722656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151646, + "balance_loss_mlp": 1.07406473, + "epoch": 0.5984994228549442, + "flos": 634669584384.0, + "grad_norm": 0.03251456811802211, + "language_loss": 0.83321273, + "learning_rate": 0.00036643251584886333, + "loss": 0.84472924, + "num_input_tokens_seen": 259454336, + "router_z_loss_mlp": 0.77490234, + "step": 3111, + "time_per_iteration": 2.816390037536621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156112, + "balance_loss_mlp": 1.07848299, + "epoch": 0.5986918045402078, + "flos": 526293600768.0, + "grad_norm": 0.03439308421341756, + "language_loss": 0.88026524, + "learning_rate": 0.00036613232120630393, + "loss": 0.89182639, + "num_input_tokens_seen": 259518960, + "router_z_loss_mlp": 0.77539062, + "step": 3112, + "time_per_iteration": 2.610931396484375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151048, + "balance_loss_mlp": 1.07332325, + "epoch": 0.5988841862254713, + "flos": 484139814912.0, + "grad_norm": 0.040537518995664656, + "language_loss": 0.85835981, + "learning_rate": 0.00036583217853578643, + "loss": 0.86987036, + "num_input_tokens_seen": 259584352, + "router_z_loss_mlp": 0.77636719, + "step": 3113, + "time_per_iteration": 2.535508871078491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115137, + "balance_loss_mlp": 1.07369328, + "epoch": 0.5990765679107349, + "flos": 1142121745920.0, + "grad_norm": 0.03045218931470109, + "language_loss": 0.82758361, + "learning_rate": 0.000365532087953837, + "loss": 0.83909732, + "num_input_tokens_seen": 259693152, + "router_z_loss_mlp": 0.77587891, + "step": 3114, + "time_per_iteration": 3.635089159011841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150692, + "balance_loss_mlp": 1.07282436, + "epoch": 0.5992689495959984, + "flos": 518018350080.0, + "grad_norm": 0.03475345450765353, + "language_loss": 0.94564217, + "learning_rate": 0.00036523204957696065, + "loss": 0.95714909, + "num_input_tokens_seen": 259762048, + "router_z_loss_mlp": 0.77783203, + "step": 3115, + "time_per_iteration": 2.6130504608154297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150235, + "balance_loss_mlp": 1.07231951, + "epoch": 0.599461331281262, + "flos": 745941396480.0, + "grad_norm": 0.03954805443520273, + "language_loss": 0.86356986, + "learning_rate": 0.00036493206352164324, + "loss": 0.87507224, + "num_input_tokens_seen": 259843184, + "router_z_loss_mlp": 0.77832031, + "step": 3116, + "time_per_iteration": 2.902606964111328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115079, + "balance_loss_mlp": 1.07282686, + "epoch": 0.5996537129665256, + "flos": 593483985408.0, + "grad_norm": 0.030263025154964335, + "language_loss": 0.90265405, + "learning_rate": 0.000364632129904349, + "loss": 0.91416192, + "num_input_tokens_seen": 259912720, + "router_z_loss_mlp": 0.77880859, + "step": 3117, + "time_per_iteration": 2.728739023208618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148018, + "balance_loss_mlp": 1.0701983, + "epoch": 0.5998460946517892, + "flos": 560115740160.0, + "grad_norm": 0.03726043771871862, + "language_loss": 0.8256759, + "learning_rate": 0.00036433224884152283, + "loss": 0.83715606, + "num_input_tokens_seen": 259985472, + "router_z_loss_mlp": 0.77734375, + "step": 3118, + "time_per_iteration": 2.7763798236846924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146842, + "balance_loss_mlp": 1.06897449, + "epoch": 0.6000384763370528, + "flos": 485535699456.0, + "grad_norm": 0.03789921911219481, + "language_loss": 0.83006287, + "learning_rate": 0.00036403242044958875, + "loss": 0.84153128, + "num_input_tokens_seen": 260050336, + "router_z_loss_mlp": 0.77783203, + "step": 3119, + "time_per_iteration": 2.549102783203125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156248, + "balance_loss_mlp": 1.07842839, + "epoch": 0.6002308580223162, + "flos": 597877756416.0, + "grad_norm": 0.03490542571663494, + "language_loss": 0.96794367, + "learning_rate": 0.0003637326448449507, + "loss": 0.97950613, + "num_input_tokens_seen": 260120304, + "router_z_loss_mlp": 0.77734375, + "step": 3120, + "time_per_iteration": 2.7004034519195557 + } + ], + "logging_steps": 1.0, + "max_steps": 5198, + "num_input_tokens_seen": 260120304, + "num_train_epochs": 1, + "save_steps": 1040, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 7092500920008704.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +} diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/training_args.bin b/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..dec1b7e0db130318069c72434f32c2789119b732 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c077e5103b778b39b648e3a5a2e73e36256d052f444290e14e15f87c36156cb +size 7992 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/zero_to_fp32.py b/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-3120/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/added_tokens.json b/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..c9d3d3a1b74d87e381e471f7b33784015d2dc0ea --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/added_tokens.json @@ -0,0 +1,13 @@ +{ + "<|assistant|>": 32001, + "<|endoftext|>": 32000, + "<|end|>": 32007, + "<|placeholder1|>": 32002, + "<|placeholder2|>": 32003, + "<|placeholder3|>": 32004, + "<|placeholder4|>": 32005, + "<|placeholder5|>": 32008, + "<|placeholder6|>": 32009, + "<|system|>": 32006, + "<|user|>": 32010 +} diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/config.json b/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/config.json new file mode 100644 index 0000000000000000000000000000000000000000..987150c78c9255ac53c0408588036e10466fc436 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/config.json @@ -0,0 +1,200 @@ +{ + "_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "architectures": [ + "LlavaPhiForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_phi3.Phi3Config", + "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM" + }, + "bal_comp_loss_coef": 0.01, + "balance_loss_coef": 0.01, + "bos_token_id": 1, + "clip_smoe": false, + "diversity_loss_coef": 0.01, + "dropout": false, + "e_loss_coef": 0.001, + "embd_pdrop": 0.0, + "entropy_advance_loss": false, + "eos_token_id": 32000, + "freeze_backbone": false, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 3072, + "hybrid": false, + "image_aspect_ratio": "pad", + "init_weight": true, + "initializer_range": 0.02, + "intermediate_size": 8192, + "is_cosine": false, + "is_norm_weight": false, + "local_rank": 0, + "loss1": "balanceloss", + "loss2": "zloss", + "luna": false, + "max_compete_in_iter": 3, + "max_position_embeddings": 131072, + "mlp_smoe": true, + "mm_hidden_size": 1152, + "mm_patch_merge_type": "flat", + "mm_projector_lr": null, + "mm_projector_type": "moe", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-224", + "model_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "model_type": "llava_phi", + "moe_name": "smoe_perturbed", + "norm_softmax": false, + "normalization": false, + "num_attention_heads": 32, + "num_experts": 8, + "num_hidden_layers": 32, + "num_key_value_heads": 32, + "num_layers": 3, + "num_selected": 4, + "number_of_previous_tokens": 2, + "original_max_position_embeddings": 4096, + "pad_token_id": 32000, + "pretrain_mm_mlp_adapter": null, + "rate_compete": 0.2, + "rate_flip": 0.05, + "resid_pdrop": 0.0, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "long_factor": [ + 1.0800000429153442, + 1.1100000143051147, + 1.1399999856948853, + 1.340000033378601, + 1.5899999141693115, + 1.600000023841858, + 1.6200000047683716, + 2.620000123977661, + 3.2300000190734863, + 3.2300000190734863, + 4.789999961853027, + 7.400000095367432, + 7.700000286102295, + 9.09000015258789, + 12.199999809265137, + 17.670000076293945, + 24.46000099182129, + 28.57000160217285, + 30.420001983642578, + 30.840002059936523, + 32.590003967285156, + 32.93000411987305, + 42.320003509521484, + 44.96000289916992, + 50.340003967285156, + 50.45000457763672, + 57.55000305175781, + 57.93000411987305, + 58.21000289916992, + 60.1400032043457, + 62.61000442504883, + 62.62000274658203, + 62.71000289916992, + 63.1400032043457, + 63.1400032043457, + 63.77000427246094, + 63.93000411987305, + 63.96000289916992, + 63.970001220703125, + 64.02999877929688, + 64.06999969482422, + 64.08000183105469, + 64.12000274658203, + 64.41000366210938, + 64.4800033569336, + 64.51000213623047, + 64.52999877929688, + 64.83999633789062 + ], + "short_factor": [ + 1.0, + 1.0199999809265137, + 1.0299999713897705, + 1.0299999713897705, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0699999332427979, + 1.0999999046325684, + 1.1099998950958252, + 1.1599998474121094, + 1.1599998474121094, + 1.1699998378753662, + 1.2899998426437378, + 1.339999794960022, + 1.679999828338623, + 1.7899998426437378, + 1.8199998140335083, + 1.8499997854232788, + 1.8799997568130493, + 1.9099997282028198, + 1.9399996995925903, + 1.9899996519088745, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0799996852874756, + 2.0899996757507324, + 2.189999580383301, + 2.2199995517730713, + 2.5899994373321533, + 2.729999542236328, + 2.749999523162842, + 2.8399994373321533 + ], + "type": "longrope" + }, + "rope_theta": 10000.0, + "router_loss_coef": 0.01, + "router_theta": 0.1, + "router_z_loss_coef": 0.001, + "scales": [ + 1, + 3 + ], + "sliding_window": 262144, + "sparse_upcycling": false, + "strategy_train": "base", + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "topk_max": 2, + "topk_min": 1, + "torch_dtype": "bfloat16", + "training": true, + "transformers_version": "4.43.0", + "tune_mm_mlp_adapter": false, + "unit_test": true, + "use_cache": false, + "use_mm_proj": true, + "use_old": false, + "version": "phi35", + "vision_tower": "google/siglip-so400m-patch14-224", + "vision_tower_dir": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/clip.bin", + "vocab_size": 32064, + "warm_up": 0.05 +} diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/generation_config.json b/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dad5c4578f0dc5969b38755d095fc30c368bb54a --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/generation_config.json @@ -0,0 +1,12 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "do_sample": true, + "eos_token_id": [ + 32007, + 32001, + 32000 + ], + "pad_token_id": 32000, + "transformers_version": "4.43.0" +} diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/global_step4160/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/global_step4160/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9b3e4d3137a453b5c405ebf7b1adfc1f37fa9184 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/global_step4160/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:558b8f7b614b6269cde620ab78b45c808c11acc2d59247bc2a43418c97d2e098 +size 396609872 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/global_step4160/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/global_step4160/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6dc4535d1c6ecd7ba389cf8d66ccc7597721f38e --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/global_step4160/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f09861e5007d76e2063f941559ef45fff30f76bcb38af6a2c7691739c030db96 +size 396609872 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/global_step4160/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/global_step4160/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..24e354dbeebc644db12d39f40d32b37556f5a220 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/global_step4160/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8fa469a46076cca963a77f2190c7a2373d16d52854b2cf032dea2b0d6485b252 +size 396609872 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/global_step4160/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/global_step4160/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..69c851659f12eb0d31b96fbbdd078246215cc767 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/global_step4160/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:72b0f4cf0c6a5f2de2fc06fe01e83f3699a67125b35e01151052961fe8ec708c +size 396609872 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/global_step4160/zero_pp_rank_0_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/global_step4160/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..386bf229b5dad01846b532eda66b2af513c955a4 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/global_step4160/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aaacf27286b244b39bda94a3420a6f9d8065c6079265c3eeb15cfd24854bc537 +size 2117322436 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/global_step4160/zero_pp_rank_1_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/global_step4160/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..40ace36638dc7be4c7346e9ef2adb7448f9bd6ed --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/global_step4160/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79bb71535f6b84de57bbe742c1982aa5b3f3c1a2cf01c00699fab4e63357aae9 +size 2117322436 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/global_step4160/zero_pp_rank_2_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/global_step4160/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c4b6122c4425abb8d3608182440196bdbd14d505 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/global_step4160/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5828fad5b37d5899322ff8eab1477d8586909cad20d18637eded5c651d317302 +size 2117322436 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/global_step4160/zero_pp_rank_3_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/global_step4160/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9cddfd2572786360a73650a4f6e1d0345c526ee9 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/global_step4160/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce83b8e7774aed996186fee1c912eb431eaaf38c8024150488f8b56db7d065bd +size 2117322436 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/latest b/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/latest new file mode 100644 index 0000000000000000000000000000000000000000..ae01dfd535e9ee314b565695c1d61230ecf4c494 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/latest @@ -0,0 +1 @@ +global_step4160 \ No newline at end of file diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/model-00001-of-00002.safetensors b/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/model-00001-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..29d76f5d80605301aab2bba59b53a5e2582094c4 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/model-00001-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe6c4f6ef38e8993629091331e0bbf23484cc88bdfd038f0dd17b6ec2800d855 +size 4972489328 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/model-00002-of-00002.safetensors b/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/model-00002-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e78c0450cc342b8feed20dcac8c17c16844c551a --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/model-00002-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:53cb0889450005cbaa676f855c1a173567d9e861be70ff50b3fd5a18e3ea8b02 +size 3759043888 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/model.safetensors.index.json b/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..01fe755c95da02467d97df3e39228dbbb26b065f --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/model.safetensors.index.json @@ -0,0 +1,674 @@ +{ + "metadata": { + "total_size": 8731443232 + }, + "weight_map": { + "lm_head.weight": "model-00002-of-00002.safetensors", + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.mm_projector.layer_norm.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.layer_norm.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.expert_embeddings": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.gate.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.inp_reduction.weight": "model-00002-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/rng_state_0.pth b/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..ef4849062bcdc8ffd2246c07673ba196a8d61a6d --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fae2114fffe9b1eea30e28bbdb4ce59046b0079ea5b8dc4682079f609d49d787 +size 14960 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/rng_state_1.pth b/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..2fcb2b640bc236c26aa841680d34a91240247970 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4ff5f3a53530ac868291e2667c8f824bfa1f4fa1ce880df8223a7165ef38e11 +size 14960 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/rng_state_2.pth b/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..00c3f989de00e6d58ca7345ae6f65fee0afcbdcd --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91f80a7779b0034e70106ba6cb0e3e686052334c20ce54453ee3977cc0219d15 +size 14960 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/rng_state_3.pth b/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..f289913854ee3fa52a86e282421da07d85b8a4c4 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ece3bc0d0e16c43ef245cc787cbd0d63d08d460f489c4cd52adf6501b9281a18 +size 14960 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/special_tokens_map.json b/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3e4d5a5bc1cb51753cc9ae0305ece0da60052b10 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/tokenizer.model b/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/tokenizer_config.json b/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d579bb0b91b24b214ea3c2e487e27a65017cdc4a --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/tokenizer_config.json @@ -0,0 +1,132 @@ +{ + "add_bos_token": false, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "32000": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32001": { + "content": "<|assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32002": { + "content": "<|placeholder1|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32003": { + "content": "<|placeholder2|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32004": { + "content": "<|placeholder3|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32005": { + "content": "<|placeholder4|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32006": { + "content": "<|system|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32007": { + "content": "<|end|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32008": { + "content": "<|placeholder5|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32009": { + "content": "<|placeholder6|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32010": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "legacy": false, + "model_max_length": 2048, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/trainer_state.json b/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..74e433fc4efb732e2daa0dca51ade21f73cbaa7c --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/trainer_state.json @@ -0,0 +1,62433 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.8003078106964217, + "eval_steps": 500, + "global_step": 4160, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02574398, + "balance_loss_mlp": 1.85189414, + "epoch": 0.00019238168526356292, + "flos": 471022176768.0, + "grad_norm": 12.86455737221305, + "language_loss": 2.79777646, + "learning_rate": 0.0, + "loss": 1.8614465, + "num_input_tokens_seen": 67104, + "router_z_loss_mlp": 7.2109375, + "step": 1, + "time_per_iteration": 21.83068585395813 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02254613, + "balance_loss_mlp": 1.76785779, + "epoch": 0.00038476337052712584, + "flos": 505537981440.0, + "grad_norm": 51.581369656319104, + "language_loss": 12.34714699, + "learning_rate": 0.00013726078121135892, + "loss": 12.3696928, + "num_input_tokens_seen": 134080, + "router_z_loss_mlp": 4.875, + "step": 2, + "time_per_iteration": 2.6192572116851807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02235864, + "balance_loss_mlp": 1.75177932, + "epoch": 0.0005771450557906887, + "flos": 600333152256.0, + "grad_norm": 53.41660983156924, + "language_loss": 12.32898235, + "learning_rate": 0.00021755319103969496, + "loss": 12.35134125, + "num_input_tokens_seen": 205152, + "router_z_loss_mlp": 4.84765625, + "step": 3, + "time_per_iteration": 2.887979030609131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02281771, + "balance_loss_mlp": 1.79577887, + "epoch": 0.0007695267410542517, + "flos": 581496442368.0, + "grad_norm": 15.812083363335244, + "language_loss": 9.24414825, + "learning_rate": 0.00027452156242271784, + "loss": 9.26696682, + "num_input_tokens_seen": 269664, + "router_z_loss_mlp": 4.8671875, + "step": 4, + "time_per_iteration": 2.6792547702789307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02454864, + "balance_loss_mlp": 1.95551991, + "epoch": 0.0009619084263178145, + "flos": 487153164288.0, + "grad_norm": 10.3691594005885, + "language_loss": 9.1886158, + "learning_rate": 0.0003187096642208417, + "loss": 9.21316433, + "num_input_tokens_seen": 338560, + "router_z_loss_mlp": 4.98828125, + "step": 5, + "time_per_iteration": 2.627883195877075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0247156, + "balance_loss_mlp": 1.97450531, + "epoch": 0.0011542901115813775, + "flos": 561166519296.0, + "grad_norm": 9.061082825397735, + "language_loss": 9.31672573, + "learning_rate": 0.0003548139722510539, + "loss": 9.34144115, + "num_input_tokens_seen": 410112, + "router_z_loss_mlp": 4.96875, + "step": 6, + "time_per_iteration": 2.697327136993408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02496704, + "balance_loss_mlp": 1.9977417, + "epoch": 0.0013466717968449403, + "flos": 534950886912.0, + "grad_norm": 5.1401213461899875, + "language_loss": 8.45638084, + "learning_rate": 0.00038533972973918044, + "loss": 8.48134804, + "num_input_tokens_seen": 477552, + "router_z_loss_mlp": 4.984375, + "step": 7, + "time_per_iteration": 2.6605119705200195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02367166, + "balance_loss_mlp": 1.8800292, + "epoch": 0.0015390534821085034, + "flos": 493333587456.0, + "grad_norm": 4.765795170053606, + "language_loss": 7.86978722, + "learning_rate": 0.0004117823436340768, + "loss": 7.89345884, + "num_input_tokens_seen": 549184, + "router_z_loss_mlp": 4.87890625, + "step": 8, + "time_per_iteration": 2.60813570022583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02377529, + "balance_loss_mlp": 1.89153647, + "epoch": 0.0017314351673720662, + "flos": 565775139840.0, + "grad_norm": 2.6394105736579268, + "language_loss": 7.60834789, + "learning_rate": 0.00043510638207938993, + "loss": 7.63212299, + "num_input_tokens_seen": 622880, + "router_z_loss_mlp": 4.8671875, + "step": 9, + "time_per_iteration": 2.871943712234497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0239868, + "balance_loss_mlp": 1.91802776, + "epoch": 0.001923816852635629, + "flos": 594508568064.0, + "grad_norm": 2.7082435786924752, + "language_loss": 7.06748104, + "learning_rate": 0.00045597044543220066, + "loss": 7.09146786, + "num_input_tokens_seen": 693584, + "router_z_loss_mlp": 4.8125, + "step": 10, + "time_per_iteration": 2.671294689178467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02381293, + "balance_loss_mlp": 1.90254807, + "epoch": 0.002116198537899192, + "flos": 610894611456.0, + "grad_norm": 2.113301815517677, + "language_loss": 6.83692646, + "learning_rate": 0.00047484428652143135, + "loss": 6.86073971, + "num_input_tokens_seen": 774432, + "router_z_loss_mlp": 4.79296875, + "step": 11, + "time_per_iteration": 2.885416269302368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02427226, + "balance_loss_mlp": 1.95687437, + "epoch": 0.002308580223162755, + "flos": 546174359040.0, + "grad_norm": 1.7416212933802626, + "language_loss": 6.4295001, + "learning_rate": 0.0004920747534624128, + "loss": 6.45377207, + "num_input_tokens_seen": 844304, + "router_z_loss_mlp": 4.70703125, + "step": 12, + "time_per_iteration": 2.6201112270355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02503769, + "balance_loss_mlp": 2.03265429, + "epoch": 0.002500961908426318, + "flos": 645923255808.0, + "grad_norm": 2.43618245016211, + "language_loss": 6.0048914, + "learning_rate": 0.0005079252465375872, + "loss": 6.02992916, + "num_input_tokens_seen": 915104, + "router_z_loss_mlp": 4.71484375, + "step": 13, + "time_per_iteration": 2.852263927459717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02634854, + "balance_loss_mlp": 2.15916157, + "epoch": 0.0026933435936898806, + "flos": 488848492032.0, + "grad_norm": 4.143842376760835, + "language_loss": 5.42230844, + "learning_rate": 0.0005226005109505393, + "loss": 5.44865704, + "num_input_tokens_seen": 982720, + "router_z_loss_mlp": 4.76171875, + "step": 14, + "time_per_iteration": 2.5524611473083496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02844198, + "balance_loss_mlp": 2.3646903, + "epoch": 0.0028857252789534437, + "flos": 435525628416.0, + "grad_norm": 5.672862092220106, + "language_loss": 4.15845776, + "learning_rate": 0.0005362628552605367, + "loss": 4.18689966, + "num_input_tokens_seen": 1050528, + "router_z_loss_mlp": 4.80078125, + "step": 15, + "time_per_iteration": 2.7353649139404297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03208902, + "balance_loss_mlp": 2.72252893, + "epoch": 0.0030781069642170067, + "flos": 597840826368.0, + "grad_norm": 3.947061509829782, + "language_loss": 2.26971245, + "learning_rate": 0.0005490431248454357, + "loss": 2.30180168, + "num_input_tokens_seen": 1116512, + "router_z_loss_mlp": 4.87109375, + "step": 16, + "time_per_iteration": 2.676703929901123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03601284, + "balance_loss_mlp": 3.10232162, + "epoch": 0.0032704886494805694, + "flos": 1541510280192.0, + "grad_norm": 0.6213816402988768, + "language_loss": 0.75705111, + "learning_rate": 0.0005610483427624225, + "loss": 0.793064, + "num_input_tokens_seen": 1351216, + "router_z_loss_mlp": 5.0, + "step": 17, + "time_per_iteration": 6.1610119342803955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0334326, + "balance_loss_mlp": 2.85841203, + "epoch": 0.0034628703347441324, + "flos": 474970237440.0, + "grad_norm": 2.8341915883282045, + "language_loss": 1.71282685, + "learning_rate": 0.0005723671632907488, + "loss": 1.74625945, + "num_input_tokens_seen": 1420512, + "router_z_loss_mlp": 4.85546875, + "step": 18, + "time_per_iteration": 2.638371467590332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02881518, + "balance_loss_mlp": 2.39934015, + "epoch": 0.0036552520200076955, + "flos": 449477743104.0, + "grad_norm": 2.8867361132515086, + "language_loss": 1.68530536, + "learning_rate": 0.0005830738490244919, + "loss": 1.71412063, + "num_input_tokens_seen": 1484976, + "router_z_loss_mlp": 4.828125, + "step": 19, + "time_per_iteration": 2.56374454498291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02402526, + "balance_loss_mlp": 1.92301893, + "epoch": 0.003847633705271258, + "flos": 637350563328.0, + "grad_norm": 0.6925173808128176, + "language_loss": 1.38203168, + "learning_rate": 0.0005932312266435596, + "loss": 1.406057, + "num_input_tokens_seen": 1557392, + "router_z_loss_mlp": 4.80078125, + "step": 20, + "time_per_iteration": 2.763998508453369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02421171, + "balance_loss_mlp": 1.94814897, + "epoch": 0.004040015390534821, + "flos": 590590158336.0, + "grad_norm": 1.6265477944222306, + "language_loss": 1.40919662, + "learning_rate": 0.0006028929207788754, + "loss": 1.43340826, + "num_input_tokens_seen": 1626064, + "router_z_loss_mlp": 4.734375, + "step": 21, + "time_per_iteration": 2.746016502380371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02575294, + "balance_loss_mlp": 2.10036469, + "epoch": 0.004232397075798384, + "flos": 757865812992.0, + "grad_norm": 1.576079326940489, + "language_loss": 1.40810275, + "learning_rate": 0.0006121050677327902, + "loss": 1.43385565, + "num_input_tokens_seen": 1696528, + "router_z_loss_mlp": 4.75390625, + "step": 22, + "time_per_iteration": 2.9607386589050293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02550906, + "balance_loss_mlp": 2.07025433, + "epoch": 0.004424778761061947, + "flos": 527726415360.0, + "grad_norm": 0.6323448080178445, + "language_loss": 1.22419024, + "learning_rate": 0.0006209076479463684, + "loss": 1.24969923, + "num_input_tokens_seen": 1765936, + "router_z_loss_mlp": 4.8125, + "step": 23, + "time_per_iteration": 2.5966527462005615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02511897, + "balance_loss_mlp": 2.02285314, + "epoch": 0.00461716044632551, + "flos": 549217907712.0, + "grad_norm": 0.22573529074246063, + "language_loss": 1.26396596, + "learning_rate": 0.0006293355346737718, + "loss": 1.28908491, + "num_input_tokens_seen": 1841632, + "router_z_loss_mlp": 4.8984375, + "step": 24, + "time_per_iteration": 2.672264575958252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02557217, + "balance_loss_mlp": 2.05978036, + "epoch": 0.004809542131589073, + "flos": 568751559168.0, + "grad_norm": 0.10471299124135865, + "language_loss": 1.20974565, + "learning_rate": 0.0006374193284416834, + "loss": 1.23531783, + "num_input_tokens_seen": 1920256, + "router_z_loss_mlp": 4.96875, + "step": 25, + "time_per_iteration": 2.7392375469207764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02658191, + "balance_loss_mlp": 2.15503263, + "epoch": 0.005001923816852636, + "flos": 471583584768.0, + "grad_norm": 0.16888144752152706, + "language_loss": 1.20314312, + "learning_rate": 0.0006451860277489461, + "loss": 1.22972512, + "num_input_tokens_seen": 1986528, + "router_z_loss_mlp": 5.02734375, + "step": 26, + "time_per_iteration": 2.6047253608703613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02722422, + "balance_loss_mlp": 2.21582985, + "epoch": 0.005194305502116198, + "flos": 416380743168.0, + "grad_norm": 0.22424567034217777, + "language_loss": 1.28844571, + "learning_rate": 0.0006526595731190848, + "loss": 1.31566989, + "num_input_tokens_seen": 2048016, + "router_z_loss_mlp": 5.0625, + "step": 27, + "time_per_iteration": 2.481884717941284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02743244, + "balance_loss_mlp": 2.2351265, + "epoch": 0.005386687187379761, + "flos": 629995835904.0, + "grad_norm": 0.15642653525507078, + "language_loss": 1.18914986, + "learning_rate": 0.0006598612921618983, + "loss": 1.2165823, + "num_input_tokens_seen": 2127664, + "router_z_loss_mlp": 5.078125, + "step": 28, + "time_per_iteration": 2.8519153594970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02748247, + "balance_loss_mlp": 2.24051118, + "epoch": 0.005579068872643324, + "flos": 888019997184.0, + "grad_norm": 0.1209301216257677, + "language_loss": 1.12191987, + "learning_rate": 0.0006668102665011454, + "loss": 1.14940238, + "num_input_tokens_seen": 2213952, + "router_z_loss_mlp": 5.07421875, + "step": 29, + "time_per_iteration": 3.2244889736175537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02691091, + "balance_loss_mlp": 2.18411779, + "epoch": 0.005771450557906887, + "flos": 548657952768.0, + "grad_norm": 0.1098895199150706, + "language_loss": 1.21368051, + "learning_rate": 0.0006735236364718957, + "loss": 1.24059153, + "num_input_tokens_seen": 2284736, + "router_z_loss_mlp": 5.06640625, + "step": 30, + "time_per_iteration": 2.642730474472046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02653145, + "balance_loss_mlp": 2.14769816, + "epoch": 0.00596383224317045, + "flos": 533068907520.0, + "grad_norm": 0.11046596793449442, + "language_loss": 1.1970098, + "learning_rate": 0.0006800168558381346, + "loss": 1.22354114, + "num_input_tokens_seen": 2354384, + "router_z_loss_mlp": 5.05078125, + "step": 31, + "time_per_iteration": 2.581875801086426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0257592, + "balance_loss_mlp": 2.07123542, + "epoch": 0.0061562139284340135, + "flos": 590162460672.0, + "grad_norm": 0.10949645130098669, + "language_loss": 1.22987807, + "learning_rate": 0.0006863039060567947, + "loss": 1.25563729, + "num_input_tokens_seen": 2419440, + "router_z_loss_mlp": 5.04296875, + "step": 32, + "time_per_iteration": 2.733224868774414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02505923, + "balance_loss_mlp": 2.00390816, + "epoch": 0.006348595613697576, + "flos": 619441107456.0, + "grad_norm": 0.0835016489973258, + "language_loss": 1.14437437, + "learning_rate": 0.0006923974775611263, + "loss": 1.16943359, + "num_input_tokens_seen": 2496368, + "router_z_loss_mlp": 5.015625, + "step": 33, + "time_per_iteration": 2.820788621902466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02482464, + "balance_loss_mlp": 1.98159432, + "epoch": 0.006540977298961139, + "flos": 779298908160.0, + "grad_norm": 0.08776573315434787, + "language_loss": 1.10869515, + "learning_rate": 0.0006983091239737814, + "loss": 1.13351965, + "num_input_tokens_seen": 2573280, + "router_z_loss_mlp": 5.00390625, + "step": 34, + "time_per_iteration": 2.9917590618133545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02373805, + "balance_loss_mlp": 1.87636864, + "epoch": 0.006733358984224702, + "flos": 668372201472.0, + "grad_norm": 0.0744368555221442, + "language_loss": 1.09626412, + "learning_rate": 0.0007040493939600222, + "loss": 1.12000227, + "num_input_tokens_seen": 2647248, + "router_z_loss_mlp": 4.96875, + "step": 35, + "time_per_iteration": 2.813040256500244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02308046, + "balance_loss_mlp": 1.81175399, + "epoch": 0.006925740669488265, + "flos": 565495162368.0, + "grad_norm": 0.06560236116646054, + "language_loss": 1.0974791, + "learning_rate": 0.0007096279445021078, + "loss": 1.12055957, + "num_input_tokens_seen": 2720736, + "router_z_loss_mlp": 4.95703125, + "step": 36, + "time_per_iteration": 2.715013027191162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02240602, + "balance_loss_mlp": 1.74888754, + "epoch": 0.007118122354751828, + "flos": 551111347200.0, + "grad_norm": 0.05581405617561486, + "language_loss": 1.16120386, + "learning_rate": 0.0007150536386503726, + "loss": 1.18360972, + "num_input_tokens_seen": 2800336, + "router_z_loss_mlp": 4.91015625, + "step": 37, + "time_per_iteration": 2.8262643814086914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02218804, + "balance_loss_mlp": 1.7293781, + "epoch": 0.007310504040015391, + "flos": 703813807104.0, + "grad_norm": 0.06412720029508237, + "language_loss": 1.08394384, + "learning_rate": 0.0007203346302358509, + "loss": 1.10613179, + "num_input_tokens_seen": 2883184, + "router_z_loss_mlp": 4.890625, + "step": 38, + "time_per_iteration": 2.9149320125579834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0220325, + "balance_loss_mlp": 1.71954608, + "epoch": 0.007502885725278953, + "flos": 600500338176.0, + "grad_norm": 0.08018675586540955, + "language_loss": 1.13587177, + "learning_rate": 0.000725478437577282, + "loss": 1.15790427, + "num_input_tokens_seen": 2960736, + "router_z_loss_mlp": 4.84375, + "step": 39, + "time_per_iteration": 2.7649383544921875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02194939, + "balance_loss_mlp": 1.71237946, + "epoch": 0.007695267410542516, + "flos": 561427031040.0, + "grad_norm": 0.11080304178085185, + "language_loss": 1.08546591, + "learning_rate": 0.0007304920078549186, + "loss": 1.10741532, + "num_input_tokens_seen": 3033472, + "router_z_loss_mlp": 4.83203125, + "step": 40, + "time_per_iteration": 2.7245187759399414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02164234, + "balance_loss_mlp": 1.68548942, + "epoch": 0.007887649095806078, + "flos": 509230808064.0, + "grad_norm": 0.12864951336881933, + "language_loss": 1.10053396, + "learning_rate": 0.0007353817735343603, + "loss": 1.12217629, + "num_input_tokens_seen": 3107824, + "router_z_loss_mlp": 4.79296875, + "step": 41, + "time_per_iteration": 2.6662168502807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02109951, + "balance_loss_mlp": 1.63425827, + "epoch": 0.008080030781069641, + "flos": 504904166400.0, + "grad_norm": 0.0888118324595499, + "language_loss": 1.05816543, + "learning_rate": 0.0007401537019902344, + "loss": 1.07926488, + "num_input_tokens_seen": 3176528, + "router_z_loss_mlp": 4.76171875, + "step": 42, + "time_per_iteration": 2.583256244659424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02065976, + "balance_loss_mlp": 1.59219027, + "epoch": 0.008272412466333205, + "flos": 519106059264.0, + "grad_norm": 0.08974821197730459, + "language_loss": 1.0785954, + "learning_rate": 0.0007448133392900729, + "loss": 1.09925508, + "num_input_tokens_seen": 3254256, + "router_z_loss_mlp": 4.7421875, + "step": 43, + "time_per_iteration": 2.677175998687744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01955434, + "balance_loss_mlp": 1.4839375, + "epoch": 0.008464794151596768, + "flos": 609183820800.0, + "grad_norm": 0.06237767914218564, + "language_loss": 1.03785229, + "learning_rate": 0.0007493658489441491, + "loss": 1.05740666, + "num_input_tokens_seen": 3340224, + "router_z_loss_mlp": 4.71875, + "step": 44, + "time_per_iteration": 2.8553237915039062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01864539, + "balance_loss_mlp": 1.39800107, + "epoch": 0.00865717583686033, + "flos": 539006283264.0, + "grad_norm": 0.049849947719683325, + "language_loss": 1.08088911, + "learning_rate": 0.0007538160463002316, + "loss": 1.09953451, + "num_input_tokens_seen": 3409216, + "router_z_loss_mlp": 4.66796875, + "step": 45, + "time_per_iteration": 2.637796640396118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01780353, + "balance_loss_mlp": 1.31572247, + "epoch": 0.008849557522123894, + "flos": 509009227776.0, + "grad_norm": 0.046919324832442044, + "language_loss": 1.11748755, + "learning_rate": 0.0007581684291577274, + "loss": 1.1352911, + "num_input_tokens_seen": 3478352, + "router_z_loss_mlp": 4.6484375, + "step": 46, + "time_per_iteration": 2.5655901432037354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01764453, + "balance_loss_mlp": 1.30211222, + "epoch": 0.009041939207387457, + "flos": 626507125248.0, + "grad_norm": 0.05937298040562763, + "language_loss": 1.13580585, + "learning_rate": 0.0007624272050891776, + "loss": 1.15345049, + "num_input_tokens_seen": 3555616, + "router_z_loss_mlp": 4.625, + "step": 47, + "time_per_iteration": 2.804643392562866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01776852, + "balance_loss_mlp": 1.31908798, + "epoch": 0.00923432089265102, + "flos": 550609789440.0, + "grad_norm": 0.07500714899038924, + "language_loss": 1.03489327, + "learning_rate": 0.0007665963158851307, + "loss": 1.05266178, + "num_input_tokens_seen": 3634512, + "router_z_loss_mlp": 4.578125, + "step": 48, + "time_per_iteration": 2.781435489654541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01771411, + "balance_loss_mlp": 1.3170805, + "epoch": 0.009426702577914583, + "flos": 563678310912.0, + "grad_norm": 0.07921486390615404, + "language_loss": 1.12758589, + "learning_rate": 0.0007706794594783609, + "loss": 1.14529991, + "num_input_tokens_seen": 3708480, + "router_z_loss_mlp": 4.54296875, + "step": 49, + "time_per_iteration": 2.739976644515991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.017484, + "balance_loss_mlp": 1.29483247, + "epoch": 0.009619084263178146, + "flos": 617925700608.0, + "grad_norm": 0.05671895540127436, + "language_loss": 1.10915053, + "learning_rate": 0.0007746801096530423, + "loss": 1.12663448, + "num_input_tokens_seen": 3783472, + "router_z_loss_mlp": 4.53515625, + "step": 50, + "time_per_iteration": 2.7333760261535645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01715641, + "balance_loss_mlp": 1.2616924, + "epoch": 0.009811465948441709, + "flos": 542488263168.0, + "grad_norm": 0.04785443300923319, + "language_loss": 1.16231108, + "learning_rate": 0.0007786015338021173, + "loss": 1.17946756, + "num_input_tokens_seen": 3851360, + "router_z_loss_mlp": 4.5390625, + "step": 51, + "time_per_iteration": 2.681392192840576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01700387, + "balance_loss_mlp": 1.24720073, + "epoch": 0.010003847633705272, + "flos": 536976583680.0, + "grad_norm": 0.04536583817216675, + "language_loss": 1.08076, + "learning_rate": 0.0007824468089603051, + "loss": 1.0977639, + "num_input_tokens_seen": 3923056, + "router_z_loss_mlp": 4.53125, + "step": 52, + "time_per_iteration": 2.6839513778686523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01675834, + "balance_loss_mlp": 1.2218852, + "epoch": 0.010196229318968833, + "flos": 910805316096.0, + "grad_norm": 0.04374839581732082, + "language_loss": 1.0833261, + "learning_rate": 0.0007862188363098669, + "loss": 1.10008454, + "num_input_tokens_seen": 4004528, + "router_z_loss_mlp": 4.5390625, + "step": 53, + "time_per_iteration": 3.1748838424682617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01650634, + "balance_loss_mlp": 1.19477725, + "epoch": 0.010388611004232396, + "flos": 586969190400.0, + "grad_norm": 0.045477377455174536, + "language_loss": 1.08262885, + "learning_rate": 0.0007899203543304438, + "loss": 1.09913516, + "num_input_tokens_seen": 4078704, + "router_z_loss_mlp": 4.55859375, + "step": 54, + "time_per_iteration": 2.7011117935180664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01588572, + "balance_loss_mlp": 1.13195276, + "epoch": 0.01058099268949596, + "flos": 503471351808.0, + "grad_norm": 0.05216939031034974, + "language_loss": 1.22650576, + "learning_rate": 0.0007935539507422731, + "loss": 1.24239147, + "num_input_tokens_seen": 4143600, + "router_z_loss_mlp": 4.56640625, + "step": 55, + "time_per_iteration": 2.6142656803131104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01553155, + "balance_loss_mlp": 1.09462798, + "epoch": 0.010773374374759523, + "flos": 545558008320.0, + "grad_norm": 0.04278176221573414, + "language_loss": 1.12836909, + "learning_rate": 0.0007971220733732573, + "loss": 1.14390063, + "num_input_tokens_seen": 4217904, + "router_z_loss_mlp": 4.5859375, + "step": 56, + "time_per_iteration": 2.718210220336914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01586959, + "balance_loss_mlp": 1.1318655, + "epoch": 0.010965756060023086, + "flos": 527285982720.0, + "grad_norm": 0.06958617519474361, + "language_loss": 1.08844507, + "learning_rate": 0.0008006270400641869, + "loss": 1.10431468, + "num_input_tokens_seen": 4293920, + "router_z_loss_mlp": 4.55078125, + "step": 57, + "time_per_iteration": 2.702324628829956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01576177, + "balance_loss_mlp": 1.12375367, + "epoch": 0.011158137745286649, + "flos": 578097054720.0, + "grad_norm": 0.08376433329063605, + "language_loss": 1.09231043, + "learning_rate": 0.0008040710477125043, + "loss": 1.10807228, + "num_input_tokens_seen": 4370080, + "router_z_loss_mlp": 4.5234375, + "step": 58, + "time_per_iteration": 2.733733892440796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01587306, + "balance_loss_mlp": 1.13793492, + "epoch": 0.011350519430550212, + "flos": 530314068480.0, + "grad_norm": 0.056261163559927586, + "language_loss": 1.098104, + "learning_rate": 0.0008074561805429771, + "loss": 1.11397719, + "num_input_tokens_seen": 4439792, + "router_z_loss_mlp": 4.4921875, + "step": 59, + "time_per_iteration": 2.604173183441162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0153348, + "balance_loss_mlp": 1.0886867, + "epoch": 0.011542901115813775, + "flos": 556970133504.0, + "grad_norm": 0.07546157909609297, + "language_loss": 1.07214928, + "learning_rate": 0.0008107844176832545, + "loss": 1.08748412, + "num_input_tokens_seen": 4510800, + "router_z_loss_mlp": 4.45703125, + "step": 60, + "time_per_iteration": 2.670180082321167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01515203, + "balance_loss_mlp": 1.07155395, + "epoch": 0.011735282801077338, + "flos": 573175529472.0, + "grad_norm": 0.06932920743779293, + "language_loss": 1.09267807, + "learning_rate": 0.0008140576401132568, + "loss": 1.10783005, + "num_input_tokens_seen": 4581136, + "router_z_loss_mlp": 4.44921875, + "step": 61, + "time_per_iteration": 2.635917901992798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01537914, + "balance_loss_mlp": 1.0965538, + "epoch": 0.0119276644863409, + "flos": 616716467712.0, + "grad_norm": 0.056166475672555005, + "language_loss": 1.10548615, + "learning_rate": 0.0008172776370494935, + "loss": 1.12086535, + "num_input_tokens_seen": 4650352, + "router_z_loss_mlp": 4.42578125, + "step": 62, + "time_per_iteration": 2.709764242172241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.015397, + "balance_loss_mlp": 1.10024714, + "epoch": 0.012120046171604464, + "flos": 502084199424.0, + "grad_norm": 0.046962065793300374, + "language_loss": 1.17909575, + "learning_rate": 0.0008204461118185703, + "loss": 1.19449282, + "num_input_tokens_seen": 4716336, + "router_z_loss_mlp": 4.40625, + "step": 63, + "time_per_iteration": 2.5971004962921143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01545078, + "balance_loss_mlp": 1.10943925, + "epoch": 0.012312427856868027, + "flos": 474301493760.0, + "grad_norm": 0.04671162143151921, + "language_loss": 1.07277906, + "learning_rate": 0.0008235646872681536, + "loss": 1.08822989, + "num_input_tokens_seen": 4781648, + "router_z_loss_mlp": 4.3671875, + "step": 64, + "time_per_iteration": 2.567622423171997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01534227, + "balance_loss_mlp": 1.10240316, + "epoch": 0.012504809542131588, + "flos": 539470910976.0, + "grad_norm": 0.04435006978162803, + "language_loss": 1.0673492, + "learning_rate": 0.0008266349107584288, + "loss": 1.08269131, + "num_input_tokens_seen": 4852320, + "router_z_loss_mlp": 4.328125, + "step": 65, + "time_per_iteration": 2.6833384037017822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0149994, + "balance_loss_mlp": 1.07345641, + "epoch": 0.012697191227395151, + "flos": 609856567296.0, + "grad_norm": 0.04524096047594039, + "language_loss": 1.09403265, + "learning_rate": 0.0008296582587724851, + "loss": 1.10903215, + "num_input_tokens_seen": 4922016, + "router_z_loss_mlp": 4.2734375, + "step": 66, + "time_per_iteration": 2.692337989807129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01482262, + "balance_loss_mlp": 1.05806744, + "epoch": 0.012889572912658714, + "flos": 769397460480.0, + "grad_norm": 0.04198159389490698, + "language_loss": 1.06809163, + "learning_rate": 0.0008326361411800136, + "loss": 1.08291411, + "num_input_tokens_seen": 5000128, + "router_z_loss_mlp": 4.25, + "step": 67, + "time_per_iteration": 2.923720598220825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01474655, + "balance_loss_mlp": 1.05503809, + "epoch": 0.013081954597922277, + "flos": 535020744192.0, + "grad_norm": 0.041919130945389606, + "language_loss": 1.07100165, + "learning_rate": 0.0008355699051851403, + "loss": 1.0857482, + "num_input_tokens_seen": 5074512, + "router_z_loss_mlp": 4.203125, + "step": 68, + "time_per_iteration": 2.7417044639587402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01462817, + "balance_loss_mlp": 1.04701531, + "epoch": 0.01327433628318584, + "flos": 574180646400.0, + "grad_norm": 0.041322055356332446, + "language_loss": 1.14468551, + "learning_rate": 0.0008384608389860635, + "loss": 1.15931368, + "num_input_tokens_seen": 5141856, + "router_z_loss_mlp": 4.1640625, + "step": 69, + "time_per_iteration": 2.6545376777648926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01450151, + "balance_loss_mlp": 1.03930819, + "epoch": 0.013466717968449404, + "flos": 498259115520.0, + "grad_norm": 0.039605765449237204, + "language_loss": 1.04742777, + "learning_rate": 0.000841310175171381, + "loss": 1.06192923, + "num_input_tokens_seen": 5209280, + "router_z_loss_mlp": 4.11328125, + "step": 70, + "time_per_iteration": 2.5687999725341797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01441096, + "balance_loss_mlp": 1.03101599, + "epoch": 0.013659099653712967, + "flos": 566621803008.0, + "grad_norm": 0.03646297128801074, + "language_loss": 1.03104186, + "learning_rate": 0.000844119093875517, + "loss": 1.04545283, + "num_input_tokens_seen": 5285424, + "router_z_loss_mlp": 4.1015625, + "step": 71, + "time_per_iteration": 2.698259115219116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01433469, + "balance_loss_mlp": 1.02720368, + "epoch": 0.01385148133897653, + "flos": 574942715904.0, + "grad_norm": 0.02854119406997066, + "language_loss": 1.07372236, + "learning_rate": 0.0008468887257134666, + "loss": 1.08805704, + "num_input_tokens_seen": 5358624, + "router_z_loss_mlp": 4.06445312, + "step": 72, + "time_per_iteration": 2.7074387073516846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01422625, + "balance_loss_mlp": 1.01941192, + "epoch": 0.014043863024240093, + "flos": 577958066688.0, + "grad_norm": 0.03113282173853564, + "language_loss": 1.10314119, + "learning_rate": 0.0008496201545131264, + "loss": 1.11736751, + "num_input_tokens_seen": 5429792, + "router_z_loss_mlp": 4.03515625, + "step": 73, + "time_per_iteration": 2.725660562515259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01425762, + "balance_loss_mlp": 1.02655351, + "epoch": 0.014236244709503656, + "flos": 940263883776.0, + "grad_norm": 0.033199488198319166, + "language_loss": 1.07624495, + "learning_rate": 0.0008523144198617317, + "loss": 1.0905025, + "num_input_tokens_seen": 5518608, + "router_z_loss_mlp": 3.99414062, + "step": 74, + "time_per_iteration": 3.2577481269836426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01437934, + "balance_loss_mlp": 1.04139662, + "epoch": 0.014428626394767219, + "flos": 529495603200.0, + "grad_norm": 0.03119178099318558, + "language_loss": 1.07016373, + "learning_rate": 0.0008549725194813783, + "loss": 1.08454299, + "num_input_tokens_seen": 5590576, + "router_z_loss_mlp": 3.96679688, + "step": 75, + "time_per_iteration": 2.727982997894287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01437754, + "balance_loss_mlp": 1.0446496, + "epoch": 0.014621008080030782, + "flos": 805282226688.0, + "grad_norm": 0.02968258762679391, + "language_loss": 1.06415534, + "learning_rate": 0.0008575954114472099, + "loss": 1.07853293, + "num_input_tokens_seen": 5674224, + "router_z_loss_mlp": 3.93164062, + "step": 76, + "time_per_iteration": 3.172807455062866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0143975, + "balance_loss_mlp": 1.04950643, + "epoch": 0.014813389765294343, + "flos": 698356521984.0, + "grad_norm": 0.031905123056971844, + "language_loss": 1.03629625, + "learning_rate": 0.0008601840162606118, + "loss": 1.05069387, + "num_input_tokens_seen": 5757648, + "router_z_loss_mlp": 3.90234375, + "step": 77, + "time_per_iteration": 3.029114007949829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01438585, + "balance_loss_mlp": 1.05158365, + "epoch": 0.015005771450557906, + "flos": 598164464640.0, + "grad_norm": 0.026994348673938514, + "language_loss": 1.09661531, + "learning_rate": 0.000862739218788641, + "loss": 1.11100101, + "num_input_tokens_seen": 5837600, + "router_z_loss_mlp": 3.86914062, + "step": 78, + "time_per_iteration": 2.795952320098877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01440626, + "balance_loss_mlp": 1.05705774, + "epoch": 0.01519815313582147, + "flos": 550492268544.0, + "grad_norm": 0.029495859587709627, + "language_loss": 1.07574832, + "learning_rate": 0.0008652618700799138, + "loss": 1.09015465, + "num_input_tokens_seen": 5907248, + "router_z_loss_mlp": 3.83789062, + "step": 79, + "time_per_iteration": 2.6552224159240723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01430975, + "balance_loss_mlp": 1.05084014, + "epoch": 0.015390534821085032, + "flos": 431440032768.0, + "grad_norm": 0.037998818197719206, + "language_loss": 1.07206631, + "learning_rate": 0.0008677527890662774, + "loss": 1.08637595, + "num_input_tokens_seen": 5970864, + "router_z_loss_mlp": 3.80664062, + "step": 80, + "time_per_iteration": 2.530073881149292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01424927, + "balance_loss_mlp": 1.04727161, + "epoch": 0.015582916506348595, + "flos": 525184424448.0, + "grad_norm": 0.03521308344632083, + "language_loss": 1.08168781, + "learning_rate": 0.0008702127641587799, + "loss": 1.09593713, + "num_input_tokens_seen": 6040800, + "router_z_loss_mlp": 3.78125, + "step": 81, + "time_per_iteration": 2.6248533725738525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01426595, + "balance_loss_mlp": 1.05141926, + "epoch": 0.015775298191612157, + "flos": 576616576512.0, + "grad_norm": 0.026523126631237747, + "language_loss": 1.036394, + "learning_rate": 0.0008726425547457192, + "loss": 1.05065989, + "num_input_tokens_seen": 6111840, + "router_z_loss_mlp": 3.75585938, + "step": 82, + "time_per_iteration": 2.759159564971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01424967, + "balance_loss_mlp": 1.05303442, + "epoch": 0.01596767987687572, + "flos": 611439103488.0, + "grad_norm": 0.03656915183129864, + "language_loss": 1.03032446, + "learning_rate": 0.0008750428925998964, + "loss": 1.04457414, + "num_input_tokens_seen": 6183872, + "router_z_loss_mlp": 3.72265625, + "step": 83, + "time_per_iteration": 2.739105224609375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01431924, + "balance_loss_mlp": 1.06151688, + "epoch": 0.016160061562139283, + "flos": 568232537088.0, + "grad_norm": 0.03323001720600938, + "language_loss": 1.08511543, + "learning_rate": 0.0008774144832015932, + "loss": 1.09943461, + "num_input_tokens_seen": 6255760, + "router_z_loss_mlp": 3.70703125, + "step": 84, + "time_per_iteration": 2.7144806385040283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02085876, + "balance_loss_mlp": 1.68762207, + "epoch": 0.016352443247402846, + "flos": 1414499701248.0, + "grad_norm": 0.1388747380481991, + "language_loss": 0.74774313, + "learning_rate": 0.0008797580069832641, + "loss": 0.76860189, + "num_input_tokens_seen": 6472960, + "router_z_loss_mlp": 3.984375, + "step": 85, + "time_per_iteration": 4.569611072540283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01450774, + "balance_loss_mlp": 1.08532572, + "epoch": 0.01654482493266641, + "flos": 731785165824.0, + "grad_norm": 0.04601998260491519, + "language_loss": 1.03772068, + "learning_rate": 0.0008820741205014318, + "loss": 1.05222845, + "num_input_tokens_seen": 6548912, + "router_z_loss_mlp": 3.65625, + "step": 86, + "time_per_iteration": 2.8604419231414795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.014606, + "balance_loss_mlp": 1.09744096, + "epoch": 0.016737206617929972, + "flos": 537404281344.0, + "grad_norm": 0.03433335749497543, + "language_loss": 1.05140662, + "learning_rate": 0.0008843634575408404, + "loss": 1.06601262, + "num_input_tokens_seen": 6621520, + "router_z_loss_mlp": 3.62695312, + "step": 87, + "time_per_iteration": 2.677731513977051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0145769, + "balance_loss_mlp": 1.09777355, + "epoch": 0.016929588303193535, + "flos": 538129420800.0, + "grad_norm": 0.05036212092144492, + "language_loss": 1.06815004, + "learning_rate": 0.0008866266301555082, + "loss": 1.08272696, + "num_input_tokens_seen": 6698432, + "router_z_loss_mlp": 3.59765625, + "step": 88, + "time_per_iteration": 2.7037875652313232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0145347, + "balance_loss_mlp": 1.09622347, + "epoch": 0.017121969988457098, + "flos": 527791543296.0, + "grad_norm": 0.030252065691096418, + "language_loss": 1.07441962, + "learning_rate": 0.0008888642296509615, + "loss": 1.08895445, + "num_input_tokens_seen": 6764336, + "router_z_loss_mlp": 3.56445312, + "step": 89, + "time_per_iteration": 2.590280771255493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0145473, + "balance_loss_mlp": 1.10034442, + "epoch": 0.01731435167372066, + "flos": 626767636992.0, + "grad_norm": 0.041554939890322294, + "language_loss": 1.12743318, + "learning_rate": 0.0008910768275115906, + "loss": 1.14198053, + "num_input_tokens_seen": 6839392, + "router_z_loss_mlp": 3.54101562, + "step": 90, + "time_per_iteration": 2.7529714107513428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0145373, + "balance_loss_mlp": 1.10220587, + "epoch": 0.017506733358984224, + "flos": 497384254464.0, + "grad_norm": 0.05646737130307679, + "language_loss": 1.07978606, + "learning_rate": 0.0008932649762767675, + "loss": 1.0943234, + "num_input_tokens_seen": 6907344, + "router_z_loss_mlp": 3.50976562, + "step": 91, + "time_per_iteration": 2.5964808464050293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01457202, + "balance_loss_mlp": 1.10911036, + "epoch": 0.017699115044247787, + "flos": 747217758720.0, + "grad_norm": 0.04050166442287704, + "language_loss": 1.1018101, + "learning_rate": 0.0008954292103690864, + "loss": 1.11638212, + "num_input_tokens_seen": 6982464, + "router_z_loss_mlp": 3.47851562, + "step": 92, + "time_per_iteration": 2.9288997650146484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01459372, + "balance_loss_mlp": 1.11395121, + "epoch": 0.01789149672951135, + "flos": 516520407552.0, + "grad_norm": 0.054281950557984966, + "language_loss": 1.12496912, + "learning_rate": 0.0008975700468778296, + "loss": 1.13956285, + "num_input_tokens_seen": 7049712, + "router_z_loss_mlp": 3.45117188, + "step": 93, + "time_per_iteration": 2.5800487995147705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01462727, + "balance_loss_mlp": 1.11978543, + "epoch": 0.018083878414774913, + "flos": 587229702144.0, + "grad_norm": 0.04557553976021738, + "language_loss": 1.05795836, + "learning_rate": 0.0008996879863005366, + "loss": 1.07258558, + "num_input_tokens_seen": 7120288, + "router_z_loss_mlp": 3.42578125, + "step": 94, + "time_per_iteration": 2.6668198108673096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0146929, + "balance_loss_mlp": 1.12882805, + "epoch": 0.018276260100038477, + "flos": 498369905664.0, + "grad_norm": 0.055406629054909326, + "language_loss": 1.06168532, + "learning_rate": 0.0009017835132453337, + "loss": 1.07637823, + "num_input_tokens_seen": 7188896, + "router_z_loss_mlp": 3.40234375, + "step": 95, + "time_per_iteration": 2.588728904724121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0146889, + "balance_loss_mlp": 1.1312896, + "epoch": 0.01846864178530204, + "flos": 641232043008.0, + "grad_norm": 0.04012691806662063, + "language_loss": 1.05874133, + "learning_rate": 0.0009038570970964896, + "loss": 1.0734303, + "num_input_tokens_seen": 7259536, + "router_z_loss_mlp": 3.37890625, + "step": 96, + "time_per_iteration": 2.7860937118530273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01464817, + "balance_loss_mlp": 1.12912345, + "epoch": 0.018661023470565603, + "flos": 512667125760.0, + "grad_norm": 0.027884025705687265, + "language_loss": 1.03269148, + "learning_rate": 0.0009059091926454854, + "loss": 1.04733968, + "num_input_tokens_seen": 7326752, + "router_z_loss_mlp": 3.359375, + "step": 97, + "time_per_iteration": 2.6100950241088867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01470726, + "balance_loss_mlp": 1.13694024, + "epoch": 0.018853405155829166, + "flos": 932696308224.0, + "grad_norm": 0.03936003805775877, + "language_loss": 1.02435613, + "learning_rate": 0.0009079402406897198, + "loss": 1.03906357, + "num_input_tokens_seen": 7417488, + "router_z_loss_mlp": 3.33984375, + "step": 98, + "time_per_iteration": 3.2489542961120605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01467854, + "balance_loss_mlp": 1.13616598, + "epoch": 0.01904578684109273, + "flos": 577586764800.0, + "grad_norm": 0.036005296184057074, + "language_loss": 1.04073858, + "learning_rate": 0.0009099506686008212, + "loss": 1.05541718, + "num_input_tokens_seen": 7493136, + "router_z_loss_mlp": 3.31835938, + "step": 99, + "time_per_iteration": 2.7905051708221436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01467812, + "balance_loss_mlp": 1.13822246, + "epoch": 0.019238168526356292, + "flos": 559520856576.0, + "grad_norm": 0.02696843746399884, + "language_loss": 1.07409596, + "learning_rate": 0.0009119408908644013, + "loss": 1.08877409, + "num_input_tokens_seen": 7560896, + "router_z_loss_mlp": 3.296875, + "step": 100, + "time_per_iteration": 2.7075607776641846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01456893, + "balance_loss_mlp": 1.12882876, + "epoch": 0.019430550211619855, + "flos": 725103184896.0, + "grad_norm": 0.03304065923870771, + "language_loss": 1.12780023, + "learning_rate": 0.0009139113095929519, + "loss": 1.14236927, + "num_input_tokens_seen": 7629040, + "router_z_loss_mlp": 3.28125, + "step": 101, + "time_per_iteration": 2.86230731010437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01460167, + "balance_loss_mlp": 1.13439226, + "epoch": 0.019622931896883418, + "flos": 500456001024.0, + "grad_norm": 0.030619133870748612, + "language_loss": 1.06594038, + "learning_rate": 0.0009158623150134762, + "loss": 1.08054209, + "num_input_tokens_seen": 7694256, + "router_z_loss_mlp": 3.2578125, + "step": 102, + "time_per_iteration": 2.563690185546875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01458611, + "balance_loss_mlp": 1.13569677, + "epoch": 0.01981531358214698, + "flos": 510281587200.0, + "grad_norm": 0.03276303076426602, + "language_loss": 1.06164801, + "learning_rate": 0.000917794285931332, + "loss": 1.0762341, + "num_input_tokens_seen": 7762256, + "router_z_loss_mlp": 3.22851562, + "step": 103, + "time_per_iteration": 2.6599903106689453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01462945, + "balance_loss_mlp": 1.1421293, + "epoch": 0.020007695267410544, + "flos": 522392655360.0, + "grad_norm": 0.026505304013468463, + "language_loss": 0.98227251, + "learning_rate": 0.0009197075901716639, + "loss": 0.99690199, + "num_input_tokens_seen": 7834400, + "router_z_loss_mlp": 3.20703125, + "step": 104, + "time_per_iteration": 2.726245880126953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01469463, + "balance_loss_mlp": 1.14998221, + "epoch": 0.020200076952674107, + "flos": 534443324928.0, + "grad_norm": 0.029933884589862427, + "language_loss": 1.08736229, + "learning_rate": 0.0009216025849997171, + "loss": 1.10205698, + "num_input_tokens_seen": 7911184, + "router_z_loss_mlp": 3.19335938, + "step": 105, + "time_per_iteration": 2.8023486137390137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01468836, + "balance_loss_mlp": 1.15221632, + "epoch": 0.020392458637937667, + "flos": 686082270720.0, + "grad_norm": 0.024520994280375335, + "language_loss": 1.03054178, + "learning_rate": 0.0009234796175212258, + "loss": 1.04523015, + "num_input_tokens_seen": 7985280, + "router_z_loss_mlp": 3.1640625, + "step": 106, + "time_per_iteration": 2.9396088123321533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01469456, + "balance_loss_mlp": 1.15512502, + "epoch": 0.02058484032320123, + "flos": 703414307328.0, + "grad_norm": 0.02898567585615155, + "language_loss": 1.07201982, + "learning_rate": 0.000925339025064007, + "loss": 1.08671439, + "num_input_tokens_seen": 8068320, + "router_z_loss_mlp": 3.140625, + "step": 107, + "time_per_iteration": 2.9473297595977783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0147282, + "balance_loss_mlp": 1.16001439, + "epoch": 0.020777222008464793, + "flos": 640326982656.0, + "grad_norm": 0.02770789473723963, + "language_loss": 0.99879742, + "learning_rate": 0.0009271811355418027, + "loss": 1.01352561, + "num_input_tokens_seen": 8148144, + "router_z_loss_mlp": 3.125, + "step": 108, + "time_per_iteration": 2.8551387786865234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01469504, + "balance_loss_mlp": 1.15803361, + "epoch": 0.020969603693728356, + "flos": 683320700928.0, + "grad_norm": 0.029161506766480293, + "language_loss": 1.06637371, + "learning_rate": 0.0009290062678013548, + "loss": 1.08106875, + "num_input_tokens_seen": 8222256, + "router_z_loss_mlp": 3.11132812, + "step": 109, + "time_per_iteration": 2.821951389312744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01468675, + "balance_loss_mlp": 1.15949392, + "epoch": 0.02116198537899192, + "flos": 534419129856.0, + "grad_norm": 0.03188637458086245, + "language_loss": 1.05070233, + "learning_rate": 0.0009308147319536321, + "loss": 1.06538928, + "num_input_tokens_seen": 8292432, + "router_z_loss_mlp": 3.08789062, + "step": 110, + "time_per_iteration": 2.6315042972564697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01469018, + "balance_loss_mlp": 1.16212535, + "epoch": 0.021354367064255482, + "flos": 718727377920.0, + "grad_norm": 0.030955966903197116, + "language_loss": 1.11490715, + "learning_rate": 0.0009326068296900676, + "loss": 1.12959719, + "num_input_tokens_seen": 8365024, + "router_z_loss_mlp": 3.06445312, + "step": 111, + "time_per_iteration": 2.8208162784576416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01474326, + "balance_loss_mlp": 1.16934085, + "epoch": 0.021546748749519045, + "flos": 520623467520.0, + "grad_norm": 0.027870670355515197, + "language_loss": 1.02138007, + "learning_rate": 0.0009343828545846161, + "loss": 1.03612328, + "num_input_tokens_seen": 8442448, + "router_z_loss_mlp": 3.04492188, + "step": 112, + "time_per_iteration": 2.759277105331421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01474098, + "balance_loss_mlp": 1.17063916, + "epoch": 0.021739130434782608, + "flos": 506161062912.0, + "grad_norm": 0.03372988233582904, + "language_loss": 1.06662297, + "learning_rate": 0.0009361430923823841, + "loss": 1.08136404, + "num_input_tokens_seen": 8508992, + "router_z_loss_mlp": 3.02929688, + "step": 113, + "time_per_iteration": 2.565107822418213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01471087, + "balance_loss_mlp": 1.1693449, + "epoch": 0.02193151212004617, + "flos": 464426242560.0, + "grad_norm": 0.03803370713592907, + "language_loss": 1.10115385, + "learning_rate": 0.0009378878212755459, + "loss": 1.11586463, + "num_input_tokens_seen": 8574048, + "router_z_loss_mlp": 3.01171875, + "step": 114, + "time_per_iteration": 2.491929292678833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01471993, + "balance_loss_mlp": 1.17253923, + "epoch": 0.022123893805309734, + "flos": 553331701248.0, + "grad_norm": 0.029753755152528143, + "language_loss": 1.00006115, + "learning_rate": 0.0009396173121672103, + "loss": 1.014781, + "num_input_tokens_seen": 8647808, + "router_z_loss_mlp": 2.98828125, + "step": 115, + "time_per_iteration": 2.6869561672210693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01473585, + "balance_loss_mlp": 1.1754663, + "epoch": 0.022316275490573297, + "flos": 637378761216.0, + "grad_norm": 0.032022590728611564, + "language_loss": 1.0593642, + "learning_rate": 0.0009413318289238633, + "loss": 1.07410002, + "num_input_tokens_seen": 8719760, + "router_z_loss_mlp": 2.97460938, + "step": 116, + "time_per_iteration": 2.7639846801757812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01474428, + "balance_loss_mlp": 1.17859828, + "epoch": 0.02250865717583686, + "flos": 800315039232.0, + "grad_norm": 0.032750944460810345, + "language_loss": 0.98115921, + "learning_rate": 0.0009430316286169771, + "loss": 0.99590349, + "num_input_tokens_seen": 8798752, + "router_z_loss_mlp": 2.95117188, + "step": 117, + "time_per_iteration": 3.020703077316284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01469481, + "balance_loss_mlp": 1.17536783, + "epoch": 0.022701038861100423, + "flos": 457062782976.0, + "grad_norm": 0.027209249322999743, + "language_loss": 1.0327785, + "learning_rate": 0.0009447169617543361, + "loss": 1.04747331, + "num_input_tokens_seen": 8866848, + "router_z_loss_mlp": 2.9375, + "step": 118, + "time_per_iteration": 2.5938501358032227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01466386, + "balance_loss_mlp": 1.17437065, + "epoch": 0.022893420546363986, + "flos": 584186153472.0, + "grad_norm": 0.028075325054819567, + "language_loss": 1.10005641, + "learning_rate": 0.0009463880725016029, + "loss": 1.11472011, + "num_input_tokens_seen": 8935488, + "router_z_loss_mlp": 2.91992188, + "step": 119, + "time_per_iteration": 2.7082488536834717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01467196, + "balance_loss_mlp": 1.17861414, + "epoch": 0.02308580223162755, + "flos": 562477810176.0, + "grad_norm": 0.032360539397207934, + "language_loss": 1.05048943, + "learning_rate": 0.0009480451988946134, + "loss": 1.06516147, + "num_input_tokens_seen": 9015344, + "router_z_loss_mlp": 2.89257812, + "step": 120, + "time_per_iteration": 2.808687686920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01461098, + "balance_loss_mlp": 1.17423272, + "epoch": 0.023278183916891113, + "flos": 772645125120.0, + "grad_norm": 0.033180722862994706, + "language_loss": 1.06113267, + "learning_rate": 0.0009496885730428627, + "loss": 1.07574379, + "num_input_tokens_seen": 9094672, + "router_z_loss_mlp": 2.875, + "step": 121, + "time_per_iteration": 3.0043137073516846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01466426, + "balance_loss_mlp": 1.18070555, + "epoch": 0.023470565602154676, + "flos": 554430144000.0, + "grad_norm": 0.030787275004595428, + "language_loss": 1.04567683, + "learning_rate": 0.0009513184213246156, + "loss": 1.06034112, + "num_input_tokens_seen": 9160608, + "router_z_loss_mlp": 2.86328125, + "step": 122, + "time_per_iteration": 2.6666839122772217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01462554, + "balance_loss_mlp": 1.17835939, + "epoch": 0.02366294728741824, + "flos": 561166519296.0, + "grad_norm": 0.030499039091632818, + "language_loss": 1.08099937, + "learning_rate": 0.0009529349645740552, + "loss": 1.09562504, + "num_input_tokens_seen": 9228704, + "router_z_loss_mlp": 2.84765625, + "step": 123, + "time_per_iteration": 2.69850492477417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01460088, + "balance_loss_mlp": 1.17741883, + "epoch": 0.0238553289726818, + "flos": 469516955136.0, + "grad_norm": 0.026549221517309443, + "language_loss": 1.06623578, + "learning_rate": 0.0009545384182608524, + "loss": 1.08083653, + "num_input_tokens_seen": 9294288, + "router_z_loss_mlp": 2.83203125, + "step": 124, + "time_per_iteration": 2.5435874462127686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01462583, + "balance_loss_mlp": 1.18144011, + "epoch": 0.024047710657945365, + "flos": 561103392768.0, + "grad_norm": 0.03287811385355005, + "language_loss": 1.04055512, + "learning_rate": 0.0009561289926625252, + "loss": 1.05518079, + "num_input_tokens_seen": 9368048, + "router_z_loss_mlp": 2.81640625, + "step": 125, + "time_per_iteration": 2.6661720275878906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01464029, + "balance_loss_mlp": 1.18460226, + "epoch": 0.024240092343208928, + "flos": 505770295296.0, + "grad_norm": 0.030159442314643806, + "language_loss": 1.08985233, + "learning_rate": 0.0009577068930299292, + "loss": 1.10449266, + "num_input_tokens_seen": 9434848, + "router_z_loss_mlp": 2.79882812, + "step": 126, + "time_per_iteration": 2.596027135848999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01456959, + "balance_loss_mlp": 1.17944014, + "epoch": 0.02443247402847249, + "flos": 436752325632.0, + "grad_norm": 0.03465787530540315, + "language_loss": 1.04454637, + "learning_rate": 0.0009592723197462087, + "loss": 1.05911589, + "num_input_tokens_seen": 9504112, + "router_z_loss_mlp": 2.77929688, + "step": 127, + "time_per_iteration": 2.6355836391448975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0145855, + "balance_loss_mlp": 1.18236613, + "epoch": 0.024624855713736054, + "flos": 685068421632.0, + "grad_norm": 0.03103018628328697, + "language_loss": 1.00976562, + "learning_rate": 0.0009608254684795125, + "loss": 1.02435124, + "num_input_tokens_seen": 9590032, + "router_z_loss_mlp": 2.765625, + "step": 128, + "time_per_iteration": 2.956745147705078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01452077, + "balance_loss_mlp": 1.17741859, + "epoch": 0.024817237398999614, + "flos": 526113679872.0, + "grad_norm": 0.03378324138815482, + "language_loss": 1.03947771, + "learning_rate": 0.0009623665303297678, + "loss": 1.05399847, + "num_input_tokens_seen": 9663040, + "router_z_loss_mlp": 2.75, + "step": 129, + "time_per_iteration": 2.762612819671631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0145448, + "balance_loss_mlp": 1.18115723, + "epoch": 0.025009619084263177, + "flos": 656886216192.0, + "grad_norm": 0.03318348770393379, + "language_loss": 1.08023834, + "learning_rate": 0.0009638956919697878, + "loss": 1.09478307, + "num_input_tokens_seen": 9736544, + "router_z_loss_mlp": 2.73339844, + "step": 130, + "time_per_iteration": 2.8800294399261475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01453293, + "balance_loss_mlp": 1.18130565, + "epoch": 0.02520200076952674, + "flos": 455369456640.0, + "grad_norm": 0.028803226470227133, + "language_loss": 1.00211501, + "learning_rate": 0.0009654131357809714, + "loss": 1.01664793, + "num_input_tokens_seen": 9804656, + "router_z_loss_mlp": 2.71875, + "step": 131, + "time_per_iteration": 2.593409776687622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01454951, + "balance_loss_mlp": 1.18534708, + "epoch": 0.025394382454790303, + "flos": 841268324352.0, + "grad_norm": 0.035993676074610494, + "language_loss": 1.09494662, + "learning_rate": 0.0009669190399838441, + "loss": 1.10949612, + "num_input_tokens_seen": 9888864, + "router_z_loss_mlp": 2.69824219, + "step": 132, + "time_per_iteration": 3.1307294368743896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01454062, + "balance_loss_mlp": 1.18588877, + "epoch": 0.025586764140053866, + "flos": 582228312576.0, + "grad_norm": 0.03305283337163912, + "language_loss": 1.02299893, + "learning_rate": 0.0009684135787636724, + "loss": 1.03753948, + "num_input_tokens_seen": 9968208, + "router_z_loss_mlp": 2.68359375, + "step": 133, + "time_per_iteration": 2.8118627071380615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01454726, + "balance_loss_mlp": 1.18798327, + "epoch": 0.02577914582531743, + "flos": 791677218816.0, + "grad_norm": 0.03011124606519955, + "language_loss": 1.06380379, + "learning_rate": 0.0009698969223913726, + "loss": 1.07835102, + "num_input_tokens_seen": 10049664, + "router_z_loss_mlp": 2.66894531, + "step": 134, + "time_per_iteration": 3.0371806621551514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01450237, + "balance_loss_mlp": 1.18454385, + "epoch": 0.025971527510580992, + "flos": 596062906368.0, + "grad_norm": 0.030569012833979448, + "language_loss": 1.08986592, + "learning_rate": 0.0009713692373399265, + "loss": 1.10436833, + "num_input_tokens_seen": 10120096, + "router_z_loss_mlp": 2.65820312, + "step": 135, + "time_per_iteration": 2.684903621673584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01684837, + "balance_loss_mlp": 1.39873505, + "epoch": 0.026163909195844555, + "flos": 1581074411520.0, + "grad_norm": 0.08870187959024729, + "language_loss": 0.79456228, + "learning_rate": 0.0009728306863964993, + "loss": 0.81141067, + "num_input_tokens_seen": 10348976, + "router_z_loss_mlp": 2.8671875, + "step": 136, + "time_per_iteration": 5.94019627571106 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0161422, + "balance_loss_mlp": 1.33116913, + "epoch": 0.026356290881108118, + "flos": 1505160886272.0, + "grad_norm": 0.07212137850421584, + "language_loss": 0.77811038, + "learning_rate": 0.0009742814287704512, + "loss": 0.79425257, + "num_input_tokens_seen": 10576512, + "router_z_loss_mlp": 2.8359375, + "step": 137, + "time_per_iteration": 4.865153074264526 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01469938, + "balance_loss_mlp": 1.20901299, + "epoch": 0.02654867256637168, + "flos": 598340382720.0, + "grad_norm": 0.040535745966457745, + "language_loss": 1.01652551, + "learning_rate": 0.0009757216201974225, + "loss": 1.03122485, + "num_input_tokens_seen": 10659168, + "router_z_loss_mlp": 2.609375, + "step": 138, + "time_per_iteration": 2.8955435752868652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01487517, + "balance_loss_mlp": 1.22802222, + "epoch": 0.026741054251635244, + "flos": 546135427584.0, + "grad_norm": 0.04340470282065083, + "language_loss": 1.06732666, + "learning_rate": 0.0009771514130396581, + "loss": 1.08220184, + "num_input_tokens_seen": 10731584, + "router_z_loss_mlp": 2.59472656, + "step": 139, + "time_per_iteration": 2.6939706802368164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01498511, + "balance_loss_mlp": 1.24044681, + "epoch": 0.026933435936898807, + "flos": 507845657088.0, + "grad_norm": 0.04879945782970011, + "language_loss": 1.07520163, + "learning_rate": 0.00097857095638274, + "loss": 1.09018672, + "num_input_tokens_seen": 10799456, + "router_z_loss_mlp": 2.58007812, + "step": 140, + "time_per_iteration": 2.559673309326172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01492411, + "balance_loss_mlp": 1.23558652, + "epoch": 0.02712581762216237, + "flos": 742253299200.0, + "grad_norm": 0.043929969627725114, + "language_loss": 0.98754954, + "learning_rate": 0.0009799803961288726, + "loss": 1.00247359, + "num_input_tokens_seen": 10886416, + "router_z_loss_mlp": 2.5703125, + "step": 141, + "time_per_iteration": 3.008998394012451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01470778, + "balance_loss_mlp": 1.21567059, + "epoch": 0.027318199307425933, + "flos": 849777890304.0, + "grad_norm": 0.03716164217421175, + "language_loss": 1.04960537, + "learning_rate": 0.000981379875086876, + "loss": 1.06431305, + "num_input_tokens_seen": 10966064, + "router_z_loss_mlp": 2.55371094, + "step": 142, + "time_per_iteration": 3.057098865509033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01469037, + "balance_loss_mlp": 1.21535933, + "epoch": 0.027510580992689496, + "flos": 576638043648.0, + "grad_norm": 0.03712962317624948, + "language_loss": 1.00046849, + "learning_rate": 0.0009827695330590185, + "loss": 1.01515889, + "num_input_tokens_seen": 11039712, + "router_z_loss_mlp": 2.5390625, + "step": 143, + "time_per_iteration": 2.638338327407837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01450228, + "balance_loss_mlp": 1.19750416, + "epoch": 0.02770296267795306, + "flos": 773789230080.0, + "grad_norm": 0.030455330453953735, + "language_loss": 0.99027133, + "learning_rate": 0.0009841495069248256, + "loss": 1.00477362, + "num_input_tokens_seen": 11123984, + "router_z_loss_mlp": 2.52929688, + "step": 144, + "time_per_iteration": 2.981438636779785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01441391, + "balance_loss_mlp": 1.19009781, + "epoch": 0.027895344363216622, + "flos": 570448888320.0, + "grad_norm": 0.031624263879455494, + "language_loss": 0.98723662, + "learning_rate": 0.0009855199307219871, + "loss": 1.00165045, + "num_input_tokens_seen": 11192864, + "router_z_loss_mlp": 2.51464844, + "step": 145, + "time_per_iteration": 2.6923046112060547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01440125, + "balance_loss_mlp": 1.1903578, + "epoch": 0.028087726048480186, + "flos": 548408174592.0, + "grad_norm": 0.029995844711875903, + "language_loss": 1.00586843, + "learning_rate": 0.0009868809357244854, + "loss": 1.02026975, + "num_input_tokens_seen": 11261760, + "router_z_loss_mlp": 2.49902344, + "step": 146, + "time_per_iteration": 2.6284868717193604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01436833, + "balance_loss_mlp": 1.18782902, + "epoch": 0.02828010773374375, + "flos": 525872633856.0, + "grad_norm": 0.03288909570778387, + "language_loss": 1.05042541, + "learning_rate": 0.0009882326505180556, + "loss": 1.06479371, + "num_input_tokens_seen": 11334736, + "router_z_loss_mlp": 2.49121094, + "step": 147, + "time_per_iteration": 2.6588714122772217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01425728, + "balance_loss_mlp": 1.1783452, + "epoch": 0.02847248941900731, + "flos": 773771765760.0, + "grad_norm": 0.031738987003727674, + "language_loss": 1.02499485, + "learning_rate": 0.0009895752010730906, + "loss": 1.03925204, + "num_input_tokens_seen": 11409872, + "router_z_loss_mlp": 2.47460938, + "step": 148, + "time_per_iteration": 2.9316182136535645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01424571, + "balance_loss_mlp": 1.17785549, + "epoch": 0.028664871104270875, + "flos": 535469908992.0, + "grad_norm": 0.028294299214345536, + "language_loss": 1.0900923, + "learning_rate": 0.0009909087108150867, + "loss": 1.10433793, + "num_input_tokens_seen": 11481024, + "router_z_loss_mlp": 2.46777344, + "step": 149, + "time_per_iteration": 2.697423219680786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.014274, + "balance_loss_mlp": 1.18182933, + "epoch": 0.028857252789534438, + "flos": 368604487680.0, + "grad_norm": 0.03525963963400797, + "language_loss": 1.09753942, + "learning_rate": 0.0009922333006927371, + "loss": 1.11181331, + "num_input_tokens_seen": 11544240, + "router_z_loss_mlp": 2.45605469, + "step": 150, + "time_per_iteration": 2.483644723892212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01433542, + "balance_loss_mlp": 1.18911529, + "epoch": 0.029049634474798, + "flos": 516483477504.0, + "grad_norm": 0.03341635886009217, + "language_loss": 1.03220332, + "learning_rate": 0.0009935490892437632, + "loss": 1.04653883, + "num_input_tokens_seen": 11610416, + "router_z_loss_mlp": 2.44433594, + "step": 151, + "time_per_iteration": 2.604599952697754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01438911, + "balance_loss_mlp": 1.19553363, + "epoch": 0.029242016160061564, + "flos": 589348724736.0, + "grad_norm": 0.030166761621646727, + "language_loss": 1.01782072, + "learning_rate": 0.0009948561926585687, + "loss": 1.03220987, + "num_input_tokens_seen": 11687488, + "router_z_loss_mlp": 2.43359375, + "step": 152, + "time_per_iteration": 2.7724709510803223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01445258, + "balance_loss_mlp": 1.20350146, + "epoch": 0.029434397845325123, + "flos": 553136317440.0, + "grad_norm": 0.030739210798008048, + "language_loss": 1.05873716, + "learning_rate": 0.0009961547248418122, + "loss": 1.07318974, + "num_input_tokens_seen": 11754576, + "router_z_loss_mlp": 2.41699219, + "step": 153, + "time_per_iteration": 2.6247737407684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01440878, + "balance_loss_mlp": 1.19988418, + "epoch": 0.029626779530588686, + "flos": 604607400960.0, + "grad_norm": 0.030186385343499288, + "language_loss": 1.02632022, + "learning_rate": 0.0009974447974719707, + "loss": 1.04072905, + "num_input_tokens_seen": 11831360, + "router_z_loss_mlp": 2.40917969, + "step": 154, + "time_per_iteration": 2.730053663253784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01431891, + "balance_loss_mlp": 1.19194651, + "epoch": 0.02981916121585225, + "flos": 622217413632.0, + "grad_norm": 0.02801027733601246, + "language_loss": 1.04305005, + "learning_rate": 0.0009987265200589763, + "loss": 1.05736899, + "num_input_tokens_seen": 11902192, + "router_z_loss_mlp": 2.3984375, + "step": 155, + "time_per_iteration": 2.7198500633239746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01423605, + "balance_loss_mlp": 1.18537688, + "epoch": 0.030011542901115813, + "flos": 662879987712.0, + "grad_norm": 0.0349007823819893, + "language_loss": 1.04218483, + "learning_rate": 0.001, + "loss": 1.05642092, + "num_input_tokens_seen": 11979088, + "router_z_loss_mlp": 2.38085938, + "step": 156, + "time_per_iteration": 2.8801028728485107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01420835, + "balance_loss_mlp": 1.18289316, + "epoch": 0.030203924586379376, + "flos": 652818084864.0, + "grad_norm": 0.029403473562715665, + "language_loss": 1.01930022, + "learning_rate": 0.0009999999029413921, + "loss": 1.03350854, + "num_input_tokens_seen": 12059200, + "router_z_loss_mlp": 2.37792969, + "step": 157, + "time_per_iteration": 2.8549368381500244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01415444, + "balance_loss_mlp": 1.17921925, + "epoch": 0.03039630627164294, + "flos": 532443824640.0, + "grad_norm": 0.03295212675068383, + "language_loss": 1.02716291, + "learning_rate": 0.0009999996117656068, + "loss": 1.04131734, + "num_input_tokens_seen": 12134944, + "router_z_loss_mlp": 2.36035156, + "step": 158, + "time_per_iteration": 2.6989729404449463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01410387, + "balance_loss_mlp": 1.17530584, + "epoch": 0.030588687956906502, + "flos": 587294830080.0, + "grad_norm": 0.0291076208082698, + "language_loss": 0.96305156, + "learning_rate": 0.0009999991264727564, + "loss": 0.97715545, + "num_input_tokens_seen": 12207936, + "router_z_loss_mlp": 2.34863281, + "step": 159, + "time_per_iteration": 2.7609338760375977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0140999, + "balance_loss_mlp": 1.1752907, + "epoch": 0.030781069642170065, + "flos": 514286592000.0, + "grad_norm": 0.030494101007586163, + "language_loss": 1.0725081, + "learning_rate": 0.0009999984470630296, + "loss": 1.08660805, + "num_input_tokens_seen": 12273200, + "router_z_loss_mlp": 2.34472656, + "step": 160, + "time_per_iteration": 2.5805158615112305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01410287, + "balance_loss_mlp": 1.17711365, + "epoch": 0.030973451327433628, + "flos": 719559304704.0, + "grad_norm": 0.025032822394785544, + "language_loss": 0.95934659, + "learning_rate": 0.0009999975735366902, + "loss": 0.97344947, + "num_input_tokens_seen": 12359600, + "router_z_loss_mlp": 2.32910156, + "step": 161, + "time_per_iteration": 3.078343629837036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01409543, + "balance_loss_mlp": 1.17675149, + "epoch": 0.03116583301269719, + "flos": 1111614400512.0, + "grad_norm": 0.029903967107167622, + "language_loss": 0.98009437, + "learning_rate": 0.0009999965058940775, + "loss": 0.99418974, + "num_input_tokens_seen": 12443936, + "router_z_loss_mlp": 2.32519531, + "step": 162, + "time_per_iteration": 3.49137544631958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01408163, + "balance_loss_mlp": 1.17689729, + "epoch": 0.031358214697960754, + "flos": 451833082368.0, + "grad_norm": 0.11336845133687022, + "language_loss": 1.0463953, + "learning_rate": 0.0009999952441356057, + "loss": 1.06047678, + "num_input_tokens_seen": 12507488, + "router_z_loss_mlp": 2.30957031, + "step": 163, + "time_per_iteration": 2.531280755996704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01406979, + "balance_loss_mlp": 1.17676246, + "epoch": 0.031550596383224314, + "flos": 1257085658112.0, + "grad_norm": 0.03183858769064714, + "language_loss": 1.05248928, + "learning_rate": 0.000999993788261765, + "loss": 1.06655908, + "num_input_tokens_seen": 12594096, + "router_z_loss_mlp": 2.30078125, + "step": 164, + "time_per_iteration": 3.5714328289031982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01408503, + "balance_loss_mlp": 1.17943025, + "epoch": 0.03174297806848788, + "flos": 669322924032.0, + "grad_norm": 0.03191781964215587, + "language_loss": 1.06263065, + "learning_rate": 0.00099999213827312, + "loss": 1.07671571, + "num_input_tokens_seen": 12669424, + "router_z_loss_mlp": 2.29101562, + "step": 165, + "time_per_iteration": 2.7947938442230225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01409995, + "balance_loss_mlp": 1.18101788, + "epoch": 0.03193535975375144, + "flos": 552363514368.0, + "grad_norm": 0.03891580789868065, + "language_loss": 1.01044345, + "learning_rate": 0.000999990294170312, + "loss": 1.0245434, + "num_input_tokens_seen": 12740080, + "router_z_loss_mlp": 2.29003906, + "step": 166, + "time_per_iteration": 2.6462574005126953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0140342, + "balance_loss_mlp": 1.17577803, + "epoch": 0.032127741439015006, + "flos": 544739543040.0, + "grad_norm": 0.03757156138401865, + "language_loss": 1.05309296, + "learning_rate": 0.0009999882559540566, + "loss": 1.06712723, + "num_input_tokens_seen": 12810576, + "router_z_loss_mlp": 2.27636719, + "step": 167, + "time_per_iteration": 2.629549503326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0140941, + "balance_loss_mlp": 1.18234003, + "epoch": 0.032320123124278566, + "flos": 549513348096.0, + "grad_norm": 0.028659149555752484, + "language_loss": 1.01791751, + "learning_rate": 0.000999986023625145, + "loss": 1.03201175, + "num_input_tokens_seen": 12887904, + "router_z_loss_mlp": 2.27050781, + "step": 168, + "time_per_iteration": 2.7051401138305664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01589355, + "balance_loss_mlp": 1.35360718, + "epoch": 0.03251250480954213, + "flos": 1308815430144.0, + "grad_norm": 0.08201951270186027, + "language_loss": 0.78924417, + "learning_rate": 0.0009999835971844441, + "loss": 0.80513763, + "num_input_tokens_seen": 13107344, + "router_z_loss_mlp": 2.35546875, + "step": 169, + "time_per_iteration": 4.9428627490997314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01407645, + "balance_loss_mlp": 1.18257797, + "epoch": 0.03270488649480569, + "flos": 562201835520.0, + "grad_norm": 0.03970113019311383, + "language_loss": 1.02863848, + "learning_rate": 0.0009999809766328958, + "loss": 1.04271495, + "num_input_tokens_seen": 13175552, + "router_z_loss_mlp": 2.25, + "step": 170, + "time_per_iteration": 2.675811529159546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01415662, + "balance_loss_mlp": 1.19193029, + "epoch": 0.03289726818006926, + "flos": 483338813952.0, + "grad_norm": 0.03325277263778645, + "language_loss": 1.04760146, + "learning_rate": 0.0009999781619715177, + "loss": 1.06175804, + "num_input_tokens_seen": 13242384, + "router_z_loss_mlp": 2.23632812, + "step": 171, + "time_per_iteration": 2.5431392192840576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01419714, + "balance_loss_mlp": 1.1972214, + "epoch": 0.03308964986533282, + "flos": 675820254720.0, + "grad_norm": 0.02950894161591202, + "language_loss": 1.04164565, + "learning_rate": 0.000999975153201402, + "loss": 1.05584288, + "num_input_tokens_seen": 13316160, + "router_z_loss_mlp": 2.22363281, + "step": 172, + "time_per_iteration": 2.812837600708008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01422366, + "balance_loss_mlp": 1.20044637, + "epoch": 0.033282031550596385, + "flos": 610340660736.0, + "grad_norm": 0.03086814843966846, + "language_loss": 1.02532911, + "learning_rate": 0.0009999719503237174, + "loss": 1.03955269, + "num_input_tokens_seen": 13387664, + "router_z_loss_mlp": 2.21777344, + "step": 173, + "time_per_iteration": 2.755462646484375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01416936, + "balance_loss_mlp": 1.1959697, + "epoch": 0.033474413235859944, + "flos": 468995931648.0, + "grad_norm": 0.048603642070708566, + "language_loss": 1.1131072, + "learning_rate": 0.0009999685533397073, + "loss": 1.12727666, + "num_input_tokens_seen": 13454528, + "router_z_loss_mlp": 2.20800781, + "step": 174, + "time_per_iteration": 2.566751003265381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01414495, + "balance_loss_mlp": 1.19438744, + "epoch": 0.03366679492112351, + "flos": 580714907136.0, + "grad_norm": 0.03243683176756354, + "language_loss": 1.02908182, + "learning_rate": 0.00099996496225069, + "loss": 1.04322672, + "num_input_tokens_seen": 13522528, + "router_z_loss_mlp": 2.19921875, + "step": 175, + "time_per_iteration": 2.67861008644104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01407523, + "balance_loss_mlp": 1.1883682, + "epoch": 0.03385917660638707, + "flos": 638885435904.0, + "grad_norm": 0.029120554083078395, + "language_loss": 1.05784094, + "learning_rate": 0.0009999611770580604, + "loss": 1.0719161, + "num_input_tokens_seen": 13601120, + "router_z_loss_mlp": 2.18945312, + "step": 176, + "time_per_iteration": 2.8410942554473877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01401607, + "balance_loss_mlp": 1.18302441, + "epoch": 0.03405155829165064, + "flos": 442739366400.0, + "grad_norm": 0.031490867136515936, + "language_loss": 1.04703283, + "learning_rate": 0.0009999571977632876, + "loss": 1.06104875, + "num_input_tokens_seen": 13666384, + "router_z_loss_mlp": 2.18359375, + "step": 177, + "time_per_iteration": 2.559652090072632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01399051, + "balance_loss_mlp": 1.1813277, + "epoch": 0.034243939976914196, + "flos": 467274407424.0, + "grad_norm": 0.029366691437037535, + "language_loss": 1.0724479, + "learning_rate": 0.0009999530243679166, + "loss": 1.08643842, + "num_input_tokens_seen": 13733968, + "router_z_loss_mlp": 2.17480469, + "step": 178, + "time_per_iteration": 2.5423247814178467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01392432, + "balance_loss_mlp": 1.17556691, + "epoch": 0.03443632166217776, + "flos": 780712257024.0, + "grad_norm": 0.02507202069561695, + "language_loss": 1.01653552, + "learning_rate": 0.0009999486568735675, + "loss": 1.03045988, + "num_input_tokens_seen": 13818960, + "router_z_loss_mlp": 2.16601562, + "step": 179, + "time_per_iteration": 3.111632823944092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01381684, + "balance_loss_mlp": 1.16567647, + "epoch": 0.03462870334744132, + "flos": 1265758407168.0, + "grad_norm": 0.027829136834509844, + "language_loss": 1.02053452, + "learning_rate": 0.0009999440952819362, + "loss": 1.03435147, + "num_input_tokens_seen": 13912448, + "router_z_loss_mlp": 2.15722656, + "step": 180, + "time_per_iteration": 3.6354756355285645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01375883, + "balance_loss_mlp": 1.16035271, + "epoch": 0.03482108503270489, + "flos": 608302228992.0, + "grad_norm": 0.033531921209289, + "language_loss": 1.02966988, + "learning_rate": 0.0009999393395947935, + "loss": 1.04342866, + "num_input_tokens_seen": 13990752, + "router_z_loss_mlp": 2.15234375, + "step": 181, + "time_per_iteration": 2.8509652614593506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01372611, + "balance_loss_mlp": 1.15774834, + "epoch": 0.03501346671796845, + "flos": 539314458624.0, + "grad_norm": 0.029990628161131794, + "language_loss": 1.05946589, + "learning_rate": 0.0009999343898139858, + "loss": 1.07319212, + "num_input_tokens_seen": 14058608, + "router_z_loss_mlp": 2.14550781, + "step": 182, + "time_per_iteration": 2.6199755668640137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01375908, + "balance_loss_mlp": 1.16161704, + "epoch": 0.035205848403232015, + "flos": 519498828288.0, + "grad_norm": 0.03419998284579487, + "language_loss": 1.04830694, + "learning_rate": 0.0009999292459414348, + "loss": 1.06206608, + "num_input_tokens_seen": 14126656, + "router_z_loss_mlp": 2.13964844, + "step": 183, + "time_per_iteration": 2.563997983932495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01386507, + "balance_loss_mlp": 1.17269289, + "epoch": 0.035398230088495575, + "flos": 473333306880.0, + "grad_norm": 0.03346089667402367, + "language_loss": 1.09292293, + "learning_rate": 0.0009999239079791374, + "loss": 1.10678792, + "num_input_tokens_seen": 14195840, + "router_z_loss_mlp": 2.13476562, + "step": 184, + "time_per_iteration": 2.5561137199401855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01387981, + "balance_loss_mlp": 1.17512131, + "epoch": 0.03559061177375914, + "flos": 513094823424.0, + "grad_norm": 0.03551516541146116, + "language_loss": 1.01857162, + "learning_rate": 0.0009999183759291659, + "loss": 1.03245139, + "num_input_tokens_seen": 14269936, + "router_z_loss_mlp": 2.125, + "step": 185, + "time_per_iteration": 2.689763307571411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01383562, + "balance_loss_mlp": 1.17108345, + "epoch": 0.0357829934590227, + "flos": 478350159360.0, + "grad_norm": 0.03945465081959485, + "language_loss": 1.04534364, + "learning_rate": 0.0009999126497936682, + "loss": 1.05917931, + "num_input_tokens_seen": 14334848, + "router_z_loss_mlp": 2.12109375, + "step": 186, + "time_per_iteration": 2.5142176151275635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01375295, + "balance_loss_mlp": 1.16415167, + "epoch": 0.03597537514428627, + "flos": 645884324352.0, + "grad_norm": 0.029215470851159726, + "language_loss": 1.06864357, + "learning_rate": 0.0009999067295748676, + "loss": 1.08239663, + "num_input_tokens_seen": 14407888, + "router_z_loss_mlp": 2.10742188, + "step": 187, + "time_per_iteration": 2.8259549140930176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01370561, + "balance_loss_mlp": 1.16056204, + "epoch": 0.03616775682954983, + "flos": 582269245440.0, + "grad_norm": 0.03159066859467708, + "language_loss": 1.0519886, + "learning_rate": 0.000999900615275062, + "loss": 1.06569433, + "num_input_tokens_seen": 14479072, + "router_z_loss_mlp": 2.09570312, + "step": 188, + "time_per_iteration": 2.6748595237731934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01368603, + "balance_loss_mlp": 1.15898561, + "epoch": 0.03636013851481339, + "flos": 383264277504.0, + "grad_norm": 0.043734318168479426, + "language_loss": 1.10731864, + "learning_rate": 0.0009998943068966256, + "loss": 1.1210047, + "num_input_tokens_seen": 14540944, + "router_z_loss_mlp": 2.09179688, + "step": 189, + "time_per_iteration": 2.4394500255584717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01365543, + "balance_loss_mlp": 1.15668833, + "epoch": 0.03655252020007695, + "flos": 584307677184.0, + "grad_norm": 0.02577278402121573, + "language_loss": 1.05579162, + "learning_rate": 0.0009998878044420072, + "loss": 1.06944704, + "num_input_tokens_seen": 14611392, + "router_z_loss_mlp": 2.0859375, + "step": 190, + "time_per_iteration": 2.7022814750671387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01365865, + "balance_loss_mlp": 1.15882242, + "epoch": 0.03674490188534051, + "flos": 472597433856.0, + "grad_norm": 0.03520388751206912, + "language_loss": 1.01277018, + "learning_rate": 0.0009998811079137318, + "loss": 1.02642882, + "num_input_tokens_seen": 14679776, + "router_z_loss_mlp": 2.07324219, + "step": 191, + "time_per_iteration": 2.5930585861206055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0136447, + "balance_loss_mlp": 1.15742755, + "epoch": 0.03693728357060408, + "flos": 529411009536.0, + "grad_norm": 0.03125533686722731, + "language_loss": 1.02464271, + "learning_rate": 0.0009998742173143987, + "loss": 1.0382874, + "num_input_tokens_seen": 14749712, + "router_z_loss_mlp": 2.07324219, + "step": 192, + "time_per_iteration": 2.6235413551330566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01358793, + "balance_loss_mlp": 1.15222692, + "epoch": 0.03712966525586764, + "flos": 800345238528.0, + "grad_norm": 0.02848545485219292, + "language_loss": 1.02800548, + "learning_rate": 0.0009998671326466833, + "loss": 1.04159343, + "num_input_tokens_seen": 14827136, + "router_z_loss_mlp": 2.06835938, + "step": 193, + "time_per_iteration": 2.991110324859619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01351781, + "balance_loss_mlp": 1.1463598, + "epoch": 0.037322046941131205, + "flos": 831358144512.0, + "grad_norm": 0.03513998418582105, + "language_loss": 1.0392077, + "learning_rate": 0.0009998598539133362, + "loss": 1.05272543, + "num_input_tokens_seen": 14902880, + "router_z_loss_mlp": 2.05664062, + "step": 194, + "time_per_iteration": 3.0204203128814697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01349328, + "balance_loss_mlp": 1.14371598, + "epoch": 0.037514428626394765, + "flos": 438588642816.0, + "grad_norm": 0.028816536284039847, + "language_loss": 1.04176903, + "learning_rate": 0.0009998523811171828, + "loss": 1.05526221, + "num_input_tokens_seen": 14967264, + "router_z_loss_mlp": 2.05859375, + "step": 195, + "time_per_iteration": 2.5615782737731934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01345129, + "balance_loss_mlp": 1.14047015, + "epoch": 0.03770681031165833, + "flos": 512638927872.0, + "grad_norm": 0.030721230574493993, + "language_loss": 1.05052435, + "learning_rate": 0.0009998447142611248, + "loss": 1.06397557, + "num_input_tokens_seen": 15039104, + "router_z_loss_mlp": 2.04882812, + "step": 196, + "time_per_iteration": 2.6310269832611084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01347072, + "balance_loss_mlp": 1.14289033, + "epoch": 0.03789919199692189, + "flos": 808842069504.0, + "grad_norm": 0.024329502455983587, + "language_loss": 0.97805226, + "learning_rate": 0.0009998368533481387, + "loss": 0.99152303, + "num_input_tokens_seen": 15124864, + "router_z_loss_mlp": 2.04394531, + "step": 197, + "time_per_iteration": 3.0467066764831543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01344143, + "balance_loss_mlp": 1.14043784, + "epoch": 0.03809157368218546, + "flos": 691791335424.0, + "grad_norm": 0.028391473090668865, + "language_loss": 1.00891113, + "learning_rate": 0.0009998287983812762, + "loss": 1.0223527, + "num_input_tokens_seen": 15199680, + "router_z_loss_mlp": 2.0390625, + "step": 198, + "time_per_iteration": 2.8457672595977783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01342798, + "balance_loss_mlp": 1.14023721, + "epoch": 0.03828395536744902, + "flos": 519004001280.0, + "grad_norm": 0.02890411668538335, + "language_loss": 1.07749867, + "learning_rate": 0.0009998205493636646, + "loss": 1.09092665, + "num_input_tokens_seen": 15270176, + "router_z_loss_mlp": 2.02734375, + "step": 199, + "time_per_iteration": 2.66135573387146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01336213, + "balance_loss_mlp": 1.13432038, + "epoch": 0.038476337052712584, + "flos": 582762071040.0, + "grad_norm": 0.025165239757241963, + "language_loss": 0.99723649, + "learning_rate": 0.0009998121062985063, + "loss": 1.01059866, + "num_input_tokens_seen": 15343168, + "router_z_loss_mlp": 2.02050781, + "step": 200, + "time_per_iteration": 2.70021915435791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01340101, + "balance_loss_mlp": 1.13868463, + "epoch": 0.03866871873797614, + "flos": 578272972800.0, + "grad_norm": 0.025940014565947116, + "language_loss": 1.01401794, + "learning_rate": 0.0009998034691890794, + "loss": 1.02741897, + "num_input_tokens_seen": 15417328, + "router_z_loss_mlp": 2.015625, + "step": 201, + "time_per_iteration": 2.7596118450164795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0134112, + "balance_loss_mlp": 1.14018106, + "epoch": 0.03886110042323971, + "flos": 541771855872.0, + "grad_norm": 0.03045868040347491, + "language_loss": 1.06763899, + "learning_rate": 0.0009997946380387369, + "loss": 1.08105016, + "num_input_tokens_seen": 15489488, + "router_z_loss_mlp": 2.01074219, + "step": 202, + "time_per_iteration": 2.6249613761901855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01341912, + "balance_loss_mlp": 1.14192665, + "epoch": 0.03905348210850327, + "flos": 719239669248.0, + "grad_norm": 0.02826530469295273, + "language_loss": 1.09111357, + "learning_rate": 0.0009997856128509076, + "loss": 1.1045326, + "num_input_tokens_seen": 15558944, + "router_z_loss_mlp": 2.00097656, + "step": 203, + "time_per_iteration": 2.8254761695861816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01336015, + "balance_loss_mlp": 1.13660145, + "epoch": 0.039245863793766836, + "flos": 428396484096.0, + "grad_norm": 0.028264614074004907, + "language_loss": 1.0366801, + "learning_rate": 0.0009997763936290952, + "loss": 1.05004025, + "num_input_tokens_seen": 15625024, + "router_z_loss_mlp": 1.99511719, + "step": 204, + "time_per_iteration": 2.4907312393188477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01334897, + "balance_loss_mlp": 1.13624632, + "epoch": 0.039438245479030395, + "flos": 664269141504.0, + "grad_norm": 0.0294297584821439, + "language_loss": 1.09143519, + "learning_rate": 0.0009997669803768789, + "loss": 1.10478401, + "num_input_tokens_seen": 15697120, + "router_z_loss_mlp": 1.98730469, + "step": 205, + "time_per_iteration": 2.787046194076538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01332958, + "balance_loss_mlp": 1.13497555, + "epoch": 0.03963062716429396, + "flos": 636495168000.0, + "grad_norm": 0.025164669035445293, + "language_loss": 1.04324186, + "learning_rate": 0.0009997573730979134, + "loss": 1.05657148, + "num_input_tokens_seen": 15768752, + "router_z_loss_mlp": 1.98242188, + "step": 206, + "time_per_iteration": 2.744339942932129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01388672, + "balance_loss_mlp": 1.18687439, + "epoch": 0.03982300884955752, + "flos": 1421587186176.0, + "grad_norm": 0.04225268457123109, + "language_loss": 0.79193199, + "learning_rate": 0.0009997475717959284, + "loss": 0.80581868, + "num_input_tokens_seen": 15980624, + "router_z_loss_mlp": 2.01953125, + "step": 207, + "time_per_iteration": 4.62822699546814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01338974, + "balance_loss_mlp": 1.14251721, + "epoch": 0.04001539053482109, + "flos": 690519702528.0, + "grad_norm": 0.029734692172116686, + "language_loss": 1.02667236, + "learning_rate": 0.0009997375764747294, + "loss": 1.04006195, + "num_input_tokens_seen": 16067232, + "router_z_loss_mlp": 1.96875, + "step": 208, + "time_per_iteration": 3.0006470680236816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01332342, + "balance_loss_mlp": 1.1360755, + "epoch": 0.04020777222008465, + "flos": 534751500288.0, + "grad_norm": 0.02521302149444487, + "language_loss": 1.00535607, + "learning_rate": 0.0009997273871381967, + "loss": 1.01867938, + "num_input_tokens_seen": 16139808, + "router_z_loss_mlp": 1.96679688, + "step": 209, + "time_per_iteration": 2.6790220737457275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01335368, + "balance_loss_mlp": 1.14005554, + "epoch": 0.040400153905348214, + "flos": 568996608000.0, + "grad_norm": 0.04055154679799505, + "language_loss": 1.05331016, + "learning_rate": 0.0009997170037902862, + "loss": 1.06666374, + "num_input_tokens_seen": 16210848, + "router_z_loss_mlp": 1.95703125, + "step": 210, + "time_per_iteration": 2.748340129852295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01331596, + "balance_loss_mlp": 1.13647389, + "epoch": 0.040592535590611774, + "flos": 714678712320.0, + "grad_norm": 0.0276705792773584, + "language_loss": 1.07916689, + "learning_rate": 0.0009997064264350292, + "loss": 1.09248281, + "num_input_tokens_seen": 16283984, + "router_z_loss_mlp": 1.95507812, + "step": 211, + "time_per_iteration": 2.8284339904785156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01332545, + "balance_loss_mlp": 1.13761449, + "epoch": 0.04078491727587533, + "flos": 579206231040.0, + "grad_norm": 0.026753366885260317, + "language_loss": 1.01893198, + "learning_rate": 0.0009996956550765317, + "loss": 1.03225756, + "num_input_tokens_seen": 16353904, + "router_z_loss_mlp": 1.953125, + "step": 212, + "time_per_iteration": 2.6636033058166504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01330854, + "balance_loss_mlp": 1.13668597, + "epoch": 0.0409772989611389, + "flos": 553368631296.0, + "grad_norm": 0.03340351088011317, + "language_loss": 0.96620274, + "learning_rate": 0.0009996846897189762, + "loss": 0.97951126, + "num_input_tokens_seen": 16425488, + "router_z_loss_mlp": 1.9453125, + "step": 213, + "time_per_iteration": 2.62785005569458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01327396, + "balance_loss_mlp": 1.13332307, + "epoch": 0.04116968064640246, + "flos": 556764016128.0, + "grad_norm": 0.026256493309422244, + "language_loss": 1.0283711, + "learning_rate": 0.0009996735303666193, + "loss": 1.04164505, + "num_input_tokens_seen": 16498016, + "router_z_loss_mlp": 1.94433594, + "step": 214, + "time_per_iteration": 2.745412588119507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01324547, + "balance_loss_mlp": 1.13152313, + "epoch": 0.041362062331666026, + "flos": 579651393024.0, + "grad_norm": 0.025801807715809106, + "language_loss": 1.04973316, + "learning_rate": 0.0009996621770237937, + "loss": 1.06297863, + "num_input_tokens_seen": 16573744, + "router_z_loss_mlp": 1.93359375, + "step": 215, + "time_per_iteration": 2.7359023094177246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01319406, + "balance_loss_mlp": 1.12657344, + "epoch": 0.041554444016929586, + "flos": 612700729344.0, + "grad_norm": 0.027594527286323677, + "language_loss": 1.00985026, + "learning_rate": 0.0009996506296949073, + "loss": 1.02304435, + "num_input_tokens_seen": 16655344, + "router_z_loss_mlp": 1.93164062, + "step": 216, + "time_per_iteration": 2.860781669616699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01320461, + "balance_loss_mlp": 1.12781918, + "epoch": 0.04174682570219315, + "flos": 529150497792.0, + "grad_norm": 0.030561981852332186, + "language_loss": 1.01172602, + "learning_rate": 0.0009996388883844428, + "loss": 1.02493072, + "num_input_tokens_seen": 16726480, + "router_z_loss_mlp": 1.9296875, + "step": 217, + "time_per_iteration": 2.614837169647217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01315002, + "balance_loss_mlp": 1.12255037, + "epoch": 0.04193920738745671, + "flos": 512499939840.0, + "grad_norm": 0.024235201889365978, + "language_loss": 1.04092622, + "learning_rate": 0.0009996269530969588, + "loss": 1.05407631, + "num_input_tokens_seen": 16792112, + "router_z_loss_mlp": 1.92773438, + "step": 218, + "time_per_iteration": 2.5777087211608887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01317845, + "balance_loss_mlp": 1.1255846, + "epoch": 0.04213158907272028, + "flos": 572552448000.0, + "grad_norm": 0.03618883866707401, + "language_loss": 1.04623246, + "learning_rate": 0.0009996148238370888, + "loss": 1.05941105, + "num_input_tokens_seen": 16862960, + "router_z_loss_mlp": 1.92578125, + "step": 219, + "time_per_iteration": 2.723344564437866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01319419, + "balance_loss_mlp": 1.12830234, + "epoch": 0.04232397075798384, + "flos": 965904098304.0, + "grad_norm": 0.02808123492922437, + "language_loss": 0.99962145, + "learning_rate": 0.0009996025006095421, + "loss": 1.01281559, + "num_input_tokens_seen": 16950416, + "router_z_loss_mlp": 1.9140625, + "step": 220, + "time_per_iteration": 3.297567844390869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01355408, + "balance_loss_mlp": 1.16314697, + "epoch": 0.042516352443247404, + "flos": 1472730628608.0, + "grad_norm": 0.031119874656221472, + "language_loss": 0.77783144, + "learning_rate": 0.0009995899834191028, + "loss": 0.79138547, + "num_input_tokens_seen": 17180944, + "router_z_loss_mlp": 1.92578125, + "step": 221, + "time_per_iteration": 5.484851837158203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0132056, + "balance_loss_mlp": 1.13039756, + "epoch": 0.042708734128510964, + "flos": 655891832832.0, + "grad_norm": 0.027306518139410985, + "language_loss": 0.99887031, + "learning_rate": 0.0009995772722706307, + "loss": 1.0120759, + "num_input_tokens_seen": 17257792, + "router_z_loss_mlp": 1.90429688, + "step": 222, + "time_per_iteration": 2.801955461502075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01324867, + "balance_loss_mlp": 1.13518083, + "epoch": 0.04290111581377453, + "flos": 432733859328.0, + "grad_norm": 0.025166076900031344, + "language_loss": 1.13987851, + "learning_rate": 0.0009995643671690604, + "loss": 1.15312719, + "num_input_tokens_seen": 17320288, + "router_z_loss_mlp": 1.89941406, + "step": 223, + "time_per_iteration": 2.4589195251464844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01320058, + "balance_loss_mlp": 1.13142133, + "epoch": 0.04309349749903809, + "flos": 645866860032.0, + "grad_norm": 0.02470776233740571, + "language_loss": 1.01624262, + "learning_rate": 0.0009995512681194023, + "loss": 1.02944326, + "num_input_tokens_seen": 17396672, + "router_z_loss_mlp": 1.88867188, + "step": 224, + "time_per_iteration": 2.854653835296631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01319788, + "balance_loss_mlp": 1.13124692, + "epoch": 0.04328587918430166, + "flos": 832895745024.0, + "grad_norm": 0.02898896961022835, + "language_loss": 0.98942387, + "learning_rate": 0.0009995379751267417, + "loss": 1.00262189, + "num_input_tokens_seen": 17488096, + "router_z_loss_mlp": 1.88769531, + "step": 225, + "time_per_iteration": 3.260105609893799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01317885, + "balance_loss_mlp": 1.12943935, + "epoch": 0.043478260869565216, + "flos": 526115681280.0, + "grad_norm": 0.02601835272599882, + "language_loss": 1.00718379, + "learning_rate": 0.0009995244881962398, + "loss": 1.02036262, + "num_input_tokens_seen": 17557632, + "router_z_loss_mlp": 1.88671875, + "step": 226, + "time_per_iteration": 2.631685495376587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01320396, + "balance_loss_mlp": 1.13204539, + "epoch": 0.04367064255482878, + "flos": 440412225024.0, + "grad_norm": 0.02740546356326938, + "language_loss": 1.02089393, + "learning_rate": 0.0009995108073331323, + "loss": 1.03409791, + "num_input_tokens_seen": 17626672, + "router_z_loss_mlp": 1.88574219, + "step": 227, + "time_per_iteration": 2.6414895057678223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01308962, + "balance_loss_mlp": 1.12156498, + "epoch": 0.04386302424009234, + "flos": 508466737152.0, + "grad_norm": 0.023646446246452554, + "language_loss": 1.04017711, + "learning_rate": 0.0009994969325427309, + "loss": 1.05326676, + "num_input_tokens_seen": 17698624, + "router_z_loss_mlp": 1.87597656, + "step": 228, + "time_per_iteration": 2.6933584213256836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0130646, + "balance_loss_mlp": 1.11906338, + "epoch": 0.04405540592535591, + "flos": 541743657984.0, + "grad_norm": 0.02642836262436834, + "language_loss": 1.00691068, + "learning_rate": 0.0009994828638304218, + "loss": 1.0199753, + "num_input_tokens_seen": 17767760, + "router_z_loss_mlp": 1.87597656, + "step": 229, + "time_per_iteration": 2.604616165161133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01305226, + "balance_loss_mlp": 1.11792421, + "epoch": 0.04424778761061947, + "flos": 447309055488.0, + "grad_norm": 0.039218098968292335, + "language_loss": 1.07079852, + "learning_rate": 0.0009994686012016675, + "loss": 1.08385086, + "num_input_tokens_seen": 17833664, + "router_z_loss_mlp": 1.875, + "step": 230, + "time_per_iteration": 2.568608045578003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0130487, + "balance_loss_mlp": 1.1187129, + "epoch": 0.044440169295883035, + "flos": 701981492736.0, + "grad_norm": 0.02721662483758601, + "language_loss": 1.06240797, + "learning_rate": 0.000999454144662005, + "loss": 1.07545662, + "num_input_tokens_seen": 17908880, + "router_z_loss_mlp": 1.86328125, + "step": 231, + "time_per_iteration": 2.9104526042938232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01295735, + "balance_loss_mlp": 1.10957813, + "epoch": 0.044632550981146595, + "flos": 589426587648.0, + "grad_norm": 0.02817980914561194, + "language_loss": 1.003865, + "learning_rate": 0.0009994394942170468, + "loss": 1.01682234, + "num_input_tokens_seen": 17978208, + "router_z_loss_mlp": 1.86328125, + "step": 232, + "time_per_iteration": 2.674896001815796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01302928, + "balance_loss_mlp": 1.11667526, + "epoch": 0.04482493266641016, + "flos": 555854226432.0, + "grad_norm": 0.029144066951330677, + "language_loss": 0.98161608, + "learning_rate": 0.0009994246498724808, + "loss": 0.99464536, + "num_input_tokens_seen": 18049296, + "router_z_loss_mlp": 1.86425781, + "step": 233, + "time_per_iteration": 2.674178123474121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01302597, + "balance_loss_mlp": 1.11682117, + "epoch": 0.04501731435167372, + "flos": 724069870080.0, + "grad_norm": 0.027038299766394356, + "language_loss": 1.00722432, + "learning_rate": 0.00099940961163407, + "loss": 1.02025032, + "num_input_tokens_seen": 18123296, + "router_z_loss_mlp": 1.859375, + "step": 234, + "time_per_iteration": 2.8427939414978027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01301098, + "balance_loss_mlp": 1.11608493, + "epoch": 0.04520969603693728, + "flos": 512797381632.0, + "grad_norm": 0.027022139799708383, + "language_loss": 1.02586675, + "learning_rate": 0.0009993943795076528, + "loss": 1.03887773, + "num_input_tokens_seen": 18192784, + "router_z_loss_mlp": 1.8515625, + "step": 235, + "time_per_iteration": 2.5940792560577393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01295671, + "balance_loss_mlp": 1.11094403, + "epoch": 0.04540207772220085, + "flos": 365877846528.0, + "grad_norm": 0.03212133053651388, + "language_loss": 1.0562067, + "learning_rate": 0.0009993789534991427, + "loss": 1.06916356, + "num_input_tokens_seen": 18254064, + "router_z_loss_mlp": 1.84863281, + "step": 236, + "time_per_iteration": 2.4345834255218506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01294151, + "balance_loss_mlp": 1.1095196, + "epoch": 0.045594459407464406, + "flos": 523723411968.0, + "grad_norm": 0.029471400038435007, + "language_loss": 1.00276268, + "learning_rate": 0.0009993633336145287, + "loss": 1.01570415, + "num_input_tokens_seen": 18325728, + "router_z_loss_mlp": 1.84765625, + "step": 237, + "time_per_iteration": 2.6279234886169434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01296614, + "balance_loss_mlp": 1.11284053, + "epoch": 0.04578684109272797, + "flos": 673115807232.0, + "grad_norm": 0.032189822363292264, + "language_loss": 1.04537559, + "learning_rate": 0.0009993475198598752, + "loss": 1.05834174, + "num_input_tokens_seen": 18408608, + "router_z_loss_mlp": 1.83886719, + "step": 238, + "time_per_iteration": 2.98264741897583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01294154, + "balance_loss_mlp": 1.11047626, + "epoch": 0.04597922277799153, + "flos": 542620520448.0, + "grad_norm": 0.025834809881005002, + "language_loss": 1.01282692, + "learning_rate": 0.0009993315122413212, + "loss": 1.02576852, + "num_input_tokens_seen": 18471920, + "router_z_loss_mlp": 1.83789062, + "step": 239, + "time_per_iteration": 2.5969364643096924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01297016, + "balance_loss_mlp": 1.11333883, + "epoch": 0.0461716044632551, + "flos": 459993540096.0, + "grad_norm": 0.025301515003642434, + "language_loss": 1.01210213, + "learning_rate": 0.0009993153107650818, + "loss": 1.02507234, + "num_input_tokens_seen": 18540496, + "router_z_loss_mlp": 1.83789062, + "step": 240, + "time_per_iteration": 2.590198278427124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01297188, + "balance_loss_mlp": 1.11360526, + "epoch": 0.04636398614851866, + "flos": 456170457600.0, + "grad_norm": 0.0338801607583888, + "language_loss": 1.01026332, + "learning_rate": 0.0009992989154374468, + "loss": 1.0232352, + "num_input_tokens_seen": 18606944, + "router_z_loss_mlp": 1.83691406, + "step": 241, + "time_per_iteration": 2.5699570178985596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012963, + "balance_loss_mlp": 1.11271763, + "epoch": 0.046556367833782225, + "flos": 557901390336.0, + "grad_norm": 0.02656657647638049, + "language_loss": 1.0757494, + "learning_rate": 0.0009992823262647817, + "loss": 1.08871233, + "num_input_tokens_seen": 18679520, + "router_z_loss_mlp": 1.83691406, + "step": 242, + "time_per_iteration": 2.6949496269226074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01293965, + "balance_loss_mlp": 1.11047852, + "epoch": 0.046748749519045785, + "flos": 594087601152.0, + "grad_norm": 0.02772781005565529, + "language_loss": 1.02479577, + "learning_rate": 0.0009992655432535264, + "loss": 1.03773546, + "num_input_tokens_seen": 18756656, + "router_z_loss_mlp": 1.8359375, + "step": 243, + "time_per_iteration": 2.7783396244049072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01286985, + "balance_loss_mlp": 1.10454702, + "epoch": 0.04694113120430935, + "flos": 570941713920.0, + "grad_norm": 0.021337056529223342, + "language_loss": 1.01771712, + "learning_rate": 0.0009992485664101973, + "loss": 1.03058696, + "num_input_tokens_seen": 18829792, + "router_z_loss_mlp": 1.82519531, + "step": 244, + "time_per_iteration": 2.679227590560913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01286082, + "balance_loss_mlp": 1.10364425, + "epoch": 0.04713351288957291, + "flos": 865245411840.0, + "grad_norm": 0.03170954338904746, + "language_loss": 1.04355013, + "learning_rate": 0.000999231395741385, + "loss": 1.05641103, + "num_input_tokens_seen": 18906864, + "router_z_loss_mlp": 1.82519531, + "step": 245, + "time_per_iteration": 3.0976788997650146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01287082, + "balance_loss_mlp": 1.10473943, + "epoch": 0.04732589457483648, + "flos": 538235481600.0, + "grad_norm": 0.02353809889700427, + "language_loss": 1.02393425, + "learning_rate": 0.0009992140312537557, + "loss": 1.03680515, + "num_input_tokens_seen": 18973632, + "router_z_loss_mlp": 1.82421875, + "step": 246, + "time_per_iteration": 2.6005859375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01286378, + "balance_loss_mlp": 1.1048938, + "epoch": 0.04751827626010004, + "flos": 763271431680.0, + "grad_norm": 0.021903859990429042, + "language_loss": 0.96665001, + "learning_rate": 0.000999196472954051, + "loss": 0.97951376, + "num_input_tokens_seen": 19052944, + "router_z_loss_mlp": 1.81542969, + "step": 247, + "time_per_iteration": 2.95379638671875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01319153, + "balance_loss_mlp": 1.13833618, + "epoch": 0.0477106579453636, + "flos": 1583125578240.0, + "grad_norm": 0.034344144576267104, + "language_loss": 0.79424852, + "learning_rate": 0.0009991787208490878, + "loss": 0.80744004, + "num_input_tokens_seen": 19286288, + "router_z_loss_mlp": 1.80859375, + "step": 248, + "time_per_iteration": 6.070216655731201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01286412, + "balance_loss_mlp": 1.10521388, + "epoch": 0.04790303963062716, + "flos": 458692982784.0, + "grad_norm": 0.024476775577385278, + "language_loss": 1.04631317, + "learning_rate": 0.0009991607749457578, + "loss": 1.05917728, + "num_input_tokens_seen": 19349296, + "router_z_loss_mlp": 1.8125, + "step": 249, + "time_per_iteration": 2.5741825103759766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0128623, + "balance_loss_mlp": 1.10503209, + "epoch": 0.04809542131589073, + "flos": 783786004992.0, + "grad_norm": 0.021665977114244464, + "language_loss": 1.0235486, + "learning_rate": 0.0009991426352510286, + "loss": 1.03641105, + "num_input_tokens_seen": 19428416, + "router_z_loss_mlp": 1.81152344, + "step": 250, + "time_per_iteration": 3.004519462585449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01287109, + "balance_loss_mlp": 1.10648286, + "epoch": 0.04828780300115429, + "flos": 560321857536.0, + "grad_norm": 0.028059326531900755, + "language_loss": 1.04456568, + "learning_rate": 0.0009991243017719422, + "loss": 1.05743682, + "num_input_tokens_seen": 19498688, + "router_z_loss_mlp": 1.8046875, + "step": 251, + "time_per_iteration": 2.666212320327759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01283793, + "balance_loss_mlp": 1.10364354, + "epoch": 0.048480184686417856, + "flos": 502922130432.0, + "grad_norm": 0.02282661348297379, + "language_loss": 0.985008, + "learning_rate": 0.0009991057745156165, + "loss": 0.99784589, + "num_input_tokens_seen": 19567568, + "router_z_loss_mlp": 1.80078125, + "step": 252, + "time_per_iteration": 2.6053824424743652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01291534, + "balance_loss_mlp": 1.11186218, + "epoch": 0.048672566371681415, + "flos": 1539469120512.0, + "grad_norm": 0.022804524860740846, + "language_loss": 0.81910986, + "learning_rate": 0.0009990870534892446, + "loss": 0.83202517, + "num_input_tokens_seen": 19796368, + "router_z_loss_mlp": 1.796875, + "step": 253, + "time_per_iteration": 5.005317449569702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01285445, + "balance_loss_mlp": 1.10500991, + "epoch": 0.04886494805694498, + "flos": 538951888896.0, + "grad_norm": 0.028242285238858512, + "language_loss": 1.06865251, + "learning_rate": 0.0009990681387000943, + "loss": 1.08150697, + "num_input_tokens_seen": 19870480, + "router_z_loss_mlp": 1.80371094, + "step": 254, + "time_per_iteration": 2.743307590484619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01283321, + "balance_loss_mlp": 1.10317183, + "epoch": 0.04905732974220854, + "flos": 681484383744.0, + "grad_norm": 0.028658365214850164, + "language_loss": 1.02065015, + "learning_rate": 0.0009990490301555093, + "loss": 1.03348327, + "num_input_tokens_seen": 19956288, + "router_z_loss_mlp": 1.80126953, + "step": 255, + "time_per_iteration": 2.989856719970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01291977, + "balance_loss_mlp": 1.1134491, + "epoch": 0.04924971142747211, + "flos": 1424274895872.0, + "grad_norm": 0.01325206916769545, + "language_loss": 0.79215157, + "learning_rate": 0.0009990297278629078, + "loss": 0.80507129, + "num_input_tokens_seen": 20180080, + "router_z_loss_mlp": 1.78515625, + "step": 256, + "time_per_iteration": 4.888273477554321 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01281082, + "balance_loss_mlp": 1.10255432, + "epoch": 0.04944209311273567, + "flos": 1561236587520.0, + "grad_norm": 0.00993410716153638, + "language_loss": 0.79242742, + "learning_rate": 0.000999010231829784, + "loss": 0.80523825, + "num_input_tokens_seen": 20413456, + "router_z_loss_mlp": 1.78515625, + "step": 257, + "time_per_iteration": 4.983605623245239 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01285439, + "balance_loss_mlp": 1.10786438, + "epoch": 0.04963447479799923, + "flos": 1574170850304.0, + "grad_norm": 0.014798835308040135, + "language_loss": 0.69975883, + "learning_rate": 0.0009989905420637066, + "loss": 0.71261322, + "num_input_tokens_seen": 20644736, + "router_z_loss_mlp": 1.77539062, + "step": 258, + "time_per_iteration": 4.888776540756226 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01282209, + "balance_loss_mlp": 1.10310864, + "epoch": 0.049826856483262794, + "flos": 626498393088.0, + "grad_norm": 0.032236291487241595, + "language_loss": 0.9680413, + "learning_rate": 0.0009989706585723202, + "loss": 0.98086333, + "num_input_tokens_seen": 20719040, + "router_z_loss_mlp": 1.79003906, + "step": 259, + "time_per_iteration": 2.776397705078125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01280186, + "balance_loss_mlp": 1.10175359, + "epoch": 0.05001923816852635, + "flos": 505155945984.0, + "grad_norm": 0.03442249770662494, + "language_loss": 1.03026366, + "learning_rate": 0.0009989505813633442, + "loss": 1.04306555, + "num_input_tokens_seen": 20789376, + "router_z_loss_mlp": 1.78271484, + "step": 260, + "time_per_iteration": 2.651773691177368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01281097, + "balance_loss_mlp": 1.10295069, + "epoch": 0.05021161985378992, + "flos": 588467132928.0, + "grad_norm": 0.024781843968885862, + "language_loss": 1.02880228, + "learning_rate": 0.000998930310444573, + "loss": 1.04161322, + "num_input_tokens_seen": 20857856, + "router_z_loss_mlp": 1.78125, + "step": 261, + "time_per_iteration": 2.730717420578003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01266116, + "balance_loss_mlp": 1.08796966, + "epoch": 0.05040400153905348, + "flos": 634402341888.0, + "grad_norm": 0.028473185138455738, + "language_loss": 1.01351452, + "learning_rate": 0.0009989098458238765, + "loss": 1.02617574, + "num_input_tokens_seen": 20931232, + "router_z_loss_mlp": 1.77929688, + "step": 262, + "time_per_iteration": 2.7717010974884033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01272128, + "balance_loss_mlp": 1.09407711, + "epoch": 0.050596383224317046, + "flos": 554808176640.0, + "grad_norm": 0.03464065468219783, + "language_loss": 1.00597906, + "learning_rate": 0.0009988891875091998, + "loss": 1.01870036, + "num_input_tokens_seen": 21012672, + "router_z_loss_mlp": 1.77880859, + "step": 263, + "time_per_iteration": 2.8842556476593018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012725, + "balance_loss_mlp": 1.09444928, + "epoch": 0.050788764909580605, + "flos": 550761512448.0, + "grad_norm": 0.02541343292713684, + "language_loss": 0.95014787, + "learning_rate": 0.0009988683355085636, + "loss": 0.96287298, + "num_input_tokens_seen": 21088592, + "router_z_loss_mlp": 1.77880859, + "step": 264, + "time_per_iteration": 2.7466378211975098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01272896, + "balance_loss_mlp": 1.09527469, + "epoch": 0.05098114659484417, + "flos": 606344388096.0, + "grad_norm": 0.02024934595994547, + "language_loss": 1.03858495, + "learning_rate": 0.000998847289830063, + "loss": 1.05131388, + "num_input_tokens_seen": 21169840, + "router_z_loss_mlp": 1.77587891, + "step": 265, + "time_per_iteration": 2.821997880935669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01285574, + "balance_loss_mlp": 1.10761857, + "epoch": 0.05117352828010773, + "flos": 439472236032.0, + "grad_norm": 0.026937538773041583, + "language_loss": 0.97004128, + "learning_rate": 0.0009988260504818682, + "loss": 0.98289704, + "num_input_tokens_seen": 21236144, + "router_z_loss_mlp": 1.77832031, + "step": 266, + "time_per_iteration": 2.557830333709717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01277028, + "balance_loss_mlp": 1.09907281, + "epoch": 0.0513659099653713, + "flos": 506030807040.0, + "grad_norm": 0.02494960853942852, + "language_loss": 1.03986156, + "learning_rate": 0.000998804617472226, + "loss": 1.05263186, + "num_input_tokens_seen": 21304864, + "router_z_loss_mlp": 1.77832031, + "step": 267, + "time_per_iteration": 2.644099235534668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01269549, + "balance_loss_mlp": 1.09254682, + "epoch": 0.05155829165063486, + "flos": 696714862080.0, + "grad_norm": 0.027664306986101984, + "language_loss": 0.98796493, + "learning_rate": 0.0009987829908094568, + "loss": 1.00066042, + "num_input_tokens_seen": 21377504, + "router_z_loss_mlp": 1.76953125, + "step": 268, + "time_per_iteration": 2.8291003704071045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01265086, + "balance_loss_mlp": 1.08817983, + "epoch": 0.051750673335898424, + "flos": 1350300294144.0, + "grad_norm": 0.03385083640642466, + "language_loss": 1.06218576, + "learning_rate": 0.0009987611705019569, + "loss": 1.07483661, + "num_input_tokens_seen": 21463840, + "router_z_loss_mlp": 1.76855469, + "step": 269, + "time_per_iteration": 4.150776624679565 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01264769, + "balance_loss_mlp": 1.08795822, + "epoch": 0.051943055021161984, + "flos": 490589481984.0, + "grad_norm": 0.028250493976035247, + "language_loss": 1.04104686, + "learning_rate": 0.0009987391565581978, + "loss": 1.05369449, + "num_input_tokens_seen": 21531184, + "router_z_loss_mlp": 1.76757812, + "step": 270, + "time_per_iteration": 2.5921454429626465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01266977, + "balance_loss_mlp": 1.09092879, + "epoch": 0.05213543670642555, + "flos": 546880032768.0, + "grad_norm": 0.026669721507250346, + "language_loss": 0.96455419, + "learning_rate": 0.000998716948986726, + "loss": 0.97722399, + "num_input_tokens_seen": 21612224, + "router_z_loss_mlp": 1.75976562, + "step": 271, + "time_per_iteration": 2.7835500240325928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01268405, + "balance_loss_mlp": 1.09264266, + "epoch": 0.05232781839168911, + "flos": 604672528896.0, + "grad_norm": 0.03568520247936263, + "language_loss": 0.99334317, + "learning_rate": 0.0009986945477961633, + "loss": 1.00602722, + "num_input_tokens_seen": 21681024, + "router_z_loss_mlp": 1.75683594, + "step": 272, + "time_per_iteration": 2.6972289085388184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01271248, + "balance_loss_mlp": 1.0953902, + "epoch": 0.052520200076952676, + "flos": 539655561216.0, + "grad_norm": 0.02343402151836954, + "language_loss": 1.0317328, + "learning_rate": 0.0009986719529952066, + "loss": 1.04444528, + "num_input_tokens_seen": 21761616, + "router_z_loss_mlp": 1.7578125, + "step": 273, + "time_per_iteration": 2.908298969268799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0126867, + "balance_loss_mlp": 1.09266984, + "epoch": 0.052712581762216236, + "flos": 464332916736.0, + "grad_norm": 0.028493663433316604, + "language_loss": 1.03350449, + "learning_rate": 0.000998649164592628, + "loss": 1.0461911, + "num_input_tokens_seen": 21828416, + "router_z_loss_mlp": 1.75927734, + "step": 274, + "time_per_iteration": 2.5805718898773193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01263404, + "balance_loss_mlp": 1.08735609, + "epoch": 0.0529049634474798, + "flos": 549105116160.0, + "grad_norm": 0.024462560446863554, + "language_loss": 1.01155043, + "learning_rate": 0.0009986261825972748, + "loss": 1.02418458, + "num_input_tokens_seen": 21901600, + "router_z_loss_mlp": 1.75976562, + "step": 275, + "time_per_iteration": 2.675705909729004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01269015, + "balance_loss_mlp": 1.09334803, + "epoch": 0.05309734513274336, + "flos": 619200061440.0, + "grad_norm": 0.026443817532743642, + "language_loss": 1.03055406, + "learning_rate": 0.000998603007018069, + "loss": 1.04324436, + "num_input_tokens_seen": 21979312, + "router_z_loss_mlp": 1.75585938, + "step": 276, + "time_per_iteration": 2.77298903465271 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01264217, + "balance_loss_mlp": 1.08893192, + "epoch": 0.05328972681800693, + "flos": 606617634816.0, + "grad_norm": 0.022439827576013177, + "language_loss": 1.00613213, + "learning_rate": 0.0009985796378640089, + "loss": 1.01877427, + "num_input_tokens_seen": 22053776, + "router_z_loss_mlp": 1.75195312, + "step": 277, + "time_per_iteration": 2.693049669265747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01264635, + "balance_loss_mlp": 1.08963549, + "epoch": 0.05348210850327049, + "flos": 605730038784.0, + "grad_norm": 0.02549683888178727, + "language_loss": 1.01102281, + "learning_rate": 0.0009985560751441665, + "loss": 1.02366924, + "num_input_tokens_seen": 22134304, + "router_z_loss_mlp": 1.74902344, + "step": 278, + "time_per_iteration": 2.8009955883026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262716, + "balance_loss_mlp": 1.08757329, + "epoch": 0.053674490188534055, + "flos": 631997337600.0, + "grad_norm": 0.025192100126554, + "language_loss": 1.03316271, + "learning_rate": 0.00099853231886769, + "loss": 1.04578984, + "num_input_tokens_seen": 22212896, + "router_z_loss_mlp": 1.75048828, + "step": 279, + "time_per_iteration": 2.8228564262390137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262121, + "balance_loss_mlp": 1.08712184, + "epoch": 0.053866871873797614, + "flos": 480173741568.0, + "grad_norm": 0.02583251996588833, + "language_loss": 1.02629757, + "learning_rate": 0.0009985083690438024, + "loss": 1.03891873, + "num_input_tokens_seen": 22287216, + "router_z_loss_mlp": 1.74902344, + "step": 280, + "time_per_iteration": 2.705789566040039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01260843, + "balance_loss_mlp": 1.08655906, + "epoch": 0.054059253559061174, + "flos": 789489065472.0, + "grad_norm": 0.023704628566171972, + "language_loss": 0.9340027, + "learning_rate": 0.0009984842256818016, + "loss": 0.94661117, + "num_input_tokens_seen": 22370864, + "router_z_loss_mlp": 1.74169922, + "step": 281, + "time_per_iteration": 3.084801435470581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01257985, + "balance_loss_mlp": 1.08379591, + "epoch": 0.05425163524432474, + "flos": 629505011712.0, + "grad_norm": 0.027462270528210347, + "language_loss": 1.04308844, + "learning_rate": 0.0009984598887910613, + "loss": 1.05566835, + "num_input_tokens_seen": 22440080, + "router_z_loss_mlp": 1.74072266, + "step": 282, + "time_per_iteration": 2.729063034057617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262745, + "balance_loss_mlp": 1.08855665, + "epoch": 0.0544440169295883, + "flos": 616992442368.0, + "grad_norm": 0.02580860229759897, + "language_loss": 0.99945354, + "learning_rate": 0.0009984353583810297, + "loss": 1.01208091, + "num_input_tokens_seen": 22517936, + "router_z_loss_mlp": 1.74072266, + "step": 283, + "time_per_iteration": 2.812309741973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01258383, + "balance_loss_mlp": 1.08433735, + "epoch": 0.05463639861485187, + "flos": 648929874432.0, + "grad_norm": 0.0290705298354334, + "language_loss": 1.01989841, + "learning_rate": 0.0009984106344612302, + "loss": 1.03248215, + "num_input_tokens_seen": 22590480, + "router_z_loss_mlp": 1.73925781, + "step": 284, + "time_per_iteration": 2.785377264022827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0126395, + "balance_loss_mlp": 1.0907625, + "epoch": 0.054828780300115426, + "flos": 798584782848.0, + "grad_norm": 0.03167011835004719, + "language_loss": 0.97435868, + "learning_rate": 0.0009983857170412615, + "loss": 0.9869982, + "num_input_tokens_seen": 22668144, + "router_z_loss_mlp": 1.73046875, + "step": 285, + "time_per_iteration": 2.9822604656219482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01258353, + "balance_loss_mlp": 1.08511817, + "epoch": 0.05502116198537899, + "flos": 550798442496.0, + "grad_norm": 0.02077828299254123, + "language_loss": 0.96197385, + "learning_rate": 0.000998360606130798, + "loss": 0.9745574, + "num_input_tokens_seen": 22749648, + "router_z_loss_mlp": 1.73095703, + "step": 286, + "time_per_iteration": 2.8340489864349365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01266281, + "balance_loss_mlp": 1.09461975, + "epoch": 0.05521354367064255, + "flos": 1410906931200.0, + "grad_norm": 0.010589673029146669, + "language_loss": 0.69073117, + "learning_rate": 0.0009983353017395877, + "loss": 0.70339394, + "num_input_tokens_seen": 22982752, + "router_z_loss_mlp": 1.71484375, + "step": 287, + "time_per_iteration": 4.893908500671387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0126535, + "balance_loss_mlp": 1.09235394, + "epoch": 0.05540592535590612, + "flos": 646611465216.0, + "grad_norm": 0.04031113274469801, + "language_loss": 1.02544129, + "learning_rate": 0.0009983098038774552, + "loss": 1.03809476, + "num_input_tokens_seen": 23053584, + "router_z_loss_mlp": 1.72851562, + "step": 288, + "time_per_iteration": 2.800687551498413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01258598, + "balance_loss_mlp": 1.08712769, + "epoch": 0.05559830704116968, + "flos": 1514315727360.0, + "grad_norm": 0.011752943348929798, + "language_loss": 0.78170228, + "learning_rate": 0.0009982841125542993, + "loss": 0.79428822, + "num_input_tokens_seen": 23280256, + "router_z_loss_mlp": 1.71289062, + "step": 289, + "time_per_iteration": 4.802466630935669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0126164, + "balance_loss_mlp": 1.08869088, + "epoch": 0.055790688726433245, + "flos": 509334867456.0, + "grad_norm": 0.03460900762027919, + "language_loss": 1.00913107, + "learning_rate": 0.0009982582277800948, + "loss": 1.02174735, + "num_input_tokens_seen": 23345760, + "router_z_loss_mlp": 1.72802734, + "step": 290, + "time_per_iteration": 2.574007749557495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01255451, + "balance_loss_mlp": 1.08326483, + "epoch": 0.055983070411696804, + "flos": 659074369536.0, + "grad_norm": 0.03439417592421578, + "language_loss": 1.07703924, + "learning_rate": 0.0009982321495648908, + "loss": 1.08959377, + "num_input_tokens_seen": 23420720, + "router_z_loss_mlp": 1.72021484, + "step": 291, + "time_per_iteration": 2.8004326820373535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01257264, + "balance_loss_mlp": 1.08503067, + "epoch": 0.05617545209696037, + "flos": 588475865088.0, + "grad_norm": 0.024241847728240208, + "language_loss": 0.9905349, + "learning_rate": 0.0009982058779188115, + "loss": 1.00310755, + "num_input_tokens_seen": 23492576, + "router_z_loss_mlp": 1.72070312, + "step": 292, + "time_per_iteration": 2.763096570968628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01257503, + "balance_loss_mlp": 1.0853169, + "epoch": 0.05636783378222393, + "flos": 612787324416.0, + "grad_norm": 0.027188079674348095, + "language_loss": 1.06693649, + "learning_rate": 0.0009981794128520567, + "loss": 1.07951164, + "num_input_tokens_seen": 23569824, + "router_z_loss_mlp": 1.72021484, + "step": 293, + "time_per_iteration": 2.7630960941314697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01253426, + "balance_loss_mlp": 1.08123958, + "epoch": 0.0565602154674875, + "flos": 669422980608.0, + "grad_norm": 0.030197403892147204, + "language_loss": 1.03523457, + "learning_rate": 0.000998152754374901, + "loss": 1.04776871, + "num_input_tokens_seen": 23649984, + "router_z_loss_mlp": 1.72021484, + "step": 294, + "time_per_iteration": 2.8583314418792725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01249713, + "balance_loss_mlp": 1.07743168, + "epoch": 0.05675259715275106, + "flos": 618364131840.0, + "grad_norm": 0.026289358543143387, + "language_loss": 0.99071473, + "learning_rate": 0.0009981259024976943, + "loss": 1.00321186, + "num_input_tokens_seen": 23722032, + "router_z_loss_mlp": 1.72119141, + "step": 295, + "time_per_iteration": 2.719881534576416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250566, + "balance_loss_mlp": 1.07814193, + "epoch": 0.05694497883801462, + "flos": 753153133056.0, + "grad_norm": 0.03148267511857758, + "language_loss": 0.97962338, + "learning_rate": 0.0009980988572308612, + "loss": 0.99212909, + "num_input_tokens_seen": 23797376, + "router_z_loss_mlp": 1.72265625, + "step": 296, + "time_per_iteration": 2.9828195571899414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250905, + "balance_loss_mlp": 1.0789572, + "epoch": 0.05713736052327818, + "flos": 713380882944.0, + "grad_norm": 0.02524811137395651, + "language_loss": 1.00250125, + "learning_rate": 0.0009980716185849015, + "loss": 1.01501024, + "num_input_tokens_seen": 23880496, + "router_z_loss_mlp": 1.71777344, + "step": 297, + "time_per_iteration": 2.9749252796173096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01251066, + "balance_loss_mlp": 1.07959557, + "epoch": 0.05732974220854175, + "flos": 469935920640.0, + "grad_norm": 0.024054663695119705, + "language_loss": 0.96916056, + "learning_rate": 0.0009980441865703904, + "loss": 0.98167121, + "num_input_tokens_seen": 23950016, + "router_z_loss_mlp": 1.71289062, + "step": 298, + "time_per_iteration": 2.598325252532959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250911, + "balance_loss_mlp": 1.07939255, + "epoch": 0.05752212389380531, + "flos": 602540771328.0, + "grad_norm": 0.025930022992042723, + "language_loss": 1.05563986, + "learning_rate": 0.000998016561197978, + "loss": 1.06814897, + "num_input_tokens_seen": 24020064, + "router_z_loss_mlp": 1.71337891, + "step": 299, + "time_per_iteration": 2.690300703048706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250529, + "balance_loss_mlp": 1.07924938, + "epoch": 0.057714505579068875, + "flos": 679949511168.0, + "grad_norm": 0.025847674874905035, + "language_loss": 0.97115421, + "learning_rate": 0.0009979887424783895, + "loss": 0.98365951, + "num_input_tokens_seen": 24095360, + "router_z_loss_mlp": 1.7109375, + "step": 300, + "time_per_iteration": 2.863856554031372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01249286, + "balance_loss_mlp": 1.07810116, + "epoch": 0.057906887264332435, + "flos": 597011627520.0, + "grad_norm": 0.02594453351976595, + "language_loss": 0.96475613, + "learning_rate": 0.0009979607304224248, + "loss": 0.97724897, + "num_input_tokens_seen": 24164608, + "router_z_loss_mlp": 1.70996094, + "step": 301, + "time_per_iteration": 2.733415365219116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01248659, + "balance_loss_mlp": 1.0772841, + "epoch": 0.058099268949596, + "flos": 553164515328.0, + "grad_norm": 0.024492956239426298, + "language_loss": 1.0387162, + "learning_rate": 0.000997932525040959, + "loss": 1.05120289, + "num_input_tokens_seen": 24233840, + "router_z_loss_mlp": 1.71191406, + "step": 302, + "time_per_iteration": 2.6392264366149902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01252345, + "balance_loss_mlp": 1.08111238, + "epoch": 0.05829165063485956, + "flos": 509230808064.0, + "grad_norm": 0.038324718957869854, + "language_loss": 1.05616117, + "learning_rate": 0.000997904126344943, + "loss": 1.06868458, + "num_input_tokens_seen": 24302928, + "router_z_loss_mlp": 1.71044922, + "step": 303, + "time_per_iteration": 2.611621141433716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0125091, + "balance_loss_mlp": 1.080441, + "epoch": 0.05848403232012313, + "flos": 616362630144.0, + "grad_norm": 0.028818083574726525, + "language_loss": 1.02425826, + "learning_rate": 0.0009978755343454018, + "loss": 1.03676736, + "num_input_tokens_seen": 24377024, + "router_z_loss_mlp": 1.70263672, + "step": 304, + "time_per_iteration": 2.750213384628296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01245805, + "balance_loss_mlp": 1.07490659, + "epoch": 0.05867641400538669, + "flos": 501079082496.0, + "grad_norm": 0.025195073137535502, + "language_loss": 1.02874422, + "learning_rate": 0.0009978467490534355, + "loss": 1.04120219, + "num_input_tokens_seen": 24442736, + "router_z_loss_mlp": 1.70703125, + "step": 305, + "time_per_iteration": 2.591139078140259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0124905, + "balance_loss_mlp": 1.07853293, + "epoch": 0.05886879569065025, + "flos": 532378696704.0, + "grad_norm": 0.026491629776715375, + "language_loss": 0.99473399, + "learning_rate": 0.00099781777048022, + "loss": 1.00722456, + "num_input_tokens_seen": 24514800, + "router_z_loss_mlp": 1.703125, + "step": 306, + "time_per_iteration": 2.731084108352661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012482, + "balance_loss_mlp": 1.07782638, + "epoch": 0.05906117737591381, + "flos": 490040260608.0, + "grad_norm": 0.025118942729794178, + "language_loss": 1.01122224, + "learning_rate": 0.0009977885986370057, + "loss": 1.02370417, + "num_input_tokens_seen": 24581648, + "router_z_loss_mlp": 1.70166016, + "step": 307, + "time_per_iteration": 2.548307418823242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01247075, + "balance_loss_mlp": 1.0766536, + "epoch": 0.05925355906117737, + "flos": 592709180928.0, + "grad_norm": 0.029001286226925486, + "language_loss": 0.96780527, + "learning_rate": 0.000997759233535118, + "loss": 0.98027599, + "num_input_tokens_seen": 24658864, + "router_z_loss_mlp": 1.70214844, + "step": 308, + "time_per_iteration": 2.7876322269439697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01247056, + "balance_loss_mlp": 1.07668173, + "epoch": 0.05944594074644094, + "flos": 564787487232.0, + "grad_norm": 0.026648157056946717, + "language_loss": 1.03345561, + "learning_rate": 0.0009977296751859576, + "loss": 1.04592621, + "num_input_tokens_seen": 24735808, + "router_z_loss_mlp": 1.70166016, + "step": 309, + "time_per_iteration": 2.71488094329834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0124953, + "balance_loss_mlp": 1.07958508, + "epoch": 0.0596383224317045, + "flos": 539807284224.0, + "grad_norm": 0.023775477335694146, + "language_loss": 1.04459929, + "learning_rate": 0.0009976999236009998, + "loss": 1.05709469, + "num_input_tokens_seen": 24807744, + "router_z_loss_mlp": 1.69726562, + "step": 310, + "time_per_iteration": 2.7919182777404785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01255511, + "balance_loss_mlp": 1.08618629, + "epoch": 0.059830704116968066, + "flos": 562052113920.0, + "grad_norm": 0.02942700961653022, + "language_loss": 1.06853497, + "learning_rate": 0.0009976699787917955, + "loss": 1.08109009, + "num_input_tokens_seen": 24876640, + "router_z_loss_mlp": 1.69091797, + "step": 311, + "time_per_iteration": 2.6729257106781006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012565, + "balance_loss_mlp": 1.08789062, + "epoch": 0.060023085802231625, + "flos": 1574047325184.0, + "grad_norm": 0.029063497479097016, + "language_loss": 0.73442996, + "learning_rate": 0.00099763984076997, + "loss": 0.74699497, + "num_input_tokens_seen": 25110864, + "router_z_loss_mlp": 1.68359375, + "step": 312, + "time_per_iteration": 4.972649097442627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01249775, + "balance_loss_mlp": 1.08021212, + "epoch": 0.06021546748749519, + "flos": 483627523584.0, + "grad_norm": 0.0314235925459163, + "language_loss": 0.98280072, + "learning_rate": 0.0009976095095472243, + "loss": 0.9952985, + "num_input_tokens_seen": 25179328, + "router_z_loss_mlp": 1.69335938, + "step": 313, + "time_per_iteration": 2.5644209384918213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0125234, + "balance_loss_mlp": 1.08287179, + "epoch": 0.06040784917275875, + "flos": 621423143424.0, + "grad_norm": 0.030123719928355924, + "language_loss": 0.99538821, + "learning_rate": 0.0009975789851353334, + "loss": 1.00791156, + "num_input_tokens_seen": 25254128, + "router_z_loss_mlp": 1.69238281, + "step": 314, + "time_per_iteration": 2.794311285018921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01256592, + "balance_loss_mlp": 1.08741045, + "epoch": 0.06060023085802232, + "flos": 484602441216.0, + "grad_norm": 0.026992074473858402, + "language_loss": 1.01683283, + "learning_rate": 0.0009975482675461487, + "loss": 1.02939868, + "num_input_tokens_seen": 25324624, + "router_z_loss_mlp": 1.68945312, + "step": 315, + "time_per_iteration": 2.67146897315979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01249108, + "balance_loss_mlp": 1.08054566, + "epoch": 0.06079261254328588, + "flos": 582985652736.0, + "grad_norm": 0.0292304668639163, + "language_loss": 0.99909455, + "learning_rate": 0.0009975173567915952, + "loss": 1.01158559, + "num_input_tokens_seen": 25393648, + "router_z_loss_mlp": 1.68310547, + "step": 316, + "time_per_iteration": 2.693526268005371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0124983, + "balance_loss_mlp": 1.08131599, + "epoch": 0.060984994228549444, + "flos": 689008298496.0, + "grad_norm": 0.03272213432041067, + "language_loss": 0.93868685, + "learning_rate": 0.000997486252883674, + "loss": 0.95118511, + "num_input_tokens_seen": 25469152, + "router_z_loss_mlp": 1.68261719, + "step": 317, + "time_per_iteration": 2.837315082550049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01252509, + "balance_loss_mlp": 1.08399427, + "epoch": 0.061177375913813004, + "flos": 1316747398656.0, + "grad_norm": 0.031012352820614663, + "language_loss": 0.98949343, + "learning_rate": 0.0009974549558344602, + "loss": 1.00201845, + "num_input_tokens_seen": 25560944, + "router_z_loss_mlp": 1.68261719, + "step": 318, + "time_per_iteration": 3.686920166015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0125178, + "balance_loss_mlp": 1.08321846, + "epoch": 0.06136975759907657, + "flos": 575400612864.0, + "grad_norm": 0.027925836735275204, + "language_loss": 1.08640313, + "learning_rate": 0.000997423465656105, + "loss": 1.09892082, + "num_input_tokens_seen": 25631424, + "router_z_loss_mlp": 1.68310547, + "step": 319, + "time_per_iteration": 2.7691538333892822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250553, + "balance_loss_mlp": 1.08218133, + "epoch": 0.06156213928434013, + "flos": 528564346368.0, + "grad_norm": 0.033042319608268485, + "language_loss": 1.06051123, + "learning_rate": 0.0009973917823608335, + "loss": 1.07301688, + "num_input_tokens_seen": 25698176, + "router_z_loss_mlp": 1.68115234, + "step": 320, + "time_per_iteration": 2.583859443664551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01251303, + "balance_loss_mlp": 1.08364725, + "epoch": 0.061754520969603696, + "flos": 496589984256.0, + "grad_norm": 0.025351519610416894, + "language_loss": 0.99929821, + "learning_rate": 0.0009973599059609462, + "loss": 1.01181126, + "num_input_tokens_seen": 25773472, + "router_z_loss_mlp": 1.67382812, + "step": 321, + "time_per_iteration": 2.7139415740966797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01246641, + "balance_loss_mlp": 1.07893777, + "epoch": 0.061946902654867256, + "flos": 441044038656.0, + "grad_norm": 0.025867704850659153, + "language_loss": 0.98033404, + "learning_rate": 0.000997327836468819, + "loss": 0.99280047, + "num_input_tokens_seen": 25841088, + "router_z_loss_mlp": 1.67431641, + "step": 322, + "time_per_iteration": 2.598400831222534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250362, + "balance_loss_mlp": 1.08280182, + "epoch": 0.06213928434013082, + "flos": 600042441216.0, + "grad_norm": 0.02535167136018297, + "language_loss": 1.01516175, + "learning_rate": 0.000997295573896902, + "loss": 1.02766538, + "num_input_tokens_seen": 25919424, + "router_z_loss_mlp": 1.67285156, + "step": 323, + "time_per_iteration": 2.8295648097991943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0125071, + "balance_loss_mlp": 1.0847702, + "epoch": 0.06233166602539438, + "flos": 1453114384896.0, + "grad_norm": 0.012451454042686489, + "language_loss": 0.8119604, + "learning_rate": 0.000997263118257721, + "loss": 0.82446748, + "num_input_tokens_seen": 26135504, + "router_z_loss_mlp": 1.65625, + "step": 324, + "time_per_iteration": 4.72265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01244164, + "balance_loss_mlp": 1.07803345, + "epoch": 0.06252404771065795, + "flos": 1466628794880.0, + "grad_norm": 0.009026829376029815, + "language_loss": 0.78571939, + "learning_rate": 0.0009972304695638763, + "loss": 0.79816103, + "num_input_tokens_seen": 26358880, + "router_z_loss_mlp": 1.65820312, + "step": 325, + "time_per_iteration": 4.859014272689819 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01252677, + "balance_loss_mlp": 1.08535445, + "epoch": 0.06271642939592151, + "flos": 465235975680.0, + "grad_norm": 0.02899330239765154, + "language_loss": 0.95714885, + "learning_rate": 0.000997197627828043, + "loss": 0.96967566, + "num_input_tokens_seen": 26425888, + "router_z_loss_mlp": 1.67041016, + "step": 326, + "time_per_iteration": 2.5137081146240234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250284, + "balance_loss_mlp": 1.08343852, + "epoch": 0.06290881108118507, + "flos": 533431477248.0, + "grad_norm": 0.02712212536791958, + "language_loss": 0.90827119, + "learning_rate": 0.0009971645930629716, + "loss": 0.92077404, + "num_input_tokens_seen": 26500656, + "router_z_loss_mlp": 1.66552734, + "step": 327, + "time_per_iteration": 2.6867988109588623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01249402, + "balance_loss_mlp": 1.08260453, + "epoch": 0.06310119276644863, + "flos": 674767474176.0, + "grad_norm": 0.026247049513885422, + "language_loss": 1.04735494, + "learning_rate": 0.0009971313652814872, + "loss": 1.0598489, + "num_input_tokens_seen": 26577408, + "router_z_loss_mlp": 1.66503906, + "step": 328, + "time_per_iteration": 2.845618724822998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01245995, + "balance_loss_mlp": 1.07924485, + "epoch": 0.0632935744517122, + "flos": 772050241536.0, + "grad_norm": 0.03020034978800923, + "language_loss": 1.02482498, + "learning_rate": 0.0009970979444964903, + "loss": 1.03728485, + "num_input_tokens_seen": 26652048, + "router_z_loss_mlp": 1.66455078, + "step": 329, + "time_per_iteration": 2.967315196990967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01249674, + "balance_loss_mlp": 1.08316231, + "epoch": 0.06348595613697576, + "flos": 562974638592.0, + "grad_norm": 0.027434293654228625, + "language_loss": 1.03562641, + "learning_rate": 0.0009970643307209556, + "loss": 1.04812312, + "num_input_tokens_seen": 26728192, + "router_z_loss_mlp": 1.66210938, + "step": 330, + "time_per_iteration": 2.7991747856140137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01247918, + "balance_loss_mlp": 1.0814544, + "epoch": 0.06367833782223932, + "flos": 677383325184.0, + "grad_norm": 0.030236705728133754, + "language_loss": 1.00163436, + "learning_rate": 0.0009970305239679334, + "loss": 1.01411343, + "num_input_tokens_seen": 26798016, + "router_z_loss_mlp": 1.66162109, + "step": 331, + "time_per_iteration": 2.8012547492980957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01243208, + "balance_loss_mlp": 1.07669675, + "epoch": 0.06387071950750288, + "flos": 496348938240.0, + "grad_norm": 0.029279450628507057, + "language_loss": 1.04491925, + "learning_rate": 0.0009969965242505483, + "loss": 1.05735123, + "num_input_tokens_seen": 26867536, + "router_z_loss_mlp": 1.66210938, + "step": 332, + "time_per_iteration": 2.658085584640503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01251001, + "balance_loss_mlp": 1.08463287, + "epoch": 0.06406310119276645, + "flos": 534556116480.0, + "grad_norm": 0.029350032940601952, + "language_loss": 1.00548685, + "learning_rate": 0.0009969623315820007, + "loss": 1.01799679, + "num_input_tokens_seen": 26941216, + "router_z_loss_mlp": 1.66064453, + "step": 333, + "time_per_iteration": 2.6670596599578857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238877, + "balance_loss_mlp": 1.07246125, + "epoch": 0.06425548287803001, + "flos": 457164840960.0, + "grad_norm": 0.03277849846880731, + "language_loss": 1.00979996, + "learning_rate": 0.000996927945975565, + "loss": 1.02218866, + "num_input_tokens_seen": 27006560, + "router_z_loss_mlp": 1.66113281, + "step": 334, + "time_per_iteration": 2.5448765754699707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01245409, + "balance_loss_mlp": 1.0792315, + "epoch": 0.06444786456329357, + "flos": 561122858496.0, + "grad_norm": 0.03573042475309631, + "language_loss": 0.98108363, + "learning_rate": 0.0009968933674445906, + "loss": 0.99353766, + "num_input_tokens_seen": 27076400, + "router_z_loss_mlp": 1.65869141, + "step": 335, + "time_per_iteration": 2.679093360900879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01242425, + "balance_loss_mlp": 1.07672429, + "epoch": 0.06464024624855713, + "flos": 667356350976.0, + "grad_norm": 0.0316377115871937, + "language_loss": 0.99817598, + "learning_rate": 0.0009968585960025028, + "loss": 1.01060021, + "num_input_tokens_seen": 27158672, + "router_z_loss_mlp": 1.65380859, + "step": 336, + "time_per_iteration": 2.9642832279205322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01246223, + "balance_loss_mlp": 1.08085632, + "epoch": 0.0648326279338207, + "flos": 1524555549696.0, + "grad_norm": 0.012731648189289846, + "language_loss": 0.77653188, + "learning_rate": 0.0009968236316628006, + "loss": 0.78899413, + "num_input_tokens_seen": 27380592, + "router_z_loss_mlp": 1.65039062, + "step": 337, + "time_per_iteration": 4.799122333526611 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01242249, + "balance_loss_mlp": 1.07683408, + "epoch": 0.06502500961908426, + "flos": 1145214959616.0, + "grad_norm": 0.030168792806873873, + "language_loss": 0.98216963, + "learning_rate": 0.0009967884744390583, + "loss": 0.99459207, + "num_input_tokens_seen": 27469984, + "router_z_loss_mlp": 1.65087891, + "step": 338, + "time_per_iteration": 3.513155460357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01243978, + "balance_loss_mlp": 1.07865858, + "epoch": 0.06521739130434782, + "flos": 583693327872.0, + "grad_norm": 0.025823410577593665, + "language_loss": 0.98998213, + "learning_rate": 0.0009967531243449256, + "loss": 1.00242186, + "num_input_tokens_seen": 27543904, + "router_z_loss_mlp": 1.64990234, + "step": 339, + "time_per_iteration": 2.6683707237243652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01239476, + "balance_loss_mlp": 1.07453787, + "epoch": 0.06540977298961138, + "flos": 498658615296.0, + "grad_norm": 0.02384437782241591, + "language_loss": 1.06067204, + "learning_rate": 0.000996717581394126, + "loss": 1.07306671, + "num_input_tokens_seen": 27609888, + "router_z_loss_mlp": 1.64599609, + "step": 340, + "time_per_iteration": 2.5471885204315186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236124, + "balance_loss_mlp": 1.07171023, + "epoch": 0.06560215467487496, + "flos": 543903613440.0, + "grad_norm": 0.02318937955413124, + "language_loss": 1.0712086, + "learning_rate": 0.000996681845600459, + "loss": 1.08356977, + "num_input_tokens_seen": 27683936, + "router_z_loss_mlp": 1.640625, + "step": 341, + "time_per_iteration": 2.651742458343506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01240028, + "balance_loss_mlp": 1.07575738, + "epoch": 0.06579453636013852, + "flos": 414351043584.0, + "grad_norm": 0.026316803994829763, + "language_loss": 0.99228215, + "learning_rate": 0.0009966459169777982, + "loss": 1.00468254, + "num_input_tokens_seen": 27747840, + "router_z_loss_mlp": 1.63916016, + "step": 342, + "time_per_iteration": 2.4996230602264404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01244627, + "balance_loss_mlp": 1.08045232, + "epoch": 0.06598691804540208, + "flos": 561680812032.0, + "grad_norm": 0.03097158399986616, + "language_loss": 1.07124209, + "learning_rate": 0.0009966097955400924, + "loss": 1.08368838, + "num_input_tokens_seen": 27819728, + "router_z_loss_mlp": 1.63818359, + "step": 343, + "time_per_iteration": 2.7243080139160156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238691, + "balance_loss_mlp": 1.07451606, + "epoch": 0.06617929973066564, + "flos": 573301782528.0, + "grad_norm": 0.022915441754152527, + "language_loss": 1.00964892, + "learning_rate": 0.0009965734813013652, + "loss": 1.02203584, + "num_input_tokens_seen": 27893536, + "router_z_loss_mlp": 1.63818359, + "step": 344, + "time_per_iteration": 2.8087360858917236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01237027, + "balance_loss_mlp": 1.07375824, + "epoch": 0.06637168141592921, + "flos": 491464343040.0, + "grad_norm": 0.024444849604151265, + "language_loss": 1.03758335, + "learning_rate": 0.0009965369742757151, + "loss": 1.04995358, + "num_input_tokens_seen": 27960976, + "router_z_loss_mlp": 1.62890625, + "step": 345, + "time_per_iteration": 2.5691587924957275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01237907, + "balance_loss_mlp": 1.07459044, + "epoch": 0.06656406310119277, + "flos": 1081037924352.0, + "grad_norm": 0.024807678995847144, + "language_loss": 0.99529493, + "learning_rate": 0.0009965002744773152, + "loss": 1.00767398, + "num_input_tokens_seen": 28050864, + "router_z_loss_mlp": 1.62939453, + "step": 346, + "time_per_iteration": 3.507969856262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01239522, + "balance_loss_mlp": 1.07611036, + "epoch": 0.06675644478645633, + "flos": 514723021824.0, + "grad_norm": 0.02663627628784384, + "language_loss": 0.97097999, + "learning_rate": 0.0009964633819204139, + "loss": 0.98337519, + "num_input_tokens_seen": 28122448, + "router_z_loss_mlp": 1.63037109, + "step": 347, + "time_per_iteration": 2.6551601886749268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01261986, + "balance_loss_mlp": 1.09986115, + "epoch": 0.06694882647171989, + "flos": 1450534189056.0, + "grad_norm": 0.030948258254188146, + "language_loss": 0.81801116, + "learning_rate": 0.0009964262966193338, + "loss": 0.83063102, + "num_input_tokens_seen": 28350352, + "router_z_loss_mlp": 1.6171875, + "step": 348, + "time_per_iteration": 5.152506589889526 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236206, + "balance_loss_mlp": 1.07427216, + "epoch": 0.06714120815698346, + "flos": 1555397266944.0, + "grad_norm": 0.0077968992848742235, + "language_loss": 0.75153887, + "learning_rate": 0.000996389018588473, + "loss": 0.76390088, + "num_input_tokens_seen": 28585584, + "router_z_loss_mlp": 1.61523438, + "step": 349, + "time_per_iteration": 4.909464120864868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01242005, + "balance_loss_mlp": 1.07873547, + "epoch": 0.06733358984224702, + "flos": 881615992320.0, + "grad_norm": 0.03432587789196913, + "language_loss": 0.97228402, + "learning_rate": 0.000996351547842304, + "loss": 0.98470408, + "num_input_tokens_seen": 28672512, + "router_z_loss_mlp": 1.62890625, + "step": 350, + "time_per_iteration": 3.1799545288085938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01240315, + "balance_loss_mlp": 1.0778569, + "epoch": 0.06752597152751058, + "flos": 519917793792.0, + "grad_norm": 0.030803186893757592, + "language_loss": 0.96182388, + "learning_rate": 0.0009963138843953744, + "loss": 0.97422707, + "num_input_tokens_seen": 28741520, + "router_z_loss_mlp": 1.62060547, + "step": 351, + "time_per_iteration": 2.5873348712921143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238163, + "balance_loss_mlp": 1.07565665, + "epoch": 0.06771835321277414, + "flos": 540882258432.0, + "grad_norm": 0.023778523337364334, + "language_loss": 0.99575555, + "learning_rate": 0.000996276028262306, + "loss": 1.00813723, + "num_input_tokens_seen": 28814912, + "router_z_loss_mlp": 1.62109375, + "step": 352, + "time_per_iteration": 2.7943532466888428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238104, + "balance_loss_mlp": 1.07583654, + "epoch": 0.0679107348980377, + "flos": 461615007744.0, + "grad_norm": 0.02720743117278016, + "language_loss": 1.06749547, + "learning_rate": 0.0009962379794577964, + "loss": 1.07987642, + "num_input_tokens_seen": 28882192, + "router_z_loss_mlp": 1.61865234, + "step": 353, + "time_per_iteration": 2.589200973510742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01239427, + "balance_loss_mlp": 1.07711196, + "epoch": 0.06810311658330127, + "flos": 637207572480.0, + "grad_norm": 0.02321502152829773, + "language_loss": 0.95908678, + "learning_rate": 0.000996199737996617, + "loss": 0.97148108, + "num_input_tokens_seen": 28968576, + "router_z_loss_mlp": 1.61914062, + "step": 354, + "time_per_iteration": 2.8822708129882812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0123871, + "balance_loss_mlp": 1.07687151, + "epoch": 0.06829549826856483, + "flos": 465626743296.0, + "grad_norm": 0.030894548658215056, + "language_loss": 1.05554581, + "learning_rate": 0.0009961613038936149, + "loss": 1.06793284, + "num_input_tokens_seen": 29036160, + "router_z_loss_mlp": 1.61425781, + "step": 355, + "time_per_iteration": 2.576930522918701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236116, + "balance_loss_mlp": 1.07456315, + "epoch": 0.06848787995382839, + "flos": 635896281600.0, + "grad_norm": 0.0286185110148739, + "language_loss": 0.9730283, + "learning_rate": 0.000996122677163711, + "loss": 0.98538941, + "num_input_tokens_seen": 29112048, + "router_z_loss_mlp": 1.61132812, + "step": 356, + "time_per_iteration": 2.850829601287842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01237686, + "balance_loss_mlp": 1.07637215, + "epoch": 0.06868026163909195, + "flos": 807780556800.0, + "grad_norm": 0.03078602082995562, + "language_loss": 1.03526855, + "learning_rate": 0.000996083857821902, + "loss": 1.04764557, + "num_input_tokens_seen": 29190960, + "router_z_loss_mlp": 1.60888672, + "step": 357, + "time_per_iteration": 3.124053716659546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01237273, + "balance_loss_mlp": 1.07605469, + "epoch": 0.06887264332435553, + "flos": 440151713280.0, + "grad_norm": 0.02263887650004652, + "language_loss": 1.01701617, + "learning_rate": 0.0009960448458832588, + "loss": 1.0293889, + "num_input_tokens_seen": 29262832, + "router_z_loss_mlp": 1.60791016, + "step": 358, + "time_per_iteration": 2.6918816566467285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01242041, + "balance_loss_mlp": 1.08077514, + "epoch": 0.06906502500961909, + "flos": 485785477632.0, + "grad_norm": 0.021707311176365728, + "language_loss": 1.01897752, + "learning_rate": 0.000996005641362927, + "loss": 1.03139794, + "num_input_tokens_seen": 29329552, + "router_z_loss_mlp": 1.60839844, + "step": 359, + "time_per_iteration": 2.601358652114868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238764, + "balance_loss_mlp": 1.07725942, + "epoch": 0.06925740669488265, + "flos": 734885110272.0, + "grad_norm": 0.024380378407611886, + "language_loss": 1.04387617, + "learning_rate": 0.0009959662442761274, + "loss": 1.05626392, + "num_input_tokens_seen": 29410784, + "router_z_loss_mlp": 1.61083984, + "step": 360, + "time_per_iteration": 2.9404215812683105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236823, + "balance_loss_mlp": 1.07589066, + "epoch": 0.0694497883801462, + "flos": 553570745856.0, + "grad_norm": 0.023221163769242582, + "language_loss": 0.97943044, + "learning_rate": 0.000995926654638155, + "loss": 0.99179876, + "num_input_tokens_seen": 29486992, + "router_z_loss_mlp": 1.60498047, + "step": 361, + "time_per_iteration": 2.811624526977539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01234495, + "balance_loss_mlp": 1.07413495, + "epoch": 0.06964217006540978, + "flos": 679243837440.0, + "grad_norm": 0.025577226237571565, + "language_loss": 1.00741839, + "learning_rate": 0.00099588687246438, + "loss": 1.01976323, + "num_input_tokens_seen": 29557232, + "router_z_loss_mlp": 1.59912109, + "step": 362, + "time_per_iteration": 2.826204538345337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01235331, + "balance_loss_mlp": 1.0749228, + "epoch": 0.06983455175067334, + "flos": 525260285952.0, + "grad_norm": 0.054619150892928216, + "language_loss": 1.0805161, + "learning_rate": 0.0009958468977702471, + "loss": 1.09286952, + "num_input_tokens_seen": 29625344, + "router_z_loss_mlp": 1.59960938, + "step": 363, + "time_per_iteration": 2.5742297172546387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01269455, + "balance_loss_mlp": 1.11000061, + "epoch": 0.0700269334359369, + "flos": 1580173353984.0, + "grad_norm": 0.0347214045967213, + "language_loss": 0.79734707, + "learning_rate": 0.0009958067305712761, + "loss": 0.81004167, + "num_input_tokens_seen": 29843664, + "router_z_loss_mlp": 1.59179688, + "step": 364, + "time_per_iteration": 4.815373182296753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01234235, + "balance_loss_mlp": 1.07420838, + "epoch": 0.07021931512120046, + "flos": 1014856659456.0, + "grad_norm": 0.027565425727799023, + "language_loss": 0.95424879, + "learning_rate": 0.0009957663708830612, + "loss": 0.96659118, + "num_input_tokens_seen": 29927152, + "router_z_loss_mlp": 1.59667969, + "step": 365, + "time_per_iteration": 3.3032214641571045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238249, + "balance_loss_mlp": 1.07874703, + "epoch": 0.07041169680646403, + "flos": 824431114752.0, + "grad_norm": 0.03609893162101238, + "language_loss": 0.99641442, + "learning_rate": 0.0009957258187212714, + "loss": 1.00879693, + "num_input_tokens_seen": 30004928, + "router_z_loss_mlp": 1.59228516, + "step": 366, + "time_per_iteration": 3.143951654434204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01232948, + "balance_loss_mlp": 1.0748291, + "epoch": 0.07060407849172759, + "flos": 1417290743808.0, + "grad_norm": 0.015479474187128486, + "language_loss": 0.79194862, + "learning_rate": 0.0009956850741016502, + "loss": 0.80427808, + "num_input_tokens_seen": 30230256, + "router_z_loss_mlp": 1.578125, + "step": 367, + "time_per_iteration": 4.856614112854004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01232866, + "balance_loss_mlp": 1.07417488, + "epoch": 0.07079646017699115, + "flos": 513941486592.0, + "grad_norm": 0.03158452537667852, + "language_loss": 0.9606331, + "learning_rate": 0.0009956441370400167, + "loss": 0.97296178, + "num_input_tokens_seen": 30301200, + "router_z_loss_mlp": 1.58398438, + "step": 368, + "time_per_iteration": 2.6471550464630127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01231431, + "balance_loss_mlp": 1.07288289, + "epoch": 0.07098884186225471, + "flos": 541548274176.0, + "grad_norm": 0.03366854249700899, + "language_loss": 1.02536654, + "learning_rate": 0.0009956030075522636, + "loss": 1.03768086, + "num_input_tokens_seen": 30377024, + "router_z_loss_mlp": 1.58251953, + "step": 369, + "time_per_iteration": 2.764350175857544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01230433, + "balance_loss_mlp": 1.07183695, + "epoch": 0.07118122354751828, + "flos": 549738931200.0, + "grad_norm": 0.025388205653796188, + "language_loss": 1.02520657, + "learning_rate": 0.0009955616856543587, + "loss": 1.03751087, + "num_input_tokens_seen": 30448896, + "router_z_loss_mlp": 1.58300781, + "step": 370, + "time_per_iteration": 2.6488449573516846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01233332, + "balance_loss_mlp": 1.07483125, + "epoch": 0.07137360523278184, + "flos": 622076424192.0, + "grad_norm": 0.025131147277089937, + "language_loss": 0.94016552, + "learning_rate": 0.0009955201713623448, + "loss": 0.95249885, + "num_input_tokens_seen": 30523584, + "router_z_loss_mlp": 1.58203125, + "step": 371, + "time_per_iteration": 2.7475128173828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01231201, + "balance_loss_mlp": 1.07594299, + "epoch": 0.0715659869180454, + "flos": 1505973347328.0, + "grad_norm": 0.011087848535678398, + "language_loss": 0.76672721, + "learning_rate": 0.000995478464692339, + "loss": 0.77903926, + "num_input_tokens_seen": 30757920, + "router_z_loss_mlp": 1.55664062, + "step": 372, + "time_per_iteration": 4.930227518081665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01234827, + "balance_loss_mlp": 1.0769937, + "epoch": 0.07175836860330896, + "flos": 496481195520.0, + "grad_norm": 0.02946804107059058, + "language_loss": 1.07406306, + "learning_rate": 0.0009954365656605333, + "loss": 1.08641148, + "num_input_tokens_seen": 30824960, + "router_z_loss_mlp": 1.57910156, + "step": 373, + "time_per_iteration": 2.5494606494903564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01235693, + "balance_loss_mlp": 1.07862246, + "epoch": 0.07195075028857253, + "flos": 787081333248.0, + "grad_norm": 0.030340412148976308, + "language_loss": 1.00769055, + "learning_rate": 0.0009953944742831947, + "loss": 1.02004743, + "num_input_tokens_seen": 30902224, + "router_z_loss_mlp": 1.57519531, + "step": 374, + "time_per_iteration": 2.974013328552246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01234053, + "balance_loss_mlp": 1.07707787, + "epoch": 0.0721431319738361, + "flos": 594346111488.0, + "grad_norm": 0.024760984543104554, + "language_loss": 1.04227853, + "learning_rate": 0.0009953521905766642, + "loss": 1.05461907, + "num_input_tokens_seen": 30984784, + "router_z_loss_mlp": 1.57421875, + "step": 375, + "time_per_iteration": 2.9470102787017822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01233349, + "balance_loss_mlp": 1.07642198, + "epoch": 0.07233551365909965, + "flos": 549328697856.0, + "grad_norm": 0.025099095391344205, + "language_loss": 1.02903581, + "learning_rate": 0.0009953097145573577, + "loss": 1.04136944, + "num_input_tokens_seen": 31055376, + "router_z_loss_mlp": 1.57373047, + "step": 376, + "time_per_iteration": 2.656438112258911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01232315, + "balance_loss_mlp": 1.0754832, + "epoch": 0.07252789534436321, + "flos": 959167723008.0, + "grad_norm": 0.028756244795243427, + "language_loss": 1.01008701, + "learning_rate": 0.000995267046241766, + "loss": 1.02241015, + "num_input_tokens_seen": 31144944, + "router_z_loss_mlp": 1.57275391, + "step": 377, + "time_per_iteration": 3.2601664066314697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226098, + "balance_loss_mlp": 1.06931448, + "epoch": 0.07272027702962677, + "flos": 508655390208.0, + "grad_norm": 0.025279277167219092, + "language_loss": 1.00209188, + "learning_rate": 0.0009952241856464547, + "loss": 1.01435292, + "num_input_tokens_seen": 31213392, + "router_z_loss_mlp": 1.57226562, + "step": 378, + "time_per_iteration": 2.616483688354492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228279, + "balance_loss_mlp": 1.07159042, + "epoch": 0.07291265871489035, + "flos": 613551395328.0, + "grad_norm": 0.025059419305224793, + "language_loss": 1.0761106, + "learning_rate": 0.0009951811327880632, + "loss": 1.08839345, + "num_input_tokens_seen": 31289840, + "router_z_loss_mlp": 1.57128906, + "step": 379, + "time_per_iteration": 2.7666382789611816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226658, + "balance_loss_mlp": 1.07063651, + "epoch": 0.0731050404001539, + "flos": 496741707264.0, + "grad_norm": 0.032880990240464036, + "language_loss": 1.00766444, + "learning_rate": 0.0009951378876833063, + "loss": 1.01993108, + "num_input_tokens_seen": 31357600, + "router_z_loss_mlp": 1.56445312, + "step": 380, + "time_per_iteration": 2.5504086017608643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01230504, + "balance_loss_mlp": 1.07433975, + "epoch": 0.07329742208541747, + "flos": 641129985024.0, + "grad_norm": 0.0343074889031262, + "language_loss": 1.0780232, + "learning_rate": 0.0009950944503489736, + "loss": 1.0903281, + "num_input_tokens_seen": 31428896, + "router_z_loss_mlp": 1.56591797, + "step": 381, + "time_per_iteration": 2.7695260047912598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01231248, + "balance_loss_mlp": 1.07537043, + "epoch": 0.07348980377068103, + "flos": 817740401664.0, + "grad_norm": 0.027198888726283066, + "language_loss": 1.01785743, + "learning_rate": 0.0009950508208019285, + "loss": 1.03016996, + "num_input_tokens_seen": 31507424, + "router_z_loss_mlp": 1.56298828, + "step": 382, + "time_per_iteration": 2.9918277263641357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01227944, + "balance_loss_mlp": 1.07187521, + "epoch": 0.0736821854559446, + "flos": 509669239296.0, + "grad_norm": 0.03113985633155724, + "language_loss": 1.05612254, + "learning_rate": 0.0009950069990591096, + "loss": 1.06840205, + "num_input_tokens_seen": 31576768, + "router_z_loss_mlp": 1.56494141, + "step": 383, + "time_per_iteration": 2.610745429992676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01248039, + "balance_loss_mlp": 1.09392548, + "epoch": 0.07387456714120816, + "flos": 1558048046592.0, + "grad_norm": 0.03338671968111017, + "language_loss": 0.76401371, + "learning_rate": 0.0009949629851375302, + "loss": 0.77649409, + "num_input_tokens_seen": 31797312, + "router_z_loss_mlp": 1.54492188, + "step": 384, + "time_per_iteration": 4.854166269302368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01229749, + "balance_loss_mlp": 1.0736798, + "epoch": 0.07406694882647172, + "flos": 526643435520.0, + "grad_norm": 0.03274978311793036, + "language_loss": 0.98781282, + "learning_rate": 0.0009949187790542777, + "loss": 1.00011039, + "num_input_tokens_seen": 31869568, + "router_z_loss_mlp": 1.56494141, + "step": 385, + "time_per_iteration": 2.728701591491699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0123258, + "balance_loss_mlp": 1.07636821, + "epoch": 0.07425933051173528, + "flos": 498823799808.0, + "grad_norm": 0.026908846939264777, + "language_loss": 0.94723004, + "learning_rate": 0.0009948743808265148, + "loss": 0.95955586, + "num_input_tokens_seen": 31941712, + "router_z_loss_mlp": 1.56640625, + "step": 386, + "time_per_iteration": 2.6850693225860596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01231135, + "balance_loss_mlp": 1.07511437, + "epoch": 0.07445171219699885, + "flos": 506057003520.0, + "grad_norm": 0.05633654869747302, + "language_loss": 1.04553366, + "learning_rate": 0.0009948297904714782, + "loss": 1.05784488, + "num_input_tokens_seen": 32015232, + "router_z_loss_mlp": 1.56445312, + "step": 387, + "time_per_iteration": 2.6746010780334473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01231627, + "balance_loss_mlp": 1.07555866, + "epoch": 0.07464409388226241, + "flos": 555116352000.0, + "grad_norm": 0.03450843374667126, + "language_loss": 0.9665134, + "learning_rate": 0.0009947850080064796, + "loss": 0.97882968, + "num_input_tokens_seen": 32094640, + "router_z_loss_mlp": 1.56494141, + "step": 388, + "time_per_iteration": 2.7839057445526123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01230193, + "balance_loss_mlp": 1.07431459, + "epoch": 0.07483647556752597, + "flos": 778274325504.0, + "grad_norm": 0.021592891008175935, + "language_loss": 1.01240289, + "learning_rate": 0.0009947400334489047, + "loss": 1.02470493, + "num_input_tokens_seen": 32176640, + "router_z_loss_mlp": 1.56298828, + "step": 389, + "time_per_iteration": 2.9945342540740967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01229089, + "balance_loss_mlp": 1.07411718, + "epoch": 0.07502885725278953, + "flos": 613681651200.0, + "grad_norm": 0.023383004705128753, + "language_loss": 0.92341155, + "learning_rate": 0.0009946948668162145, + "loss": 0.93570244, + "num_input_tokens_seen": 32246704, + "router_z_loss_mlp": 1.55371094, + "step": 390, + "time_per_iteration": 2.7355024814605713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122989, + "balance_loss_mlp": 1.07496524, + "epoch": 0.0752212389380531, + "flos": 689854961664.0, + "grad_norm": 0.026752200694656208, + "language_loss": 0.97335494, + "learning_rate": 0.0009946495081259441, + "loss": 0.98565376, + "num_input_tokens_seen": 32320032, + "router_z_loss_mlp": 1.55322266, + "step": 391, + "time_per_iteration": 2.799938678741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01227768, + "balance_loss_mlp": 1.07303405, + "epoch": 0.07541362062331666, + "flos": 767050853376.0, + "grad_norm": 0.02596026064524479, + "language_loss": 1.01604676, + "learning_rate": 0.0009946039573957035, + "loss": 1.02832437, + "num_input_tokens_seen": 32398144, + "router_z_loss_mlp": 1.55126953, + "step": 392, + "time_per_iteration": 2.932504415512085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0123199, + "balance_loss_mlp": 1.07768571, + "epoch": 0.07560600230858022, + "flos": 589908679680.0, + "grad_norm": 0.028382748029943367, + "language_loss": 0.97495323, + "learning_rate": 0.000994558214643177, + "loss": 0.98727316, + "num_input_tokens_seen": 32471984, + "router_z_loss_mlp": 1.546875, + "step": 393, + "time_per_iteration": 2.752694845199585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228178, + "balance_loss_mlp": 1.07425475, + "epoch": 0.07579838399384378, + "flos": 751144900608.0, + "grad_norm": 0.028291982513743617, + "language_loss": 0.99160051, + "learning_rate": 0.000994512279886123, + "loss": 1.00388229, + "num_input_tokens_seen": 32550176, + "router_z_loss_mlp": 1.54296875, + "step": 394, + "time_per_iteration": 3.06592059135437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228894, + "balance_loss_mlp": 1.07530475, + "epoch": 0.07599076567910736, + "flos": 524550609408.0, + "grad_norm": 0.023352712612718218, + "language_loss": 0.98641121, + "learning_rate": 0.0009944661531423758, + "loss": 0.99870014, + "num_input_tokens_seen": 32620768, + "router_z_loss_mlp": 1.53955078, + "step": 395, + "time_per_iteration": 2.6720728874206543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122919, + "balance_loss_mlp": 1.07555354, + "epoch": 0.07618314736437092, + "flos": 552185594880.0, + "grad_norm": 0.026216962171459895, + "language_loss": 0.97914684, + "learning_rate": 0.000994419834429843, + "loss": 0.99143875, + "num_input_tokens_seen": 32693472, + "router_z_loss_mlp": 1.54003906, + "step": 396, + "time_per_iteration": 2.6652910709381104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226861, + "balance_loss_mlp": 1.07308066, + "epoch": 0.07637552904963447, + "flos": 699432771072.0, + "grad_norm": 0.029361663168223213, + "language_loss": 1.03114796, + "learning_rate": 0.0009943733237665069, + "loss": 1.0434165, + "num_input_tokens_seen": 32764976, + "router_z_loss_mlp": 1.54150391, + "step": 397, + "time_per_iteration": 2.808711290359497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01227023, + "balance_loss_mlp": 1.07329071, + "epoch": 0.07656791073489803, + "flos": 580635042816.0, + "grad_norm": 0.02000560632750303, + "language_loss": 1.01598048, + "learning_rate": 0.0009943266211704248, + "loss": 1.02825069, + "num_input_tokens_seen": 32853104, + "router_z_loss_mlp": 1.54101562, + "step": 398, + "time_per_iteration": 2.9420461654663086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226854, + "balance_loss_mlp": 1.0732646, + "epoch": 0.0767602924201616, + "flos": 418037139456.0, + "grad_norm": 0.02425852476792673, + "language_loss": 1.03237891, + "learning_rate": 0.000994279726659728, + "loss": 1.04464746, + "num_input_tokens_seen": 32919376, + "router_z_loss_mlp": 1.53955078, + "step": 399, + "time_per_iteration": 2.5185675621032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01230296, + "balance_loss_mlp": 1.07675469, + "epoch": 0.07695267410542517, + "flos": 483888035328.0, + "grad_norm": 0.030174375239475117, + "language_loss": 1.02145576, + "learning_rate": 0.0009942326402526231, + "loss": 1.03375876, + "num_input_tokens_seen": 32988064, + "router_z_loss_mlp": 1.5390625, + "step": 400, + "time_per_iteration": 2.5265390872955322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224857, + "balance_loss_mlp": 1.07184029, + "epoch": 0.07714505579068873, + "flos": 532026860544.0, + "grad_norm": 0.024483465572707617, + "language_loss": 0.99344772, + "learning_rate": 0.0009941853619673902, + "loss": 1.0056963, + "num_input_tokens_seen": 33059024, + "router_z_loss_mlp": 1.53369141, + "step": 401, + "time_per_iteration": 2.660491704940796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224912, + "balance_loss_mlp": 1.07218146, + "epoch": 0.07733743747595229, + "flos": 806439066624.0, + "grad_norm": 0.032921156451595594, + "language_loss": 1.03587961, + "learning_rate": 0.0009941378918223844, + "loss": 1.04812872, + "num_input_tokens_seen": 33137712, + "router_z_loss_mlp": 1.53076172, + "step": 402, + "time_per_iteration": 3.078272819519043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01222316, + "balance_loss_mlp": 1.06972802, + "epoch": 0.07752981916121585, + "flos": 623613298176.0, + "grad_norm": 0.02596227047756477, + "language_loss": 0.96322513, + "learning_rate": 0.0009940902298360354, + "loss": 0.97544825, + "num_input_tokens_seen": 33211296, + "router_z_loss_mlp": 1.52929688, + "step": 403, + "time_per_iteration": 2.78222918510437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224993, + "balance_loss_mlp": 1.07288182, + "epoch": 0.07772220084647942, + "flos": 729542618112.0, + "grad_norm": 0.031231063897144088, + "language_loss": 1.06544566, + "learning_rate": 0.0009940423760268473, + "loss": 1.07769561, + "num_input_tokens_seen": 33283632, + "router_z_loss_mlp": 1.52441406, + "step": 404, + "time_per_iteration": 2.8572018146514893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226552, + "balance_loss_mlp": 1.07472658, + "epoch": 0.07791458253174298, + "flos": 556468575744.0, + "grad_norm": 0.029548764371286118, + "language_loss": 0.99639893, + "learning_rate": 0.0009939943304133982, + "loss": 1.00866449, + "num_input_tokens_seen": 33350704, + "router_z_loss_mlp": 1.52148438, + "step": 405, + "time_per_iteration": 2.607412815093994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226106, + "balance_loss_mlp": 1.07409084, + "epoch": 0.07810696421700654, + "flos": 554234760192.0, + "grad_norm": 0.031141101296471768, + "language_loss": 1.06411445, + "learning_rate": 0.0009939460930143416, + "loss": 1.07637548, + "num_input_tokens_seen": 33416272, + "router_z_loss_mlp": 1.5234375, + "step": 406, + "time_per_iteration": 2.6132876873016357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223027, + "balance_loss_mlp": 1.07120168, + "epoch": 0.0782993459022701, + "flos": 651878095872.0, + "grad_norm": 0.023437908852709077, + "language_loss": 1.00106847, + "learning_rate": 0.0009938976638484043, + "loss": 1.01329875, + "num_input_tokens_seen": 33501824, + "router_z_loss_mlp": 1.52148438, + "step": 407, + "time_per_iteration": 2.905681610107422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218745, + "balance_loss_mlp": 1.06691968, + "epoch": 0.07849172758753367, + "flos": 497160672768.0, + "grad_norm": 0.02891290096917658, + "language_loss": 0.99991584, + "learning_rate": 0.0009938490429343887, + "loss": 1.01210332, + "num_input_tokens_seen": 33571456, + "router_z_loss_mlp": 1.52148438, + "step": 408, + "time_per_iteration": 2.539567708969116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01222677, + "balance_loss_mlp": 1.07066166, + "epoch": 0.07868410927279723, + "flos": 579075975168.0, + "grad_norm": 0.030601656563413092, + "language_loss": 0.99965751, + "learning_rate": 0.0009938002302911709, + "loss": 1.01188421, + "num_input_tokens_seen": 33646320, + "router_z_loss_mlp": 1.5234375, + "step": 409, + "time_per_iteration": 2.732064962387085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01220028, + "balance_loss_mlp": 1.0680126, + "epoch": 0.07887649095806079, + "flos": 524066515968.0, + "grad_norm": 0.03256443285635905, + "language_loss": 1.03146362, + "learning_rate": 0.0009937512259377015, + "loss": 1.04366398, + "num_input_tokens_seen": 33717664, + "router_z_loss_mlp": 1.5234375, + "step": 410, + "time_per_iteration": 2.6500303745269775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221864, + "balance_loss_mlp": 1.07013464, + "epoch": 0.07906887264332435, + "flos": 558437876736.0, + "grad_norm": 0.023780630120827737, + "language_loss": 1.01466393, + "learning_rate": 0.000993702029893006, + "loss": 1.02688265, + "num_input_tokens_seen": 33794720, + "router_z_loss_mlp": 1.52050781, + "step": 411, + "time_per_iteration": 2.7921671867370605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221791, + "balance_loss_mlp": 1.07010949, + "epoch": 0.07926125432858792, + "flos": 823362871296.0, + "grad_norm": 0.04077078343290612, + "language_loss": 1.01153946, + "learning_rate": 0.0009936526421761838, + "loss": 1.02375734, + "num_input_tokens_seen": 33868304, + "router_z_loss_mlp": 1.52001953, + "step": 412, + "time_per_iteration": 3.0569379329681396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217861, + "balance_loss_mlp": 1.06632257, + "epoch": 0.07945363601385148, + "flos": 563393604096.0, + "grad_norm": 0.02717343044282308, + "language_loss": 1.04004121, + "learning_rate": 0.000993603062806409, + "loss": 1.05221987, + "num_input_tokens_seen": 33937424, + "router_z_loss_mlp": 1.51855469, + "step": 413, + "time_per_iteration": 2.707462787628174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01219172, + "balance_loss_mlp": 1.06844354, + "epoch": 0.07964601769911504, + "flos": 518884478976.0, + "grad_norm": 0.031245789494761384, + "language_loss": 1.07179379, + "learning_rate": 0.0009935532918029298, + "loss": 1.08398533, + "num_input_tokens_seen": 34003984, + "router_z_loss_mlp": 1.51025391, + "step": 414, + "time_per_iteration": 2.668151617050171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224604, + "balance_loss_mlp": 1.07387555, + "epoch": 0.0798383993843786, + "flos": 540300109824.0, + "grad_norm": 0.025221671350570463, + "language_loss": 0.99906069, + "learning_rate": 0.0009935033291850694, + "loss": 1.01130676, + "num_input_tokens_seen": 34072400, + "router_z_loss_mlp": 1.51025391, + "step": 415, + "time_per_iteration": 2.64747953414917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216008, + "balance_loss_mlp": 1.06547058, + "epoch": 0.08003078106964218, + "flos": 486121850880.0, + "grad_norm": 0.027121462600521052, + "language_loss": 1.02766061, + "learning_rate": 0.0009934531749722247, + "loss": 1.03982067, + "num_input_tokens_seen": 34142448, + "router_z_loss_mlp": 1.50830078, + "step": 416, + "time_per_iteration": 2.5705764293670654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121625, + "balance_loss_mlp": 1.06576049, + "epoch": 0.08022316275490574, + "flos": 519275246592.0, + "grad_norm": 0.027391361962933233, + "language_loss": 1.00515926, + "learning_rate": 0.0009934028291838672, + "loss": 1.01732171, + "num_input_tokens_seen": 34214080, + "router_z_loss_mlp": 1.5078125, + "step": 417, + "time_per_iteration": 2.7232770919799805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01219761, + "balance_loss_mlp": 1.0695101, + "epoch": 0.0804155444401693, + "flos": 495046379520.0, + "grad_norm": 0.028534904701295792, + "language_loss": 0.95904237, + "learning_rate": 0.0009933522918395433, + "loss": 0.97123998, + "num_input_tokens_seen": 34288448, + "router_z_loss_mlp": 1.50537109, + "step": 418, + "time_per_iteration": 2.670992374420166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01265297, + "balance_loss_mlp": 1.11595154, + "epoch": 0.08060792612543285, + "flos": 1584853833216.0, + "grad_norm": 0.03473829356439328, + "language_loss": 0.782511, + "learning_rate": 0.0009933015629588731, + "loss": 0.79516399, + "num_input_tokens_seen": 34521632, + "router_z_loss_mlp": 1.49609375, + "step": 419, + "time_per_iteration": 4.9051830768585205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01222046, + "balance_loss_mlp": 1.07246244, + "epoch": 0.08080030781069643, + "flos": 526358728704.0, + "grad_norm": 0.03232182071246488, + "language_loss": 1.15746891, + "learning_rate": 0.000993250642561551, + "loss": 1.16968942, + "num_input_tokens_seen": 34590080, + "router_z_loss_mlp": 1.49853516, + "step": 420, + "time_per_iteration": 2.596930503845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224313, + "balance_loss_mlp": 1.07487273, + "epoch": 0.08099268949595999, + "flos": 547756895232.0, + "grad_norm": 0.03306568774928502, + "language_loss": 1.00193918, + "learning_rate": 0.0009931995306673466, + "loss": 1.01418233, + "num_input_tokens_seen": 34660512, + "router_z_loss_mlp": 1.49707031, + "step": 421, + "time_per_iteration": 2.704012155532837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223697, + "balance_loss_mlp": 1.0744468, + "epoch": 0.08118507118122355, + "flos": 511373299200.0, + "grad_norm": 0.026268861479682264, + "language_loss": 1.0597651, + "learning_rate": 0.000993148227296103, + "loss": 1.07200205, + "num_input_tokens_seen": 34732016, + "router_z_loss_mlp": 1.49511719, + "step": 422, + "time_per_iteration": 2.6110117435455322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224578, + "balance_loss_mlp": 1.0751853, + "epoch": 0.08137745286648711, + "flos": 722001239040.0, + "grad_norm": 0.024088300997991936, + "language_loss": 0.92380643, + "learning_rate": 0.000993096732467738, + "loss": 0.9360522, + "num_input_tokens_seen": 34810416, + "router_z_loss_mlp": 1.49658203, + "step": 423, + "time_per_iteration": 2.9790220260620117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224383, + "balance_loss_mlp": 1.0753237, + "epoch": 0.08156983455175067, + "flos": 680817641472.0, + "grad_norm": 0.029818930066630327, + "language_loss": 1.0177561, + "learning_rate": 0.0009930450462022435, + "loss": 1.02999997, + "num_input_tokens_seen": 34879504, + "router_z_loss_mlp": 1.49316406, + "step": 424, + "time_per_iteration": 2.8023674488067627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223, + "balance_loss_mlp": 1.07518005, + "epoch": 0.08176221623701424, + "flos": 1456588359168.0, + "grad_norm": 0.012435251357338771, + "language_loss": 0.79189807, + "learning_rate": 0.0009929931685196862, + "loss": 0.80412811, + "num_input_tokens_seen": 35111584, + "router_z_loss_mlp": 1.48046875, + "step": 425, + "time_per_iteration": 4.96533989906311 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01219597, + "balance_loss_mlp": 1.0711571, + "epoch": 0.0819545979222778, + "flos": 1558883071488.0, + "grad_norm": 0.04204100969257126, + "language_loss": 1.00605047, + "learning_rate": 0.0009929410994402065, + "loss": 1.01824641, + "num_input_tokens_seen": 35205664, + "router_z_loss_mlp": 1.48681641, + "step": 426, + "time_per_iteration": 3.850475311279297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01220758, + "balance_loss_mlp": 1.07236588, + "epoch": 0.08214697960754136, + "flos": 513800497152.0, + "grad_norm": 0.03975912273964659, + "language_loss": 1.03955805, + "learning_rate": 0.0009928888389840196, + "loss": 1.05176568, + "num_input_tokens_seen": 35280144, + "router_z_loss_mlp": 1.48632812, + "step": 427, + "time_per_iteration": 2.6892385482788086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224824, + "balance_loss_mlp": 1.07633698, + "epoch": 0.08233936129280492, + "flos": 596221360128.0, + "grad_norm": 0.02633667259549893, + "language_loss": 1.0604248, + "learning_rate": 0.0009928363871714147, + "loss": 1.07267296, + "num_input_tokens_seen": 35344768, + "router_z_loss_mlp": 1.48730469, + "step": 428, + "time_per_iteration": 2.666851282119751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224039, + "balance_loss_mlp": 1.07550442, + "epoch": 0.08253174297806849, + "flos": 573164795904.0, + "grad_norm": 0.03052010415677114, + "language_loss": 0.99677718, + "learning_rate": 0.0009927837440227556, + "loss": 1.00901759, + "num_input_tokens_seen": 35425536, + "router_z_loss_mlp": 1.48779297, + "step": 429, + "time_per_iteration": 2.810197591781616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228416, + "balance_loss_mlp": 1.07992899, + "epoch": 0.08272412466333205, + "flos": 624642610176.0, + "grad_norm": 0.029909202440675912, + "language_loss": 0.93710327, + "learning_rate": 0.0009927309095584798, + "loss": 0.94938743, + "num_input_tokens_seen": 35515440, + "router_z_loss_mlp": 1.48730469, + "step": 430, + "time_per_iteration": 2.98052978515625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122165, + "balance_loss_mlp": 1.07316256, + "epoch": 0.08291650634859561, + "flos": 514994267136.0, + "grad_norm": 0.038201439099628094, + "language_loss": 1.07072532, + "learning_rate": 0.0009926778837991, + "loss": 1.08294177, + "num_input_tokens_seen": 35580192, + "router_z_loss_mlp": 1.48730469, + "step": 431, + "time_per_iteration": 2.613912582397461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223506, + "balance_loss_mlp": 1.07516193, + "epoch": 0.08310888803385917, + "flos": 668541388800.0, + "grad_norm": 0.02618037233016902, + "language_loss": 1.04762018, + "learning_rate": 0.000992624666765202, + "loss": 1.05985522, + "num_input_tokens_seen": 35649472, + "router_z_loss_mlp": 1.48583984, + "step": 432, + "time_per_iteration": 2.785602331161499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224029, + "balance_loss_mlp": 1.07659137, + "epoch": 0.08330126971912274, + "flos": 584490326016.0, + "grad_norm": 0.023129420064945467, + "language_loss": 1.02043724, + "learning_rate": 0.000992571258477447, + "loss": 1.03267753, + "num_input_tokens_seen": 35722848, + "router_z_loss_mlp": 1.4765625, + "step": 433, + "time_per_iteration": 2.7774012088775635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225333, + "balance_loss_mlp": 1.07799041, + "epoch": 0.0834936514043863, + "flos": 562497275904.0, + "grad_norm": 0.02412369992445121, + "language_loss": 0.95710295, + "learning_rate": 0.0009925176589565695, + "loss": 0.9693563, + "num_input_tokens_seen": 35800944, + "router_z_loss_mlp": 1.47558594, + "step": 434, + "time_per_iteration": 2.7975149154663086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224713, + "balance_loss_mlp": 1.07751381, + "epoch": 0.08368603308964986, + "flos": 495513008640.0, + "grad_norm": 0.023499028814372425, + "language_loss": 1.06310439, + "learning_rate": 0.0009924638682233791, + "loss": 1.07535148, + "num_input_tokens_seen": 35866288, + "router_z_loss_mlp": 1.47412109, + "step": 435, + "time_per_iteration": 2.5623626708984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01247864, + "balance_loss_mlp": 1.10328674, + "epoch": 0.08387841477491342, + "flos": 1391808983040.0, + "grad_norm": 0.0329185074425942, + "language_loss": 0.79564589, + "learning_rate": 0.0009924098862987589, + "loss": 0.80812454, + "num_input_tokens_seen": 36083040, + "router_z_loss_mlp": 1.44726562, + "step": 436, + "time_per_iteration": 4.5364601612091064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01219037, + "balance_loss_mlp": 1.07174218, + "epoch": 0.084070796460177, + "flos": 800353970688.0, + "grad_norm": 0.025226905267595717, + "language_loss": 0.95941472, + "learning_rate": 0.0009923557132036668, + "loss": 0.97160506, + "num_input_tokens_seen": 36158816, + "router_z_loss_mlp": 1.47509766, + "step": 437, + "time_per_iteration": 3.031538963317871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01219746, + "balance_loss_mlp": 1.07226074, + "epoch": 0.08426317814544056, + "flos": 560096274432.0, + "grad_norm": 0.024291343012928023, + "language_loss": 0.99699497, + "learning_rate": 0.0009923013489591345, + "loss": 1.00919247, + "num_input_tokens_seen": 36236432, + "router_z_loss_mlp": 1.47705078, + "step": 438, + "time_per_iteration": 2.741021156311035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217749, + "balance_loss_mlp": 1.07073975, + "epoch": 0.08445555983070412, + "flos": 811883616768.0, + "grad_norm": 0.02787309358423107, + "language_loss": 0.97740996, + "learning_rate": 0.0009922467935862681, + "loss": 0.98958743, + "num_input_tokens_seen": 36327952, + "router_z_loss_mlp": 1.47216797, + "step": 439, + "time_per_iteration": 3.0727341175079346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215984, + "balance_loss_mlp": 1.06907046, + "epoch": 0.08464794151596768, + "flos": 511169183232.0, + "grad_norm": 0.02418736148641671, + "language_loss": 1.01547837, + "learning_rate": 0.0009921920471062478, + "loss": 1.0276382, + "num_input_tokens_seen": 36394896, + "router_z_loss_mlp": 1.47119141, + "step": 440, + "time_per_iteration": 2.5793957710266113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214442, + "balance_loss_mlp": 1.06805265, + "epoch": 0.08484032320123125, + "flos": 557473692672.0, + "grad_norm": 0.02549300900866748, + "language_loss": 0.99590349, + "learning_rate": 0.0009921371095403281, + "loss": 1.00804806, + "num_input_tokens_seen": 36464656, + "router_z_loss_mlp": 1.46582031, + "step": 441, + "time_per_iteration": 2.633976936340332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215261, + "balance_loss_mlp": 1.06887233, + "epoch": 0.08503270488649481, + "flos": 528360230400.0, + "grad_norm": 0.023285649852896013, + "language_loss": 1.02823853, + "learning_rate": 0.0009920819809098379, + "loss": 1.04039121, + "num_input_tokens_seen": 36532208, + "router_z_loss_mlp": 1.46582031, + "step": 442, + "time_per_iteration": 2.5975728034973145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213611, + "balance_loss_mlp": 1.06722176, + "epoch": 0.08522508657175837, + "flos": 615385711104.0, + "grad_norm": 0.021771679570127336, + "language_loss": 0.97986722, + "learning_rate": 0.0009920266612361798, + "loss": 0.99200332, + "num_input_tokens_seen": 36607360, + "router_z_loss_mlp": 1.46582031, + "step": 443, + "time_per_iteration": 2.7284042835235596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214332, + "balance_loss_mlp": 1.06803846, + "epoch": 0.08541746825702193, + "flos": 620986713600.0, + "grad_norm": 0.024601404202987703, + "language_loss": 0.97963679, + "learning_rate": 0.0009919711505408308, + "loss": 0.9917801, + "num_input_tokens_seen": 36680688, + "router_z_loss_mlp": 1.46484375, + "step": 444, + "time_per_iteration": 2.797030448913574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216522, + "balance_loss_mlp": 1.07051492, + "epoch": 0.08560984994228549, + "flos": 483888035328.0, + "grad_norm": 0.023417740932750293, + "language_loss": 0.96522343, + "learning_rate": 0.000991915448845342, + "loss": 0.97738856, + "num_input_tokens_seen": 36746288, + "router_z_loss_mlp": 1.46191406, + "step": 445, + "time_per_iteration": 2.544638156890869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121537, + "balance_loss_mlp": 1.06945765, + "epoch": 0.08580223162754906, + "flos": 518176803840.0, + "grad_norm": 0.025018627604332305, + "language_loss": 1.05275297, + "learning_rate": 0.000991859556171339, + "loss": 1.0649066, + "num_input_tokens_seen": 36812528, + "router_z_loss_mlp": 1.4609375, + "step": 446, + "time_per_iteration": 2.5865097045898438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214045, + "balance_loss_mlp": 1.06856191, + "epoch": 0.08599461331281262, + "flos": 532519686144.0, + "grad_norm": 0.025883227843611877, + "language_loss": 1.07190132, + "learning_rate": 0.000991803472540521, + "loss": 1.08404183, + "num_input_tokens_seen": 36879248, + "router_z_loss_mlp": 1.45654297, + "step": 447, + "time_per_iteration": 2.6001055240631104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213992, + "balance_loss_mlp": 1.06879497, + "epoch": 0.08618699499807618, + "flos": 791633558016.0, + "grad_norm": 0.022461373320799196, + "language_loss": 1.02303076, + "learning_rate": 0.0009917471979746615, + "loss": 1.03517067, + "num_input_tokens_seen": 36951376, + "router_z_loss_mlp": 1.45361328, + "step": 448, + "time_per_iteration": 2.9621376991271973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218395, + "balance_loss_mlp": 1.07300746, + "epoch": 0.08637937668333974, + "flos": 567114628608.0, + "grad_norm": 0.02449904215267775, + "language_loss": 1.00404847, + "learning_rate": 0.0009916907324956086, + "loss": 1.01623249, + "num_input_tokens_seen": 37025936, + "router_z_loss_mlp": 1.45556641, + "step": 449, + "time_per_iteration": 2.691150188446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214944, + "balance_loss_mlp": 1.0697943, + "epoch": 0.08657175836860331, + "flos": 446117286912.0, + "grad_norm": 0.025714213043280993, + "language_loss": 0.97109705, + "learning_rate": 0.0009916340761252837, + "loss": 0.98324645, + "num_input_tokens_seen": 37095872, + "router_z_loss_mlp": 1.453125, + "step": 450, + "time_per_iteration": 2.6118698120117188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212599, + "balance_loss_mlp": 1.067307, + "epoch": 0.08676414005386687, + "flos": 845588235264.0, + "grad_norm": 0.02612794411743426, + "language_loss": 0.94540501, + "learning_rate": 0.0009915772288856832, + "loss": 0.95753098, + "num_input_tokens_seen": 37179072, + "router_z_loss_mlp": 1.45458984, + "step": 451, + "time_per_iteration": 3.0883219242095947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213701, + "balance_loss_mlp": 1.06926715, + "epoch": 0.08695652173913043, + "flos": 604483875840.0, + "grad_norm": 0.02003375948944636, + "language_loss": 0.95739877, + "learning_rate": 0.000991520190798877, + "loss": 0.96953583, + "num_input_tokens_seen": 37260288, + "router_z_loss_mlp": 1.44580078, + "step": 452, + "time_per_iteration": 2.8387818336486816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213572, + "balance_loss_mlp": 1.06928122, + "epoch": 0.08714890342439399, + "flos": 732000015360.0, + "grad_norm": 0.027770143088691506, + "language_loss": 1.06693339, + "learning_rate": 0.0009914629618870089, + "loss": 1.07906914, + "num_input_tokens_seen": 37331136, + "router_z_loss_mlp": 1.44433594, + "step": 453, + "time_per_iteration": 2.9403207302093506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01234398, + "balance_loss_mlp": 1.0905838, + "epoch": 0.08734128510965757, + "flos": 1485454044672.0, + "grad_norm": 0.02536208637588336, + "language_loss": 0.78675872, + "learning_rate": 0.0009914055421722976, + "loss": 0.79910266, + "num_input_tokens_seen": 37559040, + "router_z_loss_mlp": 1.43945312, + "step": 454, + "time_per_iteration": 4.803662061691284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121994, + "balance_loss_mlp": 1.07631683, + "epoch": 0.08753366679492113, + "flos": 1526266340352.0, + "grad_norm": 0.01817690946373191, + "language_loss": 0.81427962, + "learning_rate": 0.0009913479316770353, + "loss": 0.82647902, + "num_input_tokens_seen": 37785136, + "router_z_loss_mlp": 1.4375, + "step": 455, + "time_per_iteration": 4.812621355056763 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213204, + "balance_loss_mlp": 1.06919885, + "epoch": 0.08772604848018468, + "flos": 722524263936.0, + "grad_norm": 0.030160618436618963, + "language_loss": 0.98162878, + "learning_rate": 0.0009912901304235883, + "loss": 0.99376082, + "num_input_tokens_seen": 37858832, + "router_z_loss_mlp": 1.44140625, + "step": 456, + "time_per_iteration": 2.9147355556488037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217818, + "balance_loss_mlp": 1.07386112, + "epoch": 0.08791843016544824, + "flos": 709466476032.0, + "grad_norm": 0.03064824893295274, + "language_loss": 0.96399593, + "learning_rate": 0.000991232138434397, + "loss": 0.97617412, + "num_input_tokens_seen": 37931856, + "router_z_loss_mlp": 1.44091797, + "step": 457, + "time_per_iteration": 2.8735082149505615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121922, + "balance_loss_mlp": 1.07540572, + "epoch": 0.08811081185071182, + "flos": 474021516288.0, + "grad_norm": 0.03193385229896835, + "language_loss": 1.03185177, + "learning_rate": 0.000991173955731976, + "loss": 1.04404402, + "num_input_tokens_seen": 38002432, + "router_z_loss_mlp": 1.43945312, + "step": 458, + "time_per_iteration": 2.6597843170166016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01220724, + "balance_loss_mlp": 1.07762539, + "epoch": 0.08830319353597538, + "flos": 686314584576.0, + "grad_norm": 0.057581270182385194, + "language_loss": 1.06524456, + "learning_rate": 0.0009911155823389137, + "loss": 1.07745171, + "num_input_tokens_seen": 38081648, + "router_z_loss_mlp": 1.43212891, + "step": 459, + "time_per_iteration": 2.938124656677246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218235, + "balance_loss_mlp": 1.07513571, + "epoch": 0.08849557522123894, + "flos": 574608344064.0, + "grad_norm": 0.027044136096108284, + "language_loss": 1.01923048, + "learning_rate": 0.000991057018277873, + "loss": 1.03141284, + "num_input_tokens_seen": 38153424, + "router_z_loss_mlp": 1.43212891, + "step": 460, + "time_per_iteration": 2.746169090270996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212445, + "balance_loss_mlp": 1.0693934, + "epoch": 0.0886879569065025, + "flos": 565627419648.0, + "grad_norm": 0.031092379840733354, + "language_loss": 1.03267121, + "learning_rate": 0.0009909982635715898, + "loss": 1.04479575, + "num_input_tokens_seen": 38223008, + "router_z_loss_mlp": 1.43164062, + "step": 461, + "time_per_iteration": 2.6196396350860596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212854, + "balance_loss_mlp": 1.06956458, + "epoch": 0.08888033859176607, + "flos": 564956674560.0, + "grad_norm": 0.030181357689894217, + "language_loss": 1.02059078, + "learning_rate": 0.0009909393182428751, + "loss": 1.03271937, + "num_input_tokens_seen": 38294592, + "router_z_loss_mlp": 1.43408203, + "step": 462, + "time_per_iteration": 2.679793357849121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216843, + "balance_loss_mlp": 1.07345808, + "epoch": 0.08907272027702963, + "flos": 466742650368.0, + "grad_norm": 0.029240136547664795, + "language_loss": 0.9639132, + "learning_rate": 0.000990880182314614, + "loss": 0.97608161, + "num_input_tokens_seen": 38365792, + "router_z_loss_mlp": 1.43505859, + "step": 463, + "time_per_iteration": 2.712097644805908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212421, + "balance_loss_mlp": 1.06922734, + "epoch": 0.08926510196229319, + "flos": 682843338240.0, + "grad_norm": 0.026287763165510035, + "language_loss": 0.96174729, + "learning_rate": 0.0009908208558097643, + "loss": 0.97387147, + "num_input_tokens_seen": 38447776, + "router_z_loss_mlp": 1.43310547, + "step": 464, + "time_per_iteration": 2.906903028488159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217208, + "balance_loss_mlp": 1.07406175, + "epoch": 0.08945748364755675, + "flos": 597821360640.0, + "grad_norm": 0.024374741633963998, + "language_loss": 0.98668623, + "learning_rate": 0.000990761338751359, + "loss": 0.99885827, + "num_input_tokens_seen": 38521632, + "router_z_loss_mlp": 1.43261719, + "step": 465, + "time_per_iteration": 2.7994933128356934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225639, + "balance_loss_mlp": 1.08506775, + "epoch": 0.08964986533282032, + "flos": 1589340930048.0, + "grad_norm": 0.02575129149720033, + "language_loss": 0.73659623, + "learning_rate": 0.0009907016311625045, + "loss": 0.74885261, + "num_input_tokens_seen": 38760528, + "router_z_loss_mlp": 1.40625, + "step": 466, + "time_per_iteration": 4.9763429164886475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221953, + "balance_loss_mlp": 1.07861578, + "epoch": 0.08984224701808388, + "flos": 534549385728.0, + "grad_norm": 0.024628184063577727, + "language_loss": 1.01551545, + "learning_rate": 0.0009906417330663815, + "loss": 1.02773499, + "num_input_tokens_seen": 38827200, + "router_z_loss_mlp": 1.43457031, + "step": 467, + "time_per_iteration": 2.614560842514038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01232523, + "balance_loss_mlp": 1.08994913, + "epoch": 0.09003462870334744, + "flos": 479850103296.0, + "grad_norm": 0.03230737833956583, + "language_loss": 0.98222148, + "learning_rate": 0.0009905816444862442, + "loss": 0.99454677, + "num_input_tokens_seen": 38891984, + "router_z_loss_mlp": 1.42675781, + "step": 468, + "time_per_iteration": 2.598146438598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223867, + "balance_loss_mlp": 1.08124495, + "epoch": 0.090227010388611, + "flos": 654902178816.0, + "grad_norm": 0.027522185030294237, + "language_loss": 0.95659769, + "learning_rate": 0.0009905213654454216, + "loss": 0.96883637, + "num_input_tokens_seen": 38977136, + "router_z_loss_mlp": 1.42724609, + "step": 469, + "time_per_iteration": 2.8876352310180664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01219852, + "balance_loss_mlp": 1.07737279, + "epoch": 0.09041939207387456, + "flos": 619358515200.0, + "grad_norm": 0.023282407360439072, + "language_loss": 1.03878951, + "learning_rate": 0.0009904608959673158, + "loss": 1.0509882, + "num_input_tokens_seen": 39052224, + "router_z_loss_mlp": 1.42578125, + "step": 470, + "time_per_iteration": 2.7882330417633057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213781, + "balance_loss_mlp": 1.0718745, + "epoch": 0.09061177375913813, + "flos": 455295596544.0, + "grad_norm": 0.02882877970469751, + "language_loss": 1.04707062, + "learning_rate": 0.000990400236075403, + "loss": 1.05920839, + "num_input_tokens_seen": 39116832, + "router_z_loss_mlp": 1.41992188, + "step": 471, + "time_per_iteration": 2.5016987323760986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217743, + "balance_loss_mlp": 1.07574117, + "epoch": 0.0908041554444017, + "flos": 545308230144.0, + "grad_norm": 0.02444258884202674, + "language_loss": 1.01020849, + "learning_rate": 0.0009903393857932338, + "loss": 1.02238584, + "num_input_tokens_seen": 39190528, + "router_z_loss_mlp": 1.42089844, + "step": 472, + "time_per_iteration": 2.644397497177124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218613, + "balance_loss_mlp": 1.07732654, + "epoch": 0.09099653712966525, + "flos": 565466964480.0, + "grad_norm": 0.02685769494428931, + "language_loss": 0.99245131, + "learning_rate": 0.0009902783451444317, + "loss": 1.00463748, + "num_input_tokens_seen": 39263168, + "router_z_loss_mlp": 1.41357422, + "step": 473, + "time_per_iteration": 2.7087745666503906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214499, + "balance_loss_mlp": 1.07292593, + "epoch": 0.09118891881492881, + "flos": 475501994496.0, + "grad_norm": 0.029476649456104027, + "language_loss": 1.02896917, + "learning_rate": 0.0009902171141526956, + "loss": 1.04111421, + "num_input_tokens_seen": 39330784, + "router_z_loss_mlp": 1.41650391, + "step": 474, + "time_per_iteration": 2.5271990299224854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215154, + "balance_loss_mlp": 1.07410538, + "epoch": 0.09138130050019239, + "flos": 546990822912.0, + "grad_norm": 0.02490932279529465, + "language_loss": 0.89845926, + "learning_rate": 0.000990155692841797, + "loss": 0.9106108, + "num_input_tokens_seen": 39417472, + "router_z_loss_mlp": 1.41113281, + "step": 475, + "time_per_iteration": 2.958740234375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214039, + "balance_loss_mlp": 1.07303798, + "epoch": 0.09157368218545595, + "flos": 733973319168.0, + "grad_norm": 0.02740759839690251, + "language_loss": 1.01869047, + "learning_rate": 0.0009900940812355818, + "loss": 1.03083086, + "num_input_tokens_seen": 39488656, + "router_z_loss_mlp": 1.41064453, + "step": 476, + "time_per_iteration": 2.891787528991699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205639, + "balance_loss_mlp": 1.06478107, + "epoch": 0.0917660638707195, + "flos": 612072918528.0, + "grad_norm": 0.029261712768775452, + "language_loss": 0.99624813, + "learning_rate": 0.00099003227935797, + "loss": 1.0083046, + "num_input_tokens_seen": 39558224, + "router_z_loss_mlp": 1.40917969, + "step": 477, + "time_per_iteration": 2.7569031715393066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207057, + "balance_loss_mlp": 1.06605613, + "epoch": 0.09195844555598306, + "flos": 657018473472.0, + "grad_norm": 0.026965523070242428, + "language_loss": 1.02860427, + "learning_rate": 0.000989970287232955, + "loss": 1.04067481, + "num_input_tokens_seen": 39629856, + "router_z_loss_mlp": 1.41064453, + "step": 478, + "time_per_iteration": 2.7705225944519043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212938, + "balance_loss_mlp": 1.07212758, + "epoch": 0.09215082724124664, + "flos": 477540426240.0, + "grad_norm": 0.02578247385618595, + "language_loss": 0.99767786, + "learning_rate": 0.0009899081048846043, + "loss": 1.00980723, + "num_input_tokens_seen": 39695984, + "router_z_loss_mlp": 1.40869141, + "step": 479, + "time_per_iteration": 2.5488922595977783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215229, + "balance_loss_mlp": 1.07437098, + "epoch": 0.0923432089265102, + "flos": 525325413888.0, + "grad_norm": 0.029009434883925433, + "language_loss": 1.05276799, + "learning_rate": 0.0009898457323370593, + "loss": 1.06492031, + "num_input_tokens_seen": 39760256, + "router_z_loss_mlp": 1.40917969, + "step": 480, + "time_per_iteration": 2.5628790855407715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213957, + "balance_loss_mlp": 1.07314658, + "epoch": 0.09253559061177376, + "flos": 546638986752.0, + "grad_norm": 0.030643020391807937, + "language_loss": 1.01694977, + "learning_rate": 0.000989783169614535, + "loss": 1.02908933, + "num_input_tokens_seen": 39827984, + "router_z_loss_mlp": 1.40869141, + "step": 481, + "time_per_iteration": 2.6431851387023926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206421, + "balance_loss_mlp": 1.06718445, + "epoch": 0.09272797229703732, + "flos": 1541334362112.0, + "grad_norm": 0.00793715508899474, + "language_loss": 0.78752756, + "learning_rate": 0.0009897204167413206, + "loss": 0.79959178, + "num_input_tokens_seen": 40056688, + "router_z_loss_mlp": 1.39257812, + "step": 482, + "time_per_iteration": 4.84259295463562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211177, + "balance_loss_mlp": 1.07041514, + "epoch": 0.09292035398230089, + "flos": 691064194560.0, + "grad_norm": 0.029391602229229655, + "language_loss": 0.99036419, + "learning_rate": 0.000989657473741779, + "loss": 1.00247598, + "num_input_tokens_seen": 40133120, + "router_z_loss_mlp": 1.40820312, + "step": 483, + "time_per_iteration": 2.8193717002868652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210505, + "balance_loss_mlp": 1.06964695, + "epoch": 0.09311273566756445, + "flos": 510822076416.0, + "grad_norm": 0.026713621627667553, + "language_loss": 1.0060308, + "learning_rate": 0.0009895943406403465, + "loss": 1.01813591, + "num_input_tokens_seen": 40206464, + "router_z_loss_mlp": 1.40917969, + "step": 484, + "time_per_iteration": 2.695058822631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210956, + "balance_loss_mlp": 1.07071841, + "epoch": 0.09330511735282801, + "flos": 660583045632.0, + "grad_norm": 0.02538483632370611, + "language_loss": 0.94170594, + "learning_rate": 0.0009895310174615338, + "loss": 0.95381546, + "num_input_tokens_seen": 40277744, + "router_z_loss_mlp": 1.40283203, + "step": 485, + "time_per_iteration": 2.7646515369415283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210991, + "balance_loss_mlp": 1.0725174, + "epoch": 0.09349749903809157, + "flos": 1456021673472.0, + "grad_norm": 0.008074315810691821, + "language_loss": 0.75718516, + "learning_rate": 0.0009894675042299251, + "loss": 0.7692951, + "num_input_tokens_seen": 40503664, + "router_z_loss_mlp": 1.38476562, + "step": 486, + "time_per_iteration": 4.652726888656616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208546, + "balance_loss_mlp": 1.06868994, + "epoch": 0.09368988072335514, + "flos": 521899829760.0, + "grad_norm": 0.021962490795067104, + "language_loss": 0.97574425, + "learning_rate": 0.0009894038009701782, + "loss": 0.98782969, + "num_input_tokens_seen": 40571376, + "router_z_loss_mlp": 1.39892578, + "step": 487, + "time_per_iteration": 2.647747755050659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207771, + "balance_loss_mlp": 1.06786692, + "epoch": 0.0938822624086187, + "flos": 498751941120.0, + "grad_norm": 0.02403393711112831, + "language_loss": 1.01297927, + "learning_rate": 0.0009893399077070253, + "loss": 1.02505696, + "num_input_tokens_seen": 40638096, + "router_z_loss_mlp": 1.39941406, + "step": 488, + "time_per_iteration": 2.5559775829315186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209251, + "balance_loss_mlp": 1.07006216, + "epoch": 0.09407464409388226, + "flos": 534223746048.0, + "grad_norm": 0.02465812888810929, + "language_loss": 0.94380867, + "learning_rate": 0.0009892758244652718, + "loss": 0.95590127, + "num_input_tokens_seen": 40710992, + "router_z_loss_mlp": 1.39208984, + "step": 489, + "time_per_iteration": 2.6696364879608154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203933, + "balance_loss_mlp": 1.06398153, + "epoch": 0.09426702577914582, + "flos": 587090714112.0, + "grad_norm": 0.02607881729553482, + "language_loss": 1.01920152, + "learning_rate": 0.0009892115512697968, + "loss": 1.03124094, + "num_input_tokens_seen": 40778896, + "router_z_loss_mlp": 1.39990234, + "step": 490, + "time_per_iteration": 2.645073652267456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205245, + "balance_loss_mlp": 1.06524527, + "epoch": 0.0944594074644094, + "flos": 504463733760.0, + "grad_norm": 0.02086232355550113, + "language_loss": 1.01703966, + "learning_rate": 0.0009891470881455537, + "loss": 1.02909207, + "num_input_tokens_seen": 40853376, + "router_z_loss_mlp": 1.40039062, + "step": 491, + "time_per_iteration": 2.669978618621826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207443, + "balance_loss_mlp": 1.06777763, + "epoch": 0.09465178914967295, + "flos": 572114016768.0, + "grad_norm": 0.026976181820206353, + "language_loss": 1.00743008, + "learning_rate": 0.0009890824351175692, + "loss": 1.01950443, + "num_input_tokens_seen": 40923776, + "router_z_loss_mlp": 1.39697266, + "step": 492, + "time_per_iteration": 2.6572952270507812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207157, + "balance_loss_mlp": 1.06796801, + "epoch": 0.09484417083493651, + "flos": 550418408448.0, + "grad_norm": 0.023611014675858334, + "language_loss": 1.04079592, + "learning_rate": 0.0009890175922109435, + "loss": 1.05286753, + "num_input_tokens_seen": 40996848, + "router_z_loss_mlp": 1.39208984, + "step": 493, + "time_per_iteration": 2.622361183166504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120413, + "balance_loss_mlp": 1.06498933, + "epoch": 0.09503655252020007, + "flos": 825271047168.0, + "grad_norm": 0.02510100112233158, + "language_loss": 1.0275588, + "learning_rate": 0.0009889525594508513, + "loss": 1.03960025, + "num_input_tokens_seen": 41071280, + "router_z_loss_mlp": 1.39160156, + "step": 494, + "time_per_iteration": 3.0307581424713135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202477, + "balance_loss_mlp": 1.06333554, + "epoch": 0.09522893420546363, + "flos": 405517839360.0, + "grad_norm": 0.02234367718934989, + "language_loss": 0.96151906, + "learning_rate": 0.0009888873368625404, + "loss": 0.97354376, + "num_input_tokens_seen": 41136304, + "router_z_loss_mlp": 1.39160156, + "step": 495, + "time_per_iteration": 2.4793317317962646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205465, + "balance_loss_mlp": 1.06665742, + "epoch": 0.0954213158907272, + "flos": 692255963136.0, + "grad_norm": 0.025506351191757377, + "language_loss": 1.00908709, + "learning_rate": 0.0009888219244713326, + "loss": 1.02114165, + "num_input_tokens_seen": 41212384, + "router_z_loss_mlp": 1.38818359, + "step": 496, + "time_per_iteration": 2.865914821624756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206499, + "balance_loss_mlp": 1.06773937, + "epoch": 0.09561369757599077, + "flos": 520074246144.0, + "grad_norm": 0.030124833611481355, + "language_loss": 1.02319717, + "learning_rate": 0.0009887563223026229, + "loss": 1.03526211, + "num_input_tokens_seen": 41282528, + "router_z_loss_mlp": 1.38671875, + "step": 497, + "time_per_iteration": 2.689708948135376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210899, + "balance_loss_mlp": 1.07376099, + "epoch": 0.09580607926125433, + "flos": 1388781623808.0, + "grad_norm": 0.014650036919455408, + "language_loss": 0.7906816, + "learning_rate": 0.0009886905303818805, + "loss": 0.80279064, + "num_input_tokens_seen": 41512256, + "router_z_loss_mlp": 1.37109375, + "step": 498, + "time_per_iteration": 4.940208196640015 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203477, + "balance_loss_mlp": 1.06476545, + "epoch": 0.09599846094651789, + "flos": 718825433088.0, + "grad_norm": 0.028840614245688557, + "language_loss": 0.98952407, + "learning_rate": 0.0009886245487346482, + "loss": 1.00155878, + "num_input_tokens_seen": 41596816, + "router_z_loss_mlp": 1.38427734, + "step": 499, + "time_per_iteration": 3.023056745529175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205479, + "balance_loss_mlp": 1.06690967, + "epoch": 0.09619084263178146, + "flos": 386893977600.0, + "grad_norm": 0.031706482821381415, + "language_loss": 1.0340035, + "learning_rate": 0.0009885583773865422, + "loss": 1.04605842, + "num_input_tokens_seen": 41658544, + "router_z_loss_mlp": 1.38183594, + "step": 500, + "time_per_iteration": 2.422914505004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202787, + "balance_loss_mlp": 1.06479073, + "epoch": 0.09638322431704502, + "flos": 535172467200.0, + "grad_norm": 0.02878579188863982, + "language_loss": 0.99392897, + "learning_rate": 0.0009884920163632524, + "loss": 1.00595689, + "num_input_tokens_seen": 41730736, + "router_z_loss_mlp": 1.37988281, + "step": 501, + "time_per_iteration": 2.6820154190063477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203474, + "balance_loss_mlp": 1.0655731, + "epoch": 0.09657560600230858, + "flos": 501656501760.0, + "grad_norm": 0.02635733095705931, + "language_loss": 1.03128934, + "learning_rate": 0.000988425465690543, + "loss": 1.04332411, + "num_input_tokens_seen": 41797824, + "router_z_loss_mlp": 1.37890625, + "step": 502, + "time_per_iteration": 2.605536699295044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204627, + "balance_loss_mlp": 1.06677341, + "epoch": 0.09676798768757214, + "flos": 530331532800.0, + "grad_norm": 0.023374032620567947, + "language_loss": 1.00861204, + "learning_rate": 0.0009883587253942505, + "loss": 1.02065825, + "num_input_tokens_seen": 41875520, + "router_z_loss_mlp": 1.37841797, + "step": 503, + "time_per_iteration": 2.7548091411590576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204765, + "balance_loss_mlp": 1.06686366, + "epoch": 0.09696036937283571, + "flos": 464556498432.0, + "grad_norm": 0.029206950172382878, + "language_loss": 1.0685035, + "learning_rate": 0.0009882917955002862, + "loss": 1.08055115, + "num_input_tokens_seen": 41942224, + "router_z_loss_mlp": 1.37890625, + "step": 504, + "time_per_iteration": 2.520970344543457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200777, + "balance_loss_mlp": 1.06297076, + "epoch": 0.09715275105809927, + "flos": 536010398208.0, + "grad_norm": 0.02484338661637091, + "language_loss": 0.9770751, + "learning_rate": 0.0009882246760346343, + "loss": 0.98908287, + "num_input_tokens_seen": 42007552, + "router_z_loss_mlp": 1.37695312, + "step": 505, + "time_per_iteration": 2.6314897537231445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204578, + "balance_loss_mlp": 1.06672478, + "epoch": 0.09734513274336283, + "flos": 455881747968.0, + "grad_norm": 0.02756591702740651, + "language_loss": 1.04990697, + "learning_rate": 0.0009881573670233533, + "loss": 1.06195283, + "num_input_tokens_seen": 42071760, + "router_z_loss_mlp": 1.37451172, + "step": 506, + "time_per_iteration": 2.492464780807495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203948, + "balance_loss_mlp": 1.06619, + "epoch": 0.09753751442862639, + "flos": 509827693056.0, + "grad_norm": 0.02954706972608782, + "language_loss": 0.97619581, + "learning_rate": 0.0009880898684925747, + "loss": 0.98823535, + "num_input_tokens_seen": 42140688, + "router_z_loss_mlp": 1.37353516, + "step": 507, + "time_per_iteration": 2.6402478218078613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120195, + "balance_loss_mlp": 1.06438243, + "epoch": 0.09772989611388996, + "flos": 485246989824.0, + "grad_norm": 0.02487380392257162, + "language_loss": 0.96617985, + "learning_rate": 0.0009880221804685037, + "loss": 0.97819936, + "num_input_tokens_seen": 42208544, + "router_z_loss_mlp": 1.37158203, + "step": 508, + "time_per_iteration": 2.5352439880371094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209412, + "balance_loss_mlp": 1.0741806, + "epoch": 0.09792227779915352, + "flos": 1569316454400.0, + "grad_norm": 0.016823619827393988, + "language_loss": 0.79344422, + "learning_rate": 0.000987954302977419, + "loss": 0.80553836, + "num_input_tokens_seen": 42426624, + "router_z_loss_mlp": 1.3515625, + "step": 509, + "time_per_iteration": 4.694217205047607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205455, + "balance_loss_mlp": 1.06831706, + "epoch": 0.09811465948441708, + "flos": 588914296320.0, + "grad_norm": 0.032012577058462416, + "language_loss": 1.03636336, + "learning_rate": 0.0009878862360456733, + "loss": 1.04841793, + "num_input_tokens_seen": 42494592, + "router_z_loss_mlp": 1.37011719, + "step": 510, + "time_per_iteration": 2.73879337310791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208431, + "balance_loss_mlp": 1.07148337, + "epoch": 0.09830704116968064, + "flos": 614128814592.0, + "grad_norm": 0.028115444050206044, + "language_loss": 0.94855493, + "learning_rate": 0.0009878179796996922, + "loss": 0.96063924, + "num_input_tokens_seen": 42564944, + "router_z_loss_mlp": 1.36914062, + "step": 511, + "time_per_iteration": 2.6949734687805176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207361, + "balance_loss_mlp": 1.07050836, + "epoch": 0.09849942285494422, + "flos": 539935538688.0, + "grad_norm": 0.022608937638108787, + "language_loss": 0.9790619, + "learning_rate": 0.0009877495339659754, + "loss": 0.99113548, + "num_input_tokens_seen": 42645616, + "router_z_loss_mlp": 1.36816406, + "step": 512, + "time_per_iteration": 2.7515861988067627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214076, + "balance_loss_mlp": 1.0773195, + "epoch": 0.09869180454020778, + "flos": 621603064320.0, + "grad_norm": 0.029833187637910333, + "language_loss": 0.94261241, + "learning_rate": 0.000987680898871096, + "loss": 0.95475316, + "num_input_tokens_seen": 42713632, + "router_z_loss_mlp": 1.3671875, + "step": 513, + "time_per_iteration": 2.6975760459899902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120845, + "balance_loss_mlp": 1.07145417, + "epoch": 0.09888418622547133, + "flos": 813059922432.0, + "grad_norm": 0.032512892127392744, + "language_loss": 0.9726817, + "learning_rate": 0.0009876120744417, + "loss": 0.98476619, + "num_input_tokens_seen": 42789088, + "router_z_loss_mlp": 1.36767578, + "step": 514, + "time_per_iteration": 2.9514927864074707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214576, + "balance_loss_mlp": 1.07762837, + "epoch": 0.0990765679107349, + "flos": 536857061376.0, + "grad_norm": 0.028495408786163776, + "language_loss": 1.0346663, + "learning_rate": 0.0009875430607045078, + "loss": 1.04681206, + "num_input_tokens_seen": 42861168, + "router_z_loss_mlp": 1.36523438, + "step": 515, + "time_per_iteration": 2.669271230697632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209323, + "balance_loss_mlp": 1.07242322, + "epoch": 0.09926894959599845, + "flos": 588970692096.0, + "grad_norm": 0.026228231589839293, + "language_loss": 0.98752952, + "learning_rate": 0.000987473857686313, + "loss": 0.9996227, + "num_input_tokens_seen": 42934112, + "router_z_loss_mlp": 1.36474609, + "step": 516, + "time_per_iteration": 2.7055716514587402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120601, + "balance_loss_mlp": 1.06934881, + "epoch": 0.09946133128126203, + "flos": 642386881536.0, + "grad_norm": 0.0302129460476142, + "language_loss": 1.04248524, + "learning_rate": 0.0009874044654139824, + "loss": 1.05454528, + "num_input_tokens_seen": 43005248, + "router_z_loss_mlp": 1.36230469, + "step": 517, + "time_per_iteration": 2.726618528366089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200307, + "balance_loss_mlp": 1.06340742, + "epoch": 0.09965371296652559, + "flos": 466725186048.0, + "grad_norm": 0.03251153136411229, + "language_loss": 1.02563679, + "learning_rate": 0.0009873348839144563, + "loss": 1.03763986, + "num_input_tokens_seen": 43070576, + "router_z_loss_mlp": 1.36474609, + "step": 518, + "time_per_iteration": 2.5855953693389893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200913, + "balance_loss_mlp": 1.06439471, + "epoch": 0.09984609465178915, + "flos": 484558780416.0, + "grad_norm": 0.029627125773621466, + "language_loss": 1.03352094, + "learning_rate": 0.000987265113214749, + "loss": 1.04552996, + "num_input_tokens_seen": 43138048, + "router_z_loss_mlp": 1.36279297, + "step": 519, + "time_per_iteration": 2.5350587368011475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01201703, + "balance_loss_mlp": 1.06566191, + "epoch": 0.1000384763370527, + "flos": 570095050752.0, + "grad_norm": 0.028931775658430137, + "language_loss": 1.07544637, + "learning_rate": 0.0009871951533419476, + "loss": 1.08746338, + "num_input_tokens_seen": 43207600, + "router_z_loss_mlp": 1.35986328, + "step": 520, + "time_per_iteration": 2.6423709392547607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200484, + "balance_loss_mlp": 1.06439495, + "epoch": 0.10023085802231628, + "flos": 546925694976.0, + "grad_norm": 0.025491893219336172, + "language_loss": 0.95403761, + "learning_rate": 0.0009871250043232132, + "loss": 0.96604246, + "num_input_tokens_seen": 43285104, + "router_z_loss_mlp": 1.36035156, + "step": 521, + "time_per_iteration": 2.7604362964630127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198813, + "balance_loss_mlp": 1.06205583, + "epoch": 0.10042323970757984, + "flos": 504439538688.0, + "grad_norm": 0.029888360913216814, + "language_loss": 0.96113187, + "learning_rate": 0.0009870546661857797, + "loss": 0.97311997, + "num_input_tokens_seen": 43353312, + "router_z_loss_mlp": 1.36328125, + "step": 522, + "time_per_iteration": 2.578458547592163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195212, + "balance_loss_mlp": 1.05931365, + "epoch": 0.1006156213928434, + "flos": 771724601856.0, + "grad_norm": 0.029426081780707294, + "language_loss": 1.05752206, + "learning_rate": 0.0009869841389569553, + "loss": 1.0694741, + "num_input_tokens_seen": 43427680, + "router_z_loss_mlp": 1.35839844, + "step": 523, + "time_per_iteration": 2.958531618118286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194366, + "balance_loss_mlp": 1.05846703, + "epoch": 0.10080800307810696, + "flos": 491008447488.0, + "grad_norm": 0.024593893632090205, + "language_loss": 0.96497846, + "learning_rate": 0.0009869134226641206, + "loss": 0.97692204, + "num_input_tokens_seen": 43495200, + "router_z_loss_mlp": 1.35839844, + "step": 524, + "time_per_iteration": 2.6820528507232666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196113, + "balance_loss_mlp": 1.06030965, + "epoch": 0.10100038476337053, + "flos": 455712560640.0, + "grad_norm": 0.026556514945601337, + "language_loss": 0.98348475, + "learning_rate": 0.0009868425173347303, + "loss": 0.99544585, + "num_input_tokens_seen": 43566256, + "router_z_loss_mlp": 1.35742188, + "step": 525, + "time_per_iteration": 2.6460907459259033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196515, + "balance_loss_mlp": 1.06099772, + "epoch": 0.10119276644863409, + "flos": 557573749248.0, + "grad_norm": 0.022458491608374247, + "language_loss": 1.03332829, + "learning_rate": 0.0009867714229963125, + "loss": 1.04529333, + "num_input_tokens_seen": 43639696, + "router_z_loss_mlp": 1.35449219, + "step": 526, + "time_per_iteration": 2.693362236022949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119647, + "balance_loss_mlp": 1.0609529, + "epoch": 0.10138514813389765, + "flos": 517219350528.0, + "grad_norm": 0.028969258136437262, + "language_loss": 1.0161202, + "learning_rate": 0.000986700139676468, + "loss": 1.02808487, + "num_input_tokens_seen": 43703872, + "router_z_loss_mlp": 1.35449219, + "step": 527, + "time_per_iteration": 2.5826644897460938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202893, + "balance_loss_mlp": 1.06742311, + "epoch": 0.10157752981916121, + "flos": 501563175936.0, + "grad_norm": 0.023004964960346017, + "language_loss": 0.98490077, + "learning_rate": 0.0009866286674028717, + "loss": 0.99692971, + "num_input_tokens_seen": 43774416, + "router_z_loss_mlp": 1.35400391, + "step": 528, + "time_per_iteration": 2.626595973968506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204326, + "balance_loss_mlp": 1.06876123, + "epoch": 0.10176991150442478, + "flos": 658093447680.0, + "grad_norm": 0.024381421822087013, + "language_loss": 0.95674849, + "learning_rate": 0.0009865570062032717, + "loss": 0.96879184, + "num_input_tokens_seen": 43853376, + "router_z_loss_mlp": 1.35498047, + "step": 529, + "time_per_iteration": 2.916924238204956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203456, + "balance_loss_mlp": 1.0680815, + "epoch": 0.10196229318968834, + "flos": 574402226688.0, + "grad_norm": 0.021344584600364362, + "language_loss": 0.99175954, + "learning_rate": 0.0009864851561054893, + "loss": 1.00379407, + "num_input_tokens_seen": 43929632, + "router_z_loss_mlp": 1.35302734, + "step": 530, + "time_per_iteration": 2.750075578689575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203649, + "balance_loss_mlp": 1.06856096, + "epoch": 0.1021546748749519, + "flos": 519255780864.0, + "grad_norm": 0.027896087186932737, + "language_loss": 0.99157, + "learning_rate": 0.0009864131171374191, + "loss": 1.00360656, + "num_input_tokens_seen": 44002144, + "router_z_loss_mlp": 1.35009766, + "step": 531, + "time_per_iteration": 2.6506359577178955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202329, + "balance_loss_mlp": 1.06728852, + "epoch": 0.10234705656021546, + "flos": 610953008640.0, + "grad_norm": 0.021304730024267197, + "language_loss": 0.98848057, + "learning_rate": 0.0009863408893270292, + "loss": 1.0005039, + "num_input_tokens_seen": 44078272, + "router_z_loss_mlp": 1.34960938, + "step": 532, + "time_per_iteration": 2.827632427215576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202805, + "balance_loss_mlp": 1.06776476, + "epoch": 0.10253943824547904, + "flos": 602912073216.0, + "grad_norm": 0.02650069508154076, + "language_loss": 0.95645475, + "learning_rate": 0.0009862684727023605, + "loss": 0.96848285, + "num_input_tokens_seen": 44152304, + "router_z_loss_mlp": 1.34960938, + "step": 533, + "time_per_iteration": 2.730771541595459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206135, + "balance_loss_mlp": 1.07152414, + "epoch": 0.1027318199307426, + "flos": 664156349952.0, + "grad_norm": 0.02579556790717569, + "language_loss": 0.96718729, + "learning_rate": 0.0009861958672915283, + "loss": 0.97924864, + "num_input_tokens_seen": 44226720, + "router_z_loss_mlp": 1.34521484, + "step": 534, + "time_per_iteration": 2.825239419937134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202189, + "balance_loss_mlp": 1.06776834, + "epoch": 0.10292420161600616, + "flos": 684529933824.0, + "grad_norm": 0.02492376876437301, + "language_loss": 0.95656139, + "learning_rate": 0.0009861230731227201, + "loss": 0.96858335, + "num_input_tokens_seen": 44303600, + "router_z_loss_mlp": 1.34326172, + "step": 535, + "time_per_iteration": 2.858596086502075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203815, + "balance_loss_mlp": 1.06958508, + "epoch": 0.10311658330126972, + "flos": 491268959232.0, + "grad_norm": 0.02833674325523021, + "language_loss": 0.99709427, + "learning_rate": 0.0009860500902241973, + "loss": 1.00913239, + "num_input_tokens_seen": 44370960, + "router_z_loss_mlp": 1.34130859, + "step": 536, + "time_per_iteration": 2.5780303478240967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197149, + "balance_loss_mlp": 1.06291902, + "epoch": 0.10330896498653329, + "flos": 432686195712.0, + "grad_norm": 0.024484943889946764, + "language_loss": 1.03652823, + "learning_rate": 0.0009859769186242942, + "loss": 1.0484997, + "num_input_tokens_seen": 44435584, + "router_z_loss_mlp": 1.34130859, + "step": 537, + "time_per_iteration": 2.5104598999023438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119791, + "balance_loss_mlp": 1.06415713, + "epoch": 0.10350134667179685, + "flos": 550641990144.0, + "grad_norm": 0.0271300181774947, + "language_loss": 0.97886324, + "learning_rate": 0.0009859035583514187, + "loss": 0.99084234, + "num_input_tokens_seen": 44505456, + "router_z_loss_mlp": 1.33642578, + "step": 538, + "time_per_iteration": 2.6156880855560303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197994, + "balance_loss_mlp": 1.06395507, + "epoch": 0.10369372835706041, + "flos": 641826926592.0, + "grad_norm": 0.024416305433678544, + "language_loss": 1.00991774, + "learning_rate": 0.0009858300094340517, + "loss": 1.02189767, + "num_input_tokens_seen": 44580208, + "router_z_loss_mlp": 1.33935547, + "step": 539, + "time_per_iteration": 2.764214515686035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198436, + "balance_loss_mlp": 1.06468332, + "epoch": 0.10388611004232397, + "flos": 522765958656.0, + "grad_norm": 0.025798430155835095, + "language_loss": 0.9342165, + "learning_rate": 0.0009857562719007473, + "loss": 0.94620085, + "num_input_tokens_seen": 44646576, + "router_z_loss_mlp": 1.33642578, + "step": 540, + "time_per_iteration": 2.6592581272125244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204547, + "balance_loss_mlp": 1.07122386, + "epoch": 0.10407849172758753, + "flos": 703739947008.0, + "grad_norm": 0.023593197084580173, + "language_loss": 0.95331407, + "learning_rate": 0.0009856823457801331, + "loss": 0.96535957, + "num_input_tokens_seen": 44726752, + "router_z_loss_mlp": 1.33203125, + "step": 541, + "time_per_iteration": 2.889531373977661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202711, + "balance_loss_mlp": 1.06924474, + "epoch": 0.1042708734128511, + "flos": 503944711680.0, + "grad_norm": 0.023957714626313076, + "language_loss": 1.02856565, + "learning_rate": 0.00098560823110091, + "loss": 1.04059267, + "num_input_tokens_seen": 44795824, + "router_z_loss_mlp": 1.33349609, + "step": 542, + "time_per_iteration": 2.6067047119140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205134, + "balance_loss_mlp": 1.07185781, + "epoch": 0.10446325509811466, + "flos": 486640872960.0, + "grad_norm": 0.0231214260398276, + "language_loss": 1.01405394, + "learning_rate": 0.000985533927891851, + "loss": 1.02610517, + "num_input_tokens_seen": 44868496, + "router_z_loss_mlp": 1.33154297, + "step": 543, + "time_per_iteration": 2.6622776985168457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01201388, + "balance_loss_mlp": 1.06820762, + "epoch": 0.10465563678337822, + "flos": 569713015296.0, + "grad_norm": 0.023482705287667723, + "language_loss": 1.01015687, + "learning_rate": 0.0009854594361818044, + "loss": 1.02217078, + "num_input_tokens_seen": 44939888, + "router_z_loss_mlp": 1.33056641, + "step": 544, + "time_per_iteration": 2.7061924934387207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195672, + "balance_loss_mlp": 1.06244385, + "epoch": 0.10484801846864178, + "flos": 627242998272.0, + "grad_norm": 0.023194608242680787, + "language_loss": 0.99799937, + "learning_rate": 0.0009853847559996897, + "loss": 1.00995612, + "num_input_tokens_seen": 45012720, + "router_z_loss_mlp": 1.33105469, + "step": 545, + "time_per_iteration": 2.742445707321167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192128, + "balance_loss_mlp": 1.05885231, + "epoch": 0.10504040015390535, + "flos": 744812754432.0, + "grad_norm": 0.025865682249952955, + "language_loss": 0.99192667, + "learning_rate": 0.0009853098873745, + "loss": 1.00384796, + "num_input_tokens_seen": 45093744, + "router_z_loss_mlp": 1.33154297, + "step": 546, + "time_per_iteration": 3.0260400772094727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192867, + "balance_loss_mlp": 1.05997264, + "epoch": 0.10523278183916891, + "flos": 587842050048.0, + "grad_norm": 0.02599355243407578, + "language_loss": 0.98197657, + "learning_rate": 0.0009852348303353027, + "loss": 0.99390525, + "num_input_tokens_seen": 45172784, + "router_z_loss_mlp": 1.32763672, + "step": 547, + "time_per_iteration": 2.8120169639587402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191481, + "balance_loss_mlp": 1.05844367, + "epoch": 0.10542516352443247, + "flos": 871145857536.0, + "grad_norm": 0.02495252935664815, + "language_loss": 0.91398883, + "learning_rate": 0.000985159584911237, + "loss": 0.92590368, + "num_input_tokens_seen": 45255600, + "router_z_loss_mlp": 1.32910156, + "step": 548, + "time_per_iteration": 3.1012043952941895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119193, + "balance_loss_mlp": 1.05913138, + "epoch": 0.10561754520969603, + "flos": 506412842496.0, + "grad_norm": 0.025955858684814606, + "language_loss": 0.9925828, + "learning_rate": 0.0009850841511315162, + "loss": 1.00450206, + "num_input_tokens_seen": 45325072, + "router_z_loss_mlp": 1.32666016, + "step": 549, + "time_per_iteration": 2.626220464706421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192876, + "balance_loss_mlp": 1.06022012, + "epoch": 0.1058099268949596, + "flos": 561147053568.0, + "grad_norm": 0.02554357007654854, + "language_loss": 0.98952115, + "learning_rate": 0.0009850085290254256, + "loss": 1.00144982, + "num_input_tokens_seen": 45401440, + "router_z_loss_mlp": 1.32519531, + "step": 550, + "time_per_iteration": 2.7464635372161865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194366, + "balance_loss_mlp": 1.06161487, + "epoch": 0.10600230858022316, + "flos": 563159288832.0, + "grad_norm": 0.020736613501838204, + "language_loss": 0.9519307, + "learning_rate": 0.0009849327186223246, + "loss": 0.9638744, + "num_input_tokens_seen": 45479264, + "router_z_loss_mlp": 1.32617188, + "step": 551, + "time_per_iteration": 2.7678163051605225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199655, + "balance_loss_mlp": 1.06728542, + "epoch": 0.10619469026548672, + "flos": 495317624832.0, + "grad_norm": 0.02236411826292933, + "language_loss": 1.02411103, + "learning_rate": 0.000984856719951646, + "loss": 1.03610754, + "num_input_tokens_seen": 45547328, + "router_z_loss_mlp": 1.32226562, + "step": 552, + "time_per_iteration": 2.5607285499572754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196226, + "balance_loss_mlp": 1.06404662, + "epoch": 0.10638707195075028, + "flos": 677463916032.0, + "grad_norm": 0.025808282690500464, + "language_loss": 1.00531495, + "learning_rate": 0.0009847805330428943, + "loss": 1.01727724, + "num_input_tokens_seen": 45631152, + "router_z_loss_mlp": 1.3203125, + "step": 553, + "time_per_iteration": 2.8748667240142822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190787, + "balance_loss_mlp": 1.05860806, + "epoch": 0.10657945363601386, + "flos": 489035143680.0, + "grad_norm": 0.02571681940882287, + "language_loss": 1.04715252, + "learning_rate": 0.0009847041579256481, + "loss": 1.05906045, + "num_input_tokens_seen": 45698208, + "router_z_loss_mlp": 1.3203125, + "step": 554, + "time_per_iteration": 2.56693696975708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191519, + "balance_loss_mlp": 1.05948246, + "epoch": 0.10677183532127742, + "flos": 483970627584.0, + "grad_norm": 0.020874824601389917, + "language_loss": 1.01746583, + "learning_rate": 0.0009846275946295592, + "loss": 1.02938092, + "num_input_tokens_seen": 45766640, + "router_z_loss_mlp": 1.31884766, + "step": 555, + "time_per_iteration": 2.596774101257324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195781, + "balance_loss_mlp": 1.06369734, + "epoch": 0.10696421700654098, + "flos": 657581156352.0, + "grad_norm": 0.023085993180182653, + "language_loss": 0.93557143, + "learning_rate": 0.0009845508431843518, + "loss": 0.94752926, + "num_input_tokens_seen": 45851408, + "router_z_loss_mlp": 1.31933594, + "step": 556, + "time_per_iteration": 2.9913573265075684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192823, + "balance_loss_mlp": 1.06088233, + "epoch": 0.10715659869180454, + "flos": 568792492032.0, + "grad_norm": 0.026087632201688016, + "language_loss": 0.9692713, + "learning_rate": 0.0009844739036198233, + "loss": 0.9811995, + "num_input_tokens_seen": 45919824, + "router_z_loss_mlp": 1.31787109, + "step": 557, + "time_per_iteration": 2.6583988666534424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192362, + "balance_loss_mlp": 1.06051683, + "epoch": 0.10734898037706811, + "flos": 541743657984.0, + "grad_norm": 0.02708275038302545, + "language_loss": 1.03564882, + "learning_rate": 0.0009843967759658448, + "loss": 1.04757237, + "num_input_tokens_seen": 45991024, + "router_z_loss_mlp": 1.31689453, + "step": 558, + "time_per_iteration": 2.6571173667907715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209854, + "balance_loss_mlp": 1.07920074, + "epoch": 0.10754136206233167, + "flos": 1479731518464.0, + "grad_norm": 0.021017403581586082, + "language_loss": 0.72767758, + "learning_rate": 0.0009843194602523592, + "loss": 0.73977602, + "num_input_tokens_seen": 46212736, + "router_z_loss_mlp": 1.30664062, + "step": 559, + "time_per_iteration": 4.901749134063721 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191994, + "balance_loss_mlp": 1.06024349, + "epoch": 0.10773374374759523, + "flos": 513411730944.0, + "grad_norm": 0.02623387515623986, + "language_loss": 1.03025067, + "learning_rate": 0.000984241956509384, + "loss": 1.04217052, + "num_input_tokens_seen": 46283920, + "router_z_loss_mlp": 1.31591797, + "step": 560, + "time_per_iteration": 2.642380714416504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011916, + "balance_loss_mlp": 1.06013584, + "epoch": 0.10792612543285879, + "flos": 497477580288.0, + "grad_norm": 0.029111560342126648, + "language_loss": 1.01683569, + "learning_rate": 0.0009841642647670078, + "loss": 1.02875161, + "num_input_tokens_seen": 46349664, + "router_z_loss_mlp": 1.31298828, + "step": 561, + "time_per_iteration": 2.5994741916656494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191695, + "balance_loss_mlp": 1.06027901, + "epoch": 0.10811850711812235, + "flos": 736836946944.0, + "grad_norm": 0.027918527501713815, + "language_loss": 0.94711685, + "learning_rate": 0.0009840863850553944, + "loss": 0.95903373, + "num_input_tokens_seen": 46432688, + "router_z_loss_mlp": 1.3125, + "step": 562, + "time_per_iteration": 2.980377435684204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193377, + "balance_loss_mlp": 1.06215191, + "epoch": 0.10831088880338592, + "flos": 612676534272.0, + "grad_norm": 0.025174626098757973, + "language_loss": 0.99795747, + "learning_rate": 0.0009840083174047782, + "loss": 1.00989127, + "num_input_tokens_seen": 46507216, + "router_z_loss_mlp": 1.31054688, + "step": 563, + "time_per_iteration": 2.7209153175354004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194645, + "balance_loss_mlp": 1.0633713, + "epoch": 0.10850327048864948, + "flos": 557497887744.0, + "grad_norm": 0.021851565940339403, + "language_loss": 0.93414235, + "learning_rate": 0.0009839300618454685, + "loss": 0.94608879, + "num_input_tokens_seen": 46590464, + "router_z_loss_mlp": 1.31103516, + "step": 564, + "time_per_iteration": 2.833120584487915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194873, + "balance_loss_mlp": 1.06402934, + "epoch": 0.10869565217391304, + "flos": 604436212224.0, + "grad_norm": 0.021697209366751603, + "language_loss": 0.98980927, + "learning_rate": 0.0009838516184078466, + "loss": 1.00175798, + "num_input_tokens_seen": 46666240, + "router_z_loss_mlp": 1.30664062, + "step": 565, + "time_per_iteration": 2.805722236633301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193483, + "balance_loss_mlp": 1.06263876, + "epoch": 0.1088880338591766, + "flos": 527205391872.0, + "grad_norm": 0.024778377976546286, + "language_loss": 0.97356248, + "learning_rate": 0.0009837729871223669, + "loss": 0.98549736, + "num_input_tokens_seen": 46734288, + "router_z_loss_mlp": 1.30664062, + "step": 566, + "time_per_iteration": 2.652186155319214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119656, + "balance_loss_mlp": 1.0658114, + "epoch": 0.10908041554444017, + "flos": 621416412672.0, + "grad_norm": 0.023487449334803984, + "language_loss": 0.99301046, + "learning_rate": 0.0009836941680195568, + "loss": 1.00497603, + "num_input_tokens_seen": 46809920, + "router_z_loss_mlp": 1.30566406, + "step": 567, + "time_per_iteration": 2.7732484340667725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192093, + "balance_loss_mlp": 1.06144011, + "epoch": 0.10927279722970373, + "flos": 899673168384.0, + "grad_norm": 0.026216288845653656, + "language_loss": 0.95416081, + "learning_rate": 0.0009836151611300166, + "loss": 0.96608174, + "num_input_tokens_seen": 46889984, + "router_z_loss_mlp": 1.3046875, + "step": 568, + "time_per_iteration": 3.174981117248535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190864, + "balance_loss_mlp": 1.06049693, + "epoch": 0.10946517891496729, + "flos": 529699719168.0, + "grad_norm": 0.02336242427092275, + "language_loss": 1.03071296, + "learning_rate": 0.0009835359664844194, + "loss": 1.04262161, + "num_input_tokens_seen": 46959536, + "router_z_loss_mlp": 1.30273438, + "step": 569, + "time_per_iteration": 2.595041513442993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190102, + "balance_loss_mlp": 1.06173706, + "epoch": 0.10965756060023085, + "flos": 1563991426560.0, + "grad_norm": 0.006726678932110135, + "language_loss": 0.81036806, + "learning_rate": 0.0009834565841135114, + "loss": 0.82226908, + "num_input_tokens_seen": 47196960, + "router_z_loss_mlp": 1.28320312, + "step": 570, + "time_per_iteration": 4.911731719970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193915, + "balance_loss_mlp": 1.0634526, + "epoch": 0.10984994228549443, + "flos": 514099940352.0, + "grad_norm": 0.027266515996607284, + "language_loss": 1.00165153, + "learning_rate": 0.0009833770140481118, + "loss": 1.01359057, + "num_input_tokens_seen": 47266560, + "router_z_loss_mlp": 1.30273438, + "step": 571, + "time_per_iteration": 2.6079747676849365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197777, + "balance_loss_mlp": 1.06741011, + "epoch": 0.11004232397075799, + "flos": 956273895936.0, + "grad_norm": 0.026548665437539986, + "language_loss": 0.90315044, + "learning_rate": 0.000983297256319112, + "loss": 0.91512823, + "num_input_tokens_seen": 47348512, + "router_z_loss_mlp": 1.30175781, + "step": 572, + "time_per_iteration": 3.1897354125976562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188275, + "balance_loss_mlp": 1.05776477, + "epoch": 0.11023470565602154, + "flos": 489228526080.0, + "grad_norm": 0.026034490292812715, + "language_loss": 0.95817071, + "learning_rate": 0.000983217310957477, + "loss": 0.97005343, + "num_input_tokens_seen": 47425392, + "router_z_loss_mlp": 1.30322266, + "step": 573, + "time_per_iteration": 2.7447898387908936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190883, + "balance_loss_mlp": 1.06056309, + "epoch": 0.1104270873412851, + "flos": 656990275584.0, + "grad_norm": 0.026590820610190004, + "language_loss": 1.00224817, + "learning_rate": 0.000983137177994244, + "loss": 1.01415706, + "num_input_tokens_seen": 47502336, + "router_z_loss_mlp": 1.30126953, + "step": 574, + "time_per_iteration": 2.846140146255493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185115, + "balance_loss_mlp": 1.0552249, + "epoch": 0.11061946902654868, + "flos": 724747345920.0, + "grad_norm": 0.019709272455133778, + "language_loss": 0.93286896, + "learning_rate": 0.0009830568574605235, + "loss": 0.94472009, + "num_input_tokens_seen": 47583552, + "router_z_loss_mlp": 1.29736328, + "step": 575, + "time_per_iteration": 2.922821044921875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185727, + "balance_loss_mlp": 1.05569339, + "epoch": 0.11081185071181224, + "flos": 836867822592.0, + "grad_norm": 0.025292755419638515, + "language_loss": 0.97880363, + "learning_rate": 0.0009829763493874992, + "loss": 0.99066085, + "num_input_tokens_seen": 47663440, + "router_z_loss_mlp": 1.29833984, + "step": 576, + "time_per_iteration": 3.022394895553589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183726, + "balance_loss_mlp": 1.05412149, + "epoch": 0.1110042323970758, + "flos": 610282263552.0, + "grad_norm": 0.023453623229808367, + "language_loss": 1.02838886, + "learning_rate": 0.0009828956538064264, + "loss": 1.04022622, + "num_input_tokens_seen": 47741920, + "router_z_loss_mlp": 1.29541016, + "step": 577, + "time_per_iteration": 2.817147970199585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182671, + "balance_loss_mlp": 1.05316234, + "epoch": 0.11119661408233936, + "flos": 597039825408.0, + "grad_norm": 0.025026186935027953, + "language_loss": 0.99076784, + "learning_rate": 0.0009828147707486344, + "loss": 1.00259459, + "num_input_tokens_seen": 47815136, + "router_z_loss_mlp": 1.29492188, + "step": 578, + "time_per_iteration": 2.6778078079223633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186939, + "balance_loss_mlp": 1.05752516, + "epoch": 0.11138899576760293, + "flos": 556887541248.0, + "grad_norm": 0.027590262528076937, + "language_loss": 0.96720088, + "learning_rate": 0.0009827337002455245, + "loss": 0.97907031, + "num_input_tokens_seen": 47881360, + "router_z_loss_mlp": 1.29394531, + "step": 579, + "time_per_iteration": 2.6259562969207764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188781, + "balance_loss_mlp": 1.05951095, + "epoch": 0.11158137745286649, + "flos": 691062193152.0, + "grad_norm": 0.0223692175133054, + "language_loss": 0.94567806, + "learning_rate": 0.0009826524423285712, + "loss": 0.9575659, + "num_input_tokens_seen": 47962720, + "router_z_loss_mlp": 1.29150391, + "step": 580, + "time_per_iteration": 2.9144554138183594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118328, + "balance_loss_mlp": 1.05386627, + "epoch": 0.11177375913813005, + "flos": 764306747904.0, + "grad_norm": 0.02877171771660235, + "language_loss": 0.97941083, + "learning_rate": 0.0009825709970293218, + "loss": 0.9912436, + "num_input_tokens_seen": 48035472, + "router_z_loss_mlp": 1.29296875, + "step": 581, + "time_per_iteration": 2.8999927043914795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181128, + "balance_loss_mlp": 1.05223894, + "epoch": 0.11196614082339361, + "flos": 808030334976.0, + "grad_norm": 0.029325346048851512, + "language_loss": 1.03732872, + "learning_rate": 0.0009824893643793956, + "loss": 1.04913998, + "num_input_tokens_seen": 48116944, + "router_z_loss_mlp": 1.28857422, + "step": 582, + "time_per_iteration": 3.0697131156921387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186636, + "balance_loss_mlp": 1.05731773, + "epoch": 0.11215852250865718, + "flos": 559724972544.0, + "grad_norm": 0.028740695003145394, + "language_loss": 0.98446089, + "learning_rate": 0.0009824075444104857, + "loss": 0.99632728, + "num_input_tokens_seen": 48187808, + "router_z_loss_mlp": 1.29150391, + "step": 583, + "time_per_iteration": 2.7398793697357178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190407, + "balance_loss_mlp": 1.06147003, + "epoch": 0.11235090419392074, + "flos": 514575301632.0, + "grad_norm": 0.02293328270345756, + "language_loss": 1.02460003, + "learning_rate": 0.000982325537154357, + "loss": 1.03650403, + "num_input_tokens_seen": 48254464, + "router_z_loss_mlp": 1.28808594, + "step": 584, + "time_per_iteration": 2.590156078338623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188149, + "balance_loss_mlp": 1.05954635, + "epoch": 0.1125432858791843, + "flos": 492432529920.0, + "grad_norm": 0.028214107652977688, + "language_loss": 1.0381788, + "learning_rate": 0.0009822433426428484, + "loss": 1.05006027, + "num_input_tokens_seen": 48318784, + "router_z_loss_mlp": 1.28564453, + "step": 585, + "time_per_iteration": 2.566488027572632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188321, + "balance_loss_mlp": 1.05957532, + "epoch": 0.11273566756444786, + "flos": 511727136768.0, + "grad_norm": 0.027438709113267498, + "language_loss": 0.95940274, + "learning_rate": 0.0009821609609078697, + "loss": 0.971286, + "num_input_tokens_seen": 48389248, + "router_z_loss_mlp": 1.28710938, + "step": 586, + "time_per_iteration": 2.6117701530456543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189545, + "balance_loss_mlp": 1.06098938, + "epoch": 0.11292804924971142, + "flos": 623639494656.0, + "grad_norm": 0.025949033694362005, + "language_loss": 0.97216725, + "learning_rate": 0.0009820783919814045, + "loss": 0.98406273, + "num_input_tokens_seen": 48463312, + "router_z_loss_mlp": 1.28515625, + "step": 587, + "time_per_iteration": 2.798182249069214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181783, + "balance_loss_mlp": 1.05360925, + "epoch": 0.113120430934975, + "flos": 479038368768.0, + "grad_norm": 0.03012596671256698, + "language_loss": 0.94172156, + "learning_rate": 0.0009819956358955095, + "loss": 0.95353937, + "num_input_tokens_seen": 48531856, + "router_z_loss_mlp": 1.28125, + "step": 588, + "time_per_iteration": 2.54179310798645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197707, + "balance_loss_mlp": 1.06905663, + "epoch": 0.11331281262023855, + "flos": 467990814720.0, + "grad_norm": 0.02502737191739997, + "language_loss": 0.9542653, + "learning_rate": 0.0009819126926823127, + "loss": 0.96624243, + "num_input_tokens_seen": 48596640, + "router_z_loss_mlp": 1.28613281, + "step": 589, + "time_per_iteration": 2.5262975692749023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191554, + "balance_loss_mlp": 1.06333208, + "epoch": 0.11350519430550211, + "flos": 651610853376.0, + "grad_norm": 0.023462259875113876, + "language_loss": 0.96713853, + "learning_rate": 0.000981829562374016, + "loss": 0.97905409, + "num_input_tokens_seen": 48669648, + "router_z_loss_mlp": 1.28173828, + "step": 590, + "time_per_iteration": 2.764240026473999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192039, + "balance_loss_mlp": 1.06415117, + "epoch": 0.11369757599076567, + "flos": 558860845056.0, + "grad_norm": 0.030341732837715945, + "language_loss": 1.07369685, + "learning_rate": 0.0009817462450028933, + "loss": 1.08561718, + "num_input_tokens_seen": 48737392, + "router_z_loss_mlp": 1.27832031, + "step": 591, + "time_per_iteration": 2.638333559036255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189947, + "balance_loss_mlp": 1.06215453, + "epoch": 0.11388995767602925, + "flos": 572305397760.0, + "grad_norm": 0.0238596111294556, + "language_loss": 0.94198918, + "learning_rate": 0.0009816627406012916, + "loss": 0.9538886, + "num_input_tokens_seen": 48817136, + "router_z_loss_mlp": 1.27734375, + "step": 592, + "time_per_iteration": 2.800842523574829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191939, + "balance_loss_mlp": 1.06395626, + "epoch": 0.1140823393612928, + "flos": 741743009280.0, + "grad_norm": 0.025351621893671843, + "language_loss": 0.93787777, + "learning_rate": 0.0009815790492016295, + "loss": 0.94979715, + "num_input_tokens_seen": 48895808, + "router_z_loss_mlp": 1.27929688, + "step": 593, + "time_per_iteration": 2.9331579208374023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191026, + "balance_loss_mlp": 1.06337643, + "epoch": 0.11427472104655637, + "flos": 700251236352.0, + "grad_norm": 0.02689478502881467, + "language_loss": 0.96601468, + "learning_rate": 0.0009814951708363993, + "loss": 0.97792494, + "num_input_tokens_seen": 48967456, + "router_z_loss_mlp": 1.27587891, + "step": 594, + "time_per_iteration": 2.832094192504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200218, + "balance_loss_mlp": 1.07414246, + "epoch": 0.11446710273181993, + "flos": 1480352598528.0, + "grad_norm": 0.020191453180706247, + "language_loss": 0.77990985, + "learning_rate": 0.0009814111055381654, + "loss": 0.79191208, + "num_input_tokens_seen": 49193152, + "router_z_loss_mlp": 1.25976562, + "step": 595, + "time_per_iteration": 4.752530574798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187485, + "balance_loss_mlp": 1.06026483, + "epoch": 0.1146594844170835, + "flos": 495912508416.0, + "grad_norm": 0.02910362847653251, + "language_loss": 0.97498882, + "learning_rate": 0.0009813268533395648, + "loss": 0.98686367, + "num_input_tokens_seen": 49260960, + "router_z_loss_mlp": 1.27148438, + "step": 596, + "time_per_iteration": 2.571367025375366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187961, + "balance_loss_mlp": 1.06093144, + "epoch": 0.11485186610234706, + "flos": 475790704128.0, + "grad_norm": 0.02927093575191284, + "language_loss": 0.98108673, + "learning_rate": 0.0009812424142733073, + "loss": 0.99296629, + "num_input_tokens_seen": 49327616, + "router_z_loss_mlp": 1.26953125, + "step": 597, + "time_per_iteration": 2.5622098445892334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187255, + "balance_loss_mlp": 1.06046438, + "epoch": 0.11504424778761062, + "flos": 732619094016.0, + "grad_norm": 0.02047017320895946, + "language_loss": 0.92490959, + "learning_rate": 0.000981157788372175, + "loss": 0.93678212, + "num_input_tokens_seen": 49412864, + "router_z_loss_mlp": 1.26708984, + "step": 598, + "time_per_iteration": 3.017120599746704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185489, + "balance_loss_mlp": 1.05855536, + "epoch": 0.11523662947287418, + "flos": 546962625024.0, + "grad_norm": 0.02044602685826044, + "language_loss": 0.96609688, + "learning_rate": 0.0009810729756690223, + "loss": 0.97795177, + "num_input_tokens_seen": 49483584, + "router_z_loss_mlp": 1.26855469, + "step": 599, + "time_per_iteration": 2.7182610034942627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190213, + "balance_loss_mlp": 1.06323159, + "epoch": 0.11542901115813775, + "flos": 776387616768.0, + "grad_norm": 0.023703305464208416, + "language_loss": 0.99939269, + "learning_rate": 0.0009809879761967766, + "loss": 1.01129484, + "num_input_tokens_seen": 49563568, + "router_z_loss_mlp": 1.26904297, + "step": 600, + "time_per_iteration": 2.9586148262023926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189892, + "balance_loss_mlp": 1.06319618, + "epoch": 0.11562139284340131, + "flos": 732212863488.0, + "grad_norm": 0.024193120208057816, + "language_loss": 0.99113685, + "learning_rate": 0.0009809027899884378, + "loss": 1.00303578, + "num_input_tokens_seen": 49640800, + "router_z_loss_mlp": 1.26611328, + "step": 601, + "time_per_iteration": 2.885070323944092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183816, + "balance_loss_mlp": 1.05731082, + "epoch": 0.11581377452866487, + "flos": 537039710208.0, + "grad_norm": 0.022696091128935367, + "language_loss": 0.96568906, + "learning_rate": 0.0009808174170770779, + "loss": 0.97752714, + "num_input_tokens_seen": 49721872, + "router_z_loss_mlp": 1.26416016, + "step": 602, + "time_per_iteration": 2.7809743881225586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191742, + "balance_loss_mlp": 1.0662384, + "epoch": 0.11600615621392843, + "flos": 1559211617280.0, + "grad_norm": 0.013792800863456836, + "language_loss": 0.84898245, + "learning_rate": 0.0009807318574958418, + "loss": 0.86089987, + "num_input_tokens_seen": 49951472, + "router_z_loss_mlp": 1.25390625, + "step": 603, + "time_per_iteration": 4.860181570053101 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187966, + "balance_loss_mlp": 1.06169963, + "epoch": 0.116198537899192, + "flos": 538467795456.0, + "grad_norm": 0.022659628017063727, + "language_loss": 1.02766323, + "learning_rate": 0.0009806461112779462, + "loss": 1.03954291, + "num_input_tokens_seen": 50021136, + "router_z_loss_mlp": 1.26171875, + "step": 604, + "time_per_iteration": 2.614189863204956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187324, + "balance_loss_mlp": 1.06091404, + "epoch": 0.11639091958445556, + "flos": 455137142784.0, + "grad_norm": 0.0301649070939891, + "language_loss": 1.00891566, + "learning_rate": 0.0009805601784566814, + "loss": 1.02078903, + "num_input_tokens_seen": 50083888, + "router_z_loss_mlp": 1.26318359, + "step": 605, + "time_per_iteration": 2.470878839492798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119223, + "balance_loss_mlp": 1.06658351, + "epoch": 0.11658330126971912, + "flos": 556151668224.0, + "grad_norm": 0.025758302551065336, + "language_loss": 1.05099356, + "learning_rate": 0.0009804740590654089, + "loss": 1.0629158, + "num_input_tokens_seen": 50151744, + "router_z_loss_mlp": 1.25537109, + "step": 606, + "time_per_iteration": 2.631462812423706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191677, + "balance_loss_mlp": 1.06588733, + "epoch": 0.11677568295498268, + "flos": 717600737280.0, + "grad_norm": 0.02545612001836415, + "language_loss": 0.99629396, + "learning_rate": 0.0009803877531375635, + "loss": 1.00821078, + "num_input_tokens_seen": 50221248, + "router_z_loss_mlp": 1.25683594, + "step": 607, + "time_per_iteration": 2.879645586013794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191881, + "balance_loss_mlp": 1.06613898, + "epoch": 0.11696806464024626, + "flos": 610898614272.0, + "grad_norm": 0.023619167708177922, + "language_loss": 0.99668628, + "learning_rate": 0.0009803012607066523, + "loss": 1.008605, + "num_input_tokens_seen": 50293792, + "router_z_loss_mlp": 1.25634766, + "step": 608, + "time_per_iteration": 2.717660427093506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189661, + "balance_loss_mlp": 1.06406212, + "epoch": 0.11716044632550981, + "flos": 521415736320.0, + "grad_norm": 0.023557070356346427, + "language_loss": 0.97414643, + "learning_rate": 0.0009802145818062543, + "loss": 0.98604298, + "num_input_tokens_seen": 50367760, + "router_z_loss_mlp": 1.25488281, + "step": 609, + "time_per_iteration": 2.7209720611572266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190685, + "balance_loss_mlp": 1.064991, + "epoch": 0.11735282801077337, + "flos": 508488204288.0, + "grad_norm": 0.03039581956620226, + "language_loss": 1.01476204, + "learning_rate": 0.0009801277164700212, + "loss": 1.02666891, + "num_input_tokens_seen": 50435664, + "router_z_loss_mlp": 1.25585938, + "step": 610, + "time_per_iteration": 2.5900633335113525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190447, + "balance_loss_mlp": 1.06489623, + "epoch": 0.11754520969603693, + "flos": 687835995648.0, + "grad_norm": 0.028512829376260446, + "language_loss": 0.97853899, + "learning_rate": 0.0009800406647316776, + "loss": 0.99044347, + "num_input_tokens_seen": 50514144, + "router_z_loss_mlp": 1.25439453, + "step": 611, + "time_per_iteration": 2.8018290996551514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118467, + "balance_loss_mlp": 1.06088257, + "epoch": 0.1177375913813005, + "flos": 1545756331008.0, + "grad_norm": 0.00764509792440145, + "language_loss": 0.76914459, + "learning_rate": 0.0009799534266250196, + "loss": 0.78099126, + "num_input_tokens_seen": 50738448, + "router_z_loss_mlp": 1.24023438, + "step": 612, + "time_per_iteration": 4.767510175704956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185632, + "balance_loss_mlp": 1.05974686, + "epoch": 0.11792997306656407, + "flos": 521537260032.0, + "grad_norm": 0.0290479345737112, + "language_loss": 0.97953087, + "learning_rate": 0.000979866002183916, + "loss": 0.99138713, + "num_input_tokens_seen": 50809328, + "router_z_loss_mlp": 1.2578125, + "step": 613, + "time_per_iteration": 2.6752681732177734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182111, + "balance_loss_mlp": 1.05632174, + "epoch": 0.11812235475182763, + "flos": 667488608256.0, + "grad_norm": 0.030776001440310688, + "language_loss": 0.9883132, + "learning_rate": 0.0009797783914423082, + "loss": 1.00013435, + "num_input_tokens_seen": 50887728, + "router_z_loss_mlp": 1.25683594, + "step": 614, + "time_per_iteration": 2.8556718826293945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182577, + "balance_loss_mlp": 1.05697787, + "epoch": 0.11831473643709119, + "flos": 622504121856.0, + "grad_norm": 0.02739500646081478, + "language_loss": 0.93579996, + "learning_rate": 0.0009796905944342094, + "loss": 0.94762576, + "num_input_tokens_seen": 50966160, + "router_z_loss_mlp": 1.25488281, + "step": 615, + "time_per_iteration": 2.80253267288208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187072, + "balance_loss_mlp": 1.06152117, + "epoch": 0.11850711812235475, + "flos": 457694596608.0, + "grad_norm": 0.020858577781052552, + "language_loss": 0.96166766, + "learning_rate": 0.0009796026111937057, + "loss": 0.9735384, + "num_input_tokens_seen": 51035712, + "router_z_loss_mlp": 1.25439453, + "step": 616, + "time_per_iteration": 2.5763044357299805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189497, + "balance_loss_mlp": 1.06404102, + "epoch": 0.11869949980761832, + "flos": 514927137792.0, + "grad_norm": 0.022050319992180305, + "language_loss": 0.96050835, + "learning_rate": 0.0009795144417549552, + "loss": 0.97240329, + "num_input_tokens_seen": 51108656, + "router_z_loss_mlp": 1.25341797, + "step": 617, + "time_per_iteration": 2.7428698539733887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186044, + "balance_loss_mlp": 1.06092167, + "epoch": 0.11889188149288188, + "flos": 536156116992.0, + "grad_norm": 0.0238791856796517, + "language_loss": 0.97532642, + "learning_rate": 0.0009794260861521883, + "loss": 0.98718691, + "num_input_tokens_seen": 51185552, + "router_z_loss_mlp": 1.25292969, + "step": 618, + "time_per_iteration": 2.784257173538208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189196, + "balance_loss_mlp": 1.06445491, + "epoch": 0.11908426317814544, + "flos": 499644266496.0, + "grad_norm": 0.024260475486046627, + "language_loss": 0.96495152, + "learning_rate": 0.0009793375444197075, + "loss": 0.97684348, + "num_input_tokens_seen": 51255808, + "router_z_loss_mlp": 1.25, + "step": 619, + "time_per_iteration": 2.6021063327789307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189567, + "balance_loss_mlp": 1.06482673, + "epoch": 0.119276644863409, + "flos": 661067139072.0, + "grad_norm": 0.023292068214373615, + "language_loss": 0.96012962, + "learning_rate": 0.000979248816591888, + "loss": 0.97202522, + "num_input_tokens_seen": 51329408, + "router_z_loss_mlp": 1.25, + "step": 620, + "time_per_iteration": 2.783372640609741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184512, + "balance_loss_mlp": 1.06001019, + "epoch": 0.11946902654867257, + "flos": 760152021504.0, + "grad_norm": 0.02911418191745056, + "language_loss": 0.95521206, + "learning_rate": 0.0009791599027031766, + "loss": 0.96705711, + "num_input_tokens_seen": 51408784, + "router_z_loss_mlp": 1.24755859, + "step": 621, + "time_per_iteration": 3.04338002204895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185972, + "balance_loss_mlp": 1.06156564, + "epoch": 0.11966140823393613, + "flos": 682213526016.0, + "grad_norm": 0.0317276180850791, + "language_loss": 0.96021026, + "learning_rate": 0.0009790708027880932, + "loss": 0.97206998, + "num_input_tokens_seen": 51482592, + "router_z_loss_mlp": 1.24658203, + "step": 622, + "time_per_iteration": 2.8048830032348633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184547, + "balance_loss_mlp": 1.06171417, + "epoch": 0.11985378991919969, + "flos": 1454298147840.0, + "grad_norm": 0.011779966077399251, + "language_loss": 0.77427292, + "learning_rate": 0.0009789815168812293, + "loss": 0.78611839, + "num_input_tokens_seen": 51712240, + "router_z_loss_mlp": 1.23046875, + "step": 623, + "time_per_iteration": 4.88221549987793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187655, + "balance_loss_mlp": 1.06291461, + "epoch": 0.12004617160446325, + "flos": 528898718208.0, + "grad_norm": 0.0243802584204396, + "language_loss": 1.01341891, + "learning_rate": 0.0009788920450172487, + "loss": 1.0252955, + "num_input_tokens_seen": 51781440, + "router_z_loss_mlp": 1.25, + "step": 624, + "time_per_iteration": 2.6179678440093994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190724, + "balance_loss_mlp": 1.06655562, + "epoch": 0.12023855328972682, + "flos": 475176354816.0, + "grad_norm": 0.025839680970612892, + "language_loss": 0.99598378, + "learning_rate": 0.0009788023872308875, + "loss": 1.00789118, + "num_input_tokens_seen": 51845424, + "router_z_loss_mlp": 1.24414062, + "step": 625, + "time_per_iteration": 2.5168616771698 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189499, + "balance_loss_mlp": 1.06723785, + "epoch": 0.12043093497499038, + "flos": 1535051880960.0, + "grad_norm": 0.008994278182213968, + "language_loss": 0.75428998, + "learning_rate": 0.0009787125435569539, + "loss": 0.76618505, + "num_input_tokens_seen": 52076496, + "router_z_loss_mlp": 1.22460938, + "step": 626, + "time_per_iteration": 4.739393472671509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194547, + "balance_loss_mlp": 1.07128501, + "epoch": 0.12062331666025394, + "flos": 540914459136.0, + "grad_norm": 0.025390703641747513, + "language_loss": 1.01758838, + "learning_rate": 0.0009786225140303285, + "loss": 1.02953386, + "num_input_tokens_seen": 52143072, + "router_z_loss_mlp": 1.23486328, + "step": 627, + "time_per_iteration": 2.627995729446411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119053, + "balance_loss_mlp": 1.06683803, + "epoch": 0.1208156983455175, + "flos": 512999496192.0, + "grad_norm": 0.027559316114759484, + "language_loss": 1.00245547, + "learning_rate": 0.0009785322986859634, + "loss": 1.0143609, + "num_input_tokens_seen": 52211888, + "router_z_loss_mlp": 1.23925781, + "step": 628, + "time_per_iteration": 2.657465696334839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011787, + "balance_loss_mlp": 1.05481803, + "epoch": 0.12100808003078108, + "flos": 597589046784.0, + "grad_norm": 0.024406659961039724, + "language_loss": 1.01031506, + "learning_rate": 0.0009784418975588838, + "loss": 1.02210212, + "num_input_tokens_seen": 52283696, + "router_z_loss_mlp": 1.24121094, + "step": 629, + "time_per_iteration": 2.6953535079956055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187008, + "balance_loss_mlp": 1.063555, + "epoch": 0.12120046171604464, + "flos": 524066515968.0, + "grad_norm": 0.02180733694842763, + "language_loss": 0.99517697, + "learning_rate": 0.0009783513106841862, + "loss": 1.00704694, + "num_input_tokens_seen": 52358624, + "router_z_loss_mlp": 1.23681641, + "step": 630, + "time_per_iteration": 2.7234978675842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189331, + "balance_loss_mlp": 1.06687927, + "epoch": 0.1213928434013082, + "flos": 1557907057152.0, + "grad_norm": 0.011472153843238986, + "language_loss": 0.76732707, + "learning_rate": 0.00097826053809704, + "loss": 0.77922034, + "num_input_tokens_seen": 52591248, + "router_z_loss_mlp": 1.2265625, + "step": 631, + "time_per_iteration": 4.975109100341797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184278, + "balance_loss_mlp": 1.06072986, + "epoch": 0.12158522508657175, + "flos": 496387869696.0, + "grad_norm": 0.025959921000511615, + "language_loss": 0.96498066, + "learning_rate": 0.0009781695798326854, + "loss": 0.97682351, + "num_input_tokens_seen": 52659920, + "router_z_loss_mlp": 1.23779297, + "step": 632, + "time_per_iteration": 2.5740485191345215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184351, + "balance_loss_mlp": 1.0608983, + "epoch": 0.12177760677183531, + "flos": 476589703680.0, + "grad_norm": 0.025554774573744533, + "language_loss": 0.96275663, + "learning_rate": 0.0009780784359264365, + "loss": 0.9746002, + "num_input_tokens_seen": 52728832, + "router_z_loss_mlp": 1.23681641, + "step": 633, + "time_per_iteration": 2.604390859603882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176552, + "balance_loss_mlp": 1.05543518, + "epoch": 0.12196998845709889, + "flos": 1471784635392.0, + "grad_norm": 0.009598735556444526, + "language_loss": 0.74188697, + "learning_rate": 0.0009779871064136778, + "loss": 0.75365245, + "num_input_tokens_seen": 52949776, + "router_z_loss_mlp": 1.21289062, + "step": 634, + "time_per_iteration": 4.757449626922607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117713, + "balance_loss_mlp": 1.05424869, + "epoch": 0.12216237014236245, + "flos": 587748724224.0, + "grad_norm": 0.021555120902870813, + "language_loss": 0.93822527, + "learning_rate": 0.000977895591329867, + "loss": 0.94999647, + "num_input_tokens_seen": 53027184, + "router_z_loss_mlp": 1.23095703, + "step": 635, + "time_per_iteration": 2.7859792709350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181489, + "balance_loss_mlp": 1.05851305, + "epoch": 0.12235475182762601, + "flos": 599106455040.0, + "grad_norm": 0.023775729584682537, + "language_loss": 0.96009773, + "learning_rate": 0.000977803890710533, + "loss": 0.97191262, + "num_input_tokens_seen": 53101072, + "router_z_loss_mlp": 1.23193359, + "step": 636, + "time_per_iteration": 2.76069712638855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180701, + "balance_loss_mlp": 1.05762947, + "epoch": 0.12254713351288957, + "flos": 498760673280.0, + "grad_norm": 0.024707427516876792, + "language_loss": 1.00440359, + "learning_rate": 0.0009777120045912774, + "loss": 1.01621056, + "num_input_tokens_seen": 53172992, + "router_z_loss_mlp": 1.23291016, + "step": 637, + "time_per_iteration": 2.5980072021484375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118065, + "balance_loss_mlp": 1.05772126, + "epoch": 0.12273951519815314, + "flos": 606980204544.0, + "grad_norm": 0.02489341207380848, + "language_loss": 0.99891078, + "learning_rate": 0.0009776199330077736, + "loss": 1.01071739, + "num_input_tokens_seen": 53248256, + "router_z_loss_mlp": 1.23144531, + "step": 638, + "time_per_iteration": 2.704040288925171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181154, + "balance_loss_mlp": 1.05841601, + "epoch": 0.1229318968834167, + "flos": 598984931328.0, + "grad_norm": 0.02631208797714665, + "language_loss": 1.02141118, + "learning_rate": 0.0009775276759957667, + "loss": 1.03322268, + "num_input_tokens_seen": 53318960, + "router_z_loss_mlp": 1.22949219, + "step": 639, + "time_per_iteration": 2.7442896366119385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179934, + "balance_loss_mlp": 1.05700564, + "epoch": 0.12312427856868026, + "flos": 679588942848.0, + "grad_norm": 0.026802425502252814, + "language_loss": 1.01084137, + "learning_rate": 0.0009774352335910745, + "loss": 1.02264071, + "num_input_tokens_seen": 53389120, + "router_z_loss_mlp": 1.23144531, + "step": 640, + "time_per_iteration": 2.8294076919555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117918, + "balance_loss_mlp": 1.05625129, + "epoch": 0.12331666025394382, + "flos": 610043218944.0, + "grad_norm": 0.020742791942005383, + "language_loss": 1.02118182, + "learning_rate": 0.000977342605829586, + "loss": 1.03297377, + "num_input_tokens_seen": 53459056, + "router_z_loss_mlp": 1.23144531, + "step": 641, + "time_per_iteration": 2.7078418731689453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180028, + "balance_loss_mlp": 1.05748129, + "epoch": 0.12350904193920739, + "flos": 763840118784.0, + "grad_norm": 0.025027209312251563, + "language_loss": 0.94737858, + "learning_rate": 0.0009772497927472623, + "loss": 0.95917892, + "num_input_tokens_seen": 53541552, + "router_z_loss_mlp": 1.22753906, + "step": 642, + "time_per_iteration": 3.0655579566955566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177096, + "balance_loss_mlp": 1.05454898, + "epoch": 0.12370142362447095, + "flos": 542049831936.0, + "grad_norm": 0.02608476880613399, + "language_loss": 0.96273685, + "learning_rate": 0.0009771567943801368, + "loss": 0.97450781, + "num_input_tokens_seen": 53611520, + "router_z_loss_mlp": 1.22753906, + "step": 643, + "time_per_iteration": 2.7343406677246094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179725, + "balance_loss_mlp": 1.05727291, + "epoch": 0.12389380530973451, + "flos": 549252836352.0, + "grad_norm": 0.02435000122960196, + "language_loss": 0.99357152, + "learning_rate": 0.0009770636107643152, + "loss": 1.00536871, + "num_input_tokens_seen": 53683888, + "router_z_loss_mlp": 1.2265625, + "step": 644, + "time_per_iteration": 2.7361886501312256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177422, + "balance_loss_mlp": 1.05516136, + "epoch": 0.12408618699499807, + "flos": 541352890368.0, + "grad_norm": 0.02246298440278387, + "language_loss": 0.95392644, + "learning_rate": 0.0009769702419359738, + "loss": 0.96570063, + "num_input_tokens_seen": 53751888, + "router_z_loss_mlp": 1.22460938, + "step": 645, + "time_per_iteration": 2.674142837524414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181453, + "balance_loss_mlp": 1.05904841, + "epoch": 0.12427856868026164, + "flos": 747159361536.0, + "grad_norm": 0.023095982047370255, + "language_loss": 0.97586024, + "learning_rate": 0.000976876687931362, + "loss": 0.98767477, + "num_input_tokens_seen": 53827648, + "router_z_loss_mlp": 1.22607422, + "step": 646, + "time_per_iteration": 2.9833688735961914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189298, + "balance_loss_mlp": 1.06703711, + "epoch": 0.1244709503655252, + "flos": 534744769536.0, + "grad_norm": 0.03060863164707411, + "language_loss": 0.94044995, + "learning_rate": 0.0009767829487868005, + "loss": 0.95234299, + "num_input_tokens_seen": 53896400, + "router_z_loss_mlp": 1.22460938, + "step": 647, + "time_per_iteration": 2.596003293991089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182997, + "balance_loss_mlp": 1.06073558, + "epoch": 0.12466333205078876, + "flos": 509111285760.0, + "grad_norm": 0.028982594733012217, + "language_loss": 0.98960567, + "learning_rate": 0.000976689024538682, + "loss": 1.00143564, + "num_input_tokens_seen": 53965904, + "router_z_loss_mlp": 1.22460938, + "step": 648, + "time_per_iteration": 2.5837948322296143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183924, + "balance_loss_mlp": 1.06171107, + "epoch": 0.12485571373605232, + "flos": 682639222272.0, + "grad_norm": 0.03213416167398649, + "language_loss": 0.97804081, + "learning_rate": 0.0009765949152234716, + "loss": 0.98988008, + "num_input_tokens_seen": 54049792, + "router_z_loss_mlp": 1.22412109, + "step": 649, + "time_per_iteration": 2.876009702682495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193359, + "balance_loss_mlp": 1.07243347, + "epoch": 0.1250480954213159, + "flos": 1333198748160.0, + "grad_norm": 0.014891788740719425, + "language_loss": 0.78686082, + "learning_rate": 0.0009765006208777055, + "loss": 0.79879445, + "num_input_tokens_seen": 54262432, + "router_z_loss_mlp": 1.2109375, + "step": 650, + "time_per_iteration": 4.675558805465698 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182781, + "balance_loss_mlp": 1.06152093, + "epoch": 0.12524047710657946, + "flos": 940196754432.0, + "grad_norm": 0.027794334398077363, + "language_loss": 0.91408408, + "learning_rate": 0.0009764061415379919, + "loss": 0.9259119, + "num_input_tokens_seen": 54351568, + "router_z_loss_mlp": 1.21435547, + "step": 651, + "time_per_iteration": 3.260758399963379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184193, + "balance_loss_mlp": 1.06288576, + "epoch": 0.12543285879184302, + "flos": 514900941312.0, + "grad_norm": 0.027655948956122736, + "language_loss": 0.97430605, + "learning_rate": 0.0009763114772410109, + "loss": 0.986148, + "num_input_tokens_seen": 54418944, + "router_z_loss_mlp": 1.21484375, + "step": 652, + "time_per_iteration": 2.60402512550354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179616, + "balance_loss_mlp": 1.05849957, + "epoch": 0.12562524047710658, + "flos": 719682829824.0, + "grad_norm": 0.022040452281994895, + "language_loss": 0.94100869, + "learning_rate": 0.0009762166280235146, + "loss": 0.95280486, + "num_input_tokens_seen": 54495312, + "router_z_loss_mlp": 1.21289062, + "step": 653, + "time_per_iteration": 2.953866958618164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177042, + "balance_loss_mlp": 1.05592513, + "epoch": 0.12581762216237014, + "flos": 564798220800.0, + "grad_norm": 0.026345633512325176, + "language_loss": 0.96725851, + "learning_rate": 0.0009761215939223267, + "loss": 0.97902894, + "num_input_tokens_seen": 54566832, + "router_z_loss_mlp": 1.21289062, + "step": 654, + "time_per_iteration": 2.6936216354370117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176243, + "balance_loss_mlp": 1.0553174, + "epoch": 0.1260100038476337, + "flos": 482900382720.0, + "grad_norm": 0.0302310026354778, + "language_loss": 0.97697163, + "learning_rate": 0.0009760263749743428, + "loss": 0.98873413, + "num_input_tokens_seen": 54632128, + "router_z_loss_mlp": 1.2109375, + "step": 655, + "time_per_iteration": 2.5425992012023926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173716, + "balance_loss_mlp": 1.05302835, + "epoch": 0.12620238553289725, + "flos": 576701170176.0, + "grad_norm": 0.026173940013352312, + "language_loss": 0.96703827, + "learning_rate": 0.0009759309712165299, + "loss": 0.97877538, + "num_input_tokens_seen": 54707600, + "router_z_loss_mlp": 1.20849609, + "step": 656, + "time_per_iteration": 2.7134242057800293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182641, + "balance_loss_mlp": 1.06185794, + "epoch": 0.12639476721816084, + "flos": 532185314304.0, + "grad_norm": 0.024272217680215723, + "language_loss": 1.00863099, + "learning_rate": 0.0009758353826859272, + "loss": 1.02045751, + "num_input_tokens_seen": 54776704, + "router_z_loss_mlp": 1.20947266, + "step": 657, + "time_per_iteration": 2.621317148208618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183764, + "balance_loss_mlp": 1.06288576, + "epoch": 0.1265871489034244, + "flos": 691231380480.0, + "grad_norm": 0.02639198012969831, + "language_loss": 0.9913975, + "learning_rate": 0.0009757396094196456, + "loss": 1.00323522, + "num_input_tokens_seen": 54851744, + "router_z_loss_mlp": 1.21044922, + "step": 658, + "time_per_iteration": 2.8867759704589844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183942, + "balance_loss_mlp": 1.06311166, + "epoch": 0.12677953058868796, + "flos": 538242212352.0, + "grad_norm": 0.02343039495549204, + "language_loss": 0.91435432, + "learning_rate": 0.0009756436514548673, + "loss": 0.92619371, + "num_input_tokens_seen": 54932576, + "router_z_loss_mlp": 1.20996094, + "step": 659, + "time_per_iteration": 2.8055155277252197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179962, + "balance_loss_mlp": 1.05903614, + "epoch": 0.12697191227395152, + "flos": 520119908352.0, + "grad_norm": 0.02147737158217614, + "language_loss": 0.94944704, + "learning_rate": 0.0009755475088288466, + "loss": 0.96124667, + "num_input_tokens_seen": 55007296, + "router_z_loss_mlp": 1.2109375, + "step": 660, + "time_per_iteration": 2.713801860809326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179144, + "balance_loss_mlp": 1.05826533, + "epoch": 0.12716429395921508, + "flos": 567665851392.0, + "grad_norm": 0.026687699897107686, + "language_loss": 0.99289566, + "learning_rate": 0.0009754511815789095, + "loss": 1.00468707, + "num_input_tokens_seen": 55079312, + "router_z_loss_mlp": 1.21044922, + "step": 661, + "time_per_iteration": 2.739250898361206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176549, + "balance_loss_mlp": 1.05590951, + "epoch": 0.12735667564447864, + "flos": 515141987328.0, + "grad_norm": 0.028028480179563667, + "language_loss": 0.94950283, + "learning_rate": 0.0009753546697424533, + "loss": 0.96126837, + "num_input_tokens_seen": 55151824, + "router_z_loss_mlp": 1.20800781, + "step": 662, + "time_per_iteration": 2.71746826171875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180242, + "balance_loss_mlp": 1.05941188, + "epoch": 0.1275490573297422, + "flos": 542321077248.0, + "grad_norm": 0.02443290319898258, + "language_loss": 0.98755229, + "learning_rate": 0.0009752579733569475, + "loss": 0.99935466, + "num_input_tokens_seen": 55224368, + "router_z_loss_mlp": 1.20996094, + "step": 663, + "time_per_iteration": 2.631284713745117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179512, + "balance_loss_mlp": 1.06030273, + "epoch": 0.12774143901500576, + "flos": 1562024853504.0, + "grad_norm": 0.010147906106003043, + "language_loss": 0.74881387, + "learning_rate": 0.0009751610924599328, + "loss": 0.76060903, + "num_input_tokens_seen": 55453584, + "router_z_loss_mlp": 1.19335938, + "step": 664, + "time_per_iteration": 4.941519260406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188286, + "balance_loss_mlp": 1.06783676, + "epoch": 0.12793382070026935, + "flos": 614873419776.0, + "grad_norm": 0.028758292375382164, + "language_loss": 1.00255466, + "learning_rate": 0.0009750640270890217, + "loss": 1.01443744, + "num_input_tokens_seen": 55528000, + "router_z_loss_mlp": 1.20605469, + "step": 665, + "time_per_iteration": 2.7382516860961914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185033, + "balance_loss_mlp": 1.06458378, + "epoch": 0.1281262023855329, + "flos": 709117367808.0, + "grad_norm": 0.02727882395737353, + "language_loss": 1.05972624, + "learning_rate": 0.0009749667772818983, + "loss": 1.0715766, + "num_input_tokens_seen": 55612416, + "router_z_loss_mlp": 1.20605469, + "step": 666, + "time_per_iteration": 2.961103677749634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117968, + "balance_loss_mlp": 1.06104279, + "epoch": 0.12831858407079647, + "flos": 1428182572032.0, + "grad_norm": 0.005713660367986308, + "language_loss": 0.76935941, + "learning_rate": 0.0009748693430763185, + "loss": 0.78115624, + "num_input_tokens_seen": 55843664, + "router_z_loss_mlp": 1.1875, + "step": 667, + "time_per_iteration": 4.799788475036621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180825, + "balance_loss_mlp": 1.06056714, + "epoch": 0.12851096575606002, + "flos": 450018232320.0, + "grad_norm": 0.027450705632443572, + "language_loss": 1.04045725, + "learning_rate": 0.0009747717245101093, + "loss": 1.05226541, + "num_input_tokens_seen": 55909072, + "router_z_loss_mlp": 1.20410156, + "step": 668, + "time_per_iteration": 2.503016471862793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181103, + "balance_loss_mlp": 1.0609405, + "epoch": 0.12870334744132358, + "flos": 480909614592.0, + "grad_norm": 0.024743463193645603, + "language_loss": 0.94192064, + "learning_rate": 0.00097467392162117, + "loss": 0.95373166, + "num_input_tokens_seen": 55978544, + "router_z_loss_mlp": 1.203125, + "step": 669, + "time_per_iteration": 2.6341683864593506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176215, + "balance_loss_mlp": 1.05609953, + "epoch": 0.12889572912658714, + "flos": 640151064576.0, + "grad_norm": 0.020470833753638586, + "language_loss": 0.98179239, + "learning_rate": 0.0009745759344474708, + "loss": 0.99355447, + "num_input_tokens_seen": 56054144, + "router_z_loss_mlp": 1.20263672, + "step": 670, + "time_per_iteration": 2.8753654956817627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175464, + "balance_loss_mlp": 1.05530083, + "epoch": 0.1290881108118507, + "flos": 510954333696.0, + "grad_norm": 0.02496408481001148, + "language_loss": 0.98669916, + "learning_rate": 0.0009744777630270536, + "loss": 0.99845386, + "num_input_tokens_seen": 56120960, + "router_z_loss_mlp": 1.203125, + "step": 671, + "time_per_iteration": 2.601480484008789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173739, + "balance_loss_mlp": 1.05381489, + "epoch": 0.12928049249711426, + "flos": 672290611200.0, + "grad_norm": 0.0267777739546368, + "language_loss": 1.0349828, + "learning_rate": 0.000974379407398032, + "loss": 1.04672015, + "num_input_tokens_seen": 56202560, + "router_z_loss_mlp": 1.20068359, + "step": 672, + "time_per_iteration": 2.8746023178100586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176311, + "balance_loss_mlp": 1.05633891, + "epoch": 0.12947287418237785, + "flos": 794998743552.0, + "grad_norm": 0.021070447178693698, + "language_loss": 0.89884377, + "learning_rate": 0.0009742808675985913, + "loss": 0.91060686, + "num_input_tokens_seen": 56289456, + "router_z_loss_mlp": 1.20117188, + "step": 673, + "time_per_iteration": 3.106855869293213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178925, + "balance_loss_mlp": 1.05895269, + "epoch": 0.1296652558676414, + "flos": 486447490560.0, + "grad_norm": 0.028552559493613055, + "language_loss": 1.00707459, + "learning_rate": 0.0009741821436669876, + "loss": 1.0188638, + "num_input_tokens_seen": 56354480, + "router_z_loss_mlp": 1.20117188, + "step": 674, + "time_per_iteration": 2.6221611499786377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180752, + "balance_loss_mlp": 1.06097043, + "epoch": 0.12985763755290497, + "flos": 454392537600.0, + "grad_norm": 0.03163366532216525, + "language_loss": 1.04449701, + "learning_rate": 0.0009740832356415492, + "loss": 1.05630445, + "num_input_tokens_seen": 56418944, + "router_z_loss_mlp": 1.19921875, + "step": 675, + "time_per_iteration": 2.508666515350342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179614, + "balance_loss_mlp": 1.05968916, + "epoch": 0.13005001923816853, + "flos": 826434617856.0, + "grad_norm": 0.02755997498495484, + "language_loss": 0.99148017, + "learning_rate": 0.0009739841435606756, + "loss": 1.00327623, + "num_input_tokens_seen": 56492368, + "router_z_loss_mlp": 1.20068359, + "step": 676, + "time_per_iteration": 3.026420831680298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180175, + "balance_loss_mlp": 1.06058431, + "epoch": 0.1302424009234321, + "flos": 532480754688.0, + "grad_norm": 0.02275953253130011, + "language_loss": 0.97366607, + "learning_rate": 0.0009738848674628377, + "loss": 0.98546779, + "num_input_tokens_seen": 56568128, + "router_z_loss_mlp": 1.19726562, + "step": 677, + "time_per_iteration": 2.710205554962158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179059, + "balance_loss_mlp": 1.05927801, + "epoch": 0.13043478260869565, + "flos": 526916682240.0, + "grad_norm": 0.02441501439452981, + "language_loss": 0.97902691, + "learning_rate": 0.000973785407386578, + "loss": 0.99081755, + "num_input_tokens_seen": 56646448, + "router_z_loss_mlp": 1.19921875, + "step": 678, + "time_per_iteration": 2.7785394191741943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184892, + "balance_loss_mlp": 1.06553924, + "epoch": 0.1306271642939592, + "flos": 627416914944.0, + "grad_norm": 0.023801085732510874, + "language_loss": 0.94469249, + "learning_rate": 0.0009736857633705103, + "loss": 0.95654142, + "num_input_tokens_seen": 56732080, + "router_z_loss_mlp": 1.19482422, + "step": 679, + "time_per_iteration": 2.8619470596313477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177483, + "balance_loss_mlp": 1.05827415, + "epoch": 0.13081954597922277, + "flos": 551840489472.0, + "grad_norm": 0.024512943765722366, + "language_loss": 1.01033652, + "learning_rate": 0.0009735859354533196, + "loss": 1.02211142, + "num_input_tokens_seen": 56804432, + "router_z_loss_mlp": 1.19335938, + "step": 680, + "time_per_iteration": 2.6954457759857178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176387, + "balance_loss_mlp": 1.05755925, + "epoch": 0.13101192766448633, + "flos": 537955504128.0, + "grad_norm": 0.029188130773433643, + "language_loss": 1.02405858, + "learning_rate": 0.0009734859236737628, + "loss": 1.03582239, + "num_input_tokens_seen": 56872512, + "router_z_loss_mlp": 1.18945312, + "step": 681, + "time_per_iteration": 2.606597661972046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172364, + "balance_loss_mlp": 1.05353606, + "epoch": 0.13120430934974991, + "flos": 504513398784.0, + "grad_norm": 0.02625319928532985, + "language_loss": 1.02007055, + "learning_rate": 0.0009733857280706678, + "loss": 1.03179431, + "num_input_tokens_seen": 56940928, + "router_z_loss_mlp": 1.18945312, + "step": 682, + "time_per_iteration": 2.626211404800415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011686, + "balance_loss_mlp": 1.05010605, + "epoch": 0.13139669103501347, + "flos": 615422641152.0, + "grad_norm": 0.025135553656080285, + "language_loss": 0.9321503, + "learning_rate": 0.000973285348682934, + "loss": 0.94383633, + "num_input_tokens_seen": 57012736, + "router_z_loss_mlp": 1.18603516, + "step": 683, + "time_per_iteration": 2.71779727935791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190269, + "balance_loss_mlp": 1.07296753, + "epoch": 0.13158907272027703, + "flos": 1488215614464.0, + "grad_norm": 0.025067429703540995, + "language_loss": 0.77898371, + "learning_rate": 0.0009731847855495323, + "loss": 0.7908864, + "num_input_tokens_seen": 57243136, + "router_z_loss_mlp": 1.17382812, + "step": 684, + "time_per_iteration": 4.811431169509888 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168738, + "balance_loss_mlp": 1.05048192, + "epoch": 0.1317814544055406, + "flos": 987117614592.0, + "grad_norm": 0.026136533405527674, + "language_loss": 0.93269205, + "learning_rate": 0.0009730840387095046, + "loss": 0.94437939, + "num_input_tokens_seen": 57336160, + "router_z_loss_mlp": 1.18359375, + "step": 685, + "time_per_iteration": 3.3154938220977783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117288, + "balance_loss_mlp": 1.05443382, + "epoch": 0.13197383609080415, + "flos": 612628870656.0, + "grad_norm": 0.026271684435729213, + "language_loss": 0.99177825, + "learning_rate": 0.0009729831082019642, + "loss": 1.00350702, + "num_input_tokens_seen": 57418976, + "router_z_loss_mlp": 1.18554688, + "step": 686, + "time_per_iteration": 2.79620623588562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179476, + "balance_loss_mlp": 1.06093395, + "epoch": 0.1321662177760677, + "flos": 495554668032.0, + "grad_norm": 0.02508782879826625, + "language_loss": 0.97052312, + "learning_rate": 0.0009728819940660958, + "loss": 0.98231786, + "num_input_tokens_seen": 57490288, + "router_z_loss_mlp": 1.18652344, + "step": 687, + "time_per_iteration": 2.779193162918091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178983, + "balance_loss_mlp": 1.06067955, + "epoch": 0.13235859946133127, + "flos": 496843765248.0, + "grad_norm": 0.02705130625621755, + "language_loss": 0.97550011, + "learning_rate": 0.0009727806963411557, + "loss": 0.98728997, + "num_input_tokens_seen": 57556064, + "router_z_loss_mlp": 1.18408203, + "step": 688, + "time_per_iteration": 2.5702319145202637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177139, + "balance_loss_mlp": 1.05883551, + "epoch": 0.13255098114659483, + "flos": 512767182336.0, + "grad_norm": 0.022910122085290585, + "language_loss": 0.96022904, + "learning_rate": 0.000972679215066471, + "loss": 0.97200048, + "num_input_tokens_seen": 57627248, + "router_z_loss_mlp": 1.18408203, + "step": 689, + "time_per_iteration": 2.64780592918396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178761, + "balance_loss_mlp": 1.06050563, + "epoch": 0.13274336283185842, + "flos": 548399442432.0, + "grad_norm": 0.030606528220640358, + "language_loss": 1.08985806, + "learning_rate": 0.0009725775502814401, + "loss": 1.10164571, + "num_input_tokens_seen": 57694832, + "router_z_loss_mlp": 1.18359375, + "step": 690, + "time_per_iteration": 2.5830535888671875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179512, + "balance_loss_mlp": 1.06120849, + "epoch": 0.13293574451712198, + "flos": 642002844672.0, + "grad_norm": 0.023439513257655937, + "language_loss": 0.94635952, + "learning_rate": 0.0009724757020255327, + "loss": 0.95815468, + "num_input_tokens_seen": 57771776, + "router_z_loss_mlp": 1.18408203, + "step": 691, + "time_per_iteration": 2.827944278717041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183334, + "balance_loss_mlp": 1.06517375, + "epoch": 0.13312812620238554, + "flos": 492469459968.0, + "grad_norm": 0.028212898490696088, + "language_loss": 0.96836531, + "learning_rate": 0.0009723736703382902, + "loss": 0.98019874, + "num_input_tokens_seen": 57836272, + "router_z_loss_mlp": 1.18261719, + "step": 692, + "time_per_iteration": 2.6144213676452637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180114, + "balance_loss_mlp": 1.06200123, + "epoch": 0.1333205078876491, + "flos": 509949216768.0, + "grad_norm": 0.023005533645913036, + "language_loss": 0.90654016, + "learning_rate": 0.0009722714552593244, + "loss": 0.91834128, + "num_input_tokens_seen": 57907232, + "router_z_loss_mlp": 1.18212891, + "step": 693, + "time_per_iteration": 2.600128173828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180549, + "balance_loss_mlp": 1.06262743, + "epoch": 0.13351288957291266, + "flos": 419591477760.0, + "grad_norm": 0.029950659996273835, + "language_loss": 1.05475199, + "learning_rate": 0.000972169056828319, + "loss": 1.06655741, + "num_input_tokens_seen": 57969808, + "router_z_loss_mlp": 1.18017578, + "step": 694, + "time_per_iteration": 2.466643810272217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178338, + "balance_loss_mlp": 1.0606066, + "epoch": 0.13370527125817622, + "flos": 617050839552.0, + "grad_norm": 0.021764231653516302, + "language_loss": 0.95444119, + "learning_rate": 0.0009720664750850283, + "loss": 0.96622455, + "num_input_tokens_seen": 58042944, + "router_z_loss_mlp": 1.17822266, + "step": 695, + "time_per_iteration": 2.7776308059692383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173328, + "balance_loss_mlp": 1.05578816, + "epoch": 0.13389765294343978, + "flos": 627169138176.0, + "grad_norm": 0.026088042391715836, + "language_loss": 1.0165019, + "learning_rate": 0.0009719637100692784, + "loss": 1.0282352, + "num_input_tokens_seen": 58116080, + "router_z_loss_mlp": 1.17626953, + "step": 696, + "time_per_iteration": 2.77535343170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175294, + "balance_loss_mlp": 1.0578016, + "epoch": 0.13409003462870334, + "flos": 610896612864.0, + "grad_norm": 0.027090913840535472, + "language_loss": 0.92017978, + "learning_rate": 0.0009718607618209661, + "loss": 0.93193275, + "num_input_tokens_seen": 58197616, + "router_z_loss_mlp": 1.17578125, + "step": 697, + "time_per_iteration": 2.8413584232330322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179845, + "balance_loss_mlp": 1.06235278, + "epoch": 0.13428241631396692, + "flos": 685087887360.0, + "grad_norm": 0.024883061853709334, + "language_loss": 0.95573747, + "learning_rate": 0.0009717576303800595, + "loss": 0.96753585, + "num_input_tokens_seen": 58280480, + "router_z_loss_mlp": 1.17578125, + "step": 698, + "time_per_iteration": 3.047100782394409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175386, + "balance_loss_mlp": 1.05794048, + "epoch": 0.13447479799923048, + "flos": 509818960896.0, + "grad_norm": 0.024888049065051182, + "language_loss": 0.95325053, + "learning_rate": 0.0009716543157865975, + "loss": 0.96500432, + "num_input_tokens_seen": 58352464, + "router_z_loss_mlp": 1.17529297, + "step": 699, + "time_per_iteration": 2.7481272220611572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176233, + "balance_loss_mlp": 1.05878782, + "epoch": 0.13466717968449404, + "flos": 899058819072.0, + "grad_norm": 0.023872779385430955, + "language_loss": 0.92076075, + "learning_rate": 0.0009715508180806907, + "loss": 0.93252313, + "num_input_tokens_seen": 58437216, + "router_z_loss_mlp": 1.17529297, + "step": 700, + "time_per_iteration": 3.2107367515563965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173529, + "balance_loss_mlp": 1.05660856, + "epoch": 0.1348595613697576, + "flos": 991694034432.0, + "grad_norm": 0.023513798430807663, + "language_loss": 1.00262749, + "learning_rate": 0.0009714471373025202, + "loss": 1.01436281, + "num_input_tokens_seen": 58533152, + "router_z_loss_mlp": 1.16992188, + "step": 701, + "time_per_iteration": 3.3966751098632812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173715, + "balance_loss_mlp": 1.0566988, + "epoch": 0.13505194305502116, + "flos": 488811561984.0, + "grad_norm": 0.028001983236069502, + "language_loss": 0.99373382, + "learning_rate": 0.0009713432734923386, + "loss": 1.00547099, + "num_input_tokens_seen": 58601376, + "router_z_loss_mlp": 1.17089844, + "step": 702, + "time_per_iteration": 2.615107536315918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171408, + "balance_loss_mlp": 1.05439234, + "epoch": 0.13524432474028472, + "flos": 614519582208.0, + "grad_norm": 0.024192478681639117, + "language_loss": 0.96606487, + "learning_rate": 0.0009712392266904696, + "loss": 0.97777891, + "num_input_tokens_seen": 58676608, + "router_z_loss_mlp": 1.17089844, + "step": 703, + "time_per_iteration": 2.7448034286499023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174325, + "balance_loss_mlp": 1.05740499, + "epoch": 0.13543670642554828, + "flos": 906274558464.0, + "grad_norm": 0.025492480769094515, + "language_loss": 0.96012545, + "learning_rate": 0.0009711349969373076, + "loss": 0.97186869, + "num_input_tokens_seen": 58759264, + "router_z_loss_mlp": 1.16992188, + "step": 704, + "time_per_iteration": 3.1337268352508545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172794, + "balance_loss_mlp": 1.05596876, + "epoch": 0.13562908811081184, + "flos": 551747163648.0, + "grad_norm": 0.026772975251671254, + "language_loss": 0.91034031, + "learning_rate": 0.0009710305842733178, + "loss": 0.9220683, + "num_input_tokens_seen": 58834800, + "router_z_loss_mlp": 1.16894531, + "step": 705, + "time_per_iteration": 2.7571139335632324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167183, + "balance_loss_mlp": 1.05031061, + "epoch": 0.1358214697960754, + "flos": 509037425664.0, + "grad_norm": 0.024292049069741084, + "language_loss": 0.98220038, + "learning_rate": 0.0009709259887390373, + "loss": 0.99387223, + "num_input_tokens_seen": 58901712, + "router_z_loss_mlp": 1.16943359, + "step": 706, + "time_per_iteration": 2.559511661529541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168004, + "balance_loss_mlp": 1.05141699, + "epoch": 0.136013851481339, + "flos": 529923300864.0, + "grad_norm": 0.025926611739077732, + "language_loss": 1.00068641, + "learning_rate": 0.0009708212103750737, + "loss": 1.01236641, + "num_input_tokens_seen": 58967824, + "router_z_loss_mlp": 1.16650391, + "step": 707, + "time_per_iteration": 2.6197190284729004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168587, + "balance_loss_mlp": 1.05219126, + "epoch": 0.13620623316660255, + "flos": 660320532480.0, + "grad_norm": 0.02235622943703988, + "language_loss": 0.96270919, + "learning_rate": 0.0009707162492221051, + "loss": 0.97439504, + "num_input_tokens_seen": 59045040, + "router_z_loss_mlp": 1.16455078, + "step": 708, + "time_per_iteration": 2.8917648792266846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171818, + "balance_loss_mlp": 1.05542207, + "epoch": 0.1363986148518661, + "flos": 673082880000.0, + "grad_norm": 0.027649047287573853, + "language_loss": 0.98132068, + "learning_rate": 0.0009706111053208815, + "loss": 0.99303889, + "num_input_tokens_seen": 59117216, + "router_z_loss_mlp": 1.16455078, + "step": 709, + "time_per_iteration": 2.7827165126800537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173191, + "balance_loss_mlp": 1.05669987, + "epoch": 0.13659099653712967, + "flos": 474004051968.0, + "grad_norm": 0.02773643003805471, + "language_loss": 0.94597077, + "learning_rate": 0.0009705057787122232, + "loss": 0.9577027, + "num_input_tokens_seen": 59183056, + "router_z_loss_mlp": 1.16552734, + "step": 710, + "time_per_iteration": 2.542836904525757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169067, + "balance_loss_mlp": 1.05286229, + "epoch": 0.13678337822239323, + "flos": 453647932416.0, + "grad_norm": 0.0248615327032158, + "language_loss": 0.9884814, + "learning_rate": 0.0009704002694370216, + "loss": 1.00017214, + "num_input_tokens_seen": 59247312, + "router_z_loss_mlp": 1.16259766, + "step": 711, + "time_per_iteration": 2.550527811050415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164533, + "balance_loss_mlp": 1.04842281, + "epoch": 0.13697575990765679, + "flos": 520625468928.0, + "grad_norm": 0.0274811578413112, + "language_loss": 0.97066599, + "learning_rate": 0.0009702945775362388, + "loss": 0.98231125, + "num_input_tokens_seen": 59317968, + "router_z_loss_mlp": 1.16162109, + "step": 712, + "time_per_iteration": 2.56953501701355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116862, + "balance_loss_mlp": 1.05246294, + "epoch": 0.13716814159292035, + "flos": 481365510144.0, + "grad_norm": 0.025544817797380492, + "language_loss": 0.98621845, + "learning_rate": 0.0009701887030509086, + "loss": 0.99790466, + "num_input_tokens_seen": 59387936, + "router_z_loss_mlp": 1.16210938, + "step": 713, + "time_per_iteration": 2.6443872451782227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173026, + "balance_loss_mlp": 1.05663013, + "epoch": 0.1373605232781839, + "flos": 546749776896.0, + "grad_norm": 0.02672517687154734, + "language_loss": 1.02031791, + "learning_rate": 0.0009700826460221346, + "loss": 1.03204811, + "num_input_tokens_seen": 59460624, + "router_z_loss_mlp": 1.16455078, + "step": 714, + "time_per_iteration": 2.6742734909057617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171339, + "balance_loss_mlp": 1.05508566, + "epoch": 0.1375529049634475, + "flos": 710070091776.0, + "grad_norm": 0.027473841831572973, + "language_loss": 1.03736091, + "learning_rate": 0.0009699764064910921, + "loss": 1.04907441, + "num_input_tokens_seen": 59536752, + "router_z_loss_mlp": 1.16308594, + "step": 715, + "time_per_iteration": 2.8945000171661377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116921, + "balance_loss_mlp": 1.05281401, + "epoch": 0.13774528664871105, + "flos": 487676189184.0, + "grad_norm": 0.02500038679906112, + "language_loss": 0.96403199, + "learning_rate": 0.0009698699844990268, + "loss": 0.9757241, + "num_input_tokens_seen": 59608128, + "router_z_loss_mlp": 1.16455078, + "step": 716, + "time_per_iteration": 2.638272762298584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116569, + "balance_loss_mlp": 1.04972363, + "epoch": 0.1379376683339746, + "flos": 681458187264.0, + "grad_norm": 0.024933229917961583, + "language_loss": 0.9565106, + "learning_rate": 0.0009697633800872555, + "loss": 0.96816742, + "num_input_tokens_seen": 59685120, + "router_z_loss_mlp": 1.16015625, + "step": 717, + "time_per_iteration": 2.8989553451538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168974, + "balance_loss_mlp": 1.05310297, + "epoch": 0.13813005001923817, + "flos": 612225368064.0, + "grad_norm": 0.02330012063083705, + "language_loss": 1.0130372, + "learning_rate": 0.0009696565932971655, + "loss": 1.02472687, + "num_input_tokens_seen": 59763376, + "router_z_loss_mlp": 1.15917969, + "step": 718, + "time_per_iteration": 2.8472671508789062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171117, + "balance_loss_mlp": 1.05524576, + "epoch": 0.13832243170450173, + "flos": 589926144000.0, + "grad_norm": 0.027418468702626427, + "language_loss": 0.98498988, + "learning_rate": 0.0009695496241702153, + "loss": 0.99670106, + "num_input_tokens_seen": 59836800, + "router_z_loss_mlp": 1.15917969, + "step": 719, + "time_per_iteration": 2.786895990371704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167345, + "balance_loss_mlp": 1.05180764, + "epoch": 0.1385148133897653, + "flos": 701319479808.0, + "grad_norm": 0.026285913371991803, + "language_loss": 0.94868541, + "learning_rate": 0.0009694424727479339, + "loss": 0.96035892, + "num_input_tokens_seen": 59914720, + "router_z_loss_mlp": 1.15576172, + "step": 720, + "time_per_iteration": 2.921644926071167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117298, + "balance_loss_mlp": 1.05729949, + "epoch": 0.13870719507502885, + "flos": 599366966784.0, + "grad_norm": 0.024279001882637877, + "language_loss": 0.97845113, + "learning_rate": 0.0009693351390719213, + "loss": 0.99018097, + "num_input_tokens_seen": 59984544, + "router_z_loss_mlp": 1.15722656, + "step": 721, + "time_per_iteration": 2.701911687850952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168632, + "balance_loss_mlp": 1.05304694, + "epoch": 0.1388995767602924, + "flos": 587748724224.0, + "grad_norm": 0.03212240351747381, + "language_loss": 0.98596126, + "learning_rate": 0.000969227623183848, + "loss": 0.99764758, + "num_input_tokens_seen": 60057056, + "router_z_loss_mlp": 1.15625, + "step": 722, + "time_per_iteration": 2.7723541259765625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167541, + "balance_loss_mlp": 1.05205071, + "epoch": 0.139091958445556, + "flos": 652362189312.0, + "grad_norm": 0.025655198862846312, + "language_loss": 0.99224544, + "learning_rate": 0.0009691199251254554, + "loss": 1.00392079, + "num_input_tokens_seen": 60133232, + "router_z_loss_mlp": 1.15527344, + "step": 723, + "time_per_iteration": 2.8426058292388916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165537, + "balance_loss_mlp": 1.05019021, + "epoch": 0.13928434013081956, + "flos": 576905286144.0, + "grad_norm": 0.022500478429048027, + "language_loss": 0.9243086, + "learning_rate": 0.0009690120449385555, + "loss": 0.93596393, + "num_input_tokens_seen": 60207104, + "router_z_loss_mlp": 1.15380859, + "step": 724, + "time_per_iteration": 2.7558276653289795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168709, + "balance_loss_mlp": 1.05307627, + "epoch": 0.13947672181608312, + "flos": 564314127360.0, + "grad_norm": 0.02294482348940274, + "language_loss": 1.00981367, + "learning_rate": 0.0009689039826650312, + "loss": 1.02150071, + "num_input_tokens_seen": 60277920, + "router_z_loss_mlp": 1.15673828, + "step": 725, + "time_per_iteration": 2.784708261489868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211281, + "balance_loss_mlp": 1.09550476, + "epoch": 0.13966910350134668, + "flos": 1524949045248.0, + "grad_norm": 0.02639881420994122, + "language_loss": 0.76523066, + "learning_rate": 0.000968795738346836, + "loss": 0.77734339, + "num_input_tokens_seen": 60494224, + "router_z_loss_mlp": 1.15820312, + "step": 726, + "time_per_iteration": 4.9523255825042725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171441, + "balance_loss_mlp": 1.05604661, + "epoch": 0.13986148518661023, + "flos": 500855500800.0, + "grad_norm": 0.0321160389091748, + "language_loss": 0.98954523, + "learning_rate": 0.0009686873120259941, + "loss": 1.00125957, + "num_input_tokens_seen": 60562176, + "router_z_loss_mlp": 1.15429688, + "step": 727, + "time_per_iteration": 2.584141731262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173326, + "balance_loss_mlp": 1.05850363, + "epoch": 0.1400538668718738, + "flos": 599849058816.0, + "grad_norm": 0.027531106684590426, + "language_loss": 0.93834305, + "learning_rate": 0.0009685787037446004, + "loss": 0.95007634, + "num_input_tokens_seen": 60631472, + "router_z_loss_mlp": 1.1484375, + "step": 728, + "time_per_iteration": 2.770592451095581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170215, + "balance_loss_mlp": 1.05520177, + "epoch": 0.14024624855713735, + "flos": 595168579584.0, + "grad_norm": 0.026051179565135866, + "language_loss": 0.98294961, + "learning_rate": 0.0009684699135448201, + "loss": 0.99465179, + "num_input_tokens_seen": 60703488, + "router_z_loss_mlp": 1.15039062, + "step": 729, + "time_per_iteration": 2.728573799133301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164865, + "balance_loss_mlp": 1.04985154, + "epoch": 0.1404386302424009, + "flos": 507585145344.0, + "grad_norm": 0.02205061924934426, + "language_loss": 0.98307908, + "learning_rate": 0.0009683609414688895, + "loss": 0.99472773, + "num_input_tokens_seen": 60773936, + "router_z_loss_mlp": 1.15039062, + "step": 730, + "time_per_iteration": 2.700016975402832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167363, + "balance_loss_mlp": 1.05254078, + "epoch": 0.14063101192766447, + "flos": 574515018240.0, + "grad_norm": 0.021243768346974407, + "language_loss": 0.95329058, + "learning_rate": 0.0009682517875591154, + "loss": 0.96496415, + "num_input_tokens_seen": 60851120, + "router_z_loss_mlp": 1.1484375, + "step": 731, + "time_per_iteration": 2.743590831756592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116728, + "balance_loss_mlp": 1.05264843, + "epoch": 0.14082339361292806, + "flos": 565764406272.0, + "grad_norm": 0.02284757167221282, + "language_loss": 0.93998873, + "learning_rate": 0.0009681424518578749, + "loss": 0.95166153, + "num_input_tokens_seen": 60924896, + "router_z_loss_mlp": 1.14648438, + "step": 732, + "time_per_iteration": 2.757690668106079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166596, + "balance_loss_mlp": 1.05215514, + "epoch": 0.14101577529819162, + "flos": 464582694912.0, + "grad_norm": 0.02112517179619274, + "language_loss": 0.95363593, + "learning_rate": 0.000968032934407616, + "loss": 0.96530199, + "num_input_tokens_seen": 60996016, + "router_z_loss_mlp": 1.14453125, + "step": 733, + "time_per_iteration": 2.6260647773742676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167013, + "balance_loss_mlp": 1.05257201, + "epoch": 0.14120815698345518, + "flos": 597261405696.0, + "grad_norm": 0.02235342076428548, + "language_loss": 0.90822989, + "learning_rate": 0.0009679232352508571, + "loss": 0.91990006, + "num_input_tokens_seen": 61072016, + "router_z_loss_mlp": 1.14453125, + "step": 734, + "time_per_iteration": 2.7677996158599854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167689, + "balance_loss_mlp": 1.05334342, + "epoch": 0.14140053866871874, + "flos": 536231978496.0, + "grad_norm": 0.023954026934244203, + "language_loss": 0.90350544, + "learning_rate": 0.0009678133544301871, + "loss": 0.91518235, + "num_input_tokens_seen": 61144528, + "router_z_loss_mlp": 1.14355469, + "step": 735, + "time_per_iteration": 2.6668286323547363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165912, + "balance_loss_mlp": 1.05147135, + "epoch": 0.1415929203539823, + "flos": 521276748288.0, + "grad_norm": 0.01836780541558419, + "language_loss": 0.98091269, + "learning_rate": 0.0009677032919882658, + "loss": 0.99257177, + "num_input_tokens_seen": 61216960, + "router_z_loss_mlp": 1.14453125, + "step": 736, + "time_per_iteration": 2.654975652694702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175055, + "balance_loss_mlp": 1.0601368, + "epoch": 0.14178530203924586, + "flos": 483301883904.0, + "grad_norm": 0.025248480485652293, + "language_loss": 1.00008237, + "learning_rate": 0.000967593047967823, + "loss": 1.01183295, + "num_input_tokens_seen": 61281312, + "router_z_loss_mlp": 1.14941406, + "step": 737, + "time_per_iteration": 2.529147148132324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167635, + "balance_loss_mlp": 1.05319452, + "epoch": 0.14197768372450942, + "flos": 677839220736.0, + "grad_norm": 0.02278890168576414, + "language_loss": 0.9561522, + "learning_rate": 0.0009674826224116593, + "loss": 0.96782857, + "num_input_tokens_seen": 61355888, + "router_z_loss_mlp": 1.14453125, + "step": 738, + "time_per_iteration": 2.8032455444335938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170364, + "balance_loss_mlp": 1.05606639, + "epoch": 0.14217006540977298, + "flos": 446992147968.0, + "grad_norm": 0.026055784762538982, + "language_loss": 0.97800839, + "learning_rate": 0.0009673720153626455, + "loss": 0.989712, + "num_input_tokens_seen": 61424288, + "router_z_loss_mlp": 1.14306641, + "step": 739, + "time_per_iteration": 2.629868984222412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172861, + "balance_loss_mlp": 1.05889642, + "epoch": 0.14236244709503657, + "flos": 497477580288.0, + "grad_norm": 0.02475738760241807, + "language_loss": 0.95941108, + "learning_rate": 0.0009672612268637235, + "loss": 0.97113973, + "num_input_tokens_seen": 61493344, + "router_z_loss_mlp": 1.13964844, + "step": 740, + "time_per_iteration": 2.6037824153900146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170194, + "balance_loss_mlp": 1.05618262, + "epoch": 0.14255482878030012, + "flos": 649479095808.0, + "grad_norm": 0.03387034378547869, + "language_loss": 0.95329261, + "learning_rate": 0.0009671502569579048, + "loss": 0.96499455, + "num_input_tokens_seen": 61565216, + "router_z_loss_mlp": 1.14013672, + "step": 741, + "time_per_iteration": 2.7700846195220947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170542, + "balance_loss_mlp": 1.05657792, + "epoch": 0.14274721046556368, + "flos": 537274025472.0, + "grad_norm": 0.02433568326488268, + "language_loss": 0.98081231, + "learning_rate": 0.0009670391056882719, + "loss": 0.99251777, + "num_input_tokens_seen": 61640928, + "router_z_loss_mlp": 1.13964844, + "step": 742, + "time_per_iteration": 2.696019172668457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174036, + "balance_loss_mlp": 1.06002402, + "epoch": 0.14293959215082724, + "flos": 958583572992.0, + "grad_norm": 0.027423351639808666, + "language_loss": 0.96458268, + "learning_rate": 0.0009669277730979776, + "loss": 0.97632295, + "num_input_tokens_seen": 61717552, + "router_z_loss_mlp": 1.14013672, + "step": 743, + "time_per_iteration": 3.2084367275238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174905, + "balance_loss_mlp": 1.06103587, + "epoch": 0.1431319738360908, + "flos": 694385719296.0, + "grad_norm": 0.02304461389980259, + "language_loss": 0.94654781, + "learning_rate": 0.0009668162592302449, + "loss": 0.9582969, + "num_input_tokens_seen": 61800016, + "router_z_loss_mlp": 1.13867188, + "step": 744, + "time_per_iteration": 2.8862292766571045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184206, + "balance_loss_mlp": 1.07009852, + "epoch": 0.14332435552135436, + "flos": 566502280704.0, + "grad_norm": 0.024928546312887438, + "language_loss": 0.9473027, + "learning_rate": 0.0009667045641283676, + "loss": 0.95914471, + "num_input_tokens_seen": 61865904, + "router_z_loss_mlp": 1.14111328, + "step": 745, + "time_per_iteration": 2.6714677810668945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170598, + "balance_loss_mlp": 1.05672932, + "epoch": 0.14351673720661792, + "flos": 739695845376.0, + "grad_norm": 0.027004630074695047, + "language_loss": 1.03854704, + "learning_rate": 0.0009665926878357092, + "loss": 1.05025315, + "num_input_tokens_seen": 61945728, + "router_z_loss_mlp": 1.13867188, + "step": 746, + "time_per_iteration": 2.9414963722229004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168037, + "balance_loss_mlp": 1.05416811, + "epoch": 0.14370911889188148, + "flos": 550351279104.0, + "grad_norm": 0.024394803732961844, + "language_loss": 0.99195439, + "learning_rate": 0.0009664806303957043, + "loss": 1.00363481, + "num_input_tokens_seen": 62016288, + "router_z_loss_mlp": 1.13867188, + "step": 747, + "time_per_iteration": 2.6798276901245117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175063, + "balance_loss_mlp": 1.06109881, + "epoch": 0.14390150057714507, + "flos": 591589271040.0, + "grad_norm": 0.028912253716933817, + "language_loss": 0.96970344, + "learning_rate": 0.0009663683918518571, + "loss": 0.98145401, + "num_input_tokens_seen": 62097904, + "router_z_loss_mlp": 1.13964844, + "step": 748, + "time_per_iteration": 2.894670248031616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172034, + "balance_loss_mlp": 1.05845118, + "epoch": 0.14409388226240863, + "flos": 592144496640.0, + "grad_norm": 0.025560266799661176, + "language_loss": 0.96381319, + "learning_rate": 0.0009662559722477428, + "loss": 0.97553355, + "num_input_tokens_seen": 62166736, + "router_z_loss_mlp": 1.13574219, + "step": 749, + "time_per_iteration": 2.702796220779419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193848, + "balance_loss_mlp": 1.08131409, + "epoch": 0.1442862639476722, + "flos": 1514654828544.0, + "grad_norm": 0.02305864885865106, + "language_loss": 0.7616297, + "learning_rate": 0.0009661433716270062, + "loss": 0.77356815, + "num_input_tokens_seen": 62402512, + "router_z_loss_mlp": 1.125, + "step": 750, + "time_per_iteration": 5.010634660720825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166507, + "balance_loss_mlp": 1.05287659, + "epoch": 0.14447864563293575, + "flos": 497855612928.0, + "grad_norm": 0.023714468612350204, + "language_loss": 0.97989428, + "learning_rate": 0.0009660305900333632, + "loss": 0.99155927, + "num_input_tokens_seen": 62473408, + "router_z_loss_mlp": 1.13623047, + "step": 751, + "time_per_iteration": 2.7064144611358643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172129, + "balance_loss_mlp": 1.05845106, + "epoch": 0.1446710273181993, + "flos": 590794274304.0, + "grad_norm": 0.03190287595859636, + "language_loss": 0.91963172, + "learning_rate": 0.0009659176275105992, + "loss": 0.93135297, + "num_input_tokens_seen": 62547440, + "router_z_loss_mlp": 1.13671875, + "step": 752, + "time_per_iteration": 2.7171401977539062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171619, + "balance_loss_mlp": 1.05803668, + "epoch": 0.14486340900346287, + "flos": 587012851200.0, + "grad_norm": 0.023715921645424867, + "language_loss": 0.93508279, + "learning_rate": 0.0009658044841025701, + "loss": 0.94679892, + "num_input_tokens_seen": 62620224, + "router_z_loss_mlp": 1.13574219, + "step": 753, + "time_per_iteration": 2.77504563331604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172686, + "balance_loss_mlp": 1.05900788, + "epoch": 0.14505579068872643, + "flos": 505740096000.0, + "grad_norm": 0.025730958483317315, + "language_loss": 0.9055903, + "learning_rate": 0.0009656911598532021, + "loss": 0.91731715, + "num_input_tokens_seen": 62690464, + "router_z_loss_mlp": 1.13671875, + "step": 754, + "time_per_iteration": 2.642886161804199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172881, + "balance_loss_mlp": 1.05925071, + "epoch": 0.14524817237399, + "flos": 487815177216.0, + "grad_norm": 0.025261406861214447, + "language_loss": 0.98625988, + "learning_rate": 0.0009655776548064917, + "loss": 0.9979887, + "num_input_tokens_seen": 62762240, + "router_z_loss_mlp": 1.13623047, + "step": 755, + "time_per_iteration": 2.6610004901885986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169342, + "balance_loss_mlp": 1.05571139, + "epoch": 0.14544055405925355, + "flos": 729449292288.0, + "grad_norm": 0.025093779151575485, + "language_loss": 0.97407329, + "learning_rate": 0.0009654639690065054, + "loss": 0.98576677, + "num_input_tokens_seen": 62839760, + "router_z_loss_mlp": 1.13623047, + "step": 756, + "time_per_iteration": 2.867976665496826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173831, + "balance_loss_mlp": 1.06024873, + "epoch": 0.14563293574451713, + "flos": 594786544128.0, + "grad_norm": 0.02769433731610086, + "language_loss": 0.96328217, + "learning_rate": 0.00096535010249738, + "loss": 0.97502041, + "num_input_tokens_seen": 62910336, + "router_z_loss_mlp": 1.13574219, + "step": 757, + "time_per_iteration": 2.718595266342163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171947, + "balance_loss_mlp": 1.05879402, + "epoch": 0.1458253174297807, + "flos": 561622414848.0, + "grad_norm": 0.027253539371253223, + "language_loss": 0.93671888, + "learning_rate": 0.0009652360553233224, + "loss": 0.94843829, + "num_input_tokens_seen": 62988160, + "router_z_loss_mlp": 1.13134766, + "step": 758, + "time_per_iteration": 2.732665538787842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181084, + "balance_loss_mlp": 1.06835938, + "epoch": 0.14601769911504425, + "flos": 1561186922496.0, + "grad_norm": 0.016548141494889222, + "language_loss": 0.73773748, + "learning_rate": 0.0009651218275286093, + "loss": 0.74954832, + "num_input_tokens_seen": 63224704, + "router_z_loss_mlp": 1.12695312, + "step": 759, + "time_per_iteration": 4.9278404712677 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161415, + "balance_loss_mlp": 1.04840457, + "epoch": 0.1462100808003078, + "flos": 867822331392.0, + "grad_norm": 0.024551380524627048, + "language_loss": 0.89752859, + "learning_rate": 0.0009650074191575883, + "loss": 0.90914273, + "num_input_tokens_seen": 63312400, + "router_z_loss_mlp": 1.12988281, + "step": 760, + "time_per_iteration": 3.18084716796875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011658, + "balance_loss_mlp": 1.05302811, + "epoch": 0.14640246248557137, + "flos": 524029585920.0, + "grad_norm": 0.025729752682943422, + "language_loss": 0.95023656, + "learning_rate": 0.0009648928302546766, + "loss": 0.96189463, + "num_input_tokens_seen": 63387792, + "router_z_loss_mlp": 1.12744141, + "step": 761, + "time_per_iteration": 2.707385301589966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161728, + "balance_loss_mlp": 1.04895639, + "epoch": 0.14659484417083493, + "flos": 1032241089024.0, + "grad_norm": 0.022974522077421757, + "language_loss": 0.94352418, + "learning_rate": 0.0009647780608643613, + "loss": 0.95514143, + "num_input_tokens_seen": 63475632, + "router_z_loss_mlp": 1.12744141, + "step": 762, + "time_per_iteration": 3.357776165008545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116078, + "balance_loss_mlp": 1.04848516, + "epoch": 0.1467872258560985, + "flos": 501656501760.0, + "grad_norm": 0.027279773355913427, + "language_loss": 0.99627388, + "learning_rate": 0.0009646631110312001, + "loss": 1.00788176, + "num_input_tokens_seen": 63546080, + "router_z_loss_mlp": 1.12255859, + "step": 763, + "time_per_iteration": 2.629650115966797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159049, + "balance_loss_mlp": 1.04665887, + "epoch": 0.14697960754136205, + "flos": 548935928832.0, + "grad_norm": 0.020644179018096606, + "language_loss": 0.95446718, + "learning_rate": 0.0009645479807998203, + "loss": 0.96605766, + "num_input_tokens_seen": 63622464, + "router_z_loss_mlp": 1.12353516, + "step": 764, + "time_per_iteration": 2.7475621700286865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157825, + "balance_loss_mlp": 1.04510117, + "epoch": 0.14717198922662564, + "flos": 518901943296.0, + "grad_norm": 0.021535065255329562, + "language_loss": 0.99812603, + "learning_rate": 0.0009644326702149196, + "loss": 1.00970435, + "num_input_tokens_seen": 63694736, + "router_z_loss_mlp": 1.12695312, + "step": 765, + "time_per_iteration": 2.711500406265259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158907, + "balance_loss_mlp": 1.04618227, + "epoch": 0.1473643709118892, + "flos": 733483221504.0, + "grad_norm": 0.02504361772442387, + "language_loss": 0.95452881, + "learning_rate": 0.0009643171793212653, + "loss": 0.96611786, + "num_input_tokens_seen": 63779072, + "router_z_loss_mlp": 1.12695312, + "step": 766, + "time_per_iteration": 3.130798578262329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163931, + "balance_loss_mlp": 1.05115891, + "epoch": 0.14755675259715276, + "flos": 621668192256.0, + "grad_norm": 0.027740201354691706, + "language_loss": 0.99870968, + "learning_rate": 0.0009642015081636952, + "loss": 1.01034904, + "num_input_tokens_seen": 63847472, + "router_z_loss_mlp": 1.12744141, + "step": 767, + "time_per_iteration": 2.701939344406128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160055, + "balance_loss_mlp": 1.04761696, + "epoch": 0.14774913428241632, + "flos": 453172571136.0, + "grad_norm": 0.025159341457135456, + "language_loss": 0.98449206, + "learning_rate": 0.0009640856567871166, + "loss": 0.99609256, + "num_input_tokens_seen": 63912496, + "router_z_loss_mlp": 1.12402344, + "step": 768, + "time_per_iteration": 2.516721725463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164874, + "balance_loss_mlp": 1.05262613, + "epoch": 0.14794151596767988, + "flos": 838654474752.0, + "grad_norm": 0.02612823197324643, + "language_loss": 0.99416363, + "learning_rate": 0.0009639696252365072, + "loss": 1.00581241, + "num_input_tokens_seen": 63990832, + "router_z_loss_mlp": 1.12207031, + "step": 769, + "time_per_iteration": 3.06074857711792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167068, + "balance_loss_mlp": 1.05472481, + "epoch": 0.14813389765294344, + "flos": 687404295168.0, + "grad_norm": 0.02602975967937929, + "language_loss": 0.89651555, + "learning_rate": 0.0009638534135569144, + "loss": 0.90818626, + "num_input_tokens_seen": 64067552, + "router_z_loss_mlp": 1.12304688, + "step": 770, + "time_per_iteration": 2.9440436363220215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169876, + "balance_loss_mlp": 1.05753326, + "epoch": 0.148326279338207, + "flos": 510943600128.0, + "grad_norm": 0.028093178265757666, + "language_loss": 1.01150489, + "learning_rate": 0.0009637370217934554, + "loss": 1.02320373, + "num_input_tokens_seen": 64140336, + "router_z_loss_mlp": 1.12304688, + "step": 771, + "time_per_iteration": 2.649656295776367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166681, + "balance_loss_mlp": 1.05443311, + "epoch": 0.14851866102347056, + "flos": 589331260416.0, + "grad_norm": 0.028336871459981, + "language_loss": 0.90924722, + "learning_rate": 0.0009636204499913175, + "loss": 0.92091405, + "num_input_tokens_seen": 64223472, + "router_z_loss_mlp": 1.12207031, + "step": 772, + "time_per_iteration": 2.8592941761016846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157961, + "balance_loss_mlp": 1.04609525, + "epoch": 0.14871104270873411, + "flos": 692247230976.0, + "grad_norm": 0.030313888046816524, + "language_loss": 0.95830965, + "learning_rate": 0.0009635036981957581, + "loss": 0.96988928, + "num_input_tokens_seen": 64299872, + "router_z_loss_mlp": 1.11816406, + "step": 773, + "time_per_iteration": 2.8690600395202637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160765, + "balance_loss_mlp": 1.04904246, + "epoch": 0.1489034243939977, + "flos": 656282600448.0, + "grad_norm": 0.02808100337337059, + "language_loss": 0.98035401, + "learning_rate": 0.0009633867664521043, + "loss": 0.99196172, + "num_input_tokens_seen": 64377152, + "router_z_loss_mlp": 1.11669922, + "step": 774, + "time_per_iteration": 2.812833070755005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159463, + "balance_loss_mlp": 1.04788363, + "epoch": 0.14909580607926126, + "flos": 476795821056.0, + "grad_norm": 0.030787585825694654, + "language_loss": 0.97385693, + "learning_rate": 0.0009632696548057527, + "loss": 0.98545158, + "num_input_tokens_seen": 64443008, + "router_z_loss_mlp": 1.11523438, + "step": 775, + "time_per_iteration": 2.5662474632263184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160778, + "balance_loss_mlp": 1.04910243, + "epoch": 0.14928818776452482, + "flos": 612283765248.0, + "grad_norm": 0.030552265213122824, + "language_loss": 0.94746792, + "learning_rate": 0.0009631523633021704, + "loss": 0.95907569, + "num_input_tokens_seen": 64519776, + "router_z_loss_mlp": 1.11621094, + "step": 776, + "time_per_iteration": 2.789336919784546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115528, + "balance_loss_mlp": 1.04408133, + "epoch": 0.14948056944978838, + "flos": 562916241408.0, + "grad_norm": 0.02653866309736765, + "language_loss": 0.98006344, + "learning_rate": 0.0009630348919868936, + "loss": 0.99161637, + "num_input_tokens_seen": 64593712, + "router_z_loss_mlp": 1.11132812, + "step": 777, + "time_per_iteration": 2.708918571472168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115506, + "balance_loss_mlp": 1.04395676, + "epoch": 0.14967295113505194, + "flos": 450111558144.0, + "grad_norm": 0.02761804701826243, + "language_loss": 0.92444694, + "learning_rate": 0.0009629172409055293, + "loss": 0.93599755, + "num_input_tokens_seen": 64658448, + "router_z_loss_mlp": 1.11035156, + "step": 778, + "time_per_iteration": 2.522322177886963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154649, + "balance_loss_mlp": 1.0435462, + "epoch": 0.1498653328203155, + "flos": 572428922880.0, + "grad_norm": 0.02112796064723151, + "language_loss": 0.9446094, + "learning_rate": 0.0009627994101037531, + "loss": 0.9561559, + "num_input_tokens_seen": 64734144, + "router_z_loss_mlp": 1.11035156, + "step": 779, + "time_per_iteration": 2.7606184482574463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154399, + "balance_loss_mlp": 1.0433439, + "epoch": 0.15005771450557906, + "flos": 632407570944.0, + "grad_norm": 0.02232887996041627, + "language_loss": 0.98232067, + "learning_rate": 0.0009626813996273114, + "loss": 0.99386466, + "num_input_tokens_seen": 64813456, + "router_z_loss_mlp": 1.10986328, + "step": 780, + "time_per_iteration": 2.8442463874816895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156542, + "balance_loss_mlp": 1.04553461, + "epoch": 0.15025009619084262, + "flos": 579165298176.0, + "grad_norm": 0.021576328362923832, + "language_loss": 0.96611506, + "learning_rate": 0.0009625632095220198, + "loss": 0.97768044, + "num_input_tokens_seen": 64896816, + "router_z_loss_mlp": 1.109375, + "step": 781, + "time_per_iteration": 2.823941469192505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156174, + "balance_loss_mlp": 1.04492784, + "epoch": 0.1504424778761062, + "flos": 484856222208.0, + "grad_norm": 0.023769174200548453, + "language_loss": 0.96595448, + "learning_rate": 0.0009624448398337637, + "loss": 0.97751617, + "num_input_tokens_seen": 64964176, + "router_z_loss_mlp": 1.11181641, + "step": 782, + "time_per_iteration": 2.517115354537964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153917, + "balance_loss_mlp": 1.04286146, + "epoch": 0.15063485956136977, + "flos": 763894513152.0, + "grad_norm": 0.022118467112767815, + "language_loss": 0.97773027, + "learning_rate": 0.0009623262906084984, + "loss": 0.98926944, + "num_input_tokens_seen": 65042592, + "router_z_loss_mlp": 1.10986328, + "step": 783, + "time_per_iteration": 2.9971072673797607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156171, + "balance_loss_mlp": 1.04554462, + "epoch": 0.15082724124663333, + "flos": 498676079616.0, + "grad_norm": 0.021733375764601555, + "language_loss": 0.99047554, + "learning_rate": 0.0009622075618922486, + "loss": 1.00203729, + "num_input_tokens_seen": 65114576, + "router_z_loss_mlp": 1.10546875, + "step": 784, + "time_per_iteration": 2.7209272384643555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161923, + "balance_loss_mlp": 1.05110586, + "epoch": 0.15101962293189689, + "flos": 510722019840.0, + "grad_norm": 0.02414763506099098, + "language_loss": 0.95223093, + "learning_rate": 0.0009620886537311091, + "loss": 0.96385014, + "num_input_tokens_seen": 65186640, + "router_z_loss_mlp": 1.10742188, + "step": 785, + "time_per_iteration": 2.668501138687134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154688, + "balance_loss_mlp": 1.04406226, + "epoch": 0.15121200461716044, + "flos": 458701714944.0, + "grad_norm": 0.026890312379790088, + "language_loss": 0.97208995, + "learning_rate": 0.000961969566171244, + "loss": 0.98363686, + "num_input_tokens_seen": 65252112, + "router_z_loss_mlp": 1.10546875, + "step": 786, + "time_per_iteration": 2.5466530323028564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153217, + "balance_loss_mlp": 1.04278123, + "epoch": 0.151404386302424, + "flos": 539017016832.0, + "grad_norm": 0.02528800532756524, + "language_loss": 1.00058115, + "learning_rate": 0.0009618502992588873, + "loss": 1.01211333, + "num_input_tokens_seen": 65318912, + "router_z_loss_mlp": 1.10351562, + "step": 787, + "time_per_iteration": 2.6463584899902344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154208, + "balance_loss_mlp": 1.04358232, + "epoch": 0.15159676798768756, + "flos": 689616643584.0, + "grad_norm": 0.023869082053813537, + "language_loss": 0.98612797, + "learning_rate": 0.0009617308530403424, + "loss": 0.99766994, + "num_input_tokens_seen": 65395424, + "router_z_loss_mlp": 1.10546875, + "step": 788, + "time_per_iteration": 3.065110921859741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158206, + "balance_loss_mlp": 1.04758012, + "epoch": 0.15178914967295112, + "flos": 546432869376.0, + "grad_norm": 0.025092696297707027, + "language_loss": 0.95288265, + "learning_rate": 0.0009616112275619825, + "loss": 0.96446472, + "num_input_tokens_seen": 65470480, + "router_z_loss_mlp": 1.10546875, + "step": 789, + "time_per_iteration": 2.7197253704071045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158368, + "balance_loss_mlp": 1.0478847, + "epoch": 0.1519815313582147, + "flos": 512814845952.0, + "grad_norm": 0.020890571468345706, + "language_loss": 0.90545368, + "learning_rate": 0.0009614914228702503, + "loss": 0.91703737, + "num_input_tokens_seen": 65544720, + "router_z_loss_mlp": 1.10400391, + "step": 790, + "time_per_iteration": 2.6894142627716064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158071, + "balance_loss_mlp": 1.04782641, + "epoch": 0.15217391304347827, + "flos": 685457187840.0, + "grad_norm": 0.02448742031060442, + "language_loss": 0.96480352, + "learning_rate": 0.0009613714390116581, + "loss": 0.97638422, + "num_input_tokens_seen": 65627872, + "router_z_loss_mlp": 1.1015625, + "step": 791, + "time_per_iteration": 2.9898860454559326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155788, + "balance_loss_mlp": 1.04568636, + "epoch": 0.15236629472874183, + "flos": 645445893120.0, + "grad_norm": 0.023088199171654812, + "language_loss": 0.93995309, + "learning_rate": 0.0009612512760327879, + "loss": 0.95151103, + "num_input_tokens_seen": 65705264, + "router_z_loss_mlp": 1.10009766, + "step": 792, + "time_per_iteration": 2.855648994445801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154532, + "balance_loss_mlp": 1.0444783, + "epoch": 0.1525586764140054, + "flos": 413764892160.0, + "grad_norm": 0.024948238648346503, + "language_loss": 0.97790802, + "learning_rate": 0.0009611309339802909, + "loss": 0.98945332, + "num_input_tokens_seen": 65768592, + "router_z_loss_mlp": 1.09960938, + "step": 793, + "time_per_iteration": 2.4684345722198486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153777, + "balance_loss_mlp": 1.04372334, + "epoch": 0.15275105809926895, + "flos": 804233448960.0, + "grad_norm": 0.02131820977076166, + "language_loss": 0.93039513, + "learning_rate": 0.0009610104129008881, + "loss": 0.94193292, + "num_input_tokens_seen": 65852432, + "router_z_loss_mlp": 1.09960938, + "step": 794, + "time_per_iteration": 3.1013269424438477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155691, + "balance_loss_mlp": 1.04554129, + "epoch": 0.1529434397845325, + "flos": 613542663168.0, + "grad_norm": 0.024012716250022468, + "language_loss": 0.97966266, + "learning_rate": 0.0009608897128413701, + "loss": 0.99121952, + "num_input_tokens_seen": 65927904, + "router_z_loss_mlp": 1.10058594, + "step": 795, + "time_per_iteration": 2.729837417602539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154149, + "balance_loss_mlp": 1.04419053, + "epoch": 0.15313582146979607, + "flos": 616471418880.0, + "grad_norm": 0.02134077894827986, + "language_loss": 0.93399352, + "learning_rate": 0.0009607688338485965, + "loss": 0.945535, + "num_input_tokens_seen": 66006800, + "router_z_loss_mlp": 1.09863281, + "step": 796, + "time_per_iteration": 2.8517422676086426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153572, + "balance_loss_mlp": 1.04409015, + "epoch": 0.15332820315505963, + "flos": 794992012800.0, + "grad_norm": 0.02204541106277596, + "language_loss": 0.98951191, + "learning_rate": 0.0009606477759694969, + "loss": 1.00104761, + "num_input_tokens_seen": 66088608, + "router_z_loss_mlp": 1.09375, + "step": 797, + "time_per_iteration": 3.0313384532928467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153537, + "balance_loss_mlp": 1.0440551, + "epoch": 0.1535205848403232, + "flos": 551256339456.0, + "grad_norm": 0.028291975879130113, + "language_loss": 0.99155664, + "learning_rate": 0.0009605265392510703, + "loss": 1.00309205, + "num_input_tokens_seen": 66153616, + "router_z_loss_mlp": 1.09375, + "step": 798, + "time_per_iteration": 2.6558592319488525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150991, + "balance_loss_mlp": 1.04122281, + "epoch": 0.15371296652558677, + "flos": 536978585088.0, + "grad_norm": 0.02676367025649214, + "language_loss": 1.00762391, + "learning_rate": 0.0009604051237403846, + "loss": 1.01913381, + "num_input_tokens_seen": 66219472, + "router_z_loss_mlp": 1.09667969, + "step": 799, + "time_per_iteration": 2.6129424571990967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151653, + "balance_loss_mlp": 1.04198015, + "epoch": 0.15390534821085033, + "flos": 396089751552.0, + "grad_norm": 0.02759928767191203, + "language_loss": 0.9523741, + "learning_rate": 0.0009602835294845776, + "loss": 0.96389061, + "num_input_tokens_seen": 66281456, + "router_z_loss_mlp": 1.09570312, + "step": 800, + "time_per_iteration": 2.4865612983703613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152453, + "balance_loss_mlp": 1.04297161, + "epoch": 0.1540977298961139, + "flos": 536885259264.0, + "grad_norm": 0.0240348205061721, + "language_loss": 0.99338514, + "learning_rate": 0.0009601617565308565, + "loss": 1.00490952, + "num_input_tokens_seen": 66348160, + "router_z_loss_mlp": 1.09375, + "step": 801, + "time_per_iteration": 2.646925449371338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155144, + "balance_loss_mlp": 1.04551864, + "epoch": 0.15429011158137745, + "flos": 725090449920.0, + "grad_norm": 0.022214532903779557, + "language_loss": 0.94821054, + "learning_rate": 0.0009600398049264977, + "loss": 0.95976186, + "num_input_tokens_seen": 66430576, + "router_z_loss_mlp": 1.09521484, + "step": 802, + "time_per_iteration": 3.0287652015686035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011558, + "balance_loss_mlp": 1.04627085, + "epoch": 0.154482493266641, + "flos": 621748783104.0, + "grad_norm": 0.025430739734688717, + "language_loss": 1.02679133, + "learning_rate": 0.0009599176747188469, + "loss": 1.03834927, + "num_input_tokens_seen": 66506480, + "router_z_loss_mlp": 1.09423828, + "step": 803, + "time_per_iteration": 2.8240089416503906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156206, + "balance_loss_mlp": 1.0467242, + "epoch": 0.15467487495190457, + "flos": 526719297024.0, + "grad_norm": 0.024483654101252486, + "language_loss": 0.90705526, + "learning_rate": 0.0009597953659553196, + "loss": 0.91861731, + "num_input_tokens_seen": 66577680, + "router_z_loss_mlp": 1.09375, + "step": 804, + "time_per_iteration": 2.745878219604492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153494, + "balance_loss_mlp": 1.04386926, + "epoch": 0.15486725663716813, + "flos": 528759730176.0, + "grad_norm": 0.02516296775651391, + "language_loss": 0.97286022, + "learning_rate": 0.0009596728786833997, + "loss": 0.98439509, + "num_input_tokens_seen": 66648496, + "router_z_loss_mlp": 1.09521484, + "step": 805, + "time_per_iteration": 2.6471030712127686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152075, + "balance_loss_mlp": 1.04244983, + "epoch": 0.1550596383224317, + "flos": 1050278799360.0, + "grad_norm": 0.026563720364072098, + "language_loss": 0.9858942, + "learning_rate": 0.0009595502129506415, + "loss": 0.99741489, + "num_input_tokens_seen": 66735216, + "router_z_loss_mlp": 1.09521484, + "step": 806, + "time_per_iteration": 3.3734352588653564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115037, + "balance_loss_mlp": 1.04088783, + "epoch": 0.15525202000769528, + "flos": 614836489728.0, + "grad_norm": 0.02624405223250092, + "language_loss": 0.91745955, + "learning_rate": 0.0009594273688046678, + "loss": 0.92896324, + "num_input_tokens_seen": 66810672, + "router_z_loss_mlp": 1.09375, + "step": 807, + "time_per_iteration": 2.8000967502593994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153708, + "balance_loss_mlp": 1.04441667, + "epoch": 0.15544440169295884, + "flos": 534102222336.0, + "grad_norm": 0.028049278390969077, + "language_loss": 0.97350299, + "learning_rate": 0.000959304346293171, + "loss": 0.98504007, + "num_input_tokens_seen": 66879824, + "router_z_loss_mlp": 1.09179688, + "step": 808, + "time_per_iteration": 2.7285830974578857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164275, + "balance_loss_mlp": 1.05464995, + "epoch": 0.1556367833782224, + "flos": 645886325760.0, + "grad_norm": 0.033021349518653896, + "language_loss": 0.99046445, + "learning_rate": 0.0009591811454639125, + "loss": 1.00210714, + "num_input_tokens_seen": 66949424, + "router_z_loss_mlp": 1.09521484, + "step": 809, + "time_per_iteration": 2.842867612838745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155411, + "balance_loss_mlp": 1.04612005, + "epoch": 0.15582916506348596, + "flos": 544952391168.0, + "grad_norm": 0.02421082053858415, + "language_loss": 0.95793635, + "learning_rate": 0.0009590577663647234, + "loss": 0.96949041, + "num_input_tokens_seen": 67024000, + "router_z_loss_mlp": 1.09179688, + "step": 810, + "time_per_iteration": 2.8207406997680664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158015, + "balance_loss_mlp": 1.04877126, + "epoch": 0.15602154674874952, + "flos": 581214463488.0, + "grad_norm": 0.022734781081273227, + "language_loss": 0.95110512, + "learning_rate": 0.0009589342090435036, + "loss": 0.96268523, + "num_input_tokens_seen": 67100672, + "router_z_loss_mlp": 1.09130859, + "step": 811, + "time_per_iteration": 2.8413872718811035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170356, + "balance_loss_mlp": 1.06068361, + "epoch": 0.15621392843401308, + "flos": 536316572160.0, + "grad_norm": 0.026628933906638022, + "language_loss": 0.97807872, + "learning_rate": 0.0009588104735482223, + "loss": 0.98978221, + "num_input_tokens_seen": 67171584, + "router_z_loss_mlp": 1.09570312, + "step": 812, + "time_per_iteration": 2.6673126220703125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164587, + "balance_loss_mlp": 1.05524826, + "epoch": 0.15640631011927664, + "flos": 551981478912.0, + "grad_norm": 0.027865461759282353, + "language_loss": 0.94247007, + "learning_rate": 0.0009586865599269177, + "loss": 0.95411587, + "num_input_tokens_seen": 67240640, + "router_z_loss_mlp": 1.09228516, + "step": 813, + "time_per_iteration": 2.655217409133911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159004, + "balance_loss_mlp": 1.04985571, + "epoch": 0.1565986918045402, + "flos": 638635657728.0, + "grad_norm": 0.024501009698068087, + "language_loss": 0.98888743, + "learning_rate": 0.0009585624682276977, + "loss": 1.00047755, + "num_input_tokens_seen": 67312976, + "router_z_loss_mlp": 1.09033203, + "step": 814, + "time_per_iteration": 2.7572293281555176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160029, + "balance_loss_mlp": 1.05073786, + "epoch": 0.15679107348980378, + "flos": 491781250560.0, + "grad_norm": 0.02545428800843787, + "language_loss": 0.97158241, + "learning_rate": 0.0009584381984987386, + "loss": 0.98318267, + "num_input_tokens_seen": 67378528, + "router_z_loss_mlp": 1.09179688, + "step": 815, + "time_per_iteration": 2.554208517074585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160766, + "balance_loss_mlp": 1.05185616, + "epoch": 0.15698345517506734, + "flos": 531002277888.0, + "grad_norm": 0.022736041606184667, + "language_loss": 0.98151159, + "learning_rate": 0.0009583137507882864, + "loss": 0.99311924, + "num_input_tokens_seen": 67449728, + "router_z_loss_mlp": 1.08789062, + "step": 816, + "time_per_iteration": 2.6635444164276123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158696, + "balance_loss_mlp": 1.04978669, + "epoch": 0.1571758368603309, + "flos": 547077417984.0, + "grad_norm": 0.024009976747476527, + "language_loss": 0.90921289, + "learning_rate": 0.000958189125144656, + "loss": 0.92079985, + "num_input_tokens_seen": 67520512, + "router_z_loss_mlp": 1.08789062, + "step": 817, + "time_per_iteration": 2.635559558868408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156061, + "balance_loss_mlp": 1.04719925, + "epoch": 0.15736821854559446, + "flos": 566743326720.0, + "grad_norm": 0.021547949482456395, + "language_loss": 0.97883654, + "learning_rate": 0.0009580643216162313, + "loss": 0.99039721, + "num_input_tokens_seen": 67592464, + "router_z_loss_mlp": 1.08740234, + "step": 818, + "time_per_iteration": 2.673997640609741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157698, + "balance_loss_mlp": 1.04888415, + "epoch": 0.15756060023085802, + "flos": 501953943552.0, + "grad_norm": 0.023826624353146583, + "language_loss": 0.90112716, + "learning_rate": 0.0009579393402514652, + "loss": 0.91270417, + "num_input_tokens_seen": 67658928, + "router_z_loss_mlp": 1.08691406, + "step": 819, + "time_per_iteration": 2.61460542678833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156999, + "balance_loss_mlp": 1.04823244, + "epoch": 0.15775298191612158, + "flos": 520271631360.0, + "grad_norm": 0.023927295219635936, + "language_loss": 0.99075627, + "learning_rate": 0.0009578141810988801, + "loss": 1.00232625, + "num_input_tokens_seen": 67727936, + "router_z_loss_mlp": 1.08642578, + "step": 820, + "time_per_iteration": 2.591036558151245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149976, + "balance_loss_mlp": 1.04111433, + "epoch": 0.15794536360138514, + "flos": 467087755776.0, + "grad_norm": 0.026283029611425073, + "language_loss": 1.00067806, + "learning_rate": 0.0009576888442070668, + "loss": 1.01217794, + "num_input_tokens_seen": 67795488, + "router_z_loss_mlp": 1.08740234, + "step": 821, + "time_per_iteration": 2.5960564613342285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151894, + "balance_loss_mlp": 1.04279363, + "epoch": 0.1581377452866487, + "flos": 518168071680.0, + "grad_norm": 0.02399653039287492, + "language_loss": 1.01290274, + "learning_rate": 0.0009575633296246854, + "loss": 1.02442169, + "num_input_tokens_seen": 67858896, + "router_z_loss_mlp": 1.08984375, + "step": 822, + "time_per_iteration": 2.579575300216675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151848, + "balance_loss_mlp": 1.04312956, + "epoch": 0.15833012697191226, + "flos": 550837373952.0, + "grad_norm": 0.02407632334340799, + "language_loss": 0.91124117, + "learning_rate": 0.0009574376374004652, + "loss": 0.92275965, + "num_input_tokens_seen": 67924864, + "router_z_loss_mlp": 1.0859375, + "step": 823, + "time_per_iteration": 2.661754608154297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162901, + "balance_loss_mlp": 1.05446815, + "epoch": 0.15852250865717585, + "flos": 488466456576.0, + "grad_norm": 0.026327967105985502, + "language_loss": 0.90841949, + "learning_rate": 0.000957311767583204, + "loss": 0.92004848, + "num_input_tokens_seen": 67992912, + "router_z_loss_mlp": 1.08300781, + "step": 824, + "time_per_iteration": 2.7887372970581055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156753, + "balance_loss_mlp": 1.04956055, + "epoch": 0.1587148903424394, + "flos": 1312696909824.0, + "grad_norm": 0.010620587901871582, + "language_loss": 0.8207159, + "learning_rate": 0.0009571857202217691, + "loss": 0.8322835, + "num_input_tokens_seen": 68207408, + "router_z_loss_mlp": 1.0703125, + "step": 825, + "time_per_iteration": 4.766167640686035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151145, + "balance_loss_mlp": 1.04304576, + "epoch": 0.15890727202770297, + "flos": 467832360960.0, + "grad_norm": 0.02959471781097451, + "language_loss": 1.0376749, + "learning_rate": 0.0009570594953650961, + "loss": 1.04918623, + "num_input_tokens_seen": 68270864, + "router_z_loss_mlp": 1.07958984, + "step": 826, + "time_per_iteration": 2.6334874629974365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151786, + "balance_loss_mlp": 1.04354417, + "epoch": 0.15909965371296653, + "flos": 778606695936.0, + "grad_norm": 0.024366848241159877, + "language_loss": 0.8923949, + "learning_rate": 0.00095693309306219, + "loss": 0.90391278, + "num_input_tokens_seen": 68355408, + "router_z_loss_mlp": 1.08105469, + "step": 827, + "time_per_iteration": 3.1078274250030518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115264, + "balance_loss_mlp": 1.04449332, + "epoch": 0.1592920353982301, + "flos": 1079962950144.0, + "grad_norm": 0.02547465125103231, + "language_loss": 0.98567259, + "learning_rate": 0.0009568065133621244, + "loss": 0.99719906, + "num_input_tokens_seen": 68437072, + "router_z_loss_mlp": 1.08007812, + "step": 828, + "time_per_iteration": 3.3287436962127686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147109, + "balance_loss_mlp": 1.03872418, + "epoch": 0.15948441708349365, + "flos": 726889837056.0, + "grad_norm": 0.026992334830630314, + "language_loss": 0.93815649, + "learning_rate": 0.0009566797563140422, + "loss": 0.94962764, + "num_input_tokens_seen": 68511696, + "router_z_loss_mlp": 1.08251953, + "step": 829, + "time_per_iteration": 2.8641507625579834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146027, + "balance_loss_mlp": 1.03788006, + "epoch": 0.1596767987687572, + "flos": 580075087872.0, + "grad_norm": 0.026140449767567974, + "language_loss": 0.96191794, + "learning_rate": 0.0009565528219671547, + "loss": 0.97337818, + "num_input_tokens_seen": 68587488, + "router_z_loss_mlp": 1.08007812, + "step": 830, + "time_per_iteration": 2.9082329273223877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147169, + "balance_loss_mlp": 1.03902268, + "epoch": 0.15986918045402077, + "flos": 530025358848.0, + "grad_norm": 0.02186736495212519, + "language_loss": 0.93771887, + "learning_rate": 0.0009564257103707418, + "loss": 0.94919056, + "num_input_tokens_seen": 68655760, + "router_z_loss_mlp": 1.08007812, + "step": 831, + "time_per_iteration": 4.109540700912476 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153246, + "balance_loss_mlp": 1.04519463, + "epoch": 0.16006156213928435, + "flos": 575669856768.0, + "grad_norm": 0.025156765484562034, + "language_loss": 1.01463771, + "learning_rate": 0.0009562984215741533, + "loss": 1.02617025, + "num_input_tokens_seen": 68724560, + "router_z_loss_mlp": 1.07910156, + "step": 832, + "time_per_iteration": 2.634381055831909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148637, + "balance_loss_mlp": 1.0408721, + "epoch": 0.1602539438245479, + "flos": 516674858496.0, + "grad_norm": 0.023022886756030446, + "language_loss": 0.90665066, + "learning_rate": 0.0009561709556268065, + "loss": 0.91813707, + "num_input_tokens_seen": 68795440, + "router_z_loss_mlp": 1.07617188, + "step": 833, + "time_per_iteration": 2.7094552516937256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115539, + "balance_loss_mlp": 1.04752922, + "epoch": 0.16044632550981147, + "flos": 622161017856.0, + "grad_norm": 0.02456985500743924, + "language_loss": 1.0306673, + "learning_rate": 0.0009560433125781884, + "loss": 1.04222107, + "num_input_tokens_seen": 68868176, + "router_z_loss_mlp": 1.07714844, + "step": 834, + "time_per_iteration": 2.7217955589294434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156285, + "balance_loss_mlp": 1.04794765, + "epoch": 0.16063870719507503, + "flos": 562127975424.0, + "grad_norm": 0.02550250825542428, + "language_loss": 1.02622008, + "learning_rate": 0.0009559154924778544, + "loss": 1.03778291, + "num_input_tokens_seen": 68939616, + "router_z_loss_mlp": 1.08203125, + "step": 835, + "time_per_iteration": 4.0438151359558105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153381, + "balance_loss_mlp": 1.04509139, + "epoch": 0.1608310888803386, + "flos": 806560590336.0, + "grad_norm": 0.023331498233936678, + "language_loss": 0.93980491, + "learning_rate": 0.0009557874953754284, + "loss": 0.95133871, + "num_input_tokens_seen": 69016192, + "router_z_loss_mlp": 1.08154297, + "step": 836, + "time_per_iteration": 3.0253541469573975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155161, + "balance_loss_mlp": 1.04739583, + "epoch": 0.16102347056560215, + "flos": 601694108160.0, + "grad_norm": 0.024039154316001603, + "language_loss": 0.9449209, + "learning_rate": 0.0009556593213206038, + "loss": 0.95647246, + "num_input_tokens_seen": 69089360, + "router_z_loss_mlp": 1.07617188, + "step": 837, + "time_per_iteration": 2.815293788909912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148071, + "balance_loss_mlp": 1.04049647, + "epoch": 0.1612158522508657, + "flos": 554614794240.0, + "grad_norm": 0.024490980939479982, + "language_loss": 0.96443379, + "learning_rate": 0.0009555309703631414, + "loss": 0.9759146, + "num_input_tokens_seen": 69161952, + "router_z_loss_mlp": 1.07421875, + "step": 838, + "time_per_iteration": 2.7353601455688477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148397, + "balance_loss_mlp": 1.0406791, + "epoch": 0.16140823393612927, + "flos": 557017797120.0, + "grad_norm": 0.026558461299776022, + "language_loss": 0.98485982, + "learning_rate": 0.0009554024425528722, + "loss": 0.99634379, + "num_input_tokens_seen": 69232432, + "router_z_loss_mlp": 1.07568359, + "step": 839, + "time_per_iteration": 2.801539182662964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146915, + "balance_loss_mlp": 1.03924477, + "epoch": 0.16160061562139286, + "flos": 544908730368.0, + "grad_norm": 0.023933605454050468, + "language_loss": 0.96992832, + "learning_rate": 0.0009552737379396948, + "loss": 0.98139745, + "num_input_tokens_seen": 69297696, + "router_z_loss_mlp": 1.07519531, + "step": 840, + "time_per_iteration": 2.613037586212158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148515, + "balance_loss_mlp": 1.04122651, + "epoch": 0.16179299730665642, + "flos": 605006900736.0, + "grad_norm": 0.020652206840645122, + "language_loss": 0.95695615, + "learning_rate": 0.0009551448565735767, + "loss": 0.96844131, + "num_input_tokens_seen": 69373888, + "router_z_loss_mlp": 1.07128906, + "step": 841, + "time_per_iteration": 2.779979705810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149052, + "balance_loss_mlp": 1.04128659, + "epoch": 0.16198537899191998, + "flos": 788551077888.0, + "grad_norm": 0.02358864683094414, + "language_loss": 0.96423578, + "learning_rate": 0.0009550157985045543, + "loss": 0.97572625, + "num_input_tokens_seen": 69449984, + "router_z_loss_mlp": 1.07617188, + "step": 842, + "time_per_iteration": 3.0352344512939453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148245, + "balance_loss_mlp": 1.04086173, + "epoch": 0.16217776067718354, + "flos": 520829584896.0, + "grad_norm": 0.02127918945612936, + "language_loss": 0.95624614, + "learning_rate": 0.0009548865637827321, + "loss": 0.96772861, + "num_input_tokens_seen": 69522736, + "router_z_loss_mlp": 1.07226562, + "step": 843, + "time_per_iteration": 2.695211172103882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148036, + "balance_loss_mlp": 1.04027128, + "epoch": 0.1623701423624471, + "flos": 506254388736.0, + "grad_norm": 0.02427958482397641, + "language_loss": 0.99469078, + "learning_rate": 0.0009547571524582838, + "loss": 1.00617111, + "num_input_tokens_seen": 69587184, + "router_z_loss_mlp": 1.07617188, + "step": 844, + "time_per_iteration": 2.586859941482544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145622, + "balance_loss_mlp": 1.03842914, + "epoch": 0.16256252404771065, + "flos": 498157057536.0, + "grad_norm": 0.025657026114593633, + "language_loss": 1.02873135, + "learning_rate": 0.0009546275645814512, + "loss": 1.04018748, + "num_input_tokens_seen": 69656560, + "router_z_loss_mlp": 1.0703125, + "step": 845, + "time_per_iteration": 2.735323190689087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147597, + "balance_loss_mlp": 1.04040384, + "epoch": 0.16275490573297421, + "flos": 503286701568.0, + "grad_norm": 0.024743383464961046, + "language_loss": 1.00377154, + "learning_rate": 0.0009544978002025446, + "loss": 1.01524746, + "num_input_tokens_seen": 69723872, + "router_z_loss_mlp": 1.0703125, + "step": 846, + "time_per_iteration": 2.5876121520996094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148997, + "balance_loss_mlp": 1.04189885, + "epoch": 0.16294728741823777, + "flos": 508353945600.0, + "grad_norm": 0.020876938588178177, + "language_loss": 0.94877481, + "learning_rate": 0.0009543678593719434, + "loss": 0.9602648, + "num_input_tokens_seen": 69795504, + "router_z_loss_mlp": 1.06933594, + "step": 847, + "time_per_iteration": 2.69250750541687 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159847, + "balance_loss_mlp": 1.05274892, + "epoch": 0.16313966910350133, + "flos": 510756948480.0, + "grad_norm": 0.020936629725758764, + "language_loss": 0.95534647, + "learning_rate": 0.0009542377421400945, + "loss": 0.96694493, + "num_input_tokens_seen": 69873408, + "router_z_loss_mlp": 1.06933594, + "step": 848, + "time_per_iteration": 2.7832183837890625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146796, + "balance_loss_mlp": 1.03965068, + "epoch": 0.16333205078876492, + "flos": 545056450560.0, + "grad_norm": 0.023544058946573278, + "language_loss": 0.94486761, + "learning_rate": 0.0009541074485575145, + "loss": 0.95633554, + "num_input_tokens_seen": 69944112, + "router_z_loss_mlp": 1.06982422, + "step": 849, + "time_per_iteration": 2.7163026332855225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147161, + "balance_loss_mlp": 1.03996801, + "epoch": 0.16352443247402848, + "flos": 508711785984.0, + "grad_norm": 0.023080110816121054, + "language_loss": 1.00550437, + "learning_rate": 0.0009539769786747874, + "loss": 1.01697588, + "num_input_tokens_seen": 70012288, + "router_z_loss_mlp": 1.0703125, + "step": 850, + "time_per_iteration": 2.5918350219726562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152854, + "balance_loss_mlp": 1.04547, + "epoch": 0.16371681415929204, + "flos": 543222134784.0, + "grad_norm": 0.022593715242085626, + "language_loss": 0.90895152, + "learning_rate": 0.0009538463325425665, + "loss": 0.92048007, + "num_input_tokens_seen": 70086560, + "router_z_loss_mlp": 1.07226562, + "step": 851, + "time_per_iteration": 2.701662063598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146583, + "balance_loss_mlp": 1.03939056, + "epoch": 0.1639091958445556, + "flos": 521760841728.0, + "grad_norm": 0.025319624949764974, + "language_loss": 0.95562863, + "learning_rate": 0.0009537155102115728, + "loss": 0.96709442, + "num_input_tokens_seen": 70153968, + "router_z_loss_mlp": 1.0703125, + "step": 852, + "time_per_iteration": 2.577416181564331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145576, + "balance_loss_mlp": 1.03871727, + "epoch": 0.16410157752981916, + "flos": 548482034688.0, + "grad_norm": 0.022217218078565786, + "language_loss": 0.92332971, + "learning_rate": 0.0009535845117325961, + "loss": 0.93478549, + "num_input_tokens_seen": 70222496, + "router_z_loss_mlp": 1.06689453, + "step": 853, + "time_per_iteration": 2.643528699874878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148166, + "balance_loss_mlp": 1.04135406, + "epoch": 0.16429395921508272, + "flos": 584025698304.0, + "grad_norm": 0.02024018106959617, + "language_loss": 1.00128078, + "learning_rate": 0.0009534533371564946, + "loss": 1.01276231, + "num_input_tokens_seen": 70301680, + "router_z_loss_mlp": 1.06640625, + "step": 854, + "time_per_iteration": 2.74361515045166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150543, + "balance_loss_mlp": 1.04377949, + "epoch": 0.16448634090034628, + "flos": 531961732608.0, + "grad_norm": 0.02843561601072028, + "language_loss": 1.00094676, + "learning_rate": 0.0009533219865341949, + "loss": 1.01245213, + "num_input_tokens_seen": 70371152, + "router_z_loss_mlp": 1.06591797, + "step": 855, + "time_per_iteration": 2.5858380794525146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156957, + "balance_loss_mlp": 1.05014503, + "epoch": 0.16467872258560984, + "flos": 492960284160.0, + "grad_norm": 0.026495144396752456, + "language_loss": 0.95923662, + "learning_rate": 0.0009531904599166916, + "loss": 0.97080612, + "num_input_tokens_seen": 70440832, + "router_z_loss_mlp": 1.06640625, + "step": 856, + "time_per_iteration": 2.638528823852539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147973, + "balance_loss_mlp": 1.04101861, + "epoch": 0.16487110427087343, + "flos": 507259505664.0, + "grad_norm": 0.02303677132947941, + "language_loss": 0.95950538, + "learning_rate": 0.0009530587573550478, + "loss": 0.97098505, + "num_input_tokens_seen": 70507424, + "router_z_loss_mlp": 1.06787109, + "step": 857, + "time_per_iteration": 2.5788354873657227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151596, + "balance_loss_mlp": 1.04592896, + "epoch": 0.16506348595613698, + "flos": 1436108714496.0, + "grad_norm": 0.011861304780107247, + "language_loss": 0.74319386, + "learning_rate": 0.0009529268789003953, + "loss": 0.75470984, + "num_input_tokens_seen": 70742320, + "router_z_loss_mlp": 1.0546875, + "step": 858, + "time_per_iteration": 5.003005027770996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153597, + "balance_loss_mlp": 1.04673755, + "epoch": 0.16525586764140054, + "flos": 478089647616.0, + "grad_norm": 0.02595402254221991, + "language_loss": 0.98057735, + "learning_rate": 0.0009527948246039337, + "loss": 0.99211335, + "num_input_tokens_seen": 70808400, + "router_z_loss_mlp": 1.06689453, + "step": 859, + "time_per_iteration": 2.541255474090576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152748, + "balance_loss_mlp": 1.04622293, + "epoch": 0.1654482493266641, + "flos": 882540518400.0, + "grad_norm": 0.024187417777422206, + "language_loss": 0.96476752, + "learning_rate": 0.000952662594516931, + "loss": 0.97629499, + "num_input_tokens_seen": 70886192, + "router_z_loss_mlp": 1.06347656, + "step": 860, + "time_per_iteration": 3.102233409881592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154678, + "balance_loss_mlp": 1.04791439, + "epoch": 0.16564063101192766, + "flos": 628105124352.0, + "grad_norm": 0.02242324391324738, + "language_loss": 0.93166292, + "learning_rate": 0.0009525301886907234, + "loss": 0.94320977, + "num_input_tokens_seen": 70964816, + "router_z_loss_mlp": 1.06591797, + "step": 861, + "time_per_iteration": 2.871971368789673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151309, + "balance_loss_mlp": 1.04487896, + "epoch": 0.16583301269719122, + "flos": 562592603136.0, + "grad_norm": 0.02248996903194516, + "language_loss": 0.97140592, + "learning_rate": 0.0009523976071767155, + "loss": 0.98291898, + "num_input_tokens_seen": 71037456, + "router_z_loss_mlp": 1.0625, + "step": 862, + "time_per_iteration": 2.653031349182129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146763, + "balance_loss_mlp": 1.04038036, + "epoch": 0.16602539438245478, + "flos": 568983873024.0, + "grad_norm": 0.020794335354585358, + "language_loss": 0.9646408, + "learning_rate": 0.00095226485002638, + "loss": 0.97610843, + "num_input_tokens_seen": 71111872, + "router_z_loss_mlp": 1.06201172, + "step": 863, + "time_per_iteration": 2.7685163021087646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147042, + "balance_loss_mlp": 1.04075551, + "epoch": 0.16621777606771834, + "flos": 576021692928.0, + "grad_norm": 0.021581021962121343, + "language_loss": 0.96560466, + "learning_rate": 0.0009521319172912576, + "loss": 0.9770751, + "num_input_tokens_seen": 71187808, + "router_z_loss_mlp": 1.06103516, + "step": 864, + "time_per_iteration": 2.762233257293701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149511, + "balance_loss_mlp": 1.0432713, + "epoch": 0.16641015775298193, + "flos": 515597882880.0, + "grad_norm": 0.029880870913045234, + "language_loss": 1.0375855, + "learning_rate": 0.0009519988090229579, + "loss": 1.04908061, + "num_input_tokens_seen": 71261728, + "router_z_loss_mlp": 1.06054688, + "step": 865, + "time_per_iteration": 2.7156929969787598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148426, + "balance_loss_mlp": 1.04199588, + "epoch": 0.1666025394382455, + "flos": 622849227264.0, + "grad_norm": 0.023088954173990716, + "language_loss": 0.96669209, + "learning_rate": 0.0009518655252731576, + "loss": 0.9781763, + "num_input_tokens_seen": 71338352, + "router_z_loss_mlp": 1.0625, + "step": 866, + "time_per_iteration": 2.76474928855896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147261, + "balance_loss_mlp": 1.04102135, + "epoch": 0.16679492112350905, + "flos": 549932313600.0, + "grad_norm": 0.021458749489738967, + "language_loss": 0.98467255, + "learning_rate": 0.0009517320660936022, + "loss": 0.99614513, + "num_input_tokens_seen": 71416544, + "router_z_loss_mlp": 1.06054688, + "step": 867, + "time_per_iteration": 2.7664713859558105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151692, + "balance_loss_mlp": 1.04545259, + "epoch": 0.1669873028087726, + "flos": 666865526784.0, + "grad_norm": 0.02209258354681387, + "language_loss": 0.92114806, + "learning_rate": 0.0009515984315361051, + "loss": 0.93266487, + "num_input_tokens_seen": 71494080, + "router_z_loss_mlp": 1.06054688, + "step": 868, + "time_per_iteration": 2.845388412475586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151588, + "balance_loss_mlp": 1.04563451, + "epoch": 0.16717968449403617, + "flos": 539603168256.0, + "grad_norm": 0.02501334283432316, + "language_loss": 0.95751995, + "learning_rate": 0.000951464621652548, + "loss": 0.96903574, + "num_input_tokens_seen": 71562672, + "router_z_loss_mlp": 1.05761719, + "step": 869, + "time_per_iteration": 2.623375415802002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148167, + "balance_loss_mlp": 1.04216599, + "epoch": 0.16737206617929973, + "flos": 531278252544.0, + "grad_norm": 0.02062860382438808, + "language_loss": 0.87610328, + "learning_rate": 0.0009513306364948804, + "loss": 0.88758498, + "num_input_tokens_seen": 71641904, + "router_z_loss_mlp": 1.05810547, + "step": 870, + "time_per_iteration": 2.792346239089966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148065, + "balance_loss_mlp": 1.04206407, + "epoch": 0.1675644478645633, + "flos": 481756277760.0, + "grad_norm": 0.023236257285911367, + "language_loss": 0.98118269, + "learning_rate": 0.0009511964761151197, + "loss": 0.99266338, + "num_input_tokens_seen": 71709616, + "router_z_loss_mlp": 1.05810547, + "step": 871, + "time_per_iteration": 2.572923183441162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152601, + "balance_loss_mlp": 1.04669595, + "epoch": 0.16775682954982685, + "flos": 495541206528.0, + "grad_norm": 0.026661505796453877, + "language_loss": 0.99311042, + "learning_rate": 0.0009510621405653521, + "loss": 1.00463641, + "num_input_tokens_seen": 71776592, + "router_z_loss_mlp": 1.05712891, + "step": 872, + "time_per_iteration": 2.6296472549438477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150326, + "balance_loss_mlp": 1.04484987, + "epoch": 0.1679492112350904, + "flos": 753404912640.0, + "grad_norm": 0.029291148216183213, + "language_loss": 0.93300939, + "learning_rate": 0.0009509276298977309, + "loss": 0.94451261, + "num_input_tokens_seen": 71856352, + "router_z_loss_mlp": 1.05273438, + "step": 873, + "time_per_iteration": 3.0177366733551025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150817, + "balance_loss_mlp": 1.04543638, + "epoch": 0.168141592920354, + "flos": 1137731977728.0, + "grad_norm": 0.021155110884158303, + "language_loss": 0.9134444, + "learning_rate": 0.0009507929441644778, + "loss": 0.92495263, + "num_input_tokens_seen": 71948480, + "router_z_loss_mlp": 1.05175781, + "step": 874, + "time_per_iteration": 3.53277325630188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160399, + "balance_loss_mlp": 1.05501771, + "epoch": 0.16833397460561755, + "flos": 633553677312.0, + "grad_norm": 0.025508723945600786, + "language_loss": 0.94342184, + "learning_rate": 0.0009506580834178826, + "loss": 0.95502585, + "num_input_tokens_seen": 72019200, + "router_z_loss_mlp": 1.05175781, + "step": 875, + "time_per_iteration": 2.763296365737915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151031, + "balance_loss_mlp": 1.04560196, + "epoch": 0.1685263562908811, + "flos": 542542657536.0, + "grad_norm": 0.0234395143242784, + "language_loss": 1.00066125, + "learning_rate": 0.0009505230477103028, + "loss": 1.01217151, + "num_input_tokens_seen": 72088672, + "router_z_loss_mlp": 1.05224609, + "step": 876, + "time_per_iteration": 2.7256453037261963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143495, + "balance_loss_mlp": 1.03801847, + "epoch": 0.16871873797614467, + "flos": 620485155840.0, + "grad_norm": 0.02951425183806971, + "language_loss": 0.91949958, + "learning_rate": 0.0009503878370941641, + "loss": 0.93093449, + "num_input_tokens_seen": 72159952, + "router_z_loss_mlp": 1.05273438, + "step": 877, + "time_per_iteration": 2.75011944770813 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143733, + "balance_loss_mlp": 1.038257, + "epoch": 0.16891111966140823, + "flos": 607455565824.0, + "grad_norm": 0.02526909046796152, + "language_loss": 0.99137431, + "learning_rate": 0.0009502524516219595, + "loss": 1.00281167, + "num_input_tokens_seen": 72231648, + "router_z_loss_mlp": 1.05273438, + "step": 878, + "time_per_iteration": 2.7107326984405518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145725, + "balance_loss_mlp": 1.04005778, + "epoch": 0.1691035013466718, + "flos": 553405561344.0, + "grad_norm": 0.023246247090994255, + "language_loss": 0.99022686, + "learning_rate": 0.0009501168913462506, + "loss": 1.00168419, + "num_input_tokens_seen": 72298608, + "router_z_loss_mlp": 1.0546875, + "step": 879, + "time_per_iteration": 2.654356002807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153572, + "balance_loss_mlp": 1.04866791, + "epoch": 0.16929588303193535, + "flos": 1479305822208.0, + "grad_norm": 0.014844444469597292, + "language_loss": 0.79121923, + "learning_rate": 0.0009499811563196665, + "loss": 0.802755, + "num_input_tokens_seen": 72525312, + "router_z_loss_mlp": 1.046875, + "step": 880, + "time_per_iteration": 4.877387762069702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114571, + "balance_loss_mlp": 1.04042399, + "epoch": 0.1694882647171989, + "flos": 927846641664.0, + "grad_norm": 0.023879743421000837, + "language_loss": 0.93963408, + "learning_rate": 0.0009498452465949042, + "loss": 0.95109117, + "num_input_tokens_seen": 72612976, + "router_z_loss_mlp": 1.05078125, + "step": 881, + "time_per_iteration": 3.241151809692383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150014, + "balance_loss_mlp": 1.0447762, + "epoch": 0.1696806464024625, + "flos": 547151278080.0, + "grad_norm": 0.02293023114251512, + "language_loss": 0.98854458, + "learning_rate": 0.0009497091622247285, + "loss": 1.0000447, + "num_input_tokens_seen": 72686800, + "router_z_loss_mlp": 1.05029297, + "step": 882, + "time_per_iteration": 2.720453977584839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145786, + "balance_loss_mlp": 1.0406431, + "epoch": 0.16987302808772606, + "flos": 530294602752.0, + "grad_norm": 0.02459483675822623, + "language_loss": 1.0302248, + "learning_rate": 0.0009495729032619723, + "loss": 1.04168272, + "num_input_tokens_seen": 72759360, + "router_z_loss_mlp": 1.04931641, + "step": 883, + "time_per_iteration": 2.717176675796509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151842, + "balance_loss_mlp": 1.04731977, + "epoch": 0.17006540977298962, + "flos": 756478660608.0, + "grad_norm": 0.02507713686866634, + "language_loss": 0.9295364, + "learning_rate": 0.0009494364697595354, + "loss": 0.94105482, + "num_input_tokens_seen": 72831424, + "router_z_loss_mlp": 1.04589844, + "step": 884, + "time_per_iteration": 2.924898147583008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157567, + "balance_loss_mlp": 1.05271089, + "epoch": 0.17025779145825318, + "flos": 559874694144.0, + "grad_norm": 0.025110060032482954, + "language_loss": 0.98774076, + "learning_rate": 0.0009492998617703867, + "loss": 0.99931645, + "num_input_tokens_seen": 72901536, + "router_z_loss_mlp": 1.04833984, + "step": 885, + "time_per_iteration": 2.6759417057037354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155376, + "balance_loss_mlp": 1.05104423, + "epoch": 0.17045017314351674, + "flos": 513216347136.0, + "grad_norm": 0.0280627140127875, + "language_loss": 0.96898842, + "learning_rate": 0.0009491630793475619, + "loss": 0.98054218, + "num_input_tokens_seen": 72970480, + "router_z_loss_mlp": 1.04492188, + "step": 886, + "time_per_iteration": 2.5910444259643555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149096, + "balance_loss_mlp": 1.04452574, + "epoch": 0.1706425548287803, + "flos": 510012343296.0, + "grad_norm": 0.023090423796267925, + "language_loss": 0.94873035, + "learning_rate": 0.0009490261225441643, + "loss": 0.96022129, + "num_input_tokens_seen": 73053376, + "router_z_loss_mlp": 1.04638672, + "step": 887, + "time_per_iteration": 2.960139513015747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149945, + "balance_loss_mlp": 1.04508829, + "epoch": 0.17083493651404386, + "flos": 718714642944.0, + "grad_norm": 0.024954435208077393, + "language_loss": 0.98478651, + "learning_rate": 0.0009488889914133656, + "loss": 0.99628592, + "num_input_tokens_seen": 73136032, + "router_z_loss_mlp": 1.04833984, + "step": 888, + "time_per_iteration": 3.0498712062835693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151016, + "balance_loss_mlp": 1.04649353, + "epoch": 0.17102731819930742, + "flos": 560200333824.0, + "grad_norm": 0.020862133880352407, + "language_loss": 0.97394216, + "learning_rate": 0.0009487516860084047, + "loss": 0.98545229, + "num_input_tokens_seen": 73208544, + "router_z_loss_mlp": 1.046875, + "step": 889, + "time_per_iteration": 2.799579381942749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115955, + "balance_loss_mlp": 1.0542171, + "epoch": 0.17121969988457098, + "flos": 495764788224.0, + "grad_norm": 0.030159167385703775, + "language_loss": 0.99659365, + "learning_rate": 0.0009486142063825884, + "loss": 1.0081892, + "num_input_tokens_seen": 73274336, + "router_z_loss_mlp": 1.05126953, + "step": 890, + "time_per_iteration": 2.5897767543792725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160561, + "balance_loss_mlp": 1.05718231, + "epoch": 0.17141208156983456, + "flos": 1552105941504.0, + "grad_norm": 0.012289453069715352, + "language_loss": 0.72426212, + "learning_rate": 0.0009484765525892909, + "loss": 0.73586774, + "num_input_tokens_seen": 73506320, + "router_z_loss_mlp": 1.03515625, + "step": 891, + "time_per_iteration": 4.971697807312012 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160561, + "balance_loss_mlp": 1.05527556, + "epoch": 0.17160446325509812, + "flos": 620700005376.0, + "grad_norm": 0.02677753623279009, + "language_loss": 1.00227833, + "learning_rate": 0.0009483387246819542, + "loss": 1.01388383, + "num_input_tokens_seen": 73578048, + "router_z_loss_mlp": 1.05078125, + "step": 892, + "time_per_iteration": 2.7142419815063477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153152, + "balance_loss_mlp": 1.04977417, + "epoch": 0.17179684494036168, + "flos": 1384693300224.0, + "grad_norm": 0.011012484205567044, + "language_loss": 0.82285583, + "learning_rate": 0.0009482007227140877, + "loss": 0.8343873, + "num_input_tokens_seen": 73798640, + "router_z_loss_mlp": 1.03515625, + "step": 893, + "time_per_iteration": 4.678752183914185 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159751, + "balance_loss_mlp": 1.05446541, + "epoch": 0.17198922662562524, + "flos": 493641762816.0, + "grad_norm": 0.02464509578240857, + "language_loss": 0.9638195, + "learning_rate": 0.0009480625467392688, + "loss": 0.97541702, + "num_input_tokens_seen": 73867328, + "router_z_loss_mlp": 1.05175781, + "step": 894, + "time_per_iteration": 2.6579103469848633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158279, + "balance_loss_mlp": 1.05490112, + "epoch": 0.1721816083108888, + "flos": 1461485689344.0, + "grad_norm": 0.014844728137103481, + "language_loss": 0.77994668, + "learning_rate": 0.0009479241968111421, + "loss": 0.79152954, + "num_input_tokens_seen": 74093376, + "router_z_loss_mlp": 1.03515625, + "step": 895, + "time_per_iteration": 4.754615783691406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157074, + "balance_loss_mlp": 1.0523603, + "epoch": 0.17237398999615236, + "flos": 529204892160.0, + "grad_norm": 0.024157534092911288, + "language_loss": 0.95005947, + "learning_rate": 0.0009477856729834196, + "loss": 0.96163023, + "num_input_tokens_seen": 74169136, + "router_z_loss_mlp": 1.046875, + "step": 896, + "time_per_iteration": 2.7640984058380127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161659, + "balance_loss_mlp": 1.05742288, + "epoch": 0.17256637168141592, + "flos": 605026366464.0, + "grad_norm": 0.02447501108745492, + "language_loss": 0.9782356, + "learning_rate": 0.0009476469753098809, + "loss": 0.98985219, + "num_input_tokens_seen": 74236912, + "router_z_loss_mlp": 1.04394531, + "step": 897, + "time_per_iteration": 2.7016282081604004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153769, + "balance_loss_mlp": 1.04957986, + "epoch": 0.17275875336667948, + "flos": 510693821952.0, + "grad_norm": 0.025419887327313116, + "language_loss": 0.94868481, + "learning_rate": 0.0009475081038443738, + "loss": 0.96022242, + "num_input_tokens_seen": 74305968, + "router_z_loss_mlp": 1.04345703, + "step": 898, + "time_per_iteration": 2.5731348991394043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148609, + "balance_loss_mlp": 1.0446589, + "epoch": 0.17295113505194307, + "flos": 666500955648.0, + "grad_norm": 0.02623291269769982, + "language_loss": 0.95752573, + "learning_rate": 0.0009473690586408124, + "loss": 0.96901178, + "num_input_tokens_seen": 74384144, + "router_z_loss_mlp": 1.04101562, + "step": 899, + "time_per_iteration": 2.8549156188964844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146417, + "balance_loss_mlp": 1.04227531, + "epoch": 0.17314351673720663, + "flos": 556431645696.0, + "grad_norm": 0.022300666942289, + "language_loss": 0.94826102, + "learning_rate": 0.0009472298397531792, + "loss": 0.9597252, + "num_input_tokens_seen": 74455040, + "router_z_loss_mlp": 1.04296875, + "step": 900, + "time_per_iteration": 2.7165167331695557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145486, + "balance_loss_mlp": 1.04124928, + "epoch": 0.17333589842247019, + "flos": 504606724608.0, + "grad_norm": 0.023477361471443404, + "language_loss": 0.95443118, + "learning_rate": 0.0009470904472355235, + "loss": 0.96588612, + "num_input_tokens_seen": 74525248, + "router_z_loss_mlp": 1.04394531, + "step": 901, + "time_per_iteration": 2.668320655822754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143811, + "balance_loss_mlp": 1.03967023, + "epoch": 0.17352828010773375, + "flos": 557350167552.0, + "grad_norm": 0.02470997420275152, + "language_loss": 0.90534914, + "learning_rate": 0.0009469508811419626, + "loss": 0.91678727, + "num_input_tokens_seen": 74597328, + "router_z_loss_mlp": 1.04296875, + "step": 902, + "time_per_iteration": 2.714174747467041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155739, + "balance_loss_mlp": 1.05331421, + "epoch": 0.1737206617929973, + "flos": 1557791537664.0, + "grad_norm": 0.011695515468407039, + "language_loss": 0.7161383, + "learning_rate": 0.0009468111415266806, + "loss": 0.7276957, + "num_input_tokens_seen": 74819664, + "router_z_loss_mlp": 1.02539062, + "step": 903, + "time_per_iteration": 4.783574104309082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146888, + "balance_loss_mlp": 1.04308009, + "epoch": 0.17391304347826086, + "flos": 517755836928.0, + "grad_norm": 0.027522671456014093, + "language_loss": 0.94518518, + "learning_rate": 0.0009466712284439292, + "loss": 0.95665407, + "num_input_tokens_seen": 74896224, + "router_z_loss_mlp": 1.03955078, + "step": 904, + "time_per_iteration": 2.7503864765167236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011486, + "balance_loss_mlp": 1.04503071, + "epoch": 0.17410542516352442, + "flos": 542160622080.0, + "grad_norm": 0.027186859166075866, + "language_loss": 0.99262786, + "learning_rate": 0.0009465311419480276, + "loss": 1.00411391, + "num_input_tokens_seen": 74966560, + "router_z_loss_mlp": 1.03710938, + "step": 905, + "time_per_iteration": 2.671753168106079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153491, + "balance_loss_mlp": 1.05011249, + "epoch": 0.17429780684878798, + "flos": 625081041408.0, + "grad_norm": 0.028950662808853365, + "language_loss": 0.96674442, + "learning_rate": 0.0009463908820933622, + "loss": 0.97827929, + "num_input_tokens_seen": 75045248, + "router_z_loss_mlp": 1.03515625, + "step": 906, + "time_per_iteration": 2.8291828632354736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151914, + "balance_loss_mlp": 1.04844034, + "epoch": 0.17449018853405157, + "flos": 576848890368.0, + "grad_norm": 0.03002954803612974, + "language_loss": 0.90420532, + "learning_rate": 0.0009462504489343868, + "loss": 0.91572446, + "num_input_tokens_seen": 75123952, + "router_z_loss_mlp": 1.03613281, + "step": 907, + "time_per_iteration": 2.8554108142852783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147077, + "balance_loss_mlp": 1.04341269, + "epoch": 0.17468257021931513, + "flos": 534772967424.0, + "grad_norm": 0.024073731406752365, + "language_loss": 1.01002121, + "learning_rate": 0.0009461098425256222, + "loss": 1.02149189, + "num_input_tokens_seen": 75191728, + "router_z_loss_mlp": 1.03808594, + "step": 908, + "time_per_iteration": 2.5968382358551025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114306, + "balance_loss_mlp": 1.03930068, + "epoch": 0.1748749519045787, + "flos": 541808785920.0, + "grad_norm": 0.02493910110608304, + "language_loss": 0.93412566, + "learning_rate": 0.0009459690629216567, + "loss": 0.94555628, + "num_input_tokens_seen": 75262224, + "router_z_loss_mlp": 1.0390625, + "step": 909, + "time_per_iteration": 2.670389413833618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150977, + "balance_loss_mlp": 1.04688334, + "epoch": 0.17506733358984225, + "flos": 499626802176.0, + "grad_norm": 0.02402970341263653, + "language_loss": 0.96272469, + "learning_rate": 0.0009458281101771457, + "loss": 0.97423446, + "num_input_tokens_seen": 75329760, + "router_z_loss_mlp": 1.04248047, + "step": 910, + "time_per_iteration": 2.6256320476531982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153015, + "balance_loss_mlp": 1.04906452, + "epoch": 0.1752597152751058, + "flos": 624132320256.0, + "grad_norm": 0.023679811966199643, + "language_loss": 0.91450173, + "learning_rate": 0.0009456869843468122, + "loss": 0.92603183, + "num_input_tokens_seen": 75407920, + "router_z_loss_mlp": 1.04101562, + "step": 911, + "time_per_iteration": 2.863004207611084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158204, + "balance_loss_mlp": 1.05434883, + "epoch": 0.17545209696036937, + "flos": 521993155584.0, + "grad_norm": 0.029813530713564303, + "language_loss": 0.92364156, + "learning_rate": 0.0009455456854854459, + "loss": 0.93522358, + "num_input_tokens_seen": 75476752, + "router_z_loss_mlp": 1.04003906, + "step": 912, + "time_per_iteration": 2.616231918334961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150079, + "balance_loss_mlp": 1.04612815, + "epoch": 0.17564447864563293, + "flos": 462945764352.0, + "grad_norm": 0.02810445184103091, + "language_loss": 0.92624664, + "learning_rate": 0.0009454042136479039, + "loss": 0.93774742, + "num_input_tokens_seen": 75542944, + "router_z_loss_mlp": 1.04101562, + "step": 913, + "time_per_iteration": 2.5944247245788574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155662, + "balance_loss_mlp": 1.05199766, + "epoch": 0.1758368603308965, + "flos": 481617289728.0, + "grad_norm": 0.02706355326928303, + "language_loss": 0.91841793, + "learning_rate": 0.0009452625688891103, + "loss": 0.92997456, + "num_input_tokens_seen": 75609840, + "router_z_loss_mlp": 1.03808594, + "step": 914, + "time_per_iteration": 2.580941915512085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144051, + "balance_loss_mlp": 1.04200745, + "epoch": 0.17602924201616005, + "flos": 1482084856320.0, + "grad_norm": 0.009713749524187035, + "language_loss": 0.78734738, + "learning_rate": 0.0009451207512640567, + "loss": 0.79878789, + "num_input_tokens_seen": 75819312, + "router_z_loss_mlp": 1.02148438, + "step": 915, + "time_per_iteration": 4.592097997665405 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148996, + "balance_loss_mlp": 1.04523647, + "epoch": 0.17622162370142364, + "flos": 603470026752.0, + "grad_norm": 0.02797967110469985, + "language_loss": 1.03421283, + "learning_rate": 0.0009449787608278015, + "loss": 1.0457027, + "num_input_tokens_seen": 75893984, + "router_z_loss_mlp": 1.0390625, + "step": 916, + "time_per_iteration": 2.755580425262451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150108, + "balance_loss_mlp": 1.04677713, + "epoch": 0.1764140053866872, + "flos": 443605495296.0, + "grad_norm": 0.024189441248888145, + "language_loss": 1.00777316, + "learning_rate": 0.0009448365976354704, + "loss": 1.01927423, + "num_input_tokens_seen": 75958944, + "router_z_loss_mlp": 1.03466797, + "step": 917, + "time_per_iteration": 2.4922571182250977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149055, + "balance_loss_mlp": 1.04567707, + "epoch": 0.17660638707195075, + "flos": 501591373824.0, + "grad_norm": 0.028333637349232343, + "language_loss": 1.01507974, + "learning_rate": 0.0009446942617422558, + "loss": 1.02657032, + "num_input_tokens_seen": 76024240, + "router_z_loss_mlp": 1.03515625, + "step": 918, + "time_per_iteration": 2.574998378753662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148191, + "balance_loss_mlp": 1.0448128, + "epoch": 0.17679876875721431, + "flos": 539983202304.0, + "grad_norm": 0.02432410226762854, + "language_loss": 0.94564992, + "learning_rate": 0.0009445517532034176, + "loss": 0.9571318, + "num_input_tokens_seen": 76095264, + "router_z_loss_mlp": 1.03515625, + "step": 919, + "time_per_iteration": 2.7170355319976807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153425, + "balance_loss_mlp": 1.05009484, + "epoch": 0.17699115044247787, + "flos": 498715011072.0, + "grad_norm": 0.026165935935680888, + "language_loss": 0.99032271, + "learning_rate": 0.0009444090720742824, + "loss": 1.00185692, + "num_input_tokens_seen": 76163520, + "router_z_loss_mlp": 1.03466797, + "step": 920, + "time_per_iteration": 2.6009292602539062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149157, + "balance_loss_mlp": 1.04587448, + "epoch": 0.17718353212774143, + "flos": 663915303936.0, + "grad_norm": 0.025722324934358026, + "language_loss": 0.98290348, + "learning_rate": 0.0009442662184102439, + "loss": 0.99439508, + "num_input_tokens_seen": 76233760, + "router_z_loss_mlp": 1.03417969, + "step": 921, + "time_per_iteration": 2.7612035274505615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145605, + "balance_loss_mlp": 1.04251313, + "epoch": 0.177375913813005, + "flos": 583847778816.0, + "grad_norm": 0.021564117555322487, + "language_loss": 0.93569565, + "learning_rate": 0.000944123192266763, + "loss": 0.94715166, + "num_input_tokens_seen": 76310704, + "router_z_loss_mlp": 1.03222656, + "step": 922, + "time_per_iteration": 2.8110268115997314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141792, + "balance_loss_mlp": 1.03865182, + "epoch": 0.17756829549826855, + "flos": 553683537408.0, + "grad_norm": 0.021487036209533367, + "language_loss": 0.92858881, + "learning_rate": 0.0009439799936993671, + "loss": 0.94000673, + "num_input_tokens_seen": 76386992, + "router_z_loss_mlp": 1.03271484, + "step": 923, + "time_per_iteration": 2.7440245151519775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142202, + "balance_loss_mlp": 1.03901482, + "epoch": 0.17776067718353214, + "flos": 557371634688.0, + "grad_norm": 0.02463154633112553, + "language_loss": 0.97990632, + "learning_rate": 0.0009438366227636511, + "loss": 0.99132836, + "num_input_tokens_seen": 76453328, + "router_z_loss_mlp": 1.03320312, + "step": 924, + "time_per_iteration": 2.7032759189605713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140208, + "balance_loss_mlp": 1.03721154, + "epoch": 0.1779530588687957, + "flos": 659651788800.0, + "grad_norm": 0.022941473179093813, + "language_loss": 0.94988692, + "learning_rate": 0.0009436930795152763, + "loss": 0.96128899, + "num_input_tokens_seen": 76529040, + "router_z_loss_mlp": 1.03125, + "step": 925, + "time_per_iteration": 2.854522943496704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143555, + "balance_loss_mlp": 1.04084456, + "epoch": 0.17814544055405926, + "flos": 645671476224.0, + "grad_norm": 0.02421412975678805, + "language_loss": 0.95479, + "learning_rate": 0.0009435493640099713, + "loss": 0.9662255, + "num_input_tokens_seen": 76604080, + "router_z_loss_mlp": 1.02832031, + "step": 926, + "time_per_iteration": 2.8268251419067383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143389, + "balance_loss_mlp": 1.04077399, + "epoch": 0.17833782223932282, + "flos": 461884251648.0, + "grad_norm": 0.0252062590806445, + "language_loss": 0.94177145, + "learning_rate": 0.0009434054763035314, + "loss": 0.95320535, + "num_input_tokens_seen": 76674096, + "router_z_loss_mlp": 1.02734375, + "step": 927, + "time_per_iteration": 2.629499673843384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139685, + "balance_loss_mlp": 1.03706956, + "epoch": 0.17853020392458638, + "flos": 760852965888.0, + "grad_norm": 0.02122720378042075, + "language_loss": 0.93181551, + "learning_rate": 0.0009432614164518185, + "loss": 0.94321233, + "num_input_tokens_seen": 76752144, + "router_z_loss_mlp": 1.02734375, + "step": 928, + "time_per_iteration": 2.9364700317382812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140803, + "balance_loss_mlp": 1.03818727, + "epoch": 0.17872258560984994, + "flos": 784055248896.0, + "grad_norm": 0.023477252169520995, + "language_loss": 0.93520033, + "learning_rate": 0.000943117184510762, + "loss": 0.94660836, + "num_input_tokens_seen": 76830240, + "router_z_loss_mlp": 1.02734375, + "step": 929, + "time_per_iteration": 3.07600474357605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150169, + "balance_loss_mlp": 1.04831696, + "epoch": 0.1789149672951135, + "flos": 1463031295488.0, + "grad_norm": 0.013755703560815407, + "language_loss": 0.78789961, + "learning_rate": 0.0009429727805363575, + "loss": 0.7994014, + "num_input_tokens_seen": 77062464, + "router_z_loss_mlp": 1.01953125, + "step": 930, + "time_per_iteration": 5.029282808303833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153323, + "balance_loss_mlp": 1.05099344, + "epoch": 0.17910734898037706, + "flos": 504930362880.0, + "grad_norm": 0.023999213273897636, + "language_loss": 0.96652937, + "learning_rate": 0.0009428282045846674, + "loss": 0.97806263, + "num_input_tokens_seen": 77136672, + "router_z_loss_mlp": 1.02441406, + "step": 931, + "time_per_iteration": 2.7112410068511963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145421, + "balance_loss_mlp": 1.04275823, + "epoch": 0.17929973066564064, + "flos": 747669651456.0, + "grad_norm": 0.02006943819739268, + "language_loss": 0.96385491, + "learning_rate": 0.0009426834567118214, + "loss": 0.97530913, + "num_input_tokens_seen": 77227040, + "router_z_loss_mlp": 1.02783203, + "step": 932, + "time_per_iteration": 3.0711913108825684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143693, + "balance_loss_mlp": 1.04098177, + "epoch": 0.1794921123509042, + "flos": 714572651520.0, + "grad_norm": 0.021210123960592832, + "language_loss": 0.89608383, + "learning_rate": 0.0009425385369740155, + "loss": 0.90752071, + "num_input_tokens_seen": 77319392, + "router_z_loss_mlp": 1.02832031, + "step": 933, + "time_per_iteration": 3.059857130050659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114727, + "balance_loss_mlp": 1.0451318, + "epoch": 0.17968449403616776, + "flos": 634361409024.0, + "grad_norm": 0.02299955090486112, + "language_loss": 0.96636283, + "learning_rate": 0.0009423934454275125, + "loss": 0.97783554, + "num_input_tokens_seen": 77394688, + "router_z_loss_mlp": 1.02246094, + "step": 934, + "time_per_iteration": 2.85917592048645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146917, + "balance_loss_mlp": 1.04477859, + "epoch": 0.17987687572143132, + "flos": 537378084864.0, + "grad_norm": 0.02461268142415081, + "language_loss": 1.01075852, + "learning_rate": 0.0009422481821286418, + "loss": 1.02222764, + "num_input_tokens_seen": 77468288, + "router_z_loss_mlp": 1.02246094, + "step": 935, + "time_per_iteration": 2.7314486503601074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150005, + "balance_loss_mlp": 1.04777098, + "epoch": 0.18006925740669488, + "flos": 539119074816.0, + "grad_norm": 0.026258801194945027, + "language_loss": 0.98970592, + "learning_rate": 0.0009421027471337998, + "loss": 1.00120604, + "num_input_tokens_seen": 77535840, + "router_z_loss_mlp": 1.0234375, + "step": 936, + "time_per_iteration": 2.6354496479034424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151337, + "balance_loss_mlp": 1.04891205, + "epoch": 0.18026163909195844, + "flos": 540534425088.0, + "grad_norm": 0.029056123283387615, + "language_loss": 0.94782555, + "learning_rate": 0.0009419571404994493, + "loss": 0.9593389, + "num_input_tokens_seen": 77604000, + "router_z_loss_mlp": 1.02539062, + "step": 937, + "time_per_iteration": 2.6368348598480225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148965, + "balance_loss_mlp": 1.04649317, + "epoch": 0.180454020777222, + "flos": 501682698240.0, + "grad_norm": 0.026973093946582868, + "language_loss": 1.00715971, + "learning_rate": 0.00094181136228212, + "loss": 1.01864934, + "num_input_tokens_seen": 77671488, + "router_z_loss_mlp": 1.02587891, + "step": 938, + "time_per_iteration": 2.710451602935791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145832, + "balance_loss_mlp": 1.043455, + "epoch": 0.18064640246248556, + "flos": 500006836224.0, + "grad_norm": 0.02510488837562242, + "language_loss": 0.93535352, + "learning_rate": 0.0009416654125384077, + "loss": 0.9468118, + "num_input_tokens_seen": 77746240, + "router_z_loss_mlp": 1.02490234, + "step": 939, + "time_per_iteration": 2.728480577468872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145905, + "balance_loss_mlp": 1.04424286, + "epoch": 0.18083878414774912, + "flos": 1522290808320.0, + "grad_norm": 0.01070150853185005, + "language_loss": 0.79772377, + "learning_rate": 0.0009415192913249752, + "loss": 0.80918276, + "num_input_tokens_seen": 77966080, + "router_z_loss_mlp": 1.01757812, + "step": 940, + "time_per_iteration": 4.915560007095337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145419, + "balance_loss_mlp": 1.04318535, + "epoch": 0.1810311658330127, + "flos": 728665755648.0, + "grad_norm": 0.023936590350452012, + "language_loss": 0.92724693, + "learning_rate": 0.000941372998698552, + "loss": 0.93870103, + "num_input_tokens_seen": 78049200, + "router_z_loss_mlp": 1.0234375, + "step": 941, + "time_per_iteration": 2.993441343307495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140689, + "balance_loss_mlp": 1.0385505, + "epoch": 0.18122354751827627, + "flos": 566044383744.0, + "grad_norm": 0.025062658148163358, + "language_loss": 0.94270039, + "learning_rate": 0.0009412265347159336, + "loss": 0.95410728, + "num_input_tokens_seen": 78122752, + "router_z_loss_mlp": 1.02246094, + "step": 942, + "time_per_iteration": 2.731416702270508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140669, + "balance_loss_mlp": 1.03848326, + "epoch": 0.18141592920353983, + "flos": 520317293568.0, + "grad_norm": 0.024682729806918415, + "language_loss": 0.94559634, + "learning_rate": 0.0009410798994339829, + "loss": 0.95700312, + "num_input_tokens_seen": 78194064, + "router_z_loss_mlp": 1.02294922, + "step": 943, + "time_per_iteration": 2.6001992225646973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138644, + "balance_loss_mlp": 1.03650522, + "epoch": 0.1816083108888034, + "flos": 513476858880.0, + "grad_norm": 0.022579221317186333, + "language_loss": 0.95589852, + "learning_rate": 0.000940933092909628, + "loss": 0.96728498, + "num_input_tokens_seen": 78262048, + "router_z_loss_mlp": 1.02246094, + "step": 944, + "time_per_iteration": 2.6360957622528076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147356, + "balance_loss_mlp": 1.04550409, + "epoch": 0.18180069257406695, + "flos": 493372518912.0, + "grad_norm": 0.02569410792888805, + "language_loss": 0.9276287, + "learning_rate": 0.0009407861151998649, + "loss": 0.93910229, + "num_input_tokens_seen": 78330624, + "router_z_loss_mlp": 1.01953125, + "step": 945, + "time_per_iteration": 2.6910903453826904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147749, + "balance_loss_mlp": 1.04608703, + "epoch": 0.1819930742593305, + "flos": 571230423552.0, + "grad_norm": 0.024877151530798884, + "language_loss": 0.95025092, + "learning_rate": 0.0009406389663617552, + "loss": 0.96172833, + "num_input_tokens_seen": 78400672, + "router_z_loss_mlp": 1.01757812, + "step": 946, + "time_per_iteration": 2.689232110977173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138734, + "balance_loss_mlp": 1.03669131, + "epoch": 0.18218545594459407, + "flos": 607110460416.0, + "grad_norm": 0.026141117268158143, + "language_loss": 0.96229172, + "learning_rate": 0.000940491646452427, + "loss": 0.97367907, + "num_input_tokens_seen": 78467952, + "router_z_loss_mlp": 1.02148438, + "step": 947, + "time_per_iteration": 2.720996618270874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136776, + "balance_loss_mlp": 1.03473294, + "epoch": 0.18237783762985763, + "flos": 549738931200.0, + "grad_norm": 0.02114848591843324, + "language_loss": 0.99382234, + "learning_rate": 0.000940344155529075, + "loss": 1.00519001, + "num_input_tokens_seen": 78538928, + "router_z_loss_mlp": 1.02148438, + "step": 948, + "time_per_iteration": 2.655764102935791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136656, + "balance_loss_mlp": 1.03489935, + "epoch": 0.1825702193151212, + "flos": 451674628608.0, + "grad_norm": 0.027816765537183038, + "language_loss": 0.98392528, + "learning_rate": 0.0009401964936489605, + "loss": 0.99529195, + "num_input_tokens_seen": 78602144, + "router_z_loss_mlp": 1.01855469, + "step": 949, + "time_per_iteration": 2.5372273921966553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137812, + "balance_loss_mlp": 1.03615081, + "epoch": 0.18276260100038477, + "flos": 590384040960.0, + "grad_norm": 0.023066854335363023, + "language_loss": 0.93237805, + "learning_rate": 0.0009400486608694108, + "loss": 0.94375616, + "num_input_tokens_seen": 78673152, + "router_z_loss_mlp": 1.01757812, + "step": 950, + "time_per_iteration": 2.7370681762695312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139002, + "balance_loss_mlp": 1.03719783, + "epoch": 0.18295498268564833, + "flos": 788709531648.0, + "grad_norm": 0.02337801281240106, + "language_loss": 0.97100747, + "learning_rate": 0.0009399006572478195, + "loss": 0.98239744, + "num_input_tokens_seen": 78753872, + "router_z_loss_mlp": 1.01904297, + "step": 951, + "time_per_iteration": 3.1136744022369385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144566, + "balance_loss_mlp": 1.04276168, + "epoch": 0.1831473643709119, + "flos": 579225696768.0, + "grad_norm": 0.024500893588447415, + "language_loss": 0.99522519, + "learning_rate": 0.0009397524828416468, + "loss": 1.00667083, + "num_input_tokens_seen": 78822640, + "router_z_loss_mlp": 1.01904297, + "step": 952, + "time_per_iteration": 2.680551767349243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138356, + "balance_loss_mlp": 1.03664696, + "epoch": 0.18333974605617545, + "flos": 567963293184.0, + "grad_norm": 0.023361368133084506, + "language_loss": 1.04812968, + "learning_rate": 0.0009396041377084192, + "loss": 1.05951309, + "num_input_tokens_seen": 78893792, + "router_z_loss_mlp": 1.01806641, + "step": 953, + "time_per_iteration": 2.6526119709014893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136097, + "balance_loss_mlp": 1.03443527, + "epoch": 0.183532127741439, + "flos": 528069519360.0, + "grad_norm": 0.02324700647994909, + "language_loss": 0.98137838, + "learning_rate": 0.0009394556219057295, + "loss": 0.99273932, + "num_input_tokens_seen": 78964752, + "router_z_loss_mlp": 1.01757812, + "step": 954, + "time_per_iteration": 2.6928489208221436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147999, + "balance_loss_mlp": 1.04671907, + "epoch": 0.18372450942670257, + "flos": 595643940864.0, + "grad_norm": 0.02338261009959255, + "language_loss": 0.93879586, + "learning_rate": 0.0009393069354912362, + "loss": 0.95027584, + "num_input_tokens_seen": 79034400, + "router_z_loss_mlp": 1.01367188, + "step": 955, + "time_per_iteration": 2.7496042251586914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151957, + "balance_loss_mlp": 1.05067647, + "epoch": 0.18391689111196613, + "flos": 646283824128.0, + "grad_norm": 0.029421035614033756, + "language_loss": 0.90626895, + "learning_rate": 0.0009391580785226649, + "loss": 0.91778857, + "num_input_tokens_seen": 79109488, + "router_z_loss_mlp": 1.01367188, + "step": 956, + "time_per_iteration": 2.9440600872039795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152481, + "balance_loss_mlp": 1.05253601, + "epoch": 0.18410927279722972, + "flos": 1460391975936.0, + "grad_norm": 0.020211591247266292, + "language_loss": 0.79340446, + "learning_rate": 0.0009390090510578067, + "loss": 0.80492932, + "num_input_tokens_seen": 79327712, + "router_z_loss_mlp": 1.0, + "step": 957, + "time_per_iteration": 4.738964796066284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138037, + "balance_loss_mlp": 1.03623211, + "epoch": 0.18430165448249328, + "flos": 660003624960.0, + "grad_norm": 0.026926680065899915, + "language_loss": 0.95339954, + "learning_rate": 0.0009388598531545196, + "loss": 0.96477991, + "num_input_tokens_seen": 79401504, + "router_z_loss_mlp": 1.01904297, + "step": 958, + "time_per_iteration": 2.859509229660034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138629, + "balance_loss_mlp": 1.03687191, + "epoch": 0.18449403616775684, + "flos": 518949606912.0, + "grad_norm": 0.029778126611616895, + "language_loss": 0.94583583, + "learning_rate": 0.000938710484870727, + "loss": 0.9572221, + "num_input_tokens_seen": 79466688, + "router_z_loss_mlp": 1.01855469, + "step": 959, + "time_per_iteration": 2.565548896789551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137101, + "balance_loss_mlp": 1.03543901, + "epoch": 0.1846864178530204, + "flos": 553824526848.0, + "grad_norm": 0.027283874554685776, + "language_loss": 0.94945395, + "learning_rate": 0.0009385609462644189, + "loss": 0.96082497, + "num_input_tokens_seen": 79540288, + "router_z_loss_mlp": 1.01757812, + "step": 960, + "time_per_iteration": 2.676379919052124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138569, + "balance_loss_mlp": 1.03709817, + "epoch": 0.18487879953828396, + "flos": 467115953664.0, + "grad_norm": 0.025693285519799033, + "language_loss": 0.96468461, + "learning_rate": 0.0009384112373936514, + "loss": 0.97607034, + "num_input_tokens_seen": 79611872, + "router_z_loss_mlp": 1.015625, + "step": 961, + "time_per_iteration": 2.7575974464416504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154728, + "balance_loss_mlp": 1.05325735, + "epoch": 0.18507118122354752, + "flos": 649683211776.0, + "grad_norm": 0.02725538915325764, + "language_loss": 1.0098747, + "learning_rate": 0.0009382613583165467, + "loss": 1.02142203, + "num_input_tokens_seen": 79689504, + "router_z_loss_mlp": 1.015625, + "step": 962, + "time_per_iteration": 2.8268754482269287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116263, + "balance_loss_mlp": 1.06125438, + "epoch": 0.18526356290881107, + "flos": 627922475520.0, + "grad_norm": 0.027998512126097927, + "language_loss": 0.99849832, + "learning_rate": 0.0009381113090912928, + "loss": 1.01012468, + "num_input_tokens_seen": 79759264, + "router_z_loss_mlp": 1.01464844, + "step": 963, + "time_per_iteration": 2.7762861251831055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147698, + "balance_loss_mlp": 1.04679894, + "epoch": 0.18545594459407463, + "flos": 433645650432.0, + "grad_norm": 0.027008272304904758, + "language_loss": 0.98634118, + "learning_rate": 0.000937961089776144, + "loss": 0.99781811, + "num_input_tokens_seen": 79824464, + "router_z_loss_mlp": 1.00976562, + "step": 964, + "time_per_iteration": 2.61759352684021 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149635, + "balance_loss_mlp": 1.04844999, + "epoch": 0.1856483262793382, + "flos": 750426491904.0, + "grad_norm": 0.028502333826765886, + "language_loss": 0.91998804, + "learning_rate": 0.0009378107004294208, + "loss": 0.93148446, + "num_input_tokens_seen": 79907152, + "router_z_loss_mlp": 1.01269531, + "step": 965, + "time_per_iteration": 2.964561939239502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151478, + "balance_loss_mlp": 1.05057883, + "epoch": 0.18584070796460178, + "flos": 531401777664.0, + "grad_norm": 0.02451376704559663, + "language_loss": 1.00210857, + "learning_rate": 0.0009376601411095096, + "loss": 1.01362348, + "num_input_tokens_seen": 79976944, + "router_z_loss_mlp": 1.00976562, + "step": 966, + "time_per_iteration": 2.6664164066314697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150482, + "balance_loss_mlp": 1.04953575, + "epoch": 0.18603308964986534, + "flos": 484083419136.0, + "grad_norm": 0.02282308899195351, + "language_loss": 0.93174511, + "learning_rate": 0.0009375094118748622, + "loss": 0.94324994, + "num_input_tokens_seen": 80042112, + "router_z_loss_mlp": 1.01025391, + "step": 967, + "time_per_iteration": 2.544952392578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142823, + "balance_loss_mlp": 1.041924, + "epoch": 0.1862254713351289, + "flos": 802681112064.0, + "grad_norm": 0.02495680742184495, + "language_loss": 1.00251484, + "learning_rate": 0.0009373585127839976, + "loss": 1.01394308, + "num_input_tokens_seen": 80118896, + "router_z_loss_mlp": 1.00976562, + "step": 968, + "time_per_iteration": 2.973095417022705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142113, + "balance_loss_mlp": 1.0413574, + "epoch": 0.18641785302039246, + "flos": 479290148352.0, + "grad_norm": 0.02509872783632802, + "language_loss": 0.9944787, + "learning_rate": 0.0009372074438954994, + "loss": 1.00589979, + "num_input_tokens_seen": 80183360, + "router_z_loss_mlp": 1.00830078, + "step": 969, + "time_per_iteration": 2.5303025245666504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142663, + "balance_loss_mlp": 1.04181159, + "epoch": 0.18661023470565602, + "flos": 389779072512.0, + "grad_norm": 0.02439046514561532, + "language_loss": 1.00939226, + "learning_rate": 0.0009370562052680181, + "loss": 1.02081895, + "num_input_tokens_seen": 80247024, + "router_z_loss_mlp": 1.00927734, + "step": 970, + "time_per_iteration": 2.5023443698883057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114981, + "balance_loss_mlp": 1.04929316, + "epoch": 0.18680261639091958, + "flos": 565775139840.0, + "grad_norm": 0.02213336285369191, + "language_loss": 0.95379293, + "learning_rate": 0.0009369047969602695, + "loss": 0.96529102, + "num_input_tokens_seen": 80318256, + "router_z_loss_mlp": 1.00585938, + "step": 971, + "time_per_iteration": 2.722823143005371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154865, + "balance_loss_mlp": 1.05420506, + "epoch": 0.18699499807618314, + "flos": 480230137344.0, + "grad_norm": 0.029574405329312194, + "language_loss": 0.9913702, + "learning_rate": 0.0009367532190310357, + "loss": 1.00291884, + "num_input_tokens_seen": 80384848, + "router_z_loss_mlp": 1.00732422, + "step": 972, + "time_per_iteration": 2.633387327194214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149336, + "balance_loss_mlp": 1.0490092, + "epoch": 0.1871873797614467, + "flos": 554328086016.0, + "grad_norm": 0.02905569815438633, + "language_loss": 0.99535728, + "learning_rate": 0.0009366014715391644, + "loss": 1.00685072, + "num_input_tokens_seen": 80453088, + "router_z_loss_mlp": 1.00390625, + "step": 973, + "time_per_iteration": 2.6549065113067627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153264, + "balance_loss_mlp": 1.05293763, + "epoch": 0.18737976144671029, + "flos": 553952781312.0, + "grad_norm": 0.023481989115367276, + "language_loss": 0.9123525, + "learning_rate": 0.0009364495545435693, + "loss": 0.92388517, + "num_input_tokens_seen": 80528608, + "router_z_loss_mlp": 1.00390625, + "step": 974, + "time_per_iteration": 4.409714221954346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155126, + "balance_loss_mlp": 1.05479944, + "epoch": 0.18757214313197385, + "flos": 503247770112.0, + "grad_norm": 0.022955013749569684, + "language_loss": 0.97297812, + "learning_rate": 0.0009362974681032297, + "loss": 0.98452938, + "num_input_tokens_seen": 80599600, + "router_z_loss_mlp": 1.00390625, + "step": 975, + "time_per_iteration": 2.61857533454895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153706, + "balance_loss_mlp": 1.05352271, + "epoch": 0.1877645248172374, + "flos": 676291613184.0, + "grad_norm": 0.028784531937469084, + "language_loss": 0.98011422, + "learning_rate": 0.0009361452122771907, + "loss": 0.9916513, + "num_input_tokens_seen": 80677264, + "router_z_loss_mlp": 1.00244141, + "step": 976, + "time_per_iteration": 2.91774320602417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149368, + "balance_loss_mlp": 1.04923177, + "epoch": 0.18795690650250096, + "flos": 405862944768.0, + "grad_norm": 0.029616845561456457, + "language_loss": 0.95658362, + "learning_rate": 0.0009359927871245635, + "loss": 0.9680773, + "num_input_tokens_seen": 80739776, + "router_z_loss_mlp": 1.00195312, + "step": 977, + "time_per_iteration": 2.563232183456421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149302, + "balance_loss_mlp": 1.04916573, + "epoch": 0.18814928818776452, + "flos": 639063355392.0, + "grad_norm": 0.027239481801034963, + "language_loss": 0.98439831, + "learning_rate": 0.0009358401927045246, + "loss": 0.99589127, + "num_input_tokens_seen": 80815200, + "router_z_loss_mlp": 1.00195312, + "step": 978, + "time_per_iteration": 2.8147568702697754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144978, + "balance_loss_mlp": 1.04498518, + "epoch": 0.18834166987302808, + "flos": 1140115514880.0, + "grad_norm": 0.022094320674951175, + "language_loss": 0.96123868, + "learning_rate": 0.0009356874290763166, + "loss": 0.9726885, + "num_input_tokens_seen": 80905024, + "router_z_loss_mlp": 1.00048828, + "step": 979, + "time_per_iteration": 3.4719691276550293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149894, + "balance_loss_mlp": 1.04971051, + "epoch": 0.18853405155829164, + "flos": 505815957504.0, + "grad_norm": 0.02560863383472628, + "language_loss": 0.98637187, + "learning_rate": 0.0009355344962992474, + "loss": 0.99787074, + "num_input_tokens_seen": 80976704, + "router_z_loss_mlp": 1.00244141, + "step": 980, + "time_per_iteration": 2.6199324131011963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139646, + "balance_loss_mlp": 1.03931963, + "epoch": 0.1887264332435552, + "flos": 609370472448.0, + "grad_norm": 0.02150131271194909, + "language_loss": 0.97900265, + "learning_rate": 0.0009353813944326908, + "loss": 0.99039912, + "num_input_tokens_seen": 81057152, + "router_z_loss_mlp": 1.00390625, + "step": 981, + "time_per_iteration": 2.8862478733062744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143203, + "balance_loss_mlp": 1.04287672, + "epoch": 0.1889188149288188, + "flos": 553592212992.0, + "grad_norm": 0.027403519760576756, + "language_loss": 0.92598587, + "learning_rate": 0.0009352281235360863, + "loss": 0.93741786, + "num_input_tokens_seen": 81131520, + "router_z_loss_mlp": 1.00390625, + "step": 982, + "time_per_iteration": 2.680797815322876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142003, + "balance_loss_mlp": 1.04167616, + "epoch": 0.18911119661408235, + "flos": 419469954048.0, + "grad_norm": 0.02481781093748577, + "language_loss": 0.92531025, + "learning_rate": 0.0009350746836689389, + "loss": 0.93673027, + "num_input_tokens_seen": 81195952, + "router_z_loss_mlp": 1.00390625, + "step": 983, + "time_per_iteration": 2.5687928199768066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152649, + "balance_loss_mlp": 1.05289459, + "epoch": 0.1893035782993459, + "flos": 1485317784576.0, + "grad_norm": 0.01747927461324531, + "language_loss": 0.81439221, + "learning_rate": 0.0009349210748908193, + "loss": 0.82591867, + "num_input_tokens_seen": 81427312, + "router_z_loss_mlp": 0.99804688, + "step": 984, + "time_per_iteration": 4.978898048400879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115218, + "balance_loss_mlp": 1.05237782, + "epoch": 0.18949595998460947, + "flos": 509456391168.0, + "grad_norm": 0.033971943902626214, + "language_loss": 0.94133711, + "learning_rate": 0.0009347672972613634, + "loss": 0.95285892, + "num_input_tokens_seen": 81494256, + "router_z_loss_mlp": 0.99853516, + "step": 985, + "time_per_iteration": 2.5850014686584473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153583, + "balance_loss_mlp": 1.05382824, + "epoch": 0.18968834166987303, + "flos": 532192045056.0, + "grad_norm": 0.027626772825507382, + "language_loss": 0.93152702, + "learning_rate": 0.0009346133508402735, + "loss": 0.9430629, + "num_input_tokens_seen": 81569312, + "router_z_loss_mlp": 0.99804688, + "step": 986, + "time_per_iteration": 2.7262227535247803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146056, + "balance_loss_mlp": 1.04658782, + "epoch": 0.1898807233551366, + "flos": 500753442816.0, + "grad_norm": 0.02768975875157221, + "language_loss": 0.95335174, + "learning_rate": 0.0009344592356873166, + "loss": 0.96481234, + "num_input_tokens_seen": 81637024, + "router_z_loss_mlp": 0.99511719, + "step": 987, + "time_per_iteration": 2.678715467453003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149829, + "balance_loss_mlp": 1.05002666, + "epoch": 0.19007310504040015, + "flos": 603359236608.0, + "grad_norm": 0.02899497531058058, + "language_loss": 0.87347138, + "learning_rate": 0.0009343049518623255, + "loss": 0.88496965, + "num_input_tokens_seen": 81709488, + "router_z_loss_mlp": 0.99853516, + "step": 988, + "time_per_iteration": 2.726668119430542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143975, + "balance_loss_mlp": 1.04407787, + "epoch": 0.1902654867256637, + "flos": 602764353024.0, + "grad_norm": 0.022945627178248204, + "language_loss": 0.90576518, + "learning_rate": 0.0009341504994251985, + "loss": 0.91720492, + "num_input_tokens_seen": 81787152, + "router_z_loss_mlp": 0.99951172, + "step": 989, + "time_per_iteration": 2.8518989086151123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151848, + "balance_loss_mlp": 1.05247498, + "epoch": 0.19045786841092727, + "flos": 1579231363584.0, + "grad_norm": 0.011944448483625032, + "language_loss": 0.73520499, + "learning_rate": 0.0009339958784358994, + "loss": 0.74672347, + "num_input_tokens_seen": 82030608, + "router_z_loss_mlp": 0.99414062, + "step": 990, + "time_per_iteration": 5.084089517593384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144398, + "balance_loss_mlp": 1.04445326, + "epoch": 0.19065025009619085, + "flos": 683054184960.0, + "grad_norm": 0.025253455013724026, + "language_loss": 0.88680583, + "learning_rate": 0.0009338410889544574, + "loss": 0.8982498, + "num_input_tokens_seen": 82119872, + "router_z_loss_mlp": 1.0, + "step": 991, + "time_per_iteration": 3.007277011871338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113977, + "balance_loss_mlp": 1.03949153, + "epoch": 0.1908426317814544, + "flos": 603441828864.0, + "grad_norm": 0.02514183514150974, + "language_loss": 0.96243769, + "learning_rate": 0.000933686131040967, + "loss": 0.97383535, + "num_input_tokens_seen": 82195552, + "router_z_loss_mlp": 1.00341797, + "step": 992, + "time_per_iteration": 2.7673017978668213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144459, + "balance_loss_mlp": 1.04441845, + "epoch": 0.19103501346671797, + "flos": 587433818112.0, + "grad_norm": 0.025095383977303525, + "language_loss": 0.99126339, + "learning_rate": 0.0009335310047555883, + "loss": 1.00270796, + "num_input_tokens_seen": 82267040, + "router_z_loss_mlp": 1.00097656, + "step": 993, + "time_per_iteration": 2.782841920852661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145602, + "balance_loss_mlp": 1.04565716, + "epoch": 0.19122739515198153, + "flos": 546834370560.0, + "grad_norm": 0.0365250692916995, + "language_loss": 0.97246122, + "learning_rate": 0.0009333757101585467, + "loss": 0.98391724, + "num_input_tokens_seen": 82337680, + "router_z_loss_mlp": 1.0, + "step": 994, + "time_per_iteration": 2.6937174797058105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142239, + "balance_loss_mlp": 1.04229414, + "epoch": 0.1914197768372451, + "flos": 522549107712.0, + "grad_norm": 0.02399514581888075, + "language_loss": 1.00362575, + "learning_rate": 0.0009332202473101329, + "loss": 1.01504803, + "num_input_tokens_seen": 82409600, + "router_z_loss_mlp": 1.0, + "step": 995, + "time_per_iteration": 2.7192962169647217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137582, + "balance_loss_mlp": 1.03763652, + "epoch": 0.19161215852250865, + "flos": 612387824640.0, + "grad_norm": 0.024864495797513732, + "language_loss": 0.91319168, + "learning_rate": 0.0009330646162707028, + "loss": 0.92456746, + "num_input_tokens_seen": 82480288, + "router_z_loss_mlp": 1.0, + "step": 996, + "time_per_iteration": 2.7450180053710938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113947, + "balance_loss_mlp": 1.03962064, + "epoch": 0.1918045402077722, + "flos": 848182619136.0, + "grad_norm": 0.02592603597590215, + "language_loss": 0.92579019, + "learning_rate": 0.0009329088171006779, + "loss": 0.93718487, + "num_input_tokens_seen": 82568960, + "router_z_loss_mlp": 0.99902344, + "step": 997, + "time_per_iteration": 3.1890194416046143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144521, + "balance_loss_mlp": 1.04457617, + "epoch": 0.19199692189303577, + "flos": 466892371968.0, + "grad_norm": 0.027577096255712943, + "language_loss": 0.95194477, + "learning_rate": 0.0009327528498605446, + "loss": 0.96338999, + "num_input_tokens_seen": 82634128, + "router_z_loss_mlp": 1.0, + "step": 998, + "time_per_iteration": 2.6845622062683105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141712, + "balance_loss_mlp": 1.04143262, + "epoch": 0.19218930357829936, + "flos": 532613011968.0, + "grad_norm": 0.026795980657526523, + "language_loss": 0.98209792, + "learning_rate": 0.0009325967146108548, + "loss": 0.99351501, + "num_input_tokens_seen": 82707472, + "router_z_loss_mlp": 1.00341797, + "step": 999, + "time_per_iteration": 2.690363883972168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145933, + "balance_loss_mlp": 1.04589295, + "epoch": 0.19238168526356292, + "flos": 602727422976.0, + "grad_norm": 0.025877996038880184, + "language_loss": 0.97816348, + "learning_rate": 0.0009324404114122258, + "loss": 0.98962283, + "num_input_tokens_seen": 82775232, + "router_z_loss_mlp": 1.00097656, + "step": 1000, + "time_per_iteration": 2.717535972595215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139683, + "balance_loss_mlp": 1.03969073, + "epoch": 0.19257406694882648, + "flos": 573154062336.0, + "grad_norm": 0.0251308575536182, + "language_loss": 0.95425117, + "learning_rate": 0.0009322839403253397, + "loss": 0.96564806, + "num_input_tokens_seen": 82850032, + "router_z_loss_mlp": 1.00048828, + "step": 1001, + "time_per_iteration": 2.8128621578216553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147687, + "balance_loss_mlp": 1.04793251, + "epoch": 0.19276644863409004, + "flos": 803156473344.0, + "grad_norm": 0.02827819499351052, + "language_loss": 0.93752921, + "learning_rate": 0.0009321273014109439, + "loss": 0.94900608, + "num_input_tokens_seen": 82926080, + "router_z_loss_mlp": 0.99804688, + "step": 1002, + "time_per_iteration": 2.9962668418884277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115103, + "balance_loss_mlp": 1.05127609, + "epoch": 0.1929588303193536, + "flos": 564479311872.0, + "grad_norm": 0.02425681225612504, + "language_loss": 0.92063946, + "learning_rate": 0.0009319704947298513, + "loss": 0.93214977, + "num_input_tokens_seen": 83005200, + "router_z_loss_mlp": 0.99804688, + "step": 1003, + "time_per_iteration": 2.9961774349212646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148634, + "balance_loss_mlp": 1.04887998, + "epoch": 0.19315121200461716, + "flos": 627987603456.0, + "grad_norm": 0.023688885680104285, + "language_loss": 0.95116329, + "learning_rate": 0.0009318135203429393, + "loss": 0.96264958, + "num_input_tokens_seen": 83077280, + "router_z_loss_mlp": 0.99804688, + "step": 1004, + "time_per_iteration": 2.7953245639801025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146221, + "balance_loss_mlp": 1.04646707, + "epoch": 0.19334359368988072, + "flos": 518583034368.0, + "grad_norm": 0.02448547542723696, + "language_loss": 0.95706153, + "learning_rate": 0.0009316563783111511, + "loss": 0.9685238, + "num_input_tokens_seen": 83145456, + "router_z_loss_mlp": 0.99804688, + "step": 1005, + "time_per_iteration": 2.7417562007904053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141812, + "balance_loss_mlp": 1.04224837, + "epoch": 0.19353597537514428, + "flos": 695399568384.0, + "grad_norm": 0.022656832097962477, + "language_loss": 0.91614294, + "learning_rate": 0.0009314990686954943, + "loss": 0.9275611, + "num_input_tokens_seen": 83225392, + "router_z_loss_mlp": 0.99609375, + "step": 1006, + "time_per_iteration": 2.921147584915161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143701, + "balance_loss_mlp": 1.04413795, + "epoch": 0.19372835706040784, + "flos": 1212199226880.0, + "grad_norm": 0.0213605480211332, + "language_loss": 0.89449364, + "learning_rate": 0.000931341591557042, + "loss": 0.90593064, + "num_input_tokens_seen": 83331296, + "router_z_loss_mlp": 0.99609375, + "step": 1007, + "time_per_iteration": 3.6934237480163574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142723, + "balance_loss_mlp": 1.04292154, + "epoch": 0.19392073874567142, + "flos": 521684980224.0, + "grad_norm": 0.02492230683936131, + "language_loss": 0.9970367, + "learning_rate": 0.0009311839469569325, + "loss": 1.00846386, + "num_input_tokens_seen": 83399952, + "router_z_loss_mlp": 0.99853516, + "step": 1008, + "time_per_iteration": 2.66283917427063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141437, + "balance_loss_mlp": 1.04187346, + "epoch": 0.19411312043093498, + "flos": 589910681088.0, + "grad_norm": 0.028572464719479444, + "language_loss": 0.9835515, + "learning_rate": 0.0009310261349563687, + "loss": 0.99496591, + "num_input_tokens_seen": 83468384, + "router_z_loss_mlp": 0.99609375, + "step": 1009, + "time_per_iteration": 2.6913864612579346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139912, + "balance_loss_mlp": 1.04034853, + "epoch": 0.19430550211619854, + "flos": 580571916288.0, + "grad_norm": 0.022224830980977262, + "language_loss": 0.9288035, + "learning_rate": 0.0009308681556166186, + "loss": 0.94020259, + "num_input_tokens_seen": 83547952, + "router_z_loss_mlp": 0.99609375, + "step": 1010, + "time_per_iteration": 2.8937342166900635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141907, + "balance_loss_mlp": 1.04234338, + "epoch": 0.1944978838014621, + "flos": 622245611520.0, + "grad_norm": 0.028831874511777204, + "language_loss": 1.01060331, + "learning_rate": 0.0009307100089990152, + "loss": 1.02202237, + "num_input_tokens_seen": 83615712, + "router_z_loss_mlp": 0.99609375, + "step": 1011, + "time_per_iteration": 2.7086822986602783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114452, + "balance_loss_mlp": 1.04495597, + "epoch": 0.19469026548672566, + "flos": 599814130176.0, + "grad_norm": 0.02434118582542042, + "language_loss": 0.95591187, + "learning_rate": 0.0009305516951649568, + "loss": 0.96735704, + "num_input_tokens_seen": 83687296, + "router_z_loss_mlp": 0.99609375, + "step": 1012, + "time_per_iteration": 2.7046425342559814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114359, + "balance_loss_mlp": 1.04402685, + "epoch": 0.19488264717198922, + "flos": 553247107584.0, + "grad_norm": 0.020712874248618226, + "language_loss": 0.93779677, + "learning_rate": 0.0009303932141759057, + "loss": 0.94923264, + "num_input_tokens_seen": 83763168, + "router_z_loss_mlp": 0.99609375, + "step": 1013, + "time_per_iteration": 2.7684950828552246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145994, + "balance_loss_mlp": 1.0468123, + "epoch": 0.19507502885725278, + "flos": 667312690176.0, + "grad_norm": 0.029421944235057496, + "language_loss": 0.94045115, + "learning_rate": 0.0009302345660933902, + "loss": 0.95191121, + "num_input_tokens_seen": 83837312, + "router_z_loss_mlp": 0.9921875, + "step": 1014, + "time_per_iteration": 2.8242082595825195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143606, + "balance_loss_mlp": 1.04442382, + "epoch": 0.19526741054251634, + "flos": 672327541248.0, + "grad_norm": 0.024449615989116238, + "language_loss": 0.93477654, + "learning_rate": 0.0009300757509790026, + "loss": 0.94621253, + "num_input_tokens_seen": 83917120, + "router_z_loss_mlp": 0.9921875, + "step": 1015, + "time_per_iteration": 2.840658664703369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144964, + "balance_loss_mlp": 1.04578233, + "epoch": 0.19545979222777993, + "flos": 448146986496.0, + "grad_norm": 0.028637929544829934, + "language_loss": 1.02226353, + "learning_rate": 0.0009299167688944005, + "loss": 1.0337131, + "num_input_tokens_seen": 83982992, + "router_z_loss_mlp": 0.9921875, + "step": 1016, + "time_per_iteration": 2.505427837371826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114266, + "balance_loss_mlp": 1.04376352, + "epoch": 0.1956521739130435, + "flos": 570168910848.0, + "grad_norm": 0.02609870742448671, + "language_loss": 0.93148959, + "learning_rate": 0.0009297576199013063, + "loss": 0.94291621, + "num_input_tokens_seen": 84057296, + "router_z_loss_mlp": 0.98925781, + "step": 1017, + "time_per_iteration": 2.7357168197631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155182, + "balance_loss_mlp": 1.05752563, + "epoch": 0.19584455559830705, + "flos": 1458880571904.0, + "grad_norm": 0.02028337436206496, + "language_loss": 0.73002136, + "learning_rate": 0.0009295983040615071, + "loss": 0.74157315, + "num_input_tokens_seen": 84292640, + "router_z_loss_mlp": 0.9765625, + "step": 1018, + "time_per_iteration": 5.09963059425354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147095, + "balance_loss_mlp": 1.04962921, + "epoch": 0.1960369372835706, + "flos": 1594481307648.0, + "grad_norm": 0.015251553743586253, + "language_loss": 0.79426301, + "learning_rate": 0.0009294388214368547, + "loss": 0.80573392, + "num_input_tokens_seen": 84524448, + "router_z_loss_mlp": 0.97460938, + "step": 1019, + "time_per_iteration": 6.03454852104187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146546, + "balance_loss_mlp": 1.0477457, + "epoch": 0.19622931896883417, + "flos": 617252954112.0, + "grad_norm": 0.02445318741287071, + "language_loss": 0.94190967, + "learning_rate": 0.0009292791720892659, + "loss": 0.9533751, + "num_input_tokens_seen": 84600208, + "router_z_loss_mlp": 0.98828125, + "step": 1020, + "time_per_iteration": 2.8369834423065186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147421, + "balance_loss_mlp": 1.0486201, + "epoch": 0.19642170065409773, + "flos": 467207278080.0, + "grad_norm": 0.027280190942869837, + "language_loss": 0.98824823, + "learning_rate": 0.0009291193560807218, + "loss": 0.99972242, + "num_input_tokens_seen": 84668032, + "router_z_loss_mlp": 0.98828125, + "step": 1021, + "time_per_iteration": 2.5833048820495605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144681, + "balance_loss_mlp": 1.0458802, + "epoch": 0.19661408233936128, + "flos": 516288093696.0, + "grad_norm": 0.025303886608753337, + "language_loss": 0.95740455, + "learning_rate": 0.0009289593734732688, + "loss": 0.96885145, + "num_input_tokens_seen": 84738176, + "router_z_loss_mlp": 0.98828125, + "step": 1022, + "time_per_iteration": 2.5913774967193604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149525, + "balance_loss_mlp": 1.05058122, + "epoch": 0.19680646402462484, + "flos": 393493366272.0, + "grad_norm": 0.0253763529676381, + "language_loss": 1.01103711, + "learning_rate": 0.0009287992243290175, + "loss": 1.02253246, + "num_input_tokens_seen": 84799936, + "router_z_loss_mlp": 0.98974609, + "step": 1023, + "time_per_iteration": 2.4793736934661865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115501, + "balance_loss_mlp": 1.05635238, + "epoch": 0.19699884570988843, + "flos": 627623032320.0, + "grad_norm": 0.02508480994731895, + "language_loss": 0.99886519, + "learning_rate": 0.0009286389087101435, + "loss": 1.01041532, + "num_input_tokens_seen": 84877216, + "router_z_loss_mlp": 0.98681641, + "step": 1024, + "time_per_iteration": 2.7772202491760254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153446, + "balance_loss_mlp": 1.05483615, + "epoch": 0.197191227395152, + "flos": 559073693184.0, + "grad_norm": 0.02445444816711275, + "language_loss": 0.98426372, + "learning_rate": 0.0009284784266788864, + "loss": 0.99579823, + "num_input_tokens_seen": 84952464, + "router_z_loss_mlp": 0.98632812, + "step": 1025, + "time_per_iteration": 2.6955864429473877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150264, + "balance_loss_mlp": 1.05165374, + "epoch": 0.19738360908041555, + "flos": 666249176064.0, + "grad_norm": 0.021666801749132464, + "language_loss": 0.99231869, + "learning_rate": 0.0009283177782975512, + "loss": 1.00382137, + "num_input_tokens_seen": 85031488, + "router_z_loss_mlp": 0.98632812, + "step": 1026, + "time_per_iteration": 2.9886229038238525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153907, + "balance_loss_mlp": 1.05529749, + "epoch": 0.1975759907656791, + "flos": 523510563840.0, + "grad_norm": 0.025961932589349316, + "language_loss": 0.98509014, + "learning_rate": 0.000928156963628507, + "loss": 0.99662918, + "num_input_tokens_seen": 85098384, + "router_z_loss_mlp": 0.98632812, + "step": 1027, + "time_per_iteration": 2.586740493774414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149439, + "balance_loss_mlp": 1.05097175, + "epoch": 0.19776837245094267, + "flos": 463484252160.0, + "grad_norm": 0.02550253779434718, + "language_loss": 0.96135926, + "learning_rate": 0.0009279959827341877, + "loss": 0.97285366, + "num_input_tokens_seen": 85172944, + "router_z_loss_mlp": 0.98486328, + "step": 1028, + "time_per_iteration": 2.723517894744873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146754, + "balance_loss_mlp": 1.04852605, + "epoch": 0.19796075413620623, + "flos": 504057503232.0, + "grad_norm": 0.02160335630411572, + "language_loss": 0.96627682, + "learning_rate": 0.0009278348356770915, + "loss": 0.97774434, + "num_input_tokens_seen": 85241632, + "router_z_loss_mlp": 0.98242188, + "step": 1029, + "time_per_iteration": 2.566802501678467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144801, + "balance_loss_mlp": 1.04666746, + "epoch": 0.1981531358214698, + "flos": 508570796544.0, + "grad_norm": 0.024261507948164947, + "language_loss": 0.9528529, + "learning_rate": 0.0009276735225197814, + "loss": 0.96430099, + "num_input_tokens_seen": 85308992, + "router_z_loss_mlp": 0.98144531, + "step": 1030, + "time_per_iteration": 2.6009340286254883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145205, + "balance_loss_mlp": 1.04702377, + "epoch": 0.19834551750673335, + "flos": 532639208448.0, + "grad_norm": 0.023062563394134136, + "language_loss": 0.95906407, + "learning_rate": 0.0009275120433248847, + "loss": 0.97051609, + "num_input_tokens_seen": 85381936, + "router_z_loss_mlp": 0.98193359, + "step": 1031, + "time_per_iteration": 2.684858560562134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145757, + "balance_loss_mlp": 1.0477196, + "epoch": 0.1985378991919969, + "flos": 776969765376.0, + "grad_norm": 0.02469129884935611, + "language_loss": 0.94986421, + "learning_rate": 0.0009273503981550931, + "loss": 0.96132183, + "num_input_tokens_seen": 85474352, + "router_z_loss_mlp": 0.98046875, + "step": 1032, + "time_per_iteration": 3.058094024658203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145313, + "balance_loss_mlp": 1.04737103, + "epoch": 0.1987302808772605, + "flos": 435191256576.0, + "grad_norm": 0.025952536265860523, + "language_loss": 0.96777844, + "learning_rate": 0.0009271885870731626, + "loss": 0.9792316, + "num_input_tokens_seen": 85538416, + "router_z_loss_mlp": 0.97949219, + "step": 1033, + "time_per_iteration": 2.493664503097534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153962, + "balance_loss_mlp": 1.05592442, + "epoch": 0.19892266256252406, + "flos": 554653725696.0, + "grad_norm": 0.029222795446194067, + "language_loss": 1.0035603, + "learning_rate": 0.0009270266101419143, + "loss": 1.01509976, + "num_input_tokens_seen": 85604416, + "router_z_loss_mlp": 0.98046875, + "step": 1034, + "time_per_iteration": 2.626612901687622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145521, + "balance_loss_mlp": 1.04748368, + "epoch": 0.19911504424778761, + "flos": 550948164096.0, + "grad_norm": 0.02425528851980561, + "language_loss": 0.92802572, + "learning_rate": 0.0009268644674242328, + "loss": 0.9394809, + "num_input_tokens_seen": 85677008, + "router_z_loss_mlp": 0.98046875, + "step": 1035, + "time_per_iteration": 2.683253288269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148174, + "balance_loss_mlp": 1.04994512, + "epoch": 0.19930742593305117, + "flos": 519312176640.0, + "grad_norm": 0.02646778626346152, + "language_loss": 0.91577774, + "learning_rate": 0.0009267021589830678, + "loss": 0.9272595, + "num_input_tokens_seen": 85745200, + "router_z_loss_mlp": 0.98242188, + "step": 1036, + "time_per_iteration": 2.7614338397979736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218948, + "balance_loss_mlp": 1.11824036, + "epoch": 0.19949980761831473, + "flos": 1512637863936.0, + "grad_norm": 0.02467753292442409, + "language_loss": 0.77627081, + "learning_rate": 0.0009265396848814328, + "loss": 0.78846025, + "num_input_tokens_seen": 85980608, + "router_z_loss_mlp": 1.0078125, + "step": 1037, + "time_per_iteration": 4.962339878082275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114988, + "balance_loss_mlp": 1.05184233, + "epoch": 0.1996921893035783, + "flos": 699439501824.0, + "grad_norm": 0.02757683731024766, + "language_loss": 1.02362621, + "learning_rate": 0.000926377045182406, + "loss": 1.03512502, + "num_input_tokens_seen": 86055952, + "router_z_loss_mlp": 0.98046875, + "step": 1038, + "time_per_iteration": 2.916594982147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155504, + "balance_loss_mlp": 1.05727601, + "epoch": 0.19988457098884185, + "flos": 728394510336.0, + "grad_norm": 0.024851830352508646, + "language_loss": 0.97729039, + "learning_rate": 0.0009262142399491296, + "loss": 0.98884547, + "num_input_tokens_seen": 86145536, + "router_z_loss_mlp": 0.98242188, + "step": 1039, + "time_per_iteration": 3.0976781845092773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156606, + "balance_loss_mlp": 1.05837739, + "epoch": 0.2000769526741054, + "flos": 561624416256.0, + "grad_norm": 0.025662568358030838, + "language_loss": 0.98388815, + "learning_rate": 0.0009260512692448105, + "loss": 0.99545419, + "num_input_tokens_seen": 86214480, + "router_z_loss_mlp": 0.98242188, + "step": 1040, + "time_per_iteration": 2.715479850769043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151311, + "balance_loss_mlp": 1.05308211, + "epoch": 0.200269334359369, + "flos": 573164795904.0, + "grad_norm": 0.022253887646478135, + "language_loss": 0.93097693, + "learning_rate": 0.000925888133132719, + "loss": 0.9424901, + "num_input_tokens_seen": 86289824, + "router_z_loss_mlp": 0.98242188, + "step": 1041, + "time_per_iteration": 2.7987864017486572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011912, + "balance_loss_mlp": 1.0923996, + "epoch": 0.20046171604463256, + "flos": 1489152875520.0, + "grad_norm": 0.020655335232781416, + "language_loss": 0.79610431, + "learning_rate": 0.0009257248316761906, + "loss": 0.80801636, + "num_input_tokens_seen": 86516384, + "router_z_loss_mlp": 0.98828125, + "step": 1042, + "time_per_iteration": 4.944507360458374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154531, + "balance_loss_mlp": 1.05644536, + "epoch": 0.20065409772989612, + "flos": 497577636864.0, + "grad_norm": 0.02609736880654102, + "language_loss": 0.92129564, + "learning_rate": 0.0009255613649386244, + "loss": 0.932841, + "num_input_tokens_seen": 86587296, + "router_z_loss_mlp": 0.98095703, + "step": 1043, + "time_per_iteration": 2.6478612422943115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157191, + "balance_loss_mlp": 1.05915368, + "epoch": 0.20084647941515968, + "flos": 580463127552.0, + "grad_norm": 0.02650777474930283, + "language_loss": 0.87469566, + "learning_rate": 0.0009253977329834838, + "loss": 0.88626754, + "num_input_tokens_seen": 86662656, + "router_z_loss_mlp": 0.98046875, + "step": 1044, + "time_per_iteration": 2.7641594409942627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161195, + "balance_loss_mlp": 1.06315744, + "epoch": 0.20103886110042324, + "flos": 643287939072.0, + "grad_norm": 0.030624079602620518, + "language_loss": 0.9713465, + "learning_rate": 0.0009252339358742965, + "loss": 0.98295844, + "num_input_tokens_seen": 86734704, + "router_z_loss_mlp": 0.98046875, + "step": 1045, + "time_per_iteration": 2.811687707901001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157153, + "balance_loss_mlp": 1.0594964, + "epoch": 0.2012312427856868, + "flos": 442969678848.0, + "grad_norm": 0.023268596270985206, + "language_loss": 0.93283701, + "learning_rate": 0.000925069973674654, + "loss": 0.94440854, + "num_input_tokens_seen": 86806512, + "router_z_loss_mlp": 0.9765625, + "step": 1046, + "time_per_iteration": 2.6709671020507812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157527, + "balance_loss_mlp": 1.05948889, + "epoch": 0.20142362447095036, + "flos": 555472190976.0, + "grad_norm": 0.022730221646095148, + "language_loss": 0.96496689, + "learning_rate": 0.000924905846448212, + "loss": 0.97654217, + "num_input_tokens_seen": 86883440, + "router_z_loss_mlp": 0.98046875, + "step": 1047, + "time_per_iteration": 2.7338547706604004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115317, + "balance_loss_mlp": 1.05522716, + "epoch": 0.20161600615621392, + "flos": 671554738176.0, + "grad_norm": 0.026697286803692055, + "language_loss": 0.96143991, + "learning_rate": 0.0009247415542586906, + "loss": 0.97297156, + "num_input_tokens_seen": 86960208, + "router_z_loss_mlp": 0.97949219, + "step": 1048, + "time_per_iteration": 2.849416494369507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149865, + "balance_loss_mlp": 1.05216146, + "epoch": 0.2018083878414775, + "flos": 574306899456.0, + "grad_norm": 0.021371049275305663, + "language_loss": 0.91504782, + "learning_rate": 0.0009245770971698735, + "loss": 0.92654645, + "num_input_tokens_seen": 87044144, + "router_z_loss_mlp": 0.97705078, + "step": 1049, + "time_per_iteration": 2.8751590251922607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151512, + "balance_loss_mlp": 1.05376041, + "epoch": 0.20200076952674106, + "flos": 426794482176.0, + "grad_norm": 0.027360075371486055, + "language_loss": 0.97835737, + "learning_rate": 0.0009244124752456087, + "loss": 0.98987252, + "num_input_tokens_seen": 87109136, + "router_z_loss_mlp": 0.97753906, + "step": 1050, + "time_per_iteration": 2.4985499382019043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153257, + "balance_loss_mlp": 1.05531442, + "epoch": 0.20219315121200462, + "flos": 537684258816.0, + "grad_norm": 0.025856302906645603, + "language_loss": 0.95370412, + "learning_rate": 0.0009242476885498081, + "loss": 0.96523666, + "num_input_tokens_seen": 87184320, + "router_z_loss_mlp": 0.97949219, + "step": 1051, + "time_per_iteration": 2.7127723693847656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150827, + "balance_loss_mlp": 1.05297983, + "epoch": 0.20238553289726818, + "flos": 478834252800.0, + "grad_norm": 0.02631802181941096, + "language_loss": 0.90995431, + "learning_rate": 0.0009240827371464474, + "loss": 0.92146254, + "num_input_tokens_seen": 87248224, + "router_z_loss_mlp": 0.97851562, + "step": 1052, + "time_per_iteration": 2.527918577194214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144335, + "balance_loss_mlp": 1.04667878, + "epoch": 0.20257791458253174, + "flos": 1153846049280.0, + "grad_norm": 0.025276400477213575, + "language_loss": 0.92167991, + "learning_rate": 0.0009239176210995666, + "loss": 0.93312329, + "num_input_tokens_seen": 87333088, + "router_z_loss_mlp": 0.9765625, + "step": 1053, + "time_per_iteration": 3.4556469917297363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144677, + "balance_loss_mlp": 1.04682982, + "epoch": 0.2027702962677953, + "flos": 668148619776.0, + "grad_norm": 0.025342755763179396, + "language_loss": 1.04358864, + "learning_rate": 0.0009237523404732695, + "loss": 1.05503547, + "num_input_tokens_seen": 87413840, + "router_z_loss_mlp": 0.97851562, + "step": 1054, + "time_per_iteration": 2.894198417663574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144665, + "balance_loss_mlp": 1.04676986, + "epoch": 0.20296267795305886, + "flos": 642452009472.0, + "grad_norm": 0.02468028394334187, + "language_loss": 0.94787639, + "learning_rate": 0.0009235868953317235, + "loss": 0.95932305, + "num_input_tokens_seen": 87487168, + "router_z_loss_mlp": 0.97900391, + "step": 1055, + "time_per_iteration": 2.812633991241455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148717, + "balance_loss_mlp": 1.05082273, + "epoch": 0.20315505963832242, + "flos": 932129622528.0, + "grad_norm": 0.02533903757078053, + "language_loss": 0.93907225, + "learning_rate": 0.0009234212857391602, + "loss": 0.95055938, + "num_input_tokens_seen": 87573184, + "router_z_loss_mlp": 0.97900391, + "step": 1056, + "time_per_iteration": 3.2061142921447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147493, + "balance_loss_mlp": 1.0496459, + "epoch": 0.20334744132358598, + "flos": 563287543296.0, + "grad_norm": 0.019686870604104637, + "language_loss": 0.97330248, + "learning_rate": 0.000923255511759875, + "loss": 0.98477745, + "num_input_tokens_seen": 87651968, + "router_z_loss_mlp": 0.97851562, + "step": 1057, + "time_per_iteration": 2.7639002799987793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150039, + "balance_loss_mlp": 1.05219197, + "epoch": 0.20353982300884957, + "flos": 645428428800.0, + "grad_norm": 0.023252811049323967, + "language_loss": 0.95256209, + "learning_rate": 0.000923089573458227, + "loss": 0.96406245, + "num_input_tokens_seen": 87727792, + "router_z_loss_mlp": 0.97851562, + "step": 1058, + "time_per_iteration": 2.857612133026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114962, + "balance_loss_mlp": 1.05177307, + "epoch": 0.20373220469411313, + "flos": 652705293312.0, + "grad_norm": 0.02395962669603635, + "language_loss": 0.93332446, + "learning_rate": 0.0009229234708986392, + "loss": 0.94482064, + "num_input_tokens_seen": 87806048, + "router_z_loss_mlp": 0.97851562, + "step": 1059, + "time_per_iteration": 2.877995729446411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150688, + "balance_loss_mlp": 1.05436707, + "epoch": 0.2039245863793767, + "flos": 1440396973056.0, + "grad_norm": 0.013896761524226428, + "language_loss": 0.81666899, + "learning_rate": 0.0009227572041455982, + "loss": 0.82817578, + "num_input_tokens_seen": 88018160, + "router_z_loss_mlp": 0.96289062, + "step": 1060, + "time_per_iteration": 4.659267902374268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142187, + "balance_loss_mlp": 1.04434025, + "epoch": 0.20411696806464025, + "flos": 598127534592.0, + "grad_norm": 0.026599581611848343, + "language_loss": 0.93894625, + "learning_rate": 0.0009225907732636548, + "loss": 0.95036817, + "num_input_tokens_seen": 88090864, + "router_z_loss_mlp": 0.97851562, + "step": 1061, + "time_per_iteration": 2.7480902671813965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115027, + "balance_loss_mlp": 1.05242312, + "epoch": 0.2043093497499038, + "flos": 574897053696.0, + "grad_norm": 0.026136319737411078, + "language_loss": 0.96460152, + "learning_rate": 0.0009224241783174227, + "loss": 0.97610414, + "num_input_tokens_seen": 88161360, + "router_z_loss_mlp": 0.97851562, + "step": 1062, + "time_per_iteration": 2.676877021789551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146738, + "balance_loss_mlp": 1.04874802, + "epoch": 0.20450173143516737, + "flos": 631523977728.0, + "grad_norm": 0.02709710709634581, + "language_loss": 0.94472104, + "learning_rate": 0.0009222574193715802, + "loss": 0.95618844, + "num_input_tokens_seen": 88234960, + "router_z_loss_mlp": 0.97998047, + "step": 1063, + "time_per_iteration": 2.7604472637176514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141026, + "balance_loss_mlp": 1.04298854, + "epoch": 0.20469411312043093, + "flos": 575146831872.0, + "grad_norm": 0.022769515120839894, + "language_loss": 0.95189404, + "learning_rate": 0.000922090496490869, + "loss": 0.96330428, + "num_input_tokens_seen": 88308176, + "router_z_loss_mlp": 0.98046875, + "step": 1064, + "time_per_iteration": 2.728154182434082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141583, + "balance_loss_mlp": 1.04383183, + "epoch": 0.20488649480569449, + "flos": 638279818752.0, + "grad_norm": 0.022393105289594414, + "language_loss": 0.97629392, + "learning_rate": 0.0009219234097400937, + "loss": 0.9877097, + "num_input_tokens_seen": 88386768, + "router_z_loss_mlp": 0.97753906, + "step": 1065, + "time_per_iteration": 2.889946937561035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137744, + "balance_loss_mlp": 1.03989744, + "epoch": 0.20507887649095807, + "flos": 977437747200.0, + "grad_norm": 0.024872828726298618, + "language_loss": 0.9305777, + "learning_rate": 0.0009217561591841237, + "loss": 0.94195515, + "num_input_tokens_seen": 88476576, + "router_z_loss_mlp": 0.97851562, + "step": 1066, + "time_per_iteration": 3.296248435974121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144611, + "balance_loss_mlp": 1.04681206, + "epoch": 0.20527125817622163, + "flos": 487155165696.0, + "grad_norm": 0.024567371957878288, + "language_loss": 0.90358436, + "learning_rate": 0.0009215887448878913, + "loss": 0.91503048, + "num_input_tokens_seen": 88541968, + "router_z_loss_mlp": 0.97802734, + "step": 1067, + "time_per_iteration": 2.5662190914154053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137303, + "balance_loss_mlp": 1.03945625, + "epoch": 0.2054636398614852, + "flos": 528210508800.0, + "grad_norm": 0.02249486638659544, + "language_loss": 0.94470721, + "learning_rate": 0.0009214211669163922, + "loss": 0.9560802, + "num_input_tokens_seen": 88615296, + "router_z_loss_mlp": 0.97851562, + "step": 1068, + "time_per_iteration": 2.6912589073181152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139468, + "balance_loss_mlp": 1.04162145, + "epoch": 0.20565602154674875, + "flos": 559323471360.0, + "grad_norm": 0.022635174506508055, + "language_loss": 1.02501464, + "learning_rate": 0.0009212534253346862, + "loss": 1.03640926, + "num_input_tokens_seen": 88691584, + "router_z_loss_mlp": 0.97851562, + "step": 1069, + "time_per_iteration": 2.708683490753174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135123, + "balance_loss_mlp": 1.03746641, + "epoch": 0.2058484032320123, + "flos": 505221073920.0, + "grad_norm": 0.02479403914192968, + "language_loss": 0.95383358, + "learning_rate": 0.0009210855202078964, + "loss": 0.96518481, + "num_input_tokens_seen": 88756592, + "router_z_loss_mlp": 0.9765625, + "step": 1070, + "time_per_iteration": 2.6434948444366455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132203, + "balance_loss_mlp": 1.03478527, + "epoch": 0.20604078491727587, + "flos": 434047151616.0, + "grad_norm": 0.024632817960327506, + "language_loss": 0.96572351, + "learning_rate": 0.0009209174516012091, + "loss": 0.97704554, + "num_input_tokens_seen": 88820928, + "router_z_loss_mlp": 0.97412109, + "step": 1071, + "time_per_iteration": 2.4891347885131836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148822, + "balance_loss_mlp": 1.05130851, + "epoch": 0.20623316660253943, + "flos": 609874031616.0, + "grad_norm": 0.024395492192686875, + "language_loss": 0.97482872, + "learning_rate": 0.0009207492195798747, + "loss": 0.98631692, + "num_input_tokens_seen": 88895440, + "router_z_loss_mlp": 0.97509766, + "step": 1072, + "time_per_iteration": 2.758575201034546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152495, + "balance_loss_mlp": 1.05502975, + "epoch": 0.206425548287803, + "flos": 481393708032.0, + "grad_norm": 0.027205333287948934, + "language_loss": 0.9402262, + "learning_rate": 0.0009205808242092061, + "loss": 0.95175123, + "num_input_tokens_seen": 88964400, + "router_z_loss_mlp": 0.97460938, + "step": 1073, + "time_per_iteration": 2.6534366607666016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152896, + "balance_loss_mlp": 1.05562115, + "epoch": 0.20661792997306658, + "flos": 951122784768.0, + "grad_norm": 0.02943422736446298, + "language_loss": 0.93147469, + "learning_rate": 0.0009204122655545808, + "loss": 0.94300359, + "num_input_tokens_seen": 89049600, + "router_z_loss_mlp": 0.97265625, + "step": 1074, + "time_per_iteration": 3.317518949508667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149199, + "balance_loss_mlp": 1.05201948, + "epoch": 0.20681031165833014, + "flos": 604616133120.0, + "grad_norm": 0.024855118115069977, + "language_loss": 0.88961834, + "learning_rate": 0.0009202435436814388, + "loss": 0.90111029, + "num_input_tokens_seen": 89119024, + "router_z_loss_mlp": 0.97167969, + "step": 1075, + "time_per_iteration": 2.6815345287323 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142912, + "balance_loss_mlp": 1.04563749, + "epoch": 0.2070026933435937, + "flos": 710265475584.0, + "grad_norm": 0.027130222852878607, + "language_loss": 0.99239773, + "learning_rate": 0.0009200746586552836, + "loss": 1.00382686, + "num_input_tokens_seen": 89197344, + "router_z_loss_mlp": 0.97265625, + "step": 1076, + "time_per_iteration": 2.9578917026519775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141976, + "balance_loss_mlp": 1.04451025, + "epoch": 0.20719507502885726, + "flos": 831254085120.0, + "grad_norm": 0.023090334700176834, + "language_loss": 0.92780054, + "learning_rate": 0.0009199056105416825, + "loss": 0.93922031, + "num_input_tokens_seen": 89280464, + "router_z_loss_mlp": 0.97460938, + "step": 1077, + "time_per_iteration": 3.0944156646728516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140475, + "balance_loss_mlp": 1.04324794, + "epoch": 0.20738745671412082, + "flos": 639499785216.0, + "grad_norm": 0.023914471883828003, + "language_loss": 0.96186948, + "learning_rate": 0.0009197363994062654, + "loss": 0.97327423, + "num_input_tokens_seen": 89353344, + "router_z_loss_mlp": 0.97216797, + "step": 1078, + "time_per_iteration": 2.8147799968719482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142489, + "balance_loss_mlp": 1.04521394, + "epoch": 0.20757983839938438, + "flos": 686983328256.0, + "grad_norm": 0.02237329029547868, + "language_loss": 0.90686679, + "learning_rate": 0.0009195670253147262, + "loss": 0.91829169, + "num_input_tokens_seen": 89439328, + "router_z_loss_mlp": 0.97265625, + "step": 1079, + "time_per_iteration": 2.994058609008789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141016, + "balance_loss_mlp": 1.04383624, + "epoch": 0.20777222008464794, + "flos": 520317293568.0, + "grad_norm": 0.026634413874044322, + "language_loss": 0.92195654, + "learning_rate": 0.0009193974883328216, + "loss": 0.93336666, + "num_input_tokens_seen": 89510160, + "router_z_loss_mlp": 0.97167969, + "step": 1080, + "time_per_iteration": 2.6506502628326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140462, + "balance_loss_mlp": 1.04333031, + "epoch": 0.2079646017699115, + "flos": 512469740544.0, + "grad_norm": 0.025261028079588584, + "language_loss": 0.97185814, + "learning_rate": 0.0009192277885263718, + "loss": 0.98326278, + "num_input_tokens_seen": 89582960, + "router_z_loss_mlp": 0.97119141, + "step": 1081, + "time_per_iteration": 2.646629810333252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143678, + "balance_loss_mlp": 1.04640269, + "epoch": 0.20815698345517505, + "flos": 933467109888.0, + "grad_norm": 0.02363260569338726, + "language_loss": 0.9496327, + "learning_rate": 0.0009190579259612602, + "loss": 0.96106946, + "num_input_tokens_seen": 89675488, + "router_z_loss_mlp": 0.97265625, + "step": 1082, + "time_per_iteration": 3.2829811573028564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150642, + "balance_loss_mlp": 1.05336761, + "epoch": 0.20834936514043864, + "flos": 633553677312.0, + "grad_norm": 0.02436625118168465, + "language_loss": 0.97094011, + "learning_rate": 0.000918887900703433, + "loss": 0.98244655, + "num_input_tokens_seen": 89747872, + "router_z_loss_mlp": 0.97265625, + "step": 1083, + "time_per_iteration": 2.779474973678589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147642, + "balance_loss_mlp": 1.05079603, + "epoch": 0.2085417468257022, + "flos": 395243088384.0, + "grad_norm": 0.027448171988374206, + "language_loss": 0.98109657, + "learning_rate": 0.0009187177128188999, + "loss": 0.99257296, + "num_input_tokens_seen": 89810176, + "router_z_loss_mlp": 0.96826172, + "step": 1084, + "time_per_iteration": 2.487755298614502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156746, + "balance_loss_mlp": 1.06118774, + "epoch": 0.20873412851096576, + "flos": 1405195138560.0, + "grad_norm": 0.014888537960634525, + "language_loss": 0.77156538, + "learning_rate": 0.0009185473623737339, + "loss": 0.78313285, + "num_input_tokens_seen": 90038432, + "router_z_loss_mlp": 0.95507812, + "step": 1085, + "time_per_iteration": 4.917901515960693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146704, + "balance_loss_mlp": 1.04981041, + "epoch": 0.20892651019622932, + "flos": 448761335808.0, + "grad_norm": 0.0275038267286557, + "language_loss": 0.93389261, + "learning_rate": 0.000918376849434071, + "loss": 0.94535965, + "num_input_tokens_seen": 90101568, + "router_z_loss_mlp": 0.96875, + "step": 1086, + "time_per_iteration": 2.5117850303649902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153188, + "balance_loss_mlp": 1.05629456, + "epoch": 0.20911889188149288, + "flos": 494080194048.0, + "grad_norm": 0.034273062806107445, + "language_loss": 1.02428699, + "learning_rate": 0.0009182061740661098, + "loss": 1.03581882, + "num_input_tokens_seen": 90169344, + "router_z_loss_mlp": 0.96875, + "step": 1087, + "time_per_iteration": 2.5270984172821045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154258, + "balance_loss_mlp": 1.05736482, + "epoch": 0.20931127356675644, + "flos": 842748802560.0, + "grad_norm": 0.02361505883443172, + "language_loss": 0.92997056, + "learning_rate": 0.0009180353363361127, + "loss": 0.94151306, + "num_input_tokens_seen": 90252416, + "router_z_loss_mlp": 0.96875, + "step": 1088, + "time_per_iteration": 3.1549112796783447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154015, + "balance_loss_mlp": 1.05688298, + "epoch": 0.20950365525202, + "flos": 758523823104.0, + "grad_norm": 0.028384526527587387, + "language_loss": 0.93851304, + "learning_rate": 0.0009178643363104044, + "loss": 0.95005322, + "num_input_tokens_seen": 90337952, + "router_z_loss_mlp": 0.97119141, + "step": 1089, + "time_per_iteration": 4.693684339523315 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148681, + "balance_loss_mlp": 1.05159688, + "epoch": 0.20969603693728356, + "flos": 473491760640.0, + "grad_norm": 0.03411348227976855, + "language_loss": 1.04663801, + "learning_rate": 0.0009176931740553735, + "loss": 1.05812478, + "num_input_tokens_seen": 90401488, + "router_z_loss_mlp": 0.97070312, + "step": 1090, + "time_per_iteration": 2.5203866958618164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146066, + "balance_loss_mlp": 1.04917288, + "epoch": 0.20988841862254715, + "flos": 978627514368.0, + "grad_norm": 0.027482857176328385, + "language_loss": 0.92998403, + "learning_rate": 0.0009175218496374708, + "loss": 0.94144469, + "num_input_tokens_seen": 90486144, + "router_z_loss_mlp": 0.96875, + "step": 1091, + "time_per_iteration": 3.362614870071411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152337, + "balance_loss_mlp": 1.05544364, + "epoch": 0.2100808003078107, + "flos": 1094818123776.0, + "grad_norm": 0.028049590852478556, + "language_loss": 0.96363866, + "learning_rate": 0.0009173503631232103, + "loss": 0.97516203, + "num_input_tokens_seen": 90571504, + "router_z_loss_mlp": 0.96875, + "step": 1092, + "time_per_iteration": 3.359970808029175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150696, + "balance_loss_mlp": 1.05399334, + "epoch": 0.21027318199307427, + "flos": 1014559217664.0, + "grad_norm": 0.03210489869185377, + "language_loss": 0.94109344, + "learning_rate": 0.0009171787145791691, + "loss": 0.95260036, + "num_input_tokens_seen": 90646016, + "router_z_loss_mlp": 0.96679688, + "step": 1093, + "time_per_iteration": 3.2180042266845703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150028, + "balance_loss_mlp": 1.05323017, + "epoch": 0.21046556367833782, + "flos": 522412121088.0, + "grad_norm": 0.02762257246471406, + "language_loss": 0.92679179, + "learning_rate": 0.000917006904071987, + "loss": 0.93829209, + "num_input_tokens_seen": 90713440, + "router_z_loss_mlp": 0.96777344, + "step": 1094, + "time_per_iteration": 2.5961859226226807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152841, + "balance_loss_mlp": 1.0559479, + "epoch": 0.21065794536360138, + "flos": 604839714816.0, + "grad_norm": 0.02570597393175465, + "language_loss": 0.97250223, + "learning_rate": 0.0009168349316683669, + "loss": 0.98403066, + "num_input_tokens_seen": 90788208, + "router_z_loss_mlp": 0.96875, + "step": 1095, + "time_per_iteration": 2.7164759635925293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153125, + "balance_loss_mlp": 1.05642295, + "epoch": 0.21085032704886494, + "flos": 604557735936.0, + "grad_norm": 0.022711755724658188, + "language_loss": 0.91088736, + "learning_rate": 0.0009166627974350741, + "loss": 0.92241859, + "num_input_tokens_seen": 90873776, + "router_z_loss_mlp": 0.96679688, + "step": 1096, + "time_per_iteration": 2.8912341594696045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153907, + "balance_loss_mlp": 1.05739498, + "epoch": 0.2110427087341285, + "flos": 638831041536.0, + "grad_norm": 0.027939519002465243, + "language_loss": 1.01164758, + "learning_rate": 0.0009164905014389373, + "loss": 1.02318668, + "num_input_tokens_seen": 90945872, + "router_z_loss_mlp": 0.96484375, + "step": 1097, + "time_per_iteration": 2.758725881576538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115008, + "balance_loss_mlp": 1.05356789, + "epoch": 0.21123509041939206, + "flos": 523929529344.0, + "grad_norm": 0.027217895626849283, + "language_loss": 0.96537346, + "learning_rate": 0.0009163180437468476, + "loss": 0.97687429, + "num_input_tokens_seen": 91016224, + "router_z_loss_mlp": 0.96484375, + "step": 1098, + "time_per_iteration": 2.6157684326171875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011531, + "balance_loss_mlp": 1.05658853, + "epoch": 0.21142747210465565, + "flos": 452193650688.0, + "grad_norm": 0.025540912808389868, + "language_loss": 0.94842321, + "learning_rate": 0.000916145424425759, + "loss": 0.9599542, + "num_input_tokens_seen": 91086752, + "router_z_loss_mlp": 0.96484375, + "step": 1099, + "time_per_iteration": 2.6368908882141113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157233, + "balance_loss_mlp": 1.06081605, + "epoch": 0.2116198537899192, + "flos": 877625723904.0, + "grad_norm": 0.02885196772961066, + "language_loss": 1.02573156, + "learning_rate": 0.0009159726435426885, + "loss": 1.03730392, + "num_input_tokens_seen": 91162960, + "router_z_loss_mlp": 0.96386719, + "step": 1100, + "time_per_iteration": 3.0916907787323 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011557, + "balance_loss_mlp": 1.05909276, + "epoch": 0.21181223547518277, + "flos": 524674134528.0, + "grad_norm": 0.025603473018395394, + "language_loss": 0.99936807, + "learning_rate": 0.0009157997011647154, + "loss": 1.01092505, + "num_input_tokens_seen": 91229840, + "router_z_loss_mlp": 0.96582031, + "step": 1101, + "time_per_iteration": 2.5971169471740723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152722, + "balance_loss_mlp": 1.05630529, + "epoch": 0.21200461716044633, + "flos": 573425307648.0, + "grad_norm": 0.02306433427515447, + "language_loss": 0.93708789, + "learning_rate": 0.0009156265973589817, + "loss": 0.94861513, + "num_input_tokens_seen": 91307936, + "router_z_loss_mlp": 0.96386719, + "step": 1102, + "time_per_iteration": 2.786557197570801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148247, + "balance_loss_mlp": 1.05187845, + "epoch": 0.2121969988457099, + "flos": 546174359040.0, + "grad_norm": 0.023119673851329285, + "language_loss": 0.9826746, + "learning_rate": 0.0009154533321926926, + "loss": 0.99415696, + "num_input_tokens_seen": 91372848, + "router_z_loss_mlp": 0.96337891, + "step": 1103, + "time_per_iteration": 2.6500911712646484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150448, + "balance_loss_mlp": 1.05393636, + "epoch": 0.21238938053097345, + "flos": 845353920000.0, + "grad_norm": 0.02523726215492747, + "language_loss": 0.96587884, + "learning_rate": 0.0009152799057331156, + "loss": 0.97738338, + "num_input_tokens_seen": 91452768, + "router_z_loss_mlp": 0.96484375, + "step": 1104, + "time_per_iteration": 3.1080517768859863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148697, + "balance_loss_mlp": 1.05213737, + "epoch": 0.212581762216237, + "flos": 447141869568.0, + "grad_norm": 0.026678256955328494, + "language_loss": 1.00256824, + "learning_rate": 0.0009151063180475805, + "loss": 1.01405525, + "num_input_tokens_seen": 91519888, + "router_z_loss_mlp": 0.96533203, + "step": 1105, + "time_per_iteration": 2.530207633972168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153737, + "balance_loss_mlp": 1.05703473, + "epoch": 0.21277414390150057, + "flos": 515385034752.0, + "grad_norm": 0.026680614248996183, + "language_loss": 0.9432478, + "learning_rate": 0.0009149325692034803, + "loss": 0.95478517, + "num_input_tokens_seen": 91585744, + "router_z_loss_mlp": 0.96679688, + "step": 1106, + "time_per_iteration": 2.576834201812744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159119, + "balance_loss_mlp": 1.06413269, + "epoch": 0.21296652558676413, + "flos": 1488512329728.0, + "grad_norm": 0.01358013302766655, + "language_loss": 0.79203427, + "learning_rate": 0.0009147586592682702, + "loss": 0.80362546, + "num_input_tokens_seen": 91805840, + "router_z_loss_mlp": 0.94921875, + "step": 1107, + "time_per_iteration": 4.821696996688843 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156765, + "balance_loss_mlp": 1.06006265, + "epoch": 0.21315890727202771, + "flos": 847450748928.0, + "grad_norm": 0.031460519319247274, + "language_loss": 0.96369046, + "learning_rate": 0.0009145845883094678, + "loss": 0.97525811, + "num_input_tokens_seen": 91885936, + "router_z_loss_mlp": 0.96679688, + "step": 1108, + "time_per_iteration": 3.029548168182373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159379, + "balance_loss_mlp": 1.06267655, + "epoch": 0.21335128895729127, + "flos": 630555790848.0, + "grad_norm": 0.028067626854192333, + "language_loss": 0.95182431, + "learning_rate": 0.000914410356394654, + "loss": 0.96341801, + "num_input_tokens_seen": 91959888, + "router_z_loss_mlp": 0.96679688, + "step": 1109, + "time_per_iteration": 2.737241268157959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160605, + "balance_loss_mlp": 1.06352139, + "epoch": 0.21354367064255483, + "flos": 712284441600.0, + "grad_norm": 0.023599510024272945, + "language_loss": 0.92540836, + "learning_rate": 0.0009142359635914709, + "loss": 0.93701446, + "num_input_tokens_seen": 92043728, + "router_z_loss_mlp": 0.97070312, + "step": 1110, + "time_per_iteration": 3.0267913341522217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161441, + "balance_loss_mlp": 1.0645479, + "epoch": 0.2137360523278184, + "flos": 457210503168.0, + "grad_norm": 0.02473497568188501, + "language_loss": 0.9156003, + "learning_rate": 0.0009140614099676245, + "loss": 0.92721474, + "num_input_tokens_seen": 92114096, + "router_z_loss_mlp": 0.96875, + "step": 1111, + "time_per_iteration": 2.5756866931915283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164266, + "balance_loss_mlp": 1.06727743, + "epoch": 0.21392843401308195, + "flos": 667265026560.0, + "grad_norm": 0.025344438139363285, + "language_loss": 0.90291333, + "learning_rate": 0.0009138866955908821, + "loss": 0.91455603, + "num_input_tokens_seen": 92193552, + "router_z_loss_mlp": 0.96972656, + "step": 1112, + "time_per_iteration": 2.9406254291534424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160039, + "balance_loss_mlp": 1.06319368, + "epoch": 0.2141208156983455, + "flos": 750361363968.0, + "grad_norm": 0.02581510235299489, + "language_loss": 0.89949894, + "learning_rate": 0.0009137118205290738, + "loss": 0.91109931, + "num_input_tokens_seen": 92279248, + "router_z_loss_mlp": 0.96826172, + "step": 1113, + "time_per_iteration": 2.966989278793335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162558, + "balance_loss_mlp": 1.06547356, + "epoch": 0.21431319738360907, + "flos": 420010443264.0, + "grad_norm": 0.024953242249854055, + "language_loss": 1.00419319, + "learning_rate": 0.0009135367848500924, + "loss": 1.01581883, + "num_input_tokens_seen": 92344064, + "router_z_loss_mlp": 0.97070312, + "step": 1114, + "time_per_iteration": 2.4954934120178223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161216, + "balance_loss_mlp": 1.06456113, + "epoch": 0.21450557906887263, + "flos": 610238602752.0, + "grad_norm": 0.030213425802119154, + "language_loss": 0.9839642, + "learning_rate": 0.0009133615886218927, + "loss": 0.99557638, + "num_input_tokens_seen": 92410544, + "router_z_loss_mlp": 0.96630859, + "step": 1115, + "time_per_iteration": 2.71352219581604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152764, + "balance_loss_mlp": 1.05625272, + "epoch": 0.21469796075413622, + "flos": 562974638592.0, + "grad_norm": 0.027635545182738433, + "language_loss": 0.99806535, + "learning_rate": 0.0009131862319124917, + "loss": 1.00959289, + "num_input_tokens_seen": 92480272, + "router_z_loss_mlp": 0.96484375, + "step": 1116, + "time_per_iteration": 2.630807876586914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153717, + "balance_loss_mlp": 1.05720496, + "epoch": 0.21489034243939978, + "flos": 595737266688.0, + "grad_norm": 0.024806539819872384, + "language_loss": 0.94489264, + "learning_rate": 0.0009130107147899691, + "loss": 0.95642984, + "num_input_tokens_seen": 92555584, + "router_z_loss_mlp": 0.96484375, + "step": 1117, + "time_per_iteration": 2.7123875617980957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154765, + "balance_loss_mlp": 1.05825305, + "epoch": 0.21508272412466334, + "flos": 442850156544.0, + "grad_norm": 0.024517194331867692, + "language_loss": 0.93784142, + "learning_rate": 0.0009128350373224665, + "loss": 0.9493891, + "num_input_tokens_seen": 92623136, + "router_z_loss_mlp": 0.96484375, + "step": 1118, + "time_per_iteration": 2.5384151935577393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169045, + "balance_loss_mlp": 1.07348633, + "epoch": 0.2152751058099269, + "flos": 1499232242688.0, + "grad_norm": 0.019396990855708212, + "language_loss": 0.81456429, + "learning_rate": 0.0009126591995781883, + "loss": 0.82625473, + "num_input_tokens_seen": 92842608, + "router_z_loss_mlp": 0.95507812, + "step": 1119, + "time_per_iteration": 4.644891262054443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156688, + "balance_loss_mlp": 1.05989027, + "epoch": 0.21546748749519046, + "flos": 494991985152.0, + "grad_norm": 0.030440112014221473, + "language_loss": 0.9407053, + "learning_rate": 0.0009124832016254005, + "loss": 0.95227218, + "num_input_tokens_seen": 92912960, + "router_z_loss_mlp": 0.96777344, + "step": 1120, + "time_per_iteration": 2.588834285736084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163526, + "balance_loss_mlp": 1.06691861, + "epoch": 0.21565986918045402, + "flos": 635694167040.0, + "grad_norm": 0.030206495794058562, + "language_loss": 0.96966755, + "learning_rate": 0.0009123070435324316, + "loss": 0.98130286, + "num_input_tokens_seen": 92982272, + "router_z_loss_mlp": 0.96582031, + "step": 1121, + "time_per_iteration": 2.786072015762329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170601, + "balance_loss_mlp": 1.07542419, + "epoch": 0.21585225086571758, + "flos": 1586798939136.0, + "grad_norm": 0.013013152417503263, + "language_loss": 0.77875781, + "learning_rate": 0.0009121307253676722, + "loss": 0.79046386, + "num_input_tokens_seen": 93218752, + "router_z_loss_mlp": 0.95117188, + "step": 1122, + "time_per_iteration": 4.946362733840942 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011651, + "balance_loss_mlp": 1.0685885, + "epoch": 0.21604463255098114, + "flos": 685322202624.0, + "grad_norm": 0.027822137906457534, + "language_loss": 0.94040322, + "learning_rate": 0.0009119542471995752, + "loss": 0.95205426, + "num_input_tokens_seen": 93293968, + "router_z_loss_mlp": 0.96484375, + "step": 1123, + "time_per_iteration": 2.8613343238830566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162625, + "balance_loss_mlp": 1.0660181, + "epoch": 0.2162370142362447, + "flos": 782307528192.0, + "grad_norm": 0.029561600436113455, + "language_loss": 0.90709835, + "learning_rate": 0.0009117776090966554, + "loss": 0.9187246, + "num_input_tokens_seen": 93367088, + "router_z_loss_mlp": 0.96582031, + "step": 1124, + "time_per_iteration": 2.9557414054870605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170148, + "balance_loss_mlp": 1.07344532, + "epoch": 0.21642939592150828, + "flos": 1003761441792.0, + "grad_norm": 0.032145354222626064, + "language_loss": 0.98171163, + "learning_rate": 0.0009116008111274899, + "loss": 0.99341309, + "num_input_tokens_seen": 93452944, + "router_z_loss_mlp": 0.96679688, + "step": 1125, + "time_per_iteration": 3.253286838531494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175423, + "balance_loss_mlp": 1.0798645, + "epoch": 0.21662177760677184, + "flos": 1485762220032.0, + "grad_norm": 0.016361962696647775, + "language_loss": 0.79106927, + "learning_rate": 0.0009114238533607176, + "loss": 0.80282342, + "num_input_tokens_seen": 93677328, + "router_z_loss_mlp": 0.95507812, + "step": 1126, + "time_per_iteration": 4.832986831665039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168208, + "balance_loss_mlp": 1.07150567, + "epoch": 0.2168141592920354, + "flos": 888859929600.0, + "grad_norm": 0.027606671666099106, + "language_loss": 0.94760346, + "learning_rate": 0.0009112467358650396, + "loss": 0.9592855, + "num_input_tokens_seen": 93756848, + "router_z_loss_mlp": 0.96679688, + "step": 1127, + "time_per_iteration": 3.1373836994171143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164208, + "balance_loss_mlp": 1.06741047, + "epoch": 0.21700654097729896, + "flos": 547084148736.0, + "grad_norm": 0.025712027239217825, + "language_loss": 0.95734817, + "learning_rate": 0.0009110694587092192, + "loss": 0.96899021, + "num_input_tokens_seen": 93834704, + "router_z_loss_mlp": 0.96777344, + "step": 1128, + "time_per_iteration": 2.752166986465454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162506, + "balance_loss_mlp": 1.06580317, + "epoch": 0.21719892266256252, + "flos": 510535368192.0, + "grad_norm": 0.02739880514200537, + "language_loss": 0.95310479, + "learning_rate": 0.0009108920219620815, + "loss": 0.96472991, + "num_input_tokens_seen": 93904448, + "router_z_loss_mlp": 0.96679688, + "step": 1129, + "time_per_iteration": 2.639409065246582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164125, + "balance_loss_mlp": 1.06742299, + "epoch": 0.21739130434782608, + "flos": 544461566976.0, + "grad_norm": 0.023064586598143682, + "language_loss": 0.97784394, + "learning_rate": 0.0009107144256925133, + "loss": 0.9894852, + "num_input_tokens_seen": 93979312, + "router_z_loss_mlp": 0.96679688, + "step": 1130, + "time_per_iteration": 2.73559308052063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165938, + "balance_loss_mlp": 1.06923568, + "epoch": 0.21758368603308964, + "flos": 617982096384.0, + "grad_norm": 0.027176951765382908, + "language_loss": 0.9233678, + "learning_rate": 0.0009105366699694638, + "loss": 0.93502718, + "num_input_tokens_seen": 94052032, + "router_z_loss_mlp": 0.96679688, + "step": 1131, + "time_per_iteration": 2.7653839588165283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166281, + "balance_loss_mlp": 1.06957853, + "epoch": 0.2177760677183532, + "flos": 636334712832.0, + "grad_norm": 0.021107298895209785, + "language_loss": 0.91459304, + "learning_rate": 0.0009103587548619439, + "loss": 0.92625588, + "num_input_tokens_seen": 94124944, + "router_z_loss_mlp": 0.96679688, + "step": 1132, + "time_per_iteration": 2.8519365787506104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160184, + "balance_loss_mlp": 1.06367195, + "epoch": 0.2179684494036168, + "flos": 533596661760.0, + "grad_norm": 0.022551614427290693, + "language_loss": 0.95995569, + "learning_rate": 0.0009101806804390261, + "loss": 0.97155756, + "num_input_tokens_seen": 94200384, + "router_z_loss_mlp": 0.96484375, + "step": 1133, + "time_per_iteration": 2.8218026161193848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163206, + "balance_loss_mlp": 1.06664658, + "epoch": 0.21816083108888035, + "flos": 476181471744.0, + "grad_norm": 0.0250418684782295, + "language_loss": 1.00355339, + "learning_rate": 0.0009100024467698453, + "loss": 1.01518536, + "num_input_tokens_seen": 94266992, + "router_z_loss_mlp": 0.96533203, + "step": 1134, + "time_per_iteration": 2.5639142990112305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167151, + "balance_loss_mlp": 1.07059181, + "epoch": 0.2183532127741439, + "flos": 578546219520.0, + "grad_norm": 0.029194142239697657, + "language_loss": 0.95151818, + "learning_rate": 0.0009098240539235981, + "loss": 0.96318972, + "num_input_tokens_seen": 94334304, + "router_z_loss_mlp": 0.96533203, + "step": 1135, + "time_per_iteration": 2.693704843521118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162362, + "balance_loss_mlp": 1.06565976, + "epoch": 0.21854559445940747, + "flos": 595279369728.0, + "grad_norm": 0.022714398939090653, + "language_loss": 0.96190184, + "learning_rate": 0.0009096455019695423, + "loss": 0.9735254, + "num_input_tokens_seen": 94413296, + "router_z_loss_mlp": 0.96679688, + "step": 1136, + "time_per_iteration": 2.829479217529297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166866, + "balance_loss_mlp": 1.06997275, + "epoch": 0.21873797614467103, + "flos": 409549040640.0, + "grad_norm": 0.027737994351600712, + "language_loss": 1.01424551, + "learning_rate": 0.000909466790976998, + "loss": 1.02591419, + "num_input_tokens_seen": 94475840, + "router_z_loss_mlp": 0.96875, + "step": 1137, + "time_per_iteration": 2.4491164684295654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165251, + "balance_loss_mlp": 1.06869149, + "epoch": 0.21893035782993459, + "flos": 895654702080.0, + "grad_norm": 0.022710058353260835, + "language_loss": 0.90594929, + "learning_rate": 0.0009092879210153473, + "loss": 0.91760182, + "num_input_tokens_seen": 94555184, + "router_z_loss_mlp": 0.96533203, + "step": 1138, + "time_per_iteration": 3.155076503753662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168627, + "balance_loss_mlp": 1.07192433, + "epoch": 0.21912273951519814, + "flos": 468568233984.0, + "grad_norm": 0.024281064631586205, + "language_loss": 0.97427768, + "learning_rate": 0.0009091088921540333, + "loss": 0.98596388, + "num_input_tokens_seen": 94622656, + "router_z_loss_mlp": 0.96679688, + "step": 1139, + "time_per_iteration": 2.5309600830078125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172859, + "balance_loss_mlp": 1.07711029, + "epoch": 0.2193151212004617, + "flos": 1535177407488.0, + "grad_norm": 0.009496329971255709, + "language_loss": 0.75508678, + "learning_rate": 0.0009089297044625615, + "loss": 0.76681536, + "num_input_tokens_seen": 94856496, + "router_z_loss_mlp": 0.95703125, + "step": 1140, + "time_per_iteration": 4.911335229873657 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172401, + "balance_loss_mlp": 1.07569873, + "epoch": 0.2195075028857253, + "flos": 592274752512.0, + "grad_norm": 0.033335232647672346, + "language_loss": 0.95078719, + "learning_rate": 0.0009087503580104985, + "loss": 0.96251118, + "num_input_tokens_seen": 94926880, + "router_z_loss_mlp": 0.96679688, + "step": 1141, + "time_per_iteration": 2.7083888053894043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169701, + "balance_loss_mlp": 1.07295096, + "epoch": 0.21969988457098885, + "flos": 637517749248.0, + "grad_norm": 0.02859165000671714, + "language_loss": 0.90439236, + "learning_rate": 0.0009085708528674728, + "loss": 0.91608942, + "num_input_tokens_seen": 95000528, + "router_z_loss_mlp": 0.96728516, + "step": 1142, + "time_per_iteration": 2.786891222000122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162201, + "balance_loss_mlp": 1.06549823, + "epoch": 0.2198922662562524, + "flos": 913859598336.0, + "grad_norm": 0.0328462843269242, + "language_loss": 0.98848528, + "learning_rate": 0.0009083911891031745, + "loss": 1.00010729, + "num_input_tokens_seen": 95081040, + "router_z_loss_mlp": 0.96679688, + "step": 1143, + "time_per_iteration": 3.1019930839538574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116483, + "balance_loss_mlp": 1.06793654, + "epoch": 0.22008464794151597, + "flos": 824494241280.0, + "grad_norm": 0.023913565571636344, + "language_loss": 1.01496291, + "learning_rate": 0.0009082113667873553, + "loss": 1.02661121, + "num_input_tokens_seen": 95167328, + "router_z_loss_mlp": 0.96875, + "step": 1144, + "time_per_iteration": 3.104292869567871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170855, + "balance_loss_mlp": 1.07405746, + "epoch": 0.22027702962677953, + "flos": 460618622976.0, + "grad_norm": 0.029355186834356364, + "language_loss": 1.00543249, + "learning_rate": 0.0009080313859898283, + "loss": 1.0171411, + "num_input_tokens_seen": 95230304, + "router_z_loss_mlp": 0.96777344, + "step": 1145, + "time_per_iteration": 2.552457332611084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170139, + "balance_loss_mlp": 1.07343698, + "epoch": 0.2204694113120431, + "flos": 532287372288.0, + "grad_norm": 0.025362278251747628, + "language_loss": 1.01871562, + "learning_rate": 0.0009078512467804684, + "loss": 1.03041708, + "num_input_tokens_seen": 95299520, + "router_z_loss_mlp": 0.96679688, + "step": 1146, + "time_per_iteration": 2.6138763427734375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170493, + "balance_loss_mlp": 1.07379043, + "epoch": 0.22066179299730665, + "flos": 523686481920.0, + "grad_norm": 0.02553067563602684, + "language_loss": 1.00136042, + "learning_rate": 0.0009076709492292119, + "loss": 1.01306534, + "num_input_tokens_seen": 95368912, + "router_z_loss_mlp": 0.96679688, + "step": 1147, + "time_per_iteration": 2.6107985973358154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163104, + "balance_loss_mlp": 1.0664016, + "epoch": 0.2208541746825702, + "flos": 547505115648.0, + "grad_norm": 0.02505349531569444, + "language_loss": 0.99364072, + "learning_rate": 0.0009074904934060562, + "loss": 1.00527167, + "num_input_tokens_seen": 95440800, + "router_z_loss_mlp": 0.96679688, + "step": 1148, + "time_per_iteration": 2.680250644683838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166008, + "balance_loss_mlp": 1.06873322, + "epoch": 0.22104655636783377, + "flos": 710059358208.0, + "grad_norm": 0.023468083856487864, + "language_loss": 0.93112767, + "learning_rate": 0.0009073098793810607, + "loss": 0.94278765, + "num_input_tokens_seen": 95519904, + "router_z_loss_mlp": 0.97265625, + "step": 1149, + "time_per_iteration": 2.9064676761627197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165673, + "balance_loss_mlp": 1.06882739, + "epoch": 0.22123893805309736, + "flos": 585964073472.0, + "grad_norm": 0.028202445852463846, + "language_loss": 0.98436809, + "learning_rate": 0.000907129107224346, + "loss": 0.99602491, + "num_input_tokens_seen": 95591568, + "router_z_loss_mlp": 0.96826172, + "step": 1150, + "time_per_iteration": 2.670436382293701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165906, + "balance_loss_mlp": 1.06901312, + "epoch": 0.22143131973836092, + "flos": 493250995200.0, + "grad_norm": 0.02267098136900654, + "language_loss": 0.95673937, + "learning_rate": 0.0009069481770060939, + "loss": 0.96839839, + "num_input_tokens_seen": 95664480, + "router_z_loss_mlp": 0.96875, + "step": 1151, + "time_per_iteration": 2.650136947631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167632, + "balance_loss_mlp": 1.07092977, + "epoch": 0.22162370142362448, + "flos": 1081467623424.0, + "grad_norm": 0.023887201965423828, + "language_loss": 0.92357147, + "learning_rate": 0.000906767088796548, + "loss": 0.93524778, + "num_input_tokens_seen": 95754400, + "router_z_loss_mlp": 0.96679688, + "step": 1152, + "time_per_iteration": 3.4331767559051514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174048, + "balance_loss_mlp": 1.07734585, + "epoch": 0.22181608310888803, + "flos": 493511506944.0, + "grad_norm": 0.021211000774135545, + "language_loss": 0.94297695, + "learning_rate": 0.0009065858426660127, + "loss": 0.9547174, + "num_input_tokens_seen": 95826944, + "router_z_loss_mlp": 0.96679688, + "step": 1153, + "time_per_iteration": 2.6492207050323486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171336, + "balance_loss_mlp": 1.07458591, + "epoch": 0.2220084647941516, + "flos": 725324765184.0, + "grad_norm": 0.02806046891368227, + "language_loss": 0.95655924, + "learning_rate": 0.0009064044386848543, + "loss": 0.96827257, + "num_input_tokens_seen": 95902688, + "router_z_loss_mlp": 0.96728516, + "step": 1154, + "time_per_iteration": 2.9135258197784424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116775, + "balance_loss_mlp": 1.07090425, + "epoch": 0.22220084647941515, + "flos": 490244376576.0, + "grad_norm": 0.029776005734579798, + "language_loss": 1.00600004, + "learning_rate": 0.0009062228769234997, + "loss": 1.01767755, + "num_input_tokens_seen": 95969952, + "router_z_loss_mlp": 0.96826172, + "step": 1155, + "time_per_iteration": 2.597781181335449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171214, + "balance_loss_mlp": 1.07451141, + "epoch": 0.2223932281646787, + "flos": 537295492608.0, + "grad_norm": 0.030445586519746, + "language_loss": 0.93354964, + "learning_rate": 0.0009060411574524376, + "loss": 0.94526184, + "num_input_tokens_seen": 96037344, + "router_z_loss_mlp": 0.96679688, + "step": 1156, + "time_per_iteration": 2.7325634956359863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168314, + "balance_loss_mlp": 1.07151604, + "epoch": 0.22258560984994227, + "flos": 932967553536.0, + "grad_norm": 0.0275078677514356, + "language_loss": 0.98614538, + "learning_rate": 0.0009058592803422178, + "loss": 0.99782854, + "num_input_tokens_seen": 96115616, + "router_z_loss_mlp": 0.96777344, + "step": 1157, + "time_per_iteration": 3.156981945037842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169861, + "balance_loss_mlp": 1.0739212, + "epoch": 0.22277799153520586, + "flos": 1202395286016.0, + "grad_norm": 0.00950920896526599, + "language_loss": 0.78710288, + "learning_rate": 0.0009056772456634512, + "loss": 0.79880148, + "num_input_tokens_seen": 96333600, + "router_z_loss_mlp": 0.95898438, + "step": 1158, + "time_per_iteration": 4.7935662269592285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170727, + "balance_loss_mlp": 1.07421494, + "epoch": 0.22297037322046942, + "flos": 502316513280.0, + "grad_norm": 0.05502374006765337, + "language_loss": 0.97024429, + "learning_rate": 0.00090549505348681, + "loss": 0.98195159, + "num_input_tokens_seen": 96402544, + "router_z_loss_mlp": 0.96484375, + "step": 1159, + "time_per_iteration": 2.579418659210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167768, + "balance_loss_mlp": 1.07135153, + "epoch": 0.22316275490573298, + "flos": 754112587776.0, + "grad_norm": 0.025312842068973822, + "language_loss": 0.9244132, + "learning_rate": 0.0009053127038830275, + "loss": 0.93609083, + "num_input_tokens_seen": 96487600, + "router_z_loss_mlp": 0.96386719, + "step": 1160, + "time_per_iteration": 2.970240592956543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169788, + "balance_loss_mlp": 1.07346714, + "epoch": 0.22335513659099654, + "flos": 515804000256.0, + "grad_norm": 0.02702757021011719, + "language_loss": 0.97474223, + "learning_rate": 0.000905130196922898, + "loss": 0.98644012, + "num_input_tokens_seen": 96554912, + "router_z_loss_mlp": 0.96289062, + "step": 1161, + "time_per_iteration": 2.558567762374878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175493, + "balance_loss_mlp": 1.07917213, + "epoch": 0.2235475182762601, + "flos": 485507501568.0, + "grad_norm": 0.024760780359754056, + "language_loss": 0.947945, + "learning_rate": 0.0009049475326772769, + "loss": 0.95969993, + "num_input_tokens_seen": 96624192, + "router_z_loss_mlp": 0.96289062, + "step": 1162, + "time_per_iteration": 2.5948867797851562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168008, + "balance_loss_mlp": 1.0716871, + "epoch": 0.22373989996152366, + "flos": 471067290624.0, + "grad_norm": 0.0243609738761747, + "language_loss": 0.92091036, + "learning_rate": 0.0009047647112170811, + "loss": 0.93259048, + "num_input_tokens_seen": 96701040, + "router_z_loss_mlp": 0.96289062, + "step": 1163, + "time_per_iteration": 2.7958250045776367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165002, + "balance_loss_mlp": 1.06868088, + "epoch": 0.22393228164678722, + "flos": 1273017807360.0, + "grad_norm": 0.0269563070164892, + "language_loss": 0.98098505, + "learning_rate": 0.0009045817326132876, + "loss": 0.99263507, + "num_input_tokens_seen": 96791200, + "router_z_loss_mlp": 0.96289062, + "step": 1164, + "time_per_iteration": 3.64853835105896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165462, + "balance_loss_mlp": 1.06914091, + "epoch": 0.22412466333205078, + "flos": 597467523072.0, + "grad_norm": 0.02771003139242203, + "language_loss": 0.94602239, + "learning_rate": 0.0009043985969369357, + "loss": 0.95767695, + "num_input_tokens_seen": 96869360, + "router_z_loss_mlp": 0.96289062, + "step": 1165, + "time_per_iteration": 2.8231425285339355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175209, + "balance_loss_mlp": 1.07860184, + "epoch": 0.22431704501731436, + "flos": 609630984192.0, + "grad_norm": 0.02516811505749033, + "language_loss": 0.93514198, + "learning_rate": 0.0009042153042591245, + "loss": 0.94689411, + "num_input_tokens_seen": 96945840, + "router_z_loss_mlp": 0.96582031, + "step": 1166, + "time_per_iteration": 2.755671501159668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174563, + "balance_loss_mlp": 1.07819414, + "epoch": 0.22450942670257792, + "flos": 908106872832.0, + "grad_norm": 0.024247493396408124, + "language_loss": 0.93277276, + "learning_rate": 0.0009040318546510146, + "loss": 0.94451833, + "num_input_tokens_seen": 97029296, + "router_z_loss_mlp": 0.96337891, + "step": 1167, + "time_per_iteration": 3.126707077026367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174214, + "balance_loss_mlp": 1.07770181, + "epoch": 0.22470180838784148, + "flos": 566380756992.0, + "grad_norm": 0.02335770706345326, + "language_loss": 0.94522464, + "learning_rate": 0.0009038482481838275, + "loss": 0.95696682, + "num_input_tokens_seen": 97097776, + "router_z_loss_mlp": 0.96484375, + "step": 1168, + "time_per_iteration": 2.6482362747192383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171371, + "balance_loss_mlp": 1.07485878, + "epoch": 0.22489419007310504, + "flos": 835917100032.0, + "grad_norm": 0.021740410096357694, + "language_loss": 0.9467479, + "learning_rate": 0.0009036644849288455, + "loss": 0.95846164, + "num_input_tokens_seen": 97181424, + "router_z_loss_mlp": 0.96484375, + "step": 1169, + "time_per_iteration": 3.0959203243255615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168691, + "balance_loss_mlp": 1.07217908, + "epoch": 0.2250865717583686, + "flos": 582138989568.0, + "grad_norm": 0.028400846177611044, + "language_loss": 0.95971251, + "learning_rate": 0.0009034805649574118, + "loss": 0.97139943, + "num_input_tokens_seen": 97252128, + "router_z_loss_mlp": 0.96484375, + "step": 1170, + "time_per_iteration": 2.65209698677063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171761, + "balance_loss_mlp": 1.07515407, + "epoch": 0.22527895344363216, + "flos": 601670639616.0, + "grad_norm": 0.021879369323455276, + "language_loss": 0.92857611, + "learning_rate": 0.0009032964883409308, + "loss": 0.94029367, + "num_input_tokens_seen": 97326640, + "router_z_loss_mlp": 0.96582031, + "step": 1171, + "time_per_iteration": 2.8586626052856445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175461, + "balance_loss_mlp": 1.07990265, + "epoch": 0.22547133512889572, + "flos": 1443731959296.0, + "grad_norm": 0.011387534292379292, + "language_loss": 0.73050535, + "learning_rate": 0.000903112255150867, + "loss": 0.74225998, + "num_input_tokens_seen": 97553952, + "router_z_loss_mlp": 0.95507812, + "step": 1172, + "time_per_iteration": 4.9882895946502686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171774, + "balance_loss_mlp": 1.07526255, + "epoch": 0.22566371681415928, + "flos": 491585866752.0, + "grad_norm": 0.025801800464723818, + "language_loss": 0.97062689, + "learning_rate": 0.0009029278654587462, + "loss": 0.98234463, + "num_input_tokens_seen": 97623584, + "router_z_loss_mlp": 0.96484375, + "step": 1173, + "time_per_iteration": 2.595419406890869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171429, + "balance_loss_mlp": 1.07491696, + "epoch": 0.22585609849942284, + "flos": 605751505920.0, + "grad_norm": 0.02576863859493135, + "language_loss": 0.92400688, + "learning_rate": 0.0009027433193361548, + "loss": 0.93572116, + "num_input_tokens_seen": 97695952, + "router_z_loss_mlp": 0.96484375, + "step": 1174, + "time_per_iteration": 2.738267183303833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117476, + "balance_loss_mlp": 1.07824779, + "epoch": 0.22604848018468643, + "flos": 636727481856.0, + "grad_norm": 0.028952390928102957, + "language_loss": 0.97668821, + "learning_rate": 0.00090255861685474, + "loss": 0.98843575, + "num_input_tokens_seen": 97764544, + "router_z_loss_mlp": 0.96484375, + "step": 1175, + "time_per_iteration": 2.7286014556884766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117152, + "balance_loss_mlp": 1.07481766, + "epoch": 0.22624086186995, + "flos": 480844486656.0, + "grad_norm": 0.027877026454804697, + "language_loss": 1.02366519, + "learning_rate": 0.0009023737580862095, + "loss": 1.03538048, + "num_input_tokens_seen": 97830976, + "router_z_loss_mlp": 0.96679688, + "step": 1176, + "time_per_iteration": 2.553281307220459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170774, + "balance_loss_mlp": 1.07388091, + "epoch": 0.22643324355521355, + "flos": 496806835200.0, + "grad_norm": 0.02249634447584531, + "language_loss": 0.90840948, + "learning_rate": 0.0009021887431023321, + "loss": 0.92011726, + "num_input_tokens_seen": 97898800, + "router_z_loss_mlp": 0.96875, + "step": 1177, + "time_per_iteration": 2.5862364768981934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172189, + "balance_loss_mlp": 1.07539093, + "epoch": 0.2266256252404771, + "flos": 562683927552.0, + "grad_norm": 0.02041789434880362, + "language_loss": 0.95725513, + "learning_rate": 0.0009020035719749369, + "loss": 0.96897697, + "num_input_tokens_seen": 97974112, + "router_z_loss_mlp": 0.96777344, + "step": 1178, + "time_per_iteration": 2.7553560733795166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176357, + "balance_loss_mlp": 1.0796541, + "epoch": 0.22681800692574067, + "flos": 581032541184.0, + "grad_norm": 0.026733278329428435, + "language_loss": 0.89533567, + "learning_rate": 0.0009018182447759136, + "loss": 0.90709925, + "num_input_tokens_seen": 98056640, + "router_z_loss_mlp": 0.96679688, + "step": 1179, + "time_per_iteration": 3.012024402618408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175508, + "balance_loss_mlp": 1.07904434, + "epoch": 0.22701038861100423, + "flos": 741465033216.0, + "grad_norm": 0.025064804828048133, + "language_loss": 0.90941453, + "learning_rate": 0.0009016327615772126, + "loss": 0.92116958, + "num_input_tokens_seen": 98135952, + "router_z_loss_mlp": 0.96435547, + "step": 1180, + "time_per_iteration": 2.969684600830078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172378, + "balance_loss_mlp": 1.07577109, + "epoch": 0.2272027702962678, + "flos": 578305173504.0, + "grad_norm": 0.036813558231106436, + "language_loss": 1.00164366, + "learning_rate": 0.0009014471224508451, + "loss": 1.01336741, + "num_input_tokens_seen": 98204288, + "router_z_loss_mlp": 0.96582031, + "step": 1181, + "time_per_iteration": 2.664487361907959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173976, + "balance_loss_mlp": 1.0774641, + "epoch": 0.22739515198153135, + "flos": 545290765824.0, + "grad_norm": 0.028585613124224512, + "language_loss": 0.95647848, + "learning_rate": 0.0009012613274688823, + "loss": 0.96821827, + "num_input_tokens_seen": 98269856, + "router_z_loss_mlp": 0.96484375, + "step": 1182, + "time_per_iteration": 2.647608518600464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177492, + "balance_loss_mlp": 1.08078945, + "epoch": 0.22758753366679493, + "flos": 441091702272.0, + "grad_norm": 0.02755397132508441, + "language_loss": 1.00651419, + "learning_rate": 0.0009010753767034565, + "loss": 1.01828909, + "num_input_tokens_seen": 98335632, + "router_z_loss_mlp": 0.96679688, + "step": 1183, + "time_per_iteration": 2.528580904006958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176952, + "balance_loss_mlp": 1.08053601, + "epoch": 0.2277799153520585, + "flos": 730823709696.0, + "grad_norm": 0.024484618665474616, + "language_loss": 0.90051508, + "learning_rate": 0.0009008892702267599, + "loss": 0.91228461, + "num_input_tokens_seen": 98420592, + "router_z_loss_mlp": 0.96386719, + "step": 1184, + "time_per_iteration": 2.990344285964966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117733, + "balance_loss_mlp": 1.08100891, + "epoch": 0.22797229703732205, + "flos": 527913067008.0, + "grad_norm": 0.030622621699729128, + "language_loss": 1.01022232, + "learning_rate": 0.0009007030081110457, + "loss": 1.02199566, + "num_input_tokens_seen": 98488096, + "router_z_loss_mlp": 0.96289062, + "step": 1185, + "time_per_iteration": 2.5795140266418457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172726, + "balance_loss_mlp": 1.07592821, + "epoch": 0.2281646787225856, + "flos": 536520688128.0, + "grad_norm": 0.026616575931436976, + "language_loss": 0.93079567, + "learning_rate": 0.000900516590428627, + "loss": 0.942523, + "num_input_tokens_seen": 98561664, + "router_z_loss_mlp": 0.96777344, + "step": 1186, + "time_per_iteration": 2.6647558212280273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117313, + "balance_loss_mlp": 1.07628405, + "epoch": 0.22835706040784917, + "flos": 542477529600.0, + "grad_norm": 0.02522496809839962, + "language_loss": 0.99033505, + "learning_rate": 0.0009003300172518778, + "loss": 1.00206637, + "num_input_tokens_seen": 98634336, + "router_z_loss_mlp": 0.96826172, + "step": 1187, + "time_per_iteration": 2.7046303749084473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177624, + "balance_loss_mlp": 1.08073056, + "epoch": 0.22854944209311273, + "flos": 792004859904.0, + "grad_norm": 0.026332453075710083, + "language_loss": 0.94325852, + "learning_rate": 0.0009001432886532321, + "loss": 0.95503473, + "num_input_tokens_seen": 98709600, + "router_z_loss_mlp": 0.96875, + "step": 1188, + "time_per_iteration": 2.9583094120025635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179036, + "balance_loss_mlp": 1.08233392, + "epoch": 0.2287418237783763, + "flos": 470215898112.0, + "grad_norm": 0.025775869396212594, + "language_loss": 0.97465944, + "learning_rate": 0.0008999564047051843, + "loss": 0.98644984, + "num_input_tokens_seen": 98775024, + "router_z_loss_mlp": 0.96679688, + "step": 1189, + "time_per_iteration": 2.49245023727417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178388, + "balance_loss_mlp": 1.08154237, + "epoch": 0.22893420546363985, + "flos": 469004663808.0, + "grad_norm": 0.023763579929190374, + "language_loss": 0.94691694, + "learning_rate": 0.0008997693654802894, + "loss": 0.95870078, + "num_input_tokens_seen": 98845248, + "router_z_loss_mlp": 0.96826172, + "step": 1190, + "time_per_iteration": 2.6276731491088867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178257, + "balance_loss_mlp": 1.08145857, + "epoch": 0.22912658714890344, + "flos": 627401452032.0, + "grad_norm": 0.023724149848154047, + "language_loss": 0.95182133, + "learning_rate": 0.0008995821710511625, + "loss": 0.96360391, + "num_input_tokens_seen": 98913584, + "router_z_loss_mlp": 0.96777344, + "step": 1191, + "time_per_iteration": 2.756840705871582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117993, + "balance_loss_mlp": 1.08308399, + "epoch": 0.229318968834167, + "flos": 504020573184.0, + "grad_norm": 0.024708694220473774, + "language_loss": 0.93247074, + "learning_rate": 0.0008993948214904786, + "loss": 0.94427001, + "num_input_tokens_seen": 98978608, + "router_z_loss_mlp": 0.96826172, + "step": 1192, + "time_per_iteration": 2.577340602874756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190514, + "balance_loss_mlp": 1.09533691, + "epoch": 0.22951135051943056, + "flos": 1377713877504.0, + "grad_norm": 0.021264094300491608, + "language_loss": 0.78422213, + "learning_rate": 0.0008992073168709733, + "loss": 0.79612726, + "num_input_tokens_seen": 99207424, + "router_z_loss_mlp": 0.95117188, + "step": 1193, + "time_per_iteration": 4.850237607955933 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179442, + "balance_loss_mlp": 1.08316851, + "epoch": 0.22970373220469412, + "flos": 645549952512.0, + "grad_norm": 0.02667568465905087, + "language_loss": 0.92540175, + "learning_rate": 0.0008990196572654427, + "loss": 0.93719625, + "num_input_tokens_seen": 99290592, + "router_z_loss_mlp": 0.96240234, + "step": 1194, + "time_per_iteration": 2.8638381958007812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180858, + "balance_loss_mlp": 1.08453715, + "epoch": 0.22989611388995768, + "flos": 501272464896.0, + "grad_norm": 0.02416134539694475, + "language_loss": 0.95937514, + "learning_rate": 0.0008988318427467426, + "loss": 0.97118378, + "num_input_tokens_seen": 99366096, + "router_z_loss_mlp": 0.96289062, + "step": 1195, + "time_per_iteration": 2.7063868045806885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182741, + "balance_loss_mlp": 1.08589542, + "epoch": 0.23008849557522124, + "flos": 1098333030912.0, + "grad_norm": 0.02922856270819412, + "language_loss": 0.9667449, + "learning_rate": 0.0008986438733877887, + "loss": 0.97857237, + "num_input_tokens_seen": 99456768, + "router_z_loss_mlp": 0.96826172, + "step": 1196, + "time_per_iteration": 3.4508113861083984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176758, + "balance_loss_mlp": 1.08043683, + "epoch": 0.2302808772604848, + "flos": 684992560128.0, + "grad_norm": 0.022228440588834414, + "language_loss": 0.91545051, + "learning_rate": 0.0008984557492615576, + "loss": 0.92721808, + "num_input_tokens_seen": 99539616, + "router_z_loss_mlp": 0.96289062, + "step": 1197, + "time_per_iteration": 2.93611741065979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179014, + "balance_loss_mlp": 1.08269298, + "epoch": 0.23047325894574835, + "flos": 529960230912.0, + "grad_norm": 0.026499525382426087, + "language_loss": 0.99148774, + "learning_rate": 0.0008982674704410854, + "loss": 1.0032779, + "num_input_tokens_seen": 99612064, + "router_z_loss_mlp": 0.96289062, + "step": 1198, + "time_per_iteration": 2.7032008171081543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180823, + "balance_loss_mlp": 1.08450174, + "epoch": 0.23066564063101191, + "flos": 684126431232.0, + "grad_norm": 0.025326379221325218, + "language_loss": 0.86113322, + "learning_rate": 0.0008980790369994682, + "loss": 0.87294143, + "num_input_tokens_seen": 99691040, + "router_z_loss_mlp": 0.96289062, + "step": 1199, + "time_per_iteration": 2.9629056453704834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173246, + "balance_loss_mlp": 1.07673466, + "epoch": 0.2308580223162755, + "flos": 559631646720.0, + "grad_norm": 0.02469990042405053, + "language_loss": 0.95889735, + "learning_rate": 0.000897890449009863, + "loss": 0.97062981, + "num_input_tokens_seen": 99762016, + "router_z_loss_mlp": 0.96484375, + "step": 1200, + "time_per_iteration": 2.6673126220703125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178191, + "balance_loss_mlp": 1.08167911, + "epoch": 0.23105040400153906, + "flos": 556729087488.0, + "grad_norm": 0.021551459012756572, + "language_loss": 0.97633696, + "learning_rate": 0.0008977017065454853, + "loss": 0.98811877, + "num_input_tokens_seen": 99835552, + "router_z_loss_mlp": 0.96484375, + "step": 1201, + "time_per_iteration": 2.6586263179779053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176954, + "balance_loss_mlp": 1.08048964, + "epoch": 0.23124278568680262, + "flos": 706049624064.0, + "grad_norm": 0.025666519973580538, + "language_loss": 0.89963996, + "learning_rate": 0.0008975128096796121, + "loss": 0.9114095, + "num_input_tokens_seen": 99910784, + "router_z_loss_mlp": 0.96435547, + "step": 1202, + "time_per_iteration": 2.8599958419799805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175929, + "balance_loss_mlp": 1.07989419, + "epoch": 0.23143516737206618, + "flos": 613968359424.0, + "grad_norm": 0.02791489713026627, + "language_loss": 0.96485001, + "learning_rate": 0.0008973237584855794, + "loss": 0.97660929, + "num_input_tokens_seen": 99991120, + "router_z_loss_mlp": 0.95996094, + "step": 1203, + "time_per_iteration": 2.8814125061035156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117493, + "balance_loss_mlp": 1.07903779, + "epoch": 0.23162754905732974, + "flos": 390095980032.0, + "grad_norm": 0.02381480195735972, + "language_loss": 0.91340852, + "learning_rate": 0.0008971345530367832, + "loss": 0.92515785, + "num_input_tokens_seen": 100053888, + "router_z_loss_mlp": 0.95849609, + "step": 1204, + "time_per_iteration": 2.513951301574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176133, + "balance_loss_mlp": 1.08024144, + "epoch": 0.2318199307425933, + "flos": 668969086464.0, + "grad_norm": 0.024943516104182908, + "language_loss": 0.94778013, + "learning_rate": 0.0008969451934066799, + "loss": 0.95954144, + "num_input_tokens_seen": 100124176, + "router_z_loss_mlp": 0.95849609, + "step": 1205, + "time_per_iteration": 2.80454421043396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173068, + "balance_loss_mlp": 1.07712853, + "epoch": 0.23201231242785686, + "flos": 667627596288.0, + "grad_norm": 0.029617322009159303, + "language_loss": 0.92493355, + "learning_rate": 0.0008967556796687854, + "loss": 0.93666422, + "num_input_tokens_seen": 100205296, + "router_z_loss_mlp": 0.95898438, + "step": 1206, + "time_per_iteration": 2.89932918548584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173146, + "balance_loss_mlp": 1.07720602, + "epoch": 0.23220469411312042, + "flos": 750094121472.0, + "grad_norm": 0.024264467100448908, + "language_loss": 0.94343531, + "learning_rate": 0.0008965660118966752, + "loss": 0.95516682, + "num_input_tokens_seen": 100279440, + "router_z_loss_mlp": 0.95898438, + "step": 1207, + "time_per_iteration": 2.9768385887145996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179014, + "balance_loss_mlp": 1.08307481, + "epoch": 0.232397075798384, + "flos": 668261411328.0, + "grad_norm": 0.02512248807118796, + "language_loss": 0.97498, + "learning_rate": 0.0008963761901639851, + "loss": 0.98677015, + "num_input_tokens_seen": 100354512, + "router_z_loss_mlp": 0.95898438, + "step": 1208, + "time_per_iteration": 2.8175342082977295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177539, + "balance_loss_mlp": 1.081599, + "epoch": 0.23258945748364757, + "flos": 611345777664.0, + "grad_norm": 0.025244332610569246, + "language_loss": 0.93465042, + "learning_rate": 0.0008961862145444103, + "loss": 0.9464258, + "num_input_tokens_seen": 100426848, + "router_z_loss_mlp": 0.95898438, + "step": 1209, + "time_per_iteration": 2.707583427429199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117491, + "balance_loss_mlp": 1.07901847, + "epoch": 0.23278183916891113, + "flos": 490672074240.0, + "grad_norm": 0.025133767455437463, + "language_loss": 0.96175104, + "learning_rate": 0.0008959960851117059, + "loss": 0.97350019, + "num_input_tokens_seen": 100496176, + "router_z_loss_mlp": 0.95849609, + "step": 1210, + "time_per_iteration": 2.5783777236938477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174943, + "balance_loss_mlp": 1.07895589, + "epoch": 0.23297422085417469, + "flos": 512673856512.0, + "grad_norm": 0.027877077505007057, + "language_loss": 0.94183683, + "learning_rate": 0.0008958058019396868, + "loss": 0.95358628, + "num_input_tokens_seen": 100575072, + "router_z_loss_mlp": 0.95947266, + "step": 1211, + "time_per_iteration": 2.7695388793945312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118178, + "balance_loss_mlp": 1.08560216, + "epoch": 0.23316660253943824, + "flos": 547531312128.0, + "grad_norm": 0.0259067341075638, + "language_loss": 0.95459378, + "learning_rate": 0.0008956153651022274, + "loss": 0.96641153, + "num_input_tokens_seen": 100648304, + "router_z_loss_mlp": 0.96142578, + "step": 1212, + "time_per_iteration": 2.7088377475738525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178328, + "balance_loss_mlp": 1.08181643, + "epoch": 0.2333589842247018, + "flos": 511288705536.0, + "grad_norm": 0.023917692799316066, + "language_loss": 0.93208623, + "learning_rate": 0.0008954247746732618, + "loss": 0.94386959, + "num_input_tokens_seen": 100717616, + "router_z_loss_mlp": 0.96484375, + "step": 1213, + "time_per_iteration": 2.6319668292999268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172909, + "balance_loss_mlp": 1.0766834, + "epoch": 0.23355136590996536, + "flos": 664406128128.0, + "grad_norm": 0.02356648487739955, + "language_loss": 0.98858505, + "learning_rate": 0.0008952340307267837, + "loss": 1.00031424, + "num_input_tokens_seen": 100797056, + "router_z_loss_mlp": 0.96191406, + "step": 1214, + "time_per_iteration": 2.891026735305786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172334, + "balance_loss_mlp": 1.07629859, + "epoch": 0.23374374759522892, + "flos": 509465123328.0, + "grad_norm": 0.027978905734491046, + "language_loss": 0.94424212, + "learning_rate": 0.0008950431333368468, + "loss": 0.95596552, + "num_input_tokens_seen": 100863632, + "router_z_loss_mlp": 0.95996094, + "step": 1215, + "time_per_iteration": 2.5823488235473633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173288, + "balance_loss_mlp": 1.07730114, + "epoch": 0.2339361292804925, + "flos": 1296428209152.0, + "grad_norm": 0.026145796218117214, + "language_loss": 0.94705772, + "learning_rate": 0.0008948520825775634, + "loss": 0.95879066, + "num_input_tokens_seen": 100950272, + "router_z_loss_mlp": 0.95947266, + "step": 1216, + "time_per_iteration": 3.6343605518341064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174216, + "balance_loss_mlp": 1.07808566, + "epoch": 0.23412851096575607, + "flos": 707176264704.0, + "grad_norm": 0.02578801546488365, + "language_loss": 0.93516719, + "learning_rate": 0.0008946608785231067, + "loss": 0.94690937, + "num_input_tokens_seen": 101031008, + "router_z_loss_mlp": 0.9609375, + "step": 1217, + "time_per_iteration": 2.8923676013946533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174557, + "balance_loss_mlp": 1.07842624, + "epoch": 0.23432089265101963, + "flos": 439174794240.0, + "grad_norm": 0.024987781095147748, + "language_loss": 0.94467312, + "learning_rate": 0.0008944695212477084, + "loss": 0.95641869, + "num_input_tokens_seen": 101094688, + "router_z_loss_mlp": 0.9609375, + "step": 1218, + "time_per_iteration": 2.47641658782959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176273, + "balance_loss_mlp": 1.08028615, + "epoch": 0.2345132743362832, + "flos": 481914731520.0, + "grad_norm": 0.02187031641141441, + "language_loss": 0.9320662, + "learning_rate": 0.0008942780108256599, + "loss": 0.94382894, + "num_input_tokens_seen": 101163744, + "router_z_loss_mlp": 0.95947266, + "step": 1219, + "time_per_iteration": 2.585204839706421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176397, + "balance_loss_mlp": 1.07993269, + "epoch": 0.23470565602154675, + "flos": 412340809728.0, + "grad_norm": 0.02314471919225668, + "language_loss": 0.95930934, + "learning_rate": 0.0008940863473313121, + "loss": 0.97107327, + "num_input_tokens_seen": 101226480, + "router_z_loss_mlp": 0.96435547, + "step": 1220, + "time_per_iteration": 2.461904764175415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174627, + "balance_loss_mlp": 1.07811534, + "epoch": 0.2348980377068103, + "flos": 546499998720.0, + "grad_norm": 0.029389735884218435, + "language_loss": 0.99771547, + "learning_rate": 0.0008938945308390756, + "loss": 1.00946164, + "num_input_tokens_seen": 101291824, + "router_z_loss_mlp": 0.96484375, + "step": 1221, + "time_per_iteration": 2.6403567790985107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179462, + "balance_loss_mlp": 1.08295047, + "epoch": 0.23509041939207387, + "flos": 576842159616.0, + "grad_norm": 0.023502241620232074, + "language_loss": 0.96374851, + "learning_rate": 0.00089370256142342, + "loss": 0.97554314, + "num_input_tokens_seen": 101367216, + "router_z_loss_mlp": 0.96484375, + "step": 1222, + "time_per_iteration": 2.7148585319519043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178637, + "balance_loss_mlp": 1.08198178, + "epoch": 0.23528280107733743, + "flos": 589947611136.0, + "grad_norm": 0.022852016666186668, + "language_loss": 0.93682569, + "learning_rate": 0.0008935104391588746, + "loss": 0.94861209, + "num_input_tokens_seen": 101438992, + "router_z_loss_mlp": 0.96630859, + "step": 1223, + "time_per_iteration": 2.7302677631378174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179799, + "balance_loss_mlp": 1.08338237, + "epoch": 0.235475182762601, + "flos": 824856811008.0, + "grad_norm": 0.02091323276417278, + "language_loss": 0.91087663, + "learning_rate": 0.0008933181641200276, + "loss": 0.9226746, + "num_input_tokens_seen": 101534464, + "router_z_loss_mlp": 0.96386719, + "step": 1224, + "time_per_iteration": 3.120337724685669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183017, + "balance_loss_mlp": 1.08650565, + "epoch": 0.23566756444786457, + "flos": 681366862848.0, + "grad_norm": 0.027323039985709546, + "language_loss": 0.94355077, + "learning_rate": 0.0008931257363815271, + "loss": 0.95538092, + "num_input_tokens_seen": 101616496, + "router_z_loss_mlp": 0.96484375, + "step": 1225, + "time_per_iteration": 2.893202543258667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178928, + "balance_loss_mlp": 1.08251154, + "epoch": 0.23585994613312813, + "flos": 703134329856.0, + "grad_norm": 0.022860929740297704, + "language_loss": 0.96590424, + "learning_rate": 0.0008929331560180798, + "loss": 0.97769356, + "num_input_tokens_seen": 101694496, + "router_z_loss_mlp": 0.96386719, + "step": 1226, + "time_per_iteration": 2.913858652114868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176734, + "balance_loss_mlp": 1.08017468, + "epoch": 0.2360523278183917, + "flos": 525195158016.0, + "grad_norm": 0.02227272458953822, + "language_loss": 0.99194574, + "learning_rate": 0.0008927404231044525, + "loss": 1.00371313, + "num_input_tokens_seen": 101766160, + "router_z_loss_mlp": 0.96533203, + "step": 1227, + "time_per_iteration": 2.7194507122039795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175869, + "balance_loss_mlp": 1.07921374, + "epoch": 0.23624470950365525, + "flos": 525442934784.0, + "grad_norm": 0.02071878597098496, + "language_loss": 0.89412713, + "learning_rate": 0.0008925475377154703, + "loss": 0.90588582, + "num_input_tokens_seen": 101844160, + "router_z_loss_mlp": 0.96630859, + "step": 1228, + "time_per_iteration": 2.742506265640259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175669, + "balance_loss_mlp": 1.07896686, + "epoch": 0.2364370911889188, + "flos": 597960348672.0, + "grad_norm": 0.023166098266421232, + "language_loss": 0.90900964, + "learning_rate": 0.0008923544999260183, + "loss": 0.92076635, + "num_input_tokens_seen": 101917968, + "router_z_loss_mlp": 0.96679688, + "step": 1229, + "time_per_iteration": 2.809842109680176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177841, + "balance_loss_mlp": 1.08113885, + "epoch": 0.23662947287418237, + "flos": 758171986944.0, + "grad_norm": 0.02725464196132968, + "language_loss": 1.00227833, + "learning_rate": 0.00089216130981104, + "loss": 1.0140568, + "num_input_tokens_seen": 101996880, + "router_z_loss_mlp": 0.96679688, + "step": 1230, + "time_per_iteration": 3.0096282958984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178297, + "balance_loss_mlp": 1.08159423, + "epoch": 0.23682185455944593, + "flos": 547207673856.0, + "grad_norm": 0.024713012089740163, + "language_loss": 0.91807795, + "learning_rate": 0.000891967967445539, + "loss": 0.92986089, + "num_input_tokens_seen": 102067936, + "router_z_loss_mlp": 0.96679688, + "step": 1231, + "time_per_iteration": 2.7001702785491943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185987, + "balance_loss_mlp": 1.08928442, + "epoch": 0.2370142362447095, + "flos": 663522534912.0, + "grad_norm": 0.02265672956199411, + "language_loss": 0.96654546, + "learning_rate": 0.0008917744729045772, + "loss": 0.97840536, + "num_input_tokens_seen": 102147552, + "router_z_loss_mlp": 0.96679688, + "step": 1232, + "time_per_iteration": 2.8703036308288574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184505, + "balance_loss_mlp": 1.08789778, + "epoch": 0.23720661792997308, + "flos": 684911969280.0, + "grad_norm": 0.02632145570598456, + "language_loss": 0.93737417, + "learning_rate": 0.0008915808262632757, + "loss": 0.94921923, + "num_input_tokens_seen": 102224480, + "router_z_loss_mlp": 0.96582031, + "step": 1233, + "time_per_iteration": 2.839534044265747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185605, + "balance_loss_mlp": 1.08928347, + "epoch": 0.23739899961523664, + "flos": 560022414336.0, + "grad_norm": 0.027552675935845497, + "language_loss": 1.01508975, + "learning_rate": 0.0008913870275968148, + "loss": 1.02694583, + "num_input_tokens_seen": 102297392, + "router_z_loss_mlp": 0.96289062, + "step": 1234, + "time_per_iteration": 2.7176129817962646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182161, + "balance_loss_mlp": 1.08545852, + "epoch": 0.2375913813005002, + "flos": 891163602432.0, + "grad_norm": 0.02404650352203449, + "language_loss": 0.9583261, + "learning_rate": 0.0008911930769804342, + "loss": 0.97014773, + "num_input_tokens_seen": 102386032, + "router_z_loss_mlp": 0.96679688, + "step": 1235, + "time_per_iteration": 3.244257688522339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179697, + "balance_loss_mlp": 1.08289862, + "epoch": 0.23778376298576376, + "flos": 642365414400.0, + "grad_norm": 0.020226791074773265, + "language_loss": 0.99461335, + "learning_rate": 0.0008909989744894318, + "loss": 1.00641024, + "num_input_tokens_seen": 102463504, + "router_z_loss_mlp": 0.96777344, + "step": 1236, + "time_per_iteration": 2.8618855476379395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179012, + "balance_loss_mlp": 1.08230948, + "epoch": 0.23797614467102732, + "flos": 617945166336.0, + "grad_norm": 0.025060145140963254, + "language_loss": 0.91887248, + "learning_rate": 0.0008908047201991649, + "loss": 0.93066257, + "num_input_tokens_seen": 102529632, + "router_z_loss_mlp": 0.96679688, + "step": 1237, + "time_per_iteration": 2.7335665225982666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177715, + "balance_loss_mlp": 1.08120298, + "epoch": 0.23816852635629088, + "flos": 625463076864.0, + "grad_norm": 0.02188809519195417, + "language_loss": 0.92642158, + "learning_rate": 0.0008906103141850502, + "loss": 0.93819869, + "num_input_tokens_seen": 102610192, + "router_z_loss_mlp": 0.96484375, + "step": 1238, + "time_per_iteration": 2.9244723320007324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178141, + "balance_loss_mlp": 1.0816294, + "epoch": 0.23836090804155444, + "flos": 522440318976.0, + "grad_norm": 0.025638098136730073, + "language_loss": 0.97356987, + "learning_rate": 0.0008904157565225621, + "loss": 0.98535126, + "num_input_tokens_seen": 102681216, + "router_z_loss_mlp": 0.96484375, + "step": 1239, + "time_per_iteration": 2.6046018600463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186867, + "balance_loss_mlp": 1.09059334, + "epoch": 0.238553289726818, + "flos": 1155854281728.0, + "grad_norm": 0.0279922632366243, + "language_loss": 0.91224372, + "learning_rate": 0.000890221047287235, + "loss": 0.92411238, + "num_input_tokens_seen": 102777184, + "router_z_loss_mlp": 0.96240234, + "step": 1240, + "time_per_iteration": 3.503387928009033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191442, + "balance_loss_mlp": 1.09512079, + "epoch": 0.23874567141208156, + "flos": 500909895168.0, + "grad_norm": 0.02294407067471098, + "language_loss": 0.98687088, + "learning_rate": 0.0008900261865546615, + "loss": 0.99878532, + "num_input_tokens_seen": 102845744, + "router_z_loss_mlp": 0.96289062, + "step": 1241, + "time_per_iteration": 2.6329948902130127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188291, + "balance_loss_mlp": 1.09197009, + "epoch": 0.23893805309734514, + "flos": 558049110528.0, + "grad_norm": 0.02727719764566138, + "language_loss": 0.96105886, + "learning_rate": 0.0008898311744004936, + "loss": 0.97294176, + "num_input_tokens_seen": 102918064, + "router_z_loss_mlp": 0.96289062, + "step": 1242, + "time_per_iteration": 2.6852729320526123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011866, + "balance_loss_mlp": 1.0902791, + "epoch": 0.2391304347826087, + "flos": 550316350464.0, + "grad_norm": 0.023767912183342704, + "language_loss": 0.95555472, + "learning_rate": 0.0008896360109004414, + "loss": 0.9674207, + "num_input_tokens_seen": 102983920, + "router_z_loss_mlp": 0.96289062, + "step": 1243, + "time_per_iteration": 2.6607675552368164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181953, + "balance_loss_mlp": 1.08558464, + "epoch": 0.23932281646787226, + "flos": 517078361088.0, + "grad_norm": 0.022492500831292953, + "language_loss": 0.92156398, + "learning_rate": 0.0008894406961302742, + "loss": 0.93338358, + "num_input_tokens_seen": 103053328, + "router_z_loss_mlp": 0.96337891, + "step": 1244, + "time_per_iteration": 2.658339262008667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180796, + "balance_loss_mlp": 1.0844276, + "epoch": 0.23951519815313582, + "flos": 745001407488.0, + "grad_norm": 0.0220414301985699, + "language_loss": 0.9171226, + "learning_rate": 0.0008892452301658201, + "loss": 0.92893052, + "num_input_tokens_seen": 103128208, + "router_z_loss_mlp": 0.96337891, + "step": 1245, + "time_per_iteration": 2.987859010696411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189345, + "balance_loss_mlp": 1.09302354, + "epoch": 0.23970757983839938, + "flos": 555174749184.0, + "grad_norm": 0.02624868476300941, + "language_loss": 0.92775297, + "learning_rate": 0.0008890496130829653, + "loss": 0.93964636, + "num_input_tokens_seen": 103197392, + "router_z_loss_mlp": 0.96289062, + "step": 1246, + "time_per_iteration": 2.7285211086273193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011891, + "balance_loss_mlp": 1.09287417, + "epoch": 0.23989996152366294, + "flos": 481617289728.0, + "grad_norm": 0.024405638758005322, + "language_loss": 0.93939734, + "learning_rate": 0.0008888538449576555, + "loss": 0.95128834, + "num_input_tokens_seen": 103265328, + "router_z_loss_mlp": 0.96191406, + "step": 1247, + "time_per_iteration": 2.603447675704956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181648, + "balance_loss_mlp": 1.08532703, + "epoch": 0.2400923432089265, + "flos": 486280304640.0, + "grad_norm": 0.02551404288502155, + "language_loss": 0.9456799, + "learning_rate": 0.0008886579258658944, + "loss": 0.9574964, + "num_input_tokens_seen": 103331632, + "router_z_loss_mlp": 0.96289062, + "step": 1248, + "time_per_iteration": 2.6195995807647705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183672, + "balance_loss_mlp": 1.08735096, + "epoch": 0.24028472489419006, + "flos": 624792331776.0, + "grad_norm": 0.02192042043345247, + "language_loss": 0.93244678, + "learning_rate": 0.0008884618558837446, + "loss": 0.94428349, + "num_input_tokens_seen": 103405408, + "router_z_loss_mlp": 0.96289062, + "step": 1249, + "time_per_iteration": 2.830350399017334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187022, + "balance_loss_mlp": 1.09113026, + "epoch": 0.24047710657945365, + "flos": 602808013824.0, + "grad_norm": 0.023766863499936387, + "language_loss": 0.96457344, + "learning_rate": 0.0008882656350873273, + "loss": 0.97644365, + "num_input_tokens_seen": 103487216, + "router_z_loss_mlp": 0.95849609, + "step": 1250, + "time_per_iteration": 2.8691956996917725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119127, + "balance_loss_mlp": 1.09547377, + "epoch": 0.2406694882647172, + "flos": 843000582144.0, + "grad_norm": 0.03001641023469985, + "language_loss": 1.00300837, + "learning_rate": 0.0008880692635528219, + "loss": 1.01492119, + "num_input_tokens_seen": 103568640, + "router_z_loss_mlp": 0.95751953, + "step": 1251, + "time_per_iteration": 3.066152572631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187351, + "balance_loss_mlp": 1.09155416, + "epoch": 0.24086186994998077, + "flos": 528134647296.0, + "grad_norm": 0.026461260661865858, + "language_loss": 0.98557454, + "learning_rate": 0.0008878727413564669, + "loss": 0.99744809, + "num_input_tokens_seen": 103640784, + "router_z_loss_mlp": 0.95751953, + "step": 1252, + "time_per_iteration": 2.7665653228759766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209228, + "balance_loss_mlp": 1.11519623, + "epoch": 0.24105425163524433, + "flos": 1341459262464.0, + "grad_norm": 0.018061169603452644, + "language_loss": 0.80135596, + "learning_rate": 0.0008876760685745588, + "loss": 0.81344825, + "num_input_tokens_seen": 103865824, + "router_z_loss_mlp": 0.93945312, + "step": 1253, + "time_per_iteration": 4.899695634841919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182732, + "balance_loss_mlp": 1.08679259, + "epoch": 0.24124663332050789, + "flos": 615227257344.0, + "grad_norm": 0.02599071752574661, + "language_loss": 0.90657973, + "learning_rate": 0.0008874792452834528, + "loss": 0.91840708, + "num_input_tokens_seen": 103939872, + "router_z_loss_mlp": 0.95898438, + "step": 1254, + "time_per_iteration": 2.7407760620117188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179855, + "balance_loss_mlp": 1.08401072, + "epoch": 0.24143901500577145, + "flos": 576592381440.0, + "grad_norm": 0.0285281411485809, + "language_loss": 0.99380314, + "learning_rate": 0.0008872822715595626, + "loss": 1.00560164, + "num_input_tokens_seen": 104011120, + "router_z_loss_mlp": 0.95800781, + "step": 1255, + "time_per_iteration": 2.7094287872314453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176059, + "balance_loss_mlp": 1.08007157, + "epoch": 0.241631396691035, + "flos": 496146823680.0, + "grad_norm": 0.026934202036951318, + "language_loss": 0.98012596, + "learning_rate": 0.0008870851474793598, + "loss": 0.9918865, + "num_input_tokens_seen": 104077040, + "router_z_loss_mlp": 0.95947266, + "step": 1256, + "time_per_iteration": 2.5717930793762207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180992, + "balance_loss_mlp": 1.08500445, + "epoch": 0.24182377837629856, + "flos": 637396225536.0, + "grad_norm": 0.02721147411023071, + "language_loss": 0.97604549, + "learning_rate": 0.0008868878731193752, + "loss": 0.98785543, + "num_input_tokens_seen": 104150880, + "router_z_loss_mlp": 0.95947266, + "step": 1257, + "time_per_iteration": 2.835613965988159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180736, + "balance_loss_mlp": 1.08460534, + "epoch": 0.24201616006156215, + "flos": 516349218816.0, + "grad_norm": 0.023847715865297152, + "language_loss": 0.9613235, + "learning_rate": 0.0008866904485561973, + "loss": 0.97313088, + "num_input_tokens_seen": 104223696, + "router_z_loss_mlp": 0.9609375, + "step": 1258, + "time_per_iteration": 2.697693347930908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182815, + "balance_loss_mlp": 1.08682752, + "epoch": 0.2422085417468257, + "flos": 616378093056.0, + "grad_norm": 0.023106527532664196, + "language_loss": 0.92363685, + "learning_rate": 0.000886492873866473, + "loss": 0.93546498, + "num_input_tokens_seen": 104301728, + "router_z_loss_mlp": 0.95947266, + "step": 1259, + "time_per_iteration": 2.8120577335357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118033, + "balance_loss_mlp": 1.08424771, + "epoch": 0.24240092343208927, + "flos": 586912794624.0, + "grad_norm": 0.025402415625288076, + "language_loss": 0.9586736, + "learning_rate": 0.000886295149126908, + "loss": 0.97047698, + "num_input_tokens_seen": 104374480, + "router_z_loss_mlp": 0.96044922, + "step": 1260, + "time_per_iteration": 2.7276840209960938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184073, + "balance_loss_mlp": 1.08813286, + "epoch": 0.24259330511735283, + "flos": 763570874880.0, + "grad_norm": 0.0207328591517146, + "language_loss": 0.94417751, + "learning_rate": 0.0008860972744142655, + "loss": 0.95601827, + "num_input_tokens_seen": 104452384, + "router_z_loss_mlp": 0.95898438, + "step": 1261, + "time_per_iteration": 2.898794412612915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184052, + "balance_loss_mlp": 1.08816016, + "epoch": 0.2427856868026164, + "flos": 628133322240.0, + "grad_norm": 0.02409331705070074, + "language_loss": 0.89591467, + "learning_rate": 0.0008858992498053671, + "loss": 0.90775526, + "num_input_tokens_seen": 104532576, + "router_z_loss_mlp": 0.95849609, + "step": 1262, + "time_per_iteration": 2.8477351665496826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183746, + "balance_loss_mlp": 1.08952332, + "epoch": 0.24297806848787995, + "flos": 1514919343104.0, + "grad_norm": 0.012580587939111834, + "language_loss": 0.7658875, + "learning_rate": 0.0008857010753770934, + "loss": 0.77772498, + "num_input_tokens_seen": 104765216, + "router_z_loss_mlp": 0.94140625, + "step": 1263, + "time_per_iteration": 4.826787710189819 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180613, + "balance_loss_mlp": 1.0848639, + "epoch": 0.2431704501731435, + "flos": 543072413184.0, + "grad_norm": 0.025826560533695943, + "language_loss": 0.92586392, + "learning_rate": 0.0008855027512063817, + "loss": 0.93767005, + "num_input_tokens_seen": 104836912, + "router_z_loss_mlp": 0.95703125, + "step": 1264, + "time_per_iteration": 2.722557306289673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179682, + "balance_loss_mlp": 1.08364689, + "epoch": 0.24336283185840707, + "flos": 524878250496.0, + "grad_norm": 0.025894380889017608, + "language_loss": 0.95614499, + "learning_rate": 0.0008853042773702292, + "loss": 0.96794176, + "num_input_tokens_seen": 104909280, + "router_z_loss_mlp": 0.95996094, + "step": 1265, + "time_per_iteration": 2.7258307933807373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118145, + "balance_loss_mlp": 1.0855577, + "epoch": 0.24355521354367063, + "flos": 538205282304.0, + "grad_norm": 0.022817154468993458, + "language_loss": 0.98287719, + "learning_rate": 0.0008851056539456896, + "loss": 0.99469173, + "num_input_tokens_seen": 104982560, + "router_z_loss_mlp": 0.95849609, + "step": 1266, + "time_per_iteration": 2.6970114707946777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182961, + "balance_loss_mlp": 1.08692622, + "epoch": 0.24374759522893422, + "flos": 932108155392.0, + "grad_norm": 0.024066297062525326, + "language_loss": 0.9148944, + "learning_rate": 0.0008849068810098755, + "loss": 0.92672402, + "num_input_tokens_seen": 105075056, + "router_z_loss_mlp": 0.95996094, + "step": 1267, + "time_per_iteration": 3.326692819595337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118368, + "balance_loss_mlp": 1.08764458, + "epoch": 0.24393997691419778, + "flos": 428685193728.0, + "grad_norm": 0.027357648838687767, + "language_loss": 0.94001949, + "learning_rate": 0.0008847079586399575, + "loss": 0.95185632, + "num_input_tokens_seen": 105137536, + "router_z_loss_mlp": 0.95996094, + "step": 1268, + "time_per_iteration": 2.466787099838257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180763, + "balance_loss_mlp": 1.08482289, + "epoch": 0.24413235859946134, + "flos": 579942104064.0, + "grad_norm": 0.026150492080556795, + "language_loss": 0.95411992, + "learning_rate": 0.0008845088869131641, + "loss": 0.96592754, + "num_input_tokens_seen": 105204848, + "router_z_loss_mlp": 0.95898438, + "step": 1269, + "time_per_iteration": 2.7016899585723877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175832, + "balance_loss_mlp": 1.07989287, + "epoch": 0.2443247402847249, + "flos": 530900219904.0, + "grad_norm": 0.025309414349457434, + "language_loss": 0.98951483, + "learning_rate": 0.0008843096659067818, + "loss": 1.00127316, + "num_input_tokens_seen": 105273456, + "router_z_loss_mlp": 0.95898438, + "step": 1270, + "time_per_iteration": 2.6240859031677246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179701, + "balance_loss_mlp": 1.08366621, + "epoch": 0.24451712196998845, + "flos": 697624651776.0, + "grad_norm": 0.020400222299851913, + "language_loss": 0.92813951, + "learning_rate": 0.000884110295698155, + "loss": 0.93993652, + "num_input_tokens_seen": 105355488, + "router_z_loss_mlp": 0.95996094, + "step": 1271, + "time_per_iteration": 2.945749044418335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180344, + "balance_loss_mlp": 1.08435643, + "epoch": 0.24470950365525201, + "flos": 530863289856.0, + "grad_norm": 0.02434814436965663, + "language_loss": 0.97428346, + "learning_rate": 0.0008839107763646861, + "loss": 0.98608696, + "num_input_tokens_seen": 105421568, + "router_z_loss_mlp": 0.95947266, + "step": 1272, + "time_per_iteration": 2.5816495418548584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182389, + "balance_loss_mlp": 1.08630657, + "epoch": 0.24490188534051557, + "flos": 492347936256.0, + "grad_norm": 0.027277570267404832, + "language_loss": 1.00778949, + "learning_rate": 0.0008837111079838353, + "loss": 1.0196135, + "num_input_tokens_seen": 105493072, + "router_z_loss_mlp": 0.96044922, + "step": 1273, + "time_per_iteration": 2.675060749053955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182001, + "balance_loss_mlp": 1.08587062, + "epoch": 0.24509426702577913, + "flos": 475111226880.0, + "grad_norm": 0.024851656777491255, + "language_loss": 0.98025054, + "learning_rate": 0.000883511290633121, + "loss": 0.99207056, + "num_input_tokens_seen": 105559840, + "router_z_loss_mlp": 0.9609375, + "step": 1274, + "time_per_iteration": 2.5230517387390137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183988, + "balance_loss_mlp": 1.08747613, + "epoch": 0.24528664871104272, + "flos": 551647107072.0, + "grad_norm": 0.02070792437524093, + "language_loss": 1.00507927, + "learning_rate": 0.000883311324390119, + "loss": 1.01691914, + "num_input_tokens_seen": 105634448, + "router_z_loss_mlp": 0.96484375, + "step": 1275, + "time_per_iteration": 2.690488338470459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184819, + "balance_loss_mlp": 1.08887982, + "epoch": 0.24547903039630628, + "flos": 827335675392.0, + "grad_norm": 0.02978995697497926, + "language_loss": 0.95172417, + "learning_rate": 0.0008831112093324629, + "loss": 0.96357232, + "num_input_tokens_seen": 105711936, + "router_z_loss_mlp": 0.95898438, + "step": 1276, + "time_per_iteration": 3.0883522033691406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184816, + "balance_loss_mlp": 1.08839917, + "epoch": 0.24567141208156984, + "flos": 592693718016.0, + "grad_norm": 0.026400385967418116, + "language_loss": 0.99731994, + "learning_rate": 0.0008829109455378444, + "loss": 1.00916803, + "num_input_tokens_seen": 105780240, + "router_z_loss_mlp": 0.96386719, + "step": 1277, + "time_per_iteration": 2.670658588409424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184585, + "balance_loss_mlp": 1.08812118, + "epoch": 0.2458637937668334, + "flos": 548929198080.0, + "grad_norm": 0.022333419000210953, + "language_loss": 0.95654261, + "learning_rate": 0.000882710533084013, + "loss": 0.96838844, + "num_input_tokens_seen": 105849840, + "router_z_loss_mlp": 0.96435547, + "step": 1278, + "time_per_iteration": 2.641019344329834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189057, + "balance_loss_mlp": 1.09244978, + "epoch": 0.24605617545209696, + "flos": 516911175168.0, + "grad_norm": 0.022487969609205835, + "language_loss": 0.97332817, + "learning_rate": 0.0008825099720487755, + "loss": 0.98521876, + "num_input_tokens_seen": 105921488, + "router_z_loss_mlp": 0.96582031, + "step": 1279, + "time_per_iteration": 2.626079559326172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193596, + "balance_loss_mlp": 1.09880066, + "epoch": 0.24624855713736052, + "flos": 1515058331136.0, + "grad_norm": 0.0162275920205478, + "language_loss": 0.7526114, + "learning_rate": 0.0008823092625099967, + "loss": 0.76454735, + "num_input_tokens_seen": 106146816, + "router_z_loss_mlp": 0.94726562, + "step": 1280, + "time_per_iteration": 4.846211671829224 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118811, + "balance_loss_mlp": 1.09350586, + "epoch": 0.24644093882262408, + "flos": 1530746706432.0, + "grad_norm": 0.013716798372908724, + "language_loss": 0.77944112, + "learning_rate": 0.0008821084045455987, + "loss": 0.79132223, + "num_input_tokens_seen": 106361568, + "router_z_loss_mlp": 0.9453125, + "step": 1281, + "time_per_iteration": 4.781409025192261 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189694, + "balance_loss_mlp": 1.09351575, + "epoch": 0.24663332050788764, + "flos": 660348730368.0, + "grad_norm": 0.028995521048395968, + "language_loss": 0.998649, + "learning_rate": 0.0008819073982335619, + "loss": 1.01054597, + "num_input_tokens_seen": 106435296, + "router_z_loss_mlp": 0.96142578, + "step": 1282, + "time_per_iteration": 2.873255729675293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187163, + "balance_loss_mlp": 1.09098482, + "epoch": 0.24682570219315123, + "flos": 542805170688.0, + "grad_norm": 0.0289675073475646, + "language_loss": 0.92590028, + "learning_rate": 0.0008817062436519235, + "loss": 0.93777192, + "num_input_tokens_seen": 106507184, + "router_z_loss_mlp": 0.96142578, + "step": 1283, + "time_per_iteration": 2.6918435096740723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184699, + "balance_loss_mlp": 1.08852112, + "epoch": 0.24701808387841478, + "flos": 441658387968.0, + "grad_norm": 0.027350099061339322, + "language_loss": 1.00939846, + "learning_rate": 0.0008815049408787788, + "loss": 1.02124548, + "num_input_tokens_seen": 106571472, + "router_z_loss_mlp": 0.96142578, + "step": 1284, + "time_per_iteration": 2.5155978202819824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190183, + "balance_loss_mlp": 1.09443462, + "epoch": 0.24721046556367834, + "flos": 469032861696.0, + "grad_norm": 0.028209143321693456, + "language_loss": 0.95635927, + "learning_rate": 0.0008813034899922805, + "loss": 0.96826112, + "num_input_tokens_seen": 106638368, + "router_z_loss_mlp": 0.95703125, + "step": 1285, + "time_per_iteration": 2.5152530670166016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193087, + "balance_loss_mlp": 1.09729075, + "epoch": 0.2474028472489419, + "flos": 505407725568.0, + "grad_norm": 0.027111907557838905, + "language_loss": 1.01196301, + "learning_rate": 0.0008811018910706387, + "loss": 1.02389383, + "num_input_tokens_seen": 106705312, + "router_z_loss_mlp": 0.95751953, + "step": 1286, + "time_per_iteration": 2.5593316555023193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188305, + "balance_loss_mlp": 1.09255612, + "epoch": 0.24759522893420546, + "flos": 480955276800.0, + "grad_norm": 0.03276846828627927, + "language_loss": 0.9498859, + "learning_rate": 0.0008809001441921211, + "loss": 0.96176893, + "num_input_tokens_seen": 106778624, + "router_z_loss_mlp": 0.95703125, + "step": 1287, + "time_per_iteration": 2.7347421646118164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181619, + "balance_loss_mlp": 1.08567917, + "epoch": 0.24778761061946902, + "flos": 534753501696.0, + "grad_norm": 0.025262665654883373, + "language_loss": 0.97019696, + "learning_rate": 0.0008806982494350528, + "loss": 0.98201311, + "num_input_tokens_seen": 106847744, + "router_z_loss_mlp": 0.95898438, + "step": 1288, + "time_per_iteration": 2.6499245166778564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181206, + "balance_loss_mlp": 1.08526671, + "epoch": 0.24797999230473258, + "flos": 560942937600.0, + "grad_norm": 0.021558514258727474, + "language_loss": 0.9849534, + "learning_rate": 0.0008804962068778161, + "loss": 0.99676538, + "num_input_tokens_seen": 106927584, + "router_z_loss_mlp": 0.95898438, + "step": 1289, + "time_per_iteration": 2.852257490158081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186476, + "balance_loss_mlp": 1.09053683, + "epoch": 0.24817237398999614, + "flos": 625480541184.0, + "grad_norm": 0.024913990838324927, + "language_loss": 0.90269625, + "learning_rate": 0.0008802940165988511, + "loss": 0.91456103, + "num_input_tokens_seen": 107006656, + "router_z_loss_mlp": 0.95898438, + "step": 1290, + "time_per_iteration": 2.846277952194214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181135, + "balance_loss_mlp": 1.08471859, + "epoch": 0.2483647556752597, + "flos": 613484265984.0, + "grad_norm": 0.02310813532639645, + "language_loss": 0.96774852, + "learning_rate": 0.000880091678676655, + "loss": 0.97955984, + "num_input_tokens_seen": 107084352, + "router_z_loss_mlp": 0.96386719, + "step": 1291, + "time_per_iteration": 2.8085777759552 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180122, + "balance_loss_mlp": 1.0837059, + "epoch": 0.2485571373605233, + "flos": 584687711232.0, + "grad_norm": 0.021422688776258386, + "language_loss": 0.9855839, + "learning_rate": 0.0008798891931897821, + "loss": 0.99738514, + "num_input_tokens_seen": 107158368, + "router_z_loss_mlp": 0.96386719, + "step": 1292, + "time_per_iteration": 2.7361133098602295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183371, + "balance_loss_mlp": 1.08704984, + "epoch": 0.24874951904578685, + "flos": 495736590336.0, + "grad_norm": 0.02424073807687162, + "language_loss": 0.92916596, + "learning_rate": 0.0008796865602168447, + "loss": 0.94099975, + "num_input_tokens_seen": 107224256, + "router_z_loss_mlp": 0.96289062, + "step": 1293, + "time_per_iteration": 2.5220131874084473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186197, + "balance_loss_mlp": 1.09025729, + "epoch": 0.2489419007310504, + "flos": 457173573120.0, + "grad_norm": 0.023099031146870112, + "language_loss": 0.94818902, + "learning_rate": 0.0008794837798365115, + "loss": 0.96005094, + "num_input_tokens_seen": 107292720, + "router_z_loss_mlp": 0.95898438, + "step": 1294, + "time_per_iteration": 2.6338109970092773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187707, + "balance_loss_mlp": 1.09191012, + "epoch": 0.24913428241631397, + "flos": 486565011456.0, + "grad_norm": 0.02215078033303108, + "language_loss": 0.96107936, + "learning_rate": 0.0008792808521275089, + "loss": 0.97295642, + "num_input_tokens_seen": 107368576, + "router_z_loss_mlp": 0.95751953, + "step": 1295, + "time_per_iteration": 2.7354605197906494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182687, + "balance_loss_mlp": 1.0869385, + "epoch": 0.24932666410157753, + "flos": 519917793792.0, + "grad_norm": 0.022601932216391857, + "language_loss": 0.96075213, + "learning_rate": 0.0008790777771686206, + "loss": 0.972579, + "num_input_tokens_seen": 107433856, + "router_z_loss_mlp": 0.95703125, + "step": 1296, + "time_per_iteration": 2.5746819972991943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181852, + "balance_loss_mlp": 1.08610308, + "epoch": 0.2495190457868411, + "flos": 473556888576.0, + "grad_norm": 0.022656020732285023, + "language_loss": 0.93397439, + "learning_rate": 0.0008788745550386872, + "loss": 0.94579285, + "num_input_tokens_seen": 107500944, + "router_z_loss_mlp": 0.95703125, + "step": 1297, + "time_per_iteration": 2.55985689163208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177725, + "balance_loss_mlp": 1.0820719, + "epoch": 0.24971142747210465, + "flos": 747198292992.0, + "grad_norm": 0.023996141347128058, + "language_loss": 0.88372529, + "learning_rate": 0.0008786711858166063, + "loss": 0.89550251, + "num_input_tokens_seen": 107580000, + "router_z_loss_mlp": 0.95605469, + "step": 1298, + "time_per_iteration": 2.9357082843780518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179743, + "balance_loss_mlp": 1.08399367, + "epoch": 0.2499038091573682, + "flos": 750901853184.0, + "grad_norm": 0.025666304870509565, + "language_loss": 0.93355387, + "learning_rate": 0.0008784676695813332, + "loss": 0.9453513, + "num_input_tokens_seen": 107660384, + "router_z_loss_mlp": 0.95703125, + "step": 1299, + "time_per_iteration": 2.939739942550659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187708, + "balance_loss_mlp": 1.09186363, + "epoch": 0.2500961908426318, + "flos": 746342897664.0, + "grad_norm": 0.02448521774653795, + "language_loss": 0.94308037, + "learning_rate": 0.0008782640064118796, + "loss": 0.95495749, + "num_input_tokens_seen": 107736320, + "router_z_loss_mlp": 0.95800781, + "step": 1300, + "time_per_iteration": 2.882838249206543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223068, + "balance_loss_mlp": 1.12808228, + "epoch": 0.2502885725278953, + "flos": 1420523672064.0, + "grad_norm": 0.019515623701574104, + "language_loss": 0.7618475, + "learning_rate": 0.0008780601963873149, + "loss": 0.77407825, + "num_input_tokens_seen": 107972608, + "router_z_loss_mlp": 0.94921875, + "step": 1301, + "time_per_iteration": 5.002445220947266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180814, + "balance_loss_mlp": 1.08520806, + "epoch": 0.2504809542131589, + "flos": 516231697920.0, + "grad_norm": 0.028413107884204602, + "language_loss": 0.96116567, + "learning_rate": 0.0008778562395867648, + "loss": 0.97297382, + "num_input_tokens_seen": 108043312, + "router_z_loss_mlp": 0.95556641, + "step": 1302, + "time_per_iteration": 2.6463139057159424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183586, + "balance_loss_mlp": 1.08783746, + "epoch": 0.25067333589842244, + "flos": 526851554304.0, + "grad_norm": 0.024791221234372676, + "language_loss": 0.9191972, + "learning_rate": 0.0008776521360894127, + "loss": 0.93103302, + "num_input_tokens_seen": 108114144, + "router_z_loss_mlp": 0.95703125, + "step": 1303, + "time_per_iteration": 2.60622239112854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203766, + "balance_loss_mlp": 1.10897064, + "epoch": 0.25086571758368603, + "flos": 1477157326848.0, + "grad_norm": 0.014632010139538269, + "language_loss": 0.78962064, + "learning_rate": 0.0008774478859744984, + "loss": 0.80165827, + "num_input_tokens_seen": 108338720, + "router_z_loss_mlp": 0.94726562, + "step": 1304, + "time_per_iteration": 4.810328006744385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188508, + "balance_loss_mlp": 1.09285462, + "epoch": 0.2510580992689496, + "flos": 529402277376.0, + "grad_norm": 0.027485922989720333, + "language_loss": 0.99458921, + "learning_rate": 0.0008772434893213186, + "loss": 1.00647426, + "num_input_tokens_seen": 108405456, + "router_z_loss_mlp": 0.95605469, + "step": 1305, + "time_per_iteration": 2.6031458377838135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187013, + "balance_loss_mlp": 1.09155023, + "epoch": 0.25125048095421315, + "flos": 518465513472.0, + "grad_norm": 0.0302061265456268, + "language_loss": 0.93206942, + "learning_rate": 0.0008770389462092276, + "loss": 0.94393957, + "num_input_tokens_seen": 108474368, + "router_z_loss_mlp": 0.95410156, + "step": 1306, + "time_per_iteration": 2.636845827102661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118174, + "balance_loss_mlp": 1.0858953, + "epoch": 0.25144286263947674, + "flos": 621674923008.0, + "grad_norm": 0.026354631998576704, + "language_loss": 0.96568018, + "learning_rate": 0.0008768342567176357, + "loss": 0.97749758, + "num_input_tokens_seen": 108548864, + "router_z_loss_mlp": 0.95800781, + "step": 1307, + "time_per_iteration": 2.797346591949463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187952, + "balance_loss_mlp": 1.09220326, + "epoch": 0.25163524432474027, + "flos": 504865234944.0, + "grad_norm": 0.024318536510777332, + "language_loss": 0.99895847, + "learning_rate": 0.0008766294209260107, + "loss": 1.01083803, + "num_input_tokens_seen": 108623072, + "router_z_loss_mlp": 0.95703125, + "step": 1308, + "time_per_iteration": 2.648099184036255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180717, + "balance_loss_mlp": 1.0850637, + "epoch": 0.25182762601000386, + "flos": 510079472640.0, + "grad_norm": 0.027727924866539442, + "language_loss": 1.0231359, + "learning_rate": 0.0008764244389138767, + "loss": 1.0349431, + "num_input_tokens_seen": 108690128, + "router_z_loss_mlp": 0.95605469, + "step": 1309, + "time_per_iteration": 2.575963258743286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179663, + "balance_loss_mlp": 1.08396196, + "epoch": 0.2520200076952674, + "flos": 635097282048.0, + "grad_norm": 0.028356059247082867, + "language_loss": 0.93336231, + "learning_rate": 0.000876219310760815, + "loss": 0.94515896, + "num_input_tokens_seen": 108770272, + "router_z_loss_mlp": 0.95654297, + "step": 1310, + "time_per_iteration": 2.8647706508636475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189244, + "balance_loss_mlp": 1.09330475, + "epoch": 0.252212389380531, + "flos": 495651996672.0, + "grad_norm": 0.024396868749396446, + "language_loss": 0.91954494, + "learning_rate": 0.0008760140365464631, + "loss": 0.93143737, + "num_input_tokens_seen": 108840592, + "router_z_loss_mlp": 0.95898438, + "step": 1311, + "time_per_iteration": 2.592453718185425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180261, + "balance_loss_mlp": 1.08451247, + "epoch": 0.2524047710657945, + "flos": 491529470976.0, + "grad_norm": 0.026197758988141227, + "language_loss": 0.97483641, + "learning_rate": 0.0008758086163505156, + "loss": 0.98663902, + "num_input_tokens_seen": 108910064, + "router_z_loss_mlp": 0.95703125, + "step": 1312, + "time_per_iteration": 2.56319260597229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181231, + "balance_loss_mlp": 1.08548176, + "epoch": 0.2525971527510581, + "flos": 648612966912.0, + "grad_norm": 0.0242630752619845, + "language_loss": 0.98733318, + "learning_rate": 0.0008756030502527239, + "loss": 0.99914545, + "num_input_tokens_seen": 108986336, + "router_z_loss_mlp": 0.95703125, + "step": 1313, + "time_per_iteration": 2.858691930770874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180546, + "balance_loss_mlp": 1.08455837, + "epoch": 0.2527895344363217, + "flos": 570373026816.0, + "grad_norm": 0.025539383487616106, + "language_loss": 0.99746555, + "learning_rate": 0.0008753973383328954, + "loss": 1.00927103, + "num_input_tokens_seen": 109059712, + "router_z_loss_mlp": 0.95947266, + "step": 1314, + "time_per_iteration": 2.6683549880981445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180137, + "balance_loss_mlp": 1.0841974, + "epoch": 0.2529819161215852, + "flos": 515068127232.0, + "grad_norm": 0.027266475314614652, + "language_loss": 0.95154297, + "learning_rate": 0.0008751914806708952, + "loss": 0.96334434, + "num_input_tokens_seen": 109127504, + "router_z_loss_mlp": 0.95898438, + "step": 1315, + "time_per_iteration": 2.6008012294769287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178852, + "balance_loss_mlp": 1.08310342, + "epoch": 0.2531742978068488, + "flos": 532350498816.0, + "grad_norm": 0.02508848621911812, + "language_loss": 0.91122246, + "learning_rate": 0.0008749854773466439, + "loss": 0.92301095, + "num_input_tokens_seen": 109198080, + "router_z_loss_mlp": 0.95703125, + "step": 1316, + "time_per_iteration": 2.6595401763916016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193828, + "balance_loss_mlp": 1.09822178, + "epoch": 0.25336667949211233, + "flos": 597747500544.0, + "grad_norm": 0.027675397486347803, + "language_loss": 0.92894816, + "learning_rate": 0.0008747793284401192, + "loss": 0.9408865, + "num_input_tokens_seen": 109268368, + "router_z_loss_mlp": 0.95556641, + "step": 1317, + "time_per_iteration": 2.6975109577178955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187696, + "balance_loss_mlp": 1.09175622, + "epoch": 0.2535590611773759, + "flos": 603255177216.0, + "grad_norm": 0.02603186041930466, + "language_loss": 0.95462376, + "learning_rate": 0.0008745730340313551, + "loss": 0.96650076, + "num_input_tokens_seen": 109344112, + "router_z_loss_mlp": 0.95898438, + "step": 1318, + "time_per_iteration": 2.805327892303467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187328, + "balance_loss_mlp": 1.0915786, + "epoch": 0.25375144286263945, + "flos": 496322741760.0, + "grad_norm": 0.027049333310240738, + "language_loss": 0.95645851, + "learning_rate": 0.0008743665942004422, + "loss": 0.96833169, + "num_input_tokens_seen": 109414112, + "router_z_loss_mlp": 0.95703125, + "step": 1319, + "time_per_iteration": 2.6340737342834473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185781, + "balance_loss_mlp": 1.0896982, + "epoch": 0.25394382454790304, + "flos": 513476858880.0, + "grad_norm": 0.02784781206620994, + "language_loss": 1.02473438, + "learning_rate": 0.0008741600090275277, + "loss": 1.03659225, + "num_input_tokens_seen": 109484336, + "router_z_loss_mlp": 0.96044922, + "step": 1320, + "time_per_iteration": 2.573155641555786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183427, + "balance_loss_mlp": 1.08763099, + "epoch": 0.25413620623316663, + "flos": 960855045120.0, + "grad_norm": 0.03323105604734599, + "language_loss": 0.94160318, + "learning_rate": 0.0008739532785928151, + "loss": 0.95343745, + "num_input_tokens_seen": 109590128, + "router_z_loss_mlp": 0.95751953, + "step": 1321, + "time_per_iteration": 3.470245122909546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190819, + "balance_loss_mlp": 1.09659576, + "epoch": 0.25432858791843016, + "flos": 1580648715264.0, + "grad_norm": 0.017424496497570757, + "language_loss": 0.74893582, + "learning_rate": 0.0008737464029765639, + "loss": 0.76084399, + "num_input_tokens_seen": 109816592, + "router_z_loss_mlp": 0.94140625, + "step": 1322, + "time_per_iteration": 4.8549723625183105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184096, + "balance_loss_mlp": 1.08806074, + "epoch": 0.25452096960369375, + "flos": 584893828608.0, + "grad_norm": 0.025099574916072127, + "language_loss": 0.94150972, + "learning_rate": 0.0008735393822590908, + "loss": 0.95335066, + "num_input_tokens_seen": 109890464, + "router_z_loss_mlp": 0.95996094, + "step": 1323, + "time_per_iteration": 2.6771461963653564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187145, + "balance_loss_mlp": 1.0910151, + "epoch": 0.2547133512889573, + "flos": 509641041408.0, + "grad_norm": 0.024104352127734364, + "language_loss": 0.95373654, + "learning_rate": 0.0008733322165207681, + "loss": 0.965608, + "num_input_tokens_seen": 109963408, + "router_z_loss_mlp": 0.9609375, + "step": 1324, + "time_per_iteration": 2.671187400817871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191608, + "balance_loss_mlp": 1.09590697, + "epoch": 0.25490573297422087, + "flos": 784035783168.0, + "grad_norm": 0.02719192919889817, + "language_loss": 0.93181324, + "learning_rate": 0.0008731249058420247, + "loss": 0.94372928, + "num_input_tokens_seen": 110048800, + "router_z_loss_mlp": 0.95654297, + "step": 1325, + "time_per_iteration": 3.0272371768951416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189078, + "balance_loss_mlp": 1.09332883, + "epoch": 0.2550981146594844, + "flos": 510952332288.0, + "grad_norm": 0.024872253546531747, + "language_loss": 1.00651383, + "learning_rate": 0.0008729174503033459, + "loss": 1.0184046, + "num_input_tokens_seen": 110118096, + "router_z_loss_mlp": 0.95703125, + "step": 1326, + "time_per_iteration": 2.6320900917053223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187412, + "balance_loss_mlp": 1.09166288, + "epoch": 0.255290496344748, + "flos": 677930545152.0, + "grad_norm": 0.02807770436691079, + "language_loss": 0.93655276, + "learning_rate": 0.0008727098499852728, + "loss": 0.9484269, + "num_input_tokens_seen": 110190160, + "router_z_loss_mlp": 0.95703125, + "step": 1327, + "time_per_iteration": 2.8246335983276367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187202, + "balance_loss_mlp": 1.09116733, + "epoch": 0.2554828780300115, + "flos": 538984816128.0, + "grad_norm": 0.02304152562423393, + "language_loss": 0.97811985, + "learning_rate": 0.0008725021049684034, + "loss": 0.9899919, + "num_input_tokens_seen": 110268000, + "router_z_loss_mlp": 0.95996094, + "step": 1328, + "time_per_iteration": 2.783276081085205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011849, + "balance_loss_mlp": 1.08924699, + "epoch": 0.2556752597152751, + "flos": 825622883328.0, + "grad_norm": 0.024322773499976656, + "language_loss": 0.90949428, + "learning_rate": 0.000872294215333391, + "loss": 0.92134333, + "num_input_tokens_seen": 110354816, + "router_z_loss_mlp": 0.95605469, + "step": 1329, + "time_per_iteration": 3.1658623218536377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184378, + "balance_loss_mlp": 1.08867729, + "epoch": 0.2558676414005387, + "flos": 571890435072.0, + "grad_norm": 0.026114012927401953, + "language_loss": 0.91800833, + "learning_rate": 0.0008720861811609457, + "loss": 0.92985213, + "num_input_tokens_seen": 110427968, + "router_z_loss_mlp": 0.95654297, + "step": 1330, + "time_per_iteration": 2.725680112838745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185897, + "balance_loss_mlp": 1.09024334, + "epoch": 0.2560600230858022, + "flos": 487748047872.0, + "grad_norm": 0.02457760145285043, + "language_loss": 0.93800515, + "learning_rate": 0.0008718780025318338, + "loss": 0.94986409, + "num_input_tokens_seen": 110501184, + "router_z_loss_mlp": 0.95605469, + "step": 1331, + "time_per_iteration": 2.730424404144287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184699, + "balance_loss_mlp": 1.08904529, + "epoch": 0.2562524047710658, + "flos": 514119406080.0, + "grad_norm": 0.027688932662206074, + "language_loss": 0.94349414, + "learning_rate": 0.0008716696795268771, + "loss": 0.9553411, + "num_input_tokens_seen": 110573008, + "router_z_loss_mlp": 0.95605469, + "step": 1332, + "time_per_iteration": 2.6572844982147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183855, + "balance_loss_mlp": 1.0881542, + "epoch": 0.25644478645632934, + "flos": 636109129728.0, + "grad_norm": 0.025705757243887913, + "language_loss": 0.96553451, + "learning_rate": 0.0008714612122269538, + "loss": 0.97737306, + "num_input_tokens_seen": 110646704, + "router_z_loss_mlp": 0.95654297, + "step": 1333, + "time_per_iteration": 2.867598295211792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184376, + "balance_loss_mlp": 1.0888176, + "epoch": 0.25663716814159293, + "flos": 437544594432.0, + "grad_norm": 0.025955971973603553, + "language_loss": 1.00358891, + "learning_rate": 0.0008712526007129982, + "loss": 1.01543272, + "num_input_tokens_seen": 110712208, + "router_z_loss_mlp": 0.95507812, + "step": 1334, + "time_per_iteration": 2.516052484512329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186528, + "balance_loss_mlp": 1.0908742, + "epoch": 0.25682954982685646, + "flos": 499242765312.0, + "grad_norm": 0.021880143416013124, + "language_loss": 0.98599482, + "learning_rate": 0.0008710438450660003, + "loss": 0.99786019, + "num_input_tokens_seen": 110783936, + "router_z_loss_mlp": 0.95605469, + "step": 1335, + "time_per_iteration": 2.659489870071411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184319, + "balance_loss_mlp": 1.08861768, + "epoch": 0.25702193151212005, + "flos": 458627854848.0, + "grad_norm": 0.028869593177541276, + "language_loss": 0.98979777, + "learning_rate": 0.0008708349453670064, + "loss": 1.00164104, + "num_input_tokens_seen": 110848560, + "router_z_loss_mlp": 0.95654297, + "step": 1336, + "time_per_iteration": 2.5267841815948486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185282, + "balance_loss_mlp": 1.08953345, + "epoch": 0.2572143131973836, + "flos": 599403896832.0, + "grad_norm": 0.021342480544698176, + "language_loss": 0.99445975, + "learning_rate": 0.0008706259016971185, + "loss": 1.00631261, + "num_input_tokens_seen": 110922672, + "router_z_loss_mlp": 0.95703125, + "step": 1337, + "time_per_iteration": 2.7561397552490234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118469, + "balance_loss_mlp": 1.08884537, + "epoch": 0.25740669488264717, + "flos": 699526096896.0, + "grad_norm": 0.032203199948080075, + "language_loss": 0.96320713, + "learning_rate": 0.0008704167141374944, + "loss": 0.97505397, + "num_input_tokens_seen": 110995456, + "router_z_loss_mlp": 0.95800781, + "step": 1338, + "time_per_iteration": 2.7987895011901855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118993, + "balance_loss_mlp": 1.09432399, + "epoch": 0.25759907656791076, + "flos": 503378025984.0, + "grad_norm": 0.024717846020590344, + "language_loss": 0.97755861, + "learning_rate": 0.0008702073827693482, + "loss": 0.98945785, + "num_input_tokens_seen": 111069568, + "router_z_loss_mlp": 0.95556641, + "step": 1339, + "time_per_iteration": 2.694470167160034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186155, + "balance_loss_mlp": 1.0904057, + "epoch": 0.2577914582531743, + "flos": 775241510400.0, + "grad_norm": 0.025036220674882887, + "language_loss": 0.97113985, + "learning_rate": 0.0008699979076739494, + "loss": 0.98300135, + "num_input_tokens_seen": 111142608, + "router_z_loss_mlp": 0.95703125, + "step": 1340, + "time_per_iteration": 2.962740421295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184068, + "balance_loss_mlp": 1.08836627, + "epoch": 0.2579838399384379, + "flos": 460609890816.0, + "grad_norm": 0.026880962232798965, + "language_loss": 0.99139833, + "learning_rate": 0.0008697882889326234, + "loss": 1.00323892, + "num_input_tokens_seen": 111206336, + "router_z_loss_mlp": 0.95654297, + "step": 1341, + "time_per_iteration": 2.517382860183716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185483, + "balance_loss_mlp": 1.08987677, + "epoch": 0.2581762216237014, + "flos": 570262236672.0, + "grad_norm": 0.0242955377416103, + "language_loss": 0.96170259, + "learning_rate": 0.0008695785266267515, + "loss": 0.97355735, + "num_input_tokens_seen": 111276736, + "router_z_loss_mlp": 0.95556641, + "step": 1342, + "time_per_iteration": 2.6961281299591064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118536, + "balance_loss_mlp": 1.08961082, + "epoch": 0.258368603308965, + "flos": 605386934784.0, + "grad_norm": 0.023671890991135848, + "language_loss": 0.9337616, + "learning_rate": 0.0008693686208377704, + "loss": 0.94561517, + "num_input_tokens_seen": 111353856, + "router_z_loss_mlp": 0.95703125, + "step": 1343, + "time_per_iteration": 2.8561604022979736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184784, + "balance_loss_mlp": 1.08908272, + "epoch": 0.2585609849942285, + "flos": 492486924288.0, + "grad_norm": 0.022133881226187983, + "language_loss": 0.96849036, + "learning_rate": 0.0008691585716471733, + "loss": 0.98033822, + "num_input_tokens_seen": 111424960, + "router_z_loss_mlp": 0.95654297, + "step": 1344, + "time_per_iteration": 2.6443324089050293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185279, + "balance_loss_mlp": 1.08952987, + "epoch": 0.2587533666794921, + "flos": 641957182464.0, + "grad_norm": 0.02305984249039353, + "language_loss": 0.94482636, + "learning_rate": 0.0008689483791365079, + "loss": 0.95667922, + "num_input_tokens_seen": 111505248, + "router_z_loss_mlp": 0.95703125, + "step": 1345, + "time_per_iteration": 2.8541483879089355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185515, + "balance_loss_mlp": 1.08976638, + "epoch": 0.2589457483647557, + "flos": 577994996736.0, + "grad_norm": 0.022382124417400225, + "language_loss": 0.97831523, + "learning_rate": 0.0008687380433873786, + "loss": 0.99017042, + "num_input_tokens_seen": 111581936, + "router_z_loss_mlp": 0.95703125, + "step": 1346, + "time_per_iteration": 2.8148868083953857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186141, + "balance_loss_mlp": 1.09048796, + "epoch": 0.25913813005001923, + "flos": 536466293760.0, + "grad_norm": 0.024690786073415343, + "language_loss": 0.93800229, + "learning_rate": 0.0008685275644814448, + "loss": 0.94986367, + "num_input_tokens_seen": 111651456, + "router_z_loss_mlp": 0.95605469, + "step": 1347, + "time_per_iteration": 2.6872267723083496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188569, + "balance_loss_mlp": 1.0930109, + "epoch": 0.2593305117352828, + "flos": 722346344448.0, + "grad_norm": 0.028015192621825148, + "language_loss": 0.944291, + "learning_rate": 0.0008683169425004216, + "loss": 0.95617664, + "num_input_tokens_seen": 111731712, + "router_z_loss_mlp": 0.95507812, + "step": 1348, + "time_per_iteration": 2.9036293029785156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187318, + "balance_loss_mlp": 1.09171176, + "epoch": 0.25952289342054635, + "flos": 711355186176.0, + "grad_norm": 0.028695706473352366, + "language_loss": 0.9867608, + "learning_rate": 0.0008681061775260799, + "loss": 0.99863392, + "num_input_tokens_seen": 111800752, + "router_z_loss_mlp": 0.95556641, + "step": 1349, + "time_per_iteration": 2.8635356426239014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185365, + "balance_loss_mlp": 1.08942509, + "epoch": 0.25971527510580994, + "flos": 456849934848.0, + "grad_norm": 0.028158951385379896, + "language_loss": 1.01652539, + "learning_rate": 0.0008678952696402458, + "loss": 1.02837896, + "num_input_tokens_seen": 111866752, + "router_z_loss_mlp": 0.95898438, + "step": 1350, + "time_per_iteration": 2.4997899532318115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184224, + "balance_loss_mlp": 1.08847523, + "epoch": 0.25990765679107347, + "flos": 613753509888.0, + "grad_norm": 0.022929201317296435, + "language_loss": 0.944794, + "learning_rate": 0.000867684218924801, + "loss": 0.95663619, + "num_input_tokens_seen": 111951328, + "router_z_loss_mlp": 0.95703125, + "step": 1351, + "time_per_iteration": 2.8553221225738525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190399, + "balance_loss_mlp": 1.09655762, + "epoch": 0.26010003847633706, + "flos": 1541404219392.0, + "grad_norm": 0.011373150433568688, + "language_loss": 0.78947091, + "learning_rate": 0.0008674730254616827, + "loss": 0.80137491, + "num_input_tokens_seen": 112182272, + "router_z_loss_mlp": 0.9375, + "step": 1352, + "time_per_iteration": 4.894901752471924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185829, + "balance_loss_mlp": 1.0900805, + "epoch": 0.2602924201616006, + "flos": 717544341504.0, + "grad_norm": 0.021521520095987904, + "language_loss": 0.9327749, + "learning_rate": 0.0008672616893328834, + "loss": 0.94463313, + "num_input_tokens_seen": 112261760, + "router_z_loss_mlp": 0.95703125, + "step": 1353, + "time_per_iteration": 2.9336133003234863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181557, + "balance_loss_mlp": 1.08571243, + "epoch": 0.2604848018468642, + "flos": 644685825024.0, + "grad_norm": 0.026147354827328006, + "language_loss": 0.99375951, + "learning_rate": 0.0008670502106204512, + "loss": 1.00557506, + "num_input_tokens_seen": 112339136, + "router_z_loss_mlp": 0.95800781, + "step": 1354, + "time_per_iteration": 2.828476667404175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182712, + "balance_loss_mlp": 1.08677256, + "epoch": 0.26067718353212777, + "flos": 518037815808.0, + "grad_norm": 0.024264679119450936, + "language_loss": 0.92830276, + "learning_rate": 0.0008668385894064892, + "loss": 0.94012988, + "num_input_tokens_seen": 112409872, + "router_z_loss_mlp": 0.95898438, + "step": 1355, + "time_per_iteration": 2.627603054046631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183025, + "balance_loss_mlp": 1.08708537, + "epoch": 0.2608695652173913, + "flos": 824224997376.0, + "grad_norm": 0.021603697394371835, + "language_loss": 0.98353279, + "learning_rate": 0.0008666268257731562, + "loss": 0.995363, + "num_input_tokens_seen": 112495616, + "router_z_loss_mlp": 0.95898438, + "step": 1356, + "time_per_iteration": 3.104410409927368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185288, + "balance_loss_mlp": 1.0894438, + "epoch": 0.2610619469026549, + "flos": 1009449039360.0, + "grad_norm": 0.029063247039842262, + "language_loss": 0.98633218, + "learning_rate": 0.0008664149198026662, + "loss": 0.99818504, + "num_input_tokens_seen": 112575168, + "router_z_loss_mlp": 0.95800781, + "step": 1357, + "time_per_iteration": 3.2552602291107178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184981, + "balance_loss_mlp": 1.08932745, + "epoch": 0.2612543285879184, + "flos": 537825248256.0, + "grad_norm": 0.02677910773484977, + "language_loss": 0.99748302, + "learning_rate": 0.0008662028715772883, + "loss": 1.00933278, + "num_input_tokens_seen": 112648480, + "router_z_loss_mlp": 0.95605469, + "step": 1358, + "time_per_iteration": 2.6044809818267822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186466, + "balance_loss_mlp": 1.09095597, + "epoch": 0.261446710273182, + "flos": 520438817280.0, + "grad_norm": 0.024887857022763207, + "language_loss": 0.95091379, + "learning_rate": 0.0008659906811793467, + "loss": 0.96277845, + "num_input_tokens_seen": 112719856, + "router_z_loss_mlp": 0.95458984, + "step": 1359, + "time_per_iteration": 2.660039186477661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118844, + "balance_loss_mlp": 1.09297669, + "epoch": 0.26163909195844554, + "flos": 584399001600.0, + "grad_norm": 0.02478490455868915, + "language_loss": 0.99414921, + "learning_rate": 0.0008657783486912215, + "loss": 1.00603366, + "num_input_tokens_seen": 112795088, + "router_z_loss_mlp": 0.95410156, + "step": 1360, + "time_per_iteration": 2.710707187652588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189735, + "balance_loss_mlp": 1.09412944, + "epoch": 0.2618314736437091, + "flos": 960368223744.0, + "grad_norm": 0.025390417969386195, + "language_loss": 0.99146813, + "learning_rate": 0.0008655658741953472, + "loss": 1.00336552, + "num_input_tokens_seen": 112879888, + "router_z_loss_mlp": 0.95556641, + "step": 1361, + "time_per_iteration": 3.2610023021698 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187461, + "balance_loss_mlp": 1.0919987, + "epoch": 0.26202385532897265, + "flos": 575902170624.0, + "grad_norm": 0.01965876060868175, + "language_loss": 0.95685869, + "learning_rate": 0.0008653532577742136, + "loss": 0.96873331, + "num_input_tokens_seen": 112952208, + "router_z_loss_mlp": 0.95410156, + "step": 1362, + "time_per_iteration": 2.753920793533325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190509, + "balance_loss_mlp": 1.09509337, + "epoch": 0.26221623701423624, + "flos": 446397264384.0, + "grad_norm": 0.024702919408059576, + "language_loss": 0.95440364, + "learning_rate": 0.0008651404995103659, + "loss": 0.96630871, + "num_input_tokens_seen": 113017472, + "router_z_loss_mlp": 0.95361328, + "step": 1363, + "time_per_iteration": 2.532839298248291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184254, + "balance_loss_mlp": 1.088696, + "epoch": 0.26240861869949983, + "flos": 536755003392.0, + "grad_norm": 0.021936659097783043, + "language_loss": 0.95658946, + "learning_rate": 0.0008649275994864041, + "loss": 0.96843195, + "num_input_tokens_seen": 113090000, + "router_z_loss_mlp": 0.95507812, + "step": 1364, + "time_per_iteration": 2.6723499298095703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182727, + "balance_loss_mlp": 1.08735919, + "epoch": 0.26260100038476336, + "flos": 566487544320.0, + "grad_norm": 0.02057443182875544, + "language_loss": 0.93747735, + "learning_rate": 0.0008647145577849834, + "loss": 0.94930464, + "num_input_tokens_seen": 113169424, + "router_z_loss_mlp": 0.953125, + "step": 1365, + "time_per_iteration": 2.817335844039917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184888, + "balance_loss_mlp": 1.089378, + "epoch": 0.26279338207002695, + "flos": 614320195584.0, + "grad_norm": 0.02000370099851243, + "language_loss": 0.90110707, + "learning_rate": 0.0008645013744888139, + "loss": 0.912956, + "num_input_tokens_seen": 113256752, + "router_z_loss_mlp": 0.95458984, + "step": 1366, + "time_per_iteration": 2.889956474304199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190369, + "balance_loss_mlp": 1.09452498, + "epoch": 0.2629857637552905, + "flos": 523944992256.0, + "grad_norm": 0.02433762343961203, + "language_loss": 0.96272296, + "learning_rate": 0.0008642880496806607, + "loss": 0.97462666, + "num_input_tokens_seen": 113330512, + "router_z_loss_mlp": 0.95800781, + "step": 1367, + "time_per_iteration": 2.7868857383728027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186128, + "balance_loss_mlp": 1.09028387, + "epoch": 0.26317814544055407, + "flos": 535654559232.0, + "grad_norm": 0.022945771924384736, + "language_loss": 0.9318915, + "learning_rate": 0.0008640745834433437, + "loss": 0.94375277, + "num_input_tokens_seen": 113409088, + "router_z_loss_mlp": 0.95800781, + "step": 1368, + "time_per_iteration": 2.7556509971618652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182695, + "balance_loss_mlp": 1.08718467, + "epoch": 0.2633705271258176, + "flos": 556779479040.0, + "grad_norm": 0.024336346931206027, + "language_loss": 0.96858466, + "learning_rate": 0.000863860975859738, + "loss": 0.98041165, + "num_input_tokens_seen": 113486624, + "router_z_loss_mlp": 0.95458984, + "step": 1369, + "time_per_iteration": 2.9069716930389404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184914, + "balance_loss_mlp": 1.08945167, + "epoch": 0.2635629088110812, + "flos": 553461957120.0, + "grad_norm": 0.02843668952404612, + "language_loss": 1.00276971, + "learning_rate": 0.0008636472270127733, + "loss": 1.01461875, + "num_input_tokens_seen": 113555776, + "router_z_loss_mlp": 0.95410156, + "step": 1370, + "time_per_iteration": 2.626201868057251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185086, + "balance_loss_mlp": 1.08952749, + "epoch": 0.2637552904963448, + "flos": 456915062784.0, + "grad_norm": 0.02826867423240315, + "language_loss": 1.01819849, + "learning_rate": 0.0008634333369854345, + "loss": 1.03004944, + "num_input_tokens_seen": 113624208, + "router_z_loss_mlp": 0.95507812, + "step": 1371, + "time_per_iteration": 2.5906460285186768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183664, + "balance_loss_mlp": 1.08820105, + "epoch": 0.2639476721816083, + "flos": 614259070464.0, + "grad_norm": 0.024066040008067748, + "language_loss": 0.95210433, + "learning_rate": 0.0008632193058607608, + "loss": 0.96394098, + "num_input_tokens_seen": 113698544, + "router_z_loss_mlp": 0.95410156, + "step": 1372, + "time_per_iteration": 2.7260935306549072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180244, + "balance_loss_mlp": 1.08487642, + "epoch": 0.2641400538668719, + "flos": 573025807872.0, + "grad_norm": 0.02730663798923432, + "language_loss": 0.93146777, + "learning_rate": 0.0008630051337218466, + "loss": 0.94327021, + "num_input_tokens_seen": 113769024, + "router_z_loss_mlp": 0.953125, + "step": 1373, + "time_per_iteration": 2.7155323028564453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193282, + "balance_loss_mlp": 1.09777129, + "epoch": 0.2643324355521354, + "flos": 583339490304.0, + "grad_norm": 0.02802871933703498, + "language_loss": 0.91373825, + "learning_rate": 0.0008627908206518409, + "loss": 0.9256711, + "num_input_tokens_seen": 113836320, + "router_z_loss_mlp": 0.95458984, + "step": 1374, + "time_per_iteration": 2.7118475437164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189674, + "balance_loss_mlp": 1.09621429, + "epoch": 0.264524817237399, + "flos": 1548025075200.0, + "grad_norm": 0.008601814223210932, + "language_loss": 0.75151253, + "learning_rate": 0.0008625763667339472, + "loss": 0.76340932, + "num_input_tokens_seen": 114065040, + "router_z_loss_mlp": 0.93359375, + "step": 1375, + "time_per_iteration": 4.9838175773620605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192464, + "balance_loss_mlp": 1.09709656, + "epoch": 0.26471719892266254, + "flos": 519042932736.0, + "grad_norm": 0.024634755338573868, + "language_loss": 0.99606347, + "learning_rate": 0.0008623617720514241, + "loss": 1.0079881, + "num_input_tokens_seen": 114133488, + "router_z_loss_mlp": 0.953125, + "step": 1376, + "time_per_iteration": 2.5836029052734375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191563, + "balance_loss_mlp": 1.09586143, + "epoch": 0.26490958060792613, + "flos": 518205001728.0, + "grad_norm": 0.02740625444526412, + "language_loss": 0.95827538, + "learning_rate": 0.0008621470366875848, + "loss": 0.97019094, + "num_input_tokens_seen": 114200704, + "router_z_loss_mlp": 0.95654297, + "step": 1377, + "time_per_iteration": 2.574557304382324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190438, + "balance_loss_mlp": 1.09507096, + "epoch": 0.26510196229318966, + "flos": 597682372608.0, + "grad_norm": 0.02552910213335578, + "language_loss": 0.96441573, + "learning_rate": 0.0008619321607257966, + "loss": 0.97632015, + "num_input_tokens_seen": 114272160, + "router_z_loss_mlp": 0.953125, + "step": 1378, + "time_per_iteration": 2.680574655532837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187734, + "balance_loss_mlp": 1.09227157, + "epoch": 0.26529434397845325, + "flos": 687052459008.0, + "grad_norm": 0.024630390251990656, + "language_loss": 0.90670931, + "learning_rate": 0.000861717144249482, + "loss": 0.91858661, + "num_input_tokens_seen": 114347904, + "router_z_loss_mlp": 0.95410156, + "step": 1379, + "time_per_iteration": 2.8311944007873535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181951, + "balance_loss_mlp": 1.08672631, + "epoch": 0.26548672566371684, + "flos": 425259609600.0, + "grad_norm": 0.02240925569996582, + "language_loss": 0.98143864, + "learning_rate": 0.0008615019873421175, + "loss": 0.99325812, + "num_input_tokens_seen": 114409952, + "router_z_loss_mlp": 0.95166016, + "step": 1380, + "time_per_iteration": 2.472280263900757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182344, + "balance_loss_mlp": 1.08716714, + "epoch": 0.26567910734898037, + "flos": 490849993728.0, + "grad_norm": 0.024166031959674275, + "language_loss": 0.9586165, + "learning_rate": 0.0008612866900872349, + "loss": 0.97043991, + "num_input_tokens_seen": 114474832, + "router_z_loss_mlp": 0.95117188, + "step": 1381, + "time_per_iteration": 2.5671043395996094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181037, + "balance_loss_mlp": 1.08586013, + "epoch": 0.26587148903424396, + "flos": 535228862976.0, + "grad_norm": 0.024625622440273682, + "language_loss": 0.97316492, + "learning_rate": 0.0008610712525684197, + "loss": 0.98497522, + "num_input_tokens_seen": 114545152, + "router_z_loss_mlp": 0.95117188, + "step": 1382, + "time_per_iteration": 2.6394782066345215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179642, + "balance_loss_mlp": 1.08446515, + "epoch": 0.2660638707195075, + "flos": 1019055046656.0, + "grad_norm": 0.02944222863828147, + "language_loss": 0.96464765, + "learning_rate": 0.0008608556748693121, + "loss": 0.97644401, + "num_input_tokens_seen": 114626512, + "router_z_loss_mlp": 0.95117188, + "step": 1383, + "time_per_iteration": 3.2514846324920654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184353, + "balance_loss_mlp": 1.08941519, + "epoch": 0.2662562524047711, + "flos": 525062900736.0, + "grad_norm": 0.024003921212174706, + "language_loss": 0.95956504, + "learning_rate": 0.000860639957073607, + "loss": 0.97140861, + "num_input_tokens_seen": 114701008, + "router_z_loss_mlp": 0.94873047, + "step": 1384, + "time_per_iteration": 2.6759448051452637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190743, + "balance_loss_mlp": 1.09594798, + "epoch": 0.2664486340900346, + "flos": 553479421440.0, + "grad_norm": 0.02584009515603871, + "language_loss": 0.97059226, + "learning_rate": 0.0008604240992650534, + "loss": 0.98249966, + "num_input_tokens_seen": 114771984, + "router_z_loss_mlp": 0.94726562, + "step": 1385, + "time_per_iteration": 2.6880476474761963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187786, + "balance_loss_mlp": 1.09260905, + "epoch": 0.2666410157752982, + "flos": 471208280064.0, + "grad_norm": 0.023709316387392747, + "language_loss": 0.98021734, + "learning_rate": 0.0008602081015274545, + "loss": 0.99209523, + "num_input_tokens_seen": 114844800, + "router_z_loss_mlp": 0.95117188, + "step": 1386, + "time_per_iteration": 2.71233868598938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187602, + "balance_loss_mlp": 1.0924257, + "epoch": 0.2668333974605617, + "flos": 571015574016.0, + "grad_norm": 0.021121239598078063, + "language_loss": 0.90840185, + "learning_rate": 0.0008599919639446684, + "loss": 0.92027789, + "num_input_tokens_seen": 114918544, + "router_z_loss_mlp": 0.95117188, + "step": 1387, + "time_per_iteration": 2.6656363010406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183674, + "balance_loss_mlp": 1.08840239, + "epoch": 0.2670257791458253, + "flos": 399895369728.0, + "grad_norm": 0.029257146370583235, + "language_loss": 0.92911923, + "learning_rate": 0.000859775686600607, + "loss": 0.940956, + "num_input_tokens_seen": 114984272, + "router_z_loss_mlp": 0.95214844, + "step": 1388, + "time_per_iteration": 2.5366902351379395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186225, + "balance_loss_mlp": 1.09104884, + "epoch": 0.2672181608310889, + "flos": 516891709440.0, + "grad_norm": 0.02488439836403737, + "language_loss": 0.94369394, + "learning_rate": 0.0008595592695792367, + "loss": 0.95555621, + "num_input_tokens_seen": 115054800, + "router_z_loss_mlp": 0.95117188, + "step": 1389, + "time_per_iteration": 2.6710469722747803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184466, + "balance_loss_mlp": 1.08928883, + "epoch": 0.26741054251635243, + "flos": 508525134336.0, + "grad_norm": 0.024055725628873734, + "language_loss": 0.99442971, + "learning_rate": 0.0008593427129645778, + "loss": 1.00627434, + "num_input_tokens_seen": 115120928, + "router_z_loss_mlp": 0.95117188, + "step": 1390, + "time_per_iteration": 2.5913095474243164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184607, + "balance_loss_mlp": 1.08919191, + "epoch": 0.267602924201616, + "flos": 577808345088.0, + "grad_norm": 0.025635319637122064, + "language_loss": 0.93523198, + "learning_rate": 0.0008591260168407052, + "loss": 0.94707805, + "num_input_tokens_seen": 115196688, + "router_z_loss_mlp": 0.95361328, + "step": 1391, + "time_per_iteration": 2.766150712966919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118642, + "balance_loss_mlp": 1.09095728, + "epoch": 0.26779530588687955, + "flos": 524999774208.0, + "grad_norm": 0.02196829508666122, + "language_loss": 0.92168128, + "learning_rate": 0.0008589091812917479, + "loss": 0.93354547, + "num_input_tokens_seen": 115264912, + "router_z_loss_mlp": 0.95410156, + "step": 1392, + "time_per_iteration": 2.6208953857421875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119079, + "balance_loss_mlp": 1.09580445, + "epoch": 0.26798768757214314, + "flos": 557827530240.0, + "grad_norm": 0.02442636530887492, + "language_loss": 0.95854455, + "learning_rate": 0.0008586922064018887, + "loss": 0.97045243, + "num_input_tokens_seen": 115334672, + "router_z_loss_mlp": 0.94921875, + "step": 1393, + "time_per_iteration": 2.6643927097320557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190751, + "balance_loss_mlp": 1.09581244, + "epoch": 0.2681800692574067, + "flos": 932094693888.0, + "grad_norm": 0.0254733622090453, + "language_loss": 0.99184585, + "learning_rate": 0.0008584750922553651, + "loss": 1.00375342, + "num_input_tokens_seen": 115420032, + "router_z_loss_mlp": 0.94873047, + "step": 1394, + "time_per_iteration": 3.1305503845214844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192347, + "balance_loss_mlp": 1.09712303, + "epoch": 0.26837245094267026, + "flos": 702317865984.0, + "grad_norm": 0.023340973249423663, + "language_loss": 0.92753315, + "learning_rate": 0.0008582578389364677, + "loss": 0.93945664, + "num_input_tokens_seen": 115492576, + "router_z_loss_mlp": 0.95166016, + "step": 1395, + "time_per_iteration": 2.8527095317840576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184756, + "balance_loss_mlp": 1.08953142, + "epoch": 0.26856483262793385, + "flos": 594393775104.0, + "grad_norm": 0.020526468408011762, + "language_loss": 1.00206113, + "learning_rate": 0.0008580404465295422, + "loss": 1.01390874, + "num_input_tokens_seen": 115568368, + "router_z_loss_mlp": 0.95166016, + "step": 1396, + "time_per_iteration": 2.784592866897583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184595, + "balance_loss_mlp": 1.08922791, + "epoch": 0.2687572143131974, + "flos": 715588502016.0, + "grad_norm": 0.024818089102904728, + "language_loss": 0.9790895, + "learning_rate": 0.0008578229151189876, + "loss": 0.99093544, + "num_input_tokens_seen": 115651536, + "router_z_loss_mlp": 0.953125, + "step": 1397, + "time_per_iteration": 2.901818037033081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185216, + "balance_loss_mlp": 1.0896579, + "epoch": 0.26894959599846097, + "flos": 468670291968.0, + "grad_norm": 0.028086023154021946, + "language_loss": 0.91012216, + "learning_rate": 0.0008576052447892573, + "loss": 0.92197436, + "num_input_tokens_seen": 115715696, + "router_z_loss_mlp": 0.95507812, + "step": 1398, + "time_per_iteration": 2.5849812030792236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186332, + "balance_loss_mlp": 1.09082139, + "epoch": 0.2691419776837245, + "flos": 469629746688.0, + "grad_norm": 0.022530608820729603, + "language_loss": 0.95147502, + "learning_rate": 0.000857387435624858, + "loss": 0.96333838, + "num_input_tokens_seen": 115780928, + "router_z_loss_mlp": 0.95458984, + "step": 1399, + "time_per_iteration": 2.5274569988250732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011908, + "balance_loss_mlp": 1.09567106, + "epoch": 0.2693343593689881, + "flos": 939284963328.0, + "grad_norm": 0.02095039568010189, + "language_loss": 0.95472848, + "learning_rate": 0.0008571694877103513, + "loss": 0.96663648, + "num_input_tokens_seen": 115874432, + "router_z_loss_mlp": 0.95068359, + "step": 1400, + "time_per_iteration": 3.2558727264404297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190554, + "balance_loss_mlp": 1.09542465, + "epoch": 0.2695267410542516, + "flos": 578793996288.0, + "grad_norm": 0.0241215692671091, + "language_loss": 0.95762217, + "learning_rate": 0.0008569514011303515, + "loss": 0.96952766, + "num_input_tokens_seen": 115956608, + "router_z_loss_mlp": 0.95068359, + "step": 1401, + "time_per_iteration": 2.8175997734069824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193641, + "balance_loss_mlp": 1.09846401, + "epoch": 0.2697191227395152, + "flos": 557964516864.0, + "grad_norm": 0.02413892998134183, + "language_loss": 0.96554017, + "learning_rate": 0.0008567331759695277, + "loss": 0.97747654, + "num_input_tokens_seen": 116031728, + "router_z_loss_mlp": 0.95117188, + "step": 1402, + "time_per_iteration": 2.7052927017211914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192424, + "balance_loss_mlp": 1.09729552, + "epoch": 0.26991150442477874, + "flos": 530314068480.0, + "grad_norm": 0.024237100625486396, + "language_loss": 0.97319567, + "learning_rate": 0.0008565148123126023, + "loss": 0.98511994, + "num_input_tokens_seen": 116104288, + "router_z_loss_mlp": 0.95068359, + "step": 1403, + "time_per_iteration": 2.6399028301239014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187922, + "balance_loss_mlp": 1.09274554, + "epoch": 0.2701038861100423, + "flos": 533086371840.0, + "grad_norm": 0.021620674049761555, + "language_loss": 0.93398714, + "learning_rate": 0.0008562963102443516, + "loss": 0.94586635, + "num_input_tokens_seen": 116177920, + "router_z_loss_mlp": 0.95117188, + "step": 1404, + "time_per_iteration": 2.6793839931488037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185578, + "balance_loss_mlp": 1.09035325, + "epoch": 0.2702962677953059, + "flos": 736504576512.0, + "grad_norm": 0.026106257639691363, + "language_loss": 0.94497591, + "learning_rate": 0.0008560776698496056, + "loss": 0.95683169, + "num_input_tokens_seen": 116251680, + "router_z_loss_mlp": 0.95166016, + "step": 1405, + "time_per_iteration": 2.8884029388427734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186883, + "balance_loss_mlp": 1.09170628, + "epoch": 0.27048864948056944, + "flos": 576000225792.0, + "grad_norm": 0.025611862530653208, + "language_loss": 0.95929742, + "learning_rate": 0.0008558588912132481, + "loss": 0.97116625, + "num_input_tokens_seen": 116327664, + "router_z_loss_mlp": 0.95117188, + "step": 1406, + "time_per_iteration": 2.8396451473236084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190124, + "balance_loss_mlp": 1.09666443, + "epoch": 0.27068103116583303, + "flos": 1426910212608.0, + "grad_norm": 0.014531874927713828, + "language_loss": 0.76458991, + "learning_rate": 0.0008556399744202163, + "loss": 0.77649117, + "num_input_tokens_seen": 116555152, + "router_z_loss_mlp": 0.93359375, + "step": 1407, + "time_per_iteration": 4.898139715194702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119097, + "balance_loss_mlp": 1.09603214, + "epoch": 0.27087341285109656, + "flos": 533031977472.0, + "grad_norm": 0.024689522623330563, + "language_loss": 0.90804136, + "learning_rate": 0.0008554209195555016, + "loss": 0.91995108, + "num_input_tokens_seen": 116626016, + "router_z_loss_mlp": 0.94873047, + "step": 1408, + "time_per_iteration": 2.670093297958374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189645, + "balance_loss_mlp": 1.09446859, + "epoch": 0.27106579453636015, + "flos": 582464629248.0, + "grad_norm": 0.0247795195650599, + "language_loss": 0.98232609, + "learning_rate": 0.0008552017267041483, + "loss": 0.99422252, + "num_input_tokens_seen": 116699152, + "router_z_loss_mlp": 0.95117188, + "step": 1409, + "time_per_iteration": 2.6904594898223877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118886, + "balance_loss_mlp": 1.09368336, + "epoch": 0.2712581762216237, + "flos": 507880585728.0, + "grad_norm": 0.024309295256612126, + "language_loss": 0.90687084, + "learning_rate": 0.0008549823959512549, + "loss": 0.91875941, + "num_input_tokens_seen": 116770912, + "router_z_loss_mlp": 0.95117188, + "step": 1410, + "time_per_iteration": 2.662597417831421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189943, + "balance_loss_mlp": 1.09481394, + "epoch": 0.27145055790688727, + "flos": 999142087680.0, + "grad_norm": 0.023895808714677214, + "language_loss": 0.95848304, + "learning_rate": 0.0008547629273819728, + "loss": 0.97038245, + "num_input_tokens_seen": 116863088, + "router_z_loss_mlp": 0.95068359, + "step": 1411, + "time_per_iteration": 3.36985182762146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186274, + "balance_loss_mlp": 1.09109735, + "epoch": 0.2716429395921508, + "flos": 547728697344.0, + "grad_norm": 0.02712613780862537, + "language_loss": 0.93229926, + "learning_rate": 0.0008545433210815074, + "loss": 0.94416201, + "num_input_tokens_seen": 116929504, + "router_z_loss_mlp": 0.95117188, + "step": 1412, + "time_per_iteration": 2.601452350616455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182035, + "balance_loss_mlp": 1.08685839, + "epoch": 0.2718353212774144, + "flos": 574310902272.0, + "grad_norm": 0.02439507328911507, + "language_loss": 0.95137858, + "learning_rate": 0.0008543235771351176, + "loss": 0.96319902, + "num_input_tokens_seen": 117004064, + "router_z_loss_mlp": 0.95117188, + "step": 1413, + "time_per_iteration": 2.7132034301757812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197126, + "balance_loss_mlp": 1.10209203, + "epoch": 0.272027702962678, + "flos": 645584881152.0, + "grad_norm": 0.02257567173785872, + "language_loss": 0.91220462, + "learning_rate": 0.0008541036956281154, + "loss": 0.92417586, + "num_input_tokens_seen": 117081328, + "router_z_loss_mlp": 0.94970703, + "step": 1414, + "time_per_iteration": 2.871951103210449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187874, + "balance_loss_mlp": 1.09284067, + "epoch": 0.2722200846479415, + "flos": 654995504640.0, + "grad_norm": 0.026411231013774135, + "language_loss": 0.93374348, + "learning_rate": 0.0008538836766458665, + "loss": 0.94562221, + "num_input_tokens_seen": 117156544, + "router_z_loss_mlp": 0.94970703, + "step": 1415, + "time_per_iteration": 2.8673384189605713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183666, + "balance_loss_mlp": 1.08868039, + "epoch": 0.2724124663332051, + "flos": 580778033664.0, + "grad_norm": 0.027862690716265133, + "language_loss": 0.96171892, + "learning_rate": 0.0008536635202737897, + "loss": 0.97355556, + "num_input_tokens_seen": 117230208, + "router_z_loss_mlp": 0.94921875, + "step": 1416, + "time_per_iteration": 2.7829935550689697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183251, + "balance_loss_mlp": 1.08831298, + "epoch": 0.2726048480184686, + "flos": 538467795456.0, + "grad_norm": 0.025077003090708358, + "language_loss": 0.93469489, + "learning_rate": 0.0008534432265973573, + "loss": 0.94652736, + "num_input_tokens_seen": 117298080, + "router_z_loss_mlp": 0.94873047, + "step": 1417, + "time_per_iteration": 2.593364715576172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183107, + "balance_loss_mlp": 1.08793056, + "epoch": 0.2727972297037322, + "flos": 997548817920.0, + "grad_norm": 0.025553987949566613, + "language_loss": 0.99255168, + "learning_rate": 0.000853222795702095, + "loss": 1.00438273, + "num_input_tokens_seen": 117396256, + "router_z_loss_mlp": 0.95117188, + "step": 1418, + "time_per_iteration": 3.387162685394287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119173, + "balance_loss_mlp": 1.09712589, + "epoch": 0.27298961138899575, + "flos": 607334042112.0, + "grad_norm": 0.02541700118612174, + "language_loss": 0.93465757, + "learning_rate": 0.0008530022276735813, + "loss": 0.94657481, + "num_input_tokens_seen": 117467936, + "router_z_loss_mlp": 0.9453125, + "step": 1419, + "time_per_iteration": 2.7426016330718994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191299, + "balance_loss_mlp": 1.0965513, + "epoch": 0.27318199307425933, + "flos": 530396660736.0, + "grad_norm": 0.025702548257077976, + "language_loss": 0.9374572, + "learning_rate": 0.0008527815225974489, + "loss": 0.94937015, + "num_input_tokens_seen": 117538256, + "router_z_loss_mlp": 0.94677734, + "step": 1420, + "time_per_iteration": 2.6544342041015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118326, + "balance_loss_mlp": 1.08865511, + "epoch": 0.2733743747595229, + "flos": 409911610368.0, + "grad_norm": 0.028874111022423956, + "language_loss": 0.99327809, + "learning_rate": 0.0008525606805593829, + "loss": 1.00511074, + "num_input_tokens_seen": 117599488, + "router_z_loss_mlp": 0.9453125, + "step": 1421, + "time_per_iteration": 2.4215376377105713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182106, + "balance_loss_mlp": 1.08721578, + "epoch": 0.27356675644478645, + "flos": 517228082688.0, + "grad_norm": 0.026406413504372096, + "language_loss": 0.92442018, + "learning_rate": 0.0008523397016451213, + "loss": 0.93624127, + "num_input_tokens_seen": 117664240, + "router_z_loss_mlp": 0.94824219, + "step": 1422, + "time_per_iteration": 2.5680603981018066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184812, + "balance_loss_mlp": 1.09011269, + "epoch": 0.27375913813005004, + "flos": 1054058221056.0, + "grad_norm": 0.02228341429952914, + "language_loss": 0.94973963, + "learning_rate": 0.0008521185859404564, + "loss": 0.96158779, + "num_input_tokens_seen": 117754768, + "router_z_loss_mlp": 0.94628906, + "step": 1423, + "time_per_iteration": 3.37345814704895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179884, + "balance_loss_mlp": 1.08485043, + "epoch": 0.27395151981531357, + "flos": 626003566080.0, + "grad_norm": 0.02387683630357993, + "language_loss": 0.97909242, + "learning_rate": 0.0008518973335312326, + "loss": 0.99089128, + "num_input_tokens_seen": 117832816, + "router_z_loss_mlp": 0.94970703, + "step": 1424, + "time_per_iteration": 2.8314859867095947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184763, + "balance_loss_mlp": 1.08982456, + "epoch": 0.27414390150057716, + "flos": 551414793216.0, + "grad_norm": 0.028545098094769822, + "language_loss": 0.95577884, + "learning_rate": 0.0008516759445033477, + "loss": 0.96762645, + "num_input_tokens_seen": 117899168, + "router_z_loss_mlp": 0.94873047, + "step": 1425, + "time_per_iteration": 2.6086578369140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118204, + "balance_loss_mlp": 1.08705389, + "epoch": 0.2743362831858407, + "flos": 540951389184.0, + "grad_norm": 0.02677358847245462, + "language_loss": 0.96958816, + "learning_rate": 0.0008514544189427526, + "loss": 0.9814086, + "num_input_tokens_seen": 117972384, + "router_z_loss_mlp": 0.94921875, + "step": 1426, + "time_per_iteration": 2.6927483081817627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191695, + "balance_loss_mlp": 1.09713852, + "epoch": 0.2745286648711043, + "flos": 469545153024.0, + "grad_norm": 0.025998263163597202, + "language_loss": 0.95807564, + "learning_rate": 0.0008512327569354511, + "loss": 0.96999258, + "num_input_tokens_seen": 118039584, + "router_z_loss_mlp": 0.94482422, + "step": 1427, + "time_per_iteration": 2.5617682933807373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119268, + "balance_loss_mlp": 1.09764659, + "epoch": 0.2747210465563678, + "flos": 473871794688.0, + "grad_norm": 0.02733358796633043, + "language_loss": 0.93333006, + "learning_rate": 0.0008510109585675001, + "loss": 0.94525683, + "num_input_tokens_seen": 118108352, + "router_z_loss_mlp": 0.94970703, + "step": 1428, + "time_per_iteration": 2.7269434928894043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205208, + "balance_loss_mlp": 1.11193848, + "epoch": 0.2749134282416314, + "flos": 1318056866304.0, + "grad_norm": 0.019809968329655446, + "language_loss": 0.81153345, + "learning_rate": 0.0008507890239250093, + "loss": 0.82358551, + "num_input_tokens_seen": 118331120, + "router_z_loss_mlp": 0.93164062, + "step": 1429, + "time_per_iteration": 4.731899738311768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190948, + "balance_loss_mlp": 1.0958662, + "epoch": 0.275105809926895, + "flos": 972531684864.0, + "grad_norm": 0.03147414200634365, + "language_loss": 0.91184711, + "learning_rate": 0.0008505669530941415, + "loss": 0.92375666, + "num_input_tokens_seen": 118415872, + "router_z_loss_mlp": 0.95019531, + "step": 1430, + "time_per_iteration": 3.3260724544525146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189047, + "balance_loss_mlp": 1.09387004, + "epoch": 0.2752981916121585, + "flos": 528368962560.0, + "grad_norm": 0.025580193945061114, + "language_loss": 0.95012403, + "learning_rate": 0.000850344746161112, + "loss": 0.96201456, + "num_input_tokens_seen": 118483008, + "router_z_loss_mlp": 0.95117188, + "step": 1431, + "time_per_iteration": 2.5820231437683105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186021, + "balance_loss_mlp": 1.09093964, + "epoch": 0.2754905732974221, + "flos": 454598654976.0, + "grad_norm": 0.024219881250434897, + "language_loss": 0.962569, + "learning_rate": 0.0008501224032121894, + "loss": 0.97442919, + "num_input_tokens_seen": 118545840, + "router_z_loss_mlp": 0.95019531, + "step": 1432, + "time_per_iteration": 2.501572847366333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188894, + "balance_loss_mlp": 1.09362173, + "epoch": 0.27568295498268564, + "flos": 498508893696.0, + "grad_norm": 0.02427263624604226, + "language_loss": 0.90960014, + "learning_rate": 0.0008498999243336946, + "loss": 0.921489, + "num_input_tokens_seen": 118615168, + "router_z_loss_mlp": 0.95214844, + "step": 1433, + "time_per_iteration": 2.6212003231048584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192375, + "balance_loss_mlp": 1.09715116, + "epoch": 0.2758753366679492, + "flos": 609416134656.0, + "grad_norm": 0.024278981864862804, + "language_loss": 0.95570171, + "learning_rate": 0.0008496773096120021, + "loss": 0.9676255, + "num_input_tokens_seen": 118690384, + "router_z_loss_mlp": 0.95166016, + "step": 1434, + "time_per_iteration": 2.804689407348633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118926, + "balance_loss_mlp": 1.09370184, + "epoch": 0.27606771835321275, + "flos": 741436835328.0, + "grad_norm": 0.025697024392157108, + "language_loss": 0.95037985, + "learning_rate": 0.0008494545591335381, + "loss": 0.96227252, + "num_input_tokens_seen": 118763024, + "router_z_loss_mlp": 0.95507812, + "step": 1435, + "time_per_iteration": 2.9329347610473633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195816, + "balance_loss_mlp": 1.10068655, + "epoch": 0.27626010003847634, + "flos": 555748165632.0, + "grad_norm": 0.0206290639721941, + "language_loss": 0.927001, + "learning_rate": 0.0008492316729847823, + "loss": 0.93895912, + "num_input_tokens_seen": 118845536, + "router_z_loss_mlp": 0.95068359, + "step": 1436, + "time_per_iteration": 2.820913553237915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118782, + "balance_loss_mlp": 1.09245288, + "epoch": 0.2764524817237399, + "flos": 543695494656.0, + "grad_norm": 0.02424730092158954, + "language_loss": 0.88914406, + "learning_rate": 0.0008490086512522664, + "loss": 0.90102232, + "num_input_tokens_seen": 118919008, + "router_z_loss_mlp": 0.953125, + "step": 1437, + "time_per_iteration": 2.7454309463500977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186593, + "balance_loss_mlp": 1.09127319, + "epoch": 0.27664486340900346, + "flos": 407128573440.0, + "grad_norm": 0.024912305575595636, + "language_loss": 0.99286187, + "learning_rate": 0.0008487854940225755, + "loss": 1.00472784, + "num_input_tokens_seen": 118981376, + "router_z_loss_mlp": 0.95263672, + "step": 1438, + "time_per_iteration": 2.4809510707855225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183239, + "balance_loss_mlp": 1.08834839, + "epoch": 0.27683724509426705, + "flos": 523156726272.0, + "grad_norm": 0.025259333782437998, + "language_loss": 0.98154646, + "learning_rate": 0.0008485622013823466, + "loss": 0.99337876, + "num_input_tokens_seen": 119050560, + "router_z_loss_mlp": 0.94824219, + "step": 1439, + "time_per_iteration": 2.65401554107666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183688, + "balance_loss_mlp": 1.08865404, + "epoch": 0.2770296267795306, + "flos": 536409897984.0, + "grad_norm": 0.02898674716386243, + "language_loss": 0.9318651, + "learning_rate": 0.00084833877341827, + "loss": 0.94370198, + "num_input_tokens_seen": 119121104, + "router_z_loss_mlp": 0.94970703, + "step": 1440, + "time_per_iteration": 2.6294455528259277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192537, + "balance_loss_mlp": 1.09755075, + "epoch": 0.27722200846479417, + "flos": 488970015744.0, + "grad_norm": 0.027244615130064133, + "language_loss": 0.90653217, + "learning_rate": 0.000848115210217088, + "loss": 0.91845751, + "num_input_tokens_seen": 119187712, + "router_z_loss_mlp": 0.94921875, + "step": 1441, + "time_per_iteration": 2.5394957065582275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118987, + "balance_loss_mlp": 1.09493196, + "epoch": 0.2774143901500577, + "flos": 619443108864.0, + "grad_norm": 0.024388639686817183, + "language_loss": 0.9228884, + "learning_rate": 0.0008478915118655952, + "loss": 0.93478709, + "num_input_tokens_seen": 119259264, + "router_z_loss_mlp": 0.94873047, + "step": 1442, + "time_per_iteration": 2.7634968757629395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119119, + "balance_loss_mlp": 1.0962522, + "epoch": 0.2776067718353213, + "flos": 514844545536.0, + "grad_norm": 0.021441164984372, + "language_loss": 0.94525409, + "learning_rate": 0.0008476676784506393, + "loss": 0.95716596, + "num_input_tokens_seen": 119328304, + "router_z_loss_mlp": 0.94873047, + "step": 1443, + "time_per_iteration": 2.6474499702453613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119148, + "balance_loss_mlp": 1.09678042, + "epoch": 0.2777991535205848, + "flos": 1006040919552.0, + "grad_norm": 0.026818715625153876, + "language_loss": 0.93016809, + "learning_rate": 0.0008474437100591201, + "loss": 0.94208288, + "num_input_tokens_seen": 119412352, + "router_z_loss_mlp": 0.94628906, + "step": 1444, + "time_per_iteration": 3.311842441558838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189789, + "balance_loss_mlp": 1.09494591, + "epoch": 0.2779915352058484, + "flos": 551375861760.0, + "grad_norm": 0.021641305677188864, + "language_loss": 0.95129728, + "learning_rate": 0.0008472196067779898, + "loss": 0.96319526, + "num_input_tokens_seen": 119484464, + "router_z_loss_mlp": 0.94775391, + "step": 1445, + "time_per_iteration": 2.667910575866699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186263, + "balance_loss_mlp": 1.091277, + "epoch": 0.278183916891112, + "flos": 875215990272.0, + "grad_norm": 0.030449834007814664, + "language_loss": 0.98351109, + "learning_rate": 0.0008469953686942531, + "loss": 0.99537361, + "num_input_tokens_seen": 119557280, + "router_z_loss_mlp": 0.94921875, + "step": 1446, + "time_per_iteration": 3.100473403930664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187264, + "balance_loss_mlp": 1.09246826, + "epoch": 0.2783762985763755, + "flos": 625195834368.0, + "grad_norm": 0.025904191205549917, + "language_loss": 0.93646944, + "learning_rate": 0.0008467709958949668, + "loss": 0.94834208, + "num_input_tokens_seen": 119631232, + "router_z_loss_mlp": 0.94726562, + "step": 1447, + "time_per_iteration": 2.7201731204986572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187983, + "balance_loss_mlp": 1.09333074, + "epoch": 0.2785686802616391, + "flos": 582911792640.0, + "grad_norm": 0.026760771702797625, + "language_loss": 0.94447374, + "learning_rate": 0.0008465464884672403, + "loss": 0.9563536, + "num_input_tokens_seen": 119700224, + "router_z_loss_mlp": 0.94580078, + "step": 1448, + "time_per_iteration": 2.7300403118133545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118631, + "balance_loss_mlp": 1.09180129, + "epoch": 0.27876106194690264, + "flos": 588538991616.0, + "grad_norm": 0.0212290178255441, + "language_loss": 0.93077391, + "learning_rate": 0.0008463218464982348, + "loss": 0.94263697, + "num_input_tokens_seen": 119781376, + "router_z_loss_mlp": 0.94433594, + "step": 1449, + "time_per_iteration": 2.86130952835083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190148, + "balance_loss_mlp": 1.09520972, + "epoch": 0.27895344363216623, + "flos": 877430340096.0, + "grad_norm": 0.02756647509109648, + "language_loss": 0.96903402, + "learning_rate": 0.0008460970700751645, + "loss": 0.98093557, + "num_input_tokens_seen": 119856672, + "router_z_loss_mlp": 0.94873047, + "step": 1450, + "time_per_iteration": 3.069391965866089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188227, + "balance_loss_mlp": 1.0932883, + "epoch": 0.27914582531742976, + "flos": 605035098624.0, + "grad_norm": 0.025261876769304706, + "language_loss": 0.97766632, + "learning_rate": 0.000845872159285295, + "loss": 0.98954856, + "num_input_tokens_seen": 119929008, + "router_z_loss_mlp": 0.94873047, + "step": 1451, + "time_per_iteration": 2.748164653778076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197098, + "balance_loss_mlp": 1.10325623, + "epoch": 0.27933820700269335, + "flos": 1501130411520.0, + "grad_norm": 0.012982305827020523, + "language_loss": 0.77766848, + "learning_rate": 0.0008456471142159447, + "loss": 0.78963947, + "num_input_tokens_seen": 120164032, + "router_z_loss_mlp": 0.9375, + "step": 1452, + "time_per_iteration": 4.906180143356323 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198876, + "balance_loss_mlp": 1.10408044, + "epoch": 0.2795305886879569, + "flos": 1033517451264.0, + "grad_norm": 0.027093914793319178, + "language_loss": 0.95323974, + "learning_rate": 0.0008454219349544836, + "loss": 0.9652285, + "num_input_tokens_seen": 120246784, + "router_z_loss_mlp": 0.94726562, + "step": 1453, + "time_per_iteration": 3.333178758621216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194793, + "balance_loss_mlp": 1.10014069, + "epoch": 0.27972297037322047, + "flos": 608226367488.0, + "grad_norm": 0.025225525542022995, + "language_loss": 0.8972255, + "learning_rate": 0.000845196621588334, + "loss": 0.90917349, + "num_input_tokens_seen": 120318208, + "router_z_loss_mlp": 0.94580078, + "step": 1454, + "time_per_iteration": 2.7425026893615723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191631, + "balance_loss_mlp": 1.09697926, + "epoch": 0.27991535205848406, + "flos": 631560907776.0, + "grad_norm": 0.023908777965609074, + "language_loss": 0.86623406, + "learning_rate": 0.0008449711742049706, + "loss": 0.87815034, + "num_input_tokens_seen": 120393248, + "router_z_loss_mlp": 0.94580078, + "step": 1455, + "time_per_iteration": 2.8148674964904785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188728, + "balance_loss_mlp": 1.09369469, + "epoch": 0.2801077337437476, + "flos": 550353280512.0, + "grad_norm": 0.02989232443782136, + "language_loss": 0.94001353, + "learning_rate": 0.0008447455928919196, + "loss": 0.95190072, + "num_input_tokens_seen": 120461040, + "router_z_loss_mlp": 0.94970703, + "step": 1456, + "time_per_iteration": 2.6030025482177734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186748, + "balance_loss_mlp": 1.09166706, + "epoch": 0.2803001154290112, + "flos": 487741317120.0, + "grad_norm": 0.023726139763527557, + "language_loss": 0.95883709, + "learning_rate": 0.0008445198777367595, + "loss": 0.97070462, + "num_input_tokens_seen": 120530400, + "router_z_loss_mlp": 0.95019531, + "step": 1457, + "time_per_iteration": 2.598212718963623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188426, + "balance_loss_mlp": 1.09344053, + "epoch": 0.2804924971142747, + "flos": 523091598336.0, + "grad_norm": 0.027291046925092925, + "language_loss": 0.9210875, + "learning_rate": 0.0008442940288271208, + "loss": 0.93297172, + "num_input_tokens_seen": 120598304, + "router_z_loss_mlp": 0.94921875, + "step": 1458, + "time_per_iteration": 2.617572069168091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189438, + "balance_loss_mlp": 1.09473801, + "epoch": 0.2806848787995383, + "flos": 528849053184.0, + "grad_norm": 0.02378106137707509, + "language_loss": 0.95258486, + "learning_rate": 0.0008440680462506856, + "loss": 0.96447927, + "num_input_tokens_seen": 120675712, + "router_z_loss_mlp": 0.94628906, + "step": 1459, + "time_per_iteration": 2.7465641498565674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191591, + "balance_loss_mlp": 1.09660506, + "epoch": 0.2808772604848018, + "flos": 486484420608.0, + "grad_norm": 0.02248739277997059, + "language_loss": 0.9351486, + "learning_rate": 0.0008438419300951883, + "loss": 0.94706452, + "num_input_tokens_seen": 120746544, + "router_z_loss_mlp": 0.94921875, + "step": 1460, + "time_per_iteration": 2.6331160068511963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188162, + "balance_loss_mlp": 1.09303284, + "epoch": 0.2810696421700654, + "flos": 619339049472.0, + "grad_norm": 0.024684272432392865, + "language_loss": 0.96464884, + "learning_rate": 0.0008436156804484148, + "loss": 0.97653049, + "num_input_tokens_seen": 120823520, + "router_z_loss_mlp": 0.95068359, + "step": 1461, + "time_per_iteration": 2.7740418910980225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188616, + "balance_loss_mlp": 1.09358263, + "epoch": 0.28126202385532895, + "flos": 455686364160.0, + "grad_norm": 0.026728942288464865, + "language_loss": 0.99464989, + "learning_rate": 0.0008433892973982031, + "loss": 1.00653601, + "num_input_tokens_seen": 120889568, + "router_z_loss_mlp": 0.94970703, + "step": 1462, + "time_per_iteration": 2.5151000022888184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188441, + "balance_loss_mlp": 1.09345496, + "epoch": 0.28145440554059253, + "flos": 531738150912.0, + "grad_norm": 0.02863032020985732, + "language_loss": 0.95777607, + "learning_rate": 0.0008431627810324431, + "loss": 0.96966046, + "num_input_tokens_seen": 120958480, + "router_z_loss_mlp": 0.94921875, + "step": 1463, + "time_per_iteration": 2.64477801322937 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187972, + "balance_loss_mlp": 1.09298646, + "epoch": 0.2816467872258561, + "flos": 453163838976.0, + "grad_norm": 0.025052425157320847, + "language_loss": 0.90961307, + "learning_rate": 0.000842936131439076, + "loss": 0.92149282, + "num_input_tokens_seen": 121028032, + "router_z_loss_mlp": 0.94921875, + "step": 1464, + "time_per_iteration": 2.5910096168518066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186267, + "balance_loss_mlp": 1.09147155, + "epoch": 0.28183916891111965, + "flos": 473704608768.0, + "grad_norm": 0.02627501463847235, + "language_loss": 0.97073281, + "learning_rate": 0.0008427093487060951, + "loss": 0.98259544, + "num_input_tokens_seen": 121099280, + "router_z_loss_mlp": 0.94726562, + "step": 1465, + "time_per_iteration": 2.6250505447387695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187944, + "balance_loss_mlp": 1.09300542, + "epoch": 0.28203155059638324, + "flos": 558188098560.0, + "grad_norm": 0.02108937585301408, + "language_loss": 0.91709232, + "learning_rate": 0.000842482432921545, + "loss": 0.92897177, + "num_input_tokens_seen": 121180240, + "router_z_loss_mlp": 0.94873047, + "step": 1466, + "time_per_iteration": 2.809101104736328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186286, + "balance_loss_mlp": 1.09139562, + "epoch": 0.28222393228164677, + "flos": 417878685696.0, + "grad_norm": 0.025824876793605126, + "language_loss": 0.96517414, + "learning_rate": 0.0008422553841735225, + "loss": 0.97703695, + "num_input_tokens_seen": 121242736, + "router_z_loss_mlp": 0.94824219, + "step": 1467, + "time_per_iteration": 2.468773365020752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184331, + "balance_loss_mlp": 1.08963072, + "epoch": 0.28241631396691036, + "flos": 606040215552.0, + "grad_norm": 0.02479925640814435, + "language_loss": 0.92490911, + "learning_rate": 0.0008420282025501757, + "loss": 0.93675244, + "num_input_tokens_seen": 121319248, + "router_z_loss_mlp": 0.94628906, + "step": 1468, + "time_per_iteration": 2.7617123126983643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184258, + "balance_loss_mlp": 1.08960581, + "epoch": 0.2826086956521739, + "flos": 574050390528.0, + "grad_norm": 0.023359152371130017, + "language_loss": 0.93868291, + "learning_rate": 0.0008418008881397043, + "loss": 0.95052546, + "num_input_tokens_seen": 121392064, + "router_z_loss_mlp": 0.94580078, + "step": 1469, + "time_per_iteration": 2.681727886199951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185359, + "balance_loss_mlp": 1.09056342, + "epoch": 0.2828010773374375, + "flos": 844318603776.0, + "grad_norm": 0.02469333041166596, + "language_loss": 0.92646587, + "learning_rate": 0.0008415734410303595, + "loss": 0.93831944, + "num_input_tokens_seen": 121475984, + "router_z_loss_mlp": 0.94726562, + "step": 1470, + "time_per_iteration": 3.1949617862701416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186089, + "balance_loss_mlp": 1.09124613, + "epoch": 0.28299345902270107, + "flos": 543771356160.0, + "grad_norm": 0.022743934694793657, + "language_loss": 0.98454034, + "learning_rate": 0.0008413458613104444, + "loss": 0.99640119, + "num_input_tokens_seen": 121551024, + "router_z_loss_mlp": 0.94775391, + "step": 1471, + "time_per_iteration": 2.679994583129883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184615, + "balance_loss_mlp": 1.08972394, + "epoch": 0.2831858407079646, + "flos": 572754562560.0, + "grad_norm": 0.02381851847695354, + "language_loss": 0.91435039, + "learning_rate": 0.0008411181490683129, + "loss": 0.92619658, + "num_input_tokens_seen": 121624528, + "router_z_loss_mlp": 0.94824219, + "step": 1472, + "time_per_iteration": 2.7178077697753906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186226, + "balance_loss_mlp": 1.09152639, + "epoch": 0.2833782223932282, + "flos": 765170875392.0, + "grad_norm": 0.023393787071714342, + "language_loss": 0.92628008, + "learning_rate": 0.0008408903043923707, + "loss": 0.9381423, + "num_input_tokens_seen": 121706736, + "router_z_loss_mlp": 0.94628906, + "step": 1473, + "time_per_iteration": 3.0261785984039307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184462, + "balance_loss_mlp": 1.0899055, + "epoch": 0.2835706040784917, + "flos": 540087261696.0, + "grad_norm": 0.026141956799832673, + "language_loss": 0.93214488, + "learning_rate": 0.0008406623273710754, + "loss": 0.94398952, + "num_input_tokens_seen": 121773008, + "router_z_loss_mlp": 0.94482422, + "step": 1474, + "time_per_iteration": 2.62430739402771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118759, + "balance_loss_mlp": 1.09312844, + "epoch": 0.2837629857637553, + "flos": 531653557248.0, + "grad_norm": 0.026627011980012938, + "language_loss": 0.91140723, + "learning_rate": 0.0008404342180929351, + "loss": 0.9232831, + "num_input_tokens_seen": 121840016, + "router_z_loss_mlp": 0.94384766, + "step": 1475, + "time_per_iteration": 2.6201882362365723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191029, + "balance_loss_mlp": 1.09666264, + "epoch": 0.28395536744901884, + "flos": 541109842944.0, + "grad_norm": 0.026942213566754976, + "language_loss": 0.91036892, + "learning_rate": 0.00084020597664651, + "loss": 0.92227924, + "num_input_tokens_seen": 121915008, + "router_z_loss_mlp": 0.94287109, + "step": 1476, + "time_per_iteration": 2.792515516281128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191806, + "balance_loss_mlp": 1.09743977, + "epoch": 0.2841477491342824, + "flos": 574801726464.0, + "grad_norm": 0.0281069748307863, + "language_loss": 0.94561875, + "learning_rate": 0.0008399776031204111, + "loss": 0.95753682, + "num_input_tokens_seen": 121987456, + "router_z_loss_mlp": 0.94287109, + "step": 1477, + "time_per_iteration": 2.7592930793762207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189206, + "balance_loss_mlp": 1.09479237, + "epoch": 0.28434013081954596, + "flos": 573138599424.0, + "grad_norm": 0.025578880464706598, + "language_loss": 0.90985346, + "learning_rate": 0.0008397490976033009, + "loss": 0.92174542, + "num_input_tokens_seen": 122058720, + "router_z_loss_mlp": 0.94335938, + "step": 1478, + "time_per_iteration": 2.72312331199646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193047, + "balance_loss_mlp": 1.10015869, + "epoch": 0.28453251250480954, + "flos": 1556673629184.0, + "grad_norm": 0.009281527310597816, + "language_loss": 0.77879643, + "learning_rate": 0.000839520460183893, + "loss": 0.7907269, + "num_input_tokens_seen": 122285792, + "router_z_loss_mlp": 0.92773438, + "step": 1479, + "time_per_iteration": 4.714428901672363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188304, + "balance_loss_mlp": 1.0943675, + "epoch": 0.28472489419007313, + "flos": 750426491904.0, + "grad_norm": 0.023822673694276757, + "language_loss": 0.93367732, + "learning_rate": 0.0008392916909509525, + "loss": 0.94556034, + "num_input_tokens_seen": 122366608, + "router_z_loss_mlp": 0.93847656, + "step": 1480, + "time_per_iteration": 3.0365796089172363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183623, + "balance_loss_mlp": 1.08930516, + "epoch": 0.28491727587533666, + "flos": 491138703360.0, + "grad_norm": 0.028675048847138535, + "language_loss": 0.94468164, + "learning_rate": 0.0008390627899932954, + "loss": 0.95651788, + "num_input_tokens_seen": 122435536, + "router_z_loss_mlp": 0.94238281, + "step": 1481, + "time_per_iteration": 2.562316656112671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187714, + "balance_loss_mlp": 1.09353888, + "epoch": 0.28510965756060025, + "flos": 730359081984.0, + "grad_norm": 0.028797322451775676, + "language_loss": 0.96514452, + "learning_rate": 0.000838833757399789, + "loss": 0.97702163, + "num_input_tokens_seen": 122515584, + "router_z_loss_mlp": 0.94091797, + "step": 1482, + "time_per_iteration": 2.955920696258545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189825, + "balance_loss_mlp": 1.09593546, + "epoch": 0.2853020392458638, + "flos": 552669688320.0, + "grad_norm": 0.027781834693451857, + "language_loss": 0.92148101, + "learning_rate": 0.0008386045932593515, + "loss": 0.93337923, + "num_input_tokens_seen": 122585552, + "router_z_loss_mlp": 0.93798828, + "step": 1483, + "time_per_iteration": 2.6609442234039307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185409, + "balance_loss_mlp": 1.09151959, + "epoch": 0.28549442093112737, + "flos": 756096625152.0, + "grad_norm": 0.023489805753692042, + "language_loss": 0.9365592, + "learning_rate": 0.0008383752976609525, + "loss": 0.94841331, + "num_input_tokens_seen": 122658928, + "router_z_loss_mlp": 0.93798828, + "step": 1484, + "time_per_iteration": 2.914872646331787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188644, + "balance_loss_mlp": 1.09480286, + "epoch": 0.2856868026163909, + "flos": 539703224832.0, + "grad_norm": 0.026354969281760218, + "language_loss": 0.9020288, + "learning_rate": 0.0008381458706936123, + "loss": 0.91391522, + "num_input_tokens_seen": 122729056, + "router_z_loss_mlp": 0.9375, + "step": 1485, + "time_per_iteration": 2.7100982666015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190691, + "balance_loss_mlp": 1.09675431, + "epoch": 0.2858791843016545, + "flos": 584920025088.0, + "grad_norm": 0.026556247425645045, + "language_loss": 0.97539783, + "learning_rate": 0.0008379163124464025, + "loss": 0.98730469, + "num_input_tokens_seen": 122802832, + "router_z_loss_mlp": 0.93847656, + "step": 1486, + "time_per_iteration": 2.7065536975860596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192022, + "balance_loss_mlp": 1.0979898, + "epoch": 0.286071565986918, + "flos": 646051510272.0, + "grad_norm": 0.03147840332437955, + "language_loss": 0.84533966, + "learning_rate": 0.0008376866230084452, + "loss": 0.85725987, + "num_input_tokens_seen": 122881328, + "router_z_loss_mlp": 0.93945312, + "step": 1487, + "time_per_iteration": 2.818673849105835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186798, + "balance_loss_mlp": 1.09295619, + "epoch": 0.2862639476721816, + "flos": 492330471936.0, + "grad_norm": 0.02612625436823832, + "language_loss": 0.963471, + "learning_rate": 0.000837456802468914, + "loss": 0.975339, + "num_input_tokens_seen": 122949680, + "router_z_loss_mlp": 0.9375, + "step": 1488, + "time_per_iteration": 2.5766210556030273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185712, + "balance_loss_mlp": 1.09187043, + "epoch": 0.2864563293574452, + "flos": 522744491520.0, + "grad_norm": 0.023875595461199783, + "language_loss": 0.96454561, + "learning_rate": 0.0008372268509170331, + "loss": 0.9764027, + "num_input_tokens_seen": 123024736, + "router_z_loss_mlp": 0.9375, + "step": 1489, + "time_per_iteration": 2.7241337299346924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117946, + "balance_loss_mlp": 1.08537972, + "epoch": 0.2866487110427087, + "flos": 548256451584.0, + "grad_norm": 0.022999113981848278, + "language_loss": 0.93815279, + "learning_rate": 0.0008369967684420779, + "loss": 0.94994742, + "num_input_tokens_seen": 123097344, + "router_z_loss_mlp": 0.93994141, + "step": 1490, + "time_per_iteration": 2.7358930110931396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180309, + "balance_loss_mlp": 1.08656251, + "epoch": 0.2868410927279723, + "flos": 483217290240.0, + "grad_norm": 0.024118055050044187, + "language_loss": 0.93676293, + "learning_rate": 0.0008367665551333736, + "loss": 0.94856608, + "num_input_tokens_seen": 123166240, + "router_z_loss_mlp": 0.93652344, + "step": 1491, + "time_per_iteration": 2.6094913482666016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181201, + "balance_loss_mlp": 1.08731139, + "epoch": 0.28703347441323585, + "flos": 726136499712.0, + "grad_norm": 0.03204326630579906, + "language_loss": 0.96034807, + "learning_rate": 0.0008365362110802977, + "loss": 0.9721601, + "num_input_tokens_seen": 123238160, + "router_z_loss_mlp": 0.93798828, + "step": 1492, + "time_per_iteration": 2.862281322479248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180339, + "balance_loss_mlp": 1.08630645, + "epoch": 0.28722585609849943, + "flos": 636213189120.0, + "grad_norm": 0.024948941988181064, + "language_loss": 0.92257547, + "learning_rate": 0.0008363057363722773, + "loss": 0.93437886, + "num_input_tokens_seen": 123319504, + "router_z_loss_mlp": 0.93945312, + "step": 1493, + "time_per_iteration": 2.8364765644073486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180894, + "balance_loss_mlp": 1.08695745, + "epoch": 0.28741823778376296, + "flos": 511251775488.0, + "grad_norm": 0.026788978355157977, + "language_loss": 0.94388151, + "learning_rate": 0.0008360751310987906, + "loss": 0.9556905, + "num_input_tokens_seen": 123387008, + "router_z_loss_mlp": 0.93847656, + "step": 1494, + "time_per_iteration": 2.5825915336608887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186005, + "balance_loss_mlp": 1.09244919, + "epoch": 0.28761061946902655, + "flos": 604931039232.0, + "grad_norm": 0.023099591474152015, + "language_loss": 0.92881125, + "learning_rate": 0.0008358443953493666, + "loss": 0.94067132, + "num_input_tokens_seen": 123471056, + "router_z_loss_mlp": 0.93457031, + "step": 1495, + "time_per_iteration": 2.8426852226257324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190116, + "balance_loss_mlp": 1.09617913, + "epoch": 0.28780300115429014, + "flos": 408059830272.0, + "grad_norm": 0.026469370193436835, + "language_loss": 0.97524667, + "learning_rate": 0.0008356135292135851, + "loss": 0.98714793, + "num_input_tokens_seen": 123535024, + "router_z_loss_mlp": 0.93847656, + "step": 1496, + "time_per_iteration": 2.505594491958618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187979, + "balance_loss_mlp": 1.09356499, + "epoch": 0.28799538283955367, + "flos": 375744365568.0, + "grad_norm": 0.028081335314896084, + "language_loss": 1.02447343, + "learning_rate": 0.0008353825327810758, + "loss": 1.03635335, + "num_input_tokens_seen": 123596224, + "router_z_loss_mlp": 0.94335938, + "step": 1497, + "time_per_iteration": 2.4137980937957764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188393, + "balance_loss_mlp": 1.09416974, + "epoch": 0.28818776452481726, + "flos": 593019357696.0, + "grad_norm": 0.027570910872340922, + "language_loss": 0.91214752, + "learning_rate": 0.00083515140614152, + "loss": 0.9240315, + "num_input_tokens_seen": 123668640, + "router_z_loss_mlp": 0.94140625, + "step": 1498, + "time_per_iteration": 2.7084319591522217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188877, + "balance_loss_mlp": 1.0943675, + "epoch": 0.2883801462100808, + "flos": 536103724032.0, + "grad_norm": 0.024692508476740448, + "language_loss": 0.97239816, + "learning_rate": 0.0008349201493846485, + "loss": 0.9842869, + "num_input_tokens_seen": 123740816, + "router_z_loss_mlp": 0.94433594, + "step": 1499, + "time_per_iteration": 2.6401236057281494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190398, + "balance_loss_mlp": 1.09617448, + "epoch": 0.2885725278953444, + "flos": 481076800512.0, + "grad_norm": 0.026282906035864008, + "language_loss": 0.98523659, + "learning_rate": 0.0008346887626002432, + "loss": 0.99714065, + "num_input_tokens_seen": 123805968, + "router_z_loss_mlp": 0.94140625, + "step": 1500, + "time_per_iteration": 2.52458119392395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193099, + "balance_loss_mlp": 1.09863722, + "epoch": 0.2887649095806079, + "flos": 465029858304.0, + "grad_norm": 0.024051725112114657, + "language_loss": 0.95880306, + "learning_rate": 0.000834457245878137, + "loss": 0.970734, + "num_input_tokens_seen": 123876576, + "router_z_loss_mlp": 0.94384766, + "step": 1501, + "time_per_iteration": 2.629535436630249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192018, + "balance_loss_mlp": 1.09765196, + "epoch": 0.2889572912658715, + "flos": 932639912448.0, + "grad_norm": 0.02596355901590014, + "language_loss": 0.90450358, + "learning_rate": 0.000834225599308212, + "loss": 0.9164238, + "num_input_tokens_seen": 123967664, + "router_z_loss_mlp": 0.94287109, + "step": 1502, + "time_per_iteration": 3.2340567111968994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189718, + "balance_loss_mlp": 1.09568572, + "epoch": 0.28914967295113503, + "flos": 571256620032.0, + "grad_norm": 0.02412179831144176, + "language_loss": 0.9487462, + "learning_rate": 0.0008339938229804016, + "loss": 0.96064335, + "num_input_tokens_seen": 124039680, + "router_z_loss_mlp": 0.93945312, + "step": 1503, + "time_per_iteration": 2.710339069366455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193321, + "balance_loss_mlp": 1.10081482, + "epoch": 0.2893420546363986, + "flos": 1489872010752.0, + "grad_norm": 0.01509287591883609, + "language_loss": 0.75434822, + "learning_rate": 0.0008337619169846895, + "loss": 0.76628143, + "num_input_tokens_seen": 124278848, + "router_z_loss_mlp": 0.92382812, + "step": 1504, + "time_per_iteration": 4.937675714492798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189832, + "balance_loss_mlp": 1.09579968, + "epoch": 0.2895344363216622, + "flos": 471182083584.0, + "grad_norm": 0.02978733186062401, + "language_loss": 0.95586789, + "learning_rate": 0.0008335298814111094, + "loss": 0.96776623, + "num_input_tokens_seen": 124346736, + "router_z_loss_mlp": 0.93945312, + "step": 1505, + "time_per_iteration": 2.5757808685302734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119483, + "balance_loss_mlp": 1.10075009, + "epoch": 0.28972681800692573, + "flos": 649340107776.0, + "grad_norm": 0.024998045510076724, + "language_loss": 0.95390272, + "learning_rate": 0.0008332977163497455, + "loss": 0.96585107, + "num_input_tokens_seen": 124420816, + "router_z_loss_mlp": 0.93994141, + "step": 1506, + "time_per_iteration": 2.8062288761138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190367, + "balance_loss_mlp": 1.09638238, + "epoch": 0.2899191996921893, + "flos": 573305785344.0, + "grad_norm": 0.023440576211443395, + "language_loss": 0.92864263, + "learning_rate": 0.0008330654218907325, + "loss": 0.94054627, + "num_input_tokens_seen": 124490480, + "router_z_loss_mlp": 0.93896484, + "step": 1507, + "time_per_iteration": 2.6871397495269775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195663, + "balance_loss_mlp": 1.10158336, + "epoch": 0.29011158137745285, + "flos": 662636940288.0, + "grad_norm": 0.026311762315396375, + "language_loss": 0.90949756, + "learning_rate": 0.0008328329981242548, + "loss": 0.92145419, + "num_input_tokens_seen": 124564960, + "router_z_loss_mlp": 0.93994141, + "step": 1508, + "time_per_iteration": 2.870436906814575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189885, + "balance_loss_mlp": 1.09585261, + "epoch": 0.29030396306271644, + "flos": 537402279936.0, + "grad_norm": 0.02293974263799261, + "language_loss": 0.95641714, + "learning_rate": 0.0008326004451405475, + "loss": 0.96831596, + "num_input_tokens_seen": 124637424, + "router_z_loss_mlp": 0.93945312, + "step": 1509, + "time_per_iteration": 2.7639336585998535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191857, + "balance_loss_mlp": 1.09815872, + "epoch": 0.29049634474798, + "flos": 512955835392.0, + "grad_norm": 0.025710607890434264, + "language_loss": 0.93112034, + "learning_rate": 0.0008323677630298957, + "loss": 0.94303894, + "num_input_tokens_seen": 124704832, + "router_z_loss_mlp": 0.93603516, + "step": 1510, + "time_per_iteration": 2.561455726623535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118953, + "balance_loss_mlp": 1.09592652, + "epoch": 0.29068872643324356, + "flos": 614982208512.0, + "grad_norm": 0.023671610956976636, + "language_loss": 0.92362118, + "learning_rate": 0.0008321349518826345, + "loss": 0.93551642, + "num_input_tokens_seen": 124779600, + "router_z_loss_mlp": 0.93505859, + "step": 1511, + "time_per_iteration": 2.807711362838745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191488, + "balance_loss_mlp": 1.09736073, + "epoch": 0.2908811081185071, + "flos": 547468185600.0, + "grad_norm": 0.029262624151918007, + "language_loss": 1.03824317, + "learning_rate": 0.0008319020117891491, + "loss": 1.05015802, + "num_input_tokens_seen": 124844128, + "router_z_loss_mlp": 0.94042969, + "step": 1512, + "time_per_iteration": 2.626357316970825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192195, + "balance_loss_mlp": 1.09840155, + "epoch": 0.2910734898037707, + "flos": 605901227520.0, + "grad_norm": 0.026098769068304807, + "language_loss": 0.96355087, + "learning_rate": 0.0008316689428398751, + "loss": 0.97547281, + "num_input_tokens_seen": 124915376, + "router_z_loss_mlp": 0.93701172, + "step": 1513, + "time_per_iteration": 2.6982998847961426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190959, + "balance_loss_mlp": 1.09721279, + "epoch": 0.29126587148903427, + "flos": 575835041280.0, + "grad_norm": 0.02240755749123148, + "language_loss": 0.95587385, + "learning_rate": 0.0008314357451252979, + "loss": 0.96778345, + "num_input_tokens_seen": 124995504, + "router_z_loss_mlp": 0.93652344, + "step": 1514, + "time_per_iteration": 2.7506277561187744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185358, + "balance_loss_mlp": 1.09170711, + "epoch": 0.2914582531742978, + "flos": 572133482496.0, + "grad_norm": 0.030106635879309524, + "language_loss": 0.98758858, + "learning_rate": 0.0008312024187359527, + "loss": 0.99944222, + "num_input_tokens_seen": 125064192, + "router_z_loss_mlp": 0.93554688, + "step": 1515, + "time_per_iteration": 2.6389546394348145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186161, + "balance_loss_mlp": 1.09265339, + "epoch": 0.2916506348595614, + "flos": 732302186496.0, + "grad_norm": 0.023105382424412787, + "language_loss": 0.95643955, + "learning_rate": 0.000830968963762425, + "loss": 0.96830118, + "num_input_tokens_seen": 125150560, + "router_z_loss_mlp": 0.93408203, + "step": 1516, + "time_per_iteration": 3.0222864151000977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183995, + "balance_loss_mlp": 1.09048688, + "epoch": 0.2918430165448249, + "flos": 511466625024.0, + "grad_norm": 0.027481799845478876, + "language_loss": 0.92072952, + "learning_rate": 0.0008307353802953497, + "loss": 0.93256938, + "num_input_tokens_seen": 125219264, + "router_z_loss_mlp": 0.93408203, + "step": 1517, + "time_per_iteration": 2.6852073669433594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188929, + "balance_loss_mlp": 1.09546912, + "epoch": 0.2920353982300885, + "flos": 631606569984.0, + "grad_norm": 0.024841994736450757, + "language_loss": 0.95207542, + "learning_rate": 0.0008305016684254125, + "loss": 0.9639647, + "num_input_tokens_seen": 125301904, + "router_z_loss_mlp": 0.93359375, + "step": 1518, + "time_per_iteration": 2.78326678276062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185623, + "balance_loss_mlp": 1.0920676, + "epoch": 0.29222777991535204, + "flos": 502670350848.0, + "grad_norm": 0.02442081482663903, + "language_loss": 0.96402657, + "learning_rate": 0.0008302678282433479, + "loss": 0.97588277, + "num_input_tokens_seen": 125367712, + "router_z_loss_mlp": 0.93457031, + "step": 1519, + "time_per_iteration": 2.580885887145996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186077, + "balance_loss_mlp": 1.09261727, + "epoch": 0.2924201616006156, + "flos": 487841373696.0, + "grad_norm": 0.025531334181834578, + "language_loss": 0.92434102, + "learning_rate": 0.0008300338598399411, + "loss": 0.93620181, + "num_input_tokens_seen": 125437648, + "router_z_loss_mlp": 0.93359375, + "step": 1520, + "time_per_iteration": 2.60040020942688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182574, + "balance_loss_mlp": 1.08911419, + "epoch": 0.2926125432858792, + "flos": 477410170368.0, + "grad_norm": 0.025034871095789283, + "language_loss": 1.04410791, + "learning_rate": 0.0008297997633060263, + "loss": 1.05593348, + "num_input_tokens_seen": 125502432, + "router_z_loss_mlp": 0.93359375, + "step": 1521, + "time_per_iteration": 2.5479507446289062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184296, + "balance_loss_mlp": 1.09083581, + "epoch": 0.29280492497114274, + "flos": 677867418624.0, + "grad_norm": 0.023158831925944874, + "language_loss": 0.93757105, + "learning_rate": 0.0008295655387324883, + "loss": 0.94941401, + "num_input_tokens_seen": 125575424, + "router_z_loss_mlp": 0.93359375, + "step": 1522, + "time_per_iteration": 2.80924916267395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184597, + "balance_loss_mlp": 1.09113646, + "epoch": 0.29299730665640633, + "flos": 459344262144.0, + "grad_norm": 0.024881330364852117, + "language_loss": 0.95369709, + "learning_rate": 0.0008293311862102609, + "loss": 0.96554303, + "num_input_tokens_seen": 125639040, + "router_z_loss_mlp": 0.93359375, + "step": 1523, + "time_per_iteration": 2.5006909370422363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183918, + "balance_loss_mlp": 1.09055364, + "epoch": 0.29318968834166986, + "flos": 447495707136.0, + "grad_norm": 0.027757525537519354, + "language_loss": 0.99242002, + "learning_rate": 0.0008290967058303275, + "loss": 1.00425935, + "num_input_tokens_seen": 125701712, + "router_z_loss_mlp": 0.93261719, + "step": 1524, + "time_per_iteration": 2.472071409225464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184496, + "balance_loss_mlp": 1.09098816, + "epoch": 0.29338207002693345, + "flos": 451255663104.0, + "grad_norm": 0.024483324027042522, + "language_loss": 0.93697757, + "learning_rate": 0.0008288620976837219, + "loss": 0.9488225, + "num_input_tokens_seen": 125765088, + "router_z_loss_mlp": 0.93408203, + "step": 1525, + "time_per_iteration": 2.486726760864258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183678, + "balance_loss_mlp": 1.08997941, + "epoch": 0.293574451712197, + "flos": 503284700160.0, + "grad_norm": 0.025672010983446535, + "language_loss": 0.92014909, + "learning_rate": 0.000828627361861527, + "loss": 0.93198591, + "num_input_tokens_seen": 125831328, + "router_z_loss_mlp": 0.93603516, + "step": 1526, + "time_per_iteration": 2.557725429534912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183155, + "balance_loss_mlp": 1.089504, + "epoch": 0.29376683339746057, + "flos": 697683048960.0, + "grad_norm": 0.028193197708561973, + "language_loss": 0.94158876, + "learning_rate": 0.0008283924984548752, + "loss": 0.95342028, + "num_input_tokens_seen": 125903664, + "router_z_loss_mlp": 0.93554688, + "step": 1527, + "time_per_iteration": 2.866138219833374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182528, + "balance_loss_mlp": 1.08882964, + "epoch": 0.2939592150827241, + "flos": 479541927936.0, + "grad_norm": 0.024215116577050826, + "language_loss": 0.92182994, + "learning_rate": 0.0008281575075549485, + "loss": 0.93365526, + "num_input_tokens_seen": 125971856, + "router_z_loss_mlp": 0.93603516, + "step": 1528, + "time_per_iteration": 2.5585758686065674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202408, + "balance_loss_mlp": 1.1108551, + "epoch": 0.2941515967679877, + "flos": 1488386803200.0, + "grad_norm": 0.02007823063587109, + "language_loss": 0.77352691, + "learning_rate": 0.000827922389252979, + "loss": 0.78555101, + "num_input_tokens_seen": 126183968, + "router_z_loss_mlp": 0.9140625, + "step": 1529, + "time_per_iteration": 4.658870697021484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186281, + "balance_loss_mlp": 1.09267783, + "epoch": 0.2943439784532513, + "flos": 675399287808.0, + "grad_norm": 0.027761434636537758, + "language_loss": 0.99164081, + "learning_rate": 0.0008276871436402469, + "loss": 1.00350356, + "num_input_tokens_seen": 126254448, + "router_z_loss_mlp": 0.93505859, + "step": 1530, + "time_per_iteration": 2.897517204284668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182983, + "balance_loss_mlp": 1.08909357, + "epoch": 0.2945363601385148, + "flos": 577382648832.0, + "grad_norm": 0.025208295044921922, + "language_loss": 0.95561033, + "learning_rate": 0.000827451770808083, + "loss": 0.96744013, + "num_input_tokens_seen": 126328208, + "router_z_loss_mlp": 0.93798828, + "step": 1531, + "time_per_iteration": 2.667419910430908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183127, + "balance_loss_mlp": 1.08923733, + "epoch": 0.2947287418237784, + "flos": 481617289728.0, + "grad_norm": 0.0238323033403859, + "language_loss": 0.92856085, + "learning_rate": 0.0008272162708478674, + "loss": 0.94039214, + "num_input_tokens_seen": 126396464, + "router_z_loss_mlp": 0.93798828, + "step": 1532, + "time_per_iteration": 2.532593250274658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190087, + "balance_loss_mlp": 1.09638822, + "epoch": 0.2949211235090419, + "flos": 559260344832.0, + "grad_norm": 0.023856250691152107, + "language_loss": 0.9573307, + "learning_rate": 0.000826980643851029, + "loss": 0.96923155, + "num_input_tokens_seen": 126468960, + "router_z_loss_mlp": 0.93603516, + "step": 1533, + "time_per_iteration": 2.648393154144287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190115, + "balance_loss_mlp": 1.09665465, + "epoch": 0.2951135051943055, + "flos": 484856222208.0, + "grad_norm": 0.02761517479674983, + "language_loss": 0.9290787, + "learning_rate": 0.0008267448899090464, + "loss": 0.94097984, + "num_input_tokens_seen": 126536496, + "router_z_loss_mlp": 0.93359375, + "step": 1534, + "time_per_iteration": 2.5158579349517822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185677, + "balance_loss_mlp": 1.09226477, + "epoch": 0.29530588687956905, + "flos": 551421523968.0, + "grad_norm": 0.024001584155810263, + "language_loss": 0.90244222, + "learning_rate": 0.0008265090091134473, + "loss": 0.91429895, + "num_input_tokens_seen": 126614048, + "router_z_loss_mlp": 0.93310547, + "step": 1535, + "time_per_iteration": 2.8246946334838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185762, + "balance_loss_mlp": 1.09234965, + "epoch": 0.29549826856483263, + "flos": 674309577216.0, + "grad_norm": 0.021562014940098434, + "language_loss": 0.8727591, + "learning_rate": 0.0008262730015558088, + "loss": 0.88461667, + "num_input_tokens_seen": 126697248, + "router_z_loss_mlp": 0.93310547, + "step": 1536, + "time_per_iteration": 2.8568825721740723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189062, + "balance_loss_mlp": 1.09560144, + "epoch": 0.29569065025009617, + "flos": 766135059456.0, + "grad_norm": 0.0253531059084562, + "language_loss": 0.89567208, + "learning_rate": 0.0008260368673277574, + "loss": 0.90756267, + "num_input_tokens_seen": 126782496, + "router_z_loss_mlp": 0.93359375, + "step": 1537, + "time_per_iteration": 3.1248908042907715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181656, + "balance_loss_mlp": 1.08781409, + "epoch": 0.29588303193535975, + "flos": 544830867456.0, + "grad_norm": 0.02589470547450269, + "language_loss": 0.93808746, + "learning_rate": 0.0008258006065209682, + "loss": 0.94990402, + "num_input_tokens_seen": 126857328, + "router_z_loss_mlp": 0.9375, + "step": 1538, + "time_per_iteration": 2.7405824661254883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182922, + "balance_loss_mlp": 1.0892235, + "epoch": 0.29607541362062334, + "flos": 598144998912.0, + "grad_norm": 0.02499469713889481, + "language_loss": 0.9045589, + "learning_rate": 0.0008255642192271657, + "loss": 0.91638815, + "num_input_tokens_seen": 126932608, + "router_z_loss_mlp": 0.93603516, + "step": 1539, + "time_per_iteration": 2.7654454708099365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183976, + "balance_loss_mlp": 1.09032559, + "epoch": 0.29626779530588687, + "flos": 611037602304.0, + "grad_norm": 0.024707919738005703, + "language_loss": 0.92616487, + "learning_rate": 0.0008253277055381241, + "loss": 0.93800461, + "num_input_tokens_seen": 127008928, + "router_z_loss_mlp": 0.93554688, + "step": 1540, + "time_per_iteration": 2.803755760192871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186228, + "balance_loss_mlp": 1.09252918, + "epoch": 0.29646017699115046, + "flos": 868957704192.0, + "grad_norm": 0.02707124240628881, + "language_loss": 0.95315254, + "learning_rate": 0.0008250910655456658, + "loss": 0.96501482, + "num_input_tokens_seen": 127097104, + "router_z_loss_mlp": 0.93603516, + "step": 1541, + "time_per_iteration": 3.11143159866333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181572, + "balance_loss_mlp": 1.08787382, + "epoch": 0.296652558676414, + "flos": 496880695296.0, + "grad_norm": 0.02670504880571787, + "language_loss": 0.9343757, + "learning_rate": 0.0008248542993416625, + "loss": 0.94619143, + "num_input_tokens_seen": 127165264, + "router_z_loss_mlp": 0.93603516, + "step": 1542, + "time_per_iteration": 2.5893712043762207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181697, + "balance_loss_mlp": 1.08790362, + "epoch": 0.2968449403616776, + "flos": 572626308096.0, + "grad_norm": 0.02711797813063544, + "language_loss": 0.9310621, + "learning_rate": 0.0008246174070180352, + "loss": 0.94287908, + "num_input_tokens_seen": 127238992, + "router_z_loss_mlp": 0.93701172, + "step": 1543, + "time_per_iteration": 2.677011489868164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189648, + "balance_loss_mlp": 1.09614003, + "epoch": 0.2970373220469411, + "flos": 795650022912.0, + "grad_norm": 0.029629985597633038, + "language_loss": 0.9263432, + "learning_rate": 0.0008243803886667537, + "loss": 0.93823969, + "num_input_tokens_seen": 127328160, + "router_z_loss_mlp": 0.93408203, + "step": 1544, + "time_per_iteration": 3.1022729873657227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188285, + "balance_loss_mlp": 1.09472907, + "epoch": 0.2972297037322047, + "flos": 662248174080.0, + "grad_norm": 0.0271995559284498, + "language_loss": 0.89610922, + "learning_rate": 0.0008241432443798364, + "loss": 0.90799212, + "num_input_tokens_seen": 127407328, + "router_z_loss_mlp": 0.93457031, + "step": 1545, + "time_per_iteration": 2.8079423904418945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181998, + "balance_loss_mlp": 1.08868086, + "epoch": 0.29742208541746823, + "flos": 598231593984.0, + "grad_norm": 0.02196679377417612, + "language_loss": 0.91743886, + "learning_rate": 0.0008239059742493512, + "loss": 0.92925882, + "num_input_tokens_seen": 127477136, + "router_z_loss_mlp": 0.93212891, + "step": 1546, + "time_per_iteration": 2.703385353088379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182095, + "balance_loss_mlp": 1.08868301, + "epoch": 0.2976144671027318, + "flos": 771338563584.0, + "grad_norm": 0.02555387631372138, + "language_loss": 0.94145298, + "learning_rate": 0.0008236685783674142, + "loss": 0.95327395, + "num_input_tokens_seen": 127565680, + "router_z_loss_mlp": 0.93310547, + "step": 1547, + "time_per_iteration": 3.0583412647247314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221115, + "balance_loss_mlp": 1.12822723, + "epoch": 0.2978068487879954, + "flos": 1487911441920.0, + "grad_norm": 0.023679675459363107, + "language_loss": 0.76221192, + "learning_rate": 0.0008234310568261911, + "loss": 0.77442312, + "num_input_tokens_seen": 127791584, + "router_z_loss_mlp": 0.92773438, + "step": 1548, + "time_per_iteration": 4.846614360809326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192812, + "balance_loss_mlp": 1.09925652, + "epoch": 0.29799923047325894, + "flos": 476329191936.0, + "grad_norm": 0.02691026692614136, + "language_loss": 0.91868371, + "learning_rate": 0.0008231934097178955, + "loss": 0.93061185, + "num_input_tokens_seen": 127860112, + "router_z_loss_mlp": 0.93457031, + "step": 1549, + "time_per_iteration": 2.600588798522949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189437, + "balance_loss_mlp": 1.09573877, + "epoch": 0.2981916121585225, + "flos": 761167872000.0, + "grad_norm": 0.02304182660847759, + "language_loss": 0.93441629, + "learning_rate": 0.0008229556371347903, + "loss": 0.94631064, + "num_input_tokens_seen": 127938752, + "router_z_loss_mlp": 0.93603516, + "step": 1550, + "time_per_iteration": 2.9500393867492676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196641, + "balance_loss_mlp": 1.10256064, + "epoch": 0.29838399384378606, + "flos": 876516547584.0, + "grad_norm": 0.029531977965095095, + "language_loss": 0.90478379, + "learning_rate": 0.0008227177391691874, + "loss": 0.91675019, + "num_input_tokens_seen": 128022192, + "router_z_loss_mlp": 0.93994141, + "step": 1551, + "time_per_iteration": 3.117060422897339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192501, + "balance_loss_mlp": 1.09870708, + "epoch": 0.29857637552904964, + "flos": 580751837184.0, + "grad_norm": 0.026349497602305087, + "language_loss": 0.9813534, + "learning_rate": 0.0008224797159134463, + "loss": 0.99327838, + "num_input_tokens_seen": 128097776, + "router_z_loss_mlp": 0.93701172, + "step": 1552, + "time_per_iteration": 2.694382429122925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185823, + "balance_loss_mlp": 1.09212494, + "epoch": 0.2987687572143132, + "flos": 837807811584.0, + "grad_norm": 0.022207279660822626, + "language_loss": 0.8985877, + "learning_rate": 0.0008222415674599765, + "loss": 0.91044593, + "num_input_tokens_seen": 128179888, + "router_z_loss_mlp": 0.93603516, + "step": 1553, + "time_per_iteration": 3.074347972869873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186024, + "balance_loss_mlp": 1.09203923, + "epoch": 0.29896113889957676, + "flos": 568167409152.0, + "grad_norm": 0.026892838709900748, + "language_loss": 0.93768913, + "learning_rate": 0.0008220032939012349, + "loss": 0.94954944, + "num_input_tokens_seen": 128251152, + "router_z_loss_mlp": 0.93896484, + "step": 1554, + "time_per_iteration": 2.6793601512908936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190641, + "balance_loss_mlp": 1.0965606, + "epoch": 0.29915352058484035, + "flos": 499835647488.0, + "grad_norm": 0.021647779244158522, + "language_loss": 0.95223451, + "learning_rate": 0.0008217648953297277, + "loss": 0.96414095, + "num_input_tokens_seen": 128327600, + "router_z_loss_mlp": 0.93994141, + "step": 1555, + "time_per_iteration": 2.836775779724121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189405, + "balance_loss_mlp": 1.09546852, + "epoch": 0.2993459022701039, + "flos": 593214741504.0, + "grad_norm": 0.03843372955580003, + "language_loss": 0.88026905, + "learning_rate": 0.0008215263718380095, + "loss": 0.89216304, + "num_input_tokens_seen": 128398432, + "router_z_loss_mlp": 0.93847656, + "step": 1556, + "time_per_iteration": 2.6840782165527344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192028, + "balance_loss_mlp": 1.09790027, + "epoch": 0.29953828395536747, + "flos": 573472971264.0, + "grad_norm": 0.02697506762846426, + "language_loss": 0.95771539, + "learning_rate": 0.0008212877235186833, + "loss": 0.96963573, + "num_input_tokens_seen": 128469696, + "router_z_loss_mlp": 0.94042969, + "step": 1557, + "time_per_iteration": 2.649303674697876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216583, + "balance_loss_mlp": 1.12350464, + "epoch": 0.299730665640631, + "flos": 1508083637760.0, + "grad_norm": 0.01733611069553414, + "language_loss": 0.77737558, + "learning_rate": 0.0008210489504644005, + "loss": 0.78954148, + "num_input_tokens_seen": 128698560, + "router_z_loss_mlp": 0.9296875, + "step": 1558, + "time_per_iteration": 4.920740365982056 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191809, + "balance_loss_mlp": 1.09772909, + "epoch": 0.2999230473258946, + "flos": 514807615488.0, + "grad_norm": 0.03091345134541536, + "language_loss": 0.92723, + "learning_rate": 0.0008208100527678611, + "loss": 0.93914807, + "num_input_tokens_seen": 128765952, + "router_z_loss_mlp": 0.93994141, + "step": 1559, + "time_per_iteration": 2.628755807876587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191055, + "balance_loss_mlp": 1.09692788, + "epoch": 0.3001154290111581, + "flos": 835853973504.0, + "grad_norm": 0.03027255896835194, + "language_loss": 0.86836946, + "learning_rate": 0.0008205710305218135, + "loss": 0.88028002, + "num_input_tokens_seen": 128840048, + "router_z_loss_mlp": 0.94042969, + "step": 1560, + "time_per_iteration": 3.0076475143432617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188346, + "balance_loss_mlp": 1.09431422, + "epoch": 0.3003078106964217, + "flos": 557945051136.0, + "grad_norm": 0.023845762720508586, + "language_loss": 0.96495396, + "learning_rate": 0.0008203318838190541, + "loss": 0.9768374, + "num_input_tokens_seen": 128912496, + "router_z_loss_mlp": 0.93945312, + "step": 1561, + "time_per_iteration": 2.7329952716827393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118952, + "balance_loss_mlp": 1.09548759, + "epoch": 0.30050019238168524, + "flos": 527168461824.0, + "grad_norm": 0.030147848994798797, + "language_loss": 0.95915771, + "learning_rate": 0.0008200926127524281, + "loss": 0.97105289, + "num_input_tokens_seen": 128980624, + "router_z_loss_mlp": 0.93945312, + "step": 1562, + "time_per_iteration": 2.625941753387451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186113, + "balance_loss_mlp": 1.09217656, + "epoch": 0.3006925740669488, + "flos": 578936987136.0, + "grad_norm": 0.02860364820877459, + "language_loss": 0.92538679, + "learning_rate": 0.0008198532174148289, + "loss": 0.93724799, + "num_input_tokens_seen": 129050576, + "router_z_loss_mlp": 0.93847656, + "step": 1563, + "time_per_iteration": 2.725884199142456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207901, + "balance_loss_mlp": 1.11539459, + "epoch": 0.3008849557522124, + "flos": 1493610499584.0, + "grad_norm": 0.014785027254047896, + "language_loss": 0.8068617, + "learning_rate": 0.0008196136978991977, + "loss": 0.8189407, + "num_input_tokens_seen": 129278880, + "router_z_loss_mlp": 0.92382812, + "step": 1564, + "time_per_iteration": 4.830730438232422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198016, + "balance_loss_mlp": 1.10398376, + "epoch": 0.30107733743747594, + "flos": 510824077824.0, + "grad_norm": 0.03423038852538926, + "language_loss": 0.994165, + "learning_rate": 0.0008193740542985244, + "loss": 1.00614524, + "num_input_tokens_seen": 129346560, + "router_z_loss_mlp": 0.93945312, + "step": 1565, + "time_per_iteration": 2.578756809234619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194051, + "balance_loss_mlp": 1.10020983, + "epoch": 0.30126971912273953, + "flos": 588820970496.0, + "grad_norm": 0.027351016206119898, + "language_loss": 0.95914042, + "learning_rate": 0.0008191342867058467, + "loss": 0.97108096, + "num_input_tokens_seen": 129420448, + "router_z_loss_mlp": 0.9375, + "step": 1566, + "time_per_iteration": 2.7046890258789062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192822, + "balance_loss_mlp": 1.09898102, + "epoch": 0.30146210080800306, + "flos": 603220248576.0, + "grad_norm": 0.029722715632080093, + "language_loss": 0.93181753, + "learning_rate": 0.0008188943952142509, + "loss": 0.94374579, + "num_input_tokens_seen": 129494032, + "router_z_loss_mlp": 0.9375, + "step": 1567, + "time_per_iteration": 2.7784945964813232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189204, + "balance_loss_mlp": 1.09588659, + "epoch": 0.30165448249326665, + "flos": 919286684160.0, + "grad_norm": 0.02698998287866622, + "language_loss": 0.91980577, + "learning_rate": 0.0008186543799168711, + "loss": 0.93169785, + "num_input_tokens_seen": 129569088, + "router_z_loss_mlp": 0.93212891, + "step": 1568, + "time_per_iteration": 3.1082897186279297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188766, + "balance_loss_mlp": 1.09530556, + "epoch": 0.3018468641785302, + "flos": 778630164480.0, + "grad_norm": 0.02791954193910651, + "language_loss": 0.98386627, + "learning_rate": 0.0008184142409068892, + "loss": 0.99575394, + "num_input_tokens_seen": 129647968, + "router_z_loss_mlp": 0.93359375, + "step": 1569, + "time_per_iteration": 3.0047945976257324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187793, + "balance_loss_mlp": 1.09433293, + "epoch": 0.30203924586379377, + "flos": 523389040128.0, + "grad_norm": 0.023468489537567368, + "language_loss": 0.94207543, + "learning_rate": 0.000818173978277536, + "loss": 0.95395339, + "num_input_tokens_seen": 129718928, + "router_z_loss_mlp": 0.93359375, + "step": 1570, + "time_per_iteration": 2.640746831893921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119455, + "balance_loss_mlp": 1.10094678, + "epoch": 0.3022316275490573, + "flos": 525649052160.0, + "grad_norm": 0.028721303316250762, + "language_loss": 0.92132497, + "learning_rate": 0.000817933592122089, + "loss": 0.93327045, + "num_input_tokens_seen": 129790128, + "router_z_loss_mlp": 0.93505859, + "step": 1571, + "time_per_iteration": 2.683819055557251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119426, + "balance_loss_mlp": 1.10037029, + "epoch": 0.3024240092343209, + "flos": 480872684544.0, + "grad_norm": 0.028034832338571278, + "language_loss": 0.93476671, + "learning_rate": 0.0008176930825338749, + "loss": 0.94670928, + "num_input_tokens_seen": 129857536, + "router_z_loss_mlp": 0.93798828, + "step": 1572, + "time_per_iteration": 2.5472469329833984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189801, + "balance_loss_mlp": 1.09605432, + "epoch": 0.3026163909195845, + "flos": 688430879232.0, + "grad_norm": 0.025848261804373458, + "language_loss": 0.98155606, + "learning_rate": 0.0008174524496062679, + "loss": 0.9934541, + "num_input_tokens_seen": 129931440, + "router_z_loss_mlp": 0.93652344, + "step": 1573, + "time_per_iteration": 2.90840482711792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185834, + "balance_loss_mlp": 1.0922308, + "epoch": 0.302808772604848, + "flos": 544086262272.0, + "grad_norm": 0.023993082839652336, + "language_loss": 0.9423182, + "learning_rate": 0.0008172116934326894, + "loss": 0.95417649, + "num_input_tokens_seen": 130005200, + "router_z_loss_mlp": 0.93505859, + "step": 1574, + "time_per_iteration": 2.735853433609009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197529, + "balance_loss_mlp": 1.10349655, + "epoch": 0.3030011542901116, + "flos": 476051215872.0, + "grad_norm": 0.025758910941944917, + "language_loss": 0.96492219, + "learning_rate": 0.0008169708141066097, + "loss": 0.97689748, + "num_input_tokens_seen": 130069136, + "router_z_loss_mlp": 0.93945312, + "step": 1575, + "time_per_iteration": 2.5468080043792725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195411, + "balance_loss_mlp": 1.10123575, + "epoch": 0.30319353597537513, + "flos": 482472685056.0, + "grad_norm": 0.02368764088299644, + "language_loss": 0.97863203, + "learning_rate": 0.0008167298117215465, + "loss": 0.99058616, + "num_input_tokens_seen": 130135456, + "router_z_loss_mlp": 0.94091797, + "step": 1576, + "time_per_iteration": 2.5703070163726807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191699, + "balance_loss_mlp": 1.09747636, + "epoch": 0.3033859176606387, + "flos": 706112750592.0, + "grad_norm": 0.02517452757559557, + "language_loss": 0.96809077, + "learning_rate": 0.0008164886863710649, + "loss": 0.98000777, + "num_input_tokens_seen": 130213712, + "router_z_loss_mlp": 0.94140625, + "step": 1577, + "time_per_iteration": 2.9235777854919434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194461, + "balance_loss_mlp": 1.09990454, + "epoch": 0.30357829934590225, + "flos": 766108862976.0, + "grad_norm": 0.022389524212240816, + "language_loss": 0.93041158, + "learning_rate": 0.0008162474381487783, + "loss": 0.94235623, + "num_input_tokens_seen": 130290928, + "router_z_loss_mlp": 0.94482422, + "step": 1578, + "time_per_iteration": 3.0875654220581055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198648, + "balance_loss_mlp": 1.10399556, + "epoch": 0.30377068103116583, + "flos": 533448941568.0, + "grad_norm": 0.026496061930467673, + "language_loss": 0.94202471, + "learning_rate": 0.0008160060671483475, + "loss": 0.9540112, + "num_input_tokens_seen": 130362672, + "router_z_loss_mlp": 0.94580078, + "step": 1579, + "time_per_iteration": 2.69014048576355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198759, + "balance_loss_mlp": 1.10415483, + "epoch": 0.3039630627164294, + "flos": 511223577600.0, + "grad_norm": 0.03174839578716906, + "language_loss": 0.93386602, + "learning_rate": 0.0008157645734634809, + "loss": 0.94585359, + "num_input_tokens_seen": 130428848, + "router_z_loss_mlp": 0.9453125, + "step": 1580, + "time_per_iteration": 2.602752923965454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221184, + "balance_loss_mlp": 1.12791443, + "epoch": 0.30415544440169295, + "flos": 1509188084736.0, + "grad_norm": 0.0221653057193215, + "language_loss": 0.76896489, + "learning_rate": 0.000815522957187935, + "loss": 0.78117669, + "num_input_tokens_seen": 130665440, + "router_z_loss_mlp": 0.93164062, + "step": 1581, + "time_per_iteration": 4.895219802856445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196045, + "balance_loss_mlp": 1.10334778, + "epoch": 0.30434782608695654, + "flos": 1461787133952.0, + "grad_norm": 0.012004742936218659, + "language_loss": 0.73214495, + "learning_rate": 0.0008152812184155132, + "loss": 0.74410546, + "num_input_tokens_seen": 130895248, + "router_z_loss_mlp": 0.92578125, + "step": 1582, + "time_per_iteration": 4.860503196716309 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199297, + "balance_loss_mlp": 1.10526431, + "epoch": 0.3045402077722201, + "flos": 483534197760.0, + "grad_norm": 0.030796945736395555, + "language_loss": 0.93027633, + "learning_rate": 0.000815039357240067, + "loss": 0.94226933, + "num_input_tokens_seen": 130964544, + "router_z_loss_mlp": 0.93945312, + "step": 1583, + "time_per_iteration": 2.6209895610809326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200124, + "balance_loss_mlp": 1.10613978, + "epoch": 0.30473258945748366, + "flos": 544626751488.0, + "grad_norm": 0.03019985050023197, + "language_loss": 0.95277119, + "learning_rate": 0.0008147973737554952, + "loss": 0.9647724, + "num_input_tokens_seen": 131041744, + "router_z_loss_mlp": 0.93896484, + "step": 1584, + "time_per_iteration": 2.7421703338623047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194047, + "balance_loss_mlp": 1.10039604, + "epoch": 0.3049249711427472, + "flos": 568121746944.0, + "grad_norm": 0.05356410902969654, + "language_loss": 0.96138752, + "learning_rate": 0.000814555268055744, + "loss": 0.97332799, + "num_input_tokens_seen": 131108864, + "router_z_loss_mlp": 0.93554688, + "step": 1585, + "time_per_iteration": 2.632770299911499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191549, + "balance_loss_mlp": 1.09804094, + "epoch": 0.3051173528280108, + "flos": 529289485824.0, + "grad_norm": 0.02648444030223836, + "language_loss": 0.96492249, + "learning_rate": 0.0008143130402348073, + "loss": 0.97683799, + "num_input_tokens_seen": 131181104, + "router_z_loss_mlp": 0.93408203, + "step": 1586, + "time_per_iteration": 2.67673659324646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01201208, + "balance_loss_mlp": 1.10746217, + "epoch": 0.3053097345132743, + "flos": 587599002624.0, + "grad_norm": 0.026229801397330138, + "language_loss": 0.86860031, + "learning_rate": 0.0008140706903867265, + "loss": 0.88061237, + "num_input_tokens_seen": 131258704, + "router_z_loss_mlp": 0.93652344, + "step": 1587, + "time_per_iteration": 2.800891399383545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198977, + "balance_loss_mlp": 1.10518289, + "epoch": 0.3055021161985379, + "flos": 608200171008.0, + "grad_norm": 0.031935519152889405, + "language_loss": 1.00360334, + "learning_rate": 0.0008138282186055897, + "loss": 1.01559317, + "num_input_tokens_seen": 131325712, + "router_z_loss_mlp": 0.93701172, + "step": 1588, + "time_per_iteration": 2.735144853591919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119001, + "balance_loss_mlp": 1.09645426, + "epoch": 0.3056944978838015, + "flos": 574962181632.0, + "grad_norm": 0.02354328369726863, + "language_loss": 0.90634608, + "learning_rate": 0.0008135856249855331, + "loss": 0.91824615, + "num_input_tokens_seen": 131397568, + "router_z_loss_mlp": 0.93457031, + "step": 1589, + "time_per_iteration": 2.676589012145996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193478, + "balance_loss_mlp": 1.0996846, + "epoch": 0.305886879569065, + "flos": 635071085568.0, + "grad_norm": 0.031037281782467684, + "language_loss": 0.99387443, + "learning_rate": 0.0008133429096207398, + "loss": 1.00580931, + "num_input_tokens_seen": 131467632, + "router_z_loss_mlp": 0.93701172, + "step": 1590, + "time_per_iteration": 2.7601518630981445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01232346, + "balance_loss_mlp": 1.14117432, + "epoch": 0.3060792612543286, + "flos": 1372131065856.0, + "grad_norm": 0.03086145734446917, + "language_loss": 0.75312257, + "learning_rate": 0.0008131000726054403, + "loss": 0.76544607, + "num_input_tokens_seen": 131702224, + "router_z_loss_mlp": 0.91015625, + "step": 1591, + "time_per_iteration": 4.945107460021973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194266, + "balance_loss_mlp": 1.10051942, + "epoch": 0.30627164293959214, + "flos": 519618350592.0, + "grad_norm": 0.024964882972055902, + "language_loss": 0.95062864, + "learning_rate": 0.0008128571140339123, + "loss": 0.96257126, + "num_input_tokens_seen": 131774608, + "router_z_loss_mlp": 0.93652344, + "step": 1592, + "time_per_iteration": 2.6392171382904053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01201642, + "balance_loss_mlp": 1.10780036, + "epoch": 0.3064640246248557, + "flos": 456533027328.0, + "grad_norm": 0.029487227531667784, + "language_loss": 0.98122042, + "learning_rate": 0.0008126140340004805, + "loss": 0.9932369, + "num_input_tokens_seen": 131841216, + "router_z_loss_mlp": 0.9375, + "step": 1593, + "time_per_iteration": 2.504150629043579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199461, + "balance_loss_mlp": 1.10561943, + "epoch": 0.30665640631011926, + "flos": 851608203264.0, + "grad_norm": 0.026956571268616787, + "language_loss": 0.91923594, + "learning_rate": 0.0008123708325995172, + "loss": 0.93123049, + "num_input_tokens_seen": 131937584, + "router_z_loss_mlp": 0.9375, + "step": 1594, + "time_per_iteration": 3.184525489807129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190831, + "balance_loss_mlp": 1.09713268, + "epoch": 0.30684878799538284, + "flos": 759615535104.0, + "grad_norm": 0.022474213305982697, + "language_loss": 0.88990366, + "learning_rate": 0.0008121275099254414, + "loss": 0.90181196, + "num_input_tokens_seen": 132012656, + "router_z_loss_mlp": 0.93603516, + "step": 1595, + "time_per_iteration": 2.892902374267578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200579, + "balance_loss_mlp": 1.10668933, + "epoch": 0.3070411696806464, + "flos": 518595769344.0, + "grad_norm": 0.025855927391394404, + "language_loss": 0.96650064, + "learning_rate": 0.0008118840660727194, + "loss": 0.97850645, + "num_input_tokens_seen": 132083728, + "router_z_loss_mlp": 0.93798828, + "step": 1596, + "time_per_iteration": 2.696312665939331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191708, + "balance_loss_mlp": 1.09805715, + "epoch": 0.30723355136590996, + "flos": 845790349824.0, + "grad_norm": 0.023513083336694603, + "language_loss": 0.94521677, + "learning_rate": 0.0008116405011358644, + "loss": 0.95713389, + "num_input_tokens_seen": 132170896, + "router_z_loss_mlp": 0.93554688, + "step": 1597, + "time_per_iteration": 3.1500890254974365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118938, + "balance_loss_mlp": 1.09572959, + "epoch": 0.30742593305117355, + "flos": 467079023616.0, + "grad_norm": 0.024597056369147573, + "language_loss": 0.89059556, + "learning_rate": 0.0008113968152094369, + "loss": 0.90248942, + "num_input_tokens_seen": 132234592, + "router_z_loss_mlp": 0.93554688, + "step": 1598, + "time_per_iteration": 2.502336263656616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191327, + "balance_loss_mlp": 1.09781969, + "epoch": 0.3076183147364371, + "flos": 687816529920.0, + "grad_norm": 0.025330429780868927, + "language_loss": 0.90385377, + "learning_rate": 0.0008111530083880438, + "loss": 0.91576707, + "num_input_tokens_seen": 132314720, + "router_z_loss_mlp": 0.93408203, + "step": 1599, + "time_per_iteration": 2.8846051692962646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192126, + "balance_loss_mlp": 1.09847498, + "epoch": 0.30781069642170067, + "flos": 615179593728.0, + "grad_norm": 0.02627563558110635, + "language_loss": 0.95310938, + "learning_rate": 0.0008109090807663399, + "loss": 0.96503073, + "num_input_tokens_seen": 132388768, + "router_z_loss_mlp": 0.93554688, + "step": 1600, + "time_per_iteration": 2.8132736682891846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119763, + "balance_loss_mlp": 1.10402679, + "epoch": 0.3080030781069642, + "flos": 591508680192.0, + "grad_norm": 0.027223292643472258, + "language_loss": 0.96310741, + "learning_rate": 0.0008106650324390257, + "loss": 0.97508371, + "num_input_tokens_seen": 132472544, + "router_z_loss_mlp": 0.93505859, + "step": 1601, + "time_per_iteration": 2.8477296829223633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188215, + "balance_loss_mlp": 1.0948981, + "epoch": 0.3081954597922278, + "flos": 563691045888.0, + "grad_norm": 0.027322987260225157, + "language_loss": 0.89918464, + "learning_rate": 0.0008104208635008493, + "loss": 0.91106677, + "num_input_tokens_seen": 132541968, + "router_z_loss_mlp": 0.93212891, + "step": 1602, + "time_per_iteration": 2.6639676094055176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192245, + "balance_loss_mlp": 1.09859383, + "epoch": 0.3083878414774913, + "flos": 448761335808.0, + "grad_norm": 0.031035394068971153, + "language_loss": 0.93496901, + "learning_rate": 0.0008101765740466058, + "loss": 0.94689143, + "num_input_tokens_seen": 132606976, + "router_z_loss_mlp": 0.93554688, + "step": 1603, + "time_per_iteration": 2.4892899990081787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188397, + "balance_loss_mlp": 1.09465039, + "epoch": 0.3085802231627549, + "flos": 494544821760.0, + "grad_norm": 0.029709960428380106, + "language_loss": 0.93853128, + "learning_rate": 0.0008099321641711364, + "loss": 0.95041513, + "num_input_tokens_seen": 132677984, + "router_z_loss_mlp": 0.93652344, + "step": 1604, + "time_per_iteration": 2.638798952102661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011875, + "balance_loss_mlp": 1.09380174, + "epoch": 0.3087726048480185, + "flos": 488690038272.0, + "grad_norm": 0.02367908107469003, + "language_loss": 0.91951108, + "learning_rate": 0.0008096876339693295, + "loss": 0.93138611, + "num_input_tokens_seen": 132749136, + "router_z_loss_mlp": 0.93603516, + "step": 1605, + "time_per_iteration": 2.6115643978118896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189736, + "balance_loss_mlp": 1.09603786, + "epoch": 0.308964986533282, + "flos": 731887223808.0, + "grad_norm": 0.029121548764615916, + "language_loss": 0.90058184, + "learning_rate": 0.0008094429835361206, + "loss": 0.91247922, + "num_input_tokens_seen": 132823824, + "router_z_loss_mlp": 0.93603516, + "step": 1606, + "time_per_iteration": 2.9361119270324707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185725, + "balance_loss_mlp": 1.09226441, + "epoch": 0.3091573682185456, + "flos": 606515576832.0, + "grad_norm": 0.024539043330914945, + "language_loss": 0.94318593, + "learning_rate": 0.0008091982129664908, + "loss": 0.95504314, + "num_input_tokens_seen": 132895936, + "router_z_loss_mlp": 0.93359375, + "step": 1607, + "time_per_iteration": 2.750641345977783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191863, + "balance_loss_mlp": 1.09821212, + "epoch": 0.30934974990380915, + "flos": 461306832384.0, + "grad_norm": 0.02635007664096696, + "language_loss": 0.92281848, + "learning_rate": 0.0008089533223554687, + "loss": 0.93473709, + "num_input_tokens_seen": 132968960, + "router_z_loss_mlp": 0.93554688, + "step": 1608, + "time_per_iteration": 2.733422040939331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187457, + "balance_loss_mlp": 1.09380579, + "epoch": 0.30954213158907273, + "flos": 554567130624.0, + "grad_norm": 0.025571984513822792, + "language_loss": 0.94345558, + "learning_rate": 0.0008087083117981294, + "loss": 0.95533013, + "num_input_tokens_seen": 133048448, + "router_z_loss_mlp": 0.93554688, + "step": 1609, + "time_per_iteration": 2.919583797454834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189683, + "balance_loss_mlp": 1.09665251, + "epoch": 0.30973451327433627, + "flos": 554113236480.0, + "grad_norm": 0.028700236773969223, + "language_loss": 0.98730469, + "learning_rate": 0.0008084631813895943, + "loss": 0.99920154, + "num_input_tokens_seen": 133121680, + "router_z_loss_mlp": 0.92919922, + "step": 1610, + "time_per_iteration": 2.7721197605133057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192773, + "balance_loss_mlp": 1.09955156, + "epoch": 0.30992689495959985, + "flos": 566762792448.0, + "grad_norm": 0.027612542910463767, + "language_loss": 0.93469882, + "learning_rate": 0.0008082179312250315, + "loss": 0.94662654, + "num_input_tokens_seen": 133190176, + "router_z_loss_mlp": 0.93115234, + "step": 1611, + "time_per_iteration": 2.658564805984497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01219437, + "balance_loss_mlp": 1.12769318, + "epoch": 0.3101192766448634, + "flos": 1445560270848.0, + "grad_norm": 0.021240149379623804, + "language_loss": 0.79855847, + "learning_rate": 0.0008079725613996555, + "loss": 0.81075287, + "num_input_tokens_seen": 133420512, + "router_z_loss_mlp": 0.91601562, + "step": 1612, + "time_per_iteration": 4.8431174755096436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01227287, + "balance_loss_mlp": 1.13497162, + "epoch": 0.31031165833012697, + "flos": 1535127742464.0, + "grad_norm": 0.019393089292119553, + "language_loss": 0.76629329, + "learning_rate": 0.0008077270720087273, + "loss": 0.77856624, + "num_input_tokens_seen": 133651984, + "router_z_loss_mlp": 0.921875, + "step": 1613, + "time_per_iteration": 5.043596029281616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191397, + "balance_loss_mlp": 1.09850931, + "epoch": 0.31050404001539056, + "flos": 993632409600.0, + "grad_norm": 0.029090005547288914, + "language_loss": 0.90590245, + "learning_rate": 0.0008074814631475545, + "loss": 0.91781646, + "num_input_tokens_seen": 133741648, + "router_z_loss_mlp": 0.92773438, + "step": 1614, + "time_per_iteration": 3.3308844566345215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011972, + "balance_loss_mlp": 1.10450339, + "epoch": 0.3106964217006541, + "flos": 446972682240.0, + "grad_norm": 0.029174032275502568, + "language_loss": 0.8959738, + "learning_rate": 0.0008072357349114907, + "loss": 0.90794587, + "num_input_tokens_seen": 133813344, + "router_z_loss_mlp": 0.92578125, + "step": 1615, + "time_per_iteration": 2.660557746887207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194484, + "balance_loss_mlp": 1.10169172, + "epoch": 0.3108888033859177, + "flos": 511494822912.0, + "grad_norm": 0.027617375290548026, + "language_loss": 0.9836188, + "learning_rate": 0.0008069898873959363, + "loss": 0.99556363, + "num_input_tokens_seen": 133884192, + "router_z_loss_mlp": 0.92675781, + "step": 1616, + "time_per_iteration": 2.650024175643921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203555, + "balance_loss_mlp": 1.11076295, + "epoch": 0.3110811850711812, + "flos": 521778306048.0, + "grad_norm": 0.027380341091067188, + "language_loss": 0.94434142, + "learning_rate": 0.0008067439206963375, + "loss": 0.95637697, + "num_input_tokens_seen": 133954848, + "router_z_loss_mlp": 0.92675781, + "step": 1617, + "time_per_iteration": 2.6584017276763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120371, + "balance_loss_mlp": 1.11082232, + "epoch": 0.3112735667564448, + "flos": 687729934848.0, + "grad_norm": 0.029016410329411102, + "language_loss": 0.95023614, + "learning_rate": 0.0008064978349081873, + "loss": 0.96227324, + "num_input_tokens_seen": 134031824, + "router_z_loss_mlp": 0.92773438, + "step": 1618, + "time_per_iteration": 2.911677122116089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199948, + "balance_loss_mlp": 1.10720289, + "epoch": 0.31146594844170833, + "flos": 534165348864.0, + "grad_norm": 0.025439718165996668, + "language_loss": 0.95660365, + "learning_rate": 0.0008062516301270245, + "loss": 0.96860307, + "num_input_tokens_seen": 134104480, + "router_z_loss_mlp": 0.92626953, + "step": 1619, + "time_per_iteration": 2.669111490249634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196196, + "balance_loss_mlp": 1.10388064, + "epoch": 0.3116583301269719, + "flos": 680841836544.0, + "grad_norm": 0.024218225399572888, + "language_loss": 0.96279341, + "learning_rate": 0.0008060053064484343, + "loss": 0.97475541, + "num_input_tokens_seen": 134185632, + "router_z_loss_mlp": 0.921875, + "step": 1620, + "time_per_iteration": 2.924476385116577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189886, + "balance_loss_mlp": 1.09733212, + "epoch": 0.31185071181223545, + "flos": 587329758720.0, + "grad_norm": 0.02529679167102671, + "language_loss": 0.92711556, + "learning_rate": 0.0008057588639680482, + "loss": 0.93901443, + "num_input_tokens_seen": 134261600, + "router_z_loss_mlp": 0.92431641, + "step": 1621, + "time_per_iteration": 2.74631667137146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119125, + "balance_loss_mlp": 1.09817135, + "epoch": 0.31204309349749904, + "flos": 726657523200.0, + "grad_norm": 0.03522846239796161, + "language_loss": 0.93884659, + "learning_rate": 0.0008055123027815434, + "loss": 0.95075905, + "num_input_tokens_seen": 134334368, + "router_z_loss_mlp": 0.9296875, + "step": 1622, + "time_per_iteration": 2.90444016456604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189249, + "balance_loss_mlp": 1.09631383, + "epoch": 0.3122354751827626, + "flos": 577894940160.0, + "grad_norm": 0.026492717763192643, + "language_loss": 0.93252558, + "learning_rate": 0.0008052656229846436, + "loss": 0.94441813, + "num_input_tokens_seen": 134403824, + "router_z_loss_mlp": 0.92822266, + "step": 1623, + "time_per_iteration": 2.680220603942871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187983, + "balance_loss_mlp": 1.09519064, + "epoch": 0.31242785686802615, + "flos": 577028811264.0, + "grad_norm": 0.026617450345468772, + "language_loss": 1.00026262, + "learning_rate": 0.0008050188246731182, + "loss": 1.01214242, + "num_input_tokens_seen": 134471296, + "router_z_loss_mlp": 0.92675781, + "step": 1624, + "time_per_iteration": 2.6526694297790527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190099, + "balance_loss_mlp": 1.09711611, + "epoch": 0.31262023855328974, + "flos": 738195901440.0, + "grad_norm": 0.023806346866415393, + "language_loss": 0.9048847, + "learning_rate": 0.0008047719079427834, + "loss": 0.91678566, + "num_input_tokens_seen": 134551360, + "router_z_loss_mlp": 0.92871094, + "step": 1625, + "time_per_iteration": 3.0077152252197266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119944, + "balance_loss_mlp": 1.108078, + "epoch": 0.3128126202385533, + "flos": 1562591539200.0, + "grad_norm": 0.020013754894949238, + "language_loss": 0.74351704, + "learning_rate": 0.0008045248728895, + "loss": 0.7555114, + "num_input_tokens_seen": 134761328, + "router_z_loss_mlp": 0.91210938, + "step": 1626, + "time_per_iteration": 4.793031215667725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194528, + "balance_loss_mlp": 1.10111523, + "epoch": 0.31300500192381686, + "flos": 515942988288.0, + "grad_norm": 0.023349922932092686, + "language_loss": 0.95821261, + "learning_rate": 0.0008042777196091757, + "loss": 0.97015792, + "num_input_tokens_seen": 134833136, + "router_z_loss_mlp": 0.93310547, + "step": 1627, + "time_per_iteration": 2.679588556289673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196127, + "balance_loss_mlp": 1.10281038, + "epoch": 0.3131973836090804, + "flos": 527661287424.0, + "grad_norm": 0.026058472156191805, + "language_loss": 0.91163933, + "learning_rate": 0.0008040304481977643, + "loss": 0.92360055, + "num_input_tokens_seen": 134904352, + "router_z_loss_mlp": 0.93212891, + "step": 1628, + "time_per_iteration": 2.6339213848114014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206718, + "balance_loss_mlp": 1.11335361, + "epoch": 0.313389765294344, + "flos": 824209534464.0, + "grad_norm": 0.028324849871922998, + "language_loss": 0.96729648, + "learning_rate": 0.0008037830587512649, + "loss": 0.97936368, + "num_input_tokens_seen": 134984880, + "router_z_loss_mlp": 0.93261719, + "step": 1629, + "time_per_iteration": 3.052304744720459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191904, + "balance_loss_mlp": 1.09896827, + "epoch": 0.31358214697960757, + "flos": 394702599168.0, + "grad_norm": 0.026724204555937114, + "language_loss": 0.89292234, + "learning_rate": 0.0008035355513657224, + "loss": 0.90484136, + "num_input_tokens_seen": 135047456, + "router_z_loss_mlp": 0.92822266, + "step": 1630, + "time_per_iteration": 2.470526695251465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198859, + "balance_loss_mlp": 1.1059711, + "epoch": 0.3137745286648711, + "flos": 573097666560.0, + "grad_norm": 0.025006494531642755, + "language_loss": 1.00651205, + "learning_rate": 0.0008032879261372279, + "loss": 1.01850057, + "num_input_tokens_seen": 135124256, + "router_z_loss_mlp": 0.92773438, + "step": 1631, + "time_per_iteration": 2.7967746257781982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194023, + "balance_loss_mlp": 1.10418701, + "epoch": 0.3139669103501347, + "flos": 1501629241344.0, + "grad_norm": 0.01894627505164378, + "language_loss": 0.79635841, + "learning_rate": 0.0008030401831619178, + "loss": 0.80829865, + "num_input_tokens_seen": 135353024, + "router_z_loss_mlp": 0.89648438, + "step": 1632, + "time_per_iteration": 5.690793991088867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187718, + "balance_loss_mlp": 1.09478259, + "epoch": 0.3141592920353982, + "flos": 526358728704.0, + "grad_norm": 0.023739615719740217, + "language_loss": 0.94780874, + "learning_rate": 0.0008027923225359748, + "loss": 0.95968592, + "num_input_tokens_seen": 135422464, + "router_z_loss_mlp": 0.92822266, + "step": 1633, + "time_per_iteration": 2.619640827178955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182027, + "balance_loss_mlp": 1.08894837, + "epoch": 0.3143516737206618, + "flos": 594387044352.0, + "grad_norm": 0.024020227962995952, + "language_loss": 0.97166598, + "learning_rate": 0.0008025443443556267, + "loss": 0.98348624, + "num_input_tokens_seen": 135490928, + "router_z_loss_mlp": 0.9296875, + "step": 1634, + "time_per_iteration": 2.7105367183685303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187192, + "balance_loss_mlp": 1.09397042, + "epoch": 0.31454405540592534, + "flos": 649679208960.0, + "grad_norm": 0.024579905610689918, + "language_loss": 0.95561564, + "learning_rate": 0.000802296248717147, + "loss": 0.96748757, + "num_input_tokens_seen": 135576288, + "router_z_loss_mlp": 0.93115234, + "step": 1635, + "time_per_iteration": 2.954427480697632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189389, + "balance_loss_mlp": 1.09616756, + "epoch": 0.3147364370911889, + "flos": 644069474304.0, + "grad_norm": 0.026460377875643523, + "language_loss": 0.89723325, + "learning_rate": 0.0008020480357168554, + "loss": 0.90912724, + "num_input_tokens_seen": 135652320, + "router_z_loss_mlp": 0.93115234, + "step": 1636, + "time_per_iteration": 2.7983195781707764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118902, + "balance_loss_mlp": 1.09575093, + "epoch": 0.31492881877645246, + "flos": 472821015552.0, + "grad_norm": 0.024118652497695542, + "language_loss": 0.95980144, + "learning_rate": 0.0008017997054511165, + "loss": 0.97169161, + "num_input_tokens_seen": 135719632, + "router_z_loss_mlp": 0.93164062, + "step": 1637, + "time_per_iteration": 2.543381690979004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188761, + "balance_loss_mlp": 1.09544361, + "epoch": 0.31512120046171604, + "flos": 630629650944.0, + "grad_norm": 0.026442486928658162, + "language_loss": 0.94192296, + "learning_rate": 0.0008015512580163407, + "loss": 0.95381057, + "num_input_tokens_seen": 135796544, + "router_z_loss_mlp": 0.93212891, + "step": 1638, + "time_per_iteration": 2.8069217205047607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189537, + "balance_loss_mlp": 1.09645832, + "epoch": 0.31531358214697963, + "flos": 705053239296.0, + "grad_norm": 0.0247809696854931, + "language_loss": 0.89687169, + "learning_rate": 0.0008013026935089838, + "loss": 0.9087671, + "num_input_tokens_seen": 135871344, + "router_z_loss_mlp": 0.9296875, + "step": 1639, + "time_per_iteration": 2.8575150966644287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189099, + "balance_loss_mlp": 1.09592521, + "epoch": 0.31550596383224316, + "flos": 573631425024.0, + "grad_norm": 0.026868409426578303, + "language_loss": 0.92173505, + "learning_rate": 0.0008010540120255472, + "loss": 0.93362606, + "num_input_tokens_seen": 135944320, + "router_z_loss_mlp": 0.93066406, + "step": 1640, + "time_per_iteration": 2.6781005859375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118909, + "balance_loss_mlp": 1.09591639, + "epoch": 0.31569834551750675, + "flos": 659512800768.0, + "grad_norm": 0.03030176261580671, + "language_loss": 0.95734656, + "learning_rate": 0.0008008052136625774, + "loss": 0.96923745, + "num_input_tokens_seen": 136019456, + "router_z_loss_mlp": 0.93066406, + "step": 1641, + "time_per_iteration": 2.8858654499053955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192627, + "balance_loss_mlp": 1.09950101, + "epoch": 0.3158907272027703, + "flos": 567403338240.0, + "grad_norm": 0.026165343030711524, + "language_loss": 0.94310361, + "learning_rate": 0.0008005562985166666, + "loss": 0.9550299, + "num_input_tokens_seen": 136091232, + "router_z_loss_mlp": 0.93017578, + "step": 1642, + "time_per_iteration": 2.7097506523132324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193912, + "balance_loss_mlp": 1.10102403, + "epoch": 0.31608310888803387, + "flos": 537972968448.0, + "grad_norm": 0.020568762002796243, + "language_loss": 0.9172346, + "learning_rate": 0.0008003072666844524, + "loss": 0.92917377, + "num_input_tokens_seen": 136165088, + "router_z_loss_mlp": 0.92773438, + "step": 1643, + "time_per_iteration": 2.6982197761535645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194419, + "balance_loss_mlp": 1.10181749, + "epoch": 0.3162754905732974, + "flos": 487639259136.0, + "grad_norm": 0.02816029335024998, + "language_loss": 0.90344775, + "learning_rate": 0.0008000581182626173, + "loss": 0.91539198, + "num_input_tokens_seen": 136230368, + "router_z_loss_mlp": 0.92480469, + "step": 1644, + "time_per_iteration": 2.546762466430664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193569, + "balance_loss_mlp": 1.10048997, + "epoch": 0.316467872258561, + "flos": 531095603712.0, + "grad_norm": 0.024394566764596542, + "language_loss": 0.93082815, + "learning_rate": 0.0007998088533478894, + "loss": 0.94276381, + "num_input_tokens_seen": 136302512, + "router_z_loss_mlp": 0.9296875, + "step": 1645, + "time_per_iteration": 2.6320817470550537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188922, + "balance_loss_mlp": 1.09622455, + "epoch": 0.3166602539438245, + "flos": 444413227008.0, + "grad_norm": 0.029455070645316363, + "language_loss": 0.9479661, + "learning_rate": 0.000799559472037042, + "loss": 0.95985526, + "num_input_tokens_seen": 136368064, + "router_z_loss_mlp": 0.92578125, + "step": 1646, + "time_per_iteration": 2.535414457321167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187182, + "balance_loss_mlp": 1.09458041, + "epoch": 0.3168526356290881, + "flos": 647102289408.0, + "grad_norm": 0.02168302123393663, + "language_loss": 0.94649625, + "learning_rate": 0.0007993099744268932, + "loss": 0.95836812, + "num_input_tokens_seen": 136451520, + "router_z_loss_mlp": 0.92480469, + "step": 1647, + "time_per_iteration": 2.912095785140991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182437, + "balance_loss_mlp": 1.08988261, + "epoch": 0.3170450173143517, + "flos": 587257900032.0, + "grad_norm": 0.023943172344495993, + "language_loss": 0.96008313, + "learning_rate": 0.000799060360614307, + "loss": 0.97190744, + "num_input_tokens_seen": 136521184, + "router_z_loss_mlp": 0.92431641, + "step": 1648, + "time_per_iteration": 2.6763339042663574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118738, + "balance_loss_mlp": 1.09482586, + "epoch": 0.3172373989996152, + "flos": 828573106176.0, + "grad_norm": 0.025050943971751935, + "language_loss": 0.91967106, + "learning_rate": 0.0007988106306961917, + "loss": 0.93154484, + "num_input_tokens_seen": 136612592, + "router_z_loss_mlp": 0.92431641, + "step": 1649, + "time_per_iteration": 3.1265392303466797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183645, + "balance_loss_mlp": 1.09151971, + "epoch": 0.3174297806848788, + "flos": 528434090496.0, + "grad_norm": 0.026893421102733506, + "language_loss": 0.92866611, + "learning_rate": 0.0007985607847695014, + "loss": 0.94050252, + "num_input_tokens_seen": 136684336, + "router_z_loss_mlp": 0.91992188, + "step": 1650, + "time_per_iteration": 2.640529155731201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184032, + "balance_loss_mlp": 1.09152567, + "epoch": 0.31762216237014235, + "flos": 714481327104.0, + "grad_norm": 0.024008942139765378, + "language_loss": 0.9102264, + "learning_rate": 0.0007983108229312345, + "loss": 0.92206669, + "num_input_tokens_seen": 136766400, + "router_z_loss_mlp": 0.92382812, + "step": 1651, + "time_per_iteration": 2.890881299972534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183971, + "balance_loss_mlp": 1.09170341, + "epoch": 0.31781454405540593, + "flos": 484799826432.0, + "grad_norm": 0.027702532543066302, + "language_loss": 0.9509185, + "learning_rate": 0.0007980607452784351, + "loss": 0.96275818, + "num_input_tokens_seen": 136834016, + "router_z_loss_mlp": 0.92138672, + "step": 1652, + "time_per_iteration": 2.5693578720092773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118418, + "balance_loss_mlp": 1.09186423, + "epoch": 0.31800692574066947, + "flos": 549804059136.0, + "grad_norm": 0.028510736103347943, + "language_loss": 0.99507928, + "learning_rate": 0.0007978105519081919, + "loss": 1.00692105, + "num_input_tokens_seen": 136906288, + "router_z_loss_mlp": 0.921875, + "step": 1653, + "time_per_iteration": 2.674062967300415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181597, + "balance_loss_mlp": 1.08947253, + "epoch": 0.31819930742593305, + "flos": 517916292096.0, + "grad_norm": 0.029899238666621586, + "language_loss": 0.96953475, + "learning_rate": 0.0007975602429176385, + "loss": 0.98135078, + "num_input_tokens_seen": 136972416, + "router_z_loss_mlp": 0.91992188, + "step": 1654, + "time_per_iteration": 2.595107316970825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011812, + "balance_loss_mlp": 1.08907461, + "epoch": 0.31839168911119664, + "flos": 456969457152.0, + "grad_norm": 0.02327460697487094, + "language_loss": 0.90136862, + "learning_rate": 0.0007973098184039536, + "loss": 0.91318059, + "num_input_tokens_seen": 137044576, + "router_z_loss_mlp": 0.91992188, + "step": 1655, + "time_per_iteration": 2.654873847961426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184047, + "balance_loss_mlp": 1.09192252, + "epoch": 0.3185840707964602, + "flos": 627295391232.0, + "grad_norm": 0.025652000789891626, + "language_loss": 0.955365, + "learning_rate": 0.0007970592784643602, + "loss": 0.96720552, + "num_input_tokens_seen": 137125120, + "router_z_loss_mlp": 0.91992188, + "step": 1656, + "time_per_iteration": 2.8485612869262695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183486, + "balance_loss_mlp": 1.09107482, + "epoch": 0.31877645248172376, + "flos": 568540712448.0, + "grad_norm": 0.02977939264047221, + "language_loss": 0.94253254, + "learning_rate": 0.0007968086231961272, + "loss": 0.9543674, + "num_input_tokens_seen": 137195344, + "router_z_loss_mlp": 0.92285156, + "step": 1657, + "time_per_iteration": 2.6949312686920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182357, + "balance_loss_mlp": 1.09004128, + "epoch": 0.3189688341669873, + "flos": 490552551936.0, + "grad_norm": 0.03598298081414456, + "language_loss": 0.95643866, + "learning_rate": 0.0007965578526965671, + "loss": 0.96826226, + "num_input_tokens_seen": 137261040, + "router_z_loss_mlp": 0.921875, + "step": 1658, + "time_per_iteration": 2.5717341899871826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182583, + "balance_loss_mlp": 1.09012401, + "epoch": 0.3191612158522509, + "flos": 577380647424.0, + "grad_norm": 0.02594626841132509, + "language_loss": 0.93226576, + "learning_rate": 0.0007963069670630377, + "loss": 0.94409156, + "num_input_tokens_seen": 137334400, + "router_z_loss_mlp": 0.92333984, + "step": 1659, + "time_per_iteration": 2.7431960105895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187517, + "balance_loss_mlp": 1.09486747, + "epoch": 0.3193535975375144, + "flos": 539192934912.0, + "grad_norm": 0.026552556196046555, + "language_loss": 0.97412628, + "learning_rate": 0.0007960559663929416, + "loss": 0.98600149, + "num_input_tokens_seen": 137405344, + "router_z_loss_mlp": 0.92529297, + "step": 1660, + "time_per_iteration": 2.631037473678589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186332, + "balance_loss_mlp": 1.09382606, + "epoch": 0.319545979222778, + "flos": 735627714048.0, + "grad_norm": 0.022912970149823363, + "language_loss": 0.94840437, + "learning_rate": 0.0007958048507837259, + "loss": 0.96026772, + "num_input_tokens_seen": 137486016, + "router_z_loss_mlp": 0.92382812, + "step": 1661, + "time_per_iteration": 2.925752878189087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191424, + "balance_loss_mlp": 1.09872651, + "epoch": 0.31973836090804153, + "flos": 765767760384.0, + "grad_norm": 0.030797304976158044, + "language_loss": 0.98320282, + "learning_rate": 0.0007955536203328822, + "loss": 0.99511707, + "num_input_tokens_seen": 137562304, + "router_z_loss_mlp": 0.92578125, + "step": 1662, + "time_per_iteration": 2.9076955318450928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187513, + "balance_loss_mlp": 1.09486389, + "epoch": 0.3199307425933051, + "flos": 561741937152.0, + "grad_norm": 0.02511010738984868, + "language_loss": 0.90468192, + "learning_rate": 0.0007953022751379469, + "loss": 0.91655713, + "num_input_tokens_seen": 137639248, + "router_z_loss_mlp": 0.92529297, + "step": 1663, + "time_per_iteration": 2.7703394889831543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188156, + "balance_loss_mlp": 1.09564936, + "epoch": 0.3201231242785687, + "flos": 752671041024.0, + "grad_norm": 0.029121282383782986, + "language_loss": 0.92101777, + "learning_rate": 0.000795050815296501, + "loss": 0.93289936, + "num_input_tokens_seen": 137718256, + "router_z_loss_mlp": 0.92382812, + "step": 1664, + "time_per_iteration": 2.966632843017578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188504, + "balance_loss_mlp": 1.0960933, + "epoch": 0.32031550596383224, + "flos": 497384254464.0, + "grad_norm": 0.02307975398987516, + "language_loss": 1.00050378, + "learning_rate": 0.0007947992409061695, + "loss": 1.01238883, + "num_input_tokens_seen": 137785216, + "router_z_loss_mlp": 0.92285156, + "step": 1665, + "time_per_iteration": 2.6264171600341797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193124, + "balance_loss_mlp": 1.10080826, + "epoch": 0.3205078876490958, + "flos": 732874876416.0, + "grad_norm": 0.02454331261307917, + "language_loss": 0.93550396, + "learning_rate": 0.0007945475520646226, + "loss": 0.9474352, + "num_input_tokens_seen": 137863424, + "router_z_loss_mlp": 0.921875, + "step": 1666, + "time_per_iteration": 2.915013551712036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191587, + "balance_loss_mlp": 1.09941399, + "epoch": 0.32070026933435936, + "flos": 550474804224.0, + "grad_norm": 0.02796219722650757, + "language_loss": 0.9429689, + "learning_rate": 0.0007942957488695743, + "loss": 0.95488477, + "num_input_tokens_seen": 137930384, + "router_z_loss_mlp": 0.92041016, + "step": 1667, + "time_per_iteration": 2.621396780014038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186724, + "balance_loss_mlp": 1.09421742, + "epoch": 0.32089265101962294, + "flos": 746684000256.0, + "grad_norm": 0.022875326013334737, + "language_loss": 0.87680244, + "learning_rate": 0.0007940438314187833, + "loss": 0.88866973, + "num_input_tokens_seen": 138017200, + "router_z_loss_mlp": 0.92382812, + "step": 1668, + "time_per_iteration": 3.0475997924804688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187112, + "balance_loss_mlp": 1.0947485, + "epoch": 0.3210850327048865, + "flos": 495196101120.0, + "grad_norm": 0.03400858364934581, + "language_loss": 0.88502395, + "learning_rate": 0.0007937917998100529, + "loss": 0.89689511, + "num_input_tokens_seen": 138084048, + "router_z_loss_mlp": 0.92236328, + "step": 1669, + "time_per_iteration": 2.6158430576324463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188853, + "balance_loss_mlp": 1.09658515, + "epoch": 0.32127741439015006, + "flos": 531673022976.0, + "grad_norm": 0.029937804889017615, + "language_loss": 0.92354518, + "learning_rate": 0.0007935396541412302, + "loss": 0.93543375, + "num_input_tokens_seen": 138153280, + "router_z_loss_mlp": 0.92138672, + "step": 1670, + "time_per_iteration": 2.6148414611816406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188159, + "balance_loss_mlp": 1.09589148, + "epoch": 0.3214697960754136, + "flos": 502223187456.0, + "grad_norm": 0.027719397006423088, + "language_loss": 0.94146281, + "learning_rate": 0.0007932873945102068, + "loss": 0.95334446, + "num_input_tokens_seen": 138222320, + "router_z_loss_mlp": 0.92138672, + "step": 1671, + "time_per_iteration": 2.5756680965423584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189911, + "balance_loss_mlp": 1.09950256, + "epoch": 0.3216621777606772, + "flos": 1386402089472.0, + "grad_norm": 0.015471737686433536, + "language_loss": 0.75761777, + "learning_rate": 0.0007930350210149188, + "loss": 0.76951689, + "num_input_tokens_seen": 138449488, + "router_z_loss_mlp": 0.90234375, + "step": 1672, + "time_per_iteration": 4.848818778991699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181453, + "balance_loss_mlp": 1.08975732, + "epoch": 0.32185455944594077, + "flos": 572635040256.0, + "grad_norm": 0.021338606013939526, + "language_loss": 0.94597888, + "learning_rate": 0.0007927825337533461, + "loss": 0.95779347, + "num_input_tokens_seen": 138522496, + "router_z_loss_mlp": 0.91552734, + "step": 1673, + "time_per_iteration": 2.6742517948150635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181114, + "balance_loss_mlp": 1.08975172, + "epoch": 0.3220469411312043, + "flos": 544936928256.0, + "grad_norm": 0.029706455848313437, + "language_loss": 0.9645716, + "learning_rate": 0.0007925299328235131, + "loss": 0.97638273, + "num_input_tokens_seen": 138590096, + "router_z_loss_mlp": 0.91210938, + "step": 1674, + "time_per_iteration": 2.637598991394043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182375, + "balance_loss_mlp": 1.09101272, + "epoch": 0.3222393228164679, + "flos": 492161284608.0, + "grad_norm": 0.02873592636128419, + "language_loss": 0.969607, + "learning_rate": 0.000792277218323488, + "loss": 0.98143071, + "num_input_tokens_seen": 138658224, + "router_z_loss_mlp": 0.91210938, + "step": 1675, + "time_per_iteration": 2.589118719100952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182718, + "balance_loss_mlp": 1.0914042, + "epoch": 0.3224317045017314, + "flos": 491362285056.0, + "grad_norm": 0.026517432951267347, + "language_loss": 0.94174361, + "learning_rate": 0.0007920243903513833, + "loss": 0.95357084, + "num_input_tokens_seen": 138722864, + "router_z_loss_mlp": 0.91162109, + "step": 1676, + "time_per_iteration": 2.5541775226593018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179593, + "balance_loss_mlp": 1.08832622, + "epoch": 0.322624086186995, + "flos": 576870357504.0, + "grad_norm": 0.028460659829427477, + "language_loss": 0.94868386, + "learning_rate": 0.0007917714490053556, + "loss": 0.96047986, + "num_input_tokens_seen": 138791472, + "router_z_loss_mlp": 0.91113281, + "step": 1677, + "time_per_iteration": 2.685833215713501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196193, + "balance_loss_mlp": 1.10454535, + "epoch": 0.32281646787225854, + "flos": 630571253760.0, + "grad_norm": 0.02861547850998442, + "language_loss": 0.93624204, + "learning_rate": 0.0007915183943836055, + "loss": 0.94820398, + "num_input_tokens_seen": 138873424, + "router_z_loss_mlp": 0.91503906, + "step": 1678, + "time_per_iteration": 2.8957157135009766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184806, + "balance_loss_mlp": 1.09363461, + "epoch": 0.3230088495575221, + "flos": 782807084544.0, + "grad_norm": 0.029736135795599906, + "language_loss": 0.92990124, + "learning_rate": 0.0007912652265843773, + "loss": 0.94174933, + "num_input_tokens_seen": 138956880, + "router_z_loss_mlp": 0.91015625, + "step": 1679, + "time_per_iteration": 3.0256145000457764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187663, + "balance_loss_mlp": 1.09620523, + "epoch": 0.3232012312427857, + "flos": 537200165376.0, + "grad_norm": 0.0299548546326655, + "language_loss": 0.88938797, + "learning_rate": 0.0007910119457059597, + "loss": 0.90126455, + "num_input_tokens_seen": 139031296, + "router_z_loss_mlp": 0.91308594, + "step": 1680, + "time_per_iteration": 2.7195773124694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118719, + "balance_loss_mlp": 1.09601843, + "epoch": 0.32339361292804925, + "flos": 706232272896.0, + "grad_norm": 0.03079987155163935, + "language_loss": 0.89790422, + "learning_rate": 0.0007907585518466849, + "loss": 0.90977609, + "num_input_tokens_seen": 139109776, + "router_z_loss_mlp": 0.91015625, + "step": 1681, + "time_per_iteration": 2.9635961055755615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186411, + "balance_loss_mlp": 1.09523988, + "epoch": 0.32358599461331283, + "flos": 453257164800.0, + "grad_norm": 0.027692195030378806, + "language_loss": 0.99450397, + "learning_rate": 0.000790505045104929, + "loss": 1.00636816, + "num_input_tokens_seen": 139174736, + "router_z_loss_mlp": 0.91015625, + "step": 1682, + "time_per_iteration": 2.5084030628204346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186896, + "balance_loss_mlp": 1.09553456, + "epoch": 0.32377837629857636, + "flos": 602091606528.0, + "grad_norm": 0.028152445524849662, + "language_loss": 0.96712899, + "learning_rate": 0.0007902514255791125, + "loss": 0.97899795, + "num_input_tokens_seen": 139252064, + "router_z_loss_mlp": 0.91210938, + "step": 1683, + "time_per_iteration": 2.7732536792755127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185338, + "balance_loss_mlp": 1.09388101, + "epoch": 0.32397075798383995, + "flos": 808898465280.0, + "grad_norm": 0.02645952871958238, + "language_loss": 0.9579218, + "learning_rate": 0.0007899976933676986, + "loss": 0.9697752, + "num_input_tokens_seen": 139333328, + "router_z_loss_mlp": 0.91308594, + "step": 1684, + "time_per_iteration": 2.985987424850464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184012, + "balance_loss_mlp": 1.09274495, + "epoch": 0.3241631396691035, + "flos": 602792550912.0, + "grad_norm": 0.02682215462305332, + "language_loss": 0.96423018, + "learning_rate": 0.0007897438485691955, + "loss": 0.97607034, + "num_input_tokens_seen": 139400976, + "router_z_loss_mlp": 0.91113281, + "step": 1685, + "time_per_iteration": 2.673083543777466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185177, + "balance_loss_mlp": 1.09386301, + "epoch": 0.32435552135436707, + "flos": 475176354816.0, + "grad_norm": 0.030260846574811467, + "language_loss": 0.93327641, + "learning_rate": 0.0007894898912821542, + "loss": 0.9451282, + "num_input_tokens_seen": 139465664, + "router_z_loss_mlp": 0.91162109, + "step": 1686, + "time_per_iteration": 2.526704788208008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181419, + "balance_loss_mlp": 1.09015274, + "epoch": 0.3245479030396306, + "flos": 539219131392.0, + "grad_norm": 0.02519584895765407, + "language_loss": 0.95407552, + "learning_rate": 0.0007892358216051695, + "loss": 0.96588969, + "num_input_tokens_seen": 139541984, + "router_z_loss_mlp": 0.91113281, + "step": 1687, + "time_per_iteration": 2.718292713165283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186611, + "balance_loss_mlp": 1.09543955, + "epoch": 0.3247402847248942, + "flos": 548696884224.0, + "grad_norm": 0.02873183694146744, + "language_loss": 1.00761271, + "learning_rate": 0.0007889816396368803, + "loss": 1.0194788, + "num_input_tokens_seen": 139607408, + "router_z_loss_mlp": 0.91015625, + "step": 1688, + "time_per_iteration": 2.6112852096557617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179714, + "balance_loss_mlp": 1.08835161, + "epoch": 0.3249326664101578, + "flos": 378992030208.0, + "grad_norm": 0.0263136625306578, + "language_loss": 0.95246112, + "learning_rate": 0.0007887273454759687, + "loss": 0.96425825, + "num_input_tokens_seen": 139670000, + "router_z_loss_mlp": 0.91210938, + "step": 1689, + "time_per_iteration": 2.466093063354492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185248, + "balance_loss_mlp": 1.09407663, + "epoch": 0.3251250480954213, + "flos": 529122299904.0, + "grad_norm": 0.02633136368880149, + "language_loss": 0.91763788, + "learning_rate": 0.0007884729392211603, + "loss": 0.92949039, + "num_input_tokens_seen": 139739872, + "router_z_loss_mlp": 0.91015625, + "step": 1690, + "time_per_iteration": 2.633387804031372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182102, + "balance_loss_mlp": 1.09054887, + "epoch": 0.3253174297806849, + "flos": 450558721536.0, + "grad_norm": 0.03256384134880849, + "language_loss": 0.96271229, + "learning_rate": 0.0007882184209712245, + "loss": 0.97453332, + "num_input_tokens_seen": 139802032, + "router_z_loss_mlp": 0.9140625, + "step": 1691, + "time_per_iteration": 2.511629104614258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183951, + "balance_loss_mlp": 1.09239864, + "epoch": 0.32550981146594843, + "flos": 705489669120.0, + "grad_norm": 0.02306884235196454, + "language_loss": 0.92818689, + "learning_rate": 0.000787963790824974, + "loss": 0.9400264, + "num_input_tokens_seen": 139885648, + "router_z_loss_mlp": 0.9140625, + "step": 1692, + "time_per_iteration": 2.953939914703369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118506, + "balance_loss_mlp": 1.0935545, + "epoch": 0.325702193151212, + "flos": 393558494208.0, + "grad_norm": 0.026666894987577915, + "language_loss": 0.98025191, + "learning_rate": 0.0007877090488812651, + "loss": 0.9921025, + "num_input_tokens_seen": 139947920, + "router_z_loss_mlp": 0.91357422, + "step": 1693, + "time_per_iteration": 2.4410316944122314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178009, + "balance_loss_mlp": 1.08659911, + "epoch": 0.32589457483647555, + "flos": 578583149568.0, + "grad_norm": 0.029080232987036207, + "language_loss": 0.92532402, + "learning_rate": 0.0007874541952389973, + "loss": 0.93710411, + "num_input_tokens_seen": 140020048, + "router_z_loss_mlp": 0.91259766, + "step": 1694, + "time_per_iteration": 2.660390853881836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179003, + "balance_loss_mlp": 1.08792675, + "epoch": 0.32608695652173914, + "flos": 499329360384.0, + "grad_norm": 0.023433013698769337, + "language_loss": 0.93903476, + "learning_rate": 0.0007871992299971136, + "loss": 0.9508248, + "num_input_tokens_seen": 140085600, + "router_z_loss_mlp": 0.90917969, + "step": 1695, + "time_per_iteration": 2.5506269931793213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179394, + "balance_loss_mlp": 1.08822274, + "epoch": 0.32627933820700267, + "flos": 592300948992.0, + "grad_norm": 0.02355558557065364, + "language_loss": 0.91491008, + "learning_rate": 0.0007869441532546001, + "loss": 0.92670405, + "num_input_tokens_seen": 140155152, + "router_z_loss_mlp": 0.91015625, + "step": 1696, + "time_per_iteration": 2.7493326663970947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177542, + "balance_loss_mlp": 1.08618009, + "epoch": 0.32647171989226625, + "flos": 610273531392.0, + "grad_norm": 0.02705729718991907, + "language_loss": 0.87004846, + "learning_rate": 0.0007866889651104867, + "loss": 0.8818239, + "num_input_tokens_seen": 140228560, + "router_z_loss_mlp": 0.91210938, + "step": 1697, + "time_per_iteration": 2.7824432849884033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179221, + "balance_loss_mlp": 1.08785892, + "epoch": 0.32666410157752984, + "flos": 478189704192.0, + "grad_norm": 0.028152017440838794, + "language_loss": 0.94142878, + "learning_rate": 0.000786433665663846, + "loss": 0.95322108, + "num_input_tokens_seen": 140297952, + "router_z_loss_mlp": 0.91210938, + "step": 1698, + "time_per_iteration": 2.6674411296844482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187877, + "balance_loss_mlp": 1.09670568, + "epoch": 0.3268564832627934, + "flos": 719693563392.0, + "grad_norm": 0.040459779361444057, + "language_loss": 0.95728016, + "learning_rate": 0.0007861782550137942, + "loss": 0.96915889, + "num_input_tokens_seen": 140373408, + "router_z_loss_mlp": 0.91015625, + "step": 1699, + "time_per_iteration": 2.923370599746704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187429, + "balance_loss_mlp": 1.09625793, + "epoch": 0.32704886494805696, + "flos": 770105135616.0, + "grad_norm": 0.025720199745930695, + "language_loss": 0.93479955, + "learning_rate": 0.0007859227332594901, + "loss": 0.94667387, + "num_input_tokens_seen": 140451840, + "router_z_loss_mlp": 0.91015625, + "step": 1700, + "time_per_iteration": 2.94014573097229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191948, + "balance_loss_mlp": 1.10120583, + "epoch": 0.3272412466333205, + "flos": 851404087296.0, + "grad_norm": 0.0329500691508657, + "language_loss": 0.94768298, + "learning_rate": 0.0007856671005001365, + "loss": 0.95960248, + "num_input_tokens_seen": 140537696, + "router_z_loss_mlp": 0.90576172, + "step": 1701, + "time_per_iteration": 3.1774539947509766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118211, + "balance_loss_mlp": 1.09065294, + "epoch": 0.3274336283185841, + "flos": 833040737280.0, + "grad_norm": 0.029774404200988806, + "language_loss": 0.90405869, + "learning_rate": 0.0007854113568349787, + "loss": 0.91587985, + "num_input_tokens_seen": 140623536, + "router_z_loss_mlp": 0.91308594, + "step": 1702, + "time_per_iteration": 3.107083559036255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186026, + "balance_loss_mlp": 1.09471202, + "epoch": 0.3276260100038476, + "flos": 693252347904.0, + "grad_norm": 0.029328613393929583, + "language_loss": 0.89606428, + "learning_rate": 0.0007851555023633052, + "loss": 0.90792453, + "num_input_tokens_seen": 140700688, + "router_z_loss_mlp": 0.91162109, + "step": 1703, + "time_per_iteration": 2.8335254192352295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011877, + "balance_loss_mlp": 1.09643364, + "epoch": 0.3278183916891112, + "flos": 436977908736.0, + "grad_norm": 0.03479764223743197, + "language_loss": 0.91987431, + "learning_rate": 0.0007848995371844474, + "loss": 0.93175125, + "num_input_tokens_seen": 140765808, + "router_z_loss_mlp": 0.91113281, + "step": 1704, + "time_per_iteration": 2.51261043548584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118827, + "balance_loss_mlp": 1.09728956, + "epoch": 0.3280107733743748, + "flos": 462016508928.0, + "grad_norm": 0.027955151013136243, + "language_loss": 0.90236068, + "learning_rate": 0.0007846434613977801, + "loss": 0.91424334, + "num_input_tokens_seen": 140830512, + "router_z_loss_mlp": 0.90820312, + "step": 1705, + "time_per_iteration": 2.51505708694458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185335, + "balance_loss_mlp": 1.09464061, + "epoch": 0.3282031550596383, + "flos": 680528931840.0, + "grad_norm": 0.0285448105624817, + "language_loss": 0.86403298, + "learning_rate": 0.0007843872751027203, + "loss": 0.87588632, + "num_input_tokens_seen": 140902816, + "router_z_loss_mlp": 0.90527344, + "step": 1706, + "time_per_iteration": 2.7977733612060547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183945, + "balance_loss_mlp": 1.0931555, + "epoch": 0.3283955367449019, + "flos": 546254949888.0, + "grad_norm": 0.024438576566567966, + "language_loss": 0.93906903, + "learning_rate": 0.0007841309783987287, + "loss": 0.95090854, + "num_input_tokens_seen": 140975488, + "router_z_loss_mlp": 0.90625, + "step": 1707, + "time_per_iteration": 2.737680196762085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178748, + "balance_loss_mlp": 1.08757639, + "epoch": 0.32858791843016544, + "flos": 482240371200.0, + "grad_norm": 0.027193371904651382, + "language_loss": 0.97315758, + "learning_rate": 0.0007838745713853084, + "loss": 0.98494506, + "num_input_tokens_seen": 141043248, + "router_z_loss_mlp": 0.91015625, + "step": 1708, + "time_per_iteration": 2.5702459812164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189964, + "balance_loss_mlp": 1.09879303, + "epoch": 0.328780300115429, + "flos": 567915629568.0, + "grad_norm": 0.029427091701823335, + "language_loss": 0.93208408, + "learning_rate": 0.0007836180541620053, + "loss": 0.94398379, + "num_input_tokens_seen": 141119408, + "router_z_loss_mlp": 0.91015625, + "step": 1709, + "time_per_iteration": 2.7365195751190186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189596, + "balance_loss_mlp": 1.09852052, + "epoch": 0.32897268180069256, + "flos": 476991204864.0, + "grad_norm": 0.02924752300223344, + "language_loss": 0.94609785, + "learning_rate": 0.0007833614268284082, + "loss": 0.95799387, + "num_input_tokens_seen": 141184112, + "router_z_loss_mlp": 0.90917969, + "step": 1710, + "time_per_iteration": 2.575416326522827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186913, + "balance_loss_mlp": 1.09745789, + "epoch": 0.32916506348595614, + "flos": 1580450603520.0, + "grad_norm": 0.014653073497659498, + "language_loss": 0.74109769, + "learning_rate": 0.0007831046894841489, + "loss": 0.75296688, + "num_input_tokens_seen": 141414960, + "router_z_loss_mlp": 0.89257812, + "step": 1711, + "time_per_iteration": 4.8569114208221436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117837, + "balance_loss_mlp": 1.08681703, + "epoch": 0.3293574451712197, + "flos": 483851105280.0, + "grad_norm": 0.027096123044633498, + "language_loss": 0.8678506, + "learning_rate": 0.0007828478422289016, + "loss": 0.87963432, + "num_input_tokens_seen": 141485744, + "router_z_loss_mlp": 0.9140625, + "step": 1712, + "time_per_iteration": 2.5748305320739746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181971, + "balance_loss_mlp": 1.09041798, + "epoch": 0.32954982685648326, + "flos": 623724088320.0, + "grad_norm": 0.027491608740018197, + "language_loss": 0.97854888, + "learning_rate": 0.0007825908851623833, + "loss": 0.99036855, + "num_input_tokens_seen": 141560592, + "router_z_loss_mlp": 0.9140625, + "step": 1713, + "time_per_iteration": 2.7387707233428955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180742, + "balance_loss_mlp": 1.0893327, + "epoch": 0.32974220854174685, + "flos": 546070299648.0, + "grad_norm": 0.028986059756107307, + "language_loss": 0.93660253, + "learning_rate": 0.0007823338183843533, + "loss": 0.94840991, + "num_input_tokens_seen": 141630400, + "router_z_loss_mlp": 0.91259766, + "step": 1714, + "time_per_iteration": 2.7061285972595215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194773, + "balance_loss_mlp": 1.10341084, + "epoch": 0.3299345902270104, + "flos": 983822286336.0, + "grad_norm": 0.02918308821255402, + "language_loss": 0.89344442, + "learning_rate": 0.0007820766419946141, + "loss": 0.90539211, + "num_input_tokens_seen": 141721552, + "router_z_loss_mlp": 0.91210938, + "step": 1715, + "time_per_iteration": 3.2698333263397217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119133, + "balance_loss_mlp": 1.10206604, + "epoch": 0.33012697191227397, + "flos": 1406901926400.0, + "grad_norm": 0.008988097140154246, + "language_loss": 0.7967248, + "learning_rate": 0.0007818193560930102, + "loss": 0.8086381, + "num_input_tokens_seen": 141956464, + "router_z_loss_mlp": 0.890625, + "step": 1716, + "time_per_iteration": 4.931420564651489 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193588, + "balance_loss_mlp": 1.10213029, + "epoch": 0.3303193535975375, + "flos": 506169795072.0, + "grad_norm": 0.03043585823380059, + "language_loss": 0.87317824, + "learning_rate": 0.0007815619607794288, + "loss": 0.88511419, + "num_input_tokens_seen": 142029552, + "router_z_loss_mlp": 0.91308594, + "step": 1717, + "time_per_iteration": 2.611924171447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198413, + "balance_loss_mlp": 1.10676467, + "epoch": 0.3305117352828011, + "flos": 939484349952.0, + "grad_norm": 0.029759763631388395, + "language_loss": 0.92828202, + "learning_rate": 0.0007813044561538001, + "loss": 0.94026613, + "num_input_tokens_seen": 142117344, + "router_z_loss_mlp": 0.91503906, + "step": 1718, + "time_per_iteration": 3.188633680343628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186368, + "balance_loss_mlp": 1.09495842, + "epoch": 0.3307041169680646, + "flos": 722793507840.0, + "grad_norm": 0.027827869889066197, + "language_loss": 0.97286105, + "learning_rate": 0.0007810468423160958, + "loss": 0.9847247, + "num_input_tokens_seen": 142190096, + "router_z_loss_mlp": 0.91259766, + "step": 1719, + "time_per_iteration": 2.8963494300842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179653, + "balance_loss_mlp": 1.08829057, + "epoch": 0.3308964986533282, + "flos": 584815965696.0, + "grad_norm": 0.0232486528054596, + "language_loss": 0.89203978, + "learning_rate": 0.0007807891193663306, + "loss": 0.90383637, + "num_input_tokens_seen": 142265584, + "router_z_loss_mlp": 0.91210938, + "step": 1720, + "time_per_iteration": 2.784005880355835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188579, + "balance_loss_mlp": 1.09712148, + "epoch": 0.33108888033859174, + "flos": 474525075456.0, + "grad_norm": 0.03234593548431852, + "language_loss": 0.92577451, + "learning_rate": 0.0007805312874045614, + "loss": 0.93766028, + "num_input_tokens_seen": 142330352, + "router_z_loss_mlp": 0.91308594, + "step": 1721, + "time_per_iteration": 2.5072579383850098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187856, + "balance_loss_mlp": 1.09635103, + "epoch": 0.3312812620238553, + "flos": 386996035584.0, + "grad_norm": 0.030880666413309405, + "language_loss": 0.96009982, + "learning_rate": 0.0007802733465308874, + "loss": 0.97197837, + "num_input_tokens_seen": 142392208, + "router_z_loss_mlp": 0.91357422, + "step": 1722, + "time_per_iteration": 2.460878372192383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193288, + "balance_loss_mlp": 1.10173571, + "epoch": 0.3314736437091189, + "flos": 495604333056.0, + "grad_norm": 0.02871647017272099, + "language_loss": 0.9219079, + "learning_rate": 0.0007800152968454501, + "loss": 0.93384075, + "num_input_tokens_seen": 142462112, + "router_z_loss_mlp": 0.9140625, + "step": 1723, + "time_per_iteration": 2.6537680625915527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185112, + "balance_loss_mlp": 1.09365499, + "epoch": 0.33166602539438245, + "flos": 654930376704.0, + "grad_norm": 0.0223046700763118, + "language_loss": 0.96869862, + "learning_rate": 0.0007797571384484334, + "loss": 0.98054969, + "num_input_tokens_seen": 142539120, + "router_z_loss_mlp": 0.91308594, + "step": 1724, + "time_per_iteration": 2.8509135246276855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180603, + "balance_loss_mlp": 1.08909798, + "epoch": 0.33185840707964603, + "flos": 521834701824.0, + "grad_norm": 0.02731483808063424, + "language_loss": 1.00636935, + "learning_rate": 0.0007794988714400633, + "loss": 1.01817536, + "num_input_tokens_seen": 142611520, + "router_z_loss_mlp": 0.91357422, + "step": 1725, + "time_per_iteration": 2.5883586406707764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180377, + "balance_loss_mlp": 1.08901501, + "epoch": 0.33205078876490957, + "flos": 437898432000.0, + "grad_norm": 0.028871117282170154, + "language_loss": 0.94438303, + "learning_rate": 0.0007792404959206079, + "loss": 0.95618677, + "num_input_tokens_seen": 142676064, + "router_z_loss_mlp": 0.91210938, + "step": 1726, + "time_per_iteration": 2.522392988204956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196305, + "balance_loss_mlp": 1.10499096, + "epoch": 0.33224317045017315, + "flos": 770094402048.0, + "grad_norm": 0.026417182809826974, + "language_loss": 0.89548182, + "learning_rate": 0.0007789820119903774, + "loss": 0.90744483, + "num_input_tokens_seen": 142750944, + "router_z_loss_mlp": 0.91162109, + "step": 1727, + "time_per_iteration": 3.015399217605591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119368, + "balance_loss_mlp": 1.10441589, + "epoch": 0.3324355521354367, + "flos": 1469293584384.0, + "grad_norm": 0.009201187704085647, + "language_loss": 0.78492665, + "learning_rate": 0.0007787234197497242, + "loss": 0.79686344, + "num_input_tokens_seen": 142974032, + "router_z_loss_mlp": 0.890625, + "step": 1728, + "time_per_iteration": 4.849627494812012 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187682, + "balance_loss_mlp": 1.09641564, + "epoch": 0.3326279338207003, + "flos": 497799217152.0, + "grad_norm": 0.02618775195690524, + "language_loss": 0.91979456, + "learning_rate": 0.0007784647192990428, + "loss": 0.93167138, + "num_input_tokens_seen": 143047280, + "router_z_loss_mlp": 0.91113281, + "step": 1729, + "time_per_iteration": 2.6944785118103027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178599, + "balance_loss_mlp": 1.08761811, + "epoch": 0.33282031550596386, + "flos": 637053121536.0, + "grad_norm": 0.02771760173732663, + "language_loss": 0.88792735, + "learning_rate": 0.0007782059107387696, + "loss": 0.89971334, + "num_input_tokens_seen": 143124224, + "router_z_loss_mlp": 0.90820312, + "step": 1730, + "time_per_iteration": 2.8583710193634033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179548, + "balance_loss_mlp": 1.0887109, + "epoch": 0.3330126971912274, + "flos": 690721090560.0, + "grad_norm": 0.027739782699759397, + "language_loss": 0.98025161, + "learning_rate": 0.0007779469941693826, + "loss": 0.99204707, + "num_input_tokens_seen": 143194048, + "router_z_loss_mlp": 0.90673828, + "step": 1731, + "time_per_iteration": 2.810589075088501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184359, + "balance_loss_mlp": 1.09361696, + "epoch": 0.333205078876491, + "flos": 567553059840.0, + "grad_norm": 0.03096728777448764, + "language_loss": 0.86715639, + "learning_rate": 0.0007776879696914029, + "loss": 0.87899995, + "num_input_tokens_seen": 143272976, + "router_z_loss_mlp": 0.90576172, + "step": 1732, + "time_per_iteration": 2.8331797122955322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179804, + "balance_loss_mlp": 1.08906233, + "epoch": 0.3333974605617545, + "flos": 642170030592.0, + "grad_norm": 0.024377484958938406, + "language_loss": 0.95668435, + "learning_rate": 0.000777428837405392, + "loss": 0.96848238, + "num_input_tokens_seen": 143346496, + "router_z_loss_mlp": 0.90576172, + "step": 1733, + "time_per_iteration": 2.8495984077453613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178278, + "balance_loss_mlp": 1.087345, + "epoch": 0.3335898422470181, + "flos": 462778578432.0, + "grad_norm": 0.02888991438897714, + "language_loss": 0.96001673, + "learning_rate": 0.0007771695974119544, + "loss": 0.97179955, + "num_input_tokens_seen": 143410448, + "router_z_loss_mlp": 0.90771484, + "step": 1734, + "time_per_iteration": 2.581843614578247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193993, + "balance_loss_mlp": 1.10267842, + "epoch": 0.33378222393228163, + "flos": 854336845824.0, + "grad_norm": 0.031032438471150628, + "language_loss": 0.84453082, + "learning_rate": 0.0007769102498117359, + "loss": 0.85647076, + "num_input_tokens_seen": 143492416, + "router_z_loss_mlp": 0.91162109, + "step": 1735, + "time_per_iteration": 3.092892646789551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118579, + "balance_loss_mlp": 1.09471452, + "epoch": 0.3339746056175452, + "flos": 956308824576.0, + "grad_norm": 0.02638013374987503, + "language_loss": 0.87690091, + "learning_rate": 0.000776650794705424, + "loss": 0.88875878, + "num_input_tokens_seen": 143590096, + "router_z_loss_mlp": 0.90917969, + "step": 1736, + "time_per_iteration": 3.26749587059021 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188294, + "balance_loss_mlp": 1.09693241, + "epoch": 0.33416698730280875, + "flos": 545894381568.0, + "grad_norm": 0.025194797458818457, + "language_loss": 0.89670336, + "learning_rate": 0.0007763912321937483, + "loss": 0.90858638, + "num_input_tokens_seen": 143663344, + "router_z_loss_mlp": 0.91210938, + "step": 1737, + "time_per_iteration": 2.680321455001831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186775, + "balance_loss_mlp": 1.09522188, + "epoch": 0.33435936898807234, + "flos": 1015875237888.0, + "grad_norm": 0.02847992800895855, + "language_loss": 0.91932124, + "learning_rate": 0.0007761315623774799, + "loss": 0.93118894, + "num_input_tokens_seen": 143753072, + "router_z_loss_mlp": 0.9140625, + "step": 1738, + "time_per_iteration": 3.3992278575897217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191791, + "balance_loss_mlp": 1.10014248, + "epoch": 0.3345517506733359, + "flos": 616371362304.0, + "grad_norm": 0.027566762490977777, + "language_loss": 0.97487831, + "learning_rate": 0.0007758717853574313, + "loss": 0.9867962, + "num_input_tokens_seen": 143827280, + "router_z_loss_mlp": 0.91503906, + "step": 1739, + "time_per_iteration": 2.7331244945526123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195023, + "balance_loss_mlp": 1.10327947, + "epoch": 0.33474413235859946, + "flos": 495569404416.0, + "grad_norm": 0.027457607023843998, + "language_loss": 0.9961037, + "learning_rate": 0.0007756119012344571, + "loss": 1.00805402, + "num_input_tokens_seen": 143895072, + "router_z_loss_mlp": 0.91601562, + "step": 1740, + "time_per_iteration": 2.5305063724517822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189378, + "balance_loss_mlp": 1.09772944, + "epoch": 0.33493651404386304, + "flos": 629487547392.0, + "grad_norm": 0.029043894294382887, + "language_loss": 0.93616855, + "learning_rate": 0.0007753519101094535, + "loss": 0.9480623, + "num_input_tokens_seen": 143965728, + "router_z_loss_mlp": 0.91503906, + "step": 1741, + "time_per_iteration": 2.7408056259155273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177762, + "balance_loss_mlp": 1.08630431, + "epoch": 0.3351288957291266, + "flos": 514742487552.0, + "grad_norm": 0.027889242250670986, + "language_loss": 0.95720202, + "learning_rate": 0.0007750918120833575, + "loss": 0.96897966, + "num_input_tokens_seen": 144030272, + "router_z_loss_mlp": 0.91308594, + "step": 1742, + "time_per_iteration": 2.5787625312805176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179593, + "balance_loss_mlp": 1.08818376, + "epoch": 0.33532127741439016, + "flos": 648482711040.0, + "grad_norm": 0.029208114264274002, + "language_loss": 0.95614851, + "learning_rate": 0.0007748316072571485, + "loss": 0.96794444, + "num_input_tokens_seen": 144104048, + "router_z_loss_mlp": 0.91259766, + "step": 1743, + "time_per_iteration": 2.751394033432007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178526, + "balance_loss_mlp": 1.08764088, + "epoch": 0.3355136590996537, + "flos": 769788228096.0, + "grad_norm": 0.02678280054581141, + "language_loss": 0.86505532, + "learning_rate": 0.0007745712957318467, + "loss": 0.87684047, + "num_input_tokens_seen": 144180432, + "router_z_loss_mlp": 0.90722656, + "step": 1744, + "time_per_iteration": 2.9703569412231445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179715, + "balance_loss_mlp": 1.088925, + "epoch": 0.3357060407849173, + "flos": 596649057792.0, + "grad_norm": 0.023433474800662903, + "language_loss": 0.94101429, + "learning_rate": 0.0007743108776085141, + "loss": 0.95281148, + "num_input_tokens_seen": 144258704, + "router_z_loss_mlp": 0.90625, + "step": 1745, + "time_per_iteration": 2.7529683113098145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184954, + "balance_loss_mlp": 1.09435499, + "epoch": 0.3358984224701808, + "flos": 599801395200.0, + "grad_norm": 0.02538707782704008, + "language_loss": 0.88967884, + "learning_rate": 0.0007740503529882543, + "loss": 0.9015283, + "num_input_tokens_seen": 144335104, + "router_z_loss_mlp": 0.90429688, + "step": 1746, + "time_per_iteration": 2.79131817817688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188552, + "balance_loss_mlp": 1.09780991, + "epoch": 0.3360908041554444, + "flos": 579429812736.0, + "grad_norm": 0.028485119021284356, + "language_loss": 0.99668056, + "learning_rate": 0.0007737897219722114, + "loss": 1.00856614, + "num_input_tokens_seen": 144402912, + "router_z_loss_mlp": 0.90576172, + "step": 1747, + "time_per_iteration": 2.685925006866455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189008, + "balance_loss_mlp": 1.09836173, + "epoch": 0.336283185840708, + "flos": 514620963840.0, + "grad_norm": 0.027318502045144608, + "language_loss": 0.90481317, + "learning_rate": 0.0007735289846615716, + "loss": 0.91670322, + "num_input_tokens_seen": 144475328, + "router_z_loss_mlp": 0.90478516, + "step": 1748, + "time_per_iteration": 2.62443470954895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189766, + "balance_loss_mlp": 1.09902358, + "epoch": 0.3364755675259715, + "flos": 526013623296.0, + "grad_norm": 0.026723032477842582, + "language_loss": 0.90137696, + "learning_rate": 0.0007732681411575621, + "loss": 0.91327465, + "num_input_tokens_seen": 144548288, + "router_z_loss_mlp": 0.90576172, + "step": 1749, + "time_per_iteration": 2.646358013153076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182694, + "balance_loss_mlp": 1.09209466, + "epoch": 0.3366679492112351, + "flos": 555973748736.0, + "grad_norm": 0.023573972968583972, + "language_loss": 0.93333745, + "learning_rate": 0.0007730071915614514, + "loss": 0.94516432, + "num_input_tokens_seen": 144619488, + "router_z_loss_mlp": 0.90429688, + "step": 1750, + "time_per_iteration": 2.6758012771606445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179857, + "balance_loss_mlp": 1.08901942, + "epoch": 0.33686033089649864, + "flos": 428164170240.0, + "grad_norm": 0.030830494146199924, + "language_loss": 0.97502697, + "learning_rate": 0.0007727461359745489, + "loss": 0.98682547, + "num_input_tokens_seen": 144682560, + "router_z_loss_mlp": 0.90673828, + "step": 1751, + "time_per_iteration": 2.4563541412353516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182248, + "balance_loss_mlp": 1.09145832, + "epoch": 0.3370527125817622, + "flos": 542840099328.0, + "grad_norm": 0.023246790346845608, + "language_loss": 0.93729055, + "learning_rate": 0.0007724849744982056, + "loss": 0.94911301, + "num_input_tokens_seen": 144753328, + "router_z_loss_mlp": 0.90625, + "step": 1752, + "time_per_iteration": 2.668113946914673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179422, + "balance_loss_mlp": 1.08858418, + "epoch": 0.33724509426702576, + "flos": 543230866944.0, + "grad_norm": 0.02371236203418416, + "language_loss": 0.90932786, + "learning_rate": 0.0007722237072338131, + "loss": 0.92112207, + "num_input_tokens_seen": 144827312, + "router_z_loss_mlp": 0.90673828, + "step": 1753, + "time_per_iteration": 2.69787335395813 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178324, + "balance_loss_mlp": 1.08753431, + "epoch": 0.33743747595228935, + "flos": 473752272384.0, + "grad_norm": 0.029898359882718887, + "language_loss": 0.95709926, + "learning_rate": 0.0007719623342828046, + "loss": 0.96888256, + "num_input_tokens_seen": 144893488, + "router_z_loss_mlp": 0.90625, + "step": 1754, + "time_per_iteration": 2.4994091987609863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183652, + "balance_loss_mlp": 1.09295714, + "epoch": 0.33762985763755293, + "flos": 470836978176.0, + "grad_norm": 0.02665869511949433, + "language_loss": 0.93777692, + "learning_rate": 0.000771700855746654, + "loss": 0.94961339, + "num_input_tokens_seen": 144961152, + "router_z_loss_mlp": 0.90527344, + "step": 1755, + "time_per_iteration": 2.58086895942688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178715, + "balance_loss_mlp": 1.08792567, + "epoch": 0.33782223932281646, + "flos": 493250995200.0, + "grad_norm": 0.024252070816233498, + "language_loss": 0.95916575, + "learning_rate": 0.0007714392717268763, + "loss": 0.97095293, + "num_input_tokens_seen": 145030576, + "router_z_loss_mlp": 0.90625, + "step": 1756, + "time_per_iteration": 2.5631322860717773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180772, + "balance_loss_mlp": 1.08988702, + "epoch": 0.33801462100808005, + "flos": 466017510912.0, + "grad_norm": 0.025388958299120416, + "language_loss": 0.95127004, + "learning_rate": 0.0007711775823250273, + "loss": 0.96307778, + "num_input_tokens_seen": 145095648, + "router_z_loss_mlp": 0.90722656, + "step": 1757, + "time_per_iteration": 2.5053045749664307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178431, + "balance_loss_mlp": 1.08754551, + "epoch": 0.3382070026933436, + "flos": 797067374592.0, + "grad_norm": 0.024419621343361942, + "language_loss": 0.92107689, + "learning_rate": 0.0007709157876427039, + "loss": 0.93286121, + "num_input_tokens_seen": 145181248, + "router_z_loss_mlp": 0.90722656, + "step": 1758, + "time_per_iteration": 3.1007301807403564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178269, + "balance_loss_mlp": 1.08738351, + "epoch": 0.33839938437860717, + "flos": 509428193280.0, + "grad_norm": 0.024832384176200758, + "language_loss": 0.94253516, + "learning_rate": 0.0007706538877815439, + "loss": 0.95431781, + "num_input_tokens_seen": 145252944, + "router_z_loss_mlp": 0.90722656, + "step": 1759, + "time_per_iteration": 2.588744640350342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178646, + "balance_loss_mlp": 1.0878557, + "epoch": 0.3385917660638707, + "flos": 485273186304.0, + "grad_norm": 0.02369115174437829, + "language_loss": 0.89945841, + "learning_rate": 0.0007703918828432259, + "loss": 0.91124481, + "num_input_tokens_seen": 145323168, + "router_z_loss_mlp": 0.90625, + "step": 1760, + "time_per_iteration": 2.5859875679016113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178403, + "balance_loss_mlp": 1.08770907, + "epoch": 0.3387841477491343, + "flos": 546415405056.0, + "grad_norm": 0.02534991906570622, + "language_loss": 0.96946132, + "learning_rate": 0.000770129772929469, + "loss": 0.9812454, + "num_input_tokens_seen": 145395776, + "router_z_loss_mlp": 0.90527344, + "step": 1761, + "time_per_iteration": 2.633229970932007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117744, + "balance_loss_mlp": 1.08684063, + "epoch": 0.3389765294343978, + "flos": 721063251456.0, + "grad_norm": 0.027907228809642075, + "language_loss": 0.96886694, + "learning_rate": 0.0007698675581420334, + "loss": 0.98064131, + "num_input_tokens_seen": 145470576, + "router_z_loss_mlp": 0.90429688, + "step": 1762, + "time_per_iteration": 2.8309946060180664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190138, + "balance_loss_mlp": 1.09987259, + "epoch": 0.3391689111196614, + "flos": 701263084032.0, + "grad_norm": 0.028701846645649853, + "language_loss": 0.87853253, + "learning_rate": 0.0007696052385827199, + "loss": 0.89043397, + "num_input_tokens_seen": 145548896, + "router_z_loss_mlp": 0.90087891, + "step": 1763, + "time_per_iteration": 2.9673497676849365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183311, + "balance_loss_mlp": 1.09304607, + "epoch": 0.339361292804925, + "flos": 628248115200.0, + "grad_norm": 0.027144566695111814, + "language_loss": 0.85910845, + "learning_rate": 0.00076934281435337, + "loss": 0.87094158, + "num_input_tokens_seen": 145617136, + "router_z_loss_mlp": 0.90087891, + "step": 1764, + "time_per_iteration": 2.7069530487060547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011791, + "balance_loss_mlp": 1.08869135, + "epoch": 0.33955367449018853, + "flos": 610794554880.0, + "grad_norm": 0.025973604998757366, + "language_loss": 0.94002628, + "learning_rate": 0.0007690802855558658, + "loss": 0.95181727, + "num_input_tokens_seen": 145696416, + "router_z_loss_mlp": 0.90234375, + "step": 1765, + "time_per_iteration": 2.8596885204315186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198868, + "balance_loss_mlp": 1.11151123, + "epoch": 0.3397460561754521, + "flos": 1456586357760.0, + "grad_norm": 0.018873382807181687, + "language_loss": 0.76374954, + "learning_rate": 0.0007688176522921302, + "loss": 0.77573818, + "num_input_tokens_seen": 145919680, + "router_z_loss_mlp": 0.87109375, + "step": 1766, + "time_per_iteration": 4.900039434432983 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183458, + "balance_loss_mlp": 1.09304976, + "epoch": 0.33993843786071565, + "flos": 488290538496.0, + "grad_norm": 0.033631077459875626, + "language_loss": 1.00266671, + "learning_rate": 0.0007685549146641262, + "loss": 1.01450121, + "num_input_tokens_seen": 145984272, + "router_z_loss_mlp": 0.90234375, + "step": 1767, + "time_per_iteration": 2.521587610244751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176512, + "balance_loss_mlp": 1.08557928, + "epoch": 0.34013081954597923, + "flos": 418232523264.0, + "grad_norm": 0.024531175575557927, + "language_loss": 0.95696396, + "learning_rate": 0.0007682920727738579, + "loss": 0.96872908, + "num_input_tokens_seen": 146047248, + "router_z_loss_mlp": 0.90771484, + "step": 1768, + "time_per_iteration": 2.4606878757476807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177177, + "balance_loss_mlp": 1.08614898, + "epoch": 0.34032320123124277, + "flos": 438430189056.0, + "grad_norm": 0.027457130501572214, + "language_loss": 0.93990809, + "learning_rate": 0.000768029126723369, + "loss": 0.95167989, + "num_input_tokens_seen": 146111872, + "router_z_loss_mlp": 0.90869141, + "step": 1769, + "time_per_iteration": 2.494699478149414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181852, + "balance_loss_mlp": 1.09077609, + "epoch": 0.34051558291650635, + "flos": 458543261184.0, + "grad_norm": 0.027949795017340132, + "language_loss": 0.90377855, + "learning_rate": 0.0007677660766147447, + "loss": 0.91559708, + "num_input_tokens_seen": 146172608, + "router_z_loss_mlp": 0.90917969, + "step": 1770, + "time_per_iteration": 2.5302748680114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183578, + "balance_loss_mlp": 1.09469604, + "epoch": 0.3407079646017699, + "flos": 1562137645056.0, + "grad_norm": 0.011444512115251876, + "language_loss": 0.72470945, + "learning_rate": 0.0007675029225501102, + "loss": 0.73654521, + "num_input_tokens_seen": 146413584, + "router_z_loss_mlp": 0.88671875, + "step": 1771, + "time_per_iteration": 4.913311004638672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188847, + "balance_loss_mlp": 1.09758055, + "epoch": 0.3409003462870335, + "flos": 493530972672.0, + "grad_norm": 0.032062498304007335, + "language_loss": 0.91194993, + "learning_rate": 0.0007672396646316306, + "loss": 0.92383844, + "num_input_tokens_seen": 146476992, + "router_z_loss_mlp": 0.91113281, + "step": 1772, + "time_per_iteration": 2.539181709289551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179934, + "balance_loss_mlp": 1.08885825, + "epoch": 0.34109272797229706, + "flos": 809820989952.0, + "grad_norm": 0.028470010979029077, + "language_loss": 0.88439053, + "learning_rate": 0.000766976302961512, + "loss": 0.89618981, + "num_input_tokens_seen": 146552848, + "router_z_loss_mlp": 0.90917969, + "step": 1773, + "time_per_iteration": 3.006547212600708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181829, + "balance_loss_mlp": 1.09094357, + "epoch": 0.3412851096575606, + "flos": 471099491328.0, + "grad_norm": 0.02901021255147234, + "language_loss": 0.91066158, + "learning_rate": 0.0007667128376420003, + "loss": 0.92247993, + "num_input_tokens_seen": 146617504, + "router_z_loss_mlp": 0.90722656, + "step": 1774, + "time_per_iteration": 2.534266233444214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118318, + "balance_loss_mlp": 1.09253371, + "epoch": 0.3414774913428242, + "flos": 596770581504.0, + "grad_norm": 0.02876896591079206, + "language_loss": 0.92739397, + "learning_rate": 0.0007664492687753817, + "loss": 0.93922579, + "num_input_tokens_seen": 146691568, + "router_z_loss_mlp": 0.90478516, + "step": 1775, + "time_per_iteration": 2.671475410461426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181574, + "balance_loss_mlp": 1.09102285, + "epoch": 0.3416698730280877, + "flos": 528507950592.0, + "grad_norm": 0.025483549401886952, + "language_loss": 0.89018893, + "learning_rate": 0.000766185596463983, + "loss": 0.90200466, + "num_input_tokens_seen": 146764208, + "router_z_loss_mlp": 0.90380859, + "step": 1776, + "time_per_iteration": 2.6099884510040283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177935, + "balance_loss_mlp": 1.08719325, + "epoch": 0.3418622547133513, + "flos": 876117047808.0, + "grad_norm": 0.026020404961979337, + "language_loss": 0.84743214, + "learning_rate": 0.0007659218208101706, + "loss": 0.8592115, + "num_input_tokens_seen": 146847744, + "router_z_loss_mlp": 0.90576172, + "step": 1777, + "time_per_iteration": 3.1272366046905518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118093, + "balance_loss_mlp": 1.08994997, + "epoch": 0.34205463639861483, + "flos": 604876644864.0, + "grad_norm": 0.024068405360429687, + "language_loss": 0.91582745, + "learning_rate": 0.0007656579419163515, + "loss": 0.92763674, + "num_input_tokens_seen": 146918336, + "router_z_loss_mlp": 0.90820312, + "step": 1778, + "time_per_iteration": 2.7243831157684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180436, + "balance_loss_mlp": 1.0894556, + "epoch": 0.3422470180838784, + "flos": 464714952192.0, + "grad_norm": 0.02739040164484414, + "language_loss": 0.86445272, + "learning_rate": 0.0007653939598849724, + "loss": 0.87625706, + "num_input_tokens_seen": 146982496, + "router_z_loss_mlp": 0.90820312, + "step": 1779, + "time_per_iteration": 2.4913573265075684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180695, + "balance_loss_mlp": 1.09143066, + "epoch": 0.34243939976914195, + "flos": 1589816291328.0, + "grad_norm": 0.01051605552964957, + "language_loss": 0.82880205, + "learning_rate": 0.0007651298748185204, + "loss": 0.84060901, + "num_input_tokens_seen": 147213600, + "router_z_loss_mlp": 0.890625, + "step": 1780, + "time_per_iteration": 4.891184091567993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176554, + "balance_loss_mlp": 1.085621, + "epoch": 0.34263178145440554, + "flos": 874443187200.0, + "grad_norm": 0.026322112436007235, + "language_loss": 0.88782489, + "learning_rate": 0.000764865686819522, + "loss": 0.89959043, + "num_input_tokens_seen": 147287664, + "router_z_loss_mlp": 0.90771484, + "step": 1781, + "time_per_iteration": 3.048123836517334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176352, + "balance_loss_mlp": 1.08551466, + "epoch": 0.3428241631396691, + "flos": 507873854976.0, + "grad_norm": 0.024622696081698998, + "language_loss": 0.93515933, + "learning_rate": 0.0007646013959905449, + "loss": 0.94692284, + "num_input_tokens_seen": 147356800, + "router_z_loss_mlp": 0.90673828, + "step": 1782, + "time_per_iteration": 2.565661907196045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176257, + "balance_loss_mlp": 1.08565772, + "epoch": 0.34301654482493266, + "flos": 881524667904.0, + "grad_norm": 0.0252118274748732, + "language_loss": 0.880337, + "learning_rate": 0.0007643370024341949, + "loss": 0.89209956, + "num_input_tokens_seen": 147432496, + "router_z_loss_mlp": 0.90429688, + "step": 1783, + "time_per_iteration": 3.0695888996124268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180625, + "balance_loss_mlp": 1.08959711, + "epoch": 0.34320892651019624, + "flos": 432668731392.0, + "grad_norm": 0.024350173092139916, + "language_loss": 0.89407057, + "learning_rate": 0.0007640725062531195, + "loss": 0.90587682, + "num_input_tokens_seen": 147495856, + "router_z_loss_mlp": 0.90869141, + "step": 1784, + "time_per_iteration": 2.5120832920074463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184023, + "balance_loss_mlp": 1.09294736, + "epoch": 0.3434013081954598, + "flos": 464593428480.0, + "grad_norm": 0.02877111448667641, + "language_loss": 0.95969987, + "learning_rate": 0.0007638079075500047, + "loss": 0.97154009, + "num_input_tokens_seen": 147559632, + "router_z_loss_mlp": 0.90917969, + "step": 1785, + "time_per_iteration": 2.5176198482513428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194351, + "balance_loss_mlp": 1.10546875, + "epoch": 0.34359368988072336, + "flos": 1560674631168.0, + "grad_norm": 0.01088995253456435, + "language_loss": 0.75180668, + "learning_rate": 0.0007635432064275772, + "loss": 0.7637502, + "num_input_tokens_seen": 147794576, + "router_z_loss_mlp": 0.88671875, + "step": 1786, + "time_per_iteration": 5.021549463272095 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183341, + "balance_loss_mlp": 1.09278917, + "epoch": 0.3437860715659869, + "flos": 496572519936.0, + "grad_norm": 0.024204144242014246, + "language_loss": 0.90540475, + "learning_rate": 0.0007632784029886026, + "loss": 0.91723818, + "num_input_tokens_seen": 147866960, + "router_z_loss_mlp": 0.90380859, + "step": 1787, + "time_per_iteration": 2.6350793838500977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178894, + "balance_loss_mlp": 1.08791375, + "epoch": 0.3439784532512505, + "flos": 719608969728.0, + "grad_norm": 0.025958683961259412, + "language_loss": 0.93068433, + "learning_rate": 0.0007630134973358873, + "loss": 0.94247323, + "num_input_tokens_seen": 147947808, + "router_z_loss_mlp": 0.90820312, + "step": 1788, + "time_per_iteration": 2.93084454536438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178793, + "balance_loss_mlp": 1.08785999, + "epoch": 0.34417083493651407, + "flos": 566921246208.0, + "grad_norm": 0.025032512144454056, + "language_loss": 0.92506206, + "learning_rate": 0.0007627484895722763, + "loss": 0.93685007, + "num_input_tokens_seen": 148015936, + "router_z_loss_mlp": 0.90771484, + "step": 1789, + "time_per_iteration": 2.649689197540283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177857, + "balance_loss_mlp": 1.08706772, + "epoch": 0.3443632166217776, + "flos": 797701189632.0, + "grad_norm": 0.027302991531117576, + "language_loss": 0.89870507, + "learning_rate": 0.0007624833798006552, + "loss": 0.9104836, + "num_input_tokens_seen": 148099776, + "router_z_loss_mlp": 0.90625, + "step": 1790, + "time_per_iteration": 3.0469179153442383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117862, + "balance_loss_mlp": 1.08811665, + "epoch": 0.3445555983070412, + "flos": 570392492544.0, + "grad_norm": 0.0288389056738737, + "language_loss": 0.92729777, + "learning_rate": 0.0007622181681239483, + "loss": 0.93908393, + "num_input_tokens_seen": 148169616, + "router_z_loss_mlp": 0.90332031, + "step": 1791, + "time_per_iteration": 2.6440184116363525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178949, + "balance_loss_mlp": 1.08849263, + "epoch": 0.3447479799923047, + "flos": 569980257792.0, + "grad_norm": 0.022982775931836206, + "language_loss": 0.91584516, + "learning_rate": 0.0007619528546451202, + "loss": 0.9276346, + "num_input_tokens_seen": 148247824, + "router_z_loss_mlp": 0.90283203, + "step": 1792, + "time_per_iteration": 2.797133445739746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177091, + "balance_loss_mlp": 1.08673048, + "epoch": 0.3449403616775683, + "flos": 969331683840.0, + "grad_norm": 0.02628926210615307, + "language_loss": 0.90923131, + "learning_rate": 0.0007616874394671745, + "loss": 0.92100227, + "num_input_tokens_seen": 148333040, + "router_z_loss_mlp": 0.90185547, + "step": 1793, + "time_per_iteration": 3.3191378116607666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178301, + "balance_loss_mlp": 1.08784556, + "epoch": 0.34513274336283184, + "flos": 569676085248.0, + "grad_norm": 0.03267712320672132, + "language_loss": 0.9558928, + "learning_rate": 0.0007614219226931547, + "loss": 0.96767581, + "num_input_tokens_seen": 148401840, + "router_z_loss_mlp": 0.90283203, + "step": 1794, + "time_per_iteration": 2.677525043487549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178051, + "balance_loss_mlp": 1.0875473, + "epoch": 0.3453251250480954, + "flos": 461858055168.0, + "grad_norm": 0.024689469906648515, + "language_loss": 0.92397773, + "learning_rate": 0.0007611563044261435, + "loss": 0.93575823, + "num_input_tokens_seen": 148466576, + "router_z_loss_mlp": 0.90332031, + "step": 1795, + "time_per_iteration": 2.5183908939361572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178812, + "balance_loss_mlp": 1.08835602, + "epoch": 0.34551750673335896, + "flos": 416519731200.0, + "grad_norm": 0.027710199676415265, + "language_loss": 0.96473086, + "learning_rate": 0.0007608905847692631, + "loss": 0.97651899, + "num_input_tokens_seen": 148530016, + "router_z_loss_mlp": 0.90283203, + "step": 1796, + "time_per_iteration": 2.4600772857666016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182482, + "balance_loss_mlp": 1.09212101, + "epoch": 0.34570988841862255, + "flos": 589114409472.0, + "grad_norm": 0.023363368939277738, + "language_loss": 0.92555124, + "learning_rate": 0.0007606247638256749, + "loss": 0.93737608, + "num_input_tokens_seen": 148610064, + "router_z_loss_mlp": 0.90185547, + "step": 1797, + "time_per_iteration": 2.8326525688171387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183395, + "balance_loss_mlp": 1.09565735, + "epoch": 0.34590227010388613, + "flos": 1571142764544.0, + "grad_norm": 0.009651567236440416, + "language_loss": 0.78170294, + "learning_rate": 0.0007603588416985798, + "loss": 0.79353684, + "num_input_tokens_seen": 148835872, + "router_z_loss_mlp": 0.875, + "step": 1798, + "time_per_iteration": 4.921091794967651 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118071, + "balance_loss_mlp": 1.09259033, + "epoch": 0.34609465178914967, + "flos": 1540928131584.0, + "grad_norm": 0.004186018133500934, + "language_loss": 0.79327202, + "learning_rate": 0.0007600928184912179, + "loss": 0.8050791, + "num_input_tokens_seen": 149066864, + "router_z_loss_mlp": 0.87890625, + "step": 1799, + "time_per_iteration": 4.76463508605957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177428, + "balance_loss_mlp": 1.08692396, + "epoch": 0.34628703347441325, + "flos": 610516578816.0, + "grad_norm": 0.027319297321258894, + "language_loss": 0.94778776, + "learning_rate": 0.0007598266943068686, + "loss": 0.95956194, + "num_input_tokens_seen": 149141600, + "router_z_loss_mlp": 0.90332031, + "step": 1800, + "time_per_iteration": 2.741830348968506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180421, + "balance_loss_mlp": 1.0898217, + "epoch": 0.3464794151596768, + "flos": 474264563712.0, + "grad_norm": 0.0268607754896097, + "language_loss": 0.91417915, + "learning_rate": 0.0007595604692488507, + "loss": 0.92598337, + "num_input_tokens_seen": 149205888, + "router_z_loss_mlp": 0.90429688, + "step": 1801, + "time_per_iteration": 2.5253777503967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117756, + "balance_loss_mlp": 1.08719921, + "epoch": 0.34667179684494037, + "flos": 606821750784.0, + "grad_norm": 0.0251267071243342, + "language_loss": 0.907076, + "learning_rate": 0.0007592941434205215, + "loss": 0.91885161, + "num_input_tokens_seen": 149281280, + "router_z_loss_mlp": 0.90185547, + "step": 1802, + "time_per_iteration": 2.7729735374450684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175873, + "balance_loss_mlp": 1.0877533, + "epoch": 0.3468641785302039, + "flos": 1568359727616.0, + "grad_norm": 0.004114808875680539, + "language_loss": 0.73571062, + "learning_rate": 0.0007590277169252782, + "loss": 0.74746931, + "num_input_tokens_seen": 149525008, + "router_z_loss_mlp": 0.87890625, + "step": 1803, + "time_per_iteration": 5.036771774291992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178076, + "balance_loss_mlp": 1.08776271, + "epoch": 0.3470565602154675, + "flos": 908723223552.0, + "grad_norm": 0.03174792037748739, + "language_loss": 0.90712535, + "learning_rate": 0.0007587611898665566, + "loss": 0.91890609, + "num_input_tokens_seen": 149600624, + "router_z_loss_mlp": 0.90136719, + "step": 1804, + "time_per_iteration": 3.0725910663604736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177414, + "balance_loss_mlp": 1.08719671, + "epoch": 0.347248941900731, + "flos": 640059740160.0, + "grad_norm": 0.023310551488003612, + "language_loss": 0.90306699, + "learning_rate": 0.0007584945623478315, + "loss": 0.91484118, + "num_input_tokens_seen": 149674224, + "router_z_loss_mlp": 0.90039062, + "step": 1805, + "time_per_iteration": 2.8080646991729736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176916, + "balance_loss_mlp": 1.08655512, + "epoch": 0.3474413235859946, + "flos": 848781505536.0, + "grad_norm": 0.027596494202169034, + "language_loss": 0.90514499, + "learning_rate": 0.000758227834472617, + "loss": 0.91691411, + "num_input_tokens_seen": 149758688, + "router_z_loss_mlp": 0.90185547, + "step": 1806, + "time_per_iteration": 3.0443291664123535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179899, + "balance_loss_mlp": 1.08972931, + "epoch": 0.3476337052712582, + "flos": 516696325632.0, + "grad_norm": 0.02724510251762829, + "language_loss": 0.86438924, + "learning_rate": 0.0007579610063444664, + "loss": 0.87618828, + "num_input_tokens_seen": 149831648, + "router_z_loss_mlp": 0.89990234, + "step": 1807, + "time_per_iteration": 2.716522455215454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177066, + "balance_loss_mlp": 1.08694386, + "epoch": 0.34782608695652173, + "flos": 915114493440.0, + "grad_norm": 0.02927822844999151, + "language_loss": 0.96424794, + "learning_rate": 0.0007576940780669712, + "loss": 0.97601861, + "num_input_tokens_seen": 149919440, + "router_z_loss_mlp": 0.89941406, + "step": 1808, + "time_per_iteration": 3.21464204788208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177424, + "balance_loss_mlp": 1.08734941, + "epoch": 0.3480184686417853, + "flos": 775083056640.0, + "grad_norm": 0.026376675364870938, + "language_loss": 0.91835052, + "learning_rate": 0.0007574270497437624, + "loss": 0.93012476, + "num_input_tokens_seen": 150001632, + "router_z_loss_mlp": 0.89892578, + "step": 1809, + "time_per_iteration": 2.965306043624878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177298, + "balance_loss_mlp": 1.0874145, + "epoch": 0.34821085032704885, + "flos": 578003728896.0, + "grad_norm": 0.024336980271772477, + "language_loss": 0.95592844, + "learning_rate": 0.000757159921478509, + "loss": 0.96770144, + "num_input_tokens_seen": 150077552, + "router_z_loss_mlp": 0.89697266, + "step": 1810, + "time_per_iteration": 2.781496047973633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177093, + "balance_loss_mlp": 1.088974, + "epoch": 0.34840323201231244, + "flos": 1528039531008.0, + "grad_norm": 0.007178450494277746, + "language_loss": 0.74450636, + "learning_rate": 0.0007568926933749201, + "loss": 0.75627732, + "num_input_tokens_seen": 150295328, + "router_z_loss_mlp": 0.87890625, + "step": 1811, + "time_per_iteration": 4.719515562057495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176704, + "balance_loss_mlp": 1.08691561, + "epoch": 0.34859561369757597, + "flos": 510181530624.0, + "grad_norm": 0.02648580139398905, + "language_loss": 0.96071857, + "learning_rate": 0.0007566253655367423, + "loss": 0.97248554, + "num_input_tokens_seen": 150360496, + "router_z_loss_mlp": 0.89599609, + "step": 1812, + "time_per_iteration": 2.5699198246002197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177921, + "balance_loss_mlp": 1.08822834, + "epoch": 0.34878799538283956, + "flos": 549756395520.0, + "grad_norm": 0.036663453377328174, + "language_loss": 0.96810794, + "learning_rate": 0.000756357938067762, + "loss": 0.97988713, + "num_input_tokens_seen": 150432064, + "router_z_loss_mlp": 0.89501953, + "step": 1813, + "time_per_iteration": 2.6622092723846436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179077, + "balance_loss_mlp": 1.08885992, + "epoch": 0.34898037706810314, + "flos": 985193975808.0, + "grad_norm": 0.026013801782247825, + "language_loss": 0.90032709, + "learning_rate": 0.0007560904110718033, + "loss": 0.91211784, + "num_input_tokens_seen": 150512176, + "router_z_loss_mlp": 0.90039062, + "step": 1814, + "time_per_iteration": 3.2704074382781982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176613, + "balance_loss_mlp": 1.08639514, + "epoch": 0.3491727587533667, + "flos": 682836607488.0, + "grad_norm": 0.025025787643359835, + "language_loss": 0.91824377, + "learning_rate": 0.0007558227846527297, + "loss": 0.93000984, + "num_input_tokens_seen": 150586416, + "router_z_loss_mlp": 0.90039062, + "step": 1815, + "time_per_iteration": 2.870858907699585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176853, + "balance_loss_mlp": 1.08673084, + "epoch": 0.34936514043863026, + "flos": 394889250816.0, + "grad_norm": 0.0291076708707547, + "language_loss": 0.91979998, + "learning_rate": 0.0007555550589144429, + "loss": 0.9315685, + "num_input_tokens_seen": 150648944, + "router_z_loss_mlp": 0.89941406, + "step": 1816, + "time_per_iteration": 2.4363009929656982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177424, + "balance_loss_mlp": 1.08739722, + "epoch": 0.3495575221238938, + "flos": 462340147200.0, + "grad_norm": 0.02440335273431038, + "language_loss": 0.92281306, + "learning_rate": 0.000755287233960883, + "loss": 0.9345873, + "num_input_tokens_seen": 150717200, + "router_z_loss_mlp": 0.8984375, + "step": 1817, + "time_per_iteration": 2.538250207901001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117706, + "balance_loss_mlp": 1.08693826, + "epoch": 0.3497499038091574, + "flos": 725428824576.0, + "grad_norm": 0.028430093115180927, + "language_loss": 0.88002723, + "learning_rate": 0.0007550193098960292, + "loss": 0.89179784, + "num_input_tokens_seen": 150790368, + "router_z_loss_mlp": 0.89941406, + "step": 1818, + "time_per_iteration": 2.8685545921325684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174425, + "balance_loss_mlp": 1.08411181, + "epoch": 0.3499422854944209, + "flos": 829196187648.0, + "grad_norm": 0.021653398091314287, + "language_loss": 0.92103571, + "learning_rate": 0.0007547512868238988, + "loss": 0.93277991, + "num_input_tokens_seen": 150879872, + "router_z_loss_mlp": 0.90136719, + "step": 1819, + "time_per_iteration": 3.115814685821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118204, + "balance_loss_mlp": 1.092013, + "epoch": 0.3501346671796845, + "flos": 494542820352.0, + "grad_norm": 0.026515438979626053, + "language_loss": 0.9198699, + "learning_rate": 0.0007544831648485473, + "loss": 0.93169028, + "num_input_tokens_seen": 150953712, + "router_z_loss_mlp": 0.8984375, + "step": 1820, + "time_per_iteration": 2.6666150093078613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178247, + "balance_loss_mlp": 1.08783865, + "epoch": 0.35032704886494803, + "flos": 579848778240.0, + "grad_norm": 0.026574936148936048, + "language_loss": 0.89372301, + "learning_rate": 0.0007542149440740694, + "loss": 0.90550542, + "num_input_tokens_seen": 151026192, + "router_z_loss_mlp": 0.90234375, + "step": 1821, + "time_per_iteration": 2.6776442527770996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178869, + "balance_loss_mlp": 1.08841276, + "epoch": 0.3505194305502116, + "flos": 585831816192.0, + "grad_norm": 0.02674162112947977, + "language_loss": 0.9602831, + "learning_rate": 0.000753946624604597, + "loss": 0.97207189, + "num_input_tokens_seen": 151100720, + "router_z_loss_mlp": 0.90283203, + "step": 1822, + "time_per_iteration": 2.746363639831543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175368, + "balance_loss_mlp": 1.08491182, + "epoch": 0.3507118122354752, + "flos": 527978194944.0, + "grad_norm": 0.02703682960411951, + "language_loss": 0.95658362, + "learning_rate": 0.0007536782065443015, + "loss": 0.9683373, + "num_input_tokens_seen": 151166032, + "router_z_loss_mlp": 0.90283203, + "step": 1823, + "time_per_iteration": 2.5945184230804443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175188, + "balance_loss_mlp": 1.08458936, + "epoch": 0.35090419392073874, + "flos": 512545602048.0, + "grad_norm": 0.03278557538641046, + "language_loss": 0.86822712, + "learning_rate": 0.0007534096899973919, + "loss": 0.87997901, + "num_input_tokens_seen": 151232208, + "router_z_loss_mlp": 0.90429688, + "step": 1824, + "time_per_iteration": 2.56933331489563 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184456, + "balance_loss_mlp": 1.0944289, + "epoch": 0.3510965756060023, + "flos": 565195719168.0, + "grad_norm": 0.023191753507183704, + "language_loss": 0.89392567, + "learning_rate": 0.0007531410750681154, + "loss": 0.90577018, + "num_input_tokens_seen": 151308128, + "router_z_loss_mlp": 0.8984375, + "step": 1825, + "time_per_iteration": 2.7223169803619385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186327, + "balance_loss_mlp": 1.09630024, + "epoch": 0.35128895729126586, + "flos": 1022253046272.0, + "grad_norm": 0.026424599574572643, + "language_loss": 0.93470478, + "learning_rate": 0.0007528723618607575, + "loss": 0.94656801, + "num_input_tokens_seen": 151402560, + "router_z_loss_mlp": 0.8984375, + "step": 1826, + "time_per_iteration": 3.404395580291748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182394, + "balance_loss_mlp": 1.09236717, + "epoch": 0.35148133897652944, + "flos": 589424586240.0, + "grad_norm": 0.02767542011563751, + "language_loss": 0.89242589, + "learning_rate": 0.0007526035504796422, + "loss": 0.90424991, + "num_input_tokens_seen": 151478816, + "router_z_loss_mlp": 0.8984375, + "step": 1827, + "time_per_iteration": 2.820510149002075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117853, + "balance_loss_mlp": 1.08850324, + "epoch": 0.351673720661793, + "flos": 496285811712.0, + "grad_norm": 0.02845608163714707, + "language_loss": 0.94670665, + "learning_rate": 0.0007523346410291312, + "loss": 0.95849192, + "num_input_tokens_seen": 151554528, + "router_z_loss_mlp": 0.8984375, + "step": 1828, + "time_per_iteration": 2.763277053833008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177518, + "balance_loss_mlp": 1.08753836, + "epoch": 0.35186610234705656, + "flos": 763998572544.0, + "grad_norm": 0.028566964886064136, + "language_loss": 0.91855693, + "learning_rate": 0.0007520656336136245, + "loss": 0.93033206, + "num_input_tokens_seen": 151629440, + "router_z_loss_mlp": 0.89794922, + "step": 1829, + "time_per_iteration": 2.9501917362213135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179113, + "balance_loss_mlp": 1.08908641, + "epoch": 0.3520584840323201, + "flos": 627388717056.0, + "grad_norm": 0.0235814228834027, + "language_loss": 0.94624627, + "learning_rate": 0.0007517965283375599, + "loss": 0.95803738, + "num_input_tokens_seen": 151708544, + "router_z_loss_mlp": 0.8984375, + "step": 1830, + "time_per_iteration": 2.8197402954101562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179857, + "balance_loss_mlp": 1.08992577, + "epoch": 0.3522508657175837, + "flos": 538448329728.0, + "grad_norm": 0.025024391475303026, + "language_loss": 0.97205818, + "learning_rate": 0.0007515273253054132, + "loss": 0.9838568, + "num_input_tokens_seen": 151779152, + "router_z_loss_mlp": 0.89746094, + "step": 1831, + "time_per_iteration": 2.6376330852508545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191124, + "balance_loss_mlp": 1.10109711, + "epoch": 0.35244324740284727, + "flos": 568501780992.0, + "grad_norm": 0.029882616882314406, + "language_loss": 0.9266001, + "learning_rate": 0.0007512580246216988, + "loss": 0.93851131, + "num_input_tokens_seen": 151853216, + "router_z_loss_mlp": 0.8984375, + "step": 1832, + "time_per_iteration": 2.708432912826538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179716, + "balance_loss_mlp": 1.08964145, + "epoch": 0.3526356290881108, + "flos": 514054278144.0, + "grad_norm": 0.030813246422457925, + "language_loss": 0.91671479, + "learning_rate": 0.000750988626390968, + "loss": 0.92851192, + "num_input_tokens_seen": 151920416, + "router_z_loss_mlp": 0.89892578, + "step": 1833, + "time_per_iteration": 2.592047929763794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179987, + "balance_loss_mlp": 1.09010315, + "epoch": 0.3528280107733744, + "flos": 596972696064.0, + "grad_norm": 0.024705197674389605, + "language_loss": 0.91622353, + "learning_rate": 0.0007507191307178108, + "loss": 0.9280234, + "num_input_tokens_seen": 151990848, + "router_z_loss_mlp": 0.89697266, + "step": 1834, + "time_per_iteration": 2.7884535789489746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176506, + "balance_loss_mlp": 1.08652651, + "epoch": 0.3530203924586379, + "flos": 552298386432.0, + "grad_norm": 0.0302975798262418, + "language_loss": 0.83893424, + "learning_rate": 0.0007504495377068543, + "loss": 0.85069931, + "num_input_tokens_seen": 152064864, + "router_z_loss_mlp": 0.89794922, + "step": 1835, + "time_per_iteration": 2.7751786708831787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175764, + "balance_loss_mlp": 1.08573675, + "epoch": 0.3532127741439015, + "flos": 654305293824.0, + "grad_norm": 0.027517554164180617, + "language_loss": 0.90655488, + "learning_rate": 0.0007501798474627642, + "loss": 0.91831255, + "num_input_tokens_seen": 152150096, + "router_z_loss_mlp": 0.8984375, + "step": 1836, + "time_per_iteration": 2.9638845920562744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179149, + "balance_loss_mlp": 1.08926523, + "epoch": 0.35340515582916504, + "flos": 724150460928.0, + "grad_norm": 0.024568481275515953, + "language_loss": 0.91140759, + "learning_rate": 0.0007499100600902433, + "loss": 0.92319906, + "num_input_tokens_seen": 152232528, + "router_z_loss_mlp": 0.89697266, + "step": 1837, + "time_per_iteration": 2.9948322772979736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184038, + "balance_loss_mlp": 1.09396327, + "epoch": 0.35359753751442863, + "flos": 595997778432.0, + "grad_norm": 0.031821297821065, + "language_loss": 0.92654896, + "learning_rate": 0.0007496401756940324, + "loss": 0.9383893, + "num_input_tokens_seen": 152299584, + "router_z_loss_mlp": 0.89892578, + "step": 1838, + "time_per_iteration": 2.678050994873047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176486, + "balance_loss_mlp": 1.08665001, + "epoch": 0.3537899191996922, + "flos": 633805456896.0, + "grad_norm": 0.02718368250353396, + "language_loss": 0.91091663, + "learning_rate": 0.0007493701943789098, + "loss": 0.92268145, + "num_input_tokens_seen": 152370368, + "router_z_loss_mlp": 0.89648438, + "step": 1839, + "time_per_iteration": 2.779574155807495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175825, + "balance_loss_mlp": 1.08608413, + "epoch": 0.35398230088495575, + "flos": 507352831488.0, + "grad_norm": 0.028671493841357993, + "language_loss": 0.91863656, + "learning_rate": 0.000749100116249692, + "loss": 0.93039483, + "num_input_tokens_seen": 152436928, + "router_z_loss_mlp": 0.89550781, + "step": 1840, + "time_per_iteration": 2.607614755630493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189406, + "balance_loss_mlp": 1.09980869, + "epoch": 0.35417468257021933, + "flos": 509046157824.0, + "grad_norm": 0.03229862826848899, + "language_loss": 0.95953786, + "learning_rate": 0.0007488299414112321, + "loss": 0.97143197, + "num_input_tokens_seen": 152505952, + "router_z_loss_mlp": 0.89404297, + "step": 1841, + "time_per_iteration": 2.566596746444702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181321, + "balance_loss_mlp": 1.09210455, + "epoch": 0.35436706425548287, + "flos": 657659019264.0, + "grad_norm": 0.02732135002339032, + "language_loss": 0.86453879, + "learning_rate": 0.0007485596699684215, + "loss": 0.87635195, + "num_input_tokens_seen": 152577408, + "router_z_loss_mlp": 0.89013672, + "step": 1842, + "time_per_iteration": 2.8111371994018555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185021, + "balance_loss_mlp": 1.09575689, + "epoch": 0.35455944594074645, + "flos": 653888329728.0, + "grad_norm": 0.026686949506238997, + "language_loss": 0.92940086, + "learning_rate": 0.000748289302026189, + "loss": 0.94125104, + "num_input_tokens_seen": 152654480, + "router_z_loss_mlp": 0.890625, + "step": 1843, + "time_per_iteration": 2.8244054317474365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187203, + "balance_loss_mlp": 1.09793901, + "epoch": 0.35475182762601, + "flos": 850010204160.0, + "grad_norm": 0.02649701564047654, + "language_loss": 0.9307664, + "learning_rate": 0.0007480188376895004, + "loss": 0.94263846, + "num_input_tokens_seen": 152732304, + "router_z_loss_mlp": 0.890625, + "step": 1844, + "time_per_iteration": 3.041001319885254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187935, + "balance_loss_mlp": 1.10115051, + "epoch": 0.3549442093112736, + "flos": 1524775128576.0, + "grad_norm": 0.01173136965559212, + "language_loss": 0.7381134, + "learning_rate": 0.0007477482770633596, + "loss": 0.74999273, + "num_input_tokens_seen": 152965952, + "router_z_loss_mlp": 0.86914062, + "step": 1845, + "time_per_iteration": 4.865761756896973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183261, + "balance_loss_mlp": 1.09390223, + "epoch": 0.3551365909965371, + "flos": 652714025472.0, + "grad_norm": 0.028658093872898062, + "language_loss": 0.85614175, + "learning_rate": 0.0007474776202528074, + "loss": 0.8679744, + "num_input_tokens_seen": 153053088, + "router_z_loss_mlp": 0.89160156, + "step": 1846, + "time_per_iteration": 2.9342904090881348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184977, + "balance_loss_mlp": 1.0954746, + "epoch": 0.3553289726818007, + "flos": 898921832448.0, + "grad_norm": 0.03609141350995601, + "language_loss": 0.89849555, + "learning_rate": 0.000747206867362922, + "loss": 0.91034532, + "num_input_tokens_seen": 153129216, + "router_z_loss_mlp": 0.89306641, + "step": 1847, + "time_per_iteration": 3.1089484691619873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185041, + "balance_loss_mlp": 1.09553862, + "epoch": 0.3555213543670643, + "flos": 689733437952.0, + "grad_norm": 0.0286779566522822, + "language_loss": 0.9096849, + "learning_rate": 0.0007469360184988194, + "loss": 0.92153525, + "num_input_tokens_seen": 153199360, + "router_z_loss_mlp": 0.89306641, + "step": 1848, + "time_per_iteration": 2.820265293121338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183493, + "balance_loss_mlp": 1.09399033, + "epoch": 0.3557137360523278, + "flos": 539603168256.0, + "grad_norm": 0.02648998316664428, + "language_loss": 0.93967247, + "learning_rate": 0.0007466650737656518, + "loss": 0.95150745, + "num_input_tokens_seen": 153269168, + "router_z_loss_mlp": 0.89306641, + "step": 1849, + "time_per_iteration": 2.596639394760132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183541, + "balance_loss_mlp": 1.09427702, + "epoch": 0.3559061177375914, + "flos": 403153767936.0, + "grad_norm": 0.02765421607491624, + "language_loss": 0.97574586, + "learning_rate": 0.0007463940332686098, + "loss": 0.98758125, + "num_input_tokens_seen": 153333120, + "router_z_loss_mlp": 0.890625, + "step": 1850, + "time_per_iteration": 2.478158473968506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177245, + "balance_loss_mlp": 1.08764756, + "epoch": 0.35609849942285493, + "flos": 697893895680.0, + "grad_norm": 0.023379973164811964, + "language_loss": 0.90857208, + "learning_rate": 0.0007461228971129205, + "loss": 0.92034447, + "num_input_tokens_seen": 153407600, + "router_z_loss_mlp": 0.89404297, + "step": 1851, + "time_per_iteration": 2.9202487468719482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179211, + "balance_loss_mlp": 1.08966124, + "epoch": 0.3562908811081185, + "flos": 570001724928.0, + "grad_norm": 0.028863121832353986, + "language_loss": 0.92692959, + "learning_rate": 0.0007458516654038483, + "loss": 0.93872178, + "num_input_tokens_seen": 153477408, + "router_z_loss_mlp": 0.89355469, + "step": 1852, + "time_per_iteration": 2.658867359161377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179202, + "balance_loss_mlp": 1.08936572, + "epoch": 0.35648326279338205, + "flos": 683609410560.0, + "grad_norm": 0.028040747176241956, + "language_loss": 0.94642723, + "learning_rate": 0.0007455803382466946, + "loss": 0.95821923, + "num_input_tokens_seen": 153551888, + "router_z_loss_mlp": 0.89648438, + "step": 1853, + "time_per_iteration": 2.86330509185791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183408, + "balance_loss_mlp": 1.09376252, + "epoch": 0.35667564447864564, + "flos": 630340941312.0, + "grad_norm": 0.02553826751691769, + "language_loss": 0.94946796, + "learning_rate": 0.0007453089157467979, + "loss": 0.96130198, + "num_input_tokens_seen": 153626912, + "router_z_loss_mlp": 0.89453125, + "step": 1854, + "time_per_iteration": 2.792577028274536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180437, + "balance_loss_mlp": 1.09093451, + "epoch": 0.35686802616390917, + "flos": 815504584704.0, + "grad_norm": 0.02468703395074296, + "language_loss": 0.8986901, + "learning_rate": 0.0007450373980095341, + "loss": 0.91049451, + "num_input_tokens_seen": 153711312, + "router_z_loss_mlp": 0.89306641, + "step": 1855, + "time_per_iteration": 3.0555014610290527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182657, + "balance_loss_mlp": 1.09334552, + "epoch": 0.35706040784917276, + "flos": 527205391872.0, + "grad_norm": 0.02890256158864057, + "language_loss": 0.93639445, + "learning_rate": 0.0007447657851403155, + "loss": 0.94822103, + "num_input_tokens_seen": 153780208, + "router_z_loss_mlp": 0.89111328, + "step": 1856, + "time_per_iteration": 2.589708089828491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182935, + "balance_loss_mlp": 1.09367096, + "epoch": 0.35725278953443634, + "flos": 513064624128.0, + "grad_norm": 0.032008561774258475, + "language_loss": 0.88987339, + "learning_rate": 0.0007444940772445915, + "loss": 0.9017027, + "num_input_tokens_seen": 153853152, + "router_z_loss_mlp": 0.890625, + "step": 1857, + "time_per_iteration": 2.7185556888580322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180668, + "balance_loss_mlp": 1.09169042, + "epoch": 0.3574451712196999, + "flos": 488492653056.0, + "grad_norm": 0.02708223160327311, + "language_loss": 0.88387084, + "learning_rate": 0.0007442222744278484, + "loss": 0.89567751, + "num_input_tokens_seen": 153924160, + "router_z_loss_mlp": 0.88769531, + "step": 1858, + "time_per_iteration": 2.6569979190826416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182567, + "balance_loss_mlp": 1.09339869, + "epoch": 0.35763755290496346, + "flos": 551821023744.0, + "grad_norm": 0.023402609147138306, + "language_loss": 0.90506786, + "learning_rate": 0.0007439503767956099, + "loss": 0.91689354, + "num_input_tokens_seen": 153998688, + "router_z_loss_mlp": 0.88964844, + "step": 1859, + "time_per_iteration": 2.7072699069976807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180801, + "balance_loss_mlp": 1.09249115, + "epoch": 0.357829934590227, + "flos": 1507225514496.0, + "grad_norm": 0.010565166743096084, + "language_loss": 0.79671603, + "learning_rate": 0.0007436783844534352, + "loss": 0.80852401, + "num_input_tokens_seen": 154230960, + "router_z_loss_mlp": 0.88085938, + "step": 1860, + "time_per_iteration": 4.9006147384643555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177337, + "balance_loss_mlp": 1.08835948, + "epoch": 0.3580223162754906, + "flos": 569841269760.0, + "grad_norm": 0.022894220472823423, + "language_loss": 0.92520916, + "learning_rate": 0.000743406297506922, + "loss": 0.93698251, + "num_input_tokens_seen": 154309104, + "router_z_loss_mlp": 0.88769531, + "step": 1861, + "time_per_iteration": 2.7065579891204834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186252, + "balance_loss_mlp": 1.09741747, + "epoch": 0.3582146979607541, + "flos": 627760018944.0, + "grad_norm": 0.02759787968542248, + "language_loss": 0.91638815, + "learning_rate": 0.0007431341160617031, + "loss": 0.92825067, + "num_input_tokens_seen": 154387424, + "router_z_loss_mlp": 0.88623047, + "step": 1862, + "time_per_iteration": 2.9316203594207764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179684, + "balance_loss_mlp": 1.09089661, + "epoch": 0.3584070796460177, + "flos": 508319016960.0, + "grad_norm": 0.024526236298265516, + "language_loss": 0.95309365, + "learning_rate": 0.0007428618402234491, + "loss": 0.96489048, + "num_input_tokens_seen": 154459952, + "router_z_loss_mlp": 0.88574219, + "step": 1863, + "time_per_iteration": 2.648061752319336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179939, + "balance_loss_mlp": 1.09129453, + "epoch": 0.3585994613312813, + "flos": 607640216064.0, + "grad_norm": 0.026400757424935653, + "language_loss": 0.88735509, + "learning_rate": 0.0007425894700978668, + "loss": 0.89915442, + "num_input_tokens_seen": 154535456, + "router_z_loss_mlp": 0.88427734, + "step": 1864, + "time_per_iteration": 2.7512128353118896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178956, + "balance_loss_mlp": 1.0905509, + "epoch": 0.3587918430165448, + "flos": 1415087675904.0, + "grad_norm": 0.025937088976099313, + "language_loss": 0.86489892, + "learning_rate": 0.0007423170057906996, + "loss": 0.87668848, + "num_input_tokens_seen": 154627568, + "router_z_loss_mlp": 0.88183594, + "step": 1865, + "time_per_iteration": 3.8491222858428955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181386, + "balance_loss_mlp": 1.0926944, + "epoch": 0.3589842247018084, + "flos": 479513730048.0, + "grad_norm": 0.0296684402619103, + "language_loss": 0.94328964, + "learning_rate": 0.0007420444474077275, + "loss": 0.95510352, + "num_input_tokens_seen": 154694640, + "router_z_loss_mlp": 0.88476562, + "step": 1866, + "time_per_iteration": 2.5396502017974854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183129, + "balance_loss_mlp": 1.09458029, + "epoch": 0.35917660638707194, + "flos": 505705167360.0, + "grad_norm": 0.030930075238968464, + "language_loss": 0.98337018, + "learning_rate": 0.0007417717950547671, + "loss": 0.99520147, + "num_input_tokens_seen": 154762048, + "router_z_loss_mlp": 0.88330078, + "step": 1867, + "time_per_iteration": 2.562638759613037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182945, + "balance_loss_mlp": 1.09654236, + "epoch": 0.3593689880723355, + "flos": 1495481745408.0, + "grad_norm": 0.008554058370081398, + "language_loss": 0.75996608, + "learning_rate": 0.0007414990488376713, + "loss": 0.77179551, + "num_input_tokens_seen": 154989952, + "router_z_loss_mlp": 0.86523438, + "step": 1868, + "time_per_iteration": 4.885401487350464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184482, + "balance_loss_mlp": 1.09583843, + "epoch": 0.35956136975759906, + "flos": 529671521280.0, + "grad_norm": 0.02257875970711003, + "language_loss": 0.91369003, + "learning_rate": 0.0007412262088623299, + "loss": 0.92553484, + "num_input_tokens_seen": 155066992, + "router_z_loss_mlp": 0.88427734, + "step": 1869, + "time_per_iteration": 2.755620241165161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184303, + "balance_loss_mlp": 1.09584975, + "epoch": 0.35975375144286265, + "flos": 535999664640.0, + "grad_norm": 0.02945163599469251, + "language_loss": 0.8810817, + "learning_rate": 0.0007409532752346684, + "loss": 0.89292467, + "num_input_tokens_seen": 155137616, + "router_z_loss_mlp": 0.88232422, + "step": 1870, + "time_per_iteration": 2.6426498889923096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187445, + "balance_loss_mlp": 1.09860992, + "epoch": 0.3599461331281262, + "flos": 505928749056.0, + "grad_norm": 0.025692069404306732, + "language_loss": 0.95194697, + "learning_rate": 0.0007406802480606491, + "loss": 0.96382141, + "num_input_tokens_seen": 155209248, + "router_z_loss_mlp": 0.88623047, + "step": 1871, + "time_per_iteration": 2.6156716346740723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180117, + "balance_loss_mlp": 1.09123456, + "epoch": 0.36013851481338977, + "flos": 512536869888.0, + "grad_norm": 0.029138864413584674, + "language_loss": 0.9874596, + "learning_rate": 0.0007404071274462707, + "loss": 0.99926078, + "num_input_tokens_seen": 155274176, + "router_z_loss_mlp": 0.88671875, + "step": 1872, + "time_per_iteration": 2.5790889263153076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179425, + "balance_loss_mlp": 1.09054244, + "epoch": 0.36033089649865335, + "flos": 548631756288.0, + "grad_norm": 0.029675252163234106, + "language_loss": 0.91584998, + "learning_rate": 0.0007401339134975682, + "loss": 0.92764425, + "num_input_tokens_seen": 155343232, + "router_z_loss_mlp": 0.88671875, + "step": 1873, + "time_per_iteration": 2.6279983520507812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185016, + "balance_loss_mlp": 1.09613371, + "epoch": 0.3605232781839169, + "flos": 459613506048.0, + "grad_norm": 0.030657976300352024, + "language_loss": 0.92556155, + "learning_rate": 0.0007398606063206122, + "loss": 0.93741173, + "num_input_tokens_seen": 155410080, + "router_z_loss_mlp": 0.88671875, + "step": 1874, + "time_per_iteration": 2.5750958919525146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178477, + "balance_loss_mlp": 1.0895946, + "epoch": 0.36071565986918047, + "flos": 510563566080.0, + "grad_norm": 0.029863822651947862, + "language_loss": 0.87000763, + "learning_rate": 0.0007395872060215101, + "loss": 0.88179243, + "num_input_tokens_seen": 155476240, + "router_z_loss_mlp": 0.88671875, + "step": 1875, + "time_per_iteration": 2.599595546722412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180043, + "balance_loss_mlp": 1.09101713, + "epoch": 0.360908041554444, + "flos": 560256729600.0, + "grad_norm": 0.02914010843617622, + "language_loss": 0.95866597, + "learning_rate": 0.0007393137127064056, + "loss": 0.97046638, + "num_input_tokens_seen": 155543392, + "router_z_loss_mlp": 0.88818359, + "step": 1876, + "time_per_iteration": 2.629855155944824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179718, + "balance_loss_mlp": 1.09064531, + "epoch": 0.3611004232397076, + "flos": 524878250496.0, + "grad_norm": 0.029199641876594032, + "language_loss": 0.93452048, + "learning_rate": 0.0007390401264814779, + "loss": 0.94631773, + "num_input_tokens_seen": 155613264, + "router_z_loss_mlp": 0.88867188, + "step": 1877, + "time_per_iteration": 2.6057403087615967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182123, + "balance_loss_mlp": 1.0932405, + "epoch": 0.3612928049249711, + "flos": 542032367616.0, + "grad_norm": 0.029384759310162312, + "language_loss": 0.93887711, + "learning_rate": 0.0007387664474529427, + "loss": 0.95069838, + "num_input_tokens_seen": 155683712, + "router_z_loss_mlp": 0.88671875, + "step": 1878, + "time_per_iteration": 2.612924814224243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181149, + "balance_loss_mlp": 1.09207559, + "epoch": 0.3614851866102347, + "flos": 553629143040.0, + "grad_norm": 0.028847856052759763, + "language_loss": 0.99400896, + "learning_rate": 0.0007384926757270518, + "loss": 1.00582051, + "num_input_tokens_seen": 155751760, + "router_z_loss_mlp": 0.88867188, + "step": 1879, + "time_per_iteration": 2.631417751312256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183007, + "balance_loss_mlp": 1.09364784, + "epoch": 0.36167756829549824, + "flos": 773426660352.0, + "grad_norm": 0.027790454764264987, + "language_loss": 0.87101346, + "learning_rate": 0.0007382188114100924, + "loss": 0.88284349, + "num_input_tokens_seen": 155830464, + "router_z_loss_mlp": 0.89160156, + "step": 1880, + "time_per_iteration": 3.0146212577819824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182663, + "balance_loss_mlp": 1.09330404, + "epoch": 0.36186994998076183, + "flos": 713187500544.0, + "grad_norm": 0.025874200926848077, + "language_loss": 0.89437282, + "learning_rate": 0.0007379448546083884, + "loss": 0.90619946, + "num_input_tokens_seen": 155906208, + "router_z_loss_mlp": 0.89160156, + "step": 1881, + "time_per_iteration": 2.9882314205169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182414, + "balance_loss_mlp": 1.09305489, + "epoch": 0.3620623316660254, + "flos": 748900351488.0, + "grad_norm": 0.028120122690860328, + "language_loss": 0.95218164, + "learning_rate": 0.0007376708054282992, + "loss": 0.96400583, + "num_input_tokens_seen": 155983584, + "router_z_loss_mlp": 0.89160156, + "step": 1882, + "time_per_iteration": 2.937251329421997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185259, + "balance_loss_mlp": 1.09609008, + "epoch": 0.36225471335128895, + "flos": 483534197760.0, + "grad_norm": 0.025051425069896712, + "language_loss": 0.90089262, + "learning_rate": 0.0007373966639762201, + "loss": 0.91274524, + "num_input_tokens_seen": 156052464, + "router_z_loss_mlp": 0.88964844, + "step": 1883, + "time_per_iteration": 2.5956366062164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189104, + "balance_loss_mlp": 1.09964943, + "epoch": 0.36244709503655254, + "flos": 507910785024.0, + "grad_norm": 0.028814908336841725, + "language_loss": 0.97620124, + "learning_rate": 0.0007371224303585822, + "loss": 0.9880923, + "num_input_tokens_seen": 156121424, + "router_z_loss_mlp": 0.89257812, + "step": 1884, + "time_per_iteration": 2.5689563751220703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188454, + "balance_loss_mlp": 1.10205078, + "epoch": 0.36263947672181607, + "flos": 1397052145152.0, + "grad_norm": 0.012535477100621303, + "language_loss": 0.80357069, + "learning_rate": 0.0007368481046818524, + "loss": 0.8154552, + "num_input_tokens_seen": 156346144, + "router_z_loss_mlp": 0.86523438, + "step": 1885, + "time_per_iteration": 4.708393573760986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184768, + "balance_loss_mlp": 1.09531295, + "epoch": 0.36283185840707965, + "flos": 654522144768.0, + "grad_norm": 0.026882878095346403, + "language_loss": 0.90798199, + "learning_rate": 0.0007365736870525335, + "loss": 0.91982961, + "num_input_tokens_seen": 156420880, + "router_z_loss_mlp": 0.89257812, + "step": 1886, + "time_per_iteration": 2.8096718788146973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188121, + "balance_loss_mlp": 1.09842801, + "epoch": 0.3630242400923432, + "flos": 489844876800.0, + "grad_norm": 0.028488669634490066, + "language_loss": 0.90766525, + "learning_rate": 0.000736299177577164, + "loss": 0.91954637, + "num_input_tokens_seen": 156485616, + "router_z_loss_mlp": 0.89501953, + "step": 1887, + "time_per_iteration": 2.5731940269470215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184527, + "balance_loss_mlp": 1.09488153, + "epoch": 0.3632166217776068, + "flos": 518231198208.0, + "grad_norm": 0.0291282657352475, + "language_loss": 0.90900671, + "learning_rate": 0.0007360245763623174, + "loss": 0.92085195, + "num_input_tokens_seen": 156557840, + "router_z_loss_mlp": 0.89453125, + "step": 1888, + "time_per_iteration": 2.6255550384521484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184122, + "balance_loss_mlp": 1.09457171, + "epoch": 0.36340900346287036, + "flos": 647347338240.0, + "grad_norm": 0.024297388169127104, + "language_loss": 0.96519047, + "learning_rate": 0.0007357498835146039, + "loss": 0.97703171, + "num_input_tokens_seen": 156632496, + "router_z_loss_mlp": 0.89355469, + "step": 1889, + "time_per_iteration": 2.8253488540649414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183322, + "balance_loss_mlp": 1.09386766, + "epoch": 0.3636013851481339, + "flos": 554410678272.0, + "grad_norm": 0.02538543495771105, + "language_loss": 0.93937147, + "learning_rate": 0.0007354750991406684, + "loss": 0.95120472, + "num_input_tokens_seen": 156705296, + "router_z_loss_mlp": 0.89257812, + "step": 1890, + "time_per_iteration": 2.692335844039917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182823, + "balance_loss_mlp": 1.09336889, + "epoch": 0.3637937668333975, + "flos": 547691767296.0, + "grad_norm": 0.028084450652072174, + "language_loss": 0.88223994, + "learning_rate": 0.0007352002233471919, + "loss": 0.89406812, + "num_input_tokens_seen": 156773376, + "router_z_loss_mlp": 0.89257812, + "step": 1891, + "time_per_iteration": 2.620753765106201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181153, + "balance_loss_mlp": 1.09212756, + "epoch": 0.363986148518661, + "flos": 539210399232.0, + "grad_norm": 0.027970426809957948, + "language_loss": 0.87592262, + "learning_rate": 0.0007349252562408906, + "loss": 0.88773412, + "num_input_tokens_seen": 156844336, + "router_z_loss_mlp": 0.88818359, + "step": 1892, + "time_per_iteration": 2.6963558197021484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186893, + "balance_loss_mlp": 1.09762907, + "epoch": 0.3641785302039246, + "flos": 661510299648.0, + "grad_norm": 0.026164868426956554, + "language_loss": 0.89186442, + "learning_rate": 0.0007346501979285158, + "loss": 0.90373337, + "num_input_tokens_seen": 156918848, + "router_z_loss_mlp": 0.890625, + "step": 1893, + "time_per_iteration": 2.880326747894287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188103, + "balance_loss_mlp": 1.10150909, + "epoch": 0.36437091188918813, + "flos": 1472082077184.0, + "grad_norm": 0.013556454199407954, + "language_loss": 0.80539101, + "learning_rate": 0.0007343750485168551, + "loss": 0.81727207, + "num_input_tokens_seen": 157134736, + "router_z_loss_mlp": 0.8671875, + "step": 1894, + "time_per_iteration": 4.7823100090026855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189424, + "balance_loss_mlp": 1.10011292, + "epoch": 0.3645632935744517, + "flos": 598444442112.0, + "grad_norm": 0.028411509484180794, + "language_loss": 0.93676329, + "learning_rate": 0.0007340998081127308, + "loss": 0.94865751, + "num_input_tokens_seen": 157211920, + "router_z_loss_mlp": 0.89111328, + "step": 1895, + "time_per_iteration": 2.7800211906433105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179101, + "balance_loss_mlp": 1.08998048, + "epoch": 0.36475567525971525, + "flos": 600695721984.0, + "grad_norm": 0.025932670803143428, + "language_loss": 0.98669052, + "learning_rate": 0.0007338244768230007, + "loss": 0.99848151, + "num_input_tokens_seen": 157284224, + "router_z_loss_mlp": 0.88916016, + "step": 1896, + "time_per_iteration": 2.7945594787597656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180722, + "balance_loss_mlp": 1.09169638, + "epoch": 0.36494805694497884, + "flos": 799830945792.0, + "grad_norm": 0.022772977260465788, + "language_loss": 0.94548512, + "learning_rate": 0.0007335490547545578, + "loss": 0.95729244, + "num_input_tokens_seen": 157367920, + "router_z_loss_mlp": 0.88818359, + "step": 1897, + "time_per_iteration": 3.031527280807495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182826, + "balance_loss_mlp": 1.09389579, + "epoch": 0.3651404386302424, + "flos": 638477203968.0, + "grad_norm": 0.024439781626348547, + "language_loss": 0.90189934, + "learning_rate": 0.0007332735420143308, + "loss": 0.91372758, + "num_input_tokens_seen": 157438672, + "router_z_loss_mlp": 0.88720703, + "step": 1898, + "time_per_iteration": 2.743051767349243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118252, + "balance_loss_mlp": 1.09363747, + "epoch": 0.36533282031550596, + "flos": 492562785792.0, + "grad_norm": 0.03052059755540218, + "language_loss": 0.95941794, + "learning_rate": 0.0007329979387092826, + "loss": 0.97124314, + "num_input_tokens_seen": 157505888, + "router_z_loss_mlp": 0.88671875, + "step": 1899, + "time_per_iteration": 2.5555779933929443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181449, + "balance_loss_mlp": 1.09247124, + "epoch": 0.36552520200076954, + "flos": 857508648960.0, + "grad_norm": 0.02266050351879182, + "language_loss": 0.89947438, + "learning_rate": 0.0007327222449464124, + "loss": 0.91128886, + "num_input_tokens_seen": 157601568, + "router_z_loss_mlp": 0.88769531, + "step": 1900, + "time_per_iteration": 3.2362029552459717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181183, + "balance_loss_mlp": 1.09206235, + "epoch": 0.3657175836860331, + "flos": 484715232768.0, + "grad_norm": 0.026374750280255838, + "language_loss": 0.95288622, + "learning_rate": 0.0007324464608327538, + "loss": 0.96469808, + "num_input_tokens_seen": 157670992, + "router_z_loss_mlp": 0.88916016, + "step": 1901, + "time_per_iteration": 2.5933730602264404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179798, + "balance_loss_mlp": 1.09058213, + "epoch": 0.36590996537129666, + "flos": 435721012224.0, + "grad_norm": 0.02685373461110618, + "language_loss": 0.96213037, + "learning_rate": 0.0007321705864753758, + "loss": 0.97392833, + "num_input_tokens_seen": 157743616, + "router_z_loss_mlp": 0.89013672, + "step": 1902, + "time_per_iteration": 2.6981201171875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180605, + "balance_loss_mlp": 1.09124577, + "epoch": 0.3661023470565602, + "flos": 713513140224.0, + "grad_norm": 0.022756571637903334, + "language_loss": 0.91225153, + "learning_rate": 0.0007318946219813823, + "loss": 0.9240576, + "num_input_tokens_seen": 157823520, + "router_z_loss_mlp": 0.89160156, + "step": 1903, + "time_per_iteration": 2.992624044418335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183651, + "balance_loss_mlp": 1.09443474, + "epoch": 0.3662947287418238, + "flos": 565822803456.0, + "grad_norm": 0.027935940535232063, + "language_loss": 0.96619356, + "learning_rate": 0.000731618567457912, + "loss": 0.97803003, + "num_input_tokens_seen": 157893248, + "router_z_loss_mlp": 0.89013672, + "step": 1904, + "time_per_iteration": 2.685476064682007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183785, + "balance_loss_mlp": 1.09433067, + "epoch": 0.3664871104270873, + "flos": 791201857536.0, + "grad_norm": 0.029459392082425068, + "language_loss": 0.95166355, + "learning_rate": 0.000731342423012139, + "loss": 0.96350139, + "num_input_tokens_seen": 157973216, + "router_z_loss_mlp": 0.89257812, + "step": 1905, + "time_per_iteration": 3.0574183464050293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184501, + "balance_loss_mlp": 1.09480846, + "epoch": 0.3666794921123509, + "flos": 753980330496.0, + "grad_norm": 0.028631588758117728, + "language_loss": 0.89661896, + "learning_rate": 0.0007310661887512722, + "loss": 0.90846401, + "num_input_tokens_seen": 158051088, + "router_z_loss_mlp": 0.89501953, + "step": 1906, + "time_per_iteration": 3.024423122406006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183077, + "balance_loss_mlp": 1.09343171, + "epoch": 0.3668718737976145, + "flos": 524607005184.0, + "grad_norm": 0.02900954708937733, + "language_loss": 0.89823443, + "learning_rate": 0.0007307898647825549, + "loss": 0.91006529, + "num_input_tokens_seen": 158124368, + "router_z_loss_mlp": 0.89453125, + "step": 1907, + "time_per_iteration": 2.6485068798065186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182186, + "balance_loss_mlp": 1.09277892, + "epoch": 0.367064255482878, + "flos": 573045273600.0, + "grad_norm": 0.031417651983294596, + "language_loss": 0.98967636, + "learning_rate": 0.0007305134512132659, + "loss": 1.00149822, + "num_input_tokens_seen": 158191472, + "router_z_loss_mlp": 0.89208984, + "step": 1908, + "time_per_iteration": 2.646838903427124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180724, + "balance_loss_mlp": 1.09107888, + "epoch": 0.3672566371681416, + "flos": 448053660672.0, + "grad_norm": 0.03289649974011927, + "language_loss": 0.93253779, + "learning_rate": 0.0007302369481507183, + "loss": 0.94434512, + "num_input_tokens_seen": 158254384, + "router_z_loss_mlp": 0.89453125, + "step": 1909, + "time_per_iteration": 2.562856674194336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188614, + "balance_loss_mlp": 1.10011292, + "epoch": 0.36744901885340514, + "flos": 1543364061696.0, + "grad_norm": 0.010877058892954462, + "language_loss": 0.79961759, + "learning_rate": 0.00072996035570226, + "loss": 0.81150377, + "num_input_tokens_seen": 158486160, + "router_z_loss_mlp": 0.8828125, + "step": 1910, + "time_per_iteration": 4.90735387802124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011789, + "balance_loss_mlp": 1.08949292, + "epoch": 0.36764140053866873, + "flos": 564761290752.0, + "grad_norm": 0.024499581587470617, + "language_loss": 0.92626876, + "learning_rate": 0.000729683673975274, + "loss": 0.93805778, + "num_input_tokens_seen": 158555616, + "router_z_loss_mlp": 0.89208984, + "step": 1911, + "time_per_iteration": 2.6646595001220703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182116, + "balance_loss_mlp": 1.09285223, + "epoch": 0.36783378222393226, + "flos": 1218650895360.0, + "grad_norm": 0.021973130552363645, + "language_loss": 0.89050859, + "learning_rate": 0.0007294069030771774, + "loss": 0.90232974, + "num_input_tokens_seen": 158653984, + "router_z_loss_mlp": 0.890625, + "step": 1912, + "time_per_iteration": 3.6834843158721924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189865, + "balance_loss_mlp": 1.10021913, + "epoch": 0.36802616390919585, + "flos": 499720128000.0, + "grad_norm": 0.028676866730684987, + "language_loss": 0.97328013, + "learning_rate": 0.0007291300431154224, + "loss": 0.98517883, + "num_input_tokens_seen": 158719728, + "router_z_loss_mlp": 0.89453125, + "step": 1913, + "time_per_iteration": 2.587052822113037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195931, + "balance_loss_mlp": 1.10838318, + "epoch": 0.36821854559445943, + "flos": 1585615902720.0, + "grad_norm": 0.013013835157786544, + "language_loss": 0.70389736, + "learning_rate": 0.0007288530941974955, + "loss": 0.71585667, + "num_input_tokens_seen": 158952544, + "router_z_loss_mlp": 0.87695312, + "step": 1914, + "time_per_iteration": 4.952203989028931 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185283, + "balance_loss_mlp": 1.09582841, + "epoch": 0.36841092727972297, + "flos": 837089402880.0, + "grad_norm": 0.02834339080565921, + "language_loss": 0.8768307, + "learning_rate": 0.0007285760564309179, + "loss": 0.88868356, + "num_input_tokens_seen": 159039680, + "router_z_loss_mlp": 0.89257812, + "step": 1915, + "time_per_iteration": 3.100893974304199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185476, + "balance_loss_mlp": 1.09602106, + "epoch": 0.36860330896498655, + "flos": 691209913344.0, + "grad_norm": 0.028423235038061073, + "language_loss": 0.92041719, + "learning_rate": 0.0007282989299232448, + "loss": 0.93227196, + "num_input_tokens_seen": 159128128, + "router_z_loss_mlp": 0.89257812, + "step": 1916, + "time_per_iteration": 3.0683393478393555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189801, + "balance_loss_mlp": 1.10048962, + "epoch": 0.3687956906502501, + "flos": 555239877120.0, + "grad_norm": 0.03332088686108748, + "language_loss": 0.92434603, + "learning_rate": 0.0007280217147820668, + "loss": 0.93624407, + "num_input_tokens_seen": 159193248, + "router_z_loss_mlp": 0.89111328, + "step": 1917, + "time_per_iteration": 2.635451078414917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188211, + "balance_loss_mlp": 1.09894717, + "epoch": 0.3689880723355137, + "flos": 577819078656.0, + "grad_norm": 0.027623597033391085, + "language_loss": 0.8697632, + "learning_rate": 0.0007277444111150079, + "loss": 0.88164532, + "num_input_tokens_seen": 159265824, + "router_z_loss_mlp": 0.890625, + "step": 1918, + "time_per_iteration": 2.810635805130005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184664, + "balance_loss_mlp": 1.09540033, + "epoch": 0.3691804540207772, + "flos": 529886370816.0, + "grad_norm": 0.029489830132381867, + "language_loss": 0.91299617, + "learning_rate": 0.0007274670190297272, + "loss": 0.92484283, + "num_input_tokens_seen": 159332992, + "router_z_loss_mlp": 0.890625, + "step": 1919, + "time_per_iteration": 2.615386486053467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118238, + "balance_loss_mlp": 1.09368861, + "epoch": 0.3693728357060408, + "flos": 562180368384.0, + "grad_norm": 0.025570373781710027, + "language_loss": 0.90037912, + "learning_rate": 0.0007271895386339179, + "loss": 0.91220295, + "num_input_tokens_seen": 159409808, + "router_z_loss_mlp": 0.88476562, + "step": 1920, + "time_per_iteration": 2.7868921756744385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192586, + "balance_loss_mlp": 1.10375118, + "epoch": 0.3695652173913043, + "flos": 580899557376.0, + "grad_norm": 0.02893533685872539, + "language_loss": 0.90819347, + "learning_rate": 0.0007269119700353073, + "loss": 0.92011935, + "num_input_tokens_seen": 159486128, + "router_z_loss_mlp": 0.88623047, + "step": 1921, + "time_per_iteration": 2.7836573123931885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178636, + "balance_loss_mlp": 1.09023082, + "epoch": 0.3697575990765679, + "flos": 514059007488.0, + "grad_norm": 0.024390447267758214, + "language_loss": 0.90977228, + "learning_rate": 0.0007266343133416571, + "loss": 0.92155862, + "num_input_tokens_seen": 159562224, + "router_z_loss_mlp": 0.8828125, + "step": 1922, + "time_per_iteration": 2.800387382507324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173615, + "balance_loss_mlp": 1.08816528, + "epoch": 0.3699499807618315, + "flos": 1573903607808.0, + "grad_norm": 0.0066311072211368925, + "language_loss": 0.77116919, + "learning_rate": 0.0007263565686607632, + "loss": 0.78290522, + "num_input_tokens_seen": 159784768, + "router_z_loss_mlp": 0.85546875, + "step": 1923, + "time_per_iteration": 4.845300912857056 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176045, + "balance_loss_mlp": 1.08844995, + "epoch": 0.37014236244709503, + "flos": 498324243456.0, + "grad_norm": 0.031949393340513096, + "language_loss": 0.9351213, + "learning_rate": 0.0007260787361004556, + "loss": 0.94688171, + "num_input_tokens_seen": 159848608, + "router_z_loss_mlp": 0.87744141, + "step": 1924, + "time_per_iteration": 2.5984597206115723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175598, + "balance_loss_mlp": 1.0905304, + "epoch": 0.3703347441323586, + "flos": 1447605433344.0, + "grad_norm": 0.008500773473990196, + "language_loss": 0.73761505, + "learning_rate": 0.0007258008157685987, + "loss": 0.74937099, + "num_input_tokens_seen": 160080928, + "router_z_loss_mlp": 0.8515625, + "step": 1925, + "time_per_iteration": 4.886027097702026 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197031, + "balance_loss_mlp": 1.10862505, + "epoch": 0.37052712581762215, + "flos": 564713627136.0, + "grad_norm": 0.03178088368953176, + "language_loss": 0.94516188, + "learning_rate": 0.0007255228077730903, + "loss": 0.95713222, + "num_input_tokens_seen": 160148976, + "router_z_loss_mlp": 0.88183594, + "step": 1926, + "time_per_iteration": 2.6847593784332275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185383, + "balance_loss_mlp": 1.09731126, + "epoch": 0.37071950750288574, + "flos": 927570667008.0, + "grad_norm": 0.029564625514678724, + "language_loss": 0.89603549, + "learning_rate": 0.0007252447122218632, + "loss": 0.90788931, + "num_input_tokens_seen": 160233504, + "router_z_loss_mlp": 0.88037109, + "step": 1927, + "time_per_iteration": 3.106748342514038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179784, + "balance_loss_mlp": 1.0919987, + "epoch": 0.37091188918814927, + "flos": 419200710144.0, + "grad_norm": 0.03402230349378661, + "language_loss": 0.98334146, + "learning_rate": 0.0007249665292228834, + "loss": 0.99513936, + "num_input_tokens_seen": 160299696, + "router_z_loss_mlp": 0.87939453, + "step": 1928, + "time_per_iteration": 2.5786120891571045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186321, + "balance_loss_mlp": 1.09801054, + "epoch": 0.37110427087341286, + "flos": 464146265088.0, + "grad_norm": 0.029271450765855984, + "language_loss": 0.9102214, + "learning_rate": 0.000724688258884151, + "loss": 0.92208457, + "num_input_tokens_seen": 160367904, + "router_z_loss_mlp": 0.88183594, + "step": 1929, + "time_per_iteration": 2.5388894081115723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185686, + "balance_loss_mlp": 1.09780467, + "epoch": 0.3712966525586764, + "flos": 851080449024.0, + "grad_norm": 0.02435916983518334, + "language_loss": 0.9136247, + "learning_rate": 0.0007244099013137002, + "loss": 0.92548156, + "num_input_tokens_seen": 160453600, + "router_z_loss_mlp": 0.88037109, + "step": 1930, + "time_per_iteration": 3.0708000659942627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179762, + "balance_loss_mlp": 1.09159458, + "epoch": 0.37148903424394, + "flos": 927557932032.0, + "grad_norm": 0.024720397528266293, + "language_loss": 0.95256186, + "learning_rate": 0.0007241314566195993, + "loss": 0.96435952, + "num_input_tokens_seen": 160543472, + "router_z_loss_mlp": 0.88232422, + "step": 1931, + "time_per_iteration": 3.2293543815612793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179876, + "balance_loss_mlp": 1.09180403, + "epoch": 0.37168141592920356, + "flos": 520820852736.0, + "grad_norm": 0.029266961451931986, + "language_loss": 0.92750597, + "learning_rate": 0.0007238529249099496, + "loss": 0.93930471, + "num_input_tokens_seen": 160614016, + "router_z_loss_mlp": 0.88232422, + "step": 1932, + "time_per_iteration": 2.6091582775115967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188461, + "balance_loss_mlp": 1.10263062, + "epoch": 0.3718737976144671, + "flos": 1449059715072.0, + "grad_norm": 0.015165360012205364, + "language_loss": 0.77856874, + "learning_rate": 0.0007235743062928872, + "loss": 0.79045337, + "num_input_tokens_seen": 160828640, + "router_z_loss_mlp": 0.859375, + "step": 1933, + "time_per_iteration": 4.854676246643066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184357, + "balance_loss_mlp": 1.09614182, + "epoch": 0.3720661792997307, + "flos": 760953022464.0, + "grad_norm": 0.028795817149727888, + "language_loss": 0.88381398, + "learning_rate": 0.000723295600876581, + "loss": 0.89565754, + "num_input_tokens_seen": 160913088, + "router_z_loss_mlp": 0.8828125, + "step": 1934, + "time_per_iteration": 2.9830405712127686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118189, + "balance_loss_mlp": 1.09396136, + "epoch": 0.3722585609849942, + "flos": 518044546560.0, + "grad_norm": 0.028690096062057496, + "language_loss": 0.95446575, + "learning_rate": 0.0007230168087692344, + "loss": 0.96628463, + "num_input_tokens_seen": 160982960, + "router_z_loss_mlp": 0.88085938, + "step": 1935, + "time_per_iteration": 2.651982307434082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181923, + "balance_loss_mlp": 1.09404159, + "epoch": 0.3724509426702578, + "flos": 783868597248.0, + "grad_norm": 0.02900654324264667, + "language_loss": 0.88952625, + "learning_rate": 0.0007227379300790839, + "loss": 0.90134549, + "num_input_tokens_seen": 161066000, + "router_z_loss_mlp": 0.88037109, + "step": 1936, + "time_per_iteration": 3.0127265453338623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177948, + "balance_loss_mlp": 1.09006691, + "epoch": 0.37264332435552133, + "flos": 392599039488.0, + "grad_norm": 0.02836050450865214, + "language_loss": 0.94049299, + "learning_rate": 0.0007224589649143997, + "loss": 0.95227242, + "num_input_tokens_seen": 161131040, + "router_z_loss_mlp": 0.88037109, + "step": 1937, + "time_per_iteration": 2.5600061416625977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178201, + "balance_loss_mlp": 1.09074926, + "epoch": 0.3728357060407849, + "flos": 543912345600.0, + "grad_norm": 0.027673862011078548, + "language_loss": 0.89373219, + "learning_rate": 0.0007221799133834861, + "loss": 0.90551418, + "num_input_tokens_seen": 161201248, + "router_z_loss_mlp": 0.87597656, + "step": 1938, + "time_per_iteration": 2.646632671356201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011797, + "balance_loss_mlp": 1.0919621, + "epoch": 0.3730280877260485, + "flos": 434483581440.0, + "grad_norm": 0.03019004471989451, + "language_loss": 0.90666437, + "learning_rate": 0.00072190077559468, + "loss": 0.91846132, + "num_input_tokens_seen": 161266288, + "router_z_loss_mlp": 0.87890625, + "step": 1939, + "time_per_iteration": 2.5193679332733154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118304, + "balance_loss_mlp": 1.0957315, + "epoch": 0.37322046941131204, + "flos": 532510953984.0, + "grad_norm": 0.02812892901872328, + "language_loss": 0.95514065, + "learning_rate": 0.0007216215516563527, + "loss": 0.96697104, + "num_input_tokens_seen": 161335648, + "router_z_loss_mlp": 0.87451172, + "step": 1940, + "time_per_iteration": 2.6975200176239014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184025, + "balance_loss_mlp": 1.09666896, + "epoch": 0.3734128510965756, + "flos": 532576081920.0, + "grad_norm": 0.028733495674926814, + "language_loss": 0.91960251, + "learning_rate": 0.0007213422416769083, + "loss": 0.93144274, + "num_input_tokens_seen": 161403440, + "router_z_loss_mlp": 0.875, + "step": 1941, + "time_per_iteration": 2.636056900024414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183262, + "balance_loss_mlp": 1.09561944, + "epoch": 0.37360523278183916, + "flos": 501432920064.0, + "grad_norm": 0.028111058318233337, + "language_loss": 0.83044219, + "learning_rate": 0.0007210628457647849, + "loss": 0.84227479, + "num_input_tokens_seen": 161472864, + "router_z_loss_mlp": 0.87792969, + "step": 1942, + "time_per_iteration": 2.6564345359802246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182498, + "balance_loss_mlp": 1.09475958, + "epoch": 0.37379761446710275, + "flos": 549111846912.0, + "grad_norm": 0.03172951338735415, + "language_loss": 0.86608446, + "learning_rate": 0.000720783364028453, + "loss": 0.87790942, + "num_input_tokens_seen": 161548096, + "router_z_loss_mlp": 0.87890625, + "step": 1943, + "time_per_iteration": 2.7782797813415527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176645, + "balance_loss_mlp": 1.08909822, + "epoch": 0.3739899961523663, + "flos": 476739425280.0, + "grad_norm": 0.0265564263320471, + "language_loss": 0.94348681, + "learning_rate": 0.0007205037965764177, + "loss": 0.95525324, + "num_input_tokens_seen": 161615600, + "router_z_loss_mlp": 0.87695312, + "step": 1944, + "time_per_iteration": 2.5670034885406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198539, + "balance_loss_mlp": 1.11003804, + "epoch": 0.37418237783762986, + "flos": 613076034048.0, + "grad_norm": 0.032068934234115415, + "language_loss": 0.94037992, + "learning_rate": 0.0007202241435172161, + "loss": 0.95236534, + "num_input_tokens_seen": 161687408, + "router_z_loss_mlp": 0.8828125, + "step": 1945, + "time_per_iteration": 2.7505762577056885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119095, + "balance_loss_mlp": 1.10283065, + "epoch": 0.3743747595228934, + "flos": 767628272640.0, + "grad_norm": 0.02891432689626354, + "language_loss": 0.95249915, + "learning_rate": 0.0007199444049594198, + "loss": 0.9644087, + "num_input_tokens_seen": 161764224, + "router_z_loss_mlp": 0.88085938, + "step": 1946, + "time_per_iteration": 2.9690663814544678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179721, + "balance_loss_mlp": 1.09188759, + "epoch": 0.374567141208157, + "flos": 525490598400.0, + "grad_norm": 0.029648083740235674, + "language_loss": 0.90769064, + "learning_rate": 0.0007196645810116322, + "loss": 0.91948783, + "num_input_tokens_seen": 161835520, + "router_z_loss_mlp": 0.87988281, + "step": 1947, + "time_per_iteration": 2.690214157104492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178535, + "balance_loss_mlp": 1.09065437, + "epoch": 0.37475952289342057, + "flos": 682613025792.0, + "grad_norm": 0.029716110952303924, + "language_loss": 0.91939867, + "learning_rate": 0.0007193846717824912, + "loss": 0.93118405, + "num_input_tokens_seen": 161912000, + "router_z_loss_mlp": 0.88037109, + "step": 1948, + "time_per_iteration": 2.9668121337890625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179187, + "balance_loss_mlp": 1.09140122, + "epoch": 0.3749519045786841, + "flos": 461215507968.0, + "grad_norm": 0.032662314662123194, + "language_loss": 0.97396064, + "learning_rate": 0.0007191046773806669, + "loss": 0.98575246, + "num_input_tokens_seen": 161977296, + "router_z_loss_mlp": 0.87939453, + "step": 1949, + "time_per_iteration": 2.5580427646636963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189402, + "balance_loss_mlp": 1.10166442, + "epoch": 0.3751442862639477, + "flos": 956386687488.0, + "grad_norm": 0.03764484603893814, + "language_loss": 0.94282359, + "learning_rate": 0.0007188245979148631, + "loss": 0.95471758, + "num_input_tokens_seen": 162051888, + "router_z_loss_mlp": 0.87890625, + "step": 1950, + "time_per_iteration": 3.1307644844055176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185097, + "balance_loss_mlp": 1.09678674, + "epoch": 0.3753366679492112, + "flos": 528805392384.0, + "grad_norm": 0.0321726971318772, + "language_loss": 0.95554888, + "learning_rate": 0.0007185444334938157, + "loss": 0.96739984, + "num_input_tokens_seen": 162124384, + "router_z_loss_mlp": 0.8828125, + "step": 1951, + "time_per_iteration": 2.7235019207000732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181124, + "balance_loss_mlp": 1.09324276, + "epoch": 0.3755290496344748, + "flos": 522848550912.0, + "grad_norm": 0.029170285322497422, + "language_loss": 0.91979843, + "learning_rate": 0.0007182641842262947, + "loss": 0.93160963, + "num_input_tokens_seen": 162191440, + "router_z_loss_mlp": 0.88037109, + "step": 1952, + "time_per_iteration": 2.6083579063415527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179821, + "balance_loss_mlp": 1.09193957, + "epoch": 0.37572143131973834, + "flos": 622371864576.0, + "grad_norm": 0.029206332986401715, + "language_loss": 0.85116351, + "learning_rate": 0.0007179838502211022, + "loss": 0.86296165, + "num_input_tokens_seen": 162268480, + "router_z_loss_mlp": 0.88037109, + "step": 1953, + "time_per_iteration": 2.8308520317077637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185603, + "balance_loss_mlp": 1.0973407, + "epoch": 0.37591381300500193, + "flos": 772273823232.0, + "grad_norm": 0.030259488278154622, + "language_loss": 0.94510454, + "learning_rate": 0.0007177034315870738, + "loss": 0.9569605, + "num_input_tokens_seen": 162346752, + "router_z_loss_mlp": 0.88232422, + "step": 1954, + "time_per_iteration": 2.966627359390259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187445, + "balance_loss_mlp": 1.09908688, + "epoch": 0.37610619469026546, + "flos": 521480864256.0, + "grad_norm": 0.02960656624392615, + "language_loss": 0.99060822, + "learning_rate": 0.0007174229284330773, + "loss": 1.00248265, + "num_input_tokens_seen": 162415120, + "router_z_loss_mlp": 0.88330078, + "step": 1955, + "time_per_iteration": 2.642186403274536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182076, + "balance_loss_mlp": 1.09338391, + "epoch": 0.37629857637552905, + "flos": 599970582528.0, + "grad_norm": 0.025408092842649905, + "language_loss": 0.92700577, + "learning_rate": 0.0007171423408680141, + "loss": 0.93882644, + "num_input_tokens_seen": 162493280, + "router_z_loss_mlp": 0.88671875, + "step": 1956, + "time_per_iteration": 2.8501906394958496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180409, + "balance_loss_mlp": 1.09138381, + "epoch": 0.37649095806079264, + "flos": 566018187264.0, + "grad_norm": 0.027446848492574977, + "language_loss": 0.96095192, + "learning_rate": 0.0007168616690008176, + "loss": 0.97275609, + "num_input_tokens_seen": 162560736, + "router_z_loss_mlp": 0.88818359, + "step": 1957, + "time_per_iteration": 2.658282995223999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183288, + "balance_loss_mlp": 1.09440601, + "epoch": 0.37668333974605617, + "flos": 593568579072.0, + "grad_norm": 0.029268558303355535, + "language_loss": 0.93381131, + "learning_rate": 0.0007165809129404545, + "loss": 0.9456442, + "num_input_tokens_seen": 162630688, + "router_z_loss_mlp": 0.88671875, + "step": 1958, + "time_per_iteration": 2.738896608352661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185047, + "balance_loss_mlp": 1.09621239, + "epoch": 0.37687572143131975, + "flos": 420364280832.0, + "grad_norm": 0.028940223287944336, + "language_loss": 0.94791234, + "learning_rate": 0.0007163000727959239, + "loss": 0.95976275, + "num_input_tokens_seen": 162694304, + "router_z_loss_mlp": 0.88623047, + "step": 1959, + "time_per_iteration": 2.5175514221191406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122541, + "balance_loss_mlp": 1.14034271, + "epoch": 0.3770681031165833, + "flos": 1360384568832.0, + "grad_norm": 0.031863979933265396, + "language_loss": 0.77959073, + "learning_rate": 0.0007160191486762575, + "loss": 0.79184484, + "num_input_tokens_seen": 162920336, + "router_z_loss_mlp": 0.8515625, + "step": 1960, + "time_per_iteration": 4.834294557571411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187625, + "balance_loss_mlp": 1.0985992, + "epoch": 0.3772604848018469, + "flos": 646153568256.0, + "grad_norm": 0.027699188267120346, + "language_loss": 0.9236567, + "learning_rate": 0.00071573814069052, + "loss": 0.93553299, + "num_input_tokens_seen": 163000720, + "router_z_loss_mlp": 0.88818359, + "step": 1961, + "time_per_iteration": 2.8704912662506104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195985, + "balance_loss_mlp": 1.10681665, + "epoch": 0.3774528664871104, + "flos": 903200810496.0, + "grad_norm": 0.025601029742712816, + "language_loss": 0.93588847, + "learning_rate": 0.0007154570489478081, + "loss": 0.94784832, + "num_input_tokens_seen": 163085680, + "router_z_loss_mlp": 0.88964844, + "step": 1962, + "time_per_iteration": 3.2312510013580322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198663, + "balance_loss_mlp": 1.1095897, + "epoch": 0.377645248172374, + "flos": 789462868992.0, + "grad_norm": 0.028157211525065163, + "language_loss": 0.92405236, + "learning_rate": 0.0007151758735572514, + "loss": 0.93603897, + "num_input_tokens_seen": 163162224, + "router_z_loss_mlp": 0.88867188, + "step": 1963, + "time_per_iteration": 3.0338857173919678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192995, + "balance_loss_mlp": 1.10396981, + "epoch": 0.3778376298576376, + "flos": 587924642304.0, + "grad_norm": 0.030822839560022956, + "language_loss": 0.89740217, + "learning_rate": 0.0007148946146280119, + "loss": 0.90933216, + "num_input_tokens_seen": 163237920, + "router_z_loss_mlp": 0.88818359, + "step": 1964, + "time_per_iteration": 2.795830488204956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193161, + "balance_loss_mlp": 1.10656738, + "epoch": 0.3780300115429011, + "flos": 1399669997568.0, + "grad_norm": 0.013238700163895742, + "language_loss": 0.72192144, + "learning_rate": 0.000714613272269284, + "loss": 0.7338531, + "num_input_tokens_seen": 163455760, + "router_z_loss_mlp": 0.8671875, + "step": 1965, + "time_per_iteration": 4.866962909698486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120089, + "balance_loss_mlp": 1.11372375, + "epoch": 0.3782223932281647, + "flos": 1360631619072.0, + "grad_norm": 0.015556792607008025, + "language_loss": 0.75341946, + "learning_rate": 0.0007143318465902943, + "loss": 0.76542836, + "num_input_tokens_seen": 163678064, + "router_z_loss_mlp": 0.87304688, + "step": 1966, + "time_per_iteration": 4.942438364028931 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179172, + "balance_loss_mlp": 1.09114802, + "epoch": 0.37841477491342823, + "flos": 705515865600.0, + "grad_norm": 0.024767419651172896, + "language_loss": 0.90831983, + "learning_rate": 0.0007140503377003022, + "loss": 0.92011154, + "num_input_tokens_seen": 163764320, + "router_z_loss_mlp": 0.88183594, + "step": 1967, + "time_per_iteration": 2.9852232933044434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118121, + "balance_loss_mlp": 1.09318614, + "epoch": 0.3786071565986918, + "flos": 530155614720.0, + "grad_norm": 0.02676934241732637, + "language_loss": 0.92451024, + "learning_rate": 0.000713768745708599, + "loss": 0.93632239, + "num_input_tokens_seen": 163831808, + "router_z_loss_mlp": 0.88183594, + "step": 1968, + "time_per_iteration": 2.6276321411132812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180899, + "balance_loss_mlp": 1.09311283, + "epoch": 0.37879953828395535, + "flos": 994900039680.0, + "grad_norm": 0.026029915049846697, + "language_loss": 0.85207623, + "learning_rate": 0.0007134870707245085, + "loss": 0.86388516, + "num_input_tokens_seen": 163918128, + "router_z_loss_mlp": 0.87939453, + "step": 1969, + "time_per_iteration": 3.2757370471954346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118867, + "balance_loss_mlp": 1.10074103, + "epoch": 0.37899191996921894, + "flos": 627792219648.0, + "grad_norm": 0.029282968357198087, + "language_loss": 0.91297084, + "learning_rate": 0.0007132053128573864, + "loss": 0.92485756, + "num_input_tokens_seen": 163987552, + "router_z_loss_mlp": 0.88085938, + "step": 1970, + "time_per_iteration": 2.713987350463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184407, + "balance_loss_mlp": 1.09633517, + "epoch": 0.37918430165448247, + "flos": 687519088128.0, + "grad_norm": 0.026716081838251738, + "language_loss": 0.91701669, + "learning_rate": 0.0007129234722166211, + "loss": 0.92886078, + "num_input_tokens_seen": 164063248, + "router_z_loss_mlp": 0.88232422, + "step": 1971, + "time_per_iteration": 2.830312728881836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178089, + "balance_loss_mlp": 1.09025514, + "epoch": 0.37937668333974606, + "flos": 476617901568.0, + "grad_norm": 0.023390773702336033, + "language_loss": 0.97041333, + "learning_rate": 0.0007126415489116328, + "loss": 0.98219419, + "num_input_tokens_seen": 164133776, + "router_z_loss_mlp": 0.87988281, + "step": 1972, + "time_per_iteration": 2.6577088832855225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186585, + "balance_loss_mlp": 1.09903812, + "epoch": 0.37956906502500964, + "flos": 708823928832.0, + "grad_norm": 0.02822522227358307, + "language_loss": 0.89341533, + "learning_rate": 0.0007123595430518736, + "loss": 0.90528119, + "num_input_tokens_seen": 164206672, + "router_z_loss_mlp": 0.87695312, + "step": 1973, + "time_per_iteration": 2.8803040981292725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187247, + "balance_loss_mlp": 1.09974778, + "epoch": 0.3797614467102732, + "flos": 427558553088.0, + "grad_norm": 0.030455517002935972, + "language_loss": 0.93240166, + "learning_rate": 0.0007120774547468282, + "loss": 0.94427419, + "num_input_tokens_seen": 164271968, + "router_z_loss_mlp": 0.87646484, + "step": 1974, + "time_per_iteration": 2.5190658569335938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185963, + "balance_loss_mlp": 1.09836841, + "epoch": 0.37995382839553676, + "flos": 482880916992.0, + "grad_norm": 0.028219754054602288, + "language_loss": 0.89357984, + "learning_rate": 0.0007117952841060128, + "loss": 0.9054395, + "num_input_tokens_seen": 164342800, + "router_z_loss_mlp": 0.87744141, + "step": 1975, + "time_per_iteration": 2.6428894996643066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184241, + "balance_loss_mlp": 1.09631252, + "epoch": 0.3801462100808003, + "flos": 561670078464.0, + "grad_norm": 0.02907805968320273, + "language_loss": 0.90876186, + "learning_rate": 0.0007115130312389756, + "loss": 0.92060423, + "num_input_tokens_seen": 164414928, + "router_z_loss_mlp": 0.88085938, + "step": 1976, + "time_per_iteration": 2.669287919998169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188644, + "balance_loss_mlp": 1.10066783, + "epoch": 0.3803385917660639, + "flos": 465887255040.0, + "grad_norm": 0.031138982719559682, + "language_loss": 0.88565898, + "learning_rate": 0.0007112306962552973, + "loss": 0.89754546, + "num_input_tokens_seen": 164483312, + "router_z_loss_mlp": 0.88134766, + "step": 1977, + "time_per_iteration": 2.617105007171631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188488, + "balance_loss_mlp": 1.10055935, + "epoch": 0.3805309734513274, + "flos": 522904946688.0, + "grad_norm": 0.027881475391737562, + "language_loss": 0.92461807, + "learning_rate": 0.0007109482792645896, + "loss": 0.93650293, + "num_input_tokens_seen": 164555760, + "router_z_loss_mlp": 0.88085938, + "step": 1978, + "time_per_iteration": 2.7350404262542725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191644, + "balance_loss_mlp": 1.10352468, + "epoch": 0.380723355136591, + "flos": 592552728576.0, + "grad_norm": 0.03010131618310245, + "language_loss": 0.91373634, + "learning_rate": 0.0007106657803764969, + "loss": 0.92565274, + "num_input_tokens_seen": 164626768, + "router_z_loss_mlp": 0.88183594, + "step": 1979, + "time_per_iteration": 2.7113609313964844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188099, + "balance_loss_mlp": 1.10007489, + "epoch": 0.38091573682185453, + "flos": 623854344192.0, + "grad_norm": 0.03122566409921124, + "language_loss": 0.90192807, + "learning_rate": 0.0007103831997006948, + "loss": 0.91380906, + "num_input_tokens_seen": 164698016, + "router_z_loss_mlp": 0.88183594, + "step": 1980, + "time_per_iteration": 2.7460203170776367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183293, + "balance_loss_mlp": 1.09507859, + "epoch": 0.3811081185071181, + "flos": 570175641600.0, + "grad_norm": 0.027157726640451497, + "language_loss": 0.92157245, + "learning_rate": 0.0007101005373468908, + "loss": 0.9334054, + "num_input_tokens_seen": 164780320, + "router_z_loss_mlp": 0.8828125, + "step": 1981, + "time_per_iteration": 2.869722604751587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176795, + "balance_loss_mlp": 1.08891392, + "epoch": 0.3813005001923817, + "flos": 585990269952.0, + "grad_norm": 0.026054611177121254, + "language_loss": 0.92786968, + "learning_rate": 0.0007098177934248242, + "loss": 0.9396376, + "num_input_tokens_seen": 164854400, + "router_z_loss_mlp": 0.88037109, + "step": 1982, + "time_per_iteration": 2.7341668605804443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179814, + "balance_loss_mlp": 1.09188521, + "epoch": 0.38149288187764524, + "flos": 622810295808.0, + "grad_norm": 0.03120804506271422, + "language_loss": 0.94404829, + "learning_rate": 0.0007095349680442661, + "loss": 0.95584643, + "num_input_tokens_seen": 164932896, + "router_z_loss_mlp": 0.88085938, + "step": 1983, + "time_per_iteration": 2.845836639404297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182966, + "balance_loss_mlp": 1.09522831, + "epoch": 0.3816852635629088, + "flos": 571797109248.0, + "grad_norm": 0.027372063240090748, + "language_loss": 0.86448967, + "learning_rate": 0.0007092520613150188, + "loss": 0.87631935, + "num_input_tokens_seen": 165002896, + "router_z_loss_mlp": 0.87890625, + "step": 1984, + "time_per_iteration": 2.6740176677703857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178711, + "balance_loss_mlp": 1.09106863, + "epoch": 0.38187764524817236, + "flos": 566678198784.0, + "grad_norm": 0.03160695384354602, + "language_loss": 0.87573516, + "learning_rate": 0.0007089690733469165, + "loss": 0.88752234, + "num_input_tokens_seen": 165074704, + "router_z_loss_mlp": 0.87792969, + "step": 1985, + "time_per_iteration": 2.717921733856201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178571, + "balance_loss_mlp": 1.09073794, + "epoch": 0.38207002693343595, + "flos": 632398838784.0, + "grad_norm": 0.031031403109496963, + "language_loss": 0.90504575, + "learning_rate": 0.000708686004249825, + "loss": 0.91683149, + "num_input_tokens_seen": 165149136, + "router_z_loss_mlp": 0.87988281, + "step": 1986, + "time_per_iteration": 2.758554697036743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179432, + "balance_loss_mlp": 1.09164619, + "epoch": 0.3822624086186995, + "flos": 549840989184.0, + "grad_norm": 0.025201133141653974, + "language_loss": 0.97533029, + "learning_rate": 0.0007084028541336413, + "loss": 0.98712462, + "num_input_tokens_seen": 165220864, + "router_z_loss_mlp": 0.87939453, + "step": 1987, + "time_per_iteration": 2.6981115341186523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187219, + "balance_loss_mlp": 1.09909916, + "epoch": 0.38245479030396307, + "flos": 615066802176.0, + "grad_norm": 0.02853553744793089, + "language_loss": 0.9291808, + "learning_rate": 0.0007081196231082942, + "loss": 0.94105303, + "num_input_tokens_seen": 165301568, + "router_z_loss_mlp": 0.8828125, + "step": 1988, + "time_per_iteration": 2.7912278175354004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186636, + "balance_loss_mlp": 1.09851646, + "epoch": 0.38264717198922665, + "flos": 669303458304.0, + "grad_norm": 0.029318681320032423, + "language_loss": 0.88455558, + "learning_rate": 0.0007078363112837436, + "loss": 0.89642197, + "num_input_tokens_seen": 165373152, + "router_z_loss_mlp": 0.8828125, + "step": 1989, + "time_per_iteration": 2.8133885860443115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187352, + "balance_loss_mlp": 1.09927964, + "epoch": 0.3828395536744902, + "flos": 455686364160.0, + "grad_norm": 0.029265262626364436, + "language_loss": 0.9249233, + "learning_rate": 0.000707552918769981, + "loss": 0.93679678, + "num_input_tokens_seen": 165439136, + "router_z_loss_mlp": 0.88232422, + "step": 1990, + "time_per_iteration": 2.538587808609009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180802, + "balance_loss_mlp": 1.09277809, + "epoch": 0.3830319353597538, + "flos": 500482197504.0, + "grad_norm": 0.02588536582900798, + "language_loss": 0.91112638, + "learning_rate": 0.000707269445677029, + "loss": 0.92293441, + "num_input_tokens_seen": 165514624, + "router_z_loss_mlp": 0.88183594, + "step": 1991, + "time_per_iteration": 2.7578041553497314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183391, + "balance_loss_mlp": 1.09536684, + "epoch": 0.3832243170450173, + "flos": 745466035200.0, + "grad_norm": 0.02707218781991338, + "language_loss": 0.91718936, + "learning_rate": 0.0007069858921149416, + "loss": 0.92902327, + "num_input_tokens_seen": 165594512, + "router_z_loss_mlp": 0.88183594, + "step": 1992, + "time_per_iteration": 2.948418617248535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184259, + "balance_loss_mlp": 1.09613955, + "epoch": 0.3834166987302809, + "flos": 579345219072.0, + "grad_norm": 0.02587271093699699, + "language_loss": 0.92343616, + "learning_rate": 0.0007067022581938043, + "loss": 0.93527877, + "num_input_tokens_seen": 165673968, + "router_z_loss_mlp": 0.8828125, + "step": 1993, + "time_per_iteration": 2.881967782974243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118782, + "balance_loss_mlp": 1.09965289, + "epoch": 0.3836090804155444, + "flos": 537608397312.0, + "grad_norm": 0.029882536442049617, + "language_loss": 0.91833031, + "learning_rate": 0.0007064185440237334, + "loss": 0.9302085, + "num_input_tokens_seen": 165747664, + "router_z_loss_mlp": 0.88330078, + "step": 1994, + "time_per_iteration": 2.7481510639190674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190014, + "balance_loss_mlp": 1.10189474, + "epoch": 0.383801462100808, + "flos": 603051061248.0, + "grad_norm": 0.027232179622410133, + "language_loss": 0.91516536, + "learning_rate": 0.0007061347497148764, + "loss": 0.92706549, + "num_input_tokens_seen": 165824624, + "router_z_loss_mlp": 0.8828125, + "step": 1995, + "time_per_iteration": 2.762807846069336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191619, + "balance_loss_mlp": 1.10321367, + "epoch": 0.38399384378607154, + "flos": 573798610944.0, + "grad_norm": 0.03191203592253993, + "language_loss": 0.9478448, + "learning_rate": 0.0007058508753774122, + "loss": 0.95976096, + "num_input_tokens_seen": 165896304, + "router_z_loss_mlp": 0.88476562, + "step": 1996, + "time_per_iteration": 2.7208473682403564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185202, + "balance_loss_mlp": 1.09708297, + "epoch": 0.38418622547133513, + "flos": 537779586048.0, + "grad_norm": 0.03234926235653744, + "language_loss": 0.93760306, + "learning_rate": 0.0007055669211215505, + "loss": 0.94945514, + "num_input_tokens_seen": 165961312, + "router_z_loss_mlp": 0.8828125, + "step": 1997, + "time_per_iteration": 2.6605474948883057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182194, + "balance_loss_mlp": 1.09397876, + "epoch": 0.3843786071565987, + "flos": 574013460480.0, + "grad_norm": 0.03558568539094479, + "language_loss": 0.86620909, + "learning_rate": 0.0007052828870575322, + "loss": 0.87803102, + "num_input_tokens_seen": 166028064, + "router_z_loss_mlp": 0.88378906, + "step": 1998, + "time_per_iteration": 2.6478962898254395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179215, + "balance_loss_mlp": 1.09100008, + "epoch": 0.38457098884186225, + "flos": 730079104512.0, + "grad_norm": 0.027610192556292087, + "language_loss": 0.94167769, + "learning_rate": 0.0007049987732956291, + "loss": 0.95346981, + "num_input_tokens_seen": 166110272, + "router_z_loss_mlp": 0.88378906, + "step": 1999, + "time_per_iteration": 2.9643850326538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190926, + "balance_loss_mlp": 1.10199583, + "epoch": 0.38476337052712584, + "flos": 584620581888.0, + "grad_norm": 0.023866575274933036, + "language_loss": 0.8787694, + "learning_rate": 0.0007047145799461439, + "loss": 0.89067864, + "num_input_tokens_seen": 166193088, + "router_z_loss_mlp": 0.88720703, + "step": 2000, + "time_per_iteration": 2.8542819023132324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191076, + "balance_loss_mlp": 1.10200322, + "epoch": 0.38495575221238937, + "flos": 554158898688.0, + "grad_norm": 0.025960095413567152, + "language_loss": 0.89154112, + "learning_rate": 0.00070443030711941, + "loss": 0.90345186, + "num_input_tokens_seen": 166271776, + "router_z_loss_mlp": 0.88867188, + "step": 2001, + "time_per_iteration": 2.770023822784424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189246, + "balance_loss_mlp": 1.10084057, + "epoch": 0.38514813389765296, + "flos": 655676983296.0, + "grad_norm": 0.026490656569535233, + "language_loss": 0.88696259, + "learning_rate": 0.0007041459549257924, + "loss": 0.89885509, + "num_input_tokens_seen": 166350000, + "router_z_loss_mlp": 0.88476562, + "step": 2002, + "time_per_iteration": 4.357714414596558 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182232, + "balance_loss_mlp": 1.09392142, + "epoch": 0.3853405155829165, + "flos": 869645913600.0, + "grad_norm": 0.03138294802585753, + "language_loss": 0.86704218, + "learning_rate": 0.0007038615234756859, + "loss": 0.87886453, + "num_input_tokens_seen": 166434336, + "router_z_loss_mlp": 0.88476562, + "step": 2003, + "time_per_iteration": 3.154315233230591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179573, + "balance_loss_mlp": 1.09135854, + "epoch": 0.3855328972681801, + "flos": 547468185600.0, + "grad_norm": 0.030993794918127784, + "language_loss": 0.91032863, + "learning_rate": 0.000703577012879517, + "loss": 0.92212439, + "num_input_tokens_seen": 166503952, + "router_z_loss_mlp": 0.88378906, + "step": 2004, + "time_per_iteration": 2.6320230960845947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184907, + "balance_loss_mlp": 1.09673953, + "epoch": 0.3857252789534436, + "flos": 535098607104.0, + "grad_norm": 0.029525133384240967, + "language_loss": 0.9687134, + "learning_rate": 0.0007032924232477423, + "loss": 0.98056245, + "num_input_tokens_seen": 166575168, + "router_z_loss_mlp": 0.88330078, + "step": 2005, + "time_per_iteration": 2.650982618331909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184324, + "balance_loss_mlp": 1.09630013, + "epoch": 0.3859176606387072, + "flos": 492766901760.0, + "grad_norm": 0.029334702789067958, + "language_loss": 0.8823278, + "learning_rate": 0.0007030077546908493, + "loss": 0.89417106, + "num_input_tokens_seen": 166647552, + "router_z_loss_mlp": 0.88183594, + "step": 2006, + "time_per_iteration": 2.642333745956421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203979, + "balance_loss_mlp": 1.11700439, + "epoch": 0.3861100423239708, + "flos": 1490155991040.0, + "grad_norm": 0.02217822259323008, + "language_loss": 0.83064663, + "learning_rate": 0.0007027230073193561, + "loss": 0.84268641, + "num_input_tokens_seen": 166875088, + "router_z_loss_mlp": 0.87109375, + "step": 2007, + "time_per_iteration": 4.759521961212158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184336, + "balance_loss_mlp": 1.09635913, + "epoch": 0.3863024240092343, + "flos": 474692261376.0, + "grad_norm": 0.030825589148035897, + "language_loss": 0.87378025, + "learning_rate": 0.0007024381812438117, + "loss": 0.88562357, + "num_input_tokens_seen": 166939344, + "router_z_loss_mlp": 0.88134766, + "step": 2008, + "time_per_iteration": 2.5227372646331787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184691, + "balance_loss_mlp": 1.09728634, + "epoch": 0.3864948056944979, + "flos": 717978769920.0, + "grad_norm": 0.032935981886219476, + "language_loss": 0.91112518, + "learning_rate": 0.0007021532765747951, + "loss": 0.92297208, + "num_input_tokens_seen": 167014992, + "router_z_loss_mlp": 0.87548828, + "step": 2009, + "time_per_iteration": 2.963550567626953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182737, + "balance_loss_mlp": 1.0952853, + "epoch": 0.38668718737976143, + "flos": 728954465280.0, + "grad_norm": 0.030267959416106823, + "language_loss": 0.86631739, + "learning_rate": 0.0007018682934229162, + "loss": 0.87814474, + "num_input_tokens_seen": 167092096, + "router_z_loss_mlp": 0.87597656, + "step": 2010, + "time_per_iteration": 2.955132246017456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179617, + "balance_loss_mlp": 1.09235525, + "epoch": 0.386879569065025, + "flos": 526488984576.0, + "grad_norm": 0.02588052645359636, + "language_loss": 0.89375025, + "learning_rate": 0.0007015832318988152, + "loss": 0.90554643, + "num_input_tokens_seen": 167162144, + "router_z_loss_mlp": 0.87402344, + "step": 2011, + "time_per_iteration": 2.612443208694458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117942, + "balance_loss_mlp": 1.09454346, + "epoch": 0.38707195075028855, + "flos": 1530724512768.0, + "grad_norm": 0.010241364382771095, + "language_loss": 0.73890078, + "learning_rate": 0.000701298092113163, + "loss": 0.75069499, + "num_input_tokens_seen": 167391536, + "router_z_loss_mlp": 0.84960938, + "step": 2012, + "time_per_iteration": 4.952507495880127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187813, + "balance_loss_mlp": 1.10040927, + "epoch": 0.38726433243555214, + "flos": 558385483776.0, + "grad_norm": 0.026729103388188073, + "language_loss": 0.89776802, + "learning_rate": 0.0007010128741766604, + "loss": 0.90964615, + "num_input_tokens_seen": 167466000, + "router_z_loss_mlp": 0.87548828, + "step": 2013, + "time_per_iteration": 2.759916067123413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184734, + "balance_loss_mlp": 1.09756815, + "epoch": 0.38745671412081567, + "flos": 554755783680.0, + "grad_norm": 0.0314384592840016, + "language_loss": 0.91517645, + "learning_rate": 0.0007007275782000391, + "loss": 0.92702377, + "num_input_tokens_seen": 167536144, + "router_z_loss_mlp": 0.87304688, + "step": 2014, + "time_per_iteration": 2.6659133434295654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181864, + "balance_loss_mlp": 1.09469819, + "epoch": 0.38764909580607926, + "flos": 459344262144.0, + "grad_norm": 0.028810992523736655, + "language_loss": 0.92611015, + "learning_rate": 0.0007004422042940605, + "loss": 0.9379288, + "num_input_tokens_seen": 167600064, + "router_z_loss_mlp": 0.87304688, + "step": 2015, + "time_per_iteration": 2.4901411533355713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180932, + "balance_loss_mlp": 1.09376657, + "epoch": 0.38784147749134285, + "flos": 523258784256.0, + "grad_norm": 0.030339968140386194, + "language_loss": 0.98432136, + "learning_rate": 0.0007001567525695169, + "loss": 0.99613065, + "num_input_tokens_seen": 167666576, + "router_z_loss_mlp": 0.87304688, + "step": 2016, + "time_per_iteration": 2.605134963989258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182969, + "balance_loss_mlp": 1.09575546, + "epoch": 0.3880338591766064, + "flos": 667400011776.0, + "grad_norm": 0.023304348995526428, + "language_loss": 0.90603948, + "learning_rate": 0.0006998712231372303, + "loss": 0.91786909, + "num_input_tokens_seen": 167753296, + "router_z_loss_mlp": 0.87353516, + "step": 2017, + "time_per_iteration": 2.9866511821746826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187647, + "balance_loss_mlp": 1.10024321, + "epoch": 0.38822624086186996, + "flos": 595175310336.0, + "grad_norm": 0.027834044235160192, + "language_loss": 0.92810535, + "learning_rate": 0.0006995856161080532, + "loss": 0.93998176, + "num_input_tokens_seen": 167834080, + "router_z_loss_mlp": 0.87548828, + "step": 2018, + "time_per_iteration": 2.8917806148529053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181908, + "balance_loss_mlp": 1.09426534, + "epoch": 0.3884186225471335, + "flos": 613681651200.0, + "grad_norm": 0.030912624722110756, + "language_loss": 0.90135586, + "learning_rate": 0.0006992999315928679, + "loss": 0.91317499, + "num_input_tokens_seen": 167912368, + "router_z_loss_mlp": 0.87792969, + "step": 2019, + "time_per_iteration": 2.821570873260498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179846, + "balance_loss_mlp": 1.0924896, + "epoch": 0.3886110042323971, + "flos": 608243831808.0, + "grad_norm": 0.025167723735071885, + "language_loss": 0.91748118, + "learning_rate": 0.0006990141697025871, + "loss": 0.92927969, + "num_input_tokens_seen": 167991968, + "router_z_loss_mlp": 0.875, + "step": 2020, + "time_per_iteration": 2.774073600769043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181915, + "balance_loss_mlp": 1.09684753, + "epoch": 0.3888033859176606, + "flos": 1531193869824.0, + "grad_norm": 0.011544022481713089, + "language_loss": 0.76359642, + "learning_rate": 0.0006987283305481533, + "loss": 0.77541554, + "num_input_tokens_seen": 168212128, + "router_z_loss_mlp": 0.8515625, + "step": 2021, + "time_per_iteration": 4.741650581359863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174887, + "balance_loss_mlp": 1.08734, + "epoch": 0.3889957676029242, + "flos": 693671313408.0, + "grad_norm": 0.03334226176751645, + "language_loss": 0.90383756, + "learning_rate": 0.0006984424142405392, + "loss": 0.91558647, + "num_input_tokens_seen": 168287440, + "router_z_loss_mlp": 0.87695312, + "step": 2022, + "time_per_iteration": 2.839838981628418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174992, + "balance_loss_mlp": 1.08734977, + "epoch": 0.3891881492881878, + "flos": 516194767872.0, + "grad_norm": 0.031660307701904165, + "language_loss": 0.90829813, + "learning_rate": 0.0006981564208907474, + "loss": 0.92004812, + "num_input_tokens_seen": 168354704, + "router_z_loss_mlp": 0.87792969, + "step": 2023, + "time_per_iteration": 2.6160523891448975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179623, + "balance_loss_mlp": 1.09178972, + "epoch": 0.3893805309734513, + "flos": 630175756800.0, + "grad_norm": 0.02822603249283798, + "language_loss": 0.96692258, + "learning_rate": 0.0006978703506098102, + "loss": 0.97871882, + "num_input_tokens_seen": 168424272, + "router_z_loss_mlp": 0.87988281, + "step": 2024, + "time_per_iteration": 2.770775556564331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177682, + "balance_loss_mlp": 1.08994389, + "epoch": 0.3895729126587149, + "flos": 545206172160.0, + "grad_norm": 0.026225366557941037, + "language_loss": 0.95314252, + "learning_rate": 0.00069758420350879, + "loss": 0.96491939, + "num_input_tokens_seen": 168488912, + "router_z_loss_mlp": 0.87890625, + "step": 2025, + "time_per_iteration": 2.615687608718872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179844, + "balance_loss_mlp": 1.09201062, + "epoch": 0.38976529434397844, + "flos": 619406178816.0, + "grad_norm": 0.03181269468531491, + "language_loss": 0.9379099, + "learning_rate": 0.000697297979698779, + "loss": 0.94970834, + "num_input_tokens_seen": 168563248, + "router_z_loss_mlp": 0.87988281, + "step": 2026, + "time_per_iteration": 2.723860740661621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187768, + "balance_loss_mlp": 1.10007727, + "epoch": 0.38995767602924203, + "flos": 836344797696.0, + "grad_norm": 0.025703512313876988, + "language_loss": 0.89683533, + "learning_rate": 0.0006970116792908992, + "loss": 0.90871298, + "num_input_tokens_seen": 168648272, + "router_z_loss_mlp": 0.87841797, + "step": 2027, + "time_per_iteration": 3.0871434211730957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117977, + "balance_loss_mlp": 1.09203207, + "epoch": 0.39015005771450556, + "flos": 542646716928.0, + "grad_norm": 0.03022946762166595, + "language_loss": 0.88945854, + "learning_rate": 0.000696725302396302, + "loss": 0.9012562, + "num_input_tokens_seen": 168721760, + "router_z_loss_mlp": 0.87890625, + "step": 2028, + "time_per_iteration": 2.632178783416748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174959, + "balance_loss_mlp": 1.0871253, + "epoch": 0.39034243939976915, + "flos": 1009140864000.0, + "grad_norm": 0.026055335602768993, + "language_loss": 0.92111158, + "learning_rate": 0.0006964388491261692, + "loss": 0.93286121, + "num_input_tokens_seen": 168803664, + "router_z_loss_mlp": 0.87988281, + "step": 2029, + "time_per_iteration": 3.2683680057525635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174119, + "balance_loss_mlp": 1.08633304, + "epoch": 0.3905348210850327, + "flos": 680240222208.0, + "grad_norm": 0.029787695509808892, + "language_loss": 0.96251416, + "learning_rate": 0.0006961523195917114, + "loss": 0.97425532, + "num_input_tokens_seen": 168879184, + "router_z_loss_mlp": 0.87939453, + "step": 2030, + "time_per_iteration": 2.807161331176758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182527, + "balance_loss_mlp": 1.09459865, + "epoch": 0.39072720277029627, + "flos": 549988709376.0, + "grad_norm": 0.03099080969443711, + "language_loss": 0.86433041, + "learning_rate": 0.0006958657139041696, + "loss": 0.87615567, + "num_input_tokens_seen": 168957808, + "router_z_loss_mlp": 0.88085938, + "step": 2031, + "time_per_iteration": 2.728208065032959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119693, + "balance_loss_mlp": 1.11052704, + "epoch": 0.39091958445555985, + "flos": 1551051159552.0, + "grad_norm": 0.01789751173127641, + "language_loss": 0.76712966, + "learning_rate": 0.0006955790321748136, + "loss": 0.77909899, + "num_input_tokens_seen": 169194416, + "router_z_loss_mlp": 0.86523438, + "step": 2032, + "time_per_iteration": 4.911708354949951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179573, + "balance_loss_mlp": 1.09193051, + "epoch": 0.3911119661408234, + "flos": 505051886592.0, + "grad_norm": 0.03095157096826047, + "language_loss": 0.85940099, + "learning_rate": 0.0006952922745149434, + "loss": 0.87119675, + "num_input_tokens_seen": 169263552, + "router_z_loss_mlp": 0.87792969, + "step": 2033, + "time_per_iteration": 2.649538040161133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176483, + "balance_loss_mlp": 1.08903146, + "epoch": 0.391304347826087, + "flos": 558329088000.0, + "grad_norm": 0.028319463440814277, + "language_loss": 0.94666743, + "learning_rate": 0.000695005441035888, + "loss": 0.95843232, + "num_input_tokens_seen": 169333696, + "router_z_loss_mlp": 0.87597656, + "step": 2034, + "time_per_iteration": 2.6671407222747803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178574, + "balance_loss_mlp": 1.09293365, + "epoch": 0.3914967295113505, + "flos": 1502941807104.0, + "grad_norm": 0.0063133772361172544, + "language_loss": 0.73723435, + "learning_rate": 0.0006947185318490064, + "loss": 0.7490201, + "num_input_tokens_seen": 169556416, + "router_z_loss_mlp": 0.85742188, + "step": 2035, + "time_per_iteration": 4.863725423812866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180506, + "balance_loss_mlp": 1.09338748, + "epoch": 0.3916891111966141, + "flos": 708329101824.0, + "grad_norm": 0.025753563122139746, + "language_loss": 0.86980474, + "learning_rate": 0.0006944315470656863, + "loss": 0.88160974, + "num_input_tokens_seen": 169643312, + "router_z_loss_mlp": 0.87255859, + "step": 2036, + "time_per_iteration": 2.936588764190674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188418, + "balance_loss_mlp": 1.10110939, + "epoch": 0.3918814928818776, + "flos": 557408564736.0, + "grad_norm": 0.031943380680049066, + "language_loss": 0.99613088, + "learning_rate": 0.000694144486797345, + "loss": 1.00801504, + "num_input_tokens_seen": 169712560, + "router_z_loss_mlp": 0.87451172, + "step": 2037, + "time_per_iteration": 2.676107883453369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193756, + "balance_loss_mlp": 1.10868835, + "epoch": 0.3920738745671412, + "flos": 1541685471744.0, + "grad_norm": 0.012882287356254449, + "language_loss": 0.79520434, + "learning_rate": 0.0006938573511554296, + "loss": 0.8071419, + "num_input_tokens_seen": 169914912, + "router_z_loss_mlp": 0.8515625, + "step": 2038, + "time_per_iteration": 4.63246750831604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178826, + "balance_loss_mlp": 1.0916127, + "epoch": 0.39226625625240474, + "flos": 499804721664.0, + "grad_norm": 0.027391930017631044, + "language_loss": 0.96627682, + "learning_rate": 0.0006935701402514156, + "loss": 0.97806513, + "num_input_tokens_seen": 169978848, + "router_z_loss_mlp": 0.87353516, + "step": 2039, + "time_per_iteration": 2.5613086223602295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177521, + "balance_loss_mlp": 1.092453, + "epoch": 0.39245863793766833, + "flos": 1350450920448.0, + "grad_norm": 0.011737641894846437, + "language_loss": 0.73034894, + "learning_rate": 0.0006932828541968083, + "loss": 0.74212414, + "num_input_tokens_seen": 170211488, + "router_z_loss_mlp": 0.8515625, + "step": 2040, + "time_per_iteration": 4.902123689651489 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176176, + "balance_loss_mlp": 1.08881962, + "epoch": 0.3926510196229319, + "flos": 1348114142208.0, + "grad_norm": 0.028665962134257456, + "language_loss": 0.92107272, + "learning_rate": 0.0006929954931031422, + "loss": 0.93283451, + "num_input_tokens_seen": 170298528, + "router_z_loss_mlp": 0.875, + "step": 2041, + "time_per_iteration": 3.7387020587921143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176234, + "balance_loss_mlp": 1.08902013, + "epoch": 0.39284340130819545, + "flos": 500603721216.0, + "grad_norm": 0.024641039111334598, + "language_loss": 0.95021844, + "learning_rate": 0.0006927080570819805, + "loss": 0.96198076, + "num_input_tokens_seen": 170365680, + "router_z_loss_mlp": 0.87353516, + "step": 2042, + "time_per_iteration": 2.5837514400482178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117531, + "balance_loss_mlp": 1.08814418, + "epoch": 0.39303578299345904, + "flos": 521341876224.0, + "grad_norm": 0.03605238478740547, + "language_loss": 0.89998531, + "learning_rate": 0.0006924205462449161, + "loss": 0.9117384, + "num_input_tokens_seen": 170432224, + "router_z_loss_mlp": 0.87304688, + "step": 2043, + "time_per_iteration": 2.560842514038086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173664, + "balance_loss_mlp": 1.08664155, + "epoch": 0.39322816467872257, + "flos": 909537686016.0, + "grad_norm": 0.029197625514705252, + "language_loss": 0.89668262, + "learning_rate": 0.0006921329607035702, + "loss": 0.90841925, + "num_input_tokens_seen": 170517920, + "router_z_loss_mlp": 0.87158203, + "step": 2044, + "time_per_iteration": 3.2215418815612793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185916, + "balance_loss_mlp": 1.09860718, + "epoch": 0.39342054636398616, + "flos": 518641431552.0, + "grad_norm": 0.026194219642157263, + "language_loss": 0.94294739, + "learning_rate": 0.0006918453005695938, + "loss": 0.95480657, + "num_input_tokens_seen": 170589072, + "router_z_loss_mlp": 0.87451172, + "step": 2045, + "time_per_iteration": 2.637197732925415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183114, + "balance_loss_mlp": 1.09594774, + "epoch": 0.3936129280492497, + "flos": 549011790336.0, + "grad_norm": 0.026944227420126074, + "language_loss": 0.91576457, + "learning_rate": 0.0006915575659546662, + "loss": 0.92759573, + "num_input_tokens_seen": 170657856, + "router_z_loss_mlp": 0.87304688, + "step": 2046, + "time_per_iteration": 2.7570858001708984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185485, + "balance_loss_mlp": 1.098176, + "epoch": 0.3938053097345133, + "flos": 527140263936.0, + "grad_norm": 0.02948359624940754, + "language_loss": 0.88347399, + "learning_rate": 0.0006912697569704959, + "loss": 0.89532876, + "num_input_tokens_seen": 170723696, + "router_z_loss_mlp": 0.87451172, + "step": 2047, + "time_per_iteration": 2.635467290878296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186252, + "balance_loss_mlp": 1.09899104, + "epoch": 0.39399769141977686, + "flos": 472588701696.0, + "grad_norm": 0.02995196024762557, + "language_loss": 0.93503523, + "learning_rate": 0.0006909818737288205, + "loss": 0.94689775, + "num_input_tokens_seen": 170789536, + "router_z_loss_mlp": 0.87402344, + "step": 2048, + "time_per_iteration": 2.558013916015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181668, + "balance_loss_mlp": 1.09488404, + "epoch": 0.3941900731050404, + "flos": 502726746624.0, + "grad_norm": 0.02878603575662113, + "language_loss": 0.88763595, + "learning_rate": 0.000690693916341406, + "loss": 0.89945263, + "num_input_tokens_seen": 170859232, + "router_z_loss_mlp": 0.86914062, + "step": 2049, + "time_per_iteration": 2.5820720195770264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178505, + "balance_loss_mlp": 1.09152949, + "epoch": 0.394382454790304, + "flos": 582006732288.0, + "grad_norm": 0.024885306311727563, + "language_loss": 0.90003175, + "learning_rate": 0.0006904058849200475, + "loss": 0.91181684, + "num_input_tokens_seen": 170931568, + "router_z_loss_mlp": 0.87109375, + "step": 2050, + "time_per_iteration": 2.7304697036743164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118427, + "balance_loss_mlp": 1.09700906, + "epoch": 0.3945748364755675, + "flos": 514844545536.0, + "grad_norm": 0.02745844528377672, + "language_loss": 0.91741204, + "learning_rate": 0.0006901177795765683, + "loss": 0.92925465, + "num_input_tokens_seen": 170999856, + "router_z_loss_mlp": 0.87402344, + "step": 2051, + "time_per_iteration": 2.610621213912964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180664, + "balance_loss_mlp": 1.09368861, + "epoch": 0.3947672181608311, + "flos": 595057789440.0, + "grad_norm": 0.03028158635704326, + "language_loss": 0.89240891, + "learning_rate": 0.0006898296004228213, + "loss": 0.90421557, + "num_input_tokens_seen": 171072320, + "router_z_loss_mlp": 0.87109375, + "step": 2052, + "time_per_iteration": 2.747377395629883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119046, + "balance_loss_mlp": 1.10634613, + "epoch": 0.39495959984609463, + "flos": 1551049158144.0, + "grad_norm": 0.018267218432335405, + "language_loss": 0.7812674, + "learning_rate": 0.0006895413475706873, + "loss": 0.793172, + "num_input_tokens_seen": 171304128, + "router_z_loss_mlp": 0.84179688, + "step": 2053, + "time_per_iteration": 4.871596336364746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117553, + "balance_loss_mlp": 1.08845937, + "epoch": 0.3951519815313582, + "flos": 497523242496.0, + "grad_norm": 0.028876315996474663, + "language_loss": 0.87133646, + "learning_rate": 0.0006892530211320763, + "loss": 0.88309175, + "num_input_tokens_seen": 171377392, + "router_z_loss_mlp": 0.87207031, + "step": 2054, + "time_per_iteration": 2.696796417236328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117541, + "balance_loss_mlp": 1.08824456, + "epoch": 0.39534436321662175, + "flos": 532222244352.0, + "grad_norm": 0.031248767008087052, + "language_loss": 0.9121244, + "learning_rate": 0.000688964621218926, + "loss": 0.92387855, + "num_input_tokens_seen": 171447424, + "router_z_loss_mlp": 0.87304688, + "step": 2055, + "time_per_iteration": 2.6398446559906006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176401, + "balance_loss_mlp": 1.08899677, + "epoch": 0.39553674490188534, + "flos": 703724484096.0, + "grad_norm": 0.031024749515969993, + "language_loss": 0.88066703, + "learning_rate": 0.0006886761479432037, + "loss": 0.89243108, + "num_input_tokens_seen": 171519920, + "router_z_loss_mlp": 0.87548828, + "step": 2056, + "time_per_iteration": 2.896899700164795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184707, + "balance_loss_mlp": 1.09720743, + "epoch": 0.3957291265871489, + "flos": 410656215552.0, + "grad_norm": 0.031805347037857014, + "language_loss": 0.92354834, + "learning_rate": 0.0006883876014169045, + "loss": 0.93539548, + "num_input_tokens_seen": 171583856, + "router_z_loss_mlp": 0.87646484, + "step": 2057, + "time_per_iteration": 2.49245023727417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118858, + "balance_loss_mlp": 1.10108006, + "epoch": 0.39592150827241246, + "flos": 619638492672.0, + "grad_norm": 0.03245947566344542, + "language_loss": 0.97519982, + "learning_rate": 0.000688098981752052, + "loss": 0.98708564, + "num_input_tokens_seen": 171656064, + "router_z_loss_mlp": 0.87646484, + "step": 2058, + "time_per_iteration": 2.7079999446868896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183973, + "balance_loss_mlp": 1.09642518, + "epoch": 0.39611388995767605, + "flos": 822720324096.0, + "grad_norm": 0.029593298786174956, + "language_loss": 0.88381338, + "learning_rate": 0.0006878102890606982, + "loss": 0.89565313, + "num_input_tokens_seen": 171738800, + "router_z_loss_mlp": 0.87695312, + "step": 2059, + "time_per_iteration": 3.089268922805786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182646, + "balance_loss_mlp": 1.09524131, + "epoch": 0.3963062716429396, + "flos": 493214065152.0, + "grad_norm": 0.03350279358204369, + "language_loss": 0.88991904, + "learning_rate": 0.0006875215234549239, + "loss": 0.9017455, + "num_input_tokens_seen": 171803664, + "router_z_loss_mlp": 0.87548828, + "step": 2060, + "time_per_iteration": 2.538806200027466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182648, + "balance_loss_mlp": 1.09533882, + "epoch": 0.39649865332820317, + "flos": 585833817600.0, + "grad_norm": 0.030947291001002426, + "language_loss": 0.93147129, + "learning_rate": 0.0006872326850468376, + "loss": 0.9432978, + "num_input_tokens_seen": 171871968, + "router_z_loss_mlp": 0.87451172, + "step": 2061, + "time_per_iteration": 2.6593003273010254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179357, + "balance_loss_mlp": 1.09214342, + "epoch": 0.3966910350134667, + "flos": 459511448064.0, + "grad_norm": 0.03264577108022065, + "language_loss": 0.89072591, + "learning_rate": 0.0006869437739485762, + "loss": 0.90251946, + "num_input_tokens_seen": 171942368, + "router_z_loss_mlp": 0.87353516, + "step": 2062, + "time_per_iteration": 2.605191230773926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180604, + "balance_loss_mlp": 1.0932951, + "epoch": 0.3968834166987303, + "flos": 509614844928.0, + "grad_norm": 0.02743430972643364, + "language_loss": 0.9889155, + "learning_rate": 0.0006866547902723053, + "loss": 1.00072145, + "num_input_tokens_seen": 172012336, + "router_z_loss_mlp": 0.87451172, + "step": 2063, + "time_per_iteration": 2.6466383934020996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178614, + "balance_loss_mlp": 1.09116209, + "epoch": 0.3970757983839938, + "flos": 573742215168.0, + "grad_norm": 0.030016333454088624, + "language_loss": 0.87640852, + "learning_rate": 0.000686365734130218, + "loss": 0.88819462, + "num_input_tokens_seen": 172084640, + "router_z_loss_mlp": 0.87597656, + "step": 2064, + "time_per_iteration": 2.6795899868011475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178875, + "balance_loss_mlp": 1.09161353, + "epoch": 0.3972681800692574, + "flos": 482585476608.0, + "grad_norm": 0.03115409384976, + "language_loss": 0.90479839, + "learning_rate": 0.000686076605634536, + "loss": 0.91658711, + "num_input_tokens_seen": 172152992, + "router_z_loss_mlp": 0.87402344, + "step": 2065, + "time_per_iteration": 2.6956639289855957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176026, + "balance_loss_mlp": 1.0887177, + "epoch": 0.397460561754521, + "flos": 488904887808.0, + "grad_norm": 0.028660372999824147, + "language_loss": 0.91924292, + "learning_rate": 0.0006857874048975088, + "loss": 0.93100321, + "num_input_tokens_seen": 172219312, + "router_z_loss_mlp": 0.87451172, + "step": 2066, + "time_per_iteration": 2.541707992553711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182319, + "balance_loss_mlp": 1.09515274, + "epoch": 0.3976529434397845, + "flos": 422895538176.0, + "grad_norm": 0.03007540042591745, + "language_loss": 0.93814421, + "learning_rate": 0.0006854981320314142, + "loss": 0.94996738, + "num_input_tokens_seen": 172282112, + "router_z_loss_mlp": 0.87304688, + "step": 2067, + "time_per_iteration": 2.455916166305542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118284, + "balance_loss_mlp": 1.09586513, + "epoch": 0.3978453251250481, + "flos": 546621522432.0, + "grad_norm": 0.0330596148196893, + "language_loss": 0.94973123, + "learning_rate": 0.0006852087871485579, + "loss": 0.96155965, + "num_input_tokens_seen": 172347872, + "router_z_loss_mlp": 0.87109375, + "step": 2068, + "time_per_iteration": 2.609492063522339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175372, + "balance_loss_mlp": 1.08801544, + "epoch": 0.39803770681031164, + "flos": 652001620992.0, + "grad_norm": 0.0336676185790188, + "language_loss": 0.8912071, + "learning_rate": 0.0006849193703612735, + "loss": 0.90296078, + "num_input_tokens_seen": 172418560, + "router_z_loss_mlp": 0.875, + "step": 2069, + "time_per_iteration": 2.816309690475464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177646, + "balance_loss_mlp": 1.09071827, + "epoch": 0.39823008849557523, + "flos": 741426101760.0, + "grad_norm": 0.026625397702565265, + "language_loss": 0.84925234, + "learning_rate": 0.0006846298817819225, + "loss": 0.86102879, + "num_input_tokens_seen": 172497984, + "router_z_loss_mlp": 0.87060547, + "step": 2070, + "time_per_iteration": 2.9875504970550537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175555, + "balance_loss_mlp": 1.088485, + "epoch": 0.39842247018083876, + "flos": 385888860672.0, + "grad_norm": 0.03226539532166374, + "language_loss": 0.89664173, + "learning_rate": 0.0006843403215228945, + "loss": 0.90839732, + "num_input_tokens_seen": 172560112, + "router_z_loss_mlp": 0.87207031, + "step": 2071, + "time_per_iteration": 2.4326088428497314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173604, + "balance_loss_mlp": 1.08648539, + "epoch": 0.39861485186610235, + "flos": 534762233856.0, + "grad_norm": 0.028550920618746804, + "language_loss": 0.88238078, + "learning_rate": 0.0006840506896966065, + "loss": 0.89411676, + "num_input_tokens_seen": 172636192, + "router_z_loss_mlp": 0.87255859, + "step": 2072, + "time_per_iteration": 2.6961326599121094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177722, + "balance_loss_mlp": 1.09084272, + "epoch": 0.39880723355136594, + "flos": 644412578304.0, + "grad_norm": 0.03366874484709253, + "language_loss": 0.90951228, + "learning_rate": 0.0006837609864155038, + "loss": 0.9212895, + "num_input_tokens_seen": 172715264, + "router_z_loss_mlp": 0.87011719, + "step": 2073, + "time_per_iteration": 2.8584561347961426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119321, + "balance_loss_mlp": 1.10623515, + "epoch": 0.39899961523662947, + "flos": 516891709440.0, + "grad_norm": 0.031985803275243696, + "language_loss": 0.90341693, + "learning_rate": 0.0006834712117920592, + "loss": 0.91534901, + "num_input_tokens_seen": 172783456, + "router_z_loss_mlp": 0.87109375, + "step": 2074, + "time_per_iteration": 2.624469757080078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186501, + "balance_loss_mlp": 1.09933496, + "epoch": 0.39919199692189306, + "flos": 465338033664.0, + "grad_norm": 0.0320663192521817, + "language_loss": 0.92968071, + "learning_rate": 0.0006831813659387729, + "loss": 0.94154572, + "num_input_tokens_seen": 172848928, + "router_z_loss_mlp": 0.87304688, + "step": 2075, + "time_per_iteration": 2.5216238498687744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184926, + "balance_loss_mlp": 1.09785569, + "epoch": 0.3993843786071566, + "flos": 532678139904.0, + "grad_norm": 0.03441409861038799, + "language_loss": 0.91210699, + "learning_rate": 0.0006828914489681733, + "loss": 0.92395616, + "num_input_tokens_seen": 172921152, + "router_z_loss_mlp": 0.87207031, + "step": 2076, + "time_per_iteration": 2.686810255050659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186966, + "balance_loss_mlp": 1.10008633, + "epoch": 0.3995767602924202, + "flos": 505023688704.0, + "grad_norm": 0.02837279486305722, + "language_loss": 0.91445708, + "learning_rate": 0.0006826014609928162, + "loss": 0.92632675, + "num_input_tokens_seen": 172998864, + "router_z_loss_mlp": 0.87011719, + "step": 2077, + "time_per_iteration": 2.6775381565093994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225517, + "balance_loss_mlp": 1.13892365, + "epoch": 0.3997691419776837, + "flos": 1457471225856.0, + "grad_norm": 0.023004253676312834, + "language_loss": 0.83199388, + "learning_rate": 0.0006823114021252846, + "loss": 0.84424907, + "num_input_tokens_seen": 173219216, + "router_z_loss_mlp": 0.8671875, + "step": 2078, + "time_per_iteration": 4.87092661857605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117794, + "balance_loss_mlp": 1.09134626, + "epoch": 0.3999615236629473, + "flos": 531755615232.0, + "grad_norm": 0.028989200184594895, + "language_loss": 0.86860782, + "learning_rate": 0.0006820212724781896, + "loss": 0.88038719, + "num_input_tokens_seen": 173292000, + "router_z_loss_mlp": 0.8671875, + "step": 2079, + "time_per_iteration": 2.6908116340637207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176834, + "balance_loss_mlp": 1.09033561, + "epoch": 0.4001539053482108, + "flos": 696361024512.0, + "grad_norm": 0.02837619494351951, + "language_loss": 0.90808308, + "learning_rate": 0.0006817310721641694, + "loss": 0.91985142, + "num_input_tokens_seen": 173365568, + "router_z_loss_mlp": 0.86621094, + "step": 2080, + "time_per_iteration": 2.8117949962615967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190878, + "balance_loss_mlp": 1.10437989, + "epoch": 0.4003462870334744, + "flos": 521378806272.0, + "grad_norm": 0.0346474179870518, + "language_loss": 0.91806537, + "learning_rate": 0.00068144080129589, + "loss": 0.9299742, + "num_input_tokens_seen": 173430144, + "router_z_loss_mlp": 0.86621094, + "step": 2081, + "time_per_iteration": 2.596397638320923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190824, + "balance_loss_mlp": 1.10422993, + "epoch": 0.400538668718738, + "flos": 493502774784.0, + "grad_norm": 0.03225854359639043, + "language_loss": 0.90241659, + "learning_rate": 0.0006811504599860441, + "loss": 0.91432476, + "num_input_tokens_seen": 173494464, + "router_z_loss_mlp": 0.8671875, + "step": 2082, + "time_per_iteration": 2.5100014209747314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187111, + "balance_loss_mlp": 1.10075557, + "epoch": 0.40073105040400153, + "flos": 491451608064.0, + "grad_norm": 0.02371927790759806, + "language_loss": 0.91368544, + "learning_rate": 0.0006808600483473526, + "loss": 0.92555654, + "num_input_tokens_seen": 173577168, + "router_z_loss_mlp": 0.86474609, + "step": 2083, + "time_per_iteration": 2.9103221893310547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178586, + "balance_loss_mlp": 1.0923264, + "epoch": 0.4009234320892651, + "flos": 563539322880.0, + "grad_norm": 0.025152017879447597, + "language_loss": 0.9285866, + "learning_rate": 0.0006805695664925629, + "loss": 0.94037247, + "num_input_tokens_seen": 173655632, + "router_z_loss_mlp": 0.86376953, + "step": 2084, + "time_per_iteration": 2.804859161376953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170802, + "balance_loss_mlp": 1.08444667, + "epoch": 0.40111581377452865, + "flos": 426852879360.0, + "grad_norm": 0.029415551527707178, + "language_loss": 0.90934992, + "learning_rate": 0.0006802790145344506, + "loss": 0.92105794, + "num_input_tokens_seen": 173719040, + "router_z_loss_mlp": 0.86474609, + "step": 2085, + "time_per_iteration": 2.476952075958252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117314, + "balance_loss_mlp": 1.0870235, + "epoch": 0.40130819545979224, + "flos": 613642719744.0, + "grad_norm": 0.028611036161279673, + "language_loss": 0.93620002, + "learning_rate": 0.0006799883925858176, + "loss": 0.94793141, + "num_input_tokens_seen": 173796704, + "router_z_loss_mlp": 0.86230469, + "step": 2086, + "time_per_iteration": 2.8800101280212402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118738, + "balance_loss_mlp": 1.10112, + "epoch": 0.40150057714505577, + "flos": 524450552832.0, + "grad_norm": 0.02956813955479834, + "language_loss": 0.92602348, + "learning_rate": 0.0006796977007594933, + "loss": 0.93789732, + "num_input_tokens_seen": 173862352, + "router_z_loss_mlp": 0.86376953, + "step": 2087, + "time_per_iteration": 2.6013576984405518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191969, + "balance_loss_mlp": 1.10537529, + "epoch": 0.40169295883031936, + "flos": 562553671680.0, + "grad_norm": 0.03319927890150985, + "language_loss": 0.92797327, + "learning_rate": 0.0006794069391683345, + "loss": 0.93989295, + "num_input_tokens_seen": 173935408, + "router_z_loss_mlp": 0.8671875, + "step": 2088, + "time_per_iteration": 2.7359838485717773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177019, + "balance_loss_mlp": 1.09095037, + "epoch": 0.4018853405155829, + "flos": 520019851776.0, + "grad_norm": 0.03157379152927814, + "language_loss": 0.87612534, + "learning_rate": 0.0006791161079252248, + "loss": 0.88789552, + "num_input_tokens_seen": 174007152, + "router_z_loss_mlp": 0.86181641, + "step": 2089, + "time_per_iteration": 2.596851348876953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118277, + "balance_loss_mlp": 1.09655797, + "epoch": 0.4020777222008465, + "flos": 527287984128.0, + "grad_norm": 0.02654740933555753, + "language_loss": 0.89437628, + "learning_rate": 0.0006788252071430747, + "loss": 0.90620387, + "num_input_tokens_seen": 174074976, + "router_z_loss_mlp": 0.86328125, + "step": 2090, + "time_per_iteration": 2.8311312198638916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184846, + "balance_loss_mlp": 1.09853876, + "epoch": 0.40227010388611006, + "flos": 526840820736.0, + "grad_norm": 0.026844852664274194, + "language_loss": 0.92195117, + "learning_rate": 0.0006785342369348222, + "loss": 0.93379962, + "num_input_tokens_seen": 174149392, + "router_z_loss_mlp": 0.86425781, + "step": 2091, + "time_per_iteration": 2.7458736896514893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191242, + "balance_loss_mlp": 1.10488725, + "epoch": 0.4024624855713736, + "flos": 433226684928.0, + "grad_norm": 0.031284534475277, + "language_loss": 0.86698365, + "learning_rate": 0.0006782431974134316, + "loss": 0.87889606, + "num_input_tokens_seen": 174214656, + "router_z_loss_mlp": 0.86474609, + "step": 2092, + "time_per_iteration": 2.607151985168457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176082, + "balance_loss_mlp": 1.08996522, + "epoch": 0.4026548672566372, + "flos": 768090898944.0, + "grad_norm": 0.02657615147076362, + "language_loss": 0.96284211, + "learning_rate": 0.0006779520886918949, + "loss": 0.97460294, + "num_input_tokens_seen": 174296064, + "router_z_loss_mlp": 0.86230469, + "step": 2093, + "time_per_iteration": 3.03474760055542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173331, + "balance_loss_mlp": 1.08711922, + "epoch": 0.4028472489419007, + "flos": 644117137920.0, + "grad_norm": 0.02625373299959776, + "language_loss": 0.87827718, + "learning_rate": 0.0006776609108832301, + "loss": 0.89001048, + "num_input_tokens_seen": 174370896, + "router_z_loss_mlp": 0.86328125, + "step": 2094, + "time_per_iteration": 2.7667970657348633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171496, + "balance_loss_mlp": 1.08537877, + "epoch": 0.4030396306271643, + "flos": 492823297536.0, + "grad_norm": 0.02676539061642846, + "language_loss": 0.91710174, + "learning_rate": 0.0006773696641004828, + "loss": 0.92881668, + "num_input_tokens_seen": 174438448, + "router_z_loss_mlp": 0.86230469, + "step": 2095, + "time_per_iteration": 2.6013715267181396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177786, + "balance_loss_mlp": 1.09119189, + "epoch": 0.40323201231242783, + "flos": 903194079744.0, + "grad_norm": 0.03019422222161545, + "language_loss": 0.84170926, + "learning_rate": 0.0006770783484567247, + "loss": 0.85348713, + "num_input_tokens_seen": 174525952, + "router_z_loss_mlp": 0.8671875, + "step": 2096, + "time_per_iteration": 3.1032629013061523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180554, + "balance_loss_mlp": 1.09405565, + "epoch": 0.4034243939976914, + "flos": 571729979904.0, + "grad_norm": 0.026575026001379017, + "language_loss": 0.91571426, + "learning_rate": 0.000676786964065055, + "loss": 0.9275198, + "num_input_tokens_seen": 174607200, + "router_z_loss_mlp": 0.86621094, + "step": 2097, + "time_per_iteration": 2.8030343055725098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179089, + "balance_loss_mlp": 1.09254348, + "epoch": 0.403616775682955, + "flos": 508460006400.0, + "grad_norm": 0.029415731928054877, + "language_loss": 0.85702783, + "learning_rate": 0.0006764955110385986, + "loss": 0.86881876, + "num_input_tokens_seen": 174680976, + "router_z_loss_mlp": 0.86669922, + "step": 2098, + "time_per_iteration": 2.7224180698394775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175119, + "balance_loss_mlp": 1.08857322, + "epoch": 0.40380915736821854, + "flos": 520410619392.0, + "grad_norm": 0.02850929110585318, + "language_loss": 0.87608683, + "learning_rate": 0.0006762039894905083, + "loss": 0.88783801, + "num_input_tokens_seen": 174753152, + "router_z_loss_mlp": 0.86669922, + "step": 2099, + "time_per_iteration": 2.5972354412078857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169724, + "balance_loss_mlp": 1.08313072, + "epoch": 0.40400153905348213, + "flos": 442887086592.0, + "grad_norm": 0.05130464738927161, + "language_loss": 0.88512945, + "learning_rate": 0.000675912399533962, + "loss": 0.89682674, + "num_input_tokens_seen": 174817184, + "router_z_loss_mlp": 0.8671875, + "step": 2100, + "time_per_iteration": 2.502772808074951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168649, + "balance_loss_mlp": 1.08210301, + "epoch": 0.40419392073874566, + "flos": 773704636416.0, + "grad_norm": 0.02210637201548751, + "language_loss": 0.90372586, + "learning_rate": 0.0006756207412821656, + "loss": 0.91541237, + "num_input_tokens_seen": 174898128, + "router_z_loss_mlp": 0.86669922, + "step": 2101, + "time_per_iteration": 2.991191864013672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169884, + "balance_loss_mlp": 1.08319497, + "epoch": 0.40438630242400925, + "flos": 767988840960.0, + "grad_norm": 0.03154624750871164, + "language_loss": 0.88513219, + "learning_rate": 0.0006753290148483505, + "loss": 0.89683104, + "num_input_tokens_seen": 174981872, + "router_z_loss_mlp": 0.86816406, + "step": 2102, + "time_per_iteration": 3.005350112915039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166151, + "balance_loss_mlp": 1.07950926, + "epoch": 0.4045786841092728, + "flos": 416128963584.0, + "grad_norm": 0.026413403572192035, + "language_loss": 0.86387646, + "learning_rate": 0.0006750372203457752, + "loss": 0.87553799, + "num_input_tokens_seen": 175044976, + "router_z_loss_mlp": 0.86767578, + "step": 2103, + "time_per_iteration": 2.4381816387176514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168631, + "balance_loss_mlp": 1.08203721, + "epoch": 0.40477106579453637, + "flos": 540308841984.0, + "grad_norm": 0.025857351914300337, + "language_loss": 0.93101668, + "learning_rate": 0.0006747453578877242, + "loss": 0.94270301, + "num_input_tokens_seen": 175121104, + "router_z_loss_mlp": 0.8671875, + "step": 2104, + "time_per_iteration": 2.7268197536468506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169336, + "balance_loss_mlp": 1.08269489, + "epoch": 0.4049634474797999, + "flos": 828091014144.0, + "grad_norm": 0.03225143111931073, + "language_loss": 0.91022515, + "learning_rate": 0.0006744534275875085, + "loss": 0.92191851, + "num_input_tokens_seen": 175194512, + "router_z_loss_mlp": 0.86767578, + "step": 2105, + "time_per_iteration": 3.0087900161743164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176017, + "balance_loss_mlp": 1.08970928, + "epoch": 0.4051558291650635, + "flos": 573752948736.0, + "grad_norm": 0.02821186929772288, + "language_loss": 0.92500931, + "learning_rate": 0.0006741614295584657, + "loss": 0.93676949, + "num_input_tokens_seen": 175264176, + "router_z_loss_mlp": 0.86425781, + "step": 2106, + "time_per_iteration": 2.666135787963867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183174, + "balance_loss_mlp": 1.09691453, + "epoch": 0.4053482108503271, + "flos": 733244176896.0, + "grad_norm": 0.04647201706044112, + "language_loss": 0.85025966, + "learning_rate": 0.0006738693639139595, + "loss": 0.86209136, + "num_input_tokens_seen": 175347488, + "router_z_loss_mlp": 0.86376953, + "step": 2107, + "time_per_iteration": 2.9633677005767822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177787, + "balance_loss_mlp": 1.09100294, + "epoch": 0.4055405925355906, + "flos": 1214949336576.0, + "grad_norm": 0.0302025425082437, + "language_loss": 0.85097325, + "learning_rate": 0.0006735772307673796, + "loss": 0.86275113, + "num_input_tokens_seen": 175438336, + "router_z_loss_mlp": 0.86914062, + "step": 2108, + "time_per_iteration": 3.5333871841430664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177556, + "balance_loss_mlp": 1.09105742, + "epoch": 0.4057329742208542, + "flos": 717107911680.0, + "grad_norm": 0.026166055652869804, + "language_loss": 0.8899157, + "learning_rate": 0.0006732850302321421, + "loss": 0.90169132, + "num_input_tokens_seen": 175510912, + "router_z_loss_mlp": 0.86621094, + "step": 2109, + "time_per_iteration": 2.8610079288482666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170548, + "balance_loss_mlp": 1.0842886, + "epoch": 0.4059253559061177, + "flos": 565953059328.0, + "grad_norm": 0.026405563608612303, + "language_loss": 0.90377712, + "learning_rate": 0.00067299276242169, + "loss": 0.91548264, + "num_input_tokens_seen": 175583040, + "router_z_loss_mlp": 0.86376953, + "step": 2110, + "time_per_iteration": 2.709127426147461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197311, + "balance_loss_mlp": 1.11319733, + "epoch": 0.4061177375913813, + "flos": 1597186481664.0, + "grad_norm": 0.02594110918583908, + "language_loss": 0.74382168, + "learning_rate": 0.0006727004274494908, + "loss": 0.75579476, + "num_input_tokens_seen": 175817952, + "router_z_loss_mlp": 0.84179688, + "step": 2111, + "time_per_iteration": 4.906593322753906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117304, + "balance_loss_mlp": 1.08654153, + "epoch": 0.40631011927664484, + "flos": 616621140480.0, + "grad_norm": 0.028870166263774127, + "language_loss": 0.85570323, + "learning_rate": 0.0006724080254290395, + "loss": 0.86743361, + "num_input_tokens_seen": 175896352, + "router_z_loss_mlp": 0.86621094, + "step": 2112, + "time_per_iteration": 2.8279542922973633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168033, + "balance_loss_mlp": 1.08134389, + "epoch": 0.40650250096190843, + "flos": 558748053504.0, + "grad_norm": 0.030551496532206422, + "language_loss": 0.96733952, + "learning_rate": 0.0006721155564738566, + "loss": 0.97901982, + "num_input_tokens_seen": 175967152, + "router_z_loss_mlp": 0.86816406, + "step": 2113, + "time_per_iteration": 2.6917896270751953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174904, + "balance_loss_mlp": 1.08964539, + "epoch": 0.40669488264717196, + "flos": 1583542542336.0, + "grad_norm": 0.010618058744132962, + "language_loss": 0.78622639, + "learning_rate": 0.0006718230206974884, + "loss": 0.79797542, + "num_input_tokens_seen": 176205248, + "router_z_loss_mlp": 0.85351562, + "step": 2114, + "time_per_iteration": 4.959328651428223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171549, + "balance_loss_mlp": 1.08476496, + "epoch": 0.40688726433243555, + "flos": 508655390208.0, + "grad_norm": 0.033503716654157654, + "language_loss": 0.93188733, + "learning_rate": 0.0006715304182135078, + "loss": 0.9436028, + "num_input_tokens_seen": 176276208, + "router_z_loss_mlp": 0.86914062, + "step": 2115, + "time_per_iteration": 2.6056840419769287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172073, + "balance_loss_mlp": 1.08528888, + "epoch": 0.40707964601769914, + "flos": 590351840256.0, + "grad_norm": 0.028307470802153102, + "language_loss": 0.95287716, + "learning_rate": 0.0006712377491355127, + "loss": 0.96459788, + "num_input_tokens_seen": 176355072, + "router_z_loss_mlp": 0.86914062, + "step": 2116, + "time_per_iteration": 2.8985562324523926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177825, + "balance_loss_mlp": 1.09146965, + "epoch": 0.40727202770296267, + "flos": 581650893312.0, + "grad_norm": 0.026081347286493965, + "language_loss": 0.86969304, + "learning_rate": 0.0006709450135771274, + "loss": 0.88147128, + "num_input_tokens_seen": 176444592, + "router_z_loss_mlp": 0.86474609, + "step": 2117, + "time_per_iteration": 2.938913345336914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116718, + "balance_loss_mlp": 1.08058655, + "epoch": 0.40746440938822626, + "flos": 505108282368.0, + "grad_norm": 0.02500723808493834, + "language_loss": 0.92501736, + "learning_rate": 0.0006706522116520023, + "loss": 0.93668914, + "num_input_tokens_seen": 176516144, + "router_z_loss_mlp": 0.8671875, + "step": 2118, + "time_per_iteration": 2.6295557022094727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169158, + "balance_loss_mlp": 1.08246934, + "epoch": 0.4076567910734898, + "flos": 606710960640.0, + "grad_norm": 0.031046149511695622, + "language_loss": 0.91392642, + "learning_rate": 0.0006703593434738127, + "loss": 0.92561805, + "num_input_tokens_seen": 176585712, + "router_z_loss_mlp": 0.86816406, + "step": 2119, + "time_per_iteration": 2.6925787925720215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170168, + "balance_loss_mlp": 1.08371782, + "epoch": 0.4078491727587534, + "flos": 480518846976.0, + "grad_norm": 0.026436329156680958, + "language_loss": 0.85361552, + "learning_rate": 0.0006700664091562604, + "loss": 0.86531723, + "num_input_tokens_seen": 176654736, + "router_z_loss_mlp": 0.86572266, + "step": 2120, + "time_per_iteration": 2.567094087600708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177249, + "balance_loss_mlp": 1.09065557, + "epoch": 0.4080415544440169, + "flos": 511418961408.0, + "grad_norm": 0.02549175858454111, + "language_loss": 0.92328954, + "learning_rate": 0.0006697734088130725, + "loss": 0.93506193, + "num_input_tokens_seen": 176722800, + "router_z_loss_mlp": 0.8671875, + "step": 2121, + "time_per_iteration": 2.618701934814453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175348, + "balance_loss_mlp": 1.0889926, + "epoch": 0.4082339361292805, + "flos": 735927157248.0, + "grad_norm": 0.030272250235271202, + "language_loss": 0.93378723, + "learning_rate": 0.0006694803425580018, + "loss": 0.94554067, + "num_input_tokens_seen": 176800320, + "router_z_loss_mlp": 0.86474609, + "step": 2122, + "time_per_iteration": 2.983313798904419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174826, + "balance_loss_mlp": 1.08851826, + "epoch": 0.4084263178145441, + "flos": 458404273152.0, + "grad_norm": 0.031322708915370194, + "language_loss": 0.925843, + "learning_rate": 0.0006691872105048268, + "loss": 0.93759131, + "num_input_tokens_seen": 176867440, + "router_z_loss_mlp": 0.86425781, + "step": 2123, + "time_per_iteration": 2.570157766342163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171971, + "balance_loss_mlp": 1.08566332, + "epoch": 0.4086186994998076, + "flos": 564025417728.0, + "grad_norm": 0.026602974246623758, + "language_loss": 0.91457534, + "learning_rate": 0.0006688940127673513, + "loss": 0.92629504, + "num_input_tokens_seen": 176942048, + "router_z_loss_mlp": 0.86425781, + "step": 2124, + "time_per_iteration": 2.6775970458984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172213, + "balance_loss_mlp": 1.08609629, + "epoch": 0.4088110811850712, + "flos": 574893050880.0, + "grad_norm": 0.023493992507127005, + "language_loss": 0.90594321, + "learning_rate": 0.0006686007494594049, + "loss": 0.91766536, + "num_input_tokens_seen": 177025104, + "router_z_loss_mlp": 0.86230469, + "step": 2125, + "time_per_iteration": 2.8212904930114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166923, + "balance_loss_mlp": 1.08028209, + "epoch": 0.40900346287033473, + "flos": 457846319616.0, + "grad_norm": 0.03600016157180187, + "language_loss": 0.89846623, + "learning_rate": 0.0006683074206948425, + "loss": 0.91013545, + "num_input_tokens_seen": 177089296, + "router_z_loss_mlp": 0.86767578, + "step": 2126, + "time_per_iteration": 2.4914121627807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165958, + "balance_loss_mlp": 1.07926905, + "epoch": 0.4091958445555983, + "flos": 618594444288.0, + "grad_norm": 0.027616550174826966, + "language_loss": 0.88032037, + "learning_rate": 0.0006680140265875443, + "loss": 0.89197993, + "num_input_tokens_seen": 177163648, + "router_z_loss_mlp": 0.86816406, + "step": 2127, + "time_per_iteration": 2.8309690952301025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164825, + "balance_loss_mlp": 1.07846975, + "epoch": 0.40938822624086185, + "flos": 473370236928.0, + "grad_norm": 0.02755246393115647, + "language_loss": 1.01638341, + "learning_rate": 0.0006677205672514162, + "loss": 1.02803159, + "num_input_tokens_seen": 177233856, + "router_z_loss_mlp": 0.86474609, + "step": 2128, + "time_per_iteration": 2.716601610183716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170358, + "balance_loss_mlp": 1.08395457, + "epoch": 0.40958060792612544, + "flos": 571117632000.0, + "grad_norm": 0.024298637355030545, + "language_loss": 0.93714547, + "learning_rate": 0.000667427042800389, + "loss": 0.94884908, + "num_input_tokens_seen": 177309824, + "router_z_loss_mlp": 0.86523438, + "step": 2129, + "time_per_iteration": 2.7863857746124268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181584, + "balance_loss_mlp": 1.09499085, + "epoch": 0.40977298961138897, + "flos": 610470916608.0, + "grad_norm": 0.027297656005279614, + "language_loss": 0.89951032, + "learning_rate": 0.0006671334533484192, + "loss": 0.91132617, + "num_input_tokens_seen": 177380592, + "router_z_loss_mlp": 0.8671875, + "step": 2130, + "time_per_iteration": 2.7272608280181885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177813, + "balance_loss_mlp": 1.09160113, + "epoch": 0.40996537129665256, + "flos": 582872861184.0, + "grad_norm": 0.02438545141207517, + "language_loss": 0.89143705, + "learning_rate": 0.0006668397990094881, + "loss": 0.90321517, + "num_input_tokens_seen": 177454720, + "router_z_loss_mlp": 0.86328125, + "step": 2131, + "time_per_iteration": 2.74776554107666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173755, + "balance_loss_mlp": 1.08739984, + "epoch": 0.41015775298191615, + "flos": 517553722368.0, + "grad_norm": 0.026155362463659675, + "language_loss": 0.91776133, + "learning_rate": 0.0006665460798976027, + "loss": 0.92949885, + "num_input_tokens_seen": 177528224, + "router_z_loss_mlp": 0.86474609, + "step": 2132, + "time_per_iteration": 2.728180170059204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172912, + "balance_loss_mlp": 1.08679533, + "epoch": 0.4103501346671797, + "flos": 511445157888.0, + "grad_norm": 0.02671704384652658, + "language_loss": 0.87880147, + "learning_rate": 0.0006662522961267947, + "loss": 0.89053059, + "num_input_tokens_seen": 177598176, + "router_z_loss_mlp": 0.86230469, + "step": 2133, + "time_per_iteration": 2.6707494258880615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172576, + "balance_loss_mlp": 1.08636391, + "epoch": 0.41054251635244327, + "flos": 550926696960.0, + "grad_norm": 0.02310158230225749, + "language_loss": 0.93120432, + "learning_rate": 0.0006659584478111211, + "loss": 0.9429301, + "num_input_tokens_seen": 177675840, + "router_z_loss_mlp": 0.86328125, + "step": 2134, + "time_per_iteration": 2.7634923458099365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167834, + "balance_loss_mlp": 1.08162224, + "epoch": 0.4107348980377068, + "flos": 841298523648.0, + "grad_norm": 0.0323112144897684, + "language_loss": 0.91370595, + "learning_rate": 0.000665664535064664, + "loss": 0.9253844, + "num_input_tokens_seen": 177751376, + "router_z_loss_mlp": 0.86328125, + "step": 2135, + "time_per_iteration": 3.028343677520752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170594, + "balance_loss_mlp": 1.08447671, + "epoch": 0.4109272797229704, + "flos": 504763176960.0, + "grad_norm": 0.026958983372987907, + "language_loss": 0.8977797, + "learning_rate": 0.0006653705580015303, + "loss": 0.90948564, + "num_input_tokens_seen": 177825264, + "router_z_loss_mlp": 0.86230469, + "step": 2136, + "time_per_iteration": 2.6786246299743652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173433, + "balance_loss_mlp": 1.08731592, + "epoch": 0.4111196614082339, + "flos": 612023253504.0, + "grad_norm": 0.02687154551301225, + "language_loss": 0.92936879, + "learning_rate": 0.0006650765167358523, + "loss": 0.9411031, + "num_input_tokens_seen": 177901680, + "router_z_loss_mlp": 0.86230469, + "step": 2137, + "time_per_iteration": 2.765503168106079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170304, + "balance_loss_mlp": 1.08409154, + "epoch": 0.4113120430934975, + "flos": 454103827968.0, + "grad_norm": 0.029691236683527498, + "language_loss": 0.97143424, + "learning_rate": 0.0006647824113817864, + "loss": 0.98313725, + "num_input_tokens_seen": 177965264, + "router_z_loss_mlp": 0.86328125, + "step": 2138, + "time_per_iteration": 2.490111827850342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179698, + "balance_loss_mlp": 1.09329462, + "epoch": 0.41150442477876104, + "flos": 542709843456.0, + "grad_norm": 0.027637209651618533, + "language_loss": 0.88423729, + "learning_rate": 0.000664488242053515, + "loss": 0.89603424, + "num_input_tokens_seen": 178039712, + "router_z_loss_mlp": 0.86523438, + "step": 2139, + "time_per_iteration": 2.7109243869781494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193887, + "balance_loss_mlp": 1.10748434, + "epoch": 0.4116968064640246, + "flos": 577391380992.0, + "grad_norm": 0.026757188222196804, + "language_loss": 0.8939023, + "learning_rate": 0.0006641940088652445, + "loss": 0.90584123, + "num_input_tokens_seen": 178114080, + "router_z_loss_mlp": 0.86523438, + "step": 2140, + "time_per_iteration": 2.7461891174316406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186164, + "balance_loss_mlp": 1.09952235, + "epoch": 0.4118891881492882, + "flos": 497149939200.0, + "grad_norm": 0.030186458882164903, + "language_loss": 0.90177953, + "learning_rate": 0.0006638997119312065, + "loss": 0.91364121, + "num_input_tokens_seen": 178188032, + "router_z_loss_mlp": 0.86767578, + "step": 2141, + "time_per_iteration": 2.7632482051849365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206482, + "balance_loss_mlp": 1.11969757, + "epoch": 0.41208156983455174, + "flos": 1541570678784.0, + "grad_norm": 0.01865751049600735, + "language_loss": 0.75063306, + "learning_rate": 0.0006636053513656568, + "loss": 0.76269788, + "num_input_tokens_seen": 178395328, + "router_z_loss_mlp": 0.86914062, + "step": 2142, + "time_per_iteration": 4.916187286376953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117268, + "balance_loss_mlp": 1.0864203, + "epoch": 0.41227395151981533, + "flos": 586057399296.0, + "grad_norm": 0.03006664462158482, + "language_loss": 0.91539335, + "learning_rate": 0.000663310927282877, + "loss": 0.92712009, + "num_input_tokens_seen": 178471952, + "router_z_loss_mlp": 0.86376953, + "step": 2143, + "time_per_iteration": 2.783862829208374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178317, + "balance_loss_mlp": 1.09220016, + "epoch": 0.41246633320507886, + "flos": 443892203520.0, + "grad_norm": 0.03021664461702893, + "language_loss": 0.92787349, + "learning_rate": 0.000663016439797172, + "loss": 0.93965667, + "num_input_tokens_seen": 178542192, + "router_z_loss_mlp": 0.86230469, + "step": 2144, + "time_per_iteration": 2.617626428604126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177938, + "balance_loss_mlp": 1.09177303, + "epoch": 0.41265871489034245, + "flos": 581094941184.0, + "grad_norm": 0.031114344129188405, + "language_loss": 0.87895894, + "learning_rate": 0.0006627218890228724, + "loss": 0.89073837, + "num_input_tokens_seen": 178622736, + "router_z_loss_mlp": 0.86279297, + "step": 2145, + "time_per_iteration": 2.823136329650879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172469, + "balance_loss_mlp": 1.08611357, + "epoch": 0.412851096575606, + "flos": 762528827904.0, + "grad_norm": 0.03009040753958223, + "language_loss": 0.9065426, + "learning_rate": 0.0006624272750743326, + "loss": 0.91826725, + "num_input_tokens_seen": 178705808, + "router_z_loss_mlp": 0.86474609, + "step": 2146, + "time_per_iteration": 3.009969472885132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172508, + "balance_loss_mlp": 1.08615267, + "epoch": 0.41304347826086957, + "flos": 556520968704.0, + "grad_norm": 0.023356325653820006, + "language_loss": 0.88529593, + "learning_rate": 0.0006621325980659322, + "loss": 0.89702094, + "num_input_tokens_seen": 178781200, + "router_z_loss_mlp": 0.86474609, + "step": 2147, + "time_per_iteration": 2.7459471225738525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176953, + "balance_loss_mlp": 1.09083641, + "epoch": 0.41323585994613315, + "flos": 666893724672.0, + "grad_norm": 0.029406479855093332, + "language_loss": 0.8760705, + "learning_rate": 0.000661837858112075, + "loss": 0.88783997, + "num_input_tokens_seen": 178855072, + "router_z_loss_mlp": 0.86230469, + "step": 2148, + "time_per_iteration": 2.816408634185791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173515, + "balance_loss_mlp": 1.08763647, + "epoch": 0.4134282416313967, + "flos": 549784593408.0, + "grad_norm": 0.02816234486414791, + "language_loss": 0.9661653, + "learning_rate": 0.0006615430553271888, + "loss": 0.97790039, + "num_input_tokens_seen": 178927936, + "router_z_loss_mlp": 0.85986328, + "step": 2149, + "time_per_iteration": 2.7518115043640137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174425, + "balance_loss_mlp": 1.08859468, + "epoch": 0.4136206233166603, + "flos": 647512522752.0, + "grad_norm": 0.025697121170903614, + "language_loss": 0.9133321, + "learning_rate": 0.0006612481898257264, + "loss": 0.92507643, + "num_input_tokens_seen": 179007792, + "router_z_loss_mlp": 0.859375, + "step": 2150, + "time_per_iteration": 2.841632127761841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179143, + "balance_loss_mlp": 1.09364581, + "epoch": 0.4138130050019238, + "flos": 518363455488.0, + "grad_norm": 0.029278566016903075, + "language_loss": 0.9170779, + "learning_rate": 0.000660953261722165, + "loss": 0.92886931, + "num_input_tokens_seen": 179075200, + "router_z_loss_mlp": 0.85595703, + "step": 2151, + "time_per_iteration": 2.6203365325927734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178641, + "balance_loss_mlp": 1.09309638, + "epoch": 0.4140053866871874, + "flos": 610368858624.0, + "grad_norm": 0.02858072061503926, + "language_loss": 0.90138143, + "learning_rate": 0.0006606582711310055, + "loss": 0.91316783, + "num_input_tokens_seen": 179144448, + "router_z_loss_mlp": 0.85644531, + "step": 2152, + "time_per_iteration": 2.71352481842041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167147, + "balance_loss_mlp": 1.08103001, + "epoch": 0.4141977683724509, + "flos": 580845163008.0, + "grad_norm": 0.02998636441804494, + "language_loss": 0.9075436, + "learning_rate": 0.0006603632181667736, + "loss": 0.91921502, + "num_input_tokens_seen": 179215776, + "router_z_loss_mlp": 0.86230469, + "step": 2153, + "time_per_iteration": 2.766855478286743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175224, + "balance_loss_mlp": 1.09034729, + "epoch": 0.4143901500577145, + "flos": 1310176386048.0, + "grad_norm": 0.007725969282803628, + "language_loss": 0.78943324, + "learning_rate": 0.0006600681029440187, + "loss": 0.80118549, + "num_input_tokens_seen": 179436688, + "router_z_loss_mlp": 0.84960938, + "step": 2154, + "time_per_iteration": 4.895019292831421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175162, + "balance_loss_mlp": 1.08890247, + "epoch": 0.41458253174297804, + "flos": 461122182144.0, + "grad_norm": 0.032062709167589486, + "language_loss": 0.89760709, + "learning_rate": 0.0006597729255773153, + "loss": 0.90935868, + "num_input_tokens_seen": 179503264, + "router_z_loss_mlp": 0.86376953, + "step": 2155, + "time_per_iteration": 2.5811779499053955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170487, + "balance_loss_mlp": 1.08413148, + "epoch": 0.41477491342824163, + "flos": 554438876160.0, + "grad_norm": 0.02646748417883587, + "language_loss": 0.88947552, + "learning_rate": 0.0006594776861812608, + "loss": 0.90118033, + "num_input_tokens_seen": 179574864, + "router_z_loss_mlp": 0.86474609, + "step": 2156, + "time_per_iteration": 2.6486780643463135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174434, + "balance_loss_mlp": 1.08803129, + "epoch": 0.4149672951135052, + "flos": 699085664256.0, + "grad_norm": 0.02893226937169889, + "language_loss": 0.92862517, + "learning_rate": 0.0006591823848704776, + "loss": 0.94036949, + "num_input_tokens_seen": 179658208, + "router_z_loss_mlp": 0.86523438, + "step": 2157, + "time_per_iteration": 2.9617741107940674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175673, + "balance_loss_mlp": 1.08946109, + "epoch": 0.41515967679876875, + "flos": 566836652544.0, + "grad_norm": 0.025963915394380376, + "language_loss": 0.87666786, + "learning_rate": 0.0006588870217596117, + "loss": 0.88842458, + "num_input_tokens_seen": 179732320, + "router_z_loss_mlp": 0.86328125, + "step": 2158, + "time_per_iteration": 2.7438344955444336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175578, + "balance_loss_mlp": 1.08927035, + "epoch": 0.41535205848403234, + "flos": 502177525248.0, + "grad_norm": 0.03336248103115958, + "language_loss": 0.93542749, + "learning_rate": 0.0006585915969633334, + "loss": 0.94718325, + "num_input_tokens_seen": 179801616, + "router_z_loss_mlp": 0.86425781, + "step": 2159, + "time_per_iteration": 2.5621583461761475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170555, + "balance_loss_mlp": 1.08429492, + "epoch": 0.41554444016929587, + "flos": 608701728768.0, + "grad_norm": 0.03070944646834424, + "language_loss": 0.95915914, + "learning_rate": 0.0006582961105963366, + "loss": 0.97086465, + "num_input_tokens_seen": 179876112, + "router_z_loss_mlp": 0.86376953, + "step": 2160, + "time_per_iteration": 2.798051118850708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171192, + "balance_loss_mlp": 1.08498013, + "epoch": 0.41573682185455946, + "flos": 530155614720.0, + "grad_norm": 0.02743693152360054, + "language_loss": 0.85023397, + "learning_rate": 0.0006580005627733395, + "loss": 0.86194587, + "num_input_tokens_seen": 179949936, + "router_z_loss_mlp": 0.86328125, + "step": 2161, + "time_per_iteration": 2.6954233646392822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168175, + "balance_loss_mlp": 1.08234429, + "epoch": 0.415929203539823, + "flos": 506037537792.0, + "grad_norm": 0.027357224978205523, + "language_loss": 0.88365781, + "learning_rate": 0.0006577049536090838, + "loss": 0.89533949, + "num_input_tokens_seen": 180023184, + "router_z_loss_mlp": 0.859375, + "step": 2162, + "time_per_iteration": 2.6762402057647705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167145, + "balance_loss_mlp": 1.08140957, + "epoch": 0.4161215852250866, + "flos": 583823583744.0, + "grad_norm": 0.02816159229600616, + "language_loss": 0.92433643, + "learning_rate": 0.000657409283218335, + "loss": 0.93600792, + "num_input_tokens_seen": 180091728, + "router_z_loss_mlp": 0.85839844, + "step": 2163, + "time_per_iteration": 2.708815574645996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116891, + "balance_loss_mlp": 1.0833174, + "epoch": 0.4163139669103501, + "flos": 491759783424.0, + "grad_norm": 0.02622965675004396, + "language_loss": 0.87195617, + "learning_rate": 0.0006571135517158829, + "loss": 0.8836453, + "num_input_tokens_seen": 180162096, + "router_z_loss_mlp": 0.85693359, + "step": 2164, + "time_per_iteration": 2.7412045001983643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177162, + "balance_loss_mlp": 1.0930481, + "epoch": 0.4165063485956137, + "flos": 1291020767232.0, + "grad_norm": 0.0113690904759025, + "language_loss": 0.76764059, + "learning_rate": 0.0006568177592165404, + "loss": 0.77941221, + "num_input_tokens_seen": 180380912, + "router_z_loss_mlp": 0.84179688, + "step": 2165, + "time_per_iteration": 4.793722867965698 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172447, + "balance_loss_mlp": 1.08680665, + "epoch": 0.4166987302808773, + "flos": 496257613824.0, + "grad_norm": 0.031372404533623194, + "language_loss": 0.90335643, + "learning_rate": 0.0006565219058351444, + "loss": 0.9150809, + "num_input_tokens_seen": 180447424, + "router_z_loss_mlp": 0.85742188, + "step": 2166, + "time_per_iteration": 2.5605039596557617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169955, + "balance_loss_mlp": 1.08412397, + "epoch": 0.4168911119661408, + "flos": 465066788352.0, + "grad_norm": 0.02745374217966413, + "language_loss": 0.89900762, + "learning_rate": 0.0006562259916865553, + "loss": 0.91070712, + "num_input_tokens_seen": 180516336, + "router_z_loss_mlp": 0.859375, + "step": 2167, + "time_per_iteration": 2.5815963745117188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011761, + "balance_loss_mlp": 1.09055507, + "epoch": 0.4170834936514044, + "flos": 537942769152.0, + "grad_norm": 0.0279390150832869, + "language_loss": 0.86569649, + "learning_rate": 0.0006559300168856573, + "loss": 0.8774575, + "num_input_tokens_seen": 180589824, + "router_z_loss_mlp": 0.85644531, + "step": 2168, + "time_per_iteration": 2.7917275428771973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181119, + "balance_loss_mlp": 1.09547901, + "epoch": 0.41727587533666793, + "flos": 551749165056.0, + "grad_norm": 0.026888463962073755, + "language_loss": 0.92254919, + "learning_rate": 0.0006556339815473577, + "loss": 0.93436038, + "num_input_tokens_seen": 180661296, + "router_z_loss_mlp": 0.85742188, + "step": 2169, + "time_per_iteration": 2.640456438064575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170658, + "balance_loss_mlp": 1.08492219, + "epoch": 0.4174682570219315, + "flos": 632377371648.0, + "grad_norm": 0.027558904728032622, + "language_loss": 0.91870886, + "learning_rate": 0.000655337885786588, + "loss": 0.93041539, + "num_input_tokens_seen": 180744896, + "router_z_loss_mlp": 0.85839844, + "step": 2170, + "time_per_iteration": 2.885754108428955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170686, + "balance_loss_mlp": 1.08485556, + "epoch": 0.41766063870719505, + "flos": 520755724800.0, + "grad_norm": 0.031037248087189308, + "language_loss": 0.9245193, + "learning_rate": 0.0006550417297183025, + "loss": 0.93622619, + "num_input_tokens_seen": 180813008, + "router_z_loss_mlp": 0.859375, + "step": 2171, + "time_per_iteration": 2.607590436935425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175474, + "balance_loss_mlp": 1.08945298, + "epoch": 0.41785302039245864, + "flos": 559054227456.0, + "grad_norm": 0.02737354340834092, + "language_loss": 0.87721866, + "learning_rate": 0.0006547455134574793, + "loss": 0.88897336, + "num_input_tokens_seen": 180886480, + "router_z_loss_mlp": 0.86132812, + "step": 2172, + "time_per_iteration": 2.7324562072753906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184116, + "balance_loss_mlp": 1.09833348, + "epoch": 0.41804540207772223, + "flos": 790027553280.0, + "grad_norm": 0.06230752646239431, + "language_loss": 0.90406793, + "learning_rate": 0.0006544492371191198, + "loss": 0.91590911, + "num_input_tokens_seen": 180973776, + "router_z_loss_mlp": 0.85888672, + "step": 2173, + "time_per_iteration": 3.1248764991760254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186676, + "balance_loss_mlp": 1.10089302, + "epoch": 0.41823778376298576, + "flos": 905890521600.0, + "grad_norm": 0.03053935653615099, + "language_loss": 0.9052453, + "learning_rate": 0.0006541529008182485, + "loss": 0.91711211, + "num_input_tokens_seen": 181062768, + "router_z_loss_mlp": 0.85888672, + "step": 2174, + "time_per_iteration": 3.2052760124206543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169526, + "balance_loss_mlp": 1.08383834, + "epoch": 0.41843016544824935, + "flos": 512573799936.0, + "grad_norm": 0.02722476190126499, + "language_loss": 0.93815506, + "learning_rate": 0.0006538565046699136, + "loss": 0.94985026, + "num_input_tokens_seen": 181129872, + "router_z_loss_mlp": 0.85791016, + "step": 2175, + "time_per_iteration": 2.578150987625122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167473, + "balance_loss_mlp": 1.08183265, + "epoch": 0.4186225471335129, + "flos": 654289830912.0, + "grad_norm": 0.03154991846739093, + "language_loss": 0.89587617, + "learning_rate": 0.0006535600487891862, + "loss": 0.90755087, + "num_input_tokens_seen": 181208112, + "router_z_loss_mlp": 0.85742188, + "step": 2176, + "time_per_iteration": 2.8699960708618164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167918, + "balance_loss_mlp": 1.08218253, + "epoch": 0.41881492881877647, + "flos": 570225306624.0, + "grad_norm": 0.027441287945076498, + "language_loss": 0.94665354, + "learning_rate": 0.0006532635332911603, + "loss": 0.95833272, + "num_input_tokens_seen": 181278736, + "router_z_loss_mlp": 0.85839844, + "step": 2177, + "time_per_iteration": 2.695180892944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168273, + "balance_loss_mlp": 1.08239508, + "epoch": 0.41900731050404, + "flos": 913484293632.0, + "grad_norm": 0.030353783790969455, + "language_loss": 0.86808872, + "learning_rate": 0.0006529669582909541, + "loss": 0.87977153, + "num_input_tokens_seen": 181362512, + "router_z_loss_mlp": 0.85986328, + "step": 2178, + "time_per_iteration": 3.2746284008026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116623, + "balance_loss_mlp": 1.08073354, + "epoch": 0.4191996921893036, + "flos": 536783201280.0, + "grad_norm": 0.031775111638151596, + "language_loss": 0.93350971, + "learning_rate": 0.0006526703239037077, + "loss": 0.94517195, + "num_input_tokens_seen": 181432080, + "router_z_loss_mlp": 0.85595703, + "step": 2179, + "time_per_iteration": 2.6485140323638916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167238, + "balance_loss_mlp": 1.08159792, + "epoch": 0.4193920738745671, + "flos": 583730257920.0, + "grad_norm": 0.027399178820930566, + "language_loss": 0.92623031, + "learning_rate": 0.0006523736302445851, + "loss": 0.93790269, + "num_input_tokens_seen": 181507296, + "router_z_loss_mlp": 0.85742188, + "step": 2180, + "time_per_iteration": 2.8337948322296143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116728, + "balance_loss_mlp": 1.08149683, + "epoch": 0.4195844555598307, + "flos": 1337800459776.0, + "grad_norm": 0.031235958835637387, + "language_loss": 0.83915186, + "learning_rate": 0.0006520768774287728, + "loss": 0.85082471, + "num_input_tokens_seen": 181599408, + "router_z_loss_mlp": 0.85888672, + "step": 2181, + "time_per_iteration": 3.725524663925171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170743, + "balance_loss_mlp": 1.08505547, + "epoch": 0.4197768372450943, + "flos": 599996779008.0, + "grad_norm": 0.025797087070179033, + "language_loss": 0.91158509, + "learning_rate": 0.0006517800655714806, + "loss": 0.92329252, + "num_input_tokens_seen": 181674944, + "router_z_loss_mlp": 0.85791016, + "step": 2182, + "time_per_iteration": 2.8207623958587646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172108, + "balance_loss_mlp": 1.08646846, + "epoch": 0.4199692189303578, + "flos": 736595900928.0, + "grad_norm": 0.0300192342725077, + "language_loss": 0.91644537, + "learning_rate": 0.0006514831947879407, + "loss": 0.92816639, + "num_input_tokens_seen": 181756704, + "router_z_loss_mlp": 0.85742188, + "step": 2183, + "time_per_iteration": 2.9593582153320312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170186, + "balance_loss_mlp": 1.08454573, + "epoch": 0.4201616006156214, + "flos": 751661921280.0, + "grad_norm": 0.02826942186100045, + "language_loss": 0.84773123, + "learning_rate": 0.0006511862651934091, + "loss": 0.85943305, + "num_input_tokens_seen": 181837952, + "router_z_loss_mlp": 0.85742188, + "step": 2184, + "time_per_iteration": 3.1170709133148193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168703, + "balance_loss_mlp": 1.08301497, + "epoch": 0.42035398230088494, + "flos": 548091267072.0, + "grad_norm": 0.027950639773315498, + "language_loss": 0.89124084, + "learning_rate": 0.0006508892769031638, + "loss": 0.90292788, + "num_input_tokens_seen": 181906896, + "router_z_loss_mlp": 0.85791016, + "step": 2185, + "time_per_iteration": 2.6419410705566406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116924, + "balance_loss_mlp": 1.08379054, + "epoch": 0.42054636398614853, + "flos": 618047224320.0, + "grad_norm": 0.03133969262582121, + "language_loss": 0.94198585, + "learning_rate": 0.000650592230032506, + "loss": 0.95367819, + "num_input_tokens_seen": 181974976, + "router_z_loss_mlp": 0.85546875, + "step": 2186, + "time_per_iteration": 2.7254862785339355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175, + "balance_loss_mlp": 1.08935976, + "epoch": 0.42073874567141206, + "flos": 641666471424.0, + "grad_norm": 0.02942747497692904, + "language_loss": 0.9171921, + "learning_rate": 0.0006502951246967595, + "loss": 0.92894208, + "num_input_tokens_seen": 182054704, + "router_z_loss_mlp": 0.85742188, + "step": 2187, + "time_per_iteration": 2.8912041187286377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174567, + "balance_loss_mlp": 1.08897436, + "epoch": 0.42093112735667565, + "flos": 494822797824.0, + "grad_norm": 0.02515329577356359, + "language_loss": 0.92510098, + "learning_rate": 0.0006499979610112706, + "loss": 0.93684661, + "num_input_tokens_seen": 182129696, + "router_z_loss_mlp": 0.85693359, + "step": 2188, + "time_per_iteration": 2.710610866546631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119078, + "balance_loss_mlp": 1.1055218, + "epoch": 0.4211235090419392, + "flos": 543436984320.0, + "grad_norm": 0.027549100686041793, + "language_loss": 0.89267701, + "learning_rate": 0.000649700739091409, + "loss": 0.90458483, + "num_input_tokens_seen": 182203792, + "router_z_loss_mlp": 0.85351562, + "step": 2189, + "time_per_iteration": 2.770158290863037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177139, + "balance_loss_mlp": 1.09321594, + "epoch": 0.42131589072720277, + "flos": 1535388254208.0, + "grad_norm": 0.007480893247264192, + "language_loss": 0.73836273, + "learning_rate": 0.0006494034590525657, + "loss": 0.75013411, + "num_input_tokens_seen": 182432080, + "router_z_loss_mlp": 0.83984375, + "step": 2190, + "time_per_iteration": 4.826355218887329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168739, + "balance_loss_mlp": 1.08381474, + "epoch": 0.42150827241246636, + "flos": 567935095296.0, + "grad_norm": 0.025807507169531153, + "language_loss": 0.91430855, + "learning_rate": 0.0006491061210101557, + "loss": 0.92599595, + "num_input_tokens_seen": 182500256, + "router_z_loss_mlp": 0.85009766, + "step": 2191, + "time_per_iteration": 2.6813712120056152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170756, + "balance_loss_mlp": 1.08568799, + "epoch": 0.4217006540977299, + "flos": 708841393152.0, + "grad_norm": 0.02710796189326301, + "language_loss": 0.90667284, + "learning_rate": 0.0006488087250796157, + "loss": 0.91838038, + "num_input_tokens_seen": 182582912, + "router_z_loss_mlp": 0.8515625, + "step": 2192, + "time_per_iteration": 2.8864076137542725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117035, + "balance_loss_mlp": 1.08528221, + "epoch": 0.4218930357829935, + "flos": 628561019904.0, + "grad_norm": 0.0271709214243351, + "language_loss": 0.87769991, + "learning_rate": 0.0006485112713764049, + "loss": 0.8894034, + "num_input_tokens_seen": 182670304, + "router_z_loss_mlp": 0.8515625, + "step": 2193, + "time_per_iteration": 2.9007742404937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170953, + "balance_loss_mlp": 1.08578944, + "epoch": 0.422085417468257, + "flos": 461289368064.0, + "grad_norm": 0.026123872435626132, + "language_loss": 0.89901912, + "learning_rate": 0.0006482137600160051, + "loss": 0.91072869, + "num_input_tokens_seen": 182735024, + "router_z_loss_mlp": 0.85253906, + "step": 2194, + "time_per_iteration": 2.4960973262786865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170401, + "balance_loss_mlp": 1.08533287, + "epoch": 0.4222777991535206, + "flos": 474980971008.0, + "grad_norm": 0.02685495955741856, + "language_loss": 0.90204549, + "learning_rate": 0.0006479161911139206, + "loss": 0.91374946, + "num_input_tokens_seen": 182805024, + "router_z_loss_mlp": 0.8515625, + "step": 2195, + "time_per_iteration": 2.574496030807495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170408, + "balance_loss_mlp": 1.08534062, + "epoch": 0.4224701808387841, + "flos": 471844096512.0, + "grad_norm": 0.03212817551635824, + "language_loss": 0.93686366, + "learning_rate": 0.0006476185647856778, + "loss": 0.94856775, + "num_input_tokens_seen": 182871360, + "router_z_loss_mlp": 0.8515625, + "step": 2196, + "time_per_iteration": 2.558581829071045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169081, + "balance_loss_mlp": 1.08401346, + "epoch": 0.4226625625240477, + "flos": 678822870528.0, + "grad_norm": 0.034209207392335836, + "language_loss": 0.88652933, + "learning_rate": 0.0006473208811468255, + "loss": 0.89822018, + "num_input_tokens_seen": 182952912, + "router_z_loss_mlp": 0.8515625, + "step": 2197, + "time_per_iteration": 2.8745005130767822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169989, + "balance_loss_mlp": 1.08487344, + "epoch": 0.4228549442093113, + "flos": 504559060992.0, + "grad_norm": 0.02694559660877684, + "language_loss": 0.9045344, + "learning_rate": 0.0006470231403129347, + "loss": 0.91623431, + "num_input_tokens_seen": 183022016, + "router_z_loss_mlp": 0.85205078, + "step": 2198, + "time_per_iteration": 2.6385552883148193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171157, + "balance_loss_mlp": 1.08594668, + "epoch": 0.42304732589457483, + "flos": 613074032640.0, + "grad_norm": 0.02362792419875934, + "language_loss": 0.86769903, + "learning_rate": 0.0006467253423995988, + "loss": 0.87941062, + "num_input_tokens_seen": 183101776, + "router_z_loss_mlp": 0.85302734, + "step": 2199, + "time_per_iteration": 2.8800480365753174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169589, + "balance_loss_mlp": 1.08418751, + "epoch": 0.4232397075798384, + "flos": 516648662016.0, + "grad_norm": 0.0345778065938135, + "language_loss": 0.86613309, + "learning_rate": 0.000646427487522433, + "loss": 0.87782902, + "num_input_tokens_seen": 183171392, + "router_z_loss_mlp": 0.85498047, + "step": 2200, + "time_per_iteration": 2.658045768737793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170112, + "balance_loss_mlp": 1.08451986, + "epoch": 0.42343208926510195, + "flos": 590933262336.0, + "grad_norm": 0.02424061904629306, + "language_loss": 0.89308071, + "learning_rate": 0.0006461295757970749, + "loss": 0.90478176, + "num_input_tokens_seen": 183253936, + "router_z_loss_mlp": 0.85693359, + "step": 2201, + "time_per_iteration": 2.8574764728546143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170293, + "balance_loss_mlp": 1.08465314, + "epoch": 0.42362447095036554, + "flos": 641818194432.0, + "grad_norm": 0.03053594684877434, + "language_loss": 0.89224029, + "learning_rate": 0.0006458316073391839, + "loss": 0.90394318, + "num_input_tokens_seen": 183333744, + "router_z_loss_mlp": 0.85742188, + "step": 2202, + "time_per_iteration": 2.932666063308716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168878, + "balance_loss_mlp": 1.08318996, + "epoch": 0.42381685263562907, + "flos": 513717904896.0, + "grad_norm": 0.025745877239568934, + "language_loss": 0.93694568, + "learning_rate": 0.0006455335822644422, + "loss": 0.94863445, + "num_input_tokens_seen": 183401904, + "router_z_loss_mlp": 0.85791016, + "step": 2203, + "time_per_iteration": 2.6537110805511475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169969, + "balance_loss_mlp": 1.0842818, + "epoch": 0.42400923432089266, + "flos": 547822023168.0, + "grad_norm": 0.028367329203477194, + "language_loss": 0.84440267, + "learning_rate": 0.0006452355006885527, + "loss": 0.85610235, + "num_input_tokens_seen": 183471312, + "router_z_loss_mlp": 0.85791016, + "step": 2204, + "time_per_iteration": 2.639218330383301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169105, + "balance_loss_mlp": 1.08346462, + "epoch": 0.4242016160061562, + "flos": 623287658496.0, + "grad_norm": 0.03537327431533643, + "language_loss": 0.96295106, + "learning_rate": 0.0006449373627272412, + "loss": 0.9746421, + "num_input_tokens_seen": 183539184, + "router_z_loss_mlp": 0.85742188, + "step": 2205, + "time_per_iteration": 2.728724956512451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168771, + "balance_loss_mlp": 1.08317852, + "epoch": 0.4243939976914198, + "flos": 572971413504.0, + "grad_norm": 0.029625174738980242, + "language_loss": 0.88551587, + "learning_rate": 0.0006446391684962553, + "loss": 0.89720356, + "num_input_tokens_seen": 183607504, + "router_z_loss_mlp": 0.85693359, + "step": 2206, + "time_per_iteration": 2.6687116622924805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167518, + "balance_loss_mlp": 1.08192575, + "epoch": 0.42458637937668336, + "flos": 449664394752.0, + "grad_norm": 0.02816858253159587, + "language_loss": 0.89565998, + "learning_rate": 0.000644340918111364, + "loss": 0.90733516, + "num_input_tokens_seen": 183674720, + "router_z_loss_mlp": 0.85693359, + "step": 2207, + "time_per_iteration": 2.620295763015747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167512, + "balance_loss_mlp": 1.08206332, + "epoch": 0.4247787610619469, + "flos": 436335361536.0, + "grad_norm": 0.0303416400904182, + "language_loss": 0.92792743, + "learning_rate": 0.0006440426116883585, + "loss": 0.93960261, + "num_input_tokens_seen": 183740448, + "router_z_loss_mlp": 0.85546875, + "step": 2208, + "time_per_iteration": 2.5411367416381836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171139, + "balance_loss_mlp": 1.08602309, + "epoch": 0.4249711427472105, + "flos": 497121741312.0, + "grad_norm": 0.025596497409994177, + "language_loss": 0.92383361, + "learning_rate": 0.0006437442493430519, + "loss": 0.93554503, + "num_input_tokens_seen": 183812640, + "router_z_loss_mlp": 0.85205078, + "step": 2209, + "time_per_iteration": 2.6431679725646973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172012, + "balance_loss_mlp": 1.08694398, + "epoch": 0.425163524432474, + "flos": 657107796480.0, + "grad_norm": 0.030657116246539617, + "language_loss": 0.93065524, + "learning_rate": 0.000643445831191278, + "loss": 0.94237542, + "num_input_tokens_seen": 183895312, + "router_z_loss_mlp": 0.8515625, + "step": 2210, + "time_per_iteration": 2.9031519889831543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117009, + "balance_loss_mlp": 1.08502185, + "epoch": 0.4253559061177376, + "flos": 651778039296.0, + "grad_norm": 0.031032190975230387, + "language_loss": 0.88729775, + "learning_rate": 0.0006431473573488937, + "loss": 0.89899862, + "num_input_tokens_seen": 183966384, + "router_z_loss_mlp": 0.8515625, + "step": 2211, + "time_per_iteration": 2.745398759841919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170674, + "balance_loss_mlp": 1.08560598, + "epoch": 0.42554828780300114, + "flos": 555202947072.0, + "grad_norm": 0.03338022114707726, + "language_loss": 0.92210639, + "learning_rate": 0.0006428488279317765, + "loss": 0.93381315, + "num_input_tokens_seen": 184031728, + "router_z_loss_mlp": 0.8515625, + "step": 2212, + "time_per_iteration": 2.6822004318237305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172615, + "balance_loss_mlp": 1.08797669, + "epoch": 0.4257406694882647, + "flos": 515421964800.0, + "grad_norm": 0.02921339084637532, + "language_loss": 0.9444955, + "learning_rate": 0.0006425502430558259, + "loss": 0.95622164, + "num_input_tokens_seen": 184096160, + "router_z_loss_mlp": 0.84716797, + "step": 2213, + "time_per_iteration": 2.6147451400756836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173123, + "balance_loss_mlp": 1.08824575, + "epoch": 0.42593305117352825, + "flos": 516705057792.0, + "grad_norm": 0.028975617453248656, + "language_loss": 0.90705556, + "learning_rate": 0.0006422516028369628, + "loss": 0.91878676, + "num_input_tokens_seen": 184169664, + "router_z_loss_mlp": 0.84960938, + "step": 2214, + "time_per_iteration": 2.634315013885498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169159, + "balance_loss_mlp": 1.08423436, + "epoch": 0.42612543285879184, + "flos": 589237934592.0, + "grad_norm": 0.02737510916321625, + "language_loss": 0.88997841, + "learning_rate": 0.0006419529073911296, + "loss": 0.90166998, + "num_input_tokens_seen": 184249152, + "router_z_loss_mlp": 0.85009766, + "step": 2215, + "time_per_iteration": 2.934429168701172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168143, + "balance_loss_mlp": 1.08321857, + "epoch": 0.42631781454405543, + "flos": 636751676928.0, + "grad_norm": 0.02841677319990709, + "language_loss": 0.91541028, + "learning_rate": 0.0006416541568342901, + "loss": 0.92709166, + "num_input_tokens_seen": 184326816, + "router_z_loss_mlp": 0.85009766, + "step": 2216, + "time_per_iteration": 2.924881935119629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167669, + "balance_loss_mlp": 1.08269632, + "epoch": 0.42651019622931896, + "flos": 542245215744.0, + "grad_norm": 0.024048936266806608, + "language_loss": 0.89849669, + "learning_rate": 0.0006413553512824297, + "loss": 0.91017342, + "num_input_tokens_seen": 184404336, + "router_z_loss_mlp": 0.85058594, + "step": 2217, + "time_per_iteration": 2.7312259674072266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166506, + "balance_loss_mlp": 1.08096182, + "epoch": 0.42670257791458255, + "flos": 559223414784.0, + "grad_norm": 0.030670266673020908, + "language_loss": 0.90927672, + "learning_rate": 0.0006410564908515549, + "loss": 0.92094177, + "num_input_tokens_seen": 184472320, + "router_z_loss_mlp": 0.85644531, + "step": 2218, + "time_per_iteration": 2.646705389022827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165047, + "balance_loss_mlp": 1.07964516, + "epoch": 0.4268949595998461, + "flos": 622449727488.0, + "grad_norm": 0.03126891192332862, + "language_loss": 0.92295194, + "learning_rate": 0.0006407575756576935, + "loss": 0.93460238, + "num_input_tokens_seen": 184544704, + "router_z_loss_mlp": 0.85498047, + "step": 2219, + "time_per_iteration": 2.750229597091675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163243, + "balance_loss_mlp": 1.07769799, + "epoch": 0.42708734128510967, + "flos": 539015015424.0, + "grad_norm": 0.029393225010211587, + "language_loss": 0.93690813, + "learning_rate": 0.0006404586058168951, + "loss": 0.94854057, + "num_input_tokens_seen": 184622544, + "router_z_loss_mlp": 0.85644531, + "step": 2220, + "time_per_iteration": 2.75992488861084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166043, + "balance_loss_mlp": 1.08049834, + "epoch": 0.4272797229703732, + "flos": 503862119424.0, + "grad_norm": 0.0277791101580606, + "language_loss": 0.93672097, + "learning_rate": 0.0006401595814452296, + "loss": 0.94838136, + "num_input_tokens_seen": 184692544, + "router_z_loss_mlp": 0.85644531, + "step": 2221, + "time_per_iteration": 2.6034135818481445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166502, + "balance_loss_mlp": 1.08081436, + "epoch": 0.4274721046556368, + "flos": 493437646848.0, + "grad_norm": 0.028798228067485887, + "language_loss": 0.8755163, + "learning_rate": 0.000639860502658789, + "loss": 0.88718128, + "num_input_tokens_seen": 184760480, + "router_z_loss_mlp": 0.85791016, + "step": 2222, + "time_per_iteration": 2.6364476680755615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168114, + "balance_loss_mlp": 1.08242607, + "epoch": 0.4276644863409004, + "flos": 569461235712.0, + "grad_norm": 0.025058965600795662, + "language_loss": 0.90727627, + "learning_rate": 0.0006395613695736853, + "loss": 0.91895741, + "num_input_tokens_seen": 184834080, + "router_z_loss_mlp": 0.85791016, + "step": 2223, + "time_per_iteration": 2.7128536701202393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170105, + "balance_loss_mlp": 1.08432245, + "epoch": 0.4278568680261639, + "flos": 608562740736.0, + "grad_norm": 0.029982203504376047, + "language_loss": 0.88910139, + "learning_rate": 0.0006392621823060529, + "loss": 0.90080237, + "num_input_tokens_seen": 184905872, + "router_z_loss_mlp": 0.85888672, + "step": 2224, + "time_per_iteration": 2.7404489517211914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167658, + "balance_loss_mlp": 1.08177996, + "epoch": 0.4280492497114275, + "flos": 561578754048.0, + "grad_norm": 0.03210591854722722, + "language_loss": 0.92597878, + "learning_rate": 0.0006389629409720465, + "loss": 0.93765533, + "num_input_tokens_seen": 184972320, + "router_z_loss_mlp": 0.85986328, + "step": 2225, + "time_per_iteration": 2.66398549079895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170504, + "balance_loss_mlp": 1.08467305, + "epoch": 0.428241631396691, + "flos": 721901182464.0, + "grad_norm": 0.03010502161811575, + "language_loss": 0.95236158, + "learning_rate": 0.0006386636456878417, + "loss": 0.96406662, + "num_input_tokens_seen": 185051040, + "router_z_loss_mlp": 0.859375, + "step": 2226, + "time_per_iteration": 2.866391897201538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168906, + "balance_loss_mlp": 1.08307493, + "epoch": 0.4284340130819546, + "flos": 430369787904.0, + "grad_norm": 0.032531705768225685, + "language_loss": 0.99370027, + "learning_rate": 0.0006383642965696353, + "loss": 1.00538921, + "num_input_tokens_seen": 185113552, + "router_z_loss_mlp": 0.859375, + "step": 2227, + "time_per_iteration": 2.4586703777313232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169599, + "balance_loss_mlp": 1.08376861, + "epoch": 0.42862639476721814, + "flos": 526159342080.0, + "grad_norm": 0.030010487503704626, + "language_loss": 0.90640998, + "learning_rate": 0.000638064893733645, + "loss": 0.91810596, + "num_input_tokens_seen": 185185056, + "router_z_loss_mlp": 0.859375, + "step": 2228, + "time_per_iteration": 2.71899676322937 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168473, + "balance_loss_mlp": 1.08269, + "epoch": 0.42881877645248173, + "flos": 466378079232.0, + "grad_norm": 0.029133853286813928, + "language_loss": 0.95973945, + "learning_rate": 0.000637765437296109, + "loss": 0.97142416, + "num_input_tokens_seen": 185257248, + "router_z_loss_mlp": 0.85888672, + "step": 2229, + "time_per_iteration": 2.6824750900268555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166344, + "balance_loss_mlp": 1.08075178, + "epoch": 0.42901115813774526, + "flos": 561355172352.0, + "grad_norm": 0.028234307189641095, + "language_loss": 0.92378092, + "learning_rate": 0.000637465927373287, + "loss": 0.93544424, + "num_input_tokens_seen": 185324800, + "router_z_loss_mlp": 0.85693359, + "step": 2230, + "time_per_iteration": 2.65869402885437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166629, + "balance_loss_mlp": 1.08137035, + "epoch": 0.42920353982300885, + "flos": 562527475200.0, + "grad_norm": 0.03139177124565146, + "language_loss": 0.86247277, + "learning_rate": 0.000637166364081459, + "loss": 0.87413907, + "num_input_tokens_seen": 185393408, + "router_z_loss_mlp": 0.85351562, + "step": 2231, + "time_per_iteration": 2.7071642875671387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165657, + "balance_loss_mlp": 1.080446, + "epoch": 0.42939592150827244, + "flos": 557315238912.0, + "grad_norm": 0.03049902562345181, + "language_loss": 0.89974546, + "learning_rate": 0.0006368667475369256, + "loss": 0.91140211, + "num_input_tokens_seen": 185467968, + "router_z_loss_mlp": 0.85302734, + "step": 2232, + "time_per_iteration": 2.74843168258667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166412, + "balance_loss_mlp": 1.08363342, + "epoch": 0.42958830319353597, + "flos": 1524942314496.0, + "grad_norm": 0.009964168253272706, + "language_loss": 0.78527778, + "learning_rate": 0.0006365670778560084, + "loss": 0.79694188, + "num_input_tokens_seen": 185705232, + "router_z_loss_mlp": 0.828125, + "step": 2233, + "time_per_iteration": 4.862222909927368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165146, + "balance_loss_mlp": 1.08236694, + "epoch": 0.42978068487879956, + "flos": 1498869672960.0, + "grad_norm": 0.007691227120989337, + "language_loss": 0.78895426, + "learning_rate": 0.0006362673551550494, + "loss": 0.80060571, + "num_input_tokens_seen": 185932672, + "router_z_loss_mlp": 0.828125, + "step": 2234, + "time_per_iteration": 4.816195011138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167111, + "balance_loss_mlp": 1.08242488, + "epoch": 0.4299730665640631, + "flos": 548063069184.0, + "grad_norm": 0.02593969644103988, + "language_loss": 0.92186785, + "learning_rate": 0.0006359675795504112, + "loss": 0.93353903, + "num_input_tokens_seen": 186006288, + "router_z_loss_mlp": 0.84765625, + "step": 2235, + "time_per_iteration": 2.6802918910980225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167601, + "balance_loss_mlp": 1.08300984, + "epoch": 0.4301654482493267, + "flos": 1131115124736.0, + "grad_norm": 0.035304816631346984, + "language_loss": 0.82753956, + "learning_rate": 0.0006356677511584775, + "loss": 0.83921564, + "num_input_tokens_seen": 186097168, + "router_z_loss_mlp": 0.84667969, + "step": 2236, + "time_per_iteration": 3.444307327270508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169724, + "balance_loss_mlp": 1.08522856, + "epoch": 0.4303578299345902, + "flos": 496741707264.0, + "grad_norm": 0.0313639268125667, + "language_loss": 0.9209317, + "learning_rate": 0.0006353678700956511, + "loss": 0.93262899, + "num_input_tokens_seen": 186163904, + "router_z_loss_mlp": 0.84570312, + "step": 2237, + "time_per_iteration": 2.5677876472473145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164152, + "balance_loss_mlp": 1.07965648, + "epoch": 0.4305502116198538, + "flos": 616929315840.0, + "grad_norm": 0.02814766917627989, + "language_loss": 0.90743506, + "learning_rate": 0.0006350679364783569, + "loss": 0.91907656, + "num_input_tokens_seen": 186233888, + "router_z_loss_mlp": 0.84570312, + "step": 2238, + "time_per_iteration": 2.7363951206207275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175266, + "balance_loss_mlp": 1.09081805, + "epoch": 0.4307425933051173, + "flos": 560321857536.0, + "grad_norm": 0.032687311784007, + "language_loss": 0.92748511, + "learning_rate": 0.0006347679504230393, + "loss": 0.93923771, + "num_input_tokens_seen": 186301168, + "router_z_loss_mlp": 0.84521484, + "step": 2239, + "time_per_iteration": 2.6805875301361084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172185, + "balance_loss_mlp": 1.08749855, + "epoch": 0.4309349749903809, + "flos": 973816779264.0, + "grad_norm": 0.03249158230487725, + "language_loss": 0.83304834, + "learning_rate": 0.0006344679120461632, + "loss": 0.84477019, + "num_input_tokens_seen": 186392096, + "router_z_loss_mlp": 0.84765625, + "step": 2240, + "time_per_iteration": 3.4101555347442627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166292, + "balance_loss_mlp": 1.08146274, + "epoch": 0.4311273566756445, + "flos": 542972356608.0, + "grad_norm": 0.03524791345855764, + "language_loss": 0.87825459, + "learning_rate": 0.0006341678214642134, + "loss": 0.88991749, + "num_input_tokens_seen": 186458000, + "router_z_loss_mlp": 0.84912109, + "step": 2241, + "time_per_iteration": 2.625896692276001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165486, + "balance_loss_mlp": 1.08041823, + "epoch": 0.43131973836090803, + "flos": 763110976512.0, + "grad_norm": 0.027424867307564667, + "language_loss": 0.89878041, + "learning_rate": 0.0006338676787936963, + "loss": 0.91043526, + "num_input_tokens_seen": 186544992, + "router_z_loss_mlp": 0.8515625, + "step": 2242, + "time_per_iteration": 3.063455820083618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167252, + "balance_loss_mlp": 1.08199346, + "epoch": 0.4315121200461716, + "flos": 555602446848.0, + "grad_norm": 0.031429355894507384, + "language_loss": 0.916659, + "learning_rate": 0.0006335674841511367, + "loss": 0.92833149, + "num_input_tokens_seen": 186614960, + "router_z_loss_mlp": 0.85351562, + "step": 2243, + "time_per_iteration": 2.666233777999878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192352, + "balance_loss_mlp": 1.10804749, + "epoch": 0.43170450173143515, + "flos": 1488686972928.0, + "grad_norm": 0.015912473948710273, + "language_loss": 0.7918117, + "learning_rate": 0.000633267237653081, + "loss": 0.80373514, + "num_input_tokens_seen": 186854288, + "router_z_loss_mlp": 0.84375, + "step": 2244, + "time_per_iteration": 4.980380535125732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183075, + "balance_loss_mlp": 1.09877014, + "epoch": 0.43189688341669874, + "flos": 1476907548672.0, + "grad_norm": 0.014137336443723746, + "language_loss": 0.77365553, + "learning_rate": 0.0006329669394160953, + "loss": 0.78548628, + "num_input_tokens_seen": 187090272, + "router_z_loss_mlp": 0.84375, + "step": 2245, + "time_per_iteration": 4.896914005279541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011678, + "balance_loss_mlp": 1.08254158, + "epoch": 0.43208926510196227, + "flos": 493984866816.0, + "grad_norm": 0.02893589890767333, + "language_loss": 0.89212227, + "learning_rate": 0.0006326665895567652, + "loss": 0.90380025, + "num_input_tokens_seen": 187157584, + "router_z_loss_mlp": 0.85351562, + "step": 2246, + "time_per_iteration": 2.6488964557647705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169613, + "balance_loss_mlp": 1.08430731, + "epoch": 0.43228164678722586, + "flos": 521302944768.0, + "grad_norm": 0.0351368535627373, + "language_loss": 0.94705987, + "learning_rate": 0.0006323661881916976, + "loss": 0.95875597, + "num_input_tokens_seen": 187229408, + "router_z_loss_mlp": 0.85400391, + "step": 2247, + "time_per_iteration": 2.7094948291778564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170289, + "balance_loss_mlp": 1.08522093, + "epoch": 0.4324740284724894, + "flos": 797395015680.0, + "grad_norm": 0.0300569180656374, + "language_loss": 0.88277382, + "learning_rate": 0.0006320657354375179, + "loss": 0.89447677, + "num_input_tokens_seen": 187304384, + "router_z_loss_mlp": 0.8515625, + "step": 2248, + "time_per_iteration": 2.942108154296875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166997, + "balance_loss_mlp": 1.08188176, + "epoch": 0.432666410157753, + "flos": 483097767936.0, + "grad_norm": 0.027676603795042543, + "language_loss": 0.93945193, + "learning_rate": 0.0006317652314108726, + "loss": 0.95112193, + "num_input_tokens_seen": 187368064, + "router_z_loss_mlp": 0.85205078, + "step": 2249, + "time_per_iteration": 2.559255838394165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167847, + "balance_loss_mlp": 1.08268416, + "epoch": 0.43285879184301657, + "flos": 501209338368.0, + "grad_norm": 0.028764721331973258, + "language_loss": 0.98109567, + "learning_rate": 0.0006314646762284277, + "loss": 0.99277413, + "num_input_tokens_seen": 187436320, + "router_z_loss_mlp": 0.85253906, + "step": 2250, + "time_per_iteration": 2.6713576316833496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188225, + "balance_loss_mlp": 1.10582733, + "epoch": 0.4330511735282801, + "flos": 1513790701056.0, + "grad_norm": 0.02095115440391329, + "language_loss": 0.75425828, + "learning_rate": 0.0006311640700068691, + "loss": 0.76614058, + "num_input_tokens_seen": 187670912, + "router_z_loss_mlp": 0.82421875, + "step": 2251, + "time_per_iteration": 4.936391592025757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170203, + "balance_loss_mlp": 1.08518302, + "epoch": 0.4332435552135437, + "flos": 700837387776.0, + "grad_norm": 0.037779543880407794, + "language_loss": 0.84241956, + "learning_rate": 0.0006308634128629022, + "loss": 0.85412163, + "num_input_tokens_seen": 187746432, + "router_z_loss_mlp": 0.85107422, + "step": 2252, + "time_per_iteration": 2.890848398208618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168176, + "balance_loss_mlp": 1.0830133, + "epoch": 0.4334359368988072, + "flos": 593481984000.0, + "grad_norm": 0.0295787243575072, + "language_loss": 0.93934762, + "learning_rate": 0.0006305627049132531, + "loss": 0.95102942, + "num_input_tokens_seen": 187820032, + "router_z_loss_mlp": 0.85253906, + "step": 2253, + "time_per_iteration": 2.7571680545806885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167414, + "balance_loss_mlp": 1.08220303, + "epoch": 0.4336283185840708, + "flos": 844274942976.0, + "grad_norm": 0.0242542623992157, + "language_loss": 0.90322375, + "learning_rate": 0.0006302619462746662, + "loss": 0.91489786, + "num_input_tokens_seen": 187904400, + "router_z_loss_mlp": 0.85302734, + "step": 2254, + "time_per_iteration": 3.1296751499176025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167279, + "balance_loss_mlp": 1.0821631, + "epoch": 0.43382070026933434, + "flos": 627401452032.0, + "grad_norm": 0.02849659363202695, + "language_loss": 0.96522522, + "learning_rate": 0.0006299611370639069, + "loss": 0.97689807, + "num_input_tokens_seen": 187973264, + "router_z_loss_mlp": 0.85205078, + "step": 2255, + "time_per_iteration": 2.7125463485717773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167069, + "balance_loss_mlp": 1.08181024, + "epoch": 0.4340130819545979, + "flos": 592209624576.0, + "grad_norm": 0.029264792527705672, + "language_loss": 0.85361564, + "learning_rate": 0.0006296602773977593, + "loss": 0.86528635, + "num_input_tokens_seen": 188039984, + "router_z_loss_mlp": 0.85351562, + "step": 2256, + "time_per_iteration": 2.692830801010132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166353, + "balance_loss_mlp": 1.0810945, + "epoch": 0.4342054636398615, + "flos": 491955167232.0, + "grad_norm": 0.02531800088280138, + "language_loss": 0.92533612, + "learning_rate": 0.0006293593673930277, + "loss": 0.93699974, + "num_input_tokens_seen": 188113456, + "router_z_loss_mlp": 0.85351562, + "step": 2257, + "time_per_iteration": 2.6522371768951416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118061, + "balance_loss_mlp": 1.09568477, + "epoch": 0.43439784532512504, + "flos": 700259968512.0, + "grad_norm": 0.028144633410819173, + "language_loss": 0.84340745, + "learning_rate": 0.0006290584071665358, + "loss": 0.85521352, + "num_input_tokens_seen": 188192480, + "router_z_loss_mlp": 0.85009766, + "step": 2258, + "time_per_iteration": 2.878753662109375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179592, + "balance_loss_mlp": 1.09452426, + "epoch": 0.43459022701038863, + "flos": 486801328128.0, + "grad_norm": 0.028951325004384125, + "language_loss": 0.88270766, + "learning_rate": 0.0006287573968351266, + "loss": 0.89450359, + "num_input_tokens_seen": 188258784, + "router_z_loss_mlp": 0.8515625, + "step": 2259, + "time_per_iteration": 2.55161190032959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173139, + "balance_loss_mlp": 1.08830976, + "epoch": 0.43478260869565216, + "flos": 644266859520.0, + "grad_norm": 0.030714073024811012, + "language_loss": 0.91379642, + "learning_rate": 0.0006284563365156626, + "loss": 0.92552781, + "num_input_tokens_seen": 188331312, + "router_z_loss_mlp": 0.84912109, + "step": 2260, + "time_per_iteration": 2.778975009918213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177671, + "balance_loss_mlp": 1.09274662, + "epoch": 0.43497499038091575, + "flos": 427009331712.0, + "grad_norm": 0.03207934204379992, + "language_loss": 0.94470251, + "learning_rate": 0.0006281552263250261, + "loss": 0.95647919, + "num_input_tokens_seen": 188393712, + "router_z_loss_mlp": 0.85009766, + "step": 2261, + "time_per_iteration": 2.540102005004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175407, + "balance_loss_mlp": 1.09281921, + "epoch": 0.4351673720661793, + "flos": 1541525016576.0, + "grad_norm": 0.010664027023399645, + "language_loss": 0.80691534, + "learning_rate": 0.000627854066380118, + "loss": 0.81866938, + "num_input_tokens_seen": 188621152, + "router_z_loss_mlp": 0.82617188, + "step": 2262, + "time_per_iteration": 4.828954219818115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167291, + "balance_loss_mlp": 1.08260465, + "epoch": 0.43535975375144287, + "flos": 750465423360.0, + "grad_norm": 0.02969029135984414, + "language_loss": 0.88281786, + "learning_rate": 0.0006275528567978593, + "loss": 0.89449072, + "num_input_tokens_seen": 188697120, + "router_z_loss_mlp": 0.84765625, + "step": 2263, + "time_per_iteration": 2.9683096408843994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167048, + "balance_loss_mlp": 1.08193278, + "epoch": 0.4355521354367064, + "flos": 862751084544.0, + "grad_norm": 0.03226302104273745, + "language_loss": 0.89985508, + "learning_rate": 0.0006272515976951898, + "loss": 0.91152549, + "num_input_tokens_seen": 188778480, + "router_z_loss_mlp": 0.85205078, + "step": 2264, + "time_per_iteration": 4.429616689682007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166942, + "balance_loss_mlp": 1.08182704, + "epoch": 0.43574451712197, + "flos": 735842563584.0, + "grad_norm": 0.02499576623287147, + "language_loss": 0.84365284, + "learning_rate": 0.0006269502891890687, + "loss": 0.8553223, + "num_input_tokens_seen": 188863616, + "router_z_loss_mlp": 0.85205078, + "step": 2265, + "time_per_iteration": 3.0444254875183105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166782, + "balance_loss_mlp": 1.08214331, + "epoch": 0.4359368988072336, + "flos": 571712515584.0, + "grad_norm": 0.02707186340155289, + "language_loss": 0.93191004, + "learning_rate": 0.0006266489313964743, + "loss": 0.94357783, + "num_input_tokens_seen": 188933984, + "router_z_loss_mlp": 0.84716797, + "step": 2266, + "time_per_iteration": 2.7227466106414795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164913, + "balance_loss_mlp": 1.0802747, + "epoch": 0.4361292804924971, + "flos": 556670690304.0, + "grad_norm": 0.03376827968070452, + "language_loss": 0.92200565, + "learning_rate": 0.0006263475244344041, + "loss": 0.93365479, + "num_input_tokens_seen": 189012976, + "router_z_loss_mlp": 0.84716797, + "step": 2267, + "time_per_iteration": 2.845227003097534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167657, + "balance_loss_mlp": 1.08335233, + "epoch": 0.4363216621777607, + "flos": 558348553728.0, + "grad_norm": 0.031080273211388402, + "language_loss": 0.91650617, + "learning_rate": 0.0006260460684198746, + "loss": 0.92818272, + "num_input_tokens_seen": 189079664, + "router_z_loss_mlp": 0.84375, + "step": 2268, + "time_per_iteration": 2.652310371398926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165668, + "balance_loss_mlp": 1.08141088, + "epoch": 0.4365140438630242, + "flos": 479196822528.0, + "grad_norm": 0.029843008840560653, + "language_loss": 0.92140841, + "learning_rate": 0.0006257445634699213, + "loss": 0.93306512, + "num_input_tokens_seen": 189144688, + "router_z_loss_mlp": 0.84326172, + "step": 2269, + "time_per_iteration": 2.5779240131378174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164543, + "balance_loss_mlp": 1.08042932, + "epoch": 0.4367064255482878, + "flos": 580007232000.0, + "grad_norm": 0.028296510675920098, + "language_loss": 0.89645165, + "learning_rate": 0.0006254430097015993, + "loss": 0.90809709, + "num_input_tokens_seen": 189213984, + "router_z_loss_mlp": 0.84179688, + "step": 2270, + "time_per_iteration": 2.6566953659057617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172028, + "balance_loss_mlp": 1.08963013, + "epoch": 0.43689880723355135, + "flos": 1462271953920.0, + "grad_norm": 0.010844604855090543, + "language_loss": 0.76479089, + "learning_rate": 0.0006251414072319815, + "loss": 0.77651119, + "num_input_tokens_seen": 189434416, + "router_z_loss_mlp": 0.82421875, + "step": 2271, + "time_per_iteration": 4.794802904129028 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170244, + "balance_loss_mlp": 1.08593976, + "epoch": 0.43709118891881493, + "flos": 668873759232.0, + "grad_norm": 0.024959132899117664, + "language_loss": 0.91526961, + "learning_rate": 0.0006248397561781609, + "loss": 0.92697203, + "num_input_tokens_seen": 189513248, + "router_z_loss_mlp": 0.84375, + "step": 2272, + "time_per_iteration": 2.8676164150238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170164, + "balance_loss_mlp": 1.08562064, + "epoch": 0.43728357060407846, + "flos": 545913847296.0, + "grad_norm": 0.033809863548240594, + "language_loss": 0.93834352, + "learning_rate": 0.0006245380566572482, + "loss": 0.95004517, + "num_input_tokens_seen": 189585392, + "router_z_loss_mlp": 0.84619141, + "step": 2273, + "time_per_iteration": 2.6419596672058105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169646, + "balance_loss_mlp": 1.08519816, + "epoch": 0.43747595228934205, + "flos": 748183944192.0, + "grad_norm": 0.02624268387252208, + "language_loss": 0.83012575, + "learning_rate": 0.0006242363087863744, + "loss": 0.84182227, + "num_input_tokens_seen": 189667552, + "router_z_loss_mlp": 0.84521484, + "step": 2274, + "time_per_iteration": 2.9927828311920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165646, + "balance_loss_mlp": 1.08057845, + "epoch": 0.43766833397460564, + "flos": 632529094656.0, + "grad_norm": 0.025411969041571628, + "language_loss": 0.92234564, + "learning_rate": 0.0006239345126826878, + "loss": 0.9340021, + "num_input_tokens_seen": 189742048, + "router_z_loss_mlp": 0.8515625, + "step": 2275, + "time_per_iteration": 2.8180527687072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164237, + "balance_loss_mlp": 1.07931209, + "epoch": 0.43786071565986917, + "flos": 532098719232.0, + "grad_norm": 0.028730665522240066, + "language_loss": 0.90992379, + "learning_rate": 0.0006236326684633561, + "loss": 0.92156613, + "num_input_tokens_seen": 189817968, + "router_z_loss_mlp": 0.85009766, + "step": 2276, + "time_per_iteration": 2.828425168991089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163177, + "balance_loss_mlp": 1.07810962, + "epoch": 0.43805309734513276, + "flos": 539557506048.0, + "grad_norm": 0.03648062799061939, + "language_loss": 0.82486773, + "learning_rate": 0.0006233307762455658, + "loss": 0.83649945, + "num_input_tokens_seen": 189882608, + "router_z_loss_mlp": 0.8515625, + "step": 2277, + "time_per_iteration": 2.608886957168579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164162, + "balance_loss_mlp": 1.07909381, + "epoch": 0.4382454790303963, + "flos": 865963820544.0, + "grad_norm": 0.025903790262040906, + "language_loss": 0.90223956, + "learning_rate": 0.0006230288361465216, + "loss": 0.91388112, + "num_input_tokens_seen": 189960608, + "router_z_loss_mlp": 0.8515625, + "step": 2278, + "time_per_iteration": 3.036163568496704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171688, + "balance_loss_mlp": 1.08638203, + "epoch": 0.4384378607156599, + "flos": 766801075200.0, + "grad_norm": 0.03187081568607536, + "language_loss": 0.92773926, + "learning_rate": 0.0006227268482834473, + "loss": 0.93945611, + "num_input_tokens_seen": 190035472, + "router_z_loss_mlp": 0.85400391, + "step": 2279, + "time_per_iteration": 2.9320731163024902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176636, + "balance_loss_mlp": 1.09137762, + "epoch": 0.4386302424009234, + "flos": 669796283904.0, + "grad_norm": 0.028047353495827182, + "language_loss": 0.9305023, + "learning_rate": 0.000622424812773585, + "loss": 0.94226873, + "num_input_tokens_seen": 190109312, + "router_z_loss_mlp": 0.85351562, + "step": 2280, + "time_per_iteration": 2.7847142219543457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174317, + "balance_loss_mlp": 1.08901083, + "epoch": 0.438822624086187, + "flos": 486150048768.0, + "grad_norm": 0.03276492690852342, + "language_loss": 0.87875438, + "learning_rate": 0.000622122729734195, + "loss": 0.89049757, + "num_input_tokens_seen": 190174176, + "router_z_loss_mlp": 0.85400391, + "step": 2281, + "time_per_iteration": 2.5878114700317383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175391, + "balance_loss_mlp": 1.09008515, + "epoch": 0.4390150057714506, + "flos": 500258615808.0, + "grad_norm": 0.02649151217717187, + "language_loss": 0.92922705, + "learning_rate": 0.0006218205992825566, + "loss": 0.94098091, + "num_input_tokens_seen": 190243888, + "router_z_loss_mlp": 0.85400391, + "step": 2282, + "time_per_iteration": 2.6129069328308105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171786, + "balance_loss_mlp": 1.08652771, + "epoch": 0.4392073874567141, + "flos": 559351669248.0, + "grad_norm": 0.029077625047839704, + "language_loss": 0.88682199, + "learning_rate": 0.0006215184215359671, + "loss": 0.89853978, + "num_input_tokens_seen": 190317504, + "router_z_loss_mlp": 0.85351562, + "step": 2283, + "time_per_iteration": 2.7397634983062744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011712, + "balance_loss_mlp": 1.08594131, + "epoch": 0.4393997691419777, + "flos": 606422251008.0, + "grad_norm": 0.030174398524898192, + "language_loss": 0.92242193, + "learning_rate": 0.0006212161966117425, + "loss": 0.93413389, + "num_input_tokens_seen": 190390160, + "router_z_loss_mlp": 0.85351562, + "step": 2284, + "time_per_iteration": 2.710947275161743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168513, + "balance_loss_mlp": 1.08349264, + "epoch": 0.43959215082724123, + "flos": 805483614720.0, + "grad_norm": 0.03159683391584848, + "language_loss": 0.8931039, + "learning_rate": 0.0006209139246272164, + "loss": 0.90478909, + "num_input_tokens_seen": 190467600, + "router_z_loss_mlp": 0.85107422, + "step": 2285, + "time_per_iteration": 2.9573750495910645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167409, + "balance_loss_mlp": 1.08229375, + "epoch": 0.4397845325125048, + "flos": 488607446016.0, + "grad_norm": 0.033192711624055064, + "language_loss": 0.89631027, + "learning_rate": 0.0006206116056997421, + "loss": 0.90798426, + "num_input_tokens_seen": 190534192, + "router_z_loss_mlp": 0.85205078, + "step": 2286, + "time_per_iteration": 2.5915918350219727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168495, + "balance_loss_mlp": 1.08380854, + "epoch": 0.43997691419776835, + "flos": 481784475648.0, + "grad_norm": 0.02920198010279229, + "language_loss": 0.88986552, + "learning_rate": 0.0006203092399466892, + "loss": 0.90155041, + "num_input_tokens_seen": 190601440, + "router_z_loss_mlp": 0.84765625, + "step": 2287, + "time_per_iteration": 2.6179182529449463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167372, + "balance_loss_mlp": 1.08282888, + "epoch": 0.44016929588303194, + "flos": 484129081344.0, + "grad_norm": 0.024305807708132735, + "language_loss": 0.91028094, + "learning_rate": 0.0006200068274854473, + "loss": 0.92195475, + "num_input_tokens_seen": 190672528, + "router_z_loss_mlp": 0.84619141, + "step": 2288, + "time_per_iteration": 2.6643898487091064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168421, + "balance_loss_mlp": 1.08387816, + "epoch": 0.4403616775682955, + "flos": 573023806464.0, + "grad_norm": 0.025110382343061666, + "language_loss": 0.90969157, + "learning_rate": 0.0006197043684334229, + "loss": 0.92137575, + "num_input_tokens_seen": 190750704, + "router_z_loss_mlp": 0.84619141, + "step": 2289, + "time_per_iteration": 2.7810122966766357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169529, + "balance_loss_mlp": 1.08503318, + "epoch": 0.44055405925355906, + "flos": 631999339008.0, + "grad_norm": 0.03160389670817918, + "language_loss": 0.85855997, + "learning_rate": 0.0006194018629080411, + "loss": 0.87025523, + "num_input_tokens_seen": 190821664, + "router_z_loss_mlp": 0.84570312, + "step": 2290, + "time_per_iteration": 2.7407448291778564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165877, + "balance_loss_mlp": 1.08147717, + "epoch": 0.44074644093882265, + "flos": 537825248256.0, + "grad_norm": 0.027939915930863316, + "language_loss": 0.87505877, + "learning_rate": 0.0006190993110267451, + "loss": 0.88671762, + "num_input_tokens_seen": 190893888, + "router_z_loss_mlp": 0.84472656, + "step": 2291, + "time_per_iteration": 2.7158915996551514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167062, + "balance_loss_mlp": 1.08280444, + "epoch": 0.4409388226240862, + "flos": 464165730816.0, + "grad_norm": 0.03127864863359821, + "language_loss": 0.91365832, + "learning_rate": 0.0006187967129069958, + "loss": 0.92532897, + "num_input_tokens_seen": 190956800, + "router_z_loss_mlp": 0.84326172, + "step": 2292, + "time_per_iteration": 2.506866931915283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167494, + "balance_loss_mlp": 1.08337986, + "epoch": 0.44113120430934977, + "flos": 567160290816.0, + "grad_norm": 0.024295125434261364, + "language_loss": 0.92081046, + "learning_rate": 0.0006184940686662722, + "loss": 0.93248534, + "num_input_tokens_seen": 191032048, + "router_z_loss_mlp": 0.84179688, + "step": 2293, + "time_per_iteration": 2.7406985759735107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168054, + "balance_loss_mlp": 1.084131, + "epoch": 0.4413235859946133, + "flos": 544674415104.0, + "grad_norm": 0.02998433601693185, + "language_loss": 0.95718068, + "learning_rate": 0.0006181913784220714, + "loss": 0.96886122, + "num_input_tokens_seen": 191099952, + "router_z_loss_mlp": 0.83984375, + "step": 2294, + "time_per_iteration": 2.7276971340179443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186783, + "balance_loss_mlp": 1.1034317, + "epoch": 0.4415159676798769, + "flos": 1573302720000.0, + "grad_norm": 0.012177255736314117, + "language_loss": 0.80553782, + "learning_rate": 0.0006178886422919078, + "loss": 0.8174057, + "num_input_tokens_seen": 191335968, + "router_z_loss_mlp": 0.83398438, + "step": 2295, + "time_per_iteration": 4.898420333862305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174829, + "balance_loss_mlp": 1.0908581, + "epoch": 0.4417083493651404, + "flos": 660012357120.0, + "grad_norm": 0.02926637357686751, + "language_loss": 0.86549121, + "learning_rate": 0.0006175858603933146, + "loss": 0.87723947, + "num_input_tokens_seen": 191410112, + "router_z_loss_mlp": 0.84033203, + "step": 2296, + "time_per_iteration": 2.866745710372925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166372, + "balance_loss_mlp": 1.08225799, + "epoch": 0.441900731050404, + "flos": 741816869376.0, + "grad_norm": 0.028401827027787777, + "language_loss": 0.8638438, + "learning_rate": 0.0006172830328438416, + "loss": 0.87550759, + "num_input_tokens_seen": 191491552, + "router_z_loss_mlp": 0.84179688, + "step": 2297, + "time_per_iteration": 2.9731123447418213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165335, + "balance_loss_mlp": 1.08088684, + "epoch": 0.44209311273566754, + "flos": 540595550208.0, + "grad_norm": 0.030114194292861593, + "language_loss": 0.93111193, + "learning_rate": 0.0006169801597610572, + "loss": 0.94276524, + "num_input_tokens_seen": 191567872, + "router_z_loss_mlp": 0.84521484, + "step": 2298, + "time_per_iteration": 2.777326822280884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163943, + "balance_loss_mlp": 1.07959104, + "epoch": 0.4422854944209311, + "flos": 622729704960.0, + "grad_norm": 0.030043302620551878, + "language_loss": 0.96779996, + "learning_rate": 0.0006166772412625469, + "loss": 0.97943938, + "num_input_tokens_seen": 191638032, + "router_z_loss_mlp": 0.84423828, + "step": 2299, + "time_per_iteration": 2.8143997192382812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164367, + "balance_loss_mlp": 1.08006215, + "epoch": 0.4424778761061947, + "flos": 660060020736.0, + "grad_norm": 0.031086205360051855, + "language_loss": 0.88609374, + "learning_rate": 0.0006163742774659141, + "loss": 0.89773744, + "num_input_tokens_seen": 191709104, + "router_z_loss_mlp": 0.84375, + "step": 2300, + "time_per_iteration": 2.8234009742736816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116513, + "balance_loss_mlp": 1.08087325, + "epoch": 0.44267025779145824, + "flos": 569702281728.0, + "grad_norm": 0.02554920530971592, + "language_loss": 0.92150819, + "learning_rate": 0.0006160712684887801, + "loss": 0.93315947, + "num_input_tokens_seen": 191787072, + "router_z_loss_mlp": 0.84326172, + "step": 2301, + "time_per_iteration": 2.733370542526245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170443, + "balance_loss_mlp": 1.08623374, + "epoch": 0.44286263947672183, + "flos": 497818682880.0, + "grad_norm": 0.02788747598953172, + "language_loss": 0.88145387, + "learning_rate": 0.0006157682144487832, + "loss": 0.89315832, + "num_input_tokens_seen": 191863040, + "router_z_loss_mlp": 0.84277344, + "step": 2302, + "time_per_iteration": 2.766334295272827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171189, + "balance_loss_mlp": 1.08697963, + "epoch": 0.44305502116198536, + "flos": 610607903232.0, + "grad_norm": 0.028872273370365097, + "language_loss": 0.89961743, + "learning_rate": 0.0006154651154635793, + "loss": 0.91132939, + "num_input_tokens_seen": 191940352, + "router_z_loss_mlp": 0.84277344, + "step": 2303, + "time_per_iteration": 2.844402313232422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172304, + "balance_loss_mlp": 1.08776116, + "epoch": 0.44324740284724895, + "flos": 471742038528.0, + "grad_norm": 0.028372285588360545, + "language_loss": 0.91810459, + "learning_rate": 0.0006151619716508421, + "loss": 0.92982763, + "num_input_tokens_seen": 192006896, + "router_z_loss_mlp": 0.84619141, + "step": 2304, + "time_per_iteration": 2.545243263244629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166666, + "balance_loss_mlp": 1.08197927, + "epoch": 0.4434397845325125, + "flos": 579811848192.0, + "grad_norm": 0.029138508250266412, + "language_loss": 0.93279153, + "learning_rate": 0.0006148587831282625, + "loss": 0.94445825, + "num_input_tokens_seen": 192075312, + "router_z_loss_mlp": 0.84765625, + "step": 2305, + "time_per_iteration": 2.6743574142456055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179131, + "balance_loss_mlp": 1.09654236, + "epoch": 0.44363216621777607, + "flos": 1499995038720.0, + "grad_norm": 0.011431210063158581, + "language_loss": 0.79176068, + "learning_rate": 0.0006145555500135483, + "loss": 0.80355197, + "num_input_tokens_seen": 192304816, + "router_z_loss_mlp": 0.82617188, + "step": 2306, + "time_per_iteration": 4.870469570159912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177668, + "balance_loss_mlp": 1.09298158, + "epoch": 0.44382454790303966, + "flos": 478285031424.0, + "grad_norm": 0.03377230518223979, + "language_loss": 0.94630158, + "learning_rate": 0.0006142522724244255, + "loss": 0.95807827, + "num_input_tokens_seen": 192369232, + "router_z_loss_mlp": 0.84765625, + "step": 2307, + "time_per_iteration": 2.5165300369262695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181709, + "balance_loss_mlp": 1.09912109, + "epoch": 0.4440169295883032, + "flos": 1547303938560.0, + "grad_norm": 0.010354849447395944, + "language_loss": 0.76484716, + "learning_rate": 0.0006139489504786368, + "loss": 0.77666426, + "num_input_tokens_seen": 192600176, + "router_z_loss_mlp": 0.82617188, + "step": 2308, + "time_per_iteration": 4.86593222618103 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168989, + "balance_loss_mlp": 1.0843029, + "epoch": 0.4442093112735668, + "flos": 592290215424.0, + "grad_norm": 0.030546908540126056, + "language_loss": 0.84313834, + "learning_rate": 0.000613645584293942, + "loss": 0.85482824, + "num_input_tokens_seen": 192675424, + "router_z_loss_mlp": 0.84765625, + "step": 2309, + "time_per_iteration": 2.9245197772979736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179296, + "balance_loss_mlp": 1.09465766, + "epoch": 0.4444016929588303, + "flos": 531327917568.0, + "grad_norm": 0.02954341623225009, + "language_loss": 0.89990199, + "learning_rate": 0.0006133421739881185, + "loss": 0.91169494, + "num_input_tokens_seen": 192747552, + "router_z_loss_mlp": 0.84716797, + "step": 2310, + "time_per_iteration": 2.6806466579437256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173935, + "balance_loss_mlp": 1.08958304, + "epoch": 0.4445940746440939, + "flos": 621388214784.0, + "grad_norm": 0.03132503362752706, + "language_loss": 0.89829159, + "learning_rate": 0.0006130387196789605, + "loss": 0.91003096, + "num_input_tokens_seen": 192819984, + "router_z_loss_mlp": 0.84423828, + "step": 2311, + "time_per_iteration": 2.7674410343170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171768, + "balance_loss_mlp": 1.08751106, + "epoch": 0.4447864563293574, + "flos": 630375869952.0, + "grad_norm": 0.024389617188914626, + "language_loss": 0.89820284, + "learning_rate": 0.0006127352214842795, + "loss": 0.90992051, + "num_input_tokens_seen": 192906080, + "router_z_loss_mlp": 0.84326172, + "step": 2312, + "time_per_iteration": 3.0181000232696533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170174, + "balance_loss_mlp": 1.08591735, + "epoch": 0.444978838014621, + "flos": 652001620992.0, + "grad_norm": 0.03266392614581568, + "language_loss": 0.92178452, + "learning_rate": 0.0006124316795219041, + "loss": 0.93348622, + "num_input_tokens_seen": 192972336, + "router_z_loss_mlp": 0.84326172, + "step": 2313, + "time_per_iteration": 2.7772133350372314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172939, + "balance_loss_mlp": 1.08911133, + "epoch": 0.44517121969988455, + "flos": 613588325376.0, + "grad_norm": 0.026148577301855224, + "language_loss": 0.88032007, + "learning_rate": 0.0006121280939096794, + "loss": 0.89204955, + "num_input_tokens_seen": 193045744, + "router_z_loss_mlp": 0.83886719, + "step": 2314, + "time_per_iteration": 2.7472517490386963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173697, + "balance_loss_mlp": 1.09010756, + "epoch": 0.44536360138514813, + "flos": 489714620928.0, + "grad_norm": 0.031365562822013526, + "language_loss": 0.94548678, + "learning_rate": 0.000611824464765468, + "loss": 0.95722377, + "num_input_tokens_seen": 193115248, + "router_z_loss_mlp": 0.83642578, + "step": 2315, + "time_per_iteration": 2.5471882820129395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188843, + "balance_loss_mlp": 1.10758972, + "epoch": 0.4455559830704117, + "flos": 1519053877248.0, + "grad_norm": 0.020817362108823283, + "language_loss": 0.78594941, + "learning_rate": 0.0006115207922071492, + "loss": 0.79783785, + "num_input_tokens_seen": 193330816, + "router_z_loss_mlp": 0.8125, + "step": 2316, + "time_per_iteration": 4.660900831222534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170364, + "balance_loss_mlp": 1.08663106, + "epoch": 0.44574836475567525, + "flos": 616816524288.0, + "grad_norm": 0.03088300803415325, + "language_loss": 0.9123913, + "learning_rate": 0.000611217076352619, + "loss": 0.92409492, + "num_input_tokens_seen": 193407616, + "router_z_loss_mlp": 0.83789062, + "step": 2317, + "time_per_iteration": 2.7556822299957275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171317, + "balance_loss_mlp": 1.08772719, + "epoch": 0.44594074644093884, + "flos": 507433422336.0, + "grad_norm": 0.026331926721779163, + "language_loss": 0.8931551, + "learning_rate": 0.0006109133173197905, + "loss": 0.90486825, + "num_input_tokens_seen": 193482624, + "router_z_loss_mlp": 0.83642578, + "step": 2318, + "time_per_iteration": 2.720372200012207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172625, + "balance_loss_mlp": 1.08908355, + "epoch": 0.44613312812620237, + "flos": 728311918080.0, + "grad_norm": 0.030991917971638312, + "language_loss": 0.91262019, + "learning_rate": 0.0006106095152265935, + "loss": 0.92434645, + "num_input_tokens_seen": 193555952, + "router_z_loss_mlp": 0.8359375, + "step": 2319, + "time_per_iteration": 2.8956825733184814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171779, + "balance_loss_mlp": 1.08776009, + "epoch": 0.44632550981146596, + "flos": 637057850880.0, + "grad_norm": 0.02763281666385245, + "language_loss": 0.90440875, + "learning_rate": 0.0006103056701909739, + "loss": 0.91612655, + "num_input_tokens_seen": 193636672, + "router_z_loss_mlp": 0.84082031, + "step": 2320, + "time_per_iteration": 2.9104726314544678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175182, + "balance_loss_mlp": 1.09116352, + "epoch": 0.4465178914967295, + "flos": 828616766976.0, + "grad_norm": 0.02413420043376393, + "language_loss": 0.88773656, + "learning_rate": 0.0006100017823308956, + "loss": 0.89948833, + "num_input_tokens_seen": 193721728, + "router_z_loss_mlp": 0.84082031, + "step": 2321, + "time_per_iteration": 3.1638107299804688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176807, + "balance_loss_mlp": 1.0927887, + "epoch": 0.4467102731819931, + "flos": 667032712704.0, + "grad_norm": 0.03201581013716374, + "language_loss": 0.87315178, + "learning_rate": 0.0006096978517643377, + "loss": 0.88491988, + "num_input_tokens_seen": 193795456, + "router_z_loss_mlp": 0.84082031, + "step": 2322, + "time_per_iteration": 2.7875144481658936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182039, + "balance_loss_mlp": 1.09792459, + "epoch": 0.4469026548672566, + "flos": 513969684480.0, + "grad_norm": 0.032089815412588485, + "language_loss": 0.90642822, + "learning_rate": 0.0006093938786092968, + "loss": 0.91824853, + "num_input_tokens_seen": 193865520, + "router_z_loss_mlp": 0.84179688, + "step": 2323, + "time_per_iteration": 2.6789090633392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181311, + "balance_loss_mlp": 1.097054, + "epoch": 0.4470950365525202, + "flos": 685285272576.0, + "grad_norm": 0.032095192334159584, + "language_loss": 0.95970643, + "learning_rate": 0.0006090898629837857, + "loss": 0.97151959, + "num_input_tokens_seen": 193935040, + "router_z_loss_mlp": 0.84326172, + "step": 2324, + "time_per_iteration": 2.842829704284668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174335, + "balance_loss_mlp": 1.08993506, + "epoch": 0.4472874182377838, + "flos": 628534823424.0, + "grad_norm": 0.02542366781046337, + "language_loss": 0.93390518, + "learning_rate": 0.0006087858050058337, + "loss": 0.94564855, + "num_input_tokens_seen": 194009120, + "router_z_loss_mlp": 0.84472656, + "step": 2325, + "time_per_iteration": 2.798461675643921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173301, + "balance_loss_mlp": 1.08899629, + "epoch": 0.4474797999230473, + "flos": 548240988672.0, + "grad_norm": 0.026872235695321916, + "language_loss": 0.8790192, + "learning_rate": 0.0006084817047934866, + "loss": 0.8907522, + "num_input_tokens_seen": 194076672, + "router_z_loss_mlp": 0.84375, + "step": 2326, + "time_per_iteration": 2.6333069801330566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170357, + "balance_loss_mlp": 1.08552742, + "epoch": 0.4476721816083109, + "flos": 456756609024.0, + "grad_norm": 0.03263470786125086, + "language_loss": 0.9605242, + "learning_rate": 0.0006081775624648066, + "loss": 0.97222769, + "num_input_tokens_seen": 194142320, + "router_z_loss_mlp": 0.84912109, + "step": 2327, + "time_per_iteration": 2.506568431854248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171196, + "balance_loss_mlp": 1.08660555, + "epoch": 0.44786456329357444, + "flos": 482500882944.0, + "grad_norm": 0.030530219610100114, + "language_loss": 0.89424241, + "learning_rate": 0.0006078733781378721, + "loss": 0.90595436, + "num_input_tokens_seen": 194208560, + "router_z_loss_mlp": 0.84667969, + "step": 2328, + "time_per_iteration": 2.5324759483337402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174464, + "balance_loss_mlp": 1.09006357, + "epoch": 0.448056944978838, + "flos": 553236374016.0, + "grad_norm": 0.028423200188041658, + "language_loss": 0.87742424, + "learning_rate": 0.0006075691519307781, + "loss": 0.88916886, + "num_input_tokens_seen": 194288080, + "router_z_loss_mlp": 0.84472656, + "step": 2329, + "time_per_iteration": 2.8329951763153076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169966, + "balance_loss_mlp": 1.08580375, + "epoch": 0.44824932666410156, + "flos": 551916350976.0, + "grad_norm": 0.030957218182316032, + "language_loss": 0.88990253, + "learning_rate": 0.0006072648839616356, + "loss": 0.90160215, + "num_input_tokens_seen": 194358464, + "router_z_loss_mlp": 0.84228516, + "step": 2330, + "time_per_iteration": 2.6367061138153076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169901, + "balance_loss_mlp": 1.08612072, + "epoch": 0.44844170834936514, + "flos": 990271953408.0, + "grad_norm": 0.02484019388371453, + "language_loss": 0.87772298, + "learning_rate": 0.0006069605743485718, + "loss": 0.88942194, + "num_input_tokens_seen": 194456112, + "router_z_loss_mlp": 0.83837891, + "step": 2331, + "time_per_iteration": 3.3425865173339844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177153, + "balance_loss_mlp": 1.09356356, + "epoch": 0.44863409003462873, + "flos": 592450670592.0, + "grad_norm": 0.02816420707323987, + "language_loss": 0.89319122, + "learning_rate": 0.0006066562232097303, + "loss": 0.90496272, + "num_input_tokens_seen": 194526880, + "router_z_loss_mlp": 0.83642578, + "step": 2332, + "time_per_iteration": 2.7754669189453125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178328, + "balance_loss_mlp": 1.09473884, + "epoch": 0.44882647171989226, + "flos": 725984776704.0, + "grad_norm": 0.02840681089712515, + "language_loss": 0.91798162, + "learning_rate": 0.0006063518306632708, + "loss": 0.92976487, + "num_input_tokens_seen": 194606800, + "router_z_loss_mlp": 0.83642578, + "step": 2333, + "time_per_iteration": 2.9270272254943848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174339, + "balance_loss_mlp": 1.09065437, + "epoch": 0.44901885340515585, + "flos": 535990932480.0, + "grad_norm": 0.029373675588589353, + "language_loss": 0.88265771, + "learning_rate": 0.0006060473968273688, + "loss": 0.89440107, + "num_input_tokens_seen": 194679856, + "router_z_loss_mlp": 0.83740234, + "step": 2334, + "time_per_iteration": 2.6593613624572754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199905, + "balance_loss_mlp": 1.11693573, + "epoch": 0.4492112350904194, + "flos": 1558690593792.0, + "grad_norm": 0.016875691883268894, + "language_loss": 0.77879542, + "learning_rate": 0.000605742921820216, + "loss": 0.79079443, + "num_input_tokens_seen": 194906320, + "router_z_loss_mlp": 0.83007812, + "step": 2335, + "time_per_iteration": 4.868390321731567 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182762, + "balance_loss_mlp": 1.10017395, + "epoch": 0.44940361677568297, + "flos": 1526700768768.0, + "grad_norm": 0.009982769528938305, + "language_loss": 0.81005216, + "learning_rate": 0.0006054384057600202, + "loss": 0.82187974, + "num_input_tokens_seen": 195129152, + "router_z_loss_mlp": 0.82617188, + "step": 2336, + "time_per_iteration": 4.8639936447143555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176453, + "balance_loss_mlp": 1.09286392, + "epoch": 0.4495959984609465, + "flos": 383320673280.0, + "grad_norm": 0.04017386378382665, + "language_loss": 0.95653474, + "learning_rate": 0.0006051338487650047, + "loss": 0.96829921, + "num_input_tokens_seen": 195189792, + "router_z_loss_mlp": 0.83642578, + "step": 2337, + "time_per_iteration": 2.451195240020752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177188, + "balance_loss_mlp": 1.09364605, + "epoch": 0.4497883801462101, + "flos": 498882196992.0, + "grad_norm": 0.03424215683733749, + "language_loss": 0.88682485, + "learning_rate": 0.0006048292509534095, + "loss": 0.89859676, + "num_input_tokens_seen": 195258640, + "router_z_loss_mlp": 0.8359375, + "step": 2338, + "time_per_iteration": 2.5799245834350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174646, + "balance_loss_mlp": 1.09139061, + "epoch": 0.4499807618314736, + "flos": 615589827072.0, + "grad_norm": 0.03300851417215051, + "language_loss": 0.85045063, + "learning_rate": 0.0006045246124434895, + "loss": 0.86219716, + "num_input_tokens_seen": 195327984, + "router_z_loss_mlp": 0.83300781, + "step": 2339, + "time_per_iteration": 2.732715368270874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170546, + "balance_loss_mlp": 1.08738542, + "epoch": 0.4501731435167372, + "flos": 1007067503616.0, + "grad_norm": 0.0319502465029259, + "language_loss": 0.92538428, + "learning_rate": 0.0006042199333535162, + "loss": 0.9370898, + "num_input_tokens_seen": 195409504, + "router_z_loss_mlp": 0.83203125, + "step": 2340, + "time_per_iteration": 3.3100435733795166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170678, + "balance_loss_mlp": 1.08742249, + "epoch": 0.4503655252020008, + "flos": 822327555072.0, + "grad_norm": 0.024782286149646622, + "language_loss": 0.88794839, + "learning_rate": 0.0006039152138017763, + "loss": 0.89965516, + "num_input_tokens_seen": 195489424, + "router_z_loss_mlp": 0.83300781, + "step": 2341, + "time_per_iteration": 3.0845420360565186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117382, + "balance_loss_mlp": 1.09027839, + "epoch": 0.4505579068872643, + "flos": 487413676032.0, + "grad_norm": 0.028274686754151398, + "language_loss": 0.8912791, + "learning_rate": 0.0006036104539065726, + "loss": 0.90301728, + "num_input_tokens_seen": 195562128, + "router_z_loss_mlp": 0.8359375, + "step": 2342, + "time_per_iteration": 2.704869270324707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170482, + "balance_loss_mlp": 1.08679724, + "epoch": 0.4507502885725279, + "flos": 886335403008.0, + "grad_norm": 0.02767032513042878, + "language_loss": 0.89237905, + "learning_rate": 0.000603305653786223, + "loss": 0.90408385, + "num_input_tokens_seen": 195646800, + "router_z_loss_mlp": 0.83740234, + "step": 2343, + "time_per_iteration": 3.143308162689209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169453, + "balance_loss_mlp": 1.08576834, + "epoch": 0.45094267025779144, + "flos": 579421080576.0, + "grad_norm": 0.028420960086658186, + "language_loss": 0.90634954, + "learning_rate": 0.0006030008135590622, + "loss": 0.91804409, + "num_input_tokens_seen": 195719648, + "router_z_loss_mlp": 0.83740234, + "step": 2344, + "time_per_iteration": 2.7383973598480225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177198, + "balance_loss_mlp": 1.09332275, + "epoch": 0.45113505194305503, + "flos": 526441320960.0, + "grad_norm": 0.025225422820390885, + "language_loss": 0.85642457, + "learning_rate": 0.0006026959333434387, + "loss": 0.86819655, + "num_input_tokens_seen": 195794800, + "router_z_loss_mlp": 0.83935547, + "step": 2345, + "time_per_iteration": 2.7594330310821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177326, + "balance_loss_mlp": 1.09316456, + "epoch": 0.45132743362831856, + "flos": 503115512832.0, + "grad_norm": 0.026356266791679354, + "language_loss": 0.83258432, + "learning_rate": 0.0006023910132577181, + "loss": 0.84435755, + "num_input_tokens_seen": 195866848, + "router_z_loss_mlp": 0.84228516, + "step": 2346, + "time_per_iteration": 2.6426072120666504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174296, + "balance_loss_mlp": 1.09051549, + "epoch": 0.45151981531358215, + "flos": 432835917312.0, + "grad_norm": 0.03747446326611767, + "language_loss": 0.91464496, + "learning_rate": 0.0006020860534202806, + "loss": 0.92638797, + "num_input_tokens_seen": 195930640, + "router_z_loss_mlp": 0.83837891, + "step": 2347, + "time_per_iteration": 2.5375916957855225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172304, + "balance_loss_mlp": 1.08799899, + "epoch": 0.4517121969988457, + "flos": 713493674496.0, + "grad_norm": 0.026159040948808, + "language_loss": 0.86486131, + "learning_rate": 0.0006017810539495224, + "loss": 0.87658435, + "num_input_tokens_seen": 196014240, + "router_z_loss_mlp": 0.84375, + "step": 2348, + "time_per_iteration": 2.935776472091675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172944, + "balance_loss_mlp": 1.0886873, + "epoch": 0.45190457868410927, + "flos": 580556453376.0, + "grad_norm": 0.02859512200307389, + "language_loss": 0.8919422, + "learning_rate": 0.0006014760149638547, + "loss": 0.90367162, + "num_input_tokens_seen": 196083296, + "router_z_loss_mlp": 0.84326172, + "step": 2349, + "time_per_iteration": 4.1359429359436035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117423, + "balance_loss_mlp": 1.08982956, + "epoch": 0.45209696036937286, + "flos": 483627523584.0, + "grad_norm": 0.04225699722465749, + "language_loss": 0.94155228, + "learning_rate": 0.000601170936581704, + "loss": 0.95329458, + "num_input_tokens_seen": 196147840, + "router_z_loss_mlp": 0.84472656, + "step": 2350, + "time_per_iteration": 2.551886796951294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171893, + "balance_loss_mlp": 1.08739793, + "epoch": 0.4522893420546364, + "flos": 541259564544.0, + "grad_norm": 0.03047412078786442, + "language_loss": 0.90869355, + "learning_rate": 0.0006008658189215121, + "loss": 0.92041242, + "num_input_tokens_seen": 196219008, + "router_z_loss_mlp": 0.84570312, + "step": 2351, + "time_per_iteration": 2.6196951866149902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176582, + "balance_loss_mlp": 1.09175217, + "epoch": 0.4524817237399, + "flos": 497690428416.0, + "grad_norm": 0.03573709607194862, + "language_loss": 0.8682127, + "learning_rate": 0.0006005606621017366, + "loss": 0.87997848, + "num_input_tokens_seen": 196287792, + "router_z_loss_mlp": 0.84912109, + "step": 2352, + "time_per_iteration": 2.5675714015960693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174694, + "balance_loss_mlp": 1.09024608, + "epoch": 0.4526741054251635, + "flos": 653840666112.0, + "grad_norm": 0.027536817578414453, + "language_loss": 0.86718237, + "learning_rate": 0.0006002554662408496, + "loss": 0.87892926, + "num_input_tokens_seen": 196371776, + "router_z_loss_mlp": 0.84521484, + "step": 2353, + "time_per_iteration": 2.887061595916748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182285, + "balance_loss_mlp": 1.09774196, + "epoch": 0.4528664871104271, + "flos": 572003226624.0, + "grad_norm": 0.03098083736113463, + "language_loss": 0.96988797, + "learning_rate": 0.0005999502314573388, + "loss": 0.98171079, + "num_input_tokens_seen": 196441840, + "router_z_loss_mlp": 0.84619141, + "step": 2354, + "time_per_iteration": 2.6700878143310547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184968, + "balance_loss_mlp": 1.1005199, + "epoch": 0.45305886879569063, + "flos": 459678633984.0, + "grad_norm": 0.034884925425697356, + "language_loss": 0.93055832, + "learning_rate": 0.0005996449578697066, + "loss": 0.94240803, + "num_input_tokens_seen": 196510464, + "router_z_loss_mlp": 0.84521484, + "step": 2355, + "time_per_iteration": 2.6873598098754883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180832, + "balance_loss_mlp": 1.09647942, + "epoch": 0.4532512504809542, + "flos": 506206725120.0, + "grad_norm": 0.028006133853455534, + "language_loss": 0.87364781, + "learning_rate": 0.0005993396455964709, + "loss": 0.88545609, + "num_input_tokens_seen": 196583888, + "router_z_loss_mlp": 0.84423828, + "step": 2356, + "time_per_iteration": 2.672428607940674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179518, + "balance_loss_mlp": 1.09545124, + "epoch": 0.4534436321662178, + "flos": 583311292416.0, + "grad_norm": 0.033764708533666976, + "language_loss": 0.88888013, + "learning_rate": 0.0005990342947561647, + "loss": 0.90067536, + "num_input_tokens_seen": 196652816, + "router_z_loss_mlp": 0.84130859, + "step": 2357, + "time_per_iteration": 2.7101337909698486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179265, + "balance_loss_mlp": 1.09529436, + "epoch": 0.45363601385148133, + "flos": 550772246016.0, + "grad_norm": 0.03168807299418994, + "language_loss": 0.84871709, + "learning_rate": 0.0005987289054673351, + "loss": 0.86050975, + "num_input_tokens_seen": 196720208, + "router_z_loss_mlp": 0.84033203, + "step": 2358, + "time_per_iteration": 2.6033973693847656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122184, + "balance_loss_mlp": 1.14096832, + "epoch": 0.4538283955367449, + "flos": 1477791141888.0, + "grad_norm": 0.02971290012878958, + "language_loss": 0.76575738, + "learning_rate": 0.0005984234778485451, + "loss": 0.7779758, + "num_input_tokens_seen": 196947696, + "router_z_loss_mlp": 0.80859375, + "step": 2359, + "time_per_iteration": 4.841644525527954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172875, + "balance_loss_mlp": 1.0889039, + "epoch": 0.45402077722200845, + "flos": 585796887552.0, + "grad_norm": 0.03208897744410929, + "language_loss": 0.98243296, + "learning_rate": 0.0005981180120183722, + "loss": 0.99416173, + "num_input_tokens_seen": 197015712, + "router_z_loss_mlp": 0.84033203, + "step": 2360, + "time_per_iteration": 2.76943302154541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183781, + "balance_loss_mlp": 1.09957135, + "epoch": 0.45421315890727204, + "flos": 532888986624.0, + "grad_norm": 0.026822351719262807, + "language_loss": 0.89930874, + "learning_rate": 0.0005978125080954089, + "loss": 0.91114652, + "num_input_tokens_seen": 197094880, + "router_z_loss_mlp": 0.84277344, + "step": 2361, + "time_per_iteration": 2.822767972946167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180091, + "balance_loss_mlp": 1.09597707, + "epoch": 0.4544055405925356, + "flos": 786551577600.0, + "grad_norm": 0.034773976616178995, + "language_loss": 0.84516251, + "learning_rate": 0.000597506966198262, + "loss": 0.85696352, + "num_input_tokens_seen": 197176448, + "router_z_loss_mlp": 0.84179688, + "step": 2362, + "time_per_iteration": 2.952383518218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177, + "balance_loss_mlp": 1.09288561, + "epoch": 0.45459792227779916, + "flos": 519201386496.0, + "grad_norm": 0.03664720273497137, + "language_loss": 0.91360861, + "learning_rate": 0.0005972013864455536, + "loss": 0.92537856, + "num_input_tokens_seen": 197243520, + "router_z_loss_mlp": 0.84179688, + "step": 2363, + "time_per_iteration": 2.6317927837371826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178521, + "balance_loss_mlp": 1.09450209, + "epoch": 0.4547903039630627, + "flos": 538598051328.0, + "grad_norm": 0.028772208334572696, + "language_loss": 0.91273308, + "learning_rate": 0.0005968957689559203, + "loss": 0.92451829, + "num_input_tokens_seen": 197311536, + "router_z_loss_mlp": 0.84082031, + "step": 2364, + "time_per_iteration": 2.6589906215667725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173596, + "balance_loss_mlp": 1.0895294, + "epoch": 0.4549826856483263, + "flos": 529690987008.0, + "grad_norm": 0.029727340486193105, + "language_loss": 0.95477283, + "learning_rate": 0.0005965901138480131, + "loss": 0.96650875, + "num_input_tokens_seen": 197382752, + "router_z_loss_mlp": 0.84130859, + "step": 2365, + "time_per_iteration": 2.595510959625244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171355, + "balance_loss_mlp": 1.08700228, + "epoch": 0.45517506733358987, + "flos": 521982422016.0, + "grad_norm": 0.030829958952989886, + "language_loss": 0.94295681, + "learning_rate": 0.0005962844212404982, + "loss": 0.95467031, + "num_input_tokens_seen": 197456592, + "router_z_loss_mlp": 0.84423828, + "step": 2366, + "time_per_iteration": 2.662235736846924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177016, + "balance_loss_mlp": 1.09271073, + "epoch": 0.4553674490188534, + "flos": 452009000448.0, + "grad_norm": 0.02436634770305822, + "language_loss": 0.92783928, + "learning_rate": 0.0005959786912520558, + "loss": 0.93960941, + "num_input_tokens_seen": 197525408, + "router_z_loss_mlp": 0.84375, + "step": 2367, + "time_per_iteration": 2.573124408721924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117318, + "balance_loss_mlp": 1.08906567, + "epoch": 0.455559830704117, + "flos": 547744160256.0, + "grad_norm": 0.037205613753220755, + "language_loss": 0.90209919, + "learning_rate": 0.0005956729240013806, + "loss": 0.913831, + "num_input_tokens_seen": 197608480, + "router_z_loss_mlp": 0.84179688, + "step": 2368, + "time_per_iteration": 2.772557020187378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173597, + "balance_loss_mlp": 1.08943486, + "epoch": 0.4557522123893805, + "flos": 584865630720.0, + "grad_norm": 0.026144628796570656, + "language_loss": 0.97770655, + "learning_rate": 0.0005953671196071824, + "loss": 0.98944247, + "num_input_tokens_seen": 197678416, + "router_z_loss_mlp": 0.84228516, + "step": 2369, + "time_per_iteration": 2.7082910537719727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172311, + "balance_loss_mlp": 1.08819652, + "epoch": 0.4559445940746441, + "flos": 527483367936.0, + "grad_norm": 0.0309922218143565, + "language_loss": 0.8751142, + "learning_rate": 0.0005950612781881846, + "loss": 0.8868373, + "num_input_tokens_seen": 197753424, + "router_z_loss_mlp": 0.84179688, + "step": 2370, + "time_per_iteration": 2.7258613109588623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172868, + "balance_loss_mlp": 1.08913577, + "epoch": 0.45613697575990764, + "flos": 653367306240.0, + "grad_norm": 0.03125586624235708, + "language_loss": 0.84058654, + "learning_rate": 0.0005947553998631259, + "loss": 0.85231519, + "num_input_tokens_seen": 197832080, + "router_z_loss_mlp": 0.83789062, + "step": 2371, + "time_per_iteration": 2.8463094234466553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169614, + "balance_loss_mlp": 1.08626282, + "epoch": 0.4563293574451712, + "flos": 868623332352.0, + "grad_norm": 0.025158843177806284, + "language_loss": 0.84537494, + "learning_rate": 0.000594449484750758, + "loss": 0.85707104, + "num_input_tokens_seen": 197919536, + "router_z_loss_mlp": 0.83398438, + "step": 2372, + "time_per_iteration": 3.1793160438537598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165382, + "balance_loss_mlp": 1.08193552, + "epoch": 0.45652173913043476, + "flos": 499131975168.0, + "grad_norm": 0.03016735007152292, + "language_loss": 0.8953886, + "learning_rate": 0.0005941435329698484, + "loss": 0.90704238, + "num_input_tokens_seen": 197991872, + "router_z_loss_mlp": 0.83496094, + "step": 2373, + "time_per_iteration": 2.6885011196136475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168274, + "balance_loss_mlp": 1.08458936, + "epoch": 0.45671412081569834, + "flos": 561958788096.0, + "grad_norm": 0.029049495784182693, + "language_loss": 0.89830238, + "learning_rate": 0.0005938375446391778, + "loss": 0.90998513, + "num_input_tokens_seen": 198063392, + "router_z_loss_mlp": 0.83740234, + "step": 2374, + "time_per_iteration": 2.7694103717803955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169785, + "balance_loss_mlp": 1.08605206, + "epoch": 0.45690650250096193, + "flos": 504122631168.0, + "grad_norm": 0.032895841438659715, + "language_loss": 0.95283711, + "learning_rate": 0.0005935315198775415, + "loss": 0.96453488, + "num_input_tokens_seen": 198131232, + "router_z_loss_mlp": 0.83789062, + "step": 2375, + "time_per_iteration": 2.6797261238098145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117336, + "balance_loss_mlp": 1.08967507, + "epoch": 0.45709888418622546, + "flos": 431598486528.0, + "grad_norm": 0.029217874962507603, + "language_loss": 0.93084061, + "learning_rate": 0.0005932254588037486, + "loss": 0.94257426, + "num_input_tokens_seen": 198194944, + "router_z_loss_mlp": 0.83740234, + "step": 2376, + "time_per_iteration": 2.5119664669036865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170171, + "balance_loss_mlp": 1.08634305, + "epoch": 0.45729126587148905, + "flos": 526693100544.0, + "grad_norm": 0.033600967739372, + "language_loss": 0.91914618, + "learning_rate": 0.000592919361536623, + "loss": 0.93084788, + "num_input_tokens_seen": 198265728, + "router_z_loss_mlp": 0.83886719, + "step": 2377, + "time_per_iteration": 2.627753734588623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172251, + "balance_loss_mlp": 1.08861363, + "epoch": 0.4574836475567526, + "flos": 639147949056.0, + "grad_norm": 0.02676395696709272, + "language_loss": 0.95213675, + "learning_rate": 0.0005926132281950017, + "loss": 0.9638592, + "num_input_tokens_seen": 198336640, + "router_z_loss_mlp": 0.83691406, + "step": 2378, + "time_per_iteration": 2.7404637336730957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171278, + "balance_loss_mlp": 1.08754539, + "epoch": 0.45767602924201617, + "flos": 650790386688.0, + "grad_norm": 0.03076010987013328, + "language_loss": 0.92175043, + "learning_rate": 0.0005923070588977367, + "loss": 0.93346316, + "num_input_tokens_seen": 198413552, + "router_z_loss_mlp": 0.83789062, + "step": 2379, + "time_per_iteration": 2.7948412895202637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173225, + "balance_loss_mlp": 1.08944476, + "epoch": 0.4578684109272797, + "flos": 747962363904.0, + "grad_norm": 0.027484014603145524, + "language_loss": 0.92339164, + "learning_rate": 0.0005920008537636931, + "loss": 0.93512392, + "num_input_tokens_seen": 198490864, + "router_z_loss_mlp": 0.83837891, + "step": 2380, + "time_per_iteration": 2.903837203979492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173408, + "balance_loss_mlp": 1.08972311, + "epoch": 0.4580607926125433, + "flos": 642727984128.0, + "grad_norm": 0.029077527756171735, + "language_loss": 0.92490625, + "learning_rate": 0.0005916946129117504, + "loss": 0.93664026, + "num_input_tokens_seen": 198571200, + "router_z_loss_mlp": 0.83740234, + "step": 2381, + "time_per_iteration": 2.902449131011963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169328, + "balance_loss_mlp": 1.08569121, + "epoch": 0.4582531742978069, + "flos": 803239065600.0, + "grad_norm": 0.02842187637415346, + "language_loss": 0.86509985, + "learning_rate": 0.0005913883364608017, + "loss": 0.87679315, + "num_input_tokens_seen": 198658624, + "router_z_loss_mlp": 0.83691406, + "step": 2382, + "time_per_iteration": 3.0474140644073486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171424, + "balance_loss_mlp": 1.0876435, + "epoch": 0.4584455559830704, + "flos": 685517586432.0, + "grad_norm": 0.02678099894990505, + "language_loss": 0.94194049, + "learning_rate": 0.0005910820245297542, + "loss": 0.95365477, + "num_input_tokens_seen": 198731312, + "router_z_loss_mlp": 0.83837891, + "step": 2383, + "time_per_iteration": 2.879652261734009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171015, + "balance_loss_mlp": 1.08718669, + "epoch": 0.458637937668334, + "flos": 519281977344.0, + "grad_norm": 0.03033035418174317, + "language_loss": 0.87193358, + "learning_rate": 0.000590775677237529, + "loss": 0.88364375, + "num_input_tokens_seen": 198805296, + "router_z_loss_mlp": 0.83886719, + "step": 2384, + "time_per_iteration": 2.718327045440674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116823, + "balance_loss_mlp": 1.08478332, + "epoch": 0.4588303193535975, + "flos": 506532364800.0, + "grad_norm": 0.028303891516217768, + "language_loss": 0.87188554, + "learning_rate": 0.0005904692947030601, + "loss": 0.88356787, + "num_input_tokens_seen": 198872112, + "router_z_loss_mlp": 0.83496094, + "step": 2385, + "time_per_iteration": 2.5850000381469727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166672, + "balance_loss_mlp": 1.08303475, + "epoch": 0.4590227010388611, + "flos": 496908893184.0, + "grad_norm": 0.031451346934425, + "language_loss": 0.9665041, + "learning_rate": 0.0005901628770452963, + "loss": 0.97817081, + "num_input_tokens_seen": 198938480, + "router_z_loss_mlp": 0.83691406, + "step": 2386, + "time_per_iteration": 2.5478482246398926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172991, + "balance_loss_mlp": 1.08964002, + "epoch": 0.45921508272412465, + "flos": 494601217536.0, + "grad_norm": 0.030858044337890404, + "language_loss": 0.93199378, + "learning_rate": 0.000589856424383199, + "loss": 0.94372368, + "num_input_tokens_seen": 199008608, + "router_z_loss_mlp": 0.83398438, + "step": 2387, + "time_per_iteration": 2.6889121532440186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170845, + "balance_loss_mlp": 1.08744633, + "epoch": 0.45940746440938823, + "flos": 692592336384.0, + "grad_norm": 0.02985924743030105, + "language_loss": 0.89320701, + "learning_rate": 0.000589549936835744, + "loss": 0.90491545, + "num_input_tokens_seen": 199084592, + "router_z_loss_mlp": 0.83447266, + "step": 2388, + "time_per_iteration": 2.929584264755249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167353, + "balance_loss_mlp": 1.08390617, + "epoch": 0.45959984609465176, + "flos": 504736980480.0, + "grad_norm": 0.026272627268038303, + "language_loss": 0.85652947, + "learning_rate": 0.0005892434145219202, + "loss": 0.86820304, + "num_input_tokens_seen": 199151504, + "router_z_loss_mlp": 0.83496094, + "step": 2389, + "time_per_iteration": 2.6049258708953857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169189, + "balance_loss_mlp": 1.08593321, + "epoch": 0.45979222777991535, + "flos": 677839220736.0, + "grad_norm": 0.032142260667283734, + "language_loss": 0.89047158, + "learning_rate": 0.0005889368575607303, + "loss": 0.90216345, + "num_input_tokens_seen": 199224528, + "router_z_loss_mlp": 0.83300781, + "step": 2390, + "time_per_iteration": 2.8630926609039307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170087, + "balance_loss_mlp": 1.08673584, + "epoch": 0.45998460946517894, + "flos": 779038396416.0, + "grad_norm": 0.02948026619685868, + "language_loss": 0.84149277, + "learning_rate": 0.00058863026607119, + "loss": 0.85319364, + "num_input_tokens_seen": 199312512, + "router_z_loss_mlp": 0.83398438, + "step": 2391, + "time_per_iteration": 3.0889787673950195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170542, + "balance_loss_mlp": 1.08709574, + "epoch": 0.46017699115044247, + "flos": 853021552128.0, + "grad_norm": 0.028406278062058678, + "language_loss": 0.85429174, + "learning_rate": 0.0005883236401723287, + "loss": 0.8659972, + "num_input_tokens_seen": 199397216, + "router_z_loss_mlp": 0.83496094, + "step": 2392, + "time_per_iteration": 3.1613874435424805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167478, + "balance_loss_mlp": 1.08403194, + "epoch": 0.46036937283570606, + "flos": 576963683328.0, + "grad_norm": 0.029157836827012555, + "language_loss": 0.90157199, + "learning_rate": 0.0005880169799831893, + "loss": 0.91324675, + "num_input_tokens_seen": 199464288, + "router_z_loss_mlp": 0.83496094, + "step": 2393, + "time_per_iteration": 2.6974027156829834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117291, + "balance_loss_mlp": 1.08955884, + "epoch": 0.4605617545209696, + "flos": 613119694848.0, + "grad_norm": 0.028584885066092792, + "language_loss": 0.87511885, + "learning_rate": 0.0005877102856228278, + "loss": 0.88684797, + "num_input_tokens_seen": 199538096, + "router_z_loss_mlp": 0.83398438, + "step": 2394, + "time_per_iteration": 2.862462043762207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169553, + "balance_loss_mlp": 1.08591628, + "epoch": 0.4607541362062332, + "flos": 534158618112.0, + "grad_norm": 0.03156913659667245, + "language_loss": 0.91444194, + "learning_rate": 0.0005874035572103133, + "loss": 0.92613751, + "num_input_tokens_seen": 199609504, + "router_z_loss_mlp": 0.83691406, + "step": 2395, + "time_per_iteration": 2.66796612739563 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171842, + "balance_loss_mlp": 1.08830035, + "epoch": 0.4609465178914967, + "flos": 648473978880.0, + "grad_norm": 0.039315545211924735, + "language_loss": 0.89278555, + "learning_rate": 0.0005870967948647288, + "loss": 0.90450394, + "num_input_tokens_seen": 199678960, + "router_z_loss_mlp": 0.8359375, + "step": 2396, + "time_per_iteration": 2.7669596672058105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209076, + "balance_loss_mlp": 1.12553406, + "epoch": 0.4611388995767603, + "flos": 1469498426880.0, + "grad_norm": 0.015424486797259693, + "language_loss": 0.743083, + "learning_rate": 0.0005867899987051693, + "loss": 0.7551738, + "num_input_tokens_seen": 199903568, + "router_z_loss_mlp": 0.8359375, + "step": 2397, + "time_per_iteration": 5.5382936000823975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177127, + "balance_loss_mlp": 1.09377611, + "epoch": 0.46133128126202383, + "flos": 724476100608.0, + "grad_norm": 0.029375695907885992, + "language_loss": 0.91919947, + "learning_rate": 0.0005864831688507443, + "loss": 0.93097073, + "num_input_tokens_seen": 199988672, + "router_z_loss_mlp": 0.83398438, + "step": 2398, + "time_per_iteration": 2.95526123046875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171581, + "balance_loss_mlp": 1.08846855, + "epoch": 0.4615236629472874, + "flos": 549113848320.0, + "grad_norm": 0.030696537047505416, + "language_loss": 0.82409662, + "learning_rate": 0.0005861763054205754, + "loss": 0.83581245, + "num_input_tokens_seen": 200062304, + "router_z_loss_mlp": 0.83154297, + "step": 2399, + "time_per_iteration": 2.767615795135498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172709, + "balance_loss_mlp": 1.08973968, + "epoch": 0.461716044632551, + "flos": 603459293184.0, + "grad_norm": 0.02737063612292851, + "language_loss": 0.84976828, + "learning_rate": 0.0005858694085337976, + "loss": 0.86149538, + "num_input_tokens_seen": 200138464, + "router_z_loss_mlp": 0.83007812, + "step": 2400, + "time_per_iteration": 2.7964670658111572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011724, + "balance_loss_mlp": 1.08966899, + "epoch": 0.46190842631781454, + "flos": 475436866560.0, + "grad_norm": 0.03229000781534058, + "language_loss": 0.9094255, + "learning_rate": 0.0005855624783095589, + "loss": 0.92114949, + "num_input_tokens_seen": 200205728, + "router_z_loss_mlp": 0.82763672, + "step": 2401, + "time_per_iteration": 2.534349203109741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170734, + "balance_loss_mlp": 1.08814597, + "epoch": 0.4621008080030781, + "flos": 438401991168.0, + "grad_norm": 0.027555285929390542, + "language_loss": 0.90607065, + "learning_rate": 0.00058525551486702, + "loss": 0.91777802, + "num_input_tokens_seen": 200269824, + "router_z_loss_mlp": 0.82617188, + "step": 2402, + "time_per_iteration": 2.5021228790283203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172463, + "balance_loss_mlp": 1.08987451, + "epoch": 0.46229318968834165, + "flos": 526497716736.0, + "grad_norm": 0.03262891309156314, + "language_loss": 0.88400978, + "learning_rate": 0.0005849485183253548, + "loss": 0.89573443, + "num_input_tokens_seen": 200341264, + "router_z_loss_mlp": 0.82617188, + "step": 2403, + "time_per_iteration": 2.6212213039398193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165506, + "balance_loss_mlp": 1.08291745, + "epoch": 0.46248557137360524, + "flos": 440533748736.0, + "grad_norm": 0.02845192827842058, + "language_loss": 0.92361593, + "learning_rate": 0.0005846414888037501, + "loss": 0.93527102, + "num_input_tokens_seen": 200405632, + "router_z_loss_mlp": 0.82617188, + "step": 2404, + "time_per_iteration": 2.482285499572754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166688, + "balance_loss_mlp": 1.08409953, + "epoch": 0.4626779530588688, + "flos": 618772363776.0, + "grad_norm": 0.03074329225106782, + "language_loss": 0.881423, + "learning_rate": 0.0005843344264214049, + "loss": 0.89308989, + "num_input_tokens_seen": 200479312, + "router_z_loss_mlp": 0.82617188, + "step": 2405, + "time_per_iteration": 2.746795415878296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170811, + "balance_loss_mlp": 1.08803225, + "epoch": 0.46287033474413236, + "flos": 671359354368.0, + "grad_norm": 0.02816556419491645, + "language_loss": 0.904742, + "learning_rate": 0.0005840273312975317, + "loss": 0.91645014, + "num_input_tokens_seen": 200552976, + "router_z_loss_mlp": 0.828125, + "step": 2406, + "time_per_iteration": 2.866894483566284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168834, + "balance_loss_mlp": 1.08572149, + "epoch": 0.46306271642939595, + "flos": 481198324224.0, + "grad_norm": 0.027370741977369897, + "language_loss": 0.96141434, + "learning_rate": 0.0005837202035513555, + "loss": 0.97310269, + "num_input_tokens_seen": 200621088, + "router_z_loss_mlp": 0.83154297, + "step": 2407, + "time_per_iteration": 2.589233636856079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168547, + "balance_loss_mlp": 1.08562469, + "epoch": 0.4632550981146595, + "flos": 581857010688.0, + "grad_norm": 0.028787881065009197, + "language_loss": 0.87249482, + "learning_rate": 0.0005834130433021136, + "loss": 0.88418025, + "num_input_tokens_seen": 200698400, + "router_z_loss_mlp": 0.82958984, + "step": 2408, + "time_per_iteration": 2.77109432220459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176276, + "balance_loss_mlp": 1.09311593, + "epoch": 0.46344747979992307, + "flos": 525017238528.0, + "grad_norm": 0.03139748973768327, + "language_loss": 0.79860151, + "learning_rate": 0.0005831058506690563, + "loss": 0.81036425, + "num_input_tokens_seen": 200767264, + "router_z_loss_mlp": 0.83203125, + "step": 2409, + "time_per_iteration": 2.6422629356384277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175968, + "balance_loss_mlp": 1.0931412, + "epoch": 0.4636398614851866, + "flos": 747812642304.0, + "grad_norm": 0.02712568041794283, + "language_loss": 0.9122293, + "learning_rate": 0.0005827986257714464, + "loss": 0.92398894, + "num_input_tokens_seen": 200841440, + "router_z_loss_mlp": 0.82861328, + "step": 2410, + "time_per_iteration": 2.915513515472412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175895, + "balance_loss_mlp": 1.09254348, + "epoch": 0.4638322431704502, + "flos": 597645442560.0, + "grad_norm": 0.03337742182336422, + "language_loss": 0.94969916, + "learning_rate": 0.0005824913687285591, + "loss": 0.96145809, + "num_input_tokens_seen": 200911296, + "router_z_loss_mlp": 0.83398438, + "step": 2411, + "time_per_iteration": 2.7729153633117676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174985, + "balance_loss_mlp": 1.09168148, + "epoch": 0.4640246248557137, + "flos": 540532423680.0, + "grad_norm": 0.028926449520475586, + "language_loss": 0.87762833, + "learning_rate": 0.0005821840796596821, + "loss": 0.88937813, + "num_input_tokens_seen": 200981920, + "router_z_loss_mlp": 0.83349609, + "step": 2412, + "time_per_iteration": 2.7454707622528076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174854, + "balance_loss_mlp": 1.09155095, + "epoch": 0.4642170065409773, + "flos": 563808566784.0, + "grad_norm": 0.027243427778446835, + "language_loss": 0.85983133, + "learning_rate": 0.0005818767586841158, + "loss": 0.87157989, + "num_input_tokens_seen": 201059392, + "router_z_loss_mlp": 0.83349609, + "step": 2413, + "time_per_iteration": 2.7634999752044678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174726, + "balance_loss_mlp": 1.09161353, + "epoch": 0.46440938822624084, + "flos": 532061789184.0, + "grad_norm": 0.026139841130999073, + "language_loss": 0.91185576, + "learning_rate": 0.0005815694059211726, + "loss": 0.923603, + "num_input_tokens_seen": 201130192, + "router_z_loss_mlp": 0.83154297, + "step": 2414, + "time_per_iteration": 2.6814608573913574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193306, + "balance_loss_mlp": 1.11109924, + "epoch": 0.4646017699115044, + "flos": 1529624795136.0, + "grad_norm": 0.015412108289742382, + "language_loss": 0.80873632, + "learning_rate": 0.0005812620214901778, + "loss": 0.82066941, + "num_input_tokens_seen": 201354720, + "router_z_loss_mlp": 0.82226562, + "step": 2415, + "time_per_iteration": 4.867271184921265 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183273, + "balance_loss_mlp": 1.10163879, + "epoch": 0.464794151596768, + "flos": 1544171793408.0, + "grad_norm": 0.012751682226462524, + "language_loss": 0.7694506, + "learning_rate": 0.000580954605510468, + "loss": 0.78128332, + "num_input_tokens_seen": 201592096, + "router_z_loss_mlp": 0.81640625, + "step": 2416, + "time_per_iteration": 5.0150392055511475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166548, + "balance_loss_mlp": 1.08391249, + "epoch": 0.46498653328203154, + "flos": 502538093568.0, + "grad_norm": 0.028765151082888876, + "language_loss": 0.92239797, + "learning_rate": 0.0005806471581013931, + "loss": 0.93406343, + "num_input_tokens_seen": 201666160, + "router_z_loss_mlp": 0.82666016, + "step": 2417, + "time_per_iteration": 2.6913554668426514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165917, + "balance_loss_mlp": 1.08332872, + "epoch": 0.46517891496729513, + "flos": 677300732928.0, + "grad_norm": 0.03431254801555697, + "language_loss": 0.85110676, + "learning_rate": 0.0005803396793823146, + "loss": 0.86276597, + "num_input_tokens_seen": 201733552, + "router_z_loss_mlp": 0.82617188, + "step": 2418, + "time_per_iteration": 2.8245232105255127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169421, + "balance_loss_mlp": 1.08702314, + "epoch": 0.46537129665255866, + "flos": 586511293440.0, + "grad_norm": 0.03532488466841911, + "language_loss": 0.93255758, + "learning_rate": 0.0005800321694726065, + "loss": 0.94425178, + "num_input_tokens_seen": 201806128, + "router_z_loss_mlp": 0.82421875, + "step": 2419, + "time_per_iteration": 2.74255108833313 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117097, + "balance_loss_mlp": 1.08866799, + "epoch": 0.46556367833782225, + "flos": 588820970496.0, + "grad_norm": 0.031254530654890866, + "language_loss": 0.92505676, + "learning_rate": 0.0005797246284916545, + "loss": 0.93676651, + "num_input_tokens_seen": 201874224, + "router_z_loss_mlp": 0.82324219, + "step": 2420, + "time_per_iteration": 2.6942667961120605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182114, + "balance_loss_mlp": 1.10238647, + "epoch": 0.4657560600230858, + "flos": 1488582187008.0, + "grad_norm": 0.01896402624903705, + "language_loss": 0.77505189, + "learning_rate": 0.0005794170565588569, + "loss": 0.78687304, + "num_input_tokens_seen": 202111648, + "router_z_loss_mlp": 0.796875, + "step": 2421, + "time_per_iteration": 4.965069532394409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179806, + "balance_loss_mlp": 1.09740829, + "epoch": 0.46594844170834937, + "flos": 581392382976.0, + "grad_norm": 0.035008146137172264, + "language_loss": 0.92618293, + "learning_rate": 0.0005791094537936233, + "loss": 0.93798101, + "num_input_tokens_seen": 202183344, + "router_z_loss_mlp": 0.82421875, + "step": 2422, + "time_per_iteration": 2.7509443759918213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116805, + "balance_loss_mlp": 1.08555722, + "epoch": 0.4661408233936129, + "flos": 513570184704.0, + "grad_norm": 0.03182837491947037, + "language_loss": 0.88539767, + "learning_rate": 0.0005788018203153762, + "loss": 0.89707822, + "num_input_tokens_seen": 202252512, + "router_z_loss_mlp": 0.82519531, + "step": 2423, + "time_per_iteration": 2.6291344165802 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163454, + "balance_loss_mlp": 1.08038855, + "epoch": 0.4663332050788765, + "flos": 492033030144.0, + "grad_norm": 0.03147692461991822, + "language_loss": 0.92034245, + "learning_rate": 0.000578494156243549, + "loss": 0.93197691, + "num_input_tokens_seen": 202320096, + "router_z_loss_mlp": 0.83105469, + "step": 2424, + "time_per_iteration": 2.5616393089294434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167158, + "balance_loss_mlp": 1.08390224, + "epoch": 0.4665255867641401, + "flos": 513707171328.0, + "grad_norm": 0.028174773974589257, + "language_loss": 0.94988501, + "learning_rate": 0.0005781864616975878, + "loss": 0.96155655, + "num_input_tokens_seen": 202391552, + "router_z_loss_mlp": 0.83300781, + "step": 2425, + "time_per_iteration": 2.67893648147583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178777, + "balance_loss_mlp": 1.09552157, + "epoch": 0.4667179684494036, + "flos": 425706772992.0, + "grad_norm": 0.03381525890081808, + "language_loss": 0.91298926, + "learning_rate": 0.0005778787367969502, + "loss": 0.92477703, + "num_input_tokens_seen": 202457328, + "router_z_loss_mlp": 0.83300781, + "step": 2426, + "time_per_iteration": 2.5708863735198975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180968, + "balance_loss_mlp": 1.09790349, + "epoch": 0.4669103501346672, + "flos": 709223428608.0, + "grad_norm": 0.031023375068471706, + "language_loss": 0.86979687, + "learning_rate": 0.0005775709816611053, + "loss": 0.88160658, + "num_input_tokens_seen": 202535888, + "router_z_loss_mlp": 0.83105469, + "step": 2427, + "time_per_iteration": 2.9488039016723633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178737, + "balance_loss_mlp": 1.09543312, + "epoch": 0.4671027318199307, + "flos": 555945550848.0, + "grad_norm": 0.0268683026146142, + "language_loss": 0.8862977, + "learning_rate": 0.0005772631964095346, + "loss": 0.89808506, + "num_input_tokens_seen": 202608400, + "router_z_loss_mlp": 0.83349609, + "step": 2428, + "time_per_iteration": 2.6830828189849854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176571, + "balance_loss_mlp": 1.09321952, + "epoch": 0.4672951135051943, + "flos": 568195607040.0, + "grad_norm": 0.029193722689313813, + "language_loss": 0.92024446, + "learning_rate": 0.000576955381161731, + "loss": 0.93201017, + "num_input_tokens_seen": 202677712, + "router_z_loss_mlp": 0.83398438, + "step": 2429, + "time_per_iteration": 2.7286531925201416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172919, + "balance_loss_mlp": 1.08956802, + "epoch": 0.46748749519045785, + "flos": 425418063360.0, + "grad_norm": 0.030194965591673555, + "language_loss": 0.93541706, + "learning_rate": 0.0005766475360371985, + "loss": 0.94714624, + "num_input_tokens_seen": 202743824, + "router_z_loss_mlp": 0.83398438, + "step": 2430, + "time_per_iteration": 2.5866243839263916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171537, + "balance_loss_mlp": 1.08809078, + "epoch": 0.46767987687572143, + "flos": 539370854400.0, + "grad_norm": 0.031323302876694416, + "language_loss": 0.91645998, + "learning_rate": 0.0005763396611554536, + "loss": 0.92817533, + "num_input_tokens_seen": 202813072, + "router_z_loss_mlp": 0.83496094, + "step": 2431, + "time_per_iteration": 2.644538402557373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169389, + "balance_loss_mlp": 1.08622885, + "epoch": 0.467872258560985, + "flos": 825075663360.0, + "grad_norm": 0.035112660876247544, + "language_loss": 0.8720994, + "learning_rate": 0.0005760317566360237, + "loss": 0.88379329, + "num_input_tokens_seen": 202886576, + "router_z_loss_mlp": 0.83203125, + "step": 2432, + "time_per_iteration": 2.9847497940063477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169145, + "balance_loss_mlp": 1.08598459, + "epoch": 0.46806464024624855, + "flos": 662853791232.0, + "grad_norm": 0.03130586605287321, + "language_loss": 0.92657965, + "learning_rate": 0.000575723822598448, + "loss": 0.93827116, + "num_input_tokens_seen": 202956736, + "router_z_loss_mlp": 0.83203125, + "step": 2433, + "time_per_iteration": 2.7757930755615234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166037, + "balance_loss_mlp": 1.08325768, + "epoch": 0.46825702193151214, + "flos": 757054078464.0, + "grad_norm": 0.025972857143736858, + "language_loss": 0.87588978, + "learning_rate": 0.0005754158591622773, + "loss": 0.88755012, + "num_input_tokens_seen": 203036432, + "router_z_loss_mlp": 0.828125, + "step": 2434, + "time_per_iteration": 2.9586892127990723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167751, + "balance_loss_mlp": 1.08482957, + "epoch": 0.4684494036167757, + "flos": 440310167040.0, + "grad_norm": 0.03095385887839679, + "language_loss": 0.89792037, + "learning_rate": 0.0005751078664470732, + "loss": 0.90959787, + "num_input_tokens_seen": 203101904, + "router_z_loss_mlp": 0.82958984, + "step": 2435, + "time_per_iteration": 2.5508580207824707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167106, + "balance_loss_mlp": 1.08446991, + "epoch": 0.46864178530203926, + "flos": 533748384768.0, + "grad_norm": 0.02784458934890301, + "language_loss": 0.91441107, + "learning_rate": 0.0005747998445724094, + "loss": 0.92608213, + "num_input_tokens_seen": 203170272, + "router_z_loss_mlp": 0.82666016, + "step": 2436, + "time_per_iteration": 2.6264078617095947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166893, + "balance_loss_mlp": 1.08435297, + "epoch": 0.4688341669873028, + "flos": 577825809408.0, + "grad_norm": 0.028098929039846225, + "language_loss": 0.94501269, + "learning_rate": 0.0005744917936578707, + "loss": 0.95668173, + "num_input_tokens_seen": 203243920, + "router_z_loss_mlp": 0.82568359, + "step": 2437, + "time_per_iteration": 2.7923285961151123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163054, + "balance_loss_mlp": 1.0805608, + "epoch": 0.4690265486725664, + "flos": 540717073920.0, + "grad_norm": 0.02510139841230761, + "language_loss": 0.88352144, + "learning_rate": 0.0005741837138230526, + "loss": 0.89515197, + "num_input_tokens_seen": 203321760, + "router_z_loss_mlp": 0.82519531, + "step": 2438, + "time_per_iteration": 2.720592737197876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117104, + "balance_loss_mlp": 1.08849919, + "epoch": 0.4692189303578299, + "flos": 771881054208.0, + "grad_norm": 0.031043213179005578, + "language_loss": 0.91746414, + "learning_rate": 0.0005738756051875627, + "loss": 0.92917454, + "num_input_tokens_seen": 203409088, + "router_z_loss_mlp": 0.82568359, + "step": 2439, + "time_per_iteration": 3.0688676834106445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179368, + "balance_loss_mlp": 1.09697056, + "epoch": 0.4694113120430935, + "flos": 572513516544.0, + "grad_norm": 0.031224617656339514, + "language_loss": 0.8895998, + "learning_rate": 0.0005735674678710192, + "loss": 0.90139341, + "num_input_tokens_seen": 203481680, + "router_z_loss_mlp": 0.82421875, + "step": 2440, + "time_per_iteration": 2.6647889614105225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180255, + "balance_loss_mlp": 1.09814322, + "epoch": 0.4696036937283571, + "flos": 750094121472.0, + "grad_norm": 0.03673041295896698, + "language_loss": 0.88509989, + "learning_rate": 0.0005732593019930517, + "loss": 0.89690244, + "num_input_tokens_seen": 203554848, + "router_z_loss_mlp": 0.82128906, + "step": 2441, + "time_per_iteration": 2.9219651222229004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177833, + "balance_loss_mlp": 1.09553087, + "epoch": 0.4697960754136206, + "flos": 494442763776.0, + "grad_norm": 0.03186685029176949, + "language_loss": 0.93415046, + "learning_rate": 0.0005729511076733008, + "loss": 0.94592881, + "num_input_tokens_seen": 203624816, + "router_z_loss_mlp": 0.82324219, + "step": 2442, + "time_per_iteration": 2.6268982887268066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163524, + "balance_loss_mlp": 1.08088803, + "epoch": 0.4699884570988842, + "flos": 726360081408.0, + "grad_norm": 0.03313850577325225, + "language_loss": 0.91418898, + "learning_rate": 0.000572642885031418, + "loss": 0.92582428, + "num_input_tokens_seen": 203698256, + "router_z_loss_mlp": 0.82666016, + "step": 2443, + "time_per_iteration": 2.847228527069092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165965, + "balance_loss_mlp": 1.08337641, + "epoch": 0.47018083878414774, + "flos": 556577364480.0, + "grad_norm": 0.031620033102277616, + "language_loss": 0.86240256, + "learning_rate": 0.0005723346341870662, + "loss": 0.87406218, + "num_input_tokens_seen": 203772672, + "router_z_loss_mlp": 0.82617188, + "step": 2444, + "time_per_iteration": 2.7060024738311768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171889, + "balance_loss_mlp": 1.08944428, + "epoch": 0.4703732204694113, + "flos": 424962167808.0, + "grad_norm": 0.03469194433982127, + "language_loss": 0.92819834, + "learning_rate": 0.0005720263552599188, + "loss": 0.93991721, + "num_input_tokens_seen": 203835904, + "router_z_loss_mlp": 0.82470703, + "step": 2445, + "time_per_iteration": 2.486546754837036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175277, + "balance_loss_mlp": 1.09307039, + "epoch": 0.47056560215467486, + "flos": 704755797504.0, + "grad_norm": 0.03273224664010927, + "language_loss": 0.86175644, + "learning_rate": 0.0005717180483696604, + "loss": 0.87350929, + "num_input_tokens_seen": 203914704, + "router_z_loss_mlp": 0.82226562, + "step": 2446, + "time_per_iteration": 2.8490843772888184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173534, + "balance_loss_mlp": 1.09123182, + "epoch": 0.47075798383993844, + "flos": 556012680192.0, + "grad_norm": 0.030967943008195494, + "language_loss": 0.88733399, + "learning_rate": 0.0005714097136359862, + "loss": 0.89906937, + "num_input_tokens_seen": 203985072, + "router_z_loss_mlp": 0.82324219, + "step": 2447, + "time_per_iteration": 2.6790409088134766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172662, + "balance_loss_mlp": 1.09035945, + "epoch": 0.470950365525202, + "flos": 565493160960.0, + "grad_norm": 0.028459673893144737, + "language_loss": 0.91199988, + "learning_rate": 0.0005711013511786027, + "loss": 0.92372644, + "num_input_tokens_seen": 204061904, + "router_z_loss_mlp": 0.82324219, + "step": 2448, + "time_per_iteration": 2.871711492538452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169516, + "balance_loss_mlp": 1.08745217, + "epoch": 0.47114274721046556, + "flos": 535498106880.0, + "grad_norm": 0.02665313173872239, + "language_loss": 0.88226557, + "learning_rate": 0.0005707929611172263, + "loss": 0.89396071, + "num_input_tokens_seen": 204137392, + "router_z_loss_mlp": 0.82080078, + "step": 2449, + "time_per_iteration": 2.69319748878479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166092, + "balance_loss_mlp": 1.08402824, + "epoch": 0.47133512889572915, + "flos": 474077912064.0, + "grad_norm": 0.0332447507442279, + "language_loss": 0.90459168, + "learning_rate": 0.000570484543571585, + "loss": 0.91625261, + "num_input_tokens_seen": 204202752, + "router_z_loss_mlp": 0.82080078, + "step": 2450, + "time_per_iteration": 2.5612680912017822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164305, + "balance_loss_mlp": 1.08228934, + "epoch": 0.4715275105809927, + "flos": 459967343616.0, + "grad_norm": 0.03392229050190778, + "language_loss": 0.90577096, + "learning_rate": 0.0005701760986614171, + "loss": 0.91741407, + "num_input_tokens_seen": 204266960, + "router_z_loss_mlp": 0.8203125, + "step": 2451, + "time_per_iteration": 2.5571579933166504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166326, + "balance_loss_mlp": 1.08435798, + "epoch": 0.47171989226625627, + "flos": 422886806016.0, + "grad_norm": 0.028518751420243762, + "language_loss": 0.93793362, + "learning_rate": 0.0005698676265064714, + "loss": 0.94959688, + "num_input_tokens_seen": 204331216, + "router_z_loss_mlp": 0.81982422, + "step": 2452, + "time_per_iteration": 2.476069211959839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169062, + "balance_loss_mlp": 1.08680761, + "epoch": 0.4719122739515198, + "flos": 458376075264.0, + "grad_norm": 0.03301356479716476, + "language_loss": 0.95592558, + "learning_rate": 0.0005695591272265074, + "loss": 0.9676162, + "num_input_tokens_seen": 204397216, + "router_z_loss_mlp": 0.82275391, + "step": 2453, + "time_per_iteration": 2.512503147125244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169417, + "balance_loss_mlp": 1.08730555, + "epoch": 0.4721046556367834, + "flos": 516016848384.0, + "grad_norm": 0.02961212180136774, + "language_loss": 0.87225032, + "learning_rate": 0.0005692506009412954, + "loss": 0.88394439, + "num_input_tokens_seen": 204469952, + "router_z_loss_mlp": 0.82128906, + "step": 2454, + "time_per_iteration": 2.673123836517334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187157, + "balance_loss_mlp": 1.10609436, + "epoch": 0.4722970373220469, + "flos": 1575703721472.0, + "grad_norm": 0.017157731663316397, + "language_loss": 0.7755127, + "learning_rate": 0.0005689420477706156, + "loss": 0.78738415, + "num_input_tokens_seen": 204701152, + "router_z_loss_mlp": 0.81054688, + "step": 2455, + "time_per_iteration": 4.97356915473938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164137, + "balance_loss_mlp": 1.08216834, + "epoch": 0.4724894190073105, + "flos": 587394886656.0, + "grad_norm": 0.02627427755104431, + "language_loss": 0.95142597, + "learning_rate": 0.0005686334678342593, + "loss": 0.96306741, + "num_input_tokens_seen": 204778144, + "router_z_loss_mlp": 0.81982422, + "step": 2456, + "time_per_iteration": 2.867849588394165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165061, + "balance_loss_mlp": 1.08304489, + "epoch": 0.4726818006925741, + "flos": 869072497152.0, + "grad_norm": 0.03086214810478132, + "language_loss": 0.87917793, + "learning_rate": 0.0005683248612520274, + "loss": 0.89082849, + "num_input_tokens_seen": 204853376, + "router_z_loss_mlp": 0.8203125, + "step": 2457, + "time_per_iteration": 3.078068733215332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164085, + "balance_loss_mlp": 1.08206928, + "epoch": 0.4728741823778376, + "flos": 754227380736.0, + "grad_norm": 0.03352301766800045, + "language_loss": 0.88896751, + "learning_rate": 0.0005680162281437321, + "loss": 0.90060842, + "num_input_tokens_seen": 204925280, + "router_z_loss_mlp": 0.8203125, + "step": 2458, + "time_per_iteration": 2.9237887859344482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116424, + "balance_loss_mlp": 1.08260512, + "epoch": 0.4730665640631012, + "flos": 539657562624.0, + "grad_norm": 0.027635752733509208, + "language_loss": 0.89953935, + "learning_rate": 0.000567707568629195, + "loss": 0.91118181, + "num_input_tokens_seen": 205000592, + "router_z_loss_mlp": 0.81640625, + "step": 2459, + "time_per_iteration": 2.719519853591919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166645, + "balance_loss_mlp": 1.08505821, + "epoch": 0.47325894574836475, + "flos": 492682308096.0, + "grad_norm": 0.027667404433321316, + "language_loss": 0.88089126, + "learning_rate": 0.0005673988828282486, + "loss": 0.89255774, + "num_input_tokens_seen": 205073968, + "router_z_loss_mlp": 0.81591797, + "step": 2460, + "time_per_iteration": 2.71736216545105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165583, + "balance_loss_mlp": 1.0839963, + "epoch": 0.47345132743362833, + "flos": 765830886912.0, + "grad_norm": 0.028127891455978875, + "language_loss": 0.87479305, + "learning_rate": 0.0005670901708607352, + "loss": 0.88644892, + "num_input_tokens_seen": 205153536, + "router_z_loss_mlp": 0.81591797, + "step": 2461, + "time_per_iteration": 2.9727017879486084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165349, + "balance_loss_mlp": 1.08371425, + "epoch": 0.47364370911889186, + "flos": 541168240128.0, + "grad_norm": 0.03987357596495419, + "language_loss": 0.90376979, + "learning_rate": 0.0005667814328465076, + "loss": 0.91542327, + "num_input_tokens_seen": 205220944, + "router_z_loss_mlp": 0.81640625, + "step": 2462, + "time_per_iteration": 2.632636547088623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163463, + "balance_loss_mlp": 1.0815897, + "epoch": 0.47383609080415545, + "flos": 407091643392.0, + "grad_norm": 0.03654753942721471, + "language_loss": 0.88796914, + "learning_rate": 0.0005664726689054285, + "loss": 0.89960378, + "num_input_tokens_seen": 205282688, + "router_z_loss_mlp": 0.81884766, + "step": 2463, + "time_per_iteration": 2.466054916381836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170123, + "balance_loss_mlp": 1.08867884, + "epoch": 0.474028472489419, + "flos": 454438199808.0, + "grad_norm": 0.03923165930345575, + "language_loss": 0.8627066, + "learning_rate": 0.0005661638791573704, + "loss": 0.87440789, + "num_input_tokens_seen": 205357360, + "router_z_loss_mlp": 0.81445312, + "step": 2464, + "time_per_iteration": 2.7042744159698486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166183, + "balance_loss_mlp": 1.08450055, + "epoch": 0.47422085417468257, + "flos": 493194599424.0, + "grad_norm": 0.026684931914484025, + "language_loss": 0.92592585, + "learning_rate": 0.0005658550637222164, + "loss": 0.93758774, + "num_input_tokens_seen": 205424352, + "router_z_loss_mlp": 0.81689453, + "step": 2465, + "time_per_iteration": 2.6058290004730225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168127, + "balance_loss_mlp": 1.08611059, + "epoch": 0.47441323585994616, + "flos": 740125544448.0, + "grad_norm": 0.026202374072225774, + "language_loss": 0.87139833, + "learning_rate": 0.0005655462227198592, + "loss": 0.88307959, + "num_input_tokens_seen": 205502912, + "router_z_loss_mlp": 0.8203125, + "step": 2466, + "time_per_iteration": 2.8945796489715576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167919, + "balance_loss_mlp": 1.08590269, + "epoch": 0.4746056175452097, + "flos": 485674687488.0, + "grad_norm": 0.02746668082221095, + "language_loss": 0.89712787, + "learning_rate": 0.0005652373562702016, + "loss": 0.90880704, + "num_input_tokens_seen": 205571168, + "router_z_loss_mlp": 0.8203125, + "step": 2467, + "time_per_iteration": 2.576364278793335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166795, + "balance_loss_mlp": 1.08463609, + "epoch": 0.4747979992304733, + "flos": 462005775360.0, + "grad_norm": 0.03040478239716322, + "language_loss": 0.95003092, + "learning_rate": 0.000564928464493156, + "loss": 0.96169889, + "num_input_tokens_seen": 205639648, + "router_z_loss_mlp": 0.82177734, + "step": 2468, + "time_per_iteration": 2.5468242168426514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168306, + "balance_loss_mlp": 1.08624196, + "epoch": 0.4749903809157368, + "flos": 865879226880.0, + "grad_norm": 0.029413898751956376, + "language_loss": 0.88262731, + "learning_rate": 0.000564619547508645, + "loss": 0.89431041, + "num_input_tokens_seen": 205721536, + "router_z_loss_mlp": 0.82080078, + "step": 2469, + "time_per_iteration": 3.042994260787964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116966, + "balance_loss_mlp": 1.08764374, + "epoch": 0.4751827626010004, + "flos": 506551830528.0, + "grad_norm": 0.035426943126194606, + "language_loss": 0.90271819, + "learning_rate": 0.0005643106054366008, + "loss": 0.91441476, + "num_input_tokens_seen": 205788512, + "router_z_loss_mlp": 0.8203125, + "step": 2470, + "time_per_iteration": 2.5660367012023926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168432, + "balance_loss_mlp": 1.0863688, + "epoch": 0.47537514428626393, + "flos": 560452113408.0, + "grad_norm": 0.029652672624791387, + "language_loss": 0.85815179, + "learning_rate": 0.000564001638396965, + "loss": 0.86983615, + "num_input_tokens_seen": 205863104, + "router_z_loss_mlp": 0.82080078, + "step": 2471, + "time_per_iteration": 2.7345728874206543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167677, + "balance_loss_mlp": 1.08566117, + "epoch": 0.4755675259715275, + "flos": 835676054016.0, + "grad_norm": 0.029111814859825738, + "language_loss": 0.87706691, + "learning_rate": 0.0005636926465096897, + "loss": 0.8887437, + "num_input_tokens_seen": 205940688, + "router_z_loss_mlp": 0.8203125, + "step": 2472, + "time_per_iteration": 3.0570740699768066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166306, + "balance_loss_mlp": 1.08424211, + "epoch": 0.47575990765679105, + "flos": 509232809472.0, + "grad_norm": 0.030849533450069865, + "language_loss": 0.93407679, + "learning_rate": 0.0005633836298947363, + "loss": 0.94573981, + "num_input_tokens_seen": 206008352, + "router_z_loss_mlp": 0.82080078, + "step": 2473, + "time_per_iteration": 2.6804757118225098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167624, + "balance_loss_mlp": 1.08570302, + "epoch": 0.47595228934205464, + "flos": 592962961920.0, + "grad_norm": 0.0319092637225127, + "language_loss": 0.77122205, + "learning_rate": 0.000563074588672075, + "loss": 0.78289831, + "num_input_tokens_seen": 206078240, + "router_z_loss_mlp": 0.81933594, + "step": 2474, + "time_per_iteration": 2.7190651893615723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166922, + "balance_loss_mlp": 1.08500123, + "epoch": 0.4761446710273182, + "flos": 581683094016.0, + "grad_norm": 0.028375010801601097, + "language_loss": 0.91505527, + "learning_rate": 0.0005627655229616868, + "loss": 0.92672449, + "num_input_tokens_seen": 206148896, + "router_z_loss_mlp": 0.81933594, + "step": 2475, + "time_per_iteration": 2.689652919769287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164128, + "balance_loss_mlp": 1.08235061, + "epoch": 0.47633705271258175, + "flos": 674079264768.0, + "grad_norm": 0.024988633596495675, + "language_loss": 0.94898891, + "learning_rate": 0.0005624564328835616, + "loss": 0.96063018, + "num_input_tokens_seen": 206223792, + "router_z_loss_mlp": 0.81787109, + "step": 2476, + "time_per_iteration": 2.8038489818573 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169163, + "balance_loss_mlp": 1.08728969, + "epoch": 0.47652943439784534, + "flos": 542970355200.0, + "grad_norm": 0.0285977430554916, + "language_loss": 0.89680123, + "learning_rate": 0.0005621473185576986, + "loss": 0.90849286, + "num_input_tokens_seen": 206299376, + "router_z_loss_mlp": 0.81884766, + "step": 2477, + "time_per_iteration": 2.7568743228912354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165779, + "balance_loss_mlp": 1.08433557, + "epoch": 0.4767218160831089, + "flos": 525846437376.0, + "grad_norm": 0.0316668482667046, + "language_loss": 0.93167424, + "learning_rate": 0.0005618381801041068, + "loss": 0.94333208, + "num_input_tokens_seen": 206367936, + "router_z_loss_mlp": 0.81445312, + "step": 2478, + "time_per_iteration": 2.612211227416992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167228, + "balance_loss_mlp": 1.08545041, + "epoch": 0.47691419776837246, + "flos": 569126863872.0, + "grad_norm": 0.03238452738028376, + "language_loss": 0.88936818, + "learning_rate": 0.0005615290176428044, + "loss": 0.90104043, + "num_input_tokens_seen": 206438864, + "router_z_loss_mlp": 0.81787109, + "step": 2479, + "time_per_iteration": 2.649019241333008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168128, + "balance_loss_mlp": 1.08668435, + "epoch": 0.477106579453636, + "flos": 532024859136.0, + "grad_norm": 0.027888492093205767, + "language_loss": 0.91917288, + "learning_rate": 0.0005612198312938187, + "loss": 0.93085408, + "num_input_tokens_seen": 206516656, + "router_z_loss_mlp": 0.81445312, + "step": 2480, + "time_per_iteration": 2.739767551422119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169934, + "balance_loss_mlp": 1.08839524, + "epoch": 0.4772989611388996, + "flos": 595500950016.0, + "grad_norm": 0.027931665483744535, + "language_loss": 0.84935582, + "learning_rate": 0.0005609106211771868, + "loss": 0.86105514, + "num_input_tokens_seen": 206595040, + "router_z_loss_mlp": 0.81542969, + "step": 2481, + "time_per_iteration": 2.850339651107788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169841, + "balance_loss_mlp": 1.08835006, + "epoch": 0.4774913428241631, + "flos": 545707729920.0, + "grad_norm": 0.027660076347337716, + "language_loss": 0.94426548, + "learning_rate": 0.0005606013874129543, + "loss": 0.95596385, + "num_input_tokens_seen": 206670192, + "router_z_loss_mlp": 0.81494141, + "step": 2482, + "time_per_iteration": 2.7403533458709717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169934, + "balance_loss_mlp": 1.08829987, + "epoch": 0.4776837245094267, + "flos": 541129308672.0, + "grad_norm": 0.02810737401227857, + "language_loss": 0.86136961, + "learning_rate": 0.0005602921301211768, + "loss": 0.87306893, + "num_input_tokens_seen": 206746992, + "router_z_loss_mlp": 0.81640625, + "step": 2483, + "time_per_iteration": 2.6941261291503906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171891, + "balance_loss_mlp": 1.09016109, + "epoch": 0.4778761061946903, + "flos": 472755887616.0, + "grad_norm": 0.029011275825861695, + "language_loss": 0.8832168, + "learning_rate": 0.0005599828494219185, + "loss": 0.89493567, + "num_input_tokens_seen": 206813584, + "router_z_loss_mlp": 0.81738281, + "step": 2484, + "time_per_iteration": 2.5801451206207275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116562, + "balance_loss_mlp": 1.08355606, + "epoch": 0.4780684878799538, + "flos": 727337000448.0, + "grad_norm": 0.03126301150284597, + "language_loss": 0.95766234, + "learning_rate": 0.0005596735454352527, + "loss": 0.96931851, + "num_input_tokens_seen": 206885840, + "router_z_loss_mlp": 0.82080078, + "step": 2485, + "time_per_iteration": 2.866809368133545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165282, + "balance_loss_mlp": 1.0832181, + "epoch": 0.4782608695652174, + "flos": 549953780736.0, + "grad_norm": 0.032811891631208345, + "language_loss": 0.91780031, + "learning_rate": 0.0005593642182812619, + "loss": 0.92945307, + "num_input_tokens_seen": 206955104, + "router_z_loss_mlp": 0.82080078, + "step": 2486, + "time_per_iteration": 2.6762824058532715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166087, + "balance_loss_mlp": 1.08388078, + "epoch": 0.47845325125048094, + "flos": 831401805312.0, + "grad_norm": 0.03291122574992765, + "language_loss": 0.91604954, + "learning_rate": 0.0005590548680800378, + "loss": 0.92771041, + "num_input_tokens_seen": 207039792, + "router_z_loss_mlp": 0.82226562, + "step": 2487, + "time_per_iteration": 3.1848442554473877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159859, + "balance_loss_mlp": 1.07765198, + "epoch": 0.4786456329357445, + "flos": 515270241792.0, + "grad_norm": 0.02977291399963519, + "language_loss": 0.8241533, + "learning_rate": 0.0005587454949516804, + "loss": 0.83575195, + "num_input_tokens_seen": 207115632, + "router_z_loss_mlp": 0.82226562, + "step": 2488, + "time_per_iteration": 2.728825330734253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163121, + "balance_loss_mlp": 1.08077133, + "epoch": 0.47883801462100806, + "flos": 565729477632.0, + "grad_norm": 0.034122039627151275, + "language_loss": 0.9412536, + "learning_rate": 0.0005584360990162993, + "loss": 0.95288485, + "num_input_tokens_seen": 207184336, + "router_z_loss_mlp": 0.82373047, + "step": 2489, + "time_per_iteration": 2.65055251121521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162976, + "balance_loss_mlp": 1.08076906, + "epoch": 0.47903039630627164, + "flos": 580704173568.0, + "grad_norm": 0.025976014522421025, + "language_loss": 0.89770818, + "learning_rate": 0.0005581266803940124, + "loss": 0.90933788, + "num_input_tokens_seen": 207258720, + "router_z_loss_mlp": 0.82226562, + "step": 2490, + "time_per_iteration": 2.740140199661255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164709, + "balance_loss_mlp": 1.08250248, + "epoch": 0.47922277799153523, + "flos": 620085656064.0, + "grad_norm": 0.030357385002024635, + "language_loss": 0.93398184, + "learning_rate": 0.0005578172392049471, + "loss": 0.94562888, + "num_input_tokens_seen": 207329216, + "router_z_loss_mlp": 0.82226562, + "step": 2491, + "time_per_iteration": 2.7492756843566895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164354, + "balance_loss_mlp": 1.08214724, + "epoch": 0.47941515967679876, + "flos": 640858739712.0, + "grad_norm": 0.03220406636162171, + "language_loss": 0.9124878, + "learning_rate": 0.0005575077755692386, + "loss": 0.92413139, + "num_input_tokens_seen": 207403712, + "router_z_loss_mlp": 0.82226562, + "step": 2492, + "time_per_iteration": 2.8061015605926514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166388, + "balance_loss_mlp": 1.08437181, + "epoch": 0.47960754136206235, + "flos": 520875247104.0, + "grad_norm": 0.02527329704122564, + "language_loss": 0.91187584, + "learning_rate": 0.0005571982896070316, + "loss": 0.92353964, + "num_input_tokens_seen": 207477120, + "router_z_loss_mlp": 0.8203125, + "step": 2493, + "time_per_iteration": 4.094395160675049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116615, + "balance_loss_mlp": 1.08399141, + "epoch": 0.4797999230473259, + "flos": 476031750144.0, + "grad_norm": 0.03303640593992076, + "language_loss": 0.95932508, + "learning_rate": 0.0005568887814384792, + "loss": 0.97098666, + "num_input_tokens_seen": 207544592, + "router_z_loss_mlp": 0.82177734, + "step": 2494, + "time_per_iteration": 2.572852373123169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011645, + "balance_loss_mlp": 1.08229315, + "epoch": 0.47999230473258947, + "flos": 533068907520.0, + "grad_norm": 0.028664161711311382, + "language_loss": 0.92573094, + "learning_rate": 0.000556579251183743, + "loss": 0.93737602, + "num_input_tokens_seen": 207613808, + "router_z_loss_mlp": 0.82226562, + "step": 2495, + "time_per_iteration": 2.6538801193237305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162424, + "balance_loss_mlp": 1.08036053, + "epoch": 0.480184686417853, + "flos": 602605899264.0, + "grad_norm": 0.03331899292815792, + "language_loss": 0.86056805, + "learning_rate": 0.0005562696989629936, + "loss": 0.87219226, + "num_input_tokens_seen": 207684464, + "router_z_loss_mlp": 0.82080078, + "step": 2496, + "time_per_iteration": 2.687903881072998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162213, + "balance_loss_mlp": 1.08019686, + "epoch": 0.4803770681031166, + "flos": 529261287936.0, + "grad_norm": 0.02923998603568501, + "language_loss": 0.88484073, + "learning_rate": 0.0005559601248964095, + "loss": 0.89646292, + "num_input_tokens_seen": 207754016, + "router_z_loss_mlp": 0.8203125, + "step": 2497, + "time_per_iteration": 2.6282827854156494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161296, + "balance_loss_mlp": 1.07918417, + "epoch": 0.4805694497883801, + "flos": 512228694528.0, + "grad_norm": 0.02922528152793709, + "language_loss": 0.91127884, + "learning_rate": 0.0005556505291041783, + "loss": 0.92289186, + "num_input_tokens_seen": 207827104, + "router_z_loss_mlp": 0.82128906, + "step": 2498, + "time_per_iteration": 2.662783622741699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161007, + "balance_loss_mlp": 1.07899094, + "epoch": 0.4807618314736437, + "flos": 601605511680.0, + "grad_norm": 0.02724196548061384, + "language_loss": 0.8966158, + "learning_rate": 0.0005553409117064954, + "loss": 0.90822583, + "num_input_tokens_seen": 207907824, + "router_z_loss_mlp": 0.8203125, + "step": 2499, + "time_per_iteration": 2.898850917816162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164849, + "balance_loss_mlp": 1.08245122, + "epoch": 0.4809542131589073, + "flos": 570029922816.0, + "grad_norm": 0.028349491645904, + "language_loss": 0.91357303, + "learning_rate": 0.0005550312728235654, + "loss": 0.92522144, + "num_input_tokens_seen": 207975632, + "router_z_loss_mlp": 0.82421875, + "step": 2500, + "time_per_iteration": 2.754187822341919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164619, + "balance_loss_mlp": 1.08217347, + "epoch": 0.4811465948441708, + "flos": 577165797888.0, + "grad_norm": 0.034664680835738745, + "language_loss": 0.91214681, + "learning_rate": 0.0005547216125756003, + "loss": 0.92379302, + "num_input_tokens_seen": 208048000, + "router_z_loss_mlp": 0.82470703, + "step": 2501, + "time_per_iteration": 2.778639078140259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164023, + "balance_loss_mlp": 1.08143485, + "epoch": 0.4813389765294344, + "flos": 825297243648.0, + "grad_norm": 0.028167486861350455, + "language_loss": 0.87736559, + "learning_rate": 0.0005544119310828211, + "loss": 0.88900584, + "num_input_tokens_seen": 208132592, + "router_z_loss_mlp": 0.82617188, + "step": 2502, + "time_per_iteration": 3.0756351947784424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164093, + "balance_loss_mlp": 1.08174348, + "epoch": 0.48153135821469795, + "flos": 636699283968.0, + "grad_norm": 0.030410217991048386, + "language_loss": 0.91046345, + "learning_rate": 0.0005541022284654568, + "loss": 0.92210436, + "num_input_tokens_seen": 208215824, + "router_z_loss_mlp": 0.82373047, + "step": 2503, + "time_per_iteration": 2.892679214477539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163382, + "balance_loss_mlp": 1.08103192, + "epoch": 0.48172373989996153, + "flos": 504708782592.0, + "grad_norm": 0.02826951852510112, + "language_loss": 0.89667141, + "learning_rate": 0.0005537925048437446, + "loss": 0.90830529, + "num_input_tokens_seen": 208284304, + "router_z_loss_mlp": 0.82373047, + "step": 2504, + "time_per_iteration": 2.5750081539154053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179108, + "balance_loss_mlp": 1.09918976, + "epoch": 0.48191612158522507, + "flos": 1535566173696.0, + "grad_norm": 0.017261305400491866, + "language_loss": 0.75751472, + "learning_rate": 0.00055348276033793, + "loss": 0.76930583, + "num_input_tokens_seen": 208510224, + "router_z_loss_mlp": 0.79882812, + "step": 2505, + "time_per_iteration": 4.912463426589966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162522, + "balance_loss_mlp": 1.07988608, + "epoch": 0.48210850327048865, + "flos": 703811805696.0, + "grad_norm": 0.027104005826713556, + "language_loss": 0.93955028, + "learning_rate": 0.0005531729950682664, + "loss": 0.95117545, + "num_input_tokens_seen": 208596816, + "router_z_loss_mlp": 0.82666016, + "step": 2506, + "time_per_iteration": 3.000925064086914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162538, + "balance_loss_mlp": 1.07999802, + "epoch": 0.4823008849557522, + "flos": 440700934656.0, + "grad_norm": 0.03451729562062639, + "language_loss": 0.91777337, + "learning_rate": 0.000552863209155015, + "loss": 0.92939872, + "num_input_tokens_seen": 208659616, + "router_z_loss_mlp": 0.82568359, + "step": 2507, + "time_per_iteration": 2.478809118270874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159773, + "balance_loss_mlp": 1.07737529, + "epoch": 0.48249326664101577, + "flos": 472812283392.0, + "grad_norm": 0.02691149649688828, + "language_loss": 0.87363136, + "learning_rate": 0.0005525534027184461, + "loss": 0.88522899, + "num_input_tokens_seen": 208728080, + "router_z_loss_mlp": 0.82421875, + "step": 2508, + "time_per_iteration": 2.54645037651062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161526, + "balance_loss_mlp": 1.07951045, + "epoch": 0.48268564832627936, + "flos": 564314127360.0, + "grad_norm": 0.023137570540037285, + "language_loss": 0.88137501, + "learning_rate": 0.0005522435758788365, + "loss": 0.89299035, + "num_input_tokens_seen": 208803376, + "router_z_loss_mlp": 0.8203125, + "step": 2509, + "time_per_iteration": 2.700540542602539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160536, + "balance_loss_mlp": 1.07842445, + "epoch": 0.4828780300115429, + "flos": 630842499072.0, + "grad_norm": 0.03372990027790351, + "language_loss": 0.86188895, + "learning_rate": 0.0005519337287564721, + "loss": 0.87349427, + "num_input_tokens_seen": 208876656, + "router_z_loss_mlp": 0.82128906, + "step": 2510, + "time_per_iteration": 2.8127758502960205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161519, + "balance_loss_mlp": 1.07945526, + "epoch": 0.4830704116968065, + "flos": 633004455936.0, + "grad_norm": 0.029001937113396697, + "language_loss": 0.88535267, + "learning_rate": 0.000551623861471646, + "loss": 0.89696789, + "num_input_tokens_seen": 208950224, + "router_z_loss_mlp": 0.82080078, + "step": 2511, + "time_per_iteration": 2.7925469875335693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166962, + "balance_loss_mlp": 1.08647156, + "epoch": 0.48326279338207, + "flos": 1572616512000.0, + "grad_norm": 0.009161484988790693, + "language_loss": 0.78818834, + "learning_rate": 0.0005513139741446594, + "loss": 0.79985785, + "num_input_tokens_seen": 209173984, + "router_z_loss_mlp": 0.8046875, + "step": 2512, + "time_per_iteration": 4.850747108459473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159851, + "balance_loss_mlp": 1.07783449, + "epoch": 0.4834551750673336, + "flos": 510237926400.0, + "grad_norm": 0.028933780257729795, + "language_loss": 0.92768925, + "learning_rate": 0.0005510040668958211, + "loss": 0.93928778, + "num_input_tokens_seen": 209242832, + "router_z_loss_mlp": 0.8203125, + "step": 2513, + "time_per_iteration": 2.56387996673584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165955, + "balance_loss_mlp": 1.08546448, + "epoch": 0.48364755675259713, + "flos": 1531825683456.0, + "grad_norm": 0.007133010503999018, + "language_loss": 0.77760583, + "learning_rate": 0.0005506941398454483, + "loss": 0.78926539, + "num_input_tokens_seen": 209473520, + "router_z_loss_mlp": 0.8046875, + "step": 2514, + "time_per_iteration": 4.836379289627075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160977, + "balance_loss_mlp": 1.07938981, + "epoch": 0.4838399384378607, + "flos": 566046385152.0, + "grad_norm": 0.029153045334521625, + "language_loss": 0.89274001, + "learning_rate": 0.0005503841931138645, + "loss": 0.9043498, + "num_input_tokens_seen": 209544208, + "router_z_loss_mlp": 0.81591797, + "step": 2515, + "time_per_iteration": 2.6633048057556152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160148, + "balance_loss_mlp": 1.07846582, + "epoch": 0.4840323201231243, + "flos": 388541641728.0, + "grad_norm": 0.03187042626689644, + "language_loss": 0.88861662, + "learning_rate": 0.0005500742268214025, + "loss": 0.90021807, + "num_input_tokens_seen": 209607408, + "router_z_loss_mlp": 0.81689453, + "step": 2516, + "time_per_iteration": 2.4762659072875977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160045, + "balance_loss_mlp": 1.07845843, + "epoch": 0.48422470180838784, + "flos": 632175257088.0, + "grad_norm": 0.026732605532440536, + "language_loss": 0.9007901, + "learning_rate": 0.0005497642410884014, + "loss": 0.91239059, + "num_input_tokens_seen": 209683392, + "router_z_loss_mlp": 0.81591797, + "step": 2517, + "time_per_iteration": 2.7693819999694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164478, + "balance_loss_mlp": 1.08246255, + "epoch": 0.4844170834936514, + "flos": 500313010176.0, + "grad_norm": 0.028128961210665323, + "language_loss": 0.90248644, + "learning_rate": 0.0005494542360352085, + "loss": 0.91413122, + "num_input_tokens_seen": 209753184, + "router_z_loss_mlp": 0.8203125, + "step": 2518, + "time_per_iteration": 2.6704978942871094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163589, + "balance_loss_mlp": 1.08152497, + "epoch": 0.48460946517891496, + "flos": 552194327040.0, + "grad_norm": 0.02893400906180164, + "language_loss": 0.92442286, + "learning_rate": 0.0005491442117821783, + "loss": 0.93605876, + "num_input_tokens_seen": 209829568, + "router_z_loss_mlp": 0.82080078, + "step": 2519, + "time_per_iteration": 2.691898822784424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167118, + "balance_loss_mlp": 1.08491123, + "epoch": 0.48480184686417854, + "flos": 530461788672.0, + "grad_norm": 0.03488173137086134, + "language_loss": 0.937814, + "learning_rate": 0.0005488341684496732, + "loss": 0.94948518, + "num_input_tokens_seen": 209902176, + "router_z_loss_mlp": 0.82226562, + "step": 2520, + "time_per_iteration": 2.6527535915374756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165597, + "balance_loss_mlp": 1.08343804, + "epoch": 0.4849942285494421, + "flos": 533047440384.0, + "grad_norm": 0.028537304261499467, + "language_loss": 0.97065389, + "learning_rate": 0.0005485241061580624, + "loss": 0.98230994, + "num_input_tokens_seen": 209969168, + "router_z_loss_mlp": 0.82177734, + "step": 2521, + "time_per_iteration": 2.7213969230651855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166792, + "balance_loss_mlp": 1.08463287, + "epoch": 0.48518661023470566, + "flos": 723972541440.0, + "grad_norm": 0.02938300657957885, + "language_loss": 0.90224278, + "learning_rate": 0.0005482140250277228, + "loss": 0.91391075, + "num_input_tokens_seen": 210049616, + "router_z_loss_mlp": 0.82177734, + "step": 2522, + "time_per_iteration": 2.9924206733703613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116808, + "balance_loss_mlp": 1.08592129, + "epoch": 0.4853789919199692, + "flos": 507155446272.0, + "grad_norm": 0.030604201389603965, + "language_loss": 0.93692237, + "learning_rate": 0.0005479039251790387, + "loss": 0.94860315, + "num_input_tokens_seen": 210118512, + "router_z_loss_mlp": 0.82177734, + "step": 2523, + "time_per_iteration": 2.7099061012268066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167569, + "balance_loss_mlp": 1.08541012, + "epoch": 0.4855713736052328, + "flos": 661698952704.0, + "grad_norm": 0.03222198223164457, + "language_loss": 0.90574634, + "learning_rate": 0.0005475938067324014, + "loss": 0.917422, + "num_input_tokens_seen": 210193728, + "router_z_loss_mlp": 0.82177734, + "step": 2524, + "time_per_iteration": 2.8379342555999756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117016, + "balance_loss_mlp": 1.08823884, + "epoch": 0.48576375529049637, + "flos": 437889699840.0, + "grad_norm": 0.03297241328571355, + "language_loss": 0.89402866, + "learning_rate": 0.0005472836698082098, + "loss": 0.90573025, + "num_input_tokens_seen": 210258832, + "router_z_loss_mlp": 0.81933594, + "step": 2525, + "time_per_iteration": 2.5135462284088135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165117, + "balance_loss_mlp": 1.08300531, + "epoch": 0.4859561369757599, + "flos": 582844663296.0, + "grad_norm": 0.028434138704400515, + "language_loss": 0.88848263, + "learning_rate": 0.0005469735145268694, + "loss": 0.90013373, + "num_input_tokens_seen": 210335280, + "router_z_loss_mlp": 0.82128906, + "step": 2526, + "time_per_iteration": 2.7137279510498047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162635, + "balance_loss_mlp": 1.08066678, + "epoch": 0.4861485186610235, + "flos": 488933085696.0, + "grad_norm": 0.028544121185286958, + "language_loss": 0.86922419, + "learning_rate": 0.0005466633410087933, + "loss": 0.88085049, + "num_input_tokens_seen": 210407072, + "router_z_loss_mlp": 0.81982422, + "step": 2527, + "time_per_iteration": 2.7106595039367676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116584, + "balance_loss_mlp": 1.08554077, + "epoch": 0.486340900346287, + "flos": 1561111060992.0, + "grad_norm": 0.005447093154513016, + "language_loss": 0.77260822, + "learning_rate": 0.0005463531493744017, + "loss": 0.78426665, + "num_input_tokens_seen": 210644544, + "router_z_loss_mlp": 0.80273438, + "step": 2528, + "time_per_iteration": 4.841828346252441 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162423, + "balance_loss_mlp": 1.08069348, + "epoch": 0.4865332820315506, + "flos": 483990093312.0, + "grad_norm": 0.026581719305211308, + "language_loss": 0.93869209, + "learning_rate": 0.0005460429397441214, + "loss": 0.95031631, + "num_input_tokens_seen": 210711760, + "router_z_loss_mlp": 0.81738281, + "step": 2529, + "time_per_iteration": 2.553438425064087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164502, + "balance_loss_mlp": 1.08296263, + "epoch": 0.48672566371681414, + "flos": 536857061376.0, + "grad_norm": 0.02943507577689114, + "language_loss": 0.92893845, + "learning_rate": 0.0005457327122383866, + "loss": 0.94058347, + "num_input_tokens_seen": 210783040, + "router_z_loss_mlp": 0.81542969, + "step": 2530, + "time_per_iteration": 2.628859043121338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167305, + "balance_loss_mlp": 1.08795929, + "epoch": 0.4869180454020777, + "flos": 1415830457856.0, + "grad_norm": 0.01207374103656724, + "language_loss": 0.74636483, + "learning_rate": 0.0005454224669776385, + "loss": 0.75803792, + "num_input_tokens_seen": 211002128, + "router_z_loss_mlp": 0.79296875, + "step": 2531, + "time_per_iteration": 4.798464775085449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163612, + "balance_loss_mlp": 1.08212042, + "epoch": 0.48711042708734126, + "flos": 574226308608.0, + "grad_norm": 0.027593185975689192, + "language_loss": 0.81384307, + "learning_rate": 0.0005451122040823244, + "loss": 0.82547921, + "num_input_tokens_seen": 211080080, + "router_z_loss_mlp": 0.81494141, + "step": 2532, + "time_per_iteration": 2.7749013900756836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116272, + "balance_loss_mlp": 1.08118057, + "epoch": 0.48730280877260485, + "flos": 627816414720.0, + "grad_norm": 0.02591805781842408, + "language_loss": 0.82129884, + "learning_rate": 0.0005448019236728997, + "loss": 0.83292603, + "num_input_tokens_seen": 211162944, + "router_z_loss_mlp": 0.81542969, + "step": 2533, + "time_per_iteration": 2.865239381790161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164787, + "balance_loss_mlp": 1.08315206, + "epoch": 0.48749519045786843, + "flos": 513468126720.0, + "grad_norm": 0.03027053938911928, + "language_loss": 0.91336226, + "learning_rate": 0.0005444916258698255, + "loss": 0.92501009, + "num_input_tokens_seen": 211230448, + "router_z_loss_mlp": 0.81640625, + "step": 2534, + "time_per_iteration": 2.5986597537994385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164085, + "balance_loss_mlp": 1.08259368, + "epoch": 0.48768757214313196, + "flos": 526478251008.0, + "grad_norm": 0.02699578070604874, + "language_loss": 0.90958095, + "learning_rate": 0.0005441813107935704, + "loss": 0.92122173, + "num_input_tokens_seen": 211301248, + "router_z_loss_mlp": 0.81494141, + "step": 2535, + "time_per_iteration": 2.685478925704956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162911, + "balance_loss_mlp": 1.08137167, + "epoch": 0.48787995382839555, + "flos": 506030807040.0, + "grad_norm": 0.02902824988643181, + "language_loss": 0.91504169, + "learning_rate": 0.0005438709785646091, + "loss": 0.92667079, + "num_input_tokens_seen": 211369888, + "router_z_loss_mlp": 0.81542969, + "step": 2536, + "time_per_iteration": 2.563302755355835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164758, + "balance_loss_mlp": 1.08302808, + "epoch": 0.4880723355136591, + "flos": 576247276032.0, + "grad_norm": 0.028837521239882914, + "language_loss": 0.92468232, + "learning_rate": 0.0005435606293034234, + "loss": 0.93632984, + "num_input_tokens_seen": 211441808, + "router_z_loss_mlp": 0.81738281, + "step": 2537, + "time_per_iteration": 2.6447930335998535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117327, + "balance_loss_mlp": 1.09163582, + "epoch": 0.48826471719892267, + "flos": 562536207360.0, + "grad_norm": 0.0312247117460979, + "language_loss": 0.90714639, + "learning_rate": 0.0005432502631305016, + "loss": 0.91887903, + "num_input_tokens_seen": 211511216, + "router_z_loss_mlp": 0.81640625, + "step": 2538, + "time_per_iteration": 2.6652588844299316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173314, + "balance_loss_mlp": 1.09163225, + "epoch": 0.4884570988841862, + "flos": 727547847168.0, + "grad_norm": 0.027646073497336384, + "language_loss": 0.88003767, + "learning_rate": 0.0005429398801663386, + "loss": 0.89177084, + "num_input_tokens_seen": 211589264, + "router_z_loss_mlp": 0.81689453, + "step": 2539, + "time_per_iteration": 2.9378042221069336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163435, + "balance_loss_mlp": 1.08180094, + "epoch": 0.4886494805694498, + "flos": 431924126208.0, + "grad_norm": 0.03488087397138866, + "language_loss": 0.90234458, + "learning_rate": 0.0005426294805314355, + "loss": 0.91397893, + "num_input_tokens_seen": 211652928, + "router_z_loss_mlp": 0.81640625, + "step": 2540, + "time_per_iteration": 2.538275718688965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161042, + "balance_loss_mlp": 1.07935977, + "epoch": 0.4888418622547134, + "flos": 674344505856.0, + "grad_norm": 0.02710942555690322, + "language_loss": 0.8497895, + "learning_rate": 0.0005423190643463003, + "loss": 0.86139989, + "num_input_tokens_seen": 211741664, + "router_z_loss_mlp": 0.81689453, + "step": 2541, + "time_per_iteration": 2.9786784648895264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163064, + "balance_loss_mlp": 1.08133411, + "epoch": 0.4890342439399769, + "flos": 542935426560.0, + "grad_norm": 0.02908053911836938, + "language_loss": 0.88889569, + "learning_rate": 0.0005420086317314473, + "loss": 0.90052634, + "num_input_tokens_seen": 211809136, + "router_z_loss_mlp": 0.81738281, + "step": 2542, + "time_per_iteration": 2.650505781173706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163957, + "balance_loss_mlp": 1.08198881, + "epoch": 0.4892266256252405, + "flos": 591862517760.0, + "grad_norm": 0.032456825889771945, + "language_loss": 0.86421382, + "learning_rate": 0.0005416981828073971, + "loss": 0.87585342, + "num_input_tokens_seen": 211883136, + "router_z_loss_mlp": 0.81982422, + "step": 2543, + "time_per_iteration": 2.756906032562256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167862, + "balance_loss_mlp": 1.08718109, + "epoch": 0.48941900731050403, + "flos": 1519654216704.0, + "grad_norm": 0.009398242691954228, + "language_loss": 0.77115011, + "learning_rate": 0.0005413877176946765, + "loss": 0.78282875, + "num_input_tokens_seen": 212117488, + "router_z_loss_mlp": 0.80664062, + "step": 2544, + "time_per_iteration": 4.826622486114502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163984, + "balance_loss_mlp": 1.08225381, + "epoch": 0.4896113889957676, + "flos": 471518456832.0, + "grad_norm": 0.03564931489131084, + "language_loss": 0.92759442, + "learning_rate": 0.000541077236513819, + "loss": 0.93923426, + "num_input_tokens_seen": 212181952, + "router_z_loss_mlp": 0.81738281, + "step": 2545, + "time_per_iteration": 2.5047078132629395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169885, + "balance_loss_mlp": 1.08848882, + "epoch": 0.48980377068103115, + "flos": 497551440384.0, + "grad_norm": 0.02644804149278648, + "language_loss": 0.87771875, + "learning_rate": 0.0005407667393853638, + "loss": 0.88941759, + "num_input_tokens_seen": 212252608, + "router_z_loss_mlp": 0.81396484, + "step": 2546, + "time_per_iteration": 2.615182876586914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172802, + "balance_loss_mlp": 1.09116721, + "epoch": 0.48999615236629473, + "flos": 694107743232.0, + "grad_norm": 0.032384144791382644, + "language_loss": 0.89844877, + "learning_rate": 0.0005404562264298569, + "loss": 0.91017681, + "num_input_tokens_seen": 212328560, + "router_z_loss_mlp": 0.81640625, + "step": 2547, + "time_per_iteration": 2.8694136142730713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164836, + "balance_loss_mlp": 1.08310628, + "epoch": 0.49018853405155827, + "flos": 542748774912.0, + "grad_norm": 0.02932030725962162, + "language_loss": 0.90206313, + "learning_rate": 0.0005401456977678498, + "loss": 0.91371155, + "num_input_tokens_seen": 212399616, + "router_z_loss_mlp": 0.81738281, + "step": 2548, + "time_per_iteration": 2.644604444503784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158708, + "balance_loss_mlp": 1.07702553, + "epoch": 0.49038091573682185, + "flos": 697108357632.0, + "grad_norm": 0.0348486432591887, + "language_loss": 0.83939159, + "learning_rate": 0.0005398351535199008, + "loss": 0.85097861, + "num_input_tokens_seen": 212482352, + "router_z_loss_mlp": 0.81689453, + "step": 2549, + "time_per_iteration": 3.064962863922119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158664, + "balance_loss_mlp": 1.07693398, + "epoch": 0.49057329742208544, + "flos": 598062406656.0, + "grad_norm": 0.028343941430048352, + "language_loss": 0.89488542, + "learning_rate": 0.0005395245938065735, + "loss": 0.90647209, + "num_input_tokens_seen": 212559504, + "router_z_loss_mlp": 0.81738281, + "step": 2550, + "time_per_iteration": 2.8023993968963623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162826, + "balance_loss_mlp": 1.08119094, + "epoch": 0.490765679107349, + "flos": 514416847872.0, + "grad_norm": 0.036438353865587, + "language_loss": 0.8920716, + "learning_rate": 0.0005392140187484379, + "loss": 0.90369982, + "num_input_tokens_seen": 212625664, + "router_z_loss_mlp": 0.81640625, + "step": 2551, + "time_per_iteration": 2.5544004440307617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160822, + "balance_loss_mlp": 1.07928288, + "epoch": 0.49095806079261256, + "flos": 630842499072.0, + "grad_norm": 0.02833803159801528, + "language_loss": 0.95730108, + "learning_rate": 0.0005389034284660701, + "loss": 0.96890926, + "num_input_tokens_seen": 212702000, + "router_z_loss_mlp": 0.81542969, + "step": 2552, + "time_per_iteration": 2.787997245788574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156735, + "balance_loss_mlp": 1.07524312, + "epoch": 0.4911504424778761, + "flos": 916792356864.0, + "grad_norm": 0.03441290589053542, + "language_loss": 0.8892417, + "learning_rate": 0.000538592823080052, + "loss": 0.90080899, + "num_input_tokens_seen": 212785376, + "router_z_loss_mlp": 0.81494141, + "step": 2553, + "time_per_iteration": 3.1353423595428467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159599, + "balance_loss_mlp": 1.07858455, + "epoch": 0.4913428241631397, + "flos": 439854271488.0, + "grad_norm": 0.03215354145178159, + "language_loss": 0.91146123, + "learning_rate": 0.000538282202710971, + "loss": 0.9230572, + "num_input_tokens_seen": 212848176, + "router_z_loss_mlp": 0.81005859, + "step": 2554, + "time_per_iteration": 2.524106025695801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158745, + "balance_loss_mlp": 1.0776825, + "epoch": 0.4915352058484032, + "flos": 637239773184.0, + "grad_norm": 0.03412299335020121, + "language_loss": 0.8861627, + "learning_rate": 0.000537971567479421, + "loss": 0.8977502, + "num_input_tokens_seen": 212917888, + "router_z_loss_mlp": 0.81054688, + "step": 2555, + "time_per_iteration": 2.750051736831665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162188, + "balance_loss_mlp": 1.08107841, + "epoch": 0.4917275875336668, + "flos": 505509783552.0, + "grad_norm": 0.03289434989172404, + "language_loss": 0.93214262, + "learning_rate": 0.0005376609175060011, + "loss": 0.94376451, + "num_input_tokens_seen": 212986288, + "router_z_loss_mlp": 0.81103516, + "step": 2556, + "time_per_iteration": 2.588437557220459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160453, + "balance_loss_mlp": 1.07924759, + "epoch": 0.49191996921893033, + "flos": 655733379072.0, + "grad_norm": 0.02731850736189593, + "language_loss": 0.86463559, + "learning_rate": 0.0005373502529113162, + "loss": 0.87624013, + "num_input_tokens_seen": 213059504, + "router_z_loss_mlp": 0.81201172, + "step": 2557, + "time_per_iteration": 2.775529146194458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160279, + "balance_loss_mlp": 1.07897866, + "epoch": 0.4921123509041939, + "flos": 493398715392.0, + "grad_norm": 0.02896728411720768, + "language_loss": 0.88084292, + "learning_rate": 0.0005370395738159773, + "loss": 0.8924458, + "num_input_tokens_seen": 213129984, + "router_z_loss_mlp": 0.81298828, + "step": 2558, + "time_per_iteration": 2.638489007949829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162432, + "balance_loss_mlp": 1.08084488, + "epoch": 0.4923047325894575, + "flos": 547207673856.0, + "grad_norm": 0.030679841284503157, + "language_loss": 0.90182674, + "learning_rate": 0.0005367288803406003, + "loss": 0.91345102, + "num_input_tokens_seen": 213199184, + "router_z_loss_mlp": 0.81591797, + "step": 2559, + "time_per_iteration": 2.655319929122925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166456, + "balance_loss_mlp": 1.08477354, + "epoch": 0.49249711427472104, + "flos": 597589046784.0, + "grad_norm": 0.03258957792314928, + "language_loss": 0.88157088, + "learning_rate": 0.0005364181726058073, + "loss": 0.89323545, + "num_input_tokens_seen": 213272480, + "router_z_loss_mlp": 0.81689453, + "step": 2560, + "time_per_iteration": 2.7416017055511475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116275, + "balance_loss_mlp": 1.08111596, + "epoch": 0.4926894959599846, + "flos": 498808336896.0, + "grad_norm": 0.03132101057916933, + "language_loss": 0.88768357, + "learning_rate": 0.0005361074507322261, + "loss": 0.89931107, + "num_input_tokens_seen": 213338704, + "router_z_loss_mlp": 0.81640625, + "step": 2561, + "time_per_iteration": 2.6130712032318115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165857, + "balance_loss_mlp": 1.08446133, + "epoch": 0.49288187764524816, + "flos": 537182701056.0, + "grad_norm": 0.03057631912079697, + "language_loss": 0.88031554, + "learning_rate": 0.000535796714840489, + "loss": 0.89197409, + "num_input_tokens_seen": 213406016, + "router_z_loss_mlp": 0.81396484, + "step": 2562, + "time_per_iteration": 2.6463782787323 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167526, + "balance_loss_mlp": 1.08584368, + "epoch": 0.49307425933051174, + "flos": 642712521216.0, + "grad_norm": 0.037191189532270505, + "language_loss": 0.90339726, + "learning_rate": 0.0005354859650512348, + "loss": 0.91507256, + "num_input_tokens_seen": 213474016, + "router_z_loss_mlp": 0.81689453, + "step": 2563, + "time_per_iteration": 2.807185649871826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169953, + "balance_loss_mlp": 1.08831811, + "epoch": 0.4932666410157753, + "flos": 517265012736.0, + "grad_norm": 0.033499096438589164, + "language_loss": 0.92994809, + "learning_rate": 0.0005351752014851074, + "loss": 0.94164765, + "num_input_tokens_seen": 213539696, + "router_z_loss_mlp": 0.81640625, + "step": 2564, + "time_per_iteration": 2.574969530105591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164544, + "balance_loss_mlp": 1.08310056, + "epoch": 0.49345902270103886, + "flos": 602651561472.0, + "grad_norm": 0.03279756121209128, + "language_loss": 0.89816988, + "learning_rate": 0.0005348644242627553, + "loss": 0.90981531, + "num_input_tokens_seen": 213609504, + "router_z_loss_mlp": 0.81445312, + "step": 2565, + "time_per_iteration": 2.718763828277588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170387, + "balance_loss_mlp": 1.0912323, + "epoch": 0.49365140438630245, + "flos": 1496981689344.0, + "grad_norm": 0.010263800536892794, + "language_loss": 0.75286627, + "learning_rate": 0.0005345536335048336, + "loss": 0.76457012, + "num_input_tokens_seen": 213846064, + "router_z_loss_mlp": 0.79101562, + "step": 2566, + "time_per_iteration": 4.933185815811157 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116695, + "balance_loss_mlp": 1.08588743, + "epoch": 0.493843786071566, + "flos": 630788104704.0, + "grad_norm": 0.030129730382445888, + "language_loss": 0.87054515, + "learning_rate": 0.0005342428293320013, + "loss": 0.88221461, + "num_input_tokens_seen": 213923216, + "router_z_loss_mlp": 0.81054688, + "step": 2567, + "time_per_iteration": 2.7435762882232666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167603, + "balance_loss_mlp": 1.08635032, + "epoch": 0.49403616775682957, + "flos": 618689771520.0, + "grad_norm": 0.03756496493147188, + "language_loss": 0.89032316, + "learning_rate": 0.0005339320118649238, + "loss": 0.90199912, + "num_input_tokens_seen": 213994096, + "router_z_loss_mlp": 0.8125, + "step": 2568, + "time_per_iteration": 2.732135057449341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162688, + "balance_loss_mlp": 1.08148313, + "epoch": 0.4942285494420931, + "flos": 578813462016.0, + "grad_norm": 0.027001968550623295, + "language_loss": 0.91260755, + "learning_rate": 0.000533621181224271, + "loss": 0.92423451, + "num_input_tokens_seen": 214069104, + "router_z_loss_mlp": 0.81201172, + "step": 2569, + "time_per_iteration": 2.79868483543396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164198, + "balance_loss_mlp": 1.08304083, + "epoch": 0.4944209311273567, + "flos": 631465580544.0, + "grad_norm": 0.0320565630919746, + "language_loss": 0.86978823, + "learning_rate": 0.0005333103375307182, + "loss": 0.88143021, + "num_input_tokens_seen": 214150368, + "router_z_loss_mlp": 0.81152344, + "step": 2570, + "time_per_iteration": 2.850125551223755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159265, + "balance_loss_mlp": 1.07825053, + "epoch": 0.4946133128126202, + "flos": 588718912512.0, + "grad_norm": 0.030887982554767154, + "language_loss": 0.91666126, + "learning_rate": 0.0005329994809049451, + "loss": 0.92825389, + "num_input_tokens_seen": 214220112, + "router_z_loss_mlp": 0.81005859, + "step": 2571, + "time_per_iteration": 2.716823101043701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115557, + "balance_loss_mlp": 1.07460296, + "epoch": 0.4948056944978838, + "flos": 584846164992.0, + "grad_norm": 0.031743542415023744, + "language_loss": 0.93336749, + "learning_rate": 0.0005326886114676375, + "loss": 0.94492316, + "num_input_tokens_seen": 214294480, + "router_z_loss_mlp": 0.80957031, + "step": 2572, + "time_per_iteration": 2.7895162105560303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160915, + "balance_loss_mlp": 1.08004355, + "epoch": 0.49499807618314734, + "flos": 482780860416.0, + "grad_norm": 0.03097072525481985, + "language_loss": 0.93359911, + "learning_rate": 0.0005323777293394854, + "loss": 0.94520825, + "num_input_tokens_seen": 214359568, + "router_z_loss_mlp": 0.80859375, + "step": 2573, + "time_per_iteration": 2.5428624153137207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161628, + "balance_loss_mlp": 1.08089912, + "epoch": 0.4951904578684109, + "flos": 520037316096.0, + "grad_norm": 0.029847836155631635, + "language_loss": 0.87235224, + "learning_rate": 0.000532066834641184, + "loss": 0.88396853, + "num_input_tokens_seen": 214432032, + "router_z_loss_mlp": 0.80712891, + "step": 2574, + "time_per_iteration": 2.666405439376831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116292, + "balance_loss_mlp": 1.08195353, + "epoch": 0.4953828395536745, + "flos": 536577083904.0, + "grad_norm": 0.029607666498307577, + "language_loss": 0.91085738, + "learning_rate": 0.0005317559274934334, + "loss": 0.92248654, + "num_input_tokens_seen": 214504096, + "router_z_loss_mlp": 0.80957031, + "step": 2575, + "time_per_iteration": 2.694953441619873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161488, + "balance_loss_mlp": 1.08056831, + "epoch": 0.49557522123893805, + "flos": 529606393344.0, + "grad_norm": 0.03416750639658743, + "language_loss": 0.87365144, + "learning_rate": 0.0005314450080169382, + "loss": 0.8852663, + "num_input_tokens_seen": 214575920, + "router_z_loss_mlp": 0.80908203, + "step": 2576, + "time_per_iteration": 2.6648805141448975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160753, + "balance_loss_mlp": 1.07973826, + "epoch": 0.49576760292420163, + "flos": 428917507584.0, + "grad_norm": 0.028909192983869472, + "language_loss": 0.86833698, + "learning_rate": 0.0005311340763324083, + "loss": 0.87994456, + "num_input_tokens_seen": 214641664, + "router_z_loss_mlp": 0.81005859, + "step": 2577, + "time_per_iteration": 2.563143014907837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160705, + "balance_loss_mlp": 1.07945204, + "epoch": 0.49595998460946517, + "flos": 566315629056.0, + "grad_norm": 0.02703431344264104, + "language_loss": 0.87897325, + "learning_rate": 0.0005308231325605578, + "loss": 0.8905803, + "num_input_tokens_seen": 214711744, + "router_z_loss_mlp": 0.8125, + "step": 2578, + "time_per_iteration": 2.690247058868408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159003, + "balance_loss_mlp": 1.07746387, + "epoch": 0.49615236629472875, + "flos": 703813807104.0, + "grad_norm": 0.02447176932933424, + "language_loss": 0.81124884, + "learning_rate": 0.0005305121768221061, + "loss": 0.8228389, + "num_input_tokens_seen": 214802256, + "router_z_loss_mlp": 0.81542969, + "step": 2579, + "time_per_iteration": 3.1026089191436768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011698, + "balance_loss_mlp": 1.08969116, + "epoch": 0.4963447479799923, + "flos": 1444752539136.0, + "grad_norm": 0.010536082657862093, + "language_loss": 0.75038326, + "learning_rate": 0.000530201209237777, + "loss": 0.76208121, + "num_input_tokens_seen": 215023648, + "router_z_loss_mlp": 0.80078125, + "step": 2580, + "time_per_iteration": 4.814293146133423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160566, + "balance_loss_mlp": 1.07912242, + "epoch": 0.49653712966525587, + "flos": 538663179264.0, + "grad_norm": 0.027995208065503225, + "language_loss": 0.97084171, + "learning_rate": 0.0005298902299282984, + "loss": 0.98244739, + "num_input_tokens_seen": 215094080, + "router_z_loss_mlp": 0.81445312, + "step": 2581, + "time_per_iteration": 2.6197092533111572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115749, + "balance_loss_mlp": 1.07609439, + "epoch": 0.4967295113505194, + "flos": 608395554816.0, + "grad_norm": 0.029727926282221828, + "language_loss": 0.90264994, + "learning_rate": 0.0005295792390144033, + "loss": 0.91422486, + "num_input_tokens_seen": 215165456, + "router_z_loss_mlp": 0.81396484, + "step": 2582, + "time_per_iteration": 2.6830005645751953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156586, + "balance_loss_mlp": 1.07528532, + "epoch": 0.496921893035783, + "flos": 475530192384.0, + "grad_norm": 0.034235181262718475, + "language_loss": 0.90576661, + "learning_rate": 0.0005292682366168294, + "loss": 0.91733253, + "num_input_tokens_seen": 215229344, + "router_z_loss_mlp": 0.81298828, + "step": 2583, + "time_per_iteration": 2.5291895866394043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158052, + "balance_loss_mlp": 1.07694244, + "epoch": 0.4971142747210466, + "flos": 598602895872.0, + "grad_norm": 0.029240794220739816, + "language_loss": 0.86485231, + "learning_rate": 0.0005289572228563181, + "loss": 0.8764329, + "num_input_tokens_seen": 215305616, + "router_z_loss_mlp": 0.81103516, + "step": 2584, + "time_per_iteration": 2.777571678161621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159994, + "balance_loss_mlp": 1.0788368, + "epoch": 0.4973066564063101, + "flos": 600734653440.0, + "grad_norm": 0.030481884249605188, + "language_loss": 0.889974, + "learning_rate": 0.000528646197853616, + "loss": 0.90157396, + "num_input_tokens_seen": 215378128, + "router_z_loss_mlp": 0.81152344, + "step": 2585, + "time_per_iteration": 2.767935276031494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116269, + "balance_loss_mlp": 1.08162796, + "epoch": 0.4974990380915737, + "flos": 650768919552.0, + "grad_norm": 0.027212373173769577, + "language_loss": 0.90572929, + "learning_rate": 0.0005283351617294735, + "loss": 0.91735625, + "num_input_tokens_seen": 215453536, + "router_z_loss_mlp": 0.81054688, + "step": 2586, + "time_per_iteration": 2.890571117401123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167969, + "balance_loss_mlp": 1.08862305, + "epoch": 0.49769141977683723, + "flos": 1532440032768.0, + "grad_norm": 0.00993779830792852, + "language_loss": 0.7663666, + "learning_rate": 0.0005280241146046456, + "loss": 0.77804637, + "num_input_tokens_seen": 215689440, + "router_z_loss_mlp": 0.79296875, + "step": 2587, + "time_per_iteration": 4.995927095413208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116898, + "balance_loss_mlp": 1.08791721, + "epoch": 0.4978838014621008, + "flos": 537397550592.0, + "grad_norm": 0.03215658272946184, + "language_loss": 0.92911154, + "learning_rate": 0.0005277130565998916, + "loss": 0.94080132, + "num_input_tokens_seen": 215759600, + "router_z_loss_mlp": 0.81054688, + "step": 2588, + "time_per_iteration": 2.717165946960449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162431, + "balance_loss_mlp": 1.08122599, + "epoch": 0.49807618314736435, + "flos": 540745271808.0, + "grad_norm": 0.02720148099542, + "language_loss": 0.86777204, + "learning_rate": 0.0005274019878359748, + "loss": 0.87939632, + "num_input_tokens_seen": 215833920, + "router_z_loss_mlp": 0.81201172, + "step": 2589, + "time_per_iteration": 2.71560001373291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162135, + "balance_loss_mlp": 1.08088183, + "epoch": 0.49826856483262794, + "flos": 543521577984.0, + "grad_norm": 0.03624054616449923, + "language_loss": 0.92995536, + "learning_rate": 0.0005270909084336628, + "loss": 0.94157672, + "num_input_tokens_seen": 215903616, + "router_z_loss_mlp": 0.8125, + "step": 2590, + "time_per_iteration": 2.6439368724823 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165371, + "balance_loss_mlp": 1.08435619, + "epoch": 0.4984609465178915, + "flos": 523360842240.0, + "grad_norm": 0.02994333023587166, + "language_loss": 0.94466031, + "learning_rate": 0.0005267798185137276, + "loss": 0.95631397, + "num_input_tokens_seen": 215974832, + "router_z_loss_mlp": 0.81005859, + "step": 2591, + "time_per_iteration": 2.6229867935180664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159677, + "balance_loss_mlp": 1.07851899, + "epoch": 0.49865332820315506, + "flos": 575704785408.0, + "grad_norm": 0.030323117469882623, + "language_loss": 0.94773531, + "learning_rate": 0.0005264687181969444, + "loss": 0.95933211, + "num_input_tokens_seen": 216045024, + "router_z_loss_mlp": 0.81152344, + "step": 2592, + "time_per_iteration": 2.7226686477661133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164286, + "balance_loss_mlp": 1.08303344, + "epoch": 0.49884570988841864, + "flos": 1015210497024.0, + "grad_norm": 0.0376584975450282, + "language_loss": 0.82159829, + "learning_rate": 0.0005261576076040937, + "loss": 0.83324111, + "num_input_tokens_seen": 216129024, + "router_z_loss_mlp": 0.8125, + "step": 2593, + "time_per_iteration": 3.2477946281433105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169307, + "balance_loss_mlp": 1.08843529, + "epoch": 0.4990380915736822, + "flos": 560647497216.0, + "grad_norm": 0.03227625840551658, + "language_loss": 0.90092522, + "learning_rate": 0.0005258464868559591, + "loss": 0.91261828, + "num_input_tokens_seen": 216197648, + "router_z_loss_mlp": 0.80859375, + "step": 2594, + "time_per_iteration": 2.650367259979248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167043, + "balance_loss_mlp": 1.08588493, + "epoch": 0.49923047325894576, + "flos": 499943709696.0, + "grad_norm": 0.030210069947970843, + "language_loss": 0.94528484, + "learning_rate": 0.0005255353560733284, + "loss": 0.95695531, + "num_input_tokens_seen": 216263904, + "router_z_loss_mlp": 0.81152344, + "step": 2595, + "time_per_iteration": 2.6242079734802246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174149, + "balance_loss_mlp": 1.09518433, + "epoch": 0.4994228549442093, + "flos": 1499788194816.0, + "grad_norm": 0.015118012466641684, + "language_loss": 0.75578642, + "learning_rate": 0.0005252242153769931, + "loss": 0.76752794, + "num_input_tokens_seen": 216493152, + "router_z_loss_mlp": 0.7890625, + "step": 2596, + "time_per_iteration": 4.820875883102417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116628, + "balance_loss_mlp": 1.08521724, + "epoch": 0.4996152366294729, + "flos": 558513738240.0, + "grad_norm": 0.031441861478263874, + "language_loss": 0.89123356, + "learning_rate": 0.0005249130648877492, + "loss": 0.9028964, + "num_input_tokens_seen": 216567216, + "router_z_loss_mlp": 0.81054688, + "step": 2597, + "time_per_iteration": 2.71932053565979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158102, + "balance_loss_mlp": 1.07699203, + "epoch": 0.4998076183147364, + "flos": 416482801152.0, + "grad_norm": 0.03314289919132309, + "language_loss": 0.90550959, + "learning_rate": 0.0005246019047263953, + "loss": 0.91709059, + "num_input_tokens_seen": 216630624, + "router_z_loss_mlp": 0.81103516, + "step": 2598, + "time_per_iteration": 2.4899134635925293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158453, + "balance_loss_mlp": 1.07739091, + "epoch": 0.5, + "flos": 468325186560.0, + "grad_norm": 0.03341299307449988, + "language_loss": 0.88387024, + "learning_rate": 0.0005242907350137353, + "loss": 0.89545476, + "num_input_tokens_seen": 216696576, + "router_z_loss_mlp": 0.81054688, + "step": 2599, + "time_per_iteration": 2.553997039794922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164809, + "balance_loss_mlp": 1.08369899, + "epoch": 0.5001923816852636, + "flos": 483755778048.0, + "grad_norm": 0.03321709561705903, + "language_loss": 0.85543942, + "learning_rate": 0.0005239795558705754, + "loss": 0.86708754, + "num_input_tokens_seen": 216767584, + "router_z_loss_mlp": 0.81103516, + "step": 2600, + "time_per_iteration": 2.6166868209838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164506, + "balance_loss_mlp": 1.08339632, + "epoch": 0.5003847633705272, + "flos": 534855559680.0, + "grad_norm": 0.030012173683065246, + "language_loss": 0.95093107, + "learning_rate": 0.0005236683674177264, + "loss": 0.96257615, + "num_input_tokens_seen": 216834320, + "router_z_loss_mlp": 0.81103516, + "step": 2601, + "time_per_iteration": 2.6404433250427246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162684, + "balance_loss_mlp": 1.08157361, + "epoch": 0.5005771450557907, + "flos": 739055299584.0, + "grad_norm": 0.032030290781944436, + "language_loss": 0.88311857, + "learning_rate": 0.0005233571697760021, + "loss": 0.89474535, + "num_input_tokens_seen": 216907312, + "router_z_loss_mlp": 0.81103516, + "step": 2602, + "time_per_iteration": 2.8534095287323 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160577, + "balance_loss_mlp": 1.07937133, + "epoch": 0.5007695267410542, + "flos": 780306026496.0, + "grad_norm": 0.036141348793487994, + "language_loss": 0.90016913, + "learning_rate": 0.0005230459630662203, + "loss": 0.91177493, + "num_input_tokens_seen": 216979872, + "router_z_loss_mlp": 0.81201172, + "step": 2603, + "time_per_iteration": 2.952563524246216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162299, + "balance_loss_mlp": 1.0812366, + "epoch": 0.5009619084263178, + "flos": 624618415104.0, + "grad_norm": 0.03600647163377571, + "language_loss": 0.88813984, + "learning_rate": 0.0005227347474092022, + "loss": 0.89976281, + "num_input_tokens_seen": 217054000, + "router_z_loss_mlp": 0.81054688, + "step": 2604, + "time_per_iteration": 2.70975399017334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166549, + "balance_loss_mlp": 1.08543897, + "epoch": 0.5011542901115814, + "flos": 532192045056.0, + "grad_norm": 0.023202845192485378, + "language_loss": 0.88172328, + "learning_rate": 0.0005224235229257724, + "loss": 0.89338881, + "num_input_tokens_seen": 217126784, + "router_z_loss_mlp": 0.81103516, + "step": 2605, + "time_per_iteration": 2.6811788082122803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165049, + "balance_loss_mlp": 1.08393872, + "epoch": 0.5013466717968449, + "flos": 528627472896.0, + "grad_norm": 0.02710312658737552, + "language_loss": 0.91735983, + "learning_rate": 0.0005221122897367589, + "loss": 0.92901027, + "num_input_tokens_seen": 217203056, + "router_z_loss_mlp": 0.81103516, + "step": 2606, + "time_per_iteration": 2.7866344451904297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115755, + "balance_loss_mlp": 1.07644022, + "epoch": 0.5015390534821085, + "flos": 567088432128.0, + "grad_norm": 0.035852557706828735, + "language_loss": 0.88253903, + "learning_rate": 0.0005218010479629932, + "loss": 0.89411449, + "num_input_tokens_seen": 217273280, + "router_z_loss_mlp": 0.81103516, + "step": 2607, + "time_per_iteration": 2.7290749549865723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157153, + "balance_loss_mlp": 1.07594728, + "epoch": 0.5017314351673721, + "flos": 567767909376.0, + "grad_norm": 0.03266328125205783, + "language_loss": 0.88539654, + "learning_rate": 0.0005214897977253102, + "loss": 0.89696807, + "num_input_tokens_seen": 217345568, + "router_z_loss_mlp": 0.81201172, + "step": 2608, + "time_per_iteration": 2.695686101913452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158723, + "balance_loss_mlp": 1.07751739, + "epoch": 0.5019238168526357, + "flos": 523387038720.0, + "grad_norm": 0.02584859781626205, + "language_loss": 0.88962579, + "learning_rate": 0.0005211785391445473, + "loss": 0.90121305, + "num_input_tokens_seen": 217422848, + "router_z_loss_mlp": 0.81201172, + "step": 2609, + "time_per_iteration": 2.7320780754089355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157806, + "balance_loss_mlp": 1.07674336, + "epoch": 0.5021161985378992, + "flos": 642636659712.0, + "grad_norm": 0.03213074952610081, + "language_loss": 0.85809815, + "learning_rate": 0.0005208672723415467, + "loss": 0.86967611, + "num_input_tokens_seen": 217502896, + "router_z_loss_mlp": 0.81054688, + "step": 2610, + "time_per_iteration": 2.8137152194976807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115836, + "balance_loss_mlp": 1.07729781, + "epoch": 0.5023085802231627, + "flos": 592422472704.0, + "grad_norm": 0.03276582898634011, + "language_loss": 0.85898113, + "learning_rate": 0.0005205559974371525, + "loss": 0.8705647, + "num_input_tokens_seen": 217575072, + "router_z_loss_mlp": 0.81054688, + "step": 2611, + "time_per_iteration": 2.7611584663391113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158271, + "balance_loss_mlp": 1.07720828, + "epoch": 0.5025009619084263, + "flos": 473333306880.0, + "grad_norm": 0.02842666355233711, + "language_loss": 0.86990851, + "learning_rate": 0.0005202447145522123, + "loss": 0.88149118, + "num_input_tokens_seen": 217644976, + "router_z_loss_mlp": 0.81054688, + "step": 2612, + "time_per_iteration": 2.6646487712860107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161741, + "balance_loss_mlp": 1.08067882, + "epoch": 0.5026933435936899, + "flos": 456077131776.0, + "grad_norm": 0.031223796902704184, + "language_loss": 0.84174728, + "learning_rate": 0.0005199334238075769, + "loss": 0.85336471, + "num_input_tokens_seen": 217712816, + "router_z_loss_mlp": 0.81054688, + "step": 2613, + "time_per_iteration": 2.567990779876709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163025, + "balance_loss_mlp": 1.08229649, + "epoch": 0.5028857252789535, + "flos": 492721239552.0, + "grad_norm": 0.02841040015147714, + "language_loss": 0.97840261, + "learning_rate": 0.0005196221253241, + "loss": 0.99003285, + "num_input_tokens_seen": 217780256, + "router_z_loss_mlp": 0.80712891, + "step": 2614, + "time_per_iteration": 2.5584659576416016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160421, + "balance_loss_mlp": 1.07988286, + "epoch": 0.503078106964217, + "flos": 626730706944.0, + "grad_norm": 0.03241817920698289, + "language_loss": 0.88891315, + "learning_rate": 0.0005193108192226383, + "loss": 0.90051734, + "num_input_tokens_seen": 217848496, + "router_z_loss_mlp": 0.80517578, + "step": 2615, + "time_per_iteration": 2.7840871810913086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164078, + "balance_loss_mlp": 1.0830152, + "epoch": 0.5032704886494805, + "flos": 580137487872.0, + "grad_norm": 0.02867464613296787, + "language_loss": 0.91759968, + "learning_rate": 0.000518999505624052, + "loss": 0.92924047, + "num_input_tokens_seen": 217919216, + "router_z_loss_mlp": 0.81054688, + "step": 2616, + "time_per_iteration": 2.6807193756103516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161331, + "balance_loss_mlp": 1.08017337, + "epoch": 0.5034628703347441, + "flos": 472845210624.0, + "grad_norm": 0.027070743385767714, + "language_loss": 0.8816672, + "learning_rate": 0.000518688184649203, + "loss": 0.89328051, + "num_input_tokens_seen": 217996096, + "router_z_loss_mlp": 0.81152344, + "step": 2617, + "time_per_iteration": 2.7943994998931885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159886, + "balance_loss_mlp": 1.07877576, + "epoch": 0.5036552520200077, + "flos": 490813063680.0, + "grad_norm": 0.03074056287258418, + "language_loss": 0.88926733, + "learning_rate": 0.0005183768564189577, + "loss": 0.90086615, + "num_input_tokens_seen": 218063072, + "router_z_loss_mlp": 0.81103516, + "step": 2618, + "time_per_iteration": 2.549255609512329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159762, + "balance_loss_mlp": 1.07860434, + "epoch": 0.5038476337052713, + "flos": 495215566848.0, + "grad_norm": 0.030783318052010424, + "language_loss": 0.87459326, + "learning_rate": 0.0005180655210541838, + "loss": 0.88619089, + "num_input_tokens_seen": 218131056, + "router_z_loss_mlp": 0.81152344, + "step": 2619, + "time_per_iteration": 2.5555741786956787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157127, + "balance_loss_mlp": 1.0759213, + "epoch": 0.5040400153905348, + "flos": 601739770368.0, + "grad_norm": 0.036447475930772646, + "language_loss": 0.89893603, + "learning_rate": 0.0005177541786757527, + "loss": 0.91050732, + "num_input_tokens_seen": 218203536, + "router_z_loss_mlp": 0.81201172, + "step": 2620, + "time_per_iteration": 2.75068998336792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157658, + "balance_loss_mlp": 1.07621455, + "epoch": 0.5042323970757984, + "flos": 812918932992.0, + "grad_norm": 0.03476449221513998, + "language_loss": 0.90274507, + "learning_rate": 0.000517442829404538, + "loss": 0.91432166, + "num_input_tokens_seen": 218283008, + "router_z_loss_mlp": 0.81445312, + "step": 2621, + "time_per_iteration": 2.981661558151245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160033, + "balance_loss_mlp": 1.07854116, + "epoch": 0.504424778761062, + "flos": 628606682112.0, + "grad_norm": 0.030074963346690586, + "language_loss": 0.92839754, + "learning_rate": 0.0005171314733614166, + "loss": 0.93999791, + "num_input_tokens_seen": 218362096, + "router_z_loss_mlp": 0.81494141, + "step": 2622, + "time_per_iteration": 2.942354202270508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160933, + "balance_loss_mlp": 1.07934618, + "epoch": 0.5046171604463255, + "flos": 516956837376.0, + "grad_norm": 0.029806335990833818, + "language_loss": 0.84097135, + "learning_rate": 0.0005168201106672671, + "loss": 0.85258067, + "num_input_tokens_seen": 218439440, + "router_z_loss_mlp": 0.81591797, + "step": 2623, + "time_per_iteration": 2.7703733444213867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160048, + "balance_loss_mlp": 1.07841325, + "epoch": 0.504809542131589, + "flos": 528853056000.0, + "grad_norm": 0.03248441490058616, + "language_loss": 0.91679412, + "learning_rate": 0.0005165087414429717, + "loss": 0.92839456, + "num_input_tokens_seen": 218505936, + "router_z_loss_mlp": 0.81640625, + "step": 2624, + "time_per_iteration": 2.620872974395752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116106, + "balance_loss_mlp": 1.07937741, + "epoch": 0.5050019238168526, + "flos": 555174749184.0, + "grad_norm": 0.03119977790816051, + "language_loss": 0.88980711, + "learning_rate": 0.0005161973658094144, + "loss": 0.90141767, + "num_input_tokens_seen": 218573824, + "router_z_loss_mlp": 0.81689453, + "step": 2625, + "time_per_iteration": 2.640408754348755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161049, + "balance_loss_mlp": 1.07955778, + "epoch": 0.5051943055021162, + "flos": 575928367104.0, + "grad_norm": 0.024986408688213266, + "language_loss": 0.88551366, + "learning_rate": 0.000515885983887482, + "loss": 0.89712417, + "num_input_tokens_seen": 218648016, + "router_z_loss_mlp": 0.81494141, + "step": 2626, + "time_per_iteration": 2.7737276554107666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161913, + "balance_loss_mlp": 1.08066046, + "epoch": 0.5053866871873798, + "flos": 497681696256.0, + "grad_norm": 0.03126501141119064, + "language_loss": 0.91551393, + "learning_rate": 0.0005155745957980636, + "loss": 0.92713308, + "num_input_tokens_seen": 218714128, + "router_z_loss_mlp": 0.8125, + "step": 2627, + "time_per_iteration": 2.5588245391845703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159267, + "balance_loss_mlp": 1.07801354, + "epoch": 0.5055790688726434, + "flos": 503219572224.0, + "grad_norm": 0.028407663328603422, + "language_loss": 0.94095421, + "learning_rate": 0.000515263201662051, + "loss": 0.95254695, + "num_input_tokens_seen": 218784800, + "router_z_loss_mlp": 0.8125, + "step": 2628, + "time_per_iteration": 2.6333348751068115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115977, + "balance_loss_mlp": 1.07851708, + "epoch": 0.5057714505579068, + "flos": 846767268864.0, + "grad_norm": 0.025627158908879104, + "language_loss": 0.8802768, + "learning_rate": 0.0005149518016003378, + "loss": 0.89187449, + "num_input_tokens_seen": 218868256, + "router_z_loss_mlp": 0.8125, + "step": 2629, + "time_per_iteration": 3.159515142440796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115843, + "balance_loss_mlp": 1.07722509, + "epoch": 0.5059638322431704, + "flos": 498808336896.0, + "grad_norm": 0.032654832965012745, + "language_loss": 0.88445461, + "learning_rate": 0.0005146403957338206, + "loss": 0.89603889, + "num_input_tokens_seen": 218932496, + "router_z_loss_mlp": 0.81201172, + "step": 2630, + "time_per_iteration": 2.569671154022217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166774, + "balance_loss_mlp": 1.08571208, + "epoch": 0.506156213928434, + "flos": 619113466368.0, + "grad_norm": 0.027165343024338446, + "language_loss": 0.86742038, + "learning_rate": 0.0005143289841833975, + "loss": 0.8790881, + "num_input_tokens_seen": 219010672, + "router_z_loss_mlp": 0.81054688, + "step": 2631, + "time_per_iteration": 2.8505327701568604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169752, + "balance_loss_mlp": 1.08911932, + "epoch": 0.5063485956136976, + "flos": 425789365248.0, + "grad_norm": 0.03495904047465476, + "language_loss": 0.89354646, + "learning_rate": 0.0005140175670699696, + "loss": 0.90524399, + "num_input_tokens_seen": 219077104, + "router_z_loss_mlp": 0.80615234, + "step": 2632, + "time_per_iteration": 2.5920779705047607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174002, + "balance_loss_mlp": 1.09341669, + "epoch": 0.5065409772989612, + "flos": 571069968384.0, + "grad_norm": 0.02494402323857881, + "language_loss": 0.86924809, + "learning_rate": 0.0005137061445144395, + "loss": 0.88098812, + "num_input_tokens_seen": 219164880, + "router_z_loss_mlp": 0.80566406, + "step": 2633, + "time_per_iteration": 2.8890433311462402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172992, + "balance_loss_mlp": 1.09250152, + "epoch": 0.5067333589842247, + "flos": 629969639424.0, + "grad_norm": 0.03395805639170181, + "language_loss": 0.93242514, + "learning_rate": 0.000513394716637712, + "loss": 0.94415504, + "num_input_tokens_seen": 219237376, + "router_z_loss_mlp": 0.8046875, + "step": 2634, + "time_per_iteration": 2.7772305011749268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171906, + "balance_loss_mlp": 1.09217834, + "epoch": 0.5069257406694883, + "flos": 1451096145408.0, + "grad_norm": 0.011960900894201355, + "language_loss": 0.79191709, + "learning_rate": 0.0005130832835606946, + "loss": 0.80363613, + "num_input_tokens_seen": 219467632, + "router_z_loss_mlp": 0.796875, + "step": 2635, + "time_per_iteration": 4.93586802482605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116392, + "balance_loss_mlp": 1.08323884, + "epoch": 0.5071181223547518, + "flos": 640057738752.0, + "grad_norm": 0.03273720191955115, + "language_loss": 0.86367166, + "learning_rate": 0.0005127718454042958, + "loss": 0.87531078, + "num_input_tokens_seen": 219545392, + "router_z_loss_mlp": 0.80664062, + "step": 2636, + "time_per_iteration": 2.8407700061798096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115771, + "balance_loss_mlp": 1.07683849, + "epoch": 0.5073105040400154, + "flos": 714872094720.0, + "grad_norm": 0.03167408399625075, + "language_loss": 0.89809334, + "learning_rate": 0.0005124604022894269, + "loss": 0.90967047, + "num_input_tokens_seen": 219623104, + "router_z_loss_mlp": 0.80859375, + "step": 2637, + "time_per_iteration": 2.9438648223876953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011651, + "balance_loss_mlp": 1.08575439, + "epoch": 0.5075028857252789, + "flos": 1439612161536.0, + "grad_norm": 0.009234713476178756, + "language_loss": 0.77188224, + "learning_rate": 0.000512148954337001, + "loss": 0.78353328, + "num_input_tokens_seen": 219853328, + "router_z_loss_mlp": 0.79296875, + "step": 2638, + "time_per_iteration": 4.855467319488525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170042, + "balance_loss_mlp": 1.08950412, + "epoch": 0.5076952674105425, + "flos": 572307399168.0, + "grad_norm": 0.033371281415520225, + "language_loss": 0.89923447, + "learning_rate": 0.0005118375016679325, + "loss": 0.91093493, + "num_input_tokens_seen": 219925024, + "router_z_loss_mlp": 0.80517578, + "step": 2639, + "time_per_iteration": 2.7761123180389404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168126, + "balance_loss_mlp": 1.08735013, + "epoch": 0.5078876490958061, + "flos": 517712176128.0, + "grad_norm": 0.04218063889538898, + "language_loss": 0.87796986, + "learning_rate": 0.0005115260444031382, + "loss": 0.88965112, + "num_input_tokens_seen": 219992752, + "router_z_loss_mlp": 0.80761719, + "step": 2640, + "time_per_iteration": 2.5914742946624756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164741, + "balance_loss_mlp": 1.08596802, + "epoch": 0.5080800307810697, + "flos": 1587619405824.0, + "grad_norm": 0.012463066852979446, + "language_loss": 0.78731823, + "learning_rate": 0.000511214582663537, + "loss": 0.79896557, + "num_input_tokens_seen": 220224160, + "router_z_loss_mlp": 0.78710938, + "step": 2641, + "time_per_iteration": 4.9428391456604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164884, + "balance_loss_mlp": 1.08420289, + "epoch": 0.5082724124663333, + "flos": 486186978816.0, + "grad_norm": 0.039006057605032056, + "language_loss": 0.93060952, + "learning_rate": 0.0005109031165700483, + "loss": 0.94225836, + "num_input_tokens_seen": 220289504, + "router_z_loss_mlp": 0.80664062, + "step": 2642, + "time_per_iteration": 2.5630409717559814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164249, + "balance_loss_mlp": 1.08318675, + "epoch": 0.5084647941515967, + "flos": 683442224640.0, + "grad_norm": 0.03324563219825503, + "language_loss": 0.88873887, + "learning_rate": 0.0005105916462435945, + "loss": 0.90038145, + "num_input_tokens_seen": 220361376, + "router_z_loss_mlp": 0.81054688, + "step": 2643, + "time_per_iteration": 2.8135592937469482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165445, + "balance_loss_mlp": 1.08438289, + "epoch": 0.5086571758368603, + "flos": 549812791296.0, + "grad_norm": 0.031221131167697595, + "language_loss": 0.92092431, + "learning_rate": 0.0005102801718050989, + "loss": 0.93257874, + "num_input_tokens_seen": 220434720, + "router_z_loss_mlp": 0.81054688, + "step": 2644, + "time_per_iteration": 2.684957981109619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165053, + "balance_loss_mlp": 1.08413339, + "epoch": 0.5088495575221239, + "flos": 565078198272.0, + "grad_norm": 0.032204925975490975, + "language_loss": 0.95189679, + "learning_rate": 0.0005099686933754867, + "loss": 0.96354735, + "num_input_tokens_seen": 220506208, + "router_z_loss_mlp": 0.80908203, + "step": 2645, + "time_per_iteration": 2.6721112728118896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167263, + "balance_loss_mlp": 1.08620095, + "epoch": 0.5090419392073875, + "flos": 552511234560.0, + "grad_norm": 0.03332524240735616, + "language_loss": 0.90223062, + "learning_rate": 0.0005096572110756845, + "loss": 0.9139033, + "num_input_tokens_seen": 220577456, + "router_z_loss_mlp": 0.81054688, + "step": 2646, + "time_per_iteration": 2.6559739112854004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167828, + "balance_loss_mlp": 1.08686149, + "epoch": 0.509234320892651, + "flos": 568883816448.0, + "grad_norm": 0.029529111031728714, + "language_loss": 0.90596855, + "learning_rate": 0.0005093457250266205, + "loss": 0.91764688, + "num_input_tokens_seen": 220649648, + "router_z_loss_mlp": 0.80957031, + "step": 2647, + "time_per_iteration": 2.7653987407684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167889, + "balance_loss_mlp": 1.08673143, + "epoch": 0.5094267025779146, + "flos": 583693327872.0, + "grad_norm": 0.03457257756125772, + "language_loss": 0.89727396, + "learning_rate": 0.000509034235349224, + "loss": 0.90895277, + "num_input_tokens_seen": 220721168, + "router_z_loss_mlp": 0.81152344, + "step": 2648, + "time_per_iteration": 2.690363645553589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159753, + "balance_loss_mlp": 1.07854819, + "epoch": 0.5096190842631781, + "flos": 593138880000.0, + "grad_norm": 0.0341546457293008, + "language_loss": 0.88255095, + "learning_rate": 0.0005087227421644266, + "loss": 0.89414853, + "num_input_tokens_seen": 220796464, + "router_z_loss_mlp": 0.81201172, + "step": 2649, + "time_per_iteration": 2.6982481479644775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159974, + "balance_loss_mlp": 1.07891166, + "epoch": 0.5098114659484417, + "flos": 514584033792.0, + "grad_norm": 0.030485361797949893, + "language_loss": 0.92298341, + "learning_rate": 0.0005084112455931602, + "loss": 0.93458325, + "num_input_tokens_seen": 220862976, + "router_z_loss_mlp": 0.81054688, + "step": 2650, + "time_per_iteration": 2.5739448070526123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162291, + "balance_loss_mlp": 1.08170521, + "epoch": 0.5100038476337053, + "flos": 485600827392.0, + "grad_norm": 0.03052985498468287, + "language_loss": 0.91529775, + "learning_rate": 0.0005080997457563586, + "loss": 0.92692065, + "num_input_tokens_seen": 220926432, + "router_z_loss_mlp": 0.80566406, + "step": 2651, + "time_per_iteration": 2.5381717681884766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165638, + "balance_loss_mlp": 1.08514845, + "epoch": 0.5101962293189688, + "flos": 462554996736.0, + "grad_norm": 0.037278277228963375, + "language_loss": 0.86181092, + "learning_rate": 0.0005077882427749569, + "loss": 0.87346727, + "num_input_tokens_seen": 220993008, + "router_z_loss_mlp": 0.8046875, + "step": 2652, + "time_per_iteration": 2.490943670272827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158092, + "balance_loss_mlp": 1.07745898, + "epoch": 0.5103886110042324, + "flos": 588132761088.0, + "grad_norm": 0.03182463194953253, + "language_loss": 0.91334021, + "learning_rate": 0.0005074767367698913, + "loss": 0.9249211, + "num_input_tokens_seen": 221059248, + "router_z_loss_mlp": 0.80615234, + "step": 2653, + "time_per_iteration": 2.6900839805603027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115906, + "balance_loss_mlp": 1.07847476, + "epoch": 0.510580992689496, + "flos": 846677945856.0, + "grad_norm": 0.027057922805634398, + "language_loss": 0.89024949, + "learning_rate": 0.0005071652278620988, + "loss": 0.90184009, + "num_input_tokens_seen": 221133712, + "router_z_loss_mlp": 0.80566406, + "step": 2654, + "time_per_iteration": 3.044296979904175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115973, + "balance_loss_mlp": 1.07919204, + "epoch": 0.5107733743747596, + "flos": 659810242560.0, + "grad_norm": 0.0315385737613105, + "language_loss": 0.89305294, + "learning_rate": 0.0005068537161725186, + "loss": 0.90465021, + "num_input_tokens_seen": 221202192, + "router_z_loss_mlp": 0.80517578, + "step": 2655, + "time_per_iteration": 2.770669937133789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160641, + "balance_loss_mlp": 1.08000755, + "epoch": 0.510965756060023, + "flos": 702960413184.0, + "grad_norm": 0.03531630249392906, + "language_loss": 0.91070223, + "learning_rate": 0.0005065422018220893, + "loss": 0.92230862, + "num_input_tokens_seen": 221277104, + "router_z_loss_mlp": 0.80615234, + "step": 2656, + "time_per_iteration": 2.833031177520752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165495, + "balance_loss_mlp": 1.08490956, + "epoch": 0.5111581377452866, + "flos": 560940936192.0, + "grad_norm": 0.03615724120857576, + "language_loss": 0.85921729, + "learning_rate": 0.0005062306849317521, + "loss": 0.87087226, + "num_input_tokens_seen": 221352320, + "router_z_loss_mlp": 0.80566406, + "step": 2657, + "time_per_iteration": 2.800971031188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167929, + "balance_loss_mlp": 1.0873909, + "epoch": 0.5113505194305502, + "flos": 610145276928.0, + "grad_norm": 0.029932060678028026, + "language_loss": 0.88435352, + "learning_rate": 0.0005059191656224487, + "loss": 0.89603281, + "num_input_tokens_seen": 221421056, + "router_z_loss_mlp": 0.80517578, + "step": 2658, + "time_per_iteration": 2.7075443267822266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159414, + "balance_loss_mlp": 1.07882822, + "epoch": 0.5115429011158138, + "flos": 535535036928.0, + "grad_norm": 0.028231439832000826, + "language_loss": 0.94975483, + "learning_rate": 0.0005056076440151212, + "loss": 0.96134901, + "num_input_tokens_seen": 221492064, + "router_z_loss_mlp": 0.80566406, + "step": 2659, + "time_per_iteration": 2.6906392574310303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162323, + "balance_loss_mlp": 1.0835495, + "epoch": 0.5117352828010774, + "flos": 1365273166848.0, + "grad_norm": 0.00971890017277948, + "language_loss": 0.76288116, + "learning_rate": 0.0005052961202307133, + "loss": 0.77450442, + "num_input_tokens_seen": 221724672, + "router_z_loss_mlp": 0.78515625, + "step": 2660, + "time_per_iteration": 4.880187273025513 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160968, + "balance_loss_mlp": 1.07990551, + "epoch": 0.5119276644863409, + "flos": 634930096128.0, + "grad_norm": 0.027317751888226913, + "language_loss": 0.91815728, + "learning_rate": 0.0005049845943901691, + "loss": 0.92976695, + "num_input_tokens_seen": 221800144, + "router_z_loss_mlp": 0.81054688, + "step": 2661, + "time_per_iteration": 2.8184986114501953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160969, + "balance_loss_mlp": 1.08004987, + "epoch": 0.5121200461716044, + "flos": 586780537344.0, + "grad_norm": 0.02944382500923868, + "language_loss": 0.91654462, + "learning_rate": 0.0005046730666144338, + "loss": 0.92815423, + "num_input_tokens_seen": 221877168, + "router_z_loss_mlp": 0.80908203, + "step": 2662, + "time_per_iteration": 2.755974769592285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160878, + "balance_loss_mlp": 1.0798161, + "epoch": 0.512312427856868, + "flos": 1034223124992.0, + "grad_norm": 0.029507171441845153, + "language_loss": 0.93013144, + "learning_rate": 0.0005043615370244532, + "loss": 0.94174021, + "num_input_tokens_seen": 221964208, + "router_z_loss_mlp": 0.81054688, + "step": 2663, + "time_per_iteration": 3.3488211631774902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177849, + "balance_loss_mlp": 1.09907532, + "epoch": 0.5125048095421316, + "flos": 1540899207168.0, + "grad_norm": 0.013662934984579522, + "language_loss": 0.78244388, + "learning_rate": 0.0005040500057411736, + "loss": 0.79422235, + "num_input_tokens_seen": 222179264, + "router_z_loss_mlp": 0.78710938, + "step": 2664, + "time_per_iteration": 4.6237993240356445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162223, + "balance_loss_mlp": 1.08130419, + "epoch": 0.5126971912273951, + "flos": 592327145472.0, + "grad_norm": 0.024418914459260154, + "language_loss": 0.89686567, + "learning_rate": 0.0005037384728855425, + "loss": 0.90848792, + "num_input_tokens_seen": 222259504, + "router_z_loss_mlp": 0.80908203, + "step": 2665, + "time_per_iteration": 2.8003761768341064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163774, + "balance_loss_mlp": 1.08299828, + "epoch": 0.5128895729126587, + "flos": 552717351936.0, + "grad_norm": 0.03867267783646357, + "language_loss": 0.9114759, + "learning_rate": 0.0005034269385785075, + "loss": 0.9231137, + "num_input_tokens_seen": 222330512, + "router_z_loss_mlp": 0.80761719, + "step": 2666, + "time_per_iteration": 2.664607286453247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161159, + "balance_loss_mlp": 1.08047831, + "epoch": 0.5130819545979223, + "flos": 482231639040.0, + "grad_norm": 0.037339426134761385, + "language_loss": 0.92204285, + "learning_rate": 0.0005031154029410168, + "loss": 0.93365449, + "num_input_tokens_seen": 222394000, + "router_z_loss_mlp": 0.80664062, + "step": 2667, + "time_per_iteration": 2.5419206619262695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157708, + "balance_loss_mlp": 1.0769316, + "epoch": 0.5132743362831859, + "flos": 476767623168.0, + "grad_norm": 0.03576788906651519, + "language_loss": 0.93073893, + "learning_rate": 0.0005028038660940197, + "loss": 0.942316, + "num_input_tokens_seen": 222459344, + "router_z_loss_mlp": 0.80761719, + "step": 2668, + "time_per_iteration": 2.5499191284179688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166102, + "balance_loss_mlp": 1.08542132, + "epoch": 0.5134667179684494, + "flos": 504902164992.0, + "grad_norm": 0.02981054719592371, + "language_loss": 0.89144588, + "learning_rate": 0.0005024923281584648, + "loss": 0.90310693, + "num_input_tokens_seen": 222528912, + "router_z_loss_mlp": 0.80664062, + "step": 2669, + "time_per_iteration": 2.6367011070251465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165888, + "balance_loss_mlp": 1.08496881, + "epoch": 0.5136590996537129, + "flos": 505004222976.0, + "grad_norm": 0.029270286325536108, + "language_loss": 0.87695622, + "learning_rate": 0.0005021807892553026, + "loss": 0.88861501, + "num_input_tokens_seen": 222604704, + "router_z_loss_mlp": 0.80908203, + "step": 2670, + "time_per_iteration": 2.697326421737671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165807, + "balance_loss_mlp": 1.08522201, + "epoch": 0.5138514813389765, + "flos": 625799450112.0, + "grad_norm": 0.029434336289691197, + "language_loss": 0.8977018, + "learning_rate": 0.0005018692495054828, + "loss": 0.90935987, + "num_input_tokens_seen": 222677888, + "router_z_loss_mlp": 0.80566406, + "step": 2671, + "time_per_iteration": 2.848576784133911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154912, + "balance_loss_mlp": 1.07394516, + "epoch": 0.5140438630242401, + "flos": 584633316864.0, + "grad_norm": 0.027486728027613972, + "language_loss": 0.85466325, + "learning_rate": 0.0005015577090299561, + "loss": 0.86621237, + "num_input_tokens_seen": 222751936, + "router_z_loss_mlp": 0.80957031, + "step": 2672, + "time_per_iteration": 2.698976993560791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155424, + "balance_loss_mlp": 1.0744096, + "epoch": 0.5142362447095037, + "flos": 488904887808.0, + "grad_norm": 0.030629892529963922, + "language_loss": 0.92615306, + "learning_rate": 0.0005012461679496729, + "loss": 0.9377073, + "num_input_tokens_seen": 222819616, + "router_z_loss_mlp": 0.81005859, + "step": 2673, + "time_per_iteration": 2.5998294353485107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115671, + "balance_loss_mlp": 1.07564759, + "epoch": 0.5144286263947672, + "flos": 527884869120.0, + "grad_norm": 0.029257555563523763, + "language_loss": 0.93652987, + "learning_rate": 0.0005009346263855848, + "loss": 0.94809699, + "num_input_tokens_seen": 222888448, + "router_z_loss_mlp": 0.81054688, + "step": 2674, + "time_per_iteration": 2.702364444732666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156546, + "balance_loss_mlp": 1.07548332, + "epoch": 0.5146210080800308, + "flos": 487589594112.0, + "grad_norm": 0.025826040346785265, + "language_loss": 0.88576883, + "learning_rate": 0.0005006230844586422, + "loss": 0.89733428, + "num_input_tokens_seen": 222964736, + "router_z_loss_mlp": 0.81054688, + "step": 2675, + "time_per_iteration": 2.7889058589935303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159564, + "balance_loss_mlp": 1.07845449, + "epoch": 0.5148133897652943, + "flos": 516974301696.0, + "grad_norm": 0.025127862595781116, + "language_loss": 0.83195055, + "learning_rate": 0.0005003115422897968, + "loss": 0.84354615, + "num_input_tokens_seen": 223040944, + "router_z_loss_mlp": 0.81103516, + "step": 2676, + "time_per_iteration": 2.7474374771118164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165139, + "balance_loss_mlp": 1.08436286, + "epoch": 0.5150057714505579, + "flos": 512211230208.0, + "grad_norm": 0.02805317572608274, + "language_loss": 0.92311704, + "learning_rate": 0.0005, + "loss": 0.93476844, + "num_input_tokens_seen": 223109632, + "router_z_loss_mlp": 0.80761719, + "step": 2677, + "time_per_iteration": 2.635801076889038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167536, + "balance_loss_mlp": 1.08652139, + "epoch": 0.5151981531358215, + "flos": 912389853696.0, + "grad_norm": 0.03671017270530106, + "language_loss": 0.86270726, + "learning_rate": 0.0004996884577102033, + "loss": 0.87438262, + "num_input_tokens_seen": 223191648, + "router_z_loss_mlp": 0.81005859, + "step": 2678, + "time_per_iteration": 3.1016898155212402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116356, + "balance_loss_mlp": 1.08264065, + "epoch": 0.515390534821085, + "flos": 472929804288.0, + "grad_norm": 0.02746999857609634, + "language_loss": 0.90178144, + "learning_rate": 0.000499376915541358, + "loss": 0.91341698, + "num_input_tokens_seen": 223265920, + "router_z_loss_mlp": 0.80908203, + "step": 2679, + "time_per_iteration": 2.7041540145874023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163327, + "balance_loss_mlp": 1.0826937, + "epoch": 0.5155829165063486, + "flos": 651357072384.0, + "grad_norm": 0.02786171231522906, + "language_loss": 0.85589147, + "learning_rate": 0.0004990653736144155, + "loss": 0.86752468, + "num_input_tokens_seen": 223340688, + "router_z_loss_mlp": 0.80615234, + "step": 2680, + "time_per_iteration": 2.883392572402954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163916, + "balance_loss_mlp": 1.08280623, + "epoch": 0.5157752981916122, + "flos": 415160776704.0, + "grad_norm": 0.030701546031170052, + "language_loss": 0.92331398, + "learning_rate": 0.0004987538320503271, + "loss": 0.93495315, + "num_input_tokens_seen": 223404064, + "router_z_loss_mlp": 0.81103516, + "step": 2681, + "time_per_iteration": 2.4719676971435547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169918, + "balance_loss_mlp": 1.0890938, + "epoch": 0.5159676798768758, + "flos": 554931701760.0, + "grad_norm": 0.03041903817165714, + "language_loss": 0.89793313, + "learning_rate": 0.0004984422909700442, + "loss": 0.90963233, + "num_input_tokens_seen": 223476784, + "router_z_loss_mlp": 0.80810547, + "step": 2682, + "time_per_iteration": 2.7486019134521484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168893, + "balance_loss_mlp": 1.08816493, + "epoch": 0.5161600615621393, + "flos": 587620469760.0, + "grad_norm": 0.02833679783776788, + "language_loss": 0.89197505, + "learning_rate": 0.0004981307504945173, + "loss": 0.90366399, + "num_input_tokens_seen": 223542832, + "router_z_loss_mlp": 0.80712891, + "step": 2683, + "time_per_iteration": 2.6918153762817383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161385, + "balance_loss_mlp": 1.08060837, + "epoch": 0.5163524432474028, + "flos": 589947611136.0, + "grad_norm": 0.03153559446680845, + "language_loss": 0.9527353, + "learning_rate": 0.0004978192107446976, + "loss": 0.96434915, + "num_input_tokens_seen": 223617968, + "router_z_loss_mlp": 0.80761719, + "step": 2684, + "time_per_iteration": 2.7622218132019043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159701, + "balance_loss_mlp": 1.07906806, + "epoch": 0.5165448249326664, + "flos": 504904166400.0, + "grad_norm": 0.029863924033148703, + "language_loss": 0.92634213, + "learning_rate": 0.0004975076718415353, + "loss": 0.93793911, + "num_input_tokens_seen": 223689504, + "router_z_loss_mlp": 0.80615234, + "step": 2685, + "time_per_iteration": 2.644228219985962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172411, + "balance_loss_mlp": 1.09220684, + "epoch": 0.51673720661793, + "flos": 417646371840.0, + "grad_norm": 0.031084732221220036, + "language_loss": 0.95470178, + "learning_rate": 0.0004971961339059806, + "loss": 0.96642584, + "num_input_tokens_seen": 223752288, + "router_z_loss_mlp": 0.80175781, + "step": 2686, + "time_per_iteration": 2.469081401824951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160009, + "balance_loss_mlp": 1.0795666, + "epoch": 0.5169295883031936, + "flos": 600074641920.0, + "grad_norm": 0.03147701291149863, + "language_loss": 0.89665824, + "learning_rate": 0.0004968845970589832, + "loss": 0.90825832, + "num_input_tokens_seen": 223822304, + "router_z_loss_mlp": 0.80419922, + "step": 2687, + "time_per_iteration": 2.7054736614227295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159105, + "balance_loss_mlp": 1.07847178, + "epoch": 0.517121969988457, + "flos": 557910122496.0, + "grad_norm": 0.03772331123991374, + "language_loss": 0.90882772, + "learning_rate": 0.0004965730614214926, + "loss": 0.92041886, + "num_input_tokens_seen": 223888592, + "router_z_loss_mlp": 0.80615234, + "step": 2688, + "time_per_iteration": 2.6433985233306885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159068, + "balance_loss_mlp": 1.0787214, + "epoch": 0.5173143516737206, + "flos": 470374351872.0, + "grad_norm": 0.031353493154565384, + "language_loss": 0.9113276, + "learning_rate": 0.0004962615271144576, + "loss": 0.92291832, + "num_input_tokens_seen": 223952880, + "router_z_loss_mlp": 0.80322266, + "step": 2689, + "time_per_iteration": 2.5081796646118164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159566, + "balance_loss_mlp": 1.07912409, + "epoch": 0.5175067333589842, + "flos": 721378157568.0, + "grad_norm": 0.03531118205346665, + "language_loss": 0.88785195, + "learning_rate": 0.0004959499942588264, + "loss": 0.89944768, + "num_input_tokens_seen": 224030000, + "router_z_loss_mlp": 0.80419922, + "step": 2690, + "time_per_iteration": 2.8977034091949463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165977, + "balance_loss_mlp": 1.08682251, + "epoch": 0.5176991150442478, + "flos": 1469341974528.0, + "grad_norm": 0.00940812354228104, + "language_loss": 0.78200024, + "learning_rate": 0.0004956384629755469, + "loss": 0.79365999, + "num_input_tokens_seen": 224252384, + "router_z_loss_mlp": 0.79101562, + "step": 2691, + "time_per_iteration": 4.744166851043701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162816, + "balance_loss_mlp": 1.08227849, + "epoch": 0.5178914967295114, + "flos": 613783709184.0, + "grad_norm": 0.0285194405600695, + "language_loss": 0.91181535, + "learning_rate": 0.0004953269333855661, + "loss": 0.92344356, + "num_input_tokens_seen": 224324640, + "router_z_loss_mlp": 0.80517578, + "step": 2692, + "time_per_iteration": 2.7305634021759033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164372, + "balance_loss_mlp": 1.0839293, + "epoch": 0.5180838784147749, + "flos": 501980140032.0, + "grad_norm": 0.03457473418848995, + "language_loss": 0.89626956, + "learning_rate": 0.0004950154056098309, + "loss": 0.90791321, + "num_input_tokens_seen": 224398368, + "router_z_loss_mlp": 0.80419922, + "step": 2693, + "time_per_iteration": 2.7358009815216064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162458, + "balance_loss_mlp": 1.08215868, + "epoch": 0.5182762601000385, + "flos": 690041613312.0, + "grad_norm": 0.03333155233389222, + "language_loss": 0.90543425, + "learning_rate": 0.0004947038797692867, + "loss": 0.91705889, + "num_input_tokens_seen": 224465456, + "router_z_loss_mlp": 0.80273438, + "step": 2694, + "time_per_iteration": 2.8636367321014404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178055, + "balance_loss_mlp": 1.09775615, + "epoch": 0.518468641785302, + "flos": 666800398848.0, + "grad_norm": 0.03410817354988479, + "language_loss": 0.8335048, + "learning_rate": 0.0004943923559848789, + "loss": 0.84528536, + "num_input_tokens_seen": 224540960, + "router_z_loss_mlp": 0.80273438, + "step": 2695, + "time_per_iteration": 2.797072172164917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117824, + "balance_loss_mlp": 1.09794104, + "epoch": 0.5186610234705656, + "flos": 567813571584.0, + "grad_norm": 0.02729227458516312, + "language_loss": 0.95474803, + "learning_rate": 0.0004940808343775515, + "loss": 0.96653044, + "num_input_tokens_seen": 224613200, + "router_z_loss_mlp": 0.80273438, + "step": 2696, + "time_per_iteration": 2.6839044094085693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162534, + "balance_loss_mlp": 1.08204436, + "epoch": 0.5188534051558291, + "flos": 429792368640.0, + "grad_norm": 0.03355790964159957, + "language_loss": 0.87542081, + "learning_rate": 0.0004937693150682479, + "loss": 0.88704622, + "num_input_tokens_seen": 224677456, + "router_z_loss_mlp": 0.8046875, + "step": 2697, + "time_per_iteration": 2.5123825073242188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161323, + "balance_loss_mlp": 1.08045113, + "epoch": 0.5190457868410927, + "flos": 547411789824.0, + "grad_norm": 0.031455242836056954, + "language_loss": 0.81813598, + "learning_rate": 0.0004934577981779107, + "loss": 0.82974923, + "num_input_tokens_seen": 224745600, + "router_z_loss_mlp": 0.80859375, + "step": 2698, + "time_per_iteration": 2.662545919418335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117247, + "balance_loss_mlp": 1.09159839, + "epoch": 0.5192381685263563, + "flos": 549745661952.0, + "grad_norm": 0.02804159255629041, + "language_loss": 0.86178321, + "learning_rate": 0.0004931462838274817, + "loss": 0.87350786, + "num_input_tokens_seen": 224826944, + "router_z_loss_mlp": 0.80859375, + "step": 2699, + "time_per_iteration": 2.877682685852051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172435, + "balance_loss_mlp": 1.09156311, + "epoch": 0.5194305502116199, + "flos": 576349334016.0, + "grad_norm": 0.03885998177020277, + "language_loss": 0.90400088, + "learning_rate": 0.0004928347721379011, + "loss": 0.91572523, + "num_input_tokens_seen": 224895280, + "router_z_loss_mlp": 0.80859375, + "step": 2700, + "time_per_iteration": 2.671849489212036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169932, + "balance_loss_mlp": 1.08906007, + "epoch": 0.5196229318968835, + "flos": 435217453056.0, + "grad_norm": 0.030583901836551724, + "language_loss": 0.87633044, + "learning_rate": 0.0004925232632301089, + "loss": 0.88802975, + "num_input_tokens_seen": 224961632, + "router_z_loss_mlp": 0.80859375, + "step": 2701, + "time_per_iteration": 2.57857608795166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166407, + "balance_loss_mlp": 1.08558309, + "epoch": 0.5198153135821469, + "flos": 559985484288.0, + "grad_norm": 0.03187287566803064, + "language_loss": 0.85556304, + "learning_rate": 0.0004922117572250431, + "loss": 0.86722708, + "num_input_tokens_seen": 225032816, + "router_z_loss_mlp": 0.80810547, + "step": 2702, + "time_per_iteration": 2.7037737369537354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166773, + "balance_loss_mlp": 1.08618808, + "epoch": 0.5200076952674105, + "flos": 566834651136.0, + "grad_norm": 0.03219739559056917, + "language_loss": 0.8641057, + "learning_rate": 0.0004919002542436414, + "loss": 0.87577343, + "num_input_tokens_seen": 225112736, + "router_z_loss_mlp": 0.80566406, + "step": 2703, + "time_per_iteration": 2.8919363021850586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169953, + "balance_loss_mlp": 1.08965361, + "epoch": 0.5202000769526741, + "flos": 572272470528.0, + "grad_norm": 0.0327510509858114, + "language_loss": 0.87948251, + "learning_rate": 0.0004915887544068399, + "loss": 0.89118207, + "num_input_tokens_seen": 225182672, + "router_z_loss_mlp": 0.80273438, + "step": 2704, + "time_per_iteration": 2.6497535705566406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169089, + "balance_loss_mlp": 1.08869386, + "epoch": 0.5203924586379377, + "flos": 695466697728.0, + "grad_norm": 0.02924473313894461, + "language_loss": 0.83824521, + "learning_rate": 0.0004912772578355736, + "loss": 0.84993607, + "num_input_tokens_seen": 225260272, + "router_z_loss_mlp": 0.80371094, + "step": 2705, + "time_per_iteration": 2.8862009048461914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163429, + "balance_loss_mlp": 1.08274853, + "epoch": 0.5205848403232012, + "flos": 567690046464.0, + "grad_norm": 0.031189936278329552, + "language_loss": 0.88606453, + "learning_rate": 0.000490965764650776, + "loss": 0.89769882, + "num_input_tokens_seen": 225337120, + "router_z_loss_mlp": 0.80664062, + "step": 2706, + "time_per_iteration": 2.865553379058838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163571, + "balance_loss_mlp": 1.08308065, + "epoch": 0.5207772220084648, + "flos": 1216204231680.0, + "grad_norm": 0.03053180986383906, + "language_loss": 0.8816222, + "learning_rate": 0.0004906542749733798, + "loss": 0.89325786, + "num_input_tokens_seen": 225433984, + "router_z_loss_mlp": 0.8046875, + "step": 2707, + "time_per_iteration": 3.6396875381469727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162365, + "balance_loss_mlp": 1.08197033, + "epoch": 0.5209696036937284, + "flos": 594031205376.0, + "grad_norm": 0.027334962594272247, + "language_loss": 0.90568572, + "learning_rate": 0.0004903427889243156, + "loss": 0.91730928, + "num_input_tokens_seen": 225512112, + "router_z_loss_mlp": 0.80371094, + "step": 2708, + "time_per_iteration": 2.853013753890991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116169, + "balance_loss_mlp": 1.08129489, + "epoch": 0.5211619853789919, + "flos": 523955725824.0, + "grad_norm": 0.032301377197285666, + "language_loss": 0.91200471, + "learning_rate": 0.0004900313066245134, + "loss": 0.92362165, + "num_input_tokens_seen": 225586944, + "router_z_loss_mlp": 0.80371094, + "step": 2709, + "time_per_iteration": 2.706407070159912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161577, + "balance_loss_mlp": 1.08146846, + "epoch": 0.5213543670642555, + "flos": 503860118016.0, + "grad_norm": 0.02918491733204221, + "language_loss": 0.86683327, + "learning_rate": 0.0004897198281949012, + "loss": 0.87844902, + "num_input_tokens_seen": 225657184, + "router_z_loss_mlp": 0.80078125, + "step": 2710, + "time_per_iteration": 2.6603598594665527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115684, + "balance_loss_mlp": 1.07654023, + "epoch": 0.521546748749519, + "flos": 587071248384.0, + "grad_norm": 0.0328837537508598, + "language_loss": 0.84538651, + "learning_rate": 0.0004894083537564057, + "loss": 0.85695493, + "num_input_tokens_seen": 225729968, + "router_z_loss_mlp": 0.80273438, + "step": 2711, + "time_per_iteration": 2.740659236907959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159708, + "balance_loss_mlp": 1.07955158, + "epoch": 0.5217391304347826, + "flos": 571265352192.0, + "grad_norm": 0.028894041826031003, + "language_loss": 0.85799223, + "learning_rate": 0.0004890968834299519, + "loss": 0.86958933, + "num_input_tokens_seen": 225801808, + "router_z_loss_mlp": 0.80126953, + "step": 2712, + "time_per_iteration": 2.7206225395202637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157432, + "balance_loss_mlp": 1.077371, + "epoch": 0.5219315121200462, + "flos": 543919076352.0, + "grad_norm": 0.029763432747936528, + "language_loss": 0.83741677, + "learning_rate": 0.0004887854173364633, + "loss": 0.84899104, + "num_input_tokens_seen": 225878576, + "router_z_loss_mlp": 0.80029297, + "step": 2713, + "time_per_iteration": 2.737755060195923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160512, + "balance_loss_mlp": 1.08097565, + "epoch": 0.5221238938053098, + "flos": 551530312704.0, + "grad_norm": 0.028214516718367867, + "language_loss": 0.86704654, + "learning_rate": 0.0004884739555968617, + "loss": 0.87865162, + "num_input_tokens_seen": 225960096, + "router_z_loss_mlp": 0.79492188, + "step": 2714, + "time_per_iteration": 2.872819185256958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168823, + "balance_loss_mlp": 1.09100342, + "epoch": 0.5223162754905732, + "flos": 1358389797888.0, + "grad_norm": 0.012476009787944744, + "language_loss": 0.78977054, + "learning_rate": 0.0004881624983320676, + "loss": 0.80145878, + "num_input_tokens_seen": 226184960, + "router_z_loss_mlp": 0.77539062, + "step": 2715, + "time_per_iteration": 4.96741795539856 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170398, + "balance_loss_mlp": 1.09028971, + "epoch": 0.5225086571758368, + "flos": 568973139456.0, + "grad_norm": 0.03267804467904664, + "language_loss": 0.92675197, + "learning_rate": 0.0004878510456629992, + "loss": 0.93845594, + "num_input_tokens_seen": 226271328, + "router_z_loss_mlp": 0.80078125, + "step": 2716, + "time_per_iteration": 2.9626121520996094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160651, + "balance_loss_mlp": 1.08054268, + "epoch": 0.5227010388611004, + "flos": 501135478272.0, + "grad_norm": 0.033781088666230946, + "language_loss": 0.9089278, + "learning_rate": 0.00048753959771057314, + "loss": 0.92053425, + "num_input_tokens_seen": 226340080, + "router_z_loss_mlp": 0.80078125, + "step": 2717, + "time_per_iteration": 2.611691951751709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157135, + "balance_loss_mlp": 1.07702601, + "epoch": 0.522893420546364, + "flos": 598798279680.0, + "grad_norm": 0.032963356718883376, + "language_loss": 0.88626194, + "learning_rate": 0.0004872281545957044, + "loss": 0.89783323, + "num_input_tokens_seen": 226415120, + "router_z_loss_mlp": 0.80078125, + "step": 2718, + "time_per_iteration": 2.7218518257141113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116303, + "balance_loss_mlp": 1.08287394, + "epoch": 0.5230858022316276, + "flos": 665921534976.0, + "grad_norm": 0.02884991307967795, + "language_loss": 0.91186881, + "learning_rate": 0.0004869167164393055, + "loss": 0.92349917, + "num_input_tokens_seen": 226501200, + "router_z_loss_mlp": 0.80126953, + "step": 2719, + "time_per_iteration": 2.932335376739502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164195, + "balance_loss_mlp": 1.08403885, + "epoch": 0.5232781839168911, + "flos": 605033097216.0, + "grad_norm": 0.02708280335676697, + "language_loss": 0.94493294, + "learning_rate": 0.00048660528336228793, + "loss": 0.95657486, + "num_input_tokens_seen": 226582064, + "router_z_loss_mlp": 0.80126953, + "step": 2720, + "time_per_iteration": 2.8030405044555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158564, + "balance_loss_mlp": 1.07840788, + "epoch": 0.5234705656021547, + "flos": 551840489472.0, + "grad_norm": 0.028885887647779437, + "language_loss": 0.95077229, + "learning_rate": 0.0004862938554855606, + "loss": 0.96235794, + "num_input_tokens_seen": 226656448, + "router_z_loss_mlp": 0.80126953, + "step": 2721, + "time_per_iteration": 2.797297716140747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159208, + "balance_loss_mlp": 1.0790993, + "epoch": 0.5236629472874182, + "flos": 505294934016.0, + "grad_norm": 0.03214550067861962, + "language_loss": 0.91548902, + "learning_rate": 0.0004859824329300304, + "loss": 0.92708111, + "num_input_tokens_seen": 226725568, + "router_z_loss_mlp": 0.80078125, + "step": 2722, + "time_per_iteration": 2.589529037475586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164653, + "balance_loss_mlp": 1.08444893, + "epoch": 0.5238553289726818, + "flos": 548696884224.0, + "grad_norm": 0.029959051591606282, + "language_loss": 0.88512689, + "learning_rate": 0.00048567101581660244, + "loss": 0.89677346, + "num_input_tokens_seen": 226795728, + "router_z_loss_mlp": 0.80175781, + "step": 2723, + "time_per_iteration": 2.6637237071990967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160999, + "balance_loss_mlp": 1.08065164, + "epoch": 0.5240477106579453, + "flos": 533003779584.0, + "grad_norm": 0.031636293719806106, + "language_loss": 0.92529982, + "learning_rate": 0.00048535960426617956, + "loss": 0.93690991, + "num_input_tokens_seen": 226865344, + "router_z_loss_mlp": 0.80322266, + "step": 2724, + "time_per_iteration": 2.6061489582061768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156405, + "balance_loss_mlp": 1.07620108, + "epoch": 0.5242400923432089, + "flos": 619089271296.0, + "grad_norm": 0.028230181756235023, + "language_loss": 0.87247139, + "learning_rate": 0.0004850481983996621, + "loss": 0.88403541, + "num_input_tokens_seen": 226936800, + "router_z_loss_mlp": 0.80175781, + "step": 2725, + "time_per_iteration": 2.7699060440063477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157933, + "balance_loss_mlp": 1.07787168, + "epoch": 0.5244324740284725, + "flos": 417589976064.0, + "grad_norm": 0.03201067328997522, + "language_loss": 0.93398654, + "learning_rate": 0.0004847367983379492, + "loss": 0.94556582, + "num_input_tokens_seen": 226998448, + "router_z_loss_mlp": 0.80029297, + "step": 2726, + "time_per_iteration": 2.521516799926758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156009, + "balance_loss_mlp": 1.07599604, + "epoch": 0.5246248557137361, + "flos": 627731821056.0, + "grad_norm": 0.028083517097400017, + "language_loss": 0.83866012, + "learning_rate": 0.00048442540420193643, + "loss": 0.8502202, + "num_input_tokens_seen": 227081872, + "router_z_loss_mlp": 0.79980469, + "step": 2727, + "time_per_iteration": 2.8968660831451416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155443, + "balance_loss_mlp": 1.07547724, + "epoch": 0.5248172373989997, + "flos": 1250401675776.0, + "grad_norm": 0.032601939018394276, + "language_loss": 0.85122609, + "learning_rate": 0.0004841140161125182, + "loss": 0.86278045, + "num_input_tokens_seen": 227167744, + "router_z_loss_mlp": 0.79931641, + "step": 2728, + "time_per_iteration": 3.585556983947754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156303, + "balance_loss_mlp": 1.0764327, + "epoch": 0.5250096190842631, + "flos": 507882587136.0, + "grad_norm": 0.02942710549962748, + "language_loss": 0.90605354, + "learning_rate": 0.0004838026341905857, + "loss": 0.91761655, + "num_input_tokens_seen": 227239136, + "router_z_loss_mlp": 0.79833984, + "step": 2729, + "time_per_iteration": 2.7116506099700928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157734, + "balance_loss_mlp": 1.07781577, + "epoch": 0.5252020007695267, + "flos": 612507346944.0, + "grad_norm": 0.029260311632026755, + "language_loss": 0.9089191, + "learning_rate": 0.00048349125855702844, + "loss": 0.92049646, + "num_input_tokens_seen": 227311968, + "router_z_loss_mlp": 0.79882812, + "step": 2730, + "time_per_iteration": 2.772508144378662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157575, + "balance_loss_mlp": 1.07780039, + "epoch": 0.5253943824547903, + "flos": 540291377664.0, + "grad_norm": 0.027039643287400304, + "language_loss": 0.86249292, + "learning_rate": 0.00048317988933273287, + "loss": 0.87406862, + "num_input_tokens_seen": 227385248, + "router_z_loss_mlp": 0.79736328, + "step": 2731, + "time_per_iteration": 2.7501025199890137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159148, + "balance_loss_mlp": 1.07918203, + "epoch": 0.5255867641400539, + "flos": 699337443840.0, + "grad_norm": 0.030025626211663315, + "language_loss": 0.87967253, + "learning_rate": 0.00048286852663858367, + "loss": 0.89126396, + "num_input_tokens_seen": 227464640, + "router_z_loss_mlp": 0.79931641, + "step": 2732, + "time_per_iteration": 2.9441256523132324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156016, + "balance_loss_mlp": 1.07604992, + "epoch": 0.5257791458253175, + "flos": 668548119552.0, + "grad_norm": 0.03127119397180798, + "language_loss": 0.89405584, + "learning_rate": 0.000482557170595462, + "loss": 0.90561604, + "num_input_tokens_seen": 227542192, + "router_z_loss_mlp": 0.79931641, + "step": 2733, + "time_per_iteration": 2.875559091567993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158055, + "balance_loss_mlp": 1.07813704, + "epoch": 0.525971527510581, + "flos": 484604442624.0, + "grad_norm": 0.02914442262172993, + "language_loss": 0.93156296, + "learning_rate": 0.0004822458213242475, + "loss": 0.94314349, + "num_input_tokens_seen": 227606096, + "router_z_loss_mlp": 0.79882812, + "step": 2734, + "time_per_iteration": 2.5386509895324707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157288, + "balance_loss_mlp": 1.07737029, + "epoch": 0.5261639091958445, + "flos": 831347410944.0, + "grad_norm": 0.025020932409653307, + "language_loss": 0.90545583, + "learning_rate": 0.00048193447894581627, + "loss": 0.91702867, + "num_input_tokens_seen": 227689552, + "router_z_loss_mlp": 0.79882812, + "step": 2735, + "time_per_iteration": 3.087679862976074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158596, + "balance_loss_mlp": 1.07853508, + "epoch": 0.5263562908811081, + "flos": 521732643840.0, + "grad_norm": 0.03948252554958876, + "language_loss": 0.93270254, + "learning_rate": 0.00048162314358104243, + "loss": 0.94428849, + "num_input_tokens_seen": 227760784, + "router_z_loss_mlp": 0.80029297, + "step": 2736, + "time_per_iteration": 2.601278305053711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156345, + "balance_loss_mlp": 1.07633209, + "epoch": 0.5265486725663717, + "flos": 576097554432.0, + "grad_norm": 0.032044906976615765, + "language_loss": 0.89525604, + "learning_rate": 0.0004813118153507969, + "loss": 0.90681952, + "num_input_tokens_seen": 227834304, + "router_z_loss_mlp": 0.79980469, + "step": 2737, + "time_per_iteration": 2.7360177040100098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160461, + "balance_loss_mlp": 1.0820694, + "epoch": 0.5267410542516352, + "flos": 1550558333952.0, + "grad_norm": 0.008730383218555248, + "language_loss": 0.82447124, + "learning_rate": 0.0004810004943759482, + "loss": 0.8360759, + "num_input_tokens_seen": 228057232, + "router_z_loss_mlp": 0.78320312, + "step": 2738, + "time_per_iteration": 4.80830717086792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160505, + "balance_loss_mlp": 1.08039653, + "epoch": 0.5269334359368988, + "flos": 931460878848.0, + "grad_norm": 0.03056162512939441, + "language_loss": 0.89627469, + "learning_rate": 0.00048068918077736163, + "loss": 0.90787971, + "num_input_tokens_seen": 228140816, + "router_z_loss_mlp": 0.80078125, + "step": 2739, + "time_per_iteration": 3.228745222091675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160328, + "balance_loss_mlp": 1.08021903, + "epoch": 0.5271258176221624, + "flos": 656634436608.0, + "grad_norm": 0.03221347808604687, + "language_loss": 0.87126762, + "learning_rate": 0.0004803778746759001, + "loss": 0.88287091, + "num_input_tokens_seen": 228216208, + "router_z_loss_mlp": 0.80078125, + "step": 2740, + "time_per_iteration": 2.888040542602539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161897, + "balance_loss_mlp": 1.08217001, + "epoch": 0.527318199307426, + "flos": 544062067200.0, + "grad_norm": 0.03125376981830108, + "language_loss": 0.87138033, + "learning_rate": 0.00048006657619242317, + "loss": 0.8829993, + "num_input_tokens_seen": 228283184, + "router_z_loss_mlp": 0.796875, + "step": 2741, + "time_per_iteration": 2.6788547039031982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156491, + "balance_loss_mlp": 1.07662046, + "epoch": 0.5275105809926895, + "flos": 448898322432.0, + "grad_norm": 0.035204553781932095, + "language_loss": 0.84527659, + "learning_rate": 0.00047975528544778775, + "loss": 0.8568415, + "num_input_tokens_seen": 228351328, + "router_z_loss_mlp": 0.79833984, + "step": 2742, + "time_per_iteration": 2.5953187942504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156742, + "balance_loss_mlp": 1.07677603, + "epoch": 0.527702962677953, + "flos": 580052894208.0, + "grad_norm": 0.031790657619887884, + "language_loss": 0.9544906, + "learning_rate": 0.00047944400256284754, + "loss": 0.96605802, + "num_input_tokens_seen": 228423632, + "router_z_loss_mlp": 0.79931641, + "step": 2743, + "time_per_iteration": 2.6874876022338867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158128, + "balance_loss_mlp": 1.07821035, + "epoch": 0.5278953443632166, + "flos": 654009853440.0, + "grad_norm": 0.028533864641999515, + "language_loss": 0.84914398, + "learning_rate": 0.0004791327276584532, + "loss": 0.86072528, + "num_input_tokens_seen": 228498736, + "router_z_loss_mlp": 0.79882812, + "step": 2744, + "time_per_iteration": 2.851484537124634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159082, + "balance_loss_mlp": 1.07902145, + "epoch": 0.5280877260484802, + "flos": 515048661504.0, + "grad_norm": 0.02936794285447426, + "language_loss": 0.85631824, + "learning_rate": 0.00047882146085545264, + "loss": 0.86790907, + "num_input_tokens_seen": 228569056, + "router_z_loss_mlp": 0.80029297, + "step": 2745, + "time_per_iteration": 2.6376991271972656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159996, + "balance_loss_mlp": 1.081604, + "epoch": 0.5282801077337438, + "flos": 1448712608256.0, + "grad_norm": 0.005116949586401208, + "language_loss": 0.75402379, + "learning_rate": 0.00047851020227469, + "loss": 0.76562381, + "num_input_tokens_seen": 228800560, + "router_z_loss_mlp": 0.78125, + "step": 2746, + "time_per_iteration": 4.958376169204712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158639, + "balance_loss_mlp": 1.0789119, + "epoch": 0.5284724894190073, + "flos": 605966355456.0, + "grad_norm": 0.03386849685542916, + "language_loss": 0.85558748, + "learning_rate": 0.00047819895203700684, + "loss": 0.86717391, + "num_input_tokens_seen": 228869216, + "router_z_loss_mlp": 0.796875, + "step": 2747, + "time_per_iteration": 2.7103474140167236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161659, + "balance_loss_mlp": 1.08326721, + "epoch": 0.5286648711042709, + "flos": 1498103600640.0, + "grad_norm": 0.005524480658063938, + "language_loss": 0.75512433, + "learning_rate": 0.0004778877102632412, + "loss": 0.76674092, + "num_input_tokens_seen": 229085520, + "router_z_loss_mlp": 0.78125, + "step": 2748, + "time_per_iteration": 4.636225938796997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156911, + "balance_loss_mlp": 1.077088, + "epoch": 0.5288572527895344, + "flos": 598833208320.0, + "grad_norm": 0.030227845431380972, + "language_loss": 0.94071984, + "learning_rate": 0.0004775764770742277, + "loss": 0.95228899, + "num_input_tokens_seen": 229160912, + "router_z_loss_mlp": 0.79785156, + "step": 2749, + "time_per_iteration": 2.7894628047943115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154981, + "balance_loss_mlp": 1.07496762, + "epoch": 0.529049634474798, + "flos": 558439878144.0, + "grad_norm": 0.038921610012438906, + "language_loss": 0.92515904, + "learning_rate": 0.00047726525259079777, + "loss": 0.93670887, + "num_input_tokens_seen": 229235792, + "router_z_loss_mlp": 0.79980469, + "step": 2750, + "time_per_iteration": 2.8399362564086914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156308, + "balance_loss_mlp": 1.07643819, + "epoch": 0.5292420161600616, + "flos": 582434429952.0, + "grad_norm": 0.03493339209419754, + "language_loss": 0.94807124, + "learning_rate": 0.0004769540369337798, + "loss": 0.9596343, + "num_input_tokens_seen": 229309984, + "router_z_loss_mlp": 0.79833984, + "step": 2751, + "time_per_iteration": 2.7520663738250732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171177, + "balance_loss_mlp": 1.09097254, + "epoch": 0.5294343978453251, + "flos": 609563854848.0, + "grad_norm": 0.029200425139457874, + "language_loss": 0.90377945, + "learning_rate": 0.00047664283022399794, + "loss": 0.91549122, + "num_input_tokens_seen": 229394000, + "router_z_loss_mlp": 0.80175781, + "step": 2752, + "time_per_iteration": 2.827075719833374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169344, + "balance_loss_mlp": 1.08904481, + "epoch": 0.5296267795305887, + "flos": 647709907968.0, + "grad_norm": 0.03322281077035965, + "language_loss": 0.85670567, + "learning_rate": 0.00047633163258227376, + "loss": 0.86839902, + "num_input_tokens_seen": 229474320, + "router_z_loss_mlp": 0.80273438, + "step": 2753, + "time_per_iteration": 2.8684630393981934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168156, + "balance_loss_mlp": 1.08790445, + "epoch": 0.5298191612158523, + "flos": 560805950976.0, + "grad_norm": 0.0355054677596956, + "language_loss": 0.92337191, + "learning_rate": 0.0004760204441294247, + "loss": 0.93505347, + "num_input_tokens_seen": 229543072, + "router_z_loss_mlp": 0.80224609, + "step": 2754, + "time_per_iteration": 2.6347973346710205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162052, + "balance_loss_mlp": 1.08194327, + "epoch": 0.5300115429011159, + "flos": 515131253760.0, + "grad_norm": 0.03178410473183971, + "language_loss": 0.90992713, + "learning_rate": 0.00047570926498626486, + "loss": 0.92154765, + "num_input_tokens_seen": 229615296, + "router_z_loss_mlp": 0.80078125, + "step": 2755, + "time_per_iteration": 2.6713931560516357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165293, + "balance_loss_mlp": 1.08513677, + "epoch": 0.5302039245863793, + "flos": 674049065472.0, + "grad_norm": 0.025883205751119107, + "language_loss": 0.86624229, + "learning_rate": 0.00047539809527360474, + "loss": 0.87789524, + "num_input_tokens_seen": 229693728, + "router_z_loss_mlp": 0.80126953, + "step": 2756, + "time_per_iteration": 2.855339765548706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163284, + "balance_loss_mlp": 1.08312809, + "epoch": 0.5303963062716429, + "flos": 732156467712.0, + "grad_norm": 0.025616439830169112, + "language_loss": 0.86757731, + "learning_rate": 0.0004750869351122511, + "loss": 0.87921017, + "num_input_tokens_seen": 229772144, + "router_z_loss_mlp": 0.80126953, + "step": 2757, + "time_per_iteration": 2.9861788749694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157792, + "balance_loss_mlp": 1.07773066, + "epoch": 0.5305886879569065, + "flos": 574551948288.0, + "grad_norm": 0.030995691560080724, + "language_loss": 0.87564695, + "learning_rate": 0.00047477578462300685, + "loss": 0.88722491, + "num_input_tokens_seen": 229847024, + "router_z_loss_mlp": 0.80029297, + "step": 2758, + "time_per_iteration": 2.711434841156006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158236, + "balance_loss_mlp": 1.07817531, + "epoch": 0.5307810696421701, + "flos": 696728323584.0, + "grad_norm": 0.030944173565867344, + "language_loss": 0.85500729, + "learning_rate": 0.0004744646439266718, + "loss": 0.86658955, + "num_input_tokens_seen": 229932416, + "router_z_loss_mlp": 0.80029297, + "step": 2759, + "time_per_iteration": 3.012730121612549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159665, + "balance_loss_mlp": 1.07965159, + "epoch": 0.5309734513274337, + "flos": 650202233856.0, + "grad_norm": 0.02922555436454367, + "language_loss": 0.9794637, + "learning_rate": 0.000474153513144041, + "loss": 0.99106038, + "num_input_tokens_seen": 230010976, + "router_z_loss_mlp": 0.79980469, + "step": 2760, + "time_per_iteration": 2.9069197177886963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158721, + "balance_loss_mlp": 1.07866037, + "epoch": 0.5311658330126972, + "flos": 606055678464.0, + "grad_norm": 0.0324154212137011, + "language_loss": 0.92613202, + "learning_rate": 0.00047384239239590633, + "loss": 0.93771923, + "num_input_tokens_seen": 230093344, + "router_z_loss_mlp": 0.80029297, + "step": 2761, + "time_per_iteration": 2.8556571006774902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159506, + "balance_loss_mlp": 1.07949257, + "epoch": 0.5313582146979607, + "flos": 559316740608.0, + "grad_norm": 0.03061440617121834, + "language_loss": 0.94290936, + "learning_rate": 0.0004735312818030556, + "loss": 0.95450437, + "num_input_tokens_seen": 230165520, + "router_z_loss_mlp": 0.79980469, + "step": 2762, + "time_per_iteration": 2.6934847831726074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157514, + "balance_loss_mlp": 1.07764399, + "epoch": 0.5315505963832243, + "flos": 509445657600.0, + "grad_norm": 0.029953313176207894, + "language_loss": 0.88601178, + "learning_rate": 0.0004732201814862727, + "loss": 0.89758694, + "num_input_tokens_seen": 230237808, + "router_z_loss_mlp": 0.79833984, + "step": 2763, + "time_per_iteration": 2.7555651664733887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156859, + "balance_loss_mlp": 1.0773226, + "epoch": 0.5317429780684879, + "flos": 627668694528.0, + "grad_norm": 0.030098925618691368, + "language_loss": 0.87074947, + "learning_rate": 0.0004729090915663373, + "loss": 0.88231808, + "num_input_tokens_seen": 230321568, + "router_z_loss_mlp": 0.79492188, + "step": 2764, + "time_per_iteration": 2.83986496925354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157289, + "balance_loss_mlp": 1.07751369, + "epoch": 0.5319353597537514, + "flos": 477698880000.0, + "grad_norm": 0.035256009305486516, + "language_loss": 0.9145658, + "learning_rate": 0.00047259801216402534, + "loss": 0.92613864, + "num_input_tokens_seen": 230385376, + "router_z_loss_mlp": 0.79736328, + "step": 2765, + "time_per_iteration": 2.49153208732605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158926, + "balance_loss_mlp": 1.07934201, + "epoch": 0.532127741439015, + "flos": 502633420800.0, + "grad_norm": 0.031216360034414494, + "language_loss": 0.91137969, + "learning_rate": 0.00047228694340010845, + "loss": 0.92296898, + "num_input_tokens_seen": 230449760, + "router_z_loss_mlp": 0.79541016, + "step": 2766, + "time_per_iteration": 2.5491669178009033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163587, + "balance_loss_mlp": 1.08385968, + "epoch": 0.5323201231242786, + "flos": 1166482870272.0, + "grad_norm": 0.028947902109049614, + "language_loss": 0.91277415, + "learning_rate": 0.0004719758853953544, + "loss": 0.92440999, + "num_input_tokens_seen": 230536592, + "router_z_loss_mlp": 0.796875, + "step": 2767, + "time_per_iteration": 3.576573610305786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167049, + "balance_loss_mlp": 1.08694029, + "epoch": 0.5325125048095422, + "flos": 379541251584.0, + "grad_norm": 0.04259356627609034, + "language_loss": 0.91498351, + "learning_rate": 0.00047166483827052645, + "loss": 0.92665404, + "num_input_tokens_seen": 230596688, + "router_z_loss_mlp": 0.80078125, + "step": 2768, + "time_per_iteration": 2.3893725872039795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172249, + "balance_loss_mlp": 1.09423828, + "epoch": 0.5327048864948057, + "flos": 1544747211264.0, + "grad_norm": 0.007240897484727242, + "language_loss": 0.77078491, + "learning_rate": 0.00047135380214638413, + "loss": 0.78250736, + "num_input_tokens_seen": 230829408, + "router_z_loss_mlp": 0.77929688, + "step": 2769, + "time_per_iteration": 4.972010374069214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167053, + "balance_loss_mlp": 1.08737326, + "epoch": 0.5328972681800692, + "flos": 912861212160.0, + "grad_norm": 0.03027786850862354, + "language_loss": 0.8989411, + "learning_rate": 0.000471042777143682, + "loss": 0.91061163, + "num_input_tokens_seen": 230912528, + "router_z_loss_mlp": 0.79638672, + "step": 2770, + "time_per_iteration": 3.1992523670196533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116085, + "balance_loss_mlp": 1.08126593, + "epoch": 0.5330896498653328, + "flos": 474850715136.0, + "grad_norm": 0.032478463467180745, + "language_loss": 0.85492694, + "learning_rate": 0.0004707317633831707, + "loss": 0.86653543, + "num_input_tokens_seen": 230979424, + "router_z_loss_mlp": 0.79541016, + "step": 2771, + "time_per_iteration": 2.636418342590332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159417, + "balance_loss_mlp": 1.07983315, + "epoch": 0.5332820315505964, + "flos": 502633420800.0, + "grad_norm": 0.034509360784450445, + "language_loss": 0.84931278, + "learning_rate": 0.00047042076098559673, + "loss": 0.86090696, + "num_input_tokens_seen": 231046416, + "router_z_loss_mlp": 0.79541016, + "step": 2772, + "time_per_iteration": 2.587954521179199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155982, + "balance_loss_mlp": 1.07615912, + "epoch": 0.53347441323586, + "flos": 926031791616.0, + "grad_norm": 0.036007721663536225, + "language_loss": 0.8042109, + "learning_rate": 0.00047010977007170174, + "loss": 0.81577075, + "num_input_tokens_seen": 231136064, + "router_z_loss_mlp": 0.79785156, + "step": 2773, + "time_per_iteration": 3.207517623901367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154797, + "balance_loss_mlp": 1.07497442, + "epoch": 0.5336667949211235, + "flos": 575539600896.0, + "grad_norm": 0.032460813123339774, + "language_loss": 0.88737571, + "learning_rate": 0.00046979879076222334, + "loss": 0.89892364, + "num_input_tokens_seen": 231203616, + "router_z_loss_mlp": 0.79785156, + "step": 2774, + "time_per_iteration": 2.711036443710327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154367, + "balance_loss_mlp": 1.07459235, + "epoch": 0.533859176606387, + "flos": 1066390869504.0, + "grad_norm": 0.02757600625184913, + "language_loss": 0.88843602, + "learning_rate": 0.0004694878231778939, + "loss": 0.89997971, + "num_input_tokens_seen": 231287008, + "router_z_loss_mlp": 0.79736328, + "step": 2775, + "time_per_iteration": 3.3735690116882324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154523, + "balance_loss_mlp": 1.07512975, + "epoch": 0.5340515582916506, + "flos": 747905968128.0, + "grad_norm": 0.025749810309272533, + "language_loss": 0.89188796, + "learning_rate": 0.0004691768674394423, + "loss": 0.9034332, + "num_input_tokens_seen": 231365296, + "router_z_loss_mlp": 0.79345703, + "step": 2776, + "time_per_iteration": 2.9947128295898438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171234, + "balance_loss_mlp": 1.09341431, + "epoch": 0.5342439399769142, + "flos": 1448818669056.0, + "grad_norm": 0.018487467205991936, + "language_loss": 0.84484011, + "learning_rate": 0.0004688659236675918, + "loss": 0.85655242, + "num_input_tokens_seen": 231579040, + "router_z_loss_mlp": 0.77734375, + "step": 2777, + "time_per_iteration": 4.765547275543213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166931, + "balance_loss_mlp": 1.08872986, + "epoch": 0.5344363216621778, + "flos": 1430696365056.0, + "grad_norm": 0.01490962088780182, + "language_loss": 0.76653534, + "learning_rate": 0.00046855499198306187, + "loss": 0.77820462, + "num_input_tokens_seen": 231812736, + "router_z_loss_mlp": 0.77929688, + "step": 2778, + "time_per_iteration": 4.979669570922852 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156329, + "balance_loss_mlp": 1.07636368, + "epoch": 0.5346287033474413, + "flos": 528675136512.0, + "grad_norm": 0.028255812601682327, + "language_loss": 0.84707999, + "learning_rate": 0.00046824407250656676, + "loss": 0.85864329, + "num_input_tokens_seen": 231883840, + "router_z_loss_mlp": 0.79931641, + "step": 2779, + "time_per_iteration": 2.6169135570526123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161852, + "balance_loss_mlp": 1.08183897, + "epoch": 0.5348210850327049, + "flos": 511755334656.0, + "grad_norm": 0.02960487915529887, + "language_loss": 0.89552319, + "learning_rate": 0.0004679331653588161, + "loss": 0.90714169, + "num_input_tokens_seen": 231955360, + "router_z_loss_mlp": 0.79980469, + "step": 2780, + "time_per_iteration": 2.651503562927246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165567, + "balance_loss_mlp": 1.08536327, + "epoch": 0.5350134667179685, + "flos": 463625241600.0, + "grad_norm": 0.0331551624405392, + "language_loss": 0.91242051, + "learning_rate": 0.0004676222706605147, + "loss": 0.9240762, + "num_input_tokens_seen": 232027088, + "router_z_loss_mlp": 0.80175781, + "step": 2781, + "time_per_iteration": 2.609180450439453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171695, + "balance_loss_mlp": 1.09149086, + "epoch": 0.535205848403232, + "flos": 710117755392.0, + "grad_norm": 0.03114563748345981, + "language_loss": 0.9013232, + "learning_rate": 0.0004673113885323626, + "loss": 0.91304016, + "num_input_tokens_seen": 232099472, + "router_z_loss_mlp": 0.80175781, + "step": 2782, + "time_per_iteration": 2.889096736907959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167285, + "balance_loss_mlp": 1.08708084, + "epoch": 0.5353982300884956, + "flos": 895791688704.0, + "grad_norm": 0.029628425021764316, + "language_loss": 0.840244, + "learning_rate": 0.00046700051909505494, + "loss": 0.85191679, + "num_input_tokens_seen": 232182528, + "router_z_loss_mlp": 0.80175781, + "step": 2783, + "time_per_iteration": 3.1921920776367188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161558, + "balance_loss_mlp": 1.08130586, + "epoch": 0.5355906117737591, + "flos": 537024247296.0, + "grad_norm": 0.03383499561986932, + "language_loss": 0.89968938, + "learning_rate": 0.000466689662469282, + "loss": 0.91130495, + "num_input_tokens_seen": 232253344, + "router_z_loss_mlp": 0.80224609, + "step": 2784, + "time_per_iteration": 2.644693613052368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160347, + "balance_loss_mlp": 1.08009481, + "epoch": 0.5357829934590227, + "flos": 870327392256.0, + "grad_norm": 0.02956685166305249, + "language_loss": 0.89793074, + "learning_rate": 0.00046637881877572917, + "loss": 0.90953422, + "num_input_tokens_seen": 232337232, + "router_z_loss_mlp": 0.80224609, + "step": 2785, + "time_per_iteration": 3.134896755218506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159974, + "balance_loss_mlp": 1.0797224, + "epoch": 0.5359753751442863, + "flos": 554445606912.0, + "grad_norm": 0.027747995864539122, + "language_loss": 0.88820761, + "learning_rate": 0.0004660679881350764, + "loss": 0.89980739, + "num_input_tokens_seen": 232412864, + "router_z_loss_mlp": 0.80224609, + "step": 2786, + "time_per_iteration": 2.7258269786834717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186935, + "balance_loss_mlp": 1.10682678, + "epoch": 0.5361677568295499, + "flos": 1483756715520.0, + "grad_norm": 0.018012162763561924, + "language_loss": 0.75608146, + "learning_rate": 0.0004657571706679988, + "loss": 0.76795077, + "num_input_tokens_seen": 232639888, + "router_z_loss_mlp": 0.80078125, + "step": 2787, + "time_per_iteration": 5.011500835418701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163662, + "balance_loss_mlp": 1.08345807, + "epoch": 0.5363601385148133, + "flos": 807641568768.0, + "grad_norm": 0.03200093229385197, + "language_loss": 0.83718783, + "learning_rate": 0.0004654463664951667, + "loss": 0.84882444, + "num_input_tokens_seen": 232719248, + "router_z_loss_mlp": 0.80175781, + "step": 2788, + "time_per_iteration": 3.0044353008270264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162852, + "balance_loss_mlp": 1.08274364, + "epoch": 0.5365525202000769, + "flos": 508878971904.0, + "grad_norm": 0.03055357919616021, + "language_loss": 0.89048028, + "learning_rate": 0.0004651355757372447, + "loss": 0.90210879, + "num_input_tokens_seen": 232788464, + "router_z_loss_mlp": 0.80078125, + "step": 2789, + "time_per_iteration": 2.6024739742279053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011626, + "balance_loss_mlp": 1.08277702, + "epoch": 0.5367449018853405, + "flos": 530014625280.0, + "grad_norm": 0.03243837084279447, + "language_loss": 0.90724301, + "learning_rate": 0.00046482479851489274, + "loss": 0.91886902, + "num_input_tokens_seen": 232859792, + "router_z_loss_mlp": 0.79785156, + "step": 2790, + "time_per_iteration": 2.7023818492889404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168089, + "balance_loss_mlp": 1.08840978, + "epoch": 0.5369372835706041, + "flos": 651216082944.0, + "grad_norm": 0.035661652748611536, + "language_loss": 0.83603406, + "learning_rate": 0.00046451403494876525, + "loss": 0.84771496, + "num_input_tokens_seen": 232941472, + "router_z_loss_mlp": 0.79443359, + "step": 2791, + "time_per_iteration": 2.9009790420532227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169917, + "balance_loss_mlp": 1.09033263, + "epoch": 0.5371296652558677, + "flos": 585627700224.0, + "grad_norm": 0.03267915449635738, + "language_loss": 0.90313196, + "learning_rate": 0.0004642032851595111, + "loss": 0.91483116, + "num_input_tokens_seen": 233017120, + "router_z_loss_mlp": 0.79345703, + "step": 2792, + "time_per_iteration": 2.743093967437744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171549, + "balance_loss_mlp": 1.09196496, + "epoch": 0.5373220469411312, + "flos": 597083486208.0, + "grad_norm": 0.03226534649155799, + "language_loss": 0.89917493, + "learning_rate": 0.00046389254926777404, + "loss": 0.91089034, + "num_input_tokens_seen": 233095408, + "router_z_loss_mlp": 0.79345703, + "step": 2793, + "time_per_iteration": 2.816979169845581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162732, + "balance_loss_mlp": 1.08319557, + "epoch": 0.5375144286263948, + "flos": 1116277415424.0, + "grad_norm": 0.030732828924726157, + "language_loss": 0.83480382, + "learning_rate": 0.0004635818273941926, + "loss": 0.84643114, + "num_input_tokens_seen": 233191056, + "router_z_loss_mlp": 0.79443359, + "step": 2794, + "time_per_iteration": 3.538351058959961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156539, + "balance_loss_mlp": 1.07704997, + "epoch": 0.5377068103116583, + "flos": 596768580096.0, + "grad_norm": 0.03686105726392354, + "language_loss": 0.88212651, + "learning_rate": 0.0004632711196593997, + "loss": 0.8936919, + "num_input_tokens_seen": 233265536, + "router_z_loss_mlp": 0.79443359, + "step": 2795, + "time_per_iteration": 2.7304327487945557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153271, + "balance_loss_mlp": 1.07383037, + "epoch": 0.5378991919969219, + "flos": 885649195008.0, + "grad_norm": 0.031821277780470766, + "language_loss": 0.90781128, + "learning_rate": 0.00046296042618402297, + "loss": 0.91934395, + "num_input_tokens_seen": 233348224, + "router_z_loss_mlp": 0.79394531, + "step": 2796, + "time_per_iteration": 3.117605447769165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154822, + "balance_loss_mlp": 1.07523799, + "epoch": 0.5380915736821854, + "flos": 711950069760.0, + "grad_norm": 0.03181223121167454, + "language_loss": 0.84282267, + "learning_rate": 0.0004626497470886839, + "loss": 0.85437095, + "num_input_tokens_seen": 233429344, + "router_z_loss_mlp": 0.79541016, + "step": 2797, + "time_per_iteration": 2.943110704421997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154308, + "balance_loss_mlp": 1.07439017, + "epoch": 0.538283955367449, + "flos": 558114238464.0, + "grad_norm": 0.03131439333064892, + "language_loss": 0.87165904, + "learning_rate": 0.00046233908249399897, + "loss": 0.88320208, + "num_input_tokens_seen": 233504944, + "router_z_loss_mlp": 0.79882812, + "step": 2798, + "time_per_iteration": 2.753664970397949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156214, + "balance_loss_mlp": 1.0763911, + "epoch": 0.5384763370527126, + "flos": 514481975808.0, + "grad_norm": 0.02763164557850803, + "language_loss": 0.84223002, + "learning_rate": 0.00046202843252057905, + "loss": 0.85379213, + "num_input_tokens_seen": 233573072, + "router_z_loss_mlp": 0.79785156, + "step": 2799, + "time_per_iteration": 2.5850727558135986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157398, + "balance_loss_mlp": 1.07767105, + "epoch": 0.5386687187379762, + "flos": 490719737856.0, + "grad_norm": 0.033199019667933, + "language_loss": 0.8910532, + "learning_rate": 0.00046171779728902896, + "loss": 0.90262723, + "num_input_tokens_seen": 233640896, + "router_z_loss_mlp": 0.796875, + "step": 2800, + "time_per_iteration": 2.54720139503479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157318, + "balance_loss_mlp": 1.07730448, + "epoch": 0.5388611004232398, + "flos": 483627523584.0, + "grad_norm": 0.041719681603307614, + "language_loss": 0.92617553, + "learning_rate": 0.000461407176919948, + "loss": 0.93774867, + "num_input_tokens_seen": 233703904, + "router_z_loss_mlp": 0.79980469, + "step": 2801, + "time_per_iteration": 2.5201830863952637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158799, + "balance_loss_mlp": 1.07868993, + "epoch": 0.5390534821085032, + "flos": 562089043968.0, + "grad_norm": 0.03196091571695152, + "language_loss": 0.90337479, + "learning_rate": 0.00046109657153392997, + "loss": 0.91496283, + "num_input_tokens_seen": 233779248, + "router_z_loss_mlp": 0.80078125, + "step": 2802, + "time_per_iteration": 2.694173574447632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160257, + "balance_loss_mlp": 1.08014798, + "epoch": 0.5392458637937668, + "flos": 489360783360.0, + "grad_norm": 0.039860159596143786, + "language_loss": 0.89760619, + "learning_rate": 0.0004607859812515622, + "loss": 0.90920877, + "num_input_tokens_seen": 233847520, + "router_z_loss_mlp": 0.80078125, + "step": 2803, + "time_per_iteration": 2.585549831390381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164203, + "balance_loss_mlp": 1.08404684, + "epoch": 0.5394382454790304, + "flos": 513049161216.0, + "grad_norm": 0.03534563174473093, + "language_loss": 0.94152969, + "learning_rate": 0.00046047540619342667, + "loss": 0.95317167, + "num_input_tokens_seen": 233911328, + "router_z_loss_mlp": 0.80126953, + "step": 2804, + "time_per_iteration": 2.589845895767212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116808, + "balance_loss_mlp": 1.08835244, + "epoch": 0.539630627164294, + "flos": 568688432640.0, + "grad_norm": 0.02864783436473809, + "language_loss": 0.85705817, + "learning_rate": 0.00046016484648009933, + "loss": 0.86873901, + "num_input_tokens_seen": 233987104, + "router_z_loss_mlp": 0.796875, + "step": 2805, + "time_per_iteration": 2.687539577484131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162339, + "balance_loss_mlp": 1.08246911, + "epoch": 0.5398230088495575, + "flos": 527502833664.0, + "grad_norm": 0.03312242512211549, + "language_loss": 0.8782742, + "learning_rate": 0.0004598543022321501, + "loss": 0.88989753, + "num_input_tokens_seen": 234057216, + "router_z_loss_mlp": 0.79833984, + "step": 2806, + "time_per_iteration": 2.6111719608306885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159262, + "balance_loss_mlp": 1.07910562, + "epoch": 0.5400153905348211, + "flos": 539852946432.0, + "grad_norm": 0.03059923694994547, + "language_loss": 0.85068846, + "learning_rate": 0.0004595437735701433, + "loss": 0.86228108, + "num_input_tokens_seen": 234129984, + "router_z_loss_mlp": 0.80126953, + "step": 2807, + "time_per_iteration": 2.668133020401001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158376, + "balance_loss_mlp": 1.07826769, + "epoch": 0.5402077722200846, + "flos": 514664624640.0, + "grad_norm": 0.03937747929323063, + "language_loss": 0.88849455, + "learning_rate": 0.00045923326061463623, + "loss": 0.90007836, + "num_input_tokens_seen": 234203920, + "router_z_loss_mlp": 0.80078125, + "step": 2808, + "time_per_iteration": 2.76680588722229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152678, + "balance_loss_mlp": 1.07261717, + "epoch": 0.5404001539053482, + "flos": 677565974016.0, + "grad_norm": 0.030976456011377742, + "language_loss": 0.87454319, + "learning_rate": 0.00045892276348618113, + "loss": 0.88606995, + "num_input_tokens_seen": 234285440, + "router_z_loss_mlp": 0.80029297, + "step": 2809, + "time_per_iteration": 2.9939539432525635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173447, + "balance_loss_mlp": 1.09410095, + "epoch": 0.5405925355906118, + "flos": 1558189036032.0, + "grad_norm": 0.015961767794208704, + "language_loss": 0.78260827, + "learning_rate": 0.0004586122823053235, + "loss": 0.79434276, + "num_input_tokens_seen": 234521424, + "router_z_loss_mlp": 0.79296875, + "step": 2810, + "time_per_iteration": 4.974013328552246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157913, + "balance_loss_mlp": 1.07818568, + "epoch": 0.5407849172758753, + "flos": 648537105408.0, + "grad_norm": 0.02696900388574031, + "language_loss": 0.85372365, + "learning_rate": 0.000458301817192603, + "loss": 0.8653028, + "num_input_tokens_seen": 234601632, + "router_z_loss_mlp": 0.796875, + "step": 2811, + "time_per_iteration": 2.8575778007507324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118454, + "balance_loss_mlp": 1.1057663, + "epoch": 0.5409772989611389, + "flos": 1410481234944.0, + "grad_norm": 0.012734794042181983, + "language_loss": 0.8084178, + "learning_rate": 0.00045799136826855263, + "loss": 0.82026327, + "num_input_tokens_seen": 234825776, + "router_z_loss_mlp": 0.78710938, + "step": 2812, + "time_per_iteration": 4.809651613235474 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163077, + "balance_loss_mlp": 1.0835402, + "epoch": 0.5411696806464025, + "flos": 555544049664.0, + "grad_norm": 0.031759632467193835, + "language_loss": 0.91974443, + "learning_rate": 0.00045768093565369983, + "loss": 0.93137515, + "num_input_tokens_seen": 234901504, + "router_z_loss_mlp": 0.79492188, + "step": 2813, + "time_per_iteration": 2.7333316802978516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164131, + "balance_loss_mlp": 1.0847373, + "epoch": 0.5413620623316661, + "flos": 529204892160.0, + "grad_norm": 0.03127565438509195, + "language_loss": 0.8788538, + "learning_rate": 0.0004573705194685646, + "loss": 0.89049512, + "num_input_tokens_seen": 234970288, + "router_z_loss_mlp": 0.79199219, + "step": 2814, + "time_per_iteration": 2.645961284637451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164839, + "balance_loss_mlp": 1.08544588, + "epoch": 0.5415544440169295, + "flos": 599851060224.0, + "grad_norm": 0.03485280634812332, + "language_loss": 0.91058564, + "learning_rate": 0.00045706011983366157, + "loss": 0.92223406, + "num_input_tokens_seen": 235039984, + "router_z_loss_mlp": 0.79199219, + "step": 2815, + "time_per_iteration": 2.6676552295684814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161812, + "balance_loss_mlp": 1.08237088, + "epoch": 0.5417468257021931, + "flos": 471713840640.0, + "grad_norm": 0.03625185410953689, + "language_loss": 0.88930029, + "learning_rate": 0.00045674973686949847, + "loss": 0.90091836, + "num_input_tokens_seen": 235105232, + "router_z_loss_mlp": 0.79199219, + "step": 2816, + "time_per_iteration": 2.51118540763855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116016, + "balance_loss_mlp": 1.08076715, + "epoch": 0.5419392073874567, + "flos": 682190057472.0, + "grad_norm": 0.02856526912727588, + "language_loss": 0.90316737, + "learning_rate": 0.0004564393706965766, + "loss": 0.91476899, + "num_input_tokens_seen": 235192560, + "router_z_loss_mlp": 0.79199219, + "step": 2817, + "time_per_iteration": 2.9563546180725098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160311, + "balance_loss_mlp": 1.0809654, + "epoch": 0.5421315890727203, + "flos": 463336531968.0, + "grad_norm": 0.032507832188727104, + "language_loss": 0.87249088, + "learning_rate": 0.00045612902143539116, + "loss": 0.884094, + "num_input_tokens_seen": 235258448, + "router_z_loss_mlp": 0.79199219, + "step": 2818, + "time_per_iteration": 2.5383646488189697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162479, + "balance_loss_mlp": 1.08294284, + "epoch": 0.5423239707579839, + "flos": 437889699840.0, + "grad_norm": 0.03622660962153638, + "language_loss": 0.8863132, + "learning_rate": 0.00045581868920642986, + "loss": 0.89793801, + "num_input_tokens_seen": 235322176, + "router_z_loss_mlp": 0.79296875, + "step": 2819, + "time_per_iteration": 2.4692800045013428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163903, + "balance_loss_mlp": 1.08441401, + "epoch": 0.5425163524432474, + "flos": 459305330688.0, + "grad_norm": 0.036307438946012835, + "language_loss": 0.86308074, + "learning_rate": 0.00045550837413017457, + "loss": 0.8747198, + "num_input_tokens_seen": 235390960, + "router_z_loss_mlp": 0.79296875, + "step": 2820, + "time_per_iteration": 2.59252667427063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160476, + "balance_loss_mlp": 1.08089161, + "epoch": 0.542708734128511, + "flos": 420409943040.0, + "grad_norm": 0.028561818537522772, + "language_loss": 0.89964175, + "learning_rate": 0.0004551980763271005, + "loss": 0.91124654, + "num_input_tokens_seen": 235460976, + "router_z_loss_mlp": 0.79394531, + "step": 2821, + "time_per_iteration": 2.64975643157959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158342, + "balance_loss_mlp": 1.07880592, + "epoch": 0.5429011158137745, + "flos": 679708465152.0, + "grad_norm": 0.03014006642218495, + "language_loss": 0.89564693, + "learning_rate": 0.0004548877959176756, + "loss": 0.90723038, + "num_input_tokens_seen": 235540912, + "router_z_loss_mlp": 0.79345703, + "step": 2822, + "time_per_iteration": 2.881334066390991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166233, + "balance_loss_mlp": 1.08693492, + "epoch": 0.5430934974990381, + "flos": 541967239680.0, + "grad_norm": 0.03201888254331298, + "language_loss": 0.91779578, + "learning_rate": 0.00045457753302236166, + "loss": 0.92945808, + "num_input_tokens_seen": 235608736, + "router_z_loss_mlp": 0.79150391, + "step": 2823, + "time_per_iteration": 2.615506887435913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160293, + "balance_loss_mlp": 1.08075619, + "epoch": 0.5432858791843016, + "flos": 659643056640.0, + "grad_norm": 0.03397006228821556, + "language_loss": 0.93680996, + "learning_rate": 0.00045426728776161353, + "loss": 0.94841284, + "num_input_tokens_seen": 235678720, + "router_z_loss_mlp": 0.79443359, + "step": 2824, + "time_per_iteration": 2.815668821334839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160478, + "balance_loss_mlp": 1.08084619, + "epoch": 0.5434782608695652, + "flos": 532966849536.0, + "grad_norm": 0.030340926449950675, + "language_loss": 0.86484039, + "learning_rate": 0.00045395706025587863, + "loss": 0.87644517, + "num_input_tokens_seen": 235748704, + "router_z_loss_mlp": 0.79589844, + "step": 2825, + "time_per_iteration": 2.677969455718994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159818, + "balance_loss_mlp": 1.0802815, + "epoch": 0.5436706425548288, + "flos": 609632985600.0, + "grad_norm": 0.032758454025991736, + "language_loss": 0.88250875, + "learning_rate": 0.00045364685062559843, + "loss": 0.89410686, + "num_input_tokens_seen": 235828224, + "router_z_loss_mlp": 0.79492188, + "step": 2826, + "time_per_iteration": 2.7975664138793945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160655, + "balance_loss_mlp": 1.08111823, + "epoch": 0.5438630242400924, + "flos": 706772762112.0, + "grad_norm": 0.047560346967580276, + "language_loss": 0.96112239, + "learning_rate": 0.0004533366589912067, + "loss": 0.97272885, + "num_input_tokens_seen": 235909392, + "router_z_loss_mlp": 0.79492188, + "step": 2827, + "time_per_iteration": 2.9455690383911133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161858, + "balance_loss_mlp": 1.08232152, + "epoch": 0.544055405925356, + "flos": 857838291456.0, + "grad_norm": 0.035082604549872, + "language_loss": 0.84527165, + "learning_rate": 0.0004530264854731306, + "loss": 0.8568902, + "num_input_tokens_seen": 235983888, + "router_z_loss_mlp": 0.79492188, + "step": 2828, + "time_per_iteration": 3.0149006843566895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161186, + "balance_loss_mlp": 1.08160186, + "epoch": 0.5442477876106194, + "flos": 572967410688.0, + "grad_norm": 0.029506216108961765, + "language_loss": 0.89973861, + "learning_rate": 0.00045271633019179034, + "loss": 0.91135049, + "num_input_tokens_seen": 236063056, + "router_z_loss_mlp": 0.79541016, + "step": 2829, + "time_per_iteration": 2.7735414505004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162764, + "balance_loss_mlp": 1.08313203, + "epoch": 0.544440169295883, + "flos": 626802565632.0, + "grad_norm": 0.028700635940731967, + "language_loss": 0.92908496, + "learning_rate": 0.0004524061932675986, + "loss": 0.94071257, + "num_input_tokens_seen": 236141104, + "router_z_loss_mlp": 0.79589844, + "step": 2830, + "time_per_iteration": 2.828461170196533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116197, + "balance_loss_mlp": 1.08224237, + "epoch": 0.5446325509811466, + "flos": 837640625664.0, + "grad_norm": 0.03503891147687097, + "language_loss": 0.92219722, + "learning_rate": 0.00045209607482096125, + "loss": 0.93381691, + "num_input_tokens_seen": 236220320, + "router_z_loss_mlp": 0.79541016, + "step": 2831, + "time_per_iteration": 3.0058434009552 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162561, + "balance_loss_mlp": 1.08292878, + "epoch": 0.5448249326664102, + "flos": 484389593088.0, + "grad_norm": 0.03287703969217422, + "language_loss": 0.89665288, + "learning_rate": 0.0004517859749722772, + "loss": 0.90827847, + "num_input_tokens_seen": 236288208, + "router_z_loss_mlp": 0.79443359, + "step": 2832, + "time_per_iteration": 2.6527607440948486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116426, + "balance_loss_mlp": 1.08453321, + "epoch": 0.5450173143516738, + "flos": 562345552896.0, + "grad_norm": 0.03300449363670703, + "language_loss": 0.84396762, + "learning_rate": 0.0004514758938419376, + "loss": 0.85561025, + "num_input_tokens_seen": 236366864, + "router_z_loss_mlp": 0.79541016, + "step": 2833, + "time_per_iteration": 2.799923896789551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176773, + "balance_loss_mlp": 1.09971619, + "epoch": 0.5452096960369373, + "flos": 1473586023936.0, + "grad_norm": 0.016868588983801922, + "language_loss": 0.76920587, + "learning_rate": 0.0004511658315503268, + "loss": 0.78097355, + "num_input_tokens_seen": 236597120, + "router_z_loss_mlp": 0.76953125, + "step": 2834, + "time_per_iteration": 4.904434442520142 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116397, + "balance_loss_mlp": 1.08414805, + "epoch": 0.5454020777222008, + "flos": 466017510912.0, + "grad_norm": 0.028290923396431526, + "language_loss": 0.88719809, + "learning_rate": 0.00045085578821782175, + "loss": 0.8988378, + "num_input_tokens_seen": 236664192, + "router_z_loss_mlp": 0.79589844, + "step": 2835, + "time_per_iteration": 2.5375516414642334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116069, + "balance_loss_mlp": 1.08325195, + "epoch": 0.5455944594074644, + "flos": 1472615109120.0, + "grad_norm": 0.00840245760684232, + "language_loss": 0.76134741, + "learning_rate": 0.0004505457639647917, + "loss": 0.77295429, + "num_input_tokens_seen": 236888784, + "router_z_loss_mlp": 0.7734375, + "step": 2836, + "time_per_iteration": 4.908621549606323 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161179, + "balance_loss_mlp": 1.08121371, + "epoch": 0.545786841092728, + "flos": 534304336896.0, + "grad_norm": 0.026675001792915147, + "language_loss": 0.85451794, + "learning_rate": 0.00045023575891159866, + "loss": 0.86612976, + "num_input_tokens_seen": 236962528, + "router_z_loss_mlp": 0.79931641, + "step": 2837, + "time_per_iteration": 2.77382230758667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167343, + "balance_loss_mlp": 1.08952332, + "epoch": 0.5459792227779915, + "flos": 1355426113536.0, + "grad_norm": 0.010026273514264956, + "language_loss": 0.74763811, + "learning_rate": 0.00044992577317859764, + "loss": 0.75931144, + "num_input_tokens_seen": 237179360, + "router_z_loss_mlp": 0.77734375, + "step": 2838, + "time_per_iteration": 4.8985395431518555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163141, + "balance_loss_mlp": 1.08322346, + "epoch": 0.5461716044632551, + "flos": 639072087552.0, + "grad_norm": 0.03170534586871267, + "language_loss": 0.83100337, + "learning_rate": 0.0004496158068861354, + "loss": 0.8426348, + "num_input_tokens_seen": 237256240, + "router_z_loss_mlp": 0.79833984, + "step": 2839, + "time_per_iteration": 2.8032078742980957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163887, + "balance_loss_mlp": 1.08396888, + "epoch": 0.5463639861485187, + "flos": 603925922304.0, + "grad_norm": 0.031486344316249366, + "language_loss": 0.85257053, + "learning_rate": 0.00044930586015455207, + "loss": 0.86420941, + "num_input_tokens_seen": 237334272, + "router_z_loss_mlp": 0.79833984, + "step": 2840, + "time_per_iteration": 2.780024290084839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168265, + "balance_loss_mlp": 1.08834755, + "epoch": 0.5465563678337823, + "flos": 643752566784.0, + "grad_norm": 0.02832807598538896, + "language_loss": 0.93569458, + "learning_rate": 0.000448995933104179, + "loss": 0.9473772, + "num_input_tokens_seen": 237415408, + "router_z_loss_mlp": 0.79736328, + "step": 2841, + "time_per_iteration": 2.848741292953491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168336, + "balance_loss_mlp": 1.08841801, + "epoch": 0.5467487495190458, + "flos": 615364243968.0, + "grad_norm": 0.03451251764660495, + "language_loss": 0.86641318, + "learning_rate": 0.00044868602585534077, + "loss": 0.87809658, + "num_input_tokens_seen": 237493232, + "router_z_loss_mlp": 0.796875, + "step": 2842, + "time_per_iteration": 2.8590362071990967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166404, + "balance_loss_mlp": 1.08677208, + "epoch": 0.5469411312043093, + "flos": 462127299072.0, + "grad_norm": 0.03329693034046033, + "language_loss": 0.9437651, + "learning_rate": 0.0004483761385283541, + "loss": 0.95542908, + "num_input_tokens_seen": 237556624, + "router_z_loss_mlp": 0.79443359, + "step": 2843, + "time_per_iteration": 2.523390769958496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116664, + "balance_loss_mlp": 1.08691323, + "epoch": 0.5471335128895729, + "flos": 562266963456.0, + "grad_norm": 0.03201679454384124, + "language_loss": 0.87509483, + "learning_rate": 0.0004480662712435281, + "loss": 0.88676119, + "num_input_tokens_seen": 237632048, + "router_z_loss_mlp": 0.79492188, + "step": 2844, + "time_per_iteration": 2.7186124324798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162399, + "balance_loss_mlp": 1.08286297, + "epoch": 0.5473258945748365, + "flos": 519685479936.0, + "grad_norm": 0.032165214678065886, + "language_loss": 0.93768156, + "learning_rate": 0.0004477564241211635, + "loss": 0.94930553, + "num_input_tokens_seen": 237699840, + "router_z_loss_mlp": 0.79345703, + "step": 2845, + "time_per_iteration": 2.5637102127075195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159503, + "balance_loss_mlp": 1.08034766, + "epoch": 0.5475182762601001, + "flos": 434744093184.0, + "grad_norm": 0.03138398317411523, + "language_loss": 0.92521811, + "learning_rate": 0.0004474465972815541, + "loss": 0.93681312, + "num_input_tokens_seen": 237762560, + "router_z_loss_mlp": 0.79101562, + "step": 2846, + "time_per_iteration": 2.470494508743286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162403, + "balance_loss_mlp": 1.08348668, + "epoch": 0.5477106579453636, + "flos": 512573799936.0, + "grad_norm": 0.02767233380819538, + "language_loss": 0.92665255, + "learning_rate": 0.000447136790844985, + "loss": 0.93827659, + "num_input_tokens_seen": 237837152, + "router_z_loss_mlp": 0.78759766, + "step": 2847, + "time_per_iteration": 2.7123520374298096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164922, + "balance_loss_mlp": 1.0861007, + "epoch": 0.5479030396306271, + "flos": 677140277760.0, + "grad_norm": 0.030326073882101023, + "language_loss": 0.85917926, + "learning_rate": 0.00044682700493173385, + "loss": 0.87082845, + "num_input_tokens_seen": 237909488, + "router_z_loss_mlp": 0.78710938, + "step": 2848, + "time_per_iteration": 2.826556921005249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166552, + "balance_loss_mlp": 1.08787405, + "epoch": 0.5480954213158907, + "flos": 877578060288.0, + "grad_norm": 0.033676298977630685, + "language_loss": 0.86673969, + "learning_rate": 0.00044651723966207004, + "loss": 0.87840521, + "num_input_tokens_seen": 237991056, + "router_z_loss_mlp": 0.78564453, + "step": 2849, + "time_per_iteration": 3.192443370819092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164243, + "balance_loss_mlp": 1.08556521, + "epoch": 0.5482878030011543, + "flos": 623174866944.0, + "grad_norm": 0.03042847520175512, + "language_loss": 0.83109522, + "learning_rate": 0.00044620749515625536, + "loss": 0.84273762, + "num_input_tokens_seen": 238064576, + "router_z_loss_mlp": 0.78564453, + "step": 2850, + "time_per_iteration": 2.7753841876983643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164392, + "balance_loss_mlp": 1.08528447, + "epoch": 0.5484801846864179, + "flos": 498257114112.0, + "grad_norm": 0.03264010932273605, + "language_loss": 0.90008557, + "learning_rate": 0.00044589777153454334, + "loss": 0.91172945, + "num_input_tokens_seen": 238136464, + "router_z_loss_mlp": 0.78857422, + "step": 2851, + "time_per_iteration": 2.7295939922332764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162977, + "balance_loss_mlp": 1.08391714, + "epoch": 0.5486725663716814, + "flos": 443353715712.0, + "grad_norm": 0.029420479903708215, + "language_loss": 0.88820338, + "learning_rate": 0.00044558806891717895, + "loss": 0.8998332, + "num_input_tokens_seen": 238198912, + "router_z_loss_mlp": 0.78808594, + "step": 2852, + "time_per_iteration": 2.4784035682678223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164311, + "balance_loss_mlp": 1.08548951, + "epoch": 0.548864948056945, + "flos": 656347728384.0, + "grad_norm": 0.02822438724303185, + "language_loss": 0.84744209, + "learning_rate": 0.0004452783874243998, + "loss": 0.8590852, + "num_input_tokens_seen": 238275184, + "router_z_loss_mlp": 0.78759766, + "step": 2853, + "time_per_iteration": 2.821592092514038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159975, + "balance_loss_mlp": 1.08105898, + "epoch": 0.5490573297422086, + "flos": 547140544512.0, + "grad_norm": 0.03150495246723179, + "language_loss": 0.90787637, + "learning_rate": 0.00044496872717643475, + "loss": 0.91947615, + "num_input_tokens_seen": 238348496, + "router_z_loss_mlp": 0.78710938, + "step": 2854, + "time_per_iteration": 2.6908938884735107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011614, + "balance_loss_mlp": 1.08415222, + "epoch": 0.5492497114274721, + "flos": 1593760897536.0, + "grad_norm": 0.006862097523809848, + "language_loss": 0.77089292, + "learning_rate": 0.00044465908829350453, + "loss": 0.78250694, + "num_input_tokens_seen": 238578464, + "router_z_loss_mlp": 0.77148438, + "step": 2855, + "time_per_iteration": 4.92158579826355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159374, + "balance_loss_mlp": 1.08036256, + "epoch": 0.5494420931127356, + "flos": 752269539840.0, + "grad_norm": 0.030842116299214104, + "language_loss": 0.87009478, + "learning_rate": 0.0004443494708958217, + "loss": 0.88168848, + "num_input_tokens_seen": 238660256, + "router_z_loss_mlp": 0.78759766, + "step": 2856, + "time_per_iteration": 2.952693223953247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155384, + "balance_loss_mlp": 1.07627714, + "epoch": 0.5496344747979992, + "flos": 627304123392.0, + "grad_norm": 0.026887140123268247, + "language_loss": 0.85396117, + "learning_rate": 0.0004440398751035906, + "loss": 0.86551499, + "num_input_tokens_seen": 238745856, + "router_z_loss_mlp": 0.79052734, + "step": 2857, + "time_per_iteration": 2.8657121658325195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156313, + "balance_loss_mlp": 1.07691979, + "epoch": 0.5498268564832628, + "flos": 524124913152.0, + "grad_norm": 0.03681476772579859, + "language_loss": 0.90347362, + "learning_rate": 0.00044373030103700645, + "loss": 0.9150368, + "num_input_tokens_seen": 238813888, + "router_z_loss_mlp": 0.79248047, + "step": 2858, + "time_per_iteration": 2.6372759342193604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161253, + "balance_loss_mlp": 1.08185947, + "epoch": 0.5500192381685264, + "flos": 605777702400.0, + "grad_norm": 0.027579474955625485, + "language_loss": 0.8405782, + "learning_rate": 0.000443420748816257, + "loss": 0.85219079, + "num_input_tokens_seen": 238885440, + "router_z_loss_mlp": 0.79248047, + "step": 2859, + "time_per_iteration": 2.832864999771118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163587, + "balance_loss_mlp": 1.08395553, + "epoch": 0.55021161985379, + "flos": 521654780928.0, + "grad_norm": 0.03409053016014856, + "language_loss": 0.84214079, + "learning_rate": 0.0004431112185615208, + "loss": 0.85377669, + "num_input_tokens_seen": 238960944, + "router_z_loss_mlp": 0.79443359, + "step": 2860, + "time_per_iteration": 2.7533481121063232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165675, + "balance_loss_mlp": 1.0862813, + "epoch": 0.5504040015390534, + "flos": 490654609920.0, + "grad_norm": 0.028251427239966796, + "language_loss": 0.84584463, + "learning_rate": 0.00044280171039296845, + "loss": 0.85750139, + "num_input_tokens_seen": 239030592, + "router_z_loss_mlp": 0.79296875, + "step": 2861, + "time_per_iteration": 2.6798369884490967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116251, + "balance_loss_mlp": 1.08306909, + "epoch": 0.550596383224317, + "flos": 576861625344.0, + "grad_norm": 0.030462386563617952, + "language_loss": 0.93688512, + "learning_rate": 0.0004424922244307616, + "loss": 0.94851023, + "num_input_tokens_seen": 239097440, + "router_z_loss_mlp": 0.79296875, + "step": 2862, + "time_per_iteration": 2.7042698860168457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164147, + "balance_loss_mlp": 1.08461094, + "epoch": 0.5507887649095806, + "flos": 643633044480.0, + "grad_norm": 0.03244616812289036, + "language_loss": 0.87943101, + "learning_rate": 0.00044218276079505315, + "loss": 0.89107251, + "num_input_tokens_seen": 239179872, + "router_z_loss_mlp": 0.79296875, + "step": 2863, + "time_per_iteration": 2.869657278060913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116435, + "balance_loss_mlp": 1.08490932, + "epoch": 0.5509811465948442, + "flos": 532864791552.0, + "grad_norm": 0.03309127401700594, + "language_loss": 0.80069649, + "learning_rate": 0.0004418733196059876, + "loss": 0.81234002, + "num_input_tokens_seen": 239251264, + "router_z_loss_mlp": 0.79248047, + "step": 2864, + "time_per_iteration": 2.694439649581909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164051, + "balance_loss_mlp": 1.08489633, + "epoch": 0.5511735282801077, + "flos": 655983157248.0, + "grad_norm": 0.031218908498787497, + "language_loss": 0.85167533, + "learning_rate": 0.0004415639009837008, + "loss": 0.86331582, + "num_input_tokens_seen": 239326688, + "router_z_loss_mlp": 0.79101562, + "step": 2865, + "time_per_iteration": 2.8214035034179688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160959, + "balance_loss_mlp": 1.08175683, + "epoch": 0.5513659099653713, + "flos": 530609508864.0, + "grad_norm": 0.029306479659861318, + "language_loss": 0.87106019, + "learning_rate": 0.00044125450504831955, + "loss": 0.88266975, + "num_input_tokens_seen": 239401248, + "router_z_loss_mlp": 0.79150391, + "step": 2866, + "time_per_iteration": 2.7755370140075684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157699, + "balance_loss_mlp": 1.0782584, + "epoch": 0.5515582916506349, + "flos": 555973748736.0, + "grad_norm": 0.03358668454464356, + "language_loss": 0.88577026, + "learning_rate": 0.0004409451319199622, + "loss": 0.89734721, + "num_input_tokens_seen": 239471600, + "router_z_loss_mlp": 0.79248047, + "step": 2867, + "time_per_iteration": 2.700601577758789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160497, + "balance_loss_mlp": 1.08105552, + "epoch": 0.5517506733358984, + "flos": 736771819008.0, + "grad_norm": 0.033780629576782226, + "language_loss": 0.90037191, + "learning_rate": 0.0004406357817187381, + "loss": 0.91197693, + "num_input_tokens_seen": 239548592, + "router_z_loss_mlp": 0.79248047, + "step": 2868, + "time_per_iteration": 2.9809505939483643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160757, + "balance_loss_mlp": 1.0816493, + "epoch": 0.551943055021162, + "flos": 1117189206528.0, + "grad_norm": 0.02667902344135768, + "language_loss": 0.86254233, + "learning_rate": 0.0004403264545647474, + "loss": 0.87414992, + "num_input_tokens_seen": 239644432, + "router_z_loss_mlp": 0.79052734, + "step": 2869, + "time_per_iteration": 3.5932819843292236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156378, + "balance_loss_mlp": 1.07727027, + "epoch": 0.5521354367064255, + "flos": 545501612544.0, + "grad_norm": 0.024843999573841903, + "language_loss": 0.89363241, + "learning_rate": 0.00044001715057808154, + "loss": 0.90519619, + "num_input_tokens_seen": 239723392, + "router_z_loss_mlp": 0.79052734, + "step": 2870, + "time_per_iteration": 2.7333626747131348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159059, + "balance_loss_mlp": 1.07999909, + "epoch": 0.5523278183916891, + "flos": 937871614464.0, + "grad_norm": 0.027996488517333572, + "language_loss": 0.86652702, + "learning_rate": 0.0004397078698788232, + "loss": 0.87811756, + "num_input_tokens_seen": 239806896, + "router_z_loss_mlp": 0.79003906, + "step": 2871, + "time_per_iteration": 3.199366807937622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168602, + "balance_loss_mlp": 1.0909729, + "epoch": 0.5525202000769527, + "flos": 1469098927104.0, + "grad_norm": 0.009568898658781464, + "language_loss": 0.80442369, + "learning_rate": 0.0004393986125870456, + "loss": 0.81610966, + "num_input_tokens_seen": 240037824, + "router_z_loss_mlp": 0.77539062, + "step": 2872, + "time_per_iteration": 4.912739515304565 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163231, + "balance_loss_mlp": 1.08426642, + "epoch": 0.5527125817622163, + "flos": 490784865792.0, + "grad_norm": 0.03313805620558485, + "language_loss": 0.83656394, + "learning_rate": 0.00043908937882281343, + "loss": 0.84819627, + "num_input_tokens_seen": 240107952, + "router_z_loss_mlp": 0.78808594, + "step": 2873, + "time_per_iteration": 2.6517224311828613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163059, + "balance_loss_mlp": 1.08409429, + "epoch": 0.5529049634474797, + "flos": 636148061184.0, + "grad_norm": 0.033554896267230024, + "language_loss": 0.87775517, + "learning_rate": 0.0004387801687061814, + "loss": 0.88938576, + "num_input_tokens_seen": 240183824, + "router_z_loss_mlp": 0.78710938, + "step": 2874, + "time_per_iteration": 2.8159070014953613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159743, + "balance_loss_mlp": 1.08073115, + "epoch": 0.5530973451327433, + "flos": 582434429952.0, + "grad_norm": 0.02986403100144585, + "language_loss": 0.86760765, + "learning_rate": 0.0004384709823571958, + "loss": 0.87920505, + "num_input_tokens_seen": 240259296, + "router_z_loss_mlp": 0.78857422, + "step": 2875, + "time_per_iteration": 2.755831480026245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158961, + "balance_loss_mlp": 1.08004439, + "epoch": 0.5532897268180069, + "flos": 1124329084416.0, + "grad_norm": 0.02992932493519035, + "language_loss": 0.88625169, + "learning_rate": 0.0004381618198958932, + "loss": 0.89784127, + "num_input_tokens_seen": 240346768, + "router_z_loss_mlp": 0.78662109, + "step": 2876, + "time_per_iteration": 3.504112720489502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157815, + "balance_loss_mlp": 1.0788027, + "epoch": 0.5534821085032705, + "flos": 638512132608.0, + "grad_norm": 0.032170459842753865, + "language_loss": 0.89321101, + "learning_rate": 0.00043785268144230137, + "loss": 0.90478921, + "num_input_tokens_seen": 240429344, + "router_z_loss_mlp": 0.78808594, + "step": 2877, + "time_per_iteration": 2.889683961868286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158076, + "balance_loss_mlp": 1.07911134, + "epoch": 0.5536744901885341, + "flos": 572216074752.0, + "grad_norm": 0.0339903958733494, + "language_loss": 0.87417912, + "learning_rate": 0.00043754356711643837, + "loss": 0.88575995, + "num_input_tokens_seen": 240497008, + "router_z_loss_mlp": 0.78759766, + "step": 2878, + "time_per_iteration": 2.6604373455047607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115856, + "balance_loss_mlp": 1.07950056, + "epoch": 0.5538668718737976, + "flos": 596916300288.0, + "grad_norm": 0.029580626213001865, + "language_loss": 0.88473797, + "learning_rate": 0.0004372344770383132, + "loss": 0.89632356, + "num_input_tokens_seen": 240578432, + "router_z_loss_mlp": 0.78808594, + "step": 2879, + "time_per_iteration": 2.7906830310821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011565, + "balance_loss_mlp": 1.07753599, + "epoch": 0.5540592535590612, + "flos": 533718185472.0, + "grad_norm": 0.030293675767491222, + "language_loss": 0.88174736, + "learning_rate": 0.00043692541132792507, + "loss": 0.89331234, + "num_input_tokens_seen": 240649136, + "router_z_loss_mlp": 0.78710938, + "step": 2880, + "time_per_iteration": 2.7152342796325684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156751, + "balance_loss_mlp": 1.07764363, + "epoch": 0.5542516352443247, + "flos": 413504380416.0, + "grad_norm": 0.03343546183057337, + "language_loss": 0.89203489, + "learning_rate": 0.00043661637010526384, + "loss": 0.90360242, + "num_input_tokens_seen": 240714240, + "router_z_loss_mlp": 0.78857422, + "step": 2881, + "time_per_iteration": 2.4759325981140137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156889, + "balance_loss_mlp": 1.07792521, + "epoch": 0.5544440169295883, + "flos": 548677418496.0, + "grad_norm": 0.03944129006740139, + "language_loss": 0.89678496, + "learning_rate": 0.00043630735349031025, + "loss": 0.90835381, + "num_input_tokens_seen": 240786928, + "router_z_loss_mlp": 0.78759766, + "step": 2882, + "time_per_iteration": 2.6376428604125977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157119, + "balance_loss_mlp": 1.07815528, + "epoch": 0.5546363986148518, + "flos": 623033877504.0, + "grad_norm": 0.025659357486645176, + "language_loss": 0.85712773, + "learning_rate": 0.00043599836160303495, + "loss": 0.86869895, + "num_input_tokens_seen": 240865328, + "router_z_loss_mlp": 0.78710938, + "step": 2883, + "time_per_iteration": 2.861966133117676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155488, + "balance_loss_mlp": 1.07633352, + "epoch": 0.5548287803001154, + "flos": 706579379712.0, + "grad_norm": 0.03141972013571756, + "language_loss": 0.82934201, + "learning_rate": 0.0004356893945633995, + "loss": 0.8408969, + "num_input_tokens_seen": 240945680, + "router_z_loss_mlp": 0.7890625, + "step": 2884, + "time_per_iteration": 2.9471499919891357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154456, + "balance_loss_mlp": 1.07534921, + "epoch": 0.555021161985379, + "flos": 505184143872.0, + "grad_norm": 0.031430850490502316, + "language_loss": 0.85807753, + "learning_rate": 0.0004353804524913551, + "loss": 0.86962205, + "num_input_tokens_seen": 241010800, + "router_z_loss_mlp": 0.78857422, + "step": 2885, + "time_per_iteration": 2.6182141304016113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154918, + "balance_loss_mlp": 1.07576323, + "epoch": 0.5552135436706426, + "flos": 617209293312.0, + "grad_norm": 0.033803824808406595, + "language_loss": 0.88278472, + "learning_rate": 0.0004350715355068441, + "loss": 0.89433384, + "num_input_tokens_seen": 241085328, + "router_z_loss_mlp": 0.7890625, + "step": 2886, + "time_per_iteration": 2.815993547439575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154719, + "balance_loss_mlp": 1.07556415, + "epoch": 0.5554059253559062, + "flos": 464817010176.0, + "grad_norm": 0.03994579560883884, + "language_loss": 0.85848737, + "learning_rate": 0.00043476264372979847, + "loss": 0.87003452, + "num_input_tokens_seen": 241149600, + "router_z_loss_mlp": 0.7890625, + "step": 2887, + "time_per_iteration": 2.5898871421813965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154914, + "balance_loss_mlp": 1.07618785, + "epoch": 0.5555983070411696, + "flos": 1564874841600.0, + "grad_norm": 0.03588081892536478, + "language_loss": 0.85341823, + "learning_rate": 0.0004344537772801408, + "loss": 0.86496735, + "num_input_tokens_seen": 241244832, + "router_z_loss_mlp": 0.78613281, + "step": 2888, + "time_per_iteration": 3.880375385284424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158798, + "balance_loss_mlp": 1.0821228, + "epoch": 0.5557906887264332, + "flos": 1471226681856.0, + "grad_norm": 0.005822600355857551, + "language_loss": 0.73422456, + "learning_rate": 0.0004341449362777836, + "loss": 0.74581254, + "num_input_tokens_seen": 241479728, + "router_z_loss_mlp": 0.76757812, + "step": 2889, + "time_per_iteration": 4.9117255210876465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155617, + "balance_loss_mlp": 1.07670069, + "epoch": 0.5559830704116968, + "flos": 530863289856.0, + "grad_norm": 0.03666523888945824, + "language_loss": 0.89283395, + "learning_rate": 0.0004338361208426298, + "loss": 0.90439016, + "num_input_tokens_seen": 241545616, + "router_z_loss_mlp": 0.78710938, + "step": 2890, + "time_per_iteration": 2.6093485355377197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155534, + "balance_loss_mlp": 1.07671309, + "epoch": 0.5561754520969604, + "flos": 652518641664.0, + "grad_norm": 0.027207956668339604, + "language_loss": 0.85981715, + "learning_rate": 0.00043352733109457164, + "loss": 0.87137252, + "num_input_tokens_seen": 241629040, + "router_z_loss_mlp": 0.78710938, + "step": 2891, + "time_per_iteration": 2.929133892059326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155522, + "balance_loss_mlp": 1.07670057, + "epoch": 0.556367833782224, + "flos": 735618981888.0, + "grad_norm": 0.028477777137297752, + "language_loss": 0.89055073, + "learning_rate": 0.00043321856715349244, + "loss": 0.90210593, + "num_input_tokens_seen": 241706272, + "router_z_loss_mlp": 0.78662109, + "step": 2892, + "time_per_iteration": 2.94014573097229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154528, + "balance_loss_mlp": 1.0758971, + "epoch": 0.5565602154674875, + "flos": 673640833536.0, + "grad_norm": 0.028305708839331062, + "language_loss": 0.85380936, + "learning_rate": 0.00043290982913926466, + "loss": 0.8653546, + "num_input_tokens_seen": 241782304, + "router_z_loss_mlp": 0.78564453, + "step": 2893, + "time_per_iteration": 2.797816038131714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153225, + "balance_loss_mlp": 1.07449973, + "epoch": 0.556752597152751, + "flos": 587503675392.0, + "grad_norm": 0.03108865563447884, + "language_loss": 0.90100253, + "learning_rate": 0.0004326011171717514, + "loss": 0.91253483, + "num_input_tokens_seen": 241868576, + "router_z_loss_mlp": 0.78613281, + "step": 2894, + "time_per_iteration": 2.885183334350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153367, + "balance_loss_mlp": 1.07426023, + "epoch": 0.5569449788380146, + "flos": 438690700800.0, + "grad_norm": 0.03571349027789826, + "language_loss": 0.87187707, + "learning_rate": 0.0004322924313708051, + "loss": 0.88341075, + "num_input_tokens_seen": 241933696, + "router_z_loss_mlp": 0.78857422, + "step": 2895, + "time_per_iteration": 2.505321502685547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115508, + "balance_loss_mlp": 1.07635403, + "epoch": 0.5571373605232782, + "flos": 503247770112.0, + "grad_norm": 0.03410983593663488, + "language_loss": 0.90630054, + "learning_rate": 0.0004319837718562681, + "loss": 0.91785133, + "num_input_tokens_seen": 242003056, + "router_z_loss_mlp": 0.78613281, + "step": 2896, + "time_per_iteration": 2.6243269443511963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154122, + "balance_loss_mlp": 1.07530081, + "epoch": 0.5573297422085417, + "flos": 578589880320.0, + "grad_norm": 0.033933273128928194, + "language_loss": 0.88206899, + "learning_rate": 0.0004316751387479726, + "loss": 0.89361024, + "num_input_tokens_seen": 242076368, + "router_z_loss_mlp": 0.78662109, + "step": 2897, + "time_per_iteration": 2.7566635608673096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153209, + "balance_loss_mlp": 1.074579, + "epoch": 0.5575221238938053, + "flos": 1346047512576.0, + "grad_norm": 0.03456307454544867, + "language_loss": 0.88955474, + "learning_rate": 0.0004313665321657409, + "loss": 0.90108681, + "num_input_tokens_seen": 242161600, + "router_z_loss_mlp": 0.78564453, + "step": 2898, + "time_per_iteration": 3.766465187072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155323, + "balance_loss_mlp": 1.07616794, + "epoch": 0.5577145055790689, + "flos": 603098724864.0, + "grad_norm": 0.03371138021934881, + "language_loss": 0.86232543, + "learning_rate": 0.00043105795222938436, + "loss": 0.8738786, + "num_input_tokens_seen": 242237904, + "router_z_loss_mlp": 0.7890625, + "step": 2899, + "time_per_iteration": 2.7334022521972656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155497, + "balance_loss_mlp": 1.07658088, + "epoch": 0.5579068872643325, + "flos": 563691045888.0, + "grad_norm": 0.045182395108838744, + "language_loss": 0.86075807, + "learning_rate": 0.00043074939905870467, + "loss": 0.87231296, + "num_input_tokens_seen": 242306736, + "router_z_loss_mlp": 0.78759766, + "step": 2900, + "time_per_iteration": 2.696669340133667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155611, + "balance_loss_mlp": 1.0766468, + "epoch": 0.558099268949596, + "flos": 545588207616.0, + "grad_norm": 0.03640236345196184, + "language_loss": 0.86178941, + "learning_rate": 0.0004304408727734927, + "loss": 0.87334555, + "num_input_tokens_seen": 242376000, + "router_z_loss_mlp": 0.78759766, + "step": 2901, + "time_per_iteration": 2.62982439994812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115605, + "balance_loss_mlp": 1.07727695, + "epoch": 0.5582916506348595, + "flos": 553852724736.0, + "grad_norm": 0.027303392187282394, + "language_loss": 0.9274894, + "learning_rate": 0.0004301323734935288, + "loss": 0.93904984, + "num_input_tokens_seen": 242447056, + "router_z_loss_mlp": 0.78613281, + "step": 2902, + "time_per_iteration": 2.705291986465454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164959, + "balance_loss_mlp": 1.08632815, + "epoch": 0.5584840323201231, + "flos": 544424636928.0, + "grad_norm": 0.032065850930778406, + "language_loss": 0.92794406, + "learning_rate": 0.000429823901338583, + "loss": 0.93959367, + "num_input_tokens_seen": 242514400, + "router_z_loss_mlp": 0.78564453, + "step": 2903, + "time_per_iteration": 2.620115041732788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162843, + "balance_loss_mlp": 1.08421218, + "epoch": 0.5586764140053867, + "flos": 817021992960.0, + "grad_norm": 0.03266293414683286, + "language_loss": 0.92888266, + "learning_rate": 0.00042951545642841513, + "loss": 0.94051105, + "num_input_tokens_seen": 242601616, + "router_z_loss_mlp": 0.78564453, + "step": 2904, + "time_per_iteration": 3.066140651702881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160381, + "balance_loss_mlp": 1.08165538, + "epoch": 0.5588687956906503, + "flos": 487415677440.0, + "grad_norm": 0.02932995016233391, + "language_loss": 0.91419339, + "learning_rate": 0.0004292070388827737, + "loss": 0.92579722, + "num_input_tokens_seen": 242669648, + "router_z_loss_mlp": 0.78613281, + "step": 2905, + "time_per_iteration": 2.5493688583374023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153401, + "balance_loss_mlp": 1.07453251, + "epoch": 0.5590611773759138, + "flos": 453068511744.0, + "grad_norm": 0.02745082882239035, + "language_loss": 0.85835731, + "learning_rate": 0.00042889864882139753, + "loss": 0.86989129, + "num_input_tokens_seen": 242737456, + "router_z_loss_mlp": 0.78710938, + "step": 2906, + "time_per_iteration": 2.572270631790161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115253, + "balance_loss_mlp": 1.07347012, + "epoch": 0.5592535590611774, + "flos": 521956225536.0, + "grad_norm": 0.03525028250709423, + "language_loss": 0.87143886, + "learning_rate": 0.0004285902863640139, + "loss": 0.88296419, + "num_input_tokens_seen": 242807008, + "router_z_loss_mlp": 0.78857422, + "step": 2907, + "time_per_iteration": 2.657799482345581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153209, + "balance_loss_mlp": 1.07448292, + "epoch": 0.5594459407464409, + "flos": 553600945152.0, + "grad_norm": 0.02873947635122419, + "language_loss": 0.90871602, + "learning_rate": 0.00042828195163033966, + "loss": 0.92024809, + "num_input_tokens_seen": 242877328, + "router_z_loss_mlp": 0.78613281, + "step": 2908, + "time_per_iteration": 2.6421632766723633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152251, + "balance_loss_mlp": 1.07323921, + "epoch": 0.5596383224317045, + "flos": 485787479040.0, + "grad_norm": 0.030747286656696786, + "language_loss": 0.84394485, + "learning_rate": 0.0004279736447400812, + "loss": 0.85546738, + "num_input_tokens_seen": 242943152, + "router_z_loss_mlp": 0.78808594, + "step": 2909, + "time_per_iteration": 2.571681022644043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152122, + "balance_loss_mlp": 1.07344413, + "epoch": 0.5598307041169681, + "flos": 612379092480.0, + "grad_norm": 0.030942423142950287, + "language_loss": 0.83957374, + "learning_rate": 0.00042766536581293385, + "loss": 0.85109496, + "num_input_tokens_seen": 243014656, + "router_z_loss_mlp": 0.78613281, + "step": 2910, + "time_per_iteration": 2.7282116413116455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155729, + "balance_loss_mlp": 1.07662177, + "epoch": 0.5600230858022316, + "flos": 489916735488.0, + "grad_norm": 0.03226747500803281, + "language_loss": 0.85277241, + "learning_rate": 0.0004273571149685819, + "loss": 0.86432964, + "num_input_tokens_seen": 243089040, + "router_z_loss_mlp": 0.78857422, + "step": 2911, + "time_per_iteration": 2.787032127380371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154593, + "balance_loss_mlp": 1.0759151, + "epoch": 0.5602154674874952, + "flos": 599981316096.0, + "grad_norm": 0.03215276166374932, + "language_loss": 0.88704693, + "learning_rate": 0.00042704889232669937, + "loss": 0.89859283, + "num_input_tokens_seen": 243162480, + "router_z_loss_mlp": 0.78613281, + "step": 2912, + "time_per_iteration": 2.686586856842041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154743, + "balance_loss_mlp": 1.07611275, + "epoch": 0.5604078491727588, + "flos": 587062516224.0, + "grad_norm": 0.032254540051477425, + "language_loss": 0.9111523, + "learning_rate": 0.0004267406980069484, + "loss": 0.92269969, + "num_input_tokens_seen": 243232880, + "router_z_loss_mlp": 0.78466797, + "step": 2913, + "time_per_iteration": 2.6899847984313965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154041, + "balance_loss_mlp": 1.07545817, + "epoch": 0.5606002308580224, + "flos": 542327808000.0, + "grad_norm": 0.028324891167666608, + "language_loss": 0.8452785, + "learning_rate": 0.0004264325321289808, + "loss": 0.85681891, + "num_input_tokens_seen": 243309168, + "router_z_loss_mlp": 0.78515625, + "step": 2914, + "time_per_iteration": 2.770299196243286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151899, + "balance_loss_mlp": 1.07331622, + "epoch": 0.5607926125432858, + "flos": 585078478848.0, + "grad_norm": 0.03365993170310601, + "language_loss": 0.91764051, + "learning_rate": 0.00042612439481243736, + "loss": 0.92915952, + "num_input_tokens_seen": 243382064, + "router_z_loss_mlp": 0.78515625, + "step": 2915, + "time_per_iteration": 2.7451834678649902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162837, + "balance_loss_mlp": 1.08406377, + "epoch": 0.5609849942285494, + "flos": 628630150656.0, + "grad_norm": 0.03395322139017605, + "language_loss": 0.95402431, + "learning_rate": 0.00042581628617694735, + "loss": 0.96565264, + "num_input_tokens_seen": 243452064, + "router_z_loss_mlp": 0.78613281, + "step": 2916, + "time_per_iteration": 2.7379772663116455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157541, + "balance_loss_mlp": 1.07871938, + "epoch": 0.561177375913813, + "flos": 589454785536.0, + "grad_norm": 0.03197816551531196, + "language_loss": 0.86920869, + "learning_rate": 0.0004255082063421296, + "loss": 0.88078409, + "num_input_tokens_seen": 243525600, + "router_z_loss_mlp": 0.78759766, + "step": 2917, + "time_per_iteration": 2.7153422832489014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161631, + "balance_loss_mlp": 1.08285797, + "epoch": 0.5613697575990766, + "flos": 528143379456.0, + "grad_norm": 0.03128753614155992, + "language_loss": 0.89917612, + "learning_rate": 0.00042520015542759065, + "loss": 0.91079247, + "num_input_tokens_seen": 243605536, + "router_z_loss_mlp": 0.78710938, + "step": 2918, + "time_per_iteration": 2.8688042163848877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165136, + "balance_loss_mlp": 1.08636212, + "epoch": 0.5615621392843402, + "flos": 643874090496.0, + "grad_norm": 0.03249260096588731, + "language_loss": 0.93211949, + "learning_rate": 0.00042489213355292687, + "loss": 0.94377089, + "num_input_tokens_seen": 243684208, + "router_z_loss_mlp": 0.78613281, + "step": 2919, + "time_per_iteration": 2.8982832431793213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167734, + "balance_loss_mlp": 1.08900821, + "epoch": 0.5617545209696037, + "flos": 428656995840.0, + "grad_norm": 0.034334958581954525, + "language_loss": 0.87036526, + "learning_rate": 0.00042458414083772276, + "loss": 0.88204259, + "num_input_tokens_seen": 243749376, + "router_z_loss_mlp": 0.78466797, + "step": 2920, + "time_per_iteration": 2.5067636966705322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164187, + "balance_loss_mlp": 1.08536625, + "epoch": 0.5619469026548672, + "flos": 569589490176.0, + "grad_norm": 0.025989129211014445, + "language_loss": 0.89547098, + "learning_rate": 0.000424276177401552, + "loss": 0.90711284, + "num_input_tokens_seen": 243828096, + "router_z_loss_mlp": 0.78710938, + "step": 2921, + "time_per_iteration": 2.810723304748535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158764, + "balance_loss_mlp": 1.07975173, + "epoch": 0.5621392843401308, + "flos": 506243655168.0, + "grad_norm": 0.03554030610259364, + "language_loss": 0.91916943, + "learning_rate": 0.0004239682433639763, + "loss": 0.93075705, + "num_input_tokens_seen": 243896752, + "router_z_loss_mlp": 0.7890625, + "step": 2922, + "time_per_iteration": 2.6607391834259033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159452, + "balance_loss_mlp": 1.08034527, + "epoch": 0.5623316660253944, + "flos": 518009617920.0, + "grad_norm": 0.03283867999662062, + "language_loss": 0.91225737, + "learning_rate": 0.0004236603388445467, + "loss": 0.92385185, + "num_input_tokens_seen": 243964592, + "router_z_loss_mlp": 0.78955078, + "step": 2923, + "time_per_iteration": 2.586524248123169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159206, + "balance_loss_mlp": 1.08043242, + "epoch": 0.5625240477106579, + "flos": 607138658304.0, + "grad_norm": 0.07898356089021562, + "language_loss": 0.87176222, + "learning_rate": 0.00042335246396280166, + "loss": 0.88335431, + "num_input_tokens_seen": 244036656, + "router_z_loss_mlp": 0.78710938, + "step": 2924, + "time_per_iteration": 2.7597639560699463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115906, + "balance_loss_mlp": 1.08004844, + "epoch": 0.5627164293959215, + "flos": 451340256768.0, + "grad_norm": 0.0302800933285396, + "language_loss": 0.96241242, + "learning_rate": 0.0004230446188382693, + "loss": 0.97400308, + "num_input_tokens_seen": 244102704, + "router_z_loss_mlp": 0.7890625, + "step": 2925, + "time_per_iteration": 2.573899030685425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158596, + "balance_loss_mlp": 1.07977474, + "epoch": 0.5629088110811851, + "flos": 743436335616.0, + "grad_norm": 0.03229142562201564, + "language_loss": 0.85888505, + "learning_rate": 0.0004227368035904654, + "loss": 0.87047106, + "num_input_tokens_seen": 244186640, + "router_z_loss_mlp": 0.78759766, + "step": 2926, + "time_per_iteration": 2.9811575412750244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161727, + "balance_loss_mlp": 1.08295333, + "epoch": 0.5631011927664487, + "flos": 497979138048.0, + "grad_norm": 0.030188812186764755, + "language_loss": 0.88692701, + "learning_rate": 0.00042242901833889474, + "loss": 0.89854425, + "num_input_tokens_seen": 244257680, + "router_z_loss_mlp": 0.78710938, + "step": 2927, + "time_per_iteration": 2.6326565742492676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160764, + "balance_loss_mlp": 1.08194327, + "epoch": 0.5632935744517122, + "flos": 887594300928.0, + "grad_norm": 0.033144673445412554, + "language_loss": 0.91819888, + "learning_rate": 0.0004221212632030501, + "loss": 0.92980659, + "num_input_tokens_seen": 244331248, + "router_z_loss_mlp": 0.78759766, + "step": 2928, + "time_per_iteration": 3.0669453144073486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115887, + "balance_loss_mlp": 1.08014381, + "epoch": 0.5634859561369757, + "flos": 605901227520.0, + "grad_norm": 0.03167965641147859, + "language_loss": 0.85548306, + "learning_rate": 0.0004218135383024124, + "loss": 0.86707169, + "num_input_tokens_seen": 244403920, + "router_z_loss_mlp": 0.78662109, + "step": 2929, + "time_per_iteration": 2.704127788543701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154152, + "balance_loss_mlp": 1.07542574, + "epoch": 0.5636783378222393, + "flos": 454902827520.0, + "grad_norm": 0.0331862396137692, + "language_loss": 0.91072655, + "learning_rate": 0.0004215058437564511, + "loss": 0.92226809, + "num_input_tokens_seen": 244470464, + "router_z_loss_mlp": 0.78662109, + "step": 2930, + "time_per_iteration": 2.5648486614227295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153736, + "balance_loss_mlp": 1.07496285, + "epoch": 0.5638707195075029, + "flos": 519461898240.0, + "grad_norm": 0.030026295980520465, + "language_loss": 0.87243164, + "learning_rate": 0.00042119817968462397, + "loss": 0.88396895, + "num_input_tokens_seen": 244536864, + "router_z_loss_mlp": 0.78613281, + "step": 2931, + "time_per_iteration": 2.596165895462036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011545, + "balance_loss_mlp": 1.07572603, + "epoch": 0.5640631011927665, + "flos": 565844270592.0, + "grad_norm": 0.035813464167598875, + "language_loss": 0.92307299, + "learning_rate": 0.0004208905462063766, + "loss": 0.934618, + "num_input_tokens_seen": 244603344, + "router_z_loss_mlp": 0.78564453, + "step": 2932, + "time_per_iteration": 2.6596782207489014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161524, + "balance_loss_mlp": 1.0827024, + "epoch": 0.56425548287803, + "flos": 518037815808.0, + "grad_norm": 0.03163601566095553, + "language_loss": 0.90576756, + "learning_rate": 0.00042058294344114315, + "loss": 0.91738278, + "num_input_tokens_seen": 244671984, + "router_z_loss_mlp": 0.78564453, + "step": 2933, + "time_per_iteration": 2.6681416034698486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160347, + "balance_loss_mlp": 1.08157361, + "epoch": 0.5644478645632935, + "flos": 855669603840.0, + "grad_norm": 0.031443670044009366, + "language_loss": 0.83703303, + "learning_rate": 0.0004202753715083456, + "loss": 0.84863651, + "num_input_tokens_seen": 244754000, + "router_z_loss_mlp": 0.78515625, + "step": 2934, + "time_per_iteration": 3.1047325134277344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159543, + "balance_loss_mlp": 1.08081746, + "epoch": 0.5646402462485571, + "flos": 554495271936.0, + "grad_norm": 0.034946601892201584, + "language_loss": 0.87802339, + "learning_rate": 0.0004199678305273936, + "loss": 0.88961881, + "num_input_tokens_seen": 244820896, + "router_z_loss_mlp": 0.78613281, + "step": 2935, + "time_per_iteration": 2.649768352508545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159598, + "balance_loss_mlp": 1.08092046, + "epoch": 0.5648326279338207, + "flos": 687310969344.0, + "grad_norm": 0.04027660967531297, + "language_loss": 0.86366433, + "learning_rate": 0.0004196603206176854, + "loss": 0.87526035, + "num_input_tokens_seen": 244904464, + "router_z_loss_mlp": 0.78613281, + "step": 2936, + "time_per_iteration": 2.916745662689209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158764, + "balance_loss_mlp": 1.08003819, + "epoch": 0.5650250096190843, + "flos": 804682613760.0, + "grad_norm": 0.03045212290633188, + "language_loss": 0.89034498, + "learning_rate": 0.000419352841898607, + "loss": 0.9019326, + "num_input_tokens_seen": 244983760, + "router_z_loss_mlp": 0.78662109, + "step": 2937, + "time_per_iteration": 3.019742250442505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154573, + "balance_loss_mlp": 1.07541847, + "epoch": 0.5652173913043478, + "flos": 583144106496.0, + "grad_norm": 0.0352415717236192, + "language_loss": 0.82975399, + "learning_rate": 0.000419045394489532, + "loss": 0.84129971, + "num_input_tokens_seen": 245053184, + "router_z_loss_mlp": 0.79003906, + "step": 2938, + "time_per_iteration": 2.7263834476470947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155775, + "balance_loss_mlp": 1.07661998, + "epoch": 0.5654097729896114, + "flos": 822167099904.0, + "grad_norm": 0.030545896529673648, + "language_loss": 0.81679785, + "learning_rate": 0.0004187379785098224, + "loss": 0.82835561, + "num_input_tokens_seen": 245137408, + "router_z_loss_mlp": 0.7890625, + "step": 2939, + "time_per_iteration": 3.125208854675293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155934, + "balance_loss_mlp": 1.07682657, + "epoch": 0.565602154674875, + "flos": 785481332736.0, + "grad_norm": 0.038076573598017076, + "language_loss": 0.89879513, + "learning_rate": 0.00041843059407882744, + "loss": 0.9103545, + "num_input_tokens_seen": 245215504, + "router_z_loss_mlp": 0.78857422, + "step": 2940, + "time_per_iteration": 2.9577417373657227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157161, + "balance_loss_mlp": 1.07814884, + "epoch": 0.5657945363601385, + "flos": 550744048128.0, + "grad_norm": 0.03292975836505615, + "language_loss": 0.88439214, + "learning_rate": 0.0004181232413158842, + "loss": 0.89596379, + "num_input_tokens_seen": 245286032, + "router_z_loss_mlp": 0.78759766, + "step": 2941, + "time_per_iteration": 2.636016845703125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156819, + "balance_loss_mlp": 1.07771146, + "epoch": 0.5659869180454021, + "flos": 669331656192.0, + "grad_norm": 0.0384606105275957, + "language_loss": 0.88344961, + "learning_rate": 0.0004178159203403179, + "loss": 0.89501786, + "num_input_tokens_seen": 245359040, + "router_z_loss_mlp": 0.78857422, + "step": 2942, + "time_per_iteration": 2.873724937438965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157408, + "balance_loss_mlp": 1.07839596, + "epoch": 0.5661792997306656, + "flos": 500948826624.0, + "grad_norm": 0.031907837289758996, + "language_loss": 0.86677325, + "learning_rate": 0.0004175086312714409, + "loss": 0.8783474, + "num_input_tokens_seen": 245426384, + "router_z_loss_mlp": 0.78808594, + "step": 2943, + "time_per_iteration": 2.553450107574463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160396, + "balance_loss_mlp": 1.08138418, + "epoch": 0.5663716814159292, + "flos": 602362851840.0, + "grad_norm": 0.02897032807353051, + "language_loss": 0.8872959, + "learning_rate": 0.00041720137422855366, + "loss": 0.89889991, + "num_input_tokens_seen": 245501216, + "router_z_loss_mlp": 0.78759766, + "step": 2944, + "time_per_iteration": 2.7116591930389404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159876, + "balance_loss_mlp": 1.08095932, + "epoch": 0.5665640631011928, + "flos": 542032367616.0, + "grad_norm": 0.031139658556859174, + "language_loss": 0.83964241, + "learning_rate": 0.00041689414933094383, + "loss": 0.85124123, + "num_input_tokens_seen": 245571600, + "router_z_loss_mlp": 0.78710938, + "step": 2945, + "time_per_iteration": 2.638216495513916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158364, + "balance_loss_mlp": 1.07968628, + "epoch": 0.5667564447864564, + "flos": 603061794816.0, + "grad_norm": 0.037847476611961306, + "language_loss": 0.8757143, + "learning_rate": 0.00041658695669788653, + "loss": 0.88729787, + "num_input_tokens_seen": 245645632, + "router_z_loss_mlp": 0.78613281, + "step": 2946, + "time_per_iteration": 2.736724615097046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159515, + "balance_loss_mlp": 1.08074152, + "epoch": 0.5669488264717198, + "flos": 660722033664.0, + "grad_norm": 0.03809672024086723, + "language_loss": 0.87564874, + "learning_rate": 0.00041627979644864453, + "loss": 0.88724387, + "num_input_tokens_seen": 245715776, + "router_z_loss_mlp": 0.78662109, + "step": 2947, + "time_per_iteration": 2.787102460861206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160652, + "balance_loss_mlp": 1.08192623, + "epoch": 0.5671412081569834, + "flos": 486382362624.0, + "grad_norm": 0.028726289994514737, + "language_loss": 0.86769605, + "learning_rate": 0.0004159726687024683, + "loss": 0.87930262, + "num_input_tokens_seen": 245785328, + "router_z_loss_mlp": 0.78662109, + "step": 2948, + "time_per_iteration": 2.627268075942993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157953, + "balance_loss_mlp": 1.07917941, + "epoch": 0.567333589842247, + "flos": 731060026368.0, + "grad_norm": 0.031224685517340662, + "language_loss": 0.85094821, + "learning_rate": 0.00041566557357859506, + "loss": 0.86252779, + "num_input_tokens_seen": 245858000, + "router_z_loss_mlp": 0.78710938, + "step": 2949, + "time_per_iteration": 2.903480052947998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115639, + "balance_loss_mlp": 1.07737851, + "epoch": 0.5675259715275106, + "flos": 970558381056.0, + "grad_norm": 0.02889906202993953, + "language_loss": 0.84761345, + "learning_rate": 0.0004153585111962502, + "loss": 0.85917735, + "num_input_tokens_seen": 245950640, + "router_z_loss_mlp": 0.78857422, + "step": 2950, + "time_per_iteration": 3.327157497406006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155395, + "balance_loss_mlp": 1.07638264, + "epoch": 0.5677183532127742, + "flos": 566213571072.0, + "grad_norm": 0.036221800053715905, + "language_loss": 0.90357536, + "learning_rate": 0.0004150514816746453, + "loss": 0.9151293, + "num_input_tokens_seen": 246019568, + "router_z_loss_mlp": 0.78857422, + "step": 2951, + "time_per_iteration": 2.664881467819214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155178, + "balance_loss_mlp": 1.07640433, + "epoch": 0.5679107348980377, + "flos": 552745549824.0, + "grad_norm": 0.032718571293428464, + "language_loss": 0.90599716, + "learning_rate": 0.0004147444851329802, + "loss": 0.91754901, + "num_input_tokens_seen": 246089520, + "router_z_loss_mlp": 0.78710938, + "step": 2952, + "time_per_iteration": 2.659607410430908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156293, + "balance_loss_mlp": 1.07752001, + "epoch": 0.5681031165833013, + "flos": 820840346112.0, + "grad_norm": 0.029462667986489877, + "language_loss": 0.91018391, + "learning_rate": 0.00041443752169044126, + "loss": 0.92174685, + "num_input_tokens_seen": 246165920, + "router_z_loss_mlp": 0.78710938, + "step": 2953, + "time_per_iteration": 3.0214719772338867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115648, + "balance_loss_mlp": 1.07775402, + "epoch": 0.5682954982685648, + "flos": 619145667072.0, + "grad_norm": 0.03021657930021912, + "language_loss": 0.89565808, + "learning_rate": 0.0004141305914662025, + "loss": 0.90722287, + "num_input_tokens_seen": 246238672, + "router_z_loss_mlp": 0.78662109, + "step": 2954, + "time_per_iteration": 2.7215545177459717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154854, + "balance_loss_mlp": 1.07608008, + "epoch": 0.5684878799538284, + "flos": 649251511296.0, + "grad_norm": 0.03170231797387521, + "language_loss": 0.85884857, + "learning_rate": 0.0004138236945794246, + "loss": 0.87039715, + "num_input_tokens_seen": 246320208, + "router_z_loss_mlp": 0.78613281, + "step": 2955, + "time_per_iteration": 2.896960496902466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154548, + "balance_loss_mlp": 1.07587004, + "epoch": 0.5686802616390919, + "flos": 807352859136.0, + "grad_norm": 0.03477888356704498, + "language_loss": 0.88849628, + "learning_rate": 0.00041351683114925576, + "loss": 0.90004176, + "num_input_tokens_seen": 246406464, + "router_z_loss_mlp": 0.78564453, + "step": 2956, + "time_per_iteration": 3.056138753890991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155475, + "balance_loss_mlp": 1.07698798, + "epoch": 0.5688726433243555, + "flos": 548175860736.0, + "grad_norm": 0.02988071875067647, + "language_loss": 0.91774637, + "learning_rate": 0.0004132100012948308, + "loss": 0.92930108, + "num_input_tokens_seen": 246477456, + "router_z_loss_mlp": 0.78320312, + "step": 2957, + "time_per_iteration": 2.620039701461792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153148, + "balance_loss_mlp": 1.07475579, + "epoch": 0.5690650250096191, + "flos": 487545933312.0, + "grad_norm": 0.03388139796228596, + "language_loss": 0.90210378, + "learning_rate": 0.00041290320513527145, + "loss": 0.91363525, + "num_input_tokens_seen": 246541744, + "router_z_loss_mlp": 0.78222656, + "step": 2958, + "time_per_iteration": 2.5424137115478516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158065, + "balance_loss_mlp": 1.07953036, + "epoch": 0.5692574066948827, + "flos": 578554951680.0, + "grad_norm": 0.03065337308060062, + "language_loss": 0.9014492, + "learning_rate": 0.0004125964427896867, + "loss": 0.91302985, + "num_input_tokens_seen": 246611440, + "router_z_loss_mlp": 0.78369141, + "step": 2959, + "time_per_iteration": 2.6540746688842773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157828, + "balance_loss_mlp": 1.07924569, + "epoch": 0.5694497883801463, + "flos": 455219735040.0, + "grad_norm": 0.03288997710459115, + "language_loss": 0.8486557, + "learning_rate": 0.0004122897143771723, + "loss": 0.86023396, + "num_input_tokens_seen": 246676496, + "router_z_loss_mlp": 0.78515625, + "step": 2960, + "time_per_iteration": 2.5677952766418457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157581, + "balance_loss_mlp": 1.07899833, + "epoch": 0.5696421700654097, + "flos": 560582369280.0, + "grad_norm": 0.029260680521972587, + "language_loss": 0.86686659, + "learning_rate": 0.0004119830200168109, + "loss": 0.87844241, + "num_input_tokens_seen": 246746464, + "router_z_loss_mlp": 0.78515625, + "step": 2961, + "time_per_iteration": 2.661398410797119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116102, + "balance_loss_mlp": 1.08243668, + "epoch": 0.5698345517506733, + "flos": 466501604352.0, + "grad_norm": 0.06131137217333051, + "language_loss": 0.93434393, + "learning_rate": 0.0004116763598276714, + "loss": 0.94595408, + "num_input_tokens_seen": 246811808, + "router_z_loss_mlp": 0.78515625, + "step": 2962, + "time_per_iteration": 2.5421509742736816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161307, + "balance_loss_mlp": 1.08267653, + "epoch": 0.5700269334359369, + "flos": 607191051264.0, + "grad_norm": 0.033090735660708526, + "language_loss": 0.8645342, + "learning_rate": 0.00041136973392881017, + "loss": 0.87614727, + "num_input_tokens_seen": 246890432, + "router_z_loss_mlp": 0.78515625, + "step": 2963, + "time_per_iteration": 2.826312303543091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116111, + "balance_loss_mlp": 1.08233654, + "epoch": 0.5702193151212005, + "flos": 563856230400.0, + "grad_norm": 0.029371137494056676, + "language_loss": 0.87366056, + "learning_rate": 0.00041106314243926983, + "loss": 0.88527167, + "num_input_tokens_seen": 246959616, + "router_z_loss_mlp": 0.78613281, + "step": 2964, + "time_per_iteration": 2.729848861694336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163001, + "balance_loss_mlp": 1.08432257, + "epoch": 0.570411696806464, + "flos": 524309563392.0, + "grad_norm": 0.030081020285570834, + "language_loss": 0.91922152, + "learning_rate": 0.0004107565854780798, + "loss": 0.93085158, + "num_input_tokens_seen": 247030656, + "router_z_loss_mlp": 0.78564453, + "step": 2965, + "time_per_iteration": 2.6243247985839844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162398, + "balance_loss_mlp": 1.08348167, + "epoch": 0.5706040784917276, + "flos": 719471983104.0, + "grad_norm": 0.03134673766290682, + "language_loss": 0.86833286, + "learning_rate": 0.000410450063164256, + "loss": 0.87995684, + "num_input_tokens_seen": 247105872, + "router_z_loss_mlp": 0.78710938, + "step": 2966, + "time_per_iteration": 2.8488268852233887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160157, + "balance_loss_mlp": 1.08109784, + "epoch": 0.5707964601769911, + "flos": 477670682112.0, + "grad_norm": 0.03469711129941245, + "language_loss": 0.88420385, + "learning_rate": 0.00041014357561680115, + "loss": 0.89580548, + "num_input_tokens_seen": 247170448, + "router_z_loss_mlp": 0.78808594, + "step": 2967, + "time_per_iteration": 2.531399965286255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158843, + "balance_loss_mlp": 1.07997382, + "epoch": 0.5709888418622547, + "flos": 581216464896.0, + "grad_norm": 0.0299141756983156, + "language_loss": 0.91230297, + "learning_rate": 0.0004098371229547039, + "loss": 0.92389137, + "num_input_tokens_seen": 247240400, + "router_z_loss_mlp": 0.78662109, + "step": 2968, + "time_per_iteration": 2.7010715007781982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166153, + "balance_loss_mlp": 1.08947754, + "epoch": 0.5711812235475183, + "flos": 1583192707584.0, + "grad_norm": 0.007250174551889785, + "language_loss": 0.80010808, + "learning_rate": 0.0004095307052969399, + "loss": 0.8117696, + "num_input_tokens_seen": 247469136, + "router_z_loss_mlp": 0.76757812, + "step": 2969, + "time_per_iteration": 4.720959663391113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158975, + "balance_loss_mlp": 1.08001077, + "epoch": 0.5713736052327818, + "flos": 469497489408.0, + "grad_norm": 0.030927251593918268, + "language_loss": 0.85219097, + "learning_rate": 0.00040922432276247107, + "loss": 0.86378068, + "num_input_tokens_seen": 247537712, + "router_z_loss_mlp": 0.78710938, + "step": 2970, + "time_per_iteration": 2.5976855754852295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155112, + "balance_loss_mlp": 1.07610035, + "epoch": 0.5715659869180454, + "flos": 538754503680.0, + "grad_norm": 0.02782082883725602, + "language_loss": 0.88734138, + "learning_rate": 0.0004089179754702457, + "loss": 0.89889252, + "num_input_tokens_seen": 247613872, + "router_z_loss_mlp": 0.78759766, + "step": 2971, + "time_per_iteration": 2.735511064529419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155002, + "balance_loss_mlp": 1.07608509, + "epoch": 0.571758368603309, + "flos": 657250787328.0, + "grad_norm": 0.03021364085019089, + "language_loss": 0.86246514, + "learning_rate": 0.00040861166353919843, + "loss": 0.87401509, + "num_input_tokens_seen": 247686064, + "router_z_loss_mlp": 0.78710938, + "step": 2972, + "time_per_iteration": 2.784243583679199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156758, + "balance_loss_mlp": 1.07808018, + "epoch": 0.5719507502885726, + "flos": 669099342336.0, + "grad_norm": 0.04093131787913085, + "language_loss": 0.87037605, + "learning_rate": 0.00040830538708824983, + "loss": 0.8819437, + "num_input_tokens_seen": 247760384, + "router_z_loss_mlp": 0.78564453, + "step": 2973, + "time_per_iteration": 2.847334861755371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156641, + "balance_loss_mlp": 1.07815385, + "epoch": 0.572143131973836, + "flos": 477279914496.0, + "grad_norm": 0.029260532033913305, + "language_loss": 0.87478364, + "learning_rate": 0.000407999146236307, + "loss": 0.88635004, + "num_input_tokens_seen": 247824768, + "router_z_loss_mlp": 0.78417969, + "step": 2974, + "time_per_iteration": 2.5809874534606934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156886, + "balance_loss_mlp": 1.07849395, + "epoch": 0.5723355136590996, + "flos": 540534425088.0, + "grad_norm": 0.03484414683288605, + "language_loss": 0.89636898, + "learning_rate": 0.0004076929411022634, + "loss": 0.90793782, + "num_input_tokens_seen": 247894448, + "router_z_loss_mlp": 0.78320312, + "step": 2975, + "time_per_iteration": 2.631016969680786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156314, + "balance_loss_mlp": 1.07782686, + "epoch": 0.5725278953443632, + "flos": 825649079808.0, + "grad_norm": 0.03393435544828211, + "language_loss": 0.84972572, + "learning_rate": 0.0004073867718049982, + "loss": 0.86128891, + "num_input_tokens_seen": 247976432, + "router_z_loss_mlp": 0.78369141, + "step": 2976, + "time_per_iteration": 3.09523606300354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158881, + "balance_loss_mlp": 1.08044088, + "epoch": 0.5727202770296268, + "flos": 588569190912.0, + "grad_norm": 0.031011693938846972, + "language_loss": 0.87586653, + "learning_rate": 0.00040708063846337704, + "loss": 0.88745534, + "num_input_tokens_seen": 248048800, + "router_z_loss_mlp": 0.78222656, + "step": 2977, + "time_per_iteration": 2.7148561477661133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159545, + "balance_loss_mlp": 1.08100963, + "epoch": 0.5729126587148904, + "flos": 447940869120.0, + "grad_norm": 0.0318916011479424, + "language_loss": 0.87124234, + "learning_rate": 0.00040677454119625143, + "loss": 0.88283777, + "num_input_tokens_seen": 248116496, + "router_z_loss_mlp": 0.78320312, + "step": 2978, + "time_per_iteration": 2.6003363132476807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158917, + "balance_loss_mlp": 1.0804776, + "epoch": 0.5731050404001539, + "flos": 520467015168.0, + "grad_norm": 0.03318988951179658, + "language_loss": 0.88396186, + "learning_rate": 0.0004064684801224587, + "loss": 0.89555109, + "num_input_tokens_seen": 248184960, + "router_z_loss_mlp": 0.78173828, + "step": 2979, + "time_per_iteration": 2.6103272438049316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160698, + "balance_loss_mlp": 1.08225846, + "epoch": 0.5732974220854175, + "flos": 505770295296.0, + "grad_norm": 0.029710652762807207, + "language_loss": 0.85663891, + "learning_rate": 0.00040616245536082224, + "loss": 0.86824596, + "num_input_tokens_seen": 248252208, + "router_z_loss_mlp": 0.78222656, + "step": 2980, + "time_per_iteration": 2.5594868659973145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159175, + "balance_loss_mlp": 1.08078313, + "epoch": 0.573489803770681, + "flos": 593677367808.0, + "grad_norm": 0.027966372317681742, + "language_loss": 0.86258745, + "learning_rate": 0.00040585646703015165, + "loss": 0.87417924, + "num_input_tokens_seen": 248333312, + "router_z_loss_mlp": 0.78320312, + "step": 2981, + "time_per_iteration": 2.789937734603882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157316, + "balance_loss_mlp": 1.07878125, + "epoch": 0.5736821854559446, + "flos": 490869459456.0, + "grad_norm": 0.031111464824263694, + "language_loss": 0.83780992, + "learning_rate": 0.0004055505152492419, + "loss": 0.84938312, + "num_input_tokens_seen": 248403808, + "router_z_loss_mlp": 0.78466797, + "step": 2982, + "time_per_iteration": 2.6471428871154785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158265, + "balance_loss_mlp": 1.07963431, + "epoch": 0.5738745671412081, + "flos": 459201271296.0, + "grad_norm": 0.03311000411840089, + "language_loss": 0.79528159, + "learning_rate": 0.00040524460013687425, + "loss": 0.80686426, + "num_input_tokens_seen": 248477184, + "router_z_loss_mlp": 0.78564453, + "step": 2983, + "time_per_iteration": 2.708540678024292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155372, + "balance_loss_mlp": 1.07650268, + "epoch": 0.5740669488264717, + "flos": 581620694016.0, + "grad_norm": 0.028109694322635652, + "language_loss": 0.86855406, + "learning_rate": 0.0004049387218118155, + "loss": 0.88010776, + "num_input_tokens_seen": 248565552, + "router_z_loss_mlp": 0.78759766, + "step": 2984, + "time_per_iteration": 2.926750421524048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155283, + "balance_loss_mlp": 1.07622325, + "epoch": 0.5742593305117353, + "flos": 525573190656.0, + "grad_norm": 0.03395381439898354, + "language_loss": 0.91635472, + "learning_rate": 0.00040463288039281777, + "loss": 0.92790747, + "num_input_tokens_seen": 248635456, + "router_z_loss_mlp": 0.78857422, + "step": 2985, + "time_per_iteration": 2.704287528991699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162964, + "balance_loss_mlp": 1.08666992, + "epoch": 0.5744517121969989, + "flos": 1557266511360.0, + "grad_norm": 0.007878379047691413, + "language_loss": 0.77876419, + "learning_rate": 0.0004043270759986194, + "loss": 0.79039383, + "num_input_tokens_seen": 248870160, + "router_z_loss_mlp": 0.76367188, + "step": 2986, + "time_per_iteration": 4.989194869995117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155742, + "balance_loss_mlp": 1.07677734, + "epoch": 0.5746440938822625, + "flos": 753202798080.0, + "grad_norm": 0.03402997808137808, + "language_loss": 0.87620312, + "learning_rate": 0.0004040213087479444, + "loss": 0.88776052, + "num_input_tokens_seen": 248946960, + "router_z_loss_mlp": 0.78759766, + "step": 2987, + "time_per_iteration": 2.9275078773498535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163311, + "balance_loss_mlp": 1.08453715, + "epoch": 0.5748364755675259, + "flos": 502857002496.0, + "grad_norm": 0.03361733343242669, + "language_loss": 0.90824878, + "learning_rate": 0.0004037155787595018, + "loss": 0.91988194, + "num_input_tokens_seen": 249014128, + "router_z_loss_mlp": 0.78710938, + "step": 2988, + "time_per_iteration": 2.576448440551758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160011, + "balance_loss_mlp": 1.08109498, + "epoch": 0.5750288572527895, + "flos": 505197605376.0, + "grad_norm": 0.02880586923954642, + "language_loss": 0.85724807, + "learning_rate": 0.000403409886151987, + "loss": 0.86884815, + "num_input_tokens_seen": 249090016, + "router_z_loss_mlp": 0.78759766, + "step": 2989, + "time_per_iteration": 2.916322946548462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157013, + "balance_loss_mlp": 1.08033752, + "epoch": 0.5752212389380531, + "flos": 1544675352576.0, + "grad_norm": 0.005932241765552608, + "language_loss": 0.81999105, + "learning_rate": 0.0004031042310440799, + "loss": 0.83156121, + "num_input_tokens_seen": 249305552, + "router_z_loss_mlp": 0.765625, + "step": 2990, + "time_per_iteration": 4.758445978164673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115937, + "balance_loss_mlp": 1.08269501, + "epoch": 0.5754136206233167, + "flos": 1570671406080.0, + "grad_norm": 0.005822498768858246, + "language_loss": 0.781986, + "learning_rate": 0.00040279861355444656, + "loss": 0.7935797, + "num_input_tokens_seen": 249523408, + "router_z_loss_mlp": 0.765625, + "step": 2991, + "time_per_iteration": 4.785308122634888 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163075, + "balance_loss_mlp": 1.08420658, + "epoch": 0.5756060023085803, + "flos": 799561701888.0, + "grad_norm": 0.0320241684810352, + "language_loss": 0.81581879, + "learning_rate": 0.00040249303380173807, + "loss": 0.82744956, + "num_input_tokens_seen": 249616624, + "router_z_loss_mlp": 0.78808594, + "step": 2992, + "time_per_iteration": 3.060910940170288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160943, + "balance_loss_mlp": 1.08202648, + "epoch": 0.5757983839938438, + "flos": 589033818624.0, + "grad_norm": 0.033230938583522406, + "language_loss": 0.85061818, + "learning_rate": 0.00040218749190459126, + "loss": 0.86222756, + "num_input_tokens_seen": 249689936, + "router_z_loss_mlp": 0.78857422, + "step": 2993, + "time_per_iteration": 2.722538948059082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159067, + "balance_loss_mlp": 1.08029306, + "epoch": 0.5759907656791073, + "flos": 517851164160.0, + "grad_norm": 0.036503805232005304, + "language_loss": 0.88598883, + "learning_rate": 0.00040188198798162775, + "loss": 0.89757949, + "num_input_tokens_seen": 249759984, + "router_z_loss_mlp": 0.78662109, + "step": 2994, + "time_per_iteration": 2.626763105392456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157444, + "balance_loss_mlp": 1.078861, + "epoch": 0.5761831473643709, + "flos": 588289213440.0, + "grad_norm": 0.030677551313055676, + "language_loss": 0.90523088, + "learning_rate": 0.000401576522151455, + "loss": 0.91680533, + "num_input_tokens_seen": 249837888, + "router_z_loss_mlp": 0.78466797, + "step": 2995, + "time_per_iteration": 2.8290417194366455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156979, + "balance_loss_mlp": 1.07839644, + "epoch": 0.5763755290496345, + "flos": 545008786944.0, + "grad_norm": 0.030026851509959627, + "language_loss": 0.87201327, + "learning_rate": 0.0004012710945326651, + "loss": 0.88358307, + "num_input_tokens_seen": 249913584, + "router_z_loss_mlp": 0.78515625, + "step": 2996, + "time_per_iteration": 2.78725004196167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156215, + "balance_loss_mlp": 1.07767999, + "epoch": 0.576567910734898, + "flos": 627427648512.0, + "grad_norm": 0.03065527687354923, + "language_loss": 0.86651611, + "learning_rate": 0.0004009657052438355, + "loss": 0.87807822, + "num_input_tokens_seen": 249992144, + "router_z_loss_mlp": 0.78271484, + "step": 2997, + "time_per_iteration": 2.8221359252929688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156096, + "balance_loss_mlp": 1.07756102, + "epoch": 0.5767602924201616, + "flos": 539277528576.0, + "grad_norm": 0.032463443859892846, + "language_loss": 0.9117527, + "learning_rate": 0.00040066035440352904, + "loss": 0.92331362, + "num_input_tokens_seen": 250060736, + "router_z_loss_mlp": 0.78271484, + "step": 2998, + "time_per_iteration": 2.614291191101074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169762, + "balance_loss_mlp": 1.09403992, + "epoch": 0.5769526741054252, + "flos": 1563023239680.0, + "grad_norm": 0.012552051598097233, + "language_loss": 0.79293132, + "learning_rate": 0.0004003550421302934, + "loss": 0.80462897, + "num_input_tokens_seen": 250296864, + "router_z_loss_mlp": 0.7578125, + "step": 2999, + "time_per_iteration": 4.9131574630737305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162548, + "balance_loss_mlp": 1.0844425, + "epoch": 0.5771450557906888, + "flos": 469171849728.0, + "grad_norm": 0.03695219944655869, + "language_loss": 0.82297212, + "learning_rate": 0.00040004976854266145, + "loss": 0.83459759, + "num_input_tokens_seen": 250362528, + "router_z_loss_mlp": 0.78027344, + "step": 3000, + "time_per_iteration": 2.599562406539917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161323, + "balance_loss_mlp": 1.08321714, + "epoch": 0.5773374374759523, + "flos": 575632926720.0, + "grad_norm": 0.03253250172707863, + "language_loss": 0.86701882, + "learning_rate": 0.0003997445337591505, + "loss": 0.87863207, + "num_input_tokens_seen": 250432768, + "router_z_loss_mlp": 0.78027344, + "step": 3001, + "time_per_iteration": 2.651052951812744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161912, + "balance_loss_mlp": 1.08380568, + "epoch": 0.5775298191612158, + "flos": 529504335360.0, + "grad_norm": 0.030455172240490772, + "language_loss": 0.78589356, + "learning_rate": 0.0003994393378982635, + "loss": 0.79751271, + "num_input_tokens_seen": 250501504, + "router_z_loss_mlp": 0.78027344, + "step": 3002, + "time_per_iteration": 2.6081488132476807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162445, + "balance_loss_mlp": 1.08576965, + "epoch": 0.5777222008464794, + "flos": 1306896520704.0, + "grad_norm": 0.00976162227486582, + "language_loss": 0.79538, + "learning_rate": 0.00039913418107848786, + "loss": 0.80700445, + "num_input_tokens_seen": 250733632, + "router_z_loss_mlp": 0.765625, + "step": 3003, + "time_per_iteration": 4.794616460800171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154088, + "balance_loss_mlp": 1.07550502, + "epoch": 0.577914582531743, + "flos": 604792051200.0, + "grad_norm": 0.035927509548420514, + "language_loss": 0.93844306, + "learning_rate": 0.0003988290634182961, + "loss": 0.94998395, + "num_input_tokens_seen": 250809152, + "router_z_loss_mlp": 0.78417969, + "step": 3004, + "time_per_iteration": 2.7580206394195557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152956, + "balance_loss_mlp": 1.07465923, + "epoch": 0.5781069642170066, + "flos": 487832641536.0, + "grad_norm": 0.03166140659951907, + "language_loss": 0.85788441, + "learning_rate": 0.0003985239850361453, + "loss": 0.86941397, + "num_input_tokens_seen": 250879152, + "router_z_loss_mlp": 0.78173828, + "step": 3005, + "time_per_iteration": 2.5811102390289307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148402, + "balance_loss_mlp": 1.0700103, + "epoch": 0.5782993459022701, + "flos": 507413956608.0, + "grad_norm": 0.03361154868402879, + "language_loss": 0.90845788, + "learning_rate": 0.0003982189460504777, + "loss": 0.9199419, + "num_input_tokens_seen": 250949904, + "router_z_loss_mlp": 0.78271484, + "step": 3006, + "time_per_iteration": 2.701486349105835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150426, + "balance_loss_mlp": 1.07208133, + "epoch": 0.5784917275875336, + "flos": 603294108672.0, + "grad_norm": 0.03266847587020217, + "language_loss": 0.84488243, + "learning_rate": 0.00039791394657971935, + "loss": 0.85638666, + "num_input_tokens_seen": 251020976, + "router_z_loss_mlp": 0.78222656, + "step": 3007, + "time_per_iteration": 2.7029902935028076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114812, + "balance_loss_mlp": 1.06953716, + "epoch": 0.5786841092727972, + "flos": 522588039168.0, + "grad_norm": 0.03327041662205967, + "language_loss": 0.89717233, + "learning_rate": 0.00039760898674228205, + "loss": 0.90865356, + "num_input_tokens_seen": 251093280, + "router_z_loss_mlp": 0.78466797, + "step": 3008, + "time_per_iteration": 2.6650431156158447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163782, + "balance_loss_mlp": 1.08510339, + "epoch": 0.5788764909580608, + "flos": 768835504128.0, + "grad_norm": 0.02880825356575122, + "language_loss": 0.85863519, + "learning_rate": 0.0003973040666565613, + "loss": 0.87027305, + "num_input_tokens_seen": 251181376, + "router_z_loss_mlp": 0.78515625, + "step": 3009, + "time_per_iteration": 3.0480079650878906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165461, + "balance_loss_mlp": 1.08668745, + "epoch": 0.5790688726433244, + "flos": 600331150848.0, + "grad_norm": 0.03153140111016463, + "language_loss": 0.87491179, + "learning_rate": 0.000396999186440938, + "loss": 0.8865664, + "num_input_tokens_seen": 251256176, + "router_z_loss_mlp": 0.78515625, + "step": 3010, + "time_per_iteration": 2.866971254348755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163905, + "balance_loss_mlp": 1.08517945, + "epoch": 0.5792612543285879, + "flos": 524105447424.0, + "grad_norm": 0.03493307290908607, + "language_loss": 0.90569246, + "learning_rate": 0.000396694346213777, + "loss": 0.91733146, + "num_input_tokens_seen": 251325344, + "router_z_loss_mlp": 0.78564453, + "step": 3011, + "time_per_iteration": 2.6576690673828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160972, + "balance_loss_mlp": 1.08234167, + "epoch": 0.5794536360138515, + "flos": 878079618048.0, + "grad_norm": 0.028681737588389107, + "language_loss": 0.88734698, + "learning_rate": 0.0003963895460934276, + "loss": 0.89895672, + "num_input_tokens_seen": 251406656, + "router_z_loss_mlp": 0.78369141, + "step": 3012, + "time_per_iteration": 3.1439104080200195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159333, + "balance_loss_mlp": 1.08065438, + "epoch": 0.5796460176991151, + "flos": 402298372608.0, + "grad_norm": 0.038884721414284784, + "language_loss": 0.92029333, + "learning_rate": 0.00039608478619822376, + "loss": 0.93188667, + "num_input_tokens_seen": 251467760, + "router_z_loss_mlp": 0.78613281, + "step": 3013, + "time_per_iteration": 2.4331459999084473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115895, + "balance_loss_mlp": 1.08032, + "epoch": 0.5798383993843786, + "flos": 619675422720.0, + "grad_norm": 0.029275699876953817, + "language_loss": 0.87518513, + "learning_rate": 0.00039578006664648394, + "loss": 0.88677466, + "num_input_tokens_seen": 251542272, + "router_z_loss_mlp": 0.78417969, + "step": 3014, + "time_per_iteration": 2.770930290222168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157872, + "balance_loss_mlp": 1.07928884, + "epoch": 0.5800307810696421, + "flos": 845792351232.0, + "grad_norm": 0.03304881172222658, + "language_loss": 0.8676393, + "learning_rate": 0.0003954753875565105, + "loss": 0.87921804, + "num_input_tokens_seen": 251625584, + "router_z_loss_mlp": 0.78320312, + "step": 3015, + "time_per_iteration": 3.08627986907959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155618, + "balance_loss_mlp": 1.0769875, + "epoch": 0.5802231627549057, + "flos": 570364294656.0, + "grad_norm": 0.02949140039649942, + "language_loss": 0.86755216, + "learning_rate": 0.00039517074904659057, + "loss": 0.87910825, + "num_input_tokens_seen": 251696704, + "router_z_loss_mlp": 0.78369141, + "step": 3016, + "time_per_iteration": 2.685842990875244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155954, + "balance_loss_mlp": 1.07732403, + "epoch": 0.5804155444401693, + "flos": 661662022656.0, + "grad_norm": 0.030068480846806175, + "language_loss": 0.90490985, + "learning_rate": 0.00039486615123499535, + "loss": 0.91646945, + "num_input_tokens_seen": 251774784, + "router_z_loss_mlp": 0.78369141, + "step": 3017, + "time_per_iteration": 2.8422367572784424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158277, + "balance_loss_mlp": 1.07950318, + "epoch": 0.5806079261254329, + "flos": 515057393664.0, + "grad_norm": 0.0339975061302382, + "language_loss": 0.90716887, + "learning_rate": 0.00039456159423997996, + "loss": 0.91875166, + "num_input_tokens_seen": 251844768, + "router_z_loss_mlp": 0.78515625, + "step": 3018, + "time_per_iteration": 2.6301286220550537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159604, + "balance_loss_mlp": 1.08116388, + "epoch": 0.5808003078106965, + "flos": 529717183488.0, + "grad_norm": 0.035522237622510534, + "language_loss": 0.94178265, + "learning_rate": 0.00039425707817978406, + "loss": 0.95337874, + "num_input_tokens_seen": 251912736, + "router_z_loss_mlp": 0.78320312, + "step": 3019, + "time_per_iteration": 2.6516103744506836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159065, + "balance_loss_mlp": 1.08033943, + "epoch": 0.58099268949596, + "flos": 477996321792.0, + "grad_norm": 0.033660479575399194, + "language_loss": 0.88736534, + "learning_rate": 0.00039395260317263124, + "loss": 0.89895594, + "num_input_tokens_seen": 251979328, + "router_z_loss_mlp": 0.78466797, + "step": 3020, + "time_per_iteration": 2.5736000537872314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158964, + "balance_loss_mlp": 1.08033383, + "epoch": 0.5811850711812235, + "flos": 518687093760.0, + "grad_norm": 0.032372571582398105, + "language_loss": 0.90171605, + "learning_rate": 0.0003936481693367291, + "loss": 0.9133057, + "num_input_tokens_seen": 252050928, + "router_z_loss_mlp": 0.78417969, + "step": 3021, + "time_per_iteration": 2.655585289001465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152938, + "balance_loss_mlp": 1.07416463, + "epoch": 0.5813774528664871, + "flos": 617626257408.0, + "grad_norm": 0.037353178472421755, + "language_loss": 0.94038713, + "learning_rate": 0.0003933437767902697, + "loss": 0.95191658, + "num_input_tokens_seen": 252126496, + "router_z_loss_mlp": 0.78564453, + "step": 3022, + "time_per_iteration": 2.7785356044769287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155749, + "balance_loss_mlp": 1.07707083, + "epoch": 0.5815698345517507, + "flos": 568603838976.0, + "grad_norm": 0.03237494754713459, + "language_loss": 0.83540273, + "learning_rate": 0.00039303942565142825, + "loss": 0.84696019, + "num_input_tokens_seen": 252203008, + "router_z_loss_mlp": 0.78466797, + "step": 3023, + "time_per_iteration": 2.8082921504974365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115966, + "balance_loss_mlp": 1.08122075, + "epoch": 0.5817622162370142, + "flos": 564303393792.0, + "grad_norm": 0.030406133972166762, + "language_loss": 0.81602162, + "learning_rate": 0.0003927351160383644, + "loss": 0.82761824, + "num_input_tokens_seen": 252283440, + "router_z_loss_mlp": 0.78369141, + "step": 3024, + "time_per_iteration": 2.8258216381073 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115841, + "balance_loss_mlp": 1.07992303, + "epoch": 0.5819545979222778, + "flos": 460153995264.0, + "grad_norm": 0.0330231934286986, + "language_loss": 0.82985759, + "learning_rate": 0.000392430848069222, + "loss": 0.84144175, + "num_input_tokens_seen": 252351760, + "router_z_loss_mlp": 0.78369141, + "step": 3025, + "time_per_iteration": 2.552351713180542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155737, + "balance_loss_mlp": 1.0769639, + "epoch": 0.5821469796075414, + "flos": 542516461056.0, + "grad_norm": 0.03445814315346002, + "language_loss": 0.88443869, + "learning_rate": 0.00039212662186212795, + "loss": 0.89599597, + "num_input_tokens_seen": 252418480, + "router_z_loss_mlp": 0.78515625, + "step": 3026, + "time_per_iteration": 2.6369402408599854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157395, + "balance_loss_mlp": 1.07890785, + "epoch": 0.582339361292805, + "flos": 553340433408.0, + "grad_norm": 0.029462079730168216, + "language_loss": 0.82325065, + "learning_rate": 0.0003918224375351934, + "loss": 0.83482456, + "num_input_tokens_seen": 252493712, + "router_z_loss_mlp": 0.78369141, + "step": 3027, + "time_per_iteration": 2.698915958404541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116249, + "balance_loss_mlp": 1.08386004, + "epoch": 0.5825317429780685, + "flos": 497447380992.0, + "grad_norm": 0.03190253080273137, + "language_loss": 0.83360291, + "learning_rate": 0.0003915182952065135, + "loss": 0.84522784, + "num_input_tokens_seen": 252566096, + "router_z_loss_mlp": 0.78417969, + "step": 3028, + "time_per_iteration": 2.6572346687316895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160994, + "balance_loss_mlp": 1.08265007, + "epoch": 0.582724124663332, + "flos": 565254116352.0, + "grad_norm": 0.030478660984130428, + "language_loss": 0.92836106, + "learning_rate": 0.0003912141949941664, + "loss": 0.93997103, + "num_input_tokens_seen": 252639424, + "router_z_loss_mlp": 0.78271484, + "step": 3029, + "time_per_iteration": 2.683072090148926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153282, + "balance_loss_mlp": 1.07484198, + "epoch": 0.5829165063485956, + "flos": 493112007168.0, + "grad_norm": 0.03294557051603365, + "language_loss": 0.89173961, + "learning_rate": 0.0003909101370162143, + "loss": 0.90327239, + "num_input_tokens_seen": 252706672, + "router_z_loss_mlp": 0.78369141, + "step": 3030, + "time_per_iteration": 2.575670003890991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160767, + "balance_loss_mlp": 1.08370972, + "epoch": 0.5831088880338592, + "flos": 1531877349888.0, + "grad_norm": 0.012849020092446796, + "language_loss": 0.72433889, + "learning_rate": 0.00039060612139070326, + "loss": 0.7359466, + "num_input_tokens_seen": 252932464, + "router_z_loss_mlp": 0.76953125, + "step": 3031, + "time_per_iteration": 4.9284889698028564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152337, + "balance_loss_mlp": 1.07370639, + "epoch": 0.5833012697191228, + "flos": 619208793600.0, + "grad_norm": 0.02929875839371022, + "language_loss": 0.87939668, + "learning_rate": 0.0003903021482356622, + "loss": 0.89092004, + "num_input_tokens_seen": 253011920, + "router_z_loss_mlp": 0.78466797, + "step": 3032, + "time_per_iteration": 2.8254482746124268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152205, + "balance_loss_mlp": 1.07362223, + "epoch": 0.5834936514043862, + "flos": 769293401088.0, + "grad_norm": 0.02695668391828596, + "language_loss": 0.87565535, + "learning_rate": 0.00038999821766910465, + "loss": 0.88717741, + "num_input_tokens_seen": 253091552, + "router_z_loss_mlp": 0.78417969, + "step": 3033, + "time_per_iteration": 3.006687641143799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156362, + "balance_loss_mlp": 1.07796979, + "epoch": 0.5836860330896498, + "flos": 459316064256.0, + "grad_norm": 0.030677066462792797, + "language_loss": 0.91205192, + "learning_rate": 0.00038969432980902606, + "loss": 0.92361552, + "num_input_tokens_seen": 253158608, + "router_z_loss_mlp": 0.78320312, + "step": 3034, + "time_per_iteration": 2.550684690475464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011586, + "balance_loss_mlp": 1.08192444, + "epoch": 0.5838784147749134, + "flos": 1364196191232.0, + "grad_norm": 0.008170267563240248, + "language_loss": 0.79784501, + "learning_rate": 0.0003893904847734068, + "loss": 0.80943102, + "num_input_tokens_seen": 253381184, + "router_z_loss_mlp": 0.765625, + "step": 3035, + "time_per_iteration": 4.859564304351807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154223, + "balance_loss_mlp": 1.07592607, + "epoch": 0.584070796460177, + "flos": 568288932864.0, + "grad_norm": 0.030253680936045732, + "language_loss": 0.87217242, + "learning_rate": 0.00038908668268020953, + "loss": 0.88371468, + "num_input_tokens_seen": 253452880, + "router_z_loss_mlp": 0.78222656, + "step": 3036, + "time_per_iteration": 2.7140538692474365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154776, + "balance_loss_mlp": 1.07624114, + "epoch": 0.5842631781454406, + "flos": 612665800704.0, + "grad_norm": 0.02904438680956131, + "language_loss": 0.90014827, + "learning_rate": 0.00038878292364738097, + "loss": 0.91169608, + "num_input_tokens_seen": 253530000, + "router_z_loss_mlp": 0.78271484, + "step": 3037, + "time_per_iteration": 2.787289619445801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157819, + "balance_loss_mlp": 1.07923615, + "epoch": 0.5844555598307041, + "flos": 464332916736.0, + "grad_norm": 0.03338514659593435, + "language_loss": 0.93144816, + "learning_rate": 0.0003884792077928508, + "loss": 0.94302636, + "num_input_tokens_seen": 253593504, + "router_z_loss_mlp": 0.78320312, + "step": 3038, + "time_per_iteration": 2.513655185699463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155243, + "balance_loss_mlp": 1.07666051, + "epoch": 0.5846479415159677, + "flos": 411057716736.0, + "grad_norm": 0.039769663121131886, + "language_loss": 0.82121253, + "learning_rate": 0.0003881755352345322, + "loss": 0.83276498, + "num_input_tokens_seen": 253657904, + "router_z_loss_mlp": 0.78320312, + "step": 3039, + "time_per_iteration": 2.5270330905914307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154802, + "balance_loss_mlp": 1.07641041, + "epoch": 0.5848403232012312, + "flos": 492265344000.0, + "grad_norm": 0.02801571871014385, + "language_loss": 0.90901846, + "learning_rate": 0.0003878719060903207, + "loss": 0.9205665, + "num_input_tokens_seen": 253725280, + "router_z_loss_mlp": 0.78222656, + "step": 3040, + "time_per_iteration": 2.5588507652282715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154937, + "balance_loss_mlp": 1.07644928, + "epoch": 0.5850327048864948, + "flos": 585508177920.0, + "grad_norm": 0.037771067006053156, + "language_loss": 0.89005375, + "learning_rate": 0.0003875683204780961, + "loss": 0.90160316, + "num_input_tokens_seen": 253795040, + "router_z_loss_mlp": 0.78271484, + "step": 3041, + "time_per_iteration": 2.668827533721924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152572, + "balance_loss_mlp": 1.07408428, + "epoch": 0.5852250865717584, + "flos": 652718028288.0, + "grad_norm": 0.037622145269810676, + "language_loss": 0.92115968, + "learning_rate": 0.00038726477851572043, + "loss": 0.93268543, + "num_input_tokens_seen": 253866384, + "router_z_loss_mlp": 0.78271484, + "step": 3042, + "time_per_iteration": 2.813145160675049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152742, + "balance_loss_mlp": 1.07434952, + "epoch": 0.5854174682570219, + "flos": 535619630592.0, + "grad_norm": 0.034632487357399135, + "language_loss": 0.85911977, + "learning_rate": 0.0003869612803210395, + "loss": 0.87064719, + "num_input_tokens_seen": 253935712, + "router_z_loss_mlp": 0.78222656, + "step": 3043, + "time_per_iteration": 2.6411526203155518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150207, + "balance_loss_mlp": 1.07176721, + "epoch": 0.5856098499422855, + "flos": 510758949888.0, + "grad_norm": 0.03364322076393535, + "language_loss": 0.8838582, + "learning_rate": 0.0003866578260118817, + "loss": 0.89536023, + "num_input_tokens_seen": 254003152, + "router_z_loss_mlp": 0.78271484, + "step": 3044, + "time_per_iteration": 2.59216570854187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160339, + "balance_loss_mlp": 1.08228123, + "epoch": 0.5858022316275491, + "flos": 594992661504.0, + "grad_norm": 0.03592243508466687, + "language_loss": 0.87963545, + "learning_rate": 0.0003863544157060581, + "loss": 0.89123881, + "num_input_tokens_seen": 254072816, + "router_z_loss_mlp": 0.77978516, + "step": 3045, + "time_per_iteration": 2.6693618297576904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159373, + "balance_loss_mlp": 1.08131468, + "epoch": 0.5859946133128127, + "flos": 560317854720.0, + "grad_norm": 0.029657376615259006, + "language_loss": 0.86909235, + "learning_rate": 0.0003860510495213634, + "loss": 0.88068604, + "num_input_tokens_seen": 254152800, + "router_z_loss_mlp": 0.77978516, + "step": 3046, + "time_per_iteration": 2.799967050552368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159061, + "balance_loss_mlp": 1.08085966, + "epoch": 0.5861869949980761, + "flos": 554755783680.0, + "grad_norm": 0.03663253930872626, + "language_loss": 0.84493214, + "learning_rate": 0.0003857477275755746, + "loss": 0.85652274, + "num_input_tokens_seen": 254224384, + "router_z_loss_mlp": 0.78125, + "step": 3047, + "time_per_iteration": 2.6989481449127197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116382, + "balance_loss_mlp": 1.08566678, + "epoch": 0.5863793766833397, + "flos": 720054131712.0, + "grad_norm": 0.029238524404730352, + "language_loss": 0.89394152, + "learning_rate": 0.00038544444998645167, + "loss": 0.90557969, + "num_input_tokens_seen": 254310960, + "router_z_loss_mlp": 0.78076172, + "step": 3048, + "time_per_iteration": 3.0829827785491943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162492, + "balance_loss_mlp": 1.0843389, + "epoch": 0.5865717583686033, + "flos": 473285643264.0, + "grad_norm": 0.03316519352776713, + "language_loss": 0.8619799, + "learning_rate": 0.00038514121687173767, + "loss": 0.87360477, + "num_input_tokens_seen": 254378336, + "router_z_loss_mlp": 0.78076172, + "step": 3049, + "time_per_iteration": 2.575395107269287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157324, + "balance_loss_mlp": 1.07897997, + "epoch": 0.5867641400538669, + "flos": 814846574592.0, + "grad_norm": 0.0318856413902076, + "language_loss": 0.87874395, + "learning_rate": 0.00038483802834915807, + "loss": 0.8903172, + "num_input_tokens_seen": 254454352, + "router_z_loss_mlp": 0.78271484, + "step": 3050, + "time_per_iteration": 2.973144292831421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153006, + "balance_loss_mlp": 1.07461429, + "epoch": 0.5869565217391305, + "flos": 487517735424.0, + "grad_norm": 0.034960474960603255, + "language_loss": 0.8386789, + "learning_rate": 0.00038453488453642074, + "loss": 0.85020894, + "num_input_tokens_seen": 254526352, + "router_z_loss_mlp": 0.78320312, + "step": 3051, + "time_per_iteration": 2.7100586891174316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152299, + "balance_loss_mlp": 1.0736686, + "epoch": 0.587148903424394, + "flos": 570512014848.0, + "grad_norm": 0.03111841936731719, + "language_loss": 0.91899282, + "learning_rate": 0.00038423178555121697, + "loss": 0.93051583, + "num_input_tokens_seen": 254598720, + "router_z_loss_mlp": 0.78466797, + "step": 3052, + "time_per_iteration": 2.713294744491577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151746, + "balance_loss_mlp": 1.07316351, + "epoch": 0.5873412851096576, + "flos": 748694234112.0, + "grad_norm": 0.039836143626506074, + "language_loss": 0.90698159, + "learning_rate": 0.00038392873151121994, + "loss": 0.91849899, + "num_input_tokens_seen": 254683664, + "router_z_loss_mlp": 0.78466797, + "step": 3053, + "time_per_iteration": 3.0334441661834717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151743, + "balance_loss_mlp": 1.07320774, + "epoch": 0.5875336667949211, + "flos": 529187427840.0, + "grad_norm": 0.03304313685691396, + "language_loss": 0.89048851, + "learning_rate": 0.0003836257225340859, + "loss": 0.90200597, + "num_input_tokens_seen": 254754688, + "router_z_loss_mlp": 0.78417969, + "step": 3054, + "time_per_iteration": 2.612002372741699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152089, + "balance_loss_mlp": 1.07360125, + "epoch": 0.5877260484801847, + "flos": 825640347648.0, + "grad_norm": 0.04168388263761463, + "language_loss": 0.87033945, + "learning_rate": 0.00038332275873745336, + "loss": 0.88186038, + "num_input_tokens_seen": 254838976, + "router_z_loss_mlp": 0.78369141, + "step": 3055, + "time_per_iteration": 3.0469071865081787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153404, + "balance_loss_mlp": 1.07472539, + "epoch": 0.5879184301654482, + "flos": 592693718016.0, + "grad_norm": 0.028534237237830384, + "language_loss": 0.87091875, + "learning_rate": 0.0003830198402389431, + "loss": 0.88245273, + "num_input_tokens_seen": 254912912, + "router_z_loss_mlp": 0.78466797, + "step": 3056, + "time_per_iteration": 2.7129743099212646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116227, + "balance_loss_mlp": 1.08635712, + "epoch": 0.5881108118507118, + "flos": 1549223574528.0, + "grad_norm": 0.013735077759529469, + "language_loss": 0.77348936, + "learning_rate": 0.0003827169671561585, + "loss": 0.78511202, + "num_input_tokens_seen": 255151488, + "router_z_loss_mlp": 0.75976562, + "step": 3057, + "time_per_iteration": 4.971419334411621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155251, + "balance_loss_mlp": 1.0767163, + "epoch": 0.5883031935359754, + "flos": 490598214144.0, + "grad_norm": 0.03703880470659913, + "language_loss": 0.88891268, + "learning_rate": 0.0003824141396066855, + "loss": 0.90046519, + "num_input_tokens_seen": 255218896, + "router_z_loss_mlp": 0.78417969, + "step": 3058, + "time_per_iteration": 2.5657668113708496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153431, + "balance_loss_mlp": 1.0749433, + "epoch": 0.588495575221239, + "flos": 583980036096.0, + "grad_norm": 0.04132288833299083, + "language_loss": 0.89364433, + "learning_rate": 0.000382111357708092, + "loss": 0.90517867, + "num_input_tokens_seen": 255287408, + "router_z_loss_mlp": 0.78417969, + "step": 3059, + "time_per_iteration": 2.7690227031707764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152167, + "balance_loss_mlp": 1.07377541, + "epoch": 0.5886879569065026, + "flos": 662239441920.0, + "grad_norm": 0.03195995960407152, + "language_loss": 0.89352429, + "learning_rate": 0.00038180862157792864, + "loss": 0.90504599, + "num_input_tokens_seen": 255358432, + "router_z_loss_mlp": 0.78320312, + "step": 3060, + "time_per_iteration": 2.797255039215088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149069, + "balance_loss_mlp": 1.07048619, + "epoch": 0.588880338591766, + "flos": 563719243776.0, + "grad_norm": 0.031223560866560994, + "language_loss": 0.86781317, + "learning_rate": 0.0003815059313337279, + "loss": 0.87930381, + "num_input_tokens_seen": 255425744, + "router_z_loss_mlp": 0.78369141, + "step": 3061, + "time_per_iteration": 2.6690454483032227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149002, + "balance_loss_mlp": 1.07056284, + "epoch": 0.5890727202770296, + "flos": 555852225024.0, + "grad_norm": 0.029451906852367885, + "language_loss": 0.83063936, + "learning_rate": 0.00038120328709300436, + "loss": 0.84212935, + "num_input_tokens_seen": 255505808, + "router_z_loss_mlp": 0.78271484, + "step": 3062, + "time_per_iteration": 2.902662515640259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149399, + "balance_loss_mlp": 1.07095897, + "epoch": 0.5892651019622932, + "flos": 656701565952.0, + "grad_norm": 0.028569643240873292, + "language_loss": 0.89099294, + "learning_rate": 0.0003809006889732549, + "loss": 0.90248692, + "num_input_tokens_seen": 255580160, + "router_z_loss_mlp": 0.78320312, + "step": 3063, + "time_per_iteration": 2.8155622482299805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150242, + "balance_loss_mlp": 1.07185006, + "epoch": 0.5894574836475568, + "flos": 454132025856.0, + "grad_norm": 0.03219128848339896, + "language_loss": 0.93056011, + "learning_rate": 0.0003805981370919589, + "loss": 0.9420625, + "num_input_tokens_seen": 255644016, + "router_z_loss_mlp": 0.78173828, + "step": 3064, + "time_per_iteration": 2.533978223800659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156603, + "balance_loss_mlp": 1.07840204, + "epoch": 0.5896498653328203, + "flos": 520111176192.0, + "grad_norm": 0.0315116121131164, + "language_loss": 0.89031386, + "learning_rate": 0.0003802956315665771, + "loss": 0.90187985, + "num_input_tokens_seen": 255718192, + "router_z_loss_mlp": 0.78125, + "step": 3065, + "time_per_iteration": 2.6914567947387695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151617, + "balance_loss_mlp": 1.07341576, + "epoch": 0.5898422470180839, + "flos": 550084036608.0, + "grad_norm": 0.037269486879405754, + "language_loss": 0.87739515, + "learning_rate": 0.0003799931725145529, + "loss": 0.88891131, + "num_input_tokens_seen": 255787696, + "router_z_loss_mlp": 0.78125, + "step": 3066, + "time_per_iteration": 2.6040141582489014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151797, + "balance_loss_mlp": 1.07359576, + "epoch": 0.5900346287033474, + "flos": 525379808256.0, + "grad_norm": 0.03210441330274425, + "language_loss": 0.90831029, + "learning_rate": 0.00037969076005331083, + "loss": 0.9198283, + "num_input_tokens_seen": 255862992, + "router_z_loss_mlp": 0.78125, + "step": 3067, + "time_per_iteration": 2.773045301437378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151142, + "balance_loss_mlp": 1.07298875, + "epoch": 0.590227010388611, + "flos": 568215072768.0, + "grad_norm": 0.03944068050463326, + "language_loss": 0.93933421, + "learning_rate": 0.00037938839430025817, + "loss": 0.9508456, + "num_input_tokens_seen": 255931872, + "router_z_loss_mlp": 0.78076172, + "step": 3068, + "time_per_iteration": 2.6502816677093506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149777, + "balance_loss_mlp": 1.07148039, + "epoch": 0.5904193920738746, + "flos": 584455397376.0, + "grad_norm": 0.029602074998044806, + "language_loss": 0.90136111, + "learning_rate": 0.0003790860753727835, + "loss": 0.91285884, + "num_input_tokens_seen": 256004656, + "router_z_loss_mlp": 0.78173828, + "step": 3069, + "time_per_iteration": 2.8173305988311768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148373, + "balance_loss_mlp": 1.07007682, + "epoch": 0.5906117737591381, + "flos": 530796160512.0, + "grad_norm": 0.03761421694137887, + "language_loss": 0.88493633, + "learning_rate": 0.00037878380338825766, + "loss": 0.89642012, + "num_input_tokens_seen": 256076944, + "router_z_loss_mlp": 0.78173828, + "step": 3070, + "time_per_iteration": 2.6682841777801514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148557, + "balance_loss_mlp": 1.07059419, + "epoch": 0.5908041554444017, + "flos": 685515585024.0, + "grad_norm": 0.029847469423829834, + "language_loss": 0.85616612, + "learning_rate": 0.00037848157846403287, + "loss": 0.86765176, + "num_input_tokens_seen": 256154768, + "router_z_loss_mlp": 0.77880859, + "step": 3071, + "time_per_iteration": 2.942607879638672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148313, + "balance_loss_mlp": 1.07015908, + "epoch": 0.5909965371296653, + "flos": 551132814336.0, + "grad_norm": 0.030659229377642858, + "language_loss": 0.88636756, + "learning_rate": 0.0003781794007174435, + "loss": 0.89785063, + "num_input_tokens_seen": 256230896, + "router_z_loss_mlp": 0.78076172, + "step": 3072, + "time_per_iteration": 2.7619588375091553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159439, + "balance_loss_mlp": 1.08276367, + "epoch": 0.5911889188149289, + "flos": 1495642200576.0, + "grad_norm": 0.009662354088300913, + "language_loss": 0.74074531, + "learning_rate": 0.0003778772702658051, + "loss": 0.75233972, + "num_input_tokens_seen": 256462336, + "router_z_loss_mlp": 0.765625, + "step": 3073, + "time_per_iteration": 4.855187177658081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115096, + "balance_loss_mlp": 1.07275867, + "epoch": 0.5913813005001923, + "flos": 488885422080.0, + "grad_norm": 0.030913240812320716, + "language_loss": 0.86239564, + "learning_rate": 0.0003775751872264152, + "loss": 0.87390518, + "num_input_tokens_seen": 256539376, + "router_z_loss_mlp": 0.78125, + "step": 3074, + "time_per_iteration": 2.7676284313201904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150595, + "balance_loss_mlp": 1.0724895, + "epoch": 0.5915736821854559, + "flos": 574521748992.0, + "grad_norm": 0.02774902568268271, + "language_loss": 0.91979122, + "learning_rate": 0.0003772731517165527, + "loss": 0.93129718, + "num_input_tokens_seen": 256617728, + "router_z_loss_mlp": 0.78027344, + "step": 3075, + "time_per_iteration": 2.7969858646392822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146907, + "balance_loss_mlp": 1.06884861, + "epoch": 0.5917660638707195, + "flos": 790860754944.0, + "grad_norm": 0.032083383212934545, + "language_loss": 0.88416231, + "learning_rate": 0.0003769711638534784, + "loss": 0.89563137, + "num_input_tokens_seen": 256696032, + "router_z_loss_mlp": 0.77978516, + "step": 3076, + "time_per_iteration": 2.966887950897217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147265, + "balance_loss_mlp": 1.06915915, + "epoch": 0.5919584455559831, + "flos": 529756114944.0, + "grad_norm": 0.039188776409307895, + "language_loss": 0.84855187, + "learning_rate": 0.00037666922375443446, + "loss": 0.86002445, + "num_input_tokens_seen": 256767360, + "router_z_loss_mlp": 0.78027344, + "step": 3077, + "time_per_iteration": 2.6466495990753174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146857, + "balance_loss_mlp": 1.06889355, + "epoch": 0.5921508272412467, + "flos": 561752670720.0, + "grad_norm": 0.03396925526876144, + "language_loss": 0.87058771, + "learning_rate": 0.00037636733153664396, + "loss": 0.88205624, + "num_input_tokens_seen": 256844848, + "router_z_loss_mlp": 0.77880859, + "step": 3078, + "time_per_iteration": 2.868244171142578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147912, + "balance_loss_mlp": 1.06980658, + "epoch": 0.5923432089265102, + "flos": 564333593088.0, + "grad_norm": 0.03405949699736924, + "language_loss": 0.86518288, + "learning_rate": 0.0003760654873173124, + "loss": 0.87666202, + "num_input_tokens_seen": 256916688, + "router_z_loss_mlp": 0.78027344, + "step": 3079, + "time_per_iteration": 2.665978193283081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148871, + "balance_loss_mlp": 1.07095611, + "epoch": 0.5925355906117737, + "flos": 496750439424.0, + "grad_norm": 0.031078530741144403, + "language_loss": 0.87091482, + "learning_rate": 0.00037576369121362566, + "loss": 0.88240349, + "num_input_tokens_seen": 256985520, + "router_z_loss_mlp": 0.77832031, + "step": 3080, + "time_per_iteration": 2.5879437923431396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152698, + "balance_loss_mlp": 1.07483089, + "epoch": 0.5927279722970373, + "flos": 567492661248.0, + "grad_norm": 0.029886004026783125, + "language_loss": 0.86116624, + "learning_rate": 0.0003754619433427516, + "loss": 0.87269318, + "num_input_tokens_seen": 257067552, + "router_z_loss_mlp": 0.77783203, + "step": 3081, + "time_per_iteration": 2.911530017852783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149482, + "balance_loss_mlp": 1.07156706, + "epoch": 0.5929203539823009, + "flos": 668159353344.0, + "grad_norm": 0.03611880785888225, + "language_loss": 0.84511012, + "learning_rate": 0.0003751602438218392, + "loss": 0.85660493, + "num_input_tokens_seen": 257138896, + "router_z_loss_mlp": 0.77832031, + "step": 3082, + "time_per_iteration": 2.767104148864746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148924, + "balance_loss_mlp": 1.07105672, + "epoch": 0.5931127356675644, + "flos": 556785483264.0, + "grad_norm": 0.03271098535749721, + "language_loss": 0.89783478, + "learning_rate": 0.0003748585927680186, + "loss": 0.90932405, + "num_input_tokens_seen": 257210592, + "router_z_loss_mlp": 0.77783203, + "step": 3083, + "time_per_iteration": 2.6630167961120605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148966, + "balance_loss_mlp": 1.07100332, + "epoch": 0.593305117352828, + "flos": 536242712064.0, + "grad_norm": 0.03028975884774044, + "language_loss": 0.88271487, + "learning_rate": 0.00037455699029840086, + "loss": 0.89420456, + "num_input_tokens_seen": 257276208, + "router_z_loss_mlp": 0.77880859, + "step": 3084, + "time_per_iteration": 2.647643566131592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148168, + "balance_loss_mlp": 1.07020473, + "epoch": 0.5934974990380916, + "flos": 595057789440.0, + "grad_norm": 0.028668930156423956, + "language_loss": 0.89615595, + "learning_rate": 0.0003742554365300787, + "loss": 0.9076376, + "num_input_tokens_seen": 257351920, + "router_z_loss_mlp": 0.77880859, + "step": 3085, + "time_per_iteration": 2.743479013442993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148026, + "balance_loss_mlp": 1.07015836, + "epoch": 0.5936898807233552, + "flos": 714014697984.0, + "grad_norm": 0.030266517596009415, + "language_loss": 0.84002471, + "learning_rate": 0.0003739539315801255, + "loss": 0.85150492, + "num_input_tokens_seen": 257430016, + "router_z_loss_mlp": 0.77783203, + "step": 3086, + "time_per_iteration": 2.9327478408813477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147359, + "balance_loss_mlp": 1.06944346, + "epoch": 0.5938822624086187, + "flos": 392748761088.0, + "grad_norm": 0.030603721844952317, + "language_loss": 0.96139234, + "learning_rate": 0.000373652475565596, + "loss": 0.97286594, + "num_input_tokens_seen": 257492224, + "router_z_loss_mlp": 0.77832031, + "step": 3087, + "time_per_iteration": 2.471726417541504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146572, + "balance_loss_mlp": 1.06860876, + "epoch": 0.5940746440938822, + "flos": 481335310848.0, + "grad_norm": 0.033612762678092996, + "language_loss": 0.86454874, + "learning_rate": 0.00037335106860352587, + "loss": 0.87601447, + "num_input_tokens_seen": 257567824, + "router_z_loss_mlp": 0.77880859, + "step": 3088, + "time_per_iteration": 2.692692756652832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148512, + "balance_loss_mlp": 1.07045376, + "epoch": 0.5942670257791458, + "flos": 484307000832.0, + "grad_norm": 0.031191733120893732, + "language_loss": 0.87924445, + "learning_rate": 0.00037304971081093146, + "loss": 0.89072955, + "num_input_tokens_seen": 257635488, + "router_z_loss_mlp": 0.77978516, + "step": 3089, + "time_per_iteration": 2.568676710128784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149298, + "balance_loss_mlp": 1.071383, + "epoch": 0.5944594074644094, + "flos": 549057452544.0, + "grad_norm": 0.027833968511861495, + "language_loss": 0.85559821, + "learning_rate": 0.00037274840230481024, + "loss": 0.86709118, + "num_input_tokens_seen": 257709552, + "router_z_loss_mlp": 0.77832031, + "step": 3090, + "time_per_iteration": 2.7224090099334717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148103, + "balance_loss_mlp": 1.07009256, + "epoch": 0.594651789149673, + "flos": 450129022464.0, + "grad_norm": 0.03399265003555819, + "language_loss": 0.85464221, + "learning_rate": 0.00037244714320214077, + "loss": 0.86612326, + "num_input_tokens_seen": 257775520, + "router_z_loss_mlp": 0.77929688, + "step": 3091, + "time_per_iteration": 2.545518398284912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148303, + "balance_loss_mlp": 1.07034016, + "epoch": 0.5948441708349365, + "flos": 597465521664.0, + "grad_norm": 0.029759995876706483, + "language_loss": 0.88336015, + "learning_rate": 0.000372145933619882, + "loss": 0.89484322, + "num_input_tokens_seen": 257858560, + "router_z_loss_mlp": 0.77880859, + "step": 3092, + "time_per_iteration": 2.8612496852874756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147536, + "balance_loss_mlp": 1.06952572, + "epoch": 0.5950365525202, + "flos": 549580477440.0, + "grad_norm": 0.03567164883764641, + "language_loss": 0.87935793, + "learning_rate": 0.000371844773674974, + "loss": 0.89083326, + "num_input_tokens_seen": 257928048, + "router_z_loss_mlp": 0.77929688, + "step": 3093, + "time_per_iteration": 2.6431939601898193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147858, + "balance_loss_mlp": 1.06980002, + "epoch": 0.5952289342054636, + "flos": 655963691520.0, + "grad_norm": 0.03489323159702664, + "language_loss": 0.87669003, + "learning_rate": 0.0003715436634843375, + "loss": 0.88816857, + "num_input_tokens_seen": 258003088, + "router_z_loss_mlp": 0.77978516, + "step": 3094, + "time_per_iteration": 2.889326572418213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115074, + "balance_loss_mlp": 1.07268155, + "epoch": 0.5954213158907272, + "flos": 604603398144.0, + "grad_norm": 0.02937888511977547, + "language_loss": 0.85120195, + "learning_rate": 0.00037124260316487355, + "loss": 0.86270934, + "num_input_tokens_seen": 258084880, + "router_z_loss_mlp": 0.77978516, + "step": 3095, + "time_per_iteration": 2.8256890773773193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011487, + "balance_loss_mlp": 1.07064188, + "epoch": 0.5956136975759908, + "flos": 487267957248.0, + "grad_norm": 0.03289727477229571, + "language_loss": 0.94411993, + "learning_rate": 0.0003709415928334643, + "loss": 0.95560694, + "num_input_tokens_seen": 258152032, + "router_z_loss_mlp": 0.77978516, + "step": 3096, + "time_per_iteration": 2.587526559829712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148362, + "balance_loss_mlp": 1.07025576, + "epoch": 0.5958060792612543, + "flos": 660040555008.0, + "grad_norm": 0.03760653483237211, + "language_loss": 0.8629458, + "learning_rate": 0.00037064063260697233, + "loss": 0.8744294, + "num_input_tokens_seen": 258228896, + "router_z_loss_mlp": 0.78027344, + "step": 3097, + "time_per_iteration": 2.8921737670898438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149624, + "balance_loss_mlp": 1.07170904, + "epoch": 0.5959984609465179, + "flos": 724995122688.0, + "grad_norm": 0.02933465569925715, + "language_loss": 0.84228349, + "learning_rate": 0.0003703397226022407, + "loss": 0.85377973, + "num_input_tokens_seen": 258311152, + "router_z_loss_mlp": 0.77832031, + "step": 3098, + "time_per_iteration": 3.0898213386535645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115181, + "balance_loss_mlp": 1.07627869, + "epoch": 0.5961908426317815, + "flos": 1523218788864.0, + "grad_norm": 0.004520881067607934, + "language_loss": 0.75499874, + "learning_rate": 0.00037003886293609335, + "loss": 0.7665168, + "num_input_tokens_seen": 258540656, + "router_z_loss_mlp": 0.75585938, + "step": 3099, + "time_per_iteration": 4.9205827713012695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148148, + "balance_loss_mlp": 1.07023323, + "epoch": 0.596383224317045, + "flos": 533646326784.0, + "grad_norm": 0.03064762726337019, + "language_loss": 0.87394881, + "learning_rate": 0.0003697380537253339, + "loss": 0.88543034, + "num_input_tokens_seen": 258608960, + "router_z_loss_mlp": 0.77832031, + "step": 3100, + "time_per_iteration": 2.6238889694213867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148472, + "balance_loss_mlp": 1.07065213, + "epoch": 0.5965756060023086, + "flos": 592366076928.0, + "grad_norm": 0.03279417600266174, + "language_loss": 0.87095284, + "learning_rate": 0.0003694372950867471, + "loss": 0.88243759, + "num_input_tokens_seen": 258684304, + "router_z_loss_mlp": 0.77734375, + "step": 3101, + "time_per_iteration": 2.754004955291748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149351, + "balance_loss_mlp": 1.0715313, + "epoch": 0.5967679876875721, + "flos": 863469493248.0, + "grad_norm": 0.096940863219985, + "language_loss": 0.82642257, + "learning_rate": 0.0003691365871370976, + "loss": 0.83791614, + "num_input_tokens_seen": 258769472, + "router_z_loss_mlp": 0.77734375, + "step": 3102, + "time_per_iteration": 3.027898073196411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148471, + "balance_loss_mlp": 1.07065165, + "epoch": 0.5969603693728357, + "flos": 554877307392.0, + "grad_norm": 0.03194116769832037, + "language_loss": 0.90513253, + "learning_rate": 0.00036883592999313093, + "loss": 0.91661727, + "num_input_tokens_seen": 258841696, + "router_z_loss_mlp": 0.77734375, + "step": 3103, + "time_per_iteration": 2.6555323600769043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114931, + "balance_loss_mlp": 1.07158601, + "epoch": 0.5971527510580993, + "flos": 719936610816.0, + "grad_norm": 0.037867869271097296, + "language_loss": 0.85018742, + "learning_rate": 0.0003685353237715722, + "loss": 0.86168051, + "num_input_tokens_seen": 258915616, + "router_z_loss_mlp": 0.77636719, + "step": 3104, + "time_per_iteration": 2.88739013671875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115032, + "balance_loss_mlp": 1.07245219, + "epoch": 0.5973451327433629, + "flos": 648862745088.0, + "grad_norm": 0.032062315519195535, + "language_loss": 0.86408043, + "learning_rate": 0.0003682347685891274, + "loss": 0.87558353, + "num_input_tokens_seen": 258994080, + "router_z_loss_mlp": 0.77783203, + "step": 3105, + "time_per_iteration": 2.8420920372009277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149351, + "balance_loss_mlp": 1.07162631, + "epoch": 0.5975375144286263, + "flos": 723088948224.0, + "grad_norm": 0.03318206210872103, + "language_loss": 0.86870039, + "learning_rate": 0.0003679342645624822, + "loss": 0.88019389, + "num_input_tokens_seen": 259075968, + "router_z_loss_mlp": 0.77636719, + "step": 3106, + "time_per_iteration": 2.995124578475952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150114, + "balance_loss_mlp": 1.07248521, + "epoch": 0.5977298961138899, + "flos": 752343399936.0, + "grad_norm": 0.029134934835651077, + "language_loss": 0.86725187, + "learning_rate": 0.0003676338118083025, + "loss": 0.87875295, + "num_input_tokens_seen": 259162512, + "router_z_loss_mlp": 0.77539062, + "step": 3107, + "time_per_iteration": 2.972302198410034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150139, + "balance_loss_mlp": 1.07251036, + "epoch": 0.5979222777991535, + "flos": 531998662656.0, + "grad_norm": 0.035100601373903646, + "language_loss": 0.857481, + "learning_rate": 0.0003673334104432347, + "loss": 0.86898237, + "num_input_tokens_seen": 259228752, + "router_z_loss_mlp": 0.77539062, + "step": 3108, + "time_per_iteration": 2.6626758575439453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149837, + "balance_loss_mlp": 1.07230318, + "epoch": 0.5981146594844171, + "flos": 622914355200.0, + "grad_norm": 0.0316193314504938, + "language_loss": 0.88024735, + "learning_rate": 0.0003670330605839048, + "loss": 0.89174569, + "num_input_tokens_seen": 259303440, + "router_z_loss_mlp": 0.77441406, + "step": 3109, + "time_per_iteration": 2.8445565700531006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149651, + "balance_loss_mlp": 1.07216513, + "epoch": 0.5983070411696807, + "flos": 604709458944.0, + "grad_norm": 0.030685816325192888, + "language_loss": 0.81470084, + "learning_rate": 0.0003667327623469191, + "loss": 0.82619739, + "num_input_tokens_seen": 259378752, + "router_z_loss_mlp": 0.77392578, + "step": 3110, + "time_per_iteration": 2.7507362365722656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151646, + "balance_loss_mlp": 1.07406473, + "epoch": 0.5984994228549442, + "flos": 634669584384.0, + "grad_norm": 0.03251456811802211, + "language_loss": 0.83321273, + "learning_rate": 0.00036643251584886333, + "loss": 0.84472924, + "num_input_tokens_seen": 259454336, + "router_z_loss_mlp": 0.77490234, + "step": 3111, + "time_per_iteration": 2.816390037536621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156112, + "balance_loss_mlp": 1.07848299, + "epoch": 0.5986918045402078, + "flos": 526293600768.0, + "grad_norm": 0.03439308421341756, + "language_loss": 0.88026524, + "learning_rate": 0.00036613232120630393, + "loss": 0.89182639, + "num_input_tokens_seen": 259518960, + "router_z_loss_mlp": 0.77539062, + "step": 3112, + "time_per_iteration": 2.610931396484375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151048, + "balance_loss_mlp": 1.07332325, + "epoch": 0.5988841862254713, + "flos": 484139814912.0, + "grad_norm": 0.040537518995664656, + "language_loss": 0.85835981, + "learning_rate": 0.00036583217853578643, + "loss": 0.86987036, + "num_input_tokens_seen": 259584352, + "router_z_loss_mlp": 0.77636719, + "step": 3113, + "time_per_iteration": 2.535508871078491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115137, + "balance_loss_mlp": 1.07369328, + "epoch": 0.5990765679107349, + "flos": 1142121745920.0, + "grad_norm": 0.03045218931470109, + "language_loss": 0.82758361, + "learning_rate": 0.000365532087953837, + "loss": 0.83909732, + "num_input_tokens_seen": 259693152, + "router_z_loss_mlp": 0.77587891, + "step": 3114, + "time_per_iteration": 3.635089159011841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150692, + "balance_loss_mlp": 1.07282436, + "epoch": 0.5992689495959984, + "flos": 518018350080.0, + "grad_norm": 0.03475345450765353, + "language_loss": 0.94564217, + "learning_rate": 0.00036523204957696065, + "loss": 0.95714909, + "num_input_tokens_seen": 259762048, + "router_z_loss_mlp": 0.77783203, + "step": 3115, + "time_per_iteration": 2.6130504608154297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150235, + "balance_loss_mlp": 1.07231951, + "epoch": 0.599461331281262, + "flos": 745941396480.0, + "grad_norm": 0.03954805443520273, + "language_loss": 0.86356986, + "learning_rate": 0.00036493206352164324, + "loss": 0.87507224, + "num_input_tokens_seen": 259843184, + "router_z_loss_mlp": 0.77832031, + "step": 3116, + "time_per_iteration": 2.902606964111328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115079, + "balance_loss_mlp": 1.07282686, + "epoch": 0.5996537129665256, + "flos": 593483985408.0, + "grad_norm": 0.030263025154964335, + "language_loss": 0.90265405, + "learning_rate": 0.000364632129904349, + "loss": 0.91416192, + "num_input_tokens_seen": 259912720, + "router_z_loss_mlp": 0.77880859, + "step": 3117, + "time_per_iteration": 2.728739023208618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148018, + "balance_loss_mlp": 1.0701983, + "epoch": 0.5998460946517892, + "flos": 560115740160.0, + "grad_norm": 0.03726043771871862, + "language_loss": 0.8256759, + "learning_rate": 0.00036433224884152283, + "loss": 0.83715606, + "num_input_tokens_seen": 259985472, + "router_z_loss_mlp": 0.77734375, + "step": 3118, + "time_per_iteration": 2.7763798236846924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146842, + "balance_loss_mlp": 1.06897449, + "epoch": 0.6000384763370528, + "flos": 485535699456.0, + "grad_norm": 0.03789921911219481, + "language_loss": 0.83006287, + "learning_rate": 0.00036403242044958875, + "loss": 0.84153128, + "num_input_tokens_seen": 260050336, + "router_z_loss_mlp": 0.77783203, + "step": 3119, + "time_per_iteration": 2.549102783203125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156248, + "balance_loss_mlp": 1.07842839, + "epoch": 0.6002308580223162, + "flos": 597877756416.0, + "grad_norm": 0.03490542571663494, + "language_loss": 0.96794367, + "learning_rate": 0.0003637326448449507, + "loss": 0.97950613, + "num_input_tokens_seen": 260120304, + "router_z_loss_mlp": 0.77734375, + "step": 3120, + "time_per_iteration": 2.7004034519195557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153861, + "balance_loss_mlp": 1.07608855, + "epoch": 0.6004232397075798, + "flos": 546220021248.0, + "grad_norm": 0.03097014244858331, + "language_loss": 0.90828121, + "learning_rate": 0.00036343292214399177, + "loss": 0.91981983, + "num_input_tokens_seen": 260198304, + "router_z_loss_mlp": 0.77685547, + "step": 3121, + "time_per_iteration": 2.7137558460235596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149916, + "balance_loss_mlp": 1.07195354, + "epoch": 0.6006156213928434, + "flos": 631150674432.0, + "grad_norm": 0.035271472923777164, + "language_loss": 0.82629979, + "learning_rate": 0.00036313325246307456, + "loss": 0.83779889, + "num_input_tokens_seen": 260277664, + "router_z_loss_mlp": 0.77880859, + "step": 3122, + "time_per_iteration": 2.7764761447906494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149471, + "balance_loss_mlp": 1.07179451, + "epoch": 0.600808003078107, + "flos": 583404618240.0, + "grad_norm": 0.03572948741638757, + "language_loss": 0.92888528, + "learning_rate": 0.0003628336359185411, + "loss": 0.94037998, + "num_input_tokens_seen": 260350096, + "router_z_loss_mlp": 0.77587891, + "step": 3123, + "time_per_iteration": 2.658597707748413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149832, + "balance_loss_mlp": 1.07215571, + "epoch": 0.6010003847633705, + "flos": 636438772224.0, + "grad_norm": 0.033415641646833916, + "language_loss": 0.81693363, + "learning_rate": 0.000362534072626713, + "loss": 0.8284319, + "num_input_tokens_seen": 260421888, + "router_z_loss_mlp": 0.77587891, + "step": 3124, + "time_per_iteration": 2.7385804653167725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146058, + "balance_loss_mlp": 1.06857181, + "epoch": 0.6011927664486341, + "flos": 720029936640.0, + "grad_norm": 0.0314556326919405, + "language_loss": 0.85929549, + "learning_rate": 0.00036223456270389093, + "loss": 0.87075609, + "num_input_tokens_seen": 260499616, + "router_z_loss_mlp": 0.77392578, + "step": 3125, + "time_per_iteration": 2.9184412956237793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148457, + "balance_loss_mlp": 1.0710187, + "epoch": 0.6013851481338977, + "flos": 500054499840.0, + "grad_norm": 0.03211121673376429, + "language_loss": 0.85866034, + "learning_rate": 0.00036193510626635517, + "loss": 0.87014484, + "num_input_tokens_seen": 260572048, + "router_z_loss_mlp": 0.7734375, + "step": 3126, + "time_per_iteration": 2.6580941677093506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151789, + "balance_loss_mlp": 1.07439816, + "epoch": 0.6015775298191612, + "flos": 750875656704.0, + "grad_norm": 0.03289877663507899, + "language_loss": 0.86000574, + "learning_rate": 0.0003616357034303649, + "loss": 0.87152362, + "num_input_tokens_seen": 260644720, + "router_z_loss_mlp": 0.77294922, + "step": 3127, + "time_per_iteration": 2.925900459289551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154509, + "balance_loss_mlp": 1.07730949, + "epoch": 0.6017699115044248, + "flos": 594263519232.0, + "grad_norm": 0.026386451784686567, + "language_loss": 0.83912927, + "learning_rate": 0.0003613363543121584, + "loss": 0.85067433, + "num_input_tokens_seen": 260724864, + "router_z_loss_mlp": 0.77099609, + "step": 3128, + "time_per_iteration": 2.8285086154937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149104, + "balance_loss_mlp": 1.07185686, + "epoch": 0.6019622931896883, + "flos": 516201498624.0, + "grad_norm": 0.032335523729292034, + "language_loss": 0.89489174, + "learning_rate": 0.00036103705902795357, + "loss": 0.90638286, + "num_input_tokens_seen": 260800896, + "router_z_loss_mlp": 0.77148438, + "step": 3129, + "time_per_iteration": 2.7369625568389893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153149, + "balance_loss_mlp": 1.0759964, + "epoch": 0.6021546748749519, + "flos": 491473075200.0, + "grad_norm": 0.037053521707819316, + "language_loss": 0.86282051, + "learning_rate": 0.0003607378176939471, + "loss": 0.87435198, + "num_input_tokens_seen": 260872736, + "router_z_loss_mlp": 0.77050781, + "step": 3130, + "time_per_iteration": 2.6015982627868652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155234, + "balance_loss_mlp": 1.07832015, + "epoch": 0.6023470565602155, + "flos": 542114959872.0, + "grad_norm": 0.03769359789833061, + "language_loss": 0.87922359, + "learning_rate": 0.00036043863042631465, + "loss": 0.89077592, + "num_input_tokens_seen": 260943264, + "router_z_loss_mlp": 0.76806641, + "step": 3131, + "time_per_iteration": 2.870999813079834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151659, + "balance_loss_mlp": 1.07436335, + "epoch": 0.6025394382454791, + "flos": 846463096320.0, + "grad_norm": 0.03206429015818981, + "language_loss": 0.81416667, + "learning_rate": 0.00036013949734121133, + "loss": 0.82568324, + "num_input_tokens_seen": 261030064, + "router_z_loss_mlp": 0.77197266, + "step": 3132, + "time_per_iteration": 3.1543962955474854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115191, + "balance_loss_mlp": 1.0745194, + "epoch": 0.6027318199307425, + "flos": 578257509888.0, + "grad_norm": 0.03267549496137676, + "language_loss": 0.87371534, + "learning_rate": 0.00035984041855477043, + "loss": 0.88523442, + "num_input_tokens_seen": 261106496, + "router_z_loss_mlp": 0.77294922, + "step": 3133, + "time_per_iteration": 2.7443673610687256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143524, + "balance_loss_mlp": 1.06837463, + "epoch": 0.6029242016160061, + "flos": 1474252766208.0, + "grad_norm": 0.006811691070041734, + "language_loss": 0.78709894, + "learning_rate": 0.00035954139418310495, + "loss": 0.79853421, + "num_input_tokens_seen": 261343248, + "router_z_loss_mlp": 0.75195312, + "step": 3134, + "time_per_iteration": 4.92242431640625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145401, + "balance_loss_mlp": 1.06810546, + "epoch": 0.6031165833012697, + "flos": 481782474240.0, + "grad_norm": 0.029444679170183622, + "language_loss": 0.84435833, + "learning_rate": 0.00035924242434230637, + "loss": 0.85581231, + "num_input_tokens_seen": 261416704, + "router_z_loss_mlp": 0.77197266, + "step": 3135, + "time_per_iteration": 2.6391186714172363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154302, + "balance_loss_mlp": 1.07700658, + "epoch": 0.6033089649865333, + "flos": 500464733184.0, + "grad_norm": 0.036345783287305373, + "language_loss": 0.85093319, + "learning_rate": 0.00035894350914844516, + "loss": 0.86247623, + "num_input_tokens_seen": 261486688, + "router_z_loss_mlp": 0.77197266, + "step": 3136, + "time_per_iteration": 2.6352477073669434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150224, + "balance_loss_mlp": 1.07259464, + "epoch": 0.6035013466717969, + "flos": 557723470848.0, + "grad_norm": 0.0365408898732846, + "language_loss": 0.89268684, + "learning_rate": 0.0003586446487175703, + "loss": 0.90418905, + "num_input_tokens_seen": 261557344, + "router_z_loss_mlp": 0.77539062, + "step": 3137, + "time_per_iteration": 2.693071126937866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149547, + "balance_loss_mlp": 1.07215679, + "epoch": 0.6036937283570604, + "flos": 595995777024.0, + "grad_norm": 0.02904364912520073, + "language_loss": 0.90167797, + "learning_rate": 0.0003583458431657099, + "loss": 0.91317338, + "num_input_tokens_seen": 261626240, + "router_z_loss_mlp": 0.77294922, + "step": 3138, + "time_per_iteration": 2.738223075866699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114932, + "balance_loss_mlp": 1.07178628, + "epoch": 0.603886110042324, + "flos": 542058564096.0, + "grad_norm": 0.037255533971674665, + "language_loss": 0.87546921, + "learning_rate": 0.00035804709260887056, + "loss": 0.88696241, + "num_input_tokens_seen": 261696368, + "router_z_loss_mlp": 0.77441406, + "step": 3139, + "time_per_iteration": 2.6814053058624268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148548, + "balance_loss_mlp": 1.07072818, + "epoch": 0.6040784917275875, + "flos": 519655280640.0, + "grad_norm": 0.02881429249122551, + "language_loss": 0.93902391, + "learning_rate": 0.0003577483971630373, + "loss": 0.95050937, + "num_input_tokens_seen": 261769104, + "router_z_loss_mlp": 0.77734375, + "step": 3140, + "time_per_iteration": 2.6691088676452637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011483, + "balance_loss_mlp": 1.07052839, + "epoch": 0.6042708734128511, + "flos": 662013858816.0, + "grad_norm": 0.0304544298908833, + "language_loss": 0.89555264, + "learning_rate": 0.00035744975694417414, + "loss": 0.90703559, + "num_input_tokens_seen": 261844880, + "router_z_loss_mlp": 0.77685547, + "step": 3141, + "time_per_iteration": 2.872135877609253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148852, + "balance_loss_mlp": 1.07107973, + "epoch": 0.6044632550981146, + "flos": 573516632064.0, + "grad_norm": 0.03378277324120908, + "language_loss": 0.88105464, + "learning_rate": 0.00035715117206822344, + "loss": 0.89254314, + "num_input_tokens_seen": 261923280, + "router_z_loss_mlp": 0.77685547, + "step": 3142, + "time_per_iteration": 2.790640354156494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150783, + "balance_loss_mlp": 1.07315397, + "epoch": 0.6046556367833782, + "flos": 547728697344.0, + "grad_norm": 0.0341385163456541, + "language_loss": 0.86351824, + "learning_rate": 0.0003568526426511065, + "loss": 0.87502599, + "num_input_tokens_seen": 261990832, + "router_z_loss_mlp": 0.77539062, + "step": 3143, + "time_per_iteration": 2.622870683670044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150768, + "balance_loss_mlp": 1.07318711, + "epoch": 0.6048480184686418, + "flos": 778174268928.0, + "grad_norm": 0.03443143260722225, + "language_loss": 0.88285363, + "learning_rate": 0.000356554168808722, + "loss": 0.89436138, + "num_input_tokens_seen": 262063760, + "router_z_loss_mlp": 0.77490234, + "step": 3144, + "time_per_iteration": 2.9785499572753906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151515, + "balance_loss_mlp": 1.07393324, + "epoch": 0.6050404001539054, + "flos": 658375426560.0, + "grad_norm": 0.03050523278027174, + "language_loss": 0.89547616, + "learning_rate": 0.00035625575065694837, + "loss": 0.9069913, + "num_input_tokens_seen": 262137968, + "router_z_loss_mlp": 0.77490234, + "step": 3145, + "time_per_iteration": 2.893160343170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151106, + "balance_loss_mlp": 1.07347679, + "epoch": 0.605232781839169, + "flos": 550082035200.0, + "grad_norm": 0.03434592875619572, + "language_loss": 0.82820475, + "learning_rate": 0.0003559573883116415, + "loss": 0.83971578, + "num_input_tokens_seen": 262211264, + "router_z_loss_mlp": 0.77539062, + "step": 3146, + "time_per_iteration": 2.703378677368164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152026, + "balance_loss_mlp": 1.07434905, + "epoch": 0.6054251635244324, + "flos": 606641829888.0, + "grad_norm": 0.028306929425565355, + "language_loss": 0.90180922, + "learning_rate": 0.00035565908188863604, + "loss": 0.91332948, + "num_input_tokens_seen": 262289648, + "router_z_loss_mlp": 0.77587891, + "step": 3147, + "time_per_iteration": 2.8178632259368896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149693, + "balance_loss_mlp": 1.07201612, + "epoch": 0.605617545209696, + "flos": 614808291840.0, + "grad_norm": 0.03167283444801755, + "language_loss": 0.85591269, + "learning_rate": 0.00035536083150374464, + "loss": 0.86740971, + "num_input_tokens_seen": 262362704, + "router_z_loss_mlp": 0.77587891, + "step": 3148, + "time_per_iteration": 2.7630088329315186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151665, + "balance_loss_mlp": 1.07613373, + "epoch": 0.6058099268949596, + "flos": 1501607774208.0, + "grad_norm": 0.006039709216806875, + "language_loss": 0.74747956, + "learning_rate": 0.00035506263727275893, + "loss": 0.75899613, + "num_input_tokens_seen": 262596864, + "router_z_loss_mlp": 0.75585938, + "step": 3149, + "time_per_iteration": 4.826624870300293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148811, + "balance_loss_mlp": 1.07108641, + "epoch": 0.6060023085802232, + "flos": 671704459776.0, + "grad_norm": 0.03325996872858785, + "language_loss": 0.90532559, + "learning_rate": 0.0003547644993114475, + "loss": 0.91681373, + "num_input_tokens_seen": 262671088, + "router_z_loss_mlp": 0.77636719, + "step": 3150, + "time_per_iteration": 2.802644729614258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149051, + "balance_loss_mlp": 1.07127893, + "epoch": 0.6061946902654868, + "flos": 607305844224.0, + "grad_norm": 0.03277875295758358, + "language_loss": 0.85509253, + "learning_rate": 0.00035446641773555806, + "loss": 0.86658305, + "num_input_tokens_seen": 262743888, + "router_z_loss_mlp": 0.77685547, + "step": 3151, + "time_per_iteration": 2.7055504322052 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148261, + "balance_loss_mlp": 1.07082272, + "epoch": 0.6063870719507503, + "flos": 558952169472.0, + "grad_norm": 0.029065175404624204, + "language_loss": 0.91512465, + "learning_rate": 0.000354168392660816, + "loss": 0.92660725, + "num_input_tokens_seen": 262819616, + "router_z_loss_mlp": 0.7734375, + "step": 3152, + "time_per_iteration": 2.7494730949401855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145734, + "balance_loss_mlp": 1.06829596, + "epoch": 0.6065794536360138, + "flos": 558281424384.0, + "grad_norm": 0.03244852665251002, + "language_loss": 0.88397223, + "learning_rate": 0.0003538704242029252, + "loss": 0.89542961, + "num_input_tokens_seen": 262893984, + "router_z_loss_mlp": 0.7734375, + "step": 3153, + "time_per_iteration": 2.675692558288574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146957, + "balance_loss_mlp": 1.06932831, + "epoch": 0.6067718353212774, + "flos": 691381102080.0, + "grad_norm": 0.033220307719005866, + "language_loss": 0.83031321, + "learning_rate": 0.0003535725124775672, + "loss": 0.84178281, + "num_input_tokens_seen": 262969648, + "router_z_loss_mlp": 0.77539062, + "step": 3154, + "time_per_iteration": 2.843881607055664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156617, + "balance_loss_mlp": 1.07903516, + "epoch": 0.606964217006541, + "flos": 522902945280.0, + "grad_norm": 0.035561743978846455, + "language_loss": 0.91791475, + "learning_rate": 0.00035327465760040126, + "loss": 0.92948091, + "num_input_tokens_seen": 263042048, + "router_z_loss_mlp": 0.77490234, + "step": 3155, + "time_per_iteration": 2.684056043624878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158513, + "balance_loss_mlp": 1.08112192, + "epoch": 0.6071565986918045, + "flos": 642712521216.0, + "grad_norm": 0.03594986649837803, + "language_loss": 0.89308429, + "learning_rate": 0.00035297685968706526, + "loss": 0.9046694, + "num_input_tokens_seen": 263108032, + "router_z_loss_mlp": 0.77294922, + "step": 3156, + "time_per_iteration": 2.7834246158599854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160171, + "balance_loss_mlp": 1.08278084, + "epoch": 0.6073489803770681, + "flos": 561652614144.0, + "grad_norm": 0.034893913409009325, + "language_loss": 0.88205332, + "learning_rate": 0.00035267911885317454, + "loss": 0.89365506, + "num_input_tokens_seen": 263175184, + "router_z_loss_mlp": 0.77294922, + "step": 3157, + "time_per_iteration": 2.669710397720337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158828, + "balance_loss_mlp": 1.08143747, + "epoch": 0.6075413620623317, + "flos": 587201504256.0, + "grad_norm": 0.030643892610273542, + "language_loss": 0.86383843, + "learning_rate": 0.0003523814352143222, + "loss": 0.87542671, + "num_input_tokens_seen": 263252768, + "router_z_loss_mlp": 0.77294922, + "step": 3158, + "time_per_iteration": 2.822089195251465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154763, + "balance_loss_mlp": 1.07741952, + "epoch": 0.6077337437475953, + "flos": 631971141120.0, + "grad_norm": 0.03639599054768475, + "language_loss": 0.96294606, + "learning_rate": 0.00035208380888607937, + "loss": 0.97449374, + "num_input_tokens_seen": 263328720, + "router_z_loss_mlp": 0.77246094, + "step": 3159, + "time_per_iteration": 2.7675912380218506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156998, + "balance_loss_mlp": 1.08184814, + "epoch": 0.6079261254328588, + "flos": 1471623453696.0, + "grad_norm": 0.01008994969394602, + "language_loss": 0.79461986, + "learning_rate": 0.000351786239983995, + "loss": 0.80618984, + "num_input_tokens_seen": 263554656, + "router_z_loss_mlp": 0.75195312, + "step": 3160, + "time_per_iteration": 4.839691638946533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155136, + "balance_loss_mlp": 1.07998657, + "epoch": 0.6081185071181223, + "flos": 1526203213824.0, + "grad_norm": 0.005930182573689796, + "language_loss": 0.7569223, + "learning_rate": 0.00035148872862359517, + "loss": 0.76847368, + "num_input_tokens_seen": 263791600, + "router_z_loss_mlp": 0.75195312, + "step": 3161, + "time_per_iteration": 4.991135835647583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154947, + "balance_loss_mlp": 1.07746089, + "epoch": 0.6083108888033859, + "flos": 557434761216.0, + "grad_norm": 0.030736279817991784, + "language_loss": 0.86955488, + "learning_rate": 0.00035119127492038446, + "loss": 0.88110441, + "num_input_tokens_seen": 263869744, + "router_z_loss_mlp": 0.77392578, + "step": 3162, + "time_per_iteration": 2.8129284381866455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115361, + "balance_loss_mlp": 1.07631505, + "epoch": 0.6085032704886495, + "flos": 842555420160.0, + "grad_norm": 0.033332341835850446, + "language_loss": 0.88169372, + "learning_rate": 0.00035089387898984436, + "loss": 0.89322984, + "num_input_tokens_seen": 263946624, + "router_z_loss_mlp": 0.77197266, + "step": 3163, + "time_per_iteration": 3.0287744998931885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151661, + "balance_loss_mlp": 1.07412744, + "epoch": 0.6086956521739131, + "flos": 685992947712.0, + "grad_norm": 0.03500074735075155, + "language_loss": 0.87286401, + "learning_rate": 0.0003505965409474343, + "loss": 0.88438058, + "num_input_tokens_seen": 264022064, + "router_z_loss_mlp": 0.77441406, + "step": 3164, + "time_per_iteration": 2.8668415546417236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155467, + "balance_loss_mlp": 1.07802904, + "epoch": 0.6088880338591766, + "flos": 536865793536.0, + "grad_norm": 0.03207560682458212, + "language_loss": 0.90936065, + "learning_rate": 0.0003502992609085913, + "loss": 0.92091525, + "num_input_tokens_seen": 264089520, + "router_z_loss_mlp": 0.7734375, + "step": 3165, + "time_per_iteration": 2.6344704627990723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152911, + "balance_loss_mlp": 1.07552052, + "epoch": 0.6090804155444401, + "flos": 732881607168.0, + "grad_norm": 0.03068132972373785, + "language_loss": 0.86756754, + "learning_rate": 0.00035000203898872954, + "loss": 0.87909669, + "num_input_tokens_seen": 264173056, + "router_z_loss_mlp": 0.77294922, + "step": 3166, + "time_per_iteration": 3.007883071899414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151975, + "balance_loss_mlp": 1.07458472, + "epoch": 0.6092727972297037, + "flos": 700242504192.0, + "grad_norm": 0.033743959402083586, + "language_loss": 0.89530504, + "learning_rate": 0.0003497048753032406, + "loss": 0.90682483, + "num_input_tokens_seen": 264250912, + "router_z_loss_mlp": 0.77294922, + "step": 3167, + "time_per_iteration": 2.903841018676758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150053, + "balance_loss_mlp": 1.07285297, + "epoch": 0.6094651789149673, + "flos": 1053676185600.0, + "grad_norm": 0.029535454603069295, + "language_loss": 0.85045445, + "learning_rate": 0.000349407769967494, + "loss": 0.86195493, + "num_input_tokens_seen": 264342800, + "router_z_loss_mlp": 0.77099609, + "step": 3168, + "time_per_iteration": 3.4178872108459473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155901, + "balance_loss_mlp": 1.07860577, + "epoch": 0.6096575606002309, + "flos": 504094433280.0, + "grad_norm": 0.02941914211290898, + "language_loss": 0.89039332, + "learning_rate": 0.0003491107230968361, + "loss": 0.90195233, + "num_input_tokens_seen": 264413664, + "router_z_loss_mlp": 0.77197266, + "step": 3169, + "time_per_iteration": 2.6551673412323 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156463, + "balance_loss_mlp": 1.07921588, + "epoch": 0.6098499422854944, + "flos": 586863129600.0, + "grad_norm": 0.02719917666416643, + "language_loss": 0.85504711, + "learning_rate": 0.00034881373480659085, + "loss": 0.86661172, + "num_input_tokens_seen": 264494944, + "router_z_loss_mlp": 0.77148438, + "step": 3170, + "time_per_iteration": 2.851252317428589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157705, + "balance_loss_mlp": 1.08040965, + "epoch": 0.610042323970758, + "flos": 470159502336.0, + "grad_norm": 0.06140035445399593, + "language_loss": 0.85159725, + "learning_rate": 0.0003485168052120594, + "loss": 0.86317426, + "num_input_tokens_seen": 264561664, + "router_z_loss_mlp": 0.77197266, + "step": 3171, + "time_per_iteration": 2.5498504638671875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156725, + "balance_loss_mlp": 1.07938242, + "epoch": 0.6102347056560216, + "flos": 515198383104.0, + "grad_norm": 0.03549166492948706, + "language_loss": 0.85369307, + "learning_rate": 0.00034821993442851973, + "loss": 0.86526036, + "num_input_tokens_seen": 264626256, + "router_z_loss_mlp": 0.77246094, + "step": 3172, + "time_per_iteration": 2.571030378341675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153351, + "balance_loss_mlp": 1.07600832, + "epoch": 0.6104270873412851, + "flos": 469964118528.0, + "grad_norm": 0.03723847696421654, + "language_loss": 0.87251568, + "learning_rate": 0.00034792312257122735, + "loss": 0.88404918, + "num_input_tokens_seen": 264692768, + "router_z_loss_mlp": 0.77246094, + "step": 3173, + "time_per_iteration": 2.601289987564087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153196, + "balance_loss_mlp": 1.07580578, + "epoch": 0.6106194690265486, + "flos": 550939431936.0, + "grad_norm": 0.03428989424028707, + "language_loss": 0.85585618, + "learning_rate": 0.00034762636975541506, + "loss": 0.86738813, + "num_input_tokens_seen": 264764816, + "router_z_loss_mlp": 0.77294922, + "step": 3174, + "time_per_iteration": 2.623203754425049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155286, + "balance_loss_mlp": 1.07784736, + "epoch": 0.6108118507118122, + "flos": 473880526848.0, + "grad_norm": 0.03492975408157665, + "language_loss": 0.85685778, + "learning_rate": 0.0003473296760962923, + "loss": 0.86841059, + "num_input_tokens_seen": 264837968, + "router_z_loss_mlp": 0.7734375, + "step": 3175, + "time_per_iteration": 2.6674108505249023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157349, + "balance_loss_mlp": 1.08181763, + "epoch": 0.6110042323970758, + "flos": 1448180124672.0, + "grad_norm": 0.011972836775056764, + "language_loss": 0.78533739, + "learning_rate": 0.00034703304170904617, + "loss": 0.79691088, + "num_input_tokens_seen": 265058336, + "router_z_loss_mlp": 0.75585938, + "step": 3176, + "time_per_iteration": 4.719567060470581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150349, + "balance_loss_mlp": 1.07286298, + "epoch": 0.6111966140823394, + "flos": 795541234176.0, + "grad_norm": 0.03714406101939167, + "language_loss": 0.87063801, + "learning_rate": 0.00034673646670883976, + "loss": 0.88214147, + "num_input_tokens_seen": 265135920, + "router_z_loss_mlp": 0.77392578, + "step": 3177, + "time_per_iteration": 2.973940134048462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155601, + "balance_loss_mlp": 1.0800705, + "epoch": 0.611388995767603, + "flos": 1561063397376.0, + "grad_norm": 0.00949552405530534, + "language_loss": 0.75715023, + "learning_rate": 0.0003464399512108141, + "loss": 0.76870626, + "num_input_tokens_seen": 265374464, + "router_z_loss_mlp": 0.75585938, + "step": 3178, + "time_per_iteration": 5.061004400253296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152416, + "balance_loss_mlp": 1.07488239, + "epoch": 0.6115813774528664, + "flos": 713484942336.0, + "grad_norm": 0.03541902083866898, + "language_loss": 0.87553525, + "learning_rate": 0.0003461434953300865, + "loss": 0.88705945, + "num_input_tokens_seen": 265450112, + "router_z_loss_mlp": 0.77441406, + "step": 3179, + "time_per_iteration": 2.916708469390869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153239, + "balance_loss_mlp": 1.07556212, + "epoch": 0.61177375913813, + "flos": 685689501696.0, + "grad_norm": 0.028499371872006348, + "language_loss": 0.85970306, + "learning_rate": 0.0003458470991817515, + "loss": 0.87123549, + "num_input_tokens_seen": 265534336, + "router_z_loss_mlp": 0.77587891, + "step": 3180, + "time_per_iteration": 2.9950902462005615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115431, + "balance_loss_mlp": 1.07677627, + "epoch": 0.6119661408233936, + "flos": 512667125760.0, + "grad_norm": 0.035557395139189776, + "language_loss": 0.89999539, + "learning_rate": 0.0003455507628808802, + "loss": 0.91153848, + "num_input_tokens_seen": 265604480, + "router_z_loss_mlp": 0.77441406, + "step": 3181, + "time_per_iteration": 2.5897092819213867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153736, + "balance_loss_mlp": 1.07629788, + "epoch": 0.6121585225086572, + "flos": 557855728128.0, + "grad_norm": 0.03617294918278912, + "language_loss": 0.90379083, + "learning_rate": 0.00034525448654252076, + "loss": 0.9153282, + "num_input_tokens_seen": 265670848, + "router_z_loss_mlp": 0.7734375, + "step": 3182, + "time_per_iteration": 2.636446714401245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157583, + "balance_loss_mlp": 1.08047891, + "epoch": 0.6123509041939207, + "flos": 562909510656.0, + "grad_norm": 0.037973624968581914, + "language_loss": 0.88617527, + "learning_rate": 0.0003449582702816976, + "loss": 0.89775109, + "num_input_tokens_seen": 265739584, + "router_z_loss_mlp": 0.77001953, + "step": 3183, + "time_per_iteration": 2.6636195182800293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155826, + "balance_loss_mlp": 1.0786258, + "epoch": 0.6125432858791843, + "flos": 559130088960.0, + "grad_norm": 0.03254272947638904, + "language_loss": 0.87538117, + "learning_rate": 0.0003446621142134122, + "loss": 0.88693941, + "num_input_tokens_seen": 265810368, + "router_z_loss_mlp": 0.77099609, + "step": 3184, + "time_per_iteration": 2.6456782817840576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154505, + "balance_loss_mlp": 1.07711458, + "epoch": 0.6127356675644479, + "flos": 415896649728.0, + "grad_norm": 0.03534541862410296, + "language_loss": 0.89029509, + "learning_rate": 0.0003443660184526424, + "loss": 0.90184009, + "num_input_tokens_seen": 265871616, + "router_z_loss_mlp": 0.77294922, + "step": 3185, + "time_per_iteration": 2.4446170330047607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153301, + "balance_loss_mlp": 1.07586265, + "epoch": 0.6129280492497114, + "flos": 605033097216.0, + "grad_norm": 0.03004060948026975, + "language_loss": 0.92148149, + "learning_rate": 0.0003440699831143429, + "loss": 0.93301451, + "num_input_tokens_seen": 265946672, + "router_z_loss_mlp": 0.7734375, + "step": 3186, + "time_per_iteration": 2.738818407058716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114756, + "balance_loss_mlp": 1.07007372, + "epoch": 0.613120430934975, + "flos": 520864513536.0, + "grad_norm": 0.031842648163895024, + "language_loss": 0.87123644, + "learning_rate": 0.0003437740083134449, + "loss": 0.88271207, + "num_input_tokens_seen": 266020640, + "router_z_loss_mlp": 0.77392578, + "step": 3187, + "time_per_iteration": 0.013826608657836914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145943, + "balance_loss_mlp": 1.06850421, + "epoch": 0.6133128126202385, + "flos": 512080974336.0, + "grad_norm": 0.03697103993803325, + "language_loss": 0.8916111, + "learning_rate": 0.00034347809416485574, + "loss": 0.90307051, + "num_input_tokens_seen": 266085776, + "router_z_loss_mlp": 0.7734375, + "step": 3188, + "time_per_iteration": 2.626657724380493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152707, + "balance_loss_mlp": 1.07517374, + "epoch": 0.6135051943055021, + "flos": 608756123136.0, + "grad_norm": 0.032275068446110486, + "language_loss": 0.8676489, + "learning_rate": 0.0003431822407834597, + "loss": 0.87917596, + "num_input_tokens_seen": 266157104, + "router_z_loss_mlp": 0.77441406, + "step": 3189, + "time_per_iteration": 2.784728765487671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153516, + "balance_loss_mlp": 1.07588649, + "epoch": 0.6136975759907657, + "flos": 1162008508416.0, + "grad_norm": 0.035345487562752465, + "language_loss": 0.90027606, + "learning_rate": 0.00034288644828411706, + "loss": 0.91181111, + "num_input_tokens_seen": 266244144, + "router_z_loss_mlp": 0.77539062, + "step": 3190, + "time_per_iteration": 3.453296661376953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147033, + "balance_loss_mlp": 1.06959414, + "epoch": 0.6138899576760293, + "flos": 708172649472.0, + "grad_norm": 0.033974370465757506, + "language_loss": 0.80322051, + "learning_rate": 0.0003425907167816649, + "loss": 0.81469083, + "num_input_tokens_seen": 266319040, + "router_z_loss_mlp": 0.7734375, + "step": 3191, + "time_per_iteration": 2.9247496128082275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147023, + "balance_loss_mlp": 1.0697751, + "epoch": 0.6140823393612928, + "flos": 587618468352.0, + "grad_norm": 0.031154822121678163, + "language_loss": 0.89756465, + "learning_rate": 0.00034229504639091623, + "loss": 0.90903485, + "num_input_tokens_seen": 266390784, + "router_z_loss_mlp": 0.77148438, + "step": 3192, + "time_per_iteration": 2.772437810897827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150486, + "balance_loss_mlp": 1.07342911, + "epoch": 0.6142747210465563, + "flos": 805618599936.0, + "grad_norm": 0.03412621705623903, + "language_loss": 0.84789693, + "learning_rate": 0.0003419994372266606, + "loss": 0.85940182, + "num_input_tokens_seen": 266483216, + "router_z_loss_mlp": 0.76953125, + "step": 3193, + "time_per_iteration": 3.096266984939575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148388, + "balance_loss_mlp": 1.07094979, + "epoch": 0.6144671027318199, + "flos": 530544380928.0, + "grad_norm": 0.028061755795717326, + "language_loss": 0.86464483, + "learning_rate": 0.00034170388940366335, + "loss": 0.87612873, + "num_input_tokens_seen": 266557344, + "router_z_loss_mlp": 0.7734375, + "step": 3194, + "time_per_iteration": 2.6779158115386963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152877, + "balance_loss_mlp": 1.07539093, + "epoch": 0.6146594844170835, + "flos": 806912426496.0, + "grad_norm": 0.030674949388275172, + "language_loss": 0.8474896, + "learning_rate": 0.0003414084030366667, + "loss": 0.85901833, + "num_input_tokens_seen": 266639488, + "router_z_loss_mlp": 0.77392578, + "step": 3195, + "time_per_iteration": 3.106736898422241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153391, + "balance_loss_mlp": 1.07590497, + "epoch": 0.6148518661023471, + "flos": 502761675264.0, + "grad_norm": 0.03337820573482111, + "language_loss": 0.87897015, + "learning_rate": 0.0003411129782403883, + "loss": 0.89050412, + "num_input_tokens_seen": 266711168, + "router_z_loss_mlp": 0.77392578, + "step": 3196, + "time_per_iteration": 2.643308639526367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154002, + "balance_loss_mlp": 1.07642102, + "epoch": 0.6150442477876106, + "flos": 511698938880.0, + "grad_norm": 0.038534572595061774, + "language_loss": 0.91158688, + "learning_rate": 0.0003408176151295225, + "loss": 0.92312694, + "num_input_tokens_seen": 266777632, + "router_z_loss_mlp": 0.77490234, + "step": 3197, + "time_per_iteration": 2.5714070796966553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157848, + "balance_loss_mlp": 1.08040917, + "epoch": 0.6152366294728742, + "flos": 527997660672.0, + "grad_norm": 0.045085971427018416, + "language_loss": 0.83155811, + "learning_rate": 0.00034052231381873944, + "loss": 0.84313661, + "num_input_tokens_seen": 266842880, + "router_z_loss_mlp": 0.7734375, + "step": 3198, + "time_per_iteration": 2.607335329055786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158567, + "balance_loss_mlp": 1.0808903, + "epoch": 0.6154290111581378, + "flos": 474282028032.0, + "grad_norm": 0.03501094506345523, + "language_loss": 0.90176225, + "learning_rate": 0.00034022707442268494, + "loss": 0.91334796, + "num_input_tokens_seen": 266909504, + "router_z_loss_mlp": 0.77587891, + "step": 3199, + "time_per_iteration": 2.541625499725342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160121, + "balance_loss_mlp": 1.08244419, + "epoch": 0.6156213928434013, + "flos": 551933815296.0, + "grad_norm": 0.028863713644250544, + "language_loss": 0.85985374, + "learning_rate": 0.0003399318970559813, + "loss": 0.87145495, + "num_input_tokens_seen": 266988880, + "router_z_loss_mlp": 0.77587891, + "step": 3200, + "time_per_iteration": 2.796062707901001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156186, + "balance_loss_mlp": 1.07850885, + "epoch": 0.6158137745286649, + "flos": 752360864256.0, + "grad_norm": 0.02911689008620782, + "language_loss": 0.8882643, + "learning_rate": 0.00033963678183322656, + "loss": 0.89982617, + "num_input_tokens_seen": 267074512, + "router_z_loss_mlp": 0.77587891, + "step": 3201, + "time_per_iteration": 3.0142765045166016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150573, + "balance_loss_mlp": 1.07313454, + "epoch": 0.6160061562139284, + "flos": 556905005568.0, + "grad_norm": 0.026867696213324778, + "language_loss": 0.87175548, + "learning_rate": 0.0003393417288689945, + "loss": 0.8832612, + "num_input_tokens_seen": 267147952, + "router_z_loss_mlp": 0.7734375, + "step": 3202, + "time_per_iteration": 2.655984401702881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149993, + "balance_loss_mlp": 1.07250667, + "epoch": 0.616198537899192, + "flos": 743466534912.0, + "grad_norm": 0.03671255454087467, + "language_loss": 0.83013773, + "learning_rate": 0.00033904673827783504, + "loss": 0.84163767, + "num_input_tokens_seen": 267224368, + "router_z_loss_mlp": 0.77392578, + "step": 3203, + "time_per_iteration": 2.937826633453369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148812, + "balance_loss_mlp": 1.07142162, + "epoch": 0.6163909195844556, + "flos": 479774241792.0, + "grad_norm": 0.030568222552849134, + "language_loss": 0.8708697, + "learning_rate": 0.00033875181017427357, + "loss": 0.88235784, + "num_input_tokens_seen": 267292688, + "router_z_loss_mlp": 0.77294922, + "step": 3204, + "time_per_iteration": 2.6731438636779785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150596, + "balance_loss_mlp": 1.07325339, + "epoch": 0.6165833012697192, + "flos": 532665404928.0, + "grad_norm": 0.031792873085422224, + "language_loss": 0.85750729, + "learning_rate": 0.00033845694467281133, + "loss": 0.86901325, + "num_input_tokens_seen": 267371888, + "router_z_loss_mlp": 0.77246094, + "step": 3205, + "time_per_iteration": 2.876248598098755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149976, + "balance_loss_mlp": 1.07268083, + "epoch": 0.6167756829549826, + "flos": 809293962240.0, + "grad_norm": 0.03236962907615372, + "language_loss": 0.88327932, + "learning_rate": 0.00033816214188792516, + "loss": 0.89477909, + "num_input_tokens_seen": 267458784, + "router_z_loss_mlp": 0.77197266, + "step": 3206, + "time_per_iteration": 3.1564157009124756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151124, + "balance_loss_mlp": 1.07378113, + "epoch": 0.6169680646402462, + "flos": 489910004736.0, + "grad_norm": 0.03290410688193805, + "language_loss": 0.91087395, + "learning_rate": 0.00033786740193406784, + "loss": 0.92238522, + "num_input_tokens_seen": 267528528, + "router_z_loss_mlp": 0.77246094, + "step": 3207, + "time_per_iteration": 2.614291191101074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149659, + "balance_loss_mlp": 1.07236373, + "epoch": 0.6171604463255098, + "flos": 620203176960.0, + "grad_norm": 0.032558146678985676, + "language_loss": 0.86120403, + "learning_rate": 0.00033757272492566736, + "loss": 0.87270063, + "num_input_tokens_seen": 267611152, + "router_z_loss_mlp": 0.77197266, + "step": 3208, + "time_per_iteration": 2.915374994277954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150778, + "balance_loss_mlp": 1.07333994, + "epoch": 0.6173528280107734, + "flos": 529895102976.0, + "grad_norm": 0.029217733611236158, + "language_loss": 0.91618085, + "learning_rate": 0.0003372781109771278, + "loss": 0.9276886, + "num_input_tokens_seen": 267681520, + "router_z_loss_mlp": 0.7734375, + "step": 3209, + "time_per_iteration": 2.7093894481658936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158751, + "balance_loss_mlp": 1.08155119, + "epoch": 0.617545209696037, + "flos": 597736766976.0, + "grad_norm": 0.03128870869992161, + "language_loss": 0.81418395, + "learning_rate": 0.0003369835602028281, + "loss": 0.82577139, + "num_input_tokens_seen": 267758768, + "router_z_loss_mlp": 0.77099609, + "step": 3210, + "time_per_iteration": 2.7591042518615723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156243, + "balance_loss_mlp": 1.07885218, + "epoch": 0.6177375913813005, + "flos": 476105610240.0, + "grad_norm": 0.03246928186554176, + "language_loss": 0.85136282, + "learning_rate": 0.0003366890727171232, + "loss": 0.86292523, + "num_input_tokens_seen": 267831056, + "router_z_loss_mlp": 0.77294922, + "step": 3211, + "time_per_iteration": 2.663344144821167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155968, + "balance_loss_mlp": 1.07881546, + "epoch": 0.617929973066564, + "flos": 530880754176.0, + "grad_norm": 0.03620138157042922, + "language_loss": 0.83830607, + "learning_rate": 0.00033639464863434313, + "loss": 0.84986579, + "num_input_tokens_seen": 267898416, + "router_z_loss_mlp": 0.77050781, + "step": 3212, + "time_per_iteration": 2.6296675205230713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117601, + "balance_loss_mlp": 1.10105133, + "epoch": 0.6181223547518276, + "flos": 1422832622592.0, + "grad_norm": 0.023588472816246354, + "language_loss": 0.78442466, + "learning_rate": 0.00033610028806879363, + "loss": 0.79618478, + "num_input_tokens_seen": 268112864, + "router_z_loss_mlp": 0.75, + "step": 3213, + "time_per_iteration": 4.6863789558410645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148522, + "balance_loss_mlp": 1.07122719, + "epoch": 0.6183147364370912, + "flos": 741695345664.0, + "grad_norm": 0.0331085707194938, + "language_loss": 0.84652448, + "learning_rate": 0.00033580599113475543, + "loss": 0.8580097, + "num_input_tokens_seen": 268198368, + "router_z_loss_mlp": 0.77197266, + "step": 3214, + "time_per_iteration": 2.9692540168762207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148587, + "balance_loss_mlp": 1.07138717, + "epoch": 0.6185071181223547, + "flos": 382482742272.0, + "grad_norm": 0.030292285906144818, + "language_loss": 0.9191429, + "learning_rate": 0.00033551175794648507, + "loss": 0.93062878, + "num_input_tokens_seen": 268260704, + "router_z_loss_mlp": 0.77099609, + "step": 3215, + "time_per_iteration": 2.4922029972076416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157146, + "balance_loss_mlp": 1.07970774, + "epoch": 0.6186994998076183, + "flos": 464304718848.0, + "grad_norm": 0.029842780568851025, + "language_loss": 0.8691783, + "learning_rate": 0.00033521758861821365, + "loss": 0.88074982, + "num_input_tokens_seen": 268328256, + "router_z_loss_mlp": 0.7734375, + "step": 3216, + "time_per_iteration": 2.599022150039673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152488, + "balance_loss_mlp": 1.07485938, + "epoch": 0.6188918814928819, + "flos": 486252106752.0, + "grad_norm": 0.03103316495727489, + "language_loss": 0.9338237, + "learning_rate": 0.0003349234832641479, + "loss": 0.94534856, + "num_input_tokens_seen": 268394016, + "router_z_loss_mlp": 0.77539062, + "step": 3217, + "time_per_iteration": 2.602800130844116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152031, + "balance_loss_mlp": 1.0744493, + "epoch": 0.6190842631781455, + "flos": 658597006848.0, + "grad_norm": 0.03734469861973323, + "language_loss": 0.85810769, + "learning_rate": 0.00033462944199846975, + "loss": 0.86962795, + "num_input_tokens_seen": 268478512, + "router_z_loss_mlp": 0.77490234, + "step": 3218, + "time_per_iteration": 3.070335626602173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151884, + "balance_loss_mlp": 1.07425499, + "epoch": 0.619276644863409, + "flos": 404467060224.0, + "grad_norm": 0.03666199268188377, + "language_loss": 0.91774654, + "learning_rate": 0.00033433546493533606, + "loss": 0.92926538, + "num_input_tokens_seen": 268540304, + "router_z_loss_mlp": 0.77539062, + "step": 3219, + "time_per_iteration": 2.468400716781616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149767, + "balance_loss_mlp": 1.07223368, + "epoch": 0.6194690265486725, + "flos": 584240547840.0, + "grad_norm": 0.03534009375651296, + "language_loss": 0.89686239, + "learning_rate": 0.00033404155218887897, + "loss": 0.90836006, + "num_input_tokens_seen": 268611136, + "router_z_loss_mlp": 0.77441406, + "step": 3220, + "time_per_iteration": 2.695805788040161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150834, + "balance_loss_mlp": 1.07329988, + "epoch": 0.6196614082339361, + "flos": 505384257024.0, + "grad_norm": 0.028059763946118966, + "language_loss": 0.91884506, + "learning_rate": 0.00033374770387320534, + "loss": 0.93035334, + "num_input_tokens_seen": 268684992, + "router_z_loss_mlp": 0.77441406, + "step": 3221, + "time_per_iteration": 2.7483606338500977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151577, + "balance_loss_mlp": 1.07409084, + "epoch": 0.6198537899191997, + "flos": 576525252096.0, + "grad_norm": 0.031050662157407424, + "language_loss": 0.90087008, + "learning_rate": 0.00033345392010239737, + "loss": 0.91238588, + "num_input_tokens_seen": 268758096, + "router_z_loss_mlp": 0.77392578, + "step": 3222, + "time_per_iteration": 2.714914560317993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114984, + "balance_loss_mlp": 1.07249725, + "epoch": 0.6200461716044633, + "flos": 594302450688.0, + "grad_norm": 0.03255490958660124, + "language_loss": 0.88128847, + "learning_rate": 0.0003331602009905118, + "loss": 0.89278692, + "num_input_tokens_seen": 268834432, + "router_z_loss_mlp": 0.77246094, + "step": 3223, + "time_per_iteration": 2.7981505393981934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148595, + "balance_loss_mlp": 1.0711087, + "epoch": 0.6202385532897268, + "flos": 667410745344.0, + "grad_norm": 0.028478674888367996, + "language_loss": 0.88510197, + "learning_rate": 0.00033286654665158085, + "loss": 0.89658791, + "num_input_tokens_seen": 268921168, + "router_z_loss_mlp": 0.77392578, + "step": 3224, + "time_per_iteration": 2.950357437133789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147753, + "balance_loss_mlp": 1.07045746, + "epoch": 0.6204309349749904, + "flos": 485926467072.0, + "grad_norm": 0.03296106773090735, + "language_loss": 0.92470849, + "learning_rate": 0.0003325729571996109, + "loss": 0.93618602, + "num_input_tokens_seen": 268991440, + "router_z_loss_mlp": 0.77197266, + "step": 3225, + "time_per_iteration": 2.632589340209961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150501, + "balance_loss_mlp": 1.07325304, + "epoch": 0.6206233166602539, + "flos": 585217466880.0, + "grad_norm": 0.0318626759985495, + "language_loss": 0.89139777, + "learning_rate": 0.000332279432748584, + "loss": 0.90290284, + "num_input_tokens_seen": 269061024, + "router_z_loss_mlp": 0.77148438, + "step": 3226, + "time_per_iteration": 2.704615592956543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149408, + "balance_loss_mlp": 1.07235157, + "epoch": 0.6208156983455175, + "flos": 477911728128.0, + "grad_norm": 0.029634304247413663, + "language_loss": 0.91940343, + "learning_rate": 0.00033198597341245576, + "loss": 0.93089747, + "num_input_tokens_seen": 269130560, + "router_z_loss_mlp": 0.76953125, + "step": 3227, + "time_per_iteration": 2.582554340362549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149434, + "balance_loss_mlp": 1.07228148, + "epoch": 0.6210080800307811, + "flos": 790467985920.0, + "grad_norm": 0.031063189419047472, + "language_loss": 0.86885202, + "learning_rate": 0.00033169257930515763, + "loss": 0.88034642, + "num_input_tokens_seen": 269213280, + "router_z_loss_mlp": 0.77050781, + "step": 3228, + "time_per_iteration": 3.0251591205596924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152373, + "balance_loss_mlp": 1.07526827, + "epoch": 0.6212004617160446, + "flos": 608916578304.0, + "grad_norm": 0.037247869916732776, + "language_loss": 0.87339175, + "learning_rate": 0.0003313992505405951, + "loss": 0.88491547, + "num_input_tokens_seen": 269286384, + "router_z_loss_mlp": 0.77001953, + "step": 3229, + "time_per_iteration": 2.697026014328003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149107, + "balance_loss_mlp": 1.07209802, + "epoch": 0.6213928434013082, + "flos": 587611737600.0, + "grad_norm": 0.03555615318912057, + "language_loss": 0.87367719, + "learning_rate": 0.0003311059872326487, + "loss": 0.88516825, + "num_input_tokens_seen": 269353296, + "router_z_loss_mlp": 0.76904297, + "step": 3230, + "time_per_iteration": 2.7712976932525635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157084, + "balance_loss_mlp": 1.08017004, + "epoch": 0.6215852250865718, + "flos": 537108840960.0, + "grad_norm": 0.03130868556859839, + "language_loss": 0.84262764, + "learning_rate": 0.0003308127894951734, + "loss": 0.85419852, + "num_input_tokens_seen": 269422304, + "router_z_loss_mlp": 0.76806641, + "step": 3231, + "time_per_iteration": 2.6406192779541016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114749, + "balance_loss_mlp": 1.07038534, + "epoch": 0.6217776067718354, + "flos": 619312852992.0, + "grad_norm": 0.034917389789924605, + "language_loss": 0.91667497, + "learning_rate": 0.00033051965744199834, + "loss": 0.92814988, + "num_input_tokens_seen": 269498784, + "router_z_loss_mlp": 0.77001953, + "step": 3232, + "time_per_iteration": 2.750717878341675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147898, + "balance_loss_mlp": 1.07084131, + "epoch": 0.6219699884570988, + "flos": 547099611648.0, + "grad_norm": 0.02871355385068571, + "language_loss": 0.9457683, + "learning_rate": 0.0003302265911869276, + "loss": 0.95724726, + "num_input_tokens_seen": 269581264, + "router_z_loss_mlp": 0.76953125, + "step": 3233, + "time_per_iteration": 2.930553436279297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147703, + "balance_loss_mlp": 1.07059801, + "epoch": 0.6221623701423624, + "flos": 482155777536.0, + "grad_norm": 0.03278824818574476, + "language_loss": 0.89681149, + "learning_rate": 0.0003299335908437397, + "loss": 0.90828854, + "num_input_tokens_seen": 269649408, + "router_z_loss_mlp": 0.77001953, + "step": 3234, + "time_per_iteration": 2.5631237030029297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149081, + "balance_loss_mlp": 1.07211912, + "epoch": 0.622354751827626, + "flos": 380872008192.0, + "grad_norm": 0.04189689360611541, + "language_loss": 0.86520332, + "learning_rate": 0.0003296406565261873, + "loss": 0.8766942, + "num_input_tokens_seen": 269711648, + "router_z_loss_mlp": 0.76855469, + "step": 3235, + "time_per_iteration": 2.457258701324463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148211, + "balance_loss_mlp": 1.07129693, + "epoch": 0.6225471335128896, + "flos": 669071144448.0, + "grad_norm": 0.03023362442836584, + "language_loss": 0.89682841, + "learning_rate": 0.0003293477883479978, + "loss": 0.90831059, + "num_input_tokens_seen": 269787376, + "router_z_loss_mlp": 0.76806641, + "step": 3236, + "time_per_iteration": 2.8200809955596924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148687, + "balance_loss_mlp": 1.07172537, + "epoch": 0.6227395151981532, + "flos": 772627660800.0, + "grad_norm": 0.038353629459733245, + "language_loss": 0.85627455, + "learning_rate": 0.0003290549864228727, + "loss": 0.86776143, + "num_input_tokens_seen": 269863008, + "router_z_loss_mlp": 0.76855469, + "step": 3237, + "time_per_iteration": 2.9402804374694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151344, + "balance_loss_mlp": 1.07419205, + "epoch": 0.6229318968834167, + "flos": 485357779968.0, + "grad_norm": 0.030356371486713406, + "language_loss": 0.91371596, + "learning_rate": 0.0003287622508644875, + "loss": 0.92522943, + "num_input_tokens_seen": 269939552, + "router_z_loss_mlp": 0.77050781, + "step": 3238, + "time_per_iteration": 2.761613368988037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152584, + "balance_loss_mlp": 1.07543159, + "epoch": 0.6231242785686802, + "flos": 463877021184.0, + "grad_norm": 0.03773116735562404, + "language_loss": 0.92044532, + "learning_rate": 0.0003284695817864923, + "loss": 0.93197119, + "num_input_tokens_seen": 270002752, + "router_z_loss_mlp": 0.77050781, + "step": 3239, + "time_per_iteration": 2.496115207672119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152871, + "balance_loss_mlp": 1.07562304, + "epoch": 0.6233166602539438, + "flos": 610210404864.0, + "grad_norm": 0.04001521730964561, + "language_loss": 0.91216815, + "learning_rate": 0.0003281769793025116, + "loss": 0.92369688, + "num_input_tokens_seen": 270075696, + "router_z_loss_mlp": 0.77148438, + "step": 3240, + "time_per_iteration": 2.737149953842163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153333, + "balance_loss_mlp": 1.07613325, + "epoch": 0.6235090419392074, + "flos": 440114783232.0, + "grad_norm": 0.039001077055099004, + "language_loss": 0.95066154, + "learning_rate": 0.00032788444352614346, + "loss": 0.9621948, + "num_input_tokens_seen": 270139872, + "router_z_loss_mlp": 0.77099609, + "step": 3241, + "time_per_iteration": 2.5000274181365967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152362, + "balance_loss_mlp": 1.07520986, + "epoch": 0.6237014236244709, + "flos": 505900551168.0, + "grad_norm": 0.03351386174888394, + "language_loss": 0.86000109, + "learning_rate": 0.0003275919745709606, + "loss": 0.87152469, + "num_input_tokens_seen": 270206752, + "router_z_loss_mlp": 0.77050781, + "step": 3242, + "time_per_iteration": 2.5560779571533203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150845, + "balance_loss_mlp": 1.07359755, + "epoch": 0.6238938053097345, + "flos": 513995880960.0, + "grad_norm": 0.02989991495254077, + "language_loss": 0.86827087, + "learning_rate": 0.00032729957255050936, + "loss": 0.87977934, + "num_input_tokens_seen": 270275472, + "router_z_loss_mlp": 0.77148438, + "step": 3243, + "time_per_iteration": 2.7240655422210693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151606, + "balance_loss_mlp": 1.07440567, + "epoch": 0.6240861869949981, + "flos": 738021984768.0, + "grad_norm": 0.03287270457650662, + "language_loss": 0.87638962, + "learning_rate": 0.0003270072375783102, + "loss": 0.88790572, + "num_input_tokens_seen": 270348336, + "router_z_loss_mlp": 0.77099609, + "step": 3244, + "time_per_iteration": 2.9896130561828613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151989, + "balance_loss_mlp": 1.07469356, + "epoch": 0.6242785686802617, + "flos": 495708392448.0, + "grad_norm": 0.032661081616998364, + "language_loss": 0.84373832, + "learning_rate": 0.00032671496976785774, + "loss": 0.85525823, + "num_input_tokens_seen": 270416496, + "router_z_loss_mlp": 0.77197266, + "step": 3245, + "time_per_iteration": 2.635254144668579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152307, + "balance_loss_mlp": 1.0751549, + "epoch": 0.6244709503655252, + "flos": 747233221632.0, + "grad_norm": 0.0292375931838659, + "language_loss": 0.80339247, + "learning_rate": 0.0003264227692326205, + "loss": 0.81491554, + "num_input_tokens_seen": 270501680, + "router_z_loss_mlp": 0.77050781, + "step": 3246, + "time_per_iteration": 3.037773609161377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152481, + "balance_loss_mlp": 1.07523346, + "epoch": 0.6246633320507887, + "flos": 493550438400.0, + "grad_norm": 0.03477244782189641, + "language_loss": 0.90644753, + "learning_rate": 0.00032613063608604055, + "loss": 0.91797233, + "num_input_tokens_seen": 270568656, + "router_z_loss_mlp": 0.77148438, + "step": 3247, + "time_per_iteration": 2.537938117980957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151924, + "balance_loss_mlp": 1.07462883, + "epoch": 0.6248557137360523, + "flos": 518391653376.0, + "grad_norm": 0.03220304016525991, + "language_loss": 0.89104807, + "learning_rate": 0.0003258385704415343, + "loss": 0.90256733, + "num_input_tokens_seen": 270636160, + "router_z_loss_mlp": 0.77197266, + "step": 3248, + "time_per_iteration": 2.6050169467926025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157108, + "balance_loss_mlp": 1.08005083, + "epoch": 0.6250480954213159, + "flos": 520428083712.0, + "grad_norm": 0.030644735245645434, + "language_loss": 0.87455463, + "learning_rate": 0.0003255465724124915, + "loss": 0.88612568, + "num_input_tokens_seen": 270708816, + "router_z_loss_mlp": 0.76953125, + "step": 3249, + "time_per_iteration": 2.7039546966552734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152527, + "balance_loss_mlp": 1.07532752, + "epoch": 0.6252404771065795, + "flos": 517069628928.0, + "grad_norm": 0.031780137669166014, + "language_loss": 0.87919134, + "learning_rate": 0.00032525464211227587, + "loss": 0.89071667, + "num_input_tokens_seen": 270778016, + "router_z_loss_mlp": 0.77099609, + "step": 3250, + "time_per_iteration": 2.601846933364868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150948, + "balance_loss_mlp": 1.07403469, + "epoch": 0.6254328587918431, + "flos": 577996998144.0, + "grad_norm": 0.033725560308058275, + "language_loss": 0.90909386, + "learning_rate": 0.0003249627796542249, + "loss": 0.92060328, + "num_input_tokens_seen": 270847072, + "router_z_loss_mlp": 0.76806641, + "step": 3251, + "time_per_iteration": 2.653550148010254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152607, + "balance_loss_mlp": 1.07578814, + "epoch": 0.6256252404771065, + "flos": 599104453632.0, + "grad_norm": 0.030197281894512866, + "language_loss": 0.89177507, + "learning_rate": 0.00032467098515164943, + "loss": 0.90330118, + "num_input_tokens_seen": 270926320, + "router_z_loss_mlp": 0.76708984, + "step": 3252, + "time_per_iteration": 2.896319627761841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153096, + "balance_loss_mlp": 1.07622945, + "epoch": 0.6258176221623701, + "flos": 509361063936.0, + "grad_norm": 0.03670659852857571, + "language_loss": 0.90126091, + "learning_rate": 0.00032437925871783456, + "loss": 0.91279185, + "num_input_tokens_seen": 270997904, + "router_z_loss_mlp": 0.76757812, + "step": 3253, + "time_per_iteration": 2.6326792240142822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151923, + "balance_loss_mlp": 1.07500935, + "epoch": 0.6260100038476337, + "flos": 640804345344.0, + "grad_norm": 0.03617334498196145, + "language_loss": 0.90267026, + "learning_rate": 0.00032408760046603803, + "loss": 0.91418946, + "num_input_tokens_seen": 271074256, + "router_z_loss_mlp": 0.76806641, + "step": 3254, + "time_per_iteration": 2.803849697113037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151596, + "balance_loss_mlp": 1.07458711, + "epoch": 0.6262023855328973, + "flos": 842451360768.0, + "grad_norm": 0.034269487661108974, + "language_loss": 0.82522523, + "learning_rate": 0.00032379601050949193, + "loss": 0.83674121, + "num_input_tokens_seen": 271155152, + "router_z_loss_mlp": 0.76904297, + "step": 3255, + "time_per_iteration": 3.1005427837371826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150946, + "balance_loss_mlp": 1.07422304, + "epoch": 0.6263947672181608, + "flos": 523156726272.0, + "grad_norm": 0.032816276182318284, + "language_loss": 0.93856758, + "learning_rate": 0.0003235044889614013, + "loss": 0.950077, + "num_input_tokens_seen": 271224784, + "router_z_loss_mlp": 0.76611328, + "step": 3256, + "time_per_iteration": 2.6180245876312256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151059, + "balance_loss_mlp": 1.07419276, + "epoch": 0.6265871489034244, + "flos": 608289494016.0, + "grad_norm": 0.03305761610211967, + "language_loss": 0.8896969, + "learning_rate": 0.0003232130359349451, + "loss": 0.90120745, + "num_input_tokens_seen": 271303584, + "router_z_loss_mlp": 0.76757812, + "step": 3257, + "time_per_iteration": 2.845158576965332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152664, + "balance_loss_mlp": 1.07579827, + "epoch": 0.626779530588688, + "flos": 589593773568.0, + "grad_norm": 0.030590175923720698, + "language_loss": 0.86119747, + "learning_rate": 0.0003229216515432751, + "loss": 0.87272418, + "num_input_tokens_seen": 271379632, + "router_z_loss_mlp": 0.76757812, + "step": 3258, + "time_per_iteration": 2.776336193084717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151745, + "balance_loss_mlp": 1.07473612, + "epoch": 0.6269719122739515, + "flos": 439537363968.0, + "grad_norm": 0.03493081590414929, + "language_loss": 0.86540627, + "learning_rate": 0.0003226303358995174, + "loss": 0.87692368, + "num_input_tokens_seen": 271447808, + "router_z_loss_mlp": 0.76904297, + "step": 3259, + "time_per_iteration": 2.589393377304077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151325, + "balance_loss_mlp": 1.07431602, + "epoch": 0.6271642939592151, + "flos": 564014684160.0, + "grad_norm": 0.02751327310294224, + "language_loss": 0.92896867, + "learning_rate": 0.00032233908911677, + "loss": 0.9404819, + "num_input_tokens_seen": 271526768, + "router_z_loss_mlp": 0.76904297, + "step": 3260, + "time_per_iteration": 2.834845781326294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148548, + "balance_loss_mlp": 1.07153916, + "epoch": 0.6273566756444786, + "flos": 515652277248.0, + "grad_norm": 0.03305165048168085, + "language_loss": 0.86257023, + "learning_rate": 0.0003220479113081053, + "loss": 0.87405574, + "num_input_tokens_seen": 271597840, + "router_z_loss_mlp": 0.76904297, + "step": 3261, + "time_per_iteration": 2.7153472900390625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151278, + "balance_loss_mlp": 1.07431674, + "epoch": 0.6275490573297422, + "flos": 586587154944.0, + "grad_norm": 0.03255760599660819, + "language_loss": 0.84347677, + "learning_rate": 0.00032175680258656836, + "loss": 0.85498953, + "num_input_tokens_seen": 271668352, + "router_z_loss_mlp": 0.76855469, + "step": 3262, + "time_per_iteration": 2.7178304195404053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153298, + "balance_loss_mlp": 1.07638431, + "epoch": 0.6277414390150058, + "flos": 560543437824.0, + "grad_norm": 0.03084786969473793, + "language_loss": 0.84701777, + "learning_rate": 0.00032146576306517794, + "loss": 0.85855073, + "num_input_tokens_seen": 271743936, + "router_z_loss_mlp": 0.76806641, + "step": 3263, + "time_per_iteration": 2.730602502822876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153924, + "balance_loss_mlp": 1.07686687, + "epoch": 0.6279338207002694, + "flos": 613840104960.0, + "grad_norm": 0.03145910939226107, + "language_loss": 0.86918247, + "learning_rate": 0.0003211747928569255, + "loss": 0.88072169, + "num_input_tokens_seen": 271817008, + "router_z_loss_mlp": 0.76953125, + "step": 3264, + "time_per_iteration": 2.724712371826172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155736, + "balance_loss_mlp": 1.07882273, + "epoch": 0.6281262023855329, + "flos": 626932821504.0, + "grad_norm": 0.028624354652689574, + "language_loss": 0.87177598, + "learning_rate": 0.0003208838920747754, + "loss": 0.88333333, + "num_input_tokens_seen": 271896960, + "router_z_loss_mlp": 0.76806641, + "step": 3265, + "time_per_iteration": 2.830962896347046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115106, + "balance_loss_mlp": 1.07405066, + "epoch": 0.6283185840707964, + "flos": 1125418795008.0, + "grad_norm": 0.03154411123335471, + "language_loss": 0.82117403, + "learning_rate": 0.0003205930608316656, + "loss": 0.83268464, + "num_input_tokens_seen": 271985008, + "router_z_loss_mlp": 0.76904297, + "step": 3266, + "time_per_iteration": 3.4846274852752686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152648, + "balance_loss_mlp": 1.07573402, + "epoch": 0.62851096575606, + "flos": 516331754496.0, + "grad_norm": 0.032694316072136534, + "language_loss": 0.89774895, + "learning_rate": 0.00032030229924050673, + "loss": 0.90927541, + "num_input_tokens_seen": 272056368, + "router_z_loss_mlp": 0.76806641, + "step": 3267, + "time_per_iteration": 2.6537904739379883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150261, + "balance_loss_mlp": 1.07320464, + "epoch": 0.6287033474413236, + "flos": 405061943808.0, + "grad_norm": 0.03610764341116815, + "language_loss": 0.86379248, + "learning_rate": 0.00032001160741418247, + "loss": 0.8752951, + "num_input_tokens_seen": 272123424, + "router_z_loss_mlp": 0.76953125, + "step": 3268, + "time_per_iteration": 2.6072278022766113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149975, + "balance_loss_mlp": 1.0729655, + "epoch": 0.6288957291265872, + "flos": 526758228480.0, + "grad_norm": 0.03519251125136882, + "language_loss": 0.87577492, + "learning_rate": 0.0003197209854655494, + "loss": 0.88727468, + "num_input_tokens_seen": 272193008, + "router_z_loss_mlp": 0.76904297, + "step": 3269, + "time_per_iteration": 2.624221086502075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151498, + "balance_loss_mlp": 1.07458413, + "epoch": 0.6290881108118507, + "flos": 604957235712.0, + "grad_norm": 0.03303529236450534, + "language_loss": 0.79662859, + "learning_rate": 0.0003194304335074371, + "loss": 0.80814356, + "num_input_tokens_seen": 272275328, + "router_z_loss_mlp": 0.76806641, + "step": 3270, + "time_per_iteration": 2.842299461364746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153904, + "balance_loss_mlp": 1.07703781, + "epoch": 0.6292804924971143, + "flos": 438597374976.0, + "grad_norm": 0.03323676651467279, + "language_loss": 0.93520898, + "learning_rate": 0.0003191399516526475, + "loss": 0.94674796, + "num_input_tokens_seen": 272339328, + "router_z_loss_mlp": 0.76757812, + "step": 3271, + "time_per_iteration": 2.534921169281006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151771, + "balance_loss_mlp": 1.07500029, + "epoch": 0.6294728741823779, + "flos": 607844332032.0, + "grad_norm": 0.029188592887849887, + "language_loss": 0.84005713, + "learning_rate": 0.0003188495400139559, + "loss": 0.8515749, + "num_input_tokens_seen": 272416336, + "router_z_loss_mlp": 0.76660156, + "step": 3272, + "time_per_iteration": 2.783825397491455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149208, + "balance_loss_mlp": 1.07229424, + "epoch": 0.6296652558676414, + "flos": 702773761536.0, + "grad_norm": 0.03427526038841549, + "language_loss": 0.89267194, + "learning_rate": 0.00031855919870411013, + "loss": 0.90416408, + "num_input_tokens_seen": 272490368, + "router_z_loss_mlp": 0.76806641, + "step": 3273, + "time_per_iteration": 2.8276174068450928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148805, + "balance_loss_mlp": 1.07189095, + "epoch": 0.6298576375529049, + "flos": 524943378432.0, + "grad_norm": 0.029237647029809653, + "language_loss": 0.89991713, + "learning_rate": 0.0003182689278358305, + "loss": 0.91140521, + "num_input_tokens_seen": 272562992, + "router_z_loss_mlp": 0.76806641, + "step": 3274, + "time_per_iteration": 2.706908941268921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148394, + "balance_loss_mlp": 1.07143247, + "epoch": 0.6300500192381685, + "flos": 476926076928.0, + "grad_norm": 0.034587260543346605, + "language_loss": 0.85421312, + "learning_rate": 0.0003179787275218105, + "loss": 0.86569709, + "num_input_tokens_seen": 272629456, + "router_z_loss_mlp": 0.76855469, + "step": 3275, + "time_per_iteration": 2.537382125854492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147946, + "balance_loss_mlp": 1.07117569, + "epoch": 0.6302424009234321, + "flos": 521891097600.0, + "grad_norm": 0.02794771765960627, + "language_loss": 0.8894403, + "learning_rate": 0.0003176885978747155, + "loss": 0.9009198, + "num_input_tokens_seen": 272697440, + "router_z_loss_mlp": 0.76660156, + "step": 3276, + "time_per_iteration": 2.6045258045196533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148975, + "balance_loss_mlp": 1.07225204, + "epoch": 0.6304347826086957, + "flos": 695857465344.0, + "grad_norm": 0.03251661514625025, + "language_loss": 0.87684363, + "learning_rate": 0.0003173985390071839, + "loss": 0.88833332, + "num_input_tokens_seen": 272774080, + "router_z_loss_mlp": 0.76611328, + "step": 3277, + "time_per_iteration": 2.858759641647339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167786, + "balance_loss_mlp": 1.09187317, + "epoch": 0.6306271642939593, + "flos": 1470030183936.0, + "grad_norm": 0.015221211739027024, + "language_loss": 0.77900457, + "learning_rate": 0.00031710855103182675, + "loss": 0.79068244, + "num_input_tokens_seen": 272998512, + "router_z_loss_mlp": 0.7578125, + "step": 3278, + "time_per_iteration": 4.767859220504761 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148, + "balance_loss_mlp": 1.07122958, + "epoch": 0.6308195459792227, + "flos": 602929537536.0, + "grad_norm": 0.03309702536338572, + "language_loss": 0.87110293, + "learning_rate": 0.00031681863406122704, + "loss": 0.8825829, + "num_input_tokens_seen": 273074672, + "router_z_loss_mlp": 0.76660156, + "step": 3279, + "time_per_iteration": 2.7526352405548096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151009, + "balance_loss_mlp": 1.0742383, + "epoch": 0.6310119276644863, + "flos": 728236056576.0, + "grad_norm": 0.03127249771985471, + "language_loss": 0.90830934, + "learning_rate": 0.00031652878820794087, + "loss": 0.91981947, + "num_input_tokens_seen": 273157904, + "router_z_loss_mlp": 0.76660156, + "step": 3280, + "time_per_iteration": 2.980374813079834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152955, + "balance_loss_mlp": 1.07623196, + "epoch": 0.6312043093497499, + "flos": 520818851328.0, + "grad_norm": 0.035871108010903825, + "language_loss": 0.91415131, + "learning_rate": 0.00031623901358449627, + "loss": 0.92568088, + "num_input_tokens_seen": 273228160, + "router_z_loss_mlp": 0.76611328, + "step": 3281, + "time_per_iteration": 2.6661479473114014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153626, + "balance_loss_mlp": 1.07685518, + "epoch": 0.6313966910350135, + "flos": 532222244352.0, + "grad_norm": 0.03104696980992861, + "language_loss": 0.93473637, + "learning_rate": 0.0003159493103033936, + "loss": 0.94627267, + "num_input_tokens_seen": 273295872, + "router_z_loss_mlp": 0.76660156, + "step": 3282, + "time_per_iteration": 2.7015254497528076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156189, + "balance_loss_mlp": 1.08065796, + "epoch": 0.631589072720277, + "flos": 1382993969664.0, + "grad_norm": 0.006807831796281711, + "language_loss": 0.79919052, + "learning_rate": 0.00031565967847710564, + "loss": 0.81075245, + "num_input_tokens_seen": 273524320, + "router_z_loss_mlp": 0.75585938, + "step": 3283, + "time_per_iteration": 4.893282890319824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153518, + "balance_loss_mlp": 1.07674742, + "epoch": 0.6317814544055406, + "flos": 625873310208.0, + "grad_norm": 0.03000778283215098, + "language_loss": 0.87091964, + "learning_rate": 0.0003153701182180776, + "loss": 0.88245487, + "num_input_tokens_seen": 273598544, + "router_z_loss_mlp": 0.76660156, + "step": 3284, + "time_per_iteration": 2.785921335220337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153113, + "balance_loss_mlp": 1.07643747, + "epoch": 0.6319738360908042, + "flos": 499097046528.0, + "grad_norm": 0.030580966863201303, + "language_loss": 0.86424339, + "learning_rate": 0.00031508062963872655, + "loss": 0.8757745, + "num_input_tokens_seen": 273666000, + "router_z_loss_mlp": 0.765625, + "step": 3285, + "time_per_iteration": 2.6083192825317383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152348, + "balance_loss_mlp": 1.07567286, + "epoch": 0.6321662177760677, + "flos": 580908289536.0, + "grad_norm": 0.03249956938477427, + "language_loss": 0.84091449, + "learning_rate": 0.0003147912128514423, + "loss": 0.85243797, + "num_input_tokens_seen": 273742672, + "router_z_loss_mlp": 0.765625, + "step": 3286, + "time_per_iteration": 2.7065303325653076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114775, + "balance_loss_mlp": 1.07107508, + "epoch": 0.6323585994613313, + "flos": 602605899264.0, + "grad_norm": 0.03060189068927108, + "language_loss": 0.92241961, + "learning_rate": 0.0003145018679685859, + "loss": 0.93389714, + "num_input_tokens_seen": 273813984, + "router_z_loss_mlp": 0.765625, + "step": 3287, + "time_per_iteration": 2.724647045135498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147567, + "balance_loss_mlp": 1.07093954, + "epoch": 0.6325509811465948, + "flos": 529632589824.0, + "grad_norm": 0.026442764297463384, + "language_loss": 0.9133988, + "learning_rate": 0.00031421259510249134, + "loss": 0.92487442, + "num_input_tokens_seen": 273892848, + "router_z_loss_mlp": 0.76513672, + "step": 3288, + "time_per_iteration": 2.7890970706939697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146868, + "balance_loss_mlp": 1.07019234, + "epoch": 0.6327433628318584, + "flos": 575344217088.0, + "grad_norm": 0.03165563146125425, + "language_loss": 0.8638919, + "learning_rate": 0.00031392339436546414, + "loss": 0.87536061, + "num_input_tokens_seen": 273971696, + "router_z_loss_mlp": 0.765625, + "step": 3289, + "time_per_iteration": 2.8359181880950928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147105, + "balance_loss_mlp": 1.07042992, + "epoch": 0.632935744517122, + "flos": 518111675904.0, + "grad_norm": 0.040669622782204255, + "language_loss": 0.87612778, + "learning_rate": 0.00031363426586978205, + "loss": 0.88759887, + "num_input_tokens_seen": 274048096, + "router_z_loss_mlp": 0.765625, + "step": 3290, + "time_per_iteration": 2.755444288253784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148795, + "balance_loss_mlp": 1.07216728, + "epoch": 0.6331281262023856, + "flos": 618596445696.0, + "grad_norm": 0.029293061792341625, + "language_loss": 0.89532119, + "learning_rate": 0.0003133452097276947, + "loss": 0.90680915, + "num_input_tokens_seen": 274122848, + "router_z_loss_mlp": 0.76513672, + "step": 3291, + "time_per_iteration": 2.731522560119629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153422, + "balance_loss_mlp": 1.07674634, + "epoch": 0.633320507887649, + "flos": 594115799040.0, + "grad_norm": 0.032525593419921936, + "language_loss": 0.88528687, + "learning_rate": 0.0003130562260514238, + "loss": 0.89682108, + "num_input_tokens_seen": 274198320, + "router_z_loss_mlp": 0.765625, + "step": 3292, + "time_per_iteration": 2.7816312313079834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150685, + "balance_loss_mlp": 1.07396197, + "epoch": 0.6335128895729126, + "flos": 583495942656.0, + "grad_norm": 0.0277750610234457, + "language_loss": 0.86754191, + "learning_rate": 0.0003127673149531626, + "loss": 0.87904876, + "num_input_tokens_seen": 274274944, + "router_z_loss_mlp": 0.76611328, + "step": 3293, + "time_per_iteration": 2.7256717681884766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151215, + "balance_loss_mlp": 1.0744915, + "epoch": 0.6337052712581762, + "flos": 453973572096.0, + "grad_norm": 0.0366063114700609, + "language_loss": 0.89718056, + "learning_rate": 0.0003124784765450762, + "loss": 0.90869272, + "num_input_tokens_seen": 274342384, + "router_z_loss_mlp": 0.76611328, + "step": 3294, + "time_per_iteration": 2.557979106903076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152531, + "balance_loss_mlp": 1.07585573, + "epoch": 0.6338976529434398, + "flos": 574515018240.0, + "grad_norm": 0.03914872981780459, + "language_loss": 0.86348414, + "learning_rate": 0.0003121897109393017, + "loss": 0.87500942, + "num_input_tokens_seen": 274417568, + "router_z_loss_mlp": 0.765625, + "step": 3295, + "time_per_iteration": 2.7648093700408936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150647, + "balance_loss_mlp": 1.0738759, + "epoch": 0.6340900346287034, + "flos": 509808227328.0, + "grad_norm": 0.03170073477682662, + "language_loss": 0.93116355, + "learning_rate": 0.0003119010182479481, + "loss": 0.94267005, + "num_input_tokens_seen": 274488960, + "router_z_loss_mlp": 0.76660156, + "step": 3296, + "time_per_iteration": 2.6290597915649414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152399, + "balance_loss_mlp": 1.07562852, + "epoch": 0.6342824163139669, + "flos": 480714230784.0, + "grad_norm": 0.034261076448020254, + "language_loss": 0.8817153, + "learning_rate": 0.00031161239858309563, + "loss": 0.89323932, + "num_input_tokens_seen": 274556880, + "router_z_loss_mlp": 0.76660156, + "step": 3297, + "time_per_iteration": 2.5535776615142822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152393, + "balance_loss_mlp": 1.07571757, + "epoch": 0.6344747979992305, + "flos": 573110401536.0, + "grad_norm": 0.038934995330749234, + "language_loss": 0.89182544, + "learning_rate": 0.0003113238520567964, + "loss": 0.9033494, + "num_input_tokens_seen": 274624944, + "router_z_loss_mlp": 0.765625, + "step": 3298, + "time_per_iteration": 2.6296586990356445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151588, + "balance_loss_mlp": 1.07486486, + "epoch": 0.634667179684494, + "flos": 607045332480.0, + "grad_norm": 0.035281643877612956, + "language_loss": 0.86709571, + "learning_rate": 0.00031103537878107403, + "loss": 0.87861156, + "num_input_tokens_seen": 274695152, + "router_z_loss_mlp": 0.76611328, + "step": 3299, + "time_per_iteration": 2.7374937534332275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156066, + "balance_loss_mlp": 1.07934332, + "epoch": 0.6348595613697576, + "flos": 648128873472.0, + "grad_norm": 0.04012685096431152, + "language_loss": 0.85757369, + "learning_rate": 0.0003107469788679238, + "loss": 0.86913437, + "num_input_tokens_seen": 274767840, + "router_z_loss_mlp": 0.76611328, + "step": 3300, + "time_per_iteration": 2.763896942138672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150162, + "balance_loss_mlp": 1.07329571, + "epoch": 0.6350519430550212, + "flos": 640272588288.0, + "grad_norm": 0.03353321054785192, + "language_loss": 0.91748559, + "learning_rate": 0.00031045865242931267, + "loss": 0.92898715, + "num_input_tokens_seen": 274839312, + "router_z_loss_mlp": 0.76757812, + "step": 3301, + "time_per_iteration": 2.775559186935425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115092, + "balance_loss_mlp": 1.07405412, + "epoch": 0.6352443247402847, + "flos": 687829991424.0, + "grad_norm": 0.033769350364135475, + "language_loss": 0.89046073, + "learning_rate": 0.00031017039957717877, + "loss": 0.90196997, + "num_input_tokens_seen": 274922704, + "router_z_loss_mlp": 0.76757812, + "step": 3302, + "time_per_iteration": 2.9990227222442627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150719, + "balance_loss_mlp": 1.07399607, + "epoch": 0.6354367064255483, + "flos": 560525973504.0, + "grad_norm": 0.03207500130867294, + "language_loss": 0.93455017, + "learning_rate": 0.0003098822204234318, + "loss": 0.94605732, + "num_input_tokens_seen": 274992848, + "router_z_loss_mlp": 0.76611328, + "step": 3303, + "time_per_iteration": 2.6589555740356445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149713, + "balance_loss_mlp": 1.07294202, + "epoch": 0.6356290881108119, + "flos": 981060716544.0, + "grad_norm": 0.03119033938257745, + "language_loss": 0.92425978, + "learning_rate": 0.00030959411507995273, + "loss": 0.93575692, + "num_input_tokens_seen": 275071456, + "router_z_loss_mlp": 0.76660156, + "step": 3304, + "time_per_iteration": 3.2027275562286377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156004, + "balance_loss_mlp": 1.07932901, + "epoch": 0.6358214697960755, + "flos": 529372078080.0, + "grad_norm": 0.037691107664773085, + "language_loss": 0.88209277, + "learning_rate": 0.00030930608365859407, + "loss": 0.8936528, + "num_input_tokens_seen": 275140512, + "router_z_loss_mlp": 0.765625, + "step": 3305, + "time_per_iteration": 2.672909736633301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153167, + "balance_loss_mlp": 1.07663476, + "epoch": 0.6360138514813389, + "flos": 517868628480.0, + "grad_norm": 0.0314628318508628, + "language_loss": 0.93278992, + "learning_rate": 0.00030901812627117943, + "loss": 0.94432157, + "num_input_tokens_seen": 275210896, + "router_z_loss_mlp": 0.76416016, + "step": 3306, + "time_per_iteration": 2.6096842288970947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152004, + "balance_loss_mlp": 1.07556736, + "epoch": 0.6362062331666025, + "flos": 467469791232.0, + "grad_norm": 0.03698857716885425, + "language_loss": 0.90082693, + "learning_rate": 0.000308730243029504, + "loss": 0.91234696, + "num_input_tokens_seen": 275279888, + "router_z_loss_mlp": 0.76318359, + "step": 3307, + "time_per_iteration": 2.625368595123291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148049, + "balance_loss_mlp": 1.07151699, + "epoch": 0.6363986148518661, + "flos": 550772246016.0, + "grad_norm": 0.03499213724407888, + "language_loss": 0.85284883, + "learning_rate": 0.0003084424340453339, + "loss": 0.86432934, + "num_input_tokens_seen": 275357056, + "router_z_loss_mlp": 0.76416016, + "step": 3308, + "time_per_iteration": 2.79801082611084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154866, + "balance_loss_mlp": 1.07842863, + "epoch": 0.6365909965371297, + "flos": 584157955584.0, + "grad_norm": 0.034280921655294554, + "language_loss": 0.87936795, + "learning_rate": 0.0003081546994304064, + "loss": 0.89091659, + "num_input_tokens_seen": 275428240, + "router_z_loss_mlp": 0.76318359, + "step": 3309, + "time_per_iteration": 2.805798053741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151839, + "balance_loss_mlp": 1.0753541, + "epoch": 0.6367833782223933, + "flos": 532287372288.0, + "grad_norm": 0.031184654205402413, + "language_loss": 0.87230557, + "learning_rate": 0.0003078670392964298, + "loss": 0.88382399, + "num_input_tokens_seen": 275497568, + "router_z_loss_mlp": 0.76367188, + "step": 3310, + "time_per_iteration": 2.637089729309082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114879, + "balance_loss_mlp": 1.07211447, + "epoch": 0.6369757599076568, + "flos": 570587876352.0, + "grad_norm": 0.03249753882493018, + "language_loss": 0.8737638, + "learning_rate": 0.00030757945375508406, + "loss": 0.88525176, + "num_input_tokens_seen": 275569616, + "router_z_loss_mlp": 0.765625, + "step": 3311, + "time_per_iteration": 2.6652672290802 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157923, + "balance_loss_mlp": 1.08139026, + "epoch": 0.6371681415929203, + "flos": 541053447168.0, + "grad_norm": 0.03561310839394214, + "language_loss": 0.86446404, + "learning_rate": 0.00030729194291801944, + "loss": 0.8760432, + "num_input_tokens_seen": 275641408, + "router_z_loss_mlp": 0.76416016, + "step": 3312, + "time_per_iteration": 2.685426712036133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152462, + "balance_loss_mlp": 1.07588232, + "epoch": 0.6373605232781839, + "flos": 484530582528.0, + "grad_norm": 0.03615999538834489, + "language_loss": 0.82315236, + "learning_rate": 0.00030700450689685787, + "loss": 0.83467698, + "num_input_tokens_seen": 275706608, + "router_z_loss_mlp": 0.76464844, + "step": 3313, + "time_per_iteration": 2.5285892486572266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115278, + "balance_loss_mlp": 1.07629561, + "epoch": 0.6375529049634475, + "flos": 579816577536.0, + "grad_norm": 0.031570559387627636, + "language_loss": 0.90687287, + "learning_rate": 0.00030671714580319186, + "loss": 0.91840065, + "num_input_tokens_seen": 275785952, + "router_z_loss_mlp": 0.76367188, + "step": 3314, + "time_per_iteration": 2.7918403148651123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149531, + "balance_loss_mlp": 1.07290328, + "epoch": 0.637745286648711, + "flos": 683479154688.0, + "grad_norm": 0.03649458581150707, + "language_loss": 0.8839801, + "learning_rate": 0.0003064298597485846, + "loss": 0.89547539, + "num_input_tokens_seen": 275866240, + "router_z_loss_mlp": 0.76513672, + "step": 3315, + "time_per_iteration": 2.8336853981018066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157103, + "balance_loss_mlp": 1.08066618, + "epoch": 0.6379376683339746, + "flos": 505648771584.0, + "grad_norm": 0.03434060192765891, + "language_loss": 0.89178324, + "learning_rate": 0.00030614264884457054, + "loss": 0.90335435, + "num_input_tokens_seen": 275936176, + "router_z_loss_mlp": 0.76318359, + "step": 3316, + "time_per_iteration": 2.610029697418213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156868, + "balance_loss_mlp": 1.08038342, + "epoch": 0.6381300500192382, + "flos": 503024188416.0, + "grad_norm": 0.037738287263273475, + "language_loss": 0.83208811, + "learning_rate": 0.000305855513202655, + "loss": 0.8436569, + "num_input_tokens_seen": 276004608, + "router_z_loss_mlp": 0.76367188, + "step": 3317, + "time_per_iteration": 2.56390118598938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115293, + "balance_loss_mlp": 1.07663572, + "epoch": 0.6383224317045018, + "flos": 401367115776.0, + "grad_norm": 0.03934464683594442, + "language_loss": 0.83537889, + "learning_rate": 0.0003055684529343138, + "loss": 0.84690815, + "num_input_tokens_seen": 276066688, + "router_z_loss_mlp": 0.76171875, + "step": 3318, + "time_per_iteration": 2.4260315895080566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011523, + "balance_loss_mlp": 1.07600558, + "epoch": 0.6385148133897653, + "flos": 500362675200.0, + "grad_norm": 0.03558980854731561, + "language_loss": 0.8376438, + "learning_rate": 0.00030528146815099374, + "loss": 0.84916675, + "num_input_tokens_seen": 276140000, + "router_z_loss_mlp": 0.76171875, + "step": 3319, + "time_per_iteration": 2.6329188346862793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151029, + "balance_loss_mlp": 1.07468724, + "epoch": 0.6387071950750288, + "flos": 528694602240.0, + "grad_norm": 0.0315122399919932, + "language_loss": 0.76854849, + "learning_rate": 0.00030499455896411203, + "loss": 0.78005874, + "num_input_tokens_seen": 276209840, + "router_z_loss_mlp": 0.76220703, + "step": 3320, + "time_per_iteration": 2.6750285625457764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156959, + "balance_loss_mlp": 1.0823822, + "epoch": 0.6388995767602924, + "flos": 1459104153600.0, + "grad_norm": 0.009844305017815533, + "language_loss": 0.76300812, + "learning_rate": 0.0003047077254850568, + "loss": 0.77457774, + "num_input_tokens_seen": 276444784, + "router_z_loss_mlp": 0.74609375, + "step": 3321, + "time_per_iteration": 4.953099489212036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151078, + "balance_loss_mlp": 1.07459378, + "epoch": 0.639091958445556, + "flos": 605170083840.0, + "grad_norm": 0.03456514545296231, + "language_loss": 0.8206768, + "learning_rate": 0.0003044209678251865, + "loss": 0.83218759, + "num_input_tokens_seen": 276522768, + "router_z_loss_mlp": 0.76367188, + "step": 3322, + "time_per_iteration": 2.8895435333251953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149613, + "balance_loss_mlp": 1.07312858, + "epoch": 0.6392843401308196, + "flos": 585664630272.0, + "grad_norm": 0.030325412861609304, + "language_loss": 0.89598596, + "learning_rate": 0.0003041342860958306, + "loss": 0.90748215, + "num_input_tokens_seen": 276597104, + "router_z_loss_mlp": 0.76367188, + "step": 3323, + "time_per_iteration": 2.8267457485198975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115059, + "balance_loss_mlp": 1.07401037, + "epoch": 0.6394767218160831, + "flos": 515728138752.0, + "grad_norm": 0.035461056589808096, + "language_loss": 0.97089493, + "learning_rate": 0.00030384768040828857, + "loss": 0.98240083, + "num_input_tokens_seen": 276670256, + "router_z_loss_mlp": 0.76464844, + "step": 3324, + "time_per_iteration": 2.6604483127593994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147614, + "balance_loss_mlp": 1.07127237, + "epoch": 0.6396691035013466, + "flos": 542776972800.0, + "grad_norm": 0.029879293671496117, + "language_loss": 0.90136957, + "learning_rate": 0.00030356115087383094, + "loss": 0.91284573, + "num_input_tokens_seen": 276737680, + "router_z_loss_mlp": 0.76220703, + "step": 3325, + "time_per_iteration": 2.61624813079834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151957, + "balance_loss_mlp": 1.07561517, + "epoch": 0.6398614851866102, + "flos": 526554112512.0, + "grad_norm": 0.03633717350328365, + "language_loss": 0.8974539, + "learning_rate": 0.00030327469760369803, + "loss": 0.90897352, + "num_input_tokens_seen": 276803808, + "router_z_loss_mlp": 0.76220703, + "step": 3326, + "time_per_iteration": 2.5705959796905518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151499, + "balance_loss_mlp": 1.0753485, + "epoch": 0.6400538668718738, + "flos": 624134321664.0, + "grad_norm": 0.04101147906430089, + "language_loss": 0.90274537, + "learning_rate": 0.0003029883207091009, + "loss": 0.91426039, + "num_input_tokens_seen": 276874752, + "router_z_loss_mlp": 0.76025391, + "step": 3327, + "time_per_iteration": 2.710705280303955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153226, + "balance_loss_mlp": 1.07712281, + "epoch": 0.6402462485571374, + "flos": 504455001600.0, + "grad_norm": 0.03565756181750687, + "language_loss": 0.8369143, + "learning_rate": 0.00030270202030122095, + "loss": 0.84844655, + "num_input_tokens_seen": 276947200, + "router_z_loss_mlp": 0.75976562, + "step": 3328, + "time_per_iteration": 2.6669437885284424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153213, + "balance_loss_mlp": 1.07706201, + "epoch": 0.6404386302424009, + "flos": 820662426624.0, + "grad_norm": 0.035758844093176624, + "language_loss": 0.90348649, + "learning_rate": 0.00030241579649121, + "loss": 0.91501862, + "num_input_tokens_seen": 277025712, + "router_z_loss_mlp": 0.76025391, + "step": 3329, + "time_per_iteration": 2.9946744441986084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153577, + "balance_loss_mlp": 1.07747424, + "epoch": 0.6406310119276645, + "flos": 472792817664.0, + "grad_norm": 0.031682669944134774, + "language_loss": 0.84166616, + "learning_rate": 0.00030212964939018994, + "loss": 0.85320187, + "num_input_tokens_seen": 277091264, + "router_z_loss_mlp": 0.75976562, + "step": 3330, + "time_per_iteration": 2.529780864715576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153483, + "balance_loss_mlp": 1.07738006, + "epoch": 0.6408233936129281, + "flos": 426488308224.0, + "grad_norm": 0.0317787576762172, + "language_loss": 0.90697497, + "learning_rate": 0.0003018435791092527, + "loss": 0.91850984, + "num_input_tokens_seen": 277154608, + "router_z_loss_mlp": 0.75976562, + "step": 3331, + "time_per_iteration": 2.482226848602295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154163, + "balance_loss_mlp": 1.07810771, + "epoch": 0.6410157752981916, + "flos": 550837373952.0, + "grad_norm": 0.03245017993162029, + "language_loss": 0.86073428, + "learning_rate": 0.00030155758575946083, + "loss": 0.87227595, + "num_input_tokens_seen": 277222176, + "router_z_loss_mlp": 0.75927734, + "step": 3332, + "time_per_iteration": 2.7268691062927246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154009, + "balance_loss_mlp": 1.07785761, + "epoch": 0.6412081569834551, + "flos": 476860948992.0, + "grad_norm": 0.03331397331841687, + "language_loss": 0.88895929, + "learning_rate": 0.0003012716694518467, + "loss": 0.9004994, + "num_input_tokens_seen": 277289600, + "router_z_loss_mlp": 0.76025391, + "step": 3333, + "time_per_iteration": 2.5955138206481934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154559, + "balance_loss_mlp": 1.07845628, + "epoch": 0.6414005386687187, + "flos": 542030366208.0, + "grad_norm": 0.03145594160852774, + "language_loss": 0.89824158, + "learning_rate": 0.000300985830297413, + "loss": 0.90978718, + "num_input_tokens_seen": 277362784, + "router_z_loss_mlp": 0.75976562, + "step": 3334, + "time_per_iteration": 2.675809144973755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151014, + "balance_loss_mlp": 1.07476771, + "epoch": 0.6415929203539823, + "flos": 1042956272640.0, + "grad_norm": 0.03442120912103133, + "language_loss": 0.92276573, + "learning_rate": 0.00030070006840713205, + "loss": 0.93427593, + "num_input_tokens_seen": 277449728, + "router_z_loss_mlp": 0.76123047, + "step": 3335, + "time_per_iteration": 3.3598873615264893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153261, + "balance_loss_mlp": 1.07696736, + "epoch": 0.6417853020392459, + "flos": 649579152384.0, + "grad_norm": 0.03234716357342597, + "language_loss": 0.78466761, + "learning_rate": 0.000300414383891947, + "loss": 0.79620028, + "num_input_tokens_seen": 277527552, + "router_z_loss_mlp": 0.76171875, + "step": 3336, + "time_per_iteration": 2.8177781105041504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153044, + "balance_loss_mlp": 1.07679784, + "epoch": 0.6419776837245095, + "flos": 501943209984.0, + "grad_norm": 0.029578655992370296, + "language_loss": 0.93100476, + "learning_rate": 0.00030012877686276973, + "loss": 0.94253522, + "num_input_tokens_seen": 277603568, + "router_z_loss_mlp": 0.76123047, + "step": 3337, + "time_per_iteration": 2.6656994819641113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153274, + "balance_loss_mlp": 1.07688439, + "epoch": 0.642170065409773, + "flos": 621778982400.0, + "grad_norm": 0.030467733780945628, + "language_loss": 0.91408634, + "learning_rate": 0.0002998432474304832, + "loss": 0.92561901, + "num_input_tokens_seen": 277679696, + "router_z_loss_mlp": 0.76269531, + "step": 3338, + "time_per_iteration": 2.7804837226867676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156387, + "balance_loss_mlp": 1.08161926, + "epoch": 0.6423624470950365, + "flos": 1426638967296.0, + "grad_norm": 0.010632522477168303, + "language_loss": 0.79237342, + "learning_rate": 0.0002995577957059395, + "loss": 0.80393732, + "num_input_tokens_seen": 277913056, + "router_z_loss_mlp": 0.74804688, + "step": 3339, + "time_per_iteration": 4.905744791030884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151035, + "balance_loss_mlp": 1.07493174, + "epoch": 0.6425548287803001, + "flos": 563439266304.0, + "grad_norm": 0.028877045256785867, + "language_loss": 0.92764187, + "learning_rate": 0.00029927242179996107, + "loss": 0.93915224, + "num_input_tokens_seen": 277983168, + "router_z_loss_mlp": 0.75976562, + "step": 3340, + "time_per_iteration": 2.6661758422851562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145869, + "balance_loss_mlp": 1.0697186, + "epoch": 0.6427472104655637, + "flos": 586613351424.0, + "grad_norm": 0.0300822513158231, + "language_loss": 0.88234377, + "learning_rate": 0.0002989871258233398, + "loss": 0.8938024, + "num_input_tokens_seen": 278057600, + "router_z_loss_mlp": 0.76025391, + "step": 3341, + "time_per_iteration": 2.7374660968780518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144033, + "balance_loss_mlp": 1.06773865, + "epoch": 0.6429395921508272, + "flos": 405146537472.0, + "grad_norm": 0.038389287644004705, + "language_loss": 0.88664877, + "learning_rate": 0.0002987019078868373, + "loss": 0.89808905, + "num_input_tokens_seen": 278119232, + "router_z_loss_mlp": 0.76171875, + "step": 3342, + "time_per_iteration": 2.4243760108947754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140022, + "balance_loss_mlp": 1.06377542, + "epoch": 0.6431319738360908, + "flos": 549832257024.0, + "grad_norm": 0.03024016811094423, + "language_loss": 0.8722378, + "learning_rate": 0.00029841676810118484, + "loss": 0.88363802, + "num_input_tokens_seen": 278187456, + "router_z_loss_mlp": 0.76123047, + "step": 3343, + "time_per_iteration": 2.6617236137390137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147432, + "balance_loss_mlp": 1.07118535, + "epoch": 0.6433243555213544, + "flos": 794705304576.0, + "grad_norm": 0.037506118612829445, + "language_loss": 0.92627275, + "learning_rate": 0.0002981317065770839, + "loss": 0.93774706, + "num_input_tokens_seen": 278262176, + "router_z_loss_mlp": 0.76123047, + "step": 3344, + "time_per_iteration": 3.082211494445801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149276, + "balance_loss_mlp": 1.07288682, + "epoch": 0.643516737206618, + "flos": 584112293376.0, + "grad_norm": 0.03767314060719249, + "language_loss": 0.87199879, + "learning_rate": 0.00029784672342520493, + "loss": 0.88349158, + "num_input_tokens_seen": 278328816, + "router_z_loss_mlp": 0.76269531, + "step": 3345, + "time_per_iteration": 2.7258007526397705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114915, + "balance_loss_mlp": 1.07276022, + "epoch": 0.6437091188918815, + "flos": 519750607872.0, + "grad_norm": 0.03533085288020931, + "language_loss": 0.88640958, + "learning_rate": 0.00029756181875618834, + "loss": 0.89790106, + "num_input_tokens_seen": 278395824, + "router_z_loss_mlp": 0.76269531, + "step": 3346, + "time_per_iteration": 2.569779634475708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144811, + "balance_loss_mlp": 1.06846941, + "epoch": 0.643901500577145, + "flos": 385786802688.0, + "grad_norm": 0.034542585210818905, + "language_loss": 0.89738131, + "learning_rate": 0.0002972769926806439, + "loss": 0.90882939, + "num_input_tokens_seen": 278457696, + "router_z_loss_mlp": 0.76220703, + "step": 3347, + "time_per_iteration": 2.497853994369507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147673, + "balance_loss_mlp": 1.07128322, + "epoch": 0.6440938822624086, + "flos": 484697768448.0, + "grad_norm": 0.03553288196721846, + "language_loss": 0.94382805, + "learning_rate": 0.0002969922453091508, + "loss": 0.95530474, + "num_input_tokens_seen": 278526992, + "router_z_loss_mlp": 0.76269531, + "step": 3348, + "time_per_iteration": 2.5491795539855957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147538, + "balance_loss_mlp": 1.07124412, + "epoch": 0.6442862639476722, + "flos": 541637597184.0, + "grad_norm": 0.03037104728594501, + "language_loss": 0.89609063, + "learning_rate": 0.00029670757675225777, + "loss": 0.90756601, + "num_input_tokens_seen": 278601120, + "router_z_loss_mlp": 0.76171875, + "step": 3349, + "time_per_iteration": 2.721752882003784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148396, + "balance_loss_mlp": 1.07234049, + "epoch": 0.6444786456329358, + "flos": 527958729216.0, + "grad_norm": 0.03079951019721412, + "language_loss": 0.85068369, + "learning_rate": 0.0002964229871204831, + "loss": 0.8621676, + "num_input_tokens_seen": 278668208, + "router_z_loss_mlp": 0.75927734, + "step": 3350, + "time_per_iteration": 2.6219635009765625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146722, + "balance_loss_mlp": 1.07076228, + "epoch": 0.6446710273181993, + "flos": 699161525760.0, + "grad_norm": 0.03075522523020309, + "language_loss": 0.88979256, + "learning_rate": 0.00029613847652431403, + "loss": 0.90125972, + "num_input_tokens_seen": 278742832, + "router_z_loss_mlp": 0.75830078, + "step": 3351, + "time_per_iteration": 2.8463754653930664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143843, + "balance_loss_mlp": 1.06778741, + "epoch": 0.6448634090034628, + "flos": 626299006464.0, + "grad_norm": 0.030404862420189395, + "language_loss": 0.8409062, + "learning_rate": 0.0002958540450742078, + "loss": 0.85234463, + "num_input_tokens_seen": 278829744, + "router_z_loss_mlp": 0.75927734, + "step": 3352, + "time_per_iteration": 2.9119668006896973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145662, + "balance_loss_mlp": 1.0695591, + "epoch": 0.6450557906887264, + "flos": 602165466624.0, + "grad_norm": 0.030375965559079645, + "language_loss": 0.81268156, + "learning_rate": 0.0002955696928805901, + "loss": 0.82413822, + "num_input_tokens_seen": 278908592, + "router_z_loss_mlp": 0.75976562, + "step": 3353, + "time_per_iteration": 2.8792967796325684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146049, + "balance_loss_mlp": 1.06989837, + "epoch": 0.64524817237399, + "flos": 647384268288.0, + "grad_norm": 0.032745807535614124, + "language_loss": 0.90629518, + "learning_rate": 0.0002952854200538563, + "loss": 0.91775572, + "num_input_tokens_seen": 278986960, + "router_z_loss_mlp": 0.76025391, + "step": 3354, + "time_per_iteration": 2.7729763984680176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144907, + "balance_loss_mlp": 1.06870866, + "epoch": 0.6454405540592536, + "flos": 474366621696.0, + "grad_norm": 0.04216820116254093, + "language_loss": 0.87584448, + "learning_rate": 0.000295001226704371, + "loss": 0.88729358, + "num_input_tokens_seen": 279054896, + "router_z_loss_mlp": 0.76074219, + "step": 3355, + "time_per_iteration": 2.5655300617218018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146195, + "balance_loss_mlp": 1.06994879, + "epoch": 0.6456329357445171, + "flos": 613019638272.0, + "grad_norm": 0.03469469169647009, + "language_loss": 0.88972664, + "learning_rate": 0.00029471711294246783, + "loss": 0.90118861, + "num_input_tokens_seen": 279126816, + "router_z_loss_mlp": 0.76123047, + "step": 3356, + "time_per_iteration": 2.7737839221954346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149475, + "balance_loss_mlp": 1.07322907, + "epoch": 0.6458253174297807, + "flos": 732931272192.0, + "grad_norm": 0.03845226629357448, + "language_loss": 0.87651891, + "learning_rate": 0.0002944330788784494, + "loss": 0.88801372, + "num_input_tokens_seen": 279197552, + "router_z_loss_mlp": 0.76123047, + "step": 3357, + "time_per_iteration": 2.9011571407318115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151964, + "balance_loss_mlp": 1.07552743, + "epoch": 0.6460176991150443, + "flos": 571554061824.0, + "grad_norm": 0.03220756952294772, + "language_loss": 0.89507246, + "learning_rate": 0.00029414912462258786, + "loss": 0.90659207, + "num_input_tokens_seen": 279275440, + "router_z_loss_mlp": 0.76318359, + "step": 3358, + "time_per_iteration": 2.87532901763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150464, + "balance_loss_mlp": 1.07397914, + "epoch": 0.6462100808003078, + "flos": 584242549248.0, + "grad_norm": 0.034688747990618336, + "language_loss": 0.87649322, + "learning_rate": 0.00029386525028512366, + "loss": 0.88799781, + "num_input_tokens_seen": 279349168, + "router_z_loss_mlp": 0.76367188, + "step": 3359, + "time_per_iteration": 2.701509714126587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115358, + "balance_loss_mlp": 1.07709527, + "epoch": 0.6464024624855714, + "flos": 485010673152.0, + "grad_norm": 0.035268388031257245, + "language_loss": 0.92228907, + "learning_rate": 0.0002935814559762666, + "loss": 0.9338249, + "num_input_tokens_seen": 279427600, + "router_z_loss_mlp": 0.76367188, + "step": 3360, + "time_per_iteration": 2.7698283195495605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149719, + "balance_loss_mlp": 1.07332945, + "epoch": 0.6465948441708349, + "flos": 528842322432.0, + "grad_norm": 0.029604921797993008, + "language_loss": 0.84675246, + "learning_rate": 0.0002932977418061957, + "loss": 0.85824966, + "num_input_tokens_seen": 279496608, + "router_z_loss_mlp": 0.76269531, + "step": 3361, + "time_per_iteration": 2.637636661529541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148892, + "balance_loss_mlp": 1.07245517, + "epoch": 0.6467872258560985, + "flos": 670625482752.0, + "grad_norm": 0.035318648220588056, + "language_loss": 0.86576068, + "learning_rate": 0.00029301410788505833, + "loss": 0.8772496, + "num_input_tokens_seen": 279568448, + "router_z_loss_mlp": 0.76318359, + "step": 3362, + "time_per_iteration": 2.7763969898223877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144507, + "balance_loss_mlp": 1.06826067, + "epoch": 0.6469796075413621, + "flos": 433040033280.0, + "grad_norm": 0.03731380273504302, + "language_loss": 0.87366712, + "learning_rate": 0.00029273055432297126, + "loss": 0.88511223, + "num_input_tokens_seen": 279631952, + "router_z_loss_mlp": 0.76123047, + "step": 3363, + "time_per_iteration": 2.5110268592834473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144768, + "balance_loss_mlp": 1.06842613, + "epoch": 0.6471719892266257, + "flos": 805101579264.0, + "grad_norm": 0.03447928292768335, + "language_loss": 0.85973775, + "learning_rate": 0.00029244708123001917, + "loss": 0.87118536, + "num_input_tokens_seen": 279706880, + "router_z_loss_mlp": 0.76220703, + "step": 3364, + "time_per_iteration": 2.9464926719665527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145161, + "balance_loss_mlp": 1.06896257, + "epoch": 0.6473643709118891, + "flos": 578348834304.0, + "grad_norm": 0.03376367371908884, + "language_loss": 0.88996613, + "learning_rate": 0.0002921636887162565, + "loss": 0.90141773, + "num_input_tokens_seen": 279778864, + "router_z_loss_mlp": 0.76074219, + "step": 3365, + "time_per_iteration": 2.7177810668945312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144996, + "balance_loss_mlp": 1.06879795, + "epoch": 0.6475567525971527, + "flos": 762787338240.0, + "grad_norm": 0.03409968089483679, + "language_loss": 0.89139444, + "learning_rate": 0.00029188037689170595, + "loss": 0.90284443, + "num_input_tokens_seen": 279853328, + "router_z_loss_mlp": 0.76074219, + "step": 3366, + "time_per_iteration": 2.94266676902771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144468, + "balance_loss_mlp": 1.06817389, + "epoch": 0.6477491342824163, + "flos": 844500526080.0, + "grad_norm": 0.03525364957484555, + "language_loss": 0.88880944, + "learning_rate": 0.0002915971458663586, + "loss": 0.90025413, + "num_input_tokens_seen": 279928464, + "router_z_loss_mlp": 0.76171875, + "step": 3367, + "time_per_iteration": 3.037111282348633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144688, + "balance_loss_mlp": 1.06844163, + "epoch": 0.6479415159676799, + "flos": 886381065216.0, + "grad_norm": 0.02613941789873103, + "language_loss": 0.85508728, + "learning_rate": 0.00029131399575017494, + "loss": 0.86653411, + "num_input_tokens_seen": 280015680, + "router_z_loss_mlp": 0.76123047, + "step": 3368, + "time_per_iteration": 3.1630287170410156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144843, + "balance_loss_mlp": 1.06859708, + "epoch": 0.6481338976529435, + "flos": 616723198464.0, + "grad_norm": 0.02777106453890135, + "language_loss": 0.9063583, + "learning_rate": 0.0002910309266530836, + "loss": 0.91780674, + "num_input_tokens_seen": 280093904, + "router_z_loss_mlp": 0.76123047, + "step": 3369, + "time_per_iteration": 2.7928354740142822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154935, + "balance_loss_mlp": 1.07859313, + "epoch": 0.648326279338207, + "flos": 511019461632.0, + "grad_norm": 0.03366950054230419, + "language_loss": 0.90075457, + "learning_rate": 0.0002907479386849814, + "loss": 0.91230392, + "num_input_tokens_seen": 280161584, + "router_z_loss_mlp": 0.76220703, + "step": 3370, + "time_per_iteration": 2.673582077026367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154894, + "balance_loss_mlp": 1.07869589, + "epoch": 0.6485186610234706, + "flos": 703868201472.0, + "grad_norm": 0.031297921332288904, + "language_loss": 0.8459866, + "learning_rate": 0.0002904650319557339, + "loss": 0.8575356, + "num_input_tokens_seen": 280248016, + "router_z_loss_mlp": 0.76074219, + "step": 3371, + "time_per_iteration": 2.984816789627075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149879, + "balance_loss_mlp": 1.07358491, + "epoch": 0.6487110427087341, + "flos": 561745939968.0, + "grad_norm": 0.03993640989964456, + "language_loss": 0.8677696, + "learning_rate": 0.0002901822065751758, + "loss": 0.87926841, + "num_input_tokens_seen": 280319024, + "router_z_loss_mlp": 0.76171875, + "step": 3372, + "time_per_iteration": 2.642890691757202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149196, + "balance_loss_mlp": 1.0729022, + "epoch": 0.6489034243939977, + "flos": 681301734912.0, + "grad_norm": 0.03031559078625196, + "language_loss": 0.90163612, + "learning_rate": 0.0002898994626531093, + "loss": 0.91312808, + "num_input_tokens_seen": 280393200, + "router_z_loss_mlp": 0.76171875, + "step": 3373, + "time_per_iteration": 2.838804006576538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149133, + "balance_loss_mlp": 1.07303011, + "epoch": 0.6490958060792612, + "flos": 475371738624.0, + "grad_norm": 0.03229066647304318, + "language_loss": 0.92974752, + "learning_rate": 0.00028961680029930526, + "loss": 0.94123888, + "num_input_tokens_seen": 280456944, + "router_z_loss_mlp": 0.75976562, + "step": 3374, + "time_per_iteration": 2.5095248222351074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149591, + "balance_loss_mlp": 1.07339203, + "epoch": 0.6492881877645248, + "flos": 590002005504.0, + "grad_norm": 0.03422977569034653, + "language_loss": 0.8249414, + "learning_rate": 0.00028933421962350317, + "loss": 0.83643734, + "num_input_tokens_seen": 280534352, + "router_z_loss_mlp": 0.76074219, + "step": 3375, + "time_per_iteration": 2.733698606491089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149303, + "balance_loss_mlp": 1.07310462, + "epoch": 0.6494805694497884, + "flos": 643587382272.0, + "grad_norm": 0.03276895180859608, + "language_loss": 0.88882941, + "learning_rate": 0.0002890517207354104, + "loss": 0.90032244, + "num_input_tokens_seen": 280608912, + "router_z_loss_mlp": 0.76074219, + "step": 3376, + "time_per_iteration": 2.8495798110961914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149673, + "balance_loss_mlp": 1.07347465, + "epoch": 0.649672951135052, + "flos": 532836593664.0, + "grad_norm": 0.031246089180930747, + "language_loss": 0.86472917, + "learning_rate": 0.0002887693037447029, + "loss": 0.87622589, + "num_input_tokens_seen": 280678848, + "router_z_loss_mlp": 0.76074219, + "step": 3377, + "time_per_iteration": 2.588364601135254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147339, + "balance_loss_mlp": 1.07109332, + "epoch": 0.6498653328203156, + "flos": 548445104640.0, + "grad_norm": 0.03311172972858422, + "language_loss": 0.87447202, + "learning_rate": 0.00028848696876102443, + "loss": 0.88594544, + "num_input_tokens_seen": 280750224, + "router_z_loss_mlp": 0.76123047, + "step": 3378, + "time_per_iteration": 2.6357853412628174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114593, + "balance_loss_mlp": 1.06977868, + "epoch": 0.650057714505579, + "flos": 463160613888.0, + "grad_norm": 0.0392849096276736, + "language_loss": 0.89328945, + "learning_rate": 0.00028820471589398723, + "loss": 0.90474874, + "num_input_tokens_seen": 280817488, + "router_z_loss_mlp": 0.76025391, + "step": 3379, + "time_per_iteration": 2.530264139175415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161056, + "balance_loss_mlp": 1.08519137, + "epoch": 0.6502500961908426, + "flos": 511241041920.0, + "grad_norm": 0.03964181246795499, + "language_loss": 0.82806408, + "learning_rate": 0.00028792254525317196, + "loss": 0.83967471, + "num_input_tokens_seen": 280887440, + "router_z_loss_mlp": 0.75732422, + "step": 3380, + "time_per_iteration": 2.677969217300415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158758, + "balance_loss_mlp": 1.08279765, + "epoch": 0.6504424778761062, + "flos": 580910290944.0, + "grad_norm": 0.031350821569318954, + "language_loss": 0.8659088, + "learning_rate": 0.00028764045694812645, + "loss": 0.87749636, + "num_input_tokens_seen": 280959072, + "router_z_loss_mlp": 0.75830078, + "step": 3381, + "time_per_iteration": 2.7509915828704834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157316, + "balance_loss_mlp": 1.0813086, + "epoch": 0.6506348595613698, + "flos": 520467015168.0, + "grad_norm": 0.04066104102632486, + "language_loss": 0.82166147, + "learning_rate": 0.0002873584510883671, + "loss": 0.83323467, + "num_input_tokens_seen": 281025376, + "router_z_loss_mlp": 0.75878906, + "step": 3382, + "time_per_iteration": 2.5591564178466797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153945, + "balance_loss_mlp": 1.07769895, + "epoch": 0.6508272412466333, + "flos": 511362565632.0, + "grad_norm": 0.02912056326895262, + "language_loss": 0.91856563, + "learning_rate": 0.0002870765277833788, + "loss": 0.93010509, + "num_input_tokens_seen": 281097616, + "router_z_loss_mlp": 0.76123047, + "step": 3383, + "time_per_iteration": 2.7396798133850098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150716, + "balance_loss_mlp": 1.07461333, + "epoch": 0.6510196229318969, + "flos": 626804567040.0, + "grad_norm": 0.032638591105191926, + "language_loss": 0.86156708, + "learning_rate": 0.00028679468714261347, + "loss": 0.87307423, + "num_input_tokens_seen": 281170192, + "router_z_loss_mlp": 0.75976562, + "step": 3384, + "time_per_iteration": 2.762810230255127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148501, + "balance_loss_mlp": 1.07239771, + "epoch": 0.6512120046171604, + "flos": 475669180416.0, + "grad_norm": 0.033246821782095315, + "language_loss": 0.80913359, + "learning_rate": 0.0002865129292754918, + "loss": 0.82061851, + "num_input_tokens_seen": 281238832, + "router_z_loss_mlp": 0.75976562, + "step": 3385, + "time_per_iteration": 2.6017582416534424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151379, + "balance_loss_mlp": 1.07513273, + "epoch": 0.651404386302424, + "flos": 553030256640.0, + "grad_norm": 0.0304228647826632, + "language_loss": 0.86788058, + "learning_rate": 0.00028623125429140105, + "loss": 0.87939441, + "num_input_tokens_seen": 281319472, + "router_z_loss_mlp": 0.76123047, + "step": 3386, + "time_per_iteration": 2.8177084922790527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114874, + "balance_loss_mlp": 1.07230258, + "epoch": 0.6515967679876876, + "flos": 524374691328.0, + "grad_norm": 0.03154749952631653, + "language_loss": 0.92443657, + "learning_rate": 0.00028594966229969785, + "loss": 0.93592393, + "num_input_tokens_seen": 281391168, + "router_z_loss_mlp": 0.76318359, + "step": 3387, + "time_per_iteration": 2.654865264892578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145456, + "balance_loss_mlp": 1.06925726, + "epoch": 0.6517891496729511, + "flos": 575016576000.0, + "grad_norm": 0.03711897249096357, + "language_loss": 0.87118483, + "learning_rate": 0.00028566815340970577, + "loss": 0.88263941, + "num_input_tokens_seen": 281465664, + "router_z_loss_mlp": 0.76074219, + "step": 3388, + "time_per_iteration": 2.724337339401245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148749, + "balance_loss_mlp": 1.07240736, + "epoch": 0.6519815313582147, + "flos": 556989599232.0, + "grad_norm": 0.03038600941725792, + "language_loss": 0.85638821, + "learning_rate": 0.0002853867277307162, + "loss": 0.8678757, + "num_input_tokens_seen": 281532928, + "router_z_loss_mlp": 0.76220703, + "step": 3389, + "time_per_iteration": 2.6384835243225098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114605, + "balance_loss_mlp": 1.0695653, + "epoch": 0.6521739130434783, + "flos": 481521962496.0, + "grad_norm": 0.03095245810395829, + "language_loss": 0.87876832, + "learning_rate": 0.00028510538537198824, + "loss": 0.89022881, + "num_input_tokens_seen": 281601680, + "router_z_loss_mlp": 0.76367188, + "step": 3390, + "time_per_iteration": 2.6401560306549072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143269, + "balance_loss_mlp": 1.06664157, + "epoch": 0.6523662947287419, + "flos": 667019977728.0, + "grad_norm": 0.029103127011675372, + "language_loss": 0.90833724, + "learning_rate": 0.00028482412644274867, + "loss": 0.91976994, + "num_input_tokens_seen": 281679488, + "router_z_loss_mlp": 0.76513672, + "step": 3391, + "time_per_iteration": 2.914109945297241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143322, + "balance_loss_mlp": 1.06645572, + "epoch": 0.6525586764140053, + "flos": 549702001152.0, + "grad_norm": 0.036601963047289736, + "language_loss": 0.80285096, + "learning_rate": 0.00028454295105219207, + "loss": 0.81428421, + "num_input_tokens_seen": 281751056, + "router_z_loss_mlp": 0.76757812, + "step": 3392, + "time_per_iteration": 2.6647682189941406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142157, + "balance_loss_mlp": 1.06557703, + "epoch": 0.6527510580992689, + "flos": 804389901312.0, + "grad_norm": 0.025027747425113815, + "language_loss": 0.83011138, + "learning_rate": 0.0002842618593094802, + "loss": 0.84153295, + "num_input_tokens_seen": 281841008, + "router_z_loss_mlp": 0.76464844, + "step": 3393, + "time_per_iteration": 3.116758108139038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144173, + "balance_loss_mlp": 1.06744993, + "epoch": 0.6529434397845325, + "flos": 672375204864.0, + "grad_norm": 0.042372987357860006, + "language_loss": 0.85526049, + "learning_rate": 0.00028398085132374243, + "loss": 0.8667022, + "num_input_tokens_seen": 281908016, + "router_z_loss_mlp": 0.76611328, + "step": 3394, + "time_per_iteration": 2.7683980464935303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142459, + "balance_loss_mlp": 1.06592691, + "epoch": 0.6531358214697961, + "flos": 829875664896.0, + "grad_norm": 0.03113385731669579, + "language_loss": 0.89394134, + "learning_rate": 0.0002836999272040761, + "loss": 0.90536594, + "num_input_tokens_seen": 281989072, + "router_z_loss_mlp": 0.76416016, + "step": 3395, + "time_per_iteration": 3.102487087249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140812, + "balance_loss_mlp": 1.06432748, + "epoch": 0.6533282031550597, + "flos": 488392596480.0, + "grad_norm": 0.0404739719167322, + "language_loss": 0.89987487, + "learning_rate": 0.00028341908705954575, + "loss": 0.91128296, + "num_input_tokens_seen": 282053152, + "router_z_loss_mlp": 0.76367188, + "step": 3396, + "time_per_iteration": 2.692906618118286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146225, + "balance_loss_mlp": 1.07183838, + "epoch": 0.6535205848403232, + "flos": 1561102328832.0, + "grad_norm": 0.005117457515533169, + "language_loss": 0.81761813, + "learning_rate": 0.00028313833099918265, + "loss": 0.82908034, + "num_input_tokens_seen": 282283984, + "router_z_loss_mlp": 0.74414062, + "step": 3397, + "time_per_iteration": 4.795916557312012 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144233, + "balance_loss_mlp": 1.06793857, + "epoch": 0.6537129665255867, + "flos": 494703275520.0, + "grad_norm": 0.03597932641299946, + "language_loss": 0.82677722, + "learning_rate": 0.00028285765913198604, + "loss": 0.83821958, + "num_input_tokens_seen": 282353008, + "router_z_loss_mlp": 0.76171875, + "step": 3398, + "time_per_iteration": 2.5658674240112305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114427, + "balance_loss_mlp": 1.06788087, + "epoch": 0.6539053482108503, + "flos": 606142273536.0, + "grad_norm": 0.0350820826110483, + "language_loss": 0.88009775, + "learning_rate": 0.0002825770715669227, + "loss": 0.89154047, + "num_input_tokens_seen": 282427648, + "router_z_loss_mlp": 0.76269531, + "step": 3399, + "time_per_iteration": 2.7702410221099854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145417, + "balance_loss_mlp": 1.06902778, + "epoch": 0.6540977298961139, + "flos": 578880591360.0, + "grad_norm": 0.0325786381033819, + "language_loss": 0.8578831, + "learning_rate": 0.00028229656841292634, + "loss": 0.86933732, + "num_input_tokens_seen": 282502128, + "router_z_loss_mlp": 0.76269531, + "step": 3400, + "time_per_iteration": 2.6832401752471924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145045, + "balance_loss_mlp": 1.06865597, + "epoch": 0.6542901115813774, + "flos": 512769183744.0, + "grad_norm": 0.039852870614421367, + "language_loss": 0.82027632, + "learning_rate": 0.0002820161497788979, + "loss": 0.83172679, + "num_input_tokens_seen": 282569360, + "router_z_loss_mlp": 0.76269531, + "step": 3401, + "time_per_iteration": 2.5679121017456055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149696, + "balance_loss_mlp": 1.07330704, + "epoch": 0.654482493266641, + "flos": 626674311168.0, + "grad_norm": 0.030416914651843395, + "language_loss": 0.91325247, + "learning_rate": 0.00028173581577370545, + "loss": 0.92474937, + "num_input_tokens_seen": 282645472, + "router_z_loss_mlp": 0.76269531, + "step": 3402, + "time_per_iteration": 2.7601027488708496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150076, + "balance_loss_mlp": 1.07368624, + "epoch": 0.6546748749519046, + "flos": 525062900736.0, + "grad_norm": 0.030820927894649717, + "language_loss": 0.83866602, + "learning_rate": 0.0002814555665061844, + "loss": 0.8501668, + "num_input_tokens_seen": 282717568, + "router_z_loss_mlp": 0.76269531, + "step": 3403, + "time_per_iteration": 2.688485860824585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153093, + "balance_loss_mlp": 1.07641792, + "epoch": 0.6548672566371682, + "flos": 480273798144.0, + "grad_norm": 0.03553217015928594, + "language_loss": 0.82424521, + "learning_rate": 0.00028117540208513715, + "loss": 0.83577615, + "num_input_tokens_seen": 282791408, + "router_z_loss_mlp": 0.765625, + "step": 3404, + "time_per_iteration": 2.6906890869140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150931, + "balance_loss_mlp": 1.07425523, + "epoch": 0.6550596383224317, + "flos": 617135433216.0, + "grad_norm": 0.03288416711071717, + "language_loss": 0.89287072, + "learning_rate": 0.00028089532261933313, + "loss": 0.90438002, + "num_input_tokens_seen": 282862992, + "router_z_loss_mlp": 0.765625, + "step": 3405, + "time_per_iteration": 2.718001127243042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147316, + "balance_loss_mlp": 1.07078385, + "epoch": 0.6552520200076952, + "flos": 489807946752.0, + "grad_norm": 0.040144975574141664, + "language_loss": 0.91147745, + "learning_rate": 0.0002806153282175087, + "loss": 0.92295063, + "num_input_tokens_seen": 282930448, + "router_z_loss_mlp": 0.76416016, + "step": 3406, + "time_per_iteration": 2.5618858337402344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114632, + "balance_loss_mlp": 1.06983495, + "epoch": 0.6554444016929588, + "flos": 688858576896.0, + "grad_norm": 0.034942224339764696, + "language_loss": 0.88083732, + "learning_rate": 0.0002803354189883679, + "loss": 0.89230049, + "num_input_tokens_seen": 283010864, + "router_z_loss_mlp": 0.76367188, + "step": 3407, + "time_per_iteration": 2.893331527709961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114697, + "balance_loss_mlp": 1.07039022, + "epoch": 0.6556367833782224, + "flos": 544170855936.0, + "grad_norm": 0.02881485242285111, + "language_loss": 0.89870715, + "learning_rate": 0.00028005559504058053, + "loss": 0.91017687, + "num_input_tokens_seen": 283082240, + "router_z_loss_mlp": 0.76464844, + "step": 3408, + "time_per_iteration": 2.750748634338379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146342, + "balance_loss_mlp": 1.06980956, + "epoch": 0.655829165063486, + "flos": 674730544128.0, + "grad_norm": 0.03409829385099465, + "language_loss": 0.82774001, + "learning_rate": 0.0002797758564827838, + "loss": 0.83920342, + "num_input_tokens_seen": 283156656, + "router_z_loss_mlp": 0.76416016, + "step": 3409, + "time_per_iteration": 2.7883474826812744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114755, + "balance_loss_mlp": 1.07111335, + "epoch": 0.6560215467487496, + "flos": 532836593664.0, + "grad_norm": 0.03847218102070899, + "language_loss": 0.89379394, + "learning_rate": 0.0002794962034235824, + "loss": 0.9052695, + "num_input_tokens_seen": 283223584, + "router_z_loss_mlp": 0.76318359, + "step": 3410, + "time_per_iteration": 2.6389691829681396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147509, + "balance_loss_mlp": 1.07102418, + "epoch": 0.656213928434013, + "flos": 592459402752.0, + "grad_norm": 0.035948217838460056, + "language_loss": 0.79690081, + "learning_rate": 0.00027921663597154695, + "loss": 0.80837584, + "num_input_tokens_seen": 283297680, + "router_z_loss_mlp": 0.76367188, + "step": 3411, + "time_per_iteration": 2.8345415592193604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146787, + "balance_loss_mlp": 1.07030261, + "epoch": 0.6564063101192766, + "flos": 416678184960.0, + "grad_norm": 0.038637742097161205, + "language_loss": 0.87214196, + "learning_rate": 0.00027893715423521525, + "loss": 0.88360977, + "num_input_tokens_seen": 283359744, + "router_z_loss_mlp": 0.76367188, + "step": 3412, + "time_per_iteration": 2.4819529056549072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146018, + "balance_loss_mlp": 1.06953347, + "epoch": 0.6565986918045402, + "flos": 454271013888.0, + "grad_norm": 0.03334091944582967, + "language_loss": 0.89441139, + "learning_rate": 0.00027865775832309163, + "loss": 0.90587157, + "num_input_tokens_seen": 283430688, + "router_z_loss_mlp": 0.76367188, + "step": 3413, + "time_per_iteration": 2.728583335876465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145861, + "balance_loss_mlp": 1.06956708, + "epoch": 0.6567910734898038, + "flos": 548798942208.0, + "grad_norm": 0.03367441290021015, + "language_loss": 0.91664404, + "learning_rate": 0.00027837844834364733, + "loss": 0.92810267, + "num_input_tokens_seen": 283498048, + "router_z_loss_mlp": 0.76171875, + "step": 3414, + "time_per_iteration": 2.6371517181396484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145504, + "balance_loss_mlp": 1.06925821, + "epoch": 0.6569834551750673, + "flos": 656764692480.0, + "grad_norm": 0.030804659012074204, + "language_loss": 0.9116472, + "learning_rate": 0.00027809922440532, + "loss": 0.92310226, + "num_input_tokens_seen": 283573040, + "router_z_loss_mlp": 0.76123047, + "step": 3415, + "time_per_iteration": 2.8265881538391113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148906, + "balance_loss_mlp": 1.07265973, + "epoch": 0.6571758368603309, + "flos": 540810399744.0, + "grad_norm": 0.030022936132040084, + "language_loss": 0.8532089, + "learning_rate": 0.00027782008661651406, + "loss": 0.86469799, + "num_input_tokens_seen": 283651696, + "router_z_loss_mlp": 0.76123047, + "step": 3416, + "time_per_iteration": 2.7672157287597656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149293, + "balance_loss_mlp": 1.07314205, + "epoch": 0.6573682185455945, + "flos": 498378637824.0, + "grad_norm": 0.029653574310281386, + "language_loss": 0.91551638, + "learning_rate": 0.00027754103508560013, + "loss": 0.92700928, + "num_input_tokens_seen": 283721824, + "router_z_loss_mlp": 0.76025391, + "step": 3417, + "time_per_iteration": 2.6405131816864014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114713, + "balance_loss_mlp": 1.07088423, + "epoch": 0.657560600230858, + "flos": 448353103872.0, + "grad_norm": 0.03576987566134107, + "language_loss": 0.87917447, + "learning_rate": 0.0002772620699209163, + "loss": 0.89064574, + "num_input_tokens_seen": 283786960, + "router_z_loss_mlp": 0.76123047, + "step": 3418, + "time_per_iteration": 2.5418612957000732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145939, + "balance_loss_mlp": 1.06983602, + "epoch": 0.6577529819161216, + "flos": 482919848448.0, + "grad_norm": 0.03527260419864515, + "language_loss": 0.85359573, + "learning_rate": 0.0002769831912307658, + "loss": 0.86505508, + "num_input_tokens_seen": 283853808, + "router_z_loss_mlp": 0.75976562, + "step": 3419, + "time_per_iteration": 2.604675054550171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147112, + "balance_loss_mlp": 1.07081771, + "epoch": 0.6579453636013851, + "flos": 531859674624.0, + "grad_norm": 0.03824872762512091, + "language_loss": 0.86228991, + "learning_rate": 0.00027670439912341917, + "loss": 0.87376106, + "num_input_tokens_seen": 283920960, + "router_z_loss_mlp": 0.76171875, + "step": 3420, + "time_per_iteration": 2.6483054161071777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146054, + "balance_loss_mlp": 1.06975985, + "epoch": 0.6581377452866487, + "flos": 629242498560.0, + "grad_norm": 0.03412485031630486, + "language_loss": 0.89059192, + "learning_rate": 0.0002764256937071129, + "loss": 0.90205252, + "num_input_tokens_seen": 283992416, + "router_z_loss_mlp": 0.76171875, + "step": 3421, + "time_per_iteration": 2.839137077331543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146563, + "balance_loss_mlp": 1.07031691, + "epoch": 0.6583301269719123, + "flos": 549673803264.0, + "grad_norm": 0.030144943579318143, + "language_loss": 0.91856694, + "learning_rate": 0.00027614707509005036, + "loss": 0.93003255, + "num_input_tokens_seen": 284061760, + "router_z_loss_mlp": 0.76123047, + "step": 3422, + "time_per_iteration": 2.680708408355713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114715, + "balance_loss_mlp": 1.07095134, + "epoch": 0.6585225086571759, + "flos": 428396484096.0, + "grad_norm": 0.04026315039628517, + "language_loss": 0.84251142, + "learning_rate": 0.0002758685433804008, + "loss": 0.85398293, + "num_input_tokens_seen": 284124848, + "router_z_loss_mlp": 0.76074219, + "step": 3423, + "time_per_iteration": 2.5081021785736084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146911, + "balance_loss_mlp": 1.07052183, + "epoch": 0.6587148903424394, + "flos": 861049026048.0, + "grad_norm": 0.03441249575164818, + "language_loss": 0.84824026, + "learning_rate": 0.00027559009868630005, + "loss": 0.85970938, + "num_input_tokens_seen": 284206272, + "router_z_loss_mlp": 0.76269531, + "step": 3424, + "time_per_iteration": 3.1415717601776123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114833, + "balance_loss_mlp": 1.07213128, + "epoch": 0.6589072720277029, + "flos": 807035951616.0, + "grad_norm": 0.03717672501292478, + "language_loss": 0.86237669, + "learning_rate": 0.0002753117411158491, + "loss": 0.87386, + "num_input_tokens_seen": 284293696, + "router_z_loss_mlp": 0.76074219, + "step": 3425, + "time_per_iteration": 3.041346788406372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148297, + "balance_loss_mlp": 1.07195568, + "epoch": 0.6590996537129665, + "flos": 549673803264.0, + "grad_norm": 0.03250683157775158, + "language_loss": 0.94800514, + "learning_rate": 0.0002750334707771168, + "loss": 0.95948815, + "num_input_tokens_seen": 284360192, + "router_z_loss_mlp": 0.76220703, + "step": 3426, + "time_per_iteration": 2.6350677013397217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149524, + "balance_loss_mlp": 1.07318223, + "epoch": 0.6592920353982301, + "flos": 455108944896.0, + "grad_norm": 0.0355046198758662, + "language_loss": 0.86040199, + "learning_rate": 0.0002747552877781369, + "loss": 0.87189716, + "num_input_tokens_seen": 284423680, + "router_z_loss_mlp": 0.76220703, + "step": 3427, + "time_per_iteration": 2.5129551887512207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114868, + "balance_loss_mlp": 1.07233834, + "epoch": 0.6594844170834937, + "flos": 568260734976.0, + "grad_norm": 0.034595379074033504, + "language_loss": 0.88492763, + "learning_rate": 0.0002744771922269097, + "loss": 0.8964144, + "num_input_tokens_seen": 284495712, + "router_z_loss_mlp": 0.76220703, + "step": 3428, + "time_per_iteration": 2.694378137588501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147393, + "balance_loss_mlp": 1.07114637, + "epoch": 0.6596767987687572, + "flos": 1189754284032.0, + "grad_norm": 0.030854411324183387, + "language_loss": 0.86799264, + "learning_rate": 0.0002741991842314015, + "loss": 0.87946653, + "num_input_tokens_seen": 284583440, + "router_z_loss_mlp": 0.76123047, + "step": 3429, + "time_per_iteration": 3.48809552192688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145028, + "balance_loss_mlp": 1.0686388, + "epoch": 0.6598691804540208, + "flos": 504467736576.0, + "grad_norm": 0.03376941001539595, + "language_loss": 0.89963281, + "learning_rate": 0.0002739212638995445, + "loss": 0.9110831, + "num_input_tokens_seen": 284649168, + "router_z_loss_mlp": 0.76269531, + "step": 3430, + "time_per_iteration": 2.532970428466797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114449, + "balance_loss_mlp": 1.06814861, + "epoch": 0.6600615621392844, + "flos": 532398162432.0, + "grad_norm": 0.038613055067671744, + "language_loss": 0.88853264, + "learning_rate": 0.00027364343133923696, + "loss": 0.89997756, + "num_input_tokens_seen": 284723136, + "router_z_loss_mlp": 0.76220703, + "step": 3431, + "time_per_iteration": 2.6269612312316895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144024, + "balance_loss_mlp": 1.06768203, + "epoch": 0.6602539438245479, + "flos": 566556675072.0, + "grad_norm": 0.03520560530434118, + "language_loss": 0.8882376, + "learning_rate": 0.0002733656866583431, + "loss": 0.89967781, + "num_input_tokens_seen": 284792752, + "router_z_loss_mlp": 0.76220703, + "step": 3432, + "time_per_iteration": 2.682663679122925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156009, + "balance_loss_mlp": 1.07995379, + "epoch": 0.6604463255098114, + "flos": 858591628800.0, + "grad_norm": 0.04099855509153074, + "language_loss": 0.88963896, + "learning_rate": 0.0002730880299646927, + "loss": 0.90119904, + "num_input_tokens_seen": 284871008, + "router_z_loss_mlp": 0.75927734, + "step": 3433, + "time_per_iteration": 3.050039291381836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157407, + "balance_loss_mlp": 1.08149505, + "epoch": 0.660638707195075, + "flos": 675679265280.0, + "grad_norm": 0.03297285173612762, + "language_loss": 0.89854127, + "learning_rate": 0.0002728104613660821, + "loss": 0.91011536, + "num_input_tokens_seen": 284945184, + "router_z_loss_mlp": 0.7578125, + "step": 3434, + "time_per_iteration": 2.8358242511749268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148511, + "balance_loss_mlp": 1.07236028, + "epoch": 0.6608310888803386, + "flos": 890523056640.0, + "grad_norm": 0.03459988631627961, + "language_loss": 0.88072419, + "learning_rate": 0.0002725329809702729, + "loss": 0.89220929, + "num_input_tokens_seen": 285029296, + "router_z_loss_mlp": 0.76025391, + "step": 3435, + "time_per_iteration": 3.181201457977295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146577, + "balance_loss_mlp": 1.07033134, + "epoch": 0.6610234705656022, + "flos": 1138107282432.0, + "grad_norm": 0.04279733621824939, + "language_loss": 0.82982898, + "learning_rate": 0.0002722555888849921, + "loss": 0.84129477, + "num_input_tokens_seen": 285124720, + "router_z_loss_mlp": 0.76123047, + "step": 3436, + "time_per_iteration": 3.423975706100464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147052, + "balance_loss_mlp": 1.07099605, + "epoch": 0.6612158522508658, + "flos": 468959001600.0, + "grad_norm": 0.03231258951929261, + "language_loss": 0.84970325, + "learning_rate": 0.00027197828521793334, + "loss": 0.86117375, + "num_input_tokens_seen": 285191360, + "router_z_loss_mlp": 0.75927734, + "step": 3437, + "time_per_iteration": 2.5456013679504395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147897, + "balance_loss_mlp": 1.07179344, + "epoch": 0.6614082339361292, + "flos": 572774028288.0, + "grad_norm": 0.03152032613188321, + "language_loss": 0.8887009, + "learning_rate": 0.0002717010700767552, + "loss": 0.90017986, + "num_input_tokens_seen": 285262624, + "router_z_loss_mlp": 0.75976562, + "step": 3438, + "time_per_iteration": 2.6809959411621094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149118, + "balance_loss_mlp": 1.07306218, + "epoch": 0.6616006156213928, + "flos": 499459616256.0, + "grad_norm": 0.039698826906756704, + "language_loss": 0.82129598, + "learning_rate": 0.00027142394356908226, + "loss": 0.8327871, + "num_input_tokens_seen": 285328512, + "router_z_loss_mlp": 0.75927734, + "step": 3439, + "time_per_iteration": 2.5949456691741943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148646, + "balance_loss_mlp": 1.07254267, + "epoch": 0.6617929973066564, + "flos": 603609014784.0, + "grad_norm": 0.030441774907891187, + "language_loss": 0.8967098, + "learning_rate": 0.00027114690580250456, + "loss": 0.90819627, + "num_input_tokens_seen": 285406128, + "router_z_loss_mlp": 0.75976562, + "step": 3440, + "time_per_iteration": 2.749826431274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147854, + "balance_loss_mlp": 1.07175064, + "epoch": 0.66198537899192, + "flos": 523994657280.0, + "grad_norm": 0.033263511323201614, + "language_loss": 0.91719675, + "learning_rate": 0.0002708699568845776, + "loss": 0.92867529, + "num_input_tokens_seen": 285474704, + "router_z_loss_mlp": 0.75976562, + "step": 3441, + "time_per_iteration": 2.65191912651062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162537, + "balance_loss_mlp": 1.08815002, + "epoch": 0.6621777606771835, + "flos": 1569609893376.0, + "grad_norm": 0.01497403906155291, + "language_loss": 0.79287779, + "learning_rate": 0.00027059309692282265, + "loss": 0.8045032, + "num_input_tokens_seen": 285698704, + "router_z_loss_mlp": 0.74414062, + "step": 3442, + "time_per_iteration": 4.957901239395142 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154184, + "balance_loss_mlp": 1.07817662, + "epoch": 0.6623701423624471, + "flos": 527689485312.0, + "grad_norm": 0.03191394261297454, + "language_loss": 0.8795507, + "learning_rate": 0.0002703163260247261, + "loss": 0.89109254, + "num_input_tokens_seen": 285767936, + "router_z_loss_mlp": 0.75878906, + "step": 3443, + "time_per_iteration": 2.6025161743164062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151931, + "balance_loss_mlp": 1.07601833, + "epoch": 0.6625625240477107, + "flos": 529215625728.0, + "grad_norm": 0.035865829187726836, + "language_loss": 0.87189507, + "learning_rate": 0.0002700396442977399, + "loss": 0.88341439, + "num_input_tokens_seen": 285839456, + "router_z_loss_mlp": 0.7578125, + "step": 3444, + "time_per_iteration": 2.624119758605957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152482, + "balance_loss_mlp": 1.07652199, + "epoch": 0.6627549057329742, + "flos": 474195432960.0, + "grad_norm": 0.03160775147122319, + "language_loss": 0.890499, + "learning_rate": 0.0002697630518492817, + "loss": 0.90202379, + "num_input_tokens_seen": 285905904, + "router_z_loss_mlp": 0.75830078, + "step": 3445, + "time_per_iteration": 2.7382802963256836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151051, + "balance_loss_mlp": 1.07494795, + "epoch": 0.6629472874182378, + "flos": 529011509760.0, + "grad_norm": 0.03595555935138165, + "language_loss": 0.89779699, + "learning_rate": 0.0002694865487867343, + "loss": 0.90930748, + "num_input_tokens_seen": 285975520, + "router_z_loss_mlp": 0.75976562, + "step": 3446, + "time_per_iteration": 2.704895257949829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150785, + "balance_loss_mlp": 1.0749681, + "epoch": 0.6631396691035013, + "flos": 614378592768.0, + "grad_norm": 0.031003429121565652, + "language_loss": 0.8906312, + "learning_rate": 0.0002692101352174453, + "loss": 0.90213907, + "num_input_tokens_seen": 286050320, + "router_z_loss_mlp": 0.75683594, + "step": 3447, + "time_per_iteration": 2.8165597915649414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148036, + "balance_loss_mlp": 1.07207584, + "epoch": 0.6633320507887649, + "flos": 610433986560.0, + "grad_norm": 0.03537124525005162, + "language_loss": 0.89763427, + "learning_rate": 0.00026893381124872787, + "loss": 0.90911466, + "num_input_tokens_seen": 286120672, + "router_z_loss_mlp": 0.75830078, + "step": 3448, + "time_per_iteration": 2.698657512664795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146339, + "balance_loss_mlp": 1.07033098, + "epoch": 0.6635244324740285, + "flos": 751140897792.0, + "grad_norm": 0.037519042250439116, + "language_loss": 0.85281086, + "learning_rate": 0.00026865757698786097, + "loss": 0.86427426, + "num_input_tokens_seen": 286201152, + "router_z_loss_mlp": 0.75878906, + "step": 3449, + "time_per_iteration": 3.055635452270508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145472, + "balance_loss_mlp": 1.06932163, + "epoch": 0.6637168141592921, + "flos": 665747618304.0, + "grad_norm": 0.03493094826481752, + "language_loss": 0.85618043, + "learning_rate": 0.000268381432542088, + "loss": 0.86763519, + "num_input_tokens_seen": 286274512, + "router_z_loss_mlp": 0.76025391, + "step": 3450, + "time_per_iteration": 2.8057384490966797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145353, + "balance_loss_mlp": 1.06934512, + "epoch": 0.6639091958445555, + "flos": 607920193536.0, + "grad_norm": 0.03317215274134995, + "language_loss": 0.85111237, + "learning_rate": 0.00026810537801861807, + "loss": 0.86256593, + "num_input_tokens_seen": 286349808, + "router_z_loss_mlp": 0.75878906, + "step": 3451, + "time_per_iteration": 2.7435052394866943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149606, + "balance_loss_mlp": 1.0735507, + "epoch": 0.6641015775298191, + "flos": 477679414272.0, + "grad_norm": 0.03227894360580252, + "language_loss": 0.85315323, + "learning_rate": 0.0002678294135246243, + "loss": 0.8646493, + "num_input_tokens_seen": 286422912, + "router_z_loss_mlp": 0.75927734, + "step": 3452, + "time_per_iteration": 2.7193186283111572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147818, + "balance_loss_mlp": 1.07171512, + "epoch": 0.6642939592150827, + "flos": 905595081216.0, + "grad_norm": 0.03357369585289791, + "language_loss": 0.91588908, + "learning_rate": 0.0002675535391672463, + "loss": 0.92736733, + "num_input_tokens_seen": 286501072, + "router_z_loss_mlp": 0.75976562, + "step": 3453, + "time_per_iteration": 3.0945043563842773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148472, + "balance_loss_mlp": 1.07236886, + "epoch": 0.6644863409003463, + "flos": 582937989120.0, + "grad_norm": 0.030535675570776123, + "language_loss": 0.90264779, + "learning_rate": 0.0002672777550535877, + "loss": 0.91413254, + "num_input_tokens_seen": 286580480, + "router_z_loss_mlp": 0.75976562, + "step": 3454, + "time_per_iteration": 2.7741284370422363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150279, + "balance_loss_mlp": 1.07398534, + "epoch": 0.6646787225856099, + "flos": 479969625600.0, + "grad_norm": 0.03106835211233169, + "language_loss": 0.89111888, + "learning_rate": 0.00026700206129071747, + "loss": 0.90262163, + "num_input_tokens_seen": 286646208, + "router_z_loss_mlp": 0.76171875, + "step": 3455, + "time_per_iteration": 2.5455679893493652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149274, + "balance_loss_mlp": 1.07302773, + "epoch": 0.6648711042708734, + "flos": 450827965440.0, + "grad_norm": 0.034343549963822835, + "language_loss": 0.92980659, + "learning_rate": 0.00026672645798566925, + "loss": 0.94129932, + "num_input_tokens_seen": 286710624, + "router_z_loss_mlp": 0.76123047, + "step": 3456, + "time_per_iteration": 2.5500409603118896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149485, + "balance_loss_mlp": 1.07319152, + "epoch": 0.665063485956137, + "flos": 860595858432.0, + "grad_norm": 0.03429824706439816, + "language_loss": 0.85038483, + "learning_rate": 0.00026645094524544225, + "loss": 0.86187971, + "num_input_tokens_seen": 286799472, + "router_z_loss_mlp": 0.76171875, + "step": 3457, + "time_per_iteration": 3.2861030101776123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149344, + "balance_loss_mlp": 1.07290661, + "epoch": 0.6652558676414005, + "flos": 605471528448.0, + "grad_norm": 0.02726612159362192, + "language_loss": 0.79581773, + "learning_rate": 0.00026617552317699945, + "loss": 0.80731118, + "num_input_tokens_seen": 286874752, + "router_z_loss_mlp": 0.76318359, + "step": 3458, + "time_per_iteration": 2.8133809566497803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149341, + "balance_loss_mlp": 1.07299888, + "epoch": 0.6654482493266641, + "flos": 511410229248.0, + "grad_norm": 0.030741900207522484, + "language_loss": 0.92019296, + "learning_rate": 0.0002659001918872693, + "loss": 0.9316864, + "num_input_tokens_seen": 286943312, + "router_z_loss_mlp": 0.76220703, + "step": 3459, + "time_per_iteration": 2.719456672668457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149368, + "balance_loss_mlp": 1.07302606, + "epoch": 0.6656406310119277, + "flos": 566660734464.0, + "grad_norm": 0.03268721915470487, + "language_loss": 0.8501879, + "learning_rate": 0.0002656249514831449, + "loss": 0.86168158, + "num_input_tokens_seen": 287010000, + "router_z_loss_mlp": 0.76220703, + "step": 3460, + "time_per_iteration": 2.7105963230133057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150225, + "balance_loss_mlp": 1.07383597, + "epoch": 0.6658330126971912, + "flos": 1026058664448.0, + "grad_norm": 0.029696729072264432, + "language_loss": 0.91355968, + "learning_rate": 0.00026534980207148416, + "loss": 0.92506194, + "num_input_tokens_seen": 287101456, + "router_z_loss_mlp": 0.76269531, + "step": 3461, + "time_per_iteration": 3.3982574939727783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145433, + "balance_loss_mlp": 1.06894886, + "epoch": 0.6660253943824548, + "flos": 818233227264.0, + "grad_norm": 0.03528061567962845, + "language_loss": 0.78412712, + "learning_rate": 0.0002650747437591097, + "loss": 0.79558146, + "num_input_tokens_seen": 287182848, + "router_z_loss_mlp": 0.76367188, + "step": 3462, + "time_per_iteration": 2.9878056049346924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149719, + "balance_loss_mlp": 1.07533264, + "epoch": 0.6662177760677184, + "flos": 1499530411008.0, + "grad_norm": 0.00830594189347842, + "language_loss": 0.8187958, + "learning_rate": 0.00026479977665280806, + "loss": 0.83029294, + "num_input_tokens_seen": 287417920, + "router_z_loss_mlp": 0.74414062, + "step": 3463, + "time_per_iteration": 6.524547815322876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145921, + "balance_loss_mlp": 1.06953192, + "epoch": 0.666410157752982, + "flos": 501107280384.0, + "grad_norm": 0.03076087992809579, + "language_loss": 0.91384947, + "learning_rate": 0.00026452490085933155, + "loss": 0.9253087, + "num_input_tokens_seen": 287483776, + "router_z_loss_mlp": 0.76269531, + "step": 3464, + "time_per_iteration": 2.598808765411377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145896, + "balance_loss_mlp": 1.06955457, + "epoch": 0.6666025394382454, + "flos": 482138313216.0, + "grad_norm": 0.03618588438682257, + "language_loss": 0.95199478, + "learning_rate": 0.00026425011648539614, + "loss": 0.96345377, + "num_input_tokens_seen": 287548176, + "router_z_loss_mlp": 0.76220703, + "step": 3465, + "time_per_iteration": 2.5265092849731445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145501, + "balance_loss_mlp": 1.06906354, + "epoch": 0.666794921123509, + "flos": 547691767296.0, + "grad_norm": 0.03394030373238319, + "language_loss": 0.87548077, + "learning_rate": 0.00026397542363768267, + "loss": 0.88693571, + "num_input_tokens_seen": 287618496, + "router_z_loss_mlp": 0.76318359, + "step": 3466, + "time_per_iteration": 2.645876407623291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145746, + "balance_loss_mlp": 1.06935704, + "epoch": 0.6669873028087726, + "flos": 472942539264.0, + "grad_norm": 0.0340202515012301, + "language_loss": 0.87299979, + "learning_rate": 0.0002637008224228362, + "loss": 0.88445723, + "num_input_tokens_seen": 287684032, + "router_z_loss_mlp": 0.76269531, + "step": 3467, + "time_per_iteration": 2.5271472930908203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147048, + "balance_loss_mlp": 1.07070661, + "epoch": 0.6671796844940362, + "flos": 548499499008.0, + "grad_norm": 0.029468894408270302, + "language_loss": 0.89176929, + "learning_rate": 0.00026342631294746653, + "loss": 0.90323979, + "num_input_tokens_seen": 287757680, + "router_z_loss_mlp": 0.76220703, + "step": 3468, + "time_per_iteration": 2.694568395614624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146376, + "balance_loss_mlp": 1.07008207, + "epoch": 0.6673720661792998, + "flos": 1072122127872.0, + "grad_norm": 0.03284045124327485, + "language_loss": 0.85731959, + "learning_rate": 0.0002631518953181476, + "loss": 0.86878335, + "num_input_tokens_seen": 287848992, + "router_z_loss_mlp": 0.76171875, + "step": 3469, + "time_per_iteration": 3.4704368114471436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148972, + "balance_loss_mlp": 1.07458496, + "epoch": 0.6675644478645633, + "flos": 1527111002112.0, + "grad_norm": 0.004792795584487496, + "language_loss": 0.76325285, + "learning_rate": 0.000262877569641418, + "loss": 0.7747426, + "num_input_tokens_seen": 288085680, + "router_z_loss_mlp": 0.74414062, + "step": 3470, + "time_per_iteration": 4.929240465164185 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146143, + "balance_loss_mlp": 1.06989694, + "epoch": 0.6677568295498268, + "flos": 580843161600.0, + "grad_norm": 0.032107654736022645, + "language_loss": 0.84914112, + "learning_rate": 0.00026260333602377985, + "loss": 0.86060262, + "num_input_tokens_seen": 288161568, + "router_z_loss_mlp": 0.76123047, + "step": 3471, + "time_per_iteration": 2.740605592727661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146874, + "balance_loss_mlp": 1.07072294, + "epoch": 0.6679492112350904, + "flos": 384790417920.0, + "grad_norm": 0.036226919771653675, + "language_loss": 0.91317421, + "learning_rate": 0.0002623291945717007, + "loss": 0.92464286, + "num_input_tokens_seen": 288224032, + "router_z_loss_mlp": 0.76025391, + "step": 3472, + "time_per_iteration": 2.4707448482513428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146308, + "balance_loss_mlp": 1.07015693, + "epoch": 0.668141592920354, + "flos": 1152615349248.0, + "grad_norm": 0.02851459994850691, + "language_loss": 0.88269627, + "learning_rate": 0.00026205514539161175, + "loss": 0.89415932, + "num_input_tokens_seen": 288312912, + "router_z_loss_mlp": 0.76025391, + "step": 3473, + "time_per_iteration": 3.5094759464263916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146143, + "balance_loss_mlp": 1.07008779, + "epoch": 0.6683339746056175, + "flos": 562291158528.0, + "grad_norm": 0.030234261038109174, + "language_loss": 0.88653791, + "learning_rate": 0.00026178118858990773, + "loss": 0.89799941, + "num_input_tokens_seen": 288394224, + "router_z_loss_mlp": 0.75927734, + "step": 3474, + "time_per_iteration": 2.8636863231658936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147022, + "balance_loss_mlp": 1.07096648, + "epoch": 0.6685263562908811, + "flos": 515328638976.0, + "grad_norm": 0.030631239249789746, + "language_loss": 0.89337111, + "learning_rate": 0.0002615073242729483, + "loss": 0.9048413, + "num_input_tokens_seen": 288462976, + "router_z_loss_mlp": 0.75927734, + "step": 3475, + "time_per_iteration": 2.6223714351654053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148783, + "balance_loss_mlp": 1.07267952, + "epoch": 0.6687187379761447, + "flos": 631000952832.0, + "grad_norm": 0.03058857090132586, + "language_loss": 0.88941103, + "learning_rate": 0.0002612335525470573, + "loss": 0.90089881, + "num_input_tokens_seen": 288542032, + "router_z_loss_mlp": 0.75976562, + "step": 3476, + "time_per_iteration": 2.8004729747772217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148335, + "balance_loss_mlp": 1.07242274, + "epoch": 0.6689111196614083, + "flos": 536687874048.0, + "grad_norm": 0.03636459478392294, + "language_loss": 0.82775843, + "learning_rate": 0.0002609598735185221, + "loss": 0.8392418, + "num_input_tokens_seen": 288610704, + "router_z_loss_mlp": 0.7578125, + "step": 3477, + "time_per_iteration": 2.668614149093628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148386, + "balance_loss_mlp": 1.0723784, + "epoch": 0.6691035013466718, + "flos": 604160237568.0, + "grad_norm": 0.03359617144199284, + "language_loss": 0.87902224, + "learning_rate": 0.00026068628729359445, + "loss": 0.89050609, + "num_input_tokens_seen": 288686080, + "router_z_loss_mlp": 0.75878906, + "step": 3478, + "time_per_iteration": 2.7584378719329834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147866, + "balance_loss_mlp": 1.07185841, + "epoch": 0.6692958830319353, + "flos": 634127093760.0, + "grad_norm": 0.030871112113608438, + "language_loss": 0.80438709, + "learning_rate": 0.00026041279397848996, + "loss": 0.81586581, + "num_input_tokens_seen": 288764944, + "router_z_loss_mlp": 0.75878906, + "step": 3479, + "time_per_iteration": 2.8838839530944824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011474, + "balance_loss_mlp": 1.07143939, + "epoch": 0.6694882647171989, + "flos": 646748451840.0, + "grad_norm": 0.03180979016390224, + "language_loss": 0.87201416, + "learning_rate": 0.00026013939367938797, + "loss": 0.88348818, + "num_input_tokens_seen": 288847856, + "router_z_loss_mlp": 0.75830078, + "step": 3480, + "time_per_iteration": 2.908734083175659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148147, + "balance_loss_mlp": 1.07213914, + "epoch": 0.6696806464024625, + "flos": 570761793024.0, + "grad_norm": 0.030473361279484277, + "language_loss": 0.85594642, + "learning_rate": 0.00025986608650243204, + "loss": 0.86742783, + "num_input_tokens_seen": 288929360, + "router_z_loss_mlp": 0.75878906, + "step": 3481, + "time_per_iteration": 2.85624098777771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147434, + "balance_loss_mlp": 1.07137847, + "epoch": 0.6698730280877261, + "flos": 623963132928.0, + "grad_norm": 0.033030030502012045, + "language_loss": 0.84301388, + "learning_rate": 0.0002595928725537293, + "loss": 0.85448819, + "num_input_tokens_seen": 289010160, + "router_z_loss_mlp": 0.75927734, + "step": 3482, + "time_per_iteration": 2.9488890171051025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147834, + "balance_loss_mlp": 1.07177854, + "epoch": 0.6700654097729896, + "flos": 503508281856.0, + "grad_norm": 0.03256709943741325, + "language_loss": 0.93030363, + "learning_rate": 0.0002593197519393509, + "loss": 0.941782, + "num_input_tokens_seen": 289077392, + "router_z_loss_mlp": 0.75927734, + "step": 3483, + "time_per_iteration": 2.6505393981933594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146862, + "balance_loss_mlp": 1.07085407, + "epoch": 0.6702577914582531, + "flos": 625117971456.0, + "grad_norm": 0.031176357525406213, + "language_loss": 0.83921826, + "learning_rate": 0.00025904672476533165, + "loss": 0.85068691, + "num_input_tokens_seen": 289157248, + "router_z_loss_mlp": 0.75878906, + "step": 3484, + "time_per_iteration": 2.859121084213257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147102, + "balance_loss_mlp": 1.07109404, + "epoch": 0.6704501731435167, + "flos": 457212504576.0, + "grad_norm": 0.03137206075835519, + "language_loss": 0.87799835, + "learning_rate": 0.0002587737911376704, + "loss": 0.88946939, + "num_input_tokens_seen": 289224864, + "router_z_loss_mlp": 0.75878906, + "step": 3485, + "time_per_iteration": 2.599365711212158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147337, + "balance_loss_mlp": 1.07137716, + "epoch": 0.6706425548287803, + "flos": 544257451008.0, + "grad_norm": 0.033540892991266884, + "language_loss": 0.88788569, + "learning_rate": 0.00025850095116232885, + "loss": 0.89935905, + "num_input_tokens_seen": 289293488, + "router_z_loss_mlp": 0.75830078, + "step": 3486, + "time_per_iteration": 2.6457767486572266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143978, + "balance_loss_mlp": 1.06787491, + "epoch": 0.6708349365140439, + "flos": 635179874304.0, + "grad_norm": 0.030051375529732832, + "language_loss": 0.82181835, + "learning_rate": 0.000258228204945233, + "loss": 0.83325815, + "num_input_tokens_seen": 289370560, + "router_z_loss_mlp": 0.75976562, + "step": 3487, + "time_per_iteration": 2.8957583904266357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147088, + "balance_loss_mlp": 1.07117581, + "epoch": 0.6710273181993074, + "flos": 641902788096.0, + "grad_norm": 0.03500138254568088, + "language_loss": 0.89155853, + "learning_rate": 0.00025795555259227254, + "loss": 0.90302938, + "num_input_tokens_seen": 289440096, + "router_z_loss_mlp": 0.7578125, + "step": 3488, + "time_per_iteration": 2.814859628677368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147178, + "balance_loss_mlp": 1.0712657, + "epoch": 0.671219699884571, + "flos": 555025027584.0, + "grad_norm": 0.029480168700917284, + "language_loss": 0.88153946, + "learning_rate": 0.00025768299420930046, + "loss": 0.89301121, + "num_input_tokens_seen": 289515808, + "router_z_loss_mlp": 0.7578125, + "step": 3489, + "time_per_iteration": 2.723747491836548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146316, + "balance_loss_mlp": 1.07045078, + "epoch": 0.6714120815698346, + "flos": 732781550592.0, + "grad_norm": 0.031857153656531974, + "language_loss": 0.87735152, + "learning_rate": 0.0002574105299021332, + "loss": 0.88881469, + "num_input_tokens_seen": 289591344, + "router_z_loss_mlp": 0.75732422, + "step": 3490, + "time_per_iteration": 2.8996829986572266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145484, + "balance_loss_mlp": 1.06957209, + "epoch": 0.6716044632550981, + "flos": 689946286080.0, + "grad_norm": 0.030584806240151117, + "language_loss": 0.88189107, + "learning_rate": 0.00025713815977655084, + "loss": 0.89334595, + "num_input_tokens_seen": 289672032, + "router_z_loss_mlp": 0.7578125, + "step": 3491, + "time_per_iteration": 2.8675849437713623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161081, + "balance_loss_mlp": 1.08545506, + "epoch": 0.6717968449403616, + "flos": 461586809856.0, + "grad_norm": 0.035565643494579496, + "language_loss": 0.89158142, + "learning_rate": 0.0002568658839382969, + "loss": 0.90319222, + "num_input_tokens_seen": 289738304, + "router_z_loss_mlp": 0.75488281, + "step": 3492, + "time_per_iteration": 2.542618989944458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161108, + "balance_loss_mlp": 1.08538604, + "epoch": 0.6719892266256252, + "flos": 502596490752.0, + "grad_norm": 0.03871127770917694, + "language_loss": 0.90369606, + "learning_rate": 0.00025659370249307814, + "loss": 0.91530716, + "num_input_tokens_seen": 289804304, + "router_z_loss_mlp": 0.75585938, + "step": 3493, + "time_per_iteration": 2.617976665496826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155204, + "balance_loss_mlp": 1.07938695, + "epoch": 0.6721816083108888, + "flos": 684736051200.0, + "grad_norm": 0.030709352042026482, + "language_loss": 0.89865196, + "learning_rate": 0.00025632161554656473, + "loss": 0.91020399, + "num_input_tokens_seen": 289877696, + "router_z_loss_mlp": 0.75683594, + "step": 3494, + "time_per_iteration": 2.9416136741638184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153333, + "balance_loss_mlp": 1.07742059, + "epoch": 0.6723739899961524, + "flos": 586895330304.0, + "grad_norm": 0.035401445630926676, + "language_loss": 0.86814046, + "learning_rate": 0.00025604962320439017, + "loss": 0.87967384, + "num_input_tokens_seen": 289947296, + "router_z_loss_mlp": 0.7578125, + "step": 3495, + "time_per_iteration": 2.709865093231201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152259, + "balance_loss_mlp": 1.07639432, + "epoch": 0.672566371681416, + "flos": 507739596288.0, + "grad_norm": 0.03037394710394358, + "language_loss": 0.86663043, + "learning_rate": 0.0002557777255721516, + "loss": 0.87815297, + "num_input_tokens_seen": 290020080, + "router_z_loss_mlp": 0.75732422, + "step": 3496, + "time_per_iteration": 2.7064080238342285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144717, + "balance_loss_mlp": 1.06870878, + "epoch": 0.6727587533666795, + "flos": 536735537664.0, + "grad_norm": 0.03895269185794194, + "language_loss": 0.8665306, + "learning_rate": 0.0002555059227554087, + "loss": 0.87797779, + "num_input_tokens_seen": 290094544, + "router_z_loss_mlp": 0.75878906, + "step": 3497, + "time_per_iteration": 2.725748062133789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144891, + "balance_loss_mlp": 1.06897879, + "epoch": 0.672951135051943, + "flos": 604036712448.0, + "grad_norm": 0.03298671193976436, + "language_loss": 0.82722509, + "learning_rate": 0.00025523421485968453, + "loss": 0.83867407, + "num_input_tokens_seen": 290173520, + "router_z_loss_mlp": 0.7578125, + "step": 3498, + "time_per_iteration": 2.7769460678100586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143713, + "balance_loss_mlp": 1.06780005, + "epoch": 0.6731435167372066, + "flos": 812677886976.0, + "grad_norm": 0.03548022480956623, + "language_loss": 0.90755463, + "learning_rate": 0.00025496260199046585, + "loss": 0.91899168, + "num_input_tokens_seen": 290248240, + "router_z_loss_mlp": 0.7578125, + "step": 3499, + "time_per_iteration": 2.952929973602295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143579, + "balance_loss_mlp": 1.06766629, + "epoch": 0.6733358984224702, + "flos": 612750394368.0, + "grad_norm": 0.030145588081223078, + "language_loss": 0.89167559, + "learning_rate": 0.000254691084253202, + "loss": 0.90311134, + "num_input_tokens_seen": 290326288, + "router_z_loss_mlp": 0.7578125, + "step": 3500, + "time_per_iteration": 2.798442840576172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144185, + "balance_loss_mlp": 1.06827235, + "epoch": 0.6735282801077337, + "flos": 559968019968.0, + "grad_norm": 0.034844314373587704, + "language_loss": 0.83049423, + "learning_rate": 0.00025441966175330567, + "loss": 0.84193599, + "num_input_tokens_seen": 290395984, + "router_z_loss_mlp": 0.7578125, + "step": 3501, + "time_per_iteration": 2.712158203125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143612, + "balance_loss_mlp": 1.06769979, + "epoch": 0.6737206617929973, + "flos": 673632101376.0, + "grad_norm": 0.033990412363220264, + "language_loss": 0.84750879, + "learning_rate": 0.00025414833459615183, + "loss": 0.85894495, + "num_input_tokens_seen": 290470224, + "router_z_loss_mlp": 0.7578125, + "step": 3502, + "time_per_iteration": 2.801419973373413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143927, + "balance_loss_mlp": 1.06801498, + "epoch": 0.6739130434782609, + "flos": 634641386496.0, + "grad_norm": 0.0329145119302939, + "language_loss": 0.85179496, + "learning_rate": 0.0002538771028870796, + "loss": 0.86323422, + "num_input_tokens_seen": 290542864, + "router_z_loss_mlp": 0.7578125, + "step": 3503, + "time_per_iteration": 2.775928497314453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143743, + "balance_loss_mlp": 1.06783044, + "epoch": 0.6741054251635245, + "flos": 532545882624.0, + "grad_norm": 0.03235573519036691, + "language_loss": 0.85924655, + "learning_rate": 0.0002536059667313903, + "loss": 0.87068391, + "num_input_tokens_seen": 290617248, + "router_z_loss_mlp": 0.7578125, + "step": 3504, + "time_per_iteration": 2.7243404388427734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142972, + "balance_loss_mlp": 1.06705964, + "epoch": 0.674297806848788, + "flos": 543651833856.0, + "grad_norm": 0.0371245910075902, + "language_loss": 0.94068909, + "learning_rate": 0.0002533349262343483, + "loss": 0.95211881, + "num_input_tokens_seen": 290690112, + "router_z_loss_mlp": 0.7578125, + "step": 3505, + "time_per_iteration": 2.672279119491577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144049, + "balance_loss_mlp": 1.06818378, + "epoch": 0.6744901885340515, + "flos": 464454440448.0, + "grad_norm": 0.03655603062575672, + "language_loss": 0.87737519, + "learning_rate": 0.0002530639815011807, + "loss": 0.88881564, + "num_input_tokens_seen": 290756352, + "router_z_loss_mlp": 0.75732422, + "step": 3506, + "time_per_iteration": 2.4994444847106934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147432, + "balance_loss_mlp": 1.07156682, + "epoch": 0.6746825702193151, + "flos": 633021920256.0, + "grad_norm": 0.03414682593561894, + "language_loss": 0.89147329, + "learning_rate": 0.0002527931326370781, + "loss": 0.90294766, + "num_input_tokens_seen": 290829776, + "router_z_loss_mlp": 0.75732422, + "step": 3507, + "time_per_iteration": 2.8101861476898193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147739, + "balance_loss_mlp": 1.07201719, + "epoch": 0.6748749519045787, + "flos": 672392669184.0, + "grad_norm": 0.03604109956687097, + "language_loss": 0.87794244, + "learning_rate": 0.00025252237974719276, + "loss": 0.88941985, + "num_input_tokens_seen": 290900736, + "router_z_loss_mlp": 0.75585938, + "step": 3508, + "time_per_iteration": 2.8684208393096924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147125, + "balance_loss_mlp": 1.07140362, + "epoch": 0.6750673335898423, + "flos": 768492400128.0, + "grad_norm": 0.03252394082616114, + "language_loss": 0.85605073, + "learning_rate": 0.00025225172293664056, + "loss": 0.867522, + "num_input_tokens_seen": 290981696, + "router_z_loss_mlp": 0.75585938, + "step": 3509, + "time_per_iteration": 2.979069232940674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161552, + "balance_loss_mlp": 1.08716583, + "epoch": 0.6752597152751059, + "flos": 1515904994304.0, + "grad_norm": 0.012789123044337823, + "language_loss": 0.76933134, + "learning_rate": 0.00025198116231049954, + "loss": 0.78094685, + "num_input_tokens_seen": 291217888, + "router_z_loss_mlp": 0.74414062, + "step": 3510, + "time_per_iteration": 4.922729015350342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115617, + "balance_loss_mlp": 1.0805434, + "epoch": 0.6754520969603693, + "flos": 688532937216.0, + "grad_norm": 0.03719909461445286, + "language_loss": 0.8963424, + "learning_rate": 0.00025171069797381106, + "loss": 0.90790415, + "num_input_tokens_seen": 291287856, + "router_z_loss_mlp": 0.75488281, + "step": 3511, + "time_per_iteration": 2.8566861152648926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151796, + "balance_loss_mlp": 1.07621729, + "epoch": 0.6756444786456329, + "flos": 501617570304.0, + "grad_norm": 0.03363675466936639, + "language_loss": 0.85946679, + "learning_rate": 0.00025144033003157864, + "loss": 0.87098479, + "num_input_tokens_seen": 291354912, + "router_z_loss_mlp": 0.75439453, + "step": 3512, + "time_per_iteration": 2.579599142074585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152227, + "balance_loss_mlp": 1.07650506, + "epoch": 0.6758368603308965, + "flos": 493659227136.0, + "grad_norm": 0.044346995690068114, + "language_loss": 0.8418451, + "learning_rate": 0.00025117005858876806, + "loss": 0.85336733, + "num_input_tokens_seen": 291426816, + "router_z_loss_mlp": 0.75585938, + "step": 3513, + "time_per_iteration": 2.694627285003662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115062, + "balance_loss_mlp": 1.07485056, + "epoch": 0.6760292420161601, + "flos": 557043993600.0, + "grad_norm": 0.034337257206957794, + "language_loss": 0.90733004, + "learning_rate": 0.000250899883750308, + "loss": 0.91883624, + "num_input_tokens_seen": 291497648, + "router_z_loss_mlp": 0.75634766, + "step": 3514, + "time_per_iteration": 2.6701719760894775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150513, + "balance_loss_mlp": 1.07474315, + "epoch": 0.6762216237014236, + "flos": 608721194496.0, + "grad_norm": 0.03416515328617874, + "language_loss": 0.87787104, + "learning_rate": 0.00025062980562109006, + "loss": 0.8893761, + "num_input_tokens_seen": 291568080, + "router_z_loss_mlp": 0.75634766, + "step": 3515, + "time_per_iteration": 2.7225759029388428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150722, + "balance_loss_mlp": 1.07499993, + "epoch": 0.6764140053866872, + "flos": 534927418368.0, + "grad_norm": 0.03854621654418095, + "language_loss": 0.89246118, + "learning_rate": 0.0002503598243059677, + "loss": 0.90396839, + "num_input_tokens_seen": 291644896, + "router_z_loss_mlp": 0.75585938, + "step": 3516, + "time_per_iteration": 2.808784008026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143883, + "balance_loss_mlp": 1.06797004, + "epoch": 0.6766063870719508, + "flos": 505861619712.0, + "grad_norm": 0.034298651238093614, + "language_loss": 0.84964311, + "learning_rate": 0.0002500899399097568, + "loss": 0.86108196, + "num_input_tokens_seen": 291716864, + "router_z_loss_mlp": 0.7578125, + "step": 3517, + "time_per_iteration": 2.713134765625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142698, + "balance_loss_mlp": 1.0667851, + "epoch": 0.6767987687572143, + "flos": 514193266176.0, + "grad_norm": 0.03865641767048317, + "language_loss": 0.91341412, + "learning_rate": 0.0002498201525372359, + "loss": 0.92484111, + "num_input_tokens_seen": 291786000, + "router_z_loss_mlp": 0.7578125, + "step": 3518, + "time_per_iteration": 2.5997681617736816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141854, + "balance_loss_mlp": 1.0659889, + "epoch": 0.6769911504424779, + "flos": 526078751232.0, + "grad_norm": 0.04161600440053586, + "language_loss": 0.877231, + "learning_rate": 0.00024955046229314584, + "loss": 0.88864952, + "num_input_tokens_seen": 291854768, + "router_z_loss_mlp": 0.75732422, + "step": 3519, + "time_per_iteration": 2.6678366661071777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114153, + "balance_loss_mlp": 1.06576014, + "epoch": 0.6771835321277414, + "flos": 450836697600.0, + "grad_norm": 0.03317329770903154, + "language_loss": 0.91456813, + "learning_rate": 0.00024928086928218947, + "loss": 0.92598343, + "num_input_tokens_seen": 291918096, + "router_z_loss_mlp": 0.75634766, + "step": 3520, + "time_per_iteration": 2.599364995956421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142519, + "balance_loss_mlp": 1.06689274, + "epoch": 0.677375913813005, + "flos": 710673707520.0, + "grad_norm": 0.03540178465545925, + "language_loss": 0.81423402, + "learning_rate": 0.00024901137360903216, + "loss": 0.82565916, + "num_input_tokens_seen": 291998752, + "router_z_loss_mlp": 0.75488281, + "step": 3521, + "time_per_iteration": 2.9810547828674316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114229, + "balance_loss_mlp": 1.06671166, + "epoch": 0.6775682954982686, + "flos": 429345205248.0, + "grad_norm": 0.03804572823020318, + "language_loss": 0.86387855, + "learning_rate": 0.00024874197537830115, + "loss": 0.87530142, + "num_input_tokens_seen": 292065056, + "router_z_loss_mlp": 0.75439453, + "step": 3522, + "time_per_iteration": 2.5273780822753906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148684, + "balance_loss_mlp": 1.07281935, + "epoch": 0.6777606771835322, + "flos": 438820956672.0, + "grad_norm": 0.03795067145757124, + "language_loss": 0.88304371, + "learning_rate": 0.00024847267469458684, + "loss": 0.89453053, + "num_input_tokens_seen": 292129248, + "router_z_loss_mlp": 0.75732422, + "step": 3523, + "time_per_iteration": 2.5473203659057617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151175, + "balance_loss_mlp": 1.07516694, + "epoch": 0.6779530588687956, + "flos": 776787116544.0, + "grad_norm": 0.03277402838986502, + "language_loss": 0.82546473, + "learning_rate": 0.00024820347166244034, + "loss": 0.83697653, + "num_input_tokens_seen": 292206080, + "router_z_loss_mlp": 0.75878906, + "step": 3524, + "time_per_iteration": 3.006762742996216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151614, + "balance_loss_mlp": 1.07551062, + "epoch": 0.6781454405540592, + "flos": 572904284160.0, + "grad_norm": 0.03398425592449901, + "language_loss": 0.89193916, + "learning_rate": 0.0002479343663863755, + "loss": 0.90345526, + "num_input_tokens_seen": 292280192, + "router_z_loss_mlp": 0.75976562, + "step": 3525, + "time_per_iteration": 2.7708120346069336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149362, + "balance_loss_mlp": 1.07325864, + "epoch": 0.6783378222393228, + "flos": 485982862848.0, + "grad_norm": 0.03421790564553063, + "language_loss": 0.81340361, + "learning_rate": 0.00024766535897086876, + "loss": 0.82489729, + "num_input_tokens_seen": 292347792, + "router_z_loss_mlp": 0.75976562, + "step": 3526, + "time_per_iteration": 2.5445010662078857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149936, + "balance_loss_mlp": 1.07383275, + "epoch": 0.6785302039245864, + "flos": 483831639552.0, + "grad_norm": 0.03533862611113949, + "language_loss": 0.84491217, + "learning_rate": 0.0002473964495203578, + "loss": 0.85641158, + "num_input_tokens_seen": 292420032, + "router_z_loss_mlp": 0.75976562, + "step": 3527, + "time_per_iteration": 2.6606431007385254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151402, + "balance_loss_mlp": 1.07525146, + "epoch": 0.67872258560985, + "flos": 525861900288.0, + "grad_norm": 0.03371892559640898, + "language_loss": 0.90057969, + "learning_rate": 0.0002471276381392425, + "loss": 0.9120937, + "num_input_tokens_seen": 292497792, + "router_z_loss_mlp": 0.76025391, + "step": 3528, + "time_per_iteration": 2.782986640930176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156944, + "balance_loss_mlp": 1.08255768, + "epoch": 0.6789149672951135, + "flos": 1555892093952.0, + "grad_norm": 0.008577357919530966, + "language_loss": 0.78188634, + "learning_rate": 0.0002468589249318848, + "loss": 0.79345584, + "num_input_tokens_seen": 292726704, + "router_z_loss_mlp": 0.74414062, + "step": 3529, + "time_per_iteration": 4.9733335971832275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152043, + "balance_loss_mlp": 1.07594013, + "epoch": 0.6791073489803771, + "flos": 742684999680.0, + "grad_norm": 0.033404033149465266, + "language_loss": 0.89312834, + "learning_rate": 0.00024659031000260826, + "loss": 0.90464872, + "num_input_tokens_seen": 292802320, + "router_z_loss_mlp": 0.75976562, + "step": 3530, + "time_per_iteration": 2.901157855987549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145514, + "balance_loss_mlp": 1.06936264, + "epoch": 0.6792997306656406, + "flos": 577447776768.0, + "grad_norm": 0.04256917362285044, + "language_loss": 0.86884272, + "learning_rate": 0.0002463217934556985, + "loss": 0.8802979, + "num_input_tokens_seen": 292870480, + "router_z_loss_mlp": 0.76025391, + "step": 3531, + "time_per_iteration": 2.6534667015075684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153702, + "balance_loss_mlp": 1.07931519, + "epoch": 0.6794921123509042, + "flos": 1506544035840.0, + "grad_norm": 0.006337226155731696, + "language_loss": 0.7653209, + "learning_rate": 0.000246053375395403, + "loss": 0.77685791, + "num_input_tokens_seen": 293100752, + "router_z_loss_mlp": 0.74414062, + "step": 3532, + "time_per_iteration": 4.827699899673462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147095, + "balance_loss_mlp": 1.07089639, + "epoch": 0.6796844940361677, + "flos": 700140446208.0, + "grad_norm": 0.038428315777117805, + "language_loss": 0.89542228, + "learning_rate": 0.0002457850559259306, + "loss": 0.90689325, + "num_input_tokens_seen": 293178192, + "router_z_loss_mlp": 0.76074219, + "step": 3533, + "time_per_iteration": 2.827556610107422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147708, + "balance_loss_mlp": 1.07160449, + "epoch": 0.6798768757214313, + "flos": 553815794688.0, + "grad_norm": 0.03257941751207101, + "language_loss": 0.86952329, + "learning_rate": 0.00024551683515145275, + "loss": 0.88100034, + "num_input_tokens_seen": 293246368, + "router_z_loss_mlp": 0.75976562, + "step": 3534, + "time_per_iteration": 2.664051055908203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146574, + "balance_loss_mlp": 1.07051849, + "epoch": 0.6800692574066949, + "flos": 523975191552.0, + "grad_norm": 0.03399690480422162, + "language_loss": 0.91393268, + "learning_rate": 0.0002452487131761014, + "loss": 0.92539847, + "num_input_tokens_seen": 293320656, + "router_z_loss_mlp": 0.75927734, + "step": 3535, + "time_per_iteration": 2.733736276626587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146041, + "balance_loss_mlp": 1.06993783, + "epoch": 0.6802616390919585, + "flos": 575129367552.0, + "grad_norm": 0.03256850712762242, + "language_loss": 0.84912848, + "learning_rate": 0.00024498069010397093, + "loss": 0.86058891, + "num_input_tokens_seen": 293388592, + "router_z_loss_mlp": 0.75976562, + "step": 3536, + "time_per_iteration": 2.687980890274048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144058, + "balance_loss_mlp": 1.06805015, + "epoch": 0.6804540207772221, + "flos": 489128469504.0, + "grad_norm": 0.03259916802392139, + "language_loss": 0.89844334, + "learning_rate": 0.00024471276603911697, + "loss": 0.90988398, + "num_input_tokens_seen": 293453936, + "router_z_loss_mlp": 0.75878906, + "step": 3537, + "time_per_iteration": 2.5977725982666016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144351, + "balance_loss_mlp": 1.06834352, + "epoch": 0.6806464024624855, + "flos": 579744718848.0, + "grad_norm": 0.031208373438408543, + "language_loss": 0.83636969, + "learning_rate": 0.0002444449410855572, + "loss": 0.84781325, + "num_input_tokens_seen": 293527664, + "router_z_loss_mlp": 0.75878906, + "step": 3538, + "time_per_iteration": 2.806182384490967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151082, + "balance_loss_mlp": 1.0752176, + "epoch": 0.6808387841477491, + "flos": 554792713728.0, + "grad_norm": 0.02619955396666995, + "language_loss": 0.88271046, + "learning_rate": 0.00024417721534727033, + "loss": 0.89422125, + "num_input_tokens_seen": 293599344, + "router_z_loss_mlp": 0.75732422, + "step": 3539, + "time_per_iteration": 2.6672027111053467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153254, + "balance_loss_mlp": 1.07753205, + "epoch": 0.6810311658330127, + "flos": 427753936896.0, + "grad_norm": 0.03954259059998535, + "language_loss": 0.8817929, + "learning_rate": 0.00024390958892819687, + "loss": 0.89332551, + "num_input_tokens_seen": 293663088, + "router_z_loss_mlp": 0.75585938, + "step": 3540, + "time_per_iteration": 2.4914028644561768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152621, + "balance_loss_mlp": 1.07685137, + "epoch": 0.6812235475182763, + "flos": 573460236288.0, + "grad_norm": 0.03041439482605579, + "language_loss": 0.85729158, + "learning_rate": 0.0002436420619322381, + "loss": 0.86881781, + "num_input_tokens_seen": 293741296, + "router_z_loss_mlp": 0.75634766, + "step": 3541, + "time_per_iteration": 2.8284380435943604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152525, + "balance_loss_mlp": 1.07675517, + "epoch": 0.6814159292035398, + "flos": 502993989120.0, + "grad_norm": 0.031050490172735493, + "language_loss": 0.87018108, + "learning_rate": 0.0002433746344632577, + "loss": 0.88170624, + "num_input_tokens_seen": 293815840, + "router_z_loss_mlp": 0.75634766, + "step": 3542, + "time_per_iteration": 2.6791961193084717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155107, + "balance_loss_mlp": 1.07919419, + "epoch": 0.6816083108888034, + "flos": 766955526144.0, + "grad_norm": 0.032327379337262395, + "language_loss": 0.85101521, + "learning_rate": 0.00024310730662508006, + "loss": 0.86256623, + "num_input_tokens_seen": 293896368, + "router_z_loss_mlp": 0.7578125, + "step": 3543, + "time_per_iteration": 3.091520309448242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154554, + "balance_loss_mlp": 1.07854629, + "epoch": 0.681800692574067, + "flos": 480479915520.0, + "grad_norm": 0.03033872617251452, + "language_loss": 0.91889656, + "learning_rate": 0.0002428400785214911, + "loss": 0.93044209, + "num_input_tokens_seen": 293963344, + "router_z_loss_mlp": 0.75878906, + "step": 3544, + "time_per_iteration": 2.6075758934020996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148266, + "balance_loss_mlp": 1.07216299, + "epoch": 0.6819930742593305, + "flos": 692833382400.0, + "grad_norm": 0.035894178949101116, + "language_loss": 0.8798629, + "learning_rate": 0.00024257295025623794, + "loss": 0.89134556, + "num_input_tokens_seen": 294035440, + "router_z_loss_mlp": 0.75976562, + "step": 3545, + "time_per_iteration": 2.835088014602661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148628, + "balance_loss_mlp": 1.07257295, + "epoch": 0.6821854559445941, + "flos": 679354627584.0, + "grad_norm": 0.03140204473065851, + "language_loss": 0.85909534, + "learning_rate": 0.00024230592193302892, + "loss": 0.87058157, + "num_input_tokens_seen": 294116944, + "router_z_loss_mlp": 0.75927734, + "step": 3546, + "time_per_iteration": 2.8806655406951904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115113, + "balance_loss_mlp": 1.07517004, + "epoch": 0.6823778376298576, + "flos": 463132416000.0, + "grad_norm": 0.035932436170819634, + "language_loss": 0.89696717, + "learning_rate": 0.00024203899365553372, + "loss": 0.9084785, + "num_input_tokens_seen": 294178976, + "router_z_loss_mlp": 0.75830078, + "step": 3547, + "time_per_iteration": 2.538266897201538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147926, + "balance_loss_mlp": 1.07411194, + "epoch": 0.6825702193151212, + "flos": 1478174452224.0, + "grad_norm": 0.007345057771589815, + "language_loss": 0.76734358, + "learning_rate": 0.00024177216552738302, + "loss": 0.77882284, + "num_input_tokens_seen": 294384960, + "router_z_loss_mlp": 0.73828125, + "step": 3548, + "time_per_iteration": 4.545760154724121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143597, + "balance_loss_mlp": 1.06768405, + "epoch": 0.6827626010003848, + "flos": 724412974080.0, + "grad_norm": 0.035220397583358556, + "language_loss": 0.88068932, + "learning_rate": 0.00024150543765216848, + "loss": 0.89212525, + "num_input_tokens_seen": 294461408, + "router_z_loss_mlp": 0.7578125, + "step": 3549, + "time_per_iteration": 2.9486939907073975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143099, + "balance_loss_mlp": 1.06718683, + "epoch": 0.6829549826856484, + "flos": 559939822080.0, + "grad_norm": 0.03492974535391861, + "language_loss": 0.89375067, + "learning_rate": 0.00024123881013344352, + "loss": 0.90518171, + "num_input_tokens_seen": 294530624, + "router_z_loss_mlp": 0.7578125, + "step": 3550, + "time_per_iteration": 2.651604413986206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150936, + "balance_loss_mlp": 1.07502353, + "epoch": 0.6831473643709118, + "flos": 626133821952.0, + "grad_norm": 0.03217647010825034, + "language_loss": 0.83963066, + "learning_rate": 0.00024097228307472202, + "loss": 0.85114002, + "num_input_tokens_seen": 294606784, + "router_z_loss_mlp": 0.7578125, + "step": 3551, + "time_per_iteration": 2.7857072353363037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011508, + "balance_loss_mlp": 1.07479274, + "epoch": 0.6833397460561754, + "flos": 715097677824.0, + "grad_norm": 0.03621401947072565, + "language_loss": 0.87106031, + "learning_rate": 0.00024070585657947846, + "loss": 0.88256836, + "num_input_tokens_seen": 294686960, + "router_z_loss_mlp": 0.75878906, + "step": 3552, + "time_per_iteration": 2.8683760166168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114886, + "balance_loss_mlp": 1.07299471, + "epoch": 0.683532127741439, + "flos": 465726799872.0, + "grad_norm": 0.03128688144219445, + "language_loss": 0.89219671, + "learning_rate": 0.00024043953075114934, + "loss": 0.90368527, + "num_input_tokens_seen": 294759712, + "router_z_loss_mlp": 0.75732422, + "step": 3553, + "time_per_iteration": 2.704216241836548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114847, + "balance_loss_mlp": 1.07251036, + "epoch": 0.6837245094267026, + "flos": 583339490304.0, + "grad_norm": 0.0349442822995555, + "language_loss": 0.93869305, + "learning_rate": 0.00024017330569313128, + "loss": 0.95017779, + "num_input_tokens_seen": 294830592, + "router_z_loss_mlp": 0.75830078, + "step": 3554, + "time_per_iteration": 2.691981554031372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148981, + "balance_loss_mlp": 1.07287753, + "epoch": 0.6839168911119662, + "flos": 795523769856.0, + "grad_norm": 0.0402217191104916, + "language_loss": 0.80629432, + "learning_rate": 0.0002399071815087821, + "loss": 0.81778413, + "num_input_tokens_seen": 294907504, + "router_z_loss_mlp": 0.75976562, + "step": 3555, + "time_per_iteration": 2.984731912612915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148889, + "balance_loss_mlp": 1.07302415, + "epoch": 0.6841092727972297, + "flos": 581114406912.0, + "grad_norm": 0.035602777463953614, + "language_loss": 0.89145899, + "learning_rate": 0.00023964115830142025, + "loss": 0.9029479, + "num_input_tokens_seen": 294977600, + "router_z_loss_mlp": 0.75732422, + "step": 3556, + "time_per_iteration": 2.7377610206604004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148814, + "balance_loss_mlp": 1.07294965, + "epoch": 0.6843016544824932, + "flos": 384595034112.0, + "grad_norm": 0.03918339808288278, + "language_loss": 0.92691845, + "learning_rate": 0.00023937523617432522, + "loss": 0.93840659, + "num_input_tokens_seen": 295039408, + "router_z_loss_mlp": 0.75732422, + "step": 3557, + "time_per_iteration": 2.571953535079956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148872, + "balance_loss_mlp": 1.07305455, + "epoch": 0.6844940361677568, + "flos": 1441287845376.0, + "grad_norm": 0.033291217727089636, + "language_loss": 0.91850209, + "learning_rate": 0.00023910941523073705, + "loss": 0.92999083, + "num_input_tokens_seen": 295142928, + "router_z_loss_mlp": 0.75683594, + "step": 3558, + "time_per_iteration": 3.910876512527466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148946, + "balance_loss_mlp": 1.07317698, + "epoch": 0.6846864178530204, + "flos": 521899829760.0, + "grad_norm": 0.03402610589420279, + "language_loss": 0.9203999, + "learning_rate": 0.0002388436955738566, + "loss": 0.93188941, + "num_input_tokens_seen": 295215504, + "router_z_loss_mlp": 0.75634766, + "step": 3559, + "time_per_iteration": 2.6723177433013916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148516, + "balance_loss_mlp": 1.07279444, + "epoch": 0.6848787995382839, + "flos": 719228935680.0, + "grad_norm": 0.031030975541128533, + "language_loss": 0.86168528, + "learning_rate": 0.00023857807730684523, + "loss": 0.87317038, + "num_input_tokens_seen": 295291024, + "router_z_loss_mlp": 0.75585938, + "step": 3560, + "time_per_iteration": 2.90830135345459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114827, + "balance_loss_mlp": 1.07254827, + "epoch": 0.6850711812235475, + "flos": 512161565184.0, + "grad_norm": 0.040096201780059196, + "language_loss": 0.88262463, + "learning_rate": 0.00023831256053282547, + "loss": 0.89410734, + "num_input_tokens_seen": 295363248, + "router_z_loss_mlp": 0.75585938, + "step": 3561, + "time_per_iteration": 2.671116352081299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148991, + "balance_loss_mlp": 1.07336485, + "epoch": 0.6852635629088111, + "flos": 669431712768.0, + "grad_norm": 0.03641568128756266, + "language_loss": 0.83697838, + "learning_rate": 0.00023804714535488003, + "loss": 0.8484683, + "num_input_tokens_seen": 295442032, + "router_z_loss_mlp": 0.75488281, + "step": 3562, + "time_per_iteration": 2.861722946166992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149231, + "balance_loss_mlp": 1.0756073, + "epoch": 0.6854559445940747, + "flos": 1526364395520.0, + "grad_norm": 0.005446048976110769, + "language_loss": 0.7980963, + "learning_rate": 0.0002377818318760519, + "loss": 0.80958861, + "num_input_tokens_seen": 295680560, + "router_z_loss_mlp": 0.73632812, + "step": 3563, + "time_per_iteration": 5.001219272613525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145764, + "balance_loss_mlp": 1.07037604, + "epoch": 0.6856483262793382, + "flos": 455137142784.0, + "grad_norm": 0.035220734339555373, + "language_loss": 0.86132681, + "learning_rate": 0.00023751662019934488, + "loss": 0.8727845, + "num_input_tokens_seen": 295745712, + "router_z_loss_mlp": 0.75244141, + "step": 3564, + "time_per_iteration": 2.4870924949645996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146111, + "balance_loss_mlp": 1.07077074, + "epoch": 0.6858407079646017, + "flos": 616688269824.0, + "grad_norm": 0.032854756712223265, + "language_loss": 0.84736019, + "learning_rate": 0.00023725151042772364, + "loss": 0.85882127, + "num_input_tokens_seen": 295815104, + "router_z_loss_mlp": 0.75195312, + "step": 3565, + "time_per_iteration": 2.7391157150268555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146, + "balance_loss_mlp": 1.07056403, + "epoch": 0.6860330896498653, + "flos": 467094486528.0, + "grad_norm": 0.03197662147757374, + "language_loss": 0.88051426, + "learning_rate": 0.00023698650266411276, + "loss": 0.89197421, + "num_input_tokens_seen": 295882928, + "router_z_loss_mlp": 0.75292969, + "step": 3566, + "time_per_iteration": 2.6070899963378906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114589, + "balance_loss_mlp": 1.07054949, + "epoch": 0.6862254713351289, + "flos": 865838294016.0, + "grad_norm": 0.03137777844297811, + "language_loss": 0.88001108, + "learning_rate": 0.00023672159701139755, + "loss": 0.89146996, + "num_input_tokens_seen": 295970960, + "router_z_loss_mlp": 0.75195312, + "step": 3567, + "time_per_iteration": 3.252197504043579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145133, + "balance_loss_mlp": 1.06979275, + "epoch": 0.6864178530203925, + "flos": 448090590720.0, + "grad_norm": 0.03718741839919542, + "language_loss": 0.90576816, + "learning_rate": 0.00023645679357242296, + "loss": 0.91721952, + "num_input_tokens_seen": 296036128, + "router_z_loss_mlp": 0.75195312, + "step": 3568, + "time_per_iteration": 2.551252841949463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146099, + "balance_loss_mlp": 1.07052052, + "epoch": 0.6866102347056561, + "flos": 425211945984.0, + "grad_norm": 0.041154591725143186, + "language_loss": 0.89051086, + "learning_rate": 0.00023619209244999534, + "loss": 0.90197182, + "num_input_tokens_seen": 296101440, + "router_z_loss_mlp": 0.75439453, + "step": 3569, + "time_per_iteration": 2.5833351612091064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148567, + "balance_loss_mlp": 1.07289267, + "epoch": 0.6868026163909196, + "flos": 473333306880.0, + "grad_norm": 0.045387721995194655, + "language_loss": 0.91211587, + "learning_rate": 0.0002359274937468806, + "loss": 0.92360151, + "num_input_tokens_seen": 296165504, + "router_z_loss_mlp": 0.75537109, + "step": 3570, + "time_per_iteration": 2.5472187995910645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148303, + "balance_loss_mlp": 1.07258165, + "epoch": 0.6869949980761831, + "flos": 465205776384.0, + "grad_norm": 0.03150793163610154, + "language_loss": 0.82095093, + "learning_rate": 0.00023566299756580512, + "loss": 0.83243394, + "num_input_tokens_seen": 296236880, + "router_z_loss_mlp": 0.75585938, + "step": 3571, + "time_per_iteration": 2.65720534324646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149363, + "balance_loss_mlp": 1.07364154, + "epoch": 0.6871873797614467, + "flos": 427130855424.0, + "grad_norm": 0.03812414034627887, + "language_loss": 0.83773518, + "learning_rate": 0.0002353986040094551, + "loss": 0.84922886, + "num_input_tokens_seen": 296299776, + "router_z_loss_mlp": 0.75585938, + "step": 3572, + "time_per_iteration": 2.5081918239593506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150153, + "balance_loss_mlp": 1.07443094, + "epoch": 0.6873797614467103, + "flos": 444554216448.0, + "grad_norm": 0.03780966347325107, + "language_loss": 0.84840351, + "learning_rate": 0.00023513431318047796, + "loss": 0.859905, + "num_input_tokens_seen": 296365408, + "router_z_loss_mlp": 0.75585938, + "step": 3573, + "time_per_iteration": 2.5093369483947754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151367, + "balance_loss_mlp": 1.07564497, + "epoch": 0.6875721431319738, + "flos": 993914388480.0, + "grad_norm": 0.03609225050037203, + "language_loss": 0.82789201, + "learning_rate": 0.00023487012518147977, + "loss": 0.83940566, + "num_input_tokens_seen": 296445488, + "router_z_loss_mlp": 0.75585938, + "step": 3574, + "time_per_iteration": 3.209183692932129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147663, + "balance_loss_mlp": 1.07194114, + "epoch": 0.6877645248172374, + "flos": 1287447284736.0, + "grad_norm": 0.03474054925627609, + "language_loss": 0.8951385, + "learning_rate": 0.00023460604011502772, + "loss": 0.90661514, + "num_input_tokens_seen": 296529936, + "router_z_loss_mlp": 0.75585938, + "step": 3575, + "time_per_iteration": 3.6102471351623535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147349, + "balance_loss_mlp": 1.07162762, + "epoch": 0.687956906502501, + "flos": 878229339648.0, + "grad_norm": 0.03667268861696713, + "language_loss": 0.90602195, + "learning_rate": 0.00023434205808364845, + "loss": 0.91749543, + "num_input_tokens_seen": 296607488, + "router_z_loss_mlp": 0.75585938, + "step": 3576, + "time_per_iteration": 3.1072838306427 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145679, + "balance_loss_mlp": 1.07014775, + "epoch": 0.6881492881877646, + "flos": 564470579712.0, + "grad_norm": 0.03470071742143998, + "language_loss": 0.90143359, + "learning_rate": 0.00023407817918982932, + "loss": 0.91289037, + "num_input_tokens_seen": 296678672, + "router_z_loss_mlp": 0.75390625, + "step": 3577, + "time_per_iteration": 2.7108538150787354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144131, + "balance_loss_mlp": 1.06869566, + "epoch": 0.6883416698730281, + "flos": 796509421056.0, + "grad_norm": 0.03216167904462723, + "language_loss": 0.83329225, + "learning_rate": 0.00023381440353601718, + "loss": 0.84473354, + "num_input_tokens_seen": 296758896, + "router_z_loss_mlp": 0.75292969, + "step": 3578, + "time_per_iteration": 3.00079345703125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144719, + "balance_loss_mlp": 1.06933129, + "epoch": 0.6885340515582916, + "flos": 724879603200.0, + "grad_norm": 0.03602954458915834, + "language_loss": 0.91766059, + "learning_rate": 0.00023355073122461822, + "loss": 0.92910779, + "num_input_tokens_seen": 296830736, + "router_z_loss_mlp": 0.75244141, + "step": 3579, + "time_per_iteration": 2.8793976306915283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144346, + "balance_loss_mlp": 1.06891012, + "epoch": 0.6887264332435552, + "flos": 1012520785920.0, + "grad_norm": 0.032157968991135766, + "language_loss": 0.87754709, + "learning_rate": 0.00023328716235799973, + "loss": 0.88899052, + "num_input_tokens_seen": 296911504, + "router_z_loss_mlp": 0.75292969, + "step": 3580, + "time_per_iteration": 3.262232780456543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145628, + "balance_loss_mlp": 1.07028747, + "epoch": 0.6889188149288188, + "flos": 586346108928.0, + "grad_norm": 0.030956213624598772, + "language_loss": 0.88613558, + "learning_rate": 0.00023302369703848803, + "loss": 0.89759183, + "num_input_tokens_seen": 296981488, + "router_z_loss_mlp": 0.75195312, + "step": 3581, + "time_per_iteration": 2.6781458854675293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155772, + "balance_loss_mlp": 1.08043158, + "epoch": 0.6891111966140824, + "flos": 637276703232.0, + "grad_norm": 0.03960885447101306, + "language_loss": 0.85706222, + "learning_rate": 0.00023276033536836937, + "loss": 0.86861998, + "num_input_tokens_seen": 297054896, + "router_z_loss_mlp": 0.75195312, + "step": 3582, + "time_per_iteration": 2.8019070625305176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155352, + "balance_loss_mlp": 1.08005941, + "epoch": 0.6893035782993459, + "flos": 496312008192.0, + "grad_norm": 0.03332092041619006, + "language_loss": 0.89310157, + "learning_rate": 0.00023249707744988984, + "loss": 0.9046551, + "num_input_tokens_seen": 297128224, + "router_z_loss_mlp": 0.75146484, + "step": 3583, + "time_per_iteration": 2.6462185382843018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149559, + "balance_loss_mlp": 1.07421863, + "epoch": 0.6894959599846094, + "flos": 459148878336.0, + "grad_norm": 0.037983425016063846, + "language_loss": 0.88022619, + "learning_rate": 0.00023223392338525529, + "loss": 0.89172179, + "num_input_tokens_seen": 297191312, + "router_z_loss_mlp": 0.75195312, + "step": 3584, + "time_per_iteration": 2.493164539337158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149866, + "balance_loss_mlp": 1.07457304, + "epoch": 0.689688341669873, + "flos": 506057003520.0, + "grad_norm": 0.03394886477629218, + "language_loss": 0.83439797, + "learning_rate": 0.00023197087327663107, + "loss": 0.84589666, + "num_input_tokens_seen": 297261904, + "router_z_loss_mlp": 0.75146484, + "step": 3585, + "time_per_iteration": 2.6373069286346436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149128, + "balance_loss_mlp": 1.0738833, + "epoch": 0.6898807233551366, + "flos": 765218539008.0, + "grad_norm": 0.04715187460336584, + "language_loss": 0.87040132, + "learning_rate": 0.00023170792722614243, + "loss": 0.88189256, + "num_input_tokens_seen": 297338352, + "router_z_loss_mlp": 0.75097656, + "step": 3586, + "time_per_iteration": 2.9102606773376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147386, + "balance_loss_mlp": 1.07218862, + "epoch": 0.6900731050404002, + "flos": 584572918272.0, + "grad_norm": 0.029046800456262803, + "language_loss": 0.87808621, + "learning_rate": 0.00023144508533587377, + "loss": 0.88955998, + "num_input_tokens_seen": 297416688, + "router_z_loss_mlp": 0.75048828, + "step": 3587, + "time_per_iteration": 2.8061466217041016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146464, + "balance_loss_mlp": 1.07112408, + "epoch": 0.6902654867256637, + "flos": 713204964864.0, + "grad_norm": 0.038780286956444227, + "language_loss": 0.83763909, + "learning_rate": 0.0002311823477078698, + "loss": 0.84910375, + "num_input_tokens_seen": 297499968, + "router_z_loss_mlp": 0.75195312, + "step": 3588, + "time_per_iteration": 2.943735122680664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145799, + "balance_loss_mlp": 1.0705539, + "epoch": 0.6904578684109273, + "flos": 598303452672.0, + "grad_norm": 0.03424930843273271, + "language_loss": 0.89383221, + "learning_rate": 0.00023091971444413428, + "loss": 0.90529013, + "num_input_tokens_seen": 297574480, + "router_z_loss_mlp": 0.75097656, + "step": 3589, + "time_per_iteration": 2.8112401962280273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144927, + "balance_loss_mlp": 1.06958711, + "epoch": 0.6906502500961909, + "flos": 586176921600.0, + "grad_norm": 0.03337983464568353, + "language_loss": 0.87353265, + "learning_rate": 0.00023065718564663012, + "loss": 0.88498187, + "num_input_tokens_seen": 297645360, + "router_z_loss_mlp": 0.75195312, + "step": 3590, + "time_per_iteration": 2.712702512741089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148972, + "balance_loss_mlp": 1.0753479, + "epoch": 0.6908426317814544, + "flos": 1591140317184.0, + "grad_norm": 0.007217245787203084, + "language_loss": 0.73911589, + "learning_rate": 0.00023039476141728011, + "loss": 0.75060558, + "num_input_tokens_seen": 297879472, + "router_z_loss_mlp": 0.73632812, + "step": 3591, + "time_per_iteration": 4.975476980209351 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011435, + "balance_loss_mlp": 1.06830287, + "epoch": 0.6910350134667179, + "flos": 501804221952.0, + "grad_norm": 0.03486357436652247, + "language_loss": 0.85128838, + "learning_rate": 0.0002301324418579666, + "loss": 0.86272335, + "num_input_tokens_seen": 297950672, + "router_z_loss_mlp": 0.75048828, + "step": 3592, + "time_per_iteration": 2.6776154041290283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144028, + "balance_loss_mlp": 1.07040405, + "epoch": 0.6912273951519815, + "flos": 1412132901888.0, + "grad_norm": 0.003146877221363815, + "language_loss": 0.78688473, + "learning_rate": 0.00022987022707053107, + "loss": 0.798325, + "num_input_tokens_seen": 298171728, + "router_z_loss_mlp": 0.73632812, + "step": 3593, + "time_per_iteration": 4.794835567474365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143307, + "balance_loss_mlp": 1.06806242, + "epoch": 0.6914197768372451, + "flos": 636556293120.0, + "grad_norm": 0.03715032708342992, + "language_loss": 0.8555156, + "learning_rate": 0.00022960811715677415, + "loss": 0.86694872, + "num_input_tokens_seen": 298250304, + "router_z_loss_mlp": 0.75097656, + "step": 3594, + "time_per_iteration": 2.8951711654663086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147289, + "balance_loss_mlp": 1.07213938, + "epoch": 0.6916121585225087, + "flos": 559201947648.0, + "grad_norm": 0.03507172785049161, + "language_loss": 0.86282074, + "learning_rate": 0.00022934611221845608, + "loss": 0.87429363, + "num_input_tokens_seen": 298328000, + "router_z_loss_mlp": 0.75, + "step": 3595, + "time_per_iteration": 2.8272645473480225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145219, + "balance_loss_mlp": 1.0699265, + "epoch": 0.6918045402077723, + "flos": 530292601344.0, + "grad_norm": 0.04349078621871699, + "language_loss": 0.82568008, + "learning_rate": 0.00022908421235729609, + "loss": 0.83713228, + "num_input_tokens_seen": 298406832, + "router_z_loss_mlp": 0.75146484, + "step": 3596, + "time_per_iteration": 2.7838826179504395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146035, + "balance_loss_mlp": 1.07074213, + "epoch": 0.6919969218930357, + "flos": 571425807360.0, + "grad_norm": 0.03178884209281711, + "language_loss": 0.89899623, + "learning_rate": 0.0002288224176749728, + "loss": 0.9104566, + "num_input_tokens_seen": 298477584, + "router_z_loss_mlp": 0.75146484, + "step": 3597, + "time_per_iteration": 2.6271378993988037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114544, + "balance_loss_mlp": 1.07009995, + "epoch": 0.6921893035782993, + "flos": 684503737344.0, + "grad_norm": 0.040516365330590415, + "language_loss": 0.84238005, + "learning_rate": 0.00022856072827312385, + "loss": 0.85383451, + "num_input_tokens_seen": 298551872, + "router_z_loss_mlp": 0.75195312, + "step": 3598, + "time_per_iteration": 2.8102614879608154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145578, + "balance_loss_mlp": 1.07028556, + "epoch": 0.6923816852635629, + "flos": 547793825280.0, + "grad_norm": 0.038084466235788844, + "language_loss": 0.82715267, + "learning_rate": 0.00022829914425334598, + "loss": 0.83860844, + "num_input_tokens_seen": 298619680, + "router_z_loss_mlp": 0.75146484, + "step": 3599, + "time_per_iteration": 2.6669743061065674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143867, + "balance_loss_mlp": 1.06852686, + "epoch": 0.6925740669488265, + "flos": 511056391680.0, + "grad_norm": 0.034117111871926384, + "language_loss": 0.85557401, + "learning_rate": 0.0002280376657171956, + "loss": 0.86701274, + "num_input_tokens_seen": 298690080, + "router_z_loss_mlp": 0.75195312, + "step": 3600, + "time_per_iteration": 2.655038356781006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144019, + "balance_loss_mlp": 1.0685358, + "epoch": 0.69276644863409, + "flos": 870913543680.0, + "grad_norm": 0.03423377398605859, + "language_loss": 0.81733924, + "learning_rate": 0.00022777629276618706, + "loss": 0.82877946, + "num_input_tokens_seen": 298777712, + "router_z_loss_mlp": 0.75341797, + "step": 3601, + "time_per_iteration": 3.1143221855163574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114446, + "balance_loss_mlp": 1.06897676, + "epoch": 0.6929588303193536, + "flos": 626917358592.0, + "grad_norm": 0.03471097371374876, + "language_loss": 0.82267404, + "learning_rate": 0.0002275150255017947, + "loss": 0.8341186, + "num_input_tokens_seen": 298854368, + "router_z_loss_mlp": 0.75341797, + "step": 3602, + "time_per_iteration": 2.7638230323791504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149361, + "balance_loss_mlp": 1.07592773, + "epoch": 0.6931512120046172, + "flos": 1548804609024.0, + "grad_norm": 0.009029231118545568, + "language_loss": 0.75732672, + "learning_rate": 0.0002272538640254511, + "loss": 0.76882035, + "num_input_tokens_seen": 299091664, + "router_z_loss_mlp": 0.734375, + "step": 3603, + "time_per_iteration": 5.028877019882202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165459, + "balance_loss_mlp": 1.09183502, + "epoch": 0.6933435936898807, + "flos": 1451323729920.0, + "grad_norm": 0.01657275533774484, + "language_loss": 0.75127101, + "learning_rate": 0.0002269928084385487, + "loss": 0.76292562, + "num_input_tokens_seen": 299312656, + "router_z_loss_mlp": 0.73632812, + "step": 3604, + "time_per_iteration": 4.7287609577178955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157905, + "balance_loss_mlp": 1.08204055, + "epoch": 0.6935359753751443, + "flos": 541930309632.0, + "grad_norm": 0.03919534439322985, + "language_loss": 0.90026039, + "learning_rate": 0.0002267318588424379, + "loss": 0.91183943, + "num_input_tokens_seen": 299381136, + "router_z_loss_mlp": 0.75732422, + "step": 3605, + "time_per_iteration": 2.6615920066833496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150618, + "balance_loss_mlp": 1.07484841, + "epoch": 0.6937283570604078, + "flos": 720689948160.0, + "grad_norm": 0.03558950704948247, + "language_loss": 0.91988891, + "learning_rate": 0.00022647101533842845, + "loss": 0.93139505, + "num_input_tokens_seen": 299455216, + "router_z_loss_mlp": 0.75634766, + "step": 3606, + "time_per_iteration": 2.875670909881592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152588, + "balance_loss_mlp": 1.07658041, + "epoch": 0.6939207387456714, + "flos": 523193656320.0, + "grad_norm": 0.041224980702036104, + "language_loss": 0.83253193, + "learning_rate": 0.00022621027802778872, + "loss": 0.84405786, + "num_input_tokens_seen": 299524352, + "router_z_loss_mlp": 0.75878906, + "step": 3607, + "time_per_iteration": 2.6125805377960205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151349, + "balance_loss_mlp": 1.07519805, + "epoch": 0.694113120430935, + "flos": 536401165824.0, + "grad_norm": 0.03463828866617186, + "language_loss": 0.85144913, + "learning_rate": 0.00022594964701174586, + "loss": 0.86296266, + "num_input_tokens_seen": 299594960, + "router_z_loss_mlp": 0.76025391, + "step": 3608, + "time_per_iteration": 2.6021461486816406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150974, + "balance_loss_mlp": 1.07496643, + "epoch": 0.6943055021161986, + "flos": 524394157056.0, + "grad_norm": 0.03515633419070769, + "language_loss": 0.89070058, + "learning_rate": 0.00022568912239148586, + "loss": 0.9022103, + "num_input_tokens_seen": 299662560, + "router_z_loss_mlp": 0.75878906, + "step": 3609, + "time_per_iteration": 2.636577844619751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145151, + "balance_loss_mlp": 1.06904817, + "epoch": 0.694497883801462, + "flos": 485970127872.0, + "grad_norm": 0.037176872987451946, + "language_loss": 0.86671317, + "learning_rate": 0.00022542870426815344, + "loss": 0.87816465, + "num_input_tokens_seen": 299734896, + "router_z_loss_mlp": 0.75976562, + "step": 3610, + "time_per_iteration": 2.6800506114959717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114419, + "balance_loss_mlp": 1.06818187, + "epoch": 0.6946902654867256, + "flos": 462424740864.0, + "grad_norm": 0.03708376402785258, + "language_loss": 0.9062373, + "learning_rate": 0.00022516839274285173, + "loss": 0.91767919, + "num_input_tokens_seen": 299799424, + "router_z_loss_mlp": 0.75878906, + "step": 3611, + "time_per_iteration": 2.516231060028076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144878, + "balance_loss_mlp": 1.06906128, + "epoch": 0.6948826471719892, + "flos": 513867626496.0, + "grad_norm": 0.032040517416043905, + "language_loss": 0.80424583, + "learning_rate": 0.00022490818791664265, + "loss": 0.81569457, + "num_input_tokens_seen": 299868272, + "router_z_loss_mlp": 0.75683594, + "step": 3612, + "time_per_iteration": 2.5825564861297607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153455, + "balance_loss_mlp": 1.07768571, + "epoch": 0.6950750288572528, + "flos": 558255227904.0, + "grad_norm": 0.03220148028893399, + "language_loss": 0.90256339, + "learning_rate": 0.00022464808989054676, + "loss": 0.91409791, + "num_input_tokens_seen": 299939136, + "router_z_loss_mlp": 0.75634766, + "step": 3613, + "time_per_iteration": 2.673570394515991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153455, + "balance_loss_mlp": 1.07763827, + "epoch": 0.6952674105425164, + "flos": 543521577984.0, + "grad_norm": 0.03708971382778387, + "language_loss": 0.80475914, + "learning_rate": 0.00022438809876554284, + "loss": 0.81629372, + "num_input_tokens_seen": 300009472, + "router_z_loss_mlp": 0.75683594, + "step": 3614, + "time_per_iteration": 2.6276586055755615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114766, + "balance_loss_mlp": 1.07179534, + "epoch": 0.6954597922277799, + "flos": 547856951808.0, + "grad_norm": 0.035809532178513556, + "language_loss": 0.85295904, + "learning_rate": 0.00022412821464256873, + "loss": 0.86443567, + "num_input_tokens_seen": 300081008, + "router_z_loss_mlp": 0.75732422, + "step": 3615, + "time_per_iteration": 2.675262689590454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144404, + "balance_loss_mlp": 1.06887305, + "epoch": 0.6956521739130435, + "flos": 520540875264.0, + "grad_norm": 0.03660154684653836, + "language_loss": 0.87111717, + "learning_rate": 0.00022386843762252023, + "loss": 0.88256121, + "num_input_tokens_seen": 300149856, + "router_z_loss_mlp": 0.75390625, + "step": 3616, + "time_per_iteration": 2.601850986480713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145995, + "balance_loss_mlp": 1.07055974, + "epoch": 0.695844555598307, + "flos": 467263673856.0, + "grad_norm": 0.03600236468041408, + "language_loss": 0.85243946, + "learning_rate": 0.00022360876780625193, + "loss": 0.86389947, + "num_input_tokens_seen": 300217344, + "router_z_loss_mlp": 0.75292969, + "step": 3617, + "time_per_iteration": 2.6009066104888916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146046, + "balance_loss_mlp": 1.0705148, + "epoch": 0.6960369372835706, + "flos": 601931151360.0, + "grad_norm": 0.03135963801145649, + "language_loss": 0.84376919, + "learning_rate": 0.00022334920529457604, + "loss": 0.85522962, + "num_input_tokens_seen": 300305584, + "router_z_loss_mlp": 0.75390625, + "step": 3618, + "time_per_iteration": 2.919830322265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152209, + "balance_loss_mlp": 1.07662988, + "epoch": 0.6962293189688342, + "flos": 645465358848.0, + "grad_norm": 0.03118514394285757, + "language_loss": 0.91862655, + "learning_rate": 0.00022308975018826423, + "loss": 0.9301486, + "num_input_tokens_seen": 300386480, + "router_z_loss_mlp": 0.75439453, + "step": 3619, + "time_per_iteration": 2.8989925384521484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152559, + "balance_loss_mlp": 1.07688463, + "epoch": 0.6964217006540977, + "flos": 639957682176.0, + "grad_norm": 0.03812258215137557, + "language_loss": 0.9018597, + "learning_rate": 0.00022283040258804564, + "loss": 0.91338527, + "num_input_tokens_seen": 300461840, + "router_z_loss_mlp": 0.75537109, + "step": 3620, + "time_per_iteration": 2.74235200881958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115248, + "balance_loss_mlp": 1.07680559, + "epoch": 0.6966140823393613, + "flos": 653386771968.0, + "grad_norm": 0.03521446946003712, + "language_loss": 0.88482189, + "learning_rate": 0.00022257116259460802, + "loss": 0.89634669, + "num_input_tokens_seen": 300540400, + "router_z_loss_mlp": 0.75537109, + "step": 3621, + "time_per_iteration": 2.819164991378784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152109, + "balance_loss_mlp": 1.07657778, + "epoch": 0.6968064640246249, + "flos": 705824040960.0, + "grad_norm": 0.033483575769838334, + "language_loss": 0.86131644, + "learning_rate": 0.00022231203030859725, + "loss": 0.87283748, + "num_input_tokens_seen": 300624240, + "router_z_loss_mlp": 0.75390625, + "step": 3622, + "time_per_iteration": 2.9764678478240967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151499, + "balance_loss_mlp": 1.07596815, + "epoch": 0.6969988457098885, + "flos": 493530972672.0, + "grad_norm": 0.03689827849321225, + "language_loss": 0.88673711, + "learning_rate": 0.00022205300583061737, + "loss": 0.89825207, + "num_input_tokens_seen": 300689728, + "router_z_loss_mlp": 0.75390625, + "step": 3623, + "time_per_iteration": 2.56077241897583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160957, + "balance_loss_mlp": 1.08676147, + "epoch": 0.6971912273951519, + "flos": 1355612765184.0, + "grad_norm": 0.01051210233646139, + "language_loss": 0.82838202, + "learning_rate": 0.00022179408926123063, + "loss": 0.83999157, + "num_input_tokens_seen": 300913152, + "router_z_loss_mlp": 0.7421875, + "step": 3624, + "time_per_iteration": 4.901975393295288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150633, + "balance_loss_mlp": 1.07529247, + "epoch": 0.6973836090804155, + "flos": 603574086144.0, + "grad_norm": 0.03562483559578549, + "language_loss": 0.82784301, + "learning_rate": 0.00022153528070095735, + "loss": 0.83934939, + "num_input_tokens_seen": 300985824, + "router_z_loss_mlp": 0.75195312, + "step": 3625, + "time_per_iteration": 2.6827454566955566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147557, + "balance_loss_mlp": 1.07226419, + "epoch": 0.6975759907656791, + "flos": 525110564352.0, + "grad_norm": 0.03740891525888632, + "language_loss": 0.94177675, + "learning_rate": 0.00022127658025027568, + "loss": 0.95325232, + "num_input_tokens_seen": 301058048, + "router_z_loss_mlp": 0.75146484, + "step": 3626, + "time_per_iteration": 2.6243293285369873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145673, + "balance_loss_mlp": 1.07014167, + "epoch": 0.6977683724509427, + "flos": 481877801472.0, + "grad_norm": 0.03606674013608827, + "language_loss": 0.91052938, + "learning_rate": 0.00022101798800962258, + "loss": 0.92198616, + "num_input_tokens_seen": 301127472, + "router_z_loss_mlp": 0.75390625, + "step": 3627, + "time_per_iteration": 2.585353374481201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145537, + "balance_loss_mlp": 1.07005322, + "epoch": 0.6979607541362063, + "flos": 523640819712.0, + "grad_norm": 0.043695073898502274, + "language_loss": 0.852063, + "learning_rate": 0.00022075950407939227, + "loss": 0.86351836, + "num_input_tokens_seen": 301193920, + "router_z_loss_mlp": 0.75341797, + "step": 3628, + "time_per_iteration": 2.6018002033233643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145624, + "balance_loss_mlp": 1.07023609, + "epoch": 0.6981531358214698, + "flos": 549115849728.0, + "grad_norm": 0.039500919644618576, + "language_loss": 0.87787813, + "learning_rate": 0.0002205011285599367, + "loss": 0.88933432, + "num_input_tokens_seen": 301264256, + "router_z_loss_mlp": 0.75244141, + "step": 3629, + "time_per_iteration": 2.6909217834472656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114526, + "balance_loss_mlp": 1.06991994, + "epoch": 0.6983455175067333, + "flos": 701275819008.0, + "grad_norm": 0.03293425746388738, + "language_loss": 0.8505758, + "learning_rate": 0.00022024286155156658, + "loss": 0.86202836, + "num_input_tokens_seen": 301337696, + "router_z_loss_mlp": 0.75195312, + "step": 3630, + "time_per_iteration": 2.8668339252471924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145235, + "balance_loss_mlp": 1.07008553, + "epoch": 0.6985378991919969, + "flos": 486119849472.0, + "grad_norm": 0.03293145354984791, + "language_loss": 0.9093079, + "learning_rate": 0.00021998470315454994, + "loss": 0.92076027, + "num_input_tokens_seen": 301407776, + "router_z_loss_mlp": 0.75, + "step": 3631, + "time_per_iteration": 2.6536853313446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145252, + "balance_loss_mlp": 1.07010257, + "epoch": 0.6987302808772605, + "flos": 559892158464.0, + "grad_norm": 0.03487739632649299, + "language_loss": 0.90976024, + "learning_rate": 0.00021972665346911275, + "loss": 0.92121279, + "num_input_tokens_seen": 301475120, + "router_z_loss_mlp": 0.75, + "step": 3632, + "time_per_iteration": 2.705947160720825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145801, + "balance_loss_mlp": 1.07046092, + "epoch": 0.698922662562524, + "flos": 484567512576.0, + "grad_norm": 0.03530100295621196, + "language_loss": 0.84786582, + "learning_rate": 0.00021946871259543877, + "loss": 0.85932386, + "num_input_tokens_seen": 301542416, + "router_z_loss_mlp": 0.75195312, + "step": 3633, + "time_per_iteration": 2.585474729537964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146213, + "balance_loss_mlp": 1.07106328, + "epoch": 0.6991150442477876, + "flos": 720205854720.0, + "grad_norm": 0.031838987726816204, + "language_loss": 0.87710065, + "learning_rate": 0.00021921088063366957, + "loss": 0.88856274, + "num_input_tokens_seen": 301620672, + "router_z_loss_mlp": 0.75, + "step": 3634, + "time_per_iteration": 2.9367825984954834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150014, + "balance_loss_mlp": 1.0748167, + "epoch": 0.6993074259330512, + "flos": 490159782912.0, + "grad_norm": 0.031688179497796835, + "language_loss": 0.86258936, + "learning_rate": 0.00021895315768390435, + "loss": 0.87408948, + "num_input_tokens_seen": 301688016, + "router_z_loss_mlp": 0.75048828, + "step": 3635, + "time_per_iteration": 2.6028146743774414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150052, + "balance_loss_mlp": 1.07490218, + "epoch": 0.6994998076183148, + "flos": 719467980288.0, + "grad_norm": 0.03153013749596923, + "language_loss": 0.92548811, + "learning_rate": 0.00021869554384619999, + "loss": 0.93698871, + "num_input_tokens_seen": 301771184, + "router_z_loss_mlp": 0.75, + "step": 3636, + "time_per_iteration": 2.998966932296753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146553, + "balance_loss_mlp": 1.07126021, + "epoch": 0.6996921893035783, + "flos": 580163684352.0, + "grad_norm": 0.03271766083883028, + "language_loss": 0.86055148, + "learning_rate": 0.00021843803922057115, + "loss": 0.87201703, + "num_input_tokens_seen": 301844528, + "router_z_loss_mlp": 0.75146484, + "step": 3637, + "time_per_iteration": 2.745859384536743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145131, + "balance_loss_mlp": 1.06983805, + "epoch": 0.6998845709888418, + "flos": 519674746368.0, + "grad_norm": 0.033737468180216806, + "language_loss": 0.86839747, + "learning_rate": 0.00021818064390698977, + "loss": 0.87984878, + "num_input_tokens_seen": 301914960, + "router_z_loss_mlp": 0.75146484, + "step": 3638, + "time_per_iteration": 2.632795810699463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146648, + "balance_loss_mlp": 1.07130754, + "epoch": 0.7000769526741054, + "flos": 622095889920.0, + "grad_norm": 0.03373596031982573, + "language_loss": 0.91870159, + "learning_rate": 0.0002179233580053861, + "loss": 0.93016809, + "num_input_tokens_seen": 301986352, + "router_z_loss_mlp": 0.75195312, + "step": 3639, + "time_per_iteration": 2.753880023956299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115047, + "balance_loss_mlp": 1.07512987, + "epoch": 0.700269334359369, + "flos": 561055729152.0, + "grad_norm": 0.03325206970104953, + "language_loss": 0.90108448, + "learning_rate": 0.00021766618161564688, + "loss": 0.91258919, + "num_input_tokens_seen": 302060544, + "router_z_loss_mlp": 0.75195312, + "step": 3640, + "time_per_iteration": 2.724479913711548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114817, + "balance_loss_mlp": 1.07273436, + "epoch": 0.7004617160446326, + "flos": 484361395200.0, + "grad_norm": 0.03152672477913245, + "language_loss": 0.91440845, + "learning_rate": 0.00021740911483761677, + "loss": 0.92589015, + "num_input_tokens_seen": 302127232, + "router_z_loss_mlp": 0.75292969, + "step": 3641, + "time_per_iteration": 2.5502066612243652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146714, + "balance_loss_mlp": 1.07137418, + "epoch": 0.7006540977298961, + "flos": 698321593344.0, + "grad_norm": 0.030766047541437955, + "language_loss": 0.95812565, + "learning_rate": 0.00021715215777109837, + "loss": 0.96959281, + "num_input_tokens_seen": 302207056, + "router_z_loss_mlp": 0.75195312, + "step": 3642, + "time_per_iteration": 2.9363698959350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150063, + "balance_loss_mlp": 1.07477081, + "epoch": 0.7008464794151597, + "flos": 505770295296.0, + "grad_norm": 0.03557511475331178, + "language_loss": 0.88907003, + "learning_rate": 0.00021689531051585103, + "loss": 0.90057063, + "num_input_tokens_seen": 302275632, + "router_z_loss_mlp": 0.75146484, + "step": 3643, + "time_per_iteration": 2.6452667713165283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150173, + "balance_loss_mlp": 1.07483232, + "epoch": 0.7010388611004232, + "flos": 538272411648.0, + "grad_norm": 0.036527368416016295, + "language_loss": 0.85649168, + "learning_rate": 0.00021663857317159196, + "loss": 0.86799347, + "num_input_tokens_seen": 302343600, + "router_z_loss_mlp": 0.75195312, + "step": 3644, + "time_per_iteration": 2.661463499069214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149991, + "balance_loss_mlp": 1.07465088, + "epoch": 0.7012312427856868, + "flos": 548314848768.0, + "grad_norm": 0.031074257387366924, + "language_loss": 0.86441541, + "learning_rate": 0.00021638194583799487, + "loss": 0.87591535, + "num_input_tokens_seen": 302414656, + "router_z_loss_mlp": 0.75195312, + "step": 3645, + "time_per_iteration": 2.6630945205688477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114701, + "balance_loss_mlp": 1.07166946, + "epoch": 0.7014236244709504, + "flos": 942973060608.0, + "grad_norm": 0.03710031332944713, + "language_loss": 0.87637782, + "learning_rate": 0.00021612542861469176, + "loss": 0.8878479, + "num_input_tokens_seen": 302495120, + "router_z_loss_mlp": 0.75195312, + "step": 3646, + "time_per_iteration": 3.1664998531341553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146595, + "balance_loss_mlp": 1.07120693, + "epoch": 0.7016160061562139, + "flos": 526209007104.0, + "grad_norm": 0.036568631884181475, + "language_loss": 0.87361133, + "learning_rate": 0.00021586902160127135, + "loss": 0.88507724, + "num_input_tokens_seen": 302563024, + "router_z_loss_mlp": 0.75244141, + "step": 3647, + "time_per_iteration": 2.588329792022705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145686, + "balance_loss_mlp": 1.07029808, + "epoch": 0.7018083878414775, + "flos": 374244421632.0, + "grad_norm": 0.046770994216465425, + "language_loss": 0.81241143, + "learning_rate": 0.00021561272489727974, + "loss": 0.82386827, + "num_input_tokens_seen": 302624544, + "router_z_loss_mlp": 0.75244141, + "step": 3648, + "time_per_iteration": 2.4180006980895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145708, + "balance_loss_mlp": 1.07036817, + "epoch": 0.7020007695267411, + "flos": 528833590272.0, + "grad_norm": 0.03433939193961528, + "language_loss": 0.86265445, + "learning_rate": 0.0002153565386022199, + "loss": 0.87411153, + "num_input_tokens_seen": 302697856, + "router_z_loss_mlp": 0.75195312, + "step": 3649, + "time_per_iteration": 2.6287925243377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146273, + "balance_loss_mlp": 1.07093239, + "epoch": 0.7021931512120047, + "flos": 691372369920.0, + "grad_norm": 0.0338942783378883, + "language_loss": 0.87374359, + "learning_rate": 0.00021510046281555262, + "loss": 0.88520634, + "num_input_tokens_seen": 302771984, + "router_z_loss_mlp": 0.75195312, + "step": 3650, + "time_per_iteration": 2.8249292373657227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145818, + "balance_loss_mlp": 1.0704776, + "epoch": 0.7023855328972681, + "flos": 640925869056.0, + "grad_norm": 0.04142301274986203, + "language_loss": 0.87215114, + "learning_rate": 0.0002148444976366949, + "loss": 0.88360929, + "num_input_tokens_seen": 302838832, + "router_z_loss_mlp": 0.75195312, + "step": 3651, + "time_per_iteration": 2.7713325023651123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148886, + "balance_loss_mlp": 1.07368851, + "epoch": 0.7025779145825317, + "flos": 562006451712.0, + "grad_norm": 0.03240472166532918, + "language_loss": 0.87441784, + "learning_rate": 0.00021458864316502136, + "loss": 0.8859067, + "num_input_tokens_seen": 302909952, + "router_z_loss_mlp": 0.75048828, + "step": 3652, + "time_per_iteration": 2.729938268661499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147969, + "balance_loss_mlp": 1.07267606, + "epoch": 0.7027702962677953, + "flos": 448370568192.0, + "grad_norm": 0.03662771353243768, + "language_loss": 0.92350411, + "learning_rate": 0.0002143328994998634, + "loss": 0.93498379, + "num_input_tokens_seen": 302973056, + "router_z_loss_mlp": 0.75146484, + "step": 3653, + "time_per_iteration": 2.4846644401550293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147539, + "balance_loss_mlp": 1.07210338, + "epoch": 0.7029626779530589, + "flos": 623713354752.0, + "grad_norm": 0.03664764199554111, + "language_loss": 0.83479095, + "learning_rate": 0.00021407726674050982, + "loss": 0.84626639, + "num_input_tokens_seen": 303054656, + "router_z_loss_mlp": 0.75292969, + "step": 3654, + "time_per_iteration": 2.850576400756836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145188, + "balance_loss_mlp": 1.07003856, + "epoch": 0.7031550596383225, + "flos": 630733710336.0, + "grad_norm": 0.030002783226809063, + "language_loss": 0.91781414, + "learning_rate": 0.0002138217449862061, + "loss": 0.92926598, + "num_input_tokens_seen": 303124256, + "router_z_loss_mlp": 0.75, + "step": 3655, + "time_per_iteration": 2.7412569522857666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145204, + "balance_loss_mlp": 1.07000697, + "epoch": 0.703347441323586, + "flos": 531859674624.0, + "grad_norm": 0.03278089952227313, + "language_loss": 0.82951868, + "learning_rate": 0.00021356633433615403, + "loss": 0.84097064, + "num_input_tokens_seen": 303192720, + "router_z_loss_mlp": 0.75048828, + "step": 3656, + "time_per_iteration": 2.6387276649475098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144911, + "balance_loss_mlp": 1.06971395, + "epoch": 0.7035398230088495, + "flos": 694915474944.0, + "grad_norm": 0.029068288031651398, + "language_loss": 0.87720138, + "learning_rate": 0.0002133110348895133, + "loss": 0.88865048, + "num_input_tokens_seen": 303275968, + "router_z_loss_mlp": 0.75048828, + "step": 3657, + "time_per_iteration": 2.993046998977661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146816, + "balance_loss_mlp": 1.07152295, + "epoch": 0.7037322046941131, + "flos": 969666055680.0, + "grad_norm": 0.030671197457474774, + "language_loss": 0.89195395, + "learning_rate": 0.0002130558467453999, + "loss": 0.90342212, + "num_input_tokens_seen": 303367296, + "router_z_loss_mlp": 0.75146484, + "step": 3658, + "time_per_iteration": 3.3705010414123535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146747, + "balance_loss_mlp": 1.07131183, + "epoch": 0.7039245863793767, + "flos": 503925245952.0, + "grad_norm": 0.03300080382210099, + "language_loss": 0.88645768, + "learning_rate": 0.0002128007700028865, + "loss": 0.89792514, + "num_input_tokens_seen": 303442768, + "router_z_loss_mlp": 0.75292969, + "step": 3659, + "time_per_iteration": 2.734318256378174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148886, + "balance_loss_mlp": 1.07368839, + "epoch": 0.7041169680646402, + "flos": 466938034176.0, + "grad_norm": 0.036833825821468186, + "language_loss": 0.89132273, + "learning_rate": 0.00021254580476100276, + "loss": 0.90281165, + "num_input_tokens_seen": 303508304, + "router_z_loss_mlp": 0.75048828, + "step": 3660, + "time_per_iteration": 2.5174009799957275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149342, + "balance_loss_mlp": 1.07409692, + "epoch": 0.7043093497499038, + "flos": 633321363456.0, + "grad_norm": 0.04007789586728335, + "language_loss": 0.83207953, + "learning_rate": 0.00021229095111873497, + "loss": 0.84357297, + "num_input_tokens_seen": 303579312, + "router_z_loss_mlp": 0.75097656, + "step": 3661, + "time_per_iteration": 2.739220142364502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150007, + "balance_loss_mlp": 1.07466638, + "epoch": 0.7045017314351674, + "flos": 544094994432.0, + "grad_norm": 0.03298817995700549, + "language_loss": 0.90804625, + "learning_rate": 0.0002120362091750261, + "loss": 0.91954637, + "num_input_tokens_seen": 303658384, + "router_z_loss_mlp": 0.75195312, + "step": 3662, + "time_per_iteration": 2.7960565090179443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146981, + "balance_loss_mlp": 1.07149768, + "epoch": 0.704694113120431, + "flos": 429141089280.0, + "grad_norm": 0.039212871672660514, + "language_loss": 0.92362261, + "learning_rate": 0.00021178157902877566, + "loss": 0.93509239, + "num_input_tokens_seen": 303721136, + "router_z_loss_mlp": 0.75341797, + "step": 3663, + "time_per_iteration": 2.4680960178375244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147972, + "balance_loss_mlp": 1.07263219, + "epoch": 0.7048864948056945, + "flos": 651712911360.0, + "grad_norm": 0.034682408130930084, + "language_loss": 0.9230448, + "learning_rate": 0.0002115270607788397, + "loss": 0.93452454, + "num_input_tokens_seen": 303792368, + "router_z_loss_mlp": 0.75195312, + "step": 3664, + "time_per_iteration": 2.775634288787842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149534, + "balance_loss_mlp": 1.07414639, + "epoch": 0.705078876490958, + "flos": 413493646848.0, + "grad_norm": 0.03365445853786745, + "language_loss": 0.90348285, + "learning_rate": 0.00021127265452403133, + "loss": 0.91497815, + "num_input_tokens_seen": 303856336, + "router_z_loss_mlp": 0.75244141, + "step": 3665, + "time_per_iteration": 2.4944612979888916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153404, + "balance_loss_mlp": 1.07958984, + "epoch": 0.7052712581762216, + "flos": 1423148255232.0, + "grad_norm": 0.008450912797082885, + "language_loss": 0.84091628, + "learning_rate": 0.0002110183603631199, + "loss": 0.85245037, + "num_input_tokens_seen": 304089856, + "router_z_loss_mlp": 0.73828125, + "step": 3666, + "time_per_iteration": 4.8742945194244385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147318, + "balance_loss_mlp": 1.07188284, + "epoch": 0.7054636398614852, + "flos": 494069460480.0, + "grad_norm": 0.03621564888049926, + "language_loss": 0.8791604, + "learning_rate": 0.00021076417839483065, + "loss": 0.89063358, + "num_input_tokens_seen": 304164752, + "router_z_loss_mlp": 0.75292969, + "step": 3667, + "time_per_iteration": 2.8080356121063232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145091, + "balance_loss_mlp": 1.06965578, + "epoch": 0.7056560215467488, + "flos": 451377186816.0, + "grad_norm": 0.031611332246536214, + "language_loss": 0.89408493, + "learning_rate": 0.00021051010871784589, + "loss": 0.90553588, + "num_input_tokens_seen": 304229568, + "router_z_loss_mlp": 0.75292969, + "step": 3668, + "time_per_iteration": 2.57733154296875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145739, + "balance_loss_mlp": 1.07039869, + "epoch": 0.7058484032320124, + "flos": 566817186816.0, + "grad_norm": 0.030127652842763482, + "language_loss": 0.83471566, + "learning_rate": 0.0002102561514308045, + "loss": 0.84617305, + "num_input_tokens_seen": 304299408, + "router_z_loss_mlp": 0.75195312, + "step": 3669, + "time_per_iteration": 2.742791175842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144151, + "balance_loss_mlp": 1.06881058, + "epoch": 0.7060407849172758, + "flos": 568102281216.0, + "grad_norm": 0.033895396428982545, + "language_loss": 0.87930894, + "learning_rate": 0.00021000230663230135, + "loss": 0.89075041, + "num_input_tokens_seen": 304367936, + "router_z_loss_mlp": 0.75195312, + "step": 3670, + "time_per_iteration": 2.667344331741333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143185, + "balance_loss_mlp": 1.06779695, + "epoch": 0.7062331666025394, + "flos": 469712338944.0, + "grad_norm": 0.03501215574939966, + "language_loss": 0.88139564, + "learning_rate": 0.00020974857442088762, + "loss": 0.89282751, + "num_input_tokens_seen": 304438368, + "router_z_loss_mlp": 0.75244141, + "step": 3671, + "time_per_iteration": 2.6410346031188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143999, + "balance_loss_mlp": 1.06861079, + "epoch": 0.706425548287803, + "flos": 596416743936.0, + "grad_norm": 0.033800210787899305, + "language_loss": 0.93517375, + "learning_rate": 0.00020949495489507104, + "loss": 0.94661367, + "num_input_tokens_seen": 304508720, + "router_z_loss_mlp": 0.75244141, + "step": 3672, + "time_per_iteration": 2.750444173812866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143883, + "balance_loss_mlp": 1.0685432, + "epoch": 0.7066179299730666, + "flos": 476813285376.0, + "grad_norm": 0.035802140613359776, + "language_loss": 0.90171611, + "learning_rate": 0.00020924144815331525, + "loss": 0.91315496, + "num_input_tokens_seen": 304576128, + "router_z_loss_mlp": 0.75195312, + "step": 3673, + "time_per_iteration": 2.553835391998291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144146, + "balance_loss_mlp": 1.0689013, + "epoch": 0.7068103116583301, + "flos": 507435423744.0, + "grad_norm": 0.037241628897294654, + "language_loss": 0.87898988, + "learning_rate": 0.00020898805429404044, + "loss": 0.8904314, + "num_input_tokens_seen": 304642416, + "router_z_loss_mlp": 0.75097656, + "step": 3674, + "time_per_iteration": 2.586620330810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114411, + "balance_loss_mlp": 1.06905568, + "epoch": 0.7070026933435937, + "flos": 680574594048.0, + "grad_norm": 0.03737000823174173, + "language_loss": 0.83904374, + "learning_rate": 0.0002087347734156228, + "loss": 0.85048485, + "num_input_tokens_seen": 304719312, + "router_z_loss_mlp": 0.74902344, + "step": 3675, + "time_per_iteration": 2.882800579071045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144169, + "balance_loss_mlp": 1.06906736, + "epoch": 0.7071950750288573, + "flos": 473166120960.0, + "grad_norm": 0.03475094948464188, + "language_loss": 0.84385908, + "learning_rate": 0.00020848160561639452, + "loss": 0.85530072, + "num_input_tokens_seen": 304789296, + "router_z_loss_mlp": 0.74951172, + "step": 3676, + "time_per_iteration": 2.6969666481018066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149349, + "balance_loss_mlp": 1.07429469, + "epoch": 0.7073874567141208, + "flos": 474683529216.0, + "grad_norm": 0.03052777669540167, + "language_loss": 0.90233761, + "learning_rate": 0.0002082285509946445, + "loss": 0.91383111, + "num_input_tokens_seen": 304854320, + "router_z_loss_mlp": 0.74902344, + "step": 3677, + "time_per_iteration": 2.546494722366333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152207, + "balance_loss_mlp": 1.07710516, + "epoch": 0.7075798383993844, + "flos": 547036485120.0, + "grad_norm": 0.03113462016358252, + "language_loss": 0.87627769, + "learning_rate": 0.00020797560964861683, + "loss": 0.88779974, + "num_input_tokens_seen": 304932784, + "router_z_loss_mlp": 0.74951172, + "step": 3678, + "time_per_iteration": 2.745973587036133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150766, + "balance_loss_mlp": 1.07585537, + "epoch": 0.7077722200846479, + "flos": 663390277632.0, + "grad_norm": 0.06964386826372344, + "language_loss": 0.85110044, + "learning_rate": 0.0002077227816765122, + "loss": 0.86260808, + "num_input_tokens_seen": 305018080, + "router_z_loss_mlp": 0.74755859, + "step": 3679, + "time_per_iteration": 2.982367753982544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115432, + "balance_loss_mlp": 1.08107758, + "epoch": 0.7079646017699115, + "flos": 1533300157440.0, + "grad_norm": 0.007004763795919161, + "language_loss": 0.76447725, + "learning_rate": 0.0002074700671764869, + "loss": 0.77602041, + "num_input_tokens_seen": 305241216, + "router_z_loss_mlp": 0.73242188, + "step": 3680, + "time_per_iteration": 4.8018670082092285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147209, + "balance_loss_mlp": 1.07224989, + "epoch": 0.7081569834551751, + "flos": 622645111296.0, + "grad_norm": 0.030610109660701587, + "language_loss": 0.83047998, + "learning_rate": 0.00020721746624665383, + "loss": 0.84195209, + "num_input_tokens_seen": 305311376, + "router_z_loss_mlp": 0.74804688, + "step": 3681, + "time_per_iteration": 2.782902717590332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147174, + "balance_loss_mlp": 1.07207251, + "epoch": 0.7083493651404387, + "flos": 796034059776.0, + "grad_norm": 0.03164783844829979, + "language_loss": 0.84436798, + "learning_rate": 0.00020696497898508114, + "loss": 0.85583979, + "num_input_tokens_seen": 305392736, + "router_z_loss_mlp": 0.74951172, + "step": 3682, + "time_per_iteration": 3.0583677291870117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143785, + "balance_loss_mlp": 1.06882644, + "epoch": 0.7085417468257021, + "flos": 815161480704.0, + "grad_norm": 0.03682994028404894, + "language_loss": 0.82170761, + "learning_rate": 0.00020671260548979316, + "loss": 0.83314544, + "num_input_tokens_seen": 305470896, + "router_z_loss_mlp": 0.74804688, + "step": 3683, + "time_per_iteration": 2.987361192703247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144169, + "balance_loss_mlp": 1.06911492, + "epoch": 0.7087341285109657, + "flos": 701796842496.0, + "grad_norm": 0.03866478361298153, + "language_loss": 0.90972751, + "learning_rate": 0.00020646034585876982, + "loss": 0.92116916, + "num_input_tokens_seen": 305547072, + "router_z_loss_mlp": 0.74902344, + "step": 3684, + "time_per_iteration": 2.810547351837158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144506, + "balance_loss_mlp": 1.06954765, + "epoch": 0.7089265101962293, + "flos": 597734765568.0, + "grad_norm": 0.031076054714904006, + "language_loss": 0.88290167, + "learning_rate": 0.00020620820018994718, + "loss": 0.89434671, + "num_input_tokens_seen": 305624512, + "router_z_loss_mlp": 0.74804688, + "step": 3685, + "time_per_iteration": 2.822174310684204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147475, + "balance_loss_mlp": 1.07246852, + "epoch": 0.7091188918814929, + "flos": 488167013376.0, + "grad_norm": 0.047855359590775554, + "language_loss": 0.88914609, + "learning_rate": 0.00020595616858121675, + "loss": 0.90062082, + "num_input_tokens_seen": 305695088, + "router_z_loss_mlp": 0.74853516, + "step": 3686, + "time_per_iteration": 2.7043378353118896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149664, + "balance_loss_mlp": 1.07470512, + "epoch": 0.7093112735667565, + "flos": 601255676928.0, + "grad_norm": 0.0443498852923524, + "language_loss": 0.85199845, + "learning_rate": 0.00020570425113042586, + "loss": 0.86349511, + "num_input_tokens_seen": 305763680, + "router_z_loss_mlp": 0.74804688, + "step": 3687, + "time_per_iteration": 2.702566623687744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152357, + "balance_loss_mlp": 1.07754159, + "epoch": 0.70950365525202, + "flos": 506849272320.0, + "grad_norm": 0.040092967224601664, + "language_loss": 0.90721941, + "learning_rate": 0.0002054524479353776, + "loss": 0.91874295, + "num_input_tokens_seen": 305835008, + "router_z_loss_mlp": 0.74707031, + "step": 3688, + "time_per_iteration": 2.667358636856079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147763, + "balance_loss_mlp": 1.07294738, + "epoch": 0.7096960369372836, + "flos": 733424097792.0, + "grad_norm": 0.04032937797632071, + "language_loss": 0.86300701, + "learning_rate": 0.00020520075909383063, + "loss": 0.87448466, + "num_input_tokens_seen": 305909072, + "router_z_loss_mlp": 0.74707031, + "step": 3689, + "time_per_iteration": 2.829561710357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145291, + "balance_loss_mlp": 1.07033193, + "epoch": 0.7098884186225471, + "flos": 973651594752.0, + "grad_norm": 0.03422835744235037, + "language_loss": 0.85456049, + "learning_rate": 0.00020494918470349916, + "loss": 0.86601341, + "num_input_tokens_seen": 305994752, + "router_z_loss_mlp": 0.74804688, + "step": 3690, + "time_per_iteration": 3.2887604236602783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147533, + "balance_loss_mlp": 1.0725745, + "epoch": 0.7100808003078107, + "flos": 505258003968.0, + "grad_norm": 0.040153245329332135, + "language_loss": 0.91447139, + "learning_rate": 0.00020469772486205297, + "loss": 0.92594671, + "num_input_tokens_seen": 306062960, + "router_z_loss_mlp": 0.74804688, + "step": 3691, + "time_per_iteration": 2.7245473861694336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148215, + "balance_loss_mlp": 1.07344735, + "epoch": 0.7102731819930742, + "flos": 541389820416.0, + "grad_norm": 0.03217926950478085, + "language_loss": 0.86047411, + "learning_rate": 0.0002044463796671177, + "loss": 0.87195623, + "num_input_tokens_seen": 306134224, + "router_z_loss_mlp": 0.74609375, + "step": 3692, + "time_per_iteration": 2.651794910430908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148314, + "balance_loss_mlp": 1.07330716, + "epoch": 0.7104655636783378, + "flos": 621627259392.0, + "grad_norm": 0.03360219211678542, + "language_loss": 0.85673523, + "learning_rate": 0.00020419514921627408, + "loss": 0.86821842, + "num_input_tokens_seen": 306214512, + "router_z_loss_mlp": 0.74853516, + "step": 3693, + "time_per_iteration": 2.933528184890747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147632, + "balance_loss_mlp": 1.07267368, + "epoch": 0.7106579453636014, + "flos": 558376751616.0, + "grad_norm": 0.03878231917046877, + "language_loss": 0.82689238, + "learning_rate": 0.00020394403360705855, + "loss": 0.83836865, + "num_input_tokens_seen": 306283232, + "router_z_loss_mlp": 0.74804688, + "step": 3694, + "time_per_iteration": 2.717163324356079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114284, + "balance_loss_mlp": 1.06788099, + "epoch": 0.710850327048865, + "flos": 514063010304.0, + "grad_norm": 0.03670457803793717, + "language_loss": 0.93433875, + "learning_rate": 0.00020369303293696228, + "loss": 0.9457671, + "num_input_tokens_seen": 306351536, + "router_z_loss_mlp": 0.74804688, + "step": 3695, + "time_per_iteration": 2.591191053390503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144917, + "balance_loss_mlp": 1.06995821, + "epoch": 0.7110427087341286, + "flos": 424506272256.0, + "grad_norm": 0.04020330353774376, + "language_loss": 0.83559984, + "learning_rate": 0.00020344214730343304, + "loss": 0.847049, + "num_input_tokens_seen": 306419040, + "router_z_loss_mlp": 0.74804688, + "step": 3696, + "time_per_iteration": 2.591609001159668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145099, + "balance_loss_mlp": 1.07014048, + "epoch": 0.711235090419392, + "flos": 578653006848.0, + "grad_norm": 0.02808433050647353, + "language_loss": 0.83313894, + "learning_rate": 0.00020319137680387296, + "loss": 0.84458989, + "num_input_tokens_seen": 306503248, + "router_z_loss_mlp": 0.74804688, + "step": 3697, + "time_per_iteration": 2.950737953186035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145687, + "balance_loss_mlp": 1.07063317, + "epoch": 0.7114274721046556, + "flos": 448984917504.0, + "grad_norm": 0.03843897473466325, + "language_loss": 0.86332655, + "learning_rate": 0.0002029407215356398, + "loss": 0.8747834, + "num_input_tokens_seen": 306566288, + "router_z_loss_mlp": 0.74902344, + "step": 3698, + "time_per_iteration": 2.578458309173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145595, + "balance_loss_mlp": 1.07063591, + "epoch": 0.7116198537899192, + "flos": 623092274688.0, + "grad_norm": 0.03606756354447633, + "language_loss": 0.88161683, + "learning_rate": 0.00020269018159604663, + "loss": 0.89307278, + "num_input_tokens_seen": 306633344, + "router_z_loss_mlp": 0.74804688, + "step": 3699, + "time_per_iteration": 2.7380590438842773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145077, + "balance_loss_mlp": 1.07007015, + "epoch": 0.7118122354751828, + "flos": 499720128000.0, + "grad_norm": 0.030764308679153148, + "language_loss": 0.86152577, + "learning_rate": 0.00020243975708236162, + "loss": 0.87297654, + "num_input_tokens_seen": 306701328, + "router_z_loss_mlp": 0.74853516, + "step": 3700, + "time_per_iteration": 2.5728888511657715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146347, + "balance_loss_mlp": 1.07134008, + "epoch": 0.7120046171604463, + "flos": 573844273152.0, + "grad_norm": 0.03285972243825597, + "language_loss": 0.90220731, + "learning_rate": 0.00020218944809180818, + "loss": 0.91367078, + "num_input_tokens_seen": 306773168, + "router_z_loss_mlp": 0.74853516, + "step": 3701, + "time_per_iteration": 2.684532880783081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146223, + "balance_loss_mlp": 1.07116926, + "epoch": 0.7121969988457099, + "flos": 573770413056.0, + "grad_norm": 0.03115747571146437, + "language_loss": 0.89376664, + "learning_rate": 0.00020193925472156493, + "loss": 0.90522885, + "num_input_tokens_seen": 306845312, + "router_z_loss_mlp": 0.74902344, + "step": 3702, + "time_per_iteration": 2.6705996990203857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152153, + "balance_loss_mlp": 1.07910156, + "epoch": 0.7123893805309734, + "flos": 1526820291072.0, + "grad_norm": 0.004701938060017763, + "language_loss": 0.74289167, + "learning_rate": 0.00020168917706876537, + "loss": 0.75441325, + "num_input_tokens_seen": 307079216, + "router_z_loss_mlp": 0.73046875, + "step": 3703, + "time_per_iteration": 4.916099309921265 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154733, + "balance_loss_mlp": 1.07958353, + "epoch": 0.712581762216237, + "flos": 616413021696.0, + "grad_norm": 0.031775345220902064, + "language_loss": 0.87929761, + "learning_rate": 0.00020143921523049863, + "loss": 0.89084488, + "num_input_tokens_seen": 307163568, + "router_z_loss_mlp": 0.75, + "step": 3704, + "time_per_iteration": 2.913417339324951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115426, + "balance_loss_mlp": 1.07915783, + "epoch": 0.7127741439015006, + "flos": 598874141184.0, + "grad_norm": 0.035207007977916, + "language_loss": 0.88667476, + "learning_rate": 0.00020118936930380837, + "loss": 0.89821732, + "num_input_tokens_seen": 307232800, + "router_z_loss_mlp": 0.74951172, + "step": 3705, + "time_per_iteration": 2.7526493072509766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144386, + "balance_loss_mlp": 1.06928408, + "epoch": 0.7129665255867641, + "flos": 538439597568.0, + "grad_norm": 0.036308279292938186, + "language_loss": 0.86138499, + "learning_rate": 0.0002009396393856932, + "loss": 0.87282884, + "num_input_tokens_seen": 307307216, + "router_z_loss_mlp": 0.74951172, + "step": 3706, + "time_per_iteration": 2.6750972270965576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147628, + "balance_loss_mlp": 1.07243121, + "epoch": 0.7131589072720277, + "flos": 527520297984.0, + "grad_norm": 0.03563284623765711, + "language_loss": 0.87550783, + "learning_rate": 0.00020069002557310673, + "loss": 0.88698411, + "num_input_tokens_seen": 307377472, + "router_z_loss_mlp": 0.75048828, + "step": 3707, + "time_per_iteration": 2.6487066745758057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149229, + "balance_loss_mlp": 1.0741272, + "epoch": 0.7133512889572913, + "flos": 532096717824.0, + "grad_norm": 0.031192275434881008, + "language_loss": 0.81347728, + "learning_rate": 0.00020044052796295807, + "loss": 0.82496965, + "num_input_tokens_seen": 307456880, + "router_z_loss_mlp": 0.74951172, + "step": 3708, + "time_per_iteration": 2.7782645225524902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148063, + "balance_loss_mlp": 1.0728184, + "epoch": 0.7135436706425549, + "flos": 504550328832.0, + "grad_norm": 0.03157354031682846, + "language_loss": 0.86940277, + "learning_rate": 0.00020019114665211063, + "loss": 0.8808834, + "num_input_tokens_seen": 307524784, + "router_z_loss_mlp": 0.75097656, + "step": 3709, + "time_per_iteration": 2.6009671688079834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147572, + "balance_loss_mlp": 1.07242227, + "epoch": 0.7137360523278183, + "flos": 516967570944.0, + "grad_norm": 0.03487007754085134, + "language_loss": 0.85992116, + "learning_rate": 0.00019994188173738276, + "loss": 0.8713969, + "num_input_tokens_seen": 307591408, + "router_z_loss_mlp": 0.75, + "step": 3710, + "time_per_iteration": 2.5438315868377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142507, + "balance_loss_mlp": 1.0673095, + "epoch": 0.7139284340130819, + "flos": 511536482304.0, + "grad_norm": 0.03607772040837418, + "language_loss": 0.85274506, + "learning_rate": 0.0001996927333155477, + "loss": 0.86417007, + "num_input_tokens_seen": 307662912, + "router_z_loss_mlp": 0.75048828, + "step": 3711, + "time_per_iteration": 2.7427854537963867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139044, + "balance_loss_mlp": 1.06389427, + "epoch": 0.7141208156983455, + "flos": 891799418880.0, + "grad_norm": 0.0340111276626949, + "language_loss": 0.9025712, + "learning_rate": 0.00019944370148333346, + "loss": 0.91396165, + "num_input_tokens_seen": 307752256, + "router_z_loss_mlp": 0.75, + "step": 3712, + "time_per_iteration": 3.1386330127716064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113928, + "balance_loss_mlp": 1.0641309, + "epoch": 0.7143131973836091, + "flos": 536883257856.0, + "grad_norm": 0.03639718620252856, + "language_loss": 0.8407408, + "learning_rate": 0.00019919478633742278, + "loss": 0.85213363, + "num_input_tokens_seen": 307821504, + "router_z_loss_mlp": 0.75, + "step": 3713, + "time_per_iteration": 2.6460351943969727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139962, + "balance_loss_mlp": 1.06486058, + "epoch": 0.7145055790688727, + "flos": 474627133440.0, + "grad_norm": 0.03673935987195594, + "language_loss": 0.91008997, + "learning_rate": 0.00019894598797445302, + "loss": 0.9214896, + "num_input_tokens_seen": 307886464, + "router_z_loss_mlp": 0.74951172, + "step": 3714, + "time_per_iteration": 2.5253968238830566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139941, + "balance_loss_mlp": 1.06498206, + "epoch": 0.7146979607541362, + "flos": 571701782016.0, + "grad_norm": 0.032359519554933665, + "language_loss": 0.85796106, + "learning_rate": 0.00019869730649101615, + "loss": 0.86936045, + "num_input_tokens_seen": 307962736, + "router_z_loss_mlp": 0.74804688, + "step": 3715, + "time_per_iteration": 2.765871047973633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139754, + "balance_loss_mlp": 1.06489098, + "epoch": 0.7148903424393998, + "flos": 841138068480.0, + "grad_norm": 0.0393709778481749, + "language_loss": 0.77344263, + "learning_rate": 0.00019844874198365943, + "loss": 0.78484023, + "num_input_tokens_seen": 308046592, + "router_z_loss_mlp": 0.74707031, + "step": 3716, + "time_per_iteration": 3.0865817070007324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140443, + "balance_loss_mlp": 1.06562734, + "epoch": 0.7150827241246633, + "flos": 542879030784.0, + "grad_norm": 0.03442327137938287, + "language_loss": 0.88300014, + "learning_rate": 0.00019820029454888362, + "loss": 0.89440459, + "num_input_tokens_seen": 308119920, + "router_z_loss_mlp": 0.74658203, + "step": 3717, + "time_per_iteration": 2.7028956413269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145981, + "balance_loss_mlp": 1.07312012, + "epoch": 0.7152751058099269, + "flos": 1587187705344.0, + "grad_norm": 0.009338560105867444, + "language_loss": 0.74521267, + "learning_rate": 0.00019795196428314455, + "loss": 0.7566725, + "num_input_tokens_seen": 308361024, + "router_z_loss_mlp": 0.72851562, + "step": 3718, + "time_per_iteration": 5.078125715255737 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142063, + "balance_loss_mlp": 1.06729496, + "epoch": 0.7154674874951905, + "flos": 518428583424.0, + "grad_norm": 0.038346473430325045, + "language_loss": 0.86008942, + "learning_rate": 0.0001977037512828529, + "loss": 0.87151003, + "num_input_tokens_seen": 308429808, + "router_z_loss_mlp": 0.74609375, + "step": 3719, + "time_per_iteration": 2.6236274242401123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141984, + "balance_loss_mlp": 1.0672158, + "epoch": 0.715659869180454, + "flos": 603639214080.0, + "grad_norm": 0.03183829156169413, + "language_loss": 0.90619719, + "learning_rate": 0.0001974556556443734, + "loss": 0.91761708, + "num_input_tokens_seen": 308501888, + "router_z_loss_mlp": 0.74609375, + "step": 3720, + "time_per_iteration": 2.7261006832122803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143131, + "balance_loss_mlp": 1.06836271, + "epoch": 0.7158522508657176, + "flos": 532769464320.0, + "grad_norm": 0.029220712652752532, + "language_loss": 0.93066287, + "learning_rate": 0.00019720767746402547, + "loss": 0.94209415, + "num_input_tokens_seen": 308576368, + "router_z_loss_mlp": 0.74609375, + "step": 3721, + "time_per_iteration": 2.730018377304077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144996, + "balance_loss_mlp": 1.06989455, + "epoch": 0.7160446325509812, + "flos": 558645995520.0, + "grad_norm": 0.03469516261194285, + "language_loss": 0.85035664, + "learning_rate": 0.00019695981683808222, + "loss": 0.86180663, + "num_input_tokens_seen": 308651936, + "router_z_loss_mlp": 0.74951172, + "step": 3722, + "time_per_iteration": 2.7371633052825928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152889, + "balance_loss_mlp": 1.07792997, + "epoch": 0.7162370142362448, + "flos": 692282159616.0, + "grad_norm": 0.032260484298275306, + "language_loss": 0.89382893, + "learning_rate": 0.00019671207386277225, + "loss": 0.90535784, + "num_input_tokens_seen": 308737264, + "router_z_loss_mlp": 0.74804688, + "step": 3723, + "time_per_iteration": 2.9425265789031982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114829, + "balance_loss_mlp": 1.07333136, + "epoch": 0.7164293959215082, + "flos": 795458641920.0, + "grad_norm": 0.035931768652590186, + "language_loss": 0.83636975, + "learning_rate": 0.0001964644486342777, + "loss": 0.84785259, + "num_input_tokens_seen": 308811776, + "router_z_loss_mlp": 0.74804688, + "step": 3724, + "time_per_iteration": 2.9537875652313232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147875, + "balance_loss_mlp": 1.07291591, + "epoch": 0.7166217776067718, + "flos": 495204833280.0, + "grad_norm": 0.03617438678608554, + "language_loss": 0.91026467, + "learning_rate": 0.00019621694124873524, + "loss": 0.92174339, + "num_input_tokens_seen": 308886704, + "router_z_loss_mlp": 0.74804688, + "step": 3725, + "time_per_iteration": 2.6945693492889404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146446, + "balance_loss_mlp": 1.07339478, + "epoch": 0.7168141592920354, + "flos": 1403961710592.0, + "grad_norm": 0.00968138139852001, + "language_loss": 0.76540077, + "learning_rate": 0.00019596955180223557, + "loss": 0.77686524, + "num_input_tokens_seen": 309113456, + "router_z_loss_mlp": 0.73046875, + "step": 3726, + "time_per_iteration": 4.849448919296265 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142124, + "balance_loss_mlp": 1.06716549, + "epoch": 0.717006540977299, + "flos": 794599243776.0, + "grad_norm": 0.04056704618834382, + "language_loss": 0.81872368, + "learning_rate": 0.00019572228039082428, + "loss": 0.83014494, + "num_input_tokens_seen": 309198768, + "router_z_loss_mlp": 0.74804688, + "step": 3727, + "time_per_iteration": 3.045783758163452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146498, + "balance_loss_mlp": 1.07153964, + "epoch": 0.7171989226625626, + "flos": 555963015168.0, + "grad_norm": 0.02715897729892971, + "language_loss": 0.87954736, + "learning_rate": 0.0001954751271105002, + "loss": 0.89101231, + "num_input_tokens_seen": 309279680, + "router_z_loss_mlp": 0.74804688, + "step": 3728, + "time_per_iteration": 2.7890095710754395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145282, + "balance_loss_mlp": 1.07027578, + "epoch": 0.717391304347826, + "flos": 557061457920.0, + "grad_norm": 0.03346658539414039, + "language_loss": 0.86323428, + "learning_rate": 0.00019522809205721687, + "loss": 0.87468708, + "num_input_tokens_seen": 309359152, + "router_z_loss_mlp": 0.74853516, + "step": 3729, + "time_per_iteration": 2.7522380352020264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140607, + "balance_loss_mlp": 1.06579113, + "epoch": 0.7175836860330896, + "flos": 539955004416.0, + "grad_norm": 0.0354578224226226, + "language_loss": 0.87126923, + "learning_rate": 0.0001949811753268816, + "loss": 0.88267529, + "num_input_tokens_seen": 309432800, + "router_z_loss_mlp": 0.74658203, + "step": 3730, + "time_per_iteration": 2.707690477371216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141683, + "balance_loss_mlp": 1.06686759, + "epoch": 0.7177760677183532, + "flos": 516650663424.0, + "grad_norm": 0.04023163535665124, + "language_loss": 0.88339722, + "learning_rate": 0.00019473437701535634, + "loss": 0.89481401, + "num_input_tokens_seen": 309499456, + "router_z_loss_mlp": 0.74658203, + "step": 3731, + "time_per_iteration": 2.570448637008667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114196, + "balance_loss_mlp": 1.06714427, + "epoch": 0.7179684494036168, + "flos": 675939777024.0, + "grad_norm": 0.03444896194332825, + "language_loss": 0.95062304, + "learning_rate": 0.00019448769721845677, + "loss": 0.96204257, + "num_input_tokens_seen": 309571056, + "router_z_loss_mlp": 0.74658203, + "step": 3732, + "time_per_iteration": 2.838884115219116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141126, + "balance_loss_mlp": 1.06635737, + "epoch": 0.7181608310888803, + "flos": 470875909632.0, + "grad_norm": 0.032659655773852006, + "language_loss": 0.9114489, + "learning_rate": 0.00019424113603195203, + "loss": 0.92286015, + "num_input_tokens_seen": 309635040, + "router_z_loss_mlp": 0.74609375, + "step": 3733, + "time_per_iteration": 2.540231704711914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142755, + "balance_loss_mlp": 1.06803441, + "epoch": 0.7183532127741439, + "flos": 595184042496.0, + "grad_norm": 0.0393108175728225, + "language_loss": 0.85483897, + "learning_rate": 0.0001939946935515657, + "loss": 0.86626649, + "num_input_tokens_seen": 309713696, + "router_z_loss_mlp": 0.74560547, + "step": 3734, + "time_per_iteration": 2.867018461227417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142515, + "balance_loss_mlp": 1.06774652, + "epoch": 0.7185455944594075, + "flos": 499915511808.0, + "grad_norm": 0.04034729202871447, + "language_loss": 0.85582328, + "learning_rate": 0.0001937483698729755, + "loss": 0.86724842, + "num_input_tokens_seen": 309782864, + "router_z_loss_mlp": 0.74609375, + "step": 3735, + "time_per_iteration": 2.5829944610595703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142145, + "balance_loss_mlp": 1.06737685, + "epoch": 0.718737976144671, + "flos": 816307587072.0, + "grad_norm": 0.03271819913976636, + "language_loss": 0.86010873, + "learning_rate": 0.0001935021650918128, + "loss": 0.87153018, + "num_input_tokens_seen": 309867056, + "router_z_loss_mlp": 0.74609375, + "step": 3736, + "time_per_iteration": 3.0105531215667725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142718, + "balance_loss_mlp": 1.06795025, + "epoch": 0.7189303578299346, + "flos": 439239922176.0, + "grad_norm": 0.03678550720791007, + "language_loss": 0.92134023, + "learning_rate": 0.0001932560793036625, + "loss": 0.93276739, + "num_input_tokens_seen": 309929744, + "router_z_loss_mlp": 0.74609375, + "step": 3737, + "time_per_iteration": 2.4854748249053955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142524, + "balance_loss_mlp": 1.06775641, + "epoch": 0.7191227395151981, + "flos": 550446606336.0, + "grad_norm": 0.04145641408022902, + "language_loss": 0.92745817, + "learning_rate": 0.00019301011260406382, + "loss": 0.93888342, + "num_input_tokens_seen": 309998128, + "router_z_loss_mlp": 0.74609375, + "step": 3738, + "time_per_iteration": 2.6645443439483643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114754, + "balance_loss_mlp": 1.07258117, + "epoch": 0.7193151212004617, + "flos": 628080929280.0, + "grad_norm": 0.039328087285967164, + "language_loss": 0.84679413, + "learning_rate": 0.00019276426508850936, + "loss": 0.85826951, + "num_input_tokens_seen": 310065472, + "router_z_loss_mlp": 0.74804688, + "step": 3739, + "time_per_iteration": 2.7071337699890137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148446, + "balance_loss_mlp": 1.07343948, + "epoch": 0.7195075028857253, + "flos": 742439950848.0, + "grad_norm": 0.030419377075742837, + "language_loss": 0.84898889, + "learning_rate": 0.00019251853685244564, + "loss": 0.86047333, + "num_input_tokens_seen": 310152960, + "router_z_loss_mlp": 0.74853516, + "step": 3740, + "time_per_iteration": 3.0168538093566895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114834, + "balance_loss_mlp": 1.07328558, + "epoch": 0.7196998845709889, + "flos": 804289844736.0, + "grad_norm": 0.05763766751245881, + "language_loss": 0.86089444, + "learning_rate": 0.00019227292799127283, + "loss": 0.87237775, + "num_input_tokens_seen": 310234080, + "router_z_loss_mlp": 0.74902344, + "step": 3741, + "time_per_iteration": 3.0083675384521484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144489, + "balance_loss_mlp": 1.06957746, + "epoch": 0.7198922662562524, + "flos": 926776396800.0, + "grad_norm": 0.03639396960725551, + "language_loss": 0.83974087, + "learning_rate": 0.00019202743860034454, + "loss": 0.8511858, + "num_input_tokens_seen": 310330208, + "router_z_loss_mlp": 0.74755859, + "step": 3742, + "time_per_iteration": 3.2506234645843506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144029, + "balance_loss_mlp": 1.06907046, + "epoch": 0.7200846479415159, + "flos": 581207732736.0, + "grad_norm": 0.03405610584059509, + "language_loss": 0.88730514, + "learning_rate": 0.00019178206877496873, + "loss": 0.89874554, + "num_input_tokens_seen": 310402960, + "router_z_loss_mlp": 0.74804688, + "step": 3743, + "time_per_iteration": 2.6837918758392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144783, + "balance_loss_mlp": 1.0700146, + "epoch": 0.7202770296267795, + "flos": 558839377920.0, + "grad_norm": 0.02830338825493349, + "language_loss": 0.89031184, + "learning_rate": 0.0001915368186104059, + "loss": 0.90175974, + "num_input_tokens_seen": 310479776, + "router_z_loss_mlp": 0.74609375, + "step": 3744, + "time_per_iteration": 2.7329940795898438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143898, + "balance_loss_mlp": 1.06912982, + "epoch": 0.7204694113120431, + "flos": 673771089408.0, + "grad_norm": 0.03331544271841085, + "language_loss": 0.85722578, + "learning_rate": 0.0001912916882018706, + "loss": 0.86866474, + "num_input_tokens_seen": 310555952, + "router_z_loss_mlp": 0.74609375, + "step": 3745, + "time_per_iteration": 2.7906653881073 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145353, + "balance_loss_mlp": 1.0706327, + "epoch": 0.7206617929973067, + "flos": 800595016704.0, + "grad_norm": 0.03936960108018568, + "language_loss": 0.85040343, + "learning_rate": 0.00019104667764453125, + "loss": 0.861857, + "num_input_tokens_seen": 310634784, + "router_z_loss_mlp": 0.74560547, + "step": 3746, + "time_per_iteration": 3.025996685028076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149239, + "balance_loss_mlp": 1.07437599, + "epoch": 0.7208541746825702, + "flos": 532938651648.0, + "grad_norm": 0.0387374733160612, + "language_loss": 0.85314423, + "learning_rate": 0.00019080178703350926, + "loss": 0.86463666, + "num_input_tokens_seen": 310703216, + "router_z_loss_mlp": 0.74707031, + "step": 3747, + "time_per_iteration": 2.640810251235962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149934, + "balance_loss_mlp": 1.07502282, + "epoch": 0.7210465563678338, + "flos": 536168851968.0, + "grad_norm": 0.035199314592541234, + "language_loss": 0.8746413, + "learning_rate": 0.00019055701646387952, + "loss": 0.88614064, + "num_input_tokens_seen": 310776816, + "router_z_loss_mlp": 0.74755859, + "step": 3748, + "time_per_iteration": 2.6518776416778564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155716, + "balance_loss_mlp": 1.08266449, + "epoch": 0.7212389380530974, + "flos": 1537246765056.0, + "grad_norm": 0.009534270530490536, + "language_loss": 0.80472684, + "learning_rate": 0.00019031236603067042, + "loss": 0.81628406, + "num_input_tokens_seen": 310987056, + "router_z_loss_mlp": 0.73046875, + "step": 3749, + "time_per_iteration": 4.76072096824646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151416, + "balance_loss_mlp": 1.07664847, + "epoch": 0.7214313197383609, + "flos": 462452938752.0, + "grad_norm": 0.03323767151214544, + "language_loss": 0.92055959, + "learning_rate": 0.00019006783582886368, + "loss": 0.93207377, + "num_input_tokens_seen": 311051648, + "router_z_loss_mlp": 0.74609375, + "step": 3750, + "time_per_iteration": 2.536107301712036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147507, + "balance_loss_mlp": 1.0724529, + "epoch": 0.7216237014236244, + "flos": 1038912336384.0, + "grad_norm": 0.03471978227212596, + "language_loss": 0.8780399, + "learning_rate": 0.00018982342595339437, + "loss": 0.88951492, + "num_input_tokens_seen": 311146272, + "router_z_loss_mlp": 0.74902344, + "step": 3751, + "time_per_iteration": 3.496842622756958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146824, + "balance_loss_mlp": 1.07181787, + "epoch": 0.721816083108888, + "flos": 897450086400.0, + "grad_norm": 0.03786430970431107, + "language_loss": 0.87491071, + "learning_rate": 0.00018957913649915076, + "loss": 0.88637894, + "num_input_tokens_seen": 311223760, + "router_z_loss_mlp": 0.74853516, + "step": 3752, + "time_per_iteration": 3.1817660331726074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145034, + "balance_loss_mlp": 1.07002771, + "epoch": 0.7220084647941516, + "flos": 524311564800.0, + "grad_norm": 0.03715970514443419, + "language_loss": 0.85220444, + "learning_rate": 0.00018933496756097428, + "loss": 0.86365485, + "num_input_tokens_seen": 311290336, + "router_z_loss_mlp": 0.74853516, + "step": 3753, + "time_per_iteration": 2.6647567749023438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147456, + "balance_loss_mlp": 1.07244956, + "epoch": 0.7222008464794152, + "flos": 817471157760.0, + "grad_norm": 0.038995714903637436, + "language_loss": 0.86141288, + "learning_rate": 0.0001890909192336603, + "loss": 0.87288737, + "num_input_tokens_seen": 311366240, + "router_z_loss_mlp": 0.74853516, + "step": 3754, + "time_per_iteration": 3.0344350337982178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146781, + "balance_loss_mlp": 1.07172728, + "epoch": 0.7223932281646788, + "flos": 750372097536.0, + "grad_norm": 0.03457656786821505, + "language_loss": 0.74980754, + "learning_rate": 0.00018884699161195623, + "loss": 0.76127535, + "num_input_tokens_seen": 311445184, + "router_z_loss_mlp": 0.74902344, + "step": 3755, + "time_per_iteration": 2.9410288333892822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146383, + "balance_loss_mlp": 1.07137632, + "epoch": 0.7225856098499422, + "flos": 746988172800.0, + "grad_norm": 0.03312890727657128, + "language_loss": 0.82509679, + "learning_rate": 0.00018860318479056327, + "loss": 0.83656067, + "num_input_tokens_seen": 311527280, + "router_z_loss_mlp": 0.74853516, + "step": 3756, + "time_per_iteration": 3.1337335109710693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144277, + "balance_loss_mlp": 1.0693661, + "epoch": 0.7227779915352058, + "flos": 548434371072.0, + "grad_norm": 0.030530532653655316, + "language_loss": 0.88339114, + "learning_rate": 0.00018835949886413555, + "loss": 0.89483386, + "num_input_tokens_seen": 311601552, + "router_z_loss_mlp": 0.74755859, + "step": 3757, + "time_per_iteration": 2.6933181285858154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146399, + "balance_loss_mlp": 1.07158351, + "epoch": 0.7229703732204694, + "flos": 531505837056.0, + "grad_norm": 0.03838754790834608, + "language_loss": 0.84470987, + "learning_rate": 0.0001881159339272806, + "loss": 0.85617381, + "num_input_tokens_seen": 311670736, + "router_z_loss_mlp": 0.74658203, + "step": 3758, + "time_per_iteration": 2.6401891708374023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147602, + "balance_loss_mlp": 1.07273877, + "epoch": 0.723162754905733, + "flos": 529365347328.0, + "grad_norm": 0.035007648752716856, + "language_loss": 0.83889484, + "learning_rate": 0.00018787249007455858, + "loss": 0.85037082, + "num_input_tokens_seen": 311736800, + "router_z_loss_mlp": 0.74707031, + "step": 3759, + "time_per_iteration": 2.605527400970459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147364, + "balance_loss_mlp": 1.07250082, + "epoch": 0.7233551365909965, + "flos": 656059018752.0, + "grad_norm": 0.034978512511305425, + "language_loss": 0.76976448, + "learning_rate": 0.00018762916740048302, + "loss": 0.78123814, + "num_input_tokens_seen": 311806064, + "router_z_loss_mlp": 0.74707031, + "step": 3760, + "time_per_iteration": 2.8233485221862793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156841, + "balance_loss_mlp": 1.081882, + "epoch": 0.7235475182762601, + "flos": 523443434496.0, + "grad_norm": 0.03185291769452338, + "language_loss": 0.9024173, + "learning_rate": 0.0001873859659995195, + "loss": 0.91398567, + "num_input_tokens_seen": 311881280, + "router_z_loss_mlp": 0.74804688, + "step": 3761, + "time_per_iteration": 2.7312240600585938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159221, + "balance_loss_mlp": 1.08440578, + "epoch": 0.7237398999615237, + "flos": 610321195008.0, + "grad_norm": 0.03629534298697415, + "language_loss": 0.88241446, + "learning_rate": 0.0001871428859660878, + "loss": 0.89400673, + "num_input_tokens_seen": 311953696, + "router_z_loss_mlp": 0.74658203, + "step": 3762, + "time_per_iteration": 2.7550981044769287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158067, + "balance_loss_mlp": 1.08329916, + "epoch": 0.7239322816467872, + "flos": 660281601024.0, + "grad_norm": 0.02929996085025788, + "language_loss": 0.86564827, + "learning_rate": 0.00018689992739455975, + "loss": 0.87722898, + "num_input_tokens_seen": 312032752, + "router_z_loss_mlp": 0.74609375, + "step": 3763, + "time_per_iteration": 2.925534963607788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152585, + "balance_loss_mlp": 1.07767427, + "epoch": 0.7241246633320508, + "flos": 970940416512.0, + "grad_norm": 0.028975317515326986, + "language_loss": 0.89523166, + "learning_rate": 0.00018665709037926027, + "loss": 0.90675747, + "num_input_tokens_seen": 312120800, + "router_z_loss_mlp": 0.74755859, + "step": 3764, + "time_per_iteration": 3.3454575538635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149589, + "balance_loss_mlp": 1.0751071, + "epoch": 0.7243170450173143, + "flos": 515999384064.0, + "grad_norm": 0.03578449562727673, + "language_loss": 0.88854849, + "learning_rate": 0.00018641437501446694, + "loss": 0.90004438, + "num_input_tokens_seen": 312188416, + "router_z_loss_mlp": 0.74414062, + "step": 3765, + "time_per_iteration": 2.5862903594970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149356, + "balance_loss_mlp": 1.07463598, + "epoch": 0.7245094267025779, + "flos": 560805950976.0, + "grad_norm": 0.04055976430378051, + "language_loss": 0.87262148, + "learning_rate": 0.0001861717813944104, + "loss": 0.88411504, + "num_input_tokens_seen": 312257792, + "router_z_loss_mlp": 0.74560547, + "step": 3766, + "time_per_iteration": 2.6999149322509766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145931, + "balance_loss_mlp": 1.07111502, + "epoch": 0.7247018083878415, + "flos": 613774977024.0, + "grad_norm": 0.03434162187139979, + "language_loss": 0.84787124, + "learning_rate": 0.00018592930961327365, + "loss": 0.85933053, + "num_input_tokens_seen": 312328544, + "router_z_loss_mlp": 0.74658203, + "step": 3767, + "time_per_iteration": 2.7380406856536865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145503, + "balance_loss_mlp": 1.07068777, + "epoch": 0.7248941900731051, + "flos": 635870085120.0, + "grad_norm": 0.03338829446413619, + "language_loss": 0.92739952, + "learning_rate": 0.00018568695976519273, + "loss": 0.93885458, + "num_input_tokens_seen": 312405888, + "router_z_loss_mlp": 0.74658203, + "step": 3768, + "time_per_iteration": 2.7908759117126465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145327, + "balance_loss_mlp": 1.07036865, + "epoch": 0.7250865717583687, + "flos": 425837028864.0, + "grad_norm": 0.039339840772426415, + "language_loss": 0.85823148, + "learning_rate": 0.00018544473194425593, + "loss": 0.86968476, + "num_input_tokens_seen": 312469552, + "router_z_loss_mlp": 0.74804688, + "step": 3769, + "time_per_iteration": 2.493539810180664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114564, + "balance_loss_mlp": 1.0706811, + "epoch": 0.7252789534436321, + "flos": 636397839360.0, + "grad_norm": 0.0351272666064589, + "language_loss": 0.83947301, + "learning_rate": 0.00018520262624450485, + "loss": 0.85092938, + "num_input_tokens_seen": 312548848, + "router_z_loss_mlp": 0.74804688, + "step": 3770, + "time_per_iteration": 2.8556978702545166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145039, + "balance_loss_mlp": 1.07017529, + "epoch": 0.7254713351288957, + "flos": 618353398272.0, + "grad_norm": 0.031209053717976155, + "language_loss": 0.91200709, + "learning_rate": 0.00018496064275993324, + "loss": 0.9234575, + "num_input_tokens_seen": 312622016, + "router_z_loss_mlp": 0.74707031, + "step": 3771, + "time_per_iteration": 2.7326061725616455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114546, + "balance_loss_mlp": 1.07050157, + "epoch": 0.7256637168141593, + "flos": 768290285568.0, + "grad_norm": 0.04607963634377255, + "language_loss": 0.87999386, + "learning_rate": 0.00018471878158448686, + "loss": 0.89144844, + "num_input_tokens_seen": 312696960, + "router_z_loss_mlp": 0.74804688, + "step": 3772, + "time_per_iteration": 2.945519208908081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011453, + "balance_loss_mlp": 1.07038903, + "epoch": 0.7258560984994229, + "flos": 496726970880.0, + "grad_norm": 0.029552123260588873, + "language_loss": 0.88148075, + "learning_rate": 0.00018447704281206512, + "loss": 0.89293379, + "num_input_tokens_seen": 312774352, + "router_z_loss_mlp": 0.74755859, + "step": 3773, + "time_per_iteration": 2.8680005073547363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114455, + "balance_loss_mlp": 1.06963933, + "epoch": 0.7260484801846864, + "flos": 531141265920.0, + "grad_norm": 0.03674222243829071, + "language_loss": 0.87786865, + "learning_rate": 0.0001842354265365191, + "loss": 0.88931417, + "num_input_tokens_seen": 312849600, + "router_z_loss_mlp": 0.74755859, + "step": 3774, + "time_per_iteration": 2.724771499633789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114502, + "balance_loss_mlp": 1.0701561, + "epoch": 0.72624086186995, + "flos": 626107625472.0, + "grad_norm": 0.03805272317803873, + "language_loss": 0.85790277, + "learning_rate": 0.0001839939328516526, + "loss": 0.869353, + "num_input_tokens_seen": 312922688, + "router_z_loss_mlp": 0.74707031, + "step": 3775, + "time_per_iteration": 2.7149298191070557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114524, + "balance_loss_mlp": 1.07037675, + "epoch": 0.7264332435552135, + "flos": 717804853248.0, + "grad_norm": 0.035296918768569004, + "language_loss": 0.86455274, + "learning_rate": 0.0001837525618512218, + "loss": 0.87600511, + "num_input_tokens_seen": 312997728, + "router_z_loss_mlp": 0.74707031, + "step": 3776, + "time_per_iteration": 2.8749477863311768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145925, + "balance_loss_mlp": 1.07129955, + "epoch": 0.7266256252404771, + "flos": 682241723904.0, + "grad_norm": 0.03797985367726647, + "language_loss": 0.88141412, + "learning_rate": 0.00018351131362893519, + "loss": 0.89287341, + "num_input_tokens_seen": 313067168, + "router_z_loss_mlp": 0.74462891, + "step": 3777, + "time_per_iteration": 2.7961273193359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146331, + "balance_loss_mlp": 1.07156312, + "epoch": 0.7268180069257407, + "flos": 519917793792.0, + "grad_norm": 0.04046507418804878, + "language_loss": 0.86727178, + "learning_rate": 0.00018327018827845364, + "loss": 0.87873513, + "num_input_tokens_seen": 313134688, + "router_z_loss_mlp": 0.74609375, + "step": 3778, + "time_per_iteration": 2.6734490394592285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147275, + "balance_loss_mlp": 1.07265031, + "epoch": 0.7270103886110042, + "flos": 513672242688.0, + "grad_norm": 0.03480448253150256, + "language_loss": 0.91087776, + "learning_rate": 0.00018302918589339036, + "loss": 0.92235053, + "num_input_tokens_seen": 313204816, + "router_z_loss_mlp": 0.74462891, + "step": 3779, + "time_per_iteration": 2.693053722381592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144842, + "balance_loss_mlp": 1.07012212, + "epoch": 0.7272027702962678, + "flos": 547691767296.0, + "grad_norm": 0.037628889327950436, + "language_loss": 0.94755363, + "learning_rate": 0.00018278830656731054, + "loss": 0.95900208, + "num_input_tokens_seen": 313274288, + "router_z_loss_mlp": 0.74560547, + "step": 3780, + "time_per_iteration": 2.7247214317321777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143177, + "balance_loss_mlp": 1.06831324, + "epoch": 0.7273951519815314, + "flos": 594154730496.0, + "grad_norm": 0.032307622186086855, + "language_loss": 0.90543699, + "learning_rate": 0.00018254755039373222, + "loss": 0.91686875, + "num_input_tokens_seen": 313344800, + "router_z_loss_mlp": 0.74707031, + "step": 3781, + "time_per_iteration": 2.7543249130249023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139617, + "balance_loss_mlp": 1.06480122, + "epoch": 0.727587533666795, + "flos": 607138658304.0, + "grad_norm": 0.037695022521252085, + "language_loss": 0.89343524, + "learning_rate": 0.0001823069174661252, + "loss": 0.90483147, + "num_input_tokens_seen": 313417840, + "router_z_loss_mlp": 0.74658203, + "step": 3782, + "time_per_iteration": 2.7875726222991943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140015, + "balance_loss_mlp": 1.06524646, + "epoch": 0.7277799153520584, + "flos": 514026080256.0, + "grad_norm": 0.034513244238831585, + "language_loss": 0.83396327, + "learning_rate": 0.00018206640787791112, + "loss": 0.84536338, + "num_input_tokens_seen": 313485936, + "router_z_loss_mlp": 0.74609375, + "step": 3783, + "time_per_iteration": 2.672685146331787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142732, + "balance_loss_mlp": 1.06782138, + "epoch": 0.727972297037322, + "flos": 538793435136.0, + "grad_norm": 0.03888167743908025, + "language_loss": 0.90142006, + "learning_rate": 0.00018182602172246416, + "loss": 0.9128474, + "num_input_tokens_seen": 313553136, + "router_z_loss_mlp": 0.74755859, + "step": 3784, + "time_per_iteration": 2.637195110321045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142638, + "balance_loss_mlp": 1.06767881, + "epoch": 0.7281646787225856, + "flos": 536075526144.0, + "grad_norm": 0.03379285978086118, + "language_loss": 0.81641448, + "learning_rate": 0.00018158575909311075, + "loss": 0.82784092, + "num_input_tokens_seen": 313620128, + "router_z_loss_mlp": 0.74804688, + "step": 3785, + "time_per_iteration": 2.6302285194396973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143773, + "balance_loss_mlp": 1.0688144, + "epoch": 0.7283570604078492, + "flos": 626209683456.0, + "grad_norm": 0.034294613815109176, + "language_loss": 0.84919262, + "learning_rate": 0.000181345620083129, + "loss": 0.86063033, + "num_input_tokens_seen": 313696432, + "router_z_loss_mlp": 0.74804688, + "step": 3786, + "time_per_iteration": 2.826655626296997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143839, + "balance_loss_mlp": 1.06887996, + "epoch": 0.7285494420931128, + "flos": 535255059456.0, + "grad_norm": 0.03289848846312583, + "language_loss": 0.91744298, + "learning_rate": 0.00018110560478574927, + "loss": 0.92888141, + "num_input_tokens_seen": 313768416, + "router_z_loss_mlp": 0.74804688, + "step": 3787, + "time_per_iteration": 2.6760616302490234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011439, + "balance_loss_mlp": 1.06889331, + "epoch": 0.7287418237783763, + "flos": 667740387840.0, + "grad_norm": 0.04379753934602124, + "language_loss": 0.86934447, + "learning_rate": 0.0001808657132941533, + "loss": 0.88078344, + "num_input_tokens_seen": 313839888, + "router_z_loss_mlp": 0.74853516, + "step": 3788, + "time_per_iteration": 2.8172109127044678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143441, + "balance_loss_mlp": 1.0684824, + "epoch": 0.7289342054636399, + "flos": 551638374912.0, + "grad_norm": 0.03930499856080985, + "language_loss": 0.87319398, + "learning_rate": 0.00018062594570147572, + "loss": 0.88462842, + "num_input_tokens_seen": 313908832, + "router_z_loss_mlp": 0.74804688, + "step": 3789, + "time_per_iteration": 2.6159238815307617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146043, + "balance_loss_mlp": 1.07103622, + "epoch": 0.7291265871489034, + "flos": 689138554368.0, + "grad_norm": 0.030589467753511134, + "language_loss": 0.89662123, + "learning_rate": 0.00018038630210080243, + "loss": 0.90808165, + "num_input_tokens_seen": 313982672, + "router_z_loss_mlp": 0.74853516, + "step": 3790, + "time_per_iteration": 2.8022711277008057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147306, + "balance_loss_mlp": 1.07234764, + "epoch": 0.729318968834167, + "flos": 573770413056.0, + "grad_norm": 0.03374595172498584, + "language_loss": 0.89270401, + "learning_rate": 0.0001801467825851712, + "loss": 0.90417707, + "num_input_tokens_seen": 314057184, + "router_z_loss_mlp": 0.74804688, + "step": 3791, + "time_per_iteration": 2.724628210067749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147876, + "balance_loss_mlp": 1.07310832, + "epoch": 0.7295113505194305, + "flos": 587164574208.0, + "grad_norm": 0.035766234040923994, + "language_loss": 0.83940732, + "learning_rate": 0.00017990738724757172, + "loss": 0.85088611, + "num_input_tokens_seen": 314137344, + "router_z_loss_mlp": 0.74609375, + "step": 3792, + "time_per_iteration": 2.842078924179077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161985, + "balance_loss_mlp": 1.08716917, + "epoch": 0.7297037322046941, + "flos": 708441893376.0, + "grad_norm": 0.03365089778951548, + "language_loss": 0.86588967, + "learning_rate": 0.00017966811618094598, + "loss": 0.87750953, + "num_input_tokens_seen": 314214464, + "router_z_loss_mlp": 0.74658203, + "step": 3793, + "time_per_iteration": 2.9457900524139404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151295, + "balance_loss_mlp": 1.07643151, + "epoch": 0.7298961138899577, + "flos": 488308002816.0, + "grad_norm": 0.03933165170986372, + "language_loss": 0.90208626, + "learning_rate": 0.00017942896947818664, + "loss": 0.91359925, + "num_input_tokens_seen": 314280432, + "router_z_loss_mlp": 0.74707031, + "step": 3794, + "time_per_iteration": 2.5673389434814453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155838, + "balance_loss_mlp": 1.08297729, + "epoch": 0.7300884955752213, + "flos": 1368622162944.0, + "grad_norm": 0.012202680830239692, + "language_loss": 0.74825054, + "learning_rate": 0.000179189947232139, + "loss": 0.7598089, + "num_input_tokens_seen": 314497152, + "router_z_loss_mlp": 0.72851562, + "step": 3795, + "time_per_iteration": 4.860522985458374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150098, + "balance_loss_mlp": 1.07523441, + "epoch": 0.7302808772604849, + "flos": 532836593664.0, + "grad_norm": 0.03730166344512247, + "language_loss": 0.91110396, + "learning_rate": 0.00017895104953559947, + "loss": 0.92260492, + "num_input_tokens_seen": 314565488, + "router_z_loss_mlp": 0.74707031, + "step": 3796, + "time_per_iteration": 2.58555269241333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148597, + "balance_loss_mlp": 1.07378125, + "epoch": 0.7304732589457483, + "flos": 437062502400.0, + "grad_norm": 0.03959489131470051, + "language_loss": 0.95557475, + "learning_rate": 0.00017871227648131672, + "loss": 0.96706069, + "num_input_tokens_seen": 314627392, + "router_z_loss_mlp": 0.74658203, + "step": 3797, + "time_per_iteration": 2.464853048324585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148137, + "balance_loss_mlp": 1.07332122, + "epoch": 0.7306656406310119, + "flos": 452603884032.0, + "grad_norm": 0.03192912066727366, + "language_loss": 0.87151992, + "learning_rate": 0.0001784736281619907, + "loss": 0.88300121, + "num_input_tokens_seen": 314695440, + "router_z_loss_mlp": 0.74658203, + "step": 3798, + "time_per_iteration": 2.582390785217285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146414, + "balance_loss_mlp": 1.07155061, + "epoch": 0.7308580223162755, + "flos": 513029695488.0, + "grad_norm": 0.051326436791091785, + "language_loss": 0.79766852, + "learning_rate": 0.00017823510467027232, + "loss": 0.80913264, + "num_input_tokens_seen": 314772592, + "router_z_loss_mlp": 0.74707031, + "step": 3799, + "time_per_iteration": 2.75164794921875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114555, + "balance_loss_mlp": 1.07078159, + "epoch": 0.7310504040015391, + "flos": 376282853376.0, + "grad_norm": 0.04144001955179666, + "language_loss": 0.8475759, + "learning_rate": 0.00017799670609876516, + "loss": 0.85903138, + "num_input_tokens_seen": 314836192, + "router_z_loss_mlp": 0.74609375, + "step": 3800, + "time_per_iteration": 2.5519416332244873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114588, + "balance_loss_mlp": 1.07106447, + "epoch": 0.7312427856868026, + "flos": 550381478400.0, + "grad_norm": 0.03386508062276854, + "language_loss": 0.93402916, + "learning_rate": 0.00017775843254002366, + "loss": 0.94548798, + "num_input_tokens_seen": 314908400, + "router_z_loss_mlp": 0.74658203, + "step": 3801, + "time_per_iteration": 4.189229965209961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144132, + "balance_loss_mlp": 1.06917357, + "epoch": 0.7314351673720662, + "flos": 768677050368.0, + "grad_norm": 0.03513626967715429, + "language_loss": 0.89011091, + "learning_rate": 0.00017752028408655367, + "loss": 0.9015522, + "num_input_tokens_seen": 314995280, + "router_z_loss_mlp": 0.74804688, + "step": 3802, + "time_per_iteration": 3.0296835899353027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114212, + "balance_loss_mlp": 1.06716144, + "epoch": 0.7316275490573297, + "flos": 487704387072.0, + "grad_norm": 0.036348088487259234, + "language_loss": 0.90090084, + "learning_rate": 0.00017728226083081272, + "loss": 0.91232204, + "num_input_tokens_seen": 315063056, + "router_z_loss_mlp": 0.74804688, + "step": 3803, + "time_per_iteration": 2.5504109859466553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142386, + "balance_loss_mlp": 1.06742704, + "epoch": 0.7318199307425933, + "flos": 474412283904.0, + "grad_norm": 0.03547640994648555, + "language_loss": 0.86963499, + "learning_rate": 0.00017704436286520965, + "loss": 0.88105881, + "num_input_tokens_seen": 315128896, + "router_z_loss_mlp": 0.74804688, + "step": 3804, + "time_per_iteration": 2.5794951915740967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141426, + "balance_loss_mlp": 1.06665754, + "epoch": 0.7320123124278569, + "flos": 550511734272.0, + "grad_norm": 0.04039315575901835, + "language_loss": 0.89054638, + "learning_rate": 0.0001768065902821046, + "loss": 0.90196061, + "num_input_tokens_seen": 315198464, + "router_z_loss_mlp": 0.74609375, + "step": 3805, + "time_per_iteration": 2.684680700302124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141527, + "balance_loss_mlp": 1.06675947, + "epoch": 0.7322046941131204, + "flos": 571899167232.0, + "grad_norm": 0.036858739394668875, + "language_loss": 0.87521064, + "learning_rate": 0.00017656894317380907, + "loss": 0.88662589, + "num_input_tokens_seen": 315270240, + "router_z_loss_mlp": 0.74609375, + "step": 3806, + "time_per_iteration": 2.7203333377838135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147461, + "balance_loss_mlp": 1.07460022, + "epoch": 0.732397075798384, + "flos": 1472501042688.0, + "grad_norm": 0.00876082834102495, + "language_loss": 0.76031268, + "learning_rate": 0.00017633142163258565, + "loss": 0.77178729, + "num_input_tokens_seen": 315502448, + "router_z_loss_mlp": 0.72851562, + "step": 3807, + "time_per_iteration": 4.985222816467285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143568, + "balance_loss_mlp": 1.06884801, + "epoch": 0.7325894574836476, + "flos": 465830859264.0, + "grad_norm": 0.03431257016679264, + "language_loss": 0.883228, + "learning_rate": 0.00017609402575064875, + "loss": 0.89466369, + "num_input_tokens_seen": 315569472, + "router_z_loss_mlp": 0.74560547, + "step": 3808, + "time_per_iteration": 2.5505616664886475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150323, + "balance_loss_mlp": 1.07560253, + "epoch": 0.7327818391689112, + "flos": 496481195520.0, + "grad_norm": 0.036747437689303115, + "language_loss": 0.86707413, + "learning_rate": 0.00017585675562016367, + "loss": 0.87857741, + "num_input_tokens_seen": 315637632, + "router_z_loss_mlp": 0.74560547, + "step": 3809, + "time_per_iteration": 2.566805362701416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148865, + "balance_loss_mlp": 1.07600403, + "epoch": 0.7329742208541746, + "flos": 1436679403008.0, + "grad_norm": 0.008652563544013954, + "language_loss": 0.77212846, + "learning_rate": 0.0001756196113332465, + "loss": 0.78361714, + "num_input_tokens_seen": 315863648, + "router_z_loss_mlp": 0.72851562, + "step": 3810, + "time_per_iteration": 4.843864440917969 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143684, + "balance_loss_mlp": 1.06910706, + "epoch": 0.7331666025394382, + "flos": 497868347904.0, + "grad_norm": 0.0400416063155724, + "language_loss": 0.90367377, + "learning_rate": 0.00017538259298196474, + "loss": 0.91511071, + "num_input_tokens_seen": 315930752, + "router_z_loss_mlp": 0.74414062, + "step": 3811, + "time_per_iteration": 2.573604106903076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146365, + "balance_loss_mlp": 1.07174027, + "epoch": 0.7333589842247018, + "flos": 539638096896.0, + "grad_norm": 0.03197642151293291, + "language_loss": 0.86813134, + "learning_rate": 0.00017514570065833745, + "loss": 0.87959504, + "num_input_tokens_seen": 316006400, + "router_z_loss_mlp": 0.74462891, + "step": 3812, + "time_per_iteration": 2.6921682357788086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146575, + "balance_loss_mlp": 1.0719502, + "epoch": 0.7335513659099654, + "flos": 492041762304.0, + "grad_norm": 0.0378422764823117, + "language_loss": 0.86487865, + "learning_rate": 0.00017490893445433426, + "loss": 0.87634438, + "num_input_tokens_seen": 316075824, + "router_z_loss_mlp": 0.74462891, + "step": 3813, + "time_per_iteration": 2.634765148162842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146185, + "balance_loss_mlp": 1.07160771, + "epoch": 0.733743747595229, + "flos": 563252614656.0, + "grad_norm": 0.03359115001415202, + "language_loss": 0.86180258, + "learning_rate": 0.00017467229446187587, + "loss": 0.87326443, + "num_input_tokens_seen": 316148336, + "router_z_loss_mlp": 0.74414062, + "step": 3814, + "time_per_iteration": 2.6770167350769043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146242, + "balance_loss_mlp": 1.07166481, + "epoch": 0.7339361292804925, + "flos": 539648830464.0, + "grad_norm": 0.03482367170061421, + "language_loss": 0.86801744, + "learning_rate": 0.00017443578077283424, + "loss": 0.87947989, + "num_input_tokens_seen": 316220960, + "router_z_loss_mlp": 0.74414062, + "step": 3815, + "time_per_iteration": 2.6352267265319824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144002, + "balance_loss_mlp": 1.06937671, + "epoch": 0.734128510965756, + "flos": 549561011712.0, + "grad_norm": 0.030322366631391387, + "language_loss": 0.89759493, + "learning_rate": 0.0001741993934790319, + "loss": 0.90903497, + "num_input_tokens_seen": 316295824, + "router_z_loss_mlp": 0.74462891, + "step": 3816, + "time_per_iteration": 2.793721914291382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142717, + "balance_loss_mlp": 1.06799662, + "epoch": 0.7343208926510196, + "flos": 541201167360.0, + "grad_norm": 0.038181865946918005, + "language_loss": 0.887739, + "learning_rate": 0.00017396313267224273, + "loss": 0.89916623, + "num_input_tokens_seen": 316368064, + "router_z_loss_mlp": 0.74560547, + "step": 3817, + "time_per_iteration": 2.773219347000122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145721, + "balance_loss_mlp": 1.07090569, + "epoch": 0.7345132743362832, + "flos": 572170412544.0, + "grad_norm": 0.036498541155499, + "language_loss": 0.93785435, + "learning_rate": 0.0001737269984441912, + "loss": 0.94931155, + "num_input_tokens_seen": 316437440, + "router_z_loss_mlp": 0.74658203, + "step": 3818, + "time_per_iteration": 2.6538641452789307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140549, + "balance_loss_mlp": 1.06592357, + "epoch": 0.7347056560215467, + "flos": 546480532992.0, + "grad_norm": 0.03219237397324587, + "language_loss": 0.8964963, + "learning_rate": 0.00017349099088655263, + "loss": 0.90790182, + "num_input_tokens_seen": 316511936, + "router_z_loss_mlp": 0.74462891, + "step": 3819, + "time_per_iteration": 2.7040135860443115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140645, + "balance_loss_mlp": 1.06606805, + "epoch": 0.7348980377068103, + "flos": 597076755456.0, + "grad_norm": 0.033091718107472336, + "language_loss": 0.85581368, + "learning_rate": 0.00017325511009095375, + "loss": 0.86722016, + "num_input_tokens_seen": 316584304, + "router_z_loss_mlp": 0.74414062, + "step": 3820, + "time_per_iteration": 4.160353183746338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142615, + "balance_loss_mlp": 1.06798947, + "epoch": 0.7350904193920739, + "flos": 539611900416.0, + "grad_norm": 0.031456925706235525, + "language_loss": 0.88030791, + "learning_rate": 0.00017301935614897113, + "loss": 0.89173406, + "num_input_tokens_seen": 316659024, + "router_z_loss_mlp": 0.74462891, + "step": 3821, + "time_per_iteration": 2.6948046684265137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142475, + "balance_loss_mlp": 1.06789804, + "epoch": 0.7352828010773375, + "flos": 514061008896.0, + "grad_norm": 0.030574399918046426, + "language_loss": 0.85837513, + "learning_rate": 0.00017278372915213274, + "loss": 0.86979991, + "num_input_tokens_seen": 316732544, + "router_z_loss_mlp": 0.74414062, + "step": 3822, + "time_per_iteration": 2.6384036540985107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146408, + "balance_loss_mlp": 1.07354736, + "epoch": 0.735475182762601, + "flos": 1557255777792.0, + "grad_norm": 0.0051515936537080845, + "language_loss": 0.79893845, + "learning_rate": 0.00017254822919191693, + "loss": 0.81040251, + "num_input_tokens_seen": 316967104, + "router_z_loss_mlp": 0.72851562, + "step": 3823, + "time_per_iteration": 6.475368976593018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140808, + "balance_loss_mlp": 1.06618333, + "epoch": 0.7356675644478645, + "flos": 682611024384.0, + "grad_norm": 0.03514206822018316, + "language_loss": 0.85822678, + "learning_rate": 0.00017231285635975314, + "loss": 0.86963487, + "num_input_tokens_seen": 317048304, + "router_z_loss_mlp": 0.74462891, + "step": 3824, + "time_per_iteration": 2.881985664367676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140396, + "balance_loss_mlp": 1.0657233, + "epoch": 0.7358599461331281, + "flos": 516231697920.0, + "grad_norm": 0.03601426366769367, + "language_loss": 0.88078141, + "learning_rate": 0.00017207761074702115, + "loss": 0.89218545, + "num_input_tokens_seen": 317115968, + "router_z_loss_mlp": 0.74511719, + "step": 3825, + "time_per_iteration": 2.588801860809326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142954, + "balance_loss_mlp": 1.06818557, + "epoch": 0.7360523278183917, + "flos": 444916786176.0, + "grad_norm": 0.029137218094429037, + "language_loss": 0.87851697, + "learning_rate": 0.0001718424924450514, + "loss": 0.88994652, + "num_input_tokens_seen": 317185680, + "router_z_loss_mlp": 0.74609375, + "step": 3826, + "time_per_iteration": 2.596510410308838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145079, + "balance_loss_mlp": 1.07050133, + "epoch": 0.7362447095036553, + "flos": 604551005184.0, + "grad_norm": 0.02824128078517694, + "language_loss": 0.89933646, + "learning_rate": 0.00017160750154512482, + "loss": 0.91078722, + "num_input_tokens_seen": 317258800, + "router_z_loss_mlp": 0.74414062, + "step": 3827, + "time_per_iteration": 2.737093687057495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139545, + "balance_loss_mlp": 1.06496727, + "epoch": 0.7364370911889189, + "flos": 554250223104.0, + "grad_norm": 0.030336693640123275, + "language_loss": 0.87611473, + "learning_rate": 0.0001713726381384731, + "loss": 0.88751018, + "num_input_tokens_seen": 317334608, + "router_z_loss_mlp": 0.74414062, + "step": 3828, + "time_per_iteration": 2.7642135620117188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140018, + "balance_loss_mlp": 1.06553614, + "epoch": 0.7366294728741823, + "flos": 449990034432.0, + "grad_norm": 0.03985156313807423, + "language_loss": 0.86582565, + "learning_rate": 0.00017113790231627812, + "loss": 0.87722576, + "num_input_tokens_seen": 317397504, + "router_z_loss_mlp": 0.74365234, + "step": 3829, + "time_per_iteration": 2.471085786819458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144356, + "balance_loss_mlp": 1.07168579, + "epoch": 0.7368218545594459, + "flos": 1538703048192.0, + "grad_norm": 0.005233117744578673, + "language_loss": 0.79258227, + "learning_rate": 0.0001709032941696726, + "loss": 0.80402577, + "num_input_tokens_seen": 317611472, + "router_z_loss_mlp": 0.7265625, + "step": 3830, + "time_per_iteration": 4.7661731243133545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146943, + "balance_loss_mlp": 1.072366, + "epoch": 0.7370142362447095, + "flos": 516472743936.0, + "grad_norm": 0.03645785594600137, + "language_loss": 0.87339807, + "learning_rate": 0.00017066881378973936, + "loss": 0.88486743, + "num_input_tokens_seen": 317681328, + "router_z_loss_mlp": 0.74414062, + "step": 3831, + "time_per_iteration": 2.6248505115509033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146898, + "balance_loss_mlp": 1.0723207, + "epoch": 0.7372066179299731, + "flos": 501904278528.0, + "grad_norm": 0.03165196577405493, + "language_loss": 0.87413478, + "learning_rate": 0.00017043446126751189, + "loss": 0.88560379, + "num_input_tokens_seen": 317752336, + "router_z_loss_mlp": 0.74414062, + "step": 3832, + "time_per_iteration": 2.6783525943756104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144804, + "balance_loss_mlp": 1.07022643, + "epoch": 0.7373989996152366, + "flos": 559167019008.0, + "grad_norm": 0.037114015277278894, + "language_loss": 0.82006979, + "learning_rate": 0.00017020023669397376, + "loss": 0.83151782, + "num_input_tokens_seen": 317824112, + "router_z_loss_mlp": 0.74414062, + "step": 3833, + "time_per_iteration": 2.6736700534820557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142842, + "balance_loss_mlp": 1.06816959, + "epoch": 0.7375913813005002, + "flos": 507780529152.0, + "grad_norm": 0.035309103887572656, + "language_loss": 0.88040781, + "learning_rate": 0.0001699661401600589, + "loss": 0.89183623, + "num_input_tokens_seen": 317889120, + "router_z_loss_mlp": 0.74511719, + "step": 3834, + "time_per_iteration": 2.566554069519043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114318, + "balance_loss_mlp": 1.06860292, + "epoch": 0.7377837629857638, + "flos": 487155165696.0, + "grad_norm": 0.03517908569874834, + "language_loss": 0.83206999, + "learning_rate": 0.00016973217175665205, + "loss": 0.84350181, + "num_input_tokens_seen": 317953792, + "router_z_loss_mlp": 0.74414062, + "step": 3835, + "time_per_iteration": 2.5718719959259033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144836, + "balance_loss_mlp": 1.07197571, + "epoch": 0.7379761446710273, + "flos": 1417877621760.0, + "grad_norm": 0.005454955067060188, + "language_loss": 0.8116616, + "learning_rate": 0.00016949833157458755, + "loss": 0.82310998, + "num_input_tokens_seen": 318184848, + "router_z_loss_mlp": 0.72851562, + "step": 3836, + "time_per_iteration": 4.927332401275635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113978, + "balance_loss_mlp": 1.065346, + "epoch": 0.7381685263562909, + "flos": 630909628416.0, + "grad_norm": 0.03248613748529956, + "language_loss": 0.88913381, + "learning_rate": 0.00016926461970465047, + "loss": 0.90053165, + "num_input_tokens_seen": 318259296, + "router_z_loss_mlp": 0.74316406, + "step": 3837, + "time_per_iteration": 2.775867462158203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140207, + "balance_loss_mlp": 1.06591558, + "epoch": 0.7383609080415544, + "flos": 740651297280.0, + "grad_norm": 0.029601422195490622, + "language_loss": 0.88803387, + "learning_rate": 0.00016903103623757516, + "loss": 0.89943594, + "num_input_tokens_seen": 318344704, + "router_z_loss_mlp": 0.7421875, + "step": 3838, + "time_per_iteration": 3.0490381717681885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114028, + "balance_loss_mlp": 1.0659889, + "epoch": 0.738553289726818, + "flos": 551256339456.0, + "grad_norm": 0.036589238474362976, + "language_loss": 0.84502995, + "learning_rate": 0.00016879758126404738, + "loss": 0.85643274, + "num_input_tokens_seen": 318416128, + "router_z_loss_mlp": 0.7421875, + "step": 3839, + "time_per_iteration": 2.7638185024261475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140469, + "balance_loss_mlp": 1.06598663, + "epoch": 0.7387456714120816, + "flos": 911775504384.0, + "grad_norm": 0.03874838451291343, + "language_loss": 0.85589796, + "learning_rate": 0.00016856425487470216, + "loss": 0.86730266, + "num_input_tokens_seen": 318498128, + "router_z_loss_mlp": 0.74316406, + "step": 3840, + "time_per_iteration": 3.1033904552459717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139827, + "balance_loss_mlp": 1.06548798, + "epoch": 0.7389380530973452, + "flos": 854195856384.0, + "grad_norm": 0.035495854767005654, + "language_loss": 0.84398341, + "learning_rate": 0.00016833105716012486, + "loss": 0.85538161, + "num_input_tokens_seen": 318578048, + "router_z_loss_mlp": 0.7421875, + "step": 3841, + "time_per_iteration": 3.1338374614715576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011399, + "balance_loss_mlp": 1.06551313, + "epoch": 0.7391304347826086, + "flos": 818419878912.0, + "grad_norm": 0.034862132205022836, + "language_loss": 0.89572388, + "learning_rate": 0.00016809798821085088, + "loss": 0.90712291, + "num_input_tokens_seen": 318654784, + "router_z_loss_mlp": 0.74267578, + "step": 3842, + "time_per_iteration": 2.980786085128784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140329, + "balance_loss_mlp": 1.06622851, + "epoch": 0.7393228164678722, + "flos": 573937598976.0, + "grad_norm": 0.03111800184883808, + "language_loss": 0.93200815, + "learning_rate": 0.00016786504811736565, + "loss": 0.94341135, + "num_input_tokens_seen": 318727680, + "router_z_loss_mlp": 0.74072266, + "step": 3843, + "time_per_iteration": 2.669473171234131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140191, + "balance_loss_mlp": 1.06618571, + "epoch": 0.7395151981531358, + "flos": 686575096320.0, + "grad_norm": 0.030093907505068344, + "language_loss": 0.86420381, + "learning_rate": 0.00016763223697010442, + "loss": 0.8756057, + "num_input_tokens_seen": 318807568, + "router_z_loss_mlp": 0.74023438, + "step": 3844, + "time_per_iteration": 2.99284291267395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140327, + "balance_loss_mlp": 1.06632161, + "epoch": 0.7397075798383994, + "flos": 557454226944.0, + "grad_norm": 0.030952263508457714, + "language_loss": 0.88928902, + "learning_rate": 0.00016739955485945256, + "loss": 0.90069234, + "num_input_tokens_seen": 318881792, + "router_z_loss_mlp": 0.74023438, + "step": 3845, + "time_per_iteration": 2.7834365367889404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143729, + "balance_loss_mlp": 1.06972384, + "epoch": 0.739899961523663, + "flos": 547822023168.0, + "grad_norm": 0.0384067269834895, + "language_loss": 0.91738451, + "learning_rate": 0.00016716700187574513, + "loss": 0.9288218, + "num_input_tokens_seen": 318951552, + "router_z_loss_mlp": 0.74023438, + "step": 3846, + "time_per_iteration": 2.686281681060791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142346, + "balance_loss_mlp": 1.06824505, + "epoch": 0.7400923432089265, + "flos": 610303730688.0, + "grad_norm": 0.03341447658559241, + "language_loss": 0.87943906, + "learning_rate": 0.0001669345781092675, + "loss": 0.89086246, + "num_input_tokens_seen": 319022304, + "router_z_loss_mlp": 0.74072266, + "step": 3847, + "time_per_iteration": 2.7001636028289795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146926, + "balance_loss_mlp": 1.07258725, + "epoch": 0.7402847248941901, + "flos": 592179425280.0, + "grad_norm": 0.03705340018944972, + "language_loss": 0.92317855, + "learning_rate": 0.0001667022836502546, + "loss": 0.9346478, + "num_input_tokens_seen": 319093200, + "router_z_loss_mlp": 0.74169922, + "step": 3848, + "time_per_iteration": 2.7301111221313477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147022, + "balance_loss_mlp": 1.07263577, + "epoch": 0.7404771065794536, + "flos": 478304497152.0, + "grad_norm": 0.03758678291398601, + "language_loss": 0.88680065, + "learning_rate": 0.00016647011858889077, + "loss": 0.89827085, + "num_input_tokens_seen": 319159712, + "router_z_loss_mlp": 0.7421875, + "step": 3849, + "time_per_iteration": 2.5619609355926514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145959, + "balance_loss_mlp": 1.07152426, + "epoch": 0.7406694882647172, + "flos": 497466846720.0, + "grad_norm": 0.035398733472562116, + "language_loss": 0.90902388, + "learning_rate": 0.00016623808301531056, + "loss": 0.92048347, + "num_input_tokens_seen": 319230544, + "router_z_loss_mlp": 0.74267578, + "step": 3850, + "time_per_iteration": 2.6344494819641113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144814, + "balance_loss_mlp": 1.07042766, + "epoch": 0.7408618699499807, + "flos": 563326474752.0, + "grad_norm": 0.04248736642040007, + "language_loss": 0.8449176, + "learning_rate": 0.00016600617701959842, + "loss": 0.85636574, + "num_input_tokens_seen": 319305440, + "router_z_loss_mlp": 0.7421875, + "step": 3851, + "time_per_iteration": 2.764845609664917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152382, + "balance_loss_mlp": 1.07971191, + "epoch": 0.7410542516352443, + "flos": 1391469333504.0, + "grad_norm": 0.006017952028820176, + "language_loss": 0.78843814, + "learning_rate": 0.00016577440069178811, + "loss": 0.79996192, + "num_input_tokens_seen": 319534384, + "router_z_loss_mlp": 0.7265625, + "step": 3852, + "time_per_iteration": 4.992438316345215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143972, + "balance_loss_mlp": 1.06968081, + "epoch": 0.7412466333205079, + "flos": 671211634176.0, + "grad_norm": 0.03177898311172259, + "language_loss": 0.86077726, + "learning_rate": 0.00016554275412186315, + "loss": 0.872217, + "num_input_tokens_seen": 319610960, + "router_z_loss_mlp": 0.74169922, + "step": 3853, + "time_per_iteration": 2.809633731842041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143877, + "balance_loss_mlp": 1.0695858, + "epoch": 0.7414390150057715, + "flos": 490318236672.0, + "grad_norm": 0.037394191958696615, + "language_loss": 0.85646808, + "learning_rate": 0.0001653112373997568, + "loss": 0.86790681, + "num_input_tokens_seen": 319683872, + "router_z_loss_mlp": 0.74169922, + "step": 3854, + "time_per_iteration": 2.6653616428375244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144328, + "balance_loss_mlp": 1.07013178, + "epoch": 0.7416313966910351, + "flos": 600493607424.0, + "grad_norm": 0.037760188692200464, + "language_loss": 0.80141521, + "learning_rate": 0.0001650798506153517, + "loss": 0.81285852, + "num_input_tokens_seen": 319750032, + "router_z_loss_mlp": 0.74072266, + "step": 3855, + "time_per_iteration": 2.6987767219543457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143504, + "balance_loss_mlp": 1.06921279, + "epoch": 0.7418237783762985, + "flos": 543586705920.0, + "grad_norm": 0.04363259370366351, + "language_loss": 0.89603698, + "learning_rate": 0.00016484859385848023, + "loss": 0.90747201, + "num_input_tokens_seen": 319818864, + "router_z_loss_mlp": 0.74121094, + "step": 3856, + "time_per_iteration": 2.6623427867889404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143237, + "balance_loss_mlp": 1.06889808, + "epoch": 0.7420161600615621, + "flos": 545223636480.0, + "grad_norm": 0.03643329679811027, + "language_loss": 0.82348394, + "learning_rate": 0.0001646174672189243, + "loss": 0.83491635, + "num_input_tokens_seen": 319888816, + "router_z_loss_mlp": 0.74169922, + "step": 3857, + "time_per_iteration": 2.663518190383911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143563, + "balance_loss_mlp": 1.0692718, + "epoch": 0.7422085417468257, + "flos": 528210508800.0, + "grad_norm": 0.03811276290038686, + "language_loss": 0.85172391, + "learning_rate": 0.00016438647078641488, + "loss": 0.86315954, + "num_input_tokens_seen": 319956176, + "router_z_loss_mlp": 0.74121094, + "step": 3858, + "time_per_iteration": 2.5988457202911377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145341, + "balance_loss_mlp": 1.07133579, + "epoch": 0.7424009234320893, + "flos": 509760563712.0, + "grad_norm": 0.034205456810992727, + "language_loss": 0.87813514, + "learning_rate": 0.00016415560465063344, + "loss": 0.88958859, + "num_input_tokens_seen": 320028560, + "router_z_loss_mlp": 0.73925781, + "step": 3859, + "time_per_iteration": 2.7205588817596436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145531, + "balance_loss_mlp": 1.07138264, + "epoch": 0.7425933051173528, + "flos": 513607114752.0, + "grad_norm": 0.03574871107412609, + "language_loss": 0.83894295, + "learning_rate": 0.0001639248689012095, + "loss": 0.85039824, + "num_input_tokens_seen": 320096112, + "router_z_loss_mlp": 0.74023438, + "step": 3860, + "time_per_iteration": 2.604342460632324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145572, + "balance_loss_mlp": 1.07142365, + "epoch": 0.7427856868026164, + "flos": 459377189376.0, + "grad_norm": 0.03221086554930489, + "language_loss": 0.91824234, + "learning_rate": 0.00016369426362772271, + "loss": 0.92969811, + "num_input_tokens_seen": 320168992, + "router_z_loss_mlp": 0.74023438, + "step": 3861, + "time_per_iteration": 2.787710189819336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140907, + "balance_loss_mlp": 1.06666386, + "epoch": 0.74297806848788, + "flos": 606187935744.0, + "grad_norm": 0.034095856542736835, + "language_loss": 0.84967786, + "learning_rate": 0.00016346378891970233, + "loss": 0.86108696, + "num_input_tokens_seen": 320247264, + "router_z_loss_mlp": 0.74072266, + "step": 3862, + "time_per_iteration": 2.791630744934082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140095, + "balance_loss_mlp": 1.06594658, + "epoch": 0.7431704501731435, + "flos": 893069776896.0, + "grad_norm": 0.035970776867332244, + "language_loss": 0.86936057, + "learning_rate": 0.00016323344486662633, + "loss": 0.8807615, + "num_input_tokens_seen": 320338992, + "router_z_loss_mlp": 0.74023438, + "step": 3863, + "time_per_iteration": 3.3644163608551025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140007, + "balance_loss_mlp": 1.06562018, + "epoch": 0.7433628318584071, + "flos": 593351728128.0, + "grad_norm": 0.03309073679941976, + "language_loss": 0.8318609, + "learning_rate": 0.00016300323155792247, + "loss": 0.84326088, + "num_input_tokens_seen": 320422096, + "router_z_loss_mlp": 0.7421875, + "step": 3864, + "time_per_iteration": 2.9201974868774414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140802, + "balance_loss_mlp": 1.06655836, + "epoch": 0.7435552135436706, + "flos": 478189704192.0, + "grad_norm": 0.032691738541971056, + "language_loss": 0.93297988, + "learning_rate": 0.00016277314908296687, + "loss": 0.94438791, + "num_input_tokens_seen": 320492640, + "router_z_loss_mlp": 0.74072266, + "step": 3865, + "time_per_iteration": 2.662276268005371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140447, + "balance_loss_mlp": 1.06606066, + "epoch": 0.7437475952289342, + "flos": 674431100928.0, + "grad_norm": 0.04227589537607751, + "language_loss": 0.82037443, + "learning_rate": 0.00016254319753108604, + "loss": 0.83177888, + "num_input_tokens_seen": 320565264, + "router_z_loss_mlp": 0.7421875, + "step": 3866, + "time_per_iteration": 2.818756341934204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140124, + "balance_loss_mlp": 1.06573772, + "epoch": 0.7439399769141978, + "flos": 771770264064.0, + "grad_norm": 0.04121075784978914, + "language_loss": 0.82100695, + "learning_rate": 0.00016231337699155492, + "loss": 0.83240819, + "num_input_tokens_seen": 320647584, + "router_z_loss_mlp": 0.7421875, + "step": 3867, + "time_per_iteration": 2.9714555740356445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139588, + "balance_loss_mlp": 1.06539237, + "epoch": 0.7441323585994614, + "flos": 649038663168.0, + "grad_norm": 0.03532933640628425, + "language_loss": 0.82657182, + "learning_rate": 0.0001620836875535977, + "loss": 0.83796769, + "num_input_tokens_seen": 320722752, + "router_z_loss_mlp": 0.74023438, + "step": 3868, + "time_per_iteration": 2.849938154220581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139487, + "balance_loss_mlp": 1.06548178, + "epoch": 0.7443247402847248, + "flos": 566500279296.0, + "grad_norm": 0.031528263247616775, + "language_loss": 0.85388362, + "learning_rate": 0.00016185412930638766, + "loss": 0.86527848, + "num_input_tokens_seen": 320802496, + "router_z_loss_mlp": 0.73925781, + "step": 3869, + "time_per_iteration": 2.7786920070648193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139674, + "balance_loss_mlp": 1.06547797, + "epoch": 0.7445171219699884, + "flos": 579679590912.0, + "grad_norm": 0.0366739337080916, + "language_loss": 0.87914336, + "learning_rate": 0.00016162470233904765, + "loss": 0.89054006, + "num_input_tokens_seen": 320872496, + "router_z_loss_mlp": 0.74023438, + "step": 3870, + "time_per_iteration": 2.705364465713501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147326, + "balance_loss_mlp": 1.07351112, + "epoch": 0.744709503655252, + "flos": 620029260288.0, + "grad_norm": 0.03364023309307919, + "language_loss": 0.86704087, + "learning_rate": 0.00016139540674064856, + "loss": 0.87851417, + "num_input_tokens_seen": 320944992, + "router_z_loss_mlp": 0.73828125, + "step": 3871, + "time_per_iteration": 2.727344512939453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147794, + "balance_loss_mlp": 1.07388413, + "epoch": 0.7449018853405156, + "flos": 529680253440.0, + "grad_norm": 0.03265362950694584, + "language_loss": 0.82158148, + "learning_rate": 0.00016116624260021113, + "loss": 0.83305943, + "num_input_tokens_seen": 321020208, + "router_z_loss_mlp": 0.73876953, + "step": 3872, + "time_per_iteration": 2.733447551727295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147438, + "balance_loss_mlp": 1.0736239, + "epoch": 0.7450942670257792, + "flos": 434223069696.0, + "grad_norm": 0.03568420204032938, + "language_loss": 0.89293343, + "learning_rate": 0.0001609372100067046, + "loss": 0.90440786, + "num_input_tokens_seen": 321085984, + "router_z_loss_mlp": 0.73828125, + "step": 3873, + "time_per_iteration": 2.5226526260375977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141021, + "balance_loss_mlp": 1.06682503, + "epoch": 0.7452866487110427, + "flos": 698165140992.0, + "grad_norm": 0.04021816698405521, + "language_loss": 0.90011704, + "learning_rate": 0.0001607083090490475, + "loss": 0.91152722, + "num_input_tokens_seen": 321163200, + "router_z_loss_mlp": 0.74023438, + "step": 3874, + "time_per_iteration": 2.897472381591797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138845, + "balance_loss_mlp": 1.06464863, + "epoch": 0.7454790303963063, + "flos": 513279473664.0, + "grad_norm": 0.03827241503421356, + "language_loss": 0.86578858, + "learning_rate": 0.00016047953981610714, + "loss": 0.877177, + "num_input_tokens_seen": 321237328, + "router_z_loss_mlp": 0.74023438, + "step": 3875, + "time_per_iteration": 2.7049574851989746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153999, + "balance_loss_mlp": 1.08171082, + "epoch": 0.7456714120815698, + "flos": 1328874107904.0, + "grad_norm": 0.014146468768439814, + "language_loss": 0.7972964, + "learning_rate": 0.00016025090239669916, + "loss": 0.8088364, + "num_input_tokens_seen": 321456192, + "router_z_loss_mlp": 0.72460938, + "step": 3876, + "time_per_iteration": 4.997116804122925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147349, + "balance_loss_mlp": 1.0731051, + "epoch": 0.7458637937668334, + "flos": 722971427328.0, + "grad_norm": 0.03963419785288614, + "language_loss": 0.8521378, + "learning_rate": 0.0001600223968795889, + "loss": 0.86361128, + "num_input_tokens_seen": 321530560, + "router_z_loss_mlp": 0.74072266, + "step": 3877, + "time_per_iteration": 2.8971540927886963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147774, + "balance_loss_mlp": 1.07548523, + "epoch": 0.746056175452097, + "flos": 1504866172416.0, + "grad_norm": 0.01288298570823651, + "language_loss": 0.75696075, + "learning_rate": 0.00015979402335349004, + "loss": 0.76843846, + "num_input_tokens_seen": 321760928, + "router_z_loss_mlp": 0.72460938, + "step": 3878, + "time_per_iteration": 4.937422275543213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144499, + "balance_loss_mlp": 1.07025564, + "epoch": 0.7462485571373605, + "flos": 521294212608.0, + "grad_norm": 0.03493161366736204, + "language_loss": 0.85764599, + "learning_rate": 0.00015956578190706483, + "loss": 0.86909091, + "num_input_tokens_seen": 321833248, + "router_z_loss_mlp": 0.74072266, + "step": 3879, + "time_per_iteration": 2.68503737449646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144, + "balance_loss_mlp": 1.06980455, + "epoch": 0.7464409388226241, + "flos": 482166511104.0, + "grad_norm": 0.03362253888482968, + "language_loss": 0.79837132, + "learning_rate": 0.00015933767262892468, + "loss": 0.80981129, + "num_input_tokens_seen": 321905904, + "router_z_loss_mlp": 0.74072266, + "step": 3880, + "time_per_iteration": 2.693495988845825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144861, + "balance_loss_mlp": 1.07071245, + "epoch": 0.7466333205078877, + "flos": 487741317120.0, + "grad_norm": 0.04222777509687144, + "language_loss": 0.88058239, + "learning_rate": 0.00015910969560762927, + "loss": 0.89203095, + "num_input_tokens_seen": 321971920, + "router_z_loss_mlp": 0.74023438, + "step": 3881, + "time_per_iteration": 2.562688112258911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148453, + "balance_loss_mlp": 1.07416224, + "epoch": 0.7468257021931513, + "flos": 612407290368.0, + "grad_norm": 0.034328627776477647, + "language_loss": 0.8732987, + "learning_rate": 0.00015888185093168727, + "loss": 0.88478327, + "num_input_tokens_seen": 322041904, + "router_z_loss_mlp": 0.74121094, + "step": 3882, + "time_per_iteration": 2.718461036682129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146529, + "balance_loss_mlp": 1.072142, + "epoch": 0.7470180838784147, + "flos": 534484257792.0, + "grad_norm": 0.03431059853024658, + "language_loss": 0.85983026, + "learning_rate": 0.00015865413868955581, + "loss": 0.87129557, + "num_input_tokens_seen": 322110816, + "router_z_loss_mlp": 0.7421875, + "step": 3883, + "time_per_iteration": 2.6472575664520264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146306, + "balance_loss_mlp": 1.07225311, + "epoch": 0.7472104655636783, + "flos": 740672764416.0, + "grad_norm": 0.030267060700337457, + "language_loss": 0.87475348, + "learning_rate": 0.00015842655896964054, + "loss": 0.88621652, + "num_input_tokens_seen": 322192704, + "router_z_loss_mlp": 0.73974609, + "step": 3884, + "time_per_iteration": 3.015573024749756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145315, + "balance_loss_mlp": 1.07107127, + "epoch": 0.7474028472489419, + "flos": 641501286912.0, + "grad_norm": 0.03713221878515122, + "language_loss": 0.79442894, + "learning_rate": 0.00015819911186029567, + "loss": 0.8058821, + "num_input_tokens_seen": 322263888, + "router_z_loss_mlp": 0.74121094, + "step": 3885, + "time_per_iteration": 2.7972114086151123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145173, + "balance_loss_mlp": 1.07078624, + "epoch": 0.7475952289342055, + "flos": 591326031360.0, + "grad_norm": 0.035996478944381224, + "language_loss": 0.90933514, + "learning_rate": 0.00015797179744982443, + "loss": 0.92078686, + "num_input_tokens_seen": 322331936, + "router_z_loss_mlp": 0.7421875, + "step": 3886, + "time_per_iteration": 2.699364185333252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145253, + "balance_loss_mlp": 1.07100964, + "epoch": 0.7477876106194691, + "flos": 489219793920.0, + "grad_norm": 0.03742232117847866, + "language_loss": 0.83403462, + "learning_rate": 0.00015774461582647765, + "loss": 0.84548712, + "num_input_tokens_seen": 322402032, + "router_z_loss_mlp": 0.74121094, + "step": 3887, + "time_per_iteration": 2.6602365970611572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146333, + "balance_loss_mlp": 1.07199454, + "epoch": 0.7479799923047326, + "flos": 555789098496.0, + "grad_norm": 0.03709849655597122, + "language_loss": 0.85774076, + "learning_rate": 0.00015751756707845505, + "loss": 0.86920416, + "num_input_tokens_seen": 322472512, + "router_z_loss_mlp": 0.74169922, + "step": 3888, + "time_per_iteration": 2.6497113704681396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145173, + "balance_loss_mlp": 1.07097745, + "epoch": 0.7481723739899961, + "flos": 768789841920.0, + "grad_norm": 0.0326002931336663, + "language_loss": 0.92530739, + "learning_rate": 0.00015729065129390502, + "loss": 0.93675911, + "num_input_tokens_seen": 322555104, + "router_z_loss_mlp": 0.74121094, + "step": 3889, + "time_per_iteration": 3.0129857063293457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145589, + "balance_loss_mlp": 1.07129776, + "epoch": 0.7483647556752597, + "flos": 497160672768.0, + "grad_norm": 0.03921764888683204, + "language_loss": 0.87742007, + "learning_rate": 0.0001570638685609241, + "loss": 0.88887596, + "num_input_tokens_seen": 322621904, + "router_z_loss_mlp": 0.74169922, + "step": 3890, + "time_per_iteration": 2.6674981117248535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145557, + "balance_loss_mlp": 1.07126558, + "epoch": 0.7485571373605233, + "flos": 473826132480.0, + "grad_norm": 0.036715319135455414, + "language_loss": 0.85719097, + "learning_rate": 0.00015683721896755693, + "loss": 0.8686465, + "num_input_tokens_seen": 322688928, + "router_z_loss_mlp": 0.74169922, + "step": 3891, + "time_per_iteration": 2.524322271347046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153778, + "balance_loss_mlp": 1.0816803, + "epoch": 0.7487495190457868, + "flos": 1557898324992.0, + "grad_norm": 0.009583293732515121, + "language_loss": 0.82210493, + "learning_rate": 0.00015661070260179682, + "loss": 0.83364266, + "num_input_tokens_seen": 322928464, + "router_z_loss_mlp": 0.72265625, + "step": 3892, + "time_per_iteration": 4.967085361480713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114376, + "balance_loss_mlp": 1.06980217, + "epoch": 0.7489419007310504, + "flos": 582966187008.0, + "grad_norm": 0.03314224500682494, + "language_loss": 0.89740062, + "learning_rate": 0.00015638431955158528, + "loss": 0.90883827, + "num_input_tokens_seen": 323002672, + "router_z_loss_mlp": 0.73974609, + "step": 3893, + "time_per_iteration": 2.7170591354370117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143436, + "balance_loss_mlp": 1.06952667, + "epoch": 0.749134282416314, + "flos": 568697164800.0, + "grad_norm": 0.032778698573620556, + "language_loss": 0.85919845, + "learning_rate": 0.00015615806990481186, + "loss": 0.87063277, + "num_input_tokens_seen": 323076480, + "router_z_loss_mlp": 0.73925781, + "step": 3894, + "time_per_iteration": 2.6996026039123535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143061, + "balance_loss_mlp": 1.06915176, + "epoch": 0.7493266641015776, + "flos": 534165348864.0, + "grad_norm": 0.030394188724740954, + "language_loss": 0.88159597, + "learning_rate": 0.00015593195374931452, + "loss": 0.89302653, + "num_input_tokens_seen": 323151840, + "router_z_loss_mlp": 0.73876953, + "step": 3895, + "time_per_iteration": 2.7341361045837402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146619, + "balance_loss_mlp": 1.0727098, + "epoch": 0.7495190457868411, + "flos": 524717795328.0, + "grad_norm": 0.03863238275082747, + "language_loss": 0.84834325, + "learning_rate": 0.00015570597117287922, + "loss": 0.8598094, + "num_input_tokens_seen": 323223376, + "router_z_loss_mlp": 0.73925781, + "step": 3896, + "time_per_iteration": 2.659959077835083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144958, + "balance_loss_mlp": 1.07123923, + "epoch": 0.7497114274721046, + "flos": 515189650944.0, + "grad_norm": 0.036153955885896226, + "language_loss": 0.83024484, + "learning_rate": 0.0001554801222632406, + "loss": 0.84169447, + "num_input_tokens_seen": 323290288, + "router_z_loss_mlp": 0.73730469, + "step": 3897, + "time_per_iteration": 2.5906412601470947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145811, + "balance_loss_mlp": 1.07199693, + "epoch": 0.7499038091573682, + "flos": 495997102080.0, + "grad_norm": 0.03335147628193477, + "language_loss": 0.89782715, + "learning_rate": 0.00015525440710808052, + "loss": 0.90928525, + "num_input_tokens_seen": 323359568, + "router_z_loss_mlp": 0.73828125, + "step": 3898, + "time_per_iteration": 2.615407705307007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145951, + "balance_loss_mlp": 1.07199407, + "epoch": 0.7500961908426318, + "flos": 738988170240.0, + "grad_norm": 0.03474247339269188, + "language_loss": 0.84343684, + "learning_rate": 0.00015502882579502953, + "loss": 0.85489637, + "num_input_tokens_seen": 323436688, + "router_z_loss_mlp": 0.73925781, + "step": 3899, + "time_per_iteration": 3.010974645614624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114743, + "balance_loss_mlp": 1.07361519, + "epoch": 0.7502885725278954, + "flos": 534536650752.0, + "grad_norm": 0.03268230414324022, + "language_loss": 0.88787687, + "learning_rate": 0.00015480337841166592, + "loss": 0.89935118, + "num_input_tokens_seen": 323510032, + "router_z_loss_mlp": 0.73828125, + "step": 3900, + "time_per_iteration": 2.7430782318115234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147759, + "balance_loss_mlp": 1.07399249, + "epoch": 0.7504809542131589, + "flos": 590557957632.0, + "grad_norm": 0.04375512425984308, + "language_loss": 0.87710261, + "learning_rate": 0.00015457806504551647, + "loss": 0.8885802, + "num_input_tokens_seen": 323588896, + "router_z_loss_mlp": 0.73779297, + "step": 3901, + "time_per_iteration": 2.8651504516601562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148011, + "balance_loss_mlp": 1.0741967, + "epoch": 0.7506733358984224, + "flos": 512582532096.0, + "grad_norm": 0.0332649439615325, + "language_loss": 0.82646012, + "learning_rate": 0.0001543528857840554, + "loss": 0.83794028, + "num_input_tokens_seen": 323661280, + "router_z_loss_mlp": 0.73828125, + "step": 3902, + "time_per_iteration": 2.6909492015838623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144161, + "balance_loss_mlp": 1.07025158, + "epoch": 0.750865717583686, + "flos": 540382702080.0, + "grad_norm": 0.03600709682352738, + "language_loss": 0.85171556, + "learning_rate": 0.000154127840714705, + "loss": 0.86315715, + "num_input_tokens_seen": 323739200, + "router_z_loss_mlp": 0.73925781, + "step": 3903, + "time_per_iteration": 2.7624754905700684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144936, + "balance_loss_mlp": 1.0707401, + "epoch": 0.7510580992689496, + "flos": 477540426240.0, + "grad_norm": 0.045315321448851864, + "language_loss": 0.87899154, + "learning_rate": 0.00015390292992483557, + "loss": 0.89044094, + "num_input_tokens_seen": 323802816, + "router_z_loss_mlp": 0.74072266, + "step": 3904, + "time_per_iteration": 2.512664794921875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141177, + "balance_loss_mlp": 1.06707633, + "epoch": 0.7512504809542132, + "flos": 580200614400.0, + "grad_norm": 0.0336140335329932, + "language_loss": 0.89387548, + "learning_rate": 0.00015367815350176523, + "loss": 0.90528727, + "num_input_tokens_seen": 323879488, + "router_z_loss_mlp": 0.74072266, + "step": 3905, + "time_per_iteration": 2.743971824645996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139798, + "balance_loss_mlp": 1.06550705, + "epoch": 0.7514428626394767, + "flos": 419563279872.0, + "grad_norm": 0.033015406559801515, + "language_loss": 0.88140541, + "learning_rate": 0.00015345351153275987, + "loss": 0.89280337, + "num_input_tokens_seen": 323944512, + "router_z_loss_mlp": 0.74169922, + "step": 3906, + "time_per_iteration": 2.5664329528808594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137169, + "balance_loss_mlp": 1.06335413, + "epoch": 0.7516352443247403, + "flos": 642254624256.0, + "grad_norm": 0.03633245053817903, + "language_loss": 0.85467315, + "learning_rate": 0.00015322900410503332, + "loss": 0.86604482, + "num_input_tokens_seen": 324020688, + "router_z_loss_mlp": 0.73828125, + "step": 3907, + "time_per_iteration": 2.797030210494995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139178, + "balance_loss_mlp": 1.0650295, + "epoch": 0.7518276260100039, + "flos": 582191382528.0, + "grad_norm": 0.03436736061108426, + "language_loss": 0.8251732, + "learning_rate": 0.00015300463130574703, + "loss": 0.83656502, + "num_input_tokens_seen": 324098080, + "router_z_loss_mlp": 0.74023438, + "step": 3908, + "time_per_iteration": 2.8524422645568848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139345, + "balance_loss_mlp": 1.06524479, + "epoch": 0.7520200076952674, + "flos": 688615529472.0, + "grad_norm": 0.03139939166900202, + "language_loss": 0.85847479, + "learning_rate": 0.00015278039322201033, + "loss": 0.86986822, + "num_input_tokens_seen": 324183968, + "router_z_loss_mlp": 0.73974609, + "step": 3909, + "time_per_iteration": 2.9437077045440674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113959, + "balance_loss_mlp": 1.0656805, + "epoch": 0.7522123893805309, + "flos": 487415677440.0, + "grad_norm": 0.04345489019259924, + "language_loss": 0.85063672, + "learning_rate": 0.00015255628994088004, + "loss": 0.86203265, + "num_input_tokens_seen": 324249568, + "router_z_loss_mlp": 0.73876953, + "step": 3910, + "time_per_iteration": 2.5493288040161133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139511, + "balance_loss_mlp": 1.0655055, + "epoch": 0.7524047710657945, + "flos": 820591294464.0, + "grad_norm": 0.035053470769469915, + "language_loss": 0.79975402, + "learning_rate": 0.00015233232154936082, + "loss": 0.81114912, + "num_input_tokens_seen": 324345312, + "router_z_loss_mlp": 0.73925781, + "step": 3911, + "time_per_iteration": 3.2801201343536377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136453, + "balance_loss_mlp": 1.06259108, + "epoch": 0.7525971527510581, + "flos": 700780992000.0, + "grad_norm": 0.03701963339686214, + "language_loss": 0.80987895, + "learning_rate": 0.0001521084881344048, + "loss": 0.82124352, + "num_input_tokens_seen": 324419056, + "router_z_loss_mlp": 0.73876953, + "step": 3912, + "time_per_iteration": 2.864623785018921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136423, + "balance_loss_mlp": 1.06260836, + "epoch": 0.7527895344363217, + "flos": 634949561856.0, + "grad_norm": 0.03193238845442204, + "language_loss": 0.90964454, + "learning_rate": 0.00015188478978291208, + "loss": 0.92100877, + "num_input_tokens_seen": 324490848, + "router_z_loss_mlp": 0.73828125, + "step": 3913, + "time_per_iteration": 2.817735433578491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138001, + "balance_loss_mlp": 1.06423438, + "epoch": 0.7529819161215853, + "flos": 563932091904.0, + "grad_norm": 0.03160281710037872, + "language_loss": 0.90830052, + "learning_rate": 0.00015166122658173014, + "loss": 0.91968054, + "num_input_tokens_seen": 324565648, + "router_z_loss_mlp": 0.73779297, + "step": 3914, + "time_per_iteration": 2.769164562225342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143642, + "balance_loss_mlp": 1.06992257, + "epoch": 0.7531742978068487, + "flos": 691956519936.0, + "grad_norm": 0.03347021027562271, + "language_loss": 0.9305917, + "learning_rate": 0.00015143779861765332, + "loss": 0.94202816, + "num_input_tokens_seen": 324642832, + "router_z_loss_mlp": 0.73730469, + "step": 3915, + "time_per_iteration": 2.8637077808380127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143643, + "balance_loss_mlp": 1.07001936, + "epoch": 0.7533666794921123, + "flos": 682306851840.0, + "grad_norm": 0.03059680855463854, + "language_loss": 0.85590506, + "learning_rate": 0.00015121450597742458, + "loss": 0.86734146, + "num_input_tokens_seen": 324718336, + "router_z_loss_mlp": 0.73632812, + "step": 3916, + "time_per_iteration": 2.822169065475464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143917, + "balance_loss_mlp": 1.0701977, + "epoch": 0.7535590611773759, + "flos": 624813798912.0, + "grad_norm": 0.03788604820756776, + "language_loss": 0.84024751, + "learning_rate": 0.00015099134874773369, + "loss": 0.85168672, + "num_input_tokens_seen": 324787744, + "router_z_loss_mlp": 0.73730469, + "step": 3917, + "time_per_iteration": 2.739708185195923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143474, + "balance_loss_mlp": 1.06975508, + "epoch": 0.7537514428626395, + "flos": 520493211648.0, + "grad_norm": 0.03128503546806215, + "language_loss": 0.84470636, + "learning_rate": 0.00015076832701521793, + "loss": 0.85614109, + "num_input_tokens_seen": 324863280, + "router_z_loss_mlp": 0.73730469, + "step": 3918, + "time_per_iteration": 2.7321834564208984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143927, + "balance_loss_mlp": 1.07016027, + "epoch": 0.753943824547903, + "flos": 725034054144.0, + "grad_norm": 0.04314682819864583, + "language_loss": 0.87482226, + "learning_rate": 0.000150545440866462, + "loss": 0.88626158, + "num_input_tokens_seen": 324949600, + "router_z_loss_mlp": 0.73779297, + "step": 3919, + "time_per_iteration": 2.9775331020355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138634, + "balance_loss_mlp": 1.06486762, + "epoch": 0.7541362062331666, + "flos": 438467119104.0, + "grad_norm": 0.052938940004614674, + "language_loss": 0.83896869, + "learning_rate": 0.000150322690387998, + "loss": 0.85035503, + "num_input_tokens_seen": 325013808, + "router_z_loss_mlp": 0.73779297, + "step": 3920, + "time_per_iteration": 2.49090576171875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137452, + "balance_loss_mlp": 1.06363773, + "epoch": 0.7543285879184302, + "flos": 566343826944.0, + "grad_norm": 0.033797104064901606, + "language_loss": 0.79905725, + "learning_rate": 0.00015010007566630535, + "loss": 0.81043172, + "num_input_tokens_seen": 325084832, + "router_z_loss_mlp": 0.73828125, + "step": 3921, + "time_per_iteration": 2.731271266937256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136388, + "balance_loss_mlp": 1.06257319, + "epoch": 0.7545209696036937, + "flos": 522058283520.0, + "grad_norm": 0.038458937044939336, + "language_loss": 0.86757135, + "learning_rate": 0.00014987759678781077, + "loss": 0.87893528, + "num_input_tokens_seen": 325155120, + "router_z_loss_mlp": 0.73828125, + "step": 3922, + "time_per_iteration": 2.6090140342712402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137282, + "balance_loss_mlp": 1.06356251, + "epoch": 0.7547133512889573, + "flos": 617209293312.0, + "grad_norm": 0.03880443282291728, + "language_loss": 0.87359434, + "learning_rate": 0.00014965525383888795, + "loss": 0.88496715, + "num_input_tokens_seen": 325235632, + "router_z_loss_mlp": 0.73730469, + "step": 3923, + "time_per_iteration": 2.7862982749938965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142684, + "balance_loss_mlp": 1.06867838, + "epoch": 0.7549057329742208, + "flos": 752141285376.0, + "grad_norm": 0.034394345643830246, + "language_loss": 0.76875985, + "learning_rate": 0.00014943304690585851, + "loss": 0.78018677, + "num_input_tokens_seen": 325309696, + "router_z_loss_mlp": 0.73876953, + "step": 3924, + "time_per_iteration": 2.910545825958252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143742, + "balance_loss_mlp": 1.06964111, + "epoch": 0.7550981146594844, + "flos": 515450162688.0, + "grad_norm": 0.03861308320303695, + "language_loss": 0.84874004, + "learning_rate": 0.0001492109760749908, + "loss": 0.8601774, + "num_input_tokens_seen": 325375744, + "router_z_loss_mlp": 0.73925781, + "step": 3925, + "time_per_iteration": 2.6297590732574463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114885, + "balance_loss_mlp": 1.07503557, + "epoch": 0.755290496344748, + "flos": 523026470400.0, + "grad_norm": 0.03619284623478051, + "language_loss": 0.84284902, + "learning_rate": 0.00014898904143250002, + "loss": 0.85433757, + "num_input_tokens_seen": 325448384, + "router_z_loss_mlp": 0.73828125, + "step": 3926, + "time_per_iteration": 2.6899092197418213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155189, + "balance_loss_mlp": 1.082901, + "epoch": 0.7554828780300116, + "flos": 1417703705088.0, + "grad_norm": 0.01325688578051584, + "language_loss": 0.75755203, + "learning_rate": 0.00014876724306454886, + "loss": 0.76910388, + "num_input_tokens_seen": 325678672, + "router_z_loss_mlp": 0.72460938, + "step": 3927, + "time_per_iteration": 4.904372692108154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141123, + "balance_loss_mlp": 1.06683159, + "epoch": 0.7556752597152752, + "flos": 557985984000.0, + "grad_norm": 0.031943357844755736, + "language_loss": 0.84718072, + "learning_rate": 0.0001485455810572474, + "loss": 0.85859191, + "num_input_tokens_seen": 325746656, + "router_z_loss_mlp": 0.74121094, + "step": 3928, + "time_per_iteration": 2.6653287410736084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139674, + "balance_loss_mlp": 1.06519186, + "epoch": 0.7558676414005386, + "flos": 564741825024.0, + "grad_norm": 0.03222629584019241, + "language_loss": 0.88709021, + "learning_rate": 0.00014832405549665236, + "loss": 0.89848697, + "num_input_tokens_seen": 325820304, + "router_z_loss_mlp": 0.74316406, + "step": 3929, + "time_per_iteration": 2.69524884223938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114176, + "balance_loss_mlp": 1.0672785, + "epoch": 0.7560600230858022, + "flos": 562534205952.0, + "grad_norm": 0.03584285097744866, + "language_loss": 0.82973742, + "learning_rate": 0.00014810266646876746, + "loss": 0.84115505, + "num_input_tokens_seen": 325895584, + "router_z_loss_mlp": 0.74316406, + "step": 3930, + "time_per_iteration": 2.781097888946533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141215, + "balance_loss_mlp": 1.06663764, + "epoch": 0.7562524047710658, + "flos": 720957190656.0, + "grad_norm": 0.038983110262219116, + "language_loss": 0.82315147, + "learning_rate": 0.00014788141405954364, + "loss": 0.83456367, + "num_input_tokens_seen": 325976752, + "router_z_loss_mlp": 0.74414062, + "step": 3931, + "time_per_iteration": 2.9991354942321777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140296, + "balance_loss_mlp": 1.06571853, + "epoch": 0.7564447864563294, + "flos": 544396439040.0, + "grad_norm": 0.037101319530533854, + "language_loss": 0.90224212, + "learning_rate": 0.00014766029835487865, + "loss": 0.91364509, + "num_input_tokens_seen": 326047152, + "router_z_loss_mlp": 0.74414062, + "step": 3932, + "time_per_iteration": 2.692891836166382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144662, + "balance_loss_mlp": 1.07008481, + "epoch": 0.7566371681415929, + "flos": 727093953024.0, + "grad_norm": 0.03778072998608002, + "language_loss": 0.86007833, + "learning_rate": 0.0001474393194406173, + "loss": 0.87152493, + "num_input_tokens_seen": 326119056, + "router_z_loss_mlp": 0.74414062, + "step": 3933, + "time_per_iteration": 2.891930341720581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146005, + "balance_loss_mlp": 1.07142723, + "epoch": 0.7568295498268565, + "flos": 577806343680.0, + "grad_norm": 0.03260015867991467, + "language_loss": 0.84333152, + "learning_rate": 0.00014721847740255112, + "loss": 0.85479152, + "num_input_tokens_seen": 326196736, + "router_z_loss_mlp": 0.74414062, + "step": 3934, + "time_per_iteration": 2.799757242202759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151863, + "balance_loss_mlp": 1.07919312, + "epoch": 0.75702193151212, + "flos": 1523216060928.0, + "grad_norm": 0.00897818069303787, + "language_loss": 0.73911923, + "learning_rate": 0.00014699777232641853, + "loss": 0.75063783, + "num_input_tokens_seen": 326404752, + "router_z_loss_mlp": 0.7265625, + "step": 3935, + "time_per_iteration": 4.575445175170898 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146571, + "balance_loss_mlp": 1.07199419, + "epoch": 0.7572143131973836, + "flos": 526488984576.0, + "grad_norm": 0.039044960519486104, + "language_loss": 0.83207357, + "learning_rate": 0.00014677720429790526, + "loss": 0.8435393, + "num_input_tokens_seen": 326472832, + "router_z_loss_mlp": 0.74414062, + "step": 3936, + "time_per_iteration": 2.6141350269317627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143608, + "balance_loss_mlp": 1.06917346, + "epoch": 0.7574066948826472, + "flos": 551823025152.0, + "grad_norm": 0.030693904946920876, + "language_loss": 0.88398033, + "learning_rate": 0.0001465567734026429, + "loss": 0.89541638, + "num_input_tokens_seen": 326546976, + "router_z_loss_mlp": 0.74267578, + "step": 3937, + "time_per_iteration": 2.738377571105957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136961, + "balance_loss_mlp": 1.06219339, + "epoch": 0.7575990765679107, + "flos": 396769228800.0, + "grad_norm": 0.04103098357371863, + "language_loss": 0.88068545, + "learning_rate": 0.00014633647972621034, + "loss": 0.89205503, + "num_input_tokens_seen": 326609296, + "router_z_loss_mlp": 0.74609375, + "step": 3938, + "time_per_iteration": 2.4616434574127197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138132, + "balance_loss_mlp": 1.06336367, + "epoch": 0.7577914582531743, + "flos": 586185653760.0, + "grad_norm": 0.030008665391221847, + "language_loss": 0.90353823, + "learning_rate": 0.00014611632335413354, + "loss": 0.91491956, + "num_input_tokens_seen": 326687168, + "router_z_loss_mlp": 0.74609375, + "step": 3939, + "time_per_iteration": 2.775031805038452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113606, + "balance_loss_mlp": 1.06143546, + "epoch": 0.7579838399384379, + "flos": 822484007424.0, + "grad_norm": 0.031088983596600554, + "language_loss": 0.87266111, + "learning_rate": 0.00014589630437188456, + "loss": 0.8840217, + "num_input_tokens_seen": 326777760, + "router_z_loss_mlp": 0.74462891, + "step": 3940, + "time_per_iteration": 3.1587963104248047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136592, + "balance_loss_mlp": 1.06187153, + "epoch": 0.7581762216237015, + "flos": 444805996032.0, + "grad_norm": 0.04449780821151478, + "language_loss": 0.84434611, + "learning_rate": 0.00014567642286488253, + "loss": 0.85571206, + "num_input_tokens_seen": 326843952, + "router_z_loss_mlp": 0.74560547, + "step": 3941, + "time_per_iteration": 2.541396141052246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146151, + "balance_loss_mlp": 1.07143092, + "epoch": 0.7583686033089649, + "flos": 541939041792.0, + "grad_norm": 0.045311193933261745, + "language_loss": 0.84473586, + "learning_rate": 0.00014545667891849258, + "loss": 0.85619736, + "num_input_tokens_seen": 326911296, + "router_z_loss_mlp": 0.74560547, + "step": 3942, + "time_per_iteration": 2.653228998184204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146078, + "balance_loss_mlp": 1.07150042, + "epoch": 0.7585609849942285, + "flos": 523612621824.0, + "grad_norm": 0.032810068859795746, + "language_loss": 0.87606031, + "learning_rate": 0.00014523707261802733, + "loss": 0.88752109, + "num_input_tokens_seen": 326977776, + "router_z_loss_mlp": 0.74414062, + "step": 3943, + "time_per_iteration": 2.6271109580993652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145321, + "balance_loss_mlp": 1.07064807, + "epoch": 0.7587533666794921, + "flos": 542907228672.0, + "grad_norm": 0.03968141925916535, + "language_loss": 0.87281996, + "learning_rate": 0.00014501760404874527, + "loss": 0.88427311, + "num_input_tokens_seen": 327050240, + "router_z_loss_mlp": 0.74511719, + "step": 3944, + "time_per_iteration": 2.696624279022217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143644, + "balance_loss_mlp": 1.06921005, + "epoch": 0.7589457483647557, + "flos": 607520693760.0, + "grad_norm": 0.03527343203685723, + "language_loss": 0.909307, + "learning_rate": 0.00014479827329585176, + "loss": 0.92074347, + "num_input_tokens_seen": 327119952, + "router_z_loss_mlp": 0.74267578, + "step": 3945, + "time_per_iteration": 2.7308402061462402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141632, + "balance_loss_mlp": 1.06724524, + "epoch": 0.7591381300500193, + "flos": 556251724800.0, + "grad_norm": 0.03227407382042984, + "language_loss": 0.88668191, + "learning_rate": 0.00014457908044449846, + "loss": 0.89809817, + "num_input_tokens_seen": 327192640, + "router_z_loss_mlp": 0.7421875, + "step": 3946, + "time_per_iteration": 2.723604917526245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145154, + "balance_loss_mlp": 1.07076728, + "epoch": 0.7593305117352828, + "flos": 530813624832.0, + "grad_norm": 0.032659275008273744, + "language_loss": 0.87264967, + "learning_rate": 0.00014436002557978371, + "loss": 0.88410115, + "num_input_tokens_seen": 327271008, + "router_z_loss_mlp": 0.7421875, + "step": 3947, + "time_per_iteration": 2.7849090099334717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151436, + "balance_loss_mlp": 1.07876587, + "epoch": 0.7595228934205464, + "flos": 1505922955776.0, + "grad_norm": 0.01242422674418897, + "language_loss": 0.76643145, + "learning_rate": 0.00014414110878675201, + "loss": 0.77794582, + "num_input_tokens_seen": 327505392, + "router_z_loss_mlp": 0.7265625, + "step": 3948, + "time_per_iteration": 4.869319200515747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141564, + "balance_loss_mlp": 1.06717777, + "epoch": 0.7597152751058099, + "flos": 456467899392.0, + "grad_norm": 0.03330137470124234, + "language_loss": 0.84041482, + "learning_rate": 0.0001439223301503945, + "loss": 0.85183042, + "num_input_tokens_seen": 327569392, + "router_z_loss_mlp": 0.7421875, + "step": 3949, + "time_per_iteration": 2.511057138442993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141649, + "balance_loss_mlp": 1.06721532, + "epoch": 0.7599076567910735, + "flos": 686798678016.0, + "grad_norm": 0.040114283676211684, + "language_loss": 0.80981869, + "learning_rate": 0.00014370368975564834, + "loss": 0.82123518, + "num_input_tokens_seen": 327648304, + "router_z_loss_mlp": 0.74267578, + "step": 3950, + "time_per_iteration": 3.0096349716186523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144078, + "balance_loss_mlp": 1.06973898, + "epoch": 0.760100038476337, + "flos": 533494603776.0, + "grad_norm": 0.03798147365213374, + "language_loss": 0.88830221, + "learning_rate": 0.00014348518768739766, + "loss": 0.89974296, + "num_input_tokens_seen": 327725600, + "router_z_loss_mlp": 0.74169922, + "step": 3951, + "time_per_iteration": 2.789020299911499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146828, + "balance_loss_mlp": 1.07415771, + "epoch": 0.7602924201616006, + "flos": 1474916780544.0, + "grad_norm": 0.005782127135677509, + "language_loss": 0.7672804, + "learning_rate": 0.00014326682403047243, + "loss": 0.77874869, + "num_input_tokens_seen": 327954048, + "router_z_loss_mlp": 0.7265625, + "step": 3952, + "time_per_iteration": 4.8369224071502686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142903, + "balance_loss_mlp": 1.06875467, + "epoch": 0.7604848018468642, + "flos": 776040509952.0, + "grad_norm": 0.03364559855712782, + "language_loss": 0.90537649, + "learning_rate": 0.00014304859886964867, + "loss": 0.91680551, + "num_input_tokens_seen": 328034656, + "router_z_loss_mlp": 0.74072266, + "step": 3953, + "time_per_iteration": 2.9843015670776367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142718, + "balance_loss_mlp": 1.06871259, + "epoch": 0.7606771835321278, + "flos": 559260344832.0, + "grad_norm": 0.034495919290042885, + "language_loss": 0.88372874, + "learning_rate": 0.00014283051228964878, + "loss": 0.89515591, + "num_input_tokens_seen": 328107264, + "router_z_loss_mlp": 0.74023438, + "step": 3954, + "time_per_iteration": 2.6971194744110107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143086, + "balance_loss_mlp": 1.06912816, + "epoch": 0.7608695652173914, + "flos": 526432588800.0, + "grad_norm": 0.03600141615552244, + "language_loss": 0.87487853, + "learning_rate": 0.00014261256437514197, + "loss": 0.88630933, + "num_input_tokens_seen": 328177168, + "router_z_loss_mlp": 0.73974609, + "step": 3955, + "time_per_iteration": 2.641023635864258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143325, + "balance_loss_mlp": 1.06932008, + "epoch": 0.7610619469026548, + "flos": 616167246336.0, + "grad_norm": 0.03384728426849952, + "language_loss": 0.87191808, + "learning_rate": 0.0001423947552107428, + "loss": 0.88335133, + "num_input_tokens_seen": 328245360, + "router_z_loss_mlp": 0.73974609, + "step": 3956, + "time_per_iteration": 2.7422232627868652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143723, + "balance_loss_mlp": 1.06981361, + "epoch": 0.7612543285879184, + "flos": 864817714176.0, + "grad_norm": 0.03496249839254083, + "language_loss": 0.82073259, + "learning_rate": 0.00014217708488101243, + "loss": 0.83216989, + "num_input_tokens_seen": 328326560, + "router_z_loss_mlp": 0.73925781, + "step": 3957, + "time_per_iteration": 3.1032650470733643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142422, + "balance_loss_mlp": 1.06822646, + "epoch": 0.761446710273182, + "flos": 554727585792.0, + "grad_norm": 0.03657356062959036, + "language_loss": 0.82088828, + "learning_rate": 0.0001419595534704579, + "loss": 0.83231246, + "num_input_tokens_seen": 328395760, + "router_z_loss_mlp": 0.74121094, + "step": 3958, + "time_per_iteration": 2.624901533126831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145496, + "balance_loss_mlp": 1.07149136, + "epoch": 0.7616390919584456, + "flos": 468325186560.0, + "grad_norm": 0.0357245127474846, + "language_loss": 0.85904223, + "learning_rate": 0.00014174216106353237, + "loss": 0.87049717, + "num_input_tokens_seen": 328464560, + "router_z_loss_mlp": 0.73974609, + "step": 3959, + "time_per_iteration": 2.595851421356201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143762, + "balance_loss_mlp": 1.06966209, + "epoch": 0.7618314736437091, + "flos": 499431418368.0, + "grad_norm": 0.03393548471878093, + "language_loss": 0.81279588, + "learning_rate": 0.00014152490774463512, + "loss": 0.82423347, + "num_input_tokens_seen": 328532640, + "router_z_loss_mlp": 0.73974609, + "step": 3960, + "time_per_iteration": 2.589545488357544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143507, + "balance_loss_mlp": 1.06931114, + "epoch": 0.7620238553289727, + "flos": 435451768320.0, + "grad_norm": 0.03935121424248522, + "language_loss": 0.92124438, + "learning_rate": 0.00014130779359811135, + "loss": 0.93267947, + "num_input_tokens_seen": 328595392, + "router_z_loss_mlp": 0.74072266, + "step": 3961, + "time_per_iteration": 2.455334424972534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114569, + "balance_loss_mlp": 1.07144618, + "epoch": 0.7622162370142362, + "flos": 665541500928.0, + "grad_norm": 0.033439971209903066, + "language_loss": 0.90740561, + "learning_rate": 0.0001410908187082521, + "loss": 0.91886252, + "num_input_tokens_seen": 328676368, + "router_z_loss_mlp": 0.74072266, + "step": 3962, + "time_per_iteration": 2.849613904953003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145492, + "balance_loss_mlp": 1.07105827, + "epoch": 0.7624086186994998, + "flos": 559028030976.0, + "grad_norm": 0.03941593540167477, + "language_loss": 0.90269017, + "learning_rate": 0.0001408739831592949, + "loss": 0.91414511, + "num_input_tokens_seen": 328745136, + "router_z_loss_mlp": 0.74267578, + "step": 3963, + "time_per_iteration": 2.638357639312744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114573, + "balance_loss_mlp": 1.07134342, + "epoch": 0.7626010003847634, + "flos": 630286546944.0, + "grad_norm": 0.03652031952844941, + "language_loss": 0.82416636, + "learning_rate": 0.0001406572870354224, + "loss": 0.83562368, + "num_input_tokens_seen": 328820384, + "router_z_loss_mlp": 0.7421875, + "step": 3964, + "time_per_iteration": 2.8123042583465576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145859, + "balance_loss_mlp": 1.07142508, + "epoch": 0.7627933820700269, + "flos": 438849154560.0, + "grad_norm": 0.03432760394377559, + "language_loss": 0.91489524, + "learning_rate": 0.00014044073042076337, + "loss": 0.92635381, + "num_input_tokens_seen": 328884976, + "router_z_loss_mlp": 0.74267578, + "step": 3965, + "time_per_iteration": 2.536203145980835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146519, + "balance_loss_mlp": 1.0722276, + "epoch": 0.7629857637552905, + "flos": 533794046976.0, + "grad_norm": 0.02784014268631594, + "language_loss": 0.9243055, + "learning_rate": 0.00014022431339939302, + "loss": 0.93577063, + "num_input_tokens_seen": 328957792, + "router_z_loss_mlp": 0.74121094, + "step": 3966, + "time_per_iteration": 2.6469874382019043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145692, + "balance_loss_mlp": 1.07135272, + "epoch": 0.7631781454405541, + "flos": 681236606976.0, + "grad_norm": 0.04013351668688065, + "language_loss": 0.82884651, + "learning_rate": 0.00014000803605533163, + "loss": 0.84030342, + "num_input_tokens_seen": 329034960, + "router_z_loss_mlp": 0.74169922, + "step": 3967, + "time_per_iteration": 2.802208185195923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145081, + "balance_loss_mlp": 1.07074177, + "epoch": 0.7633705271258177, + "flos": 508488204288.0, + "grad_norm": 0.04349575646472503, + "language_loss": 0.88445222, + "learning_rate": 0.00013979189847254553, + "loss": 0.89590299, + "num_input_tokens_seen": 329100848, + "router_z_loss_mlp": 0.74169922, + "step": 3968, + "time_per_iteration": 2.5820798873901367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145241, + "balance_loss_mlp": 1.07085466, + "epoch": 0.7635629088110811, + "flos": 620037992448.0, + "grad_norm": 0.0345033477005795, + "language_loss": 0.85449362, + "learning_rate": 0.00013957590073494674, + "loss": 0.86594605, + "num_input_tokens_seen": 329181120, + "router_z_loss_mlp": 0.7421875, + "step": 3969, + "time_per_iteration": 2.7904934883117676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139507, + "balance_loss_mlp": 1.0648824, + "epoch": 0.7637552904963447, + "flos": 639566914560.0, + "grad_norm": 0.03972116820389674, + "language_loss": 0.84200621, + "learning_rate": 0.0001393600429263931, + "loss": 0.8534013, + "num_input_tokens_seen": 329249888, + "router_z_loss_mlp": 0.74462891, + "step": 3970, + "time_per_iteration": 2.7333059310913086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145393, + "balance_loss_mlp": 1.07272339, + "epoch": 0.7639476721816083, + "flos": 1566683865600.0, + "grad_norm": 0.008603454608039083, + "language_loss": 0.74744886, + "learning_rate": 0.00013914432513068792, + "loss": 0.75890285, + "num_input_tokens_seen": 329483824, + "router_z_loss_mlp": 0.7265625, + "step": 3971, + "time_per_iteration": 4.924766302108765 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139229, + "balance_loss_mlp": 1.06484199, + "epoch": 0.7641400538668719, + "flos": 497019683328.0, + "grad_norm": 0.0358458499629568, + "language_loss": 0.86623794, + "learning_rate": 0.0001389287474315804, + "loss": 0.87763023, + "num_input_tokens_seen": 329553536, + "router_z_loss_mlp": 0.7421875, + "step": 3972, + "time_per_iteration": 2.6104958057403564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139206, + "balance_loss_mlp": 1.06481898, + "epoch": 0.7643324355521355, + "flos": 579514406400.0, + "grad_norm": 0.02970253105840928, + "language_loss": 0.84359801, + "learning_rate": 0.00013871330991276505, + "loss": 0.85499001, + "num_input_tokens_seen": 329621856, + "router_z_loss_mlp": 0.7421875, + "step": 3973, + "time_per_iteration": 2.7183613777160645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145413, + "balance_loss_mlp": 1.07102644, + "epoch": 0.764524817237399, + "flos": 786232668672.0, + "grad_norm": 0.038742643805220495, + "language_loss": 0.85575706, + "learning_rate": 0.00013849801265788247, + "loss": 0.86721122, + "num_input_tokens_seen": 329708192, + "router_z_loss_mlp": 0.7421875, + "step": 3974, + "time_per_iteration": 3.0245180130004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145329, + "balance_loss_mlp": 1.07094204, + "epoch": 0.7647171989226625, + "flos": 527298717696.0, + "grad_norm": 0.0343294309098999, + "language_loss": 0.88214505, + "learning_rate": 0.00013828285575051818, + "loss": 0.89359832, + "num_input_tokens_seen": 329774704, + "router_z_loss_mlp": 0.7421875, + "step": 3975, + "time_per_iteration": 2.6501829624176025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143749, + "balance_loss_mlp": 1.06964874, + "epoch": 0.7649095806079261, + "flos": 556028143104.0, + "grad_norm": 0.034577120087892245, + "language_loss": 0.88279045, + "learning_rate": 0.0001380678392742035, + "loss": 0.89422792, + "num_input_tokens_seen": 329846432, + "router_z_loss_mlp": 0.74072266, + "step": 3976, + "time_per_iteration": 2.717852830886841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143601, + "balance_loss_mlp": 1.06921458, + "epoch": 0.7651019622931897, + "flos": 650388885504.0, + "grad_norm": 0.0329487622471132, + "language_loss": 0.89186555, + "learning_rate": 0.00013785296331241526, + "loss": 0.90330154, + "num_input_tokens_seen": 329926336, + "router_z_loss_mlp": 0.7421875, + "step": 3977, + "time_per_iteration": 2.877988576889038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113775, + "balance_loss_mlp": 1.06336296, + "epoch": 0.7652943439784533, + "flos": 1048112113152.0, + "grad_norm": 0.034644421756337376, + "language_loss": 0.92511564, + "learning_rate": 0.00013763822794857583, + "loss": 0.9364931, + "num_input_tokens_seen": 330009536, + "router_z_loss_mlp": 0.7421875, + "step": 3978, + "time_per_iteration": 3.3197543621063232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113835, + "balance_loss_mlp": 1.06386817, + "epoch": 0.7654867256637168, + "flos": 505414456320.0, + "grad_norm": 0.032056341535250436, + "language_loss": 0.94870603, + "learning_rate": 0.00013742363326605278, + "loss": 0.96008945, + "num_input_tokens_seen": 330083264, + "router_z_loss_mlp": 0.74316406, + "step": 3979, + "time_per_iteration": 2.714352607727051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137744, + "balance_loss_mlp": 1.06330967, + "epoch": 0.7656791073489804, + "flos": 575863239168.0, + "grad_norm": 0.03156054452878063, + "language_loss": 0.82591552, + "learning_rate": 0.00013720917934815935, + "loss": 0.83729297, + "num_input_tokens_seen": 330157120, + "router_z_loss_mlp": 0.74267578, + "step": 3980, + "time_per_iteration": 2.717848300933838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113843, + "balance_loss_mlp": 1.06394827, + "epoch": 0.765871489034244, + "flos": 493791484416.0, + "grad_norm": 0.0408766328487834, + "language_loss": 0.88351345, + "learning_rate": 0.00013699486627815344, + "loss": 0.89489782, + "num_input_tokens_seen": 330224560, + "router_z_loss_mlp": 0.74316406, + "step": 3981, + "time_per_iteration": 2.570958137512207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114649, + "balance_loss_mlp": 1.07215071, + "epoch": 0.7660638707195075, + "flos": 487051106304.0, + "grad_norm": 0.03334801499225344, + "language_loss": 0.87230325, + "learning_rate": 0.00013678069413923928, + "loss": 0.8837682, + "num_input_tokens_seen": 330292000, + "router_z_loss_mlp": 0.74169922, + "step": 3982, + "time_per_iteration": 2.59192156791687 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145932, + "balance_loss_mlp": 1.07168806, + "epoch": 0.766256252404771, + "flos": 445242425856.0, + "grad_norm": 0.033038982399311745, + "language_loss": 0.86065191, + "learning_rate": 0.00013656666301456555, + "loss": 0.8721112, + "num_input_tokens_seen": 330357472, + "router_z_loss_mlp": 0.74121094, + "step": 3983, + "time_per_iteration": 2.5096640586853027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139926, + "balance_loss_mlp": 1.06568277, + "epoch": 0.7664486340900346, + "flos": 486213175296.0, + "grad_norm": 0.0343473148612919, + "language_loss": 0.88720405, + "learning_rate": 0.0001363527729872267, + "loss": 0.89860332, + "num_input_tokens_seen": 330427792, + "router_z_loss_mlp": 0.74072266, + "step": 3984, + "time_per_iteration": 2.652386426925659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138175, + "balance_loss_mlp": 1.06359744, + "epoch": 0.7666410157752982, + "flos": 647384268288.0, + "grad_norm": 0.033932927272579565, + "language_loss": 0.81177199, + "learning_rate": 0.00013613902414026207, + "loss": 0.82315373, + "num_input_tokens_seen": 330500320, + "router_z_loss_mlp": 0.74414062, + "step": 3985, + "time_per_iteration": 2.785083055496216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138176, + "balance_loss_mlp": 1.06359911, + "epoch": 0.7668333974605618, + "flos": 775660475904.0, + "grad_norm": 0.03599596212719163, + "language_loss": 0.86968917, + "learning_rate": 0.00013592541655665642, + "loss": 0.88107091, + "num_input_tokens_seen": 330581696, + "router_z_loss_mlp": 0.74414062, + "step": 3986, + "time_per_iteration": 3.013932704925537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144262, + "balance_loss_mlp": 1.06987572, + "epoch": 0.7670257791458254, + "flos": 614512851456.0, + "grad_norm": 0.036460289004419034, + "language_loss": 0.90080905, + "learning_rate": 0.00013571195031933947, + "loss": 0.91225165, + "num_input_tokens_seen": 330648000, + "router_z_loss_mlp": 0.7421875, + "step": 3987, + "time_per_iteration": 2.6782960891723633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114978, + "balance_loss_mlp": 1.0776825, + "epoch": 0.7672181608310888, + "flos": 1488362608128.0, + "grad_norm": 0.008503355118198302, + "language_loss": 0.80481339, + "learning_rate": 0.00013549862551118626, + "loss": 0.81631124, + "num_input_tokens_seen": 330873872, + "router_z_loss_mlp": 0.72265625, + "step": 3988, + "time_per_iteration": 4.697616338729858 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135719, + "balance_loss_mlp": 1.06128454, + "epoch": 0.7674105425163524, + "flos": 611866801152.0, + "grad_norm": 0.03376269838630617, + "language_loss": 0.9032138, + "learning_rate": 0.00013528544221501655, + "loss": 0.91457105, + "num_input_tokens_seen": 330945760, + "router_z_loss_mlp": 0.74267578, + "step": 3989, + "time_per_iteration": 2.731600284576416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135719, + "balance_loss_mlp": 1.06118917, + "epoch": 0.767602924201616, + "flos": 846604085760.0, + "grad_norm": 0.0353786451651817, + "language_loss": 0.86480021, + "learning_rate": 0.00013507240051359586, + "loss": 0.8761574, + "num_input_tokens_seen": 331025584, + "router_z_loss_mlp": 0.74365234, + "step": 3990, + "time_per_iteration": 3.0497024059295654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135952, + "balance_loss_mlp": 1.06156516, + "epoch": 0.7677953058868796, + "flos": 528145380864.0, + "grad_norm": 0.040368948500693246, + "language_loss": 0.91154569, + "learning_rate": 0.00013485950048963425, + "loss": 0.92290527, + "num_input_tokens_seen": 331093008, + "router_z_loss_mlp": 0.7421875, + "step": 3991, + "time_per_iteration": 2.596708059310913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135888, + "balance_loss_mlp": 1.06145394, + "epoch": 0.7679876875721431, + "flos": 925111268352.0, + "grad_norm": 0.05870608675269832, + "language_loss": 0.88347316, + "learning_rate": 0.00013464674222578643, + "loss": 0.89483202, + "num_input_tokens_seen": 331177120, + "router_z_loss_mlp": 0.74267578, + "step": 3992, + "time_per_iteration": 3.1901588439941406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114079, + "balance_loss_mlp": 1.06640303, + "epoch": 0.7681800692574067, + "flos": 459018622464.0, + "grad_norm": 0.03723022902665057, + "language_loss": 0.87956703, + "learning_rate": 0.00013443412580465292, + "loss": 0.89097494, + "num_input_tokens_seen": 331245424, + "router_z_loss_mlp": 0.7421875, + "step": 3993, + "time_per_iteration": 2.603252649307251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141634, + "balance_loss_mlp": 1.06724763, + "epoch": 0.7683724509426703, + "flos": 659732379648.0, + "grad_norm": 0.0341053080993109, + "language_loss": 0.8901087, + "learning_rate": 0.00013422165130877857, + "loss": 0.90152502, + "num_input_tokens_seen": 331327504, + "router_z_loss_mlp": 0.7421875, + "step": 3994, + "time_per_iteration": 2.911731004714966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142658, + "balance_loss_mlp": 1.06827152, + "epoch": 0.7685648326279338, + "flos": 556338319872.0, + "grad_norm": 0.037345354137488074, + "language_loss": 0.84750074, + "learning_rate": 0.00013400931882065327, + "loss": 0.85892731, + "num_input_tokens_seen": 331398464, + "router_z_loss_mlp": 0.7421875, + "step": 3995, + "time_per_iteration": 2.6689093112945557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142291, + "balance_loss_mlp": 1.06790483, + "epoch": 0.7687572143131974, + "flos": 688743783936.0, + "grad_norm": 0.03341807173983279, + "language_loss": 0.85686117, + "learning_rate": 0.0001337971284227118, + "loss": 0.86828411, + "num_input_tokens_seen": 331484592, + "router_z_loss_mlp": 0.7421875, + "step": 3996, + "time_per_iteration": 3.0353329181671143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148544, + "balance_loss_mlp": 1.07644653, + "epoch": 0.7689495959984609, + "flos": 1492665781248.0, + "grad_norm": 0.006288320283860005, + "language_loss": 0.76118422, + "learning_rate": 0.00013358508019733388, + "loss": 0.77266961, + "num_input_tokens_seen": 331721360, + "router_z_loss_mlp": 0.72265625, + "step": 3997, + "time_per_iteration": 4.911880731582642 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144884, + "balance_loss_mlp": 1.07049692, + "epoch": 0.7691419776837245, + "flos": 571499667456.0, + "grad_norm": 0.031757425540639796, + "language_loss": 0.84642863, + "learning_rate": 0.0001333731742268438, + "loss": 0.85787749, + "num_input_tokens_seen": 331794240, + "router_z_loss_mlp": 0.7421875, + "step": 3998, + "time_per_iteration": 2.6962177753448486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145361, + "balance_loss_mlp": 1.07097435, + "epoch": 0.7693343593689881, + "flos": 521190153216.0, + "grad_norm": 0.03369214696754818, + "language_loss": 0.89708233, + "learning_rate": 0.0001331614105935109, + "loss": 0.9085359, + "num_input_tokens_seen": 331866496, + "router_z_loss_mlp": 0.7421875, + "step": 3999, + "time_per_iteration": 2.6809701919555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114508, + "balance_loss_mlp": 1.07074106, + "epoch": 0.7695267410542517, + "flos": 661551232512.0, + "grad_norm": 0.03371243854874441, + "language_loss": 0.88376063, + "learning_rate": 0.00013294978937954883, + "loss": 0.8952114, + "num_input_tokens_seen": 331936592, + "router_z_loss_mlp": 0.74169922, + "step": 4000, + "time_per_iteration": 2.867079973220825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114193, + "balance_loss_mlp": 1.06754363, + "epoch": 0.7697191227395151, + "flos": 547858953216.0, + "grad_norm": 0.037308762350110276, + "language_loss": 0.89336216, + "learning_rate": 0.00013273831066711655, + "loss": 0.90478146, + "num_input_tokens_seen": 332003536, + "router_z_loss_mlp": 0.7421875, + "step": 4001, + "time_per_iteration": 2.5953049659729004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141038, + "balance_loss_mlp": 1.06684196, + "epoch": 0.7699115044247787, + "flos": 541695994368.0, + "grad_norm": 0.03259494083798661, + "language_loss": 0.84480441, + "learning_rate": 0.00013252697453831747, + "loss": 0.85621476, + "num_input_tokens_seen": 332075248, + "router_z_loss_mlp": 0.74121094, + "step": 4002, + "time_per_iteration": 2.685664653778076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140964, + "balance_loss_mlp": 1.06686342, + "epoch": 0.7701038861100423, + "flos": 564142938624.0, + "grad_norm": 0.03879527633270508, + "language_loss": 0.87191802, + "learning_rate": 0.00013231578107519916, + "loss": 0.8833276, + "num_input_tokens_seen": 332158944, + "router_z_loss_mlp": 0.74072266, + "step": 4003, + "time_per_iteration": 2.8707611560821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142721, + "balance_loss_mlp": 1.06843019, + "epoch": 0.7702962677953059, + "flos": 482733196800.0, + "grad_norm": 0.03964954780213044, + "language_loss": 0.87790287, + "learning_rate": 0.00013210473035975422, + "loss": 0.88933003, + "num_input_tokens_seen": 332226368, + "router_z_loss_mlp": 0.74169922, + "step": 4004, + "time_per_iteration": 2.577669143676758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137199, + "balance_loss_mlp": 1.06266928, + "epoch": 0.7704886494805695, + "flos": 771805192704.0, + "grad_norm": 0.03541890764411222, + "language_loss": 0.90018678, + "learning_rate": 0.0001318938224739201, + "loss": 0.91155875, + "num_input_tokens_seen": 332314784, + "router_z_loss_mlp": 0.74365234, + "step": 4005, + "time_per_iteration": 3.054161787033081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138331, + "balance_loss_mlp": 1.06384909, + "epoch": 0.770681031165833, + "flos": 602317189632.0, + "grad_norm": 0.032853196947195275, + "language_loss": 0.87994003, + "learning_rate": 0.00013168305749957843, + "loss": 0.89132333, + "num_input_tokens_seen": 332387952, + "router_z_loss_mlp": 0.74316406, + "step": 4006, + "time_per_iteration": 2.742284059524536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139142, + "balance_loss_mlp": 1.06461227, + "epoch": 0.7708734128510966, + "flos": 497095544832.0, + "grad_norm": 0.034737097331234285, + "language_loss": 0.87459195, + "learning_rate": 0.00013147243551855532, + "loss": 0.88598335, + "num_input_tokens_seen": 332456352, + "router_z_loss_mlp": 0.74365234, + "step": 4007, + "time_per_iteration": 2.565561532974243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138441, + "balance_loss_mlp": 1.06400645, + "epoch": 0.7710657945363601, + "flos": 568454117376.0, + "grad_norm": 0.028865688800901353, + "language_loss": 0.84292293, + "learning_rate": 0.00013126195661262148, + "loss": 0.85430735, + "num_input_tokens_seen": 332534288, + "router_z_loss_mlp": 0.74267578, + "step": 4008, + "time_per_iteration": 2.76387357711792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143893, + "balance_loss_mlp": 1.06969726, + "epoch": 0.7712581762216237, + "flos": 605749504512.0, + "grad_norm": 0.03137791389810697, + "language_loss": 0.90203846, + "learning_rate": 0.00013105162086349216, + "loss": 0.91347742, + "num_input_tokens_seen": 332615440, + "router_z_loss_mlp": 0.74121094, + "step": 4009, + "time_per_iteration": 2.8172740936279297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144917, + "balance_loss_mlp": 1.07057822, + "epoch": 0.7714505579068872, + "flos": 531996661248.0, + "grad_norm": 0.03056437231076115, + "language_loss": 0.89419609, + "learning_rate": 0.00013084142835282687, + "loss": 0.90564525, + "num_input_tokens_seen": 332687360, + "router_z_loss_mlp": 0.74169922, + "step": 4010, + "time_per_iteration": 2.7165045738220215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150368, + "balance_loss_mlp": 1.07769775, + "epoch": 0.7716429395921508, + "flos": 1425380069376.0, + "grad_norm": 0.007418114590999428, + "language_loss": 0.79884362, + "learning_rate": 0.00013063137916222956, + "loss": 0.81034732, + "num_input_tokens_seen": 332919936, + "router_z_loss_mlp": 0.7265625, + "step": 4011, + "time_per_iteration": 4.772608757019043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143697, + "balance_loss_mlp": 1.06978679, + "epoch": 0.7718353212774144, + "flos": 579586265088.0, + "grad_norm": 0.032910193378974356, + "language_loss": 0.94427228, + "learning_rate": 0.0001304214733732485, + "loss": 0.95570928, + "num_input_tokens_seen": 332990096, + "router_z_loss_mlp": 0.73925781, + "step": 4012, + "time_per_iteration": 2.789973258972168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143696, + "balance_loss_mlp": 1.06969118, + "epoch": 0.772027702962678, + "flos": 511772798976.0, + "grad_norm": 0.03524437980359451, + "language_loss": 0.87796986, + "learning_rate": 0.00013021171106737672, + "loss": 0.8894068, + "num_input_tokens_seen": 333063616, + "router_z_loss_mlp": 0.74023438, + "step": 4013, + "time_per_iteration": 2.71975040435791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113924, + "balance_loss_mlp": 1.06499684, + "epoch": 0.7722200846479416, + "flos": 526747494912.0, + "grad_norm": 0.030121234112763372, + "language_loss": 0.84496903, + "learning_rate": 0.00013000209232605071, + "loss": 0.85636145, + "num_input_tokens_seen": 333136368, + "router_z_loss_mlp": 0.74121094, + "step": 4014, + "time_per_iteration": 2.6892056465148926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139469, + "balance_loss_mlp": 1.06508267, + "epoch": 0.772412466333205, + "flos": 480601439232.0, + "grad_norm": 0.03460224041299985, + "language_loss": 0.83357382, + "learning_rate": 0.0001297926172306519, + "loss": 0.84496856, + "num_input_tokens_seen": 333207136, + "router_z_loss_mlp": 0.7421875, + "step": 4015, + "time_per_iteration": 2.6161460876464844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138641, + "balance_loss_mlp": 1.06449294, + "epoch": 0.7726048480184686, + "flos": 907312602624.0, + "grad_norm": 0.03829273799260643, + "language_loss": 0.83440059, + "learning_rate": 0.0001295832858625055, + "loss": 0.84578699, + "num_input_tokens_seen": 333291920, + "router_z_loss_mlp": 0.74023438, + "step": 4016, + "time_per_iteration": 3.286180019378662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137589, + "balance_loss_mlp": 1.06329787, + "epoch": 0.7727972297037322, + "flos": 632566024704.0, + "grad_norm": 0.037636726324715264, + "language_loss": 0.7551474, + "learning_rate": 0.00012937409830288154, + "loss": 0.7665233, + "num_input_tokens_seen": 333369824, + "router_z_loss_mlp": 0.74121094, + "step": 4017, + "time_per_iteration": 2.8370349407196045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142791, + "balance_loss_mlp": 1.0688808, + "epoch": 0.7729896113889958, + "flos": 415673068032.0, + "grad_norm": 0.038209347580389144, + "language_loss": 0.9001559, + "learning_rate": 0.00012916505463299362, + "loss": 0.91158378, + "num_input_tokens_seen": 333434192, + "router_z_loss_mlp": 0.73925781, + "step": 4018, + "time_per_iteration": 2.519319772720337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141641, + "balance_loss_mlp": 1.06754065, + "epoch": 0.7731819930742593, + "flos": 670104459264.0, + "grad_norm": 0.03754903876157777, + "language_loss": 0.83159339, + "learning_rate": 0.00012895615493399972, + "loss": 0.84300983, + "num_input_tokens_seen": 333509696, + "router_z_loss_mlp": 0.74072266, + "step": 4019, + "time_per_iteration": 2.8084754943847656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136472, + "balance_loss_mlp": 1.06203771, + "epoch": 0.7733743747595229, + "flos": 490858725888.0, + "grad_norm": 0.052975326566308774, + "language_loss": 0.88814008, + "learning_rate": 0.00012874739928700192, + "loss": 0.89950484, + "num_input_tokens_seen": 333575184, + "router_z_loss_mlp": 0.74267578, + "step": 4020, + "time_per_iteration": 2.6240487098693848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113737, + "balance_loss_mlp": 1.06307888, + "epoch": 0.7735667564447865, + "flos": 660887218176.0, + "grad_norm": 0.04201046633060088, + "language_loss": 0.84696388, + "learning_rate": 0.00012853878777304624, + "loss": 0.85833752, + "num_input_tokens_seen": 333651568, + "router_z_loss_mlp": 0.74121094, + "step": 4021, + "time_per_iteration": 2.873288154602051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135595, + "balance_loss_mlp": 1.06120825, + "epoch": 0.77375913813005, + "flos": 534490988544.0, + "grad_norm": 0.02933243833596509, + "language_loss": 0.88221383, + "learning_rate": 0.000128330320473123, + "loss": 0.89356983, + "num_input_tokens_seen": 333726400, + "router_z_loss_mlp": 0.7421875, + "step": 4022, + "time_per_iteration": 2.6959497928619385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138573, + "balance_loss_mlp": 1.06590271, + "epoch": 0.7739515198153136, + "flos": 1523379244032.0, + "grad_norm": 0.005476553783658496, + "language_loss": 0.783319, + "learning_rate": 0.00012812199746816628, + "loss": 0.79470468, + "num_input_tokens_seen": 333960224, + "router_z_loss_mlp": 0.7265625, + "step": 4023, + "time_per_iteration": 4.908393621444702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136949, + "balance_loss_mlp": 1.06256282, + "epoch": 0.7741439015005771, + "flos": 641251508736.0, + "grad_norm": 0.0388161486580036, + "language_loss": 0.86722291, + "learning_rate": 0.0001279138188390543, + "loss": 0.87859237, + "num_input_tokens_seen": 334033904, + "router_z_loss_mlp": 0.7421875, + "step": 4024, + "time_per_iteration": 2.8532886505126953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142263, + "balance_loss_mlp": 1.06835282, + "epoch": 0.7743362831858407, + "flos": 667023980544.0, + "grad_norm": 0.03451580070650428, + "language_loss": 0.90432525, + "learning_rate": 0.00012770578466660915, + "loss": 0.91574788, + "num_input_tokens_seen": 334107904, + "router_z_loss_mlp": 0.73925781, + "step": 4025, + "time_per_iteration": 2.862123489379883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142172, + "balance_loss_mlp": 1.06807196, + "epoch": 0.7745286648711043, + "flos": 563993217024.0, + "grad_norm": 0.03283033762939225, + "language_loss": 0.85806942, + "learning_rate": 0.0001274978950315968, + "loss": 0.86949122, + "num_input_tokens_seen": 334184048, + "router_z_loss_mlp": 0.74072266, + "step": 4026, + "time_per_iteration": 2.802757501602173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137274, + "balance_loss_mlp": 1.06288695, + "epoch": 0.7747210465563679, + "flos": 517961954304.0, + "grad_norm": 0.042128094380904035, + "language_loss": 0.87673521, + "learning_rate": 0.00012729015001472716, + "loss": 0.88810796, + "num_input_tokens_seen": 334257152, + "router_z_loss_mlp": 0.7421875, + "step": 4027, + "time_per_iteration": 2.6692821979522705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137346, + "balance_loss_mlp": 1.06295931, + "epoch": 0.7749134282416313, + "flos": 635368527360.0, + "grad_norm": 0.03931555017475162, + "language_loss": 0.86517704, + "learning_rate": 0.00012708254969665418, + "loss": 0.87655056, + "num_input_tokens_seen": 334331312, + "router_z_loss_mlp": 0.7421875, + "step": 4028, + "time_per_iteration": 2.7921457290649414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138509, + "balance_loss_mlp": 1.0641222, + "epoch": 0.7751058099268949, + "flos": 496350939648.0, + "grad_norm": 0.04579390573234304, + "language_loss": 0.889467, + "learning_rate": 0.00012687509415797526, + "loss": 0.90085208, + "num_input_tokens_seen": 334397344, + "router_z_loss_mlp": 0.7421875, + "step": 4029, + "time_per_iteration": 2.5587246417999268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137293, + "balance_loss_mlp": 1.06304908, + "epoch": 0.7752981916121585, + "flos": 511362565632.0, + "grad_norm": 0.03685004486441248, + "language_loss": 0.85761744, + "learning_rate": 0.00012666778347923208, + "loss": 0.86899036, + "num_input_tokens_seen": 334467872, + "router_z_loss_mlp": 0.74072266, + "step": 4030, + "time_per_iteration": 2.6332554817199707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143646, + "balance_loss_mlp": 1.06978357, + "epoch": 0.7754905732974221, + "flos": 498565289472.0, + "grad_norm": 0.03255854062300405, + "language_loss": 0.87846529, + "learning_rate": 0.0001264606177409092, + "loss": 0.88990176, + "num_input_tokens_seen": 334539088, + "router_z_loss_mlp": 0.73876953, + "step": 4031, + "time_per_iteration": 2.6323087215423584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139185, + "balance_loss_mlp": 1.06498873, + "epoch": 0.7756829549826857, + "flos": 481782474240.0, + "grad_norm": 0.03677638670321597, + "language_loss": 0.90051126, + "learning_rate": 0.00012625359702343609, + "loss": 0.91190314, + "num_input_tokens_seen": 334612576, + "router_z_loss_mlp": 0.74023438, + "step": 4032, + "time_per_iteration": 2.764946937561035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136066, + "balance_loss_mlp": 1.06186974, + "epoch": 0.7758753366679492, + "flos": 553685538816.0, + "grad_norm": 0.03552074396287166, + "language_loss": 0.89551866, + "learning_rate": 0.00012604672140718504, + "loss": 0.90687937, + "num_input_tokens_seen": 334677824, + "router_z_loss_mlp": 0.74072266, + "step": 4033, + "time_per_iteration": 2.616276741027832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136731, + "balance_loss_mlp": 1.06243956, + "epoch": 0.7760677183532128, + "flos": 705063972864.0, + "grad_norm": 0.03368756555440988, + "language_loss": 0.82777321, + "learning_rate": 0.00012583999097247233, + "loss": 0.83914053, + "num_input_tokens_seen": 334751456, + "router_z_loss_mlp": 0.74121094, + "step": 4034, + "time_per_iteration": 2.8126814365386963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136753, + "balance_loss_mlp": 1.06255746, + "epoch": 0.7762601000384763, + "flos": 524478750720.0, + "grad_norm": 0.036921944541312396, + "language_loss": 0.85384995, + "learning_rate": 0.0001256334057995578, + "loss": 0.86521751, + "num_input_tokens_seen": 334823008, + "router_z_loss_mlp": 0.74072266, + "step": 4035, + "time_per_iteration": 2.6846728324890137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138277, + "balance_loss_mlp": 1.0641768, + "epoch": 0.7764524817237399, + "flos": 558617797632.0, + "grad_norm": 0.033254007354158545, + "language_loss": 0.89694679, + "learning_rate": 0.000125426965968645, + "loss": 0.90832961, + "num_input_tokens_seen": 334896048, + "router_z_loss_mlp": 0.73974609, + "step": 4036, + "time_per_iteration": 2.747835636138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144007, + "balance_loss_mlp": 1.07009733, + "epoch": 0.7766448634090035, + "flos": 580816965120.0, + "grad_norm": 0.036524717116784906, + "language_loss": 0.87124515, + "learning_rate": 0.00012522067155988092, + "loss": 0.88268518, + "num_input_tokens_seen": 334964416, + "router_z_loss_mlp": 0.73925781, + "step": 4037, + "time_per_iteration": 2.7287211418151855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011441, + "balance_loss_mlp": 1.07028544, + "epoch": 0.776837245094267, + "flos": 636818806272.0, + "grad_norm": 0.04076227552668926, + "language_loss": 0.80187047, + "learning_rate": 0.00012501452265335617, + "loss": 0.81331146, + "num_input_tokens_seen": 335043360, + "router_z_loss_mlp": 0.73828125, + "step": 4038, + "time_per_iteration": 2.811866283416748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138752, + "balance_loss_mlp": 1.06455588, + "epoch": 0.7770296267795306, + "flos": 615813408768.0, + "grad_norm": 0.0355390445236554, + "language_loss": 0.87746716, + "learning_rate": 0.0001248085193291047, + "loss": 0.88885468, + "num_input_tokens_seen": 335113216, + "router_z_loss_mlp": 0.74023438, + "step": 4039, + "time_per_iteration": 2.734161853790283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137901, + "balance_loss_mlp": 1.06380022, + "epoch": 0.7772220084647942, + "flos": 880295969280.0, + "grad_norm": 0.030150697576870535, + "language_loss": 0.86369264, + "learning_rate": 0.00012460266166710443, + "loss": 0.87507164, + "num_input_tokens_seen": 335195824, + "router_z_loss_mlp": 0.73974609, + "step": 4040, + "time_per_iteration": 3.137223243713379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146543, + "balance_loss_mlp": 1.07215619, + "epoch": 0.7774143901500578, + "flos": 841038011904.0, + "grad_norm": 0.03809465045400834, + "language_loss": 0.82413107, + "learning_rate": 0.00012439694974727633, + "loss": 0.8355965, + "num_input_tokens_seen": 335269712, + "router_z_loss_mlp": 0.7421875, + "step": 4041, + "time_per_iteration": 3.0596840381622314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146741, + "balance_loss_mlp": 1.07225895, + "epoch": 0.7776067718353212, + "flos": 569228921856.0, + "grad_norm": 0.03500635055952716, + "language_loss": 0.84672141, + "learning_rate": 0.00012419138364948458, + "loss": 0.85818887, + "num_input_tokens_seen": 335343408, + "router_z_loss_mlp": 0.74316406, + "step": 4042, + "time_per_iteration": 2.697154998779297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153394, + "balance_loss_mlp": 1.07919836, + "epoch": 0.7777991535205848, + "flos": 747209026560.0, + "grad_norm": 0.038117976475530245, + "language_loss": 0.87011731, + "learning_rate": 0.00012398596345353702, + "loss": 0.88165122, + "num_input_tokens_seen": 335415360, + "router_z_loss_mlp": 0.74072266, + "step": 4043, + "time_per_iteration": 2.903593063354492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145251, + "balance_loss_mlp": 1.07086432, + "epoch": 0.7779915352058484, + "flos": 539182201344.0, + "grad_norm": 0.034270473867383876, + "language_loss": 0.87845659, + "learning_rate": 0.0001237806892391851, + "loss": 0.88990903, + "num_input_tokens_seen": 335491568, + "router_z_loss_mlp": 0.7421875, + "step": 4044, + "time_per_iteration": 2.713480234146118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145012, + "balance_loss_mlp": 1.0706259, + "epoch": 0.778183916891112, + "flos": 635954678784.0, + "grad_norm": 0.03512178084580865, + "language_loss": 0.85495478, + "learning_rate": 0.0001235755610861233, + "loss": 0.86640489, + "num_input_tokens_seen": 335567200, + "router_z_loss_mlp": 0.7421875, + "step": 4045, + "time_per_iteration": 2.732534170150757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141546, + "balance_loss_mlp": 1.06711173, + "epoch": 0.7783762985763756, + "flos": 589789157376.0, + "grad_norm": 0.036702613640591464, + "language_loss": 0.89351201, + "learning_rate": 0.0001233705790739893, + "loss": 0.90492749, + "num_input_tokens_seen": 335640512, + "router_z_loss_mlp": 0.74267578, + "step": 4046, + "time_per_iteration": 2.7078564167022705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139744, + "balance_loss_mlp": 1.06535733, + "epoch": 0.7785686802616391, + "flos": 932240412672.0, + "grad_norm": 0.03647485158303252, + "language_loss": 0.79245514, + "learning_rate": 0.0001231657432823643, + "loss": 0.80385262, + "num_input_tokens_seen": 335726016, + "router_z_loss_mlp": 0.7421875, + "step": 4047, + "time_per_iteration": 3.204200029373169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114146, + "balance_loss_mlp": 1.06707299, + "epoch": 0.7787610619469026, + "flos": 498956057088.0, + "grad_norm": 0.04086385671919431, + "language_loss": 0.84949565, + "learning_rate": 0.0001229610537907725, + "loss": 0.86091024, + "num_input_tokens_seen": 335794864, + "router_z_loss_mlp": 0.7421875, + "step": 4048, + "time_per_iteration": 2.587411403656006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139179, + "balance_loss_mlp": 1.06483984, + "epoch": 0.7789534436321662, + "flos": 516650663424.0, + "grad_norm": 0.0370984959952915, + "language_loss": 0.95913208, + "learning_rate": 0.00012275651067868143, + "loss": 0.97052377, + "num_input_tokens_seen": 335860928, + "router_z_loss_mlp": 0.74169922, + "step": 4049, + "time_per_iteration": 2.6297829151153564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145054, + "balance_loss_mlp": 1.07095397, + "epoch": 0.7791458253174298, + "flos": 990061106688.0, + "grad_norm": 0.049766868205719794, + "language_loss": 0.84448528, + "learning_rate": 0.00012255211402550182, + "loss": 0.85593581, + "num_input_tokens_seen": 335945728, + "router_z_loss_mlp": 0.74072266, + "step": 4050, + "time_per_iteration": 3.2185845375061035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138393, + "balance_loss_mlp": 1.06400621, + "epoch": 0.7793382070026933, + "flos": 630184488960.0, + "grad_norm": 0.041629514228615855, + "language_loss": 0.82138163, + "learning_rate": 0.00012234786391058727, + "loss": 0.83276558, + "num_input_tokens_seen": 336014848, + "router_z_loss_mlp": 0.7421875, + "step": 4051, + "time_per_iteration": 2.7984745502471924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114015, + "balance_loss_mlp": 1.06590664, + "epoch": 0.7795305886879569, + "flos": 532762733568.0, + "grad_norm": 0.042901247751836985, + "language_loss": 0.90027404, + "learning_rate": 0.0001221437604132352, + "loss": 0.91167557, + "num_input_tokens_seen": 336080096, + "router_z_loss_mlp": 0.74072266, + "step": 4052, + "time_per_iteration": 2.6062204837799072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139339, + "balance_loss_mlp": 1.06490481, + "epoch": 0.7797229703732205, + "flos": 613141161984.0, + "grad_norm": 0.0426206226565264, + "language_loss": 0.86529624, + "learning_rate": 0.0001219398036126852, + "loss": 0.87668967, + "num_input_tokens_seen": 336154640, + "router_z_loss_mlp": 0.74267578, + "step": 4053, + "time_per_iteration": 2.7453675270080566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137791, + "balance_loss_mlp": 1.06340408, + "epoch": 0.7799153520584841, + "flos": 873794635776.0, + "grad_norm": 0.03320369943222444, + "language_loss": 0.82415718, + "learning_rate": 0.00012173599358812027, + "loss": 0.83553505, + "num_input_tokens_seen": 336244160, + "router_z_loss_mlp": 0.7421875, + "step": 4054, + "time_per_iteration": 3.2739408016204834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137317, + "balance_loss_mlp": 1.06293011, + "epoch": 0.7801077337437476, + "flos": 584744107008.0, + "grad_norm": 0.03804124847596099, + "language_loss": 0.87714571, + "learning_rate": 0.0001215323304186668, + "loss": 0.88851887, + "num_input_tokens_seen": 336317936, + "router_z_loss_mlp": 0.7421875, + "step": 4055, + "time_per_iteration": 2.7659378051757812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137343, + "balance_loss_mlp": 1.06319451, + "epoch": 0.7803001154290111, + "flos": 602280259584.0, + "grad_norm": 0.03158827116137511, + "language_loss": 0.91988087, + "learning_rate": 0.00012132881418339364, + "loss": 0.93125427, + "num_input_tokens_seen": 336389504, + "router_z_loss_mlp": 0.74072266, + "step": 4056, + "time_per_iteration": 2.7168469429016113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114492, + "balance_loss_mlp": 1.07263184, + "epoch": 0.7804924971142747, + "flos": 1482925515264.0, + "grad_norm": 0.005095674237873183, + "language_loss": 0.77517563, + "learning_rate": 0.00012112544496131306, + "loss": 0.78662485, + "num_input_tokens_seen": 336615536, + "router_z_loss_mlp": 0.72460938, + "step": 4057, + "time_per_iteration": 4.8585734367370605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113894, + "balance_loss_mlp": 1.06460154, + "epoch": 0.7806848787995383, + "flos": 631515245568.0, + "grad_norm": 0.03359665860494396, + "language_loss": 0.81806797, + "learning_rate": 0.00012092222283137944, + "loss": 0.8294574, + "num_input_tokens_seen": 336686400, + "router_z_loss_mlp": 0.74169922, + "step": 4058, + "time_per_iteration": 2.757882595062256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115152, + "balance_loss_mlp": 1.079422, + "epoch": 0.7808772604848019, + "flos": 1420745252352.0, + "grad_norm": 0.008112478231263178, + "language_loss": 0.7890631, + "learning_rate": 0.00012071914787249111, + "loss": 0.8005783, + "num_input_tokens_seen": 336912704, + "router_z_loss_mlp": 0.72265625, + "step": 4059, + "time_per_iteration": 4.779797315597534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011384, + "balance_loss_mlp": 1.06406116, + "epoch": 0.7810696421700654, + "flos": 733103187456.0, + "grad_norm": 0.03176373649090862, + "language_loss": 0.88107026, + "learning_rate": 0.00012051622016348856, + "loss": 0.89245427, + "num_input_tokens_seen": 336997040, + "router_z_loss_mlp": 0.74169922, + "step": 4060, + "time_per_iteration": 3.0269150733947754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138414, + "balance_loss_mlp": 1.06412303, + "epoch": 0.781262023855329, + "flos": 425837028864.0, + "grad_norm": 0.038145388321841694, + "language_loss": 0.90811419, + "learning_rate": 0.00012031343978315539, + "loss": 0.91949832, + "num_input_tokens_seen": 337059760, + "router_z_loss_mlp": 0.74121094, + "step": 4061, + "time_per_iteration": 2.459432363510132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136363, + "balance_loss_mlp": 1.06197631, + "epoch": 0.7814544055405925, + "flos": 502073465856.0, + "grad_norm": 0.03753829813607959, + "language_loss": 0.87161046, + "learning_rate": 0.00012011080681021774, + "loss": 0.88297415, + "num_input_tokens_seen": 337128528, + "router_z_loss_mlp": 0.7421875, + "step": 4062, + "time_per_iteration": 2.691654920578003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136384, + "balance_loss_mlp": 1.06204486, + "epoch": 0.7816467872258561, + "flos": 463392927744.0, + "grad_norm": 0.03545714253981061, + "language_loss": 0.90689021, + "learning_rate": 0.00011990832132334512, + "loss": 0.91825402, + "num_input_tokens_seen": 337194112, + "router_z_loss_mlp": 0.74169922, + "step": 4063, + "time_per_iteration": 2.501356363296509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011365, + "balance_loss_mlp": 1.06211364, + "epoch": 0.7818391689111197, + "flos": 742107580416.0, + "grad_norm": 0.03646375779692072, + "language_loss": 0.8761006, + "learning_rate": 0.00011970598340114897, + "loss": 0.8874656, + "num_input_tokens_seen": 337270416, + "router_z_loss_mlp": 0.7421875, + "step": 4064, + "time_per_iteration": 2.9211695194244385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138234, + "balance_loss_mlp": 1.06389523, + "epoch": 0.7820315505963832, + "flos": 548805672960.0, + "grad_norm": 0.037373767627345386, + "language_loss": 0.88286138, + "learning_rate": 0.00011950379312218396, + "loss": 0.89424372, + "num_input_tokens_seen": 337343024, + "router_z_loss_mlp": 0.74169922, + "step": 4065, + "time_per_iteration": 2.7662761211395264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139451, + "balance_loss_mlp": 1.06511247, + "epoch": 0.7822239322816468, + "flos": 730259025408.0, + "grad_norm": 0.031688812892368586, + "language_loss": 0.90089023, + "learning_rate": 0.00011930175056494719, + "loss": 0.91228467, + "num_input_tokens_seen": 337417232, + "router_z_loss_mlp": 0.74169922, + "step": 4066, + "time_per_iteration": 2.8510522842407227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145428, + "balance_loss_mlp": 1.07137561, + "epoch": 0.7824163139669104, + "flos": 452985919488.0, + "grad_norm": 0.030648314991386538, + "language_loss": 0.79762566, + "learning_rate": 0.00011909985580787885, + "loss": 0.80907995, + "num_input_tokens_seen": 337488224, + "router_z_loss_mlp": 0.73974609, + "step": 4067, + "time_per_iteration": 2.6272332668304443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144706, + "balance_loss_mlp": 1.07074893, + "epoch": 0.782608695652174, + "flos": 541620132864.0, + "grad_norm": 0.030654260562385374, + "language_loss": 0.85639668, + "learning_rate": 0.00011889810892936137, + "loss": 0.86784375, + "num_input_tokens_seen": 337564928, + "router_z_loss_mlp": 0.73974609, + "step": 4068, + "time_per_iteration": 2.7750964164733887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114329, + "balance_loss_mlp": 1.06899869, + "epoch": 0.7828010773374374, + "flos": 501428917248.0, + "grad_norm": 0.03582388212815207, + "language_loss": 0.82907784, + "learning_rate": 0.00011869651000771959, + "loss": 0.84051073, + "num_input_tokens_seen": 337641632, + "router_z_loss_mlp": 0.74169922, + "step": 4069, + "time_per_iteration": 2.8643925189971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138233, + "balance_loss_mlp": 1.06389439, + "epoch": 0.782993459022701, + "flos": 601917689856.0, + "grad_norm": 0.03429166344261292, + "language_loss": 0.87759733, + "learning_rate": 0.00011849505912122117, + "loss": 0.88897967, + "num_input_tokens_seen": 337711968, + "router_z_loss_mlp": 0.74169922, + "step": 4070, + "time_per_iteration": 2.6959619522094727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138061, + "balance_loss_mlp": 1.06377029, + "epoch": 0.7831858407079646, + "flos": 811475384832.0, + "grad_norm": 0.039746496548432604, + "language_loss": 0.82642615, + "learning_rate": 0.00011829375634807654, + "loss": 0.8378067, + "num_input_tokens_seen": 337795792, + "router_z_loss_mlp": 0.74121094, + "step": 4071, + "time_per_iteration": 3.0114569664001465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136715, + "balance_loss_mlp": 1.06247175, + "epoch": 0.7833782223932282, + "flos": 808012870656.0, + "grad_norm": 0.03273964905208881, + "language_loss": 0.857427, + "learning_rate": 0.00011809260176643821, + "loss": 0.86879414, + "num_input_tokens_seen": 337875584, + "router_z_loss_mlp": 0.74121094, + "step": 4072, + "time_per_iteration": 3.0994741916656494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136116, + "balance_loss_mlp": 1.06206274, + "epoch": 0.7835706040784918, + "flos": 521899829760.0, + "grad_norm": 0.04024817722432492, + "language_loss": 0.88959461, + "learning_rate": 0.00011789159545440131, + "loss": 0.9009558, + "num_input_tokens_seen": 337942304, + "router_z_loss_mlp": 0.74023438, + "step": 4073, + "time_per_iteration": 2.644077777862549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138181, + "balance_loss_mlp": 1.06398499, + "epoch": 0.7837629857637552, + "flos": 506743211520.0, + "grad_norm": 0.03009333087268268, + "language_loss": 0.86380607, + "learning_rate": 0.00011769073749000348, + "loss": 0.87518787, + "num_input_tokens_seen": 338020864, + "router_z_loss_mlp": 0.74023438, + "step": 4074, + "time_per_iteration": 2.7675211429595947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138086, + "balance_loss_mlp": 1.06384242, + "epoch": 0.7839553674490188, + "flos": 517134756864.0, + "grad_norm": 0.03603773685865746, + "language_loss": 0.81149113, + "learning_rate": 0.0001174900279512246, + "loss": 0.82287204, + "num_input_tokens_seen": 338089584, + "router_z_loss_mlp": 0.74072266, + "step": 4075, + "time_per_iteration": 2.559067964553833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138281, + "balance_loss_mlp": 1.06418085, + "epoch": 0.7841477491342824, + "flos": 507650273280.0, + "grad_norm": 0.04900023922641464, + "language_loss": 0.86111671, + "learning_rate": 0.00011728946691598707, + "loss": 0.87249947, + "num_input_tokens_seen": 338159568, + "router_z_loss_mlp": 0.73974609, + "step": 4076, + "time_per_iteration": 2.601316213607788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139089, + "balance_loss_mlp": 1.06498837, + "epoch": 0.784340130819546, + "flos": 720904797696.0, + "grad_norm": 0.037946042945582265, + "language_loss": 0.81358349, + "learning_rate": 0.00011708905446215561, + "loss": 0.82497436, + "num_input_tokens_seen": 338233952, + "router_z_loss_mlp": 0.73974609, + "step": 4077, + "time_per_iteration": 2.8491528034210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138777, + "balance_loss_mlp": 1.06477141, + "epoch": 0.7845325125048095, + "flos": 515513289216.0, + "grad_norm": 0.03152801605769719, + "language_loss": 0.84297472, + "learning_rate": 0.00011688879066753711, + "loss": 0.85436249, + "num_input_tokens_seen": 338309568, + "router_z_loss_mlp": 0.73925781, + "step": 4078, + "time_per_iteration": 2.649890184402466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139298, + "balance_loss_mlp": 1.06529319, + "epoch": 0.7847248941900731, + "flos": 467050825728.0, + "grad_norm": 0.04544253460314975, + "language_loss": 0.92901659, + "learning_rate": 0.00011668867560988122, + "loss": 0.9404096, + "num_input_tokens_seen": 338375920, + "router_z_loss_mlp": 0.73925781, + "step": 4079, + "time_per_iteration": 2.583395004272461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137742, + "balance_loss_mlp": 1.06383276, + "epoch": 0.7849172758753367, + "flos": 504083699712.0, + "grad_norm": 0.03256844135977144, + "language_loss": 0.89159727, + "learning_rate": 0.00011648870936687916, + "loss": 0.90297467, + "num_input_tokens_seen": 338452208, + "router_z_loss_mlp": 0.73876953, + "step": 4080, + "time_per_iteration": 2.729670524597168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137567, + "balance_loss_mlp": 1.06375289, + "epoch": 0.7851096575606002, + "flos": 533031977472.0, + "grad_norm": 0.038157171447079044, + "language_loss": 0.83702409, + "learning_rate": 0.00011628889201616461, + "loss": 0.84839982, + "num_input_tokens_seen": 338522864, + "router_z_loss_mlp": 0.73828125, + "step": 4081, + "time_per_iteration": 2.6109676361083984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139939, + "balance_loss_mlp": 1.06602943, + "epoch": 0.7853020392458638, + "flos": 571043771904.0, + "grad_norm": 0.03751217922846888, + "language_loss": 0.86986727, + "learning_rate": 0.00011608922363531393, + "loss": 0.88126665, + "num_input_tokens_seen": 338591024, + "router_z_loss_mlp": 0.73876953, + "step": 4082, + "time_per_iteration": 2.6544032096862793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140867, + "balance_loss_mlp": 1.06686151, + "epoch": 0.7854944209311273, + "flos": 833991459840.0, + "grad_norm": 0.051644606704595315, + "language_loss": 0.88386512, + "learning_rate": 0.00011588970430184504, + "loss": 0.8952738, + "num_input_tokens_seen": 338669616, + "router_z_loss_mlp": 0.73925781, + "step": 4083, + "time_per_iteration": 3.0330986976623535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137232, + "balance_loss_mlp": 1.06332254, + "epoch": 0.7856868026163909, + "flos": 561010066944.0, + "grad_norm": 0.028770858152958077, + "language_loss": 0.85727829, + "learning_rate": 0.00011569033409321822, + "loss": 0.86865062, + "num_input_tokens_seen": 338740416, + "router_z_loss_mlp": 0.73876953, + "step": 4084, + "time_per_iteration": 2.678072452545166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137339, + "balance_loss_mlp": 1.0635246, + "epoch": 0.7858791843016545, + "flos": 546267684864.0, + "grad_norm": 0.036494926225622726, + "language_loss": 0.77694023, + "learning_rate": 0.00011549111308683591, + "loss": 0.78831363, + "num_input_tokens_seen": 338807664, + "router_z_loss_mlp": 0.73828125, + "step": 4085, + "time_per_iteration": 2.67767596244812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137399, + "balance_loss_mlp": 1.06339443, + "epoch": 0.7860715659869181, + "flos": 381840195072.0, + "grad_norm": 0.03798884187272388, + "language_loss": 0.86288953, + "learning_rate": 0.00011529204136004251, + "loss": 0.87426353, + "num_input_tokens_seen": 338869472, + "router_z_loss_mlp": 0.73925781, + "step": 4086, + "time_per_iteration": 2.533773422241211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143071, + "balance_loss_mlp": 1.068923, + "epoch": 0.7862639476721817, + "flos": 568512514560.0, + "grad_norm": 0.030679232207270264, + "language_loss": 0.87964737, + "learning_rate": 0.00011509311899012459, + "loss": 0.89107811, + "num_input_tokens_seen": 338941312, + "router_z_loss_mlp": 0.73974609, + "step": 4087, + "time_per_iteration": 2.76526141166687 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143134, + "balance_loss_mlp": 1.06903315, + "epoch": 0.7864563293574451, + "flos": 546322079232.0, + "grad_norm": 0.04187466244210811, + "language_loss": 0.83333945, + "learning_rate": 0.00011489434605431053, + "loss": 0.84477079, + "num_input_tokens_seen": 339010208, + "router_z_loss_mlp": 0.73925781, + "step": 4088, + "time_per_iteration": 2.6215317249298096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113809, + "balance_loss_mlp": 1.06408453, + "epoch": 0.7866487110427087, + "flos": 564648499200.0, + "grad_norm": 0.03663955414764931, + "language_loss": 0.861283, + "learning_rate": 0.0001146957226297708, + "loss": 0.87266392, + "num_input_tokens_seen": 339081232, + "router_z_loss_mlp": 0.73925781, + "step": 4089, + "time_per_iteration": 2.673021078109741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137912, + "balance_loss_mlp": 1.06381154, + "epoch": 0.7868410927279723, + "flos": 729558081024.0, + "grad_norm": 0.03607616248061006, + "language_loss": 0.80388957, + "learning_rate": 0.00011449724879361827, + "loss": 0.8152687, + "num_input_tokens_seen": 339161040, + "router_z_loss_mlp": 0.73974609, + "step": 4090, + "time_per_iteration": 2.9554953575134277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138064, + "balance_loss_mlp": 1.06410635, + "epoch": 0.7870334744132359, + "flos": 522447049728.0, + "grad_norm": 0.04384771027998422, + "language_loss": 0.79606628, + "learning_rate": 0.00011429892462290687, + "loss": 0.80744684, + "num_input_tokens_seen": 339233984, + "router_z_loss_mlp": 0.73925781, + "step": 4091, + "time_per_iteration": 2.663344383239746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137849, + "balance_loss_mlp": 1.06360543, + "epoch": 0.7872258560984994, + "flos": 452362838016.0, + "grad_norm": 0.03444063676499776, + "language_loss": 0.88160485, + "learning_rate": 0.00011410075019463295, + "loss": 0.89298332, + "num_input_tokens_seen": 339303168, + "router_z_loss_mlp": 0.74072266, + "step": 4092, + "time_per_iteration": 2.6327311992645264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137383, + "balance_loss_mlp": 1.06323516, + "epoch": 0.787418237783763, + "flos": 516249162240.0, + "grad_norm": 0.03476027857253962, + "language_loss": 0.84398365, + "learning_rate": 0.00011390272558573461, + "loss": 0.85535741, + "num_input_tokens_seen": 339374512, + "router_z_loss_mlp": 0.74023438, + "step": 4093, + "time_per_iteration": 2.675528049468994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137221, + "balance_loss_mlp": 1.06316793, + "epoch": 0.7876106194690266, + "flos": 486056722944.0, + "grad_norm": 0.030632947109506273, + "language_loss": 0.84047627, + "learning_rate": 0.00011370485087309202, + "loss": 0.85184848, + "num_input_tokens_seen": 339442720, + "router_z_loss_mlp": 0.73974609, + "step": 4094, + "time_per_iteration": 2.6260645389556885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138901, + "balance_loss_mlp": 1.06465769, + "epoch": 0.7878030011542901, + "flos": 543929809920.0, + "grad_norm": 0.0372748045886788, + "language_loss": 0.83189571, + "learning_rate": 0.00011350712613352688, + "loss": 0.84328461, + "num_input_tokens_seen": 339508800, + "router_z_loss_mlp": 0.74072266, + "step": 4095, + "time_per_iteration": 2.64158034324646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138645, + "balance_loss_mlp": 1.06440127, + "epoch": 0.7879953828395537, + "flos": 517749106176.0, + "grad_norm": 0.04715116302825024, + "language_loss": 0.85976934, + "learning_rate": 0.00011330955144380283, + "loss": 0.87115586, + "num_input_tokens_seen": 339578048, + "router_z_loss_mlp": 0.74072266, + "step": 4096, + "time_per_iteration": 2.599391222000122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138884, + "balance_loss_mlp": 1.06464028, + "epoch": 0.7881877645248172, + "flos": 583376420352.0, + "grad_norm": 0.03608757830250762, + "language_loss": 0.90583527, + "learning_rate": 0.00011311212688062483, + "loss": 0.91722411, + "num_input_tokens_seen": 339650176, + "router_z_loss_mlp": 0.74072266, + "step": 4097, + "time_per_iteration": 2.7737503051757812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141606, + "balance_loss_mlp": 1.06741059, + "epoch": 0.7883801462100808, + "flos": 590327645184.0, + "grad_norm": 0.09861102268280594, + "language_loss": 0.83454096, + "learning_rate": 0.0001129148525206402, + "loss": 0.84595704, + "num_input_tokens_seen": 339727312, + "router_z_loss_mlp": 0.74023438, + "step": 4098, + "time_per_iteration": 2.8053319454193115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114196, + "balance_loss_mlp": 1.06766832, + "epoch": 0.7885725278953444, + "flos": 482741928960.0, + "grad_norm": 0.039263204911434944, + "language_loss": 0.9157722, + "learning_rate": 0.00011271772844043759, + "loss": 0.92719185, + "num_input_tokens_seen": 339801344, + "router_z_loss_mlp": 0.74121094, + "step": 4099, + "time_per_iteration": 2.6722400188446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113855, + "balance_loss_mlp": 1.06440175, + "epoch": 0.788764909580608, + "flos": 758098126848.0, + "grad_norm": 0.0423984319236596, + "language_loss": 0.81897676, + "learning_rate": 0.00011252075471654727, + "loss": 0.83036232, + "num_input_tokens_seen": 339877840, + "router_z_loss_mlp": 0.74023438, + "step": 4100, + "time_per_iteration": 2.941443920135498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138656, + "balance_loss_mlp": 1.0645076, + "epoch": 0.7889572912658714, + "flos": 703878935040.0, + "grad_norm": 0.03307179261397765, + "language_loss": 0.82702905, + "learning_rate": 0.00011232393142544133, + "loss": 0.83841556, + "num_input_tokens_seen": 339959568, + "router_z_loss_mlp": 0.74023438, + "step": 4101, + "time_per_iteration": 2.9557137489318848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138555, + "balance_loss_mlp": 1.06435919, + "epoch": 0.789149672951135, + "flos": 737840062464.0, + "grad_norm": 0.034454067220804824, + "language_loss": 0.87124509, + "learning_rate": 0.00011212725864353323, + "loss": 0.88263059, + "num_input_tokens_seen": 340043600, + "router_z_loss_mlp": 0.74023438, + "step": 4102, + "time_per_iteration": 3.0640292167663574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145164, + "balance_loss_mlp": 1.07287598, + "epoch": 0.7893420546363986, + "flos": 1484487859200.0, + "grad_norm": 0.005768368046383886, + "language_loss": 0.76335925, + "learning_rate": 0.00011193073644717822, + "loss": 0.77481097, + "num_input_tokens_seen": 340270608, + "router_z_loss_mlp": 0.72460938, + "step": 4103, + "time_per_iteration": 4.858243227005005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140406, + "balance_loss_mlp": 1.06620967, + "epoch": 0.7895344363216622, + "flos": 510079472640.0, + "grad_norm": 0.047695984740599745, + "language_loss": 0.81464952, + "learning_rate": 0.00011173436491267291, + "loss": 0.82605356, + "num_input_tokens_seen": 340338784, + "router_z_loss_mlp": 0.74023438, + "step": 4104, + "time_per_iteration": 2.6253249645233154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137981, + "balance_loss_mlp": 1.06378555, + "epoch": 0.7897268180069258, + "flos": 543037484544.0, + "grad_norm": 0.03504267179198509, + "language_loss": 0.86698043, + "learning_rate": 0.0001115381441162554, + "loss": 0.87836027, + "num_input_tokens_seen": 340407744, + "router_z_loss_mlp": 0.74023438, + "step": 4105, + "time_per_iteration": 2.644268274307251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143089, + "balance_loss_mlp": 1.07080078, + "epoch": 0.7899191996921893, + "flos": 1415749867008.0, + "grad_norm": 0.006312961233255799, + "language_loss": 0.73583722, + "learning_rate": 0.00011134207413410557, + "loss": 0.7472682, + "num_input_tokens_seen": 340635824, + "router_z_loss_mlp": 0.72460938, + "step": 4106, + "time_per_iteration": 4.874951601028442 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139255, + "balance_loss_mlp": 1.06486893, + "epoch": 0.7901115813774529, + "flos": 624021530112.0, + "grad_norm": 0.035685278807963586, + "language_loss": 0.89252567, + "learning_rate": 0.00011114615504234465, + "loss": 0.90391827, + "num_input_tokens_seen": 340710928, + "router_z_loss_mlp": 0.7421875, + "step": 4107, + "time_per_iteration": 2.759730577468872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139038, + "balance_loss_mlp": 1.06488955, + "epoch": 0.7903039630627164, + "flos": 646804847616.0, + "grad_norm": 0.03564605308593673, + "language_loss": 0.86189628, + "learning_rate": 0.00011095038691703468, + "loss": 0.87328672, + "num_input_tokens_seen": 340786128, + "router_z_loss_mlp": 0.74023438, + "step": 4108, + "time_per_iteration": 2.8478689193725586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141249, + "balance_loss_mlp": 1.0670532, + "epoch": 0.79049634474798, + "flos": 595611740160.0, + "grad_norm": 0.03583745426638565, + "language_loss": 0.86790907, + "learning_rate": 0.00011075476983417998, + "loss": 0.87932158, + "num_input_tokens_seen": 340861616, + "router_z_loss_mlp": 0.74072266, + "step": 4109, + "time_per_iteration": 2.8335795402526855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139823, + "balance_loss_mlp": 1.0655793, + "epoch": 0.7906887264332435, + "flos": 717331493376.0, + "grad_norm": 0.038905447121572734, + "language_loss": 0.82716894, + "learning_rate": 0.00011055930386972579, + "loss": 0.83856714, + "num_input_tokens_seen": 340934480, + "router_z_loss_mlp": 0.74072266, + "step": 4110, + "time_per_iteration": 2.871617555618286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136816, + "balance_loss_mlp": 1.06271601, + "epoch": 0.7908811081185071, + "flos": 791260254720.0, + "grad_norm": 0.03420948770513602, + "language_loss": 0.82615238, + "learning_rate": 0.00011036398909955863, + "loss": 0.8375206, + "num_input_tokens_seen": 341014912, + "router_z_loss_mlp": 0.74023438, + "step": 4111, + "time_per_iteration": 3.035374402999878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137149, + "balance_loss_mlp": 1.06304824, + "epoch": 0.7910734898037707, + "flos": 643075090944.0, + "grad_norm": 0.03464769838403225, + "language_loss": 0.85694349, + "learning_rate": 0.00011016882559950648, + "loss": 0.86831492, + "num_input_tokens_seen": 341090608, + "router_z_loss_mlp": 0.73974609, + "step": 4112, + "time_per_iteration": 2.809424877166748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136751, + "balance_loss_mlp": 1.06284177, + "epoch": 0.7912658714890343, + "flos": 670560354816.0, + "grad_norm": 0.03852457437308278, + "language_loss": 0.85799241, + "learning_rate": 0.00010997381344533853, + "loss": 0.86935997, + "num_input_tokens_seen": 341160992, + "router_z_loss_mlp": 0.73876953, + "step": 4113, + "time_per_iteration": 2.7723140716552734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139368, + "balance_loss_mlp": 1.06512499, + "epoch": 0.7914582531742979, + "flos": 558887041536.0, + "grad_norm": 0.03351504494890856, + "language_loss": 0.84678841, + "learning_rate": 0.00010977895271276517, + "loss": 0.85818207, + "num_input_tokens_seen": 341232032, + "router_z_loss_mlp": 0.74072266, + "step": 4114, + "time_per_iteration": 2.6767303943634033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138954, + "balance_loss_mlp": 1.06494868, + "epoch": 0.7916506348595613, + "flos": 571191492096.0, + "grad_norm": 0.04313250317632895, + "language_loss": 0.84584868, + "learning_rate": 0.00010958424347743807, + "loss": 0.85723823, + "num_input_tokens_seen": 341303888, + "router_z_loss_mlp": 0.73925781, + "step": 4115, + "time_per_iteration": 2.7286806106567383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136476, + "balance_loss_mlp": 1.06266189, + "epoch": 0.7918430165448249, + "flos": 719645899776.0, + "grad_norm": 0.03512595532684894, + "language_loss": 0.8494817, + "learning_rate": 0.00010938968581494991, + "loss": 0.8608464, + "num_input_tokens_seen": 341385616, + "router_z_loss_mlp": 0.73828125, + "step": 4116, + "time_per_iteration": 2.9482476711273193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136681, + "balance_loss_mlp": 1.06277156, + "epoch": 0.7920353982300885, + "flos": 554736317952.0, + "grad_norm": 0.04228851157339113, + "language_loss": 0.83485335, + "learning_rate": 0.000109195279800835, + "loss": 0.84622014, + "num_input_tokens_seen": 341460976, + "router_z_loss_mlp": 0.73876953, + "step": 4117, + "time_per_iteration": 2.69572114944458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139513, + "balance_loss_mlp": 1.06555605, + "epoch": 0.7922277799153521, + "flos": 811540512768.0, + "grad_norm": 0.03903964409517225, + "language_loss": 0.81738925, + "learning_rate": 0.00010900102551056834, + "loss": 0.82878435, + "num_input_tokens_seen": 341537328, + "router_z_loss_mlp": 0.73876953, + "step": 4118, + "time_per_iteration": 3.021683692932129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139717, + "balance_loss_mlp": 1.06580722, + "epoch": 0.7924201616006156, + "flos": 422244258816.0, + "grad_norm": 0.03704274036887823, + "language_loss": 0.89204621, + "learning_rate": 0.00010880692301956601, + "loss": 0.90344346, + "num_input_tokens_seen": 341600272, + "router_z_loss_mlp": 0.73876953, + "step": 4119, + "time_per_iteration": 2.509284019470215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136195, + "balance_loss_mlp": 1.06238043, + "epoch": 0.7926125432858792, + "flos": 619104734208.0, + "grad_norm": 0.032195482380303, + "language_loss": 0.90015543, + "learning_rate": 0.00010861297240318518, + "loss": 0.91151732, + "num_input_tokens_seen": 341682096, + "router_z_loss_mlp": 0.73828125, + "step": 4120, + "time_per_iteration": 2.835418939590454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136735, + "balance_loss_mlp": 1.0630163, + "epoch": 0.7928049249711427, + "flos": 603611016192.0, + "grad_norm": 0.031028055346739136, + "language_loss": 0.90660435, + "learning_rate": 0.00010841917373672444, + "loss": 0.91797173, + "num_input_tokens_seen": 341754912, + "router_z_loss_mlp": 0.73730469, + "step": 4121, + "time_per_iteration": 2.7115211486816406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136879, + "balance_loss_mlp": 1.06306481, + "epoch": 0.7929973066564063, + "flos": 657231321600.0, + "grad_norm": 0.03886819591939463, + "language_loss": 0.83054501, + "learning_rate": 0.00010822552709542293, + "loss": 0.84191382, + "num_input_tokens_seen": 341831152, + "router_z_loss_mlp": 0.73828125, + "step": 4122, + "time_per_iteration": 2.811147928237915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137962, + "balance_loss_mlp": 1.0642904, + "epoch": 0.7931896883416699, + "flos": 537434480640.0, + "grad_norm": 0.03139044095393014, + "language_loss": 0.90324616, + "learning_rate": 0.0001080320325544612, + "loss": 0.91462576, + "num_input_tokens_seen": 341903552, + "router_z_loss_mlp": 0.73681641, + "step": 4123, + "time_per_iteration": 2.6880621910095215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137532, + "balance_loss_mlp": 1.06381249, + "epoch": 0.7933820700269334, + "flos": 499068848640.0, + "grad_norm": 0.03512735769346207, + "language_loss": 0.87548339, + "learning_rate": 0.00010783869018895997, + "loss": 0.8868587, + "num_input_tokens_seen": 341972256, + "router_z_loss_mlp": 0.73730469, + "step": 4124, + "time_per_iteration": 2.6342406272888184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138023, + "balance_loss_mlp": 1.06425595, + "epoch": 0.793574451712197, + "flos": 538495993344.0, + "grad_norm": 0.03751622303181437, + "language_loss": 0.88749498, + "learning_rate": 0.00010764550007398189, + "loss": 0.89887518, + "num_input_tokens_seen": 342040496, + "router_z_loss_mlp": 0.73779297, + "step": 4125, + "time_per_iteration": 2.6272289752960205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137744, + "balance_loss_mlp": 1.0640254, + "epoch": 0.7937668333974606, + "flos": 489258725376.0, + "grad_norm": 0.034933857523794375, + "language_loss": 0.85822791, + "learning_rate": 0.00010745246228452982, + "loss": 0.86960542, + "num_input_tokens_seen": 342108512, + "router_z_loss_mlp": 0.73730469, + "step": 4126, + "time_per_iteration": 2.5639169216156006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137347, + "balance_loss_mlp": 1.06358075, + "epoch": 0.7939592150827242, + "flos": 528479752704.0, + "grad_norm": 0.034679171376522114, + "language_loss": 0.86079615, + "learning_rate": 0.00010725957689554771, + "loss": 0.87216961, + "num_input_tokens_seen": 342183568, + "router_z_loss_mlp": 0.73779297, + "step": 4127, + "time_per_iteration": 2.7611310482025146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137731, + "balance_loss_mlp": 1.06391644, + "epoch": 0.7941515967679876, + "flos": 542803169280.0, + "grad_norm": 0.03824880137917062, + "language_loss": 0.88766754, + "learning_rate": 0.00010706684398192013, + "loss": 0.89904475, + "num_input_tokens_seen": 342259920, + "router_z_loss_mlp": 0.73828125, + "step": 4128, + "time_per_iteration": 2.7266509532928467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138133, + "balance_loss_mlp": 1.06436622, + "epoch": 0.7943439784532512, + "flos": 519523023360.0, + "grad_norm": 0.040169030809423835, + "language_loss": 0.87296367, + "learning_rate": 0.00010687426361847313, + "loss": 0.88434494, + "num_input_tokens_seen": 342330192, + "router_z_loss_mlp": 0.73779297, + "step": 4129, + "time_per_iteration": 2.7299461364746094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137822, + "balance_loss_mlp": 1.06405497, + "epoch": 0.7945363601385148, + "flos": 510060006912.0, + "grad_norm": 0.03365010231466857, + "language_loss": 0.9038803, + "learning_rate": 0.00010668183587997254, + "loss": 0.91525853, + "num_input_tokens_seen": 342398944, + "router_z_loss_mlp": 0.73779297, + "step": 4130, + "time_per_iteration": 2.5838053226470947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137059, + "balance_loss_mlp": 1.06343496, + "epoch": 0.7947287418237784, + "flos": 652401120768.0, + "grad_norm": 0.02856230138733652, + "language_loss": 0.8155334, + "learning_rate": 0.0001064895608411256, + "loss": 0.826904, + "num_input_tokens_seen": 342474000, + "router_z_loss_mlp": 0.73632812, + "step": 4131, + "time_per_iteration": 2.855571746826172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140645, + "balance_loss_mlp": 1.0668304, + "epoch": 0.794921123509042, + "flos": 697372872192.0, + "grad_norm": 0.03566888341568189, + "language_loss": 0.84410554, + "learning_rate": 0.00010629743857657998, + "loss": 0.85551202, + "num_input_tokens_seen": 342549184, + "router_z_loss_mlp": 0.73828125, + "step": 4132, + "time_per_iteration": 2.8950796127319336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149963, + "balance_loss_mlp": 1.07805634, + "epoch": 0.7951135051943055, + "flos": 1406076730368.0, + "grad_norm": 0.009945360443955307, + "language_loss": 0.70598668, + "learning_rate": 0.0001061054691609244, + "loss": 0.71748632, + "num_input_tokens_seen": 342767376, + "router_z_loss_mlp": 0.72070312, + "step": 4133, + "time_per_iteration": 4.6428234577178955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137714, + "balance_loss_mlp": 1.06399536, + "epoch": 0.795305886879569, + "flos": 811449188352.0, + "grad_norm": 0.03756536523282242, + "language_loss": 0.86775541, + "learning_rate": 0.00010591365266868802, + "loss": 0.87913251, + "num_input_tokens_seen": 342845024, + "router_z_loss_mlp": 0.73730469, + "step": 4134, + "time_per_iteration": 2.9570915699005127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143425, + "balance_loss_mlp": 1.07132721, + "epoch": 0.7954982685648326, + "flos": 1429213885440.0, + "grad_norm": 0.0062941693525409926, + "language_loss": 0.75511783, + "learning_rate": 0.00010572198917434018, + "loss": 0.76655209, + "num_input_tokens_seen": 343072496, + "router_z_loss_mlp": 0.72265625, + "step": 4135, + "time_per_iteration": 4.914888143539429 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137959, + "balance_loss_mlp": 1.06404912, + "epoch": 0.7956906502500962, + "flos": 390747259392.0, + "grad_norm": 0.0392560850681974, + "language_loss": 0.85252422, + "learning_rate": 0.00010553047875229166, + "loss": 0.86390382, + "num_input_tokens_seen": 343136928, + "router_z_loss_mlp": 0.73876953, + "step": 4136, + "time_per_iteration": 2.5757832527160645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137394, + "balance_loss_mlp": 1.06362712, + "epoch": 0.7958830319353598, + "flos": 516585535488.0, + "grad_norm": 0.03073809129555248, + "language_loss": 0.8796097, + "learning_rate": 0.00010533912147689328, + "loss": 0.89098364, + "num_input_tokens_seen": 343207440, + "router_z_loss_mlp": 0.73779297, + "step": 4137, + "time_per_iteration": 2.6300714015960693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137078, + "balance_loss_mlp": 1.06335866, + "epoch": 0.7960754136206233, + "flos": 494926857216.0, + "grad_norm": 0.033442699276882225, + "language_loss": 0.87293124, + "learning_rate": 0.00010514791742243656, + "loss": 0.88430202, + "num_input_tokens_seen": 343273744, + "router_z_loss_mlp": 0.73730469, + "step": 4138, + "time_per_iteration": 2.5906717777252197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136999, + "balance_loss_mlp": 1.06323278, + "epoch": 0.7962677953058869, + "flos": 657005738496.0, + "grad_norm": 0.03903943901806541, + "language_loss": 0.87440938, + "learning_rate": 0.00010495686666315341, + "loss": 0.88577938, + "num_input_tokens_seen": 343357648, + "router_z_loss_mlp": 0.73779297, + "step": 4139, + "time_per_iteration": 2.909572124481201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113797, + "balance_loss_mlp": 1.06401289, + "epoch": 0.7964601769911505, + "flos": 543419520000.0, + "grad_norm": 0.08585465629101555, + "language_loss": 0.81986225, + "learning_rate": 0.00010476596927321635, + "loss": 0.83124197, + "num_input_tokens_seen": 343425344, + "router_z_loss_mlp": 0.73876953, + "step": 4140, + "time_per_iteration": 2.5994365215301514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137712, + "balance_loss_mlp": 1.06389797, + "epoch": 0.796652558676414, + "flos": 538826362368.0, + "grad_norm": 0.03248172590146644, + "language_loss": 0.84015322, + "learning_rate": 0.00010457522532673835, + "loss": 0.85153031, + "num_input_tokens_seen": 343504960, + "router_z_loss_mlp": 0.73828125, + "step": 4141, + "time_per_iteration": 2.851498603820801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137565, + "balance_loss_mlp": 1.06375015, + "epoch": 0.7968449403616775, + "flos": 476051215872.0, + "grad_norm": 0.03503840732668985, + "language_loss": 0.8857249, + "learning_rate": 0.00010438463489777272, + "loss": 0.89710057, + "num_input_tokens_seen": 343570832, + "router_z_loss_mlp": 0.73828125, + "step": 4142, + "time_per_iteration": 2.56007981300354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137015, + "balance_loss_mlp": 1.06320024, + "epoch": 0.7970373220469411, + "flos": 568725362688.0, + "grad_norm": 0.0411728476443369, + "language_loss": 0.82051033, + "learning_rate": 0.00010419419806031316, + "loss": 0.83188045, + "num_input_tokens_seen": 343639808, + "router_z_loss_mlp": 0.73828125, + "step": 4143, + "time_per_iteration": 2.66398549079895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138356, + "balance_loss_mlp": 1.0646373, + "epoch": 0.7972297037322047, + "flos": 557350167552.0, + "grad_norm": 0.048021721616636356, + "language_loss": 0.88371974, + "learning_rate": 0.00010400391488829403, + "loss": 0.89510334, + "num_input_tokens_seen": 343715232, + "router_z_loss_mlp": 0.73730469, + "step": 4144, + "time_per_iteration": 2.764263153076172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137941, + "balance_loss_mlp": 1.06412661, + "epoch": 0.7974220854174683, + "flos": 577306787328.0, + "grad_norm": 0.030349731756734208, + "language_loss": 0.90217054, + "learning_rate": 0.00010381378545558984, + "loss": 0.9135499, + "num_input_tokens_seen": 343787168, + "router_z_loss_mlp": 0.73828125, + "step": 4145, + "time_per_iteration": 2.694387197494507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139239, + "balance_loss_mlp": 1.06552041, + "epoch": 0.7976144671027319, + "flos": 484055221248.0, + "grad_norm": 0.04602586335086132, + "language_loss": 0.89352703, + "learning_rate": 0.00010362380983601505, + "loss": 0.90491945, + "num_input_tokens_seen": 343853600, + "router_z_loss_mlp": 0.73730469, + "step": 4146, + "time_per_iteration": 2.5373778343200684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139051, + "balance_loss_mlp": 1.06528461, + "epoch": 0.7978068487879953, + "flos": 1079652773376.0, + "grad_norm": 0.026886472634432064, + "language_loss": 0.83036357, + "learning_rate": 0.00010343398810332477, + "loss": 0.84175408, + "num_input_tokens_seen": 343942816, + "router_z_loss_mlp": 0.73779297, + "step": 4147, + "time_per_iteration": 3.465343952178955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137553, + "balance_loss_mlp": 1.06383419, + "epoch": 0.7979992304732589, + "flos": 735015366144.0, + "grad_norm": 0.0386131750052721, + "language_loss": 0.89394611, + "learning_rate": 0.00010324432033121467, + "loss": 0.9053216, + "num_input_tokens_seen": 344021232, + "router_z_loss_mlp": 0.73730469, + "step": 4148, + "time_per_iteration": 2.95272159576416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137647, + "balance_loss_mlp": 1.06397593, + "epoch": 0.7981916121585225, + "flos": 416750043648.0, + "grad_norm": 0.03182767294568272, + "language_loss": 0.87920535, + "learning_rate": 0.00010305480659332005, + "loss": 0.89058185, + "num_input_tokens_seen": 344089616, + "router_z_loss_mlp": 0.73681641, + "step": 4149, + "time_per_iteration": 2.6444265842437744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113765, + "balance_loss_mlp": 1.0638833, + "epoch": 0.7983839938437861, + "flos": 466212894720.0, + "grad_norm": 0.047857965738547205, + "language_loss": 0.88751274, + "learning_rate": 0.00010286544696321682, + "loss": 0.89888918, + "num_input_tokens_seen": 344154992, + "router_z_loss_mlp": 0.73779297, + "step": 4150, + "time_per_iteration": 2.5789239406585693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138352, + "balance_loss_mlp": 1.06472826, + "epoch": 0.7985763755290496, + "flos": 511623077376.0, + "grad_norm": 0.03835001072611694, + "language_loss": 0.83638573, + "learning_rate": 0.00010267624151442073, + "loss": 0.84776926, + "num_input_tokens_seen": 344225232, + "router_z_loss_mlp": 0.73632812, + "step": 4151, + "time_per_iteration": 2.670612096786499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137657, + "balance_loss_mlp": 1.06408083, + "epoch": 0.7987687572143132, + "flos": 1012277738496.0, + "grad_norm": 0.03249576548614517, + "language_loss": 0.85286856, + "learning_rate": 0.000102487190320388, + "loss": 0.86424506, + "num_input_tokens_seen": 344309120, + "router_z_loss_mlp": 0.73583984, + "step": 4152, + "time_per_iteration": 3.3122832775115967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138879, + "balance_loss_mlp": 1.06520724, + "epoch": 0.7989611388995768, + "flos": 1022747873280.0, + "grad_norm": 0.03976712139414911, + "language_loss": 0.85336626, + "learning_rate": 0.00010229829345451475, + "loss": 0.86475503, + "num_input_tokens_seen": 344394112, + "router_z_loss_mlp": 0.73681641, + "step": 4153, + "time_per_iteration": 3.3512771129608154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138777, + "balance_loss_mlp": 1.0651536, + "epoch": 0.7991535205848403, + "flos": 1103036978688.0, + "grad_norm": 0.04036200779620281, + "language_loss": 0.83784497, + "learning_rate": 0.00010210955099013724, + "loss": 0.84923279, + "num_input_tokens_seen": 344476512, + "router_z_loss_mlp": 0.73632812, + "step": 4154, + "time_per_iteration": 3.352534532546997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138505, + "balance_loss_mlp": 1.06492949, + "epoch": 0.7993459022701039, + "flos": 836279669760.0, + "grad_norm": 0.04342364986110735, + "language_loss": 0.81863582, + "learning_rate": 0.00010192096300053167, + "loss": 0.83002084, + "num_input_tokens_seen": 344561088, + "router_z_loss_mlp": 0.73583984, + "step": 4155, + "time_per_iteration": 3.055297374725342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140351, + "balance_loss_mlp": 1.06672716, + "epoch": 0.7995382839553674, + "flos": 523769074176.0, + "grad_norm": 0.02922915705008151, + "language_loss": 0.89245528, + "learning_rate": 0.00010173252955891477, + "loss": 0.90385878, + "num_input_tokens_seen": 344639424, + "router_z_loss_mlp": 0.73632812, + "step": 4156, + "time_per_iteration": 2.741558790206909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141174, + "balance_loss_mlp": 1.0675503, + "epoch": 0.799730665640631, + "flos": 538858563072.0, + "grad_norm": 0.03668807577756746, + "language_loss": 0.78405279, + "learning_rate": 0.00010154425073844253, + "loss": 0.79546452, + "num_input_tokens_seen": 344710048, + "router_z_loss_mlp": 0.73632812, + "step": 4157, + "time_per_iteration": 2.6747748851776123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141717, + "balance_loss_mlp": 1.0680933, + "epoch": 0.7999230473258946, + "flos": 506067737088.0, + "grad_norm": 0.03089804381419182, + "language_loss": 0.86340404, + "learning_rate": 0.00010135612661221138, + "loss": 0.87482131, + "num_input_tokens_seen": 344776832, + "router_z_loss_mlp": 0.73632812, + "step": 4158, + "time_per_iteration": 2.565213680267334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144064, + "balance_loss_mlp": 1.07034528, + "epoch": 0.8001154290111582, + "flos": 1028975960064.0, + "grad_norm": 0.0395229836188532, + "language_loss": 0.87076604, + "learning_rate": 0.00010116815725325751, + "loss": 0.88220668, + "num_input_tokens_seen": 344864928, + "router_z_loss_mlp": 0.73681641, + "step": 4159, + "time_per_iteration": 3.3038952350616455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142065, + "balance_loss_mlp": 1.06834638, + "epoch": 0.8003078106964217, + "flos": 752269539840.0, + "grad_norm": 0.03606815133795925, + "language_loss": 0.85251313, + "learning_rate": 0.00010098034273455725, + "loss": 0.8639338, + "num_input_tokens_seen": 344944048, + "router_z_loss_mlp": 0.73681641, + "step": 4160, + "time_per_iteration": 2.9671449661254883 + } + ], + "logging_steps": 1.0, + "max_steps": 5198, + "num_input_tokens_seen": 344944048, + "num_train_epochs": 1, + "save_steps": 1040, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 9407542803955712.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +} diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/training_args.bin b/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..dec1b7e0db130318069c72434f32c2789119b732 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c077e5103b778b39b648e3a5a2e73e36256d052f444290e14e15f87c36156cb +size 7992 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/zero_to_fp32.py b/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-4160/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/added_tokens.json b/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..c9d3d3a1b74d87e381e471f7b33784015d2dc0ea --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/added_tokens.json @@ -0,0 +1,13 @@ +{ + "<|assistant|>": 32001, + "<|endoftext|>": 32000, + "<|end|>": 32007, + "<|placeholder1|>": 32002, + "<|placeholder2|>": 32003, + "<|placeholder3|>": 32004, + "<|placeholder4|>": 32005, + "<|placeholder5|>": 32008, + "<|placeholder6|>": 32009, + "<|system|>": 32006, + "<|user|>": 32010 +} diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/config.json b/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/config.json new file mode 100644 index 0000000000000000000000000000000000000000..987150c78c9255ac53c0408588036e10466fc436 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/config.json @@ -0,0 +1,200 @@ +{ + "_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "architectures": [ + "LlavaPhiForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_phi3.Phi3Config", + "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM" + }, + "bal_comp_loss_coef": 0.01, + "balance_loss_coef": 0.01, + "bos_token_id": 1, + "clip_smoe": false, + "diversity_loss_coef": 0.01, + "dropout": false, + "e_loss_coef": 0.001, + "embd_pdrop": 0.0, + "entropy_advance_loss": false, + "eos_token_id": 32000, + "freeze_backbone": false, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 3072, + "hybrid": false, + "image_aspect_ratio": "pad", + "init_weight": true, + "initializer_range": 0.02, + "intermediate_size": 8192, + "is_cosine": false, + "is_norm_weight": false, + "local_rank": 0, + "loss1": "balanceloss", + "loss2": "zloss", + "luna": false, + "max_compete_in_iter": 3, + "max_position_embeddings": 131072, + "mlp_smoe": true, + "mm_hidden_size": 1152, + "mm_patch_merge_type": "flat", + "mm_projector_lr": null, + "mm_projector_type": "moe", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-224", + "model_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "model_type": "llava_phi", + "moe_name": "smoe_perturbed", + "norm_softmax": false, + "normalization": false, + "num_attention_heads": 32, + "num_experts": 8, + "num_hidden_layers": 32, + "num_key_value_heads": 32, + "num_layers": 3, + "num_selected": 4, + "number_of_previous_tokens": 2, + "original_max_position_embeddings": 4096, + "pad_token_id": 32000, + "pretrain_mm_mlp_adapter": null, + "rate_compete": 0.2, + "rate_flip": 0.05, + "resid_pdrop": 0.0, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "long_factor": [ + 1.0800000429153442, + 1.1100000143051147, + 1.1399999856948853, + 1.340000033378601, + 1.5899999141693115, + 1.600000023841858, + 1.6200000047683716, + 2.620000123977661, + 3.2300000190734863, + 3.2300000190734863, + 4.789999961853027, + 7.400000095367432, + 7.700000286102295, + 9.09000015258789, + 12.199999809265137, + 17.670000076293945, + 24.46000099182129, + 28.57000160217285, + 30.420001983642578, + 30.840002059936523, + 32.590003967285156, + 32.93000411987305, + 42.320003509521484, + 44.96000289916992, + 50.340003967285156, + 50.45000457763672, + 57.55000305175781, + 57.93000411987305, + 58.21000289916992, + 60.1400032043457, + 62.61000442504883, + 62.62000274658203, + 62.71000289916992, + 63.1400032043457, + 63.1400032043457, + 63.77000427246094, + 63.93000411987305, + 63.96000289916992, + 63.970001220703125, + 64.02999877929688, + 64.06999969482422, + 64.08000183105469, + 64.12000274658203, + 64.41000366210938, + 64.4800033569336, + 64.51000213623047, + 64.52999877929688, + 64.83999633789062 + ], + "short_factor": [ + 1.0, + 1.0199999809265137, + 1.0299999713897705, + 1.0299999713897705, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0699999332427979, + 1.0999999046325684, + 1.1099998950958252, + 1.1599998474121094, + 1.1599998474121094, + 1.1699998378753662, + 1.2899998426437378, + 1.339999794960022, + 1.679999828338623, + 1.7899998426437378, + 1.8199998140335083, + 1.8499997854232788, + 1.8799997568130493, + 1.9099997282028198, + 1.9399996995925903, + 1.9899996519088745, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0799996852874756, + 2.0899996757507324, + 2.189999580383301, + 2.2199995517730713, + 2.5899994373321533, + 2.729999542236328, + 2.749999523162842, + 2.8399994373321533 + ], + "type": "longrope" + }, + "rope_theta": 10000.0, + "router_loss_coef": 0.01, + "router_theta": 0.1, + "router_z_loss_coef": 0.001, + "scales": [ + 1, + 3 + ], + "sliding_window": 262144, + "sparse_upcycling": false, + "strategy_train": "base", + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "topk_max": 2, + "topk_min": 1, + "torch_dtype": "bfloat16", + "training": true, + "transformers_version": "4.43.0", + "tune_mm_mlp_adapter": false, + "unit_test": true, + "use_cache": false, + "use_mm_proj": true, + "use_old": false, + "version": "phi35", + "vision_tower": "google/siglip-so400m-patch14-224", + "vision_tower_dir": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/clip.bin", + "vocab_size": 32064, + "warm_up": 0.05 +} diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/generation_config.json b/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dad5c4578f0dc5969b38755d095fc30c368bb54a --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/generation_config.json @@ -0,0 +1,12 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "do_sample": true, + "eos_token_id": [ + 32007, + 32001, + 32000 + ], + "pad_token_id": 32000, + "transformers_version": "4.43.0" +} diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/global_step5198/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/global_step5198/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9a3308b18c42751b6c336cb7defbd92d3a5a96d2 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/global_step5198/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:efd74a314a9ae0b57fca0f33adfb65a7d3b980832688915090744b1a695b3ba5 +size 396609872 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/global_step5198/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/global_step5198/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..125f03463325db977aca3c55f8588d86c3908c87 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/global_step5198/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46e14f649ecb665ba9d14ba8c9ac180ab33697addf5c627e3de17b83db359a13 +size 396609872 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/global_step5198/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/global_step5198/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b6414127a6031c400230667d0e940168ff0bd69e --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/global_step5198/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce50dccfa17dd5b1031c2a944a7343476d697f8ebf510c1aa68165137ab0fa1e +size 396609872 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/global_step5198/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/global_step5198/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3a4d399e61bb1bf6f03c69f3eb7ebceab8e0186b --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/global_step5198/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:503fa8e455a1e596b85ec603bc818af20f7a3aae22554d093a8ce8f679ee8ed5 +size 396609872 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/global_step5198/zero_pp_rank_0_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/global_step5198/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..009bc032e937a301a186ac6525c4f29f5364ba48 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/global_step5198/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a45385b9d82806b345addc6109272663b79a05322a1414dca125d3c2ed9e5bd +size 2117322436 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/global_step5198/zero_pp_rank_1_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/global_step5198/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..775f4ef10d9d77a1b493ed7d109542f6e2c0bf5a --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/global_step5198/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9aab448c32acc1245692eaaeb277f58127218103d928bab49ab28464addecc2e +size 2117322436 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/global_step5198/zero_pp_rank_2_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/global_step5198/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5a5a82c3f8af9b13c86f8031c4d311de762cfc35 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/global_step5198/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ef06393f6d940e252d972cff6f9fb1ad1b560cf054dd98dc4536a91d580b15e +size 2117322436 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/global_step5198/zero_pp_rank_3_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/global_step5198/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..67f5920bbb3a24748ef3e05b1f8e13d913371eab --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/global_step5198/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5335b7ba8c8cf7961a97fc6ad949b806798373ccd4751b023d2c3947eaf3de8c +size 2117322436 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/latest b/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/latest new file mode 100644 index 0000000000000000000000000000000000000000..c0e63763d1d13a0ca7a3b62ff8f5cd1d69cc4978 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/latest @@ -0,0 +1 @@ +global_step5198 \ No newline at end of file diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/model-00001-of-00002.safetensors b/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/model-00001-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..29d76f5d80605301aab2bba59b53a5e2582094c4 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/model-00001-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe6c4f6ef38e8993629091331e0bbf23484cc88bdfd038f0dd17b6ec2800d855 +size 4972489328 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/model-00002-of-00002.safetensors b/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/model-00002-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4128b762c290a8c7cb6627a17f8505d154b92d4d --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/model-00002-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f860cdc59cd5cce9d24bbc4d9e72e861be5a033df830dd664ddc3ec6244d240 +size 3759043888 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/model.safetensors.index.json b/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..01fe755c95da02467d97df3e39228dbbb26b065f --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/model.safetensors.index.json @@ -0,0 +1,674 @@ +{ + "metadata": { + "total_size": 8731443232 + }, + "weight_map": { + "lm_head.weight": "model-00002-of-00002.safetensors", + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.mm_projector.layer_norm.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.layer_norm.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.expert_embeddings": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.gate.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.inp_reduction.weight": "model-00002-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/rng_state_0.pth b/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..ef4849062bcdc8ffd2246c07673ba196a8d61a6d --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fae2114fffe9b1eea30e28bbdb4ce59046b0079ea5b8dc4682079f609d49d787 +size 14960 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/rng_state_1.pth b/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..2fcb2b640bc236c26aa841680d34a91240247970 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4ff5f3a53530ac868291e2667c8f824bfa1f4fa1ce880df8223a7165ef38e11 +size 14960 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/rng_state_2.pth b/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..00c3f989de00e6d58ca7345ae6f65fee0afcbdcd --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91f80a7779b0034e70106ba6cb0e3e686052334c20ce54453ee3977cc0219d15 +size 14960 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/rng_state_3.pth b/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..f289913854ee3fa52a86e282421da07d85b8a4c4 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ece3bc0d0e16c43ef245cc787cbd0d63d08d460f489c4cd52adf6501b9281a18 +size 14960 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/special_tokens_map.json b/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3e4d5a5bc1cb51753cc9ae0305ece0da60052b10 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/tokenizer.model b/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/tokenizer_config.json b/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d579bb0b91b24b214ea3c2e487e27a65017cdc4a --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/tokenizer_config.json @@ -0,0 +1,132 @@ +{ + "add_bos_token": false, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "32000": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32001": { + "content": "<|assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32002": { + "content": "<|placeholder1|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32003": { + "content": "<|placeholder2|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32004": { + "content": "<|placeholder3|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32005": { + "content": "<|placeholder4|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32006": { + "content": "<|system|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32007": { + "content": "<|end|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32008": { + "content": "<|placeholder5|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32009": { + "content": "<|placeholder6|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32010": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "legacy": false, + "model_max_length": 2048, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/trainer_state.json b/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..264a7a7f8f4d871987a71ac41e6cc34505e060f2 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/trainer_state.json @@ -0,0 +1,78003 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 5198, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02574398, + "balance_loss_mlp": 1.85189414, + "epoch": 0.00019238168526356292, + "flos": 471022176768.0, + "grad_norm": 12.86455737221305, + "language_loss": 2.79777646, + "learning_rate": 0.0, + "loss": 1.8614465, + "num_input_tokens_seen": 67104, + "router_z_loss_mlp": 7.2109375, + "step": 1, + "time_per_iteration": 21.83068585395813 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02254613, + "balance_loss_mlp": 1.76785779, + "epoch": 0.00038476337052712584, + "flos": 505537981440.0, + "grad_norm": 51.581369656319104, + "language_loss": 12.34714699, + "learning_rate": 0.00013726078121135892, + "loss": 12.3696928, + "num_input_tokens_seen": 134080, + "router_z_loss_mlp": 4.875, + "step": 2, + "time_per_iteration": 2.6192572116851807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02235864, + "balance_loss_mlp": 1.75177932, + "epoch": 0.0005771450557906887, + "flos": 600333152256.0, + "grad_norm": 53.41660983156924, + "language_loss": 12.32898235, + "learning_rate": 0.00021755319103969496, + "loss": 12.35134125, + "num_input_tokens_seen": 205152, + "router_z_loss_mlp": 4.84765625, + "step": 3, + "time_per_iteration": 2.887979030609131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02281771, + "balance_loss_mlp": 1.79577887, + "epoch": 0.0007695267410542517, + "flos": 581496442368.0, + "grad_norm": 15.812083363335244, + "language_loss": 9.24414825, + "learning_rate": 0.00027452156242271784, + "loss": 9.26696682, + "num_input_tokens_seen": 269664, + "router_z_loss_mlp": 4.8671875, + "step": 4, + "time_per_iteration": 2.6792547702789307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02454864, + "balance_loss_mlp": 1.95551991, + "epoch": 0.0009619084263178145, + "flos": 487153164288.0, + "grad_norm": 10.3691594005885, + "language_loss": 9.1886158, + "learning_rate": 0.0003187096642208417, + "loss": 9.21316433, + "num_input_tokens_seen": 338560, + "router_z_loss_mlp": 4.98828125, + "step": 5, + "time_per_iteration": 2.627883195877075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0247156, + "balance_loss_mlp": 1.97450531, + "epoch": 0.0011542901115813775, + "flos": 561166519296.0, + "grad_norm": 9.061082825397735, + "language_loss": 9.31672573, + "learning_rate": 0.0003548139722510539, + "loss": 9.34144115, + "num_input_tokens_seen": 410112, + "router_z_loss_mlp": 4.96875, + "step": 6, + "time_per_iteration": 2.697327136993408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02496704, + "balance_loss_mlp": 1.9977417, + "epoch": 0.0013466717968449403, + "flos": 534950886912.0, + "grad_norm": 5.1401213461899875, + "language_loss": 8.45638084, + "learning_rate": 0.00038533972973918044, + "loss": 8.48134804, + "num_input_tokens_seen": 477552, + "router_z_loss_mlp": 4.984375, + "step": 7, + "time_per_iteration": 2.6605119705200195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02367166, + "balance_loss_mlp": 1.8800292, + "epoch": 0.0015390534821085034, + "flos": 493333587456.0, + "grad_norm": 4.765795170053606, + "language_loss": 7.86978722, + "learning_rate": 0.0004117823436340768, + "loss": 7.89345884, + "num_input_tokens_seen": 549184, + "router_z_loss_mlp": 4.87890625, + "step": 8, + "time_per_iteration": 2.60813570022583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02377529, + "balance_loss_mlp": 1.89153647, + "epoch": 0.0017314351673720662, + "flos": 565775139840.0, + "grad_norm": 2.6394105736579268, + "language_loss": 7.60834789, + "learning_rate": 0.00043510638207938993, + "loss": 7.63212299, + "num_input_tokens_seen": 622880, + "router_z_loss_mlp": 4.8671875, + "step": 9, + "time_per_iteration": 2.871943712234497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0239868, + "balance_loss_mlp": 1.91802776, + "epoch": 0.001923816852635629, + "flos": 594508568064.0, + "grad_norm": 2.7082435786924752, + "language_loss": 7.06748104, + "learning_rate": 0.00045597044543220066, + "loss": 7.09146786, + "num_input_tokens_seen": 693584, + "router_z_loss_mlp": 4.8125, + "step": 10, + "time_per_iteration": 2.671294689178467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02381293, + "balance_loss_mlp": 1.90254807, + "epoch": 0.002116198537899192, + "flos": 610894611456.0, + "grad_norm": 2.113301815517677, + "language_loss": 6.83692646, + "learning_rate": 0.00047484428652143135, + "loss": 6.86073971, + "num_input_tokens_seen": 774432, + "router_z_loss_mlp": 4.79296875, + "step": 11, + "time_per_iteration": 2.885416269302368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02427226, + "balance_loss_mlp": 1.95687437, + "epoch": 0.002308580223162755, + "flos": 546174359040.0, + "grad_norm": 1.7416212933802626, + "language_loss": 6.4295001, + "learning_rate": 0.0004920747534624128, + "loss": 6.45377207, + "num_input_tokens_seen": 844304, + "router_z_loss_mlp": 4.70703125, + "step": 12, + "time_per_iteration": 2.6201112270355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02503769, + "balance_loss_mlp": 2.03265429, + "epoch": 0.002500961908426318, + "flos": 645923255808.0, + "grad_norm": 2.43618245016211, + "language_loss": 6.0048914, + "learning_rate": 0.0005079252465375872, + "loss": 6.02992916, + "num_input_tokens_seen": 915104, + "router_z_loss_mlp": 4.71484375, + "step": 13, + "time_per_iteration": 2.852263927459717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02634854, + "balance_loss_mlp": 2.15916157, + "epoch": 0.0026933435936898806, + "flos": 488848492032.0, + "grad_norm": 4.143842376760835, + "language_loss": 5.42230844, + "learning_rate": 0.0005226005109505393, + "loss": 5.44865704, + "num_input_tokens_seen": 982720, + "router_z_loss_mlp": 4.76171875, + "step": 14, + "time_per_iteration": 2.5524611473083496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02844198, + "balance_loss_mlp": 2.3646903, + "epoch": 0.0028857252789534437, + "flos": 435525628416.0, + "grad_norm": 5.672862092220106, + "language_loss": 4.15845776, + "learning_rate": 0.0005362628552605367, + "loss": 4.18689966, + "num_input_tokens_seen": 1050528, + "router_z_loss_mlp": 4.80078125, + "step": 15, + "time_per_iteration": 2.7353649139404297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03208902, + "balance_loss_mlp": 2.72252893, + "epoch": 0.0030781069642170067, + "flos": 597840826368.0, + "grad_norm": 3.947061509829782, + "language_loss": 2.26971245, + "learning_rate": 0.0005490431248454357, + "loss": 2.30180168, + "num_input_tokens_seen": 1116512, + "router_z_loss_mlp": 4.87109375, + "step": 16, + "time_per_iteration": 2.676703929901123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03601284, + "balance_loss_mlp": 3.10232162, + "epoch": 0.0032704886494805694, + "flos": 1541510280192.0, + "grad_norm": 0.6213816402988768, + "language_loss": 0.75705111, + "learning_rate": 0.0005610483427624225, + "loss": 0.793064, + "num_input_tokens_seen": 1351216, + "router_z_loss_mlp": 5.0, + "step": 17, + "time_per_iteration": 6.1610119342803955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0334326, + "balance_loss_mlp": 2.85841203, + "epoch": 0.0034628703347441324, + "flos": 474970237440.0, + "grad_norm": 2.8341915883282045, + "language_loss": 1.71282685, + "learning_rate": 0.0005723671632907488, + "loss": 1.74625945, + "num_input_tokens_seen": 1420512, + "router_z_loss_mlp": 4.85546875, + "step": 18, + "time_per_iteration": 2.638371467590332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02881518, + "balance_loss_mlp": 2.39934015, + "epoch": 0.0036552520200076955, + "flos": 449477743104.0, + "grad_norm": 2.8867361132515086, + "language_loss": 1.68530536, + "learning_rate": 0.0005830738490244919, + "loss": 1.71412063, + "num_input_tokens_seen": 1484976, + "router_z_loss_mlp": 4.828125, + "step": 19, + "time_per_iteration": 2.56374454498291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02402526, + "balance_loss_mlp": 1.92301893, + "epoch": 0.003847633705271258, + "flos": 637350563328.0, + "grad_norm": 0.6925173808128176, + "language_loss": 1.38203168, + "learning_rate": 0.0005932312266435596, + "loss": 1.406057, + "num_input_tokens_seen": 1557392, + "router_z_loss_mlp": 4.80078125, + "step": 20, + "time_per_iteration": 2.763998508453369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02421171, + "balance_loss_mlp": 1.94814897, + "epoch": 0.004040015390534821, + "flos": 590590158336.0, + "grad_norm": 1.6265477944222306, + "language_loss": 1.40919662, + "learning_rate": 0.0006028929207788754, + "loss": 1.43340826, + "num_input_tokens_seen": 1626064, + "router_z_loss_mlp": 4.734375, + "step": 21, + "time_per_iteration": 2.746016502380371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02575294, + "balance_loss_mlp": 2.10036469, + "epoch": 0.004232397075798384, + "flos": 757865812992.0, + "grad_norm": 1.576079326940489, + "language_loss": 1.40810275, + "learning_rate": 0.0006121050677327902, + "loss": 1.43385565, + "num_input_tokens_seen": 1696528, + "router_z_loss_mlp": 4.75390625, + "step": 22, + "time_per_iteration": 2.9607386589050293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02550906, + "balance_loss_mlp": 2.07025433, + "epoch": 0.004424778761061947, + "flos": 527726415360.0, + "grad_norm": 0.6323448080178445, + "language_loss": 1.22419024, + "learning_rate": 0.0006209076479463684, + "loss": 1.24969923, + "num_input_tokens_seen": 1765936, + "router_z_loss_mlp": 4.8125, + "step": 23, + "time_per_iteration": 2.5966527462005615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02511897, + "balance_loss_mlp": 2.02285314, + "epoch": 0.00461716044632551, + "flos": 549217907712.0, + "grad_norm": 0.22573529074246063, + "language_loss": 1.26396596, + "learning_rate": 0.0006293355346737718, + "loss": 1.28908491, + "num_input_tokens_seen": 1841632, + "router_z_loss_mlp": 4.8984375, + "step": 24, + "time_per_iteration": 2.672264575958252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02557217, + "balance_loss_mlp": 2.05978036, + "epoch": 0.004809542131589073, + "flos": 568751559168.0, + "grad_norm": 0.10471299124135865, + "language_loss": 1.20974565, + "learning_rate": 0.0006374193284416834, + "loss": 1.23531783, + "num_input_tokens_seen": 1920256, + "router_z_loss_mlp": 4.96875, + "step": 25, + "time_per_iteration": 2.7392375469207764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02658191, + "balance_loss_mlp": 2.15503263, + "epoch": 0.005001923816852636, + "flos": 471583584768.0, + "grad_norm": 0.16888144752152706, + "language_loss": 1.20314312, + "learning_rate": 0.0006451860277489461, + "loss": 1.22972512, + "num_input_tokens_seen": 1986528, + "router_z_loss_mlp": 5.02734375, + "step": 26, + "time_per_iteration": 2.6047253608703613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02722422, + "balance_loss_mlp": 2.21582985, + "epoch": 0.005194305502116198, + "flos": 416380743168.0, + "grad_norm": 0.22424567034217777, + "language_loss": 1.28844571, + "learning_rate": 0.0006526595731190848, + "loss": 1.31566989, + "num_input_tokens_seen": 2048016, + "router_z_loss_mlp": 5.0625, + "step": 27, + "time_per_iteration": 2.481884717941284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02743244, + "balance_loss_mlp": 2.2351265, + "epoch": 0.005386687187379761, + "flos": 629995835904.0, + "grad_norm": 0.15642653525507078, + "language_loss": 1.18914986, + "learning_rate": 0.0006598612921618983, + "loss": 1.2165823, + "num_input_tokens_seen": 2127664, + "router_z_loss_mlp": 5.078125, + "step": 28, + "time_per_iteration": 2.8519153594970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02748247, + "balance_loss_mlp": 2.24051118, + "epoch": 0.005579068872643324, + "flos": 888019997184.0, + "grad_norm": 0.1209301216257677, + "language_loss": 1.12191987, + "learning_rate": 0.0006668102665011454, + "loss": 1.14940238, + "num_input_tokens_seen": 2213952, + "router_z_loss_mlp": 5.07421875, + "step": 29, + "time_per_iteration": 3.2244889736175537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02691091, + "balance_loss_mlp": 2.18411779, + "epoch": 0.005771450557906887, + "flos": 548657952768.0, + "grad_norm": 0.1098895199150706, + "language_loss": 1.21368051, + "learning_rate": 0.0006735236364718957, + "loss": 1.24059153, + "num_input_tokens_seen": 2284736, + "router_z_loss_mlp": 5.06640625, + "step": 30, + "time_per_iteration": 2.642730474472046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02653145, + "balance_loss_mlp": 2.14769816, + "epoch": 0.00596383224317045, + "flos": 533068907520.0, + "grad_norm": 0.11046596793449442, + "language_loss": 1.1970098, + "learning_rate": 0.0006800168558381346, + "loss": 1.22354114, + "num_input_tokens_seen": 2354384, + "router_z_loss_mlp": 5.05078125, + "step": 31, + "time_per_iteration": 2.581875801086426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0257592, + "balance_loss_mlp": 2.07123542, + "epoch": 0.0061562139284340135, + "flos": 590162460672.0, + "grad_norm": 0.10949645130098669, + "language_loss": 1.22987807, + "learning_rate": 0.0006863039060567947, + "loss": 1.25563729, + "num_input_tokens_seen": 2419440, + "router_z_loss_mlp": 5.04296875, + "step": 32, + "time_per_iteration": 2.733224868774414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02505923, + "balance_loss_mlp": 2.00390816, + "epoch": 0.006348595613697576, + "flos": 619441107456.0, + "grad_norm": 0.0835016489973258, + "language_loss": 1.14437437, + "learning_rate": 0.0006923974775611263, + "loss": 1.16943359, + "num_input_tokens_seen": 2496368, + "router_z_loss_mlp": 5.015625, + "step": 33, + "time_per_iteration": 2.820788621902466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02482464, + "balance_loss_mlp": 1.98159432, + "epoch": 0.006540977298961139, + "flos": 779298908160.0, + "grad_norm": 0.08776573315434787, + "language_loss": 1.10869515, + "learning_rate": 0.0006983091239737814, + "loss": 1.13351965, + "num_input_tokens_seen": 2573280, + "router_z_loss_mlp": 5.00390625, + "step": 34, + "time_per_iteration": 2.9917590618133545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02373805, + "balance_loss_mlp": 1.87636864, + "epoch": 0.006733358984224702, + "flos": 668372201472.0, + "grad_norm": 0.0744368555221442, + "language_loss": 1.09626412, + "learning_rate": 0.0007040493939600222, + "loss": 1.12000227, + "num_input_tokens_seen": 2647248, + "router_z_loss_mlp": 4.96875, + "step": 35, + "time_per_iteration": 2.813040256500244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02308046, + "balance_loss_mlp": 1.81175399, + "epoch": 0.006925740669488265, + "flos": 565495162368.0, + "grad_norm": 0.06560236116646054, + "language_loss": 1.0974791, + "learning_rate": 0.0007096279445021078, + "loss": 1.12055957, + "num_input_tokens_seen": 2720736, + "router_z_loss_mlp": 4.95703125, + "step": 36, + "time_per_iteration": 2.715013027191162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02240602, + "balance_loss_mlp": 1.74888754, + "epoch": 0.007118122354751828, + "flos": 551111347200.0, + "grad_norm": 0.05581405617561486, + "language_loss": 1.16120386, + "learning_rate": 0.0007150536386503726, + "loss": 1.18360972, + "num_input_tokens_seen": 2800336, + "router_z_loss_mlp": 4.91015625, + "step": 37, + "time_per_iteration": 2.8262643814086914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02218804, + "balance_loss_mlp": 1.7293781, + "epoch": 0.007310504040015391, + "flos": 703813807104.0, + "grad_norm": 0.06412720029508237, + "language_loss": 1.08394384, + "learning_rate": 0.0007203346302358509, + "loss": 1.10613179, + "num_input_tokens_seen": 2883184, + "router_z_loss_mlp": 4.890625, + "step": 38, + "time_per_iteration": 2.9149320125579834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0220325, + "balance_loss_mlp": 1.71954608, + "epoch": 0.007502885725278953, + "flos": 600500338176.0, + "grad_norm": 0.08018675586540955, + "language_loss": 1.13587177, + "learning_rate": 0.000725478437577282, + "loss": 1.15790427, + "num_input_tokens_seen": 2960736, + "router_z_loss_mlp": 4.84375, + "step": 39, + "time_per_iteration": 2.7649383544921875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02194939, + "balance_loss_mlp": 1.71237946, + "epoch": 0.007695267410542516, + "flos": 561427031040.0, + "grad_norm": 0.11080304178085185, + "language_loss": 1.08546591, + "learning_rate": 0.0007304920078549186, + "loss": 1.10741532, + "num_input_tokens_seen": 3033472, + "router_z_loss_mlp": 4.83203125, + "step": 40, + "time_per_iteration": 2.7245187759399414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02164234, + "balance_loss_mlp": 1.68548942, + "epoch": 0.007887649095806078, + "flos": 509230808064.0, + "grad_norm": 0.12864951336881933, + "language_loss": 1.10053396, + "learning_rate": 0.0007353817735343603, + "loss": 1.12217629, + "num_input_tokens_seen": 3107824, + "router_z_loss_mlp": 4.79296875, + "step": 41, + "time_per_iteration": 2.6662168502807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02109951, + "balance_loss_mlp": 1.63425827, + "epoch": 0.008080030781069641, + "flos": 504904166400.0, + "grad_norm": 0.0888118324595499, + "language_loss": 1.05816543, + "learning_rate": 0.0007401537019902344, + "loss": 1.07926488, + "num_input_tokens_seen": 3176528, + "router_z_loss_mlp": 4.76171875, + "step": 42, + "time_per_iteration": 2.583256244659424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02065976, + "balance_loss_mlp": 1.59219027, + "epoch": 0.008272412466333205, + "flos": 519106059264.0, + "grad_norm": 0.08974821197730459, + "language_loss": 1.0785954, + "learning_rate": 0.0007448133392900729, + "loss": 1.09925508, + "num_input_tokens_seen": 3254256, + "router_z_loss_mlp": 4.7421875, + "step": 43, + "time_per_iteration": 2.677175998687744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01955434, + "balance_loss_mlp": 1.4839375, + "epoch": 0.008464794151596768, + "flos": 609183820800.0, + "grad_norm": 0.06237767914218564, + "language_loss": 1.03785229, + "learning_rate": 0.0007493658489441491, + "loss": 1.05740666, + "num_input_tokens_seen": 3340224, + "router_z_loss_mlp": 4.71875, + "step": 44, + "time_per_iteration": 2.8553237915039062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01864539, + "balance_loss_mlp": 1.39800107, + "epoch": 0.00865717583686033, + "flos": 539006283264.0, + "grad_norm": 0.049849947719683325, + "language_loss": 1.08088911, + "learning_rate": 0.0007538160463002316, + "loss": 1.09953451, + "num_input_tokens_seen": 3409216, + "router_z_loss_mlp": 4.66796875, + "step": 45, + "time_per_iteration": 2.637796640396118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01780353, + "balance_loss_mlp": 1.31572247, + "epoch": 0.008849557522123894, + "flos": 509009227776.0, + "grad_norm": 0.046919324832442044, + "language_loss": 1.11748755, + "learning_rate": 0.0007581684291577274, + "loss": 1.1352911, + "num_input_tokens_seen": 3478352, + "router_z_loss_mlp": 4.6484375, + "step": 46, + "time_per_iteration": 2.5655901432037354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01764453, + "balance_loss_mlp": 1.30211222, + "epoch": 0.009041939207387457, + "flos": 626507125248.0, + "grad_norm": 0.05937298040562763, + "language_loss": 1.13580585, + "learning_rate": 0.0007624272050891776, + "loss": 1.15345049, + "num_input_tokens_seen": 3555616, + "router_z_loss_mlp": 4.625, + "step": 47, + "time_per_iteration": 2.804643392562866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01776852, + "balance_loss_mlp": 1.31908798, + "epoch": 0.00923432089265102, + "flos": 550609789440.0, + "grad_norm": 0.07500714899038924, + "language_loss": 1.03489327, + "learning_rate": 0.0007665963158851307, + "loss": 1.05266178, + "num_input_tokens_seen": 3634512, + "router_z_loss_mlp": 4.578125, + "step": 48, + "time_per_iteration": 2.781435489654541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01771411, + "balance_loss_mlp": 1.3170805, + "epoch": 0.009426702577914583, + "flos": 563678310912.0, + "grad_norm": 0.07921486390615404, + "language_loss": 1.12758589, + "learning_rate": 0.0007706794594783609, + "loss": 1.14529991, + "num_input_tokens_seen": 3708480, + "router_z_loss_mlp": 4.54296875, + "step": 49, + "time_per_iteration": 2.739976644515991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.017484, + "balance_loss_mlp": 1.29483247, + "epoch": 0.009619084263178146, + "flos": 617925700608.0, + "grad_norm": 0.05671895540127436, + "language_loss": 1.10915053, + "learning_rate": 0.0007746801096530423, + "loss": 1.12663448, + "num_input_tokens_seen": 3783472, + "router_z_loss_mlp": 4.53515625, + "step": 50, + "time_per_iteration": 2.7333760261535645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01715641, + "balance_loss_mlp": 1.2616924, + "epoch": 0.009811465948441709, + "flos": 542488263168.0, + "grad_norm": 0.04785443300923319, + "language_loss": 1.16231108, + "learning_rate": 0.0007786015338021173, + "loss": 1.17946756, + "num_input_tokens_seen": 3851360, + "router_z_loss_mlp": 4.5390625, + "step": 51, + "time_per_iteration": 2.681392192840576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01700387, + "balance_loss_mlp": 1.24720073, + "epoch": 0.010003847633705272, + "flos": 536976583680.0, + "grad_norm": 0.04536583817216675, + "language_loss": 1.08076, + "learning_rate": 0.0007824468089603051, + "loss": 1.0977639, + "num_input_tokens_seen": 3923056, + "router_z_loss_mlp": 4.53125, + "step": 52, + "time_per_iteration": 2.6839513778686523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01675834, + "balance_loss_mlp": 1.2218852, + "epoch": 0.010196229318968833, + "flos": 910805316096.0, + "grad_norm": 0.04374839581732082, + "language_loss": 1.0833261, + "learning_rate": 0.0007862188363098669, + "loss": 1.10008454, + "num_input_tokens_seen": 4004528, + "router_z_loss_mlp": 4.5390625, + "step": 53, + "time_per_iteration": 3.1748838424682617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01650634, + "balance_loss_mlp": 1.19477725, + "epoch": 0.010388611004232396, + "flos": 586969190400.0, + "grad_norm": 0.045477377455174536, + "language_loss": 1.08262885, + "learning_rate": 0.0007899203543304438, + "loss": 1.09913516, + "num_input_tokens_seen": 4078704, + "router_z_loss_mlp": 4.55859375, + "step": 54, + "time_per_iteration": 2.7011117935180664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01588572, + "balance_loss_mlp": 1.13195276, + "epoch": 0.01058099268949596, + "flos": 503471351808.0, + "grad_norm": 0.05216939031034974, + "language_loss": 1.22650576, + "learning_rate": 0.0007935539507422731, + "loss": 1.24239147, + "num_input_tokens_seen": 4143600, + "router_z_loss_mlp": 4.56640625, + "step": 55, + "time_per_iteration": 2.6142656803131104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01553155, + "balance_loss_mlp": 1.09462798, + "epoch": 0.010773374374759523, + "flos": 545558008320.0, + "grad_norm": 0.04278176221573414, + "language_loss": 1.12836909, + "learning_rate": 0.0007971220733732573, + "loss": 1.14390063, + "num_input_tokens_seen": 4217904, + "router_z_loss_mlp": 4.5859375, + "step": 56, + "time_per_iteration": 2.718210220336914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01586959, + "balance_loss_mlp": 1.1318655, + "epoch": 0.010965756060023086, + "flos": 527285982720.0, + "grad_norm": 0.06958617519474361, + "language_loss": 1.08844507, + "learning_rate": 0.0008006270400641869, + "loss": 1.10431468, + "num_input_tokens_seen": 4293920, + "router_z_loss_mlp": 4.55078125, + "step": 57, + "time_per_iteration": 2.702324628829956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01576177, + "balance_loss_mlp": 1.12375367, + "epoch": 0.011158137745286649, + "flos": 578097054720.0, + "grad_norm": 0.08376433329063605, + "language_loss": 1.09231043, + "learning_rate": 0.0008040710477125043, + "loss": 1.10807228, + "num_input_tokens_seen": 4370080, + "router_z_loss_mlp": 4.5234375, + "step": 58, + "time_per_iteration": 2.733733892440796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01587306, + "balance_loss_mlp": 1.13793492, + "epoch": 0.011350519430550212, + "flos": 530314068480.0, + "grad_norm": 0.056261163559927586, + "language_loss": 1.098104, + "learning_rate": 0.0008074561805429771, + "loss": 1.11397719, + "num_input_tokens_seen": 4439792, + "router_z_loss_mlp": 4.4921875, + "step": 59, + "time_per_iteration": 2.604173183441162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0153348, + "balance_loss_mlp": 1.0886867, + "epoch": 0.011542901115813775, + "flos": 556970133504.0, + "grad_norm": 0.07546157909609297, + "language_loss": 1.07214928, + "learning_rate": 0.0008107844176832545, + "loss": 1.08748412, + "num_input_tokens_seen": 4510800, + "router_z_loss_mlp": 4.45703125, + "step": 60, + "time_per_iteration": 2.670180082321167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01515203, + "balance_loss_mlp": 1.07155395, + "epoch": 0.011735282801077338, + "flos": 573175529472.0, + "grad_norm": 0.06932920743779293, + "language_loss": 1.09267807, + "learning_rate": 0.0008140576401132568, + "loss": 1.10783005, + "num_input_tokens_seen": 4581136, + "router_z_loss_mlp": 4.44921875, + "step": 61, + "time_per_iteration": 2.635917901992798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01537914, + "balance_loss_mlp": 1.0965538, + "epoch": 0.0119276644863409, + "flos": 616716467712.0, + "grad_norm": 0.056166475672555005, + "language_loss": 1.10548615, + "learning_rate": 0.0008172776370494935, + "loss": 1.12086535, + "num_input_tokens_seen": 4650352, + "router_z_loss_mlp": 4.42578125, + "step": 62, + "time_per_iteration": 2.709764242172241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.015397, + "balance_loss_mlp": 1.10024714, + "epoch": 0.012120046171604464, + "flos": 502084199424.0, + "grad_norm": 0.046962065793300374, + "language_loss": 1.17909575, + "learning_rate": 0.0008204461118185703, + "loss": 1.19449282, + "num_input_tokens_seen": 4716336, + "router_z_loss_mlp": 4.40625, + "step": 63, + "time_per_iteration": 2.5971004962921143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01545078, + "balance_loss_mlp": 1.10943925, + "epoch": 0.012312427856868027, + "flos": 474301493760.0, + "grad_norm": 0.04671162143151921, + "language_loss": 1.07277906, + "learning_rate": 0.0008235646872681536, + "loss": 1.08822989, + "num_input_tokens_seen": 4781648, + "router_z_loss_mlp": 4.3671875, + "step": 64, + "time_per_iteration": 2.567622423171997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01534227, + "balance_loss_mlp": 1.10240316, + "epoch": 0.012504809542131588, + "flos": 539470910976.0, + "grad_norm": 0.04435006978162803, + "language_loss": 1.0673492, + "learning_rate": 0.0008266349107584288, + "loss": 1.08269131, + "num_input_tokens_seen": 4852320, + "router_z_loss_mlp": 4.328125, + "step": 65, + "time_per_iteration": 2.6833384037017822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0149994, + "balance_loss_mlp": 1.07345641, + "epoch": 0.012697191227395151, + "flos": 609856567296.0, + "grad_norm": 0.04524096047594039, + "language_loss": 1.09403265, + "learning_rate": 0.0008296582587724851, + "loss": 1.10903215, + "num_input_tokens_seen": 4922016, + "router_z_loss_mlp": 4.2734375, + "step": 66, + "time_per_iteration": 2.692337989807129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01482262, + "balance_loss_mlp": 1.05806744, + "epoch": 0.012889572912658714, + "flos": 769397460480.0, + "grad_norm": 0.04198159389490698, + "language_loss": 1.06809163, + "learning_rate": 0.0008326361411800136, + "loss": 1.08291411, + "num_input_tokens_seen": 5000128, + "router_z_loss_mlp": 4.25, + "step": 67, + "time_per_iteration": 2.923720598220825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01474655, + "balance_loss_mlp": 1.05503809, + "epoch": 0.013081954597922277, + "flos": 535020744192.0, + "grad_norm": 0.041919130945389606, + "language_loss": 1.07100165, + "learning_rate": 0.0008355699051851403, + "loss": 1.0857482, + "num_input_tokens_seen": 5074512, + "router_z_loss_mlp": 4.203125, + "step": 68, + "time_per_iteration": 2.7417044639587402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01462817, + "balance_loss_mlp": 1.04701531, + "epoch": 0.01327433628318584, + "flos": 574180646400.0, + "grad_norm": 0.041322055356332446, + "language_loss": 1.14468551, + "learning_rate": 0.0008384608389860635, + "loss": 1.15931368, + "num_input_tokens_seen": 5141856, + "router_z_loss_mlp": 4.1640625, + "step": 69, + "time_per_iteration": 2.6545376777648926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01450151, + "balance_loss_mlp": 1.03930819, + "epoch": 0.013466717968449404, + "flos": 498259115520.0, + "grad_norm": 0.039605765449237204, + "language_loss": 1.04742777, + "learning_rate": 0.000841310175171381, + "loss": 1.06192923, + "num_input_tokens_seen": 5209280, + "router_z_loss_mlp": 4.11328125, + "step": 70, + "time_per_iteration": 2.5687999725341797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01441096, + "balance_loss_mlp": 1.03101599, + "epoch": 0.013659099653712967, + "flos": 566621803008.0, + "grad_norm": 0.03646297128801074, + "language_loss": 1.03104186, + "learning_rate": 0.000844119093875517, + "loss": 1.04545283, + "num_input_tokens_seen": 5285424, + "router_z_loss_mlp": 4.1015625, + "step": 71, + "time_per_iteration": 2.698259115219116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01433469, + "balance_loss_mlp": 1.02720368, + "epoch": 0.01385148133897653, + "flos": 574942715904.0, + "grad_norm": 0.02854119406997066, + "language_loss": 1.07372236, + "learning_rate": 0.0008468887257134666, + "loss": 1.08805704, + "num_input_tokens_seen": 5358624, + "router_z_loss_mlp": 4.06445312, + "step": 72, + "time_per_iteration": 2.7074387073516846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01422625, + "balance_loss_mlp": 1.01941192, + "epoch": 0.014043863024240093, + "flos": 577958066688.0, + "grad_norm": 0.03113282173853564, + "language_loss": 1.10314119, + "learning_rate": 0.0008496201545131264, + "loss": 1.11736751, + "num_input_tokens_seen": 5429792, + "router_z_loss_mlp": 4.03515625, + "step": 73, + "time_per_iteration": 2.725660562515259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01425762, + "balance_loss_mlp": 1.02655351, + "epoch": 0.014236244709503656, + "flos": 940263883776.0, + "grad_norm": 0.033199488198319166, + "language_loss": 1.07624495, + "learning_rate": 0.0008523144198617317, + "loss": 1.0905025, + "num_input_tokens_seen": 5518608, + "router_z_loss_mlp": 3.99414062, + "step": 74, + "time_per_iteration": 3.2577481269836426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01437934, + "balance_loss_mlp": 1.04139662, + "epoch": 0.014428626394767219, + "flos": 529495603200.0, + "grad_norm": 0.03119178099318558, + "language_loss": 1.07016373, + "learning_rate": 0.0008549725194813783, + "loss": 1.08454299, + "num_input_tokens_seen": 5590576, + "router_z_loss_mlp": 3.96679688, + "step": 75, + "time_per_iteration": 2.727982997894287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01437754, + "balance_loss_mlp": 1.0446496, + "epoch": 0.014621008080030782, + "flos": 805282226688.0, + "grad_norm": 0.02968258762679391, + "language_loss": 1.06415534, + "learning_rate": 0.0008575954114472099, + "loss": 1.07853293, + "num_input_tokens_seen": 5674224, + "router_z_loss_mlp": 3.93164062, + "step": 76, + "time_per_iteration": 3.172807455062866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0143975, + "balance_loss_mlp": 1.04950643, + "epoch": 0.014813389765294343, + "flos": 698356521984.0, + "grad_norm": 0.031905123056971844, + "language_loss": 1.03629625, + "learning_rate": 0.0008601840162606118, + "loss": 1.05069387, + "num_input_tokens_seen": 5757648, + "router_z_loss_mlp": 3.90234375, + "step": 77, + "time_per_iteration": 3.029114007949829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01438585, + "balance_loss_mlp": 1.05158365, + "epoch": 0.015005771450557906, + "flos": 598164464640.0, + "grad_norm": 0.026994348673938514, + "language_loss": 1.09661531, + "learning_rate": 0.000862739218788641, + "loss": 1.11100101, + "num_input_tokens_seen": 5837600, + "router_z_loss_mlp": 3.86914062, + "step": 78, + "time_per_iteration": 2.795952320098877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01440626, + "balance_loss_mlp": 1.05705774, + "epoch": 0.01519815313582147, + "flos": 550492268544.0, + "grad_norm": 0.029495859587709627, + "language_loss": 1.07574832, + "learning_rate": 0.0008652618700799138, + "loss": 1.09015465, + "num_input_tokens_seen": 5907248, + "router_z_loss_mlp": 3.83789062, + "step": 79, + "time_per_iteration": 2.6552224159240723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01430975, + "balance_loss_mlp": 1.05084014, + "epoch": 0.015390534821085032, + "flos": 431440032768.0, + "grad_norm": 0.037998818197719206, + "language_loss": 1.07206631, + "learning_rate": 0.0008677527890662774, + "loss": 1.08637595, + "num_input_tokens_seen": 5970864, + "router_z_loss_mlp": 3.80664062, + "step": 80, + "time_per_iteration": 2.530073881149292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01424927, + "balance_loss_mlp": 1.04727161, + "epoch": 0.015582916506348595, + "flos": 525184424448.0, + "grad_norm": 0.03521308344632083, + "language_loss": 1.08168781, + "learning_rate": 0.0008702127641587799, + "loss": 1.09593713, + "num_input_tokens_seen": 6040800, + "router_z_loss_mlp": 3.78125, + "step": 81, + "time_per_iteration": 2.6248533725738525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01426595, + "balance_loss_mlp": 1.05141926, + "epoch": 0.015775298191612157, + "flos": 576616576512.0, + "grad_norm": 0.026523126631237747, + "language_loss": 1.036394, + "learning_rate": 0.0008726425547457192, + "loss": 1.05065989, + "num_input_tokens_seen": 6111840, + "router_z_loss_mlp": 3.75585938, + "step": 82, + "time_per_iteration": 2.759159564971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01424967, + "balance_loss_mlp": 1.05303442, + "epoch": 0.01596767987687572, + "flos": 611439103488.0, + "grad_norm": 0.03656915183129864, + "language_loss": 1.03032446, + "learning_rate": 0.0008750428925998964, + "loss": 1.04457414, + "num_input_tokens_seen": 6183872, + "router_z_loss_mlp": 3.72265625, + "step": 83, + "time_per_iteration": 2.739105224609375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01431924, + "balance_loss_mlp": 1.06151688, + "epoch": 0.016160061562139283, + "flos": 568232537088.0, + "grad_norm": 0.03323001720600938, + "language_loss": 1.08511543, + "learning_rate": 0.0008774144832015932, + "loss": 1.09943461, + "num_input_tokens_seen": 6255760, + "router_z_loss_mlp": 3.70703125, + "step": 84, + "time_per_iteration": 2.7144806385040283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02085876, + "balance_loss_mlp": 1.68762207, + "epoch": 0.016352443247402846, + "flos": 1414499701248.0, + "grad_norm": 0.1388747380481991, + "language_loss": 0.74774313, + "learning_rate": 0.0008797580069832641, + "loss": 0.76860189, + "num_input_tokens_seen": 6472960, + "router_z_loss_mlp": 3.984375, + "step": 85, + "time_per_iteration": 4.569611072540283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01450774, + "balance_loss_mlp": 1.08532572, + "epoch": 0.01654482493266641, + "flos": 731785165824.0, + "grad_norm": 0.04601998260491519, + "language_loss": 1.03772068, + "learning_rate": 0.0008820741205014318, + "loss": 1.05222845, + "num_input_tokens_seen": 6548912, + "router_z_loss_mlp": 3.65625, + "step": 86, + "time_per_iteration": 2.8604419231414795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.014606, + "balance_loss_mlp": 1.09744096, + "epoch": 0.016737206617929972, + "flos": 537404281344.0, + "grad_norm": 0.03433335749497543, + "language_loss": 1.05140662, + "learning_rate": 0.0008843634575408404, + "loss": 1.06601262, + "num_input_tokens_seen": 6621520, + "router_z_loss_mlp": 3.62695312, + "step": 87, + "time_per_iteration": 2.677731513977051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0145769, + "balance_loss_mlp": 1.09777355, + "epoch": 0.016929588303193535, + "flos": 538129420800.0, + "grad_norm": 0.05036212092144492, + "language_loss": 1.06815004, + "learning_rate": 0.0008866266301555082, + "loss": 1.08272696, + "num_input_tokens_seen": 6698432, + "router_z_loss_mlp": 3.59765625, + "step": 88, + "time_per_iteration": 2.7037875652313232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0145347, + "balance_loss_mlp": 1.09622347, + "epoch": 0.017121969988457098, + "flos": 527791543296.0, + "grad_norm": 0.030252065691096418, + "language_loss": 1.07441962, + "learning_rate": 0.0008888642296509615, + "loss": 1.08895445, + "num_input_tokens_seen": 6764336, + "router_z_loss_mlp": 3.56445312, + "step": 89, + "time_per_iteration": 2.590280771255493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0145473, + "balance_loss_mlp": 1.10034442, + "epoch": 0.01731435167372066, + "flos": 626767636992.0, + "grad_norm": 0.041554939890322294, + "language_loss": 1.12743318, + "learning_rate": 0.0008910768275115906, + "loss": 1.14198053, + "num_input_tokens_seen": 6839392, + "router_z_loss_mlp": 3.54101562, + "step": 90, + "time_per_iteration": 2.7529714107513428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0145373, + "balance_loss_mlp": 1.10220587, + "epoch": 0.017506733358984224, + "flos": 497384254464.0, + "grad_norm": 0.05646737130307679, + "language_loss": 1.07978606, + "learning_rate": 0.0008932649762767675, + "loss": 1.0943234, + "num_input_tokens_seen": 6907344, + "router_z_loss_mlp": 3.50976562, + "step": 91, + "time_per_iteration": 2.5964808464050293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01457202, + "balance_loss_mlp": 1.10911036, + "epoch": 0.017699115044247787, + "flos": 747217758720.0, + "grad_norm": 0.04050166442287704, + "language_loss": 1.1018101, + "learning_rate": 0.0008954292103690864, + "loss": 1.11638212, + "num_input_tokens_seen": 6982464, + "router_z_loss_mlp": 3.47851562, + "step": 92, + "time_per_iteration": 2.9288997650146484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01459372, + "balance_loss_mlp": 1.11395121, + "epoch": 0.01789149672951135, + "flos": 516520407552.0, + "grad_norm": 0.054281950557984966, + "language_loss": 1.12496912, + "learning_rate": 0.0008975700468778296, + "loss": 1.13956285, + "num_input_tokens_seen": 7049712, + "router_z_loss_mlp": 3.45117188, + "step": 93, + "time_per_iteration": 2.5800487995147705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01462727, + "balance_loss_mlp": 1.11978543, + "epoch": 0.018083878414774913, + "flos": 587229702144.0, + "grad_norm": 0.04557553976021738, + "language_loss": 1.05795836, + "learning_rate": 0.0008996879863005366, + "loss": 1.07258558, + "num_input_tokens_seen": 7120288, + "router_z_loss_mlp": 3.42578125, + "step": 94, + "time_per_iteration": 2.6668198108673096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0146929, + "balance_loss_mlp": 1.12882805, + "epoch": 0.018276260100038477, + "flos": 498369905664.0, + "grad_norm": 0.055406629054909326, + "language_loss": 1.06168532, + "learning_rate": 0.0009017835132453337, + "loss": 1.07637823, + "num_input_tokens_seen": 7188896, + "router_z_loss_mlp": 3.40234375, + "step": 95, + "time_per_iteration": 2.588728904724121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0146889, + "balance_loss_mlp": 1.1312896, + "epoch": 0.01846864178530204, + "flos": 641232043008.0, + "grad_norm": 0.04012691806662063, + "language_loss": 1.05874133, + "learning_rate": 0.0009038570970964896, + "loss": 1.0734303, + "num_input_tokens_seen": 7259536, + "router_z_loss_mlp": 3.37890625, + "step": 96, + "time_per_iteration": 2.7860937118530273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01464817, + "balance_loss_mlp": 1.12912345, + "epoch": 0.018661023470565603, + "flos": 512667125760.0, + "grad_norm": 0.027884025705687265, + "language_loss": 1.03269148, + "learning_rate": 0.0009059091926454854, + "loss": 1.04733968, + "num_input_tokens_seen": 7326752, + "router_z_loss_mlp": 3.359375, + "step": 97, + "time_per_iteration": 2.6100950241088867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01470726, + "balance_loss_mlp": 1.13694024, + "epoch": 0.018853405155829166, + "flos": 932696308224.0, + "grad_norm": 0.03936003805775877, + "language_loss": 1.02435613, + "learning_rate": 0.0009079402406897198, + "loss": 1.03906357, + "num_input_tokens_seen": 7417488, + "router_z_loss_mlp": 3.33984375, + "step": 98, + "time_per_iteration": 3.2489542961120605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01467854, + "balance_loss_mlp": 1.13616598, + "epoch": 0.01904578684109273, + "flos": 577586764800.0, + "grad_norm": 0.036005296184057074, + "language_loss": 1.04073858, + "learning_rate": 0.0009099506686008212, + "loss": 1.05541718, + "num_input_tokens_seen": 7493136, + "router_z_loss_mlp": 3.31835938, + "step": 99, + "time_per_iteration": 2.7905051708221436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01467812, + "balance_loss_mlp": 1.13822246, + "epoch": 0.019238168526356292, + "flos": 559520856576.0, + "grad_norm": 0.02696843746399884, + "language_loss": 1.07409596, + "learning_rate": 0.0009119408908644013, + "loss": 1.08877409, + "num_input_tokens_seen": 7560896, + "router_z_loss_mlp": 3.296875, + "step": 100, + "time_per_iteration": 2.7075607776641846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01456893, + "balance_loss_mlp": 1.12882876, + "epoch": 0.019430550211619855, + "flos": 725103184896.0, + "grad_norm": 0.03304065923870771, + "language_loss": 1.12780023, + "learning_rate": 0.0009139113095929519, + "loss": 1.14236927, + "num_input_tokens_seen": 7629040, + "router_z_loss_mlp": 3.28125, + "step": 101, + "time_per_iteration": 2.86230731010437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01460167, + "balance_loss_mlp": 1.13439226, + "epoch": 0.019622931896883418, + "flos": 500456001024.0, + "grad_norm": 0.030619133870748612, + "language_loss": 1.06594038, + "learning_rate": 0.0009158623150134762, + "loss": 1.08054209, + "num_input_tokens_seen": 7694256, + "router_z_loss_mlp": 3.2578125, + "step": 102, + "time_per_iteration": 2.563690185546875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01458611, + "balance_loss_mlp": 1.13569677, + "epoch": 0.01981531358214698, + "flos": 510281587200.0, + "grad_norm": 0.03276303076426602, + "language_loss": 1.06164801, + "learning_rate": 0.000917794285931332, + "loss": 1.0762341, + "num_input_tokens_seen": 7762256, + "router_z_loss_mlp": 3.22851562, + "step": 103, + "time_per_iteration": 2.6599903106689453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01462945, + "balance_loss_mlp": 1.1421293, + "epoch": 0.020007695267410544, + "flos": 522392655360.0, + "grad_norm": 0.026505304013468463, + "language_loss": 0.98227251, + "learning_rate": 0.0009197075901716639, + "loss": 0.99690199, + "num_input_tokens_seen": 7834400, + "router_z_loss_mlp": 3.20703125, + "step": 104, + "time_per_iteration": 2.726245880126953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01469463, + "balance_loss_mlp": 1.14998221, + "epoch": 0.020200076952674107, + "flos": 534443324928.0, + "grad_norm": 0.029933884589862427, + "language_loss": 1.08736229, + "learning_rate": 0.0009216025849997171, + "loss": 1.10205698, + "num_input_tokens_seen": 7911184, + "router_z_loss_mlp": 3.19335938, + "step": 105, + "time_per_iteration": 2.8023486137390137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01468836, + "balance_loss_mlp": 1.15221632, + "epoch": 0.020392458637937667, + "flos": 686082270720.0, + "grad_norm": 0.024520994280375335, + "language_loss": 1.03054178, + "learning_rate": 0.0009234796175212258, + "loss": 1.04523015, + "num_input_tokens_seen": 7985280, + "router_z_loss_mlp": 3.1640625, + "step": 106, + "time_per_iteration": 2.9396088123321533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01469456, + "balance_loss_mlp": 1.15512502, + "epoch": 0.02058484032320123, + "flos": 703414307328.0, + "grad_norm": 0.02898567585615155, + "language_loss": 1.07201982, + "learning_rate": 0.000925339025064007, + "loss": 1.08671439, + "num_input_tokens_seen": 8068320, + "router_z_loss_mlp": 3.140625, + "step": 107, + "time_per_iteration": 2.9473297595977783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0147282, + "balance_loss_mlp": 1.16001439, + "epoch": 0.020777222008464793, + "flos": 640326982656.0, + "grad_norm": 0.02770789473723963, + "language_loss": 0.99879742, + "learning_rate": 0.0009271811355418027, + "loss": 1.01352561, + "num_input_tokens_seen": 8148144, + "router_z_loss_mlp": 3.125, + "step": 108, + "time_per_iteration": 2.8551387786865234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01469504, + "balance_loss_mlp": 1.15803361, + "epoch": 0.020969603693728356, + "flos": 683320700928.0, + "grad_norm": 0.029161506766480293, + "language_loss": 1.06637371, + "learning_rate": 0.0009290062678013548, + "loss": 1.08106875, + "num_input_tokens_seen": 8222256, + "router_z_loss_mlp": 3.11132812, + "step": 109, + "time_per_iteration": 2.821951389312744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01468675, + "balance_loss_mlp": 1.15949392, + "epoch": 0.02116198537899192, + "flos": 534419129856.0, + "grad_norm": 0.03188637458086245, + "language_loss": 1.05070233, + "learning_rate": 0.0009308147319536321, + "loss": 1.06538928, + "num_input_tokens_seen": 8292432, + "router_z_loss_mlp": 3.08789062, + "step": 110, + "time_per_iteration": 2.6315042972564697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01469018, + "balance_loss_mlp": 1.16212535, + "epoch": 0.021354367064255482, + "flos": 718727377920.0, + "grad_norm": 0.030955966903197116, + "language_loss": 1.11490715, + "learning_rate": 0.0009326068296900676, + "loss": 1.12959719, + "num_input_tokens_seen": 8365024, + "router_z_loss_mlp": 3.06445312, + "step": 111, + "time_per_iteration": 2.8208162784576416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01474326, + "balance_loss_mlp": 1.16934085, + "epoch": 0.021546748749519045, + "flos": 520623467520.0, + "grad_norm": 0.027870670355515197, + "language_loss": 1.02138007, + "learning_rate": 0.0009343828545846161, + "loss": 1.03612328, + "num_input_tokens_seen": 8442448, + "router_z_loss_mlp": 3.04492188, + "step": 112, + "time_per_iteration": 2.759277105331421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01474098, + "balance_loss_mlp": 1.17063916, + "epoch": 0.021739130434782608, + "flos": 506161062912.0, + "grad_norm": 0.03372988233582904, + "language_loss": 1.06662297, + "learning_rate": 0.0009361430923823841, + "loss": 1.08136404, + "num_input_tokens_seen": 8508992, + "router_z_loss_mlp": 3.02929688, + "step": 113, + "time_per_iteration": 2.565107822418213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01471087, + "balance_loss_mlp": 1.1693449, + "epoch": 0.02193151212004617, + "flos": 464426242560.0, + "grad_norm": 0.03803370713592907, + "language_loss": 1.10115385, + "learning_rate": 0.0009378878212755459, + "loss": 1.11586463, + "num_input_tokens_seen": 8574048, + "router_z_loss_mlp": 3.01171875, + "step": 114, + "time_per_iteration": 2.491929292678833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01471993, + "balance_loss_mlp": 1.17253923, + "epoch": 0.022123893805309734, + "flos": 553331701248.0, + "grad_norm": 0.029753755152528143, + "language_loss": 1.00006115, + "learning_rate": 0.0009396173121672103, + "loss": 1.014781, + "num_input_tokens_seen": 8647808, + "router_z_loss_mlp": 2.98828125, + "step": 115, + "time_per_iteration": 2.6869561672210693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01473585, + "balance_loss_mlp": 1.1754663, + "epoch": 0.022316275490573297, + "flos": 637378761216.0, + "grad_norm": 0.032022590728611564, + "language_loss": 1.0593642, + "learning_rate": 0.0009413318289238633, + "loss": 1.07410002, + "num_input_tokens_seen": 8719760, + "router_z_loss_mlp": 2.97460938, + "step": 116, + "time_per_iteration": 2.7639846801757812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01474428, + "balance_loss_mlp": 1.17859828, + "epoch": 0.02250865717583686, + "flos": 800315039232.0, + "grad_norm": 0.032750944460810345, + "language_loss": 0.98115921, + "learning_rate": 0.0009430316286169771, + "loss": 0.99590349, + "num_input_tokens_seen": 8798752, + "router_z_loss_mlp": 2.95117188, + "step": 117, + "time_per_iteration": 3.020703077316284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01469481, + "balance_loss_mlp": 1.17536783, + "epoch": 0.022701038861100423, + "flos": 457062782976.0, + "grad_norm": 0.027209249322999743, + "language_loss": 1.0327785, + "learning_rate": 0.0009447169617543361, + "loss": 1.04747331, + "num_input_tokens_seen": 8866848, + "router_z_loss_mlp": 2.9375, + "step": 118, + "time_per_iteration": 2.5938501358032227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01466386, + "balance_loss_mlp": 1.17437065, + "epoch": 0.022893420546363986, + "flos": 584186153472.0, + "grad_norm": 0.028075325054819567, + "language_loss": 1.10005641, + "learning_rate": 0.0009463880725016029, + "loss": 1.11472011, + "num_input_tokens_seen": 8935488, + "router_z_loss_mlp": 2.91992188, + "step": 119, + "time_per_iteration": 2.7082488536834717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01467196, + "balance_loss_mlp": 1.17861414, + "epoch": 0.02308580223162755, + "flos": 562477810176.0, + "grad_norm": 0.032360539397207934, + "language_loss": 1.05048943, + "learning_rate": 0.0009480451988946134, + "loss": 1.06516147, + "num_input_tokens_seen": 9015344, + "router_z_loss_mlp": 2.89257812, + "step": 120, + "time_per_iteration": 2.808687686920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01461098, + "balance_loss_mlp": 1.17423272, + "epoch": 0.023278183916891113, + "flos": 772645125120.0, + "grad_norm": 0.033180722862994706, + "language_loss": 1.06113267, + "learning_rate": 0.0009496885730428627, + "loss": 1.07574379, + "num_input_tokens_seen": 9094672, + "router_z_loss_mlp": 2.875, + "step": 121, + "time_per_iteration": 3.0043137073516846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01466426, + "balance_loss_mlp": 1.18070555, + "epoch": 0.023470565602154676, + "flos": 554430144000.0, + "grad_norm": 0.030787275004595428, + "language_loss": 1.04567683, + "learning_rate": 0.0009513184213246156, + "loss": 1.06034112, + "num_input_tokens_seen": 9160608, + "router_z_loss_mlp": 2.86328125, + "step": 122, + "time_per_iteration": 2.6666839122772217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01462554, + "balance_loss_mlp": 1.17835939, + "epoch": 0.02366294728741824, + "flos": 561166519296.0, + "grad_norm": 0.030499039091632818, + "language_loss": 1.08099937, + "learning_rate": 0.0009529349645740552, + "loss": 1.09562504, + "num_input_tokens_seen": 9228704, + "router_z_loss_mlp": 2.84765625, + "step": 123, + "time_per_iteration": 2.69850492477417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01460088, + "balance_loss_mlp": 1.17741883, + "epoch": 0.0238553289726818, + "flos": 469516955136.0, + "grad_norm": 0.026549221517309443, + "language_loss": 1.06623578, + "learning_rate": 0.0009545384182608524, + "loss": 1.08083653, + "num_input_tokens_seen": 9294288, + "router_z_loss_mlp": 2.83203125, + "step": 124, + "time_per_iteration": 2.5435874462127686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01462583, + "balance_loss_mlp": 1.18144011, + "epoch": 0.024047710657945365, + "flos": 561103392768.0, + "grad_norm": 0.03287811385355005, + "language_loss": 1.04055512, + "learning_rate": 0.0009561289926625252, + "loss": 1.05518079, + "num_input_tokens_seen": 9368048, + "router_z_loss_mlp": 2.81640625, + "step": 125, + "time_per_iteration": 2.6661720275878906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01464029, + "balance_loss_mlp": 1.18460226, + "epoch": 0.024240092343208928, + "flos": 505770295296.0, + "grad_norm": 0.030159442314643806, + "language_loss": 1.08985233, + "learning_rate": 0.0009577068930299292, + "loss": 1.10449266, + "num_input_tokens_seen": 9434848, + "router_z_loss_mlp": 2.79882812, + "step": 126, + "time_per_iteration": 2.596027135848999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01456959, + "balance_loss_mlp": 1.17944014, + "epoch": 0.02443247402847249, + "flos": 436752325632.0, + "grad_norm": 0.03465787530540315, + "language_loss": 1.04454637, + "learning_rate": 0.0009592723197462087, + "loss": 1.05911589, + "num_input_tokens_seen": 9504112, + "router_z_loss_mlp": 2.77929688, + "step": 127, + "time_per_iteration": 2.6355836391448975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0145855, + "balance_loss_mlp": 1.18236613, + "epoch": 0.024624855713736054, + "flos": 685068421632.0, + "grad_norm": 0.03103018628328697, + "language_loss": 1.00976562, + "learning_rate": 0.0009608254684795125, + "loss": 1.02435124, + "num_input_tokens_seen": 9590032, + "router_z_loss_mlp": 2.765625, + "step": 128, + "time_per_iteration": 2.956745147705078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01452077, + "balance_loss_mlp": 1.17741859, + "epoch": 0.024817237398999614, + "flos": 526113679872.0, + "grad_norm": 0.03378324138815482, + "language_loss": 1.03947771, + "learning_rate": 0.0009623665303297678, + "loss": 1.05399847, + "num_input_tokens_seen": 9663040, + "router_z_loss_mlp": 2.75, + "step": 129, + "time_per_iteration": 2.762612819671631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0145448, + "balance_loss_mlp": 1.18115723, + "epoch": 0.025009619084263177, + "flos": 656886216192.0, + "grad_norm": 0.03318348770393379, + "language_loss": 1.08023834, + "learning_rate": 0.0009638956919697878, + "loss": 1.09478307, + "num_input_tokens_seen": 9736544, + "router_z_loss_mlp": 2.73339844, + "step": 130, + "time_per_iteration": 2.8800294399261475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01453293, + "balance_loss_mlp": 1.18130565, + "epoch": 0.02520200076952674, + "flos": 455369456640.0, + "grad_norm": 0.028803226470227133, + "language_loss": 1.00211501, + "learning_rate": 0.0009654131357809714, + "loss": 1.01664793, + "num_input_tokens_seen": 9804656, + "router_z_loss_mlp": 2.71875, + "step": 131, + "time_per_iteration": 2.593409776687622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01454951, + "balance_loss_mlp": 1.18534708, + "epoch": 0.025394382454790303, + "flos": 841268324352.0, + "grad_norm": 0.035993676074610494, + "language_loss": 1.09494662, + "learning_rate": 0.0009669190399838441, + "loss": 1.10949612, + "num_input_tokens_seen": 9888864, + "router_z_loss_mlp": 2.69824219, + "step": 132, + "time_per_iteration": 3.1307294368743896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01454062, + "balance_loss_mlp": 1.18588877, + "epoch": 0.025586764140053866, + "flos": 582228312576.0, + "grad_norm": 0.03305283337163912, + "language_loss": 1.02299893, + "learning_rate": 0.0009684135787636724, + "loss": 1.03753948, + "num_input_tokens_seen": 9968208, + "router_z_loss_mlp": 2.68359375, + "step": 133, + "time_per_iteration": 2.8118627071380615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01454726, + "balance_loss_mlp": 1.18798327, + "epoch": 0.02577914582531743, + "flos": 791677218816.0, + "grad_norm": 0.03011124606519955, + "language_loss": 1.06380379, + "learning_rate": 0.0009698969223913726, + "loss": 1.07835102, + "num_input_tokens_seen": 10049664, + "router_z_loss_mlp": 2.66894531, + "step": 134, + "time_per_iteration": 3.0371806621551514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01450237, + "balance_loss_mlp": 1.18454385, + "epoch": 0.025971527510580992, + "flos": 596062906368.0, + "grad_norm": 0.030569012833979448, + "language_loss": 1.08986592, + "learning_rate": 0.0009713692373399265, + "loss": 1.10436833, + "num_input_tokens_seen": 10120096, + "router_z_loss_mlp": 2.65820312, + "step": 135, + "time_per_iteration": 2.684903621673584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01684837, + "balance_loss_mlp": 1.39873505, + "epoch": 0.026163909195844555, + "flos": 1581074411520.0, + "grad_norm": 0.08870187959024729, + "language_loss": 0.79456228, + "learning_rate": 0.0009728306863964993, + "loss": 0.81141067, + "num_input_tokens_seen": 10348976, + "router_z_loss_mlp": 2.8671875, + "step": 136, + "time_per_iteration": 5.94019627571106 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0161422, + "balance_loss_mlp": 1.33116913, + "epoch": 0.026356290881108118, + "flos": 1505160886272.0, + "grad_norm": 0.07212137850421584, + "language_loss": 0.77811038, + "learning_rate": 0.0009742814287704512, + "loss": 0.79425257, + "num_input_tokens_seen": 10576512, + "router_z_loss_mlp": 2.8359375, + "step": 137, + "time_per_iteration": 4.865153074264526 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01469938, + "balance_loss_mlp": 1.20901299, + "epoch": 0.02654867256637168, + "flos": 598340382720.0, + "grad_norm": 0.040535745966457745, + "language_loss": 1.01652551, + "learning_rate": 0.0009757216201974225, + "loss": 1.03122485, + "num_input_tokens_seen": 10659168, + "router_z_loss_mlp": 2.609375, + "step": 138, + "time_per_iteration": 2.8955435752868652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01487517, + "balance_loss_mlp": 1.22802222, + "epoch": 0.026741054251635244, + "flos": 546135427584.0, + "grad_norm": 0.04340470282065083, + "language_loss": 1.06732666, + "learning_rate": 0.0009771514130396581, + "loss": 1.08220184, + "num_input_tokens_seen": 10731584, + "router_z_loss_mlp": 2.59472656, + "step": 139, + "time_per_iteration": 2.6939706802368164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01498511, + "balance_loss_mlp": 1.24044681, + "epoch": 0.026933435936898807, + "flos": 507845657088.0, + "grad_norm": 0.04879945782970011, + "language_loss": 1.07520163, + "learning_rate": 0.00097857095638274, + "loss": 1.09018672, + "num_input_tokens_seen": 10799456, + "router_z_loss_mlp": 2.58007812, + "step": 140, + "time_per_iteration": 2.559673309326172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01492411, + "balance_loss_mlp": 1.23558652, + "epoch": 0.02712581762216237, + "flos": 742253299200.0, + "grad_norm": 0.043929969627725114, + "language_loss": 0.98754954, + "learning_rate": 0.0009799803961288726, + "loss": 1.00247359, + "num_input_tokens_seen": 10886416, + "router_z_loss_mlp": 2.5703125, + "step": 141, + "time_per_iteration": 3.008998394012451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01470778, + "balance_loss_mlp": 1.21567059, + "epoch": 0.027318199307425933, + "flos": 849777890304.0, + "grad_norm": 0.03716164217421175, + "language_loss": 1.04960537, + "learning_rate": 0.000981379875086876, + "loss": 1.06431305, + "num_input_tokens_seen": 10966064, + "router_z_loss_mlp": 2.55371094, + "step": 142, + "time_per_iteration": 3.057098865509033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01469037, + "balance_loss_mlp": 1.21535933, + "epoch": 0.027510580992689496, + "flos": 576638043648.0, + "grad_norm": 0.03712962317624948, + "language_loss": 1.00046849, + "learning_rate": 0.0009827695330590185, + "loss": 1.01515889, + "num_input_tokens_seen": 11039712, + "router_z_loss_mlp": 2.5390625, + "step": 143, + "time_per_iteration": 2.638338327407837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01450228, + "balance_loss_mlp": 1.19750416, + "epoch": 0.02770296267795306, + "flos": 773789230080.0, + "grad_norm": 0.030455330453953735, + "language_loss": 0.99027133, + "learning_rate": 0.0009841495069248256, + "loss": 1.00477362, + "num_input_tokens_seen": 11123984, + "router_z_loss_mlp": 2.52929688, + "step": 144, + "time_per_iteration": 2.981438636779785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01441391, + "balance_loss_mlp": 1.19009781, + "epoch": 0.027895344363216622, + "flos": 570448888320.0, + "grad_norm": 0.031624263879455494, + "language_loss": 0.98723662, + "learning_rate": 0.0009855199307219871, + "loss": 1.00165045, + "num_input_tokens_seen": 11192864, + "router_z_loss_mlp": 2.51464844, + "step": 145, + "time_per_iteration": 2.6923046112060547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01440125, + "balance_loss_mlp": 1.1903578, + "epoch": 0.028087726048480186, + "flos": 548408174592.0, + "grad_norm": 0.029995844711875903, + "language_loss": 1.00586843, + "learning_rate": 0.0009868809357244854, + "loss": 1.02026975, + "num_input_tokens_seen": 11261760, + "router_z_loss_mlp": 2.49902344, + "step": 146, + "time_per_iteration": 2.6284868717193604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01436833, + "balance_loss_mlp": 1.18782902, + "epoch": 0.02828010773374375, + "flos": 525872633856.0, + "grad_norm": 0.03288909570778387, + "language_loss": 1.05042541, + "learning_rate": 0.0009882326505180556, + "loss": 1.06479371, + "num_input_tokens_seen": 11334736, + "router_z_loss_mlp": 2.49121094, + "step": 147, + "time_per_iteration": 2.6588714122772217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01425728, + "balance_loss_mlp": 1.1783452, + "epoch": 0.02847248941900731, + "flos": 773771765760.0, + "grad_norm": 0.031738987003727674, + "language_loss": 1.02499485, + "learning_rate": 0.0009895752010730906, + "loss": 1.03925204, + "num_input_tokens_seen": 11409872, + "router_z_loss_mlp": 2.47460938, + "step": 148, + "time_per_iteration": 2.9316182136535645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01424571, + "balance_loss_mlp": 1.17785549, + "epoch": 0.028664871104270875, + "flos": 535469908992.0, + "grad_norm": 0.028294299214345536, + "language_loss": 1.0900923, + "learning_rate": 0.0009909087108150867, + "loss": 1.10433793, + "num_input_tokens_seen": 11481024, + "router_z_loss_mlp": 2.46777344, + "step": 149, + "time_per_iteration": 2.697423219680786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.014274, + "balance_loss_mlp": 1.18182933, + "epoch": 0.028857252789534438, + "flos": 368604487680.0, + "grad_norm": 0.03525963963400797, + "language_loss": 1.09753942, + "learning_rate": 0.0009922333006927371, + "loss": 1.11181331, + "num_input_tokens_seen": 11544240, + "router_z_loss_mlp": 2.45605469, + "step": 150, + "time_per_iteration": 2.483644723892212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01433542, + "balance_loss_mlp": 1.18911529, + "epoch": 0.029049634474798, + "flos": 516483477504.0, + "grad_norm": 0.03341635886009217, + "language_loss": 1.03220332, + "learning_rate": 0.0009935490892437632, + "loss": 1.04653883, + "num_input_tokens_seen": 11610416, + "router_z_loss_mlp": 2.44433594, + "step": 151, + "time_per_iteration": 2.604599952697754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01438911, + "balance_loss_mlp": 1.19553363, + "epoch": 0.029242016160061564, + "flos": 589348724736.0, + "grad_norm": 0.030166761621646727, + "language_loss": 1.01782072, + "learning_rate": 0.0009948561926585687, + "loss": 1.03220987, + "num_input_tokens_seen": 11687488, + "router_z_loss_mlp": 2.43359375, + "step": 152, + "time_per_iteration": 2.7724709510803223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01445258, + "balance_loss_mlp": 1.20350146, + "epoch": 0.029434397845325123, + "flos": 553136317440.0, + "grad_norm": 0.030739210798008048, + "language_loss": 1.05873716, + "learning_rate": 0.0009961547248418122, + "loss": 1.07318974, + "num_input_tokens_seen": 11754576, + "router_z_loss_mlp": 2.41699219, + "step": 153, + "time_per_iteration": 2.6247737407684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01440878, + "balance_loss_mlp": 1.19988418, + "epoch": 0.029626779530588686, + "flos": 604607400960.0, + "grad_norm": 0.030186385343499288, + "language_loss": 1.02632022, + "learning_rate": 0.0009974447974719707, + "loss": 1.04072905, + "num_input_tokens_seen": 11831360, + "router_z_loss_mlp": 2.40917969, + "step": 154, + "time_per_iteration": 2.730053663253784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01431891, + "balance_loss_mlp": 1.19194651, + "epoch": 0.02981916121585225, + "flos": 622217413632.0, + "grad_norm": 0.02801027733601246, + "language_loss": 1.04305005, + "learning_rate": 0.0009987265200589763, + "loss": 1.05736899, + "num_input_tokens_seen": 11902192, + "router_z_loss_mlp": 2.3984375, + "step": 155, + "time_per_iteration": 2.7198500633239746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01423605, + "balance_loss_mlp": 1.18537688, + "epoch": 0.030011542901115813, + "flos": 662879987712.0, + "grad_norm": 0.0349007823819893, + "language_loss": 1.04218483, + "learning_rate": 0.001, + "loss": 1.05642092, + "num_input_tokens_seen": 11979088, + "router_z_loss_mlp": 2.38085938, + "step": 156, + "time_per_iteration": 2.8801028728485107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01420835, + "balance_loss_mlp": 1.18289316, + "epoch": 0.030203924586379376, + "flos": 652818084864.0, + "grad_norm": 0.029403473562715665, + "language_loss": 1.01930022, + "learning_rate": 0.0009999999029413921, + "loss": 1.03350854, + "num_input_tokens_seen": 12059200, + "router_z_loss_mlp": 2.37792969, + "step": 157, + "time_per_iteration": 2.8549368381500244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01415444, + "balance_loss_mlp": 1.17921925, + "epoch": 0.03039630627164294, + "flos": 532443824640.0, + "grad_norm": 0.03295212675068383, + "language_loss": 1.02716291, + "learning_rate": 0.0009999996117656068, + "loss": 1.04131734, + "num_input_tokens_seen": 12134944, + "router_z_loss_mlp": 2.36035156, + "step": 158, + "time_per_iteration": 2.6989729404449463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01410387, + "balance_loss_mlp": 1.17530584, + "epoch": 0.030588687956906502, + "flos": 587294830080.0, + "grad_norm": 0.0291076208082698, + "language_loss": 0.96305156, + "learning_rate": 0.0009999991264727564, + "loss": 0.97715545, + "num_input_tokens_seen": 12207936, + "router_z_loss_mlp": 2.34863281, + "step": 159, + "time_per_iteration": 2.7609338760375977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0140999, + "balance_loss_mlp": 1.1752907, + "epoch": 0.030781069642170065, + "flos": 514286592000.0, + "grad_norm": 0.030494101007586163, + "language_loss": 1.0725081, + "learning_rate": 0.0009999984470630296, + "loss": 1.08660805, + "num_input_tokens_seen": 12273200, + "router_z_loss_mlp": 2.34472656, + "step": 160, + "time_per_iteration": 2.5805158615112305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01410287, + "balance_loss_mlp": 1.17711365, + "epoch": 0.030973451327433628, + "flos": 719559304704.0, + "grad_norm": 0.025032822394785544, + "language_loss": 0.95934659, + "learning_rate": 0.0009999975735366902, + "loss": 0.97344947, + "num_input_tokens_seen": 12359600, + "router_z_loss_mlp": 2.32910156, + "step": 161, + "time_per_iteration": 3.078343629837036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01409543, + "balance_loss_mlp": 1.17675149, + "epoch": 0.03116583301269719, + "flos": 1111614400512.0, + "grad_norm": 0.029903967107167622, + "language_loss": 0.98009437, + "learning_rate": 0.0009999965058940775, + "loss": 0.99418974, + "num_input_tokens_seen": 12443936, + "router_z_loss_mlp": 2.32519531, + "step": 162, + "time_per_iteration": 3.49137544631958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01408163, + "balance_loss_mlp": 1.17689729, + "epoch": 0.031358214697960754, + "flos": 451833082368.0, + "grad_norm": 0.11336845133687022, + "language_loss": 1.0463953, + "learning_rate": 0.0009999952441356057, + "loss": 1.06047678, + "num_input_tokens_seen": 12507488, + "router_z_loss_mlp": 2.30957031, + "step": 163, + "time_per_iteration": 2.531280755996704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01406979, + "balance_loss_mlp": 1.17676246, + "epoch": 0.031550596383224314, + "flos": 1257085658112.0, + "grad_norm": 0.03183858769064714, + "language_loss": 1.05248928, + "learning_rate": 0.000999993788261765, + "loss": 1.06655908, + "num_input_tokens_seen": 12594096, + "router_z_loss_mlp": 2.30078125, + "step": 164, + "time_per_iteration": 3.5714328289031982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01408503, + "balance_loss_mlp": 1.17943025, + "epoch": 0.03174297806848788, + "flos": 669322924032.0, + "grad_norm": 0.03191781964215587, + "language_loss": 1.06263065, + "learning_rate": 0.00099999213827312, + "loss": 1.07671571, + "num_input_tokens_seen": 12669424, + "router_z_loss_mlp": 2.29101562, + "step": 165, + "time_per_iteration": 2.7947938442230225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01409995, + "balance_loss_mlp": 1.18101788, + "epoch": 0.03193535975375144, + "flos": 552363514368.0, + "grad_norm": 0.03891580789868065, + "language_loss": 1.01044345, + "learning_rate": 0.000999990294170312, + "loss": 1.0245434, + "num_input_tokens_seen": 12740080, + "router_z_loss_mlp": 2.29003906, + "step": 166, + "time_per_iteration": 2.6462574005126953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0140342, + "balance_loss_mlp": 1.17577803, + "epoch": 0.032127741439015006, + "flos": 544739543040.0, + "grad_norm": 0.03757156138401865, + "language_loss": 1.05309296, + "learning_rate": 0.0009999882559540566, + "loss": 1.06712723, + "num_input_tokens_seen": 12810576, + "router_z_loss_mlp": 2.27636719, + "step": 167, + "time_per_iteration": 2.629549503326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0140941, + "balance_loss_mlp": 1.18234003, + "epoch": 0.032320123124278566, + "flos": 549513348096.0, + "grad_norm": 0.028659149555752484, + "language_loss": 1.01791751, + "learning_rate": 0.000999986023625145, + "loss": 1.03201175, + "num_input_tokens_seen": 12887904, + "router_z_loss_mlp": 2.27050781, + "step": 168, + "time_per_iteration": 2.7051401138305664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01589355, + "balance_loss_mlp": 1.35360718, + "epoch": 0.03251250480954213, + "flos": 1308815430144.0, + "grad_norm": 0.08201951270186027, + "language_loss": 0.78924417, + "learning_rate": 0.0009999835971844441, + "loss": 0.80513763, + "num_input_tokens_seen": 13107344, + "router_z_loss_mlp": 2.35546875, + "step": 169, + "time_per_iteration": 4.9428627490997314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01407645, + "balance_loss_mlp": 1.18257797, + "epoch": 0.03270488649480569, + "flos": 562201835520.0, + "grad_norm": 0.03970113019311383, + "language_loss": 1.02863848, + "learning_rate": 0.0009999809766328958, + "loss": 1.04271495, + "num_input_tokens_seen": 13175552, + "router_z_loss_mlp": 2.25, + "step": 170, + "time_per_iteration": 2.675811529159546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01415662, + "balance_loss_mlp": 1.19193029, + "epoch": 0.03289726818006926, + "flos": 483338813952.0, + "grad_norm": 0.03325277263778645, + "language_loss": 1.04760146, + "learning_rate": 0.0009999781619715177, + "loss": 1.06175804, + "num_input_tokens_seen": 13242384, + "router_z_loss_mlp": 2.23632812, + "step": 171, + "time_per_iteration": 2.5431392192840576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01419714, + "balance_loss_mlp": 1.1972214, + "epoch": 0.03308964986533282, + "flos": 675820254720.0, + "grad_norm": 0.02950894161591202, + "language_loss": 1.04164565, + "learning_rate": 0.000999975153201402, + "loss": 1.05584288, + "num_input_tokens_seen": 13316160, + "router_z_loss_mlp": 2.22363281, + "step": 172, + "time_per_iteration": 2.812837600708008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01422366, + "balance_loss_mlp": 1.20044637, + "epoch": 0.033282031550596385, + "flos": 610340660736.0, + "grad_norm": 0.03086814843966846, + "language_loss": 1.02532911, + "learning_rate": 0.0009999719503237174, + "loss": 1.03955269, + "num_input_tokens_seen": 13387664, + "router_z_loss_mlp": 2.21777344, + "step": 173, + "time_per_iteration": 2.755462646484375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01416936, + "balance_loss_mlp": 1.1959697, + "epoch": 0.033474413235859944, + "flos": 468995931648.0, + "grad_norm": 0.048603642070708566, + "language_loss": 1.1131072, + "learning_rate": 0.0009999685533397073, + "loss": 1.12727666, + "num_input_tokens_seen": 13454528, + "router_z_loss_mlp": 2.20800781, + "step": 174, + "time_per_iteration": 2.566751003265381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01414495, + "balance_loss_mlp": 1.19438744, + "epoch": 0.03366679492112351, + "flos": 580714907136.0, + "grad_norm": 0.03243683176756354, + "language_loss": 1.02908182, + "learning_rate": 0.00099996496225069, + "loss": 1.04322672, + "num_input_tokens_seen": 13522528, + "router_z_loss_mlp": 2.19921875, + "step": 175, + "time_per_iteration": 2.67861008644104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01407523, + "balance_loss_mlp": 1.1883682, + "epoch": 0.03385917660638707, + "flos": 638885435904.0, + "grad_norm": 0.029120554083078395, + "language_loss": 1.05784094, + "learning_rate": 0.0009999611770580604, + "loss": 1.0719161, + "num_input_tokens_seen": 13601120, + "router_z_loss_mlp": 2.18945312, + "step": 176, + "time_per_iteration": 2.8410942554473877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01401607, + "balance_loss_mlp": 1.18302441, + "epoch": 0.03405155829165064, + "flos": 442739366400.0, + "grad_norm": 0.031490867136515936, + "language_loss": 1.04703283, + "learning_rate": 0.0009999571977632876, + "loss": 1.06104875, + "num_input_tokens_seen": 13666384, + "router_z_loss_mlp": 2.18359375, + "step": 177, + "time_per_iteration": 2.559652090072632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01399051, + "balance_loss_mlp": 1.1813277, + "epoch": 0.034243939976914196, + "flos": 467274407424.0, + "grad_norm": 0.029366691437037535, + "language_loss": 1.0724479, + "learning_rate": 0.0009999530243679166, + "loss": 1.08643842, + "num_input_tokens_seen": 13733968, + "router_z_loss_mlp": 2.17480469, + "step": 178, + "time_per_iteration": 2.5423247814178467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01392432, + "balance_loss_mlp": 1.17556691, + "epoch": 0.03443632166217776, + "flos": 780712257024.0, + "grad_norm": 0.02507202069561695, + "language_loss": 1.01653552, + "learning_rate": 0.0009999486568735675, + "loss": 1.03045988, + "num_input_tokens_seen": 13818960, + "router_z_loss_mlp": 2.16601562, + "step": 179, + "time_per_iteration": 3.111632823944092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01381684, + "balance_loss_mlp": 1.16567647, + "epoch": 0.03462870334744132, + "flos": 1265758407168.0, + "grad_norm": 0.027829136834509844, + "language_loss": 1.02053452, + "learning_rate": 0.0009999440952819362, + "loss": 1.03435147, + "num_input_tokens_seen": 13912448, + "router_z_loss_mlp": 2.15722656, + "step": 180, + "time_per_iteration": 3.6354756355285645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01375883, + "balance_loss_mlp": 1.16035271, + "epoch": 0.03482108503270489, + "flos": 608302228992.0, + "grad_norm": 0.033531921209289, + "language_loss": 1.02966988, + "learning_rate": 0.0009999393395947935, + "loss": 1.04342866, + "num_input_tokens_seen": 13990752, + "router_z_loss_mlp": 2.15234375, + "step": 181, + "time_per_iteration": 2.8509652614593506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01372611, + "balance_loss_mlp": 1.15774834, + "epoch": 0.03501346671796845, + "flos": 539314458624.0, + "grad_norm": 0.029990628161131794, + "language_loss": 1.05946589, + "learning_rate": 0.0009999343898139858, + "loss": 1.07319212, + "num_input_tokens_seen": 14058608, + "router_z_loss_mlp": 2.14550781, + "step": 182, + "time_per_iteration": 2.6199755668640137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01375908, + "balance_loss_mlp": 1.16161704, + "epoch": 0.035205848403232015, + "flos": 519498828288.0, + "grad_norm": 0.03419998284579487, + "language_loss": 1.04830694, + "learning_rate": 0.0009999292459414348, + "loss": 1.06206608, + "num_input_tokens_seen": 14126656, + "router_z_loss_mlp": 2.13964844, + "step": 183, + "time_per_iteration": 2.563997983932495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01386507, + "balance_loss_mlp": 1.17269289, + "epoch": 0.035398230088495575, + "flos": 473333306880.0, + "grad_norm": 0.03346089667402367, + "language_loss": 1.09292293, + "learning_rate": 0.0009999239079791374, + "loss": 1.10678792, + "num_input_tokens_seen": 14195840, + "router_z_loss_mlp": 2.13476562, + "step": 184, + "time_per_iteration": 2.5561137199401855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01387981, + "balance_loss_mlp": 1.17512131, + "epoch": 0.03559061177375914, + "flos": 513094823424.0, + "grad_norm": 0.03551516541146116, + "language_loss": 1.01857162, + "learning_rate": 0.0009999183759291659, + "loss": 1.03245139, + "num_input_tokens_seen": 14269936, + "router_z_loss_mlp": 2.125, + "step": 185, + "time_per_iteration": 2.689763307571411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01383562, + "balance_loss_mlp": 1.17108345, + "epoch": 0.0357829934590227, + "flos": 478350159360.0, + "grad_norm": 0.03945465081959485, + "language_loss": 1.04534364, + "learning_rate": 0.0009999126497936682, + "loss": 1.05917931, + "num_input_tokens_seen": 14334848, + "router_z_loss_mlp": 2.12109375, + "step": 186, + "time_per_iteration": 2.5142176151275635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01375295, + "balance_loss_mlp": 1.16415167, + "epoch": 0.03597537514428627, + "flos": 645884324352.0, + "grad_norm": 0.029215470851159726, + "language_loss": 1.06864357, + "learning_rate": 0.0009999067295748676, + "loss": 1.08239663, + "num_input_tokens_seen": 14407888, + "router_z_loss_mlp": 2.10742188, + "step": 187, + "time_per_iteration": 2.8259549140930176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01370561, + "balance_loss_mlp": 1.16056204, + "epoch": 0.03616775682954983, + "flos": 582269245440.0, + "grad_norm": 0.03159066859467708, + "language_loss": 1.0519886, + "learning_rate": 0.000999900615275062, + "loss": 1.06569433, + "num_input_tokens_seen": 14479072, + "router_z_loss_mlp": 2.09570312, + "step": 188, + "time_per_iteration": 2.6748595237731934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01368603, + "balance_loss_mlp": 1.15898561, + "epoch": 0.03636013851481339, + "flos": 383264277504.0, + "grad_norm": 0.043734318168479426, + "language_loss": 1.10731864, + "learning_rate": 0.0009998943068966256, + "loss": 1.1210047, + "num_input_tokens_seen": 14540944, + "router_z_loss_mlp": 2.09179688, + "step": 189, + "time_per_iteration": 2.4394500255584717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01365543, + "balance_loss_mlp": 1.15668833, + "epoch": 0.03655252020007695, + "flos": 584307677184.0, + "grad_norm": 0.02577278402121573, + "language_loss": 1.05579162, + "learning_rate": 0.0009998878044420072, + "loss": 1.06944704, + "num_input_tokens_seen": 14611392, + "router_z_loss_mlp": 2.0859375, + "step": 190, + "time_per_iteration": 2.7022814750671387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01365865, + "balance_loss_mlp": 1.15882242, + "epoch": 0.03674490188534051, + "flos": 472597433856.0, + "grad_norm": 0.03520388751206912, + "language_loss": 1.01277018, + "learning_rate": 0.0009998811079137318, + "loss": 1.02642882, + "num_input_tokens_seen": 14679776, + "router_z_loss_mlp": 2.07324219, + "step": 191, + "time_per_iteration": 2.5930585861206055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0136447, + "balance_loss_mlp": 1.15742755, + "epoch": 0.03693728357060408, + "flos": 529411009536.0, + "grad_norm": 0.03125533686722731, + "language_loss": 1.02464271, + "learning_rate": 0.0009998742173143987, + "loss": 1.0382874, + "num_input_tokens_seen": 14749712, + "router_z_loss_mlp": 2.07324219, + "step": 192, + "time_per_iteration": 2.6235413551330566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01358793, + "balance_loss_mlp": 1.15222692, + "epoch": 0.03712966525586764, + "flos": 800345238528.0, + "grad_norm": 0.02848545485219292, + "language_loss": 1.02800548, + "learning_rate": 0.0009998671326466833, + "loss": 1.04159343, + "num_input_tokens_seen": 14827136, + "router_z_loss_mlp": 2.06835938, + "step": 193, + "time_per_iteration": 2.991110324859619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01351781, + "balance_loss_mlp": 1.1463598, + "epoch": 0.037322046941131205, + "flos": 831358144512.0, + "grad_norm": 0.03513998418582105, + "language_loss": 1.0392077, + "learning_rate": 0.0009998598539133362, + "loss": 1.05272543, + "num_input_tokens_seen": 14902880, + "router_z_loss_mlp": 2.05664062, + "step": 194, + "time_per_iteration": 3.0204203128814697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01349328, + "balance_loss_mlp": 1.14371598, + "epoch": 0.037514428626394765, + "flos": 438588642816.0, + "grad_norm": 0.028816536284039847, + "language_loss": 1.04176903, + "learning_rate": 0.0009998523811171828, + "loss": 1.05526221, + "num_input_tokens_seen": 14967264, + "router_z_loss_mlp": 2.05859375, + "step": 195, + "time_per_iteration": 2.5615782737731934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01345129, + "balance_loss_mlp": 1.14047015, + "epoch": 0.03770681031165833, + "flos": 512638927872.0, + "grad_norm": 0.030721230574493993, + "language_loss": 1.05052435, + "learning_rate": 0.0009998447142611248, + "loss": 1.06397557, + "num_input_tokens_seen": 15039104, + "router_z_loss_mlp": 2.04882812, + "step": 196, + "time_per_iteration": 2.6310269832611084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01347072, + "balance_loss_mlp": 1.14289033, + "epoch": 0.03789919199692189, + "flos": 808842069504.0, + "grad_norm": 0.024329502455983587, + "language_loss": 0.97805226, + "learning_rate": 0.0009998368533481387, + "loss": 0.99152303, + "num_input_tokens_seen": 15124864, + "router_z_loss_mlp": 2.04394531, + "step": 197, + "time_per_iteration": 3.0467066764831543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01344143, + "balance_loss_mlp": 1.14043784, + "epoch": 0.03809157368218546, + "flos": 691791335424.0, + "grad_norm": 0.028391473090668865, + "language_loss": 1.00891113, + "learning_rate": 0.0009998287983812762, + "loss": 1.0223527, + "num_input_tokens_seen": 15199680, + "router_z_loss_mlp": 2.0390625, + "step": 198, + "time_per_iteration": 2.8457672595977783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01342798, + "balance_loss_mlp": 1.14023721, + "epoch": 0.03828395536744902, + "flos": 519004001280.0, + "grad_norm": 0.02890411668538335, + "language_loss": 1.07749867, + "learning_rate": 0.0009998205493636646, + "loss": 1.09092665, + "num_input_tokens_seen": 15270176, + "router_z_loss_mlp": 2.02734375, + "step": 199, + "time_per_iteration": 2.66135573387146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01336213, + "balance_loss_mlp": 1.13432038, + "epoch": 0.038476337052712584, + "flos": 582762071040.0, + "grad_norm": 0.025165239757241963, + "language_loss": 0.99723649, + "learning_rate": 0.0009998121062985063, + "loss": 1.01059866, + "num_input_tokens_seen": 15343168, + "router_z_loss_mlp": 2.02050781, + "step": 200, + "time_per_iteration": 2.70021915435791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01340101, + "balance_loss_mlp": 1.13868463, + "epoch": 0.03866871873797614, + "flos": 578272972800.0, + "grad_norm": 0.025940014565947116, + "language_loss": 1.01401794, + "learning_rate": 0.0009998034691890794, + "loss": 1.02741897, + "num_input_tokens_seen": 15417328, + "router_z_loss_mlp": 2.015625, + "step": 201, + "time_per_iteration": 2.7596118450164795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0134112, + "balance_loss_mlp": 1.14018106, + "epoch": 0.03886110042323971, + "flos": 541771855872.0, + "grad_norm": 0.03045868040347491, + "language_loss": 1.06763899, + "learning_rate": 0.0009997946380387369, + "loss": 1.08105016, + "num_input_tokens_seen": 15489488, + "router_z_loss_mlp": 2.01074219, + "step": 202, + "time_per_iteration": 2.6249613761901855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01341912, + "balance_loss_mlp": 1.14192665, + "epoch": 0.03905348210850327, + "flos": 719239669248.0, + "grad_norm": 0.02826530469295273, + "language_loss": 1.09111357, + "learning_rate": 0.0009997856128509076, + "loss": 1.1045326, + "num_input_tokens_seen": 15558944, + "router_z_loss_mlp": 2.00097656, + "step": 203, + "time_per_iteration": 2.8254761695861816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01336015, + "balance_loss_mlp": 1.13660145, + "epoch": 0.039245863793766836, + "flos": 428396484096.0, + "grad_norm": 0.028264614074004907, + "language_loss": 1.0366801, + "learning_rate": 0.0009997763936290952, + "loss": 1.05004025, + "num_input_tokens_seen": 15625024, + "router_z_loss_mlp": 1.99511719, + "step": 204, + "time_per_iteration": 2.4907312393188477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01334897, + "balance_loss_mlp": 1.13624632, + "epoch": 0.039438245479030395, + "flos": 664269141504.0, + "grad_norm": 0.0294297584821439, + "language_loss": 1.09143519, + "learning_rate": 0.0009997669803768789, + "loss": 1.10478401, + "num_input_tokens_seen": 15697120, + "router_z_loss_mlp": 1.98730469, + "step": 205, + "time_per_iteration": 2.787046194076538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01332958, + "balance_loss_mlp": 1.13497555, + "epoch": 0.03963062716429396, + "flos": 636495168000.0, + "grad_norm": 0.025164669035445293, + "language_loss": 1.04324186, + "learning_rate": 0.0009997573730979134, + "loss": 1.05657148, + "num_input_tokens_seen": 15768752, + "router_z_loss_mlp": 1.98242188, + "step": 206, + "time_per_iteration": 2.744339942932129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01388672, + "balance_loss_mlp": 1.18687439, + "epoch": 0.03982300884955752, + "flos": 1421587186176.0, + "grad_norm": 0.04225268457123109, + "language_loss": 0.79193199, + "learning_rate": 0.0009997475717959284, + "loss": 0.80581868, + "num_input_tokens_seen": 15980624, + "router_z_loss_mlp": 2.01953125, + "step": 207, + "time_per_iteration": 4.62822699546814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01338974, + "balance_loss_mlp": 1.14251721, + "epoch": 0.04001539053482109, + "flos": 690519702528.0, + "grad_norm": 0.029734692172116686, + "language_loss": 1.02667236, + "learning_rate": 0.0009997375764747294, + "loss": 1.04006195, + "num_input_tokens_seen": 16067232, + "router_z_loss_mlp": 1.96875, + "step": 208, + "time_per_iteration": 3.0006470680236816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01332342, + "balance_loss_mlp": 1.1360755, + "epoch": 0.04020777222008465, + "flos": 534751500288.0, + "grad_norm": 0.02521302149444487, + "language_loss": 1.00535607, + "learning_rate": 0.0009997273871381967, + "loss": 1.01867938, + "num_input_tokens_seen": 16139808, + "router_z_loss_mlp": 1.96679688, + "step": 209, + "time_per_iteration": 2.6790220737457275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01335368, + "balance_loss_mlp": 1.14005554, + "epoch": 0.040400153905348214, + "flos": 568996608000.0, + "grad_norm": 0.04055154679799505, + "language_loss": 1.05331016, + "learning_rate": 0.0009997170037902862, + "loss": 1.06666374, + "num_input_tokens_seen": 16210848, + "router_z_loss_mlp": 1.95703125, + "step": 210, + "time_per_iteration": 2.748340129852295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01331596, + "balance_loss_mlp": 1.13647389, + "epoch": 0.040592535590611774, + "flos": 714678712320.0, + "grad_norm": 0.0276705792773584, + "language_loss": 1.07916689, + "learning_rate": 0.0009997064264350292, + "loss": 1.09248281, + "num_input_tokens_seen": 16283984, + "router_z_loss_mlp": 1.95507812, + "step": 211, + "time_per_iteration": 2.8284339904785156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01332545, + "balance_loss_mlp": 1.13761449, + "epoch": 0.04078491727587533, + "flos": 579206231040.0, + "grad_norm": 0.026753366885260317, + "language_loss": 1.01893198, + "learning_rate": 0.0009996956550765317, + "loss": 1.03225756, + "num_input_tokens_seen": 16353904, + "router_z_loss_mlp": 1.953125, + "step": 212, + "time_per_iteration": 2.6636033058166504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01330854, + "balance_loss_mlp": 1.13668597, + "epoch": 0.0409772989611389, + "flos": 553368631296.0, + "grad_norm": 0.03340351088011317, + "language_loss": 0.96620274, + "learning_rate": 0.0009996846897189762, + "loss": 0.97951126, + "num_input_tokens_seen": 16425488, + "router_z_loss_mlp": 1.9453125, + "step": 213, + "time_per_iteration": 2.62785005569458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01327396, + "balance_loss_mlp": 1.13332307, + "epoch": 0.04116968064640246, + "flos": 556764016128.0, + "grad_norm": 0.026256493309422244, + "language_loss": 1.0283711, + "learning_rate": 0.0009996735303666193, + "loss": 1.04164505, + "num_input_tokens_seen": 16498016, + "router_z_loss_mlp": 1.94433594, + "step": 214, + "time_per_iteration": 2.745412588119507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01324547, + "balance_loss_mlp": 1.13152313, + "epoch": 0.041362062331666026, + "flos": 579651393024.0, + "grad_norm": 0.025801807715809106, + "language_loss": 1.04973316, + "learning_rate": 0.0009996621770237937, + "loss": 1.06297863, + "num_input_tokens_seen": 16573744, + "router_z_loss_mlp": 1.93359375, + "step": 215, + "time_per_iteration": 2.7359023094177246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01319406, + "balance_loss_mlp": 1.12657344, + "epoch": 0.041554444016929586, + "flos": 612700729344.0, + "grad_norm": 0.027594527286323677, + "language_loss": 1.00985026, + "learning_rate": 0.0009996506296949073, + "loss": 1.02304435, + "num_input_tokens_seen": 16655344, + "router_z_loss_mlp": 1.93164062, + "step": 216, + "time_per_iteration": 2.860781669616699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01320461, + "balance_loss_mlp": 1.12781918, + "epoch": 0.04174682570219315, + "flos": 529150497792.0, + "grad_norm": 0.030561981852332186, + "language_loss": 1.01172602, + "learning_rate": 0.0009996388883844428, + "loss": 1.02493072, + "num_input_tokens_seen": 16726480, + "router_z_loss_mlp": 1.9296875, + "step": 217, + "time_per_iteration": 2.614837169647217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01315002, + "balance_loss_mlp": 1.12255037, + "epoch": 0.04193920738745671, + "flos": 512499939840.0, + "grad_norm": 0.024235201889365978, + "language_loss": 1.04092622, + "learning_rate": 0.0009996269530969588, + "loss": 1.05407631, + "num_input_tokens_seen": 16792112, + "router_z_loss_mlp": 1.92773438, + "step": 218, + "time_per_iteration": 2.5777087211608887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01317845, + "balance_loss_mlp": 1.1255846, + "epoch": 0.04213158907272028, + "flos": 572552448000.0, + "grad_norm": 0.03618883866707401, + "language_loss": 1.04623246, + "learning_rate": 0.0009996148238370888, + "loss": 1.05941105, + "num_input_tokens_seen": 16862960, + "router_z_loss_mlp": 1.92578125, + "step": 219, + "time_per_iteration": 2.723344564437866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01319419, + "balance_loss_mlp": 1.12830234, + "epoch": 0.04232397075798384, + "flos": 965904098304.0, + "grad_norm": 0.02808123492922437, + "language_loss": 0.99962145, + "learning_rate": 0.0009996025006095421, + "loss": 1.01281559, + "num_input_tokens_seen": 16950416, + "router_z_loss_mlp": 1.9140625, + "step": 220, + "time_per_iteration": 3.297567844390869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01355408, + "balance_loss_mlp": 1.16314697, + "epoch": 0.042516352443247404, + "flos": 1472730628608.0, + "grad_norm": 0.031119874656221472, + "language_loss": 0.77783144, + "learning_rate": 0.0009995899834191028, + "loss": 0.79138547, + "num_input_tokens_seen": 17180944, + "router_z_loss_mlp": 1.92578125, + "step": 221, + "time_per_iteration": 5.484851837158203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0132056, + "balance_loss_mlp": 1.13039756, + "epoch": 0.042708734128510964, + "flos": 655891832832.0, + "grad_norm": 0.027306518139410985, + "language_loss": 0.99887031, + "learning_rate": 0.0009995772722706307, + "loss": 1.0120759, + "num_input_tokens_seen": 17257792, + "router_z_loss_mlp": 1.90429688, + "step": 222, + "time_per_iteration": 2.801955461502075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01324867, + "balance_loss_mlp": 1.13518083, + "epoch": 0.04290111581377453, + "flos": 432733859328.0, + "grad_norm": 0.025166076900031344, + "language_loss": 1.13987851, + "learning_rate": 0.0009995643671690604, + "loss": 1.15312719, + "num_input_tokens_seen": 17320288, + "router_z_loss_mlp": 1.89941406, + "step": 223, + "time_per_iteration": 2.4589195251464844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01320058, + "balance_loss_mlp": 1.13142133, + "epoch": 0.04309349749903809, + "flos": 645866860032.0, + "grad_norm": 0.02470776233740571, + "language_loss": 1.01624262, + "learning_rate": 0.0009995512681194023, + "loss": 1.02944326, + "num_input_tokens_seen": 17396672, + "router_z_loss_mlp": 1.88867188, + "step": 224, + "time_per_iteration": 2.854653835296631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01319788, + "balance_loss_mlp": 1.13124692, + "epoch": 0.04328587918430166, + "flos": 832895745024.0, + "grad_norm": 0.02898896961022835, + "language_loss": 0.98942387, + "learning_rate": 0.0009995379751267417, + "loss": 1.00262189, + "num_input_tokens_seen": 17488096, + "router_z_loss_mlp": 1.88769531, + "step": 225, + "time_per_iteration": 3.260105609893799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01317885, + "balance_loss_mlp": 1.12943935, + "epoch": 0.043478260869565216, + "flos": 526115681280.0, + "grad_norm": 0.02601835272599882, + "language_loss": 1.00718379, + "learning_rate": 0.0009995244881962398, + "loss": 1.02036262, + "num_input_tokens_seen": 17557632, + "router_z_loss_mlp": 1.88671875, + "step": 226, + "time_per_iteration": 2.631685495376587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01320396, + "balance_loss_mlp": 1.13204539, + "epoch": 0.04367064255482878, + "flos": 440412225024.0, + "grad_norm": 0.02740546356326938, + "language_loss": 1.02089393, + "learning_rate": 0.0009995108073331323, + "loss": 1.03409791, + "num_input_tokens_seen": 17626672, + "router_z_loss_mlp": 1.88574219, + "step": 227, + "time_per_iteration": 2.6414895057678223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01308962, + "balance_loss_mlp": 1.12156498, + "epoch": 0.04386302424009234, + "flos": 508466737152.0, + "grad_norm": 0.023646446246452554, + "language_loss": 1.04017711, + "learning_rate": 0.0009994969325427309, + "loss": 1.05326676, + "num_input_tokens_seen": 17698624, + "router_z_loss_mlp": 1.87597656, + "step": 228, + "time_per_iteration": 2.6933584213256836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0130646, + "balance_loss_mlp": 1.11906338, + "epoch": 0.04405540592535591, + "flos": 541743657984.0, + "grad_norm": 0.02642836262436834, + "language_loss": 1.00691068, + "learning_rate": 0.0009994828638304218, + "loss": 1.0199753, + "num_input_tokens_seen": 17767760, + "router_z_loss_mlp": 1.87597656, + "step": 229, + "time_per_iteration": 2.604616165161133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01305226, + "balance_loss_mlp": 1.11792421, + "epoch": 0.04424778761061947, + "flos": 447309055488.0, + "grad_norm": 0.039218098968292335, + "language_loss": 1.07079852, + "learning_rate": 0.0009994686012016675, + "loss": 1.08385086, + "num_input_tokens_seen": 17833664, + "router_z_loss_mlp": 1.875, + "step": 230, + "time_per_iteration": 2.568608045578003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0130487, + "balance_loss_mlp": 1.1187129, + "epoch": 0.044440169295883035, + "flos": 701981492736.0, + "grad_norm": 0.02721662483758601, + "language_loss": 1.06240797, + "learning_rate": 0.000999454144662005, + "loss": 1.07545662, + "num_input_tokens_seen": 17908880, + "router_z_loss_mlp": 1.86328125, + "step": 231, + "time_per_iteration": 2.9104526042938232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01295735, + "balance_loss_mlp": 1.10957813, + "epoch": 0.044632550981146595, + "flos": 589426587648.0, + "grad_norm": 0.02817980914561194, + "language_loss": 1.003865, + "learning_rate": 0.0009994394942170468, + "loss": 1.01682234, + "num_input_tokens_seen": 17978208, + "router_z_loss_mlp": 1.86328125, + "step": 232, + "time_per_iteration": 2.674896001815796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01302928, + "balance_loss_mlp": 1.11667526, + "epoch": 0.04482493266641016, + "flos": 555854226432.0, + "grad_norm": 0.029144066951330677, + "language_loss": 0.98161608, + "learning_rate": 0.0009994246498724808, + "loss": 0.99464536, + "num_input_tokens_seen": 18049296, + "router_z_loss_mlp": 1.86425781, + "step": 233, + "time_per_iteration": 2.674178123474121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01302597, + "balance_loss_mlp": 1.11682117, + "epoch": 0.04501731435167372, + "flos": 724069870080.0, + "grad_norm": 0.027038299766394356, + "language_loss": 1.00722432, + "learning_rate": 0.00099940961163407, + "loss": 1.02025032, + "num_input_tokens_seen": 18123296, + "router_z_loss_mlp": 1.859375, + "step": 234, + "time_per_iteration": 2.8427939414978027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01301098, + "balance_loss_mlp": 1.11608493, + "epoch": 0.04520969603693728, + "flos": 512797381632.0, + "grad_norm": 0.027022139799708383, + "language_loss": 1.02586675, + "learning_rate": 0.0009993943795076528, + "loss": 1.03887773, + "num_input_tokens_seen": 18192784, + "router_z_loss_mlp": 1.8515625, + "step": 235, + "time_per_iteration": 2.5940792560577393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01295671, + "balance_loss_mlp": 1.11094403, + "epoch": 0.04540207772220085, + "flos": 365877846528.0, + "grad_norm": 0.03212133053651388, + "language_loss": 1.0562067, + "learning_rate": 0.0009993789534991427, + "loss": 1.06916356, + "num_input_tokens_seen": 18254064, + "router_z_loss_mlp": 1.84863281, + "step": 236, + "time_per_iteration": 2.4345834255218506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01294151, + "balance_loss_mlp": 1.1095196, + "epoch": 0.045594459407464406, + "flos": 523723411968.0, + "grad_norm": 0.029471400038435007, + "language_loss": 1.00276268, + "learning_rate": 0.0009993633336145287, + "loss": 1.01570415, + "num_input_tokens_seen": 18325728, + "router_z_loss_mlp": 1.84765625, + "step": 237, + "time_per_iteration": 2.6279234886169434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01296614, + "balance_loss_mlp": 1.11284053, + "epoch": 0.04578684109272797, + "flos": 673115807232.0, + "grad_norm": 0.032189822363292264, + "language_loss": 1.04537559, + "learning_rate": 0.0009993475198598752, + "loss": 1.05834174, + "num_input_tokens_seen": 18408608, + "router_z_loss_mlp": 1.83886719, + "step": 238, + "time_per_iteration": 2.98264741897583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01294154, + "balance_loss_mlp": 1.11047626, + "epoch": 0.04597922277799153, + "flos": 542620520448.0, + "grad_norm": 0.025834809881005002, + "language_loss": 1.01282692, + "learning_rate": 0.0009993315122413212, + "loss": 1.02576852, + "num_input_tokens_seen": 18471920, + "router_z_loss_mlp": 1.83789062, + "step": 239, + "time_per_iteration": 2.5969364643096924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01297016, + "balance_loss_mlp": 1.11333883, + "epoch": 0.0461716044632551, + "flos": 459993540096.0, + "grad_norm": 0.025301515003642434, + "language_loss": 1.01210213, + "learning_rate": 0.0009993153107650818, + "loss": 1.02507234, + "num_input_tokens_seen": 18540496, + "router_z_loss_mlp": 1.83789062, + "step": 240, + "time_per_iteration": 2.590198278427124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01297188, + "balance_loss_mlp": 1.11360526, + "epoch": 0.04636398614851866, + "flos": 456170457600.0, + "grad_norm": 0.0338801607583888, + "language_loss": 1.01026332, + "learning_rate": 0.0009992989154374468, + "loss": 1.0232352, + "num_input_tokens_seen": 18606944, + "router_z_loss_mlp": 1.83691406, + "step": 241, + "time_per_iteration": 2.5699570178985596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012963, + "balance_loss_mlp": 1.11271763, + "epoch": 0.046556367833782225, + "flos": 557901390336.0, + "grad_norm": 0.02656657647638049, + "language_loss": 1.0757494, + "learning_rate": 0.0009992823262647817, + "loss": 1.08871233, + "num_input_tokens_seen": 18679520, + "router_z_loss_mlp": 1.83691406, + "step": 242, + "time_per_iteration": 2.6949496269226074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01293965, + "balance_loss_mlp": 1.11047852, + "epoch": 0.046748749519045785, + "flos": 594087601152.0, + "grad_norm": 0.02772781005565529, + "language_loss": 1.02479577, + "learning_rate": 0.0009992655432535264, + "loss": 1.03773546, + "num_input_tokens_seen": 18756656, + "router_z_loss_mlp": 1.8359375, + "step": 243, + "time_per_iteration": 2.7783396244049072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01286985, + "balance_loss_mlp": 1.10454702, + "epoch": 0.04694113120430935, + "flos": 570941713920.0, + "grad_norm": 0.021337056529223342, + "language_loss": 1.01771712, + "learning_rate": 0.0009992485664101973, + "loss": 1.03058696, + "num_input_tokens_seen": 18829792, + "router_z_loss_mlp": 1.82519531, + "step": 244, + "time_per_iteration": 2.679227590560913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01286082, + "balance_loss_mlp": 1.10364425, + "epoch": 0.04713351288957291, + "flos": 865245411840.0, + "grad_norm": 0.03170954338904746, + "language_loss": 1.04355013, + "learning_rate": 0.000999231395741385, + "loss": 1.05641103, + "num_input_tokens_seen": 18906864, + "router_z_loss_mlp": 1.82519531, + "step": 245, + "time_per_iteration": 3.0976788997650146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01287082, + "balance_loss_mlp": 1.10473943, + "epoch": 0.04732589457483648, + "flos": 538235481600.0, + "grad_norm": 0.02353809889700427, + "language_loss": 1.02393425, + "learning_rate": 0.0009992140312537557, + "loss": 1.03680515, + "num_input_tokens_seen": 18973632, + "router_z_loss_mlp": 1.82421875, + "step": 246, + "time_per_iteration": 2.6005859375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01286378, + "balance_loss_mlp": 1.1048938, + "epoch": 0.04751827626010004, + "flos": 763271431680.0, + "grad_norm": 0.021903859990429042, + "language_loss": 0.96665001, + "learning_rate": 0.000999196472954051, + "loss": 0.97951376, + "num_input_tokens_seen": 19052944, + "router_z_loss_mlp": 1.81542969, + "step": 247, + "time_per_iteration": 2.95379638671875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01319153, + "balance_loss_mlp": 1.13833618, + "epoch": 0.0477106579453636, + "flos": 1583125578240.0, + "grad_norm": 0.034344144576267104, + "language_loss": 0.79424852, + "learning_rate": 0.0009991787208490878, + "loss": 0.80744004, + "num_input_tokens_seen": 19286288, + "router_z_loss_mlp": 1.80859375, + "step": 248, + "time_per_iteration": 6.070216655731201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01286412, + "balance_loss_mlp": 1.10521388, + "epoch": 0.04790303963062716, + "flos": 458692982784.0, + "grad_norm": 0.024476775577385278, + "language_loss": 1.04631317, + "learning_rate": 0.0009991607749457578, + "loss": 1.05917728, + "num_input_tokens_seen": 19349296, + "router_z_loss_mlp": 1.8125, + "step": 249, + "time_per_iteration": 2.5741825103759766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0128623, + "balance_loss_mlp": 1.10503209, + "epoch": 0.04809542131589073, + "flos": 783786004992.0, + "grad_norm": 0.021665977114244464, + "language_loss": 1.0235486, + "learning_rate": 0.0009991426352510286, + "loss": 1.03641105, + "num_input_tokens_seen": 19428416, + "router_z_loss_mlp": 1.81152344, + "step": 250, + "time_per_iteration": 3.004519462585449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01287109, + "balance_loss_mlp": 1.10648286, + "epoch": 0.04828780300115429, + "flos": 560321857536.0, + "grad_norm": 0.028059326531900755, + "language_loss": 1.04456568, + "learning_rate": 0.0009991243017719422, + "loss": 1.05743682, + "num_input_tokens_seen": 19498688, + "router_z_loss_mlp": 1.8046875, + "step": 251, + "time_per_iteration": 2.666212320327759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01283793, + "balance_loss_mlp": 1.10364354, + "epoch": 0.048480184686417856, + "flos": 502922130432.0, + "grad_norm": 0.02282661348297379, + "language_loss": 0.985008, + "learning_rate": 0.0009991057745156165, + "loss": 0.99784589, + "num_input_tokens_seen": 19567568, + "router_z_loss_mlp": 1.80078125, + "step": 252, + "time_per_iteration": 2.6053824424743652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01291534, + "balance_loss_mlp": 1.11186218, + "epoch": 0.048672566371681415, + "flos": 1539469120512.0, + "grad_norm": 0.022804524860740846, + "language_loss": 0.81910986, + "learning_rate": 0.0009990870534892446, + "loss": 0.83202517, + "num_input_tokens_seen": 19796368, + "router_z_loss_mlp": 1.796875, + "step": 253, + "time_per_iteration": 5.005317449569702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01285445, + "balance_loss_mlp": 1.10500991, + "epoch": 0.04886494805694498, + "flos": 538951888896.0, + "grad_norm": 0.028242285238858512, + "language_loss": 1.06865251, + "learning_rate": 0.0009990681387000943, + "loss": 1.08150697, + "num_input_tokens_seen": 19870480, + "router_z_loss_mlp": 1.80371094, + "step": 254, + "time_per_iteration": 2.743307590484619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01283321, + "balance_loss_mlp": 1.10317183, + "epoch": 0.04905732974220854, + "flos": 681484383744.0, + "grad_norm": 0.028658365214850164, + "language_loss": 1.02065015, + "learning_rate": 0.0009990490301555093, + "loss": 1.03348327, + "num_input_tokens_seen": 19956288, + "router_z_loss_mlp": 1.80126953, + "step": 255, + "time_per_iteration": 2.989856719970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01291977, + "balance_loss_mlp": 1.1134491, + "epoch": 0.04924971142747211, + "flos": 1424274895872.0, + "grad_norm": 0.01325206916769545, + "language_loss": 0.79215157, + "learning_rate": 0.0009990297278629078, + "loss": 0.80507129, + "num_input_tokens_seen": 20180080, + "router_z_loss_mlp": 1.78515625, + "step": 256, + "time_per_iteration": 4.888273477554321 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01281082, + "balance_loss_mlp": 1.10255432, + "epoch": 0.04944209311273567, + "flos": 1561236587520.0, + "grad_norm": 0.00993410716153638, + "language_loss": 0.79242742, + "learning_rate": 0.000999010231829784, + "loss": 0.80523825, + "num_input_tokens_seen": 20413456, + "router_z_loss_mlp": 1.78515625, + "step": 257, + "time_per_iteration": 4.983605623245239 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01285439, + "balance_loss_mlp": 1.10786438, + "epoch": 0.04963447479799923, + "flos": 1574170850304.0, + "grad_norm": 0.014798835308040135, + "language_loss": 0.69975883, + "learning_rate": 0.0009989905420637066, + "loss": 0.71261322, + "num_input_tokens_seen": 20644736, + "router_z_loss_mlp": 1.77539062, + "step": 258, + "time_per_iteration": 4.888776540756226 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01282209, + "balance_loss_mlp": 1.10310864, + "epoch": 0.049826856483262794, + "flos": 626498393088.0, + "grad_norm": 0.032236291487241595, + "language_loss": 0.9680413, + "learning_rate": 0.0009989706585723202, + "loss": 0.98086333, + "num_input_tokens_seen": 20719040, + "router_z_loss_mlp": 1.79003906, + "step": 259, + "time_per_iteration": 2.776397705078125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01280186, + "balance_loss_mlp": 1.10175359, + "epoch": 0.05001923816852635, + "flos": 505155945984.0, + "grad_norm": 0.03442249770662494, + "language_loss": 1.03026366, + "learning_rate": 0.0009989505813633442, + "loss": 1.04306555, + "num_input_tokens_seen": 20789376, + "router_z_loss_mlp": 1.78271484, + "step": 260, + "time_per_iteration": 2.651773691177368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01281097, + "balance_loss_mlp": 1.10295069, + "epoch": 0.05021161985378992, + "flos": 588467132928.0, + "grad_norm": 0.024781843968885862, + "language_loss": 1.02880228, + "learning_rate": 0.000998930310444573, + "loss": 1.04161322, + "num_input_tokens_seen": 20857856, + "router_z_loss_mlp": 1.78125, + "step": 261, + "time_per_iteration": 2.730717420578003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01266116, + "balance_loss_mlp": 1.08796966, + "epoch": 0.05040400153905348, + "flos": 634402341888.0, + "grad_norm": 0.028473185138455738, + "language_loss": 1.01351452, + "learning_rate": 0.0009989098458238765, + "loss": 1.02617574, + "num_input_tokens_seen": 20931232, + "router_z_loss_mlp": 1.77929688, + "step": 262, + "time_per_iteration": 2.7717010974884033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01272128, + "balance_loss_mlp": 1.09407711, + "epoch": 0.050596383224317046, + "flos": 554808176640.0, + "grad_norm": 0.03464065468219783, + "language_loss": 1.00597906, + "learning_rate": 0.0009988891875091998, + "loss": 1.01870036, + "num_input_tokens_seen": 21012672, + "router_z_loss_mlp": 1.77880859, + "step": 263, + "time_per_iteration": 2.8842556476593018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012725, + "balance_loss_mlp": 1.09444928, + "epoch": 0.050788764909580605, + "flos": 550761512448.0, + "grad_norm": 0.02541343292713684, + "language_loss": 0.95014787, + "learning_rate": 0.0009988683355085636, + "loss": 0.96287298, + "num_input_tokens_seen": 21088592, + "router_z_loss_mlp": 1.77880859, + "step": 264, + "time_per_iteration": 2.7466378211975098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01272896, + "balance_loss_mlp": 1.09527469, + "epoch": 0.05098114659484417, + "flos": 606344388096.0, + "grad_norm": 0.02024934595994547, + "language_loss": 1.03858495, + "learning_rate": 0.000998847289830063, + "loss": 1.05131388, + "num_input_tokens_seen": 21169840, + "router_z_loss_mlp": 1.77587891, + "step": 265, + "time_per_iteration": 2.821997880935669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01285574, + "balance_loss_mlp": 1.10761857, + "epoch": 0.05117352828010773, + "flos": 439472236032.0, + "grad_norm": 0.026937538773041583, + "language_loss": 0.97004128, + "learning_rate": 0.0009988260504818682, + "loss": 0.98289704, + "num_input_tokens_seen": 21236144, + "router_z_loss_mlp": 1.77832031, + "step": 266, + "time_per_iteration": 2.557830333709717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01277028, + "balance_loss_mlp": 1.09907281, + "epoch": 0.0513659099653713, + "flos": 506030807040.0, + "grad_norm": 0.02494960853942852, + "language_loss": 1.03986156, + "learning_rate": 0.000998804617472226, + "loss": 1.05263186, + "num_input_tokens_seen": 21304864, + "router_z_loss_mlp": 1.77832031, + "step": 267, + "time_per_iteration": 2.644099235534668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01269549, + "balance_loss_mlp": 1.09254682, + "epoch": 0.05155829165063486, + "flos": 696714862080.0, + "grad_norm": 0.027664306986101984, + "language_loss": 0.98796493, + "learning_rate": 0.0009987829908094568, + "loss": 1.00066042, + "num_input_tokens_seen": 21377504, + "router_z_loss_mlp": 1.76953125, + "step": 268, + "time_per_iteration": 2.8291003704071045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01265086, + "balance_loss_mlp": 1.08817983, + "epoch": 0.051750673335898424, + "flos": 1350300294144.0, + "grad_norm": 0.03385083640642466, + "language_loss": 1.06218576, + "learning_rate": 0.0009987611705019569, + "loss": 1.07483661, + "num_input_tokens_seen": 21463840, + "router_z_loss_mlp": 1.76855469, + "step": 269, + "time_per_iteration": 4.150776624679565 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01264769, + "balance_loss_mlp": 1.08795822, + "epoch": 0.051943055021161984, + "flos": 490589481984.0, + "grad_norm": 0.028250493976035247, + "language_loss": 1.04104686, + "learning_rate": 0.0009987391565581978, + "loss": 1.05369449, + "num_input_tokens_seen": 21531184, + "router_z_loss_mlp": 1.76757812, + "step": 270, + "time_per_iteration": 2.5921454429626465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01266977, + "balance_loss_mlp": 1.09092879, + "epoch": 0.05213543670642555, + "flos": 546880032768.0, + "grad_norm": 0.026669721507250346, + "language_loss": 0.96455419, + "learning_rate": 0.000998716948986726, + "loss": 0.97722399, + "num_input_tokens_seen": 21612224, + "router_z_loss_mlp": 1.75976562, + "step": 271, + "time_per_iteration": 2.7835500240325928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01268405, + "balance_loss_mlp": 1.09264266, + "epoch": 0.05232781839168911, + "flos": 604672528896.0, + "grad_norm": 0.03568520247936263, + "language_loss": 0.99334317, + "learning_rate": 0.0009986945477961633, + "loss": 1.00602722, + "num_input_tokens_seen": 21681024, + "router_z_loss_mlp": 1.75683594, + "step": 272, + "time_per_iteration": 2.6972289085388184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01271248, + "balance_loss_mlp": 1.0953902, + "epoch": 0.052520200076952676, + "flos": 539655561216.0, + "grad_norm": 0.02343402151836954, + "language_loss": 1.0317328, + "learning_rate": 0.0009986719529952066, + "loss": 1.04444528, + "num_input_tokens_seen": 21761616, + "router_z_loss_mlp": 1.7578125, + "step": 273, + "time_per_iteration": 2.908298969268799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0126867, + "balance_loss_mlp": 1.09266984, + "epoch": 0.052712581762216236, + "flos": 464332916736.0, + "grad_norm": 0.028493663433316604, + "language_loss": 1.03350449, + "learning_rate": 0.000998649164592628, + "loss": 1.0461911, + "num_input_tokens_seen": 21828416, + "router_z_loss_mlp": 1.75927734, + "step": 274, + "time_per_iteration": 2.5805718898773193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01263404, + "balance_loss_mlp": 1.08735609, + "epoch": 0.0529049634474798, + "flos": 549105116160.0, + "grad_norm": 0.024462560446863554, + "language_loss": 1.01155043, + "learning_rate": 0.0009986261825972748, + "loss": 1.02418458, + "num_input_tokens_seen": 21901600, + "router_z_loss_mlp": 1.75976562, + "step": 275, + "time_per_iteration": 2.675705909729004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01269015, + "balance_loss_mlp": 1.09334803, + "epoch": 0.05309734513274336, + "flos": 619200061440.0, + "grad_norm": 0.026443817532743642, + "language_loss": 1.03055406, + "learning_rate": 0.000998603007018069, + "loss": 1.04324436, + "num_input_tokens_seen": 21979312, + "router_z_loss_mlp": 1.75585938, + "step": 276, + "time_per_iteration": 2.77298903465271 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01264217, + "balance_loss_mlp": 1.08893192, + "epoch": 0.05328972681800693, + "flos": 606617634816.0, + "grad_norm": 0.022439827576013177, + "language_loss": 1.00613213, + "learning_rate": 0.0009985796378640089, + "loss": 1.01877427, + "num_input_tokens_seen": 22053776, + "router_z_loss_mlp": 1.75195312, + "step": 277, + "time_per_iteration": 2.693049669265747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01264635, + "balance_loss_mlp": 1.08963549, + "epoch": 0.05348210850327049, + "flos": 605730038784.0, + "grad_norm": 0.02549683888178727, + "language_loss": 1.01102281, + "learning_rate": 0.0009985560751441665, + "loss": 1.02366924, + "num_input_tokens_seen": 22134304, + "router_z_loss_mlp": 1.74902344, + "step": 278, + "time_per_iteration": 2.8009955883026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262716, + "balance_loss_mlp": 1.08757329, + "epoch": 0.053674490188534055, + "flos": 631997337600.0, + "grad_norm": 0.025192100126554, + "language_loss": 1.03316271, + "learning_rate": 0.00099853231886769, + "loss": 1.04578984, + "num_input_tokens_seen": 22212896, + "router_z_loss_mlp": 1.75048828, + "step": 279, + "time_per_iteration": 2.8228564262390137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262121, + "balance_loss_mlp": 1.08712184, + "epoch": 0.053866871873797614, + "flos": 480173741568.0, + "grad_norm": 0.02583251996588833, + "language_loss": 1.02629757, + "learning_rate": 0.0009985083690438024, + "loss": 1.03891873, + "num_input_tokens_seen": 22287216, + "router_z_loss_mlp": 1.74902344, + "step": 280, + "time_per_iteration": 2.705789566040039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01260843, + "balance_loss_mlp": 1.08655906, + "epoch": 0.054059253559061174, + "flos": 789489065472.0, + "grad_norm": 0.023704628566171972, + "language_loss": 0.9340027, + "learning_rate": 0.0009984842256818016, + "loss": 0.94661117, + "num_input_tokens_seen": 22370864, + "router_z_loss_mlp": 1.74169922, + "step": 281, + "time_per_iteration": 3.084801435470581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01257985, + "balance_loss_mlp": 1.08379591, + "epoch": 0.05425163524432474, + "flos": 629505011712.0, + "grad_norm": 0.027462270528210347, + "language_loss": 1.04308844, + "learning_rate": 0.0009984598887910613, + "loss": 1.05566835, + "num_input_tokens_seen": 22440080, + "router_z_loss_mlp": 1.74072266, + "step": 282, + "time_per_iteration": 2.729063034057617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262745, + "balance_loss_mlp": 1.08855665, + "epoch": 0.0544440169295883, + "flos": 616992442368.0, + "grad_norm": 0.02580860229759897, + "language_loss": 0.99945354, + "learning_rate": 0.0009984353583810297, + "loss": 1.01208091, + "num_input_tokens_seen": 22517936, + "router_z_loss_mlp": 1.74072266, + "step": 283, + "time_per_iteration": 2.812309741973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01258383, + "balance_loss_mlp": 1.08433735, + "epoch": 0.05463639861485187, + "flos": 648929874432.0, + "grad_norm": 0.0290705298354334, + "language_loss": 1.01989841, + "learning_rate": 0.0009984106344612302, + "loss": 1.03248215, + "num_input_tokens_seen": 22590480, + "router_z_loss_mlp": 1.73925781, + "step": 284, + "time_per_iteration": 2.785377264022827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0126395, + "balance_loss_mlp": 1.0907625, + "epoch": 0.054828780300115426, + "flos": 798584782848.0, + "grad_norm": 0.03167011835004719, + "language_loss": 0.97435868, + "learning_rate": 0.0009983857170412615, + "loss": 0.9869982, + "num_input_tokens_seen": 22668144, + "router_z_loss_mlp": 1.73046875, + "step": 285, + "time_per_iteration": 2.9822604656219482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01258353, + "balance_loss_mlp": 1.08511817, + "epoch": 0.05502116198537899, + "flos": 550798442496.0, + "grad_norm": 0.02077828299254123, + "language_loss": 0.96197385, + "learning_rate": 0.000998360606130798, + "loss": 0.9745574, + "num_input_tokens_seen": 22749648, + "router_z_loss_mlp": 1.73095703, + "step": 286, + "time_per_iteration": 2.8340489864349365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01266281, + "balance_loss_mlp": 1.09461975, + "epoch": 0.05521354367064255, + "flos": 1410906931200.0, + "grad_norm": 0.010589673029146669, + "language_loss": 0.69073117, + "learning_rate": 0.0009983353017395877, + "loss": 0.70339394, + "num_input_tokens_seen": 22982752, + "router_z_loss_mlp": 1.71484375, + "step": 287, + "time_per_iteration": 4.893908500671387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0126535, + "balance_loss_mlp": 1.09235394, + "epoch": 0.05540592535590612, + "flos": 646611465216.0, + "grad_norm": 0.04031113274469801, + "language_loss": 1.02544129, + "learning_rate": 0.0009983098038774552, + "loss": 1.03809476, + "num_input_tokens_seen": 23053584, + "router_z_loss_mlp": 1.72851562, + "step": 288, + "time_per_iteration": 2.800687551498413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01258598, + "balance_loss_mlp": 1.08712769, + "epoch": 0.05559830704116968, + "flos": 1514315727360.0, + "grad_norm": 0.011752943348929798, + "language_loss": 0.78170228, + "learning_rate": 0.0009982841125542993, + "loss": 0.79428822, + "num_input_tokens_seen": 23280256, + "router_z_loss_mlp": 1.71289062, + "step": 289, + "time_per_iteration": 4.802466630935669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0126164, + "balance_loss_mlp": 1.08869088, + "epoch": 0.055790688726433245, + "flos": 509334867456.0, + "grad_norm": 0.03460900762027919, + "language_loss": 1.00913107, + "learning_rate": 0.0009982582277800948, + "loss": 1.02174735, + "num_input_tokens_seen": 23345760, + "router_z_loss_mlp": 1.72802734, + "step": 290, + "time_per_iteration": 2.574007749557495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01255451, + "balance_loss_mlp": 1.08326483, + "epoch": 0.055983070411696804, + "flos": 659074369536.0, + "grad_norm": 0.03439417592421578, + "language_loss": 1.07703924, + "learning_rate": 0.0009982321495648908, + "loss": 1.08959377, + "num_input_tokens_seen": 23420720, + "router_z_loss_mlp": 1.72021484, + "step": 291, + "time_per_iteration": 2.8004326820373535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01257264, + "balance_loss_mlp": 1.08503067, + "epoch": 0.05617545209696037, + "flos": 588475865088.0, + "grad_norm": 0.024241847728240208, + "language_loss": 0.9905349, + "learning_rate": 0.0009982058779188115, + "loss": 1.00310755, + "num_input_tokens_seen": 23492576, + "router_z_loss_mlp": 1.72070312, + "step": 292, + "time_per_iteration": 2.763096570968628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01257503, + "balance_loss_mlp": 1.0853169, + "epoch": 0.05636783378222393, + "flos": 612787324416.0, + "grad_norm": 0.027188079674348095, + "language_loss": 1.06693649, + "learning_rate": 0.0009981794128520567, + "loss": 1.07951164, + "num_input_tokens_seen": 23569824, + "router_z_loss_mlp": 1.72021484, + "step": 293, + "time_per_iteration": 2.7630960941314697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01253426, + "balance_loss_mlp": 1.08123958, + "epoch": 0.0565602154674875, + "flos": 669422980608.0, + "grad_norm": 0.030197403892147204, + "language_loss": 1.03523457, + "learning_rate": 0.000998152754374901, + "loss": 1.04776871, + "num_input_tokens_seen": 23649984, + "router_z_loss_mlp": 1.72021484, + "step": 294, + "time_per_iteration": 2.8583314418792725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01249713, + "balance_loss_mlp": 1.07743168, + "epoch": 0.05675259715275106, + "flos": 618364131840.0, + "grad_norm": 0.026289358543143387, + "language_loss": 0.99071473, + "learning_rate": 0.0009981259024976943, + "loss": 1.00321186, + "num_input_tokens_seen": 23722032, + "router_z_loss_mlp": 1.72119141, + "step": 295, + "time_per_iteration": 2.719881534576416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250566, + "balance_loss_mlp": 1.07814193, + "epoch": 0.05694497883801462, + "flos": 753153133056.0, + "grad_norm": 0.03148267511857758, + "language_loss": 0.97962338, + "learning_rate": 0.0009980988572308612, + "loss": 0.99212909, + "num_input_tokens_seen": 23797376, + "router_z_loss_mlp": 1.72265625, + "step": 296, + "time_per_iteration": 2.9828195571899414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250905, + "balance_loss_mlp": 1.0789572, + "epoch": 0.05713736052327818, + "flos": 713380882944.0, + "grad_norm": 0.02524811137395651, + "language_loss": 1.00250125, + "learning_rate": 0.0009980716185849015, + "loss": 1.01501024, + "num_input_tokens_seen": 23880496, + "router_z_loss_mlp": 1.71777344, + "step": 297, + "time_per_iteration": 2.9749252796173096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01251066, + "balance_loss_mlp": 1.07959557, + "epoch": 0.05732974220854175, + "flos": 469935920640.0, + "grad_norm": 0.024054663695119705, + "language_loss": 0.96916056, + "learning_rate": 0.0009980441865703904, + "loss": 0.98167121, + "num_input_tokens_seen": 23950016, + "router_z_loss_mlp": 1.71289062, + "step": 298, + "time_per_iteration": 2.598325252532959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250911, + "balance_loss_mlp": 1.07939255, + "epoch": 0.05752212389380531, + "flos": 602540771328.0, + "grad_norm": 0.025930022992042723, + "language_loss": 1.05563986, + "learning_rate": 0.000998016561197978, + "loss": 1.06814897, + "num_input_tokens_seen": 24020064, + "router_z_loss_mlp": 1.71337891, + "step": 299, + "time_per_iteration": 2.690300703048706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250529, + "balance_loss_mlp": 1.07924938, + "epoch": 0.057714505579068875, + "flos": 679949511168.0, + "grad_norm": 0.025847674874905035, + "language_loss": 0.97115421, + "learning_rate": 0.0009979887424783895, + "loss": 0.98365951, + "num_input_tokens_seen": 24095360, + "router_z_loss_mlp": 1.7109375, + "step": 300, + "time_per_iteration": 2.863856554031372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01249286, + "balance_loss_mlp": 1.07810116, + "epoch": 0.057906887264332435, + "flos": 597011627520.0, + "grad_norm": 0.02594453351976595, + "language_loss": 0.96475613, + "learning_rate": 0.0009979607304224248, + "loss": 0.97724897, + "num_input_tokens_seen": 24164608, + "router_z_loss_mlp": 1.70996094, + "step": 301, + "time_per_iteration": 2.733415365219116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01248659, + "balance_loss_mlp": 1.0772841, + "epoch": 0.058099268949596, + "flos": 553164515328.0, + "grad_norm": 0.024492956239426298, + "language_loss": 1.0387162, + "learning_rate": 0.000997932525040959, + "loss": 1.05120289, + "num_input_tokens_seen": 24233840, + "router_z_loss_mlp": 1.71191406, + "step": 302, + "time_per_iteration": 2.6392264366149902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01252345, + "balance_loss_mlp": 1.08111238, + "epoch": 0.05829165063485956, + "flos": 509230808064.0, + "grad_norm": 0.038324718957869854, + "language_loss": 1.05616117, + "learning_rate": 0.000997904126344943, + "loss": 1.06868458, + "num_input_tokens_seen": 24302928, + "router_z_loss_mlp": 1.71044922, + "step": 303, + "time_per_iteration": 2.611621141433716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0125091, + "balance_loss_mlp": 1.080441, + "epoch": 0.05848403232012313, + "flos": 616362630144.0, + "grad_norm": 0.028818083574726525, + "language_loss": 1.02425826, + "learning_rate": 0.0009978755343454018, + "loss": 1.03676736, + "num_input_tokens_seen": 24377024, + "router_z_loss_mlp": 1.70263672, + "step": 304, + "time_per_iteration": 2.750213384628296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01245805, + "balance_loss_mlp": 1.07490659, + "epoch": 0.05867641400538669, + "flos": 501079082496.0, + "grad_norm": 0.025195073137535502, + "language_loss": 1.02874422, + "learning_rate": 0.0009978467490534355, + "loss": 1.04120219, + "num_input_tokens_seen": 24442736, + "router_z_loss_mlp": 1.70703125, + "step": 305, + "time_per_iteration": 2.591139078140259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0124905, + "balance_loss_mlp": 1.07853293, + "epoch": 0.05886879569065025, + "flos": 532378696704.0, + "grad_norm": 0.026491629776715375, + "language_loss": 0.99473399, + "learning_rate": 0.00099781777048022, + "loss": 1.00722456, + "num_input_tokens_seen": 24514800, + "router_z_loss_mlp": 1.703125, + "step": 306, + "time_per_iteration": 2.731084108352661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012482, + "balance_loss_mlp": 1.07782638, + "epoch": 0.05906117737591381, + "flos": 490040260608.0, + "grad_norm": 0.025118942729794178, + "language_loss": 1.01122224, + "learning_rate": 0.0009977885986370057, + "loss": 1.02370417, + "num_input_tokens_seen": 24581648, + "router_z_loss_mlp": 1.70166016, + "step": 307, + "time_per_iteration": 2.548307418823242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01247075, + "balance_loss_mlp": 1.0766536, + "epoch": 0.05925355906117737, + "flos": 592709180928.0, + "grad_norm": 0.029001286226925486, + "language_loss": 0.96780527, + "learning_rate": 0.000997759233535118, + "loss": 0.98027599, + "num_input_tokens_seen": 24658864, + "router_z_loss_mlp": 1.70214844, + "step": 308, + "time_per_iteration": 2.7876322269439697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01247056, + "balance_loss_mlp": 1.07668173, + "epoch": 0.05944594074644094, + "flos": 564787487232.0, + "grad_norm": 0.026648157056946717, + "language_loss": 1.03345561, + "learning_rate": 0.0009977296751859576, + "loss": 1.04592621, + "num_input_tokens_seen": 24735808, + "router_z_loss_mlp": 1.70166016, + "step": 309, + "time_per_iteration": 2.71488094329834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0124953, + "balance_loss_mlp": 1.07958508, + "epoch": 0.0596383224317045, + "flos": 539807284224.0, + "grad_norm": 0.023775477335694146, + "language_loss": 1.04459929, + "learning_rate": 0.0009976999236009998, + "loss": 1.05709469, + "num_input_tokens_seen": 24807744, + "router_z_loss_mlp": 1.69726562, + "step": 310, + "time_per_iteration": 2.7919182777404785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01255511, + "balance_loss_mlp": 1.08618629, + "epoch": 0.059830704116968066, + "flos": 562052113920.0, + "grad_norm": 0.02942700961653022, + "language_loss": 1.06853497, + "learning_rate": 0.0009976699787917955, + "loss": 1.08109009, + "num_input_tokens_seen": 24876640, + "router_z_loss_mlp": 1.69091797, + "step": 311, + "time_per_iteration": 2.6729257106781006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012565, + "balance_loss_mlp": 1.08789062, + "epoch": 0.060023085802231625, + "flos": 1574047325184.0, + "grad_norm": 0.029063497479097016, + "language_loss": 0.73442996, + "learning_rate": 0.00099763984076997, + "loss": 0.74699497, + "num_input_tokens_seen": 25110864, + "router_z_loss_mlp": 1.68359375, + "step": 312, + "time_per_iteration": 4.972649097442627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01249775, + "balance_loss_mlp": 1.08021212, + "epoch": 0.06021546748749519, + "flos": 483627523584.0, + "grad_norm": 0.0314235925459163, + "language_loss": 0.98280072, + "learning_rate": 0.0009976095095472243, + "loss": 0.9952985, + "num_input_tokens_seen": 25179328, + "router_z_loss_mlp": 1.69335938, + "step": 313, + "time_per_iteration": 2.5644209384918213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0125234, + "balance_loss_mlp": 1.08287179, + "epoch": 0.06040784917275875, + "flos": 621423143424.0, + "grad_norm": 0.030123719928355924, + "language_loss": 0.99538821, + "learning_rate": 0.0009975789851353334, + "loss": 1.00791156, + "num_input_tokens_seen": 25254128, + "router_z_loss_mlp": 1.69238281, + "step": 314, + "time_per_iteration": 2.794311285018921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01256592, + "balance_loss_mlp": 1.08741045, + "epoch": 0.06060023085802232, + "flos": 484602441216.0, + "grad_norm": 0.026992074473858402, + "language_loss": 1.01683283, + "learning_rate": 0.0009975482675461487, + "loss": 1.02939868, + "num_input_tokens_seen": 25324624, + "router_z_loss_mlp": 1.68945312, + "step": 315, + "time_per_iteration": 2.67146897315979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01249108, + "balance_loss_mlp": 1.08054566, + "epoch": 0.06079261254328588, + "flos": 582985652736.0, + "grad_norm": 0.0292304668639163, + "language_loss": 0.99909455, + "learning_rate": 0.0009975173567915952, + "loss": 1.01158559, + "num_input_tokens_seen": 25393648, + "router_z_loss_mlp": 1.68310547, + "step": 316, + "time_per_iteration": 2.693526268005371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0124983, + "balance_loss_mlp": 1.08131599, + "epoch": 0.060984994228549444, + "flos": 689008298496.0, + "grad_norm": 0.03272213432041067, + "language_loss": 0.93868685, + "learning_rate": 0.000997486252883674, + "loss": 0.95118511, + "num_input_tokens_seen": 25469152, + "router_z_loss_mlp": 1.68261719, + "step": 317, + "time_per_iteration": 2.837315082550049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01252509, + "balance_loss_mlp": 1.08399427, + "epoch": 0.061177375913813004, + "flos": 1316747398656.0, + "grad_norm": 0.031012352820614663, + "language_loss": 0.98949343, + "learning_rate": 0.0009974549558344602, + "loss": 1.00201845, + "num_input_tokens_seen": 25560944, + "router_z_loss_mlp": 1.68261719, + "step": 318, + "time_per_iteration": 3.686920166015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0125178, + "balance_loss_mlp": 1.08321846, + "epoch": 0.06136975759907657, + "flos": 575400612864.0, + "grad_norm": 0.027925836735275204, + "language_loss": 1.08640313, + "learning_rate": 0.000997423465656105, + "loss": 1.09892082, + "num_input_tokens_seen": 25631424, + "router_z_loss_mlp": 1.68310547, + "step": 319, + "time_per_iteration": 2.7691538333892822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250553, + "balance_loss_mlp": 1.08218133, + "epoch": 0.06156213928434013, + "flos": 528564346368.0, + "grad_norm": 0.033042319608268485, + "language_loss": 1.06051123, + "learning_rate": 0.0009973917823608335, + "loss": 1.07301688, + "num_input_tokens_seen": 25698176, + "router_z_loss_mlp": 1.68115234, + "step": 320, + "time_per_iteration": 2.583859443664551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01251303, + "balance_loss_mlp": 1.08364725, + "epoch": 0.061754520969603696, + "flos": 496589984256.0, + "grad_norm": 0.025351519610416894, + "language_loss": 0.99929821, + "learning_rate": 0.0009973599059609462, + "loss": 1.01181126, + "num_input_tokens_seen": 25773472, + "router_z_loss_mlp": 1.67382812, + "step": 321, + "time_per_iteration": 2.7139415740966797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01246641, + "balance_loss_mlp": 1.07893777, + "epoch": 0.061946902654867256, + "flos": 441044038656.0, + "grad_norm": 0.025867704850659153, + "language_loss": 0.98033404, + "learning_rate": 0.000997327836468819, + "loss": 0.99280047, + "num_input_tokens_seen": 25841088, + "router_z_loss_mlp": 1.67431641, + "step": 322, + "time_per_iteration": 2.598400831222534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250362, + "balance_loss_mlp": 1.08280182, + "epoch": 0.06213928434013082, + "flos": 600042441216.0, + "grad_norm": 0.02535167136018297, + "language_loss": 1.01516175, + "learning_rate": 0.000997295573896902, + "loss": 1.02766538, + "num_input_tokens_seen": 25919424, + "router_z_loss_mlp": 1.67285156, + "step": 323, + "time_per_iteration": 2.8295648097991943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0125071, + "balance_loss_mlp": 1.0847702, + "epoch": 0.06233166602539438, + "flos": 1453114384896.0, + "grad_norm": 0.012451454042686489, + "language_loss": 0.8119604, + "learning_rate": 0.000997263118257721, + "loss": 0.82446748, + "num_input_tokens_seen": 26135504, + "router_z_loss_mlp": 1.65625, + "step": 324, + "time_per_iteration": 4.72265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01244164, + "balance_loss_mlp": 1.07803345, + "epoch": 0.06252404771065795, + "flos": 1466628794880.0, + "grad_norm": 0.009026829376029815, + "language_loss": 0.78571939, + "learning_rate": 0.0009972304695638763, + "loss": 0.79816103, + "num_input_tokens_seen": 26358880, + "router_z_loss_mlp": 1.65820312, + "step": 325, + "time_per_iteration": 4.859014272689819 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01252677, + "balance_loss_mlp": 1.08535445, + "epoch": 0.06271642939592151, + "flos": 465235975680.0, + "grad_norm": 0.02899330239765154, + "language_loss": 0.95714885, + "learning_rate": 0.000997197627828043, + "loss": 0.96967566, + "num_input_tokens_seen": 26425888, + "router_z_loss_mlp": 1.67041016, + "step": 326, + "time_per_iteration": 2.5137081146240234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250284, + "balance_loss_mlp": 1.08343852, + "epoch": 0.06290881108118507, + "flos": 533431477248.0, + "grad_norm": 0.02712212536791958, + "language_loss": 0.90827119, + "learning_rate": 0.0009971645930629716, + "loss": 0.92077404, + "num_input_tokens_seen": 26500656, + "router_z_loss_mlp": 1.66552734, + "step": 327, + "time_per_iteration": 2.6867988109588623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01249402, + "balance_loss_mlp": 1.08260453, + "epoch": 0.06310119276644863, + "flos": 674767474176.0, + "grad_norm": 0.026247049513885422, + "language_loss": 1.04735494, + "learning_rate": 0.0009971313652814872, + "loss": 1.0598489, + "num_input_tokens_seen": 26577408, + "router_z_loss_mlp": 1.66503906, + "step": 328, + "time_per_iteration": 2.845618724822998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01245995, + "balance_loss_mlp": 1.07924485, + "epoch": 0.0632935744517122, + "flos": 772050241536.0, + "grad_norm": 0.03020034978800923, + "language_loss": 1.02482498, + "learning_rate": 0.0009970979444964903, + "loss": 1.03728485, + "num_input_tokens_seen": 26652048, + "router_z_loss_mlp": 1.66455078, + "step": 329, + "time_per_iteration": 2.967315196990967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01249674, + "balance_loss_mlp": 1.08316231, + "epoch": 0.06348595613697576, + "flos": 562974638592.0, + "grad_norm": 0.027434293654228625, + "language_loss": 1.03562641, + "learning_rate": 0.0009970643307209556, + "loss": 1.04812312, + "num_input_tokens_seen": 26728192, + "router_z_loss_mlp": 1.66210938, + "step": 330, + "time_per_iteration": 2.7991747856140137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01247918, + "balance_loss_mlp": 1.0814544, + "epoch": 0.06367833782223932, + "flos": 677383325184.0, + "grad_norm": 0.030236705728133754, + "language_loss": 1.00163436, + "learning_rate": 0.0009970305239679334, + "loss": 1.01411343, + "num_input_tokens_seen": 26798016, + "router_z_loss_mlp": 1.66162109, + "step": 331, + "time_per_iteration": 2.8012547492980957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01243208, + "balance_loss_mlp": 1.07669675, + "epoch": 0.06387071950750288, + "flos": 496348938240.0, + "grad_norm": 0.029279450628507057, + "language_loss": 1.04491925, + "learning_rate": 0.0009969965242505483, + "loss": 1.05735123, + "num_input_tokens_seen": 26867536, + "router_z_loss_mlp": 1.66210938, + "step": 332, + "time_per_iteration": 2.658085584640503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01251001, + "balance_loss_mlp": 1.08463287, + "epoch": 0.06406310119276645, + "flos": 534556116480.0, + "grad_norm": 0.029350032940601952, + "language_loss": 1.00548685, + "learning_rate": 0.0009969623315820007, + "loss": 1.01799679, + "num_input_tokens_seen": 26941216, + "router_z_loss_mlp": 1.66064453, + "step": 333, + "time_per_iteration": 2.6670596599578857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238877, + "balance_loss_mlp": 1.07246125, + "epoch": 0.06425548287803001, + "flos": 457164840960.0, + "grad_norm": 0.03277849846880731, + "language_loss": 1.00979996, + "learning_rate": 0.000996927945975565, + "loss": 1.02218866, + "num_input_tokens_seen": 27006560, + "router_z_loss_mlp": 1.66113281, + "step": 334, + "time_per_iteration": 2.5448765754699707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01245409, + "balance_loss_mlp": 1.0792315, + "epoch": 0.06444786456329357, + "flos": 561122858496.0, + "grad_norm": 0.03573042475309631, + "language_loss": 0.98108363, + "learning_rate": 0.0009968933674445906, + "loss": 0.99353766, + "num_input_tokens_seen": 27076400, + "router_z_loss_mlp": 1.65869141, + "step": 335, + "time_per_iteration": 2.679093360900879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01242425, + "balance_loss_mlp": 1.07672429, + "epoch": 0.06464024624855713, + "flos": 667356350976.0, + "grad_norm": 0.0316377115871937, + "language_loss": 0.99817598, + "learning_rate": 0.0009968585960025028, + "loss": 1.01060021, + "num_input_tokens_seen": 27158672, + "router_z_loss_mlp": 1.65380859, + "step": 336, + "time_per_iteration": 2.9642832279205322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01246223, + "balance_loss_mlp": 1.08085632, + "epoch": 0.0648326279338207, + "flos": 1524555549696.0, + "grad_norm": 0.012731648189289846, + "language_loss": 0.77653188, + "learning_rate": 0.0009968236316628006, + "loss": 0.78899413, + "num_input_tokens_seen": 27380592, + "router_z_loss_mlp": 1.65039062, + "step": 337, + "time_per_iteration": 4.799122333526611 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01242249, + "balance_loss_mlp": 1.07683408, + "epoch": 0.06502500961908426, + "flos": 1145214959616.0, + "grad_norm": 0.030168792806873873, + "language_loss": 0.98216963, + "learning_rate": 0.0009967884744390583, + "loss": 0.99459207, + "num_input_tokens_seen": 27469984, + "router_z_loss_mlp": 1.65087891, + "step": 338, + "time_per_iteration": 3.513155460357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01243978, + "balance_loss_mlp": 1.07865858, + "epoch": 0.06521739130434782, + "flos": 583693327872.0, + "grad_norm": 0.025823410577593665, + "language_loss": 0.98998213, + "learning_rate": 0.0009967531243449256, + "loss": 1.00242186, + "num_input_tokens_seen": 27543904, + "router_z_loss_mlp": 1.64990234, + "step": 339, + "time_per_iteration": 2.6683707237243652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01239476, + "balance_loss_mlp": 1.07453787, + "epoch": 0.06540977298961138, + "flos": 498658615296.0, + "grad_norm": 0.02384437782241591, + "language_loss": 1.06067204, + "learning_rate": 0.000996717581394126, + "loss": 1.07306671, + "num_input_tokens_seen": 27609888, + "router_z_loss_mlp": 1.64599609, + "step": 340, + "time_per_iteration": 2.5471885204315186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236124, + "balance_loss_mlp": 1.07171023, + "epoch": 0.06560215467487496, + "flos": 543903613440.0, + "grad_norm": 0.02318937955413124, + "language_loss": 1.0712086, + "learning_rate": 0.000996681845600459, + "loss": 1.08356977, + "num_input_tokens_seen": 27683936, + "router_z_loss_mlp": 1.640625, + "step": 341, + "time_per_iteration": 2.651742458343506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01240028, + "balance_loss_mlp": 1.07575738, + "epoch": 0.06579453636013852, + "flos": 414351043584.0, + "grad_norm": 0.026316803994829763, + "language_loss": 0.99228215, + "learning_rate": 0.0009966459169777982, + "loss": 1.00468254, + "num_input_tokens_seen": 27747840, + "router_z_loss_mlp": 1.63916016, + "step": 342, + "time_per_iteration": 2.4996230602264404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01244627, + "balance_loss_mlp": 1.08045232, + "epoch": 0.06598691804540208, + "flos": 561680812032.0, + "grad_norm": 0.03097158399986616, + "language_loss": 1.07124209, + "learning_rate": 0.0009966097955400924, + "loss": 1.08368838, + "num_input_tokens_seen": 27819728, + "router_z_loss_mlp": 1.63818359, + "step": 343, + "time_per_iteration": 2.7243080139160156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238691, + "balance_loss_mlp": 1.07451606, + "epoch": 0.06617929973066564, + "flos": 573301782528.0, + "grad_norm": 0.022915441754152527, + "language_loss": 1.00964892, + "learning_rate": 0.0009965734813013652, + "loss": 1.02203584, + "num_input_tokens_seen": 27893536, + "router_z_loss_mlp": 1.63818359, + "step": 344, + "time_per_iteration": 2.8087360858917236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01237027, + "balance_loss_mlp": 1.07375824, + "epoch": 0.06637168141592921, + "flos": 491464343040.0, + "grad_norm": 0.024444849604151265, + "language_loss": 1.03758335, + "learning_rate": 0.0009965369742757151, + "loss": 1.04995358, + "num_input_tokens_seen": 27960976, + "router_z_loss_mlp": 1.62890625, + "step": 345, + "time_per_iteration": 2.5691587924957275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01237907, + "balance_loss_mlp": 1.07459044, + "epoch": 0.06656406310119277, + "flos": 1081037924352.0, + "grad_norm": 0.024807678995847144, + "language_loss": 0.99529493, + "learning_rate": 0.0009965002744773152, + "loss": 1.00767398, + "num_input_tokens_seen": 28050864, + "router_z_loss_mlp": 1.62939453, + "step": 346, + "time_per_iteration": 3.507969856262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01239522, + "balance_loss_mlp": 1.07611036, + "epoch": 0.06675644478645633, + "flos": 514723021824.0, + "grad_norm": 0.02663627628784384, + "language_loss": 0.97097999, + "learning_rate": 0.0009964633819204139, + "loss": 0.98337519, + "num_input_tokens_seen": 28122448, + "router_z_loss_mlp": 1.63037109, + "step": 347, + "time_per_iteration": 2.6551601886749268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01261986, + "balance_loss_mlp": 1.09986115, + "epoch": 0.06694882647171989, + "flos": 1450534189056.0, + "grad_norm": 0.030948258254188146, + "language_loss": 0.81801116, + "learning_rate": 0.0009964262966193338, + "loss": 0.83063102, + "num_input_tokens_seen": 28350352, + "router_z_loss_mlp": 1.6171875, + "step": 348, + "time_per_iteration": 5.152506589889526 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236206, + "balance_loss_mlp": 1.07427216, + "epoch": 0.06714120815698346, + "flos": 1555397266944.0, + "grad_norm": 0.0077968992848742235, + "language_loss": 0.75153887, + "learning_rate": 0.000996389018588473, + "loss": 0.76390088, + "num_input_tokens_seen": 28585584, + "router_z_loss_mlp": 1.61523438, + "step": 349, + "time_per_iteration": 4.909464120864868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01242005, + "balance_loss_mlp": 1.07873547, + "epoch": 0.06733358984224702, + "flos": 881615992320.0, + "grad_norm": 0.03432587789196913, + "language_loss": 0.97228402, + "learning_rate": 0.000996351547842304, + "loss": 0.98470408, + "num_input_tokens_seen": 28672512, + "router_z_loss_mlp": 1.62890625, + "step": 350, + "time_per_iteration": 3.1799545288085938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01240315, + "balance_loss_mlp": 1.0778569, + "epoch": 0.06752597152751058, + "flos": 519917793792.0, + "grad_norm": 0.030803186893757592, + "language_loss": 0.96182388, + "learning_rate": 0.0009963138843953744, + "loss": 0.97422707, + "num_input_tokens_seen": 28741520, + "router_z_loss_mlp": 1.62060547, + "step": 351, + "time_per_iteration": 2.5873348712921143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238163, + "balance_loss_mlp": 1.07565665, + "epoch": 0.06771835321277414, + "flos": 540882258432.0, + "grad_norm": 0.023778523337364334, + "language_loss": 0.99575555, + "learning_rate": 0.000996276028262306, + "loss": 1.00813723, + "num_input_tokens_seen": 28814912, + "router_z_loss_mlp": 1.62109375, + "step": 352, + "time_per_iteration": 2.7943532466888428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238104, + "balance_loss_mlp": 1.07583654, + "epoch": 0.0679107348980377, + "flos": 461615007744.0, + "grad_norm": 0.02720743117278016, + "language_loss": 1.06749547, + "learning_rate": 0.0009962379794577964, + "loss": 1.07987642, + "num_input_tokens_seen": 28882192, + "router_z_loss_mlp": 1.61865234, + "step": 353, + "time_per_iteration": 2.589200973510742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01239427, + "balance_loss_mlp": 1.07711196, + "epoch": 0.06810311658330127, + "flos": 637207572480.0, + "grad_norm": 0.02321502152829773, + "language_loss": 0.95908678, + "learning_rate": 0.000996199737996617, + "loss": 0.97148108, + "num_input_tokens_seen": 28968576, + "router_z_loss_mlp": 1.61914062, + "step": 354, + "time_per_iteration": 2.8822708129882812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0123871, + "balance_loss_mlp": 1.07687151, + "epoch": 0.06829549826856483, + "flos": 465626743296.0, + "grad_norm": 0.030894548658215056, + "language_loss": 1.05554581, + "learning_rate": 0.0009961613038936149, + "loss": 1.06793284, + "num_input_tokens_seen": 29036160, + "router_z_loss_mlp": 1.61425781, + "step": 355, + "time_per_iteration": 2.576930522918701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236116, + "balance_loss_mlp": 1.07456315, + "epoch": 0.06848787995382839, + "flos": 635896281600.0, + "grad_norm": 0.0286185110148739, + "language_loss": 0.9730283, + "learning_rate": 0.000996122677163711, + "loss": 0.98538941, + "num_input_tokens_seen": 29112048, + "router_z_loss_mlp": 1.61132812, + "step": 356, + "time_per_iteration": 2.850829601287842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01237686, + "balance_loss_mlp": 1.07637215, + "epoch": 0.06868026163909195, + "flos": 807780556800.0, + "grad_norm": 0.03078602082995562, + "language_loss": 1.03526855, + "learning_rate": 0.000996083857821902, + "loss": 1.04764557, + "num_input_tokens_seen": 29190960, + "router_z_loss_mlp": 1.60888672, + "step": 357, + "time_per_iteration": 3.124053716659546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01237273, + "balance_loss_mlp": 1.07605469, + "epoch": 0.06887264332435553, + "flos": 440151713280.0, + "grad_norm": 0.02263887650004652, + "language_loss": 1.01701617, + "learning_rate": 0.0009960448458832588, + "loss": 1.0293889, + "num_input_tokens_seen": 29262832, + "router_z_loss_mlp": 1.60791016, + "step": 358, + "time_per_iteration": 2.6918816566467285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01242041, + "balance_loss_mlp": 1.08077514, + "epoch": 0.06906502500961909, + "flos": 485785477632.0, + "grad_norm": 0.021707311176365728, + "language_loss": 1.01897752, + "learning_rate": 0.000996005641362927, + "loss": 1.03139794, + "num_input_tokens_seen": 29329552, + "router_z_loss_mlp": 1.60839844, + "step": 359, + "time_per_iteration": 2.601358652114868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238764, + "balance_loss_mlp": 1.07725942, + "epoch": 0.06925740669488265, + "flos": 734885110272.0, + "grad_norm": 0.024380378407611886, + "language_loss": 1.04387617, + "learning_rate": 0.0009959662442761274, + "loss": 1.05626392, + "num_input_tokens_seen": 29410784, + "router_z_loss_mlp": 1.61083984, + "step": 360, + "time_per_iteration": 2.9404215812683105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236823, + "balance_loss_mlp": 1.07589066, + "epoch": 0.0694497883801462, + "flos": 553570745856.0, + "grad_norm": 0.023221163769242582, + "language_loss": 0.97943044, + "learning_rate": 0.000995926654638155, + "loss": 0.99179876, + "num_input_tokens_seen": 29486992, + "router_z_loss_mlp": 1.60498047, + "step": 361, + "time_per_iteration": 2.811624526977539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01234495, + "balance_loss_mlp": 1.07413495, + "epoch": 0.06964217006540978, + "flos": 679243837440.0, + "grad_norm": 0.025577226237571565, + "language_loss": 1.00741839, + "learning_rate": 0.00099588687246438, + "loss": 1.01976323, + "num_input_tokens_seen": 29557232, + "router_z_loss_mlp": 1.59912109, + "step": 362, + "time_per_iteration": 2.826204538345337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01235331, + "balance_loss_mlp": 1.0749228, + "epoch": 0.06983455175067334, + "flos": 525260285952.0, + "grad_norm": 0.054619150892928216, + "language_loss": 1.0805161, + "learning_rate": 0.0009958468977702471, + "loss": 1.09286952, + "num_input_tokens_seen": 29625344, + "router_z_loss_mlp": 1.59960938, + "step": 363, + "time_per_iteration": 2.5742297172546387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01269455, + "balance_loss_mlp": 1.11000061, + "epoch": 0.0700269334359369, + "flos": 1580173353984.0, + "grad_norm": 0.0347214045967213, + "language_loss": 0.79734707, + "learning_rate": 0.0009958067305712761, + "loss": 0.81004167, + "num_input_tokens_seen": 29843664, + "router_z_loss_mlp": 1.59179688, + "step": 364, + "time_per_iteration": 4.815373182296753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01234235, + "balance_loss_mlp": 1.07420838, + "epoch": 0.07021931512120046, + "flos": 1014856659456.0, + "grad_norm": 0.027565425727799023, + "language_loss": 0.95424879, + "learning_rate": 0.0009957663708830612, + "loss": 0.96659118, + "num_input_tokens_seen": 29927152, + "router_z_loss_mlp": 1.59667969, + "step": 365, + "time_per_iteration": 3.3032214641571045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238249, + "balance_loss_mlp": 1.07874703, + "epoch": 0.07041169680646403, + "flos": 824431114752.0, + "grad_norm": 0.03609893162101238, + "language_loss": 0.99641442, + "learning_rate": 0.0009957258187212714, + "loss": 1.00879693, + "num_input_tokens_seen": 30004928, + "router_z_loss_mlp": 1.59228516, + "step": 366, + "time_per_iteration": 3.143951654434204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01232948, + "balance_loss_mlp": 1.0748291, + "epoch": 0.07060407849172759, + "flos": 1417290743808.0, + "grad_norm": 0.015479474187128486, + "language_loss": 0.79194862, + "learning_rate": 0.0009956850741016502, + "loss": 0.80427808, + "num_input_tokens_seen": 30230256, + "router_z_loss_mlp": 1.578125, + "step": 367, + "time_per_iteration": 4.856614112854004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01232866, + "balance_loss_mlp": 1.07417488, + "epoch": 0.07079646017699115, + "flos": 513941486592.0, + "grad_norm": 0.03158452537667852, + "language_loss": 0.9606331, + "learning_rate": 0.0009956441370400167, + "loss": 0.97296178, + "num_input_tokens_seen": 30301200, + "router_z_loss_mlp": 1.58398438, + "step": 368, + "time_per_iteration": 2.6471550464630127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01231431, + "balance_loss_mlp": 1.07288289, + "epoch": 0.07098884186225471, + "flos": 541548274176.0, + "grad_norm": 0.03366854249700899, + "language_loss": 1.02536654, + "learning_rate": 0.0009956030075522636, + "loss": 1.03768086, + "num_input_tokens_seen": 30377024, + "router_z_loss_mlp": 1.58251953, + "step": 369, + "time_per_iteration": 2.764350175857544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01230433, + "balance_loss_mlp": 1.07183695, + "epoch": 0.07118122354751828, + "flos": 549738931200.0, + "grad_norm": 0.025388205653796188, + "language_loss": 1.02520657, + "learning_rate": 0.0009955616856543587, + "loss": 1.03751087, + "num_input_tokens_seen": 30448896, + "router_z_loss_mlp": 1.58300781, + "step": 370, + "time_per_iteration": 2.6488449573516846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01233332, + "balance_loss_mlp": 1.07483125, + "epoch": 0.07137360523278184, + "flos": 622076424192.0, + "grad_norm": 0.025131147277089937, + "language_loss": 0.94016552, + "learning_rate": 0.0009955201713623448, + "loss": 0.95249885, + "num_input_tokens_seen": 30523584, + "router_z_loss_mlp": 1.58203125, + "step": 371, + "time_per_iteration": 2.7475128173828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01231201, + "balance_loss_mlp": 1.07594299, + "epoch": 0.0715659869180454, + "flos": 1505973347328.0, + "grad_norm": 0.011087848535678398, + "language_loss": 0.76672721, + "learning_rate": 0.000995478464692339, + "loss": 0.77903926, + "num_input_tokens_seen": 30757920, + "router_z_loss_mlp": 1.55664062, + "step": 372, + "time_per_iteration": 4.930227518081665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01234827, + "balance_loss_mlp": 1.0769937, + "epoch": 0.07175836860330896, + "flos": 496481195520.0, + "grad_norm": 0.02946804107059058, + "language_loss": 1.07406306, + "learning_rate": 0.0009954365656605333, + "loss": 1.08641148, + "num_input_tokens_seen": 30824960, + "router_z_loss_mlp": 1.57910156, + "step": 373, + "time_per_iteration": 2.5494606494903564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01235693, + "balance_loss_mlp": 1.07862246, + "epoch": 0.07195075028857253, + "flos": 787081333248.0, + "grad_norm": 0.030340412148976308, + "language_loss": 1.00769055, + "learning_rate": 0.0009953944742831947, + "loss": 1.02004743, + "num_input_tokens_seen": 30902224, + "router_z_loss_mlp": 1.57519531, + "step": 374, + "time_per_iteration": 2.974013328552246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01234053, + "balance_loss_mlp": 1.07707787, + "epoch": 0.0721431319738361, + "flos": 594346111488.0, + "grad_norm": 0.024760984543104554, + "language_loss": 1.04227853, + "learning_rate": 0.0009953521905766642, + "loss": 1.05461907, + "num_input_tokens_seen": 30984784, + "router_z_loss_mlp": 1.57421875, + "step": 375, + "time_per_iteration": 2.9470102787017822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01233349, + "balance_loss_mlp": 1.07642198, + "epoch": 0.07233551365909965, + "flos": 549328697856.0, + "grad_norm": 0.025099095391344205, + "language_loss": 1.02903581, + "learning_rate": 0.0009953097145573577, + "loss": 1.04136944, + "num_input_tokens_seen": 31055376, + "router_z_loss_mlp": 1.57373047, + "step": 376, + "time_per_iteration": 2.656438112258911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01232315, + "balance_loss_mlp": 1.0754832, + "epoch": 0.07252789534436321, + "flos": 959167723008.0, + "grad_norm": 0.028756244795243427, + "language_loss": 1.01008701, + "learning_rate": 0.000995267046241766, + "loss": 1.02241015, + "num_input_tokens_seen": 31144944, + "router_z_loss_mlp": 1.57275391, + "step": 377, + "time_per_iteration": 3.2601664066314697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226098, + "balance_loss_mlp": 1.06931448, + "epoch": 0.07272027702962677, + "flos": 508655390208.0, + "grad_norm": 0.025279277167219092, + "language_loss": 1.00209188, + "learning_rate": 0.0009952241856464547, + "loss": 1.01435292, + "num_input_tokens_seen": 31213392, + "router_z_loss_mlp": 1.57226562, + "step": 378, + "time_per_iteration": 2.616483688354492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228279, + "balance_loss_mlp": 1.07159042, + "epoch": 0.07291265871489035, + "flos": 613551395328.0, + "grad_norm": 0.025059419305224793, + "language_loss": 1.0761106, + "learning_rate": 0.0009951811327880632, + "loss": 1.08839345, + "num_input_tokens_seen": 31289840, + "router_z_loss_mlp": 1.57128906, + "step": 379, + "time_per_iteration": 2.7666382789611816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226658, + "balance_loss_mlp": 1.07063651, + "epoch": 0.0731050404001539, + "flos": 496741707264.0, + "grad_norm": 0.032880990240464036, + "language_loss": 1.00766444, + "learning_rate": 0.0009951378876833063, + "loss": 1.01993108, + "num_input_tokens_seen": 31357600, + "router_z_loss_mlp": 1.56445312, + "step": 380, + "time_per_iteration": 2.5504086017608643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01230504, + "balance_loss_mlp": 1.07433975, + "epoch": 0.07329742208541747, + "flos": 641129985024.0, + "grad_norm": 0.0343074889031262, + "language_loss": 1.0780232, + "learning_rate": 0.0009950944503489736, + "loss": 1.0903281, + "num_input_tokens_seen": 31428896, + "router_z_loss_mlp": 1.56591797, + "step": 381, + "time_per_iteration": 2.7695260047912598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01231248, + "balance_loss_mlp": 1.07537043, + "epoch": 0.07348980377068103, + "flos": 817740401664.0, + "grad_norm": 0.027198888726283066, + "language_loss": 1.01785743, + "learning_rate": 0.0009950508208019285, + "loss": 1.03016996, + "num_input_tokens_seen": 31507424, + "router_z_loss_mlp": 1.56298828, + "step": 382, + "time_per_iteration": 2.9918277263641357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01227944, + "balance_loss_mlp": 1.07187521, + "epoch": 0.0736821854559446, + "flos": 509669239296.0, + "grad_norm": 0.03113985633155724, + "language_loss": 1.05612254, + "learning_rate": 0.0009950069990591096, + "loss": 1.06840205, + "num_input_tokens_seen": 31576768, + "router_z_loss_mlp": 1.56494141, + "step": 383, + "time_per_iteration": 2.610745429992676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01248039, + "balance_loss_mlp": 1.09392548, + "epoch": 0.07387456714120816, + "flos": 1558048046592.0, + "grad_norm": 0.03338671968111017, + "language_loss": 0.76401371, + "learning_rate": 0.0009949629851375302, + "loss": 0.77649409, + "num_input_tokens_seen": 31797312, + "router_z_loss_mlp": 1.54492188, + "step": 384, + "time_per_iteration": 4.854166269302368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01229749, + "balance_loss_mlp": 1.0736798, + "epoch": 0.07406694882647172, + "flos": 526643435520.0, + "grad_norm": 0.03274978311793036, + "language_loss": 0.98781282, + "learning_rate": 0.0009949187790542777, + "loss": 1.00011039, + "num_input_tokens_seen": 31869568, + "router_z_loss_mlp": 1.56494141, + "step": 385, + "time_per_iteration": 2.728701591491699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0123258, + "balance_loss_mlp": 1.07636821, + "epoch": 0.07425933051173528, + "flos": 498823799808.0, + "grad_norm": 0.026908846939264777, + "language_loss": 0.94723004, + "learning_rate": 0.0009948743808265148, + "loss": 0.95955586, + "num_input_tokens_seen": 31941712, + "router_z_loss_mlp": 1.56640625, + "step": 386, + "time_per_iteration": 2.6850693225860596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01231135, + "balance_loss_mlp": 1.07511437, + "epoch": 0.07445171219699885, + "flos": 506057003520.0, + "grad_norm": 0.05633654869747302, + "language_loss": 1.04553366, + "learning_rate": 0.0009948297904714782, + "loss": 1.05784488, + "num_input_tokens_seen": 32015232, + "router_z_loss_mlp": 1.56445312, + "step": 387, + "time_per_iteration": 2.6746010780334473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01231627, + "balance_loss_mlp": 1.07555866, + "epoch": 0.07464409388226241, + "flos": 555116352000.0, + "grad_norm": 0.03450843374667126, + "language_loss": 0.9665134, + "learning_rate": 0.0009947850080064796, + "loss": 0.97882968, + "num_input_tokens_seen": 32094640, + "router_z_loss_mlp": 1.56494141, + "step": 388, + "time_per_iteration": 2.7839057445526123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01230193, + "balance_loss_mlp": 1.07431459, + "epoch": 0.07483647556752597, + "flos": 778274325504.0, + "grad_norm": 0.021592891008175935, + "language_loss": 1.01240289, + "learning_rate": 0.0009947400334489047, + "loss": 1.02470493, + "num_input_tokens_seen": 32176640, + "router_z_loss_mlp": 1.56298828, + "step": 389, + "time_per_iteration": 2.9945342540740967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01229089, + "balance_loss_mlp": 1.07411718, + "epoch": 0.07502885725278953, + "flos": 613681651200.0, + "grad_norm": 0.023383004705128753, + "language_loss": 0.92341155, + "learning_rate": 0.0009946948668162145, + "loss": 0.93570244, + "num_input_tokens_seen": 32246704, + "router_z_loss_mlp": 1.55371094, + "step": 390, + "time_per_iteration": 2.7355024814605713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122989, + "balance_loss_mlp": 1.07496524, + "epoch": 0.0752212389380531, + "flos": 689854961664.0, + "grad_norm": 0.026752200694656208, + "language_loss": 0.97335494, + "learning_rate": 0.0009946495081259441, + "loss": 0.98565376, + "num_input_tokens_seen": 32320032, + "router_z_loss_mlp": 1.55322266, + "step": 391, + "time_per_iteration": 2.799938678741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01227768, + "balance_loss_mlp": 1.07303405, + "epoch": 0.07541362062331666, + "flos": 767050853376.0, + "grad_norm": 0.02596026064524479, + "language_loss": 1.01604676, + "learning_rate": 0.0009946039573957035, + "loss": 1.02832437, + "num_input_tokens_seen": 32398144, + "router_z_loss_mlp": 1.55126953, + "step": 392, + "time_per_iteration": 2.932504415512085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0123199, + "balance_loss_mlp": 1.07768571, + "epoch": 0.07560600230858022, + "flos": 589908679680.0, + "grad_norm": 0.028382748029943367, + "language_loss": 0.97495323, + "learning_rate": 0.000994558214643177, + "loss": 0.98727316, + "num_input_tokens_seen": 32471984, + "router_z_loss_mlp": 1.546875, + "step": 393, + "time_per_iteration": 2.752694845199585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228178, + "balance_loss_mlp": 1.07425475, + "epoch": 0.07579838399384378, + "flos": 751144900608.0, + "grad_norm": 0.028291982513743617, + "language_loss": 0.99160051, + "learning_rate": 0.000994512279886123, + "loss": 1.00388229, + "num_input_tokens_seen": 32550176, + "router_z_loss_mlp": 1.54296875, + "step": 394, + "time_per_iteration": 3.06592059135437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228894, + "balance_loss_mlp": 1.07530475, + "epoch": 0.07599076567910736, + "flos": 524550609408.0, + "grad_norm": 0.023352712612718218, + "language_loss": 0.98641121, + "learning_rate": 0.0009944661531423758, + "loss": 0.99870014, + "num_input_tokens_seen": 32620768, + "router_z_loss_mlp": 1.53955078, + "step": 395, + "time_per_iteration": 2.6720728874206543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122919, + "balance_loss_mlp": 1.07555354, + "epoch": 0.07618314736437092, + "flos": 552185594880.0, + "grad_norm": 0.026216962171459895, + "language_loss": 0.97914684, + "learning_rate": 0.000994419834429843, + "loss": 0.99143875, + "num_input_tokens_seen": 32693472, + "router_z_loss_mlp": 1.54003906, + "step": 396, + "time_per_iteration": 2.6652910709381104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226861, + "balance_loss_mlp": 1.07308066, + "epoch": 0.07637552904963447, + "flos": 699432771072.0, + "grad_norm": 0.029361663168223213, + "language_loss": 1.03114796, + "learning_rate": 0.0009943733237665069, + "loss": 1.0434165, + "num_input_tokens_seen": 32764976, + "router_z_loss_mlp": 1.54150391, + "step": 397, + "time_per_iteration": 2.808711290359497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01227023, + "balance_loss_mlp": 1.07329071, + "epoch": 0.07656791073489803, + "flos": 580635042816.0, + "grad_norm": 0.02000560632750303, + "language_loss": 1.01598048, + "learning_rate": 0.0009943266211704248, + "loss": 1.02825069, + "num_input_tokens_seen": 32853104, + "router_z_loss_mlp": 1.54101562, + "step": 398, + "time_per_iteration": 2.9420461654663086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226854, + "balance_loss_mlp": 1.0732646, + "epoch": 0.0767602924201616, + "flos": 418037139456.0, + "grad_norm": 0.02425852476792673, + "language_loss": 1.03237891, + "learning_rate": 0.000994279726659728, + "loss": 1.04464746, + "num_input_tokens_seen": 32919376, + "router_z_loss_mlp": 1.53955078, + "step": 399, + "time_per_iteration": 2.5185675621032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01230296, + "balance_loss_mlp": 1.07675469, + "epoch": 0.07695267410542517, + "flos": 483888035328.0, + "grad_norm": 0.030174375239475117, + "language_loss": 1.02145576, + "learning_rate": 0.0009942326402526231, + "loss": 1.03375876, + "num_input_tokens_seen": 32988064, + "router_z_loss_mlp": 1.5390625, + "step": 400, + "time_per_iteration": 2.5265390872955322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224857, + "balance_loss_mlp": 1.07184029, + "epoch": 0.07714505579068873, + "flos": 532026860544.0, + "grad_norm": 0.024483465572707617, + "language_loss": 0.99344772, + "learning_rate": 0.0009941853619673902, + "loss": 1.0056963, + "num_input_tokens_seen": 33059024, + "router_z_loss_mlp": 1.53369141, + "step": 401, + "time_per_iteration": 2.660491704940796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224912, + "balance_loss_mlp": 1.07218146, + "epoch": 0.07733743747595229, + "flos": 806439066624.0, + "grad_norm": 0.032921156451595594, + "language_loss": 1.03587961, + "learning_rate": 0.0009941378918223844, + "loss": 1.04812872, + "num_input_tokens_seen": 33137712, + "router_z_loss_mlp": 1.53076172, + "step": 402, + "time_per_iteration": 3.078272819519043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01222316, + "balance_loss_mlp": 1.06972802, + "epoch": 0.07752981916121585, + "flos": 623613298176.0, + "grad_norm": 0.02596227047756477, + "language_loss": 0.96322513, + "learning_rate": 0.0009940902298360354, + "loss": 0.97544825, + "num_input_tokens_seen": 33211296, + "router_z_loss_mlp": 1.52929688, + "step": 403, + "time_per_iteration": 2.78222918510437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224993, + "balance_loss_mlp": 1.07288182, + "epoch": 0.07772220084647942, + "flos": 729542618112.0, + "grad_norm": 0.031231063897144088, + "language_loss": 1.06544566, + "learning_rate": 0.0009940423760268473, + "loss": 1.07769561, + "num_input_tokens_seen": 33283632, + "router_z_loss_mlp": 1.52441406, + "step": 404, + "time_per_iteration": 2.8572018146514893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226552, + "balance_loss_mlp": 1.07472658, + "epoch": 0.07791458253174298, + "flos": 556468575744.0, + "grad_norm": 0.029548764371286118, + "language_loss": 0.99639893, + "learning_rate": 0.0009939943304133982, + "loss": 1.00866449, + "num_input_tokens_seen": 33350704, + "router_z_loss_mlp": 1.52148438, + "step": 405, + "time_per_iteration": 2.607412815093994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226106, + "balance_loss_mlp": 1.07409084, + "epoch": 0.07810696421700654, + "flos": 554234760192.0, + "grad_norm": 0.031141101296471768, + "language_loss": 1.06411445, + "learning_rate": 0.0009939460930143416, + "loss": 1.07637548, + "num_input_tokens_seen": 33416272, + "router_z_loss_mlp": 1.5234375, + "step": 406, + "time_per_iteration": 2.6132876873016357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223027, + "balance_loss_mlp": 1.07120168, + "epoch": 0.0782993459022701, + "flos": 651878095872.0, + "grad_norm": 0.023437908852709077, + "language_loss": 1.00106847, + "learning_rate": 0.0009938976638484043, + "loss": 1.01329875, + "num_input_tokens_seen": 33501824, + "router_z_loss_mlp": 1.52148438, + "step": 407, + "time_per_iteration": 2.905681610107422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218745, + "balance_loss_mlp": 1.06691968, + "epoch": 0.07849172758753367, + "flos": 497160672768.0, + "grad_norm": 0.02891290096917658, + "language_loss": 0.99991584, + "learning_rate": 0.0009938490429343887, + "loss": 1.01210332, + "num_input_tokens_seen": 33571456, + "router_z_loss_mlp": 1.52148438, + "step": 408, + "time_per_iteration": 2.539567708969116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01222677, + "balance_loss_mlp": 1.07066166, + "epoch": 0.07868410927279723, + "flos": 579075975168.0, + "grad_norm": 0.030601656563413092, + "language_loss": 0.99965751, + "learning_rate": 0.0009938002302911709, + "loss": 1.01188421, + "num_input_tokens_seen": 33646320, + "router_z_loss_mlp": 1.5234375, + "step": 409, + "time_per_iteration": 2.732064962387085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01220028, + "balance_loss_mlp": 1.0680126, + "epoch": 0.07887649095806079, + "flos": 524066515968.0, + "grad_norm": 0.03256443285635905, + "language_loss": 1.03146362, + "learning_rate": 0.0009937512259377015, + "loss": 1.04366398, + "num_input_tokens_seen": 33717664, + "router_z_loss_mlp": 1.5234375, + "step": 410, + "time_per_iteration": 2.6500303745269775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221864, + "balance_loss_mlp": 1.07013464, + "epoch": 0.07906887264332435, + "flos": 558437876736.0, + "grad_norm": 0.023780630120827737, + "language_loss": 1.01466393, + "learning_rate": 0.000993702029893006, + "loss": 1.02688265, + "num_input_tokens_seen": 33794720, + "router_z_loss_mlp": 1.52050781, + "step": 411, + "time_per_iteration": 2.7921671867370605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221791, + "balance_loss_mlp": 1.07010949, + "epoch": 0.07926125432858792, + "flos": 823362871296.0, + "grad_norm": 0.04077078343290612, + "language_loss": 1.01153946, + "learning_rate": 0.0009936526421761838, + "loss": 1.02375734, + "num_input_tokens_seen": 33868304, + "router_z_loss_mlp": 1.52001953, + "step": 412, + "time_per_iteration": 3.0569379329681396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217861, + "balance_loss_mlp": 1.06632257, + "epoch": 0.07945363601385148, + "flos": 563393604096.0, + "grad_norm": 0.02717343044282308, + "language_loss": 1.04004121, + "learning_rate": 0.000993603062806409, + "loss": 1.05221987, + "num_input_tokens_seen": 33937424, + "router_z_loss_mlp": 1.51855469, + "step": 413, + "time_per_iteration": 2.707462787628174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01219172, + "balance_loss_mlp": 1.06844354, + "epoch": 0.07964601769911504, + "flos": 518884478976.0, + "grad_norm": 0.031245789494761384, + "language_loss": 1.07179379, + "learning_rate": 0.0009935532918029298, + "loss": 1.08398533, + "num_input_tokens_seen": 34003984, + "router_z_loss_mlp": 1.51025391, + "step": 414, + "time_per_iteration": 2.668151617050171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224604, + "balance_loss_mlp": 1.07387555, + "epoch": 0.0798383993843786, + "flos": 540300109824.0, + "grad_norm": 0.025221671350570463, + "language_loss": 0.99906069, + "learning_rate": 0.0009935033291850694, + "loss": 1.01130676, + "num_input_tokens_seen": 34072400, + "router_z_loss_mlp": 1.51025391, + "step": 415, + "time_per_iteration": 2.64747953414917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216008, + "balance_loss_mlp": 1.06547058, + "epoch": 0.08003078106964218, + "flos": 486121850880.0, + "grad_norm": 0.027121462600521052, + "language_loss": 1.02766061, + "learning_rate": 0.0009934531749722247, + "loss": 1.03982067, + "num_input_tokens_seen": 34142448, + "router_z_loss_mlp": 1.50830078, + "step": 416, + "time_per_iteration": 2.5705764293670654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121625, + "balance_loss_mlp": 1.06576049, + "epoch": 0.08022316275490574, + "flos": 519275246592.0, + "grad_norm": 0.027391361962933233, + "language_loss": 1.00515926, + "learning_rate": 0.0009934028291838672, + "loss": 1.01732171, + "num_input_tokens_seen": 34214080, + "router_z_loss_mlp": 1.5078125, + "step": 417, + "time_per_iteration": 2.7232770919799805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01219761, + "balance_loss_mlp": 1.0695101, + "epoch": 0.0804155444401693, + "flos": 495046379520.0, + "grad_norm": 0.028534904701295792, + "language_loss": 0.95904237, + "learning_rate": 0.0009933522918395433, + "loss": 0.97123998, + "num_input_tokens_seen": 34288448, + "router_z_loss_mlp": 1.50537109, + "step": 418, + "time_per_iteration": 2.670992374420166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01265297, + "balance_loss_mlp": 1.11595154, + "epoch": 0.08060792612543285, + "flos": 1584853833216.0, + "grad_norm": 0.03473829356439328, + "language_loss": 0.782511, + "learning_rate": 0.0009933015629588731, + "loss": 0.79516399, + "num_input_tokens_seen": 34521632, + "router_z_loss_mlp": 1.49609375, + "step": 419, + "time_per_iteration": 4.9051830768585205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01222046, + "balance_loss_mlp": 1.07246244, + "epoch": 0.08080030781069643, + "flos": 526358728704.0, + "grad_norm": 0.03232182071246488, + "language_loss": 1.15746891, + "learning_rate": 0.000993250642561551, + "loss": 1.16968942, + "num_input_tokens_seen": 34590080, + "router_z_loss_mlp": 1.49853516, + "step": 420, + "time_per_iteration": 2.596930503845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224313, + "balance_loss_mlp": 1.07487273, + "epoch": 0.08099268949595999, + "flos": 547756895232.0, + "grad_norm": 0.03306568774928502, + "language_loss": 1.00193918, + "learning_rate": 0.0009931995306673466, + "loss": 1.01418233, + "num_input_tokens_seen": 34660512, + "router_z_loss_mlp": 1.49707031, + "step": 421, + "time_per_iteration": 2.704012155532837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223697, + "balance_loss_mlp": 1.0744468, + "epoch": 0.08118507118122355, + "flos": 511373299200.0, + "grad_norm": 0.026268861479682264, + "language_loss": 1.0597651, + "learning_rate": 0.000993148227296103, + "loss": 1.07200205, + "num_input_tokens_seen": 34732016, + "router_z_loss_mlp": 1.49511719, + "step": 422, + "time_per_iteration": 2.6110117435455322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224578, + "balance_loss_mlp": 1.0751853, + "epoch": 0.08137745286648711, + "flos": 722001239040.0, + "grad_norm": 0.024088300997991936, + "language_loss": 0.92380643, + "learning_rate": 0.000993096732467738, + "loss": 0.9360522, + "num_input_tokens_seen": 34810416, + "router_z_loss_mlp": 1.49658203, + "step": 423, + "time_per_iteration": 2.9790220260620117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224383, + "balance_loss_mlp": 1.0753237, + "epoch": 0.08156983455175067, + "flos": 680817641472.0, + "grad_norm": 0.029818930066630327, + "language_loss": 1.0177561, + "learning_rate": 0.0009930450462022435, + "loss": 1.02999997, + "num_input_tokens_seen": 34879504, + "router_z_loss_mlp": 1.49316406, + "step": 424, + "time_per_iteration": 2.8023674488067627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223, + "balance_loss_mlp": 1.07518005, + "epoch": 0.08176221623701424, + "flos": 1456588359168.0, + "grad_norm": 0.012435251357338771, + "language_loss": 0.79189807, + "learning_rate": 0.0009929931685196862, + "loss": 0.80412811, + "num_input_tokens_seen": 35111584, + "router_z_loss_mlp": 1.48046875, + "step": 425, + "time_per_iteration": 4.96533989906311 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01219597, + "balance_loss_mlp": 1.0711571, + "epoch": 0.0819545979222778, + "flos": 1558883071488.0, + "grad_norm": 0.04204100969257126, + "language_loss": 1.00605047, + "learning_rate": 0.0009929410994402065, + "loss": 1.01824641, + "num_input_tokens_seen": 35205664, + "router_z_loss_mlp": 1.48681641, + "step": 426, + "time_per_iteration": 3.850475311279297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01220758, + "balance_loss_mlp": 1.07236588, + "epoch": 0.08214697960754136, + "flos": 513800497152.0, + "grad_norm": 0.03975912273964659, + "language_loss": 1.03955805, + "learning_rate": 0.0009928888389840196, + "loss": 1.05176568, + "num_input_tokens_seen": 35280144, + "router_z_loss_mlp": 1.48632812, + "step": 427, + "time_per_iteration": 2.6892385482788086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224824, + "balance_loss_mlp": 1.07633698, + "epoch": 0.08233936129280492, + "flos": 596221360128.0, + "grad_norm": 0.02633667259549893, + "language_loss": 1.0604248, + "learning_rate": 0.0009928363871714147, + "loss": 1.07267296, + "num_input_tokens_seen": 35344768, + "router_z_loss_mlp": 1.48730469, + "step": 428, + "time_per_iteration": 2.666851282119751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224039, + "balance_loss_mlp": 1.07550442, + "epoch": 0.08253174297806849, + "flos": 573164795904.0, + "grad_norm": 0.03052010415677114, + "language_loss": 0.99677718, + "learning_rate": 0.0009927837440227556, + "loss": 1.00901759, + "num_input_tokens_seen": 35425536, + "router_z_loss_mlp": 1.48779297, + "step": 429, + "time_per_iteration": 2.810197591781616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228416, + "balance_loss_mlp": 1.07992899, + "epoch": 0.08272412466333205, + "flos": 624642610176.0, + "grad_norm": 0.029909202440675912, + "language_loss": 0.93710327, + "learning_rate": 0.0009927309095584798, + "loss": 0.94938743, + "num_input_tokens_seen": 35515440, + "router_z_loss_mlp": 1.48730469, + "step": 430, + "time_per_iteration": 2.98052978515625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122165, + "balance_loss_mlp": 1.07316256, + "epoch": 0.08291650634859561, + "flos": 514994267136.0, + "grad_norm": 0.038201439099628094, + "language_loss": 1.07072532, + "learning_rate": 0.0009926778837991, + "loss": 1.08294177, + "num_input_tokens_seen": 35580192, + "router_z_loss_mlp": 1.48730469, + "step": 431, + "time_per_iteration": 2.613912582397461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223506, + "balance_loss_mlp": 1.07516193, + "epoch": 0.08310888803385917, + "flos": 668541388800.0, + "grad_norm": 0.02618037233016902, + "language_loss": 1.04762018, + "learning_rate": 0.000992624666765202, + "loss": 1.05985522, + "num_input_tokens_seen": 35649472, + "router_z_loss_mlp": 1.48583984, + "step": 432, + "time_per_iteration": 2.785602331161499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224029, + "balance_loss_mlp": 1.07659137, + "epoch": 0.08330126971912274, + "flos": 584490326016.0, + "grad_norm": 0.023129420064945467, + "language_loss": 1.02043724, + "learning_rate": 0.000992571258477447, + "loss": 1.03267753, + "num_input_tokens_seen": 35722848, + "router_z_loss_mlp": 1.4765625, + "step": 433, + "time_per_iteration": 2.7774012088775635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225333, + "balance_loss_mlp": 1.07799041, + "epoch": 0.0834936514043863, + "flos": 562497275904.0, + "grad_norm": 0.02412369992445121, + "language_loss": 0.95710295, + "learning_rate": 0.0009925176589565695, + "loss": 0.9693563, + "num_input_tokens_seen": 35800944, + "router_z_loss_mlp": 1.47558594, + "step": 434, + "time_per_iteration": 2.7975149154663086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224713, + "balance_loss_mlp": 1.07751381, + "epoch": 0.08368603308964986, + "flos": 495513008640.0, + "grad_norm": 0.023499028814372425, + "language_loss": 1.06310439, + "learning_rate": 0.0009924638682233791, + "loss": 1.07535148, + "num_input_tokens_seen": 35866288, + "router_z_loss_mlp": 1.47412109, + "step": 435, + "time_per_iteration": 2.5623626708984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01247864, + "balance_loss_mlp": 1.10328674, + "epoch": 0.08387841477491342, + "flos": 1391808983040.0, + "grad_norm": 0.0329185074425942, + "language_loss": 0.79564589, + "learning_rate": 0.0009924098862987589, + "loss": 0.80812454, + "num_input_tokens_seen": 36083040, + "router_z_loss_mlp": 1.44726562, + "step": 436, + "time_per_iteration": 4.5364601612091064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01219037, + "balance_loss_mlp": 1.07174218, + "epoch": 0.084070796460177, + "flos": 800353970688.0, + "grad_norm": 0.025226905267595717, + "language_loss": 0.95941472, + "learning_rate": 0.0009923557132036668, + "loss": 0.97160506, + "num_input_tokens_seen": 36158816, + "router_z_loss_mlp": 1.47509766, + "step": 437, + "time_per_iteration": 3.031538963317871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01219746, + "balance_loss_mlp": 1.07226074, + "epoch": 0.08426317814544056, + "flos": 560096274432.0, + "grad_norm": 0.024291343012928023, + "language_loss": 0.99699497, + "learning_rate": 0.0009923013489591345, + "loss": 1.00919247, + "num_input_tokens_seen": 36236432, + "router_z_loss_mlp": 1.47705078, + "step": 438, + "time_per_iteration": 2.741021156311035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217749, + "balance_loss_mlp": 1.07073975, + "epoch": 0.08445555983070412, + "flos": 811883616768.0, + "grad_norm": 0.02787309358423107, + "language_loss": 0.97740996, + "learning_rate": 0.0009922467935862681, + "loss": 0.98958743, + "num_input_tokens_seen": 36327952, + "router_z_loss_mlp": 1.47216797, + "step": 439, + "time_per_iteration": 3.0727341175079346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215984, + "balance_loss_mlp": 1.06907046, + "epoch": 0.08464794151596768, + "flos": 511169183232.0, + "grad_norm": 0.02418736148641671, + "language_loss": 1.01547837, + "learning_rate": 0.0009921920471062478, + "loss": 1.0276382, + "num_input_tokens_seen": 36394896, + "router_z_loss_mlp": 1.47119141, + "step": 440, + "time_per_iteration": 2.5793957710266113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214442, + "balance_loss_mlp": 1.06805265, + "epoch": 0.08484032320123125, + "flos": 557473692672.0, + "grad_norm": 0.02549300900866748, + "language_loss": 0.99590349, + "learning_rate": 0.0009921371095403281, + "loss": 1.00804806, + "num_input_tokens_seen": 36464656, + "router_z_loss_mlp": 1.46582031, + "step": 441, + "time_per_iteration": 2.633976936340332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215261, + "balance_loss_mlp": 1.06887233, + "epoch": 0.08503270488649481, + "flos": 528360230400.0, + "grad_norm": 0.023285649852896013, + "language_loss": 1.02823853, + "learning_rate": 0.0009920819809098379, + "loss": 1.04039121, + "num_input_tokens_seen": 36532208, + "router_z_loss_mlp": 1.46582031, + "step": 442, + "time_per_iteration": 2.5975728034973145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213611, + "balance_loss_mlp": 1.06722176, + "epoch": 0.08522508657175837, + "flos": 615385711104.0, + "grad_norm": 0.021771679570127336, + "language_loss": 0.97986722, + "learning_rate": 0.0009920266612361798, + "loss": 0.99200332, + "num_input_tokens_seen": 36607360, + "router_z_loss_mlp": 1.46582031, + "step": 443, + "time_per_iteration": 2.7284042835235596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214332, + "balance_loss_mlp": 1.06803846, + "epoch": 0.08541746825702193, + "flos": 620986713600.0, + "grad_norm": 0.024601404202987703, + "language_loss": 0.97963679, + "learning_rate": 0.0009919711505408308, + "loss": 0.9917801, + "num_input_tokens_seen": 36680688, + "router_z_loss_mlp": 1.46484375, + "step": 444, + "time_per_iteration": 2.797030448913574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216522, + "balance_loss_mlp": 1.07051492, + "epoch": 0.08560984994228549, + "flos": 483888035328.0, + "grad_norm": 0.023417740932750293, + "language_loss": 0.96522343, + "learning_rate": 0.000991915448845342, + "loss": 0.97738856, + "num_input_tokens_seen": 36746288, + "router_z_loss_mlp": 1.46191406, + "step": 445, + "time_per_iteration": 2.544638156890869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121537, + "balance_loss_mlp": 1.06945765, + "epoch": 0.08580223162754906, + "flos": 518176803840.0, + "grad_norm": 0.025018627604332305, + "language_loss": 1.05275297, + "learning_rate": 0.000991859556171339, + "loss": 1.0649066, + "num_input_tokens_seen": 36812528, + "router_z_loss_mlp": 1.4609375, + "step": 446, + "time_per_iteration": 2.5865097045898438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214045, + "balance_loss_mlp": 1.06856191, + "epoch": 0.08599461331281262, + "flos": 532519686144.0, + "grad_norm": 0.025883227843611877, + "language_loss": 1.07190132, + "learning_rate": 0.000991803472540521, + "loss": 1.08404183, + "num_input_tokens_seen": 36879248, + "router_z_loss_mlp": 1.45654297, + "step": 447, + "time_per_iteration": 2.6001055240631104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213992, + "balance_loss_mlp": 1.06879497, + "epoch": 0.08618699499807618, + "flos": 791633558016.0, + "grad_norm": 0.022461373320799196, + "language_loss": 1.02303076, + "learning_rate": 0.0009917471979746615, + "loss": 1.03517067, + "num_input_tokens_seen": 36951376, + "router_z_loss_mlp": 1.45361328, + "step": 448, + "time_per_iteration": 2.9621376991271973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218395, + "balance_loss_mlp": 1.07300746, + "epoch": 0.08637937668333974, + "flos": 567114628608.0, + "grad_norm": 0.02449904215267775, + "language_loss": 1.00404847, + "learning_rate": 0.0009916907324956086, + "loss": 1.01623249, + "num_input_tokens_seen": 37025936, + "router_z_loss_mlp": 1.45556641, + "step": 449, + "time_per_iteration": 2.691150188446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214944, + "balance_loss_mlp": 1.0697943, + "epoch": 0.08657175836860331, + "flos": 446117286912.0, + "grad_norm": 0.025714213043280993, + "language_loss": 0.97109705, + "learning_rate": 0.0009916340761252837, + "loss": 0.98324645, + "num_input_tokens_seen": 37095872, + "router_z_loss_mlp": 1.453125, + "step": 450, + "time_per_iteration": 2.6118698120117188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212599, + "balance_loss_mlp": 1.067307, + "epoch": 0.08676414005386687, + "flos": 845588235264.0, + "grad_norm": 0.02612794411743426, + "language_loss": 0.94540501, + "learning_rate": 0.0009915772288856832, + "loss": 0.95753098, + "num_input_tokens_seen": 37179072, + "router_z_loss_mlp": 1.45458984, + "step": 451, + "time_per_iteration": 3.0883219242095947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213701, + "balance_loss_mlp": 1.06926715, + "epoch": 0.08695652173913043, + "flos": 604483875840.0, + "grad_norm": 0.02003375948944636, + "language_loss": 0.95739877, + "learning_rate": 0.000991520190798877, + "loss": 0.96953583, + "num_input_tokens_seen": 37260288, + "router_z_loss_mlp": 1.44580078, + "step": 452, + "time_per_iteration": 2.8387818336486816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213572, + "balance_loss_mlp": 1.06928122, + "epoch": 0.08714890342439399, + "flos": 732000015360.0, + "grad_norm": 0.027770143088691506, + "language_loss": 1.06693339, + "learning_rate": 0.0009914629618870089, + "loss": 1.07906914, + "num_input_tokens_seen": 37331136, + "router_z_loss_mlp": 1.44433594, + "step": 453, + "time_per_iteration": 2.9403207302093506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01234398, + "balance_loss_mlp": 1.0905838, + "epoch": 0.08734128510965757, + "flos": 1485454044672.0, + "grad_norm": 0.02536208637588336, + "language_loss": 0.78675872, + "learning_rate": 0.0009914055421722976, + "loss": 0.79910266, + "num_input_tokens_seen": 37559040, + "router_z_loss_mlp": 1.43945312, + "step": 454, + "time_per_iteration": 4.803662061691284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121994, + "balance_loss_mlp": 1.07631683, + "epoch": 0.08753366679492113, + "flos": 1526266340352.0, + "grad_norm": 0.01817690946373191, + "language_loss": 0.81427962, + "learning_rate": 0.0009913479316770353, + "loss": 0.82647902, + "num_input_tokens_seen": 37785136, + "router_z_loss_mlp": 1.4375, + "step": 455, + "time_per_iteration": 4.812621355056763 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213204, + "balance_loss_mlp": 1.06919885, + "epoch": 0.08772604848018468, + "flos": 722524263936.0, + "grad_norm": 0.030160618436618963, + "language_loss": 0.98162878, + "learning_rate": 0.0009912901304235883, + "loss": 0.99376082, + "num_input_tokens_seen": 37858832, + "router_z_loss_mlp": 1.44140625, + "step": 456, + "time_per_iteration": 2.9147355556488037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217818, + "balance_loss_mlp": 1.07386112, + "epoch": 0.08791843016544824, + "flos": 709466476032.0, + "grad_norm": 0.03064824893295274, + "language_loss": 0.96399593, + "learning_rate": 0.000991232138434397, + "loss": 0.97617412, + "num_input_tokens_seen": 37931856, + "router_z_loss_mlp": 1.44091797, + "step": 457, + "time_per_iteration": 2.8735082149505615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121922, + "balance_loss_mlp": 1.07540572, + "epoch": 0.08811081185071182, + "flos": 474021516288.0, + "grad_norm": 0.03193385229896835, + "language_loss": 1.03185177, + "learning_rate": 0.000991173955731976, + "loss": 1.04404402, + "num_input_tokens_seen": 38002432, + "router_z_loss_mlp": 1.43945312, + "step": 458, + "time_per_iteration": 2.6597843170166016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01220724, + "balance_loss_mlp": 1.07762539, + "epoch": 0.08830319353597538, + "flos": 686314584576.0, + "grad_norm": 0.057581270182385194, + "language_loss": 1.06524456, + "learning_rate": 0.0009911155823389137, + "loss": 1.07745171, + "num_input_tokens_seen": 38081648, + "router_z_loss_mlp": 1.43212891, + "step": 459, + "time_per_iteration": 2.938124656677246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218235, + "balance_loss_mlp": 1.07513571, + "epoch": 0.08849557522123894, + "flos": 574608344064.0, + "grad_norm": 0.027044136096108284, + "language_loss": 1.01923048, + "learning_rate": 0.000991057018277873, + "loss": 1.03141284, + "num_input_tokens_seen": 38153424, + "router_z_loss_mlp": 1.43212891, + "step": 460, + "time_per_iteration": 2.746169090270996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212445, + "balance_loss_mlp": 1.0693934, + "epoch": 0.0886879569065025, + "flos": 565627419648.0, + "grad_norm": 0.031092379840733354, + "language_loss": 1.03267121, + "learning_rate": 0.0009909982635715898, + "loss": 1.04479575, + "num_input_tokens_seen": 38223008, + "router_z_loss_mlp": 1.43164062, + "step": 461, + "time_per_iteration": 2.6196396350860596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212854, + "balance_loss_mlp": 1.06956458, + "epoch": 0.08888033859176607, + "flos": 564956674560.0, + "grad_norm": 0.030181357689894217, + "language_loss": 1.02059078, + "learning_rate": 0.0009909393182428751, + "loss": 1.03271937, + "num_input_tokens_seen": 38294592, + "router_z_loss_mlp": 1.43408203, + "step": 462, + "time_per_iteration": 2.679793357849121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216843, + "balance_loss_mlp": 1.07345808, + "epoch": 0.08907272027702963, + "flos": 466742650368.0, + "grad_norm": 0.029240136547664795, + "language_loss": 0.9639132, + "learning_rate": 0.000990880182314614, + "loss": 0.97608161, + "num_input_tokens_seen": 38365792, + "router_z_loss_mlp": 1.43505859, + "step": 463, + "time_per_iteration": 2.712097644805908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212421, + "balance_loss_mlp": 1.06922734, + "epoch": 0.08926510196229319, + "flos": 682843338240.0, + "grad_norm": 0.026287763165510035, + "language_loss": 0.96174729, + "learning_rate": 0.0009908208558097643, + "loss": 0.97387147, + "num_input_tokens_seen": 38447776, + "router_z_loss_mlp": 1.43310547, + "step": 464, + "time_per_iteration": 2.906903028488159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217208, + "balance_loss_mlp": 1.07406175, + "epoch": 0.08945748364755675, + "flos": 597821360640.0, + "grad_norm": 0.024374741633963998, + "language_loss": 0.98668623, + "learning_rate": 0.000990761338751359, + "loss": 0.99885827, + "num_input_tokens_seen": 38521632, + "router_z_loss_mlp": 1.43261719, + "step": 465, + "time_per_iteration": 2.7994933128356934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225639, + "balance_loss_mlp": 1.08506775, + "epoch": 0.08964986533282032, + "flos": 1589340930048.0, + "grad_norm": 0.02575129149720033, + "language_loss": 0.73659623, + "learning_rate": 0.0009907016311625045, + "loss": 0.74885261, + "num_input_tokens_seen": 38760528, + "router_z_loss_mlp": 1.40625, + "step": 466, + "time_per_iteration": 4.9763429164886475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221953, + "balance_loss_mlp": 1.07861578, + "epoch": 0.08984224701808388, + "flos": 534549385728.0, + "grad_norm": 0.024628184063577727, + "language_loss": 1.01551545, + "learning_rate": 0.0009906417330663815, + "loss": 1.02773499, + "num_input_tokens_seen": 38827200, + "router_z_loss_mlp": 1.43457031, + "step": 467, + "time_per_iteration": 2.614560842514038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01232523, + "balance_loss_mlp": 1.08994913, + "epoch": 0.09003462870334744, + "flos": 479850103296.0, + "grad_norm": 0.03230737833956583, + "language_loss": 0.98222148, + "learning_rate": 0.0009905816444862442, + "loss": 0.99454677, + "num_input_tokens_seen": 38891984, + "router_z_loss_mlp": 1.42675781, + "step": 468, + "time_per_iteration": 2.598146438598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223867, + "balance_loss_mlp": 1.08124495, + "epoch": 0.090227010388611, + "flos": 654902178816.0, + "grad_norm": 0.027522185030294237, + "language_loss": 0.95659769, + "learning_rate": 0.0009905213654454216, + "loss": 0.96883637, + "num_input_tokens_seen": 38977136, + "router_z_loss_mlp": 1.42724609, + "step": 469, + "time_per_iteration": 2.8876352310180664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01219852, + "balance_loss_mlp": 1.07737279, + "epoch": 0.09041939207387456, + "flos": 619358515200.0, + "grad_norm": 0.023282407360439072, + "language_loss": 1.03878951, + "learning_rate": 0.0009904608959673158, + "loss": 1.0509882, + "num_input_tokens_seen": 39052224, + "router_z_loss_mlp": 1.42578125, + "step": 470, + "time_per_iteration": 2.7882330417633057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213781, + "balance_loss_mlp": 1.0718745, + "epoch": 0.09061177375913813, + "flos": 455295596544.0, + "grad_norm": 0.02882877970469751, + "language_loss": 1.04707062, + "learning_rate": 0.000990400236075403, + "loss": 1.05920839, + "num_input_tokens_seen": 39116832, + "router_z_loss_mlp": 1.41992188, + "step": 471, + "time_per_iteration": 2.5016987323760986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217743, + "balance_loss_mlp": 1.07574117, + "epoch": 0.0908041554444017, + "flos": 545308230144.0, + "grad_norm": 0.02444258884202674, + "language_loss": 1.01020849, + "learning_rate": 0.0009903393857932338, + "loss": 1.02238584, + "num_input_tokens_seen": 39190528, + "router_z_loss_mlp": 1.42089844, + "step": 472, + "time_per_iteration": 2.644397497177124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218613, + "balance_loss_mlp": 1.07732654, + "epoch": 0.09099653712966525, + "flos": 565466964480.0, + "grad_norm": 0.02685769494428931, + "language_loss": 0.99245131, + "learning_rate": 0.0009902783451444317, + "loss": 1.00463748, + "num_input_tokens_seen": 39263168, + "router_z_loss_mlp": 1.41357422, + "step": 473, + "time_per_iteration": 2.7087745666503906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214499, + "balance_loss_mlp": 1.07292593, + "epoch": 0.09118891881492881, + "flos": 475501994496.0, + "grad_norm": 0.029476649456104027, + "language_loss": 1.02896917, + "learning_rate": 0.0009902171141526956, + "loss": 1.04111421, + "num_input_tokens_seen": 39330784, + "router_z_loss_mlp": 1.41650391, + "step": 474, + "time_per_iteration": 2.5271990299224854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215154, + "balance_loss_mlp": 1.07410538, + "epoch": 0.09138130050019239, + "flos": 546990822912.0, + "grad_norm": 0.02490932279529465, + "language_loss": 0.89845926, + "learning_rate": 0.000990155692841797, + "loss": 0.9106108, + "num_input_tokens_seen": 39417472, + "router_z_loss_mlp": 1.41113281, + "step": 475, + "time_per_iteration": 2.958740234375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214039, + "balance_loss_mlp": 1.07303798, + "epoch": 0.09157368218545595, + "flos": 733973319168.0, + "grad_norm": 0.02740759839690251, + "language_loss": 1.01869047, + "learning_rate": 0.0009900940812355818, + "loss": 1.03083086, + "num_input_tokens_seen": 39488656, + "router_z_loss_mlp": 1.41064453, + "step": 476, + "time_per_iteration": 2.891787528991699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205639, + "balance_loss_mlp": 1.06478107, + "epoch": 0.0917660638707195, + "flos": 612072918528.0, + "grad_norm": 0.029261712768775452, + "language_loss": 0.99624813, + "learning_rate": 0.00099003227935797, + "loss": 1.0083046, + "num_input_tokens_seen": 39558224, + "router_z_loss_mlp": 1.40917969, + "step": 477, + "time_per_iteration": 2.7569031715393066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207057, + "balance_loss_mlp": 1.06605613, + "epoch": 0.09195844555598306, + "flos": 657018473472.0, + "grad_norm": 0.026965523070242428, + "language_loss": 1.02860427, + "learning_rate": 0.000989970287232955, + "loss": 1.04067481, + "num_input_tokens_seen": 39629856, + "router_z_loss_mlp": 1.41064453, + "step": 478, + "time_per_iteration": 2.7705225944519043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212938, + "balance_loss_mlp": 1.07212758, + "epoch": 0.09215082724124664, + "flos": 477540426240.0, + "grad_norm": 0.02578247385618595, + "language_loss": 0.99767786, + "learning_rate": 0.0009899081048846043, + "loss": 1.00980723, + "num_input_tokens_seen": 39695984, + "router_z_loss_mlp": 1.40869141, + "step": 479, + "time_per_iteration": 2.5488922595977783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215229, + "balance_loss_mlp": 1.07437098, + "epoch": 0.0923432089265102, + "flos": 525325413888.0, + "grad_norm": 0.029009434883925433, + "language_loss": 1.05276799, + "learning_rate": 0.0009898457323370593, + "loss": 1.06492031, + "num_input_tokens_seen": 39760256, + "router_z_loss_mlp": 1.40917969, + "step": 480, + "time_per_iteration": 2.5628790855407715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213957, + "balance_loss_mlp": 1.07314658, + "epoch": 0.09253559061177376, + "flos": 546638986752.0, + "grad_norm": 0.030643020391807937, + "language_loss": 1.01694977, + "learning_rate": 0.000989783169614535, + "loss": 1.02908933, + "num_input_tokens_seen": 39827984, + "router_z_loss_mlp": 1.40869141, + "step": 481, + "time_per_iteration": 2.6431851387023926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206421, + "balance_loss_mlp": 1.06718445, + "epoch": 0.09272797229703732, + "flos": 1541334362112.0, + "grad_norm": 0.00793715508899474, + "language_loss": 0.78752756, + "learning_rate": 0.0009897204167413206, + "loss": 0.79959178, + "num_input_tokens_seen": 40056688, + "router_z_loss_mlp": 1.39257812, + "step": 482, + "time_per_iteration": 4.84259295463562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211177, + "balance_loss_mlp": 1.07041514, + "epoch": 0.09292035398230089, + "flos": 691064194560.0, + "grad_norm": 0.029391602229229655, + "language_loss": 0.99036419, + "learning_rate": 0.000989657473741779, + "loss": 1.00247598, + "num_input_tokens_seen": 40133120, + "router_z_loss_mlp": 1.40820312, + "step": 483, + "time_per_iteration": 2.8193717002868652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210505, + "balance_loss_mlp": 1.06964695, + "epoch": 0.09311273566756445, + "flos": 510822076416.0, + "grad_norm": 0.026713621627667553, + "language_loss": 1.0060308, + "learning_rate": 0.0009895943406403465, + "loss": 1.01813591, + "num_input_tokens_seen": 40206464, + "router_z_loss_mlp": 1.40917969, + "step": 484, + "time_per_iteration": 2.695058822631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210956, + "balance_loss_mlp": 1.07071841, + "epoch": 0.09330511735282801, + "flos": 660583045632.0, + "grad_norm": 0.02538483632370611, + "language_loss": 0.94170594, + "learning_rate": 0.0009895310174615338, + "loss": 0.95381546, + "num_input_tokens_seen": 40277744, + "router_z_loss_mlp": 1.40283203, + "step": 485, + "time_per_iteration": 2.7646515369415283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210991, + "balance_loss_mlp": 1.0725174, + "epoch": 0.09349749903809157, + "flos": 1456021673472.0, + "grad_norm": 0.008074315810691821, + "language_loss": 0.75718516, + "learning_rate": 0.0009894675042299251, + "loss": 0.7692951, + "num_input_tokens_seen": 40503664, + "router_z_loss_mlp": 1.38476562, + "step": 486, + "time_per_iteration": 4.652726888656616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208546, + "balance_loss_mlp": 1.06868994, + "epoch": 0.09368988072335514, + "flos": 521899829760.0, + "grad_norm": 0.021962490795067104, + "language_loss": 0.97574425, + "learning_rate": 0.0009894038009701782, + "loss": 0.98782969, + "num_input_tokens_seen": 40571376, + "router_z_loss_mlp": 1.39892578, + "step": 487, + "time_per_iteration": 2.647747755050659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207771, + "balance_loss_mlp": 1.06786692, + "epoch": 0.0938822624086187, + "flos": 498751941120.0, + "grad_norm": 0.02403393711112831, + "language_loss": 1.01297927, + "learning_rate": 0.0009893399077070253, + "loss": 1.02505696, + "num_input_tokens_seen": 40638096, + "router_z_loss_mlp": 1.39941406, + "step": 488, + "time_per_iteration": 2.5559775829315186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209251, + "balance_loss_mlp": 1.07006216, + "epoch": 0.09407464409388226, + "flos": 534223746048.0, + "grad_norm": 0.02465812888810929, + "language_loss": 0.94380867, + "learning_rate": 0.0009892758244652718, + "loss": 0.95590127, + "num_input_tokens_seen": 40710992, + "router_z_loss_mlp": 1.39208984, + "step": 489, + "time_per_iteration": 2.6696364879608154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203933, + "balance_loss_mlp": 1.06398153, + "epoch": 0.09426702577914582, + "flos": 587090714112.0, + "grad_norm": 0.02607881729553482, + "language_loss": 1.01920152, + "learning_rate": 0.0009892115512697968, + "loss": 1.03124094, + "num_input_tokens_seen": 40778896, + "router_z_loss_mlp": 1.39990234, + "step": 490, + "time_per_iteration": 2.645073652267456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205245, + "balance_loss_mlp": 1.06524527, + "epoch": 0.0944594074644094, + "flos": 504463733760.0, + "grad_norm": 0.02086232355550113, + "language_loss": 1.01703966, + "learning_rate": 0.0009891470881455537, + "loss": 1.02909207, + "num_input_tokens_seen": 40853376, + "router_z_loss_mlp": 1.40039062, + "step": 491, + "time_per_iteration": 2.669978618621826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207443, + "balance_loss_mlp": 1.06777763, + "epoch": 0.09465178914967295, + "flos": 572114016768.0, + "grad_norm": 0.026976181820206353, + "language_loss": 1.00743008, + "learning_rate": 0.0009890824351175692, + "loss": 1.01950443, + "num_input_tokens_seen": 40923776, + "router_z_loss_mlp": 1.39697266, + "step": 492, + "time_per_iteration": 2.6572952270507812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207157, + "balance_loss_mlp": 1.06796801, + "epoch": 0.09484417083493651, + "flos": 550418408448.0, + "grad_norm": 0.023611014675858334, + "language_loss": 1.04079592, + "learning_rate": 0.0009890175922109435, + "loss": 1.05286753, + "num_input_tokens_seen": 40996848, + "router_z_loss_mlp": 1.39208984, + "step": 493, + "time_per_iteration": 2.622361183166504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120413, + "balance_loss_mlp": 1.06498933, + "epoch": 0.09503655252020007, + "flos": 825271047168.0, + "grad_norm": 0.02510100112233158, + "language_loss": 1.0275588, + "learning_rate": 0.0009889525594508513, + "loss": 1.03960025, + "num_input_tokens_seen": 41071280, + "router_z_loss_mlp": 1.39160156, + "step": 494, + "time_per_iteration": 3.0307581424713135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202477, + "balance_loss_mlp": 1.06333554, + "epoch": 0.09522893420546363, + "flos": 405517839360.0, + "grad_norm": 0.02234367718934989, + "language_loss": 0.96151906, + "learning_rate": 0.0009888873368625404, + "loss": 0.97354376, + "num_input_tokens_seen": 41136304, + "router_z_loss_mlp": 1.39160156, + "step": 495, + "time_per_iteration": 2.4793317317962646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205465, + "balance_loss_mlp": 1.06665742, + "epoch": 0.0954213158907272, + "flos": 692255963136.0, + "grad_norm": 0.025506351191757377, + "language_loss": 1.00908709, + "learning_rate": 0.0009888219244713326, + "loss": 1.02114165, + "num_input_tokens_seen": 41212384, + "router_z_loss_mlp": 1.38818359, + "step": 496, + "time_per_iteration": 2.865914821624756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206499, + "balance_loss_mlp": 1.06773937, + "epoch": 0.09561369757599077, + "flos": 520074246144.0, + "grad_norm": 0.030124833611481355, + "language_loss": 1.02319717, + "learning_rate": 0.0009887563223026229, + "loss": 1.03526211, + "num_input_tokens_seen": 41282528, + "router_z_loss_mlp": 1.38671875, + "step": 497, + "time_per_iteration": 2.689708948135376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210899, + "balance_loss_mlp": 1.07376099, + "epoch": 0.09580607926125433, + "flos": 1388781623808.0, + "grad_norm": 0.014650036919455408, + "language_loss": 0.7906816, + "learning_rate": 0.0009886905303818805, + "loss": 0.80279064, + "num_input_tokens_seen": 41512256, + "router_z_loss_mlp": 1.37109375, + "step": 498, + "time_per_iteration": 4.940208196640015 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203477, + "balance_loss_mlp": 1.06476545, + "epoch": 0.09599846094651789, + "flos": 718825433088.0, + "grad_norm": 0.028840614245688557, + "language_loss": 0.98952407, + "learning_rate": 0.0009886245487346482, + "loss": 1.00155878, + "num_input_tokens_seen": 41596816, + "router_z_loss_mlp": 1.38427734, + "step": 499, + "time_per_iteration": 3.023056745529175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205479, + "balance_loss_mlp": 1.06690967, + "epoch": 0.09619084263178146, + "flos": 386893977600.0, + "grad_norm": 0.031706482821381415, + "language_loss": 1.0340035, + "learning_rate": 0.0009885583773865422, + "loss": 1.04605842, + "num_input_tokens_seen": 41658544, + "router_z_loss_mlp": 1.38183594, + "step": 500, + "time_per_iteration": 2.422914505004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202787, + "balance_loss_mlp": 1.06479073, + "epoch": 0.09638322431704502, + "flos": 535172467200.0, + "grad_norm": 0.02878579188863982, + "language_loss": 0.99392897, + "learning_rate": 0.0009884920163632524, + "loss": 1.00595689, + "num_input_tokens_seen": 41730736, + "router_z_loss_mlp": 1.37988281, + "step": 501, + "time_per_iteration": 2.6820154190063477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203474, + "balance_loss_mlp": 1.0655731, + "epoch": 0.09657560600230858, + "flos": 501656501760.0, + "grad_norm": 0.02635733095705931, + "language_loss": 1.03128934, + "learning_rate": 0.000988425465690543, + "loss": 1.04332411, + "num_input_tokens_seen": 41797824, + "router_z_loss_mlp": 1.37890625, + "step": 502, + "time_per_iteration": 2.605536699295044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204627, + "balance_loss_mlp": 1.06677341, + "epoch": 0.09676798768757214, + "flos": 530331532800.0, + "grad_norm": 0.023374032620567947, + "language_loss": 1.00861204, + "learning_rate": 0.0009883587253942505, + "loss": 1.02065825, + "num_input_tokens_seen": 41875520, + "router_z_loss_mlp": 1.37841797, + "step": 503, + "time_per_iteration": 2.7548091411590576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204765, + "balance_loss_mlp": 1.06686366, + "epoch": 0.09696036937283571, + "flos": 464556498432.0, + "grad_norm": 0.029206950172382878, + "language_loss": 1.0685035, + "learning_rate": 0.0009882917955002862, + "loss": 1.08055115, + "num_input_tokens_seen": 41942224, + "router_z_loss_mlp": 1.37890625, + "step": 504, + "time_per_iteration": 2.520970344543457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200777, + "balance_loss_mlp": 1.06297076, + "epoch": 0.09715275105809927, + "flos": 536010398208.0, + "grad_norm": 0.02484338661637091, + "language_loss": 0.9770751, + "learning_rate": 0.0009882246760346343, + "loss": 0.98908287, + "num_input_tokens_seen": 42007552, + "router_z_loss_mlp": 1.37695312, + "step": 505, + "time_per_iteration": 2.6314897537231445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204578, + "balance_loss_mlp": 1.06672478, + "epoch": 0.09734513274336283, + "flos": 455881747968.0, + "grad_norm": 0.02756591702740651, + "language_loss": 1.04990697, + "learning_rate": 0.0009881573670233533, + "loss": 1.06195283, + "num_input_tokens_seen": 42071760, + "router_z_loss_mlp": 1.37451172, + "step": 506, + "time_per_iteration": 2.492464780807495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203948, + "balance_loss_mlp": 1.06619, + "epoch": 0.09753751442862639, + "flos": 509827693056.0, + "grad_norm": 0.02954706972608782, + "language_loss": 0.97619581, + "learning_rate": 0.0009880898684925747, + "loss": 0.98823535, + "num_input_tokens_seen": 42140688, + "router_z_loss_mlp": 1.37353516, + "step": 507, + "time_per_iteration": 2.6402478218078613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120195, + "balance_loss_mlp": 1.06438243, + "epoch": 0.09772989611388996, + "flos": 485246989824.0, + "grad_norm": 0.02487380392257162, + "language_loss": 0.96617985, + "learning_rate": 0.0009880221804685037, + "loss": 0.97819936, + "num_input_tokens_seen": 42208544, + "router_z_loss_mlp": 1.37158203, + "step": 508, + "time_per_iteration": 2.5352439880371094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209412, + "balance_loss_mlp": 1.0741806, + "epoch": 0.09792227779915352, + "flos": 1569316454400.0, + "grad_norm": 0.016823619827393988, + "language_loss": 0.79344422, + "learning_rate": 0.000987954302977419, + "loss": 0.80553836, + "num_input_tokens_seen": 42426624, + "router_z_loss_mlp": 1.3515625, + "step": 509, + "time_per_iteration": 4.694217205047607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205455, + "balance_loss_mlp": 1.06831706, + "epoch": 0.09811465948441708, + "flos": 588914296320.0, + "grad_norm": 0.032012577058462416, + "language_loss": 1.03636336, + "learning_rate": 0.0009878862360456733, + "loss": 1.04841793, + "num_input_tokens_seen": 42494592, + "router_z_loss_mlp": 1.37011719, + "step": 510, + "time_per_iteration": 2.73879337310791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208431, + "balance_loss_mlp": 1.07148337, + "epoch": 0.09830704116968064, + "flos": 614128814592.0, + "grad_norm": 0.028115444050206044, + "language_loss": 0.94855493, + "learning_rate": 0.0009878179796996922, + "loss": 0.96063924, + "num_input_tokens_seen": 42564944, + "router_z_loss_mlp": 1.36914062, + "step": 511, + "time_per_iteration": 2.6949734687805176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207361, + "balance_loss_mlp": 1.07050836, + "epoch": 0.09849942285494422, + "flos": 539935538688.0, + "grad_norm": 0.022608937638108787, + "language_loss": 0.9790619, + "learning_rate": 0.0009877495339659754, + "loss": 0.99113548, + "num_input_tokens_seen": 42645616, + "router_z_loss_mlp": 1.36816406, + "step": 512, + "time_per_iteration": 2.7515861988067627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214076, + "balance_loss_mlp": 1.0773195, + "epoch": 0.09869180454020778, + "flos": 621603064320.0, + "grad_norm": 0.029833187637910333, + "language_loss": 0.94261241, + "learning_rate": 0.000987680898871096, + "loss": 0.95475316, + "num_input_tokens_seen": 42713632, + "router_z_loss_mlp": 1.3671875, + "step": 513, + "time_per_iteration": 2.6975760459899902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120845, + "balance_loss_mlp": 1.07145417, + "epoch": 0.09888418622547133, + "flos": 813059922432.0, + "grad_norm": 0.032512892127392744, + "language_loss": 0.9726817, + "learning_rate": 0.0009876120744417, + "loss": 0.98476619, + "num_input_tokens_seen": 42789088, + "router_z_loss_mlp": 1.36767578, + "step": 514, + "time_per_iteration": 2.9514927864074707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214576, + "balance_loss_mlp": 1.07762837, + "epoch": 0.0990765679107349, + "flos": 536857061376.0, + "grad_norm": 0.028495408786163776, + "language_loss": 1.0346663, + "learning_rate": 0.0009875430607045078, + "loss": 1.04681206, + "num_input_tokens_seen": 42861168, + "router_z_loss_mlp": 1.36523438, + "step": 515, + "time_per_iteration": 2.669271230697632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209323, + "balance_loss_mlp": 1.07242322, + "epoch": 0.09926894959599845, + "flos": 588970692096.0, + "grad_norm": 0.026228231589839293, + "language_loss": 0.98752952, + "learning_rate": 0.000987473857686313, + "loss": 0.9996227, + "num_input_tokens_seen": 42934112, + "router_z_loss_mlp": 1.36474609, + "step": 516, + "time_per_iteration": 2.7055716514587402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120601, + "balance_loss_mlp": 1.06934881, + "epoch": 0.09946133128126203, + "flos": 642386881536.0, + "grad_norm": 0.0302129460476142, + "language_loss": 1.04248524, + "learning_rate": 0.0009874044654139824, + "loss": 1.05454528, + "num_input_tokens_seen": 43005248, + "router_z_loss_mlp": 1.36230469, + "step": 517, + "time_per_iteration": 2.726618528366089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200307, + "balance_loss_mlp": 1.06340742, + "epoch": 0.09965371296652559, + "flos": 466725186048.0, + "grad_norm": 0.03251153136411229, + "language_loss": 1.02563679, + "learning_rate": 0.0009873348839144563, + "loss": 1.03763986, + "num_input_tokens_seen": 43070576, + "router_z_loss_mlp": 1.36474609, + "step": 518, + "time_per_iteration": 2.5855953693389893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200913, + "balance_loss_mlp": 1.06439471, + "epoch": 0.09984609465178915, + "flos": 484558780416.0, + "grad_norm": 0.029627125773621466, + "language_loss": 1.03352094, + "learning_rate": 0.000987265113214749, + "loss": 1.04552996, + "num_input_tokens_seen": 43138048, + "router_z_loss_mlp": 1.36279297, + "step": 519, + "time_per_iteration": 2.5350587368011475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01201703, + "balance_loss_mlp": 1.06566191, + "epoch": 0.1000384763370527, + "flos": 570095050752.0, + "grad_norm": 0.028931775658430137, + "language_loss": 1.07544637, + "learning_rate": 0.0009871951533419476, + "loss": 1.08746338, + "num_input_tokens_seen": 43207600, + "router_z_loss_mlp": 1.35986328, + "step": 520, + "time_per_iteration": 2.6423709392547607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200484, + "balance_loss_mlp": 1.06439495, + "epoch": 0.10023085802231628, + "flos": 546925694976.0, + "grad_norm": 0.025491893219336172, + "language_loss": 0.95403761, + "learning_rate": 0.0009871250043232132, + "loss": 0.96604246, + "num_input_tokens_seen": 43285104, + "router_z_loss_mlp": 1.36035156, + "step": 521, + "time_per_iteration": 2.7604362964630127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198813, + "balance_loss_mlp": 1.06205583, + "epoch": 0.10042323970757984, + "flos": 504439538688.0, + "grad_norm": 0.029888360913216814, + "language_loss": 0.96113187, + "learning_rate": 0.0009870546661857797, + "loss": 0.97311997, + "num_input_tokens_seen": 43353312, + "router_z_loss_mlp": 1.36328125, + "step": 522, + "time_per_iteration": 2.578458547592163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195212, + "balance_loss_mlp": 1.05931365, + "epoch": 0.1006156213928434, + "flos": 771724601856.0, + "grad_norm": 0.029426081780707294, + "language_loss": 1.05752206, + "learning_rate": 0.0009869841389569553, + "loss": 1.0694741, + "num_input_tokens_seen": 43427680, + "router_z_loss_mlp": 1.35839844, + "step": 523, + "time_per_iteration": 2.958531618118286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194366, + "balance_loss_mlp": 1.05846703, + "epoch": 0.10080800307810696, + "flos": 491008447488.0, + "grad_norm": 0.024593893632090205, + "language_loss": 0.96497846, + "learning_rate": 0.0009869134226641206, + "loss": 0.97692204, + "num_input_tokens_seen": 43495200, + "router_z_loss_mlp": 1.35839844, + "step": 524, + "time_per_iteration": 2.6820528507232666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196113, + "balance_loss_mlp": 1.06030965, + "epoch": 0.10100038476337053, + "flos": 455712560640.0, + "grad_norm": 0.026556514945601337, + "language_loss": 0.98348475, + "learning_rate": 0.0009868425173347303, + "loss": 0.99544585, + "num_input_tokens_seen": 43566256, + "router_z_loss_mlp": 1.35742188, + "step": 525, + "time_per_iteration": 2.6460907459259033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196515, + "balance_loss_mlp": 1.06099772, + "epoch": 0.10119276644863409, + "flos": 557573749248.0, + "grad_norm": 0.022458491608374247, + "language_loss": 1.03332829, + "learning_rate": 0.0009867714229963125, + "loss": 1.04529333, + "num_input_tokens_seen": 43639696, + "router_z_loss_mlp": 1.35449219, + "step": 526, + "time_per_iteration": 2.693362236022949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119647, + "balance_loss_mlp": 1.0609529, + "epoch": 0.10138514813389765, + "flos": 517219350528.0, + "grad_norm": 0.028969258136437262, + "language_loss": 1.0161202, + "learning_rate": 0.000986700139676468, + "loss": 1.02808487, + "num_input_tokens_seen": 43703872, + "router_z_loss_mlp": 1.35449219, + "step": 527, + "time_per_iteration": 2.5826644897460938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202893, + "balance_loss_mlp": 1.06742311, + "epoch": 0.10157752981916121, + "flos": 501563175936.0, + "grad_norm": 0.023004964960346017, + "language_loss": 0.98490077, + "learning_rate": 0.0009866286674028717, + "loss": 0.99692971, + "num_input_tokens_seen": 43774416, + "router_z_loss_mlp": 1.35400391, + "step": 528, + "time_per_iteration": 2.626595973968506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204326, + "balance_loss_mlp": 1.06876123, + "epoch": 0.10176991150442478, + "flos": 658093447680.0, + "grad_norm": 0.024381421822087013, + "language_loss": 0.95674849, + "learning_rate": 0.0009865570062032717, + "loss": 0.96879184, + "num_input_tokens_seen": 43853376, + "router_z_loss_mlp": 1.35498047, + "step": 529, + "time_per_iteration": 2.916924238204956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203456, + "balance_loss_mlp": 1.0680815, + "epoch": 0.10196229318968834, + "flos": 574402226688.0, + "grad_norm": 0.021344584600364362, + "language_loss": 0.99175954, + "learning_rate": 0.0009864851561054893, + "loss": 1.00379407, + "num_input_tokens_seen": 43929632, + "router_z_loss_mlp": 1.35302734, + "step": 530, + "time_per_iteration": 2.750075578689575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203649, + "balance_loss_mlp": 1.06856096, + "epoch": 0.1021546748749519, + "flos": 519255780864.0, + "grad_norm": 0.027896087186932737, + "language_loss": 0.99157, + "learning_rate": 0.0009864131171374191, + "loss": 1.00360656, + "num_input_tokens_seen": 44002144, + "router_z_loss_mlp": 1.35009766, + "step": 531, + "time_per_iteration": 2.6506359577178955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202329, + "balance_loss_mlp": 1.06728852, + "epoch": 0.10234705656021546, + "flos": 610953008640.0, + "grad_norm": 0.021304730024267197, + "language_loss": 0.98848057, + "learning_rate": 0.0009863408893270292, + "loss": 1.0005039, + "num_input_tokens_seen": 44078272, + "router_z_loss_mlp": 1.34960938, + "step": 532, + "time_per_iteration": 2.827632427215576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202805, + "balance_loss_mlp": 1.06776476, + "epoch": 0.10253943824547904, + "flos": 602912073216.0, + "grad_norm": 0.02650069508154076, + "language_loss": 0.95645475, + "learning_rate": 0.0009862684727023605, + "loss": 0.96848285, + "num_input_tokens_seen": 44152304, + "router_z_loss_mlp": 1.34960938, + "step": 533, + "time_per_iteration": 2.730771541595459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206135, + "balance_loss_mlp": 1.07152414, + "epoch": 0.1027318199307426, + "flos": 664156349952.0, + "grad_norm": 0.02579556790717569, + "language_loss": 0.96718729, + "learning_rate": 0.0009861958672915283, + "loss": 0.97924864, + "num_input_tokens_seen": 44226720, + "router_z_loss_mlp": 1.34521484, + "step": 534, + "time_per_iteration": 2.825239419937134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202189, + "balance_loss_mlp": 1.06776834, + "epoch": 0.10292420161600616, + "flos": 684529933824.0, + "grad_norm": 0.02492376876437301, + "language_loss": 0.95656139, + "learning_rate": 0.0009861230731227201, + "loss": 0.96858335, + "num_input_tokens_seen": 44303600, + "router_z_loss_mlp": 1.34326172, + "step": 535, + "time_per_iteration": 2.858596086502075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203815, + "balance_loss_mlp": 1.06958508, + "epoch": 0.10311658330126972, + "flos": 491268959232.0, + "grad_norm": 0.02833674325523021, + "language_loss": 0.99709427, + "learning_rate": 0.0009860500902241973, + "loss": 1.00913239, + "num_input_tokens_seen": 44370960, + "router_z_loss_mlp": 1.34130859, + "step": 536, + "time_per_iteration": 2.5780303478240967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197149, + "balance_loss_mlp": 1.06291902, + "epoch": 0.10330896498653329, + "flos": 432686195712.0, + "grad_norm": 0.024484943889946764, + "language_loss": 1.03652823, + "learning_rate": 0.0009859769186242942, + "loss": 1.0484997, + "num_input_tokens_seen": 44435584, + "router_z_loss_mlp": 1.34130859, + "step": 537, + "time_per_iteration": 2.5104598999023438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119791, + "balance_loss_mlp": 1.06415713, + "epoch": 0.10350134667179685, + "flos": 550641990144.0, + "grad_norm": 0.0271300181774947, + "language_loss": 0.97886324, + "learning_rate": 0.0009859035583514187, + "loss": 0.99084234, + "num_input_tokens_seen": 44505456, + "router_z_loss_mlp": 1.33642578, + "step": 538, + "time_per_iteration": 2.6156880855560303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197994, + "balance_loss_mlp": 1.06395507, + "epoch": 0.10369372835706041, + "flos": 641826926592.0, + "grad_norm": 0.024416305433678544, + "language_loss": 1.00991774, + "learning_rate": 0.0009858300094340517, + "loss": 1.02189767, + "num_input_tokens_seen": 44580208, + "router_z_loss_mlp": 1.33935547, + "step": 539, + "time_per_iteration": 2.764214515686035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198436, + "balance_loss_mlp": 1.06468332, + "epoch": 0.10388611004232397, + "flos": 522765958656.0, + "grad_norm": 0.025798430155835095, + "language_loss": 0.9342165, + "learning_rate": 0.0009857562719007473, + "loss": 0.94620085, + "num_input_tokens_seen": 44646576, + "router_z_loss_mlp": 1.33642578, + "step": 540, + "time_per_iteration": 2.6592581272125244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204547, + "balance_loss_mlp": 1.07122386, + "epoch": 0.10407849172758753, + "flos": 703739947008.0, + "grad_norm": 0.023593197084580173, + "language_loss": 0.95331407, + "learning_rate": 0.0009856823457801331, + "loss": 0.96535957, + "num_input_tokens_seen": 44726752, + "router_z_loss_mlp": 1.33203125, + "step": 541, + "time_per_iteration": 2.889531373977661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202711, + "balance_loss_mlp": 1.06924474, + "epoch": 0.1042708734128511, + "flos": 503944711680.0, + "grad_norm": 0.023957714626313076, + "language_loss": 1.02856565, + "learning_rate": 0.00098560823110091, + "loss": 1.04059267, + "num_input_tokens_seen": 44795824, + "router_z_loss_mlp": 1.33349609, + "step": 542, + "time_per_iteration": 2.6067047119140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205134, + "balance_loss_mlp": 1.07185781, + "epoch": 0.10446325509811466, + "flos": 486640872960.0, + "grad_norm": 0.0231214260398276, + "language_loss": 1.01405394, + "learning_rate": 0.000985533927891851, + "loss": 1.02610517, + "num_input_tokens_seen": 44868496, + "router_z_loss_mlp": 1.33154297, + "step": 543, + "time_per_iteration": 2.6622776985168457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01201388, + "balance_loss_mlp": 1.06820762, + "epoch": 0.10465563678337822, + "flos": 569713015296.0, + "grad_norm": 0.023482705287667723, + "language_loss": 1.01015687, + "learning_rate": 0.0009854594361818044, + "loss": 1.02217078, + "num_input_tokens_seen": 44939888, + "router_z_loss_mlp": 1.33056641, + "step": 544, + "time_per_iteration": 2.7061924934387207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195672, + "balance_loss_mlp": 1.06244385, + "epoch": 0.10484801846864178, + "flos": 627242998272.0, + "grad_norm": 0.023194608242680787, + "language_loss": 0.99799937, + "learning_rate": 0.0009853847559996897, + "loss": 1.00995612, + "num_input_tokens_seen": 45012720, + "router_z_loss_mlp": 1.33105469, + "step": 545, + "time_per_iteration": 2.742445707321167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192128, + "balance_loss_mlp": 1.05885231, + "epoch": 0.10504040015390535, + "flos": 744812754432.0, + "grad_norm": 0.025865682249952955, + "language_loss": 0.99192667, + "learning_rate": 0.0009853098873745, + "loss": 1.00384796, + "num_input_tokens_seen": 45093744, + "router_z_loss_mlp": 1.33154297, + "step": 546, + "time_per_iteration": 3.0260400772094727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192867, + "balance_loss_mlp": 1.05997264, + "epoch": 0.10523278183916891, + "flos": 587842050048.0, + "grad_norm": 0.02599355243407578, + "language_loss": 0.98197657, + "learning_rate": 0.0009852348303353027, + "loss": 0.99390525, + "num_input_tokens_seen": 45172784, + "router_z_loss_mlp": 1.32763672, + "step": 547, + "time_per_iteration": 2.8120169639587402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191481, + "balance_loss_mlp": 1.05844367, + "epoch": 0.10542516352443247, + "flos": 871145857536.0, + "grad_norm": 0.02495252935664815, + "language_loss": 0.91398883, + "learning_rate": 0.000985159584911237, + "loss": 0.92590368, + "num_input_tokens_seen": 45255600, + "router_z_loss_mlp": 1.32910156, + "step": 548, + "time_per_iteration": 3.1012043952941895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119193, + "balance_loss_mlp": 1.05913138, + "epoch": 0.10561754520969603, + "flos": 506412842496.0, + "grad_norm": 0.025955858684814606, + "language_loss": 0.9925828, + "learning_rate": 0.0009850841511315162, + "loss": 1.00450206, + "num_input_tokens_seen": 45325072, + "router_z_loss_mlp": 1.32666016, + "step": 549, + "time_per_iteration": 2.626220464706421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192876, + "balance_loss_mlp": 1.06022012, + "epoch": 0.1058099268949596, + "flos": 561147053568.0, + "grad_norm": 0.02554357007654854, + "language_loss": 0.98952115, + "learning_rate": 0.0009850085290254256, + "loss": 1.00144982, + "num_input_tokens_seen": 45401440, + "router_z_loss_mlp": 1.32519531, + "step": 550, + "time_per_iteration": 2.7464635372161865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194366, + "balance_loss_mlp": 1.06161487, + "epoch": 0.10600230858022316, + "flos": 563159288832.0, + "grad_norm": 0.020736613501838204, + "language_loss": 0.9519307, + "learning_rate": 0.0009849327186223246, + "loss": 0.9638744, + "num_input_tokens_seen": 45479264, + "router_z_loss_mlp": 1.32617188, + "step": 551, + "time_per_iteration": 2.7678163051605225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199655, + "balance_loss_mlp": 1.06728542, + "epoch": 0.10619469026548672, + "flos": 495317624832.0, + "grad_norm": 0.02236411826292933, + "language_loss": 1.02411103, + "learning_rate": 0.000984856719951646, + "loss": 1.03610754, + "num_input_tokens_seen": 45547328, + "router_z_loss_mlp": 1.32226562, + "step": 552, + "time_per_iteration": 2.5607285499572754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196226, + "balance_loss_mlp": 1.06404662, + "epoch": 0.10638707195075028, + "flos": 677463916032.0, + "grad_norm": 0.025808282690500464, + "language_loss": 1.00531495, + "learning_rate": 0.0009847805330428943, + "loss": 1.01727724, + "num_input_tokens_seen": 45631152, + "router_z_loss_mlp": 1.3203125, + "step": 553, + "time_per_iteration": 2.8748667240142822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190787, + "balance_loss_mlp": 1.05860806, + "epoch": 0.10657945363601386, + "flos": 489035143680.0, + "grad_norm": 0.02571681940882287, + "language_loss": 1.04715252, + "learning_rate": 0.0009847041579256481, + "loss": 1.05906045, + "num_input_tokens_seen": 45698208, + "router_z_loss_mlp": 1.3203125, + "step": 554, + "time_per_iteration": 2.56693696975708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191519, + "balance_loss_mlp": 1.05948246, + "epoch": 0.10677183532127742, + "flos": 483970627584.0, + "grad_norm": 0.020874824601389917, + "language_loss": 1.01746583, + "learning_rate": 0.0009846275946295592, + "loss": 1.02938092, + "num_input_tokens_seen": 45766640, + "router_z_loss_mlp": 1.31884766, + "step": 555, + "time_per_iteration": 2.596774101257324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195781, + "balance_loss_mlp": 1.06369734, + "epoch": 0.10696421700654098, + "flos": 657581156352.0, + "grad_norm": 0.023085993180182653, + "language_loss": 0.93557143, + "learning_rate": 0.0009845508431843518, + "loss": 0.94752926, + "num_input_tokens_seen": 45851408, + "router_z_loss_mlp": 1.31933594, + "step": 556, + "time_per_iteration": 2.9913573265075684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192823, + "balance_loss_mlp": 1.06088233, + "epoch": 0.10715659869180454, + "flos": 568792492032.0, + "grad_norm": 0.026087632201688016, + "language_loss": 0.9692713, + "learning_rate": 0.0009844739036198233, + "loss": 0.9811995, + "num_input_tokens_seen": 45919824, + "router_z_loss_mlp": 1.31787109, + "step": 557, + "time_per_iteration": 2.6583988666534424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192362, + "balance_loss_mlp": 1.06051683, + "epoch": 0.10734898037706811, + "flos": 541743657984.0, + "grad_norm": 0.02708275038302545, + "language_loss": 1.03564882, + "learning_rate": 0.0009843967759658448, + "loss": 1.04757237, + "num_input_tokens_seen": 45991024, + "router_z_loss_mlp": 1.31689453, + "step": 558, + "time_per_iteration": 2.6571173667907715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209854, + "balance_loss_mlp": 1.07920074, + "epoch": 0.10754136206233167, + "flos": 1479731518464.0, + "grad_norm": 0.021017403581586082, + "language_loss": 0.72767758, + "learning_rate": 0.0009843194602523592, + "loss": 0.73977602, + "num_input_tokens_seen": 46212736, + "router_z_loss_mlp": 1.30664062, + "step": 559, + "time_per_iteration": 4.901749134063721 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191994, + "balance_loss_mlp": 1.06024349, + "epoch": 0.10773374374759523, + "flos": 513411730944.0, + "grad_norm": 0.02623387515623986, + "language_loss": 1.03025067, + "learning_rate": 0.000984241956509384, + "loss": 1.04217052, + "num_input_tokens_seen": 46283920, + "router_z_loss_mlp": 1.31591797, + "step": 560, + "time_per_iteration": 2.642380714416504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011916, + "balance_loss_mlp": 1.06013584, + "epoch": 0.10792612543285879, + "flos": 497477580288.0, + "grad_norm": 0.029111560342126648, + "language_loss": 1.01683569, + "learning_rate": 0.0009841642647670078, + "loss": 1.02875161, + "num_input_tokens_seen": 46349664, + "router_z_loss_mlp": 1.31298828, + "step": 561, + "time_per_iteration": 2.5994741916656494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191695, + "balance_loss_mlp": 1.06027901, + "epoch": 0.10811850711812235, + "flos": 736836946944.0, + "grad_norm": 0.027918527501713815, + "language_loss": 0.94711685, + "learning_rate": 0.0009840863850553944, + "loss": 0.95903373, + "num_input_tokens_seen": 46432688, + "router_z_loss_mlp": 1.3125, + "step": 562, + "time_per_iteration": 2.980377435684204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193377, + "balance_loss_mlp": 1.06215191, + "epoch": 0.10831088880338592, + "flos": 612676534272.0, + "grad_norm": 0.025174626098757973, + "language_loss": 0.99795747, + "learning_rate": 0.0009840083174047782, + "loss": 1.00989127, + "num_input_tokens_seen": 46507216, + "router_z_loss_mlp": 1.31054688, + "step": 563, + "time_per_iteration": 2.7209153175354004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194645, + "balance_loss_mlp": 1.0633713, + "epoch": 0.10850327048864948, + "flos": 557497887744.0, + "grad_norm": 0.021851565940339403, + "language_loss": 0.93414235, + "learning_rate": 0.0009839300618454685, + "loss": 0.94608879, + "num_input_tokens_seen": 46590464, + "router_z_loss_mlp": 1.31103516, + "step": 564, + "time_per_iteration": 2.833120584487915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194873, + "balance_loss_mlp": 1.06402934, + "epoch": 0.10869565217391304, + "flos": 604436212224.0, + "grad_norm": 0.021697209366751603, + "language_loss": 0.98980927, + "learning_rate": 0.0009838516184078466, + "loss": 1.00175798, + "num_input_tokens_seen": 46666240, + "router_z_loss_mlp": 1.30664062, + "step": 565, + "time_per_iteration": 2.805722236633301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193483, + "balance_loss_mlp": 1.06263876, + "epoch": 0.1088880338591766, + "flos": 527205391872.0, + "grad_norm": 0.024778377976546286, + "language_loss": 0.97356248, + "learning_rate": 0.0009837729871223669, + "loss": 0.98549736, + "num_input_tokens_seen": 46734288, + "router_z_loss_mlp": 1.30664062, + "step": 566, + "time_per_iteration": 2.652186155319214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119656, + "balance_loss_mlp": 1.0658114, + "epoch": 0.10908041554444017, + "flos": 621416412672.0, + "grad_norm": 0.023487449334803984, + "language_loss": 0.99301046, + "learning_rate": 0.0009836941680195568, + "loss": 1.00497603, + "num_input_tokens_seen": 46809920, + "router_z_loss_mlp": 1.30566406, + "step": 567, + "time_per_iteration": 2.7732484340667725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192093, + "balance_loss_mlp": 1.06144011, + "epoch": 0.10927279722970373, + "flos": 899673168384.0, + "grad_norm": 0.026216288845653656, + "language_loss": 0.95416081, + "learning_rate": 0.0009836151611300166, + "loss": 0.96608174, + "num_input_tokens_seen": 46889984, + "router_z_loss_mlp": 1.3046875, + "step": 568, + "time_per_iteration": 3.174981117248535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190864, + "balance_loss_mlp": 1.06049693, + "epoch": 0.10946517891496729, + "flos": 529699719168.0, + "grad_norm": 0.02336242427092275, + "language_loss": 1.03071296, + "learning_rate": 0.0009835359664844194, + "loss": 1.04262161, + "num_input_tokens_seen": 46959536, + "router_z_loss_mlp": 1.30273438, + "step": 569, + "time_per_iteration": 2.595041513442993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190102, + "balance_loss_mlp": 1.06173706, + "epoch": 0.10965756060023085, + "flos": 1563991426560.0, + "grad_norm": 0.006726678932110135, + "language_loss": 0.81036806, + "learning_rate": 0.0009834565841135114, + "loss": 0.82226908, + "num_input_tokens_seen": 47196960, + "router_z_loss_mlp": 1.28320312, + "step": 570, + "time_per_iteration": 4.911731719970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193915, + "balance_loss_mlp": 1.0634526, + "epoch": 0.10984994228549443, + "flos": 514099940352.0, + "grad_norm": 0.027266515996607284, + "language_loss": 1.00165153, + "learning_rate": 0.0009833770140481118, + "loss": 1.01359057, + "num_input_tokens_seen": 47266560, + "router_z_loss_mlp": 1.30273438, + "step": 571, + "time_per_iteration": 2.6079747676849365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197777, + "balance_loss_mlp": 1.06741011, + "epoch": 0.11004232397075799, + "flos": 956273895936.0, + "grad_norm": 0.026548665437539986, + "language_loss": 0.90315044, + "learning_rate": 0.000983297256319112, + "loss": 0.91512823, + "num_input_tokens_seen": 47348512, + "router_z_loss_mlp": 1.30175781, + "step": 572, + "time_per_iteration": 3.1897354125976562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188275, + "balance_loss_mlp": 1.05776477, + "epoch": 0.11023470565602154, + "flos": 489228526080.0, + "grad_norm": 0.026034490292812715, + "language_loss": 0.95817071, + "learning_rate": 0.000983217310957477, + "loss": 0.97005343, + "num_input_tokens_seen": 47425392, + "router_z_loss_mlp": 1.30322266, + "step": 573, + "time_per_iteration": 2.7447898387908936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190883, + "balance_loss_mlp": 1.06056309, + "epoch": 0.1104270873412851, + "flos": 656990275584.0, + "grad_norm": 0.026590820610190004, + "language_loss": 1.00224817, + "learning_rate": 0.000983137177994244, + "loss": 1.01415706, + "num_input_tokens_seen": 47502336, + "router_z_loss_mlp": 1.30126953, + "step": 574, + "time_per_iteration": 2.846140146255493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185115, + "balance_loss_mlp": 1.0552249, + "epoch": 0.11061946902654868, + "flos": 724747345920.0, + "grad_norm": 0.019709272455133778, + "language_loss": 0.93286896, + "learning_rate": 0.0009830568574605235, + "loss": 0.94472009, + "num_input_tokens_seen": 47583552, + "router_z_loss_mlp": 1.29736328, + "step": 575, + "time_per_iteration": 2.922821044921875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185727, + "balance_loss_mlp": 1.05569339, + "epoch": 0.11081185071181224, + "flos": 836867822592.0, + "grad_norm": 0.025292755419638515, + "language_loss": 0.97880363, + "learning_rate": 0.0009829763493874992, + "loss": 0.99066085, + "num_input_tokens_seen": 47663440, + "router_z_loss_mlp": 1.29833984, + "step": 576, + "time_per_iteration": 3.022394895553589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183726, + "balance_loss_mlp": 1.05412149, + "epoch": 0.1110042323970758, + "flos": 610282263552.0, + "grad_norm": 0.023453623229808367, + "language_loss": 1.02838886, + "learning_rate": 0.0009828956538064264, + "loss": 1.04022622, + "num_input_tokens_seen": 47741920, + "router_z_loss_mlp": 1.29541016, + "step": 577, + "time_per_iteration": 2.817147970199585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182671, + "balance_loss_mlp": 1.05316234, + "epoch": 0.11119661408233936, + "flos": 597039825408.0, + "grad_norm": 0.025026186935027953, + "language_loss": 0.99076784, + "learning_rate": 0.0009828147707486344, + "loss": 1.00259459, + "num_input_tokens_seen": 47815136, + "router_z_loss_mlp": 1.29492188, + "step": 578, + "time_per_iteration": 2.6778078079223633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186939, + "balance_loss_mlp": 1.05752516, + "epoch": 0.11138899576760293, + "flos": 556887541248.0, + "grad_norm": 0.027590262528076937, + "language_loss": 0.96720088, + "learning_rate": 0.0009827337002455245, + "loss": 0.97907031, + "num_input_tokens_seen": 47881360, + "router_z_loss_mlp": 1.29394531, + "step": 579, + "time_per_iteration": 2.6259562969207764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188781, + "balance_loss_mlp": 1.05951095, + "epoch": 0.11158137745286649, + "flos": 691062193152.0, + "grad_norm": 0.0223692175133054, + "language_loss": 0.94567806, + "learning_rate": 0.0009826524423285712, + "loss": 0.9575659, + "num_input_tokens_seen": 47962720, + "router_z_loss_mlp": 1.29150391, + "step": 580, + "time_per_iteration": 2.9144554138183594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118328, + "balance_loss_mlp": 1.05386627, + "epoch": 0.11177375913813005, + "flos": 764306747904.0, + "grad_norm": 0.02877171771660235, + "language_loss": 0.97941083, + "learning_rate": 0.0009825709970293218, + "loss": 0.9912436, + "num_input_tokens_seen": 48035472, + "router_z_loss_mlp": 1.29296875, + "step": 581, + "time_per_iteration": 2.8999927043914795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181128, + "balance_loss_mlp": 1.05223894, + "epoch": 0.11196614082339361, + "flos": 808030334976.0, + "grad_norm": 0.029325346048851512, + "language_loss": 1.03732872, + "learning_rate": 0.0009824893643793956, + "loss": 1.04913998, + "num_input_tokens_seen": 48116944, + "router_z_loss_mlp": 1.28857422, + "step": 582, + "time_per_iteration": 3.0697131156921387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186636, + "balance_loss_mlp": 1.05731773, + "epoch": 0.11215852250865718, + "flos": 559724972544.0, + "grad_norm": 0.028740695003145394, + "language_loss": 0.98446089, + "learning_rate": 0.0009824075444104857, + "loss": 0.99632728, + "num_input_tokens_seen": 48187808, + "router_z_loss_mlp": 1.29150391, + "step": 583, + "time_per_iteration": 2.7398793697357178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190407, + "balance_loss_mlp": 1.06147003, + "epoch": 0.11235090419392074, + "flos": 514575301632.0, + "grad_norm": 0.02293328270345756, + "language_loss": 1.02460003, + "learning_rate": 0.000982325537154357, + "loss": 1.03650403, + "num_input_tokens_seen": 48254464, + "router_z_loss_mlp": 1.28808594, + "step": 584, + "time_per_iteration": 2.590156078338623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188149, + "balance_loss_mlp": 1.05954635, + "epoch": 0.1125432858791843, + "flos": 492432529920.0, + "grad_norm": 0.028214107652977688, + "language_loss": 1.0381788, + "learning_rate": 0.0009822433426428484, + "loss": 1.05006027, + "num_input_tokens_seen": 48318784, + "router_z_loss_mlp": 1.28564453, + "step": 585, + "time_per_iteration": 2.566488027572632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188321, + "balance_loss_mlp": 1.05957532, + "epoch": 0.11273566756444786, + "flos": 511727136768.0, + "grad_norm": 0.027438709113267498, + "language_loss": 0.95940274, + "learning_rate": 0.0009821609609078697, + "loss": 0.971286, + "num_input_tokens_seen": 48389248, + "router_z_loss_mlp": 1.28710938, + "step": 586, + "time_per_iteration": 2.6117701530456543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189545, + "balance_loss_mlp": 1.06098938, + "epoch": 0.11292804924971142, + "flos": 623639494656.0, + "grad_norm": 0.025949033694362005, + "language_loss": 0.97216725, + "learning_rate": 0.0009820783919814045, + "loss": 0.98406273, + "num_input_tokens_seen": 48463312, + "router_z_loss_mlp": 1.28515625, + "step": 587, + "time_per_iteration": 2.798182249069214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181783, + "balance_loss_mlp": 1.05360925, + "epoch": 0.113120430934975, + "flos": 479038368768.0, + "grad_norm": 0.03012596671256698, + "language_loss": 0.94172156, + "learning_rate": 0.0009819956358955095, + "loss": 0.95353937, + "num_input_tokens_seen": 48531856, + "router_z_loss_mlp": 1.28125, + "step": 588, + "time_per_iteration": 2.54179310798645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197707, + "balance_loss_mlp": 1.06905663, + "epoch": 0.11331281262023855, + "flos": 467990814720.0, + "grad_norm": 0.02502737191739997, + "language_loss": 0.9542653, + "learning_rate": 0.0009819126926823127, + "loss": 0.96624243, + "num_input_tokens_seen": 48596640, + "router_z_loss_mlp": 1.28613281, + "step": 589, + "time_per_iteration": 2.5262975692749023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191554, + "balance_loss_mlp": 1.06333208, + "epoch": 0.11350519430550211, + "flos": 651610853376.0, + "grad_norm": 0.023462259875113876, + "language_loss": 0.96713853, + "learning_rate": 0.000981829562374016, + "loss": 0.97905409, + "num_input_tokens_seen": 48669648, + "router_z_loss_mlp": 1.28173828, + "step": 590, + "time_per_iteration": 2.764240026473999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192039, + "balance_loss_mlp": 1.06415117, + "epoch": 0.11369757599076567, + "flos": 558860845056.0, + "grad_norm": 0.030341732837715945, + "language_loss": 1.07369685, + "learning_rate": 0.0009817462450028933, + "loss": 1.08561718, + "num_input_tokens_seen": 48737392, + "router_z_loss_mlp": 1.27832031, + "step": 591, + "time_per_iteration": 2.638333559036255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189947, + "balance_loss_mlp": 1.06215453, + "epoch": 0.11388995767602925, + "flos": 572305397760.0, + "grad_norm": 0.0238596111294556, + "language_loss": 0.94198918, + "learning_rate": 0.0009816627406012916, + "loss": 0.9538886, + "num_input_tokens_seen": 48817136, + "router_z_loss_mlp": 1.27734375, + "step": 592, + "time_per_iteration": 2.800842523574829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191939, + "balance_loss_mlp": 1.06395626, + "epoch": 0.1140823393612928, + "flos": 741743009280.0, + "grad_norm": 0.025351621893671843, + "language_loss": 0.93787777, + "learning_rate": 0.0009815790492016295, + "loss": 0.94979715, + "num_input_tokens_seen": 48895808, + "router_z_loss_mlp": 1.27929688, + "step": 593, + "time_per_iteration": 2.9331579208374023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191026, + "balance_loss_mlp": 1.06337643, + "epoch": 0.11427472104655637, + "flos": 700251236352.0, + "grad_norm": 0.02689478502881467, + "language_loss": 0.96601468, + "learning_rate": 0.0009814951708363993, + "loss": 0.97792494, + "num_input_tokens_seen": 48967456, + "router_z_loss_mlp": 1.27587891, + "step": 594, + "time_per_iteration": 2.832094192504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200218, + "balance_loss_mlp": 1.07414246, + "epoch": 0.11446710273181993, + "flos": 1480352598528.0, + "grad_norm": 0.020191453180706247, + "language_loss": 0.77990985, + "learning_rate": 0.0009814111055381654, + "loss": 0.79191208, + "num_input_tokens_seen": 49193152, + "router_z_loss_mlp": 1.25976562, + "step": 595, + "time_per_iteration": 4.752530574798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187485, + "balance_loss_mlp": 1.06026483, + "epoch": 0.1146594844170835, + "flos": 495912508416.0, + "grad_norm": 0.02910362847653251, + "language_loss": 0.97498882, + "learning_rate": 0.0009813268533395648, + "loss": 0.98686367, + "num_input_tokens_seen": 49260960, + "router_z_loss_mlp": 1.27148438, + "step": 596, + "time_per_iteration": 2.571367025375366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187961, + "balance_loss_mlp": 1.06093144, + "epoch": 0.11485186610234706, + "flos": 475790704128.0, + "grad_norm": 0.02927093575191284, + "language_loss": 0.98108673, + "learning_rate": 0.0009812424142733073, + "loss": 0.99296629, + "num_input_tokens_seen": 49327616, + "router_z_loss_mlp": 1.26953125, + "step": 597, + "time_per_iteration": 2.5622098445892334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187255, + "balance_loss_mlp": 1.06046438, + "epoch": 0.11504424778761062, + "flos": 732619094016.0, + "grad_norm": 0.02047017320895946, + "language_loss": 0.92490959, + "learning_rate": 0.000981157788372175, + "loss": 0.93678212, + "num_input_tokens_seen": 49412864, + "router_z_loss_mlp": 1.26708984, + "step": 598, + "time_per_iteration": 3.017120599746704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185489, + "balance_loss_mlp": 1.05855536, + "epoch": 0.11523662947287418, + "flos": 546962625024.0, + "grad_norm": 0.02044602685826044, + "language_loss": 0.96609688, + "learning_rate": 0.0009810729756690223, + "loss": 0.97795177, + "num_input_tokens_seen": 49483584, + "router_z_loss_mlp": 1.26855469, + "step": 599, + "time_per_iteration": 2.7182610034942627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190213, + "balance_loss_mlp": 1.06323159, + "epoch": 0.11542901115813775, + "flos": 776387616768.0, + "grad_norm": 0.023703305464208416, + "language_loss": 0.99939269, + "learning_rate": 0.0009809879761967766, + "loss": 1.01129484, + "num_input_tokens_seen": 49563568, + "router_z_loss_mlp": 1.26904297, + "step": 600, + "time_per_iteration": 2.9586148262023926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189892, + "balance_loss_mlp": 1.06319618, + "epoch": 0.11562139284340131, + "flos": 732212863488.0, + "grad_norm": 0.024193120208057816, + "language_loss": 0.99113685, + "learning_rate": 0.0009809027899884378, + "loss": 1.00303578, + "num_input_tokens_seen": 49640800, + "router_z_loss_mlp": 1.26611328, + "step": 601, + "time_per_iteration": 2.885070323944092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183816, + "balance_loss_mlp": 1.05731082, + "epoch": 0.11581377452866487, + "flos": 537039710208.0, + "grad_norm": 0.022696091128935367, + "language_loss": 0.96568906, + "learning_rate": 0.0009808174170770779, + "loss": 0.97752714, + "num_input_tokens_seen": 49721872, + "router_z_loss_mlp": 1.26416016, + "step": 602, + "time_per_iteration": 2.7809743881225586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191742, + "balance_loss_mlp": 1.0662384, + "epoch": 0.11600615621392843, + "flos": 1559211617280.0, + "grad_norm": 0.013792800863456836, + "language_loss": 0.84898245, + "learning_rate": 0.0009807318574958418, + "loss": 0.86089987, + "num_input_tokens_seen": 49951472, + "router_z_loss_mlp": 1.25390625, + "step": 603, + "time_per_iteration": 4.860181570053101 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187966, + "balance_loss_mlp": 1.06169963, + "epoch": 0.116198537899192, + "flos": 538467795456.0, + "grad_norm": 0.022659628017063727, + "language_loss": 1.02766323, + "learning_rate": 0.0009806461112779462, + "loss": 1.03954291, + "num_input_tokens_seen": 50021136, + "router_z_loss_mlp": 1.26171875, + "step": 604, + "time_per_iteration": 2.614189863204956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187324, + "balance_loss_mlp": 1.06091404, + "epoch": 0.11639091958445556, + "flos": 455137142784.0, + "grad_norm": 0.0301649070939891, + "language_loss": 1.00891566, + "learning_rate": 0.0009805601784566814, + "loss": 1.02078903, + "num_input_tokens_seen": 50083888, + "router_z_loss_mlp": 1.26318359, + "step": 605, + "time_per_iteration": 2.470878839492798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119223, + "balance_loss_mlp": 1.06658351, + "epoch": 0.11658330126971912, + "flos": 556151668224.0, + "grad_norm": 0.025758302551065336, + "language_loss": 1.05099356, + "learning_rate": 0.0009804740590654089, + "loss": 1.0629158, + "num_input_tokens_seen": 50151744, + "router_z_loss_mlp": 1.25537109, + "step": 606, + "time_per_iteration": 2.631462812423706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191677, + "balance_loss_mlp": 1.06588733, + "epoch": 0.11677568295498268, + "flos": 717600737280.0, + "grad_norm": 0.02545612001836415, + "language_loss": 0.99629396, + "learning_rate": 0.0009803877531375635, + "loss": 1.00821078, + "num_input_tokens_seen": 50221248, + "router_z_loss_mlp": 1.25683594, + "step": 607, + "time_per_iteration": 2.879645586013794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191881, + "balance_loss_mlp": 1.06613898, + "epoch": 0.11696806464024626, + "flos": 610898614272.0, + "grad_norm": 0.023619167708177922, + "language_loss": 0.99668628, + "learning_rate": 0.0009803012607066523, + "loss": 1.008605, + "num_input_tokens_seen": 50293792, + "router_z_loss_mlp": 1.25634766, + "step": 608, + "time_per_iteration": 2.717660427093506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189661, + "balance_loss_mlp": 1.06406212, + "epoch": 0.11716044632550981, + "flos": 521415736320.0, + "grad_norm": 0.023557070356346427, + "language_loss": 0.97414643, + "learning_rate": 0.0009802145818062543, + "loss": 0.98604298, + "num_input_tokens_seen": 50367760, + "router_z_loss_mlp": 1.25488281, + "step": 609, + "time_per_iteration": 2.7209720611572266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190685, + "balance_loss_mlp": 1.064991, + "epoch": 0.11735282801077337, + "flos": 508488204288.0, + "grad_norm": 0.03039581956620226, + "language_loss": 1.01476204, + "learning_rate": 0.0009801277164700212, + "loss": 1.02666891, + "num_input_tokens_seen": 50435664, + "router_z_loss_mlp": 1.25585938, + "step": 610, + "time_per_iteration": 2.5900633335113525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190447, + "balance_loss_mlp": 1.06489623, + "epoch": 0.11754520969603693, + "flos": 687835995648.0, + "grad_norm": 0.028512829376260446, + "language_loss": 0.97853899, + "learning_rate": 0.0009800406647316776, + "loss": 0.99044347, + "num_input_tokens_seen": 50514144, + "router_z_loss_mlp": 1.25439453, + "step": 611, + "time_per_iteration": 2.8018290996551514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118467, + "balance_loss_mlp": 1.06088257, + "epoch": 0.1177375913813005, + "flos": 1545756331008.0, + "grad_norm": 0.00764509792440145, + "language_loss": 0.76914459, + "learning_rate": 0.0009799534266250196, + "loss": 0.78099126, + "num_input_tokens_seen": 50738448, + "router_z_loss_mlp": 1.24023438, + "step": 612, + "time_per_iteration": 4.767510175704956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185632, + "balance_loss_mlp": 1.05974686, + "epoch": 0.11792997306656407, + "flos": 521537260032.0, + "grad_norm": 0.0290479345737112, + "language_loss": 0.97953087, + "learning_rate": 0.000979866002183916, + "loss": 0.99138713, + "num_input_tokens_seen": 50809328, + "router_z_loss_mlp": 1.2578125, + "step": 613, + "time_per_iteration": 2.6752681732177734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182111, + "balance_loss_mlp": 1.05632174, + "epoch": 0.11812235475182763, + "flos": 667488608256.0, + "grad_norm": 0.030776001440310688, + "language_loss": 0.9883132, + "learning_rate": 0.0009797783914423082, + "loss": 1.00013435, + "num_input_tokens_seen": 50887728, + "router_z_loss_mlp": 1.25683594, + "step": 614, + "time_per_iteration": 2.8556718826293945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182577, + "balance_loss_mlp": 1.05697787, + "epoch": 0.11831473643709119, + "flos": 622504121856.0, + "grad_norm": 0.02739500646081478, + "language_loss": 0.93579996, + "learning_rate": 0.0009796905944342094, + "loss": 0.94762576, + "num_input_tokens_seen": 50966160, + "router_z_loss_mlp": 1.25488281, + "step": 615, + "time_per_iteration": 2.80253267288208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187072, + "balance_loss_mlp": 1.06152117, + "epoch": 0.11850711812235475, + "flos": 457694596608.0, + "grad_norm": 0.020858577781052552, + "language_loss": 0.96166766, + "learning_rate": 0.0009796026111937057, + "loss": 0.9735384, + "num_input_tokens_seen": 51035712, + "router_z_loss_mlp": 1.25439453, + "step": 616, + "time_per_iteration": 2.5763044357299805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189497, + "balance_loss_mlp": 1.06404102, + "epoch": 0.11869949980761832, + "flos": 514927137792.0, + "grad_norm": 0.022050319992180305, + "language_loss": 0.96050835, + "learning_rate": 0.0009795144417549552, + "loss": 0.97240329, + "num_input_tokens_seen": 51108656, + "router_z_loss_mlp": 1.25341797, + "step": 617, + "time_per_iteration": 2.7428698539733887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186044, + "balance_loss_mlp": 1.06092167, + "epoch": 0.11889188149288188, + "flos": 536156116992.0, + "grad_norm": 0.0238791856796517, + "language_loss": 0.97532642, + "learning_rate": 0.0009794260861521883, + "loss": 0.98718691, + "num_input_tokens_seen": 51185552, + "router_z_loss_mlp": 1.25292969, + "step": 618, + "time_per_iteration": 2.784257173538208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189196, + "balance_loss_mlp": 1.06445491, + "epoch": 0.11908426317814544, + "flos": 499644266496.0, + "grad_norm": 0.024260475486046627, + "language_loss": 0.96495152, + "learning_rate": 0.0009793375444197075, + "loss": 0.97684348, + "num_input_tokens_seen": 51255808, + "router_z_loss_mlp": 1.25, + "step": 619, + "time_per_iteration": 2.6021063327789307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189567, + "balance_loss_mlp": 1.06482673, + "epoch": 0.119276644863409, + "flos": 661067139072.0, + "grad_norm": 0.023292068214373615, + "language_loss": 0.96012962, + "learning_rate": 0.000979248816591888, + "loss": 0.97202522, + "num_input_tokens_seen": 51329408, + "router_z_loss_mlp": 1.25, + "step": 620, + "time_per_iteration": 2.783372640609741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184512, + "balance_loss_mlp": 1.06001019, + "epoch": 0.11946902654867257, + "flos": 760152021504.0, + "grad_norm": 0.02911418191745056, + "language_loss": 0.95521206, + "learning_rate": 0.0009791599027031766, + "loss": 0.96705711, + "num_input_tokens_seen": 51408784, + "router_z_loss_mlp": 1.24755859, + "step": 621, + "time_per_iteration": 3.04338002204895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185972, + "balance_loss_mlp": 1.06156564, + "epoch": 0.11966140823393613, + "flos": 682213526016.0, + "grad_norm": 0.0317276180850791, + "language_loss": 0.96021026, + "learning_rate": 0.0009790708027880932, + "loss": 0.97206998, + "num_input_tokens_seen": 51482592, + "router_z_loss_mlp": 1.24658203, + "step": 622, + "time_per_iteration": 2.8048830032348633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184547, + "balance_loss_mlp": 1.06171417, + "epoch": 0.11985378991919969, + "flos": 1454298147840.0, + "grad_norm": 0.011779966077399251, + "language_loss": 0.77427292, + "learning_rate": 0.0009789815168812293, + "loss": 0.78611839, + "num_input_tokens_seen": 51712240, + "router_z_loss_mlp": 1.23046875, + "step": 623, + "time_per_iteration": 4.88221549987793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187655, + "balance_loss_mlp": 1.06291461, + "epoch": 0.12004617160446325, + "flos": 528898718208.0, + "grad_norm": 0.0243802584204396, + "language_loss": 1.01341891, + "learning_rate": 0.0009788920450172487, + "loss": 1.0252955, + "num_input_tokens_seen": 51781440, + "router_z_loss_mlp": 1.25, + "step": 624, + "time_per_iteration": 2.6179678440093994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190724, + "balance_loss_mlp": 1.06655562, + "epoch": 0.12023855328972682, + "flos": 475176354816.0, + "grad_norm": 0.025839680970612892, + "language_loss": 0.99598378, + "learning_rate": 0.0009788023872308875, + "loss": 1.00789118, + "num_input_tokens_seen": 51845424, + "router_z_loss_mlp": 1.24414062, + "step": 625, + "time_per_iteration": 2.5168616771698 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189499, + "balance_loss_mlp": 1.06723785, + "epoch": 0.12043093497499038, + "flos": 1535051880960.0, + "grad_norm": 0.008994278182213968, + "language_loss": 0.75428998, + "learning_rate": 0.0009787125435569539, + "loss": 0.76618505, + "num_input_tokens_seen": 52076496, + "router_z_loss_mlp": 1.22460938, + "step": 626, + "time_per_iteration": 4.739393472671509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194547, + "balance_loss_mlp": 1.07128501, + "epoch": 0.12062331666025394, + "flos": 540914459136.0, + "grad_norm": 0.025390703641747513, + "language_loss": 1.01758838, + "learning_rate": 0.0009786225140303285, + "loss": 1.02953386, + "num_input_tokens_seen": 52143072, + "router_z_loss_mlp": 1.23486328, + "step": 627, + "time_per_iteration": 2.627995729446411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119053, + "balance_loss_mlp": 1.06683803, + "epoch": 0.1208156983455175, + "flos": 512999496192.0, + "grad_norm": 0.027559316114759484, + "language_loss": 1.00245547, + "learning_rate": 0.0009785322986859634, + "loss": 1.0143609, + "num_input_tokens_seen": 52211888, + "router_z_loss_mlp": 1.23925781, + "step": 628, + "time_per_iteration": 2.657465696334839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011787, + "balance_loss_mlp": 1.05481803, + "epoch": 0.12100808003078108, + "flos": 597589046784.0, + "grad_norm": 0.024406659961039724, + "language_loss": 1.01031506, + "learning_rate": 0.0009784418975588838, + "loss": 1.02210212, + "num_input_tokens_seen": 52283696, + "router_z_loss_mlp": 1.24121094, + "step": 629, + "time_per_iteration": 2.6953535079956055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187008, + "balance_loss_mlp": 1.063555, + "epoch": 0.12120046171604464, + "flos": 524066515968.0, + "grad_norm": 0.02180733694842763, + "language_loss": 0.99517697, + "learning_rate": 0.0009783513106841862, + "loss": 1.00704694, + "num_input_tokens_seen": 52358624, + "router_z_loss_mlp": 1.23681641, + "step": 630, + "time_per_iteration": 2.7234978675842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189331, + "balance_loss_mlp": 1.06687927, + "epoch": 0.1213928434013082, + "flos": 1557907057152.0, + "grad_norm": 0.011472153843238986, + "language_loss": 0.76732707, + "learning_rate": 0.00097826053809704, + "loss": 0.77922034, + "num_input_tokens_seen": 52591248, + "router_z_loss_mlp": 1.2265625, + "step": 631, + "time_per_iteration": 4.975109100341797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184278, + "balance_loss_mlp": 1.06072986, + "epoch": 0.12158522508657175, + "flos": 496387869696.0, + "grad_norm": 0.025959921000511615, + "language_loss": 0.96498066, + "learning_rate": 0.0009781695798326854, + "loss": 0.97682351, + "num_input_tokens_seen": 52659920, + "router_z_loss_mlp": 1.23779297, + "step": 632, + "time_per_iteration": 2.5740485191345215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184351, + "balance_loss_mlp": 1.0608983, + "epoch": 0.12177760677183531, + "flos": 476589703680.0, + "grad_norm": 0.025554774573744533, + "language_loss": 0.96275663, + "learning_rate": 0.0009780784359264365, + "loss": 0.9746002, + "num_input_tokens_seen": 52728832, + "router_z_loss_mlp": 1.23681641, + "step": 633, + "time_per_iteration": 2.604390859603882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176552, + "balance_loss_mlp": 1.05543518, + "epoch": 0.12196998845709889, + "flos": 1471784635392.0, + "grad_norm": 0.009598735556444526, + "language_loss": 0.74188697, + "learning_rate": 0.0009779871064136778, + "loss": 0.75365245, + "num_input_tokens_seen": 52949776, + "router_z_loss_mlp": 1.21289062, + "step": 634, + "time_per_iteration": 4.757449626922607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117713, + "balance_loss_mlp": 1.05424869, + "epoch": 0.12216237014236245, + "flos": 587748724224.0, + "grad_norm": 0.021555120902870813, + "language_loss": 0.93822527, + "learning_rate": 0.000977895591329867, + "loss": 0.94999647, + "num_input_tokens_seen": 53027184, + "router_z_loss_mlp": 1.23095703, + "step": 635, + "time_per_iteration": 2.7859792709350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181489, + "balance_loss_mlp": 1.05851305, + "epoch": 0.12235475182762601, + "flos": 599106455040.0, + "grad_norm": 0.023775729584682537, + "language_loss": 0.96009773, + "learning_rate": 0.000977803890710533, + "loss": 0.97191262, + "num_input_tokens_seen": 53101072, + "router_z_loss_mlp": 1.23193359, + "step": 636, + "time_per_iteration": 2.76069712638855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180701, + "balance_loss_mlp": 1.05762947, + "epoch": 0.12254713351288957, + "flos": 498760673280.0, + "grad_norm": 0.024707427516876792, + "language_loss": 1.00440359, + "learning_rate": 0.0009777120045912774, + "loss": 1.01621056, + "num_input_tokens_seen": 53172992, + "router_z_loss_mlp": 1.23291016, + "step": 637, + "time_per_iteration": 2.5980072021484375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118065, + "balance_loss_mlp": 1.05772126, + "epoch": 0.12273951519815314, + "flos": 606980204544.0, + "grad_norm": 0.02489341207380848, + "language_loss": 0.99891078, + "learning_rate": 0.0009776199330077736, + "loss": 1.01071739, + "num_input_tokens_seen": 53248256, + "router_z_loss_mlp": 1.23144531, + "step": 638, + "time_per_iteration": 2.704040288925171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181154, + "balance_loss_mlp": 1.05841601, + "epoch": 0.1229318968834167, + "flos": 598984931328.0, + "grad_norm": 0.02631208797714665, + "language_loss": 1.02141118, + "learning_rate": 0.0009775276759957667, + "loss": 1.03322268, + "num_input_tokens_seen": 53318960, + "router_z_loss_mlp": 1.22949219, + "step": 639, + "time_per_iteration": 2.7442896366119385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179934, + "balance_loss_mlp": 1.05700564, + "epoch": 0.12312427856868026, + "flos": 679588942848.0, + "grad_norm": 0.026802425502252814, + "language_loss": 1.01084137, + "learning_rate": 0.0009774352335910745, + "loss": 1.02264071, + "num_input_tokens_seen": 53389120, + "router_z_loss_mlp": 1.23144531, + "step": 640, + "time_per_iteration": 2.8294076919555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117918, + "balance_loss_mlp": 1.05625129, + "epoch": 0.12331666025394382, + "flos": 610043218944.0, + "grad_norm": 0.020742791942005383, + "language_loss": 1.02118182, + "learning_rate": 0.000977342605829586, + "loss": 1.03297377, + "num_input_tokens_seen": 53459056, + "router_z_loss_mlp": 1.23144531, + "step": 641, + "time_per_iteration": 2.7078418731689453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180028, + "balance_loss_mlp": 1.05748129, + "epoch": 0.12350904193920739, + "flos": 763840118784.0, + "grad_norm": 0.025027209312251563, + "language_loss": 0.94737858, + "learning_rate": 0.0009772497927472623, + "loss": 0.95917892, + "num_input_tokens_seen": 53541552, + "router_z_loss_mlp": 1.22753906, + "step": 642, + "time_per_iteration": 3.0655579566955566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177096, + "balance_loss_mlp": 1.05454898, + "epoch": 0.12370142362447095, + "flos": 542049831936.0, + "grad_norm": 0.02608476880613399, + "language_loss": 0.96273685, + "learning_rate": 0.0009771567943801368, + "loss": 0.97450781, + "num_input_tokens_seen": 53611520, + "router_z_loss_mlp": 1.22753906, + "step": 643, + "time_per_iteration": 2.7343406677246094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179725, + "balance_loss_mlp": 1.05727291, + "epoch": 0.12389380530973451, + "flos": 549252836352.0, + "grad_norm": 0.02435000122960196, + "language_loss": 0.99357152, + "learning_rate": 0.0009770636107643152, + "loss": 1.00536871, + "num_input_tokens_seen": 53683888, + "router_z_loss_mlp": 1.2265625, + "step": 644, + "time_per_iteration": 2.7361886501312256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177422, + "balance_loss_mlp": 1.05516136, + "epoch": 0.12408618699499807, + "flos": 541352890368.0, + "grad_norm": 0.02246298440278387, + "language_loss": 0.95392644, + "learning_rate": 0.0009769702419359738, + "loss": 0.96570063, + "num_input_tokens_seen": 53751888, + "router_z_loss_mlp": 1.22460938, + "step": 645, + "time_per_iteration": 2.674142837524414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181453, + "balance_loss_mlp": 1.05904841, + "epoch": 0.12427856868026164, + "flos": 747159361536.0, + "grad_norm": 0.023095982047370255, + "language_loss": 0.97586024, + "learning_rate": 0.000976876687931362, + "loss": 0.98767477, + "num_input_tokens_seen": 53827648, + "router_z_loss_mlp": 1.22607422, + "step": 646, + "time_per_iteration": 2.9833688735961914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189298, + "balance_loss_mlp": 1.06703711, + "epoch": 0.1244709503655252, + "flos": 534744769536.0, + "grad_norm": 0.03060863164707411, + "language_loss": 0.94044995, + "learning_rate": 0.0009767829487868005, + "loss": 0.95234299, + "num_input_tokens_seen": 53896400, + "router_z_loss_mlp": 1.22460938, + "step": 647, + "time_per_iteration": 2.596003293991089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182997, + "balance_loss_mlp": 1.06073558, + "epoch": 0.12466333205078876, + "flos": 509111285760.0, + "grad_norm": 0.028982594733012217, + "language_loss": 0.98960567, + "learning_rate": 0.000976689024538682, + "loss": 1.00143564, + "num_input_tokens_seen": 53965904, + "router_z_loss_mlp": 1.22460938, + "step": 648, + "time_per_iteration": 2.5837948322296143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183924, + "balance_loss_mlp": 1.06171107, + "epoch": 0.12485571373605232, + "flos": 682639222272.0, + "grad_norm": 0.03213416167398649, + "language_loss": 0.97804081, + "learning_rate": 0.0009765949152234716, + "loss": 0.98988008, + "num_input_tokens_seen": 54049792, + "router_z_loss_mlp": 1.22412109, + "step": 649, + "time_per_iteration": 2.876009702682495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193359, + "balance_loss_mlp": 1.07243347, + "epoch": 0.1250480954213159, + "flos": 1333198748160.0, + "grad_norm": 0.014891788740719425, + "language_loss": 0.78686082, + "learning_rate": 0.0009765006208777055, + "loss": 0.79879445, + "num_input_tokens_seen": 54262432, + "router_z_loss_mlp": 1.2109375, + "step": 650, + "time_per_iteration": 4.675558805465698 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182781, + "balance_loss_mlp": 1.06152093, + "epoch": 0.12524047710657946, + "flos": 940196754432.0, + "grad_norm": 0.027794334398077363, + "language_loss": 0.91408408, + "learning_rate": 0.0009764061415379919, + "loss": 0.9259119, + "num_input_tokens_seen": 54351568, + "router_z_loss_mlp": 1.21435547, + "step": 651, + "time_per_iteration": 3.260758399963379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184193, + "balance_loss_mlp": 1.06288576, + "epoch": 0.12543285879184302, + "flos": 514900941312.0, + "grad_norm": 0.027655948956122736, + "language_loss": 0.97430605, + "learning_rate": 0.0009763114772410109, + "loss": 0.986148, + "num_input_tokens_seen": 54418944, + "router_z_loss_mlp": 1.21484375, + "step": 652, + "time_per_iteration": 2.60402512550354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179616, + "balance_loss_mlp": 1.05849957, + "epoch": 0.12562524047710658, + "flos": 719682829824.0, + "grad_norm": 0.022040452281994895, + "language_loss": 0.94100869, + "learning_rate": 0.0009762166280235146, + "loss": 0.95280486, + "num_input_tokens_seen": 54495312, + "router_z_loss_mlp": 1.21289062, + "step": 653, + "time_per_iteration": 2.953866958618164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177042, + "balance_loss_mlp": 1.05592513, + "epoch": 0.12581762216237014, + "flos": 564798220800.0, + "grad_norm": 0.026345633512325176, + "language_loss": 0.96725851, + "learning_rate": 0.0009761215939223267, + "loss": 0.97902894, + "num_input_tokens_seen": 54566832, + "router_z_loss_mlp": 1.21289062, + "step": 654, + "time_per_iteration": 2.6936216354370117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176243, + "balance_loss_mlp": 1.0553174, + "epoch": 0.1260100038476337, + "flos": 482900382720.0, + "grad_norm": 0.0302310026354778, + "language_loss": 0.97697163, + "learning_rate": 0.0009760263749743428, + "loss": 0.98873413, + "num_input_tokens_seen": 54632128, + "router_z_loss_mlp": 1.2109375, + "step": 655, + "time_per_iteration": 2.5425992012023926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173716, + "balance_loss_mlp": 1.05302835, + "epoch": 0.12620238553289725, + "flos": 576701170176.0, + "grad_norm": 0.026173940013352312, + "language_loss": 0.96703827, + "learning_rate": 0.0009759309712165299, + "loss": 0.97877538, + "num_input_tokens_seen": 54707600, + "router_z_loss_mlp": 1.20849609, + "step": 656, + "time_per_iteration": 2.7134242057800293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182641, + "balance_loss_mlp": 1.06185794, + "epoch": 0.12639476721816084, + "flos": 532185314304.0, + "grad_norm": 0.024272217680215723, + "language_loss": 1.00863099, + "learning_rate": 0.0009758353826859272, + "loss": 1.02045751, + "num_input_tokens_seen": 54776704, + "router_z_loss_mlp": 1.20947266, + "step": 657, + "time_per_iteration": 2.621317148208618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183764, + "balance_loss_mlp": 1.06288576, + "epoch": 0.1265871489034244, + "flos": 691231380480.0, + "grad_norm": 0.02639198012969831, + "language_loss": 0.9913975, + "learning_rate": 0.0009757396094196456, + "loss": 1.00323522, + "num_input_tokens_seen": 54851744, + "router_z_loss_mlp": 1.21044922, + "step": 658, + "time_per_iteration": 2.8867759704589844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183942, + "balance_loss_mlp": 1.06311166, + "epoch": 0.12677953058868796, + "flos": 538242212352.0, + "grad_norm": 0.02343039495549204, + "language_loss": 0.91435432, + "learning_rate": 0.0009756436514548673, + "loss": 0.92619371, + "num_input_tokens_seen": 54932576, + "router_z_loss_mlp": 1.20996094, + "step": 659, + "time_per_iteration": 2.8055155277252197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179962, + "balance_loss_mlp": 1.05903614, + "epoch": 0.12697191227395152, + "flos": 520119908352.0, + "grad_norm": 0.02147737158217614, + "language_loss": 0.94944704, + "learning_rate": 0.0009755475088288466, + "loss": 0.96124667, + "num_input_tokens_seen": 55007296, + "router_z_loss_mlp": 1.2109375, + "step": 660, + "time_per_iteration": 2.713801860809326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179144, + "balance_loss_mlp": 1.05826533, + "epoch": 0.12716429395921508, + "flos": 567665851392.0, + "grad_norm": 0.026687699897107686, + "language_loss": 0.99289566, + "learning_rate": 0.0009754511815789095, + "loss": 1.00468707, + "num_input_tokens_seen": 55079312, + "router_z_loss_mlp": 1.21044922, + "step": 661, + "time_per_iteration": 2.739250898361206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176549, + "balance_loss_mlp": 1.05590951, + "epoch": 0.12735667564447864, + "flos": 515141987328.0, + "grad_norm": 0.028028480179563667, + "language_loss": 0.94950283, + "learning_rate": 0.0009753546697424533, + "loss": 0.96126837, + "num_input_tokens_seen": 55151824, + "router_z_loss_mlp": 1.20800781, + "step": 662, + "time_per_iteration": 2.71746826171875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180242, + "balance_loss_mlp": 1.05941188, + "epoch": 0.1275490573297422, + "flos": 542321077248.0, + "grad_norm": 0.02443290319898258, + "language_loss": 0.98755229, + "learning_rate": 0.0009752579733569475, + "loss": 0.99935466, + "num_input_tokens_seen": 55224368, + "router_z_loss_mlp": 1.20996094, + "step": 663, + "time_per_iteration": 2.631284713745117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179512, + "balance_loss_mlp": 1.06030273, + "epoch": 0.12774143901500576, + "flos": 1562024853504.0, + "grad_norm": 0.010147906106003043, + "language_loss": 0.74881387, + "learning_rate": 0.0009751610924599328, + "loss": 0.76060903, + "num_input_tokens_seen": 55453584, + "router_z_loss_mlp": 1.19335938, + "step": 664, + "time_per_iteration": 4.941519260406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188286, + "balance_loss_mlp": 1.06783676, + "epoch": 0.12793382070026935, + "flos": 614873419776.0, + "grad_norm": 0.028758292375382164, + "language_loss": 1.00255466, + "learning_rate": 0.0009750640270890217, + "loss": 1.01443744, + "num_input_tokens_seen": 55528000, + "router_z_loss_mlp": 1.20605469, + "step": 665, + "time_per_iteration": 2.7382516860961914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185033, + "balance_loss_mlp": 1.06458378, + "epoch": 0.1281262023855329, + "flos": 709117367808.0, + "grad_norm": 0.02727882395737353, + "language_loss": 1.05972624, + "learning_rate": 0.0009749667772818983, + "loss": 1.0715766, + "num_input_tokens_seen": 55612416, + "router_z_loss_mlp": 1.20605469, + "step": 666, + "time_per_iteration": 2.961103677749634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117968, + "balance_loss_mlp": 1.06104279, + "epoch": 0.12831858407079647, + "flos": 1428182572032.0, + "grad_norm": 0.005713660367986308, + "language_loss": 0.76935941, + "learning_rate": 0.0009748693430763185, + "loss": 0.78115624, + "num_input_tokens_seen": 55843664, + "router_z_loss_mlp": 1.1875, + "step": 667, + "time_per_iteration": 4.799788475036621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180825, + "balance_loss_mlp": 1.06056714, + "epoch": 0.12851096575606002, + "flos": 450018232320.0, + "grad_norm": 0.027450705632443572, + "language_loss": 1.04045725, + "learning_rate": 0.0009747717245101093, + "loss": 1.05226541, + "num_input_tokens_seen": 55909072, + "router_z_loss_mlp": 1.20410156, + "step": 668, + "time_per_iteration": 2.503016471862793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181103, + "balance_loss_mlp": 1.0609405, + "epoch": 0.12870334744132358, + "flos": 480909614592.0, + "grad_norm": 0.024743463193645603, + "language_loss": 0.94192064, + "learning_rate": 0.00097467392162117, + "loss": 0.95373166, + "num_input_tokens_seen": 55978544, + "router_z_loss_mlp": 1.203125, + "step": 669, + "time_per_iteration": 2.6341683864593506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176215, + "balance_loss_mlp": 1.05609953, + "epoch": 0.12889572912658714, + "flos": 640151064576.0, + "grad_norm": 0.020470833753638586, + "language_loss": 0.98179239, + "learning_rate": 0.0009745759344474708, + "loss": 0.99355447, + "num_input_tokens_seen": 56054144, + "router_z_loss_mlp": 1.20263672, + "step": 670, + "time_per_iteration": 2.8753654956817627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175464, + "balance_loss_mlp": 1.05530083, + "epoch": 0.1290881108118507, + "flos": 510954333696.0, + "grad_norm": 0.02496408481001148, + "language_loss": 0.98669916, + "learning_rate": 0.0009744777630270536, + "loss": 0.99845386, + "num_input_tokens_seen": 56120960, + "router_z_loss_mlp": 1.203125, + "step": 671, + "time_per_iteration": 2.601480484008789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173739, + "balance_loss_mlp": 1.05381489, + "epoch": 0.12928049249711426, + "flos": 672290611200.0, + "grad_norm": 0.0267777739546368, + "language_loss": 1.0349828, + "learning_rate": 0.000974379407398032, + "loss": 1.04672015, + "num_input_tokens_seen": 56202560, + "router_z_loss_mlp": 1.20068359, + "step": 672, + "time_per_iteration": 2.8746023178100586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176311, + "balance_loss_mlp": 1.05633891, + "epoch": 0.12947287418237785, + "flos": 794998743552.0, + "grad_norm": 0.021070447178693698, + "language_loss": 0.89884377, + "learning_rate": 0.0009742808675985913, + "loss": 0.91060686, + "num_input_tokens_seen": 56289456, + "router_z_loss_mlp": 1.20117188, + "step": 673, + "time_per_iteration": 3.106855869293213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178925, + "balance_loss_mlp": 1.05895269, + "epoch": 0.1296652558676414, + "flos": 486447490560.0, + "grad_norm": 0.028552559493613055, + "language_loss": 1.00707459, + "learning_rate": 0.0009741821436669876, + "loss": 1.0188638, + "num_input_tokens_seen": 56354480, + "router_z_loss_mlp": 1.20117188, + "step": 674, + "time_per_iteration": 2.6221611499786377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180752, + "balance_loss_mlp": 1.06097043, + "epoch": 0.12985763755290497, + "flos": 454392537600.0, + "grad_norm": 0.03163366532216525, + "language_loss": 1.04449701, + "learning_rate": 0.0009740832356415492, + "loss": 1.05630445, + "num_input_tokens_seen": 56418944, + "router_z_loss_mlp": 1.19921875, + "step": 675, + "time_per_iteration": 2.508666515350342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179614, + "balance_loss_mlp": 1.05968916, + "epoch": 0.13005001923816853, + "flos": 826434617856.0, + "grad_norm": 0.02755997498495484, + "language_loss": 0.99148017, + "learning_rate": 0.0009739841435606756, + "loss": 1.00327623, + "num_input_tokens_seen": 56492368, + "router_z_loss_mlp": 1.20068359, + "step": 676, + "time_per_iteration": 3.026420831680298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180175, + "balance_loss_mlp": 1.06058431, + "epoch": 0.1302424009234321, + "flos": 532480754688.0, + "grad_norm": 0.02275953253130011, + "language_loss": 0.97366607, + "learning_rate": 0.0009738848674628377, + "loss": 0.98546779, + "num_input_tokens_seen": 56568128, + "router_z_loss_mlp": 1.19726562, + "step": 677, + "time_per_iteration": 2.710205554962158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179059, + "balance_loss_mlp": 1.05927801, + "epoch": 0.13043478260869565, + "flos": 526916682240.0, + "grad_norm": 0.02441501439452981, + "language_loss": 0.97902691, + "learning_rate": 0.000973785407386578, + "loss": 0.99081755, + "num_input_tokens_seen": 56646448, + "router_z_loss_mlp": 1.19921875, + "step": 678, + "time_per_iteration": 2.7785394191741943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184892, + "balance_loss_mlp": 1.06553924, + "epoch": 0.1306271642939592, + "flos": 627416914944.0, + "grad_norm": 0.023801085732510874, + "language_loss": 0.94469249, + "learning_rate": 0.0009736857633705103, + "loss": 0.95654142, + "num_input_tokens_seen": 56732080, + "router_z_loss_mlp": 1.19482422, + "step": 679, + "time_per_iteration": 2.8619470596313477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177483, + "balance_loss_mlp": 1.05827415, + "epoch": 0.13081954597922277, + "flos": 551840489472.0, + "grad_norm": 0.024512943765722366, + "language_loss": 1.01033652, + "learning_rate": 0.0009735859354533196, + "loss": 1.02211142, + "num_input_tokens_seen": 56804432, + "router_z_loss_mlp": 1.19335938, + "step": 680, + "time_per_iteration": 2.6954457759857178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176387, + "balance_loss_mlp": 1.05755925, + "epoch": 0.13101192766448633, + "flos": 537955504128.0, + "grad_norm": 0.029188130773433643, + "language_loss": 1.02405858, + "learning_rate": 0.0009734859236737628, + "loss": 1.03582239, + "num_input_tokens_seen": 56872512, + "router_z_loss_mlp": 1.18945312, + "step": 681, + "time_per_iteration": 2.606597661972046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172364, + "balance_loss_mlp": 1.05353606, + "epoch": 0.13120430934974991, + "flos": 504513398784.0, + "grad_norm": 0.02625319928532985, + "language_loss": 1.02007055, + "learning_rate": 0.0009733857280706678, + "loss": 1.03179431, + "num_input_tokens_seen": 56940928, + "router_z_loss_mlp": 1.18945312, + "step": 682, + "time_per_iteration": 2.626211404800415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011686, + "balance_loss_mlp": 1.05010605, + "epoch": 0.13139669103501347, + "flos": 615422641152.0, + "grad_norm": 0.025135553656080285, + "language_loss": 0.9321503, + "learning_rate": 0.000973285348682934, + "loss": 0.94383633, + "num_input_tokens_seen": 57012736, + "router_z_loss_mlp": 1.18603516, + "step": 683, + "time_per_iteration": 2.71779727935791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190269, + "balance_loss_mlp": 1.07296753, + "epoch": 0.13158907272027703, + "flos": 1488215614464.0, + "grad_norm": 0.025067429703540995, + "language_loss": 0.77898371, + "learning_rate": 0.0009731847855495323, + "loss": 0.7908864, + "num_input_tokens_seen": 57243136, + "router_z_loss_mlp": 1.17382812, + "step": 684, + "time_per_iteration": 4.811431169509888 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168738, + "balance_loss_mlp": 1.05048192, + "epoch": 0.1317814544055406, + "flos": 987117614592.0, + "grad_norm": 0.026136533405527674, + "language_loss": 0.93269205, + "learning_rate": 0.0009730840387095046, + "loss": 0.94437939, + "num_input_tokens_seen": 57336160, + "router_z_loss_mlp": 1.18359375, + "step": 685, + "time_per_iteration": 3.3154938220977783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117288, + "balance_loss_mlp": 1.05443382, + "epoch": 0.13197383609080415, + "flos": 612628870656.0, + "grad_norm": 0.026271684435729213, + "language_loss": 0.99177825, + "learning_rate": 0.0009729831082019642, + "loss": 1.00350702, + "num_input_tokens_seen": 57418976, + "router_z_loss_mlp": 1.18554688, + "step": 686, + "time_per_iteration": 2.79620623588562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179476, + "balance_loss_mlp": 1.06093395, + "epoch": 0.1321662177760677, + "flos": 495554668032.0, + "grad_norm": 0.02508782879826625, + "language_loss": 0.97052312, + "learning_rate": 0.0009728819940660958, + "loss": 0.98231786, + "num_input_tokens_seen": 57490288, + "router_z_loss_mlp": 1.18652344, + "step": 687, + "time_per_iteration": 2.779193162918091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178983, + "balance_loss_mlp": 1.06067955, + "epoch": 0.13235859946133127, + "flos": 496843765248.0, + "grad_norm": 0.02705130625621755, + "language_loss": 0.97550011, + "learning_rate": 0.0009727806963411557, + "loss": 0.98728997, + "num_input_tokens_seen": 57556064, + "router_z_loss_mlp": 1.18408203, + "step": 688, + "time_per_iteration": 2.5702319145202637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177139, + "balance_loss_mlp": 1.05883551, + "epoch": 0.13255098114659483, + "flos": 512767182336.0, + "grad_norm": 0.022910122085290585, + "language_loss": 0.96022904, + "learning_rate": 0.000972679215066471, + "loss": 0.97200048, + "num_input_tokens_seen": 57627248, + "router_z_loss_mlp": 1.18408203, + "step": 689, + "time_per_iteration": 2.64780592918396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178761, + "balance_loss_mlp": 1.06050563, + "epoch": 0.13274336283185842, + "flos": 548399442432.0, + "grad_norm": 0.030606528220640358, + "language_loss": 1.08985806, + "learning_rate": 0.0009725775502814401, + "loss": 1.10164571, + "num_input_tokens_seen": 57694832, + "router_z_loss_mlp": 1.18359375, + "step": 690, + "time_per_iteration": 2.5830535888671875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179512, + "balance_loss_mlp": 1.06120849, + "epoch": 0.13293574451712198, + "flos": 642002844672.0, + "grad_norm": 0.023439513257655937, + "language_loss": 0.94635952, + "learning_rate": 0.0009724757020255327, + "loss": 0.95815468, + "num_input_tokens_seen": 57771776, + "router_z_loss_mlp": 1.18408203, + "step": 691, + "time_per_iteration": 2.827944278717041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183334, + "balance_loss_mlp": 1.06517375, + "epoch": 0.13312812620238554, + "flos": 492469459968.0, + "grad_norm": 0.028212898490696088, + "language_loss": 0.96836531, + "learning_rate": 0.0009723736703382902, + "loss": 0.98019874, + "num_input_tokens_seen": 57836272, + "router_z_loss_mlp": 1.18261719, + "step": 692, + "time_per_iteration": 2.6144213676452637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180114, + "balance_loss_mlp": 1.06200123, + "epoch": 0.1333205078876491, + "flos": 509949216768.0, + "grad_norm": 0.023005533645913036, + "language_loss": 0.90654016, + "learning_rate": 0.0009722714552593244, + "loss": 0.91834128, + "num_input_tokens_seen": 57907232, + "router_z_loss_mlp": 1.18212891, + "step": 693, + "time_per_iteration": 2.600128173828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180549, + "balance_loss_mlp": 1.06262743, + "epoch": 0.13351288957291266, + "flos": 419591477760.0, + "grad_norm": 0.029950659996273835, + "language_loss": 1.05475199, + "learning_rate": 0.000972169056828319, + "loss": 1.06655741, + "num_input_tokens_seen": 57969808, + "router_z_loss_mlp": 1.18017578, + "step": 694, + "time_per_iteration": 2.466643810272217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178338, + "balance_loss_mlp": 1.0606066, + "epoch": 0.13370527125817622, + "flos": 617050839552.0, + "grad_norm": 0.021764231653516302, + "language_loss": 0.95444119, + "learning_rate": 0.0009720664750850283, + "loss": 0.96622455, + "num_input_tokens_seen": 58042944, + "router_z_loss_mlp": 1.17822266, + "step": 695, + "time_per_iteration": 2.7776308059692383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173328, + "balance_loss_mlp": 1.05578816, + "epoch": 0.13389765294343978, + "flos": 627169138176.0, + "grad_norm": 0.026088042391715836, + "language_loss": 1.0165019, + "learning_rate": 0.0009719637100692784, + "loss": 1.0282352, + "num_input_tokens_seen": 58116080, + "router_z_loss_mlp": 1.17626953, + "step": 696, + "time_per_iteration": 2.77535343170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175294, + "balance_loss_mlp": 1.0578016, + "epoch": 0.13409003462870334, + "flos": 610896612864.0, + "grad_norm": 0.027090913840535472, + "language_loss": 0.92017978, + "learning_rate": 0.0009718607618209661, + "loss": 0.93193275, + "num_input_tokens_seen": 58197616, + "router_z_loss_mlp": 1.17578125, + "step": 697, + "time_per_iteration": 2.8413584232330322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179845, + "balance_loss_mlp": 1.06235278, + "epoch": 0.13428241631396692, + "flos": 685087887360.0, + "grad_norm": 0.024883061853709334, + "language_loss": 0.95573747, + "learning_rate": 0.0009717576303800595, + "loss": 0.96753585, + "num_input_tokens_seen": 58280480, + "router_z_loss_mlp": 1.17578125, + "step": 698, + "time_per_iteration": 3.047100782394409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175386, + "balance_loss_mlp": 1.05794048, + "epoch": 0.13447479799923048, + "flos": 509818960896.0, + "grad_norm": 0.024888049065051182, + "language_loss": 0.95325053, + "learning_rate": 0.0009716543157865975, + "loss": 0.96500432, + "num_input_tokens_seen": 58352464, + "router_z_loss_mlp": 1.17529297, + "step": 699, + "time_per_iteration": 2.7481272220611572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176233, + "balance_loss_mlp": 1.05878782, + "epoch": 0.13466717968449404, + "flos": 899058819072.0, + "grad_norm": 0.023872779385430955, + "language_loss": 0.92076075, + "learning_rate": 0.0009715508180806907, + "loss": 0.93252313, + "num_input_tokens_seen": 58437216, + "router_z_loss_mlp": 1.17529297, + "step": 700, + "time_per_iteration": 3.2107367515563965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173529, + "balance_loss_mlp": 1.05660856, + "epoch": 0.1348595613697576, + "flos": 991694034432.0, + "grad_norm": 0.023513798430807663, + "language_loss": 1.00262749, + "learning_rate": 0.0009714471373025202, + "loss": 1.01436281, + "num_input_tokens_seen": 58533152, + "router_z_loss_mlp": 1.16992188, + "step": 701, + "time_per_iteration": 3.3966751098632812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173715, + "balance_loss_mlp": 1.0566988, + "epoch": 0.13505194305502116, + "flos": 488811561984.0, + "grad_norm": 0.028001983236069502, + "language_loss": 0.99373382, + "learning_rate": 0.0009713432734923386, + "loss": 1.00547099, + "num_input_tokens_seen": 58601376, + "router_z_loss_mlp": 1.17089844, + "step": 702, + "time_per_iteration": 2.615107536315918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171408, + "balance_loss_mlp": 1.05439234, + "epoch": 0.13524432474028472, + "flos": 614519582208.0, + "grad_norm": 0.024192478681639117, + "language_loss": 0.96606487, + "learning_rate": 0.0009712392266904696, + "loss": 0.97777891, + "num_input_tokens_seen": 58676608, + "router_z_loss_mlp": 1.17089844, + "step": 703, + "time_per_iteration": 2.7448034286499023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174325, + "balance_loss_mlp": 1.05740499, + "epoch": 0.13543670642554828, + "flos": 906274558464.0, + "grad_norm": 0.025492480769094515, + "language_loss": 0.96012545, + "learning_rate": 0.0009711349969373076, + "loss": 0.97186869, + "num_input_tokens_seen": 58759264, + "router_z_loss_mlp": 1.16992188, + "step": 704, + "time_per_iteration": 3.1337268352508545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172794, + "balance_loss_mlp": 1.05596876, + "epoch": 0.13562908811081184, + "flos": 551747163648.0, + "grad_norm": 0.026772975251671254, + "language_loss": 0.91034031, + "learning_rate": 0.0009710305842733178, + "loss": 0.9220683, + "num_input_tokens_seen": 58834800, + "router_z_loss_mlp": 1.16894531, + "step": 705, + "time_per_iteration": 2.7571139335632324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167183, + "balance_loss_mlp": 1.05031061, + "epoch": 0.1358214697960754, + "flos": 509037425664.0, + "grad_norm": 0.024292049069741084, + "language_loss": 0.98220038, + "learning_rate": 0.0009709259887390373, + "loss": 0.99387223, + "num_input_tokens_seen": 58901712, + "router_z_loss_mlp": 1.16943359, + "step": 706, + "time_per_iteration": 2.559511661529541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168004, + "balance_loss_mlp": 1.05141699, + "epoch": 0.136013851481339, + "flos": 529923300864.0, + "grad_norm": 0.025926611739077732, + "language_loss": 1.00068641, + "learning_rate": 0.0009708212103750737, + "loss": 1.01236641, + "num_input_tokens_seen": 58967824, + "router_z_loss_mlp": 1.16650391, + "step": 707, + "time_per_iteration": 2.6197190284729004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168587, + "balance_loss_mlp": 1.05219126, + "epoch": 0.13620623316660255, + "flos": 660320532480.0, + "grad_norm": 0.02235622943703988, + "language_loss": 0.96270919, + "learning_rate": 0.0009707162492221051, + "loss": 0.97439504, + "num_input_tokens_seen": 59045040, + "router_z_loss_mlp": 1.16455078, + "step": 708, + "time_per_iteration": 2.8917648792266846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171818, + "balance_loss_mlp": 1.05542207, + "epoch": 0.1363986148518661, + "flos": 673082880000.0, + "grad_norm": 0.027649047287573853, + "language_loss": 0.98132068, + "learning_rate": 0.0009706111053208815, + "loss": 0.99303889, + "num_input_tokens_seen": 59117216, + "router_z_loss_mlp": 1.16455078, + "step": 709, + "time_per_iteration": 2.7827165126800537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173191, + "balance_loss_mlp": 1.05669987, + "epoch": 0.13659099653712967, + "flos": 474004051968.0, + "grad_norm": 0.02773643003805471, + "language_loss": 0.94597077, + "learning_rate": 0.0009705057787122232, + "loss": 0.9577027, + "num_input_tokens_seen": 59183056, + "router_z_loss_mlp": 1.16552734, + "step": 710, + "time_per_iteration": 2.542836904525757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169067, + "balance_loss_mlp": 1.05286229, + "epoch": 0.13678337822239323, + "flos": 453647932416.0, + "grad_norm": 0.0248615327032158, + "language_loss": 0.9884814, + "learning_rate": 0.0009704002694370216, + "loss": 1.00017214, + "num_input_tokens_seen": 59247312, + "router_z_loss_mlp": 1.16259766, + "step": 711, + "time_per_iteration": 2.550527811050415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164533, + "balance_loss_mlp": 1.04842281, + "epoch": 0.13697575990765679, + "flos": 520625468928.0, + "grad_norm": 0.0274811578413112, + "language_loss": 0.97066599, + "learning_rate": 0.0009702945775362388, + "loss": 0.98231125, + "num_input_tokens_seen": 59317968, + "router_z_loss_mlp": 1.16162109, + "step": 712, + "time_per_iteration": 2.56953501701355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116862, + "balance_loss_mlp": 1.05246294, + "epoch": 0.13716814159292035, + "flos": 481365510144.0, + "grad_norm": 0.025544817797380492, + "language_loss": 0.98621845, + "learning_rate": 0.0009701887030509086, + "loss": 0.99790466, + "num_input_tokens_seen": 59387936, + "router_z_loss_mlp": 1.16210938, + "step": 713, + "time_per_iteration": 2.6443872451782227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173026, + "balance_loss_mlp": 1.05663013, + "epoch": 0.1373605232781839, + "flos": 546749776896.0, + "grad_norm": 0.02672517687154734, + "language_loss": 1.02031791, + "learning_rate": 0.0009700826460221346, + "loss": 1.03204811, + "num_input_tokens_seen": 59460624, + "router_z_loss_mlp": 1.16455078, + "step": 714, + "time_per_iteration": 2.6742734909057617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171339, + "balance_loss_mlp": 1.05508566, + "epoch": 0.1375529049634475, + "flos": 710070091776.0, + "grad_norm": 0.027473841831572973, + "language_loss": 1.03736091, + "learning_rate": 0.0009699764064910921, + "loss": 1.04907441, + "num_input_tokens_seen": 59536752, + "router_z_loss_mlp": 1.16308594, + "step": 715, + "time_per_iteration": 2.8945000171661377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116921, + "balance_loss_mlp": 1.05281401, + "epoch": 0.13774528664871105, + "flos": 487676189184.0, + "grad_norm": 0.02500038679906112, + "language_loss": 0.96403199, + "learning_rate": 0.0009698699844990268, + "loss": 0.9757241, + "num_input_tokens_seen": 59608128, + "router_z_loss_mlp": 1.16455078, + "step": 716, + "time_per_iteration": 2.638272762298584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116569, + "balance_loss_mlp": 1.04972363, + "epoch": 0.1379376683339746, + "flos": 681458187264.0, + "grad_norm": 0.024933229917961583, + "language_loss": 0.9565106, + "learning_rate": 0.0009697633800872555, + "loss": 0.96816742, + "num_input_tokens_seen": 59685120, + "router_z_loss_mlp": 1.16015625, + "step": 717, + "time_per_iteration": 2.8989553451538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168974, + "balance_loss_mlp": 1.05310297, + "epoch": 0.13813005001923817, + "flos": 612225368064.0, + "grad_norm": 0.02330012063083705, + "language_loss": 1.0130372, + "learning_rate": 0.0009696565932971655, + "loss": 1.02472687, + "num_input_tokens_seen": 59763376, + "router_z_loss_mlp": 1.15917969, + "step": 718, + "time_per_iteration": 2.8472671508789062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171117, + "balance_loss_mlp": 1.05524576, + "epoch": 0.13832243170450173, + "flos": 589926144000.0, + "grad_norm": 0.027418468702626427, + "language_loss": 0.98498988, + "learning_rate": 0.0009695496241702153, + "loss": 0.99670106, + "num_input_tokens_seen": 59836800, + "router_z_loss_mlp": 1.15917969, + "step": 719, + "time_per_iteration": 2.786895990371704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167345, + "balance_loss_mlp": 1.05180764, + "epoch": 0.1385148133897653, + "flos": 701319479808.0, + "grad_norm": 0.026285913371991803, + "language_loss": 0.94868541, + "learning_rate": 0.0009694424727479339, + "loss": 0.96035892, + "num_input_tokens_seen": 59914720, + "router_z_loss_mlp": 1.15576172, + "step": 720, + "time_per_iteration": 2.921644926071167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117298, + "balance_loss_mlp": 1.05729949, + "epoch": 0.13870719507502885, + "flos": 599366966784.0, + "grad_norm": 0.024279001882637877, + "language_loss": 0.97845113, + "learning_rate": 0.0009693351390719213, + "loss": 0.99018097, + "num_input_tokens_seen": 59984544, + "router_z_loss_mlp": 1.15722656, + "step": 721, + "time_per_iteration": 2.701911687850952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168632, + "balance_loss_mlp": 1.05304694, + "epoch": 0.1388995767602924, + "flos": 587748724224.0, + "grad_norm": 0.03212240351747381, + "language_loss": 0.98596126, + "learning_rate": 0.000969227623183848, + "loss": 0.99764758, + "num_input_tokens_seen": 60057056, + "router_z_loss_mlp": 1.15625, + "step": 722, + "time_per_iteration": 2.7723541259765625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167541, + "balance_loss_mlp": 1.05205071, + "epoch": 0.139091958445556, + "flos": 652362189312.0, + "grad_norm": 0.025655198862846312, + "language_loss": 0.99224544, + "learning_rate": 0.0009691199251254554, + "loss": 1.00392079, + "num_input_tokens_seen": 60133232, + "router_z_loss_mlp": 1.15527344, + "step": 723, + "time_per_iteration": 2.8426058292388916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165537, + "balance_loss_mlp": 1.05019021, + "epoch": 0.13928434013081956, + "flos": 576905286144.0, + "grad_norm": 0.022500478429048027, + "language_loss": 0.9243086, + "learning_rate": 0.0009690120449385555, + "loss": 0.93596393, + "num_input_tokens_seen": 60207104, + "router_z_loss_mlp": 1.15380859, + "step": 724, + "time_per_iteration": 2.7558276653289795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168709, + "balance_loss_mlp": 1.05307627, + "epoch": 0.13947672181608312, + "flos": 564314127360.0, + "grad_norm": 0.02294482348940274, + "language_loss": 1.00981367, + "learning_rate": 0.0009689039826650312, + "loss": 1.02150071, + "num_input_tokens_seen": 60277920, + "router_z_loss_mlp": 1.15673828, + "step": 725, + "time_per_iteration": 2.784708261489868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211281, + "balance_loss_mlp": 1.09550476, + "epoch": 0.13966910350134668, + "flos": 1524949045248.0, + "grad_norm": 0.02639881420994122, + "language_loss": 0.76523066, + "learning_rate": 0.000968795738346836, + "loss": 0.77734339, + "num_input_tokens_seen": 60494224, + "router_z_loss_mlp": 1.15820312, + "step": 726, + "time_per_iteration": 4.9523255825042725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171441, + "balance_loss_mlp": 1.05604661, + "epoch": 0.13986148518661023, + "flos": 500855500800.0, + "grad_norm": 0.0321160389091748, + "language_loss": 0.98954523, + "learning_rate": 0.0009686873120259941, + "loss": 1.00125957, + "num_input_tokens_seen": 60562176, + "router_z_loss_mlp": 1.15429688, + "step": 727, + "time_per_iteration": 2.584141731262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173326, + "balance_loss_mlp": 1.05850363, + "epoch": 0.1400538668718738, + "flos": 599849058816.0, + "grad_norm": 0.027531106684590426, + "language_loss": 0.93834305, + "learning_rate": 0.0009685787037446004, + "loss": 0.95007634, + "num_input_tokens_seen": 60631472, + "router_z_loss_mlp": 1.1484375, + "step": 728, + "time_per_iteration": 2.770592451095581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170215, + "balance_loss_mlp": 1.05520177, + "epoch": 0.14024624855713735, + "flos": 595168579584.0, + "grad_norm": 0.026051179565135866, + "language_loss": 0.98294961, + "learning_rate": 0.0009684699135448201, + "loss": 0.99465179, + "num_input_tokens_seen": 60703488, + "router_z_loss_mlp": 1.15039062, + "step": 729, + "time_per_iteration": 2.728573799133301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164865, + "balance_loss_mlp": 1.04985154, + "epoch": 0.1404386302424009, + "flos": 507585145344.0, + "grad_norm": 0.02205061924934426, + "language_loss": 0.98307908, + "learning_rate": 0.0009683609414688895, + "loss": 0.99472773, + "num_input_tokens_seen": 60773936, + "router_z_loss_mlp": 1.15039062, + "step": 730, + "time_per_iteration": 2.700016975402832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167363, + "balance_loss_mlp": 1.05254078, + "epoch": 0.14063101192766447, + "flos": 574515018240.0, + "grad_norm": 0.021243768346974407, + "language_loss": 0.95329058, + "learning_rate": 0.0009682517875591154, + "loss": 0.96496415, + "num_input_tokens_seen": 60851120, + "router_z_loss_mlp": 1.1484375, + "step": 731, + "time_per_iteration": 2.743590831756592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116728, + "balance_loss_mlp": 1.05264843, + "epoch": 0.14082339361292806, + "flos": 565764406272.0, + "grad_norm": 0.02284757167221282, + "language_loss": 0.93998873, + "learning_rate": 0.0009681424518578749, + "loss": 0.95166153, + "num_input_tokens_seen": 60924896, + "router_z_loss_mlp": 1.14648438, + "step": 732, + "time_per_iteration": 2.757690668106079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166596, + "balance_loss_mlp": 1.05215514, + "epoch": 0.14101577529819162, + "flos": 464582694912.0, + "grad_norm": 0.02112517179619274, + "language_loss": 0.95363593, + "learning_rate": 0.000968032934407616, + "loss": 0.96530199, + "num_input_tokens_seen": 60996016, + "router_z_loss_mlp": 1.14453125, + "step": 733, + "time_per_iteration": 2.6260647773742676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167013, + "balance_loss_mlp": 1.05257201, + "epoch": 0.14120815698345518, + "flos": 597261405696.0, + "grad_norm": 0.02235342076428548, + "language_loss": 0.90822989, + "learning_rate": 0.0009679232352508571, + "loss": 0.91990006, + "num_input_tokens_seen": 61072016, + "router_z_loss_mlp": 1.14453125, + "step": 734, + "time_per_iteration": 2.7677996158599854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167689, + "balance_loss_mlp": 1.05334342, + "epoch": 0.14140053866871874, + "flos": 536231978496.0, + "grad_norm": 0.023954026934244203, + "language_loss": 0.90350544, + "learning_rate": 0.0009678133544301871, + "loss": 0.91518235, + "num_input_tokens_seen": 61144528, + "router_z_loss_mlp": 1.14355469, + "step": 735, + "time_per_iteration": 2.6668286323547363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165912, + "balance_loss_mlp": 1.05147135, + "epoch": 0.1415929203539823, + "flos": 521276748288.0, + "grad_norm": 0.01836780541558419, + "language_loss": 0.98091269, + "learning_rate": 0.0009677032919882658, + "loss": 0.99257177, + "num_input_tokens_seen": 61216960, + "router_z_loss_mlp": 1.14453125, + "step": 736, + "time_per_iteration": 2.654975652694702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175055, + "balance_loss_mlp": 1.0601368, + "epoch": 0.14178530203924586, + "flos": 483301883904.0, + "grad_norm": 0.025248480485652293, + "language_loss": 1.00008237, + "learning_rate": 0.000967593047967823, + "loss": 1.01183295, + "num_input_tokens_seen": 61281312, + "router_z_loss_mlp": 1.14941406, + "step": 737, + "time_per_iteration": 2.529147148132324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167635, + "balance_loss_mlp": 1.05319452, + "epoch": 0.14197768372450942, + "flos": 677839220736.0, + "grad_norm": 0.02278890168576414, + "language_loss": 0.9561522, + "learning_rate": 0.0009674826224116593, + "loss": 0.96782857, + "num_input_tokens_seen": 61355888, + "router_z_loss_mlp": 1.14453125, + "step": 738, + "time_per_iteration": 2.8032455444335938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170364, + "balance_loss_mlp": 1.05606639, + "epoch": 0.14217006540977298, + "flos": 446992147968.0, + "grad_norm": 0.026055784762538982, + "language_loss": 0.97800839, + "learning_rate": 0.0009673720153626455, + "loss": 0.989712, + "num_input_tokens_seen": 61424288, + "router_z_loss_mlp": 1.14306641, + "step": 739, + "time_per_iteration": 2.629868984222412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172861, + "balance_loss_mlp": 1.05889642, + "epoch": 0.14236244709503657, + "flos": 497477580288.0, + "grad_norm": 0.02475738760241807, + "language_loss": 0.95941108, + "learning_rate": 0.0009672612268637235, + "loss": 0.97113973, + "num_input_tokens_seen": 61493344, + "router_z_loss_mlp": 1.13964844, + "step": 740, + "time_per_iteration": 2.6037824153900146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170194, + "balance_loss_mlp": 1.05618262, + "epoch": 0.14255482878030012, + "flos": 649479095808.0, + "grad_norm": 0.03387034378547869, + "language_loss": 0.95329261, + "learning_rate": 0.0009671502569579048, + "loss": 0.96499455, + "num_input_tokens_seen": 61565216, + "router_z_loss_mlp": 1.14013672, + "step": 741, + "time_per_iteration": 2.7700846195220947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170542, + "balance_loss_mlp": 1.05657792, + "epoch": 0.14274721046556368, + "flos": 537274025472.0, + "grad_norm": 0.02433568326488268, + "language_loss": 0.98081231, + "learning_rate": 0.0009670391056882719, + "loss": 0.99251777, + "num_input_tokens_seen": 61640928, + "router_z_loss_mlp": 1.13964844, + "step": 742, + "time_per_iteration": 2.696019172668457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174036, + "balance_loss_mlp": 1.06002402, + "epoch": 0.14293959215082724, + "flos": 958583572992.0, + "grad_norm": 0.027423351639808666, + "language_loss": 0.96458268, + "learning_rate": 0.0009669277730979776, + "loss": 0.97632295, + "num_input_tokens_seen": 61717552, + "router_z_loss_mlp": 1.14013672, + "step": 743, + "time_per_iteration": 3.2084367275238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174905, + "balance_loss_mlp": 1.06103587, + "epoch": 0.1431319738360908, + "flos": 694385719296.0, + "grad_norm": 0.02304461389980259, + "language_loss": 0.94654781, + "learning_rate": 0.0009668162592302449, + "loss": 0.9582969, + "num_input_tokens_seen": 61800016, + "router_z_loss_mlp": 1.13867188, + "step": 744, + "time_per_iteration": 2.8862292766571045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184206, + "balance_loss_mlp": 1.07009852, + "epoch": 0.14332435552135436, + "flos": 566502280704.0, + "grad_norm": 0.024928546312887438, + "language_loss": 0.9473027, + "learning_rate": 0.0009667045641283676, + "loss": 0.95914471, + "num_input_tokens_seen": 61865904, + "router_z_loss_mlp": 1.14111328, + "step": 745, + "time_per_iteration": 2.6714677810668945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170598, + "balance_loss_mlp": 1.05672932, + "epoch": 0.14351673720661792, + "flos": 739695845376.0, + "grad_norm": 0.027004630074695047, + "language_loss": 1.03854704, + "learning_rate": 0.0009665926878357092, + "loss": 1.05025315, + "num_input_tokens_seen": 61945728, + "router_z_loss_mlp": 1.13867188, + "step": 746, + "time_per_iteration": 2.9414963722229004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168037, + "balance_loss_mlp": 1.05416811, + "epoch": 0.14370911889188148, + "flos": 550351279104.0, + "grad_norm": 0.024394803732961844, + "language_loss": 0.99195439, + "learning_rate": 0.0009664806303957043, + "loss": 1.00363481, + "num_input_tokens_seen": 62016288, + "router_z_loss_mlp": 1.13867188, + "step": 747, + "time_per_iteration": 2.6798276901245117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175063, + "balance_loss_mlp": 1.06109881, + "epoch": 0.14390150057714507, + "flos": 591589271040.0, + "grad_norm": 0.028912253716933817, + "language_loss": 0.96970344, + "learning_rate": 0.0009663683918518571, + "loss": 0.98145401, + "num_input_tokens_seen": 62097904, + "router_z_loss_mlp": 1.13964844, + "step": 748, + "time_per_iteration": 2.894670248031616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172034, + "balance_loss_mlp": 1.05845118, + "epoch": 0.14409388226240863, + "flos": 592144496640.0, + "grad_norm": 0.025560266799661176, + "language_loss": 0.96381319, + "learning_rate": 0.0009662559722477428, + "loss": 0.97553355, + "num_input_tokens_seen": 62166736, + "router_z_loss_mlp": 1.13574219, + "step": 749, + "time_per_iteration": 2.702796220779419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193848, + "balance_loss_mlp": 1.08131409, + "epoch": 0.1442862639476722, + "flos": 1514654828544.0, + "grad_norm": 0.02305864885865106, + "language_loss": 0.7616297, + "learning_rate": 0.0009661433716270062, + "loss": 0.77356815, + "num_input_tokens_seen": 62402512, + "router_z_loss_mlp": 1.125, + "step": 750, + "time_per_iteration": 5.010634660720825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166507, + "balance_loss_mlp": 1.05287659, + "epoch": 0.14447864563293575, + "flos": 497855612928.0, + "grad_norm": 0.023714468612350204, + "language_loss": 0.97989428, + "learning_rate": 0.0009660305900333632, + "loss": 0.99155927, + "num_input_tokens_seen": 62473408, + "router_z_loss_mlp": 1.13623047, + "step": 751, + "time_per_iteration": 2.7064144611358643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172129, + "balance_loss_mlp": 1.05845106, + "epoch": 0.1446710273181993, + "flos": 590794274304.0, + "grad_norm": 0.03190287595859636, + "language_loss": 0.91963172, + "learning_rate": 0.0009659176275105992, + "loss": 0.93135297, + "num_input_tokens_seen": 62547440, + "router_z_loss_mlp": 1.13671875, + "step": 752, + "time_per_iteration": 2.7171401977539062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171619, + "balance_loss_mlp": 1.05803668, + "epoch": 0.14486340900346287, + "flos": 587012851200.0, + "grad_norm": 0.023715921645424867, + "language_loss": 0.93508279, + "learning_rate": 0.0009658044841025701, + "loss": 0.94679892, + "num_input_tokens_seen": 62620224, + "router_z_loss_mlp": 1.13574219, + "step": 753, + "time_per_iteration": 2.77504563331604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172686, + "balance_loss_mlp": 1.05900788, + "epoch": 0.14505579068872643, + "flos": 505740096000.0, + "grad_norm": 0.025730958483317315, + "language_loss": 0.9055903, + "learning_rate": 0.0009656911598532021, + "loss": 0.91731715, + "num_input_tokens_seen": 62690464, + "router_z_loss_mlp": 1.13671875, + "step": 754, + "time_per_iteration": 2.642886161804199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172881, + "balance_loss_mlp": 1.05925071, + "epoch": 0.14524817237399, + "flos": 487815177216.0, + "grad_norm": 0.025261406861214447, + "language_loss": 0.98625988, + "learning_rate": 0.0009655776548064917, + "loss": 0.9979887, + "num_input_tokens_seen": 62762240, + "router_z_loss_mlp": 1.13623047, + "step": 755, + "time_per_iteration": 2.6610004901885986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169342, + "balance_loss_mlp": 1.05571139, + "epoch": 0.14544055405925355, + "flos": 729449292288.0, + "grad_norm": 0.025093779151575485, + "language_loss": 0.97407329, + "learning_rate": 0.0009654639690065054, + "loss": 0.98576677, + "num_input_tokens_seen": 62839760, + "router_z_loss_mlp": 1.13623047, + "step": 756, + "time_per_iteration": 2.867976665496826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173831, + "balance_loss_mlp": 1.06024873, + "epoch": 0.14563293574451713, + "flos": 594786544128.0, + "grad_norm": 0.02769433731610086, + "language_loss": 0.96328217, + "learning_rate": 0.00096535010249738, + "loss": 0.97502041, + "num_input_tokens_seen": 62910336, + "router_z_loss_mlp": 1.13574219, + "step": 757, + "time_per_iteration": 2.718595266342163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171947, + "balance_loss_mlp": 1.05879402, + "epoch": 0.1458253174297807, + "flos": 561622414848.0, + "grad_norm": 0.027253539371253223, + "language_loss": 0.93671888, + "learning_rate": 0.0009652360553233224, + "loss": 0.94843829, + "num_input_tokens_seen": 62988160, + "router_z_loss_mlp": 1.13134766, + "step": 758, + "time_per_iteration": 2.732665538787842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181084, + "balance_loss_mlp": 1.06835938, + "epoch": 0.14601769911504425, + "flos": 1561186922496.0, + "grad_norm": 0.016548141494889222, + "language_loss": 0.73773748, + "learning_rate": 0.0009651218275286093, + "loss": 0.74954832, + "num_input_tokens_seen": 63224704, + "router_z_loss_mlp": 1.12695312, + "step": 759, + "time_per_iteration": 4.9278404712677 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161415, + "balance_loss_mlp": 1.04840457, + "epoch": 0.1462100808003078, + "flos": 867822331392.0, + "grad_norm": 0.024551380524627048, + "language_loss": 0.89752859, + "learning_rate": 0.0009650074191575883, + "loss": 0.90914273, + "num_input_tokens_seen": 63312400, + "router_z_loss_mlp": 1.12988281, + "step": 760, + "time_per_iteration": 3.18084716796875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011658, + "balance_loss_mlp": 1.05302811, + "epoch": 0.14640246248557137, + "flos": 524029585920.0, + "grad_norm": 0.025729752682943422, + "language_loss": 0.95023656, + "learning_rate": 0.0009648928302546766, + "loss": 0.96189463, + "num_input_tokens_seen": 63387792, + "router_z_loss_mlp": 1.12744141, + "step": 761, + "time_per_iteration": 2.707385301589966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161728, + "balance_loss_mlp": 1.04895639, + "epoch": 0.14659484417083493, + "flos": 1032241089024.0, + "grad_norm": 0.022974522077421757, + "language_loss": 0.94352418, + "learning_rate": 0.0009647780608643613, + "loss": 0.95514143, + "num_input_tokens_seen": 63475632, + "router_z_loss_mlp": 1.12744141, + "step": 762, + "time_per_iteration": 3.357776165008545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116078, + "balance_loss_mlp": 1.04848516, + "epoch": 0.1467872258560985, + "flos": 501656501760.0, + "grad_norm": 0.027279773355913427, + "language_loss": 0.99627388, + "learning_rate": 0.0009646631110312001, + "loss": 1.00788176, + "num_input_tokens_seen": 63546080, + "router_z_loss_mlp": 1.12255859, + "step": 763, + "time_per_iteration": 2.629650115966797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159049, + "balance_loss_mlp": 1.04665887, + "epoch": 0.14697960754136205, + "flos": 548935928832.0, + "grad_norm": 0.020644179018096606, + "language_loss": 0.95446718, + "learning_rate": 0.0009645479807998203, + "loss": 0.96605766, + "num_input_tokens_seen": 63622464, + "router_z_loss_mlp": 1.12353516, + "step": 764, + "time_per_iteration": 2.7475621700286865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157825, + "balance_loss_mlp": 1.04510117, + "epoch": 0.14717198922662564, + "flos": 518901943296.0, + "grad_norm": 0.021535065255329562, + "language_loss": 0.99812603, + "learning_rate": 0.0009644326702149196, + "loss": 1.00970435, + "num_input_tokens_seen": 63694736, + "router_z_loss_mlp": 1.12695312, + "step": 765, + "time_per_iteration": 2.711500406265259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158907, + "balance_loss_mlp": 1.04618227, + "epoch": 0.1473643709118892, + "flos": 733483221504.0, + "grad_norm": 0.02504361772442387, + "language_loss": 0.95452881, + "learning_rate": 0.0009643171793212653, + "loss": 0.96611786, + "num_input_tokens_seen": 63779072, + "router_z_loss_mlp": 1.12695312, + "step": 766, + "time_per_iteration": 3.130798578262329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163931, + "balance_loss_mlp": 1.05115891, + "epoch": 0.14755675259715276, + "flos": 621668192256.0, + "grad_norm": 0.027740201354691706, + "language_loss": 0.99870968, + "learning_rate": 0.0009642015081636952, + "loss": 1.01034904, + "num_input_tokens_seen": 63847472, + "router_z_loss_mlp": 1.12744141, + "step": 767, + "time_per_iteration": 2.701939344406128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160055, + "balance_loss_mlp": 1.04761696, + "epoch": 0.14774913428241632, + "flos": 453172571136.0, + "grad_norm": 0.025159341457135456, + "language_loss": 0.98449206, + "learning_rate": 0.0009640856567871166, + "loss": 0.99609256, + "num_input_tokens_seen": 63912496, + "router_z_loss_mlp": 1.12402344, + "step": 768, + "time_per_iteration": 2.516721725463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164874, + "balance_loss_mlp": 1.05262613, + "epoch": 0.14794151596767988, + "flos": 838654474752.0, + "grad_norm": 0.02612823197324643, + "language_loss": 0.99416363, + "learning_rate": 0.0009639696252365072, + "loss": 1.00581241, + "num_input_tokens_seen": 63990832, + "router_z_loss_mlp": 1.12207031, + "step": 769, + "time_per_iteration": 3.06074857711792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167068, + "balance_loss_mlp": 1.05472481, + "epoch": 0.14813389765294344, + "flos": 687404295168.0, + "grad_norm": 0.02602975967937929, + "language_loss": 0.89651555, + "learning_rate": 0.0009638534135569144, + "loss": 0.90818626, + "num_input_tokens_seen": 64067552, + "router_z_loss_mlp": 1.12304688, + "step": 770, + "time_per_iteration": 2.9440436363220215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169876, + "balance_loss_mlp": 1.05753326, + "epoch": 0.148326279338207, + "flos": 510943600128.0, + "grad_norm": 0.028093178265757666, + "language_loss": 1.01150489, + "learning_rate": 0.0009637370217934554, + "loss": 1.02320373, + "num_input_tokens_seen": 64140336, + "router_z_loss_mlp": 1.12304688, + "step": 771, + "time_per_iteration": 2.649656295776367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166681, + "balance_loss_mlp": 1.05443311, + "epoch": 0.14851866102347056, + "flos": 589331260416.0, + "grad_norm": 0.028336871459981, + "language_loss": 0.90924722, + "learning_rate": 0.0009636204499913175, + "loss": 0.92091405, + "num_input_tokens_seen": 64223472, + "router_z_loss_mlp": 1.12207031, + "step": 772, + "time_per_iteration": 2.8592941761016846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157961, + "balance_loss_mlp": 1.04609525, + "epoch": 0.14871104270873411, + "flos": 692247230976.0, + "grad_norm": 0.030313888046816524, + "language_loss": 0.95830965, + "learning_rate": 0.0009635036981957581, + "loss": 0.96988928, + "num_input_tokens_seen": 64299872, + "router_z_loss_mlp": 1.11816406, + "step": 773, + "time_per_iteration": 2.8690600395202637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160765, + "balance_loss_mlp": 1.04904246, + "epoch": 0.1489034243939977, + "flos": 656282600448.0, + "grad_norm": 0.02808100337337059, + "language_loss": 0.98035401, + "learning_rate": 0.0009633867664521043, + "loss": 0.99196172, + "num_input_tokens_seen": 64377152, + "router_z_loss_mlp": 1.11669922, + "step": 774, + "time_per_iteration": 2.812833070755005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159463, + "balance_loss_mlp": 1.04788363, + "epoch": 0.14909580607926126, + "flos": 476795821056.0, + "grad_norm": 0.030787585825694654, + "language_loss": 0.97385693, + "learning_rate": 0.0009632696548057527, + "loss": 0.98545158, + "num_input_tokens_seen": 64443008, + "router_z_loss_mlp": 1.11523438, + "step": 775, + "time_per_iteration": 2.5662474632263184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160778, + "balance_loss_mlp": 1.04910243, + "epoch": 0.14928818776452482, + "flos": 612283765248.0, + "grad_norm": 0.030552265213122824, + "language_loss": 0.94746792, + "learning_rate": 0.0009631523633021704, + "loss": 0.95907569, + "num_input_tokens_seen": 64519776, + "router_z_loss_mlp": 1.11621094, + "step": 776, + "time_per_iteration": 2.789336919784546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115528, + "balance_loss_mlp": 1.04408133, + "epoch": 0.14948056944978838, + "flos": 562916241408.0, + "grad_norm": 0.02653866309736765, + "language_loss": 0.98006344, + "learning_rate": 0.0009630348919868936, + "loss": 0.99161637, + "num_input_tokens_seen": 64593712, + "router_z_loss_mlp": 1.11132812, + "step": 777, + "time_per_iteration": 2.708918571472168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115506, + "balance_loss_mlp": 1.04395676, + "epoch": 0.14967295113505194, + "flos": 450111558144.0, + "grad_norm": 0.02761804701826243, + "language_loss": 0.92444694, + "learning_rate": 0.0009629172409055293, + "loss": 0.93599755, + "num_input_tokens_seen": 64658448, + "router_z_loss_mlp": 1.11035156, + "step": 778, + "time_per_iteration": 2.522322177886963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154649, + "balance_loss_mlp": 1.0435462, + "epoch": 0.1498653328203155, + "flos": 572428922880.0, + "grad_norm": 0.02112796064723151, + "language_loss": 0.9446094, + "learning_rate": 0.0009627994101037531, + "loss": 0.9561559, + "num_input_tokens_seen": 64734144, + "router_z_loss_mlp": 1.11035156, + "step": 779, + "time_per_iteration": 2.7606184482574463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154399, + "balance_loss_mlp": 1.0433439, + "epoch": 0.15005771450557906, + "flos": 632407570944.0, + "grad_norm": 0.02232887996041627, + "language_loss": 0.98232067, + "learning_rate": 0.0009626813996273114, + "loss": 0.99386466, + "num_input_tokens_seen": 64813456, + "router_z_loss_mlp": 1.10986328, + "step": 780, + "time_per_iteration": 2.8442463874816895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156542, + "balance_loss_mlp": 1.04553461, + "epoch": 0.15025009619084262, + "flos": 579165298176.0, + "grad_norm": 0.021576328362923832, + "language_loss": 0.96611506, + "learning_rate": 0.0009625632095220198, + "loss": 0.97768044, + "num_input_tokens_seen": 64896816, + "router_z_loss_mlp": 1.109375, + "step": 781, + "time_per_iteration": 2.823941469192505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156174, + "balance_loss_mlp": 1.04492784, + "epoch": 0.1504424778761062, + "flos": 484856222208.0, + "grad_norm": 0.023769174200548453, + "language_loss": 0.96595448, + "learning_rate": 0.0009624448398337637, + "loss": 0.97751617, + "num_input_tokens_seen": 64964176, + "router_z_loss_mlp": 1.11181641, + "step": 782, + "time_per_iteration": 2.517115354537964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153917, + "balance_loss_mlp": 1.04286146, + "epoch": 0.15063485956136977, + "flos": 763894513152.0, + "grad_norm": 0.022118467112767815, + "language_loss": 0.97773027, + "learning_rate": 0.0009623262906084984, + "loss": 0.98926944, + "num_input_tokens_seen": 65042592, + "router_z_loss_mlp": 1.10986328, + "step": 783, + "time_per_iteration": 2.9971072673797607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156171, + "balance_loss_mlp": 1.04554462, + "epoch": 0.15082724124663333, + "flos": 498676079616.0, + "grad_norm": 0.021733375764601555, + "language_loss": 0.99047554, + "learning_rate": 0.0009622075618922486, + "loss": 1.00203729, + "num_input_tokens_seen": 65114576, + "router_z_loss_mlp": 1.10546875, + "step": 784, + "time_per_iteration": 2.7209272384643555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161923, + "balance_loss_mlp": 1.05110586, + "epoch": 0.15101962293189689, + "flos": 510722019840.0, + "grad_norm": 0.02414763506099098, + "language_loss": 0.95223093, + "learning_rate": 0.0009620886537311091, + "loss": 0.96385014, + "num_input_tokens_seen": 65186640, + "router_z_loss_mlp": 1.10742188, + "step": 785, + "time_per_iteration": 2.668501138687134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154688, + "balance_loss_mlp": 1.04406226, + "epoch": 0.15121200461716044, + "flos": 458701714944.0, + "grad_norm": 0.026890312379790088, + "language_loss": 0.97208995, + "learning_rate": 0.000961969566171244, + "loss": 0.98363686, + "num_input_tokens_seen": 65252112, + "router_z_loss_mlp": 1.10546875, + "step": 786, + "time_per_iteration": 2.5466530323028564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153217, + "balance_loss_mlp": 1.04278123, + "epoch": 0.151404386302424, + "flos": 539017016832.0, + "grad_norm": 0.02528800532756524, + "language_loss": 1.00058115, + "learning_rate": 0.0009618502992588873, + "loss": 1.01211333, + "num_input_tokens_seen": 65318912, + "router_z_loss_mlp": 1.10351562, + "step": 787, + "time_per_iteration": 2.6463584899902344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154208, + "balance_loss_mlp": 1.04358232, + "epoch": 0.15159676798768756, + "flos": 689616643584.0, + "grad_norm": 0.023869082053813537, + "language_loss": 0.98612797, + "learning_rate": 0.0009617308530403424, + "loss": 0.99766994, + "num_input_tokens_seen": 65395424, + "router_z_loss_mlp": 1.10546875, + "step": 788, + "time_per_iteration": 3.065110921859741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158206, + "balance_loss_mlp": 1.04758012, + "epoch": 0.15178914967295112, + "flos": 546432869376.0, + "grad_norm": 0.025092696297707027, + "language_loss": 0.95288265, + "learning_rate": 0.0009616112275619825, + "loss": 0.96446472, + "num_input_tokens_seen": 65470480, + "router_z_loss_mlp": 1.10546875, + "step": 789, + "time_per_iteration": 2.7197253704071045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158368, + "balance_loss_mlp": 1.0478847, + "epoch": 0.1519815313582147, + "flos": 512814845952.0, + "grad_norm": 0.020890571468345706, + "language_loss": 0.90545368, + "learning_rate": 0.0009614914228702503, + "loss": 0.91703737, + "num_input_tokens_seen": 65544720, + "router_z_loss_mlp": 1.10400391, + "step": 790, + "time_per_iteration": 2.6894142627716064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158071, + "balance_loss_mlp": 1.04782641, + "epoch": 0.15217391304347827, + "flos": 685457187840.0, + "grad_norm": 0.02448742031060442, + "language_loss": 0.96480352, + "learning_rate": 0.0009613714390116581, + "loss": 0.97638422, + "num_input_tokens_seen": 65627872, + "router_z_loss_mlp": 1.1015625, + "step": 791, + "time_per_iteration": 2.9898860454559326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155788, + "balance_loss_mlp": 1.04568636, + "epoch": 0.15236629472874183, + "flos": 645445893120.0, + "grad_norm": 0.023088199171654812, + "language_loss": 0.93995309, + "learning_rate": 0.0009612512760327879, + "loss": 0.95151103, + "num_input_tokens_seen": 65705264, + "router_z_loss_mlp": 1.10009766, + "step": 792, + "time_per_iteration": 2.855648994445801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154532, + "balance_loss_mlp": 1.0444783, + "epoch": 0.1525586764140054, + "flos": 413764892160.0, + "grad_norm": 0.024948238648346503, + "language_loss": 0.97790802, + "learning_rate": 0.0009611309339802909, + "loss": 0.98945332, + "num_input_tokens_seen": 65768592, + "router_z_loss_mlp": 1.09960938, + "step": 793, + "time_per_iteration": 2.4684345722198486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153777, + "balance_loss_mlp": 1.04372334, + "epoch": 0.15275105809926895, + "flos": 804233448960.0, + "grad_norm": 0.02131820977076166, + "language_loss": 0.93039513, + "learning_rate": 0.0009610104129008881, + "loss": 0.94193292, + "num_input_tokens_seen": 65852432, + "router_z_loss_mlp": 1.09960938, + "step": 794, + "time_per_iteration": 3.1013269424438477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155691, + "balance_loss_mlp": 1.04554129, + "epoch": 0.1529434397845325, + "flos": 613542663168.0, + "grad_norm": 0.024012716250022468, + "language_loss": 0.97966266, + "learning_rate": 0.0009608897128413701, + "loss": 0.99121952, + "num_input_tokens_seen": 65927904, + "router_z_loss_mlp": 1.10058594, + "step": 795, + "time_per_iteration": 2.729837417602539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154149, + "balance_loss_mlp": 1.04419053, + "epoch": 0.15313582146979607, + "flos": 616471418880.0, + "grad_norm": 0.02134077894827986, + "language_loss": 0.93399352, + "learning_rate": 0.0009607688338485965, + "loss": 0.945535, + "num_input_tokens_seen": 66006800, + "router_z_loss_mlp": 1.09863281, + "step": 796, + "time_per_iteration": 2.8517422676086426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153572, + "balance_loss_mlp": 1.04409015, + "epoch": 0.15332820315505963, + "flos": 794992012800.0, + "grad_norm": 0.02204541106277596, + "language_loss": 0.98951191, + "learning_rate": 0.0009606477759694969, + "loss": 1.00104761, + "num_input_tokens_seen": 66088608, + "router_z_loss_mlp": 1.09375, + "step": 797, + "time_per_iteration": 3.0313384532928467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153537, + "balance_loss_mlp": 1.0440551, + "epoch": 0.1535205848403232, + "flos": 551256339456.0, + "grad_norm": 0.028291975879130113, + "language_loss": 0.99155664, + "learning_rate": 0.0009605265392510703, + "loss": 1.00309205, + "num_input_tokens_seen": 66153616, + "router_z_loss_mlp": 1.09375, + "step": 798, + "time_per_iteration": 2.6558592319488525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150991, + "balance_loss_mlp": 1.04122281, + "epoch": 0.15371296652558677, + "flos": 536978585088.0, + "grad_norm": 0.02676367025649214, + "language_loss": 1.00762391, + "learning_rate": 0.0009604051237403846, + "loss": 1.01913381, + "num_input_tokens_seen": 66219472, + "router_z_loss_mlp": 1.09667969, + "step": 799, + "time_per_iteration": 2.6129424571990967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151653, + "balance_loss_mlp": 1.04198015, + "epoch": 0.15390534821085033, + "flos": 396089751552.0, + "grad_norm": 0.02759928767191203, + "language_loss": 0.9523741, + "learning_rate": 0.0009602835294845776, + "loss": 0.96389061, + "num_input_tokens_seen": 66281456, + "router_z_loss_mlp": 1.09570312, + "step": 800, + "time_per_iteration": 2.4865612983703613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152453, + "balance_loss_mlp": 1.04297161, + "epoch": 0.1540977298961139, + "flos": 536885259264.0, + "grad_norm": 0.0240348205061721, + "language_loss": 0.99338514, + "learning_rate": 0.0009601617565308565, + "loss": 1.00490952, + "num_input_tokens_seen": 66348160, + "router_z_loss_mlp": 1.09375, + "step": 801, + "time_per_iteration": 2.646925449371338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155144, + "balance_loss_mlp": 1.04551864, + "epoch": 0.15429011158137745, + "flos": 725090449920.0, + "grad_norm": 0.022214532903779557, + "language_loss": 0.94821054, + "learning_rate": 0.0009600398049264977, + "loss": 0.95976186, + "num_input_tokens_seen": 66430576, + "router_z_loss_mlp": 1.09521484, + "step": 802, + "time_per_iteration": 3.0287652015686035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011558, + "balance_loss_mlp": 1.04627085, + "epoch": 0.154482493266641, + "flos": 621748783104.0, + "grad_norm": 0.025430739734688717, + "language_loss": 1.02679133, + "learning_rate": 0.0009599176747188469, + "loss": 1.03834927, + "num_input_tokens_seen": 66506480, + "router_z_loss_mlp": 1.09423828, + "step": 803, + "time_per_iteration": 2.8240089416503906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156206, + "balance_loss_mlp": 1.0467242, + "epoch": 0.15467487495190457, + "flos": 526719297024.0, + "grad_norm": 0.024483654101252486, + "language_loss": 0.90705526, + "learning_rate": 0.0009597953659553196, + "loss": 0.91861731, + "num_input_tokens_seen": 66577680, + "router_z_loss_mlp": 1.09375, + "step": 804, + "time_per_iteration": 2.745878219604492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153494, + "balance_loss_mlp": 1.04386926, + "epoch": 0.15486725663716813, + "flos": 528759730176.0, + "grad_norm": 0.02516296775651391, + "language_loss": 0.97286022, + "learning_rate": 0.0009596728786833997, + "loss": 0.98439509, + "num_input_tokens_seen": 66648496, + "router_z_loss_mlp": 1.09521484, + "step": 805, + "time_per_iteration": 2.6471030712127686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152075, + "balance_loss_mlp": 1.04244983, + "epoch": 0.1550596383224317, + "flos": 1050278799360.0, + "grad_norm": 0.026563720364072098, + "language_loss": 0.9858942, + "learning_rate": 0.0009595502129506415, + "loss": 0.99741489, + "num_input_tokens_seen": 66735216, + "router_z_loss_mlp": 1.09521484, + "step": 806, + "time_per_iteration": 3.3734352588653564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115037, + "balance_loss_mlp": 1.04088783, + "epoch": 0.15525202000769528, + "flos": 614836489728.0, + "grad_norm": 0.02624405223250092, + "language_loss": 0.91745955, + "learning_rate": 0.0009594273688046678, + "loss": 0.92896324, + "num_input_tokens_seen": 66810672, + "router_z_loss_mlp": 1.09375, + "step": 807, + "time_per_iteration": 2.8000967502593994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153708, + "balance_loss_mlp": 1.04441667, + "epoch": 0.15544440169295884, + "flos": 534102222336.0, + "grad_norm": 0.028049278390969077, + "language_loss": 0.97350299, + "learning_rate": 0.000959304346293171, + "loss": 0.98504007, + "num_input_tokens_seen": 66879824, + "router_z_loss_mlp": 1.09179688, + "step": 808, + "time_per_iteration": 2.7285830974578857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164275, + "balance_loss_mlp": 1.05464995, + "epoch": 0.1556367833782224, + "flos": 645886325760.0, + "grad_norm": 0.033021349518653896, + "language_loss": 0.99046445, + "learning_rate": 0.0009591811454639125, + "loss": 1.00210714, + "num_input_tokens_seen": 66949424, + "router_z_loss_mlp": 1.09521484, + "step": 809, + "time_per_iteration": 2.842867612838745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155411, + "balance_loss_mlp": 1.04612005, + "epoch": 0.15582916506348596, + "flos": 544952391168.0, + "grad_norm": 0.02421082053858415, + "language_loss": 0.95793635, + "learning_rate": 0.0009590577663647234, + "loss": 0.96949041, + "num_input_tokens_seen": 67024000, + "router_z_loss_mlp": 1.09179688, + "step": 810, + "time_per_iteration": 2.8207406997680664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158015, + "balance_loss_mlp": 1.04877126, + "epoch": 0.15602154674874952, + "flos": 581214463488.0, + "grad_norm": 0.022734781081273227, + "language_loss": 0.95110512, + "learning_rate": 0.0009589342090435036, + "loss": 0.96268523, + "num_input_tokens_seen": 67100672, + "router_z_loss_mlp": 1.09130859, + "step": 811, + "time_per_iteration": 2.8413872718811035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170356, + "balance_loss_mlp": 1.06068361, + "epoch": 0.15621392843401308, + "flos": 536316572160.0, + "grad_norm": 0.026628933906638022, + "language_loss": 0.97807872, + "learning_rate": 0.0009588104735482223, + "loss": 0.98978221, + "num_input_tokens_seen": 67171584, + "router_z_loss_mlp": 1.09570312, + "step": 812, + "time_per_iteration": 2.6673126220703125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164587, + "balance_loss_mlp": 1.05524826, + "epoch": 0.15640631011927664, + "flos": 551981478912.0, + "grad_norm": 0.027865461759282353, + "language_loss": 0.94247007, + "learning_rate": 0.0009586865599269177, + "loss": 0.95411587, + "num_input_tokens_seen": 67240640, + "router_z_loss_mlp": 1.09228516, + "step": 813, + "time_per_iteration": 2.655217409133911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159004, + "balance_loss_mlp": 1.04985571, + "epoch": 0.1565986918045402, + "flos": 638635657728.0, + "grad_norm": 0.024501009698068087, + "language_loss": 0.98888743, + "learning_rate": 0.0009585624682276977, + "loss": 1.00047755, + "num_input_tokens_seen": 67312976, + "router_z_loss_mlp": 1.09033203, + "step": 814, + "time_per_iteration": 2.7572293281555176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160029, + "balance_loss_mlp": 1.05073786, + "epoch": 0.15679107348980378, + "flos": 491781250560.0, + "grad_norm": 0.02545428800843787, + "language_loss": 0.97158241, + "learning_rate": 0.0009584381984987386, + "loss": 0.98318267, + "num_input_tokens_seen": 67378528, + "router_z_loss_mlp": 1.09179688, + "step": 815, + "time_per_iteration": 2.554208517074585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160766, + "balance_loss_mlp": 1.05185616, + "epoch": 0.15698345517506734, + "flos": 531002277888.0, + "grad_norm": 0.022736041606184667, + "language_loss": 0.98151159, + "learning_rate": 0.0009583137507882864, + "loss": 0.99311924, + "num_input_tokens_seen": 67449728, + "router_z_loss_mlp": 1.08789062, + "step": 816, + "time_per_iteration": 2.6635444164276123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158696, + "balance_loss_mlp": 1.04978669, + "epoch": 0.1571758368603309, + "flos": 547077417984.0, + "grad_norm": 0.024009976747476527, + "language_loss": 0.90921289, + "learning_rate": 0.000958189125144656, + "loss": 0.92079985, + "num_input_tokens_seen": 67520512, + "router_z_loss_mlp": 1.08789062, + "step": 817, + "time_per_iteration": 2.635559558868408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156061, + "balance_loss_mlp": 1.04719925, + "epoch": 0.15736821854559446, + "flos": 566743326720.0, + "grad_norm": 0.021547949482456395, + "language_loss": 0.97883654, + "learning_rate": 0.0009580643216162313, + "loss": 0.99039721, + "num_input_tokens_seen": 67592464, + "router_z_loss_mlp": 1.08740234, + "step": 818, + "time_per_iteration": 2.673997640609741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157698, + "balance_loss_mlp": 1.04888415, + "epoch": 0.15756060023085802, + "flos": 501953943552.0, + "grad_norm": 0.023826624353146583, + "language_loss": 0.90112716, + "learning_rate": 0.0009579393402514652, + "loss": 0.91270417, + "num_input_tokens_seen": 67658928, + "router_z_loss_mlp": 1.08691406, + "step": 819, + "time_per_iteration": 2.61460542678833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156999, + "balance_loss_mlp": 1.04823244, + "epoch": 0.15775298191612158, + "flos": 520271631360.0, + "grad_norm": 0.023927295219635936, + "language_loss": 0.99075627, + "learning_rate": 0.0009578141810988801, + "loss": 1.00232625, + "num_input_tokens_seen": 67727936, + "router_z_loss_mlp": 1.08642578, + "step": 820, + "time_per_iteration": 2.591036558151245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149976, + "balance_loss_mlp": 1.04111433, + "epoch": 0.15794536360138514, + "flos": 467087755776.0, + "grad_norm": 0.026283029611425073, + "language_loss": 1.00067806, + "learning_rate": 0.0009576888442070668, + "loss": 1.01217794, + "num_input_tokens_seen": 67795488, + "router_z_loss_mlp": 1.08740234, + "step": 821, + "time_per_iteration": 2.5960564613342285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151894, + "balance_loss_mlp": 1.04279363, + "epoch": 0.1581377452866487, + "flos": 518168071680.0, + "grad_norm": 0.02399653039287492, + "language_loss": 1.01290274, + "learning_rate": 0.0009575633296246854, + "loss": 1.02442169, + "num_input_tokens_seen": 67858896, + "router_z_loss_mlp": 1.08984375, + "step": 822, + "time_per_iteration": 2.579575300216675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151848, + "balance_loss_mlp": 1.04312956, + "epoch": 0.15833012697191226, + "flos": 550837373952.0, + "grad_norm": 0.02407632334340799, + "language_loss": 0.91124117, + "learning_rate": 0.0009574376374004652, + "loss": 0.92275965, + "num_input_tokens_seen": 67924864, + "router_z_loss_mlp": 1.0859375, + "step": 823, + "time_per_iteration": 2.661754608154297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162901, + "balance_loss_mlp": 1.05446815, + "epoch": 0.15852250865717585, + "flos": 488466456576.0, + "grad_norm": 0.026327967105985502, + "language_loss": 0.90841949, + "learning_rate": 0.000957311767583204, + "loss": 0.92004848, + "num_input_tokens_seen": 67992912, + "router_z_loss_mlp": 1.08300781, + "step": 824, + "time_per_iteration": 2.7887372970581055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156753, + "balance_loss_mlp": 1.04956055, + "epoch": 0.1587148903424394, + "flos": 1312696909824.0, + "grad_norm": 0.010620587901871582, + "language_loss": 0.8207159, + "learning_rate": 0.0009571857202217691, + "loss": 0.8322835, + "num_input_tokens_seen": 68207408, + "router_z_loss_mlp": 1.0703125, + "step": 825, + "time_per_iteration": 4.766167640686035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151145, + "balance_loss_mlp": 1.04304576, + "epoch": 0.15890727202770297, + "flos": 467832360960.0, + "grad_norm": 0.02959471781097451, + "language_loss": 1.0376749, + "learning_rate": 0.0009570594953650961, + "loss": 1.04918623, + "num_input_tokens_seen": 68270864, + "router_z_loss_mlp": 1.07958984, + "step": 826, + "time_per_iteration": 2.6334874629974365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151786, + "balance_loss_mlp": 1.04354417, + "epoch": 0.15909965371296653, + "flos": 778606695936.0, + "grad_norm": 0.024366848241159877, + "language_loss": 0.8923949, + "learning_rate": 0.00095693309306219, + "loss": 0.90391278, + "num_input_tokens_seen": 68355408, + "router_z_loss_mlp": 1.08105469, + "step": 827, + "time_per_iteration": 3.1078274250030518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115264, + "balance_loss_mlp": 1.04449332, + "epoch": 0.1592920353982301, + "flos": 1079962950144.0, + "grad_norm": 0.02547465125103231, + "language_loss": 0.98567259, + "learning_rate": 0.0009568065133621244, + "loss": 0.99719906, + "num_input_tokens_seen": 68437072, + "router_z_loss_mlp": 1.08007812, + "step": 828, + "time_per_iteration": 3.3287436962127686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147109, + "balance_loss_mlp": 1.03872418, + "epoch": 0.15948441708349365, + "flos": 726889837056.0, + "grad_norm": 0.026992334830630314, + "language_loss": 0.93815649, + "learning_rate": 0.0009566797563140422, + "loss": 0.94962764, + "num_input_tokens_seen": 68511696, + "router_z_loss_mlp": 1.08251953, + "step": 829, + "time_per_iteration": 2.8641507625579834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146027, + "balance_loss_mlp": 1.03788006, + "epoch": 0.1596767987687572, + "flos": 580075087872.0, + "grad_norm": 0.026140449767567974, + "language_loss": 0.96191794, + "learning_rate": 0.0009565528219671547, + "loss": 0.97337818, + "num_input_tokens_seen": 68587488, + "router_z_loss_mlp": 1.08007812, + "step": 830, + "time_per_iteration": 2.9082329273223877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147169, + "balance_loss_mlp": 1.03902268, + "epoch": 0.15986918045402077, + "flos": 530025358848.0, + "grad_norm": 0.02186736495212519, + "language_loss": 0.93771887, + "learning_rate": 0.0009564257103707418, + "loss": 0.94919056, + "num_input_tokens_seen": 68655760, + "router_z_loss_mlp": 1.08007812, + "step": 831, + "time_per_iteration": 4.109540700912476 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153246, + "balance_loss_mlp": 1.04519463, + "epoch": 0.16006156213928435, + "flos": 575669856768.0, + "grad_norm": 0.025156765484562034, + "language_loss": 1.01463771, + "learning_rate": 0.0009562984215741533, + "loss": 1.02617025, + "num_input_tokens_seen": 68724560, + "router_z_loss_mlp": 1.07910156, + "step": 832, + "time_per_iteration": 2.634381055831909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148637, + "balance_loss_mlp": 1.0408721, + "epoch": 0.1602539438245479, + "flos": 516674858496.0, + "grad_norm": 0.023022886756030446, + "language_loss": 0.90665066, + "learning_rate": 0.0009561709556268065, + "loss": 0.91813707, + "num_input_tokens_seen": 68795440, + "router_z_loss_mlp": 1.07617188, + "step": 833, + "time_per_iteration": 2.7094552516937256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115539, + "balance_loss_mlp": 1.04752922, + "epoch": 0.16044632550981147, + "flos": 622161017856.0, + "grad_norm": 0.02456985500743924, + "language_loss": 1.0306673, + "learning_rate": 0.0009560433125781884, + "loss": 1.04222107, + "num_input_tokens_seen": 68868176, + "router_z_loss_mlp": 1.07714844, + "step": 834, + "time_per_iteration": 2.7217955589294434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156285, + "balance_loss_mlp": 1.04794765, + "epoch": 0.16063870719507503, + "flos": 562127975424.0, + "grad_norm": 0.02550250825542428, + "language_loss": 1.02622008, + "learning_rate": 0.0009559154924778544, + "loss": 1.03778291, + "num_input_tokens_seen": 68939616, + "router_z_loss_mlp": 1.08203125, + "step": 835, + "time_per_iteration": 4.0438151359558105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153381, + "balance_loss_mlp": 1.04509139, + "epoch": 0.1608310888803386, + "flos": 806560590336.0, + "grad_norm": 0.023331498233936678, + "language_loss": 0.93980491, + "learning_rate": 0.0009557874953754284, + "loss": 0.95133871, + "num_input_tokens_seen": 69016192, + "router_z_loss_mlp": 1.08154297, + "step": 836, + "time_per_iteration": 3.0253541469573975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155161, + "balance_loss_mlp": 1.04739583, + "epoch": 0.16102347056560215, + "flos": 601694108160.0, + "grad_norm": 0.024039154316001603, + "language_loss": 0.9449209, + "learning_rate": 0.0009556593213206038, + "loss": 0.95647246, + "num_input_tokens_seen": 69089360, + "router_z_loss_mlp": 1.07617188, + "step": 837, + "time_per_iteration": 2.815293788909912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148071, + "balance_loss_mlp": 1.04049647, + "epoch": 0.1612158522508657, + "flos": 554614794240.0, + "grad_norm": 0.024490980939479982, + "language_loss": 0.96443379, + "learning_rate": 0.0009555309703631414, + "loss": 0.9759146, + "num_input_tokens_seen": 69161952, + "router_z_loss_mlp": 1.07421875, + "step": 838, + "time_per_iteration": 2.7353601455688477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148397, + "balance_loss_mlp": 1.0406791, + "epoch": 0.16140823393612927, + "flos": 557017797120.0, + "grad_norm": 0.026558461299776022, + "language_loss": 0.98485982, + "learning_rate": 0.0009554024425528722, + "loss": 0.99634379, + "num_input_tokens_seen": 69232432, + "router_z_loss_mlp": 1.07568359, + "step": 839, + "time_per_iteration": 2.801539182662964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146915, + "balance_loss_mlp": 1.03924477, + "epoch": 0.16160061562139286, + "flos": 544908730368.0, + "grad_norm": 0.023933605454050468, + "language_loss": 0.96992832, + "learning_rate": 0.0009552737379396948, + "loss": 0.98139745, + "num_input_tokens_seen": 69297696, + "router_z_loss_mlp": 1.07519531, + "step": 840, + "time_per_iteration": 2.613037586212158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148515, + "balance_loss_mlp": 1.04122651, + "epoch": 0.16179299730665642, + "flos": 605006900736.0, + "grad_norm": 0.020652206840645122, + "language_loss": 0.95695615, + "learning_rate": 0.0009551448565735767, + "loss": 0.96844131, + "num_input_tokens_seen": 69373888, + "router_z_loss_mlp": 1.07128906, + "step": 841, + "time_per_iteration": 2.779979705810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149052, + "balance_loss_mlp": 1.04128659, + "epoch": 0.16198537899191998, + "flos": 788551077888.0, + "grad_norm": 0.02358864683094414, + "language_loss": 0.96423578, + "learning_rate": 0.0009550157985045543, + "loss": 0.97572625, + "num_input_tokens_seen": 69449984, + "router_z_loss_mlp": 1.07617188, + "step": 842, + "time_per_iteration": 3.0352344512939453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148245, + "balance_loss_mlp": 1.04086173, + "epoch": 0.16217776067718354, + "flos": 520829584896.0, + "grad_norm": 0.02127918945612936, + "language_loss": 0.95624614, + "learning_rate": 0.0009548865637827321, + "loss": 0.96772861, + "num_input_tokens_seen": 69522736, + "router_z_loss_mlp": 1.07226562, + "step": 843, + "time_per_iteration": 2.695211172103882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148036, + "balance_loss_mlp": 1.04027128, + "epoch": 0.1623701423624471, + "flos": 506254388736.0, + "grad_norm": 0.02427958482397641, + "language_loss": 0.99469078, + "learning_rate": 0.0009547571524582838, + "loss": 1.00617111, + "num_input_tokens_seen": 69587184, + "router_z_loss_mlp": 1.07617188, + "step": 844, + "time_per_iteration": 2.586859941482544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145622, + "balance_loss_mlp": 1.03842914, + "epoch": 0.16256252404771065, + "flos": 498157057536.0, + "grad_norm": 0.025657026114593633, + "language_loss": 1.02873135, + "learning_rate": 0.0009546275645814512, + "loss": 1.04018748, + "num_input_tokens_seen": 69656560, + "router_z_loss_mlp": 1.0703125, + "step": 845, + "time_per_iteration": 2.735323190689087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147597, + "balance_loss_mlp": 1.04040384, + "epoch": 0.16275490573297421, + "flos": 503286701568.0, + "grad_norm": 0.024743383464961046, + "language_loss": 1.00377154, + "learning_rate": 0.0009544978002025446, + "loss": 1.01524746, + "num_input_tokens_seen": 69723872, + "router_z_loss_mlp": 1.0703125, + "step": 846, + "time_per_iteration": 2.5876121520996094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148997, + "balance_loss_mlp": 1.04189885, + "epoch": 0.16294728741823777, + "flos": 508353945600.0, + "grad_norm": 0.020876938588178177, + "language_loss": 0.94877481, + "learning_rate": 0.0009543678593719434, + "loss": 0.9602648, + "num_input_tokens_seen": 69795504, + "router_z_loss_mlp": 1.06933594, + "step": 847, + "time_per_iteration": 2.69250750541687 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159847, + "balance_loss_mlp": 1.05274892, + "epoch": 0.16313966910350133, + "flos": 510756948480.0, + "grad_norm": 0.020936629725758764, + "language_loss": 0.95534647, + "learning_rate": 0.0009542377421400945, + "loss": 0.96694493, + "num_input_tokens_seen": 69873408, + "router_z_loss_mlp": 1.06933594, + "step": 848, + "time_per_iteration": 2.7832183837890625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146796, + "balance_loss_mlp": 1.03965068, + "epoch": 0.16333205078876492, + "flos": 545056450560.0, + "grad_norm": 0.023544058946573278, + "language_loss": 0.94486761, + "learning_rate": 0.0009541074485575145, + "loss": 0.95633554, + "num_input_tokens_seen": 69944112, + "router_z_loss_mlp": 1.06982422, + "step": 849, + "time_per_iteration": 2.7163026332855225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147161, + "balance_loss_mlp": 1.03996801, + "epoch": 0.16352443247402848, + "flos": 508711785984.0, + "grad_norm": 0.023080110816121054, + "language_loss": 1.00550437, + "learning_rate": 0.0009539769786747874, + "loss": 1.01697588, + "num_input_tokens_seen": 70012288, + "router_z_loss_mlp": 1.0703125, + "step": 850, + "time_per_iteration": 2.5918350219726562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152854, + "balance_loss_mlp": 1.04547, + "epoch": 0.16371681415929204, + "flos": 543222134784.0, + "grad_norm": 0.022593715242085626, + "language_loss": 0.90895152, + "learning_rate": 0.0009538463325425665, + "loss": 0.92048007, + "num_input_tokens_seen": 70086560, + "router_z_loss_mlp": 1.07226562, + "step": 851, + "time_per_iteration": 2.701662063598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146583, + "balance_loss_mlp": 1.03939056, + "epoch": 0.1639091958445556, + "flos": 521760841728.0, + "grad_norm": 0.025319624949764974, + "language_loss": 0.95562863, + "learning_rate": 0.0009537155102115728, + "loss": 0.96709442, + "num_input_tokens_seen": 70153968, + "router_z_loss_mlp": 1.0703125, + "step": 852, + "time_per_iteration": 2.577416181564331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145576, + "balance_loss_mlp": 1.03871727, + "epoch": 0.16410157752981916, + "flos": 548482034688.0, + "grad_norm": 0.022217218078565786, + "language_loss": 0.92332971, + "learning_rate": 0.0009535845117325961, + "loss": 0.93478549, + "num_input_tokens_seen": 70222496, + "router_z_loss_mlp": 1.06689453, + "step": 853, + "time_per_iteration": 2.643528699874878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148166, + "balance_loss_mlp": 1.04135406, + "epoch": 0.16429395921508272, + "flos": 584025698304.0, + "grad_norm": 0.02024018106959617, + "language_loss": 1.00128078, + "learning_rate": 0.0009534533371564946, + "loss": 1.01276231, + "num_input_tokens_seen": 70301680, + "router_z_loss_mlp": 1.06640625, + "step": 854, + "time_per_iteration": 2.74361515045166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150543, + "balance_loss_mlp": 1.04377949, + "epoch": 0.16448634090034628, + "flos": 531961732608.0, + "grad_norm": 0.02843561601072028, + "language_loss": 1.00094676, + "learning_rate": 0.0009533219865341949, + "loss": 1.01245213, + "num_input_tokens_seen": 70371152, + "router_z_loss_mlp": 1.06591797, + "step": 855, + "time_per_iteration": 2.5858380794525146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156957, + "balance_loss_mlp": 1.05014503, + "epoch": 0.16467872258560984, + "flos": 492960284160.0, + "grad_norm": 0.026495144396752456, + "language_loss": 0.95923662, + "learning_rate": 0.0009531904599166916, + "loss": 0.97080612, + "num_input_tokens_seen": 70440832, + "router_z_loss_mlp": 1.06640625, + "step": 856, + "time_per_iteration": 2.638528823852539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147973, + "balance_loss_mlp": 1.04101861, + "epoch": 0.16487110427087343, + "flos": 507259505664.0, + "grad_norm": 0.02303677132947941, + "language_loss": 0.95950538, + "learning_rate": 0.0009530587573550478, + "loss": 0.97098505, + "num_input_tokens_seen": 70507424, + "router_z_loss_mlp": 1.06787109, + "step": 857, + "time_per_iteration": 2.5788354873657227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151596, + "balance_loss_mlp": 1.04592896, + "epoch": 0.16506348595613698, + "flos": 1436108714496.0, + "grad_norm": 0.011861304780107247, + "language_loss": 0.74319386, + "learning_rate": 0.0009529268789003953, + "loss": 0.75470984, + "num_input_tokens_seen": 70742320, + "router_z_loss_mlp": 1.0546875, + "step": 858, + "time_per_iteration": 5.003005027770996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153597, + "balance_loss_mlp": 1.04673755, + "epoch": 0.16525586764140054, + "flos": 478089647616.0, + "grad_norm": 0.02595402254221991, + "language_loss": 0.98057735, + "learning_rate": 0.0009527948246039337, + "loss": 0.99211335, + "num_input_tokens_seen": 70808400, + "router_z_loss_mlp": 1.06689453, + "step": 859, + "time_per_iteration": 2.541255474090576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152748, + "balance_loss_mlp": 1.04622293, + "epoch": 0.1654482493266641, + "flos": 882540518400.0, + "grad_norm": 0.024187417777422206, + "language_loss": 0.96476752, + "learning_rate": 0.000952662594516931, + "loss": 0.97629499, + "num_input_tokens_seen": 70886192, + "router_z_loss_mlp": 1.06347656, + "step": 860, + "time_per_iteration": 3.102233409881592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154678, + "balance_loss_mlp": 1.04791439, + "epoch": 0.16564063101192766, + "flos": 628105124352.0, + "grad_norm": 0.02242324391324738, + "language_loss": 0.93166292, + "learning_rate": 0.0009525301886907234, + "loss": 0.94320977, + "num_input_tokens_seen": 70964816, + "router_z_loss_mlp": 1.06591797, + "step": 861, + "time_per_iteration": 2.871971368789673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151309, + "balance_loss_mlp": 1.04487896, + "epoch": 0.16583301269719122, + "flos": 562592603136.0, + "grad_norm": 0.02248996903194516, + "language_loss": 0.97140592, + "learning_rate": 0.0009523976071767155, + "loss": 0.98291898, + "num_input_tokens_seen": 71037456, + "router_z_loss_mlp": 1.0625, + "step": 862, + "time_per_iteration": 2.653031349182129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146763, + "balance_loss_mlp": 1.04038036, + "epoch": 0.16602539438245478, + "flos": 568983873024.0, + "grad_norm": 0.020794335354585358, + "language_loss": 0.9646408, + "learning_rate": 0.00095226485002638, + "loss": 0.97610843, + "num_input_tokens_seen": 71111872, + "router_z_loss_mlp": 1.06201172, + "step": 863, + "time_per_iteration": 2.7685163021087646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147042, + "balance_loss_mlp": 1.04075551, + "epoch": 0.16621777606771834, + "flos": 576021692928.0, + "grad_norm": 0.021581021962121343, + "language_loss": 0.96560466, + "learning_rate": 0.0009521319172912576, + "loss": 0.9770751, + "num_input_tokens_seen": 71187808, + "router_z_loss_mlp": 1.06103516, + "step": 864, + "time_per_iteration": 2.762233257293701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149511, + "balance_loss_mlp": 1.0432713, + "epoch": 0.16641015775298193, + "flos": 515597882880.0, + "grad_norm": 0.029880870913045234, + "language_loss": 1.0375855, + "learning_rate": 0.0009519988090229579, + "loss": 1.04908061, + "num_input_tokens_seen": 71261728, + "router_z_loss_mlp": 1.06054688, + "step": 865, + "time_per_iteration": 2.7156929969787598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148426, + "balance_loss_mlp": 1.04199588, + "epoch": 0.1666025394382455, + "flos": 622849227264.0, + "grad_norm": 0.023088954173990716, + "language_loss": 0.96669209, + "learning_rate": 0.0009518655252731576, + "loss": 0.9781763, + "num_input_tokens_seen": 71338352, + "router_z_loss_mlp": 1.0625, + "step": 866, + "time_per_iteration": 2.76474928855896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147261, + "balance_loss_mlp": 1.04102135, + "epoch": 0.16679492112350905, + "flos": 549932313600.0, + "grad_norm": 0.021458749489738967, + "language_loss": 0.98467255, + "learning_rate": 0.0009517320660936022, + "loss": 0.99614513, + "num_input_tokens_seen": 71416544, + "router_z_loss_mlp": 1.06054688, + "step": 867, + "time_per_iteration": 2.7664713859558105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151692, + "balance_loss_mlp": 1.04545259, + "epoch": 0.1669873028087726, + "flos": 666865526784.0, + "grad_norm": 0.02209258354681387, + "language_loss": 0.92114806, + "learning_rate": 0.0009515984315361051, + "loss": 0.93266487, + "num_input_tokens_seen": 71494080, + "router_z_loss_mlp": 1.06054688, + "step": 868, + "time_per_iteration": 2.845388412475586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151588, + "balance_loss_mlp": 1.04563451, + "epoch": 0.16717968449403617, + "flos": 539603168256.0, + "grad_norm": 0.02501334283432316, + "language_loss": 0.95751995, + "learning_rate": 0.000951464621652548, + "loss": 0.96903574, + "num_input_tokens_seen": 71562672, + "router_z_loss_mlp": 1.05761719, + "step": 869, + "time_per_iteration": 2.623375415802002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148167, + "balance_loss_mlp": 1.04216599, + "epoch": 0.16737206617929973, + "flos": 531278252544.0, + "grad_norm": 0.02062860382438808, + "language_loss": 0.87610328, + "learning_rate": 0.0009513306364948804, + "loss": 0.88758498, + "num_input_tokens_seen": 71641904, + "router_z_loss_mlp": 1.05810547, + "step": 870, + "time_per_iteration": 2.792346239089966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148065, + "balance_loss_mlp": 1.04206407, + "epoch": 0.1675644478645633, + "flos": 481756277760.0, + "grad_norm": 0.023236257285911367, + "language_loss": 0.98118269, + "learning_rate": 0.0009511964761151197, + "loss": 0.99266338, + "num_input_tokens_seen": 71709616, + "router_z_loss_mlp": 1.05810547, + "step": 871, + "time_per_iteration": 2.572923183441162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152601, + "balance_loss_mlp": 1.04669595, + "epoch": 0.16775682954982685, + "flos": 495541206528.0, + "grad_norm": 0.026661505796453877, + "language_loss": 0.99311042, + "learning_rate": 0.0009510621405653521, + "loss": 1.00463641, + "num_input_tokens_seen": 71776592, + "router_z_loss_mlp": 1.05712891, + "step": 872, + "time_per_iteration": 2.6296472549438477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150326, + "balance_loss_mlp": 1.04484987, + "epoch": 0.1679492112350904, + "flos": 753404912640.0, + "grad_norm": 0.029291148216183213, + "language_loss": 0.93300939, + "learning_rate": 0.0009509276298977309, + "loss": 0.94451261, + "num_input_tokens_seen": 71856352, + "router_z_loss_mlp": 1.05273438, + "step": 873, + "time_per_iteration": 3.0177366733551025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150817, + "balance_loss_mlp": 1.04543638, + "epoch": 0.168141592920354, + "flos": 1137731977728.0, + "grad_norm": 0.021155110884158303, + "language_loss": 0.9134444, + "learning_rate": 0.0009507929441644778, + "loss": 0.92495263, + "num_input_tokens_seen": 71948480, + "router_z_loss_mlp": 1.05175781, + "step": 874, + "time_per_iteration": 3.53277325630188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160399, + "balance_loss_mlp": 1.05501771, + "epoch": 0.16833397460561755, + "flos": 633553677312.0, + "grad_norm": 0.025508723945600786, + "language_loss": 0.94342184, + "learning_rate": 0.0009506580834178826, + "loss": 0.95502585, + "num_input_tokens_seen": 72019200, + "router_z_loss_mlp": 1.05175781, + "step": 875, + "time_per_iteration": 2.763296365737915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151031, + "balance_loss_mlp": 1.04560196, + "epoch": 0.1685263562908811, + "flos": 542542657536.0, + "grad_norm": 0.0234395143242784, + "language_loss": 1.00066125, + "learning_rate": 0.0009505230477103028, + "loss": 1.01217151, + "num_input_tokens_seen": 72088672, + "router_z_loss_mlp": 1.05224609, + "step": 876, + "time_per_iteration": 2.7256453037261963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143495, + "balance_loss_mlp": 1.03801847, + "epoch": 0.16871873797614467, + "flos": 620485155840.0, + "grad_norm": 0.02951425183806971, + "language_loss": 0.91949958, + "learning_rate": 0.0009503878370941641, + "loss": 0.93093449, + "num_input_tokens_seen": 72159952, + "router_z_loss_mlp": 1.05273438, + "step": 877, + "time_per_iteration": 2.75011944770813 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143733, + "balance_loss_mlp": 1.038257, + "epoch": 0.16891111966140823, + "flos": 607455565824.0, + "grad_norm": 0.02526909046796152, + "language_loss": 0.99137431, + "learning_rate": 0.0009502524516219595, + "loss": 1.00281167, + "num_input_tokens_seen": 72231648, + "router_z_loss_mlp": 1.05273438, + "step": 878, + "time_per_iteration": 2.7107326984405518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145725, + "balance_loss_mlp": 1.04005778, + "epoch": 0.1691035013466718, + "flos": 553405561344.0, + "grad_norm": 0.023246247090994255, + "language_loss": 0.99022686, + "learning_rate": 0.0009501168913462506, + "loss": 1.00168419, + "num_input_tokens_seen": 72298608, + "router_z_loss_mlp": 1.0546875, + "step": 879, + "time_per_iteration": 2.654356002807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153572, + "balance_loss_mlp": 1.04866791, + "epoch": 0.16929588303193535, + "flos": 1479305822208.0, + "grad_norm": 0.014844444469597292, + "language_loss": 0.79121923, + "learning_rate": 0.0009499811563196665, + "loss": 0.802755, + "num_input_tokens_seen": 72525312, + "router_z_loss_mlp": 1.046875, + "step": 880, + "time_per_iteration": 4.877387762069702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114571, + "balance_loss_mlp": 1.04042399, + "epoch": 0.1694882647171989, + "flos": 927846641664.0, + "grad_norm": 0.023879743421000837, + "language_loss": 0.93963408, + "learning_rate": 0.0009498452465949042, + "loss": 0.95109117, + "num_input_tokens_seen": 72612976, + "router_z_loss_mlp": 1.05078125, + "step": 881, + "time_per_iteration": 3.241151809692383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150014, + "balance_loss_mlp": 1.0447762, + "epoch": 0.1696806464024625, + "flos": 547151278080.0, + "grad_norm": 0.02293023114251512, + "language_loss": 0.98854458, + "learning_rate": 0.0009497091622247285, + "loss": 1.0000447, + "num_input_tokens_seen": 72686800, + "router_z_loss_mlp": 1.05029297, + "step": 882, + "time_per_iteration": 2.720453977584839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145786, + "balance_loss_mlp": 1.0406431, + "epoch": 0.16987302808772606, + "flos": 530294602752.0, + "grad_norm": 0.02459483675822623, + "language_loss": 1.0302248, + "learning_rate": 0.0009495729032619723, + "loss": 1.04168272, + "num_input_tokens_seen": 72759360, + "router_z_loss_mlp": 1.04931641, + "step": 883, + "time_per_iteration": 2.717176675796509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151842, + "balance_loss_mlp": 1.04731977, + "epoch": 0.17006540977298962, + "flos": 756478660608.0, + "grad_norm": 0.02507713686866634, + "language_loss": 0.9295364, + "learning_rate": 0.0009494364697595354, + "loss": 0.94105482, + "num_input_tokens_seen": 72831424, + "router_z_loss_mlp": 1.04589844, + "step": 884, + "time_per_iteration": 2.924898147583008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157567, + "balance_loss_mlp": 1.05271089, + "epoch": 0.17025779145825318, + "flos": 559874694144.0, + "grad_norm": 0.025110060032482954, + "language_loss": 0.98774076, + "learning_rate": 0.0009492998617703867, + "loss": 0.99931645, + "num_input_tokens_seen": 72901536, + "router_z_loss_mlp": 1.04833984, + "step": 885, + "time_per_iteration": 2.6759417057037354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155376, + "balance_loss_mlp": 1.05104423, + "epoch": 0.17045017314351674, + "flos": 513216347136.0, + "grad_norm": 0.0280627140127875, + "language_loss": 0.96898842, + "learning_rate": 0.0009491630793475619, + "loss": 0.98054218, + "num_input_tokens_seen": 72970480, + "router_z_loss_mlp": 1.04492188, + "step": 886, + "time_per_iteration": 2.5910444259643555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149096, + "balance_loss_mlp": 1.04452574, + "epoch": 0.1706425548287803, + "flos": 510012343296.0, + "grad_norm": 0.023090423796267925, + "language_loss": 0.94873035, + "learning_rate": 0.0009490261225441643, + "loss": 0.96022129, + "num_input_tokens_seen": 73053376, + "router_z_loss_mlp": 1.04638672, + "step": 887, + "time_per_iteration": 2.960139513015747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149945, + "balance_loss_mlp": 1.04508829, + "epoch": 0.17083493651404386, + "flos": 718714642944.0, + "grad_norm": 0.024954435208077393, + "language_loss": 0.98478651, + "learning_rate": 0.0009488889914133656, + "loss": 0.99628592, + "num_input_tokens_seen": 73136032, + "router_z_loss_mlp": 1.04833984, + "step": 888, + "time_per_iteration": 3.0498712062835693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151016, + "balance_loss_mlp": 1.04649353, + "epoch": 0.17102731819930742, + "flos": 560200333824.0, + "grad_norm": 0.020862133880352407, + "language_loss": 0.97394216, + "learning_rate": 0.0009487516860084047, + "loss": 0.98545229, + "num_input_tokens_seen": 73208544, + "router_z_loss_mlp": 1.046875, + "step": 889, + "time_per_iteration": 2.799579381942749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115955, + "balance_loss_mlp": 1.0542171, + "epoch": 0.17121969988457098, + "flos": 495764788224.0, + "grad_norm": 0.030159167385703775, + "language_loss": 0.99659365, + "learning_rate": 0.0009486142063825884, + "loss": 1.0081892, + "num_input_tokens_seen": 73274336, + "router_z_loss_mlp": 1.05126953, + "step": 890, + "time_per_iteration": 2.5897767543792725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160561, + "balance_loss_mlp": 1.05718231, + "epoch": 0.17141208156983456, + "flos": 1552105941504.0, + "grad_norm": 0.012289453069715352, + "language_loss": 0.72426212, + "learning_rate": 0.0009484765525892909, + "loss": 0.73586774, + "num_input_tokens_seen": 73506320, + "router_z_loss_mlp": 1.03515625, + "step": 891, + "time_per_iteration": 4.971697807312012 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160561, + "balance_loss_mlp": 1.05527556, + "epoch": 0.17160446325509812, + "flos": 620700005376.0, + "grad_norm": 0.02677753623279009, + "language_loss": 1.00227833, + "learning_rate": 0.0009483387246819542, + "loss": 1.01388383, + "num_input_tokens_seen": 73578048, + "router_z_loss_mlp": 1.05078125, + "step": 892, + "time_per_iteration": 2.7142419815063477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153152, + "balance_loss_mlp": 1.04977417, + "epoch": 0.17179684494036168, + "flos": 1384693300224.0, + "grad_norm": 0.011012484205567044, + "language_loss": 0.82285583, + "learning_rate": 0.0009482007227140877, + "loss": 0.8343873, + "num_input_tokens_seen": 73798640, + "router_z_loss_mlp": 1.03515625, + "step": 893, + "time_per_iteration": 4.678752183914185 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159751, + "balance_loss_mlp": 1.05446541, + "epoch": 0.17198922662562524, + "flos": 493641762816.0, + "grad_norm": 0.02464509578240857, + "language_loss": 0.9638195, + "learning_rate": 0.0009480625467392688, + "loss": 0.97541702, + "num_input_tokens_seen": 73867328, + "router_z_loss_mlp": 1.05175781, + "step": 894, + "time_per_iteration": 2.6579103469848633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158279, + "balance_loss_mlp": 1.05490112, + "epoch": 0.1721816083108888, + "flos": 1461485689344.0, + "grad_norm": 0.014844728137103481, + "language_loss": 0.77994668, + "learning_rate": 0.0009479241968111421, + "loss": 0.79152954, + "num_input_tokens_seen": 74093376, + "router_z_loss_mlp": 1.03515625, + "step": 895, + "time_per_iteration": 4.754615783691406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157074, + "balance_loss_mlp": 1.0523603, + "epoch": 0.17237398999615236, + "flos": 529204892160.0, + "grad_norm": 0.024157534092911288, + "language_loss": 0.95005947, + "learning_rate": 0.0009477856729834196, + "loss": 0.96163023, + "num_input_tokens_seen": 74169136, + "router_z_loss_mlp": 1.046875, + "step": 896, + "time_per_iteration": 2.7640984058380127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161659, + "balance_loss_mlp": 1.05742288, + "epoch": 0.17256637168141592, + "flos": 605026366464.0, + "grad_norm": 0.02447501108745492, + "language_loss": 0.9782356, + "learning_rate": 0.0009476469753098809, + "loss": 0.98985219, + "num_input_tokens_seen": 74236912, + "router_z_loss_mlp": 1.04394531, + "step": 897, + "time_per_iteration": 2.7016282081604004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153769, + "balance_loss_mlp": 1.04957986, + "epoch": 0.17275875336667948, + "flos": 510693821952.0, + "grad_norm": 0.025419887327313116, + "language_loss": 0.94868481, + "learning_rate": 0.0009475081038443738, + "loss": 0.96022242, + "num_input_tokens_seen": 74305968, + "router_z_loss_mlp": 1.04345703, + "step": 898, + "time_per_iteration": 2.5731348991394043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148609, + "balance_loss_mlp": 1.0446589, + "epoch": 0.17295113505194307, + "flos": 666500955648.0, + "grad_norm": 0.02623291269769982, + "language_loss": 0.95752573, + "learning_rate": 0.0009473690586408124, + "loss": 0.96901178, + "num_input_tokens_seen": 74384144, + "router_z_loss_mlp": 1.04101562, + "step": 899, + "time_per_iteration": 2.8549156188964844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146417, + "balance_loss_mlp": 1.04227531, + "epoch": 0.17314351673720663, + "flos": 556431645696.0, + "grad_norm": 0.022300666942289, + "language_loss": 0.94826102, + "learning_rate": 0.0009472298397531792, + "loss": 0.9597252, + "num_input_tokens_seen": 74455040, + "router_z_loss_mlp": 1.04296875, + "step": 900, + "time_per_iteration": 2.7165167331695557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145486, + "balance_loss_mlp": 1.04124928, + "epoch": 0.17333589842247019, + "flos": 504606724608.0, + "grad_norm": 0.023477361471443404, + "language_loss": 0.95443118, + "learning_rate": 0.0009470904472355235, + "loss": 0.96588612, + "num_input_tokens_seen": 74525248, + "router_z_loss_mlp": 1.04394531, + "step": 901, + "time_per_iteration": 2.668320655822754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143811, + "balance_loss_mlp": 1.03967023, + "epoch": 0.17352828010773375, + "flos": 557350167552.0, + "grad_norm": 0.02470997420275152, + "language_loss": 0.90534914, + "learning_rate": 0.0009469508811419626, + "loss": 0.91678727, + "num_input_tokens_seen": 74597328, + "router_z_loss_mlp": 1.04296875, + "step": 902, + "time_per_iteration": 2.714174747467041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155739, + "balance_loss_mlp": 1.05331421, + "epoch": 0.1737206617929973, + "flos": 1557791537664.0, + "grad_norm": 0.011695515468407039, + "language_loss": 0.7161383, + "learning_rate": 0.0009468111415266806, + "loss": 0.7276957, + "num_input_tokens_seen": 74819664, + "router_z_loss_mlp": 1.02539062, + "step": 903, + "time_per_iteration": 4.783574104309082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146888, + "balance_loss_mlp": 1.04308009, + "epoch": 0.17391304347826086, + "flos": 517755836928.0, + "grad_norm": 0.027522671456014093, + "language_loss": 0.94518518, + "learning_rate": 0.0009466712284439292, + "loss": 0.95665407, + "num_input_tokens_seen": 74896224, + "router_z_loss_mlp": 1.03955078, + "step": 904, + "time_per_iteration": 2.7503864765167236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011486, + "balance_loss_mlp": 1.04503071, + "epoch": 0.17410542516352442, + "flos": 542160622080.0, + "grad_norm": 0.027186859166075866, + "language_loss": 0.99262786, + "learning_rate": 0.0009465311419480276, + "loss": 1.00411391, + "num_input_tokens_seen": 74966560, + "router_z_loss_mlp": 1.03710938, + "step": 905, + "time_per_iteration": 2.671753168106079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153491, + "balance_loss_mlp": 1.05011249, + "epoch": 0.17429780684878798, + "flos": 625081041408.0, + "grad_norm": 0.028950662808853365, + "language_loss": 0.96674442, + "learning_rate": 0.0009463908820933622, + "loss": 0.97827929, + "num_input_tokens_seen": 75045248, + "router_z_loss_mlp": 1.03515625, + "step": 906, + "time_per_iteration": 2.8291828632354736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151914, + "balance_loss_mlp": 1.04844034, + "epoch": 0.17449018853405157, + "flos": 576848890368.0, + "grad_norm": 0.03002954803612974, + "language_loss": 0.90420532, + "learning_rate": 0.0009462504489343868, + "loss": 0.91572446, + "num_input_tokens_seen": 75123952, + "router_z_loss_mlp": 1.03613281, + "step": 907, + "time_per_iteration": 2.8554108142852783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147077, + "balance_loss_mlp": 1.04341269, + "epoch": 0.17468257021931513, + "flos": 534772967424.0, + "grad_norm": 0.024073731406752365, + "language_loss": 1.01002121, + "learning_rate": 0.0009461098425256222, + "loss": 1.02149189, + "num_input_tokens_seen": 75191728, + "router_z_loss_mlp": 1.03808594, + "step": 908, + "time_per_iteration": 2.5968382358551025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114306, + "balance_loss_mlp": 1.03930068, + "epoch": 0.1748749519045787, + "flos": 541808785920.0, + "grad_norm": 0.02493910110608304, + "language_loss": 0.93412566, + "learning_rate": 0.0009459690629216567, + "loss": 0.94555628, + "num_input_tokens_seen": 75262224, + "router_z_loss_mlp": 1.0390625, + "step": 909, + "time_per_iteration": 2.670389413833618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150977, + "balance_loss_mlp": 1.04688334, + "epoch": 0.17506733358984225, + "flos": 499626802176.0, + "grad_norm": 0.02402970341263653, + "language_loss": 0.96272469, + "learning_rate": 0.0009458281101771457, + "loss": 0.97423446, + "num_input_tokens_seen": 75329760, + "router_z_loss_mlp": 1.04248047, + "step": 910, + "time_per_iteration": 2.6256320476531982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153015, + "balance_loss_mlp": 1.04906452, + "epoch": 0.1752597152751058, + "flos": 624132320256.0, + "grad_norm": 0.023679811966199643, + "language_loss": 0.91450173, + "learning_rate": 0.0009456869843468122, + "loss": 0.92603183, + "num_input_tokens_seen": 75407920, + "router_z_loss_mlp": 1.04101562, + "step": 911, + "time_per_iteration": 2.863004207611084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158204, + "balance_loss_mlp": 1.05434883, + "epoch": 0.17545209696036937, + "flos": 521993155584.0, + "grad_norm": 0.029813530713564303, + "language_loss": 0.92364156, + "learning_rate": 0.0009455456854854459, + "loss": 0.93522358, + "num_input_tokens_seen": 75476752, + "router_z_loss_mlp": 1.04003906, + "step": 912, + "time_per_iteration": 2.616231918334961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150079, + "balance_loss_mlp": 1.04612815, + "epoch": 0.17564447864563293, + "flos": 462945764352.0, + "grad_norm": 0.02810445184103091, + "language_loss": 0.92624664, + "learning_rate": 0.0009454042136479039, + "loss": 0.93774742, + "num_input_tokens_seen": 75542944, + "router_z_loss_mlp": 1.04101562, + "step": 913, + "time_per_iteration": 2.5944247245788574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155662, + "balance_loss_mlp": 1.05199766, + "epoch": 0.1758368603308965, + "flos": 481617289728.0, + "grad_norm": 0.02706355326928303, + "language_loss": 0.91841793, + "learning_rate": 0.0009452625688891103, + "loss": 0.92997456, + "num_input_tokens_seen": 75609840, + "router_z_loss_mlp": 1.03808594, + "step": 914, + "time_per_iteration": 2.580941915512085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144051, + "balance_loss_mlp": 1.04200745, + "epoch": 0.17602924201616005, + "flos": 1482084856320.0, + "grad_norm": 0.009713749524187035, + "language_loss": 0.78734738, + "learning_rate": 0.0009451207512640567, + "loss": 0.79878789, + "num_input_tokens_seen": 75819312, + "router_z_loss_mlp": 1.02148438, + "step": 915, + "time_per_iteration": 4.592097997665405 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148996, + "balance_loss_mlp": 1.04523647, + "epoch": 0.17622162370142364, + "flos": 603470026752.0, + "grad_norm": 0.02797967110469985, + "language_loss": 1.03421283, + "learning_rate": 0.0009449787608278015, + "loss": 1.0457027, + "num_input_tokens_seen": 75893984, + "router_z_loss_mlp": 1.0390625, + "step": 916, + "time_per_iteration": 2.755580425262451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150108, + "balance_loss_mlp": 1.04677713, + "epoch": 0.1764140053866872, + "flos": 443605495296.0, + "grad_norm": 0.024189441248888145, + "language_loss": 1.00777316, + "learning_rate": 0.0009448365976354704, + "loss": 1.01927423, + "num_input_tokens_seen": 75958944, + "router_z_loss_mlp": 1.03466797, + "step": 917, + "time_per_iteration": 2.4922571182250977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149055, + "balance_loss_mlp": 1.04567707, + "epoch": 0.17660638707195075, + "flos": 501591373824.0, + "grad_norm": 0.028333637349232343, + "language_loss": 1.01507974, + "learning_rate": 0.0009446942617422558, + "loss": 1.02657032, + "num_input_tokens_seen": 76024240, + "router_z_loss_mlp": 1.03515625, + "step": 918, + "time_per_iteration": 2.574998378753662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148191, + "balance_loss_mlp": 1.0448128, + "epoch": 0.17679876875721431, + "flos": 539983202304.0, + "grad_norm": 0.02432410226762854, + "language_loss": 0.94564992, + "learning_rate": 0.0009445517532034176, + "loss": 0.9571318, + "num_input_tokens_seen": 76095264, + "router_z_loss_mlp": 1.03515625, + "step": 919, + "time_per_iteration": 2.7170355319976807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153425, + "balance_loss_mlp": 1.05009484, + "epoch": 0.17699115044247787, + "flos": 498715011072.0, + "grad_norm": 0.026165935935680888, + "language_loss": 0.99032271, + "learning_rate": 0.0009444090720742824, + "loss": 1.00185692, + "num_input_tokens_seen": 76163520, + "router_z_loss_mlp": 1.03466797, + "step": 920, + "time_per_iteration": 2.6009292602539062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149157, + "balance_loss_mlp": 1.04587448, + "epoch": 0.17718353212774143, + "flos": 663915303936.0, + "grad_norm": 0.025722324934358026, + "language_loss": 0.98290348, + "learning_rate": 0.0009442662184102439, + "loss": 0.99439508, + "num_input_tokens_seen": 76233760, + "router_z_loss_mlp": 1.03417969, + "step": 921, + "time_per_iteration": 2.7612035274505615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145605, + "balance_loss_mlp": 1.04251313, + "epoch": 0.177375913813005, + "flos": 583847778816.0, + "grad_norm": 0.021564117555322487, + "language_loss": 0.93569565, + "learning_rate": 0.000944123192266763, + "loss": 0.94715166, + "num_input_tokens_seen": 76310704, + "router_z_loss_mlp": 1.03222656, + "step": 922, + "time_per_iteration": 2.8110268115997314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141792, + "balance_loss_mlp": 1.03865182, + "epoch": 0.17756829549826855, + "flos": 553683537408.0, + "grad_norm": 0.021487036209533367, + "language_loss": 0.92858881, + "learning_rate": 0.0009439799936993671, + "loss": 0.94000673, + "num_input_tokens_seen": 76386992, + "router_z_loss_mlp": 1.03271484, + "step": 923, + "time_per_iteration": 2.7440245151519775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142202, + "balance_loss_mlp": 1.03901482, + "epoch": 0.17776067718353214, + "flos": 557371634688.0, + "grad_norm": 0.02463154633112553, + "language_loss": 0.97990632, + "learning_rate": 0.0009438366227636511, + "loss": 0.99132836, + "num_input_tokens_seen": 76453328, + "router_z_loss_mlp": 1.03320312, + "step": 924, + "time_per_iteration": 2.7032759189605713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140208, + "balance_loss_mlp": 1.03721154, + "epoch": 0.1779530588687957, + "flos": 659651788800.0, + "grad_norm": 0.022941473179093813, + "language_loss": 0.94988692, + "learning_rate": 0.0009436930795152763, + "loss": 0.96128899, + "num_input_tokens_seen": 76529040, + "router_z_loss_mlp": 1.03125, + "step": 925, + "time_per_iteration": 2.854522943496704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143555, + "balance_loss_mlp": 1.04084456, + "epoch": 0.17814544055405926, + "flos": 645671476224.0, + "grad_norm": 0.02421412975678805, + "language_loss": 0.95479, + "learning_rate": 0.0009435493640099713, + "loss": 0.9662255, + "num_input_tokens_seen": 76604080, + "router_z_loss_mlp": 1.02832031, + "step": 926, + "time_per_iteration": 2.8268251419067383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143389, + "balance_loss_mlp": 1.04077399, + "epoch": 0.17833782223932282, + "flos": 461884251648.0, + "grad_norm": 0.0252062590806445, + "language_loss": 0.94177145, + "learning_rate": 0.0009434054763035314, + "loss": 0.95320535, + "num_input_tokens_seen": 76674096, + "router_z_loss_mlp": 1.02734375, + "step": 927, + "time_per_iteration": 2.629499673843384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139685, + "balance_loss_mlp": 1.03706956, + "epoch": 0.17853020392458638, + "flos": 760852965888.0, + "grad_norm": 0.02122720378042075, + "language_loss": 0.93181551, + "learning_rate": 0.0009432614164518185, + "loss": 0.94321233, + "num_input_tokens_seen": 76752144, + "router_z_loss_mlp": 1.02734375, + "step": 928, + "time_per_iteration": 2.9364700317382812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140803, + "balance_loss_mlp": 1.03818727, + "epoch": 0.17872258560984994, + "flos": 784055248896.0, + "grad_norm": 0.023477252169520995, + "language_loss": 0.93520033, + "learning_rate": 0.000943117184510762, + "loss": 0.94660836, + "num_input_tokens_seen": 76830240, + "router_z_loss_mlp": 1.02734375, + "step": 929, + "time_per_iteration": 3.07600474357605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150169, + "balance_loss_mlp": 1.04831696, + "epoch": 0.1789149672951135, + "flos": 1463031295488.0, + "grad_norm": 0.013755703560815407, + "language_loss": 0.78789961, + "learning_rate": 0.0009429727805363575, + "loss": 0.7994014, + "num_input_tokens_seen": 77062464, + "router_z_loss_mlp": 1.01953125, + "step": 930, + "time_per_iteration": 5.029282808303833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153323, + "balance_loss_mlp": 1.05099344, + "epoch": 0.17910734898037706, + "flos": 504930362880.0, + "grad_norm": 0.023999213273897636, + "language_loss": 0.96652937, + "learning_rate": 0.0009428282045846674, + "loss": 0.97806263, + "num_input_tokens_seen": 77136672, + "router_z_loss_mlp": 1.02441406, + "step": 931, + "time_per_iteration": 2.7112410068511963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145421, + "balance_loss_mlp": 1.04275823, + "epoch": 0.17929973066564064, + "flos": 747669651456.0, + "grad_norm": 0.02006943819739268, + "language_loss": 0.96385491, + "learning_rate": 0.0009426834567118214, + "loss": 0.97530913, + "num_input_tokens_seen": 77227040, + "router_z_loss_mlp": 1.02783203, + "step": 932, + "time_per_iteration": 3.0711913108825684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143693, + "balance_loss_mlp": 1.04098177, + "epoch": 0.1794921123509042, + "flos": 714572651520.0, + "grad_norm": 0.021210123960592832, + "language_loss": 0.89608383, + "learning_rate": 0.0009425385369740155, + "loss": 0.90752071, + "num_input_tokens_seen": 77319392, + "router_z_loss_mlp": 1.02832031, + "step": 933, + "time_per_iteration": 3.059857130050659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114727, + "balance_loss_mlp": 1.0451318, + "epoch": 0.17968449403616776, + "flos": 634361409024.0, + "grad_norm": 0.02299955090486112, + "language_loss": 0.96636283, + "learning_rate": 0.0009423934454275125, + "loss": 0.97783554, + "num_input_tokens_seen": 77394688, + "router_z_loss_mlp": 1.02246094, + "step": 934, + "time_per_iteration": 2.85917592048645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146917, + "balance_loss_mlp": 1.04477859, + "epoch": 0.17987687572143132, + "flos": 537378084864.0, + "grad_norm": 0.02461268142415081, + "language_loss": 1.01075852, + "learning_rate": 0.0009422481821286418, + "loss": 1.02222764, + "num_input_tokens_seen": 77468288, + "router_z_loss_mlp": 1.02246094, + "step": 935, + "time_per_iteration": 2.7314486503601074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150005, + "balance_loss_mlp": 1.04777098, + "epoch": 0.18006925740669488, + "flos": 539119074816.0, + "grad_norm": 0.026258801194945027, + "language_loss": 0.98970592, + "learning_rate": 0.0009421027471337998, + "loss": 1.00120604, + "num_input_tokens_seen": 77535840, + "router_z_loss_mlp": 1.0234375, + "step": 936, + "time_per_iteration": 2.6354496479034424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151337, + "balance_loss_mlp": 1.04891205, + "epoch": 0.18026163909195844, + "flos": 540534425088.0, + "grad_norm": 0.029056123283387615, + "language_loss": 0.94782555, + "learning_rate": 0.0009419571404994493, + "loss": 0.9593389, + "num_input_tokens_seen": 77604000, + "router_z_loss_mlp": 1.02539062, + "step": 937, + "time_per_iteration": 2.6368348598480225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148965, + "balance_loss_mlp": 1.04649317, + "epoch": 0.180454020777222, + "flos": 501682698240.0, + "grad_norm": 0.026973093946582868, + "language_loss": 1.00715971, + "learning_rate": 0.00094181136228212, + "loss": 1.01864934, + "num_input_tokens_seen": 77671488, + "router_z_loss_mlp": 1.02587891, + "step": 938, + "time_per_iteration": 2.710451602935791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145832, + "balance_loss_mlp": 1.043455, + "epoch": 0.18064640246248556, + "flos": 500006836224.0, + "grad_norm": 0.02510488837562242, + "language_loss": 0.93535352, + "learning_rate": 0.0009416654125384077, + "loss": 0.9468118, + "num_input_tokens_seen": 77746240, + "router_z_loss_mlp": 1.02490234, + "step": 939, + "time_per_iteration": 2.728480577468872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145905, + "balance_loss_mlp": 1.04424286, + "epoch": 0.18083878414774912, + "flos": 1522290808320.0, + "grad_norm": 0.01070150853185005, + "language_loss": 0.79772377, + "learning_rate": 0.0009415192913249752, + "loss": 0.80918276, + "num_input_tokens_seen": 77966080, + "router_z_loss_mlp": 1.01757812, + "step": 940, + "time_per_iteration": 4.915560007095337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145419, + "balance_loss_mlp": 1.04318535, + "epoch": 0.1810311658330127, + "flos": 728665755648.0, + "grad_norm": 0.023936590350452012, + "language_loss": 0.92724693, + "learning_rate": 0.000941372998698552, + "loss": 0.93870103, + "num_input_tokens_seen": 78049200, + "router_z_loss_mlp": 1.0234375, + "step": 941, + "time_per_iteration": 2.993441343307495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140689, + "balance_loss_mlp": 1.0385505, + "epoch": 0.18122354751827627, + "flos": 566044383744.0, + "grad_norm": 0.025062658148163358, + "language_loss": 0.94270039, + "learning_rate": 0.0009412265347159336, + "loss": 0.95410728, + "num_input_tokens_seen": 78122752, + "router_z_loss_mlp": 1.02246094, + "step": 942, + "time_per_iteration": 2.731416702270508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140669, + "balance_loss_mlp": 1.03848326, + "epoch": 0.18141592920353983, + "flos": 520317293568.0, + "grad_norm": 0.024682729806918415, + "language_loss": 0.94559634, + "learning_rate": 0.0009410798994339829, + "loss": 0.95700312, + "num_input_tokens_seen": 78194064, + "router_z_loss_mlp": 1.02294922, + "step": 943, + "time_per_iteration": 2.6001992225646973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138644, + "balance_loss_mlp": 1.03650522, + "epoch": 0.1816083108888034, + "flos": 513476858880.0, + "grad_norm": 0.022579221317186333, + "language_loss": 0.95589852, + "learning_rate": 0.000940933092909628, + "loss": 0.96728498, + "num_input_tokens_seen": 78262048, + "router_z_loss_mlp": 1.02246094, + "step": 944, + "time_per_iteration": 2.6360957622528076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147356, + "balance_loss_mlp": 1.04550409, + "epoch": 0.18180069257406695, + "flos": 493372518912.0, + "grad_norm": 0.02569410792888805, + "language_loss": 0.9276287, + "learning_rate": 0.0009407861151998649, + "loss": 0.93910229, + "num_input_tokens_seen": 78330624, + "router_z_loss_mlp": 1.01953125, + "step": 945, + "time_per_iteration": 2.6910903453826904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147749, + "balance_loss_mlp": 1.04608703, + "epoch": 0.1819930742593305, + "flos": 571230423552.0, + "grad_norm": 0.024877151530798884, + "language_loss": 0.95025092, + "learning_rate": 0.0009406389663617552, + "loss": 0.96172833, + "num_input_tokens_seen": 78400672, + "router_z_loss_mlp": 1.01757812, + "step": 946, + "time_per_iteration": 2.689232110977173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138734, + "balance_loss_mlp": 1.03669131, + "epoch": 0.18218545594459407, + "flos": 607110460416.0, + "grad_norm": 0.026141117268158143, + "language_loss": 0.96229172, + "learning_rate": 0.000940491646452427, + "loss": 0.97367907, + "num_input_tokens_seen": 78467952, + "router_z_loss_mlp": 1.02148438, + "step": 947, + "time_per_iteration": 2.720996618270874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136776, + "balance_loss_mlp": 1.03473294, + "epoch": 0.18237783762985763, + "flos": 549738931200.0, + "grad_norm": 0.02114848591843324, + "language_loss": 0.99382234, + "learning_rate": 0.000940344155529075, + "loss": 1.00519001, + "num_input_tokens_seen": 78538928, + "router_z_loss_mlp": 1.02148438, + "step": 948, + "time_per_iteration": 2.655764102935791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136656, + "balance_loss_mlp": 1.03489935, + "epoch": 0.1825702193151212, + "flos": 451674628608.0, + "grad_norm": 0.027816765537183038, + "language_loss": 0.98392528, + "learning_rate": 0.0009401964936489605, + "loss": 0.99529195, + "num_input_tokens_seen": 78602144, + "router_z_loss_mlp": 1.01855469, + "step": 949, + "time_per_iteration": 2.5372273921966553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137812, + "balance_loss_mlp": 1.03615081, + "epoch": 0.18276260100038477, + "flos": 590384040960.0, + "grad_norm": 0.023066854335363023, + "language_loss": 0.93237805, + "learning_rate": 0.0009400486608694108, + "loss": 0.94375616, + "num_input_tokens_seen": 78673152, + "router_z_loss_mlp": 1.01757812, + "step": 950, + "time_per_iteration": 2.7370681762695312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139002, + "balance_loss_mlp": 1.03719783, + "epoch": 0.18295498268564833, + "flos": 788709531648.0, + "grad_norm": 0.02337801281240106, + "language_loss": 0.97100747, + "learning_rate": 0.0009399006572478195, + "loss": 0.98239744, + "num_input_tokens_seen": 78753872, + "router_z_loss_mlp": 1.01904297, + "step": 951, + "time_per_iteration": 3.1136744022369385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144566, + "balance_loss_mlp": 1.04276168, + "epoch": 0.1831473643709119, + "flos": 579225696768.0, + "grad_norm": 0.024500893588447415, + "language_loss": 0.99522519, + "learning_rate": 0.0009397524828416468, + "loss": 1.00667083, + "num_input_tokens_seen": 78822640, + "router_z_loss_mlp": 1.01904297, + "step": 952, + "time_per_iteration": 2.680551767349243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138356, + "balance_loss_mlp": 1.03664696, + "epoch": 0.18333974605617545, + "flos": 567963293184.0, + "grad_norm": 0.023361368133084506, + "language_loss": 1.04812968, + "learning_rate": 0.0009396041377084192, + "loss": 1.05951309, + "num_input_tokens_seen": 78893792, + "router_z_loss_mlp": 1.01806641, + "step": 953, + "time_per_iteration": 2.6526119709014893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136097, + "balance_loss_mlp": 1.03443527, + "epoch": 0.183532127741439, + "flos": 528069519360.0, + "grad_norm": 0.02324700647994909, + "language_loss": 0.98137838, + "learning_rate": 0.0009394556219057295, + "loss": 0.99273932, + "num_input_tokens_seen": 78964752, + "router_z_loss_mlp": 1.01757812, + "step": 954, + "time_per_iteration": 2.6928489208221436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147999, + "balance_loss_mlp": 1.04671907, + "epoch": 0.18372450942670257, + "flos": 595643940864.0, + "grad_norm": 0.02338261009959255, + "language_loss": 0.93879586, + "learning_rate": 0.0009393069354912362, + "loss": 0.95027584, + "num_input_tokens_seen": 79034400, + "router_z_loss_mlp": 1.01367188, + "step": 955, + "time_per_iteration": 2.7496042251586914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151957, + "balance_loss_mlp": 1.05067647, + "epoch": 0.18391689111196613, + "flos": 646283824128.0, + "grad_norm": 0.029421035614033756, + "language_loss": 0.90626895, + "learning_rate": 0.0009391580785226649, + "loss": 0.91778857, + "num_input_tokens_seen": 79109488, + "router_z_loss_mlp": 1.01367188, + "step": 956, + "time_per_iteration": 2.9440600872039795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152481, + "balance_loss_mlp": 1.05253601, + "epoch": 0.18410927279722972, + "flos": 1460391975936.0, + "grad_norm": 0.020211591247266292, + "language_loss": 0.79340446, + "learning_rate": 0.0009390090510578067, + "loss": 0.80492932, + "num_input_tokens_seen": 79327712, + "router_z_loss_mlp": 1.0, + "step": 957, + "time_per_iteration": 4.738964796066284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138037, + "balance_loss_mlp": 1.03623211, + "epoch": 0.18430165448249328, + "flos": 660003624960.0, + "grad_norm": 0.026926680065899915, + "language_loss": 0.95339954, + "learning_rate": 0.0009388598531545196, + "loss": 0.96477991, + "num_input_tokens_seen": 79401504, + "router_z_loss_mlp": 1.01904297, + "step": 958, + "time_per_iteration": 2.859509229660034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138629, + "balance_loss_mlp": 1.03687191, + "epoch": 0.18449403616775684, + "flos": 518949606912.0, + "grad_norm": 0.029778126611616895, + "language_loss": 0.94583583, + "learning_rate": 0.000938710484870727, + "loss": 0.9572221, + "num_input_tokens_seen": 79466688, + "router_z_loss_mlp": 1.01855469, + "step": 959, + "time_per_iteration": 2.565548896789551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137101, + "balance_loss_mlp": 1.03543901, + "epoch": 0.1846864178530204, + "flos": 553824526848.0, + "grad_norm": 0.027283874554685776, + "language_loss": 0.94945395, + "learning_rate": 0.0009385609462644189, + "loss": 0.96082497, + "num_input_tokens_seen": 79540288, + "router_z_loss_mlp": 1.01757812, + "step": 960, + "time_per_iteration": 2.676379919052124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138569, + "balance_loss_mlp": 1.03709817, + "epoch": 0.18487879953828396, + "flos": 467115953664.0, + "grad_norm": 0.025693285519799033, + "language_loss": 0.96468461, + "learning_rate": 0.0009384112373936514, + "loss": 0.97607034, + "num_input_tokens_seen": 79611872, + "router_z_loss_mlp": 1.015625, + "step": 961, + "time_per_iteration": 2.7575974464416504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154728, + "balance_loss_mlp": 1.05325735, + "epoch": 0.18507118122354752, + "flos": 649683211776.0, + "grad_norm": 0.02725538915325764, + "language_loss": 1.0098747, + "learning_rate": 0.0009382613583165467, + "loss": 1.02142203, + "num_input_tokens_seen": 79689504, + "router_z_loss_mlp": 1.015625, + "step": 962, + "time_per_iteration": 2.8268754482269287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116263, + "balance_loss_mlp": 1.06125438, + "epoch": 0.18526356290881107, + "flos": 627922475520.0, + "grad_norm": 0.027998512126097927, + "language_loss": 0.99849832, + "learning_rate": 0.0009381113090912928, + "loss": 1.01012468, + "num_input_tokens_seen": 79759264, + "router_z_loss_mlp": 1.01464844, + "step": 963, + "time_per_iteration": 2.7762861251831055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147698, + "balance_loss_mlp": 1.04679894, + "epoch": 0.18545594459407463, + "flos": 433645650432.0, + "grad_norm": 0.027008272304904758, + "language_loss": 0.98634118, + "learning_rate": 0.000937961089776144, + "loss": 0.99781811, + "num_input_tokens_seen": 79824464, + "router_z_loss_mlp": 1.00976562, + "step": 964, + "time_per_iteration": 2.61759352684021 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149635, + "balance_loss_mlp": 1.04844999, + "epoch": 0.1856483262793382, + "flos": 750426491904.0, + "grad_norm": 0.028502333826765886, + "language_loss": 0.91998804, + "learning_rate": 0.0009378107004294208, + "loss": 0.93148446, + "num_input_tokens_seen": 79907152, + "router_z_loss_mlp": 1.01269531, + "step": 965, + "time_per_iteration": 2.964561939239502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151478, + "balance_loss_mlp": 1.05057883, + "epoch": 0.18584070796460178, + "flos": 531401777664.0, + "grad_norm": 0.02451376704559663, + "language_loss": 1.00210857, + "learning_rate": 0.0009376601411095096, + "loss": 1.01362348, + "num_input_tokens_seen": 79976944, + "router_z_loss_mlp": 1.00976562, + "step": 966, + "time_per_iteration": 2.6664164066314697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150482, + "balance_loss_mlp": 1.04953575, + "epoch": 0.18603308964986534, + "flos": 484083419136.0, + "grad_norm": 0.02282308899195351, + "language_loss": 0.93174511, + "learning_rate": 0.0009375094118748622, + "loss": 0.94324994, + "num_input_tokens_seen": 80042112, + "router_z_loss_mlp": 1.01025391, + "step": 967, + "time_per_iteration": 2.544952392578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142823, + "balance_loss_mlp": 1.041924, + "epoch": 0.1862254713351289, + "flos": 802681112064.0, + "grad_norm": 0.02495680742184495, + "language_loss": 1.00251484, + "learning_rate": 0.0009373585127839976, + "loss": 1.01394308, + "num_input_tokens_seen": 80118896, + "router_z_loss_mlp": 1.00976562, + "step": 968, + "time_per_iteration": 2.973095417022705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142113, + "balance_loss_mlp": 1.0413574, + "epoch": 0.18641785302039246, + "flos": 479290148352.0, + "grad_norm": 0.02509872783632802, + "language_loss": 0.9944787, + "learning_rate": 0.0009372074438954994, + "loss": 1.00589979, + "num_input_tokens_seen": 80183360, + "router_z_loss_mlp": 1.00830078, + "step": 969, + "time_per_iteration": 2.5303025245666504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142663, + "balance_loss_mlp": 1.04181159, + "epoch": 0.18661023470565602, + "flos": 389779072512.0, + "grad_norm": 0.02439046514561532, + "language_loss": 1.00939226, + "learning_rate": 0.0009370562052680181, + "loss": 1.02081895, + "num_input_tokens_seen": 80247024, + "router_z_loss_mlp": 1.00927734, + "step": 970, + "time_per_iteration": 2.5023443698883057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114981, + "balance_loss_mlp": 1.04929316, + "epoch": 0.18680261639091958, + "flos": 565775139840.0, + "grad_norm": 0.02213336285369191, + "language_loss": 0.95379293, + "learning_rate": 0.0009369047969602695, + "loss": 0.96529102, + "num_input_tokens_seen": 80318256, + "router_z_loss_mlp": 1.00585938, + "step": 971, + "time_per_iteration": 2.722823143005371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154865, + "balance_loss_mlp": 1.05420506, + "epoch": 0.18699499807618314, + "flos": 480230137344.0, + "grad_norm": 0.029574405329312194, + "language_loss": 0.9913702, + "learning_rate": 0.0009367532190310357, + "loss": 1.00291884, + "num_input_tokens_seen": 80384848, + "router_z_loss_mlp": 1.00732422, + "step": 972, + "time_per_iteration": 2.633387327194214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149336, + "balance_loss_mlp": 1.0490092, + "epoch": 0.1871873797614467, + "flos": 554328086016.0, + "grad_norm": 0.02905569815438633, + "language_loss": 0.99535728, + "learning_rate": 0.0009366014715391644, + "loss": 1.00685072, + "num_input_tokens_seen": 80453088, + "router_z_loss_mlp": 1.00390625, + "step": 973, + "time_per_iteration": 2.6549065113067627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153264, + "balance_loss_mlp": 1.05293763, + "epoch": 0.18737976144671029, + "flos": 553952781312.0, + "grad_norm": 0.023481989115367276, + "language_loss": 0.9123525, + "learning_rate": 0.0009364495545435693, + "loss": 0.92388517, + "num_input_tokens_seen": 80528608, + "router_z_loss_mlp": 1.00390625, + "step": 974, + "time_per_iteration": 4.409714221954346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155126, + "balance_loss_mlp": 1.05479944, + "epoch": 0.18757214313197385, + "flos": 503247770112.0, + "grad_norm": 0.022955013749569684, + "language_loss": 0.97297812, + "learning_rate": 0.0009362974681032297, + "loss": 0.98452938, + "num_input_tokens_seen": 80599600, + "router_z_loss_mlp": 1.00390625, + "step": 975, + "time_per_iteration": 2.61857533454895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153706, + "balance_loss_mlp": 1.05352271, + "epoch": 0.1877645248172374, + "flos": 676291613184.0, + "grad_norm": 0.028784531937469084, + "language_loss": 0.98011422, + "learning_rate": 0.0009361452122771907, + "loss": 0.9916513, + "num_input_tokens_seen": 80677264, + "router_z_loss_mlp": 1.00244141, + "step": 976, + "time_per_iteration": 2.91774320602417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149368, + "balance_loss_mlp": 1.04923177, + "epoch": 0.18795690650250096, + "flos": 405862944768.0, + "grad_norm": 0.029616845561456457, + "language_loss": 0.95658362, + "learning_rate": 0.0009359927871245635, + "loss": 0.9680773, + "num_input_tokens_seen": 80739776, + "router_z_loss_mlp": 1.00195312, + "step": 977, + "time_per_iteration": 2.563232183456421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149302, + "balance_loss_mlp": 1.04916573, + "epoch": 0.18814928818776452, + "flos": 639063355392.0, + "grad_norm": 0.027239481801034963, + "language_loss": 0.98439831, + "learning_rate": 0.0009358401927045246, + "loss": 0.99589127, + "num_input_tokens_seen": 80815200, + "router_z_loss_mlp": 1.00195312, + "step": 978, + "time_per_iteration": 2.8147568702697754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144978, + "balance_loss_mlp": 1.04498518, + "epoch": 0.18834166987302808, + "flos": 1140115514880.0, + "grad_norm": 0.022094320674951175, + "language_loss": 0.96123868, + "learning_rate": 0.0009356874290763166, + "loss": 0.9726885, + "num_input_tokens_seen": 80905024, + "router_z_loss_mlp": 1.00048828, + "step": 979, + "time_per_iteration": 3.4719691276550293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149894, + "balance_loss_mlp": 1.04971051, + "epoch": 0.18853405155829164, + "flos": 505815957504.0, + "grad_norm": 0.02560863383472628, + "language_loss": 0.98637187, + "learning_rate": 0.0009355344962992474, + "loss": 0.99787074, + "num_input_tokens_seen": 80976704, + "router_z_loss_mlp": 1.00244141, + "step": 980, + "time_per_iteration": 2.6199324131011963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139646, + "balance_loss_mlp": 1.03931963, + "epoch": 0.1887264332435552, + "flos": 609370472448.0, + "grad_norm": 0.02150131271194909, + "language_loss": 0.97900265, + "learning_rate": 0.0009353813944326908, + "loss": 0.99039912, + "num_input_tokens_seen": 81057152, + "router_z_loss_mlp": 1.00390625, + "step": 981, + "time_per_iteration": 2.8862478733062744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143203, + "balance_loss_mlp": 1.04287672, + "epoch": 0.1889188149288188, + "flos": 553592212992.0, + "grad_norm": 0.027403519760576756, + "language_loss": 0.92598587, + "learning_rate": 0.0009352281235360863, + "loss": 0.93741786, + "num_input_tokens_seen": 81131520, + "router_z_loss_mlp": 1.00390625, + "step": 982, + "time_per_iteration": 2.680797815322876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142003, + "balance_loss_mlp": 1.04167616, + "epoch": 0.18911119661408235, + "flos": 419469954048.0, + "grad_norm": 0.02481781093748577, + "language_loss": 0.92531025, + "learning_rate": 0.0009350746836689389, + "loss": 0.93673027, + "num_input_tokens_seen": 81195952, + "router_z_loss_mlp": 1.00390625, + "step": 983, + "time_per_iteration": 2.5687928199768066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152649, + "balance_loss_mlp": 1.05289459, + "epoch": 0.1893035782993459, + "flos": 1485317784576.0, + "grad_norm": 0.01747927461324531, + "language_loss": 0.81439221, + "learning_rate": 0.0009349210748908193, + "loss": 0.82591867, + "num_input_tokens_seen": 81427312, + "router_z_loss_mlp": 0.99804688, + "step": 984, + "time_per_iteration": 4.978898048400879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115218, + "balance_loss_mlp": 1.05237782, + "epoch": 0.18949595998460947, + "flos": 509456391168.0, + "grad_norm": 0.033971943902626214, + "language_loss": 0.94133711, + "learning_rate": 0.0009347672972613634, + "loss": 0.95285892, + "num_input_tokens_seen": 81494256, + "router_z_loss_mlp": 0.99853516, + "step": 985, + "time_per_iteration": 2.5850014686584473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153583, + "balance_loss_mlp": 1.05382824, + "epoch": 0.18968834166987303, + "flos": 532192045056.0, + "grad_norm": 0.027626772825507382, + "language_loss": 0.93152702, + "learning_rate": 0.0009346133508402735, + "loss": 0.9430629, + "num_input_tokens_seen": 81569312, + "router_z_loss_mlp": 0.99804688, + "step": 986, + "time_per_iteration": 2.7262227535247803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146056, + "balance_loss_mlp": 1.04658782, + "epoch": 0.1898807233551366, + "flos": 500753442816.0, + "grad_norm": 0.02768975875157221, + "language_loss": 0.95335174, + "learning_rate": 0.0009344592356873166, + "loss": 0.96481234, + "num_input_tokens_seen": 81637024, + "router_z_loss_mlp": 0.99511719, + "step": 987, + "time_per_iteration": 2.678715467453003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149829, + "balance_loss_mlp": 1.05002666, + "epoch": 0.19007310504040015, + "flos": 603359236608.0, + "grad_norm": 0.02899497531058058, + "language_loss": 0.87347138, + "learning_rate": 0.0009343049518623255, + "loss": 0.88496965, + "num_input_tokens_seen": 81709488, + "router_z_loss_mlp": 0.99853516, + "step": 988, + "time_per_iteration": 2.726668119430542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143975, + "balance_loss_mlp": 1.04407787, + "epoch": 0.1902654867256637, + "flos": 602764353024.0, + "grad_norm": 0.022945627178248204, + "language_loss": 0.90576518, + "learning_rate": 0.0009341504994251985, + "loss": 0.91720492, + "num_input_tokens_seen": 81787152, + "router_z_loss_mlp": 0.99951172, + "step": 989, + "time_per_iteration": 2.8518989086151123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151848, + "balance_loss_mlp": 1.05247498, + "epoch": 0.19045786841092727, + "flos": 1579231363584.0, + "grad_norm": 0.011944448483625032, + "language_loss": 0.73520499, + "learning_rate": 0.0009339958784358994, + "loss": 0.74672347, + "num_input_tokens_seen": 82030608, + "router_z_loss_mlp": 0.99414062, + "step": 990, + "time_per_iteration": 5.084089517593384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144398, + "balance_loss_mlp": 1.04445326, + "epoch": 0.19065025009619085, + "flos": 683054184960.0, + "grad_norm": 0.025253455013724026, + "language_loss": 0.88680583, + "learning_rate": 0.0009338410889544574, + "loss": 0.8982498, + "num_input_tokens_seen": 82119872, + "router_z_loss_mlp": 1.0, + "step": 991, + "time_per_iteration": 3.007277011871338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113977, + "balance_loss_mlp": 1.03949153, + "epoch": 0.1908426317814544, + "flos": 603441828864.0, + "grad_norm": 0.02514183514150974, + "language_loss": 0.96243769, + "learning_rate": 0.000933686131040967, + "loss": 0.97383535, + "num_input_tokens_seen": 82195552, + "router_z_loss_mlp": 1.00341797, + "step": 992, + "time_per_iteration": 2.7673017978668213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144459, + "balance_loss_mlp": 1.04441845, + "epoch": 0.19103501346671797, + "flos": 587433818112.0, + "grad_norm": 0.025095383977303525, + "language_loss": 0.99126339, + "learning_rate": 0.0009335310047555883, + "loss": 1.00270796, + "num_input_tokens_seen": 82267040, + "router_z_loss_mlp": 1.00097656, + "step": 993, + "time_per_iteration": 2.782841920852661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145602, + "balance_loss_mlp": 1.04565716, + "epoch": 0.19122739515198153, + "flos": 546834370560.0, + "grad_norm": 0.0365250692916995, + "language_loss": 0.97246122, + "learning_rate": 0.0009333757101585467, + "loss": 0.98391724, + "num_input_tokens_seen": 82337680, + "router_z_loss_mlp": 1.0, + "step": 994, + "time_per_iteration": 2.6937174797058105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142239, + "balance_loss_mlp": 1.04229414, + "epoch": 0.1914197768372451, + "flos": 522549107712.0, + "grad_norm": 0.02399514581888075, + "language_loss": 1.00362575, + "learning_rate": 0.0009332202473101329, + "loss": 1.01504803, + "num_input_tokens_seen": 82409600, + "router_z_loss_mlp": 1.0, + "step": 995, + "time_per_iteration": 2.7192962169647217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137582, + "balance_loss_mlp": 1.03763652, + "epoch": 0.19161215852250865, + "flos": 612387824640.0, + "grad_norm": 0.024864495797513732, + "language_loss": 0.91319168, + "learning_rate": 0.0009330646162707028, + "loss": 0.92456746, + "num_input_tokens_seen": 82480288, + "router_z_loss_mlp": 1.0, + "step": 996, + "time_per_iteration": 2.7450180053710938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113947, + "balance_loss_mlp": 1.03962064, + "epoch": 0.1918045402077722, + "flos": 848182619136.0, + "grad_norm": 0.02592603597590215, + "language_loss": 0.92579019, + "learning_rate": 0.0009329088171006779, + "loss": 0.93718487, + "num_input_tokens_seen": 82568960, + "router_z_loss_mlp": 0.99902344, + "step": 997, + "time_per_iteration": 3.1890194416046143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144521, + "balance_loss_mlp": 1.04457617, + "epoch": 0.19199692189303577, + "flos": 466892371968.0, + "grad_norm": 0.027577096255712943, + "language_loss": 0.95194477, + "learning_rate": 0.0009327528498605446, + "loss": 0.96338999, + "num_input_tokens_seen": 82634128, + "router_z_loss_mlp": 1.0, + "step": 998, + "time_per_iteration": 2.6845622062683105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141712, + "balance_loss_mlp": 1.04143262, + "epoch": 0.19218930357829936, + "flos": 532613011968.0, + "grad_norm": 0.026795980657526523, + "language_loss": 0.98209792, + "learning_rate": 0.0009325967146108548, + "loss": 0.99351501, + "num_input_tokens_seen": 82707472, + "router_z_loss_mlp": 1.00341797, + "step": 999, + "time_per_iteration": 2.690363883972168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145933, + "balance_loss_mlp": 1.04589295, + "epoch": 0.19238168526356292, + "flos": 602727422976.0, + "grad_norm": 0.025877996038880184, + "language_loss": 0.97816348, + "learning_rate": 0.0009324404114122258, + "loss": 0.98962283, + "num_input_tokens_seen": 82775232, + "router_z_loss_mlp": 1.00097656, + "step": 1000, + "time_per_iteration": 2.717535972595215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139683, + "balance_loss_mlp": 1.03969073, + "epoch": 0.19257406694882648, + "flos": 573154062336.0, + "grad_norm": 0.0251308575536182, + "language_loss": 0.95425117, + "learning_rate": 0.0009322839403253397, + "loss": 0.96564806, + "num_input_tokens_seen": 82850032, + "router_z_loss_mlp": 1.00048828, + "step": 1001, + "time_per_iteration": 2.8128621578216553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147687, + "balance_loss_mlp": 1.04793251, + "epoch": 0.19276644863409004, + "flos": 803156473344.0, + "grad_norm": 0.02827819499351052, + "language_loss": 0.93752921, + "learning_rate": 0.0009321273014109439, + "loss": 0.94900608, + "num_input_tokens_seen": 82926080, + "router_z_loss_mlp": 0.99804688, + "step": 1002, + "time_per_iteration": 2.9962668418884277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115103, + "balance_loss_mlp": 1.05127609, + "epoch": 0.1929588303193536, + "flos": 564479311872.0, + "grad_norm": 0.02425681225612504, + "language_loss": 0.92063946, + "learning_rate": 0.0009319704947298513, + "loss": 0.93214977, + "num_input_tokens_seen": 83005200, + "router_z_loss_mlp": 0.99804688, + "step": 1003, + "time_per_iteration": 2.9961774349212646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148634, + "balance_loss_mlp": 1.04887998, + "epoch": 0.19315121200461716, + "flos": 627987603456.0, + "grad_norm": 0.023688885680104285, + "language_loss": 0.95116329, + "learning_rate": 0.0009318135203429393, + "loss": 0.96264958, + "num_input_tokens_seen": 83077280, + "router_z_loss_mlp": 0.99804688, + "step": 1004, + "time_per_iteration": 2.7953245639801025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146221, + "balance_loss_mlp": 1.04646707, + "epoch": 0.19334359368988072, + "flos": 518583034368.0, + "grad_norm": 0.02448547542723696, + "language_loss": 0.95706153, + "learning_rate": 0.0009316563783111511, + "loss": 0.9685238, + "num_input_tokens_seen": 83145456, + "router_z_loss_mlp": 0.99804688, + "step": 1005, + "time_per_iteration": 2.7417562007904053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141812, + "balance_loss_mlp": 1.04224837, + "epoch": 0.19353597537514428, + "flos": 695399568384.0, + "grad_norm": 0.022656832097962477, + "language_loss": 0.91614294, + "learning_rate": 0.0009314990686954943, + "loss": 0.9275611, + "num_input_tokens_seen": 83225392, + "router_z_loss_mlp": 0.99609375, + "step": 1006, + "time_per_iteration": 2.921147584915161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143701, + "balance_loss_mlp": 1.04413795, + "epoch": 0.19372835706040784, + "flos": 1212199226880.0, + "grad_norm": 0.0213605480211332, + "language_loss": 0.89449364, + "learning_rate": 0.000931341591557042, + "loss": 0.90593064, + "num_input_tokens_seen": 83331296, + "router_z_loss_mlp": 0.99609375, + "step": 1007, + "time_per_iteration": 3.6934237480163574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142723, + "balance_loss_mlp": 1.04292154, + "epoch": 0.19392073874567142, + "flos": 521684980224.0, + "grad_norm": 0.02492230683936131, + "language_loss": 0.9970367, + "learning_rate": 0.0009311839469569325, + "loss": 1.00846386, + "num_input_tokens_seen": 83399952, + "router_z_loss_mlp": 0.99853516, + "step": 1008, + "time_per_iteration": 2.66283917427063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141437, + "balance_loss_mlp": 1.04187346, + "epoch": 0.19411312043093498, + "flos": 589910681088.0, + "grad_norm": 0.028572464719479444, + "language_loss": 0.9835515, + "learning_rate": 0.0009310261349563687, + "loss": 0.99496591, + "num_input_tokens_seen": 83468384, + "router_z_loss_mlp": 0.99609375, + "step": 1009, + "time_per_iteration": 2.6913864612579346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139912, + "balance_loss_mlp": 1.04034853, + "epoch": 0.19430550211619854, + "flos": 580571916288.0, + "grad_norm": 0.022224830980977262, + "language_loss": 0.9288035, + "learning_rate": 0.0009308681556166186, + "loss": 0.94020259, + "num_input_tokens_seen": 83547952, + "router_z_loss_mlp": 0.99609375, + "step": 1010, + "time_per_iteration": 2.8937342166900635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141907, + "balance_loss_mlp": 1.04234338, + "epoch": 0.1944978838014621, + "flos": 622245611520.0, + "grad_norm": 0.028831874511777204, + "language_loss": 1.01060331, + "learning_rate": 0.0009307100089990152, + "loss": 1.02202237, + "num_input_tokens_seen": 83615712, + "router_z_loss_mlp": 0.99609375, + "step": 1011, + "time_per_iteration": 2.7086822986602783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114452, + "balance_loss_mlp": 1.04495597, + "epoch": 0.19469026548672566, + "flos": 599814130176.0, + "grad_norm": 0.02434118582542042, + "language_loss": 0.95591187, + "learning_rate": 0.0009305516951649568, + "loss": 0.96735704, + "num_input_tokens_seen": 83687296, + "router_z_loss_mlp": 0.99609375, + "step": 1012, + "time_per_iteration": 2.7046425342559814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114359, + "balance_loss_mlp": 1.04402685, + "epoch": 0.19488264717198922, + "flos": 553247107584.0, + "grad_norm": 0.020712874248618226, + "language_loss": 0.93779677, + "learning_rate": 0.0009303932141759057, + "loss": 0.94923264, + "num_input_tokens_seen": 83763168, + "router_z_loss_mlp": 0.99609375, + "step": 1013, + "time_per_iteration": 2.7684950828552246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145994, + "balance_loss_mlp": 1.0468123, + "epoch": 0.19507502885725278, + "flos": 667312690176.0, + "grad_norm": 0.029421944235057496, + "language_loss": 0.94045115, + "learning_rate": 0.0009302345660933902, + "loss": 0.95191121, + "num_input_tokens_seen": 83837312, + "router_z_loss_mlp": 0.9921875, + "step": 1014, + "time_per_iteration": 2.8242082595825195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143606, + "balance_loss_mlp": 1.04442382, + "epoch": 0.19526741054251634, + "flos": 672327541248.0, + "grad_norm": 0.024449615989116238, + "language_loss": 0.93477654, + "learning_rate": 0.0009300757509790026, + "loss": 0.94621253, + "num_input_tokens_seen": 83917120, + "router_z_loss_mlp": 0.9921875, + "step": 1015, + "time_per_iteration": 2.840658664703369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144964, + "balance_loss_mlp": 1.04578233, + "epoch": 0.19545979222777993, + "flos": 448146986496.0, + "grad_norm": 0.028637929544829934, + "language_loss": 1.02226353, + "learning_rate": 0.0009299167688944005, + "loss": 1.0337131, + "num_input_tokens_seen": 83982992, + "router_z_loss_mlp": 0.9921875, + "step": 1016, + "time_per_iteration": 2.505427837371826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114266, + "balance_loss_mlp": 1.04376352, + "epoch": 0.1956521739130435, + "flos": 570168910848.0, + "grad_norm": 0.02609870742448671, + "language_loss": 0.93148959, + "learning_rate": 0.0009297576199013063, + "loss": 0.94291621, + "num_input_tokens_seen": 84057296, + "router_z_loss_mlp": 0.98925781, + "step": 1017, + "time_per_iteration": 2.7357168197631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155182, + "balance_loss_mlp": 1.05752563, + "epoch": 0.19584455559830705, + "flos": 1458880571904.0, + "grad_norm": 0.02028337436206496, + "language_loss": 0.73002136, + "learning_rate": 0.0009295983040615071, + "loss": 0.74157315, + "num_input_tokens_seen": 84292640, + "router_z_loss_mlp": 0.9765625, + "step": 1018, + "time_per_iteration": 5.09963059425354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147095, + "balance_loss_mlp": 1.04962921, + "epoch": 0.1960369372835706, + "flos": 1594481307648.0, + "grad_norm": 0.015251553743586253, + "language_loss": 0.79426301, + "learning_rate": 0.0009294388214368547, + "loss": 0.80573392, + "num_input_tokens_seen": 84524448, + "router_z_loss_mlp": 0.97460938, + "step": 1019, + "time_per_iteration": 6.03454852104187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146546, + "balance_loss_mlp": 1.0477457, + "epoch": 0.19622931896883417, + "flos": 617252954112.0, + "grad_norm": 0.02445318741287071, + "language_loss": 0.94190967, + "learning_rate": 0.0009292791720892659, + "loss": 0.9533751, + "num_input_tokens_seen": 84600208, + "router_z_loss_mlp": 0.98828125, + "step": 1020, + "time_per_iteration": 2.8369834423065186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147421, + "balance_loss_mlp": 1.0486201, + "epoch": 0.19642170065409773, + "flos": 467207278080.0, + "grad_norm": 0.027280190942869837, + "language_loss": 0.98824823, + "learning_rate": 0.0009291193560807218, + "loss": 0.99972242, + "num_input_tokens_seen": 84668032, + "router_z_loss_mlp": 0.98828125, + "step": 1021, + "time_per_iteration": 2.5833048820495605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144681, + "balance_loss_mlp": 1.0458802, + "epoch": 0.19661408233936128, + "flos": 516288093696.0, + "grad_norm": 0.025303886608753337, + "language_loss": 0.95740455, + "learning_rate": 0.0009289593734732688, + "loss": 0.96885145, + "num_input_tokens_seen": 84738176, + "router_z_loss_mlp": 0.98828125, + "step": 1022, + "time_per_iteration": 2.5913774967193604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149525, + "balance_loss_mlp": 1.05058122, + "epoch": 0.19680646402462484, + "flos": 393493366272.0, + "grad_norm": 0.0253763529676381, + "language_loss": 1.01103711, + "learning_rate": 0.0009287992243290175, + "loss": 1.02253246, + "num_input_tokens_seen": 84799936, + "router_z_loss_mlp": 0.98974609, + "step": 1023, + "time_per_iteration": 2.4793736934661865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115501, + "balance_loss_mlp": 1.05635238, + "epoch": 0.19699884570988843, + "flos": 627623032320.0, + "grad_norm": 0.02508480994731895, + "language_loss": 0.99886519, + "learning_rate": 0.0009286389087101435, + "loss": 1.01041532, + "num_input_tokens_seen": 84877216, + "router_z_loss_mlp": 0.98681641, + "step": 1024, + "time_per_iteration": 2.7772202491760254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153446, + "balance_loss_mlp": 1.05483615, + "epoch": 0.197191227395152, + "flos": 559073693184.0, + "grad_norm": 0.02445444816711275, + "language_loss": 0.98426372, + "learning_rate": 0.0009284784266788864, + "loss": 0.99579823, + "num_input_tokens_seen": 84952464, + "router_z_loss_mlp": 0.98632812, + "step": 1025, + "time_per_iteration": 2.6955864429473877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150264, + "balance_loss_mlp": 1.05165374, + "epoch": 0.19738360908041555, + "flos": 666249176064.0, + "grad_norm": 0.021666801749132464, + "language_loss": 0.99231869, + "learning_rate": 0.0009283177782975512, + "loss": 1.00382137, + "num_input_tokens_seen": 85031488, + "router_z_loss_mlp": 0.98632812, + "step": 1026, + "time_per_iteration": 2.9886229038238525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153907, + "balance_loss_mlp": 1.05529749, + "epoch": 0.1975759907656791, + "flos": 523510563840.0, + "grad_norm": 0.025961932589349316, + "language_loss": 0.98509014, + "learning_rate": 0.000928156963628507, + "loss": 0.99662918, + "num_input_tokens_seen": 85098384, + "router_z_loss_mlp": 0.98632812, + "step": 1027, + "time_per_iteration": 2.586740493774414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149439, + "balance_loss_mlp": 1.05097175, + "epoch": 0.19776837245094267, + "flos": 463484252160.0, + "grad_norm": 0.02550253779434718, + "language_loss": 0.96135926, + "learning_rate": 0.0009279959827341877, + "loss": 0.97285366, + "num_input_tokens_seen": 85172944, + "router_z_loss_mlp": 0.98486328, + "step": 1028, + "time_per_iteration": 2.723517894744873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146754, + "balance_loss_mlp": 1.04852605, + "epoch": 0.19796075413620623, + "flos": 504057503232.0, + "grad_norm": 0.02160335630411572, + "language_loss": 0.96627682, + "learning_rate": 0.0009278348356770915, + "loss": 0.97774434, + "num_input_tokens_seen": 85241632, + "router_z_loss_mlp": 0.98242188, + "step": 1029, + "time_per_iteration": 2.566802501678467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144801, + "balance_loss_mlp": 1.04666746, + "epoch": 0.1981531358214698, + "flos": 508570796544.0, + "grad_norm": 0.024261507948164947, + "language_loss": 0.9528529, + "learning_rate": 0.0009276735225197814, + "loss": 0.96430099, + "num_input_tokens_seen": 85308992, + "router_z_loss_mlp": 0.98144531, + "step": 1030, + "time_per_iteration": 2.6009340286254883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145205, + "balance_loss_mlp": 1.04702377, + "epoch": 0.19834551750673335, + "flos": 532639208448.0, + "grad_norm": 0.023062563394134136, + "language_loss": 0.95906407, + "learning_rate": 0.0009275120433248847, + "loss": 0.97051609, + "num_input_tokens_seen": 85381936, + "router_z_loss_mlp": 0.98193359, + "step": 1031, + "time_per_iteration": 2.684858560562134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145757, + "balance_loss_mlp": 1.0477196, + "epoch": 0.1985378991919969, + "flos": 776969765376.0, + "grad_norm": 0.02469129884935611, + "language_loss": 0.94986421, + "learning_rate": 0.0009273503981550931, + "loss": 0.96132183, + "num_input_tokens_seen": 85474352, + "router_z_loss_mlp": 0.98046875, + "step": 1032, + "time_per_iteration": 3.058094024658203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145313, + "balance_loss_mlp": 1.04737103, + "epoch": 0.1987302808772605, + "flos": 435191256576.0, + "grad_norm": 0.025952536265860523, + "language_loss": 0.96777844, + "learning_rate": 0.0009271885870731626, + "loss": 0.9792316, + "num_input_tokens_seen": 85538416, + "router_z_loss_mlp": 0.97949219, + "step": 1033, + "time_per_iteration": 2.493664503097534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153962, + "balance_loss_mlp": 1.05592442, + "epoch": 0.19892266256252406, + "flos": 554653725696.0, + "grad_norm": 0.029222795446194067, + "language_loss": 1.0035603, + "learning_rate": 0.0009270266101419143, + "loss": 1.01509976, + "num_input_tokens_seen": 85604416, + "router_z_loss_mlp": 0.98046875, + "step": 1034, + "time_per_iteration": 2.626612901687622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145521, + "balance_loss_mlp": 1.04748368, + "epoch": 0.19911504424778761, + "flos": 550948164096.0, + "grad_norm": 0.02425528851980561, + "language_loss": 0.92802572, + "learning_rate": 0.0009268644674242328, + "loss": 0.9394809, + "num_input_tokens_seen": 85677008, + "router_z_loss_mlp": 0.98046875, + "step": 1035, + "time_per_iteration": 2.683253288269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148174, + "balance_loss_mlp": 1.04994512, + "epoch": 0.19930742593305117, + "flos": 519312176640.0, + "grad_norm": 0.02646778626346152, + "language_loss": 0.91577774, + "learning_rate": 0.0009267021589830678, + "loss": 0.9272595, + "num_input_tokens_seen": 85745200, + "router_z_loss_mlp": 0.98242188, + "step": 1036, + "time_per_iteration": 2.7614338397979736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218948, + "balance_loss_mlp": 1.11824036, + "epoch": 0.19949980761831473, + "flos": 1512637863936.0, + "grad_norm": 0.02467753292442409, + "language_loss": 0.77627081, + "learning_rate": 0.0009265396848814328, + "loss": 0.78846025, + "num_input_tokens_seen": 85980608, + "router_z_loss_mlp": 1.0078125, + "step": 1037, + "time_per_iteration": 4.962339878082275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114988, + "balance_loss_mlp": 1.05184233, + "epoch": 0.1996921893035783, + "flos": 699439501824.0, + "grad_norm": 0.02757683731024766, + "language_loss": 1.02362621, + "learning_rate": 0.000926377045182406, + "loss": 1.03512502, + "num_input_tokens_seen": 86055952, + "router_z_loss_mlp": 0.98046875, + "step": 1038, + "time_per_iteration": 2.916594982147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155504, + "balance_loss_mlp": 1.05727601, + "epoch": 0.19988457098884185, + "flos": 728394510336.0, + "grad_norm": 0.024851830352508646, + "language_loss": 0.97729039, + "learning_rate": 0.0009262142399491296, + "loss": 0.98884547, + "num_input_tokens_seen": 86145536, + "router_z_loss_mlp": 0.98242188, + "step": 1039, + "time_per_iteration": 3.0976781845092773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156606, + "balance_loss_mlp": 1.05837739, + "epoch": 0.2000769526741054, + "flos": 561624416256.0, + "grad_norm": 0.025662568358030838, + "language_loss": 0.98388815, + "learning_rate": 0.0009260512692448105, + "loss": 0.99545419, + "num_input_tokens_seen": 86214480, + "router_z_loss_mlp": 0.98242188, + "step": 1040, + "time_per_iteration": 2.715479850769043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151311, + "balance_loss_mlp": 1.05308211, + "epoch": 0.200269334359369, + "flos": 573164795904.0, + "grad_norm": 0.022253887646478135, + "language_loss": 0.93097693, + "learning_rate": 0.000925888133132719, + "loss": 0.9424901, + "num_input_tokens_seen": 86289824, + "router_z_loss_mlp": 0.98242188, + "step": 1041, + "time_per_iteration": 2.7987864017486572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011912, + "balance_loss_mlp": 1.0923996, + "epoch": 0.20046171604463256, + "flos": 1489152875520.0, + "grad_norm": 0.020655335232781416, + "language_loss": 0.79610431, + "learning_rate": 0.0009257248316761906, + "loss": 0.80801636, + "num_input_tokens_seen": 86516384, + "router_z_loss_mlp": 0.98828125, + "step": 1042, + "time_per_iteration": 4.944507360458374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154531, + "balance_loss_mlp": 1.05644536, + "epoch": 0.20065409772989612, + "flos": 497577636864.0, + "grad_norm": 0.02609736880654102, + "language_loss": 0.92129564, + "learning_rate": 0.0009255613649386244, + "loss": 0.932841, + "num_input_tokens_seen": 86587296, + "router_z_loss_mlp": 0.98095703, + "step": 1043, + "time_per_iteration": 2.6478612422943115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157191, + "balance_loss_mlp": 1.05915368, + "epoch": 0.20084647941515968, + "flos": 580463127552.0, + "grad_norm": 0.02650777474930283, + "language_loss": 0.87469566, + "learning_rate": 0.0009253977329834838, + "loss": 0.88626754, + "num_input_tokens_seen": 86662656, + "router_z_loss_mlp": 0.98046875, + "step": 1044, + "time_per_iteration": 2.7641594409942627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161195, + "balance_loss_mlp": 1.06315744, + "epoch": 0.20103886110042324, + "flos": 643287939072.0, + "grad_norm": 0.030624079602620518, + "language_loss": 0.9713465, + "learning_rate": 0.0009252339358742965, + "loss": 0.98295844, + "num_input_tokens_seen": 86734704, + "router_z_loss_mlp": 0.98046875, + "step": 1045, + "time_per_iteration": 2.811687707901001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157153, + "balance_loss_mlp": 1.0594964, + "epoch": 0.2012312427856868, + "flos": 442969678848.0, + "grad_norm": 0.023268596270985206, + "language_loss": 0.93283701, + "learning_rate": 0.000925069973674654, + "loss": 0.94440854, + "num_input_tokens_seen": 86806512, + "router_z_loss_mlp": 0.9765625, + "step": 1046, + "time_per_iteration": 2.6709671020507812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157527, + "balance_loss_mlp": 1.05948889, + "epoch": 0.20142362447095036, + "flos": 555472190976.0, + "grad_norm": 0.022730221646095148, + "language_loss": 0.96496689, + "learning_rate": 0.000924905846448212, + "loss": 0.97654217, + "num_input_tokens_seen": 86883440, + "router_z_loss_mlp": 0.98046875, + "step": 1047, + "time_per_iteration": 2.7338547706604004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115317, + "balance_loss_mlp": 1.05522716, + "epoch": 0.20161600615621392, + "flos": 671554738176.0, + "grad_norm": 0.026697286803692055, + "language_loss": 0.96143991, + "learning_rate": 0.0009247415542586906, + "loss": 0.97297156, + "num_input_tokens_seen": 86960208, + "router_z_loss_mlp": 0.97949219, + "step": 1048, + "time_per_iteration": 2.849416494369507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149865, + "balance_loss_mlp": 1.05216146, + "epoch": 0.2018083878414775, + "flos": 574306899456.0, + "grad_norm": 0.021371049275305663, + "language_loss": 0.91504782, + "learning_rate": 0.0009245770971698735, + "loss": 0.92654645, + "num_input_tokens_seen": 87044144, + "router_z_loss_mlp": 0.97705078, + "step": 1049, + "time_per_iteration": 2.8751590251922607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151512, + "balance_loss_mlp": 1.05376041, + "epoch": 0.20200076952674106, + "flos": 426794482176.0, + "grad_norm": 0.027360075371486055, + "language_loss": 0.97835737, + "learning_rate": 0.0009244124752456087, + "loss": 0.98987252, + "num_input_tokens_seen": 87109136, + "router_z_loss_mlp": 0.97753906, + "step": 1050, + "time_per_iteration": 2.4985499382019043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153257, + "balance_loss_mlp": 1.05531442, + "epoch": 0.20219315121200462, + "flos": 537684258816.0, + "grad_norm": 0.025856302906645603, + "language_loss": 0.95370412, + "learning_rate": 0.0009242476885498081, + "loss": 0.96523666, + "num_input_tokens_seen": 87184320, + "router_z_loss_mlp": 0.97949219, + "step": 1051, + "time_per_iteration": 2.7127723693847656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150827, + "balance_loss_mlp": 1.05297983, + "epoch": 0.20238553289726818, + "flos": 478834252800.0, + "grad_norm": 0.02631802181941096, + "language_loss": 0.90995431, + "learning_rate": 0.0009240827371464474, + "loss": 0.92146254, + "num_input_tokens_seen": 87248224, + "router_z_loss_mlp": 0.97851562, + "step": 1052, + "time_per_iteration": 2.527918577194214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144335, + "balance_loss_mlp": 1.04667878, + "epoch": 0.20257791458253174, + "flos": 1153846049280.0, + "grad_norm": 0.025276400477213575, + "language_loss": 0.92167991, + "learning_rate": 0.0009239176210995666, + "loss": 0.93312329, + "num_input_tokens_seen": 87333088, + "router_z_loss_mlp": 0.9765625, + "step": 1053, + "time_per_iteration": 3.4556469917297363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144677, + "balance_loss_mlp": 1.04682982, + "epoch": 0.2027702962677953, + "flos": 668148619776.0, + "grad_norm": 0.025342755763179396, + "language_loss": 1.04358864, + "learning_rate": 0.0009237523404732695, + "loss": 1.05503547, + "num_input_tokens_seen": 87413840, + "router_z_loss_mlp": 0.97851562, + "step": 1054, + "time_per_iteration": 2.894198417663574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144665, + "balance_loss_mlp": 1.04676986, + "epoch": 0.20296267795305886, + "flos": 642452009472.0, + "grad_norm": 0.02468028394334187, + "language_loss": 0.94787639, + "learning_rate": 0.0009235868953317235, + "loss": 0.95932305, + "num_input_tokens_seen": 87487168, + "router_z_loss_mlp": 0.97900391, + "step": 1055, + "time_per_iteration": 2.812633991241455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148717, + "balance_loss_mlp": 1.05082273, + "epoch": 0.20315505963832242, + "flos": 932129622528.0, + "grad_norm": 0.02533903757078053, + "language_loss": 0.93907225, + "learning_rate": 0.0009234212857391602, + "loss": 0.95055938, + "num_input_tokens_seen": 87573184, + "router_z_loss_mlp": 0.97900391, + "step": 1056, + "time_per_iteration": 3.2061142921447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147493, + "balance_loss_mlp": 1.0496459, + "epoch": 0.20334744132358598, + "flos": 563287543296.0, + "grad_norm": 0.019686870604104637, + "language_loss": 0.97330248, + "learning_rate": 0.000923255511759875, + "loss": 0.98477745, + "num_input_tokens_seen": 87651968, + "router_z_loss_mlp": 0.97851562, + "step": 1057, + "time_per_iteration": 2.7639002799987793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150039, + "balance_loss_mlp": 1.05219197, + "epoch": 0.20353982300884957, + "flos": 645428428800.0, + "grad_norm": 0.023252811049323967, + "language_loss": 0.95256209, + "learning_rate": 0.000923089573458227, + "loss": 0.96406245, + "num_input_tokens_seen": 87727792, + "router_z_loss_mlp": 0.97851562, + "step": 1058, + "time_per_iteration": 2.857612133026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114962, + "balance_loss_mlp": 1.05177307, + "epoch": 0.20373220469411313, + "flos": 652705293312.0, + "grad_norm": 0.02395962669603635, + "language_loss": 0.93332446, + "learning_rate": 0.0009229234708986392, + "loss": 0.94482064, + "num_input_tokens_seen": 87806048, + "router_z_loss_mlp": 0.97851562, + "step": 1059, + "time_per_iteration": 2.877995729446411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150688, + "balance_loss_mlp": 1.05436707, + "epoch": 0.2039245863793767, + "flos": 1440396973056.0, + "grad_norm": 0.013896761524226428, + "language_loss": 0.81666899, + "learning_rate": 0.0009227572041455982, + "loss": 0.82817578, + "num_input_tokens_seen": 88018160, + "router_z_loss_mlp": 0.96289062, + "step": 1060, + "time_per_iteration": 4.659267902374268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142187, + "balance_loss_mlp": 1.04434025, + "epoch": 0.20411696806464025, + "flos": 598127534592.0, + "grad_norm": 0.026599581611848343, + "language_loss": 0.93894625, + "learning_rate": 0.0009225907732636548, + "loss": 0.95036817, + "num_input_tokens_seen": 88090864, + "router_z_loss_mlp": 0.97851562, + "step": 1061, + "time_per_iteration": 2.7480902671813965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115027, + "balance_loss_mlp": 1.05242312, + "epoch": 0.2043093497499038, + "flos": 574897053696.0, + "grad_norm": 0.026136319737411078, + "language_loss": 0.96460152, + "learning_rate": 0.0009224241783174227, + "loss": 0.97610414, + "num_input_tokens_seen": 88161360, + "router_z_loss_mlp": 0.97851562, + "step": 1062, + "time_per_iteration": 2.676877021789551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146738, + "balance_loss_mlp": 1.04874802, + "epoch": 0.20450173143516737, + "flos": 631523977728.0, + "grad_norm": 0.02709710709634581, + "language_loss": 0.94472104, + "learning_rate": 0.0009222574193715802, + "loss": 0.95618844, + "num_input_tokens_seen": 88234960, + "router_z_loss_mlp": 0.97998047, + "step": 1063, + "time_per_iteration": 2.7604472637176514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141026, + "balance_loss_mlp": 1.04298854, + "epoch": 0.20469411312043093, + "flos": 575146831872.0, + "grad_norm": 0.022769515120839894, + "language_loss": 0.95189404, + "learning_rate": 0.000922090496490869, + "loss": 0.96330428, + "num_input_tokens_seen": 88308176, + "router_z_loss_mlp": 0.98046875, + "step": 1064, + "time_per_iteration": 2.728154182434082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141583, + "balance_loss_mlp": 1.04383183, + "epoch": 0.20488649480569449, + "flos": 638279818752.0, + "grad_norm": 0.022393105289594414, + "language_loss": 0.97629392, + "learning_rate": 0.0009219234097400937, + "loss": 0.9877097, + "num_input_tokens_seen": 88386768, + "router_z_loss_mlp": 0.97753906, + "step": 1065, + "time_per_iteration": 2.889946937561035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137744, + "balance_loss_mlp": 1.03989744, + "epoch": 0.20507887649095807, + "flos": 977437747200.0, + "grad_norm": 0.024872828726298618, + "language_loss": 0.9305777, + "learning_rate": 0.0009217561591841237, + "loss": 0.94195515, + "num_input_tokens_seen": 88476576, + "router_z_loss_mlp": 0.97851562, + "step": 1066, + "time_per_iteration": 3.296248435974121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144611, + "balance_loss_mlp": 1.04681206, + "epoch": 0.20527125817622163, + "flos": 487155165696.0, + "grad_norm": 0.024567371957878288, + "language_loss": 0.90358436, + "learning_rate": 0.0009215887448878913, + "loss": 0.91503048, + "num_input_tokens_seen": 88541968, + "router_z_loss_mlp": 0.97802734, + "step": 1067, + "time_per_iteration": 2.5662190914154053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137303, + "balance_loss_mlp": 1.03945625, + "epoch": 0.2054636398614852, + "flos": 528210508800.0, + "grad_norm": 0.02249486638659544, + "language_loss": 0.94470721, + "learning_rate": 0.0009214211669163922, + "loss": 0.9560802, + "num_input_tokens_seen": 88615296, + "router_z_loss_mlp": 0.97851562, + "step": 1068, + "time_per_iteration": 2.6912589073181152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139468, + "balance_loss_mlp": 1.04162145, + "epoch": 0.20565602154674875, + "flos": 559323471360.0, + "grad_norm": 0.022635174506508055, + "language_loss": 1.02501464, + "learning_rate": 0.0009212534253346862, + "loss": 1.03640926, + "num_input_tokens_seen": 88691584, + "router_z_loss_mlp": 0.97851562, + "step": 1069, + "time_per_iteration": 2.708683490753174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135123, + "balance_loss_mlp": 1.03746641, + "epoch": 0.2058484032320123, + "flos": 505221073920.0, + "grad_norm": 0.02479403914192968, + "language_loss": 0.95383358, + "learning_rate": 0.0009210855202078964, + "loss": 0.96518481, + "num_input_tokens_seen": 88756592, + "router_z_loss_mlp": 0.9765625, + "step": 1070, + "time_per_iteration": 2.6434948444366455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132203, + "balance_loss_mlp": 1.03478527, + "epoch": 0.20604078491727587, + "flos": 434047151616.0, + "grad_norm": 0.024632817960327506, + "language_loss": 0.96572351, + "learning_rate": 0.0009209174516012091, + "loss": 0.97704554, + "num_input_tokens_seen": 88820928, + "router_z_loss_mlp": 0.97412109, + "step": 1071, + "time_per_iteration": 2.4891347885131836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148822, + "balance_loss_mlp": 1.05130851, + "epoch": 0.20623316660253943, + "flos": 609874031616.0, + "grad_norm": 0.024395492192686875, + "language_loss": 0.97482872, + "learning_rate": 0.0009207492195798747, + "loss": 0.98631692, + "num_input_tokens_seen": 88895440, + "router_z_loss_mlp": 0.97509766, + "step": 1072, + "time_per_iteration": 2.758575201034546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152495, + "balance_loss_mlp": 1.05502975, + "epoch": 0.206425548287803, + "flos": 481393708032.0, + "grad_norm": 0.027205333287948934, + "language_loss": 0.9402262, + "learning_rate": 0.0009205808242092061, + "loss": 0.95175123, + "num_input_tokens_seen": 88964400, + "router_z_loss_mlp": 0.97460938, + "step": 1073, + "time_per_iteration": 2.6534366607666016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152896, + "balance_loss_mlp": 1.05562115, + "epoch": 0.20661792997306658, + "flos": 951122784768.0, + "grad_norm": 0.02943422736446298, + "language_loss": 0.93147469, + "learning_rate": 0.0009204122655545808, + "loss": 0.94300359, + "num_input_tokens_seen": 89049600, + "router_z_loss_mlp": 0.97265625, + "step": 1074, + "time_per_iteration": 3.317518949508667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149199, + "balance_loss_mlp": 1.05201948, + "epoch": 0.20681031165833014, + "flos": 604616133120.0, + "grad_norm": 0.024855118115069977, + "language_loss": 0.88961834, + "learning_rate": 0.0009202435436814388, + "loss": 0.90111029, + "num_input_tokens_seen": 89119024, + "router_z_loss_mlp": 0.97167969, + "step": 1075, + "time_per_iteration": 2.6815345287323 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142912, + "balance_loss_mlp": 1.04563749, + "epoch": 0.2070026933435937, + "flos": 710265475584.0, + "grad_norm": 0.027130222852878607, + "language_loss": 0.99239773, + "learning_rate": 0.0009200746586552836, + "loss": 1.00382686, + "num_input_tokens_seen": 89197344, + "router_z_loss_mlp": 0.97265625, + "step": 1076, + "time_per_iteration": 2.9578917026519775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141976, + "balance_loss_mlp": 1.04451025, + "epoch": 0.20719507502885726, + "flos": 831254085120.0, + "grad_norm": 0.023090334700176834, + "language_loss": 0.92780054, + "learning_rate": 0.0009199056105416825, + "loss": 0.93922031, + "num_input_tokens_seen": 89280464, + "router_z_loss_mlp": 0.97460938, + "step": 1077, + "time_per_iteration": 3.0944156646728516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140475, + "balance_loss_mlp": 1.04324794, + "epoch": 0.20738745671412082, + "flos": 639499785216.0, + "grad_norm": 0.023914471883828003, + "language_loss": 0.96186948, + "learning_rate": 0.0009197363994062654, + "loss": 0.97327423, + "num_input_tokens_seen": 89353344, + "router_z_loss_mlp": 0.97216797, + "step": 1078, + "time_per_iteration": 2.8147799968719482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142489, + "balance_loss_mlp": 1.04521394, + "epoch": 0.20757983839938438, + "flos": 686983328256.0, + "grad_norm": 0.02237329029547868, + "language_loss": 0.90686679, + "learning_rate": 0.0009195670253147262, + "loss": 0.91829169, + "num_input_tokens_seen": 89439328, + "router_z_loss_mlp": 0.97265625, + "step": 1079, + "time_per_iteration": 2.994058609008789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141016, + "balance_loss_mlp": 1.04383624, + "epoch": 0.20777222008464794, + "flos": 520317293568.0, + "grad_norm": 0.026634413874044322, + "language_loss": 0.92195654, + "learning_rate": 0.0009193974883328216, + "loss": 0.93336666, + "num_input_tokens_seen": 89510160, + "router_z_loss_mlp": 0.97167969, + "step": 1080, + "time_per_iteration": 2.6506502628326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140462, + "balance_loss_mlp": 1.04333031, + "epoch": 0.2079646017699115, + "flos": 512469740544.0, + "grad_norm": 0.025261028079588584, + "language_loss": 0.97185814, + "learning_rate": 0.0009192277885263718, + "loss": 0.98326278, + "num_input_tokens_seen": 89582960, + "router_z_loss_mlp": 0.97119141, + "step": 1081, + "time_per_iteration": 2.646629810333252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143678, + "balance_loss_mlp": 1.04640269, + "epoch": 0.20815698345517505, + "flos": 933467109888.0, + "grad_norm": 0.02363260569338726, + "language_loss": 0.9496327, + "learning_rate": 0.0009190579259612602, + "loss": 0.96106946, + "num_input_tokens_seen": 89675488, + "router_z_loss_mlp": 0.97265625, + "step": 1082, + "time_per_iteration": 3.2829811573028564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150642, + "balance_loss_mlp": 1.05336761, + "epoch": 0.20834936514043864, + "flos": 633553677312.0, + "grad_norm": 0.02436625118168465, + "language_loss": 0.97094011, + "learning_rate": 0.000918887900703433, + "loss": 0.98244655, + "num_input_tokens_seen": 89747872, + "router_z_loss_mlp": 0.97265625, + "step": 1083, + "time_per_iteration": 2.779474973678589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147642, + "balance_loss_mlp": 1.05079603, + "epoch": 0.2085417468257022, + "flos": 395243088384.0, + "grad_norm": 0.027448171988374206, + "language_loss": 0.98109657, + "learning_rate": 0.0009187177128188999, + "loss": 0.99257296, + "num_input_tokens_seen": 89810176, + "router_z_loss_mlp": 0.96826172, + "step": 1084, + "time_per_iteration": 2.487755298614502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156746, + "balance_loss_mlp": 1.06118774, + "epoch": 0.20873412851096576, + "flos": 1405195138560.0, + "grad_norm": 0.014888537960634525, + "language_loss": 0.77156538, + "learning_rate": 0.0009185473623737339, + "loss": 0.78313285, + "num_input_tokens_seen": 90038432, + "router_z_loss_mlp": 0.95507812, + "step": 1085, + "time_per_iteration": 4.917901515960693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146704, + "balance_loss_mlp": 1.04981041, + "epoch": 0.20892651019622932, + "flos": 448761335808.0, + "grad_norm": 0.0275038267286557, + "language_loss": 0.93389261, + "learning_rate": 0.000918376849434071, + "loss": 0.94535965, + "num_input_tokens_seen": 90101568, + "router_z_loss_mlp": 0.96875, + "step": 1086, + "time_per_iteration": 2.5117850303649902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153188, + "balance_loss_mlp": 1.05629456, + "epoch": 0.20911889188149288, + "flos": 494080194048.0, + "grad_norm": 0.034273062806107445, + "language_loss": 1.02428699, + "learning_rate": 0.0009182061740661098, + "loss": 1.03581882, + "num_input_tokens_seen": 90169344, + "router_z_loss_mlp": 0.96875, + "step": 1087, + "time_per_iteration": 2.5270984172821045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154258, + "balance_loss_mlp": 1.05736482, + "epoch": 0.20931127356675644, + "flos": 842748802560.0, + "grad_norm": 0.02361505883443172, + "language_loss": 0.92997056, + "learning_rate": 0.0009180353363361127, + "loss": 0.94151306, + "num_input_tokens_seen": 90252416, + "router_z_loss_mlp": 0.96875, + "step": 1088, + "time_per_iteration": 3.1549112796783447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154015, + "balance_loss_mlp": 1.05688298, + "epoch": 0.20950365525202, + "flos": 758523823104.0, + "grad_norm": 0.028384526527587387, + "language_loss": 0.93851304, + "learning_rate": 0.0009178643363104044, + "loss": 0.95005322, + "num_input_tokens_seen": 90337952, + "router_z_loss_mlp": 0.97119141, + "step": 1089, + "time_per_iteration": 4.693684339523315 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148681, + "balance_loss_mlp": 1.05159688, + "epoch": 0.20969603693728356, + "flos": 473491760640.0, + "grad_norm": 0.03411348227976855, + "language_loss": 1.04663801, + "learning_rate": 0.0009176931740553735, + "loss": 1.05812478, + "num_input_tokens_seen": 90401488, + "router_z_loss_mlp": 0.97070312, + "step": 1090, + "time_per_iteration": 2.5203866958618164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146066, + "balance_loss_mlp": 1.04917288, + "epoch": 0.20988841862254715, + "flos": 978627514368.0, + "grad_norm": 0.027482857176328385, + "language_loss": 0.92998403, + "learning_rate": 0.0009175218496374708, + "loss": 0.94144469, + "num_input_tokens_seen": 90486144, + "router_z_loss_mlp": 0.96875, + "step": 1091, + "time_per_iteration": 3.362614870071411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152337, + "balance_loss_mlp": 1.05544364, + "epoch": 0.2100808003078107, + "flos": 1094818123776.0, + "grad_norm": 0.028049590852478556, + "language_loss": 0.96363866, + "learning_rate": 0.0009173503631232103, + "loss": 0.97516203, + "num_input_tokens_seen": 90571504, + "router_z_loss_mlp": 0.96875, + "step": 1092, + "time_per_iteration": 3.359970808029175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150696, + "balance_loss_mlp": 1.05399334, + "epoch": 0.21027318199307427, + "flos": 1014559217664.0, + "grad_norm": 0.03210489869185377, + "language_loss": 0.94109344, + "learning_rate": 0.0009171787145791691, + "loss": 0.95260036, + "num_input_tokens_seen": 90646016, + "router_z_loss_mlp": 0.96679688, + "step": 1093, + "time_per_iteration": 3.2180042266845703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150028, + "balance_loss_mlp": 1.05323017, + "epoch": 0.21046556367833782, + "flos": 522412121088.0, + "grad_norm": 0.02762257246471406, + "language_loss": 0.92679179, + "learning_rate": 0.000917006904071987, + "loss": 0.93829209, + "num_input_tokens_seen": 90713440, + "router_z_loss_mlp": 0.96777344, + "step": 1094, + "time_per_iteration": 2.5961859226226807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152841, + "balance_loss_mlp": 1.0559479, + "epoch": 0.21065794536360138, + "flos": 604839714816.0, + "grad_norm": 0.02570597393175465, + "language_loss": 0.97250223, + "learning_rate": 0.0009168349316683669, + "loss": 0.98403066, + "num_input_tokens_seen": 90788208, + "router_z_loss_mlp": 0.96875, + "step": 1095, + "time_per_iteration": 2.7164759635925293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153125, + "balance_loss_mlp": 1.05642295, + "epoch": 0.21085032704886494, + "flos": 604557735936.0, + "grad_norm": 0.022711755724658188, + "language_loss": 0.91088736, + "learning_rate": 0.0009166627974350741, + "loss": 0.92241859, + "num_input_tokens_seen": 90873776, + "router_z_loss_mlp": 0.96679688, + "step": 1096, + "time_per_iteration": 2.8912341594696045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153907, + "balance_loss_mlp": 1.05739498, + "epoch": 0.2110427087341285, + "flos": 638831041536.0, + "grad_norm": 0.027939519002465243, + "language_loss": 1.01164758, + "learning_rate": 0.0009164905014389373, + "loss": 1.02318668, + "num_input_tokens_seen": 90945872, + "router_z_loss_mlp": 0.96484375, + "step": 1097, + "time_per_iteration": 2.758725881576538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115008, + "balance_loss_mlp": 1.05356789, + "epoch": 0.21123509041939206, + "flos": 523929529344.0, + "grad_norm": 0.027217895626849283, + "language_loss": 0.96537346, + "learning_rate": 0.0009163180437468476, + "loss": 0.97687429, + "num_input_tokens_seen": 91016224, + "router_z_loss_mlp": 0.96484375, + "step": 1098, + "time_per_iteration": 2.6157684326171875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011531, + "balance_loss_mlp": 1.05658853, + "epoch": 0.21142747210465565, + "flos": 452193650688.0, + "grad_norm": 0.025540912808389868, + "language_loss": 0.94842321, + "learning_rate": 0.000916145424425759, + "loss": 0.9599542, + "num_input_tokens_seen": 91086752, + "router_z_loss_mlp": 0.96484375, + "step": 1099, + "time_per_iteration": 2.6368908882141113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157233, + "balance_loss_mlp": 1.06081605, + "epoch": 0.2116198537899192, + "flos": 877625723904.0, + "grad_norm": 0.02885196772961066, + "language_loss": 1.02573156, + "learning_rate": 0.0009159726435426885, + "loss": 1.03730392, + "num_input_tokens_seen": 91162960, + "router_z_loss_mlp": 0.96386719, + "step": 1100, + "time_per_iteration": 3.0916907787323 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011557, + "balance_loss_mlp": 1.05909276, + "epoch": 0.21181223547518277, + "flos": 524674134528.0, + "grad_norm": 0.025603473018395394, + "language_loss": 0.99936807, + "learning_rate": 0.0009157997011647154, + "loss": 1.01092505, + "num_input_tokens_seen": 91229840, + "router_z_loss_mlp": 0.96582031, + "step": 1101, + "time_per_iteration": 2.5971169471740723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152722, + "balance_loss_mlp": 1.05630529, + "epoch": 0.21200461716044633, + "flos": 573425307648.0, + "grad_norm": 0.02306433427515447, + "language_loss": 0.93708789, + "learning_rate": 0.0009156265973589817, + "loss": 0.94861513, + "num_input_tokens_seen": 91307936, + "router_z_loss_mlp": 0.96386719, + "step": 1102, + "time_per_iteration": 2.786557197570801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148247, + "balance_loss_mlp": 1.05187845, + "epoch": 0.2121969988457099, + "flos": 546174359040.0, + "grad_norm": 0.023119673851329285, + "language_loss": 0.9826746, + "learning_rate": 0.0009154533321926926, + "loss": 0.99415696, + "num_input_tokens_seen": 91372848, + "router_z_loss_mlp": 0.96337891, + "step": 1103, + "time_per_iteration": 2.6500911712646484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150448, + "balance_loss_mlp": 1.05393636, + "epoch": 0.21238938053097345, + "flos": 845353920000.0, + "grad_norm": 0.02523726215492747, + "language_loss": 0.96587884, + "learning_rate": 0.0009152799057331156, + "loss": 0.97738338, + "num_input_tokens_seen": 91452768, + "router_z_loss_mlp": 0.96484375, + "step": 1104, + "time_per_iteration": 3.1080517768859863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148697, + "balance_loss_mlp": 1.05213737, + "epoch": 0.212581762216237, + "flos": 447141869568.0, + "grad_norm": 0.026678256955328494, + "language_loss": 1.00256824, + "learning_rate": 0.0009151063180475805, + "loss": 1.01405525, + "num_input_tokens_seen": 91519888, + "router_z_loss_mlp": 0.96533203, + "step": 1105, + "time_per_iteration": 2.530207633972168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153737, + "balance_loss_mlp": 1.05703473, + "epoch": 0.21277414390150057, + "flos": 515385034752.0, + "grad_norm": 0.026680614248996183, + "language_loss": 0.9432478, + "learning_rate": 0.0009149325692034803, + "loss": 0.95478517, + "num_input_tokens_seen": 91585744, + "router_z_loss_mlp": 0.96679688, + "step": 1106, + "time_per_iteration": 2.576834201812744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159119, + "balance_loss_mlp": 1.06413269, + "epoch": 0.21296652558676413, + "flos": 1488512329728.0, + "grad_norm": 0.01358013302766655, + "language_loss": 0.79203427, + "learning_rate": 0.0009147586592682702, + "loss": 0.80362546, + "num_input_tokens_seen": 91805840, + "router_z_loss_mlp": 0.94921875, + "step": 1107, + "time_per_iteration": 4.821696996688843 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156765, + "balance_loss_mlp": 1.06006265, + "epoch": 0.21315890727202771, + "flos": 847450748928.0, + "grad_norm": 0.031460519319247274, + "language_loss": 0.96369046, + "learning_rate": 0.0009145845883094678, + "loss": 0.97525811, + "num_input_tokens_seen": 91885936, + "router_z_loss_mlp": 0.96679688, + "step": 1108, + "time_per_iteration": 3.029548168182373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159379, + "balance_loss_mlp": 1.06267655, + "epoch": 0.21335128895729127, + "flos": 630555790848.0, + "grad_norm": 0.028067626854192333, + "language_loss": 0.95182431, + "learning_rate": 0.000914410356394654, + "loss": 0.96341801, + "num_input_tokens_seen": 91959888, + "router_z_loss_mlp": 0.96679688, + "step": 1109, + "time_per_iteration": 2.737241268157959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160605, + "balance_loss_mlp": 1.06352139, + "epoch": 0.21354367064255483, + "flos": 712284441600.0, + "grad_norm": 0.023599510024272945, + "language_loss": 0.92540836, + "learning_rate": 0.0009142359635914709, + "loss": 0.93701446, + "num_input_tokens_seen": 92043728, + "router_z_loss_mlp": 0.97070312, + "step": 1110, + "time_per_iteration": 3.0267913341522217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161441, + "balance_loss_mlp": 1.0645479, + "epoch": 0.2137360523278184, + "flos": 457210503168.0, + "grad_norm": 0.02473497568188501, + "language_loss": 0.9156003, + "learning_rate": 0.0009140614099676245, + "loss": 0.92721474, + "num_input_tokens_seen": 92114096, + "router_z_loss_mlp": 0.96875, + "step": 1111, + "time_per_iteration": 2.5756866931915283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164266, + "balance_loss_mlp": 1.06727743, + "epoch": 0.21392843401308195, + "flos": 667265026560.0, + "grad_norm": 0.025344438139363285, + "language_loss": 0.90291333, + "learning_rate": 0.0009138866955908821, + "loss": 0.91455603, + "num_input_tokens_seen": 92193552, + "router_z_loss_mlp": 0.96972656, + "step": 1112, + "time_per_iteration": 2.9406254291534424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160039, + "balance_loss_mlp": 1.06319368, + "epoch": 0.2141208156983455, + "flos": 750361363968.0, + "grad_norm": 0.02581510235299489, + "language_loss": 0.89949894, + "learning_rate": 0.0009137118205290738, + "loss": 0.91109931, + "num_input_tokens_seen": 92279248, + "router_z_loss_mlp": 0.96826172, + "step": 1113, + "time_per_iteration": 2.966989278793335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162558, + "balance_loss_mlp": 1.06547356, + "epoch": 0.21431319738360907, + "flos": 420010443264.0, + "grad_norm": 0.024953242249854055, + "language_loss": 1.00419319, + "learning_rate": 0.0009135367848500924, + "loss": 1.01581883, + "num_input_tokens_seen": 92344064, + "router_z_loss_mlp": 0.97070312, + "step": 1114, + "time_per_iteration": 2.4954934120178223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161216, + "balance_loss_mlp": 1.06456113, + "epoch": 0.21450557906887263, + "flos": 610238602752.0, + "grad_norm": 0.030213425802119154, + "language_loss": 0.9839642, + "learning_rate": 0.0009133615886218927, + "loss": 0.99557638, + "num_input_tokens_seen": 92410544, + "router_z_loss_mlp": 0.96630859, + "step": 1115, + "time_per_iteration": 2.71352219581604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152764, + "balance_loss_mlp": 1.05625272, + "epoch": 0.21469796075413622, + "flos": 562974638592.0, + "grad_norm": 0.027635545182738433, + "language_loss": 0.99806535, + "learning_rate": 0.0009131862319124917, + "loss": 1.00959289, + "num_input_tokens_seen": 92480272, + "router_z_loss_mlp": 0.96484375, + "step": 1116, + "time_per_iteration": 2.630807876586914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153717, + "balance_loss_mlp": 1.05720496, + "epoch": 0.21489034243939978, + "flos": 595737266688.0, + "grad_norm": 0.024806539819872384, + "language_loss": 0.94489264, + "learning_rate": 0.0009130107147899691, + "loss": 0.95642984, + "num_input_tokens_seen": 92555584, + "router_z_loss_mlp": 0.96484375, + "step": 1117, + "time_per_iteration": 2.7123875617980957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154765, + "balance_loss_mlp": 1.05825305, + "epoch": 0.21508272412466334, + "flos": 442850156544.0, + "grad_norm": 0.024517194331867692, + "language_loss": 0.93784142, + "learning_rate": 0.0009128350373224665, + "loss": 0.9493891, + "num_input_tokens_seen": 92623136, + "router_z_loss_mlp": 0.96484375, + "step": 1118, + "time_per_iteration": 2.5384151935577393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169045, + "balance_loss_mlp": 1.07348633, + "epoch": 0.2152751058099269, + "flos": 1499232242688.0, + "grad_norm": 0.019396990855708212, + "language_loss": 0.81456429, + "learning_rate": 0.0009126591995781883, + "loss": 0.82625473, + "num_input_tokens_seen": 92842608, + "router_z_loss_mlp": 0.95507812, + "step": 1119, + "time_per_iteration": 4.644891262054443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156688, + "balance_loss_mlp": 1.05989027, + "epoch": 0.21546748749519046, + "flos": 494991985152.0, + "grad_norm": 0.030440112014221473, + "language_loss": 0.9407053, + "learning_rate": 0.0009124832016254005, + "loss": 0.95227218, + "num_input_tokens_seen": 92912960, + "router_z_loss_mlp": 0.96777344, + "step": 1120, + "time_per_iteration": 2.588834285736084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163526, + "balance_loss_mlp": 1.06691861, + "epoch": 0.21565986918045402, + "flos": 635694167040.0, + "grad_norm": 0.030206495794058562, + "language_loss": 0.96966755, + "learning_rate": 0.0009123070435324316, + "loss": 0.98130286, + "num_input_tokens_seen": 92982272, + "router_z_loss_mlp": 0.96582031, + "step": 1121, + "time_per_iteration": 2.786072015762329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170601, + "balance_loss_mlp": 1.07542419, + "epoch": 0.21585225086571758, + "flos": 1586798939136.0, + "grad_norm": 0.013013152417503263, + "language_loss": 0.77875781, + "learning_rate": 0.0009121307253676722, + "loss": 0.79046386, + "num_input_tokens_seen": 93218752, + "router_z_loss_mlp": 0.95117188, + "step": 1122, + "time_per_iteration": 4.946362733840942 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011651, + "balance_loss_mlp": 1.0685885, + "epoch": 0.21604463255098114, + "flos": 685322202624.0, + "grad_norm": 0.027822137906457534, + "language_loss": 0.94040322, + "learning_rate": 0.0009119542471995752, + "loss": 0.95205426, + "num_input_tokens_seen": 93293968, + "router_z_loss_mlp": 0.96484375, + "step": 1123, + "time_per_iteration": 2.8613343238830566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162625, + "balance_loss_mlp": 1.0660181, + "epoch": 0.2162370142362447, + "flos": 782307528192.0, + "grad_norm": 0.029561600436113455, + "language_loss": 0.90709835, + "learning_rate": 0.0009117776090966554, + "loss": 0.9187246, + "num_input_tokens_seen": 93367088, + "router_z_loss_mlp": 0.96582031, + "step": 1124, + "time_per_iteration": 2.9557414054870605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170148, + "balance_loss_mlp": 1.07344532, + "epoch": 0.21642939592150828, + "flos": 1003761441792.0, + "grad_norm": 0.032145354222626064, + "language_loss": 0.98171163, + "learning_rate": 0.0009116008111274899, + "loss": 0.99341309, + "num_input_tokens_seen": 93452944, + "router_z_loss_mlp": 0.96679688, + "step": 1125, + "time_per_iteration": 3.253286838531494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175423, + "balance_loss_mlp": 1.0798645, + "epoch": 0.21662177760677184, + "flos": 1485762220032.0, + "grad_norm": 0.016361962696647775, + "language_loss": 0.79106927, + "learning_rate": 0.0009114238533607176, + "loss": 0.80282342, + "num_input_tokens_seen": 93677328, + "router_z_loss_mlp": 0.95507812, + "step": 1126, + "time_per_iteration": 4.832986831665039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168208, + "balance_loss_mlp": 1.07150567, + "epoch": 0.2168141592920354, + "flos": 888859929600.0, + "grad_norm": 0.027606671666099106, + "language_loss": 0.94760346, + "learning_rate": 0.0009112467358650396, + "loss": 0.9592855, + "num_input_tokens_seen": 93756848, + "router_z_loss_mlp": 0.96679688, + "step": 1127, + "time_per_iteration": 3.1373836994171143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164208, + "balance_loss_mlp": 1.06741047, + "epoch": 0.21700654097729896, + "flos": 547084148736.0, + "grad_norm": 0.025712027239217825, + "language_loss": 0.95734817, + "learning_rate": 0.0009110694587092192, + "loss": 0.96899021, + "num_input_tokens_seen": 93834704, + "router_z_loss_mlp": 0.96777344, + "step": 1128, + "time_per_iteration": 2.752166986465454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162506, + "balance_loss_mlp": 1.06580317, + "epoch": 0.21719892266256252, + "flos": 510535368192.0, + "grad_norm": 0.02739880514200537, + "language_loss": 0.95310479, + "learning_rate": 0.0009108920219620815, + "loss": 0.96472991, + "num_input_tokens_seen": 93904448, + "router_z_loss_mlp": 0.96679688, + "step": 1129, + "time_per_iteration": 2.639409065246582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164125, + "balance_loss_mlp": 1.06742299, + "epoch": 0.21739130434782608, + "flos": 544461566976.0, + "grad_norm": 0.023064586598143682, + "language_loss": 0.97784394, + "learning_rate": 0.0009107144256925133, + "loss": 0.9894852, + "num_input_tokens_seen": 93979312, + "router_z_loss_mlp": 0.96679688, + "step": 1130, + "time_per_iteration": 2.73559308052063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165938, + "balance_loss_mlp": 1.06923568, + "epoch": 0.21758368603308964, + "flos": 617982096384.0, + "grad_norm": 0.027176951765382908, + "language_loss": 0.9233678, + "learning_rate": 0.0009105366699694638, + "loss": 0.93502718, + "num_input_tokens_seen": 94052032, + "router_z_loss_mlp": 0.96679688, + "step": 1131, + "time_per_iteration": 2.7653839588165283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166281, + "balance_loss_mlp": 1.06957853, + "epoch": 0.2177760677183532, + "flos": 636334712832.0, + "grad_norm": 0.021107298895209785, + "language_loss": 0.91459304, + "learning_rate": 0.0009103587548619439, + "loss": 0.92625588, + "num_input_tokens_seen": 94124944, + "router_z_loss_mlp": 0.96679688, + "step": 1132, + "time_per_iteration": 2.8519365787506104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160184, + "balance_loss_mlp": 1.06367195, + "epoch": 0.2179684494036168, + "flos": 533596661760.0, + "grad_norm": 0.022551614427290693, + "language_loss": 0.95995569, + "learning_rate": 0.0009101806804390261, + "loss": 0.97155756, + "num_input_tokens_seen": 94200384, + "router_z_loss_mlp": 0.96484375, + "step": 1133, + "time_per_iteration": 2.8218026161193848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163206, + "balance_loss_mlp": 1.06664658, + "epoch": 0.21816083108888035, + "flos": 476181471744.0, + "grad_norm": 0.0250418684782295, + "language_loss": 1.00355339, + "learning_rate": 0.0009100024467698453, + "loss": 1.01518536, + "num_input_tokens_seen": 94266992, + "router_z_loss_mlp": 0.96533203, + "step": 1134, + "time_per_iteration": 2.5639142990112305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167151, + "balance_loss_mlp": 1.07059181, + "epoch": 0.2183532127741439, + "flos": 578546219520.0, + "grad_norm": 0.029194142239697657, + "language_loss": 0.95151818, + "learning_rate": 0.0009098240539235981, + "loss": 0.96318972, + "num_input_tokens_seen": 94334304, + "router_z_loss_mlp": 0.96533203, + "step": 1135, + "time_per_iteration": 2.693704843521118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162362, + "balance_loss_mlp": 1.06565976, + "epoch": 0.21854559445940747, + "flos": 595279369728.0, + "grad_norm": 0.022714398939090653, + "language_loss": 0.96190184, + "learning_rate": 0.0009096455019695423, + "loss": 0.9735254, + "num_input_tokens_seen": 94413296, + "router_z_loss_mlp": 0.96679688, + "step": 1136, + "time_per_iteration": 2.829479217529297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166866, + "balance_loss_mlp": 1.06997275, + "epoch": 0.21873797614467103, + "flos": 409549040640.0, + "grad_norm": 0.027737994351600712, + "language_loss": 1.01424551, + "learning_rate": 0.000909466790976998, + "loss": 1.02591419, + "num_input_tokens_seen": 94475840, + "router_z_loss_mlp": 0.96875, + "step": 1137, + "time_per_iteration": 2.4491164684295654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165251, + "balance_loss_mlp": 1.06869149, + "epoch": 0.21893035782993459, + "flos": 895654702080.0, + "grad_norm": 0.022710058353260835, + "language_loss": 0.90594929, + "learning_rate": 0.0009092879210153473, + "loss": 0.91760182, + "num_input_tokens_seen": 94555184, + "router_z_loss_mlp": 0.96533203, + "step": 1138, + "time_per_iteration": 3.155076503753662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168627, + "balance_loss_mlp": 1.07192433, + "epoch": 0.21912273951519814, + "flos": 468568233984.0, + "grad_norm": 0.024281064631586205, + "language_loss": 0.97427768, + "learning_rate": 0.0009091088921540333, + "loss": 0.98596388, + "num_input_tokens_seen": 94622656, + "router_z_loss_mlp": 0.96679688, + "step": 1139, + "time_per_iteration": 2.5309600830078125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172859, + "balance_loss_mlp": 1.07711029, + "epoch": 0.2193151212004617, + "flos": 1535177407488.0, + "grad_norm": 0.009496329971255709, + "language_loss": 0.75508678, + "learning_rate": 0.0009089297044625615, + "loss": 0.76681536, + "num_input_tokens_seen": 94856496, + "router_z_loss_mlp": 0.95703125, + "step": 1140, + "time_per_iteration": 4.911335229873657 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172401, + "balance_loss_mlp": 1.07569873, + "epoch": 0.2195075028857253, + "flos": 592274752512.0, + "grad_norm": 0.033335232647672346, + "language_loss": 0.95078719, + "learning_rate": 0.0009087503580104985, + "loss": 0.96251118, + "num_input_tokens_seen": 94926880, + "router_z_loss_mlp": 0.96679688, + "step": 1141, + "time_per_iteration": 2.7083888053894043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169701, + "balance_loss_mlp": 1.07295096, + "epoch": 0.21969988457098885, + "flos": 637517749248.0, + "grad_norm": 0.02859165000671714, + "language_loss": 0.90439236, + "learning_rate": 0.0009085708528674728, + "loss": 0.91608942, + "num_input_tokens_seen": 95000528, + "router_z_loss_mlp": 0.96728516, + "step": 1142, + "time_per_iteration": 2.786891222000122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162201, + "balance_loss_mlp": 1.06549823, + "epoch": 0.2198922662562524, + "flos": 913859598336.0, + "grad_norm": 0.0328462843269242, + "language_loss": 0.98848528, + "learning_rate": 0.0009083911891031745, + "loss": 1.00010729, + "num_input_tokens_seen": 95081040, + "router_z_loss_mlp": 0.96679688, + "step": 1143, + "time_per_iteration": 3.1019930839538574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116483, + "balance_loss_mlp": 1.06793654, + "epoch": 0.22008464794151597, + "flos": 824494241280.0, + "grad_norm": 0.023913565571636344, + "language_loss": 1.01496291, + "learning_rate": 0.0009082113667873553, + "loss": 1.02661121, + "num_input_tokens_seen": 95167328, + "router_z_loss_mlp": 0.96875, + "step": 1144, + "time_per_iteration": 3.104292869567871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170855, + "balance_loss_mlp": 1.07405746, + "epoch": 0.22027702962677953, + "flos": 460618622976.0, + "grad_norm": 0.029355186834356364, + "language_loss": 1.00543249, + "learning_rate": 0.0009080313859898283, + "loss": 1.0171411, + "num_input_tokens_seen": 95230304, + "router_z_loss_mlp": 0.96777344, + "step": 1145, + "time_per_iteration": 2.552457332611084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170139, + "balance_loss_mlp": 1.07343698, + "epoch": 0.2204694113120431, + "flos": 532287372288.0, + "grad_norm": 0.025362278251747628, + "language_loss": 1.01871562, + "learning_rate": 0.0009078512467804684, + "loss": 1.03041708, + "num_input_tokens_seen": 95299520, + "router_z_loss_mlp": 0.96679688, + "step": 1146, + "time_per_iteration": 2.6138763427734375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170493, + "balance_loss_mlp": 1.07379043, + "epoch": 0.22066179299730665, + "flos": 523686481920.0, + "grad_norm": 0.02553067563602684, + "language_loss": 1.00136042, + "learning_rate": 0.0009076709492292119, + "loss": 1.01306534, + "num_input_tokens_seen": 95368912, + "router_z_loss_mlp": 0.96679688, + "step": 1147, + "time_per_iteration": 2.6107985973358154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163104, + "balance_loss_mlp": 1.0664016, + "epoch": 0.2208541746825702, + "flos": 547505115648.0, + "grad_norm": 0.02505349531569444, + "language_loss": 0.99364072, + "learning_rate": 0.0009074904934060562, + "loss": 1.00527167, + "num_input_tokens_seen": 95440800, + "router_z_loss_mlp": 0.96679688, + "step": 1148, + "time_per_iteration": 2.680250644683838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166008, + "balance_loss_mlp": 1.06873322, + "epoch": 0.22104655636783377, + "flos": 710059358208.0, + "grad_norm": 0.023468083856487864, + "language_loss": 0.93112767, + "learning_rate": 0.0009073098793810607, + "loss": 0.94278765, + "num_input_tokens_seen": 95519904, + "router_z_loss_mlp": 0.97265625, + "step": 1149, + "time_per_iteration": 2.9064676761627197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165673, + "balance_loss_mlp": 1.06882739, + "epoch": 0.22123893805309736, + "flos": 585964073472.0, + "grad_norm": 0.028202445852463846, + "language_loss": 0.98436809, + "learning_rate": 0.000907129107224346, + "loss": 0.99602491, + "num_input_tokens_seen": 95591568, + "router_z_loss_mlp": 0.96826172, + "step": 1150, + "time_per_iteration": 2.670436382293701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165906, + "balance_loss_mlp": 1.06901312, + "epoch": 0.22143131973836092, + "flos": 493250995200.0, + "grad_norm": 0.02267098136900654, + "language_loss": 0.95673937, + "learning_rate": 0.0009069481770060939, + "loss": 0.96839839, + "num_input_tokens_seen": 95664480, + "router_z_loss_mlp": 0.96875, + "step": 1151, + "time_per_iteration": 2.650136947631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167632, + "balance_loss_mlp": 1.07092977, + "epoch": 0.22162370142362448, + "flos": 1081467623424.0, + "grad_norm": 0.023887201965423828, + "language_loss": 0.92357147, + "learning_rate": 0.000906767088796548, + "loss": 0.93524778, + "num_input_tokens_seen": 95754400, + "router_z_loss_mlp": 0.96679688, + "step": 1152, + "time_per_iteration": 3.4331767559051514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174048, + "balance_loss_mlp": 1.07734585, + "epoch": 0.22181608310888803, + "flos": 493511506944.0, + "grad_norm": 0.021211000774135545, + "language_loss": 0.94297695, + "learning_rate": 0.0009065858426660127, + "loss": 0.9547174, + "num_input_tokens_seen": 95826944, + "router_z_loss_mlp": 0.96679688, + "step": 1153, + "time_per_iteration": 2.6492207050323486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171336, + "balance_loss_mlp": 1.07458591, + "epoch": 0.2220084647941516, + "flos": 725324765184.0, + "grad_norm": 0.02806046891368227, + "language_loss": 0.95655924, + "learning_rate": 0.0009064044386848543, + "loss": 0.96827257, + "num_input_tokens_seen": 95902688, + "router_z_loss_mlp": 0.96728516, + "step": 1154, + "time_per_iteration": 2.9135258197784424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116775, + "balance_loss_mlp": 1.07090425, + "epoch": 0.22220084647941515, + "flos": 490244376576.0, + "grad_norm": 0.029776005734579798, + "language_loss": 1.00600004, + "learning_rate": 0.0009062228769234997, + "loss": 1.01767755, + "num_input_tokens_seen": 95969952, + "router_z_loss_mlp": 0.96826172, + "step": 1155, + "time_per_iteration": 2.597781181335449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171214, + "balance_loss_mlp": 1.07451141, + "epoch": 0.2223932281646787, + "flos": 537295492608.0, + "grad_norm": 0.030445586519746, + "language_loss": 0.93354964, + "learning_rate": 0.0009060411574524376, + "loss": 0.94526184, + "num_input_tokens_seen": 96037344, + "router_z_loss_mlp": 0.96679688, + "step": 1156, + "time_per_iteration": 2.7325634956359863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168314, + "balance_loss_mlp": 1.07151604, + "epoch": 0.22258560984994227, + "flos": 932967553536.0, + "grad_norm": 0.0275078677514356, + "language_loss": 0.98614538, + "learning_rate": 0.0009058592803422178, + "loss": 0.99782854, + "num_input_tokens_seen": 96115616, + "router_z_loss_mlp": 0.96777344, + "step": 1157, + "time_per_iteration": 3.156981945037842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169861, + "balance_loss_mlp": 1.0739212, + "epoch": 0.22277799153520586, + "flos": 1202395286016.0, + "grad_norm": 0.00950920896526599, + "language_loss": 0.78710288, + "learning_rate": 0.0009056772456634512, + "loss": 0.79880148, + "num_input_tokens_seen": 96333600, + "router_z_loss_mlp": 0.95898438, + "step": 1158, + "time_per_iteration": 4.7935662269592285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170727, + "balance_loss_mlp": 1.07421494, + "epoch": 0.22297037322046942, + "flos": 502316513280.0, + "grad_norm": 0.05502374006765337, + "language_loss": 0.97024429, + "learning_rate": 0.00090549505348681, + "loss": 0.98195159, + "num_input_tokens_seen": 96402544, + "router_z_loss_mlp": 0.96484375, + "step": 1159, + "time_per_iteration": 2.579418659210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167768, + "balance_loss_mlp": 1.07135153, + "epoch": 0.22316275490573298, + "flos": 754112587776.0, + "grad_norm": 0.025312842068973822, + "language_loss": 0.9244132, + "learning_rate": 0.0009053127038830275, + "loss": 0.93609083, + "num_input_tokens_seen": 96487600, + "router_z_loss_mlp": 0.96386719, + "step": 1160, + "time_per_iteration": 2.970240592956543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169788, + "balance_loss_mlp": 1.07346714, + "epoch": 0.22335513659099654, + "flos": 515804000256.0, + "grad_norm": 0.02702757021011719, + "language_loss": 0.97474223, + "learning_rate": 0.000905130196922898, + "loss": 0.98644012, + "num_input_tokens_seen": 96554912, + "router_z_loss_mlp": 0.96289062, + "step": 1161, + "time_per_iteration": 2.558567762374878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175493, + "balance_loss_mlp": 1.07917213, + "epoch": 0.2235475182762601, + "flos": 485507501568.0, + "grad_norm": 0.024760780359754056, + "language_loss": 0.947945, + "learning_rate": 0.0009049475326772769, + "loss": 0.95969993, + "num_input_tokens_seen": 96624192, + "router_z_loss_mlp": 0.96289062, + "step": 1162, + "time_per_iteration": 2.5948867797851562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168008, + "balance_loss_mlp": 1.0716871, + "epoch": 0.22373989996152366, + "flos": 471067290624.0, + "grad_norm": 0.0243609738761747, + "language_loss": 0.92091036, + "learning_rate": 0.0009047647112170811, + "loss": 0.93259048, + "num_input_tokens_seen": 96701040, + "router_z_loss_mlp": 0.96289062, + "step": 1163, + "time_per_iteration": 2.7958250045776367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165002, + "balance_loss_mlp": 1.06868088, + "epoch": 0.22393228164678722, + "flos": 1273017807360.0, + "grad_norm": 0.0269563070164892, + "language_loss": 0.98098505, + "learning_rate": 0.0009045817326132876, + "loss": 0.99263507, + "num_input_tokens_seen": 96791200, + "router_z_loss_mlp": 0.96289062, + "step": 1164, + "time_per_iteration": 3.64853835105896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165462, + "balance_loss_mlp": 1.06914091, + "epoch": 0.22412466333205078, + "flos": 597467523072.0, + "grad_norm": 0.02771003139242203, + "language_loss": 0.94602239, + "learning_rate": 0.0009043985969369357, + "loss": 0.95767695, + "num_input_tokens_seen": 96869360, + "router_z_loss_mlp": 0.96289062, + "step": 1165, + "time_per_iteration": 2.8231425285339355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175209, + "balance_loss_mlp": 1.07860184, + "epoch": 0.22431704501731436, + "flos": 609630984192.0, + "grad_norm": 0.02516811505749033, + "language_loss": 0.93514198, + "learning_rate": 0.0009042153042591245, + "loss": 0.94689411, + "num_input_tokens_seen": 96945840, + "router_z_loss_mlp": 0.96582031, + "step": 1166, + "time_per_iteration": 2.755671501159668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174563, + "balance_loss_mlp": 1.07819414, + "epoch": 0.22450942670257792, + "flos": 908106872832.0, + "grad_norm": 0.024247493396408124, + "language_loss": 0.93277276, + "learning_rate": 0.0009040318546510146, + "loss": 0.94451833, + "num_input_tokens_seen": 97029296, + "router_z_loss_mlp": 0.96337891, + "step": 1167, + "time_per_iteration": 3.126707077026367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174214, + "balance_loss_mlp": 1.07770181, + "epoch": 0.22470180838784148, + "flos": 566380756992.0, + "grad_norm": 0.02335770706345326, + "language_loss": 0.94522464, + "learning_rate": 0.0009038482481838275, + "loss": 0.95696682, + "num_input_tokens_seen": 97097776, + "router_z_loss_mlp": 0.96484375, + "step": 1168, + "time_per_iteration": 2.6482362747192383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171371, + "balance_loss_mlp": 1.07485878, + "epoch": 0.22489419007310504, + "flos": 835917100032.0, + "grad_norm": 0.021740410096357694, + "language_loss": 0.9467479, + "learning_rate": 0.0009036644849288455, + "loss": 0.95846164, + "num_input_tokens_seen": 97181424, + "router_z_loss_mlp": 0.96484375, + "step": 1169, + "time_per_iteration": 3.0959203243255615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168691, + "balance_loss_mlp": 1.07217908, + "epoch": 0.2250865717583686, + "flos": 582138989568.0, + "grad_norm": 0.028400846177611044, + "language_loss": 0.95971251, + "learning_rate": 0.0009034805649574118, + "loss": 0.97139943, + "num_input_tokens_seen": 97252128, + "router_z_loss_mlp": 0.96484375, + "step": 1170, + "time_per_iteration": 2.65209698677063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171761, + "balance_loss_mlp": 1.07515407, + "epoch": 0.22527895344363216, + "flos": 601670639616.0, + "grad_norm": 0.021879369323455276, + "language_loss": 0.92857611, + "learning_rate": 0.0009032964883409308, + "loss": 0.94029367, + "num_input_tokens_seen": 97326640, + "router_z_loss_mlp": 0.96582031, + "step": 1171, + "time_per_iteration": 2.8586626052856445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175461, + "balance_loss_mlp": 1.07990265, + "epoch": 0.22547133512889572, + "flos": 1443731959296.0, + "grad_norm": 0.011387534292379292, + "language_loss": 0.73050535, + "learning_rate": 0.000903112255150867, + "loss": 0.74225998, + "num_input_tokens_seen": 97553952, + "router_z_loss_mlp": 0.95507812, + "step": 1172, + "time_per_iteration": 4.9882895946502686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171774, + "balance_loss_mlp": 1.07526255, + "epoch": 0.22566371681415928, + "flos": 491585866752.0, + "grad_norm": 0.025801800464723818, + "language_loss": 0.97062689, + "learning_rate": 0.0009029278654587462, + "loss": 0.98234463, + "num_input_tokens_seen": 97623584, + "router_z_loss_mlp": 0.96484375, + "step": 1173, + "time_per_iteration": 2.595419406890869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171429, + "balance_loss_mlp": 1.07491696, + "epoch": 0.22585609849942284, + "flos": 605751505920.0, + "grad_norm": 0.02576863859493135, + "language_loss": 0.92400688, + "learning_rate": 0.0009027433193361548, + "loss": 0.93572116, + "num_input_tokens_seen": 97695952, + "router_z_loss_mlp": 0.96484375, + "step": 1174, + "time_per_iteration": 2.738267183303833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117476, + "balance_loss_mlp": 1.07824779, + "epoch": 0.22604848018468643, + "flos": 636727481856.0, + "grad_norm": 0.028952390928102957, + "language_loss": 0.97668821, + "learning_rate": 0.00090255861685474, + "loss": 0.98843575, + "num_input_tokens_seen": 97764544, + "router_z_loss_mlp": 0.96484375, + "step": 1175, + "time_per_iteration": 2.7286014556884766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117152, + "balance_loss_mlp": 1.07481766, + "epoch": 0.22624086186995, + "flos": 480844486656.0, + "grad_norm": 0.027877026454804697, + "language_loss": 1.02366519, + "learning_rate": 0.0009023737580862095, + "loss": 1.03538048, + "num_input_tokens_seen": 97830976, + "router_z_loss_mlp": 0.96679688, + "step": 1176, + "time_per_iteration": 2.553281307220459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170774, + "balance_loss_mlp": 1.07388091, + "epoch": 0.22643324355521355, + "flos": 496806835200.0, + "grad_norm": 0.02249634447584531, + "language_loss": 0.90840948, + "learning_rate": 0.0009021887431023321, + "loss": 0.92011726, + "num_input_tokens_seen": 97898800, + "router_z_loss_mlp": 0.96875, + "step": 1177, + "time_per_iteration": 2.5862364768981934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172189, + "balance_loss_mlp": 1.07539093, + "epoch": 0.2266256252404771, + "flos": 562683927552.0, + "grad_norm": 0.02041789434880362, + "language_loss": 0.95725513, + "learning_rate": 0.0009020035719749369, + "loss": 0.96897697, + "num_input_tokens_seen": 97974112, + "router_z_loss_mlp": 0.96777344, + "step": 1178, + "time_per_iteration": 2.7553560733795166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176357, + "balance_loss_mlp": 1.0796541, + "epoch": 0.22681800692574067, + "flos": 581032541184.0, + "grad_norm": 0.026733278329428435, + "language_loss": 0.89533567, + "learning_rate": 0.0009018182447759136, + "loss": 0.90709925, + "num_input_tokens_seen": 98056640, + "router_z_loss_mlp": 0.96679688, + "step": 1179, + "time_per_iteration": 3.012024402618408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175508, + "balance_loss_mlp": 1.07904434, + "epoch": 0.22701038861100423, + "flos": 741465033216.0, + "grad_norm": 0.025064804828048133, + "language_loss": 0.90941453, + "learning_rate": 0.0009016327615772126, + "loss": 0.92116958, + "num_input_tokens_seen": 98135952, + "router_z_loss_mlp": 0.96435547, + "step": 1180, + "time_per_iteration": 2.969684600830078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172378, + "balance_loss_mlp": 1.07577109, + "epoch": 0.2272027702962678, + "flos": 578305173504.0, + "grad_norm": 0.036813558231106436, + "language_loss": 1.00164366, + "learning_rate": 0.0009014471224508451, + "loss": 1.01336741, + "num_input_tokens_seen": 98204288, + "router_z_loss_mlp": 0.96582031, + "step": 1181, + "time_per_iteration": 2.664487361907959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173976, + "balance_loss_mlp": 1.0774641, + "epoch": 0.22739515198153135, + "flos": 545290765824.0, + "grad_norm": 0.028585613124224512, + "language_loss": 0.95647848, + "learning_rate": 0.0009012613274688823, + "loss": 0.96821827, + "num_input_tokens_seen": 98269856, + "router_z_loss_mlp": 0.96484375, + "step": 1182, + "time_per_iteration": 2.647608518600464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177492, + "balance_loss_mlp": 1.08078945, + "epoch": 0.22758753366679493, + "flos": 441091702272.0, + "grad_norm": 0.02755397132508441, + "language_loss": 1.00651419, + "learning_rate": 0.0009010753767034565, + "loss": 1.01828909, + "num_input_tokens_seen": 98335632, + "router_z_loss_mlp": 0.96679688, + "step": 1183, + "time_per_iteration": 2.528580904006958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176952, + "balance_loss_mlp": 1.08053601, + "epoch": 0.2277799153520585, + "flos": 730823709696.0, + "grad_norm": 0.024484618665474616, + "language_loss": 0.90051508, + "learning_rate": 0.0009008892702267599, + "loss": 0.91228461, + "num_input_tokens_seen": 98420592, + "router_z_loss_mlp": 0.96386719, + "step": 1184, + "time_per_iteration": 2.990344285964966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117733, + "balance_loss_mlp": 1.08100891, + "epoch": 0.22797229703732205, + "flos": 527913067008.0, + "grad_norm": 0.030622621699729128, + "language_loss": 1.01022232, + "learning_rate": 0.0009007030081110457, + "loss": 1.02199566, + "num_input_tokens_seen": 98488096, + "router_z_loss_mlp": 0.96289062, + "step": 1185, + "time_per_iteration": 2.5795140266418457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172726, + "balance_loss_mlp": 1.07592821, + "epoch": 0.2281646787225856, + "flos": 536520688128.0, + "grad_norm": 0.026616575931436976, + "language_loss": 0.93079567, + "learning_rate": 0.000900516590428627, + "loss": 0.942523, + "num_input_tokens_seen": 98561664, + "router_z_loss_mlp": 0.96777344, + "step": 1186, + "time_per_iteration": 2.6647558212280273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117313, + "balance_loss_mlp": 1.07628405, + "epoch": 0.22835706040784917, + "flos": 542477529600.0, + "grad_norm": 0.02522496809839962, + "language_loss": 0.99033505, + "learning_rate": 0.0009003300172518778, + "loss": 1.00206637, + "num_input_tokens_seen": 98634336, + "router_z_loss_mlp": 0.96826172, + "step": 1187, + "time_per_iteration": 2.7046303749084473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177624, + "balance_loss_mlp": 1.08073056, + "epoch": 0.22854944209311273, + "flos": 792004859904.0, + "grad_norm": 0.026332453075710083, + "language_loss": 0.94325852, + "learning_rate": 0.0009001432886532321, + "loss": 0.95503473, + "num_input_tokens_seen": 98709600, + "router_z_loss_mlp": 0.96875, + "step": 1188, + "time_per_iteration": 2.9583094120025635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179036, + "balance_loss_mlp": 1.08233392, + "epoch": 0.2287418237783763, + "flos": 470215898112.0, + "grad_norm": 0.025775869396212594, + "language_loss": 0.97465944, + "learning_rate": 0.0008999564047051843, + "loss": 0.98644984, + "num_input_tokens_seen": 98775024, + "router_z_loss_mlp": 0.96679688, + "step": 1189, + "time_per_iteration": 2.49245023727417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178388, + "balance_loss_mlp": 1.08154237, + "epoch": 0.22893420546363985, + "flos": 469004663808.0, + "grad_norm": 0.023763579929190374, + "language_loss": 0.94691694, + "learning_rate": 0.0008997693654802894, + "loss": 0.95870078, + "num_input_tokens_seen": 98845248, + "router_z_loss_mlp": 0.96826172, + "step": 1190, + "time_per_iteration": 2.6276731491088867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178257, + "balance_loss_mlp": 1.08145857, + "epoch": 0.22912658714890344, + "flos": 627401452032.0, + "grad_norm": 0.023724149848154047, + "language_loss": 0.95182133, + "learning_rate": 0.0008995821710511625, + "loss": 0.96360391, + "num_input_tokens_seen": 98913584, + "router_z_loss_mlp": 0.96777344, + "step": 1191, + "time_per_iteration": 2.756840705871582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117993, + "balance_loss_mlp": 1.08308399, + "epoch": 0.229318968834167, + "flos": 504020573184.0, + "grad_norm": 0.024708694220473774, + "language_loss": 0.93247074, + "learning_rate": 0.0008993948214904786, + "loss": 0.94427001, + "num_input_tokens_seen": 98978608, + "router_z_loss_mlp": 0.96826172, + "step": 1192, + "time_per_iteration": 2.577340602874756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190514, + "balance_loss_mlp": 1.09533691, + "epoch": 0.22951135051943056, + "flos": 1377713877504.0, + "grad_norm": 0.021264094300491608, + "language_loss": 0.78422213, + "learning_rate": 0.0008992073168709733, + "loss": 0.79612726, + "num_input_tokens_seen": 99207424, + "router_z_loss_mlp": 0.95117188, + "step": 1193, + "time_per_iteration": 4.850237607955933 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179442, + "balance_loss_mlp": 1.08316851, + "epoch": 0.22970373220469412, + "flos": 645549952512.0, + "grad_norm": 0.02667568465905087, + "language_loss": 0.92540175, + "learning_rate": 0.0008990196572654427, + "loss": 0.93719625, + "num_input_tokens_seen": 99290592, + "router_z_loss_mlp": 0.96240234, + "step": 1194, + "time_per_iteration": 2.8638381958007812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180858, + "balance_loss_mlp": 1.08453715, + "epoch": 0.22989611388995768, + "flos": 501272464896.0, + "grad_norm": 0.02416134539694475, + "language_loss": 0.95937514, + "learning_rate": 0.0008988318427467426, + "loss": 0.97118378, + "num_input_tokens_seen": 99366096, + "router_z_loss_mlp": 0.96289062, + "step": 1195, + "time_per_iteration": 2.7063868045806885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182741, + "balance_loss_mlp": 1.08589542, + "epoch": 0.23008849557522124, + "flos": 1098333030912.0, + "grad_norm": 0.02922856270819412, + "language_loss": 0.9667449, + "learning_rate": 0.0008986438733877887, + "loss": 0.97857237, + "num_input_tokens_seen": 99456768, + "router_z_loss_mlp": 0.96826172, + "step": 1196, + "time_per_iteration": 3.4508113861083984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176758, + "balance_loss_mlp": 1.08043683, + "epoch": 0.2302808772604848, + "flos": 684992560128.0, + "grad_norm": 0.022228440588834414, + "language_loss": 0.91545051, + "learning_rate": 0.0008984557492615576, + "loss": 0.92721808, + "num_input_tokens_seen": 99539616, + "router_z_loss_mlp": 0.96289062, + "step": 1197, + "time_per_iteration": 2.93611741065979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179014, + "balance_loss_mlp": 1.08269298, + "epoch": 0.23047325894574835, + "flos": 529960230912.0, + "grad_norm": 0.026499525382426087, + "language_loss": 0.99148774, + "learning_rate": 0.0008982674704410854, + "loss": 1.0032779, + "num_input_tokens_seen": 99612064, + "router_z_loss_mlp": 0.96289062, + "step": 1198, + "time_per_iteration": 2.7032008171081543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180823, + "balance_loss_mlp": 1.08450174, + "epoch": 0.23066564063101191, + "flos": 684126431232.0, + "grad_norm": 0.025326379221325218, + "language_loss": 0.86113322, + "learning_rate": 0.0008980790369994682, + "loss": 0.87294143, + "num_input_tokens_seen": 99691040, + "router_z_loss_mlp": 0.96289062, + "step": 1199, + "time_per_iteration": 2.9629056453704834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173246, + "balance_loss_mlp": 1.07673466, + "epoch": 0.2308580223162755, + "flos": 559631646720.0, + "grad_norm": 0.02469990042405053, + "language_loss": 0.95889735, + "learning_rate": 0.000897890449009863, + "loss": 0.97062981, + "num_input_tokens_seen": 99762016, + "router_z_loss_mlp": 0.96484375, + "step": 1200, + "time_per_iteration": 2.6673126220703125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178191, + "balance_loss_mlp": 1.08167911, + "epoch": 0.23105040400153906, + "flos": 556729087488.0, + "grad_norm": 0.021551459012756572, + "language_loss": 0.97633696, + "learning_rate": 0.0008977017065454853, + "loss": 0.98811877, + "num_input_tokens_seen": 99835552, + "router_z_loss_mlp": 0.96484375, + "step": 1201, + "time_per_iteration": 2.6586263179779053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176954, + "balance_loss_mlp": 1.08048964, + "epoch": 0.23124278568680262, + "flos": 706049624064.0, + "grad_norm": 0.025666519973580538, + "language_loss": 0.89963996, + "learning_rate": 0.0008975128096796121, + "loss": 0.9114095, + "num_input_tokens_seen": 99910784, + "router_z_loss_mlp": 0.96435547, + "step": 1202, + "time_per_iteration": 2.8599958419799805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175929, + "balance_loss_mlp": 1.07989419, + "epoch": 0.23143516737206618, + "flos": 613968359424.0, + "grad_norm": 0.02791489713026627, + "language_loss": 0.96485001, + "learning_rate": 0.0008973237584855794, + "loss": 0.97660929, + "num_input_tokens_seen": 99991120, + "router_z_loss_mlp": 0.95996094, + "step": 1203, + "time_per_iteration": 2.8814125061035156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117493, + "balance_loss_mlp": 1.07903779, + "epoch": 0.23162754905732974, + "flos": 390095980032.0, + "grad_norm": 0.02381480195735972, + "language_loss": 0.91340852, + "learning_rate": 0.0008971345530367832, + "loss": 0.92515785, + "num_input_tokens_seen": 100053888, + "router_z_loss_mlp": 0.95849609, + "step": 1204, + "time_per_iteration": 2.513951301574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176133, + "balance_loss_mlp": 1.08024144, + "epoch": 0.2318199307425933, + "flos": 668969086464.0, + "grad_norm": 0.024943516104182908, + "language_loss": 0.94778013, + "learning_rate": 0.0008969451934066799, + "loss": 0.95954144, + "num_input_tokens_seen": 100124176, + "router_z_loss_mlp": 0.95849609, + "step": 1205, + "time_per_iteration": 2.80454421043396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173068, + "balance_loss_mlp": 1.07712853, + "epoch": 0.23201231242785686, + "flos": 667627596288.0, + "grad_norm": 0.029617322009159303, + "language_loss": 0.92493355, + "learning_rate": 0.0008967556796687854, + "loss": 0.93666422, + "num_input_tokens_seen": 100205296, + "router_z_loss_mlp": 0.95898438, + "step": 1206, + "time_per_iteration": 2.89932918548584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173146, + "balance_loss_mlp": 1.07720602, + "epoch": 0.23220469411312042, + "flos": 750094121472.0, + "grad_norm": 0.024264467100448908, + "language_loss": 0.94343531, + "learning_rate": 0.0008965660118966752, + "loss": 0.95516682, + "num_input_tokens_seen": 100279440, + "router_z_loss_mlp": 0.95898438, + "step": 1207, + "time_per_iteration": 2.9768385887145996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179014, + "balance_loss_mlp": 1.08307481, + "epoch": 0.232397075798384, + "flos": 668261411328.0, + "grad_norm": 0.02512248807118796, + "language_loss": 0.97498, + "learning_rate": 0.0008963761901639851, + "loss": 0.98677015, + "num_input_tokens_seen": 100354512, + "router_z_loss_mlp": 0.95898438, + "step": 1208, + "time_per_iteration": 2.8175342082977295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177539, + "balance_loss_mlp": 1.081599, + "epoch": 0.23258945748364757, + "flos": 611345777664.0, + "grad_norm": 0.025244332610569246, + "language_loss": 0.93465042, + "learning_rate": 0.0008961862145444103, + "loss": 0.9464258, + "num_input_tokens_seen": 100426848, + "router_z_loss_mlp": 0.95898438, + "step": 1209, + "time_per_iteration": 2.707583427429199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117491, + "balance_loss_mlp": 1.07901847, + "epoch": 0.23278183916891113, + "flos": 490672074240.0, + "grad_norm": 0.025133767455437463, + "language_loss": 0.96175104, + "learning_rate": 0.0008959960851117059, + "loss": 0.97350019, + "num_input_tokens_seen": 100496176, + "router_z_loss_mlp": 0.95849609, + "step": 1210, + "time_per_iteration": 2.5783777236938477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174943, + "balance_loss_mlp": 1.07895589, + "epoch": 0.23297422085417469, + "flos": 512673856512.0, + "grad_norm": 0.027877077505007057, + "language_loss": 0.94183683, + "learning_rate": 0.0008958058019396868, + "loss": 0.95358628, + "num_input_tokens_seen": 100575072, + "router_z_loss_mlp": 0.95947266, + "step": 1211, + "time_per_iteration": 2.7695388793945312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118178, + "balance_loss_mlp": 1.08560216, + "epoch": 0.23316660253943824, + "flos": 547531312128.0, + "grad_norm": 0.0259067341075638, + "language_loss": 0.95459378, + "learning_rate": 0.0008956153651022274, + "loss": 0.96641153, + "num_input_tokens_seen": 100648304, + "router_z_loss_mlp": 0.96142578, + "step": 1212, + "time_per_iteration": 2.7088377475738525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178328, + "balance_loss_mlp": 1.08181643, + "epoch": 0.2333589842247018, + "flos": 511288705536.0, + "grad_norm": 0.023917692799316066, + "language_loss": 0.93208623, + "learning_rate": 0.0008954247746732618, + "loss": 0.94386959, + "num_input_tokens_seen": 100717616, + "router_z_loss_mlp": 0.96484375, + "step": 1213, + "time_per_iteration": 2.6319668292999268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172909, + "balance_loss_mlp": 1.0766834, + "epoch": 0.23355136590996536, + "flos": 664406128128.0, + "grad_norm": 0.02356648487739955, + "language_loss": 0.98858505, + "learning_rate": 0.0008952340307267837, + "loss": 1.00031424, + "num_input_tokens_seen": 100797056, + "router_z_loss_mlp": 0.96191406, + "step": 1214, + "time_per_iteration": 2.891026735305786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172334, + "balance_loss_mlp": 1.07629859, + "epoch": 0.23374374759522892, + "flos": 509465123328.0, + "grad_norm": 0.027978905734491046, + "language_loss": 0.94424212, + "learning_rate": 0.0008950431333368468, + "loss": 0.95596552, + "num_input_tokens_seen": 100863632, + "router_z_loss_mlp": 0.95996094, + "step": 1215, + "time_per_iteration": 2.5823488235473633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173288, + "balance_loss_mlp": 1.07730114, + "epoch": 0.2339361292804925, + "flos": 1296428209152.0, + "grad_norm": 0.026145796218117214, + "language_loss": 0.94705772, + "learning_rate": 0.0008948520825775634, + "loss": 0.95879066, + "num_input_tokens_seen": 100950272, + "router_z_loss_mlp": 0.95947266, + "step": 1216, + "time_per_iteration": 3.6343605518341064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174216, + "balance_loss_mlp": 1.07808566, + "epoch": 0.23412851096575607, + "flos": 707176264704.0, + "grad_norm": 0.02578801546488365, + "language_loss": 0.93516719, + "learning_rate": 0.0008946608785231067, + "loss": 0.94690937, + "num_input_tokens_seen": 101031008, + "router_z_loss_mlp": 0.9609375, + "step": 1217, + "time_per_iteration": 2.8923676013946533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174557, + "balance_loss_mlp": 1.07842624, + "epoch": 0.23432089265101963, + "flos": 439174794240.0, + "grad_norm": 0.024987781095147748, + "language_loss": 0.94467312, + "learning_rate": 0.0008944695212477084, + "loss": 0.95641869, + "num_input_tokens_seen": 101094688, + "router_z_loss_mlp": 0.9609375, + "step": 1218, + "time_per_iteration": 2.47641658782959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176273, + "balance_loss_mlp": 1.08028615, + "epoch": 0.2345132743362832, + "flos": 481914731520.0, + "grad_norm": 0.02187031641141441, + "language_loss": 0.9320662, + "learning_rate": 0.0008942780108256599, + "loss": 0.94382894, + "num_input_tokens_seen": 101163744, + "router_z_loss_mlp": 0.95947266, + "step": 1219, + "time_per_iteration": 2.585204839706421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176397, + "balance_loss_mlp": 1.07993269, + "epoch": 0.23470565602154675, + "flos": 412340809728.0, + "grad_norm": 0.02314471919225668, + "language_loss": 0.95930934, + "learning_rate": 0.0008940863473313121, + "loss": 0.97107327, + "num_input_tokens_seen": 101226480, + "router_z_loss_mlp": 0.96435547, + "step": 1220, + "time_per_iteration": 2.461904764175415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174627, + "balance_loss_mlp": 1.07811534, + "epoch": 0.2348980377068103, + "flos": 546499998720.0, + "grad_norm": 0.029389735884218435, + "language_loss": 0.99771547, + "learning_rate": 0.0008938945308390756, + "loss": 1.00946164, + "num_input_tokens_seen": 101291824, + "router_z_loss_mlp": 0.96484375, + "step": 1221, + "time_per_iteration": 2.6403567790985107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179462, + "balance_loss_mlp": 1.08295047, + "epoch": 0.23509041939207387, + "flos": 576842159616.0, + "grad_norm": 0.023502241620232074, + "language_loss": 0.96374851, + "learning_rate": 0.00089370256142342, + "loss": 0.97554314, + "num_input_tokens_seen": 101367216, + "router_z_loss_mlp": 0.96484375, + "step": 1222, + "time_per_iteration": 2.7148585319519043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178637, + "balance_loss_mlp": 1.08198178, + "epoch": 0.23528280107733743, + "flos": 589947611136.0, + "grad_norm": 0.022852016666186668, + "language_loss": 0.93682569, + "learning_rate": 0.0008935104391588746, + "loss": 0.94861209, + "num_input_tokens_seen": 101438992, + "router_z_loss_mlp": 0.96630859, + "step": 1223, + "time_per_iteration": 2.7302677631378174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179799, + "balance_loss_mlp": 1.08338237, + "epoch": 0.235475182762601, + "flos": 824856811008.0, + "grad_norm": 0.02091323276417278, + "language_loss": 0.91087663, + "learning_rate": 0.0008933181641200276, + "loss": 0.9226746, + "num_input_tokens_seen": 101534464, + "router_z_loss_mlp": 0.96386719, + "step": 1224, + "time_per_iteration": 3.120337724685669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183017, + "balance_loss_mlp": 1.08650565, + "epoch": 0.23566756444786457, + "flos": 681366862848.0, + "grad_norm": 0.027323039985709546, + "language_loss": 0.94355077, + "learning_rate": 0.0008931257363815271, + "loss": 0.95538092, + "num_input_tokens_seen": 101616496, + "router_z_loss_mlp": 0.96484375, + "step": 1225, + "time_per_iteration": 2.893202543258667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178928, + "balance_loss_mlp": 1.08251154, + "epoch": 0.23585994613312813, + "flos": 703134329856.0, + "grad_norm": 0.022860929740297704, + "language_loss": 0.96590424, + "learning_rate": 0.0008929331560180798, + "loss": 0.97769356, + "num_input_tokens_seen": 101694496, + "router_z_loss_mlp": 0.96386719, + "step": 1226, + "time_per_iteration": 2.913858652114868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176734, + "balance_loss_mlp": 1.08017468, + "epoch": 0.2360523278183917, + "flos": 525195158016.0, + "grad_norm": 0.02227272458953822, + "language_loss": 0.99194574, + "learning_rate": 0.0008927404231044525, + "loss": 1.00371313, + "num_input_tokens_seen": 101766160, + "router_z_loss_mlp": 0.96533203, + "step": 1227, + "time_per_iteration": 2.7194507122039795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175869, + "balance_loss_mlp": 1.07921374, + "epoch": 0.23624470950365525, + "flos": 525442934784.0, + "grad_norm": 0.02071878597098496, + "language_loss": 0.89412713, + "learning_rate": 0.0008925475377154703, + "loss": 0.90588582, + "num_input_tokens_seen": 101844160, + "router_z_loss_mlp": 0.96630859, + "step": 1228, + "time_per_iteration": 2.742506265640259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175669, + "balance_loss_mlp": 1.07896686, + "epoch": 0.2364370911889188, + "flos": 597960348672.0, + "grad_norm": 0.023166098266421232, + "language_loss": 0.90900964, + "learning_rate": 0.0008923544999260183, + "loss": 0.92076635, + "num_input_tokens_seen": 101917968, + "router_z_loss_mlp": 0.96679688, + "step": 1229, + "time_per_iteration": 2.809842109680176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177841, + "balance_loss_mlp": 1.08113885, + "epoch": 0.23662947287418237, + "flos": 758171986944.0, + "grad_norm": 0.02725464196132968, + "language_loss": 1.00227833, + "learning_rate": 0.00089216130981104, + "loss": 1.0140568, + "num_input_tokens_seen": 101996880, + "router_z_loss_mlp": 0.96679688, + "step": 1230, + "time_per_iteration": 3.0096282958984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178297, + "balance_loss_mlp": 1.08159423, + "epoch": 0.23682185455944593, + "flos": 547207673856.0, + "grad_norm": 0.024713012089740163, + "language_loss": 0.91807795, + "learning_rate": 0.000891967967445539, + "loss": 0.92986089, + "num_input_tokens_seen": 102067936, + "router_z_loss_mlp": 0.96679688, + "step": 1231, + "time_per_iteration": 2.7001702785491943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185987, + "balance_loss_mlp": 1.08928442, + "epoch": 0.2370142362447095, + "flos": 663522534912.0, + "grad_norm": 0.02265672956199411, + "language_loss": 0.96654546, + "learning_rate": 0.0008917744729045772, + "loss": 0.97840536, + "num_input_tokens_seen": 102147552, + "router_z_loss_mlp": 0.96679688, + "step": 1232, + "time_per_iteration": 2.8703036308288574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184505, + "balance_loss_mlp": 1.08789778, + "epoch": 0.23720661792997308, + "flos": 684911969280.0, + "grad_norm": 0.02632145570598456, + "language_loss": 0.93737417, + "learning_rate": 0.0008915808262632757, + "loss": 0.94921923, + "num_input_tokens_seen": 102224480, + "router_z_loss_mlp": 0.96582031, + "step": 1233, + "time_per_iteration": 2.839534044265747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185605, + "balance_loss_mlp": 1.08928347, + "epoch": 0.23739899961523664, + "flos": 560022414336.0, + "grad_norm": 0.027552675935845497, + "language_loss": 1.01508975, + "learning_rate": 0.0008913870275968148, + "loss": 1.02694583, + "num_input_tokens_seen": 102297392, + "router_z_loss_mlp": 0.96289062, + "step": 1234, + "time_per_iteration": 2.7176129817962646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182161, + "balance_loss_mlp": 1.08545852, + "epoch": 0.2375913813005002, + "flos": 891163602432.0, + "grad_norm": 0.02404650352203449, + "language_loss": 0.9583261, + "learning_rate": 0.0008911930769804342, + "loss": 0.97014773, + "num_input_tokens_seen": 102386032, + "router_z_loss_mlp": 0.96679688, + "step": 1235, + "time_per_iteration": 3.244257688522339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179697, + "balance_loss_mlp": 1.08289862, + "epoch": 0.23778376298576376, + "flos": 642365414400.0, + "grad_norm": 0.020226791074773265, + "language_loss": 0.99461335, + "learning_rate": 0.0008909989744894318, + "loss": 1.00641024, + "num_input_tokens_seen": 102463504, + "router_z_loss_mlp": 0.96777344, + "step": 1236, + "time_per_iteration": 2.8618855476379395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179012, + "balance_loss_mlp": 1.08230948, + "epoch": 0.23797614467102732, + "flos": 617945166336.0, + "grad_norm": 0.025060145140963254, + "language_loss": 0.91887248, + "learning_rate": 0.0008908047201991649, + "loss": 0.93066257, + "num_input_tokens_seen": 102529632, + "router_z_loss_mlp": 0.96679688, + "step": 1237, + "time_per_iteration": 2.7335665225982666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177715, + "balance_loss_mlp": 1.08120298, + "epoch": 0.23816852635629088, + "flos": 625463076864.0, + "grad_norm": 0.02188809519195417, + "language_loss": 0.92642158, + "learning_rate": 0.0008906103141850502, + "loss": 0.93819869, + "num_input_tokens_seen": 102610192, + "router_z_loss_mlp": 0.96484375, + "step": 1238, + "time_per_iteration": 2.9244723320007324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178141, + "balance_loss_mlp": 1.0816294, + "epoch": 0.23836090804155444, + "flos": 522440318976.0, + "grad_norm": 0.025638098136730073, + "language_loss": 0.97356987, + "learning_rate": 0.0008904157565225621, + "loss": 0.98535126, + "num_input_tokens_seen": 102681216, + "router_z_loss_mlp": 0.96484375, + "step": 1239, + "time_per_iteration": 2.6046018600463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186867, + "balance_loss_mlp": 1.09059334, + "epoch": 0.238553289726818, + "flos": 1155854281728.0, + "grad_norm": 0.0279922632366243, + "language_loss": 0.91224372, + "learning_rate": 0.000890221047287235, + "loss": 0.92411238, + "num_input_tokens_seen": 102777184, + "router_z_loss_mlp": 0.96240234, + "step": 1240, + "time_per_iteration": 3.503387928009033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191442, + "balance_loss_mlp": 1.09512079, + "epoch": 0.23874567141208156, + "flos": 500909895168.0, + "grad_norm": 0.02294407067471098, + "language_loss": 0.98687088, + "learning_rate": 0.0008900261865546615, + "loss": 0.99878532, + "num_input_tokens_seen": 102845744, + "router_z_loss_mlp": 0.96289062, + "step": 1241, + "time_per_iteration": 2.6329948902130127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188291, + "balance_loss_mlp": 1.09197009, + "epoch": 0.23893805309734514, + "flos": 558049110528.0, + "grad_norm": 0.02727719764566138, + "language_loss": 0.96105886, + "learning_rate": 0.0008898311744004936, + "loss": 0.97294176, + "num_input_tokens_seen": 102918064, + "router_z_loss_mlp": 0.96289062, + "step": 1242, + "time_per_iteration": 2.6852729320526123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011866, + "balance_loss_mlp": 1.0902791, + "epoch": 0.2391304347826087, + "flos": 550316350464.0, + "grad_norm": 0.023767912183342704, + "language_loss": 0.95555472, + "learning_rate": 0.0008896360109004414, + "loss": 0.9674207, + "num_input_tokens_seen": 102983920, + "router_z_loss_mlp": 0.96289062, + "step": 1243, + "time_per_iteration": 2.6607675552368164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181953, + "balance_loss_mlp": 1.08558464, + "epoch": 0.23932281646787226, + "flos": 517078361088.0, + "grad_norm": 0.022492500831292953, + "language_loss": 0.92156398, + "learning_rate": 0.0008894406961302742, + "loss": 0.93338358, + "num_input_tokens_seen": 103053328, + "router_z_loss_mlp": 0.96337891, + "step": 1244, + "time_per_iteration": 2.658339262008667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180796, + "balance_loss_mlp": 1.0844276, + "epoch": 0.23951519815313582, + "flos": 745001407488.0, + "grad_norm": 0.0220414301985699, + "language_loss": 0.9171226, + "learning_rate": 0.0008892452301658201, + "loss": 0.92893052, + "num_input_tokens_seen": 103128208, + "router_z_loss_mlp": 0.96337891, + "step": 1245, + "time_per_iteration": 2.987859010696411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189345, + "balance_loss_mlp": 1.09302354, + "epoch": 0.23970757983839938, + "flos": 555174749184.0, + "grad_norm": 0.02624868476300941, + "language_loss": 0.92775297, + "learning_rate": 0.0008890496130829653, + "loss": 0.93964636, + "num_input_tokens_seen": 103197392, + "router_z_loss_mlp": 0.96289062, + "step": 1246, + "time_per_iteration": 2.7285211086273193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011891, + "balance_loss_mlp": 1.09287417, + "epoch": 0.23989996152366294, + "flos": 481617289728.0, + "grad_norm": 0.024405638758005322, + "language_loss": 0.93939734, + "learning_rate": 0.0008888538449576555, + "loss": 0.95128834, + "num_input_tokens_seen": 103265328, + "router_z_loss_mlp": 0.96191406, + "step": 1247, + "time_per_iteration": 2.603447675704956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181648, + "balance_loss_mlp": 1.08532703, + "epoch": 0.2400923432089265, + "flos": 486280304640.0, + "grad_norm": 0.02551404288502155, + "language_loss": 0.9456799, + "learning_rate": 0.0008886579258658944, + "loss": 0.9574964, + "num_input_tokens_seen": 103331632, + "router_z_loss_mlp": 0.96289062, + "step": 1248, + "time_per_iteration": 2.6195995807647705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183672, + "balance_loss_mlp": 1.08735096, + "epoch": 0.24028472489419006, + "flos": 624792331776.0, + "grad_norm": 0.02192042043345247, + "language_loss": 0.93244678, + "learning_rate": 0.0008884618558837446, + "loss": 0.94428349, + "num_input_tokens_seen": 103405408, + "router_z_loss_mlp": 0.96289062, + "step": 1249, + "time_per_iteration": 2.830350399017334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187022, + "balance_loss_mlp": 1.09113026, + "epoch": 0.24047710657945365, + "flos": 602808013824.0, + "grad_norm": 0.023766863499936387, + "language_loss": 0.96457344, + "learning_rate": 0.0008882656350873273, + "loss": 0.97644365, + "num_input_tokens_seen": 103487216, + "router_z_loss_mlp": 0.95849609, + "step": 1250, + "time_per_iteration": 2.8691956996917725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119127, + "balance_loss_mlp": 1.09547377, + "epoch": 0.2406694882647172, + "flos": 843000582144.0, + "grad_norm": 0.03001641023469985, + "language_loss": 1.00300837, + "learning_rate": 0.0008880692635528219, + "loss": 1.01492119, + "num_input_tokens_seen": 103568640, + "router_z_loss_mlp": 0.95751953, + "step": 1251, + "time_per_iteration": 3.066152572631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187351, + "balance_loss_mlp": 1.09155416, + "epoch": 0.24086186994998077, + "flos": 528134647296.0, + "grad_norm": 0.026461260661865858, + "language_loss": 0.98557454, + "learning_rate": 0.0008878727413564669, + "loss": 0.99744809, + "num_input_tokens_seen": 103640784, + "router_z_loss_mlp": 0.95751953, + "step": 1252, + "time_per_iteration": 2.7665653228759766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209228, + "balance_loss_mlp": 1.11519623, + "epoch": 0.24105425163524433, + "flos": 1341459262464.0, + "grad_norm": 0.018061169603452644, + "language_loss": 0.80135596, + "learning_rate": 0.0008876760685745588, + "loss": 0.81344825, + "num_input_tokens_seen": 103865824, + "router_z_loss_mlp": 0.93945312, + "step": 1253, + "time_per_iteration": 4.899695634841919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182732, + "balance_loss_mlp": 1.08679259, + "epoch": 0.24124663332050789, + "flos": 615227257344.0, + "grad_norm": 0.02599071752574661, + "language_loss": 0.90657973, + "learning_rate": 0.0008874792452834528, + "loss": 0.91840708, + "num_input_tokens_seen": 103939872, + "router_z_loss_mlp": 0.95898438, + "step": 1254, + "time_per_iteration": 2.7407760620117188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179855, + "balance_loss_mlp": 1.08401072, + "epoch": 0.24143901500577145, + "flos": 576592381440.0, + "grad_norm": 0.0285281411485809, + "language_loss": 0.99380314, + "learning_rate": 0.0008872822715595626, + "loss": 1.00560164, + "num_input_tokens_seen": 104011120, + "router_z_loss_mlp": 0.95800781, + "step": 1255, + "time_per_iteration": 2.7094287872314453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176059, + "balance_loss_mlp": 1.08007157, + "epoch": 0.241631396691035, + "flos": 496146823680.0, + "grad_norm": 0.026934202036951318, + "language_loss": 0.98012596, + "learning_rate": 0.0008870851474793598, + "loss": 0.9918865, + "num_input_tokens_seen": 104077040, + "router_z_loss_mlp": 0.95947266, + "step": 1256, + "time_per_iteration": 2.5717930793762207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180992, + "balance_loss_mlp": 1.08500445, + "epoch": 0.24182377837629856, + "flos": 637396225536.0, + "grad_norm": 0.02721147411023071, + "language_loss": 0.97604549, + "learning_rate": 0.0008868878731193752, + "loss": 0.98785543, + "num_input_tokens_seen": 104150880, + "router_z_loss_mlp": 0.95947266, + "step": 1257, + "time_per_iteration": 2.835613965988159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180736, + "balance_loss_mlp": 1.08460534, + "epoch": 0.24201616006156215, + "flos": 516349218816.0, + "grad_norm": 0.023847715865297152, + "language_loss": 0.9613235, + "learning_rate": 0.0008866904485561973, + "loss": 0.97313088, + "num_input_tokens_seen": 104223696, + "router_z_loss_mlp": 0.9609375, + "step": 1258, + "time_per_iteration": 2.697693347930908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182815, + "balance_loss_mlp": 1.08682752, + "epoch": 0.2422085417468257, + "flos": 616378093056.0, + "grad_norm": 0.023106527532664196, + "language_loss": 0.92363685, + "learning_rate": 0.000886492873866473, + "loss": 0.93546498, + "num_input_tokens_seen": 104301728, + "router_z_loss_mlp": 0.95947266, + "step": 1259, + "time_per_iteration": 2.8120577335357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118033, + "balance_loss_mlp": 1.08424771, + "epoch": 0.24240092343208927, + "flos": 586912794624.0, + "grad_norm": 0.025402415625288076, + "language_loss": 0.9586736, + "learning_rate": 0.000886295149126908, + "loss": 0.97047698, + "num_input_tokens_seen": 104374480, + "router_z_loss_mlp": 0.96044922, + "step": 1260, + "time_per_iteration": 2.7276840209960938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184073, + "balance_loss_mlp": 1.08813286, + "epoch": 0.24259330511735283, + "flos": 763570874880.0, + "grad_norm": 0.0207328591517146, + "language_loss": 0.94417751, + "learning_rate": 0.0008860972744142655, + "loss": 0.95601827, + "num_input_tokens_seen": 104452384, + "router_z_loss_mlp": 0.95898438, + "step": 1261, + "time_per_iteration": 2.898794412612915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184052, + "balance_loss_mlp": 1.08816016, + "epoch": 0.2427856868026164, + "flos": 628133322240.0, + "grad_norm": 0.02409331705070074, + "language_loss": 0.89591467, + "learning_rate": 0.0008858992498053671, + "loss": 0.90775526, + "num_input_tokens_seen": 104532576, + "router_z_loss_mlp": 0.95849609, + "step": 1262, + "time_per_iteration": 2.8477351665496826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183746, + "balance_loss_mlp": 1.08952332, + "epoch": 0.24297806848787995, + "flos": 1514919343104.0, + "grad_norm": 0.012580587939111834, + "language_loss": 0.7658875, + "learning_rate": 0.0008857010753770934, + "loss": 0.77772498, + "num_input_tokens_seen": 104765216, + "router_z_loss_mlp": 0.94140625, + "step": 1263, + "time_per_iteration": 4.826787710189819 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180613, + "balance_loss_mlp": 1.0848639, + "epoch": 0.2431704501731435, + "flos": 543072413184.0, + "grad_norm": 0.025826560533695943, + "language_loss": 0.92586392, + "learning_rate": 0.0008855027512063817, + "loss": 0.93767005, + "num_input_tokens_seen": 104836912, + "router_z_loss_mlp": 0.95703125, + "step": 1264, + "time_per_iteration": 2.722557306289673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179682, + "balance_loss_mlp": 1.08364689, + "epoch": 0.24336283185840707, + "flos": 524878250496.0, + "grad_norm": 0.025894380889017608, + "language_loss": 0.95614499, + "learning_rate": 0.0008853042773702292, + "loss": 0.96794176, + "num_input_tokens_seen": 104909280, + "router_z_loss_mlp": 0.95996094, + "step": 1265, + "time_per_iteration": 2.7258307933807373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118145, + "balance_loss_mlp": 1.0855577, + "epoch": 0.24355521354367063, + "flos": 538205282304.0, + "grad_norm": 0.022817154468993458, + "language_loss": 0.98287719, + "learning_rate": 0.0008851056539456896, + "loss": 0.99469173, + "num_input_tokens_seen": 104982560, + "router_z_loss_mlp": 0.95849609, + "step": 1266, + "time_per_iteration": 2.6970114707946777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182961, + "balance_loss_mlp": 1.08692622, + "epoch": 0.24374759522893422, + "flos": 932108155392.0, + "grad_norm": 0.024066297062525326, + "language_loss": 0.9148944, + "learning_rate": 0.0008849068810098755, + "loss": 0.92672402, + "num_input_tokens_seen": 105075056, + "router_z_loss_mlp": 0.95996094, + "step": 1267, + "time_per_iteration": 3.326692819595337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118368, + "balance_loss_mlp": 1.08764458, + "epoch": 0.24393997691419778, + "flos": 428685193728.0, + "grad_norm": 0.027357648838687767, + "language_loss": 0.94001949, + "learning_rate": 0.0008847079586399575, + "loss": 0.95185632, + "num_input_tokens_seen": 105137536, + "router_z_loss_mlp": 0.95996094, + "step": 1268, + "time_per_iteration": 2.466787099838257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180763, + "balance_loss_mlp": 1.08482289, + "epoch": 0.24413235859946134, + "flos": 579942104064.0, + "grad_norm": 0.026150492080556795, + "language_loss": 0.95411992, + "learning_rate": 0.0008845088869131641, + "loss": 0.96592754, + "num_input_tokens_seen": 105204848, + "router_z_loss_mlp": 0.95898438, + "step": 1269, + "time_per_iteration": 2.7016899585723877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175832, + "balance_loss_mlp": 1.07989287, + "epoch": 0.2443247402847249, + "flos": 530900219904.0, + "grad_norm": 0.025309414349457434, + "language_loss": 0.98951483, + "learning_rate": 0.0008843096659067818, + "loss": 1.00127316, + "num_input_tokens_seen": 105273456, + "router_z_loss_mlp": 0.95898438, + "step": 1270, + "time_per_iteration": 2.6240859031677246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179701, + "balance_loss_mlp": 1.08366621, + "epoch": 0.24451712196998845, + "flos": 697624651776.0, + "grad_norm": 0.020400222299851913, + "language_loss": 0.92813951, + "learning_rate": 0.000884110295698155, + "loss": 0.93993652, + "num_input_tokens_seen": 105355488, + "router_z_loss_mlp": 0.95996094, + "step": 1271, + "time_per_iteration": 2.945749044418335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180344, + "balance_loss_mlp": 1.08435643, + "epoch": 0.24470950365525201, + "flos": 530863289856.0, + "grad_norm": 0.02434814436965663, + "language_loss": 0.97428346, + "learning_rate": 0.0008839107763646861, + "loss": 0.98608696, + "num_input_tokens_seen": 105421568, + "router_z_loss_mlp": 0.95947266, + "step": 1272, + "time_per_iteration": 2.5816495418548584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182389, + "balance_loss_mlp": 1.08630657, + "epoch": 0.24490188534051557, + "flos": 492347936256.0, + "grad_norm": 0.027277570267404832, + "language_loss": 1.00778949, + "learning_rate": 0.0008837111079838353, + "loss": 1.0196135, + "num_input_tokens_seen": 105493072, + "router_z_loss_mlp": 0.96044922, + "step": 1273, + "time_per_iteration": 2.675060749053955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182001, + "balance_loss_mlp": 1.08587062, + "epoch": 0.24509426702577913, + "flos": 475111226880.0, + "grad_norm": 0.024851656777491255, + "language_loss": 0.98025054, + "learning_rate": 0.000883511290633121, + "loss": 0.99207056, + "num_input_tokens_seen": 105559840, + "router_z_loss_mlp": 0.9609375, + "step": 1274, + "time_per_iteration": 2.5230517387390137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183988, + "balance_loss_mlp": 1.08747613, + "epoch": 0.24528664871104272, + "flos": 551647107072.0, + "grad_norm": 0.02070792437524093, + "language_loss": 1.00507927, + "learning_rate": 0.000883311324390119, + "loss": 1.01691914, + "num_input_tokens_seen": 105634448, + "router_z_loss_mlp": 0.96484375, + "step": 1275, + "time_per_iteration": 2.690488338470459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184819, + "balance_loss_mlp": 1.08887982, + "epoch": 0.24547903039630628, + "flos": 827335675392.0, + "grad_norm": 0.02978995697497926, + "language_loss": 0.95172417, + "learning_rate": 0.0008831112093324629, + "loss": 0.96357232, + "num_input_tokens_seen": 105711936, + "router_z_loss_mlp": 0.95898438, + "step": 1276, + "time_per_iteration": 3.0883522033691406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184816, + "balance_loss_mlp": 1.08839917, + "epoch": 0.24567141208156984, + "flos": 592693718016.0, + "grad_norm": 0.026400385967418116, + "language_loss": 0.99731994, + "learning_rate": 0.0008829109455378444, + "loss": 1.00916803, + "num_input_tokens_seen": 105780240, + "router_z_loss_mlp": 0.96386719, + "step": 1277, + "time_per_iteration": 2.670658588409424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184585, + "balance_loss_mlp": 1.08812118, + "epoch": 0.2458637937668334, + "flos": 548929198080.0, + "grad_norm": 0.022333419000210953, + "language_loss": 0.95654261, + "learning_rate": 0.000882710533084013, + "loss": 0.96838844, + "num_input_tokens_seen": 105849840, + "router_z_loss_mlp": 0.96435547, + "step": 1278, + "time_per_iteration": 2.641019344329834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189057, + "balance_loss_mlp": 1.09244978, + "epoch": 0.24605617545209696, + "flos": 516911175168.0, + "grad_norm": 0.022487969609205835, + "language_loss": 0.97332817, + "learning_rate": 0.0008825099720487755, + "loss": 0.98521876, + "num_input_tokens_seen": 105921488, + "router_z_loss_mlp": 0.96582031, + "step": 1279, + "time_per_iteration": 2.626079559326172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193596, + "balance_loss_mlp": 1.09880066, + "epoch": 0.24624855713736052, + "flos": 1515058331136.0, + "grad_norm": 0.0162275920205478, + "language_loss": 0.7526114, + "learning_rate": 0.0008823092625099967, + "loss": 0.76454735, + "num_input_tokens_seen": 106146816, + "router_z_loss_mlp": 0.94726562, + "step": 1280, + "time_per_iteration": 4.846211671829224 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118811, + "balance_loss_mlp": 1.09350586, + "epoch": 0.24644093882262408, + "flos": 1530746706432.0, + "grad_norm": 0.013716798372908724, + "language_loss": 0.77944112, + "learning_rate": 0.0008821084045455987, + "loss": 0.79132223, + "num_input_tokens_seen": 106361568, + "router_z_loss_mlp": 0.9453125, + "step": 1281, + "time_per_iteration": 4.781409025192261 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189694, + "balance_loss_mlp": 1.09351575, + "epoch": 0.24663332050788764, + "flos": 660348730368.0, + "grad_norm": 0.028995521048395968, + "language_loss": 0.998649, + "learning_rate": 0.0008819073982335619, + "loss": 1.01054597, + "num_input_tokens_seen": 106435296, + "router_z_loss_mlp": 0.96142578, + "step": 1282, + "time_per_iteration": 2.873255729675293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187163, + "balance_loss_mlp": 1.09098482, + "epoch": 0.24682570219315123, + "flos": 542805170688.0, + "grad_norm": 0.0289675073475646, + "language_loss": 0.92590028, + "learning_rate": 0.0008817062436519235, + "loss": 0.93777192, + "num_input_tokens_seen": 106507184, + "router_z_loss_mlp": 0.96142578, + "step": 1283, + "time_per_iteration": 2.6918435096740723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184699, + "balance_loss_mlp": 1.08852112, + "epoch": 0.24701808387841478, + "flos": 441658387968.0, + "grad_norm": 0.027350099061339322, + "language_loss": 1.00939846, + "learning_rate": 0.0008815049408787788, + "loss": 1.02124548, + "num_input_tokens_seen": 106571472, + "router_z_loss_mlp": 0.96142578, + "step": 1284, + "time_per_iteration": 2.5155978202819824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190183, + "balance_loss_mlp": 1.09443462, + "epoch": 0.24721046556367834, + "flos": 469032861696.0, + "grad_norm": 0.028209143321693456, + "language_loss": 0.95635927, + "learning_rate": 0.0008813034899922805, + "loss": 0.96826112, + "num_input_tokens_seen": 106638368, + "router_z_loss_mlp": 0.95703125, + "step": 1285, + "time_per_iteration": 2.5152530670166016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193087, + "balance_loss_mlp": 1.09729075, + "epoch": 0.2474028472489419, + "flos": 505407725568.0, + "grad_norm": 0.027111907557838905, + "language_loss": 1.01196301, + "learning_rate": 0.0008811018910706387, + "loss": 1.02389383, + "num_input_tokens_seen": 106705312, + "router_z_loss_mlp": 0.95751953, + "step": 1286, + "time_per_iteration": 2.5593316555023193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188305, + "balance_loss_mlp": 1.09255612, + "epoch": 0.24759522893420546, + "flos": 480955276800.0, + "grad_norm": 0.03276846828627927, + "language_loss": 0.9498859, + "learning_rate": 0.0008809001441921211, + "loss": 0.96176893, + "num_input_tokens_seen": 106778624, + "router_z_loss_mlp": 0.95703125, + "step": 1287, + "time_per_iteration": 2.7347421646118164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181619, + "balance_loss_mlp": 1.08567917, + "epoch": 0.24778761061946902, + "flos": 534753501696.0, + "grad_norm": 0.025262665654883373, + "language_loss": 0.97019696, + "learning_rate": 0.0008806982494350528, + "loss": 0.98201311, + "num_input_tokens_seen": 106847744, + "router_z_loss_mlp": 0.95898438, + "step": 1288, + "time_per_iteration": 2.6499245166778564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181206, + "balance_loss_mlp": 1.08526671, + "epoch": 0.24797999230473258, + "flos": 560942937600.0, + "grad_norm": 0.021558514258727474, + "language_loss": 0.9849534, + "learning_rate": 0.0008804962068778161, + "loss": 0.99676538, + "num_input_tokens_seen": 106927584, + "router_z_loss_mlp": 0.95898438, + "step": 1289, + "time_per_iteration": 2.852257490158081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186476, + "balance_loss_mlp": 1.09053683, + "epoch": 0.24817237398999614, + "flos": 625480541184.0, + "grad_norm": 0.024913990838324927, + "language_loss": 0.90269625, + "learning_rate": 0.0008802940165988511, + "loss": 0.91456103, + "num_input_tokens_seen": 107006656, + "router_z_loss_mlp": 0.95898438, + "step": 1290, + "time_per_iteration": 2.846277952194214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181135, + "balance_loss_mlp": 1.08471859, + "epoch": 0.2483647556752597, + "flos": 613484265984.0, + "grad_norm": 0.02310813532639645, + "language_loss": 0.96774852, + "learning_rate": 0.000880091678676655, + "loss": 0.97955984, + "num_input_tokens_seen": 107084352, + "router_z_loss_mlp": 0.96386719, + "step": 1291, + "time_per_iteration": 2.8085777759552 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180122, + "balance_loss_mlp": 1.0837059, + "epoch": 0.2485571373605233, + "flos": 584687711232.0, + "grad_norm": 0.021422688776258386, + "language_loss": 0.9855839, + "learning_rate": 0.0008798891931897821, + "loss": 0.99738514, + "num_input_tokens_seen": 107158368, + "router_z_loss_mlp": 0.96386719, + "step": 1292, + "time_per_iteration": 2.7361133098602295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183371, + "balance_loss_mlp": 1.08704984, + "epoch": 0.24874951904578685, + "flos": 495736590336.0, + "grad_norm": 0.02424073807687162, + "language_loss": 0.92916596, + "learning_rate": 0.0008796865602168447, + "loss": 0.94099975, + "num_input_tokens_seen": 107224256, + "router_z_loss_mlp": 0.96289062, + "step": 1293, + "time_per_iteration": 2.5220131874084473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186197, + "balance_loss_mlp": 1.09025729, + "epoch": 0.2489419007310504, + "flos": 457173573120.0, + "grad_norm": 0.023099031146870112, + "language_loss": 0.94818902, + "learning_rate": 0.0008794837798365115, + "loss": 0.96005094, + "num_input_tokens_seen": 107292720, + "router_z_loss_mlp": 0.95898438, + "step": 1294, + "time_per_iteration": 2.6338109970092773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187707, + "balance_loss_mlp": 1.09191012, + "epoch": 0.24913428241631397, + "flos": 486565011456.0, + "grad_norm": 0.02215078033303108, + "language_loss": 0.96107936, + "learning_rate": 0.0008792808521275089, + "loss": 0.97295642, + "num_input_tokens_seen": 107368576, + "router_z_loss_mlp": 0.95751953, + "step": 1295, + "time_per_iteration": 2.7354605197906494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182687, + "balance_loss_mlp": 1.0869385, + "epoch": 0.24932666410157753, + "flos": 519917793792.0, + "grad_norm": 0.022601932216391857, + "language_loss": 0.96075213, + "learning_rate": 0.0008790777771686206, + "loss": 0.972579, + "num_input_tokens_seen": 107433856, + "router_z_loss_mlp": 0.95703125, + "step": 1296, + "time_per_iteration": 2.5746819972991943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181852, + "balance_loss_mlp": 1.08610308, + "epoch": 0.2495190457868411, + "flos": 473556888576.0, + "grad_norm": 0.022656020732285023, + "language_loss": 0.93397439, + "learning_rate": 0.0008788745550386872, + "loss": 0.94579285, + "num_input_tokens_seen": 107500944, + "router_z_loss_mlp": 0.95703125, + "step": 1297, + "time_per_iteration": 2.55985689163208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177725, + "balance_loss_mlp": 1.0820719, + "epoch": 0.24971142747210465, + "flos": 747198292992.0, + "grad_norm": 0.023996141347128058, + "language_loss": 0.88372529, + "learning_rate": 0.0008786711858166063, + "loss": 0.89550251, + "num_input_tokens_seen": 107580000, + "router_z_loss_mlp": 0.95605469, + "step": 1298, + "time_per_iteration": 2.9357082843780518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179743, + "balance_loss_mlp": 1.08399367, + "epoch": 0.2499038091573682, + "flos": 750901853184.0, + "grad_norm": 0.025666304870509565, + "language_loss": 0.93355387, + "learning_rate": 0.0008784676695813332, + "loss": 0.9453513, + "num_input_tokens_seen": 107660384, + "router_z_loss_mlp": 0.95703125, + "step": 1299, + "time_per_iteration": 2.939739942550659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187708, + "balance_loss_mlp": 1.09186363, + "epoch": 0.2500961908426318, + "flos": 746342897664.0, + "grad_norm": 0.02448521774653795, + "language_loss": 0.94308037, + "learning_rate": 0.0008782640064118796, + "loss": 0.95495749, + "num_input_tokens_seen": 107736320, + "router_z_loss_mlp": 0.95800781, + "step": 1300, + "time_per_iteration": 2.882838249206543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223068, + "balance_loss_mlp": 1.12808228, + "epoch": 0.2502885725278953, + "flos": 1420523672064.0, + "grad_norm": 0.019515623701574104, + "language_loss": 0.7618475, + "learning_rate": 0.0008780601963873149, + "loss": 0.77407825, + "num_input_tokens_seen": 107972608, + "router_z_loss_mlp": 0.94921875, + "step": 1301, + "time_per_iteration": 5.002445220947266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180814, + "balance_loss_mlp": 1.08520806, + "epoch": 0.2504809542131589, + "flos": 516231697920.0, + "grad_norm": 0.028413107884204602, + "language_loss": 0.96116567, + "learning_rate": 0.0008778562395867648, + "loss": 0.97297382, + "num_input_tokens_seen": 108043312, + "router_z_loss_mlp": 0.95556641, + "step": 1302, + "time_per_iteration": 2.6463139057159424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183586, + "balance_loss_mlp": 1.08783746, + "epoch": 0.25067333589842244, + "flos": 526851554304.0, + "grad_norm": 0.024791221234372676, + "language_loss": 0.9191972, + "learning_rate": 0.0008776521360894127, + "loss": 0.93103302, + "num_input_tokens_seen": 108114144, + "router_z_loss_mlp": 0.95703125, + "step": 1303, + "time_per_iteration": 2.60622239112854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203766, + "balance_loss_mlp": 1.10897064, + "epoch": 0.25086571758368603, + "flos": 1477157326848.0, + "grad_norm": 0.014632010139538269, + "language_loss": 0.78962064, + "learning_rate": 0.0008774478859744984, + "loss": 0.80165827, + "num_input_tokens_seen": 108338720, + "router_z_loss_mlp": 0.94726562, + "step": 1304, + "time_per_iteration": 4.810328006744385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188508, + "balance_loss_mlp": 1.09285462, + "epoch": 0.2510580992689496, + "flos": 529402277376.0, + "grad_norm": 0.027485922989720333, + "language_loss": 0.99458921, + "learning_rate": 0.0008772434893213186, + "loss": 1.00647426, + "num_input_tokens_seen": 108405456, + "router_z_loss_mlp": 0.95605469, + "step": 1305, + "time_per_iteration": 2.6031458377838135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187013, + "balance_loss_mlp": 1.09155023, + "epoch": 0.25125048095421315, + "flos": 518465513472.0, + "grad_norm": 0.0302061265456268, + "language_loss": 0.93206942, + "learning_rate": 0.0008770389462092276, + "loss": 0.94393957, + "num_input_tokens_seen": 108474368, + "router_z_loss_mlp": 0.95410156, + "step": 1306, + "time_per_iteration": 2.636845827102661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118174, + "balance_loss_mlp": 1.0858953, + "epoch": 0.25144286263947674, + "flos": 621674923008.0, + "grad_norm": 0.026354631998576704, + "language_loss": 0.96568018, + "learning_rate": 0.0008768342567176357, + "loss": 0.97749758, + "num_input_tokens_seen": 108548864, + "router_z_loss_mlp": 0.95800781, + "step": 1307, + "time_per_iteration": 2.797346591949463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187952, + "balance_loss_mlp": 1.09220326, + "epoch": 0.25163524432474027, + "flos": 504865234944.0, + "grad_norm": 0.024318536510777332, + "language_loss": 0.99895847, + "learning_rate": 0.0008766294209260107, + "loss": 1.01083803, + "num_input_tokens_seen": 108623072, + "router_z_loss_mlp": 0.95703125, + "step": 1308, + "time_per_iteration": 2.648099184036255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180717, + "balance_loss_mlp": 1.0850637, + "epoch": 0.25182762601000386, + "flos": 510079472640.0, + "grad_norm": 0.027727924866539442, + "language_loss": 1.0231359, + "learning_rate": 0.0008764244389138767, + "loss": 1.0349431, + "num_input_tokens_seen": 108690128, + "router_z_loss_mlp": 0.95605469, + "step": 1309, + "time_per_iteration": 2.575963258743286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179663, + "balance_loss_mlp": 1.08396196, + "epoch": 0.2520200076952674, + "flos": 635097282048.0, + "grad_norm": 0.028356059247082867, + "language_loss": 0.93336231, + "learning_rate": 0.000876219310760815, + "loss": 0.94515896, + "num_input_tokens_seen": 108770272, + "router_z_loss_mlp": 0.95654297, + "step": 1310, + "time_per_iteration": 2.8647706508636475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189244, + "balance_loss_mlp": 1.09330475, + "epoch": 0.252212389380531, + "flos": 495651996672.0, + "grad_norm": 0.024396868749396446, + "language_loss": 0.91954494, + "learning_rate": 0.0008760140365464631, + "loss": 0.93143737, + "num_input_tokens_seen": 108840592, + "router_z_loss_mlp": 0.95898438, + "step": 1311, + "time_per_iteration": 2.592453718185425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180261, + "balance_loss_mlp": 1.08451247, + "epoch": 0.2524047710657945, + "flos": 491529470976.0, + "grad_norm": 0.026197758988141227, + "language_loss": 0.97483641, + "learning_rate": 0.0008758086163505156, + "loss": 0.98663902, + "num_input_tokens_seen": 108910064, + "router_z_loss_mlp": 0.95703125, + "step": 1312, + "time_per_iteration": 2.56319260597229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181231, + "balance_loss_mlp": 1.08548176, + "epoch": 0.2525971527510581, + "flos": 648612966912.0, + "grad_norm": 0.0242630752619845, + "language_loss": 0.98733318, + "learning_rate": 0.0008756030502527239, + "loss": 0.99914545, + "num_input_tokens_seen": 108986336, + "router_z_loss_mlp": 0.95703125, + "step": 1313, + "time_per_iteration": 2.858691930770874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180546, + "balance_loss_mlp": 1.08455837, + "epoch": 0.2527895344363217, + "flos": 570373026816.0, + "grad_norm": 0.025539383487616106, + "language_loss": 0.99746555, + "learning_rate": 0.0008753973383328954, + "loss": 1.00927103, + "num_input_tokens_seen": 109059712, + "router_z_loss_mlp": 0.95947266, + "step": 1314, + "time_per_iteration": 2.6683549880981445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180137, + "balance_loss_mlp": 1.0841974, + "epoch": 0.2529819161215852, + "flos": 515068127232.0, + "grad_norm": 0.027266475314614652, + "language_loss": 0.95154297, + "learning_rate": 0.0008751914806708952, + "loss": 0.96334434, + "num_input_tokens_seen": 109127504, + "router_z_loss_mlp": 0.95898438, + "step": 1315, + "time_per_iteration": 2.6008012294769287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178852, + "balance_loss_mlp": 1.08310342, + "epoch": 0.2531742978068488, + "flos": 532350498816.0, + "grad_norm": 0.02508848621911812, + "language_loss": 0.91122246, + "learning_rate": 0.0008749854773466439, + "loss": 0.92301095, + "num_input_tokens_seen": 109198080, + "router_z_loss_mlp": 0.95703125, + "step": 1316, + "time_per_iteration": 2.6595401763916016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193828, + "balance_loss_mlp": 1.09822178, + "epoch": 0.25336667949211233, + "flos": 597747500544.0, + "grad_norm": 0.027675397486347803, + "language_loss": 0.92894816, + "learning_rate": 0.0008747793284401192, + "loss": 0.9408865, + "num_input_tokens_seen": 109268368, + "router_z_loss_mlp": 0.95556641, + "step": 1317, + "time_per_iteration": 2.6975109577178955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187696, + "balance_loss_mlp": 1.09175622, + "epoch": 0.2535590611773759, + "flos": 603255177216.0, + "grad_norm": 0.02603186041930466, + "language_loss": 0.95462376, + "learning_rate": 0.0008745730340313551, + "loss": 0.96650076, + "num_input_tokens_seen": 109344112, + "router_z_loss_mlp": 0.95898438, + "step": 1318, + "time_per_iteration": 2.805327892303467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187328, + "balance_loss_mlp": 1.0915786, + "epoch": 0.25375144286263945, + "flos": 496322741760.0, + "grad_norm": 0.027049333310240738, + "language_loss": 0.95645851, + "learning_rate": 0.0008743665942004422, + "loss": 0.96833169, + "num_input_tokens_seen": 109414112, + "router_z_loss_mlp": 0.95703125, + "step": 1319, + "time_per_iteration": 2.6340737342834473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185781, + "balance_loss_mlp": 1.0896982, + "epoch": 0.25394382454790304, + "flos": 513476858880.0, + "grad_norm": 0.02784781206620994, + "language_loss": 1.02473438, + "learning_rate": 0.0008741600090275277, + "loss": 1.03659225, + "num_input_tokens_seen": 109484336, + "router_z_loss_mlp": 0.96044922, + "step": 1320, + "time_per_iteration": 2.573155641555786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183427, + "balance_loss_mlp": 1.08763099, + "epoch": 0.25413620623316663, + "flos": 960855045120.0, + "grad_norm": 0.03323105604734599, + "language_loss": 0.94160318, + "learning_rate": 0.0008739532785928151, + "loss": 0.95343745, + "num_input_tokens_seen": 109590128, + "router_z_loss_mlp": 0.95751953, + "step": 1321, + "time_per_iteration": 3.470245122909546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190819, + "balance_loss_mlp": 1.09659576, + "epoch": 0.25432858791843016, + "flos": 1580648715264.0, + "grad_norm": 0.017424496497570757, + "language_loss": 0.74893582, + "learning_rate": 0.0008737464029765639, + "loss": 0.76084399, + "num_input_tokens_seen": 109816592, + "router_z_loss_mlp": 0.94140625, + "step": 1322, + "time_per_iteration": 4.8549723625183105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184096, + "balance_loss_mlp": 1.08806074, + "epoch": 0.25452096960369375, + "flos": 584893828608.0, + "grad_norm": 0.025099574916072127, + "language_loss": 0.94150972, + "learning_rate": 0.0008735393822590908, + "loss": 0.95335066, + "num_input_tokens_seen": 109890464, + "router_z_loss_mlp": 0.95996094, + "step": 1323, + "time_per_iteration": 2.6771461963653564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187145, + "balance_loss_mlp": 1.0910151, + "epoch": 0.2547133512889573, + "flos": 509641041408.0, + "grad_norm": 0.024104352127734364, + "language_loss": 0.95373654, + "learning_rate": 0.0008733322165207681, + "loss": 0.965608, + "num_input_tokens_seen": 109963408, + "router_z_loss_mlp": 0.9609375, + "step": 1324, + "time_per_iteration": 2.671187400817871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191608, + "balance_loss_mlp": 1.09590697, + "epoch": 0.25490573297422087, + "flos": 784035783168.0, + "grad_norm": 0.02719192919889817, + "language_loss": 0.93181324, + "learning_rate": 0.0008731249058420247, + "loss": 0.94372928, + "num_input_tokens_seen": 110048800, + "router_z_loss_mlp": 0.95654297, + "step": 1325, + "time_per_iteration": 3.0272371768951416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189078, + "balance_loss_mlp": 1.09332883, + "epoch": 0.2550981146594844, + "flos": 510952332288.0, + "grad_norm": 0.024872253546531747, + "language_loss": 1.00651383, + "learning_rate": 0.0008729174503033459, + "loss": 1.0184046, + "num_input_tokens_seen": 110118096, + "router_z_loss_mlp": 0.95703125, + "step": 1326, + "time_per_iteration": 2.6320900917053223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187412, + "balance_loss_mlp": 1.09166288, + "epoch": 0.255290496344748, + "flos": 677930545152.0, + "grad_norm": 0.02807770436691079, + "language_loss": 0.93655276, + "learning_rate": 0.0008727098499852728, + "loss": 0.9484269, + "num_input_tokens_seen": 110190160, + "router_z_loss_mlp": 0.95703125, + "step": 1327, + "time_per_iteration": 2.8246335983276367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187202, + "balance_loss_mlp": 1.09116733, + "epoch": 0.2554828780300115, + "flos": 538984816128.0, + "grad_norm": 0.02304152562423393, + "language_loss": 0.97811985, + "learning_rate": 0.0008725021049684034, + "loss": 0.9899919, + "num_input_tokens_seen": 110268000, + "router_z_loss_mlp": 0.95996094, + "step": 1328, + "time_per_iteration": 2.783276081085205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011849, + "balance_loss_mlp": 1.08924699, + "epoch": 0.2556752597152751, + "flos": 825622883328.0, + "grad_norm": 0.024322773499976656, + "language_loss": 0.90949428, + "learning_rate": 0.000872294215333391, + "loss": 0.92134333, + "num_input_tokens_seen": 110354816, + "router_z_loss_mlp": 0.95605469, + "step": 1329, + "time_per_iteration": 3.1658623218536377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184378, + "balance_loss_mlp": 1.08867729, + "epoch": 0.2558676414005387, + "flos": 571890435072.0, + "grad_norm": 0.026114012927401953, + "language_loss": 0.91800833, + "learning_rate": 0.0008720861811609457, + "loss": 0.92985213, + "num_input_tokens_seen": 110427968, + "router_z_loss_mlp": 0.95654297, + "step": 1330, + "time_per_iteration": 2.725680112838745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185897, + "balance_loss_mlp": 1.09024334, + "epoch": 0.2560600230858022, + "flos": 487748047872.0, + "grad_norm": 0.02457760145285043, + "language_loss": 0.93800515, + "learning_rate": 0.0008718780025318338, + "loss": 0.94986409, + "num_input_tokens_seen": 110501184, + "router_z_loss_mlp": 0.95605469, + "step": 1331, + "time_per_iteration": 2.730424404144287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184699, + "balance_loss_mlp": 1.08904529, + "epoch": 0.2562524047710658, + "flos": 514119406080.0, + "grad_norm": 0.027688932662206074, + "language_loss": 0.94349414, + "learning_rate": 0.0008716696795268771, + "loss": 0.9553411, + "num_input_tokens_seen": 110573008, + "router_z_loss_mlp": 0.95605469, + "step": 1332, + "time_per_iteration": 2.6572844982147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183855, + "balance_loss_mlp": 1.0881542, + "epoch": 0.25644478645632934, + "flos": 636109129728.0, + "grad_norm": 0.025705757243887913, + "language_loss": 0.96553451, + "learning_rate": 0.0008714612122269538, + "loss": 0.97737306, + "num_input_tokens_seen": 110646704, + "router_z_loss_mlp": 0.95654297, + "step": 1333, + "time_per_iteration": 2.867598295211792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184376, + "balance_loss_mlp": 1.0888176, + "epoch": 0.25663716814159293, + "flos": 437544594432.0, + "grad_norm": 0.025955971973603553, + "language_loss": 1.00358891, + "learning_rate": 0.0008712526007129982, + "loss": 1.01543272, + "num_input_tokens_seen": 110712208, + "router_z_loss_mlp": 0.95507812, + "step": 1334, + "time_per_iteration": 2.516052484512329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186528, + "balance_loss_mlp": 1.0908742, + "epoch": 0.25682954982685646, + "flos": 499242765312.0, + "grad_norm": 0.021880143416013124, + "language_loss": 0.98599482, + "learning_rate": 0.0008710438450660003, + "loss": 0.99786019, + "num_input_tokens_seen": 110783936, + "router_z_loss_mlp": 0.95605469, + "step": 1335, + "time_per_iteration": 2.659489870071411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184319, + "balance_loss_mlp": 1.08861768, + "epoch": 0.25702193151212005, + "flos": 458627854848.0, + "grad_norm": 0.028869593177541276, + "language_loss": 0.98979777, + "learning_rate": 0.0008708349453670064, + "loss": 1.00164104, + "num_input_tokens_seen": 110848560, + "router_z_loss_mlp": 0.95654297, + "step": 1336, + "time_per_iteration": 2.5267841815948486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185282, + "balance_loss_mlp": 1.08953345, + "epoch": 0.2572143131973836, + "flos": 599403896832.0, + "grad_norm": 0.021342480544698176, + "language_loss": 0.99445975, + "learning_rate": 0.0008706259016971185, + "loss": 1.00631261, + "num_input_tokens_seen": 110922672, + "router_z_loss_mlp": 0.95703125, + "step": 1337, + "time_per_iteration": 2.7561397552490234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118469, + "balance_loss_mlp": 1.08884537, + "epoch": 0.25740669488264717, + "flos": 699526096896.0, + "grad_norm": 0.032203199948080075, + "language_loss": 0.96320713, + "learning_rate": 0.0008704167141374944, + "loss": 0.97505397, + "num_input_tokens_seen": 110995456, + "router_z_loss_mlp": 0.95800781, + "step": 1338, + "time_per_iteration": 2.7987895011901855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118993, + "balance_loss_mlp": 1.09432399, + "epoch": 0.25759907656791076, + "flos": 503378025984.0, + "grad_norm": 0.024717846020590344, + "language_loss": 0.97755861, + "learning_rate": 0.0008702073827693482, + "loss": 0.98945785, + "num_input_tokens_seen": 111069568, + "router_z_loss_mlp": 0.95556641, + "step": 1339, + "time_per_iteration": 2.694470167160034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186155, + "balance_loss_mlp": 1.0904057, + "epoch": 0.2577914582531743, + "flos": 775241510400.0, + "grad_norm": 0.025036220674882887, + "language_loss": 0.97113985, + "learning_rate": 0.0008699979076739494, + "loss": 0.98300135, + "num_input_tokens_seen": 111142608, + "router_z_loss_mlp": 0.95703125, + "step": 1340, + "time_per_iteration": 2.962740421295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184068, + "balance_loss_mlp": 1.08836627, + "epoch": 0.2579838399384379, + "flos": 460609890816.0, + "grad_norm": 0.026880962232798965, + "language_loss": 0.99139833, + "learning_rate": 0.0008697882889326234, + "loss": 1.00323892, + "num_input_tokens_seen": 111206336, + "router_z_loss_mlp": 0.95654297, + "step": 1341, + "time_per_iteration": 2.517382860183716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185483, + "balance_loss_mlp": 1.08987677, + "epoch": 0.2581762216237014, + "flos": 570262236672.0, + "grad_norm": 0.0242955377416103, + "language_loss": 0.96170259, + "learning_rate": 0.0008695785266267515, + "loss": 0.97355735, + "num_input_tokens_seen": 111276736, + "router_z_loss_mlp": 0.95556641, + "step": 1342, + "time_per_iteration": 2.6961281299591064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118536, + "balance_loss_mlp": 1.08961082, + "epoch": 0.258368603308965, + "flos": 605386934784.0, + "grad_norm": 0.023671890991135848, + "language_loss": 0.9337616, + "learning_rate": 0.0008693686208377704, + "loss": 0.94561517, + "num_input_tokens_seen": 111353856, + "router_z_loss_mlp": 0.95703125, + "step": 1343, + "time_per_iteration": 2.8561604022979736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184784, + "balance_loss_mlp": 1.08908272, + "epoch": 0.2585609849942285, + "flos": 492486924288.0, + "grad_norm": 0.022133881226187983, + "language_loss": 0.96849036, + "learning_rate": 0.0008691585716471733, + "loss": 0.98033822, + "num_input_tokens_seen": 111424960, + "router_z_loss_mlp": 0.95654297, + "step": 1344, + "time_per_iteration": 2.6443324089050293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185279, + "balance_loss_mlp": 1.08952987, + "epoch": 0.2587533666794921, + "flos": 641957182464.0, + "grad_norm": 0.02305984249039353, + "language_loss": 0.94482636, + "learning_rate": 0.0008689483791365079, + "loss": 0.95667922, + "num_input_tokens_seen": 111505248, + "router_z_loss_mlp": 0.95703125, + "step": 1345, + "time_per_iteration": 2.8541483879089355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185515, + "balance_loss_mlp": 1.08976638, + "epoch": 0.2589457483647557, + "flos": 577994996736.0, + "grad_norm": 0.022382124417400225, + "language_loss": 0.97831523, + "learning_rate": 0.0008687380433873786, + "loss": 0.99017042, + "num_input_tokens_seen": 111581936, + "router_z_loss_mlp": 0.95703125, + "step": 1346, + "time_per_iteration": 2.8148868083953857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186141, + "balance_loss_mlp": 1.09048796, + "epoch": 0.25913813005001923, + "flos": 536466293760.0, + "grad_norm": 0.024690786073415343, + "language_loss": 0.93800229, + "learning_rate": 0.0008685275644814448, + "loss": 0.94986367, + "num_input_tokens_seen": 111651456, + "router_z_loss_mlp": 0.95605469, + "step": 1347, + "time_per_iteration": 2.6872267723083496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188569, + "balance_loss_mlp": 1.0930109, + "epoch": 0.2593305117352828, + "flos": 722346344448.0, + "grad_norm": 0.028015192621825148, + "language_loss": 0.944291, + "learning_rate": 0.0008683169425004216, + "loss": 0.95617664, + "num_input_tokens_seen": 111731712, + "router_z_loss_mlp": 0.95507812, + "step": 1348, + "time_per_iteration": 2.9036293029785156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187318, + "balance_loss_mlp": 1.09171176, + "epoch": 0.25952289342054635, + "flos": 711355186176.0, + "grad_norm": 0.028695706473352366, + "language_loss": 0.9867608, + "learning_rate": 0.0008681061775260799, + "loss": 0.99863392, + "num_input_tokens_seen": 111800752, + "router_z_loss_mlp": 0.95556641, + "step": 1349, + "time_per_iteration": 2.8635356426239014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185365, + "balance_loss_mlp": 1.08942509, + "epoch": 0.25971527510580994, + "flos": 456849934848.0, + "grad_norm": 0.028158951385379896, + "language_loss": 1.01652539, + "learning_rate": 0.0008678952696402458, + "loss": 1.02837896, + "num_input_tokens_seen": 111866752, + "router_z_loss_mlp": 0.95898438, + "step": 1350, + "time_per_iteration": 2.4997899532318115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184224, + "balance_loss_mlp": 1.08847523, + "epoch": 0.25990765679107347, + "flos": 613753509888.0, + "grad_norm": 0.022929201317296435, + "language_loss": 0.944794, + "learning_rate": 0.000867684218924801, + "loss": 0.95663619, + "num_input_tokens_seen": 111951328, + "router_z_loss_mlp": 0.95703125, + "step": 1351, + "time_per_iteration": 2.8553221225738525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190399, + "balance_loss_mlp": 1.09655762, + "epoch": 0.26010003847633706, + "flos": 1541404219392.0, + "grad_norm": 0.011373150433568688, + "language_loss": 0.78947091, + "learning_rate": 0.0008674730254616827, + "loss": 0.80137491, + "num_input_tokens_seen": 112182272, + "router_z_loss_mlp": 0.9375, + "step": 1352, + "time_per_iteration": 4.894901752471924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185829, + "balance_loss_mlp": 1.0900805, + "epoch": 0.2602924201616006, + "flos": 717544341504.0, + "grad_norm": 0.021521520095987904, + "language_loss": 0.9327749, + "learning_rate": 0.0008672616893328834, + "loss": 0.94463313, + "num_input_tokens_seen": 112261760, + "router_z_loss_mlp": 0.95703125, + "step": 1353, + "time_per_iteration": 2.9336133003234863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181557, + "balance_loss_mlp": 1.08571243, + "epoch": 0.2604848018468642, + "flos": 644685825024.0, + "grad_norm": 0.026147354827328006, + "language_loss": 0.99375951, + "learning_rate": 0.0008670502106204512, + "loss": 1.00557506, + "num_input_tokens_seen": 112339136, + "router_z_loss_mlp": 0.95800781, + "step": 1354, + "time_per_iteration": 2.828476667404175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182712, + "balance_loss_mlp": 1.08677256, + "epoch": 0.26067718353212777, + "flos": 518037815808.0, + "grad_norm": 0.024264679119450936, + "language_loss": 0.92830276, + "learning_rate": 0.0008668385894064892, + "loss": 0.94012988, + "num_input_tokens_seen": 112409872, + "router_z_loss_mlp": 0.95898438, + "step": 1355, + "time_per_iteration": 2.627603054046631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183025, + "balance_loss_mlp": 1.08708537, + "epoch": 0.2608695652173913, + "flos": 824224997376.0, + "grad_norm": 0.021603697394371835, + "language_loss": 0.98353279, + "learning_rate": 0.0008666268257731562, + "loss": 0.995363, + "num_input_tokens_seen": 112495616, + "router_z_loss_mlp": 0.95898438, + "step": 1356, + "time_per_iteration": 3.104410409927368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185288, + "balance_loss_mlp": 1.0894438, + "epoch": 0.2610619469026549, + "flos": 1009449039360.0, + "grad_norm": 0.029063247039842262, + "language_loss": 0.98633218, + "learning_rate": 0.0008664149198026662, + "loss": 0.99818504, + "num_input_tokens_seen": 112575168, + "router_z_loss_mlp": 0.95800781, + "step": 1357, + "time_per_iteration": 3.2552602291107178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184981, + "balance_loss_mlp": 1.08932745, + "epoch": 0.2612543285879184, + "flos": 537825248256.0, + "grad_norm": 0.02677910773484977, + "language_loss": 0.99748302, + "learning_rate": 0.0008662028715772883, + "loss": 1.00933278, + "num_input_tokens_seen": 112648480, + "router_z_loss_mlp": 0.95605469, + "step": 1358, + "time_per_iteration": 2.6044809818267822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186466, + "balance_loss_mlp": 1.09095597, + "epoch": 0.261446710273182, + "flos": 520438817280.0, + "grad_norm": 0.024887857022763207, + "language_loss": 0.95091379, + "learning_rate": 0.0008659906811793467, + "loss": 0.96277845, + "num_input_tokens_seen": 112719856, + "router_z_loss_mlp": 0.95458984, + "step": 1359, + "time_per_iteration": 2.660039186477661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118844, + "balance_loss_mlp": 1.09297669, + "epoch": 0.26163909195844554, + "flos": 584399001600.0, + "grad_norm": 0.02478490455868915, + "language_loss": 0.99414921, + "learning_rate": 0.0008657783486912215, + "loss": 1.00603366, + "num_input_tokens_seen": 112795088, + "router_z_loss_mlp": 0.95410156, + "step": 1360, + "time_per_iteration": 2.710707187652588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189735, + "balance_loss_mlp": 1.09412944, + "epoch": 0.2618314736437091, + "flos": 960368223744.0, + "grad_norm": 0.025390417969386195, + "language_loss": 0.99146813, + "learning_rate": 0.0008655658741953472, + "loss": 1.00336552, + "num_input_tokens_seen": 112879888, + "router_z_loss_mlp": 0.95556641, + "step": 1361, + "time_per_iteration": 3.2610023021698 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187461, + "balance_loss_mlp": 1.0919987, + "epoch": 0.26202385532897265, + "flos": 575902170624.0, + "grad_norm": 0.01965876060868175, + "language_loss": 0.95685869, + "learning_rate": 0.0008653532577742136, + "loss": 0.96873331, + "num_input_tokens_seen": 112952208, + "router_z_loss_mlp": 0.95410156, + "step": 1362, + "time_per_iteration": 2.753920793533325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190509, + "balance_loss_mlp": 1.09509337, + "epoch": 0.26221623701423624, + "flos": 446397264384.0, + "grad_norm": 0.024702919408059576, + "language_loss": 0.95440364, + "learning_rate": 0.0008651404995103659, + "loss": 0.96630871, + "num_input_tokens_seen": 113017472, + "router_z_loss_mlp": 0.95361328, + "step": 1363, + "time_per_iteration": 2.532839298248291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184254, + "balance_loss_mlp": 1.088696, + "epoch": 0.26240861869949983, + "flos": 536755003392.0, + "grad_norm": 0.021936659097783043, + "language_loss": 0.95658946, + "learning_rate": 0.0008649275994864041, + "loss": 0.96843195, + "num_input_tokens_seen": 113090000, + "router_z_loss_mlp": 0.95507812, + "step": 1364, + "time_per_iteration": 2.6723499298095703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182727, + "balance_loss_mlp": 1.08735919, + "epoch": 0.26260100038476336, + "flos": 566487544320.0, + "grad_norm": 0.02057443182875544, + "language_loss": 0.93747735, + "learning_rate": 0.0008647145577849834, + "loss": 0.94930464, + "num_input_tokens_seen": 113169424, + "router_z_loss_mlp": 0.953125, + "step": 1365, + "time_per_iteration": 2.817335844039917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184888, + "balance_loss_mlp": 1.089378, + "epoch": 0.26279338207002695, + "flos": 614320195584.0, + "grad_norm": 0.02000370099851243, + "language_loss": 0.90110707, + "learning_rate": 0.0008645013744888139, + "loss": 0.912956, + "num_input_tokens_seen": 113256752, + "router_z_loss_mlp": 0.95458984, + "step": 1366, + "time_per_iteration": 2.889956474304199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190369, + "balance_loss_mlp": 1.09452498, + "epoch": 0.2629857637552905, + "flos": 523944992256.0, + "grad_norm": 0.02433762343961203, + "language_loss": 0.96272296, + "learning_rate": 0.0008642880496806607, + "loss": 0.97462666, + "num_input_tokens_seen": 113330512, + "router_z_loss_mlp": 0.95800781, + "step": 1367, + "time_per_iteration": 2.7868857383728027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186128, + "balance_loss_mlp": 1.09028387, + "epoch": 0.26317814544055407, + "flos": 535654559232.0, + "grad_norm": 0.022945771924384736, + "language_loss": 0.9318915, + "learning_rate": 0.0008640745834433437, + "loss": 0.94375277, + "num_input_tokens_seen": 113409088, + "router_z_loss_mlp": 0.95800781, + "step": 1368, + "time_per_iteration": 2.7556509971618652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182695, + "balance_loss_mlp": 1.08718467, + "epoch": 0.2633705271258176, + "flos": 556779479040.0, + "grad_norm": 0.024336346931206027, + "language_loss": 0.96858466, + "learning_rate": 0.000863860975859738, + "loss": 0.98041165, + "num_input_tokens_seen": 113486624, + "router_z_loss_mlp": 0.95458984, + "step": 1369, + "time_per_iteration": 2.9069716930389404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184914, + "balance_loss_mlp": 1.08945167, + "epoch": 0.2635629088110812, + "flos": 553461957120.0, + "grad_norm": 0.02843668952404612, + "language_loss": 1.00276971, + "learning_rate": 0.0008636472270127733, + "loss": 1.01461875, + "num_input_tokens_seen": 113555776, + "router_z_loss_mlp": 0.95410156, + "step": 1370, + "time_per_iteration": 2.626201868057251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185086, + "balance_loss_mlp": 1.08952749, + "epoch": 0.2637552904963448, + "flos": 456915062784.0, + "grad_norm": 0.02826867423240315, + "language_loss": 1.01819849, + "learning_rate": 0.0008634333369854345, + "loss": 1.03004944, + "num_input_tokens_seen": 113624208, + "router_z_loss_mlp": 0.95507812, + "step": 1371, + "time_per_iteration": 2.5906460285186768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183664, + "balance_loss_mlp": 1.08820105, + "epoch": 0.2639476721816083, + "flos": 614259070464.0, + "grad_norm": 0.024066040008067748, + "language_loss": 0.95210433, + "learning_rate": 0.0008632193058607608, + "loss": 0.96394098, + "num_input_tokens_seen": 113698544, + "router_z_loss_mlp": 0.95410156, + "step": 1372, + "time_per_iteration": 2.7260935306549072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180244, + "balance_loss_mlp": 1.08487642, + "epoch": 0.2641400538668719, + "flos": 573025807872.0, + "grad_norm": 0.02730663798923432, + "language_loss": 0.93146777, + "learning_rate": 0.0008630051337218466, + "loss": 0.94327021, + "num_input_tokens_seen": 113769024, + "router_z_loss_mlp": 0.953125, + "step": 1373, + "time_per_iteration": 2.7155323028564453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193282, + "balance_loss_mlp": 1.09777129, + "epoch": 0.2643324355521354, + "flos": 583339490304.0, + "grad_norm": 0.02802871933703498, + "language_loss": 0.91373825, + "learning_rate": 0.0008627908206518409, + "loss": 0.9256711, + "num_input_tokens_seen": 113836320, + "router_z_loss_mlp": 0.95458984, + "step": 1374, + "time_per_iteration": 2.7118475437164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189674, + "balance_loss_mlp": 1.09621429, + "epoch": 0.264524817237399, + "flos": 1548025075200.0, + "grad_norm": 0.008601814223210932, + "language_loss": 0.75151253, + "learning_rate": 0.0008625763667339472, + "loss": 0.76340932, + "num_input_tokens_seen": 114065040, + "router_z_loss_mlp": 0.93359375, + "step": 1375, + "time_per_iteration": 4.9838175773620605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192464, + "balance_loss_mlp": 1.09709656, + "epoch": 0.26471719892266254, + "flos": 519042932736.0, + "grad_norm": 0.024634755338573868, + "language_loss": 0.99606347, + "learning_rate": 0.0008623617720514241, + "loss": 1.0079881, + "num_input_tokens_seen": 114133488, + "router_z_loss_mlp": 0.953125, + "step": 1376, + "time_per_iteration": 2.5836029052734375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191563, + "balance_loss_mlp": 1.09586143, + "epoch": 0.26490958060792613, + "flos": 518205001728.0, + "grad_norm": 0.02740625444526412, + "language_loss": 0.95827538, + "learning_rate": 0.0008621470366875848, + "loss": 0.97019094, + "num_input_tokens_seen": 114200704, + "router_z_loss_mlp": 0.95654297, + "step": 1377, + "time_per_iteration": 2.574557304382324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190438, + "balance_loss_mlp": 1.09507096, + "epoch": 0.26510196229318966, + "flos": 597682372608.0, + "grad_norm": 0.02552910213335578, + "language_loss": 0.96441573, + "learning_rate": 0.0008619321607257966, + "loss": 0.97632015, + "num_input_tokens_seen": 114272160, + "router_z_loss_mlp": 0.953125, + "step": 1378, + "time_per_iteration": 2.680574655532837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187734, + "balance_loss_mlp": 1.09227157, + "epoch": 0.26529434397845325, + "flos": 687052459008.0, + "grad_norm": 0.024630390251990656, + "language_loss": 0.90670931, + "learning_rate": 0.000861717144249482, + "loss": 0.91858661, + "num_input_tokens_seen": 114347904, + "router_z_loss_mlp": 0.95410156, + "step": 1379, + "time_per_iteration": 2.8311944007873535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181951, + "balance_loss_mlp": 1.08672631, + "epoch": 0.26548672566371684, + "flos": 425259609600.0, + "grad_norm": 0.02240925569996582, + "language_loss": 0.98143864, + "learning_rate": 0.0008615019873421175, + "loss": 0.99325812, + "num_input_tokens_seen": 114409952, + "router_z_loss_mlp": 0.95166016, + "step": 1380, + "time_per_iteration": 2.472280263900757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182344, + "balance_loss_mlp": 1.08716714, + "epoch": 0.26567910734898037, + "flos": 490849993728.0, + "grad_norm": 0.024166031959674275, + "language_loss": 0.9586165, + "learning_rate": 0.0008612866900872349, + "loss": 0.97043991, + "num_input_tokens_seen": 114474832, + "router_z_loss_mlp": 0.95117188, + "step": 1381, + "time_per_iteration": 2.5671043395996094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181037, + "balance_loss_mlp": 1.08586013, + "epoch": 0.26587148903424396, + "flos": 535228862976.0, + "grad_norm": 0.024625622440273682, + "language_loss": 0.97316492, + "learning_rate": 0.0008610712525684197, + "loss": 0.98497522, + "num_input_tokens_seen": 114545152, + "router_z_loss_mlp": 0.95117188, + "step": 1382, + "time_per_iteration": 2.6394782066345215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179642, + "balance_loss_mlp": 1.08446515, + "epoch": 0.2660638707195075, + "flos": 1019055046656.0, + "grad_norm": 0.02944222863828147, + "language_loss": 0.96464765, + "learning_rate": 0.0008608556748693121, + "loss": 0.97644401, + "num_input_tokens_seen": 114626512, + "router_z_loss_mlp": 0.95117188, + "step": 1383, + "time_per_iteration": 3.2514846324920654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184353, + "balance_loss_mlp": 1.08941519, + "epoch": 0.2662562524047711, + "flos": 525062900736.0, + "grad_norm": 0.024003921212174706, + "language_loss": 0.95956504, + "learning_rate": 0.000860639957073607, + "loss": 0.97140861, + "num_input_tokens_seen": 114701008, + "router_z_loss_mlp": 0.94873047, + "step": 1384, + "time_per_iteration": 2.6759448051452637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190743, + "balance_loss_mlp": 1.09594798, + "epoch": 0.2664486340900346, + "flos": 553479421440.0, + "grad_norm": 0.02584009515603871, + "language_loss": 0.97059226, + "learning_rate": 0.0008604240992650534, + "loss": 0.98249966, + "num_input_tokens_seen": 114771984, + "router_z_loss_mlp": 0.94726562, + "step": 1385, + "time_per_iteration": 2.6880476474761963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187786, + "balance_loss_mlp": 1.09260905, + "epoch": 0.2666410157752982, + "flos": 471208280064.0, + "grad_norm": 0.023709316387392747, + "language_loss": 0.98021734, + "learning_rate": 0.0008602081015274545, + "loss": 0.99209523, + "num_input_tokens_seen": 114844800, + "router_z_loss_mlp": 0.95117188, + "step": 1386, + "time_per_iteration": 2.71233868598938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187602, + "balance_loss_mlp": 1.0924257, + "epoch": 0.2668333974605617, + "flos": 571015574016.0, + "grad_norm": 0.021121239598078063, + "language_loss": 0.90840185, + "learning_rate": 0.0008599919639446684, + "loss": 0.92027789, + "num_input_tokens_seen": 114918544, + "router_z_loss_mlp": 0.95117188, + "step": 1387, + "time_per_iteration": 2.6656363010406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183674, + "balance_loss_mlp": 1.08840239, + "epoch": 0.2670257791458253, + "flos": 399895369728.0, + "grad_norm": 0.029257146370583235, + "language_loss": 0.92911923, + "learning_rate": 0.000859775686600607, + "loss": 0.940956, + "num_input_tokens_seen": 114984272, + "router_z_loss_mlp": 0.95214844, + "step": 1388, + "time_per_iteration": 2.5366902351379395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186225, + "balance_loss_mlp": 1.09104884, + "epoch": 0.2672181608310889, + "flos": 516891709440.0, + "grad_norm": 0.02488439836403737, + "language_loss": 0.94369394, + "learning_rate": 0.0008595592695792367, + "loss": 0.95555621, + "num_input_tokens_seen": 115054800, + "router_z_loss_mlp": 0.95117188, + "step": 1389, + "time_per_iteration": 2.6710469722747803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184466, + "balance_loss_mlp": 1.08928883, + "epoch": 0.26741054251635243, + "flos": 508525134336.0, + "grad_norm": 0.024055725628873734, + "language_loss": 0.99442971, + "learning_rate": 0.0008593427129645778, + "loss": 1.00627434, + "num_input_tokens_seen": 115120928, + "router_z_loss_mlp": 0.95117188, + "step": 1390, + "time_per_iteration": 2.5913095474243164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184607, + "balance_loss_mlp": 1.08919191, + "epoch": 0.267602924201616, + "flos": 577808345088.0, + "grad_norm": 0.025635319637122064, + "language_loss": 0.93523198, + "learning_rate": 0.0008591260168407052, + "loss": 0.94707805, + "num_input_tokens_seen": 115196688, + "router_z_loss_mlp": 0.95361328, + "step": 1391, + "time_per_iteration": 2.766150712966919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118642, + "balance_loss_mlp": 1.09095728, + "epoch": 0.26779530588687955, + "flos": 524999774208.0, + "grad_norm": 0.02196829508666122, + "language_loss": 0.92168128, + "learning_rate": 0.0008589091812917479, + "loss": 0.93354547, + "num_input_tokens_seen": 115264912, + "router_z_loss_mlp": 0.95410156, + "step": 1392, + "time_per_iteration": 2.6208953857421875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119079, + "balance_loss_mlp": 1.09580445, + "epoch": 0.26798768757214314, + "flos": 557827530240.0, + "grad_norm": 0.02442636530887492, + "language_loss": 0.95854455, + "learning_rate": 0.0008586922064018887, + "loss": 0.97045243, + "num_input_tokens_seen": 115334672, + "router_z_loss_mlp": 0.94921875, + "step": 1393, + "time_per_iteration": 2.6643927097320557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190751, + "balance_loss_mlp": 1.09581244, + "epoch": 0.2681800692574067, + "flos": 932094693888.0, + "grad_norm": 0.0254733622090453, + "language_loss": 0.99184585, + "learning_rate": 0.0008584750922553651, + "loss": 1.00375342, + "num_input_tokens_seen": 115420032, + "router_z_loss_mlp": 0.94873047, + "step": 1394, + "time_per_iteration": 3.1305503845214844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192347, + "balance_loss_mlp": 1.09712303, + "epoch": 0.26837245094267026, + "flos": 702317865984.0, + "grad_norm": 0.023340973249423663, + "language_loss": 0.92753315, + "learning_rate": 0.0008582578389364677, + "loss": 0.93945664, + "num_input_tokens_seen": 115492576, + "router_z_loss_mlp": 0.95166016, + "step": 1395, + "time_per_iteration": 2.8527095317840576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184756, + "balance_loss_mlp": 1.08953142, + "epoch": 0.26856483262793385, + "flos": 594393775104.0, + "grad_norm": 0.020526468408011762, + "language_loss": 1.00206113, + "learning_rate": 0.0008580404465295422, + "loss": 1.01390874, + "num_input_tokens_seen": 115568368, + "router_z_loss_mlp": 0.95166016, + "step": 1396, + "time_per_iteration": 2.784592866897583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184595, + "balance_loss_mlp": 1.08922791, + "epoch": 0.2687572143131974, + "flos": 715588502016.0, + "grad_norm": 0.024818089102904728, + "language_loss": 0.9790895, + "learning_rate": 0.0008578229151189876, + "loss": 0.99093544, + "num_input_tokens_seen": 115651536, + "router_z_loss_mlp": 0.953125, + "step": 1397, + "time_per_iteration": 2.901818037033081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185216, + "balance_loss_mlp": 1.0896579, + "epoch": 0.26894959599846097, + "flos": 468670291968.0, + "grad_norm": 0.028086023154021946, + "language_loss": 0.91012216, + "learning_rate": 0.0008576052447892573, + "loss": 0.92197436, + "num_input_tokens_seen": 115715696, + "router_z_loss_mlp": 0.95507812, + "step": 1398, + "time_per_iteration": 2.5849812030792236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186332, + "balance_loss_mlp": 1.09082139, + "epoch": 0.2691419776837245, + "flos": 469629746688.0, + "grad_norm": 0.022530608820729603, + "language_loss": 0.95147502, + "learning_rate": 0.000857387435624858, + "loss": 0.96333838, + "num_input_tokens_seen": 115780928, + "router_z_loss_mlp": 0.95458984, + "step": 1399, + "time_per_iteration": 2.5274569988250732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011908, + "balance_loss_mlp": 1.09567106, + "epoch": 0.2693343593689881, + "flos": 939284963328.0, + "grad_norm": 0.02095039568010189, + "language_loss": 0.95472848, + "learning_rate": 0.0008571694877103513, + "loss": 0.96663648, + "num_input_tokens_seen": 115874432, + "router_z_loss_mlp": 0.95068359, + "step": 1400, + "time_per_iteration": 3.2558727264404297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190554, + "balance_loss_mlp": 1.09542465, + "epoch": 0.2695267410542516, + "flos": 578793996288.0, + "grad_norm": 0.0241215692671091, + "language_loss": 0.95762217, + "learning_rate": 0.0008569514011303515, + "loss": 0.96952766, + "num_input_tokens_seen": 115956608, + "router_z_loss_mlp": 0.95068359, + "step": 1401, + "time_per_iteration": 2.8175997734069824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193641, + "balance_loss_mlp": 1.09846401, + "epoch": 0.2697191227395152, + "flos": 557964516864.0, + "grad_norm": 0.02413892998134183, + "language_loss": 0.96554017, + "learning_rate": 0.0008567331759695277, + "loss": 0.97747654, + "num_input_tokens_seen": 116031728, + "router_z_loss_mlp": 0.95117188, + "step": 1402, + "time_per_iteration": 2.7052927017211914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192424, + "balance_loss_mlp": 1.09729552, + "epoch": 0.26991150442477874, + "flos": 530314068480.0, + "grad_norm": 0.024237100625486396, + "language_loss": 0.97319567, + "learning_rate": 0.0008565148123126023, + "loss": 0.98511994, + "num_input_tokens_seen": 116104288, + "router_z_loss_mlp": 0.95068359, + "step": 1403, + "time_per_iteration": 2.6399028301239014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187922, + "balance_loss_mlp": 1.09274554, + "epoch": 0.2701038861100423, + "flos": 533086371840.0, + "grad_norm": 0.021620674049761555, + "language_loss": 0.93398714, + "learning_rate": 0.0008562963102443516, + "loss": 0.94586635, + "num_input_tokens_seen": 116177920, + "router_z_loss_mlp": 0.95117188, + "step": 1404, + "time_per_iteration": 2.6793839931488037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185578, + "balance_loss_mlp": 1.09035325, + "epoch": 0.2702962677953059, + "flos": 736504576512.0, + "grad_norm": 0.026106257639691363, + "language_loss": 0.94497591, + "learning_rate": 0.0008560776698496056, + "loss": 0.95683169, + "num_input_tokens_seen": 116251680, + "router_z_loss_mlp": 0.95166016, + "step": 1405, + "time_per_iteration": 2.8884029388427734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186883, + "balance_loss_mlp": 1.09170628, + "epoch": 0.27048864948056944, + "flos": 576000225792.0, + "grad_norm": 0.025611862530653208, + "language_loss": 0.95929742, + "learning_rate": 0.0008558588912132481, + "loss": 0.97116625, + "num_input_tokens_seen": 116327664, + "router_z_loss_mlp": 0.95117188, + "step": 1406, + "time_per_iteration": 2.8396451473236084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190124, + "balance_loss_mlp": 1.09666443, + "epoch": 0.27068103116583303, + "flos": 1426910212608.0, + "grad_norm": 0.014531874927713828, + "language_loss": 0.76458991, + "learning_rate": 0.0008556399744202163, + "loss": 0.77649117, + "num_input_tokens_seen": 116555152, + "router_z_loss_mlp": 0.93359375, + "step": 1407, + "time_per_iteration": 4.898139715194702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119097, + "balance_loss_mlp": 1.09603214, + "epoch": 0.27087341285109656, + "flos": 533031977472.0, + "grad_norm": 0.024689522623330563, + "language_loss": 0.90804136, + "learning_rate": 0.0008554209195555016, + "loss": 0.91995108, + "num_input_tokens_seen": 116626016, + "router_z_loss_mlp": 0.94873047, + "step": 1408, + "time_per_iteration": 2.670093297958374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189645, + "balance_loss_mlp": 1.09446859, + "epoch": 0.27106579453636015, + "flos": 582464629248.0, + "grad_norm": 0.0247795195650599, + "language_loss": 0.98232609, + "learning_rate": 0.0008552017267041483, + "loss": 0.99422252, + "num_input_tokens_seen": 116699152, + "router_z_loss_mlp": 0.95117188, + "step": 1409, + "time_per_iteration": 2.6904594898223877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118886, + "balance_loss_mlp": 1.09368336, + "epoch": 0.2712581762216237, + "flos": 507880585728.0, + "grad_norm": 0.024309295256612126, + "language_loss": 0.90687084, + "learning_rate": 0.0008549823959512549, + "loss": 0.91875941, + "num_input_tokens_seen": 116770912, + "router_z_loss_mlp": 0.95117188, + "step": 1410, + "time_per_iteration": 2.662597417831421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189943, + "balance_loss_mlp": 1.09481394, + "epoch": 0.27145055790688727, + "flos": 999142087680.0, + "grad_norm": 0.023895808714677214, + "language_loss": 0.95848304, + "learning_rate": 0.0008547629273819728, + "loss": 0.97038245, + "num_input_tokens_seen": 116863088, + "router_z_loss_mlp": 0.95068359, + "step": 1411, + "time_per_iteration": 3.36985182762146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186274, + "balance_loss_mlp": 1.09109735, + "epoch": 0.2716429395921508, + "flos": 547728697344.0, + "grad_norm": 0.02712613780862537, + "language_loss": 0.93229926, + "learning_rate": 0.0008545433210815074, + "loss": 0.94416201, + "num_input_tokens_seen": 116929504, + "router_z_loss_mlp": 0.95117188, + "step": 1412, + "time_per_iteration": 2.601452350616455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182035, + "balance_loss_mlp": 1.08685839, + "epoch": 0.2718353212774144, + "flos": 574310902272.0, + "grad_norm": 0.02439507328911507, + "language_loss": 0.95137858, + "learning_rate": 0.0008543235771351176, + "loss": 0.96319902, + "num_input_tokens_seen": 117004064, + "router_z_loss_mlp": 0.95117188, + "step": 1413, + "time_per_iteration": 2.7132034301757812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197126, + "balance_loss_mlp": 1.10209203, + "epoch": 0.272027702962678, + "flos": 645584881152.0, + "grad_norm": 0.02257567173785872, + "language_loss": 0.91220462, + "learning_rate": 0.0008541036956281154, + "loss": 0.92417586, + "num_input_tokens_seen": 117081328, + "router_z_loss_mlp": 0.94970703, + "step": 1414, + "time_per_iteration": 2.871951103210449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187874, + "balance_loss_mlp": 1.09284067, + "epoch": 0.2722200846479415, + "flos": 654995504640.0, + "grad_norm": 0.026411231013774135, + "language_loss": 0.93374348, + "learning_rate": 0.0008538836766458665, + "loss": 0.94562221, + "num_input_tokens_seen": 117156544, + "router_z_loss_mlp": 0.94970703, + "step": 1415, + "time_per_iteration": 2.8673384189605713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183666, + "balance_loss_mlp": 1.08868039, + "epoch": 0.2724124663332051, + "flos": 580778033664.0, + "grad_norm": 0.027862690716265133, + "language_loss": 0.96171892, + "learning_rate": 0.0008536635202737897, + "loss": 0.97355556, + "num_input_tokens_seen": 117230208, + "router_z_loss_mlp": 0.94921875, + "step": 1416, + "time_per_iteration": 2.7829935550689697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183251, + "balance_loss_mlp": 1.08831298, + "epoch": 0.2726048480184686, + "flos": 538467795456.0, + "grad_norm": 0.025077003090708358, + "language_loss": 0.93469489, + "learning_rate": 0.0008534432265973573, + "loss": 0.94652736, + "num_input_tokens_seen": 117298080, + "router_z_loss_mlp": 0.94873047, + "step": 1417, + "time_per_iteration": 2.593364715576172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183107, + "balance_loss_mlp": 1.08793056, + "epoch": 0.2727972297037322, + "flos": 997548817920.0, + "grad_norm": 0.025553987949566613, + "language_loss": 0.99255168, + "learning_rate": 0.000853222795702095, + "loss": 1.00438273, + "num_input_tokens_seen": 117396256, + "router_z_loss_mlp": 0.95117188, + "step": 1418, + "time_per_iteration": 3.387162685394287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119173, + "balance_loss_mlp": 1.09712589, + "epoch": 0.27298961138899575, + "flos": 607334042112.0, + "grad_norm": 0.02541700118612174, + "language_loss": 0.93465757, + "learning_rate": 0.0008530022276735813, + "loss": 0.94657481, + "num_input_tokens_seen": 117467936, + "router_z_loss_mlp": 0.9453125, + "step": 1419, + "time_per_iteration": 2.7426016330718994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191299, + "balance_loss_mlp": 1.0965513, + "epoch": 0.27318199307425933, + "flos": 530396660736.0, + "grad_norm": 0.025702548257077976, + "language_loss": 0.9374572, + "learning_rate": 0.0008527815225974489, + "loss": 0.94937015, + "num_input_tokens_seen": 117538256, + "router_z_loss_mlp": 0.94677734, + "step": 1420, + "time_per_iteration": 2.6544342041015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118326, + "balance_loss_mlp": 1.08865511, + "epoch": 0.2733743747595229, + "flos": 409911610368.0, + "grad_norm": 0.028874111022423956, + "language_loss": 0.99327809, + "learning_rate": 0.0008525606805593829, + "loss": 1.00511074, + "num_input_tokens_seen": 117599488, + "router_z_loss_mlp": 0.9453125, + "step": 1421, + "time_per_iteration": 2.4215376377105713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182106, + "balance_loss_mlp": 1.08721578, + "epoch": 0.27356675644478645, + "flos": 517228082688.0, + "grad_norm": 0.026406413504372096, + "language_loss": 0.92442018, + "learning_rate": 0.0008523397016451213, + "loss": 0.93624127, + "num_input_tokens_seen": 117664240, + "router_z_loss_mlp": 0.94824219, + "step": 1422, + "time_per_iteration": 2.5680603981018066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184812, + "balance_loss_mlp": 1.09011269, + "epoch": 0.27375913813005004, + "flos": 1054058221056.0, + "grad_norm": 0.02228341429952914, + "language_loss": 0.94973963, + "learning_rate": 0.0008521185859404564, + "loss": 0.96158779, + "num_input_tokens_seen": 117754768, + "router_z_loss_mlp": 0.94628906, + "step": 1423, + "time_per_iteration": 3.37345814704895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179884, + "balance_loss_mlp": 1.08485043, + "epoch": 0.27395151981531357, + "flos": 626003566080.0, + "grad_norm": 0.02387683630357993, + "language_loss": 0.97909242, + "learning_rate": 0.0008518973335312326, + "loss": 0.99089128, + "num_input_tokens_seen": 117832816, + "router_z_loss_mlp": 0.94970703, + "step": 1424, + "time_per_iteration": 2.8314859867095947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184763, + "balance_loss_mlp": 1.08982456, + "epoch": 0.27414390150057716, + "flos": 551414793216.0, + "grad_norm": 0.028545098094769822, + "language_loss": 0.95577884, + "learning_rate": 0.0008516759445033477, + "loss": 0.96762645, + "num_input_tokens_seen": 117899168, + "router_z_loss_mlp": 0.94873047, + "step": 1425, + "time_per_iteration": 2.6086578369140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118204, + "balance_loss_mlp": 1.08705389, + "epoch": 0.2743362831858407, + "flos": 540951389184.0, + "grad_norm": 0.02677358847245462, + "language_loss": 0.96958816, + "learning_rate": 0.0008514544189427526, + "loss": 0.9814086, + "num_input_tokens_seen": 117972384, + "router_z_loss_mlp": 0.94921875, + "step": 1426, + "time_per_iteration": 2.6927483081817627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191695, + "balance_loss_mlp": 1.09713852, + "epoch": 0.2745286648711043, + "flos": 469545153024.0, + "grad_norm": 0.025998263163597202, + "language_loss": 0.95807564, + "learning_rate": 0.0008512327569354511, + "loss": 0.96999258, + "num_input_tokens_seen": 118039584, + "router_z_loss_mlp": 0.94482422, + "step": 1427, + "time_per_iteration": 2.5617682933807373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119268, + "balance_loss_mlp": 1.09764659, + "epoch": 0.2747210465563678, + "flos": 473871794688.0, + "grad_norm": 0.02733358796633043, + "language_loss": 0.93333006, + "learning_rate": 0.0008510109585675001, + "loss": 0.94525683, + "num_input_tokens_seen": 118108352, + "router_z_loss_mlp": 0.94970703, + "step": 1428, + "time_per_iteration": 2.7269434928894043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205208, + "balance_loss_mlp": 1.11193848, + "epoch": 0.2749134282416314, + "flos": 1318056866304.0, + "grad_norm": 0.019809968329655446, + "language_loss": 0.81153345, + "learning_rate": 0.0008507890239250093, + "loss": 0.82358551, + "num_input_tokens_seen": 118331120, + "router_z_loss_mlp": 0.93164062, + "step": 1429, + "time_per_iteration": 4.731899738311768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190948, + "balance_loss_mlp": 1.0958662, + "epoch": 0.275105809926895, + "flos": 972531684864.0, + "grad_norm": 0.03147414200634365, + "language_loss": 0.91184711, + "learning_rate": 0.0008505669530941415, + "loss": 0.92375666, + "num_input_tokens_seen": 118415872, + "router_z_loss_mlp": 0.95019531, + "step": 1430, + "time_per_iteration": 3.3260724544525146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189047, + "balance_loss_mlp": 1.09387004, + "epoch": 0.2752981916121585, + "flos": 528368962560.0, + "grad_norm": 0.025580193945061114, + "language_loss": 0.95012403, + "learning_rate": 0.000850344746161112, + "loss": 0.96201456, + "num_input_tokens_seen": 118483008, + "router_z_loss_mlp": 0.95117188, + "step": 1431, + "time_per_iteration": 2.5820231437683105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186021, + "balance_loss_mlp": 1.09093964, + "epoch": 0.2754905732974221, + "flos": 454598654976.0, + "grad_norm": 0.024219881250434897, + "language_loss": 0.962569, + "learning_rate": 0.0008501224032121894, + "loss": 0.97442919, + "num_input_tokens_seen": 118545840, + "router_z_loss_mlp": 0.95019531, + "step": 1432, + "time_per_iteration": 2.501572847366333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188894, + "balance_loss_mlp": 1.09362173, + "epoch": 0.27568295498268564, + "flos": 498508893696.0, + "grad_norm": 0.02427263624604226, + "language_loss": 0.90960014, + "learning_rate": 0.0008498999243336946, + "loss": 0.921489, + "num_input_tokens_seen": 118615168, + "router_z_loss_mlp": 0.95214844, + "step": 1433, + "time_per_iteration": 2.6212003231048584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192375, + "balance_loss_mlp": 1.09715116, + "epoch": 0.2758753366679492, + "flos": 609416134656.0, + "grad_norm": 0.024278981864862804, + "language_loss": 0.95570171, + "learning_rate": 0.0008496773096120021, + "loss": 0.9676255, + "num_input_tokens_seen": 118690384, + "router_z_loss_mlp": 0.95166016, + "step": 1434, + "time_per_iteration": 2.804689407348633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118926, + "balance_loss_mlp": 1.09370184, + "epoch": 0.27606771835321275, + "flos": 741436835328.0, + "grad_norm": 0.025697024392157108, + "language_loss": 0.95037985, + "learning_rate": 0.0008494545591335381, + "loss": 0.96227252, + "num_input_tokens_seen": 118763024, + "router_z_loss_mlp": 0.95507812, + "step": 1435, + "time_per_iteration": 2.9329347610473633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195816, + "balance_loss_mlp": 1.10068655, + "epoch": 0.27626010003847634, + "flos": 555748165632.0, + "grad_norm": 0.0206290639721941, + "language_loss": 0.927001, + "learning_rate": 0.0008492316729847823, + "loss": 0.93895912, + "num_input_tokens_seen": 118845536, + "router_z_loss_mlp": 0.95068359, + "step": 1436, + "time_per_iteration": 2.820913553237915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118782, + "balance_loss_mlp": 1.09245288, + "epoch": 0.2764524817237399, + "flos": 543695494656.0, + "grad_norm": 0.02424730092158954, + "language_loss": 0.88914406, + "learning_rate": 0.0008490086512522664, + "loss": 0.90102232, + "num_input_tokens_seen": 118919008, + "router_z_loss_mlp": 0.953125, + "step": 1437, + "time_per_iteration": 2.7454309463500977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186593, + "balance_loss_mlp": 1.09127319, + "epoch": 0.27664486340900346, + "flos": 407128573440.0, + "grad_norm": 0.024912305575595636, + "language_loss": 0.99286187, + "learning_rate": 0.0008487854940225755, + "loss": 1.00472784, + "num_input_tokens_seen": 118981376, + "router_z_loss_mlp": 0.95263672, + "step": 1438, + "time_per_iteration": 2.4809510707855225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183239, + "balance_loss_mlp": 1.08834839, + "epoch": 0.27683724509426705, + "flos": 523156726272.0, + "grad_norm": 0.025259333782437998, + "language_loss": 0.98154646, + "learning_rate": 0.0008485622013823466, + "loss": 0.99337876, + "num_input_tokens_seen": 119050560, + "router_z_loss_mlp": 0.94824219, + "step": 1439, + "time_per_iteration": 2.65401554107666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183688, + "balance_loss_mlp": 1.08865404, + "epoch": 0.2770296267795306, + "flos": 536409897984.0, + "grad_norm": 0.02898674716386243, + "language_loss": 0.9318651, + "learning_rate": 0.00084833877341827, + "loss": 0.94370198, + "num_input_tokens_seen": 119121104, + "router_z_loss_mlp": 0.94970703, + "step": 1440, + "time_per_iteration": 2.6294455528259277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192537, + "balance_loss_mlp": 1.09755075, + "epoch": 0.27722200846479417, + "flos": 488970015744.0, + "grad_norm": 0.027244615130064133, + "language_loss": 0.90653217, + "learning_rate": 0.000848115210217088, + "loss": 0.91845751, + "num_input_tokens_seen": 119187712, + "router_z_loss_mlp": 0.94921875, + "step": 1441, + "time_per_iteration": 2.5394957065582275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118987, + "balance_loss_mlp": 1.09493196, + "epoch": 0.2774143901500577, + "flos": 619443108864.0, + "grad_norm": 0.024388639686817183, + "language_loss": 0.9228884, + "learning_rate": 0.0008478915118655952, + "loss": 0.93478709, + "num_input_tokens_seen": 119259264, + "router_z_loss_mlp": 0.94873047, + "step": 1442, + "time_per_iteration": 2.7634968757629395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119119, + "balance_loss_mlp": 1.0962522, + "epoch": 0.2776067718353213, + "flos": 514844545536.0, + "grad_norm": 0.021441164984372, + "language_loss": 0.94525409, + "learning_rate": 0.0008476676784506393, + "loss": 0.95716596, + "num_input_tokens_seen": 119328304, + "router_z_loss_mlp": 0.94873047, + "step": 1443, + "time_per_iteration": 2.6474499702453613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119148, + "balance_loss_mlp": 1.09678042, + "epoch": 0.2777991535205848, + "flos": 1006040919552.0, + "grad_norm": 0.026818715625153876, + "language_loss": 0.93016809, + "learning_rate": 0.0008474437100591201, + "loss": 0.94208288, + "num_input_tokens_seen": 119412352, + "router_z_loss_mlp": 0.94628906, + "step": 1444, + "time_per_iteration": 3.311842441558838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189789, + "balance_loss_mlp": 1.09494591, + "epoch": 0.2779915352058484, + "flos": 551375861760.0, + "grad_norm": 0.021641305677188864, + "language_loss": 0.95129728, + "learning_rate": 0.0008472196067779898, + "loss": 0.96319526, + "num_input_tokens_seen": 119484464, + "router_z_loss_mlp": 0.94775391, + "step": 1445, + "time_per_iteration": 2.667910575866699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186263, + "balance_loss_mlp": 1.091277, + "epoch": 0.278183916891112, + "flos": 875215990272.0, + "grad_norm": 0.030449834007814664, + "language_loss": 0.98351109, + "learning_rate": 0.0008469953686942531, + "loss": 0.99537361, + "num_input_tokens_seen": 119557280, + "router_z_loss_mlp": 0.94921875, + "step": 1446, + "time_per_iteration": 3.100473403930664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187264, + "balance_loss_mlp": 1.09246826, + "epoch": 0.2783762985763755, + "flos": 625195834368.0, + "grad_norm": 0.025904191205549917, + "language_loss": 0.93646944, + "learning_rate": 0.0008467709958949668, + "loss": 0.94834208, + "num_input_tokens_seen": 119631232, + "router_z_loss_mlp": 0.94726562, + "step": 1447, + "time_per_iteration": 2.7201731204986572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187983, + "balance_loss_mlp": 1.09333074, + "epoch": 0.2785686802616391, + "flos": 582911792640.0, + "grad_norm": 0.026760771702797625, + "language_loss": 0.94447374, + "learning_rate": 0.0008465464884672403, + "loss": 0.9563536, + "num_input_tokens_seen": 119700224, + "router_z_loss_mlp": 0.94580078, + "step": 1448, + "time_per_iteration": 2.7300403118133545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118631, + "balance_loss_mlp": 1.09180129, + "epoch": 0.27876106194690264, + "flos": 588538991616.0, + "grad_norm": 0.0212290178255441, + "language_loss": 0.93077391, + "learning_rate": 0.0008463218464982348, + "loss": 0.94263697, + "num_input_tokens_seen": 119781376, + "router_z_loss_mlp": 0.94433594, + "step": 1449, + "time_per_iteration": 2.86130952835083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190148, + "balance_loss_mlp": 1.09520972, + "epoch": 0.27895344363216623, + "flos": 877430340096.0, + "grad_norm": 0.02756647509109648, + "language_loss": 0.96903402, + "learning_rate": 0.0008460970700751645, + "loss": 0.98093557, + "num_input_tokens_seen": 119856672, + "router_z_loss_mlp": 0.94873047, + "step": 1450, + "time_per_iteration": 3.069391965866089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188227, + "balance_loss_mlp": 1.0932883, + "epoch": 0.27914582531742976, + "flos": 605035098624.0, + "grad_norm": 0.025261876769304706, + "language_loss": 0.97766632, + "learning_rate": 0.000845872159285295, + "loss": 0.98954856, + "num_input_tokens_seen": 119929008, + "router_z_loss_mlp": 0.94873047, + "step": 1451, + "time_per_iteration": 2.748164653778076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197098, + "balance_loss_mlp": 1.10325623, + "epoch": 0.27933820700269335, + "flos": 1501130411520.0, + "grad_norm": 0.012982305827020523, + "language_loss": 0.77766848, + "learning_rate": 0.0008456471142159447, + "loss": 0.78963947, + "num_input_tokens_seen": 120164032, + "router_z_loss_mlp": 0.9375, + "step": 1452, + "time_per_iteration": 4.906180143356323 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198876, + "balance_loss_mlp": 1.10408044, + "epoch": 0.2795305886879569, + "flos": 1033517451264.0, + "grad_norm": 0.027093914793319178, + "language_loss": 0.95323974, + "learning_rate": 0.0008454219349544836, + "loss": 0.9652285, + "num_input_tokens_seen": 120246784, + "router_z_loss_mlp": 0.94726562, + "step": 1453, + "time_per_iteration": 3.333178758621216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194793, + "balance_loss_mlp": 1.10014069, + "epoch": 0.27972297037322047, + "flos": 608226367488.0, + "grad_norm": 0.025225525542022995, + "language_loss": 0.8972255, + "learning_rate": 0.000845196621588334, + "loss": 0.90917349, + "num_input_tokens_seen": 120318208, + "router_z_loss_mlp": 0.94580078, + "step": 1454, + "time_per_iteration": 2.7425026893615723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191631, + "balance_loss_mlp": 1.09697926, + "epoch": 0.27991535205848406, + "flos": 631560907776.0, + "grad_norm": 0.023908777965609074, + "language_loss": 0.86623406, + "learning_rate": 0.0008449711742049706, + "loss": 0.87815034, + "num_input_tokens_seen": 120393248, + "router_z_loss_mlp": 0.94580078, + "step": 1455, + "time_per_iteration": 2.8148674964904785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188728, + "balance_loss_mlp": 1.09369469, + "epoch": 0.2801077337437476, + "flos": 550353280512.0, + "grad_norm": 0.02989232443782136, + "language_loss": 0.94001353, + "learning_rate": 0.0008447455928919196, + "loss": 0.95190072, + "num_input_tokens_seen": 120461040, + "router_z_loss_mlp": 0.94970703, + "step": 1456, + "time_per_iteration": 2.6030025482177734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186748, + "balance_loss_mlp": 1.09166706, + "epoch": 0.2803001154290112, + "flos": 487741317120.0, + "grad_norm": 0.023726139763527557, + "language_loss": 0.95883709, + "learning_rate": 0.0008445198777367595, + "loss": 0.97070462, + "num_input_tokens_seen": 120530400, + "router_z_loss_mlp": 0.95019531, + "step": 1457, + "time_per_iteration": 2.598212718963623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188426, + "balance_loss_mlp": 1.09344053, + "epoch": 0.2804924971142747, + "flos": 523091598336.0, + "grad_norm": 0.027291046925092925, + "language_loss": 0.9210875, + "learning_rate": 0.0008442940288271208, + "loss": 0.93297172, + "num_input_tokens_seen": 120598304, + "router_z_loss_mlp": 0.94921875, + "step": 1458, + "time_per_iteration": 2.617572069168091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189438, + "balance_loss_mlp": 1.09473801, + "epoch": 0.2806848787995383, + "flos": 528849053184.0, + "grad_norm": 0.02378106137707509, + "language_loss": 0.95258486, + "learning_rate": 0.0008440680462506856, + "loss": 0.96447927, + "num_input_tokens_seen": 120675712, + "router_z_loss_mlp": 0.94628906, + "step": 1459, + "time_per_iteration": 2.7465641498565674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191591, + "balance_loss_mlp": 1.09660506, + "epoch": 0.2808772604848018, + "flos": 486484420608.0, + "grad_norm": 0.02248739277997059, + "language_loss": 0.9351486, + "learning_rate": 0.0008438419300951883, + "loss": 0.94706452, + "num_input_tokens_seen": 120746544, + "router_z_loss_mlp": 0.94921875, + "step": 1460, + "time_per_iteration": 2.6331160068511963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188162, + "balance_loss_mlp": 1.09303284, + "epoch": 0.2810696421700654, + "flos": 619339049472.0, + "grad_norm": 0.024684272432392865, + "language_loss": 0.96464884, + "learning_rate": 0.0008436156804484148, + "loss": 0.97653049, + "num_input_tokens_seen": 120823520, + "router_z_loss_mlp": 0.95068359, + "step": 1461, + "time_per_iteration": 2.7740418910980225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188616, + "balance_loss_mlp": 1.09358263, + "epoch": 0.28126202385532895, + "flos": 455686364160.0, + "grad_norm": 0.026728942288464865, + "language_loss": 0.99464989, + "learning_rate": 0.0008433892973982031, + "loss": 1.00653601, + "num_input_tokens_seen": 120889568, + "router_z_loss_mlp": 0.94970703, + "step": 1462, + "time_per_iteration": 2.5151000022888184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188441, + "balance_loss_mlp": 1.09345496, + "epoch": 0.28145440554059253, + "flos": 531738150912.0, + "grad_norm": 0.02863032020985732, + "language_loss": 0.95777607, + "learning_rate": 0.0008431627810324431, + "loss": 0.96966046, + "num_input_tokens_seen": 120958480, + "router_z_loss_mlp": 0.94921875, + "step": 1463, + "time_per_iteration": 2.64477801322937 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187972, + "balance_loss_mlp": 1.09298646, + "epoch": 0.2816467872258561, + "flos": 453163838976.0, + "grad_norm": 0.025052425157320847, + "language_loss": 0.90961307, + "learning_rate": 0.000842936131439076, + "loss": 0.92149282, + "num_input_tokens_seen": 121028032, + "router_z_loss_mlp": 0.94921875, + "step": 1464, + "time_per_iteration": 2.5910096168518066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186267, + "balance_loss_mlp": 1.09147155, + "epoch": 0.28183916891111965, + "flos": 473704608768.0, + "grad_norm": 0.02627501463847235, + "language_loss": 0.97073281, + "learning_rate": 0.0008427093487060951, + "loss": 0.98259544, + "num_input_tokens_seen": 121099280, + "router_z_loss_mlp": 0.94726562, + "step": 1465, + "time_per_iteration": 2.6250505447387695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187944, + "balance_loss_mlp": 1.09300542, + "epoch": 0.28203155059638324, + "flos": 558188098560.0, + "grad_norm": 0.02108937585301408, + "language_loss": 0.91709232, + "learning_rate": 0.000842482432921545, + "loss": 0.92897177, + "num_input_tokens_seen": 121180240, + "router_z_loss_mlp": 0.94873047, + "step": 1466, + "time_per_iteration": 2.809101104736328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186286, + "balance_loss_mlp": 1.09139562, + "epoch": 0.28222393228164677, + "flos": 417878685696.0, + "grad_norm": 0.025824876793605126, + "language_loss": 0.96517414, + "learning_rate": 0.0008422553841735225, + "loss": 0.97703695, + "num_input_tokens_seen": 121242736, + "router_z_loss_mlp": 0.94824219, + "step": 1467, + "time_per_iteration": 2.468773365020752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184331, + "balance_loss_mlp": 1.08963072, + "epoch": 0.28241631396691036, + "flos": 606040215552.0, + "grad_norm": 0.02479925640814435, + "language_loss": 0.92490911, + "learning_rate": 0.0008420282025501757, + "loss": 0.93675244, + "num_input_tokens_seen": 121319248, + "router_z_loss_mlp": 0.94628906, + "step": 1468, + "time_per_iteration": 2.7617123126983643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184258, + "balance_loss_mlp": 1.08960581, + "epoch": 0.2826086956521739, + "flos": 574050390528.0, + "grad_norm": 0.023359152371130017, + "language_loss": 0.93868291, + "learning_rate": 0.0008418008881397043, + "loss": 0.95052546, + "num_input_tokens_seen": 121392064, + "router_z_loss_mlp": 0.94580078, + "step": 1469, + "time_per_iteration": 2.681727886199951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185359, + "balance_loss_mlp": 1.09056342, + "epoch": 0.2828010773374375, + "flos": 844318603776.0, + "grad_norm": 0.02469333041166596, + "language_loss": 0.92646587, + "learning_rate": 0.0008415734410303595, + "loss": 0.93831944, + "num_input_tokens_seen": 121475984, + "router_z_loss_mlp": 0.94726562, + "step": 1470, + "time_per_iteration": 3.1949617862701416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186089, + "balance_loss_mlp": 1.09124613, + "epoch": 0.28299345902270107, + "flos": 543771356160.0, + "grad_norm": 0.022743934694793657, + "language_loss": 0.98454034, + "learning_rate": 0.0008413458613104444, + "loss": 0.99640119, + "num_input_tokens_seen": 121551024, + "router_z_loss_mlp": 0.94775391, + "step": 1471, + "time_per_iteration": 2.679994583129883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184615, + "balance_loss_mlp": 1.08972394, + "epoch": 0.2831858407079646, + "flos": 572754562560.0, + "grad_norm": 0.02381851847695354, + "language_loss": 0.91435039, + "learning_rate": 0.0008411181490683129, + "loss": 0.92619658, + "num_input_tokens_seen": 121624528, + "router_z_loss_mlp": 0.94824219, + "step": 1472, + "time_per_iteration": 2.7178077697753906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186226, + "balance_loss_mlp": 1.09152639, + "epoch": 0.2833782223932282, + "flos": 765170875392.0, + "grad_norm": 0.023393787071714342, + "language_loss": 0.92628008, + "learning_rate": 0.0008408903043923707, + "loss": 0.9381423, + "num_input_tokens_seen": 121706736, + "router_z_loss_mlp": 0.94628906, + "step": 1473, + "time_per_iteration": 3.0261785984039307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184462, + "balance_loss_mlp": 1.0899055, + "epoch": 0.2835706040784917, + "flos": 540087261696.0, + "grad_norm": 0.026141956799832673, + "language_loss": 0.93214488, + "learning_rate": 0.0008406623273710754, + "loss": 0.94398952, + "num_input_tokens_seen": 121773008, + "router_z_loss_mlp": 0.94482422, + "step": 1474, + "time_per_iteration": 2.62430739402771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118759, + "balance_loss_mlp": 1.09312844, + "epoch": 0.2837629857637553, + "flos": 531653557248.0, + "grad_norm": 0.026627011980012938, + "language_loss": 0.91140723, + "learning_rate": 0.0008404342180929351, + "loss": 0.9232831, + "num_input_tokens_seen": 121840016, + "router_z_loss_mlp": 0.94384766, + "step": 1475, + "time_per_iteration": 2.6201882362365723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191029, + "balance_loss_mlp": 1.09666264, + "epoch": 0.28395536744901884, + "flos": 541109842944.0, + "grad_norm": 0.026942213566754976, + "language_loss": 0.91036892, + "learning_rate": 0.00084020597664651, + "loss": 0.92227924, + "num_input_tokens_seen": 121915008, + "router_z_loss_mlp": 0.94287109, + "step": 1476, + "time_per_iteration": 2.792515516281128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191806, + "balance_loss_mlp": 1.09743977, + "epoch": 0.2841477491342824, + "flos": 574801726464.0, + "grad_norm": 0.0281069748307863, + "language_loss": 0.94561875, + "learning_rate": 0.0008399776031204111, + "loss": 0.95753682, + "num_input_tokens_seen": 121987456, + "router_z_loss_mlp": 0.94287109, + "step": 1477, + "time_per_iteration": 2.7592930793762207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189206, + "balance_loss_mlp": 1.09479237, + "epoch": 0.28434013081954596, + "flos": 573138599424.0, + "grad_norm": 0.025578880464706598, + "language_loss": 0.90985346, + "learning_rate": 0.0008397490976033009, + "loss": 0.92174542, + "num_input_tokens_seen": 122058720, + "router_z_loss_mlp": 0.94335938, + "step": 1478, + "time_per_iteration": 2.72312331199646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193047, + "balance_loss_mlp": 1.10015869, + "epoch": 0.28453251250480954, + "flos": 1556673629184.0, + "grad_norm": 0.009281527310597816, + "language_loss": 0.77879643, + "learning_rate": 0.000839520460183893, + "loss": 0.7907269, + "num_input_tokens_seen": 122285792, + "router_z_loss_mlp": 0.92773438, + "step": 1479, + "time_per_iteration": 4.714428901672363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188304, + "balance_loss_mlp": 1.0943675, + "epoch": 0.28472489419007313, + "flos": 750426491904.0, + "grad_norm": 0.023822673694276757, + "language_loss": 0.93367732, + "learning_rate": 0.0008392916909509525, + "loss": 0.94556034, + "num_input_tokens_seen": 122366608, + "router_z_loss_mlp": 0.93847656, + "step": 1480, + "time_per_iteration": 3.0365796089172363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183623, + "balance_loss_mlp": 1.08930516, + "epoch": 0.28491727587533666, + "flos": 491138703360.0, + "grad_norm": 0.028675048847138535, + "language_loss": 0.94468164, + "learning_rate": 0.0008390627899932954, + "loss": 0.95651788, + "num_input_tokens_seen": 122435536, + "router_z_loss_mlp": 0.94238281, + "step": 1481, + "time_per_iteration": 2.562316656112671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187714, + "balance_loss_mlp": 1.09353888, + "epoch": 0.28510965756060025, + "flos": 730359081984.0, + "grad_norm": 0.028797322451775676, + "language_loss": 0.96514452, + "learning_rate": 0.000838833757399789, + "loss": 0.97702163, + "num_input_tokens_seen": 122515584, + "router_z_loss_mlp": 0.94091797, + "step": 1482, + "time_per_iteration": 2.955920696258545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189825, + "balance_loss_mlp": 1.09593546, + "epoch": 0.2853020392458638, + "flos": 552669688320.0, + "grad_norm": 0.027781834693451857, + "language_loss": 0.92148101, + "learning_rate": 0.0008386045932593515, + "loss": 0.93337923, + "num_input_tokens_seen": 122585552, + "router_z_loss_mlp": 0.93798828, + "step": 1483, + "time_per_iteration": 2.6609442234039307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185409, + "balance_loss_mlp": 1.09151959, + "epoch": 0.28549442093112737, + "flos": 756096625152.0, + "grad_norm": 0.023489805753692042, + "language_loss": 0.9365592, + "learning_rate": 0.0008383752976609525, + "loss": 0.94841331, + "num_input_tokens_seen": 122658928, + "router_z_loss_mlp": 0.93798828, + "step": 1484, + "time_per_iteration": 2.914872646331787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188644, + "balance_loss_mlp": 1.09480286, + "epoch": 0.2856868026163909, + "flos": 539703224832.0, + "grad_norm": 0.026354969281760218, + "language_loss": 0.9020288, + "learning_rate": 0.0008381458706936123, + "loss": 0.91391522, + "num_input_tokens_seen": 122729056, + "router_z_loss_mlp": 0.9375, + "step": 1485, + "time_per_iteration": 2.7100982666015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190691, + "balance_loss_mlp": 1.09675431, + "epoch": 0.2858791843016545, + "flos": 584920025088.0, + "grad_norm": 0.026556247425645045, + "language_loss": 0.97539783, + "learning_rate": 0.0008379163124464025, + "loss": 0.98730469, + "num_input_tokens_seen": 122802832, + "router_z_loss_mlp": 0.93847656, + "step": 1486, + "time_per_iteration": 2.7065536975860596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192022, + "balance_loss_mlp": 1.0979898, + "epoch": 0.286071565986918, + "flos": 646051510272.0, + "grad_norm": 0.03147840332437955, + "language_loss": 0.84533966, + "learning_rate": 0.0008376866230084452, + "loss": 0.85725987, + "num_input_tokens_seen": 122881328, + "router_z_loss_mlp": 0.93945312, + "step": 1487, + "time_per_iteration": 2.818673849105835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186798, + "balance_loss_mlp": 1.09295619, + "epoch": 0.2862639476721816, + "flos": 492330471936.0, + "grad_norm": 0.02612625436823832, + "language_loss": 0.963471, + "learning_rate": 0.000837456802468914, + "loss": 0.975339, + "num_input_tokens_seen": 122949680, + "router_z_loss_mlp": 0.9375, + "step": 1488, + "time_per_iteration": 2.5766210556030273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185712, + "balance_loss_mlp": 1.09187043, + "epoch": 0.2864563293574452, + "flos": 522744491520.0, + "grad_norm": 0.023875595461199783, + "language_loss": 0.96454561, + "learning_rate": 0.0008372268509170331, + "loss": 0.9764027, + "num_input_tokens_seen": 123024736, + "router_z_loss_mlp": 0.9375, + "step": 1489, + "time_per_iteration": 2.7241337299346924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117946, + "balance_loss_mlp": 1.08537972, + "epoch": 0.2866487110427087, + "flos": 548256451584.0, + "grad_norm": 0.022999113981848278, + "language_loss": 0.93815279, + "learning_rate": 0.0008369967684420779, + "loss": 0.94994742, + "num_input_tokens_seen": 123097344, + "router_z_loss_mlp": 0.93994141, + "step": 1490, + "time_per_iteration": 2.7358930110931396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180309, + "balance_loss_mlp": 1.08656251, + "epoch": 0.2868410927279723, + "flos": 483217290240.0, + "grad_norm": 0.024118055050044187, + "language_loss": 0.93676293, + "learning_rate": 0.0008367665551333736, + "loss": 0.94856608, + "num_input_tokens_seen": 123166240, + "router_z_loss_mlp": 0.93652344, + "step": 1491, + "time_per_iteration": 2.6094913482666016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181201, + "balance_loss_mlp": 1.08731139, + "epoch": 0.28703347441323585, + "flos": 726136499712.0, + "grad_norm": 0.03204326630579906, + "language_loss": 0.96034807, + "learning_rate": 0.0008365362110802977, + "loss": 0.9721601, + "num_input_tokens_seen": 123238160, + "router_z_loss_mlp": 0.93798828, + "step": 1492, + "time_per_iteration": 2.862281322479248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180339, + "balance_loss_mlp": 1.08630645, + "epoch": 0.28722585609849943, + "flos": 636213189120.0, + "grad_norm": 0.024948941988181064, + "language_loss": 0.92257547, + "learning_rate": 0.0008363057363722773, + "loss": 0.93437886, + "num_input_tokens_seen": 123319504, + "router_z_loss_mlp": 0.93945312, + "step": 1493, + "time_per_iteration": 2.8364765644073486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180894, + "balance_loss_mlp": 1.08695745, + "epoch": 0.28741823778376296, + "flos": 511251775488.0, + "grad_norm": 0.026788978355157977, + "language_loss": 0.94388151, + "learning_rate": 0.0008360751310987906, + "loss": 0.9556905, + "num_input_tokens_seen": 123387008, + "router_z_loss_mlp": 0.93847656, + "step": 1494, + "time_per_iteration": 2.5825915336608887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186005, + "balance_loss_mlp": 1.09244919, + "epoch": 0.28761061946902655, + "flos": 604931039232.0, + "grad_norm": 0.023099591474152015, + "language_loss": 0.92881125, + "learning_rate": 0.0008358443953493666, + "loss": 0.94067132, + "num_input_tokens_seen": 123471056, + "router_z_loss_mlp": 0.93457031, + "step": 1495, + "time_per_iteration": 2.8426852226257324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190116, + "balance_loss_mlp": 1.09617913, + "epoch": 0.28780300115429014, + "flos": 408059830272.0, + "grad_norm": 0.026469370193436835, + "language_loss": 0.97524667, + "learning_rate": 0.0008356135292135851, + "loss": 0.98714793, + "num_input_tokens_seen": 123535024, + "router_z_loss_mlp": 0.93847656, + "step": 1496, + "time_per_iteration": 2.505594491958618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187979, + "balance_loss_mlp": 1.09356499, + "epoch": 0.28799538283955367, + "flos": 375744365568.0, + "grad_norm": 0.028081335314896084, + "language_loss": 1.02447343, + "learning_rate": 0.0008353825327810758, + "loss": 1.03635335, + "num_input_tokens_seen": 123596224, + "router_z_loss_mlp": 0.94335938, + "step": 1497, + "time_per_iteration": 2.4137980937957764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188393, + "balance_loss_mlp": 1.09416974, + "epoch": 0.28818776452481726, + "flos": 593019357696.0, + "grad_norm": 0.027570910872340922, + "language_loss": 0.91214752, + "learning_rate": 0.00083515140614152, + "loss": 0.9240315, + "num_input_tokens_seen": 123668640, + "router_z_loss_mlp": 0.94140625, + "step": 1498, + "time_per_iteration": 2.7084319591522217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188877, + "balance_loss_mlp": 1.0943675, + "epoch": 0.2883801462100808, + "flos": 536103724032.0, + "grad_norm": 0.024692508476740448, + "language_loss": 0.97239816, + "learning_rate": 0.0008349201493846485, + "loss": 0.9842869, + "num_input_tokens_seen": 123740816, + "router_z_loss_mlp": 0.94433594, + "step": 1499, + "time_per_iteration": 2.6401236057281494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190398, + "balance_loss_mlp": 1.09617448, + "epoch": 0.2885725278953444, + "flos": 481076800512.0, + "grad_norm": 0.026282906035864008, + "language_loss": 0.98523659, + "learning_rate": 0.0008346887626002432, + "loss": 0.99714065, + "num_input_tokens_seen": 123805968, + "router_z_loss_mlp": 0.94140625, + "step": 1500, + "time_per_iteration": 2.52458119392395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193099, + "balance_loss_mlp": 1.09863722, + "epoch": 0.2887649095806079, + "flos": 465029858304.0, + "grad_norm": 0.024051725112114657, + "language_loss": 0.95880306, + "learning_rate": 0.000834457245878137, + "loss": 0.970734, + "num_input_tokens_seen": 123876576, + "router_z_loss_mlp": 0.94384766, + "step": 1501, + "time_per_iteration": 2.629535436630249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192018, + "balance_loss_mlp": 1.09765196, + "epoch": 0.2889572912658715, + "flos": 932639912448.0, + "grad_norm": 0.02596355901590014, + "language_loss": 0.90450358, + "learning_rate": 0.000834225599308212, + "loss": 0.9164238, + "num_input_tokens_seen": 123967664, + "router_z_loss_mlp": 0.94287109, + "step": 1502, + "time_per_iteration": 3.2340567111968994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189718, + "balance_loss_mlp": 1.09568572, + "epoch": 0.28914967295113503, + "flos": 571256620032.0, + "grad_norm": 0.02412179831144176, + "language_loss": 0.9487462, + "learning_rate": 0.0008339938229804016, + "loss": 0.96064335, + "num_input_tokens_seen": 124039680, + "router_z_loss_mlp": 0.93945312, + "step": 1503, + "time_per_iteration": 2.710339069366455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193321, + "balance_loss_mlp": 1.10081482, + "epoch": 0.2893420546363986, + "flos": 1489872010752.0, + "grad_norm": 0.01509287591883609, + "language_loss": 0.75434822, + "learning_rate": 0.0008337619169846895, + "loss": 0.76628143, + "num_input_tokens_seen": 124278848, + "router_z_loss_mlp": 0.92382812, + "step": 1504, + "time_per_iteration": 4.937675714492798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189832, + "balance_loss_mlp": 1.09579968, + "epoch": 0.2895344363216622, + "flos": 471182083584.0, + "grad_norm": 0.02978733186062401, + "language_loss": 0.95586789, + "learning_rate": 0.0008335298814111094, + "loss": 0.96776623, + "num_input_tokens_seen": 124346736, + "router_z_loss_mlp": 0.93945312, + "step": 1505, + "time_per_iteration": 2.5757808685302734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119483, + "balance_loss_mlp": 1.10075009, + "epoch": 0.28972681800692573, + "flos": 649340107776.0, + "grad_norm": 0.024998045510076724, + "language_loss": 0.95390272, + "learning_rate": 0.0008332977163497455, + "loss": 0.96585107, + "num_input_tokens_seen": 124420816, + "router_z_loss_mlp": 0.93994141, + "step": 1506, + "time_per_iteration": 2.8062288761138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190367, + "balance_loss_mlp": 1.09638238, + "epoch": 0.2899191996921893, + "flos": 573305785344.0, + "grad_norm": 0.023440576211443395, + "language_loss": 0.92864263, + "learning_rate": 0.0008330654218907325, + "loss": 0.94054627, + "num_input_tokens_seen": 124490480, + "router_z_loss_mlp": 0.93896484, + "step": 1507, + "time_per_iteration": 2.6871397495269775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195663, + "balance_loss_mlp": 1.10158336, + "epoch": 0.29011158137745285, + "flos": 662636940288.0, + "grad_norm": 0.026311762315396375, + "language_loss": 0.90949756, + "learning_rate": 0.0008328329981242548, + "loss": 0.92145419, + "num_input_tokens_seen": 124564960, + "router_z_loss_mlp": 0.93994141, + "step": 1508, + "time_per_iteration": 2.870436906814575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189885, + "balance_loss_mlp": 1.09585261, + "epoch": 0.29030396306271644, + "flos": 537402279936.0, + "grad_norm": 0.02293974263799261, + "language_loss": 0.95641714, + "learning_rate": 0.0008326004451405475, + "loss": 0.96831596, + "num_input_tokens_seen": 124637424, + "router_z_loss_mlp": 0.93945312, + "step": 1509, + "time_per_iteration": 2.7639336585998535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191857, + "balance_loss_mlp": 1.09815872, + "epoch": 0.29049634474798, + "flos": 512955835392.0, + "grad_norm": 0.025710607890434264, + "language_loss": 0.93112034, + "learning_rate": 0.0008323677630298957, + "loss": 0.94303894, + "num_input_tokens_seen": 124704832, + "router_z_loss_mlp": 0.93603516, + "step": 1510, + "time_per_iteration": 2.561455726623535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118953, + "balance_loss_mlp": 1.09592652, + "epoch": 0.29068872643324356, + "flos": 614982208512.0, + "grad_norm": 0.023671610956976636, + "language_loss": 0.92362118, + "learning_rate": 0.0008321349518826345, + "loss": 0.93551642, + "num_input_tokens_seen": 124779600, + "router_z_loss_mlp": 0.93505859, + "step": 1511, + "time_per_iteration": 2.807711362838745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191488, + "balance_loss_mlp": 1.09736073, + "epoch": 0.2908811081185071, + "flos": 547468185600.0, + "grad_norm": 0.029262624151918007, + "language_loss": 1.03824317, + "learning_rate": 0.0008319020117891491, + "loss": 1.05015802, + "num_input_tokens_seen": 124844128, + "router_z_loss_mlp": 0.94042969, + "step": 1512, + "time_per_iteration": 2.626357316970825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192195, + "balance_loss_mlp": 1.09840155, + "epoch": 0.2910734898037707, + "flos": 605901227520.0, + "grad_norm": 0.026098769068304807, + "language_loss": 0.96355087, + "learning_rate": 0.0008316689428398751, + "loss": 0.97547281, + "num_input_tokens_seen": 124915376, + "router_z_loss_mlp": 0.93701172, + "step": 1513, + "time_per_iteration": 2.6982998847961426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190959, + "balance_loss_mlp": 1.09721279, + "epoch": 0.29126587148903427, + "flos": 575835041280.0, + "grad_norm": 0.02240755749123148, + "language_loss": 0.95587385, + "learning_rate": 0.0008314357451252979, + "loss": 0.96778345, + "num_input_tokens_seen": 124995504, + "router_z_loss_mlp": 0.93652344, + "step": 1514, + "time_per_iteration": 2.7506277561187744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185358, + "balance_loss_mlp": 1.09170711, + "epoch": 0.2914582531742978, + "flos": 572133482496.0, + "grad_norm": 0.030106635879309524, + "language_loss": 0.98758858, + "learning_rate": 0.0008312024187359527, + "loss": 0.99944222, + "num_input_tokens_seen": 125064192, + "router_z_loss_mlp": 0.93554688, + "step": 1515, + "time_per_iteration": 2.6389546394348145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186161, + "balance_loss_mlp": 1.09265339, + "epoch": 0.2916506348595614, + "flos": 732302186496.0, + "grad_norm": 0.023105382424412787, + "language_loss": 0.95643955, + "learning_rate": 0.000830968963762425, + "loss": 0.96830118, + "num_input_tokens_seen": 125150560, + "router_z_loss_mlp": 0.93408203, + "step": 1516, + "time_per_iteration": 3.0222864151000977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183995, + "balance_loss_mlp": 1.09048688, + "epoch": 0.2918430165448249, + "flos": 511466625024.0, + "grad_norm": 0.027481799845478876, + "language_loss": 0.92072952, + "learning_rate": 0.0008307353802953497, + "loss": 0.93256938, + "num_input_tokens_seen": 125219264, + "router_z_loss_mlp": 0.93408203, + "step": 1517, + "time_per_iteration": 2.6852073669433594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188929, + "balance_loss_mlp": 1.09546912, + "epoch": 0.2920353982300885, + "flos": 631606569984.0, + "grad_norm": 0.024841994736450757, + "language_loss": 0.95207542, + "learning_rate": 0.0008305016684254125, + "loss": 0.9639647, + "num_input_tokens_seen": 125301904, + "router_z_loss_mlp": 0.93359375, + "step": 1518, + "time_per_iteration": 2.78326678276062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185623, + "balance_loss_mlp": 1.0920676, + "epoch": 0.29222777991535204, + "flos": 502670350848.0, + "grad_norm": 0.02442081482663903, + "language_loss": 0.96402657, + "learning_rate": 0.0008302678282433479, + "loss": 0.97588277, + "num_input_tokens_seen": 125367712, + "router_z_loss_mlp": 0.93457031, + "step": 1519, + "time_per_iteration": 2.580885887145996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186077, + "balance_loss_mlp": 1.09261727, + "epoch": 0.2924201616006156, + "flos": 487841373696.0, + "grad_norm": 0.025531334181834578, + "language_loss": 0.92434102, + "learning_rate": 0.0008300338598399411, + "loss": 0.93620181, + "num_input_tokens_seen": 125437648, + "router_z_loss_mlp": 0.93359375, + "step": 1520, + "time_per_iteration": 2.60040020942688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182574, + "balance_loss_mlp": 1.08911419, + "epoch": 0.2926125432858792, + "flos": 477410170368.0, + "grad_norm": 0.025034871095789283, + "language_loss": 1.04410791, + "learning_rate": 0.0008297997633060263, + "loss": 1.05593348, + "num_input_tokens_seen": 125502432, + "router_z_loss_mlp": 0.93359375, + "step": 1521, + "time_per_iteration": 2.5479507446289062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184296, + "balance_loss_mlp": 1.09083581, + "epoch": 0.29280492497114274, + "flos": 677867418624.0, + "grad_norm": 0.023158831925944874, + "language_loss": 0.93757105, + "learning_rate": 0.0008295655387324883, + "loss": 0.94941401, + "num_input_tokens_seen": 125575424, + "router_z_loss_mlp": 0.93359375, + "step": 1522, + "time_per_iteration": 2.80924916267395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184597, + "balance_loss_mlp": 1.09113646, + "epoch": 0.29299730665640633, + "flos": 459344262144.0, + "grad_norm": 0.024881330364852117, + "language_loss": 0.95369709, + "learning_rate": 0.0008293311862102609, + "loss": 0.96554303, + "num_input_tokens_seen": 125639040, + "router_z_loss_mlp": 0.93359375, + "step": 1523, + "time_per_iteration": 2.5006909370422363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183918, + "balance_loss_mlp": 1.09055364, + "epoch": 0.29318968834166986, + "flos": 447495707136.0, + "grad_norm": 0.027757525537519354, + "language_loss": 0.99242002, + "learning_rate": 0.0008290967058303275, + "loss": 1.00425935, + "num_input_tokens_seen": 125701712, + "router_z_loss_mlp": 0.93261719, + "step": 1524, + "time_per_iteration": 2.472071409225464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184496, + "balance_loss_mlp": 1.09098816, + "epoch": 0.29338207002693345, + "flos": 451255663104.0, + "grad_norm": 0.024483324027042522, + "language_loss": 0.93697757, + "learning_rate": 0.0008288620976837219, + "loss": 0.9488225, + "num_input_tokens_seen": 125765088, + "router_z_loss_mlp": 0.93408203, + "step": 1525, + "time_per_iteration": 2.486726760864258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183678, + "balance_loss_mlp": 1.08997941, + "epoch": 0.293574451712197, + "flos": 503284700160.0, + "grad_norm": 0.025672010983446535, + "language_loss": 0.92014909, + "learning_rate": 0.000828627361861527, + "loss": 0.93198591, + "num_input_tokens_seen": 125831328, + "router_z_loss_mlp": 0.93603516, + "step": 1526, + "time_per_iteration": 2.557725429534912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183155, + "balance_loss_mlp": 1.089504, + "epoch": 0.29376683339746057, + "flos": 697683048960.0, + "grad_norm": 0.028193197708561973, + "language_loss": 0.94158876, + "learning_rate": 0.0008283924984548752, + "loss": 0.95342028, + "num_input_tokens_seen": 125903664, + "router_z_loss_mlp": 0.93554688, + "step": 1527, + "time_per_iteration": 2.866138219833374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182528, + "balance_loss_mlp": 1.08882964, + "epoch": 0.2939592150827241, + "flos": 479541927936.0, + "grad_norm": 0.024215116577050826, + "language_loss": 0.92182994, + "learning_rate": 0.0008281575075549485, + "loss": 0.93365526, + "num_input_tokens_seen": 125971856, + "router_z_loss_mlp": 0.93603516, + "step": 1528, + "time_per_iteration": 2.5585758686065674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202408, + "balance_loss_mlp": 1.1108551, + "epoch": 0.2941515967679877, + "flos": 1488386803200.0, + "grad_norm": 0.02007823063587109, + "language_loss": 0.77352691, + "learning_rate": 0.000827922389252979, + "loss": 0.78555101, + "num_input_tokens_seen": 126183968, + "router_z_loss_mlp": 0.9140625, + "step": 1529, + "time_per_iteration": 4.658870697021484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186281, + "balance_loss_mlp": 1.09267783, + "epoch": 0.2943439784532513, + "flos": 675399287808.0, + "grad_norm": 0.027761434636537758, + "language_loss": 0.99164081, + "learning_rate": 0.0008276871436402469, + "loss": 1.00350356, + "num_input_tokens_seen": 126254448, + "router_z_loss_mlp": 0.93505859, + "step": 1530, + "time_per_iteration": 2.897517204284668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182983, + "balance_loss_mlp": 1.08909357, + "epoch": 0.2945363601385148, + "flos": 577382648832.0, + "grad_norm": 0.025208295044921922, + "language_loss": 0.95561033, + "learning_rate": 0.000827451770808083, + "loss": 0.96744013, + "num_input_tokens_seen": 126328208, + "router_z_loss_mlp": 0.93798828, + "step": 1531, + "time_per_iteration": 2.667419910430908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183127, + "balance_loss_mlp": 1.08923733, + "epoch": 0.2947287418237784, + "flos": 481617289728.0, + "grad_norm": 0.0238323033403859, + "language_loss": 0.92856085, + "learning_rate": 0.0008272162708478674, + "loss": 0.94039214, + "num_input_tokens_seen": 126396464, + "router_z_loss_mlp": 0.93798828, + "step": 1532, + "time_per_iteration": 2.532593250274658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190087, + "balance_loss_mlp": 1.09638822, + "epoch": 0.2949211235090419, + "flos": 559260344832.0, + "grad_norm": 0.023856250691152107, + "language_loss": 0.9573307, + "learning_rate": 0.000826980643851029, + "loss": 0.96923155, + "num_input_tokens_seen": 126468960, + "router_z_loss_mlp": 0.93603516, + "step": 1533, + "time_per_iteration": 2.648393154144287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190115, + "balance_loss_mlp": 1.09665465, + "epoch": 0.2951135051943055, + "flos": 484856222208.0, + "grad_norm": 0.02761517479674983, + "language_loss": 0.9290787, + "learning_rate": 0.0008267448899090464, + "loss": 0.94097984, + "num_input_tokens_seen": 126536496, + "router_z_loss_mlp": 0.93359375, + "step": 1534, + "time_per_iteration": 2.5158579349517822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185677, + "balance_loss_mlp": 1.09226477, + "epoch": 0.29530588687956905, + "flos": 551421523968.0, + "grad_norm": 0.024001584155810263, + "language_loss": 0.90244222, + "learning_rate": 0.0008265090091134473, + "loss": 0.91429895, + "num_input_tokens_seen": 126614048, + "router_z_loss_mlp": 0.93310547, + "step": 1535, + "time_per_iteration": 2.8246946334838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185762, + "balance_loss_mlp": 1.09234965, + "epoch": 0.29549826856483263, + "flos": 674309577216.0, + "grad_norm": 0.021562014940098434, + "language_loss": 0.8727591, + "learning_rate": 0.0008262730015558088, + "loss": 0.88461667, + "num_input_tokens_seen": 126697248, + "router_z_loss_mlp": 0.93310547, + "step": 1536, + "time_per_iteration": 2.8568825721740723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189062, + "balance_loss_mlp": 1.09560144, + "epoch": 0.29569065025009617, + "flos": 766135059456.0, + "grad_norm": 0.0253531059084562, + "language_loss": 0.89567208, + "learning_rate": 0.0008260368673277574, + "loss": 0.90756267, + "num_input_tokens_seen": 126782496, + "router_z_loss_mlp": 0.93359375, + "step": 1537, + "time_per_iteration": 3.1248908042907715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181656, + "balance_loss_mlp": 1.08781409, + "epoch": 0.29588303193535975, + "flos": 544830867456.0, + "grad_norm": 0.02589470547450269, + "language_loss": 0.93808746, + "learning_rate": 0.0008258006065209682, + "loss": 0.94990402, + "num_input_tokens_seen": 126857328, + "router_z_loss_mlp": 0.9375, + "step": 1538, + "time_per_iteration": 2.7405824661254883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182922, + "balance_loss_mlp": 1.0892235, + "epoch": 0.29607541362062334, + "flos": 598144998912.0, + "grad_norm": 0.02499469713889481, + "language_loss": 0.9045589, + "learning_rate": 0.0008255642192271657, + "loss": 0.91638815, + "num_input_tokens_seen": 126932608, + "router_z_loss_mlp": 0.93603516, + "step": 1539, + "time_per_iteration": 2.7654454708099365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183976, + "balance_loss_mlp": 1.09032559, + "epoch": 0.29626779530588687, + "flos": 611037602304.0, + "grad_norm": 0.024707919738005703, + "language_loss": 0.92616487, + "learning_rate": 0.0008253277055381241, + "loss": 0.93800461, + "num_input_tokens_seen": 127008928, + "router_z_loss_mlp": 0.93554688, + "step": 1540, + "time_per_iteration": 2.803755760192871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186228, + "balance_loss_mlp": 1.09252918, + "epoch": 0.29646017699115046, + "flos": 868957704192.0, + "grad_norm": 0.02707124240628881, + "language_loss": 0.95315254, + "learning_rate": 0.0008250910655456658, + "loss": 0.96501482, + "num_input_tokens_seen": 127097104, + "router_z_loss_mlp": 0.93603516, + "step": 1541, + "time_per_iteration": 3.11143159866333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181572, + "balance_loss_mlp": 1.08787382, + "epoch": 0.296652558676414, + "flos": 496880695296.0, + "grad_norm": 0.02670504880571787, + "language_loss": 0.9343757, + "learning_rate": 0.0008248542993416625, + "loss": 0.94619143, + "num_input_tokens_seen": 127165264, + "router_z_loss_mlp": 0.93603516, + "step": 1542, + "time_per_iteration": 2.5893712043762207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181697, + "balance_loss_mlp": 1.08790362, + "epoch": 0.2968449403616776, + "flos": 572626308096.0, + "grad_norm": 0.02711797813063544, + "language_loss": 0.9310621, + "learning_rate": 0.0008246174070180352, + "loss": 0.94287908, + "num_input_tokens_seen": 127238992, + "router_z_loss_mlp": 0.93701172, + "step": 1543, + "time_per_iteration": 2.677011489868164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189648, + "balance_loss_mlp": 1.09614003, + "epoch": 0.2970373220469411, + "flos": 795650022912.0, + "grad_norm": 0.029629985597633038, + "language_loss": 0.9263432, + "learning_rate": 0.0008243803886667537, + "loss": 0.93823969, + "num_input_tokens_seen": 127328160, + "router_z_loss_mlp": 0.93408203, + "step": 1544, + "time_per_iteration": 3.1022729873657227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188285, + "balance_loss_mlp": 1.09472907, + "epoch": 0.2972297037322047, + "flos": 662248174080.0, + "grad_norm": 0.0271995559284498, + "language_loss": 0.89610922, + "learning_rate": 0.0008241432443798364, + "loss": 0.90799212, + "num_input_tokens_seen": 127407328, + "router_z_loss_mlp": 0.93457031, + "step": 1545, + "time_per_iteration": 2.8079423904418945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181998, + "balance_loss_mlp": 1.08868086, + "epoch": 0.29742208541746823, + "flos": 598231593984.0, + "grad_norm": 0.02196679377417612, + "language_loss": 0.91743886, + "learning_rate": 0.0008239059742493512, + "loss": 0.92925882, + "num_input_tokens_seen": 127477136, + "router_z_loss_mlp": 0.93212891, + "step": 1546, + "time_per_iteration": 2.703385353088379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182095, + "balance_loss_mlp": 1.08868301, + "epoch": 0.2976144671027318, + "flos": 771338563584.0, + "grad_norm": 0.02555387631372138, + "language_loss": 0.94145298, + "learning_rate": 0.0008236685783674142, + "loss": 0.95327395, + "num_input_tokens_seen": 127565680, + "router_z_loss_mlp": 0.93310547, + "step": 1547, + "time_per_iteration": 3.0583412647247314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221115, + "balance_loss_mlp": 1.12822723, + "epoch": 0.2978068487879954, + "flos": 1487911441920.0, + "grad_norm": 0.023679675459363107, + "language_loss": 0.76221192, + "learning_rate": 0.0008234310568261911, + "loss": 0.77442312, + "num_input_tokens_seen": 127791584, + "router_z_loss_mlp": 0.92773438, + "step": 1548, + "time_per_iteration": 4.846614360809326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192812, + "balance_loss_mlp": 1.09925652, + "epoch": 0.29799923047325894, + "flos": 476329191936.0, + "grad_norm": 0.02691026692614136, + "language_loss": 0.91868371, + "learning_rate": 0.0008231934097178955, + "loss": 0.93061185, + "num_input_tokens_seen": 127860112, + "router_z_loss_mlp": 0.93457031, + "step": 1549, + "time_per_iteration": 2.600588798522949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189437, + "balance_loss_mlp": 1.09573877, + "epoch": 0.2981916121585225, + "flos": 761167872000.0, + "grad_norm": 0.02304182660847759, + "language_loss": 0.93441629, + "learning_rate": 0.0008229556371347903, + "loss": 0.94631064, + "num_input_tokens_seen": 127938752, + "router_z_loss_mlp": 0.93603516, + "step": 1550, + "time_per_iteration": 2.9500393867492676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196641, + "balance_loss_mlp": 1.10256064, + "epoch": 0.29838399384378606, + "flos": 876516547584.0, + "grad_norm": 0.029531977965095095, + "language_loss": 0.90478379, + "learning_rate": 0.0008227177391691874, + "loss": 0.91675019, + "num_input_tokens_seen": 128022192, + "router_z_loss_mlp": 0.93994141, + "step": 1551, + "time_per_iteration": 3.117060422897339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192501, + "balance_loss_mlp": 1.09870708, + "epoch": 0.29857637552904964, + "flos": 580751837184.0, + "grad_norm": 0.026349497602305087, + "language_loss": 0.9813534, + "learning_rate": 0.0008224797159134463, + "loss": 0.99327838, + "num_input_tokens_seen": 128097776, + "router_z_loss_mlp": 0.93701172, + "step": 1552, + "time_per_iteration": 2.694382429122925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185823, + "balance_loss_mlp": 1.09212494, + "epoch": 0.2987687572143132, + "flos": 837807811584.0, + "grad_norm": 0.022207279660822626, + "language_loss": 0.8985877, + "learning_rate": 0.0008222415674599765, + "loss": 0.91044593, + "num_input_tokens_seen": 128179888, + "router_z_loss_mlp": 0.93603516, + "step": 1553, + "time_per_iteration": 3.074347972869873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186024, + "balance_loss_mlp": 1.09203923, + "epoch": 0.29896113889957676, + "flos": 568167409152.0, + "grad_norm": 0.026892838709900748, + "language_loss": 0.93768913, + "learning_rate": 0.0008220032939012349, + "loss": 0.94954944, + "num_input_tokens_seen": 128251152, + "router_z_loss_mlp": 0.93896484, + "step": 1554, + "time_per_iteration": 2.6793601512908936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190641, + "balance_loss_mlp": 1.0965606, + "epoch": 0.29915352058484035, + "flos": 499835647488.0, + "grad_norm": 0.021647779244158522, + "language_loss": 0.95223451, + "learning_rate": 0.0008217648953297277, + "loss": 0.96414095, + "num_input_tokens_seen": 128327600, + "router_z_loss_mlp": 0.93994141, + "step": 1555, + "time_per_iteration": 2.836775779724121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189405, + "balance_loss_mlp": 1.09546852, + "epoch": 0.2993459022701039, + "flos": 593214741504.0, + "grad_norm": 0.03843372955580003, + "language_loss": 0.88026905, + "learning_rate": 0.0008215263718380095, + "loss": 0.89216304, + "num_input_tokens_seen": 128398432, + "router_z_loss_mlp": 0.93847656, + "step": 1556, + "time_per_iteration": 2.6840782165527344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192028, + "balance_loss_mlp": 1.09790027, + "epoch": 0.29953828395536747, + "flos": 573472971264.0, + "grad_norm": 0.02697506762846426, + "language_loss": 0.95771539, + "learning_rate": 0.0008212877235186833, + "loss": 0.96963573, + "num_input_tokens_seen": 128469696, + "router_z_loss_mlp": 0.94042969, + "step": 1557, + "time_per_iteration": 2.649303674697876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216583, + "balance_loss_mlp": 1.12350464, + "epoch": 0.299730665640631, + "flos": 1508083637760.0, + "grad_norm": 0.01733611069553414, + "language_loss": 0.77737558, + "learning_rate": 0.0008210489504644005, + "loss": 0.78954148, + "num_input_tokens_seen": 128698560, + "router_z_loss_mlp": 0.9296875, + "step": 1558, + "time_per_iteration": 4.920740365982056 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191809, + "balance_loss_mlp": 1.09772909, + "epoch": 0.2999230473258946, + "flos": 514807615488.0, + "grad_norm": 0.03091345134541536, + "language_loss": 0.92723, + "learning_rate": 0.0008208100527678611, + "loss": 0.93914807, + "num_input_tokens_seen": 128765952, + "router_z_loss_mlp": 0.93994141, + "step": 1559, + "time_per_iteration": 2.628755807876587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191055, + "balance_loss_mlp": 1.09692788, + "epoch": 0.3001154290111581, + "flos": 835853973504.0, + "grad_norm": 0.03027255896835194, + "language_loss": 0.86836946, + "learning_rate": 0.0008205710305218135, + "loss": 0.88028002, + "num_input_tokens_seen": 128840048, + "router_z_loss_mlp": 0.94042969, + "step": 1560, + "time_per_iteration": 3.0076475143432617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188346, + "balance_loss_mlp": 1.09431422, + "epoch": 0.3003078106964217, + "flos": 557945051136.0, + "grad_norm": 0.023845762720508586, + "language_loss": 0.96495396, + "learning_rate": 0.0008203318838190541, + "loss": 0.9768374, + "num_input_tokens_seen": 128912496, + "router_z_loss_mlp": 0.93945312, + "step": 1561, + "time_per_iteration": 2.7329952716827393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118952, + "balance_loss_mlp": 1.09548759, + "epoch": 0.30050019238168524, + "flos": 527168461824.0, + "grad_norm": 0.030147848994798797, + "language_loss": 0.95915771, + "learning_rate": 0.0008200926127524281, + "loss": 0.97105289, + "num_input_tokens_seen": 128980624, + "router_z_loss_mlp": 0.93945312, + "step": 1562, + "time_per_iteration": 2.625941753387451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186113, + "balance_loss_mlp": 1.09217656, + "epoch": 0.3006925740669488, + "flos": 578936987136.0, + "grad_norm": 0.02860364820877459, + "language_loss": 0.92538679, + "learning_rate": 0.0008198532174148289, + "loss": 0.93724799, + "num_input_tokens_seen": 129050576, + "router_z_loss_mlp": 0.93847656, + "step": 1563, + "time_per_iteration": 2.725884199142456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207901, + "balance_loss_mlp": 1.11539459, + "epoch": 0.3008849557522124, + "flos": 1493610499584.0, + "grad_norm": 0.014785027254047896, + "language_loss": 0.8068617, + "learning_rate": 0.0008196136978991977, + "loss": 0.8189407, + "num_input_tokens_seen": 129278880, + "router_z_loss_mlp": 0.92382812, + "step": 1564, + "time_per_iteration": 4.830730438232422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198016, + "balance_loss_mlp": 1.10398376, + "epoch": 0.30107733743747594, + "flos": 510824077824.0, + "grad_norm": 0.03423038852538926, + "language_loss": 0.994165, + "learning_rate": 0.0008193740542985244, + "loss": 1.00614524, + "num_input_tokens_seen": 129346560, + "router_z_loss_mlp": 0.93945312, + "step": 1565, + "time_per_iteration": 2.578756809234619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194051, + "balance_loss_mlp": 1.10020983, + "epoch": 0.30126971912273953, + "flos": 588820970496.0, + "grad_norm": 0.027351016206119898, + "language_loss": 0.95914042, + "learning_rate": 0.0008191342867058467, + "loss": 0.97108096, + "num_input_tokens_seen": 129420448, + "router_z_loss_mlp": 0.9375, + "step": 1566, + "time_per_iteration": 2.7046890258789062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192822, + "balance_loss_mlp": 1.09898102, + "epoch": 0.30146210080800306, + "flos": 603220248576.0, + "grad_norm": 0.029722715632080093, + "language_loss": 0.93181753, + "learning_rate": 0.0008188943952142509, + "loss": 0.94374579, + "num_input_tokens_seen": 129494032, + "router_z_loss_mlp": 0.9375, + "step": 1567, + "time_per_iteration": 2.7784945964813232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189204, + "balance_loss_mlp": 1.09588659, + "epoch": 0.30165448249326665, + "flos": 919286684160.0, + "grad_norm": 0.02698998287866622, + "language_loss": 0.91980577, + "learning_rate": 0.0008186543799168711, + "loss": 0.93169785, + "num_input_tokens_seen": 129569088, + "router_z_loss_mlp": 0.93212891, + "step": 1568, + "time_per_iteration": 3.1082897186279297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188766, + "balance_loss_mlp": 1.09530556, + "epoch": 0.3018468641785302, + "flos": 778630164480.0, + "grad_norm": 0.02791954193910651, + "language_loss": 0.98386627, + "learning_rate": 0.0008184142409068892, + "loss": 0.99575394, + "num_input_tokens_seen": 129647968, + "router_z_loss_mlp": 0.93359375, + "step": 1569, + "time_per_iteration": 3.0047945976257324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187793, + "balance_loss_mlp": 1.09433293, + "epoch": 0.30203924586379377, + "flos": 523389040128.0, + "grad_norm": 0.023468489537567368, + "language_loss": 0.94207543, + "learning_rate": 0.000818173978277536, + "loss": 0.95395339, + "num_input_tokens_seen": 129718928, + "router_z_loss_mlp": 0.93359375, + "step": 1570, + "time_per_iteration": 2.640746831893921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119455, + "balance_loss_mlp": 1.10094678, + "epoch": 0.3022316275490573, + "flos": 525649052160.0, + "grad_norm": 0.028721303316250762, + "language_loss": 0.92132497, + "learning_rate": 0.000817933592122089, + "loss": 0.93327045, + "num_input_tokens_seen": 129790128, + "router_z_loss_mlp": 0.93505859, + "step": 1571, + "time_per_iteration": 2.683819055557251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119426, + "balance_loss_mlp": 1.10037029, + "epoch": 0.3024240092343209, + "flos": 480872684544.0, + "grad_norm": 0.028034832338571278, + "language_loss": 0.93476671, + "learning_rate": 0.0008176930825338749, + "loss": 0.94670928, + "num_input_tokens_seen": 129857536, + "router_z_loss_mlp": 0.93798828, + "step": 1572, + "time_per_iteration": 2.5472469329833984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189801, + "balance_loss_mlp": 1.09605432, + "epoch": 0.3026163909195845, + "flos": 688430879232.0, + "grad_norm": 0.025848261804373458, + "language_loss": 0.98155606, + "learning_rate": 0.0008174524496062679, + "loss": 0.9934541, + "num_input_tokens_seen": 129931440, + "router_z_loss_mlp": 0.93652344, + "step": 1573, + "time_per_iteration": 2.90840482711792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185834, + "balance_loss_mlp": 1.0922308, + "epoch": 0.302808772604848, + "flos": 544086262272.0, + "grad_norm": 0.023993082839652336, + "language_loss": 0.9423182, + "learning_rate": 0.0008172116934326894, + "loss": 0.95417649, + "num_input_tokens_seen": 130005200, + "router_z_loss_mlp": 0.93505859, + "step": 1574, + "time_per_iteration": 2.735853433609009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197529, + "balance_loss_mlp": 1.10349655, + "epoch": 0.3030011542901116, + "flos": 476051215872.0, + "grad_norm": 0.025758910941944917, + "language_loss": 0.96492219, + "learning_rate": 0.0008169708141066097, + "loss": 0.97689748, + "num_input_tokens_seen": 130069136, + "router_z_loss_mlp": 0.93945312, + "step": 1575, + "time_per_iteration": 2.5468080043792725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195411, + "balance_loss_mlp": 1.10123575, + "epoch": 0.30319353597537513, + "flos": 482472685056.0, + "grad_norm": 0.02368764088299644, + "language_loss": 0.97863203, + "learning_rate": 0.0008167298117215465, + "loss": 0.99058616, + "num_input_tokens_seen": 130135456, + "router_z_loss_mlp": 0.94091797, + "step": 1576, + "time_per_iteration": 2.5703070163726807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191699, + "balance_loss_mlp": 1.09747636, + "epoch": 0.3033859176606387, + "flos": 706112750592.0, + "grad_norm": 0.02517452757559557, + "language_loss": 0.96809077, + "learning_rate": 0.0008164886863710649, + "loss": 0.98000777, + "num_input_tokens_seen": 130213712, + "router_z_loss_mlp": 0.94140625, + "step": 1577, + "time_per_iteration": 2.9235777854919434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194461, + "balance_loss_mlp": 1.09990454, + "epoch": 0.30357829934590225, + "flos": 766108862976.0, + "grad_norm": 0.022389524212240816, + "language_loss": 0.93041158, + "learning_rate": 0.0008162474381487783, + "loss": 0.94235623, + "num_input_tokens_seen": 130290928, + "router_z_loss_mlp": 0.94482422, + "step": 1578, + "time_per_iteration": 3.0875654220581055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198648, + "balance_loss_mlp": 1.10399556, + "epoch": 0.30377068103116583, + "flos": 533448941568.0, + "grad_norm": 0.026496061930467673, + "language_loss": 0.94202471, + "learning_rate": 0.0008160060671483475, + "loss": 0.9540112, + "num_input_tokens_seen": 130362672, + "router_z_loss_mlp": 0.94580078, + "step": 1579, + "time_per_iteration": 2.69014048576355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198759, + "balance_loss_mlp": 1.10415483, + "epoch": 0.3039630627164294, + "flos": 511223577600.0, + "grad_norm": 0.03174839578716906, + "language_loss": 0.93386602, + "learning_rate": 0.0008157645734634809, + "loss": 0.94585359, + "num_input_tokens_seen": 130428848, + "router_z_loss_mlp": 0.9453125, + "step": 1580, + "time_per_iteration": 2.602752923965454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221184, + "balance_loss_mlp": 1.12791443, + "epoch": 0.30415544440169295, + "flos": 1509188084736.0, + "grad_norm": 0.0221653057193215, + "language_loss": 0.76896489, + "learning_rate": 0.000815522957187935, + "loss": 0.78117669, + "num_input_tokens_seen": 130665440, + "router_z_loss_mlp": 0.93164062, + "step": 1581, + "time_per_iteration": 4.895219802856445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196045, + "balance_loss_mlp": 1.10334778, + "epoch": 0.30434782608695654, + "flos": 1461787133952.0, + "grad_norm": 0.012004742936218659, + "language_loss": 0.73214495, + "learning_rate": 0.0008152812184155132, + "loss": 0.74410546, + "num_input_tokens_seen": 130895248, + "router_z_loss_mlp": 0.92578125, + "step": 1582, + "time_per_iteration": 4.860503196716309 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199297, + "balance_loss_mlp": 1.10526431, + "epoch": 0.3045402077722201, + "flos": 483534197760.0, + "grad_norm": 0.030796945736395555, + "language_loss": 0.93027633, + "learning_rate": 0.000815039357240067, + "loss": 0.94226933, + "num_input_tokens_seen": 130964544, + "router_z_loss_mlp": 0.93945312, + "step": 1583, + "time_per_iteration": 2.6209895610809326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200124, + "balance_loss_mlp": 1.10613978, + "epoch": 0.30473258945748366, + "flos": 544626751488.0, + "grad_norm": 0.03019985050023197, + "language_loss": 0.95277119, + "learning_rate": 0.0008147973737554952, + "loss": 0.9647724, + "num_input_tokens_seen": 131041744, + "router_z_loss_mlp": 0.93896484, + "step": 1584, + "time_per_iteration": 2.7421703338623047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194047, + "balance_loss_mlp": 1.10039604, + "epoch": 0.3049249711427472, + "flos": 568121746944.0, + "grad_norm": 0.05356410902969654, + "language_loss": 0.96138752, + "learning_rate": 0.000814555268055744, + "loss": 0.97332799, + "num_input_tokens_seen": 131108864, + "router_z_loss_mlp": 0.93554688, + "step": 1585, + "time_per_iteration": 2.632770299911499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191549, + "balance_loss_mlp": 1.09804094, + "epoch": 0.3051173528280108, + "flos": 529289485824.0, + "grad_norm": 0.02648444030223836, + "language_loss": 0.96492249, + "learning_rate": 0.0008143130402348073, + "loss": 0.97683799, + "num_input_tokens_seen": 131181104, + "router_z_loss_mlp": 0.93408203, + "step": 1586, + "time_per_iteration": 2.67673659324646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01201208, + "balance_loss_mlp": 1.10746217, + "epoch": 0.3053097345132743, + "flos": 587599002624.0, + "grad_norm": 0.026229801397330138, + "language_loss": 0.86860031, + "learning_rate": 0.0008140706903867265, + "loss": 0.88061237, + "num_input_tokens_seen": 131258704, + "router_z_loss_mlp": 0.93652344, + "step": 1587, + "time_per_iteration": 2.800891399383545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198977, + "balance_loss_mlp": 1.10518289, + "epoch": 0.3055021161985379, + "flos": 608200171008.0, + "grad_norm": 0.031935519152889405, + "language_loss": 1.00360334, + "learning_rate": 0.0008138282186055897, + "loss": 1.01559317, + "num_input_tokens_seen": 131325712, + "router_z_loss_mlp": 0.93701172, + "step": 1588, + "time_per_iteration": 2.735144853591919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119001, + "balance_loss_mlp": 1.09645426, + "epoch": 0.3056944978838015, + "flos": 574962181632.0, + "grad_norm": 0.02354328369726863, + "language_loss": 0.90634608, + "learning_rate": 0.0008135856249855331, + "loss": 0.91824615, + "num_input_tokens_seen": 131397568, + "router_z_loss_mlp": 0.93457031, + "step": 1589, + "time_per_iteration": 2.676589012145996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193478, + "balance_loss_mlp": 1.0996846, + "epoch": 0.305886879569065, + "flos": 635071085568.0, + "grad_norm": 0.031037281782467684, + "language_loss": 0.99387443, + "learning_rate": 0.0008133429096207398, + "loss": 1.00580931, + "num_input_tokens_seen": 131467632, + "router_z_loss_mlp": 0.93701172, + "step": 1590, + "time_per_iteration": 2.7601518630981445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01232346, + "balance_loss_mlp": 1.14117432, + "epoch": 0.3060792612543286, + "flos": 1372131065856.0, + "grad_norm": 0.03086145734446917, + "language_loss": 0.75312257, + "learning_rate": 0.0008131000726054403, + "loss": 0.76544607, + "num_input_tokens_seen": 131702224, + "router_z_loss_mlp": 0.91015625, + "step": 1591, + "time_per_iteration": 4.945107460021973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194266, + "balance_loss_mlp": 1.10051942, + "epoch": 0.30627164293959214, + "flos": 519618350592.0, + "grad_norm": 0.024964882972055902, + "language_loss": 0.95062864, + "learning_rate": 0.0008128571140339123, + "loss": 0.96257126, + "num_input_tokens_seen": 131774608, + "router_z_loss_mlp": 0.93652344, + "step": 1592, + "time_per_iteration": 2.6392171382904053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01201642, + "balance_loss_mlp": 1.10780036, + "epoch": 0.3064640246248557, + "flos": 456533027328.0, + "grad_norm": 0.029487227531667784, + "language_loss": 0.98122042, + "learning_rate": 0.0008126140340004805, + "loss": 0.9932369, + "num_input_tokens_seen": 131841216, + "router_z_loss_mlp": 0.9375, + "step": 1593, + "time_per_iteration": 2.504150629043579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199461, + "balance_loss_mlp": 1.10561943, + "epoch": 0.30665640631011926, + "flos": 851608203264.0, + "grad_norm": 0.026956571268616787, + "language_loss": 0.91923594, + "learning_rate": 0.0008123708325995172, + "loss": 0.93123049, + "num_input_tokens_seen": 131937584, + "router_z_loss_mlp": 0.9375, + "step": 1594, + "time_per_iteration": 3.184525489807129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190831, + "balance_loss_mlp": 1.09713268, + "epoch": 0.30684878799538284, + "flos": 759615535104.0, + "grad_norm": 0.022474213305982697, + "language_loss": 0.88990366, + "learning_rate": 0.0008121275099254414, + "loss": 0.90181196, + "num_input_tokens_seen": 132012656, + "router_z_loss_mlp": 0.93603516, + "step": 1595, + "time_per_iteration": 2.892902374267578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200579, + "balance_loss_mlp": 1.10668933, + "epoch": 0.3070411696806464, + "flos": 518595769344.0, + "grad_norm": 0.025855927391394404, + "language_loss": 0.96650064, + "learning_rate": 0.0008118840660727194, + "loss": 0.97850645, + "num_input_tokens_seen": 132083728, + "router_z_loss_mlp": 0.93798828, + "step": 1596, + "time_per_iteration": 2.696312665939331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191708, + "balance_loss_mlp": 1.09805715, + "epoch": 0.30723355136590996, + "flos": 845790349824.0, + "grad_norm": 0.023513083336694603, + "language_loss": 0.94521677, + "learning_rate": 0.0008116405011358644, + "loss": 0.95713389, + "num_input_tokens_seen": 132170896, + "router_z_loss_mlp": 0.93554688, + "step": 1597, + "time_per_iteration": 3.1500890254974365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118938, + "balance_loss_mlp": 1.09572959, + "epoch": 0.30742593305117355, + "flos": 467079023616.0, + "grad_norm": 0.024597056369147573, + "language_loss": 0.89059556, + "learning_rate": 0.0008113968152094369, + "loss": 0.90248942, + "num_input_tokens_seen": 132234592, + "router_z_loss_mlp": 0.93554688, + "step": 1598, + "time_per_iteration": 2.502336263656616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191327, + "balance_loss_mlp": 1.09781969, + "epoch": 0.3076183147364371, + "flos": 687816529920.0, + "grad_norm": 0.025330429780868927, + "language_loss": 0.90385377, + "learning_rate": 0.0008111530083880438, + "loss": 0.91576707, + "num_input_tokens_seen": 132314720, + "router_z_loss_mlp": 0.93408203, + "step": 1599, + "time_per_iteration": 2.8846051692962646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192126, + "balance_loss_mlp": 1.09847498, + "epoch": 0.30781069642170067, + "flos": 615179593728.0, + "grad_norm": 0.02627563558110635, + "language_loss": 0.95310938, + "learning_rate": 0.0008109090807663399, + "loss": 0.96503073, + "num_input_tokens_seen": 132388768, + "router_z_loss_mlp": 0.93554688, + "step": 1600, + "time_per_iteration": 2.8132736682891846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119763, + "balance_loss_mlp": 1.10402679, + "epoch": 0.3080030781069642, + "flos": 591508680192.0, + "grad_norm": 0.027223292643472258, + "language_loss": 0.96310741, + "learning_rate": 0.0008106650324390257, + "loss": 0.97508371, + "num_input_tokens_seen": 132472544, + "router_z_loss_mlp": 0.93505859, + "step": 1601, + "time_per_iteration": 2.8477296829223633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188215, + "balance_loss_mlp": 1.0948981, + "epoch": 0.3081954597922278, + "flos": 563691045888.0, + "grad_norm": 0.027322987260225157, + "language_loss": 0.89918464, + "learning_rate": 0.0008104208635008493, + "loss": 0.91106677, + "num_input_tokens_seen": 132541968, + "router_z_loss_mlp": 0.93212891, + "step": 1602, + "time_per_iteration": 2.6639676094055176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192245, + "balance_loss_mlp": 1.09859383, + "epoch": 0.3083878414774913, + "flos": 448761335808.0, + "grad_norm": 0.031035394068971153, + "language_loss": 0.93496901, + "learning_rate": 0.0008101765740466058, + "loss": 0.94689143, + "num_input_tokens_seen": 132606976, + "router_z_loss_mlp": 0.93554688, + "step": 1603, + "time_per_iteration": 2.4892899990081787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188397, + "balance_loss_mlp": 1.09465039, + "epoch": 0.3085802231627549, + "flos": 494544821760.0, + "grad_norm": 0.029709960428380106, + "language_loss": 0.93853128, + "learning_rate": 0.0008099321641711364, + "loss": 0.95041513, + "num_input_tokens_seen": 132677984, + "router_z_loss_mlp": 0.93652344, + "step": 1604, + "time_per_iteration": 2.638798952102661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011875, + "balance_loss_mlp": 1.09380174, + "epoch": 0.3087726048480185, + "flos": 488690038272.0, + "grad_norm": 0.02367908107469003, + "language_loss": 0.91951108, + "learning_rate": 0.0008096876339693295, + "loss": 0.93138611, + "num_input_tokens_seen": 132749136, + "router_z_loss_mlp": 0.93603516, + "step": 1605, + "time_per_iteration": 2.6115643978118896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189736, + "balance_loss_mlp": 1.09603786, + "epoch": 0.308964986533282, + "flos": 731887223808.0, + "grad_norm": 0.029121548764615916, + "language_loss": 0.90058184, + "learning_rate": 0.0008094429835361206, + "loss": 0.91247922, + "num_input_tokens_seen": 132823824, + "router_z_loss_mlp": 0.93603516, + "step": 1606, + "time_per_iteration": 2.9361119270324707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185725, + "balance_loss_mlp": 1.09226441, + "epoch": 0.3091573682185456, + "flos": 606515576832.0, + "grad_norm": 0.024539043330914945, + "language_loss": 0.94318593, + "learning_rate": 0.0008091982129664908, + "loss": 0.95504314, + "num_input_tokens_seen": 132895936, + "router_z_loss_mlp": 0.93359375, + "step": 1607, + "time_per_iteration": 2.750641345977783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191863, + "balance_loss_mlp": 1.09821212, + "epoch": 0.30934974990380915, + "flos": 461306832384.0, + "grad_norm": 0.02635007664096696, + "language_loss": 0.92281848, + "learning_rate": 0.0008089533223554687, + "loss": 0.93473709, + "num_input_tokens_seen": 132968960, + "router_z_loss_mlp": 0.93554688, + "step": 1608, + "time_per_iteration": 2.733422040939331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187457, + "balance_loss_mlp": 1.09380579, + "epoch": 0.30954213158907273, + "flos": 554567130624.0, + "grad_norm": 0.025571984513822792, + "language_loss": 0.94345558, + "learning_rate": 0.0008087083117981294, + "loss": 0.95533013, + "num_input_tokens_seen": 133048448, + "router_z_loss_mlp": 0.93554688, + "step": 1609, + "time_per_iteration": 2.919583797454834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189683, + "balance_loss_mlp": 1.09665251, + "epoch": 0.30973451327433627, + "flos": 554113236480.0, + "grad_norm": 0.028700236773969223, + "language_loss": 0.98730469, + "learning_rate": 0.0008084631813895943, + "loss": 0.99920154, + "num_input_tokens_seen": 133121680, + "router_z_loss_mlp": 0.92919922, + "step": 1610, + "time_per_iteration": 2.7721197605133057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192773, + "balance_loss_mlp": 1.09955156, + "epoch": 0.30992689495959985, + "flos": 566762792448.0, + "grad_norm": 0.027612542910463767, + "language_loss": 0.93469882, + "learning_rate": 0.0008082179312250315, + "loss": 0.94662654, + "num_input_tokens_seen": 133190176, + "router_z_loss_mlp": 0.93115234, + "step": 1611, + "time_per_iteration": 2.658564805984497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01219437, + "balance_loss_mlp": 1.12769318, + "epoch": 0.3101192766448634, + "flos": 1445560270848.0, + "grad_norm": 0.021240149379623804, + "language_loss": 0.79855847, + "learning_rate": 0.0008079725613996555, + "loss": 0.81075287, + "num_input_tokens_seen": 133420512, + "router_z_loss_mlp": 0.91601562, + "step": 1612, + "time_per_iteration": 4.8431174755096436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01227287, + "balance_loss_mlp": 1.13497162, + "epoch": 0.31031165833012697, + "flos": 1535127742464.0, + "grad_norm": 0.019393089292119553, + "language_loss": 0.76629329, + "learning_rate": 0.0008077270720087273, + "loss": 0.77856624, + "num_input_tokens_seen": 133651984, + "router_z_loss_mlp": 0.921875, + "step": 1613, + "time_per_iteration": 5.043596029281616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191397, + "balance_loss_mlp": 1.09850931, + "epoch": 0.31050404001539056, + "flos": 993632409600.0, + "grad_norm": 0.029090005547288914, + "language_loss": 0.90590245, + "learning_rate": 0.0008074814631475545, + "loss": 0.91781646, + "num_input_tokens_seen": 133741648, + "router_z_loss_mlp": 0.92773438, + "step": 1614, + "time_per_iteration": 3.3308844566345215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011972, + "balance_loss_mlp": 1.10450339, + "epoch": 0.3106964217006541, + "flos": 446972682240.0, + "grad_norm": 0.029174032275502568, + "language_loss": 0.8959738, + "learning_rate": 0.0008072357349114907, + "loss": 0.90794587, + "num_input_tokens_seen": 133813344, + "router_z_loss_mlp": 0.92578125, + "step": 1615, + "time_per_iteration": 2.660557746887207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194484, + "balance_loss_mlp": 1.10169172, + "epoch": 0.3108888033859177, + "flos": 511494822912.0, + "grad_norm": 0.027617375290548026, + "language_loss": 0.9836188, + "learning_rate": 0.0008069898873959363, + "loss": 0.99556363, + "num_input_tokens_seen": 133884192, + "router_z_loss_mlp": 0.92675781, + "step": 1616, + "time_per_iteration": 2.650024175643921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203555, + "balance_loss_mlp": 1.11076295, + "epoch": 0.3110811850711812, + "flos": 521778306048.0, + "grad_norm": 0.027380341091067188, + "language_loss": 0.94434142, + "learning_rate": 0.0008067439206963375, + "loss": 0.95637697, + "num_input_tokens_seen": 133954848, + "router_z_loss_mlp": 0.92675781, + "step": 1617, + "time_per_iteration": 2.6584017276763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120371, + "balance_loss_mlp": 1.11082232, + "epoch": 0.3112735667564448, + "flos": 687729934848.0, + "grad_norm": 0.029016410329411102, + "language_loss": 0.95023614, + "learning_rate": 0.0008064978349081873, + "loss": 0.96227324, + "num_input_tokens_seen": 134031824, + "router_z_loss_mlp": 0.92773438, + "step": 1618, + "time_per_iteration": 2.911677122116089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199948, + "balance_loss_mlp": 1.10720289, + "epoch": 0.31146594844170833, + "flos": 534165348864.0, + "grad_norm": 0.025439718165996668, + "language_loss": 0.95660365, + "learning_rate": 0.0008062516301270245, + "loss": 0.96860307, + "num_input_tokens_seen": 134104480, + "router_z_loss_mlp": 0.92626953, + "step": 1619, + "time_per_iteration": 2.669111490249634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196196, + "balance_loss_mlp": 1.10388064, + "epoch": 0.3116583301269719, + "flos": 680841836544.0, + "grad_norm": 0.024218225399572888, + "language_loss": 0.96279341, + "learning_rate": 0.0008060053064484343, + "loss": 0.97475541, + "num_input_tokens_seen": 134185632, + "router_z_loss_mlp": 0.921875, + "step": 1620, + "time_per_iteration": 2.924476385116577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189886, + "balance_loss_mlp": 1.09733212, + "epoch": 0.31185071181223545, + "flos": 587329758720.0, + "grad_norm": 0.02529679167102671, + "language_loss": 0.92711556, + "learning_rate": 0.0008057588639680482, + "loss": 0.93901443, + "num_input_tokens_seen": 134261600, + "router_z_loss_mlp": 0.92431641, + "step": 1621, + "time_per_iteration": 2.74631667137146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119125, + "balance_loss_mlp": 1.09817135, + "epoch": 0.31204309349749904, + "flos": 726657523200.0, + "grad_norm": 0.03522846239796161, + "language_loss": 0.93884659, + "learning_rate": 0.0008055123027815434, + "loss": 0.95075905, + "num_input_tokens_seen": 134334368, + "router_z_loss_mlp": 0.9296875, + "step": 1622, + "time_per_iteration": 2.90444016456604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189249, + "balance_loss_mlp": 1.09631383, + "epoch": 0.3122354751827626, + "flos": 577894940160.0, + "grad_norm": 0.026492717763192643, + "language_loss": 0.93252558, + "learning_rate": 0.0008052656229846436, + "loss": 0.94441813, + "num_input_tokens_seen": 134403824, + "router_z_loss_mlp": 0.92822266, + "step": 1623, + "time_per_iteration": 2.680220603942871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187983, + "balance_loss_mlp": 1.09519064, + "epoch": 0.31242785686802615, + "flos": 577028811264.0, + "grad_norm": 0.026617450345468772, + "language_loss": 1.00026262, + "learning_rate": 0.0008050188246731182, + "loss": 1.01214242, + "num_input_tokens_seen": 134471296, + "router_z_loss_mlp": 0.92675781, + "step": 1624, + "time_per_iteration": 2.6526694297790527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190099, + "balance_loss_mlp": 1.09711611, + "epoch": 0.31262023855328974, + "flos": 738195901440.0, + "grad_norm": 0.023806346866415393, + "language_loss": 0.9048847, + "learning_rate": 0.0008047719079427834, + "loss": 0.91678566, + "num_input_tokens_seen": 134551360, + "router_z_loss_mlp": 0.92871094, + "step": 1625, + "time_per_iteration": 3.0077152252197266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119944, + "balance_loss_mlp": 1.108078, + "epoch": 0.3128126202385533, + "flos": 1562591539200.0, + "grad_norm": 0.020013754894949238, + "language_loss": 0.74351704, + "learning_rate": 0.0008045248728895, + "loss": 0.7555114, + "num_input_tokens_seen": 134761328, + "router_z_loss_mlp": 0.91210938, + "step": 1626, + "time_per_iteration": 4.793031215667725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194528, + "balance_loss_mlp": 1.10111523, + "epoch": 0.31300500192381686, + "flos": 515942988288.0, + "grad_norm": 0.023349922932092686, + "language_loss": 0.95821261, + "learning_rate": 0.0008042777196091757, + "loss": 0.97015792, + "num_input_tokens_seen": 134833136, + "router_z_loss_mlp": 0.93310547, + "step": 1627, + "time_per_iteration": 2.679588556289673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196127, + "balance_loss_mlp": 1.10281038, + "epoch": 0.3131973836090804, + "flos": 527661287424.0, + "grad_norm": 0.026058472156191805, + "language_loss": 0.91163933, + "learning_rate": 0.0008040304481977643, + "loss": 0.92360055, + "num_input_tokens_seen": 134904352, + "router_z_loss_mlp": 0.93212891, + "step": 1628, + "time_per_iteration": 2.6339213848114014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206718, + "balance_loss_mlp": 1.11335361, + "epoch": 0.313389765294344, + "flos": 824209534464.0, + "grad_norm": 0.028324849871922998, + "language_loss": 0.96729648, + "learning_rate": 0.0008037830587512649, + "loss": 0.97936368, + "num_input_tokens_seen": 134984880, + "router_z_loss_mlp": 0.93261719, + "step": 1629, + "time_per_iteration": 3.052304744720459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191904, + "balance_loss_mlp": 1.09896827, + "epoch": 0.31358214697960757, + "flos": 394702599168.0, + "grad_norm": 0.026724204555937114, + "language_loss": 0.89292234, + "learning_rate": 0.0008035355513657224, + "loss": 0.90484136, + "num_input_tokens_seen": 135047456, + "router_z_loss_mlp": 0.92822266, + "step": 1630, + "time_per_iteration": 2.470526695251465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198859, + "balance_loss_mlp": 1.1059711, + "epoch": 0.3137745286648711, + "flos": 573097666560.0, + "grad_norm": 0.025006494531642755, + "language_loss": 1.00651205, + "learning_rate": 0.0008032879261372279, + "loss": 1.01850057, + "num_input_tokens_seen": 135124256, + "router_z_loss_mlp": 0.92773438, + "step": 1631, + "time_per_iteration": 2.7967746257781982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194023, + "balance_loss_mlp": 1.10418701, + "epoch": 0.3139669103501347, + "flos": 1501629241344.0, + "grad_norm": 0.01894627505164378, + "language_loss": 0.79635841, + "learning_rate": 0.0008030401831619178, + "loss": 0.80829865, + "num_input_tokens_seen": 135353024, + "router_z_loss_mlp": 0.89648438, + "step": 1632, + "time_per_iteration": 5.690793991088867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187718, + "balance_loss_mlp": 1.09478259, + "epoch": 0.3141592920353982, + "flos": 526358728704.0, + "grad_norm": 0.023739615719740217, + "language_loss": 0.94780874, + "learning_rate": 0.0008027923225359748, + "loss": 0.95968592, + "num_input_tokens_seen": 135422464, + "router_z_loss_mlp": 0.92822266, + "step": 1633, + "time_per_iteration": 2.619640827178955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182027, + "balance_loss_mlp": 1.08894837, + "epoch": 0.3143516737206618, + "flos": 594387044352.0, + "grad_norm": 0.024020227962995952, + "language_loss": 0.97166598, + "learning_rate": 0.0008025443443556267, + "loss": 0.98348624, + "num_input_tokens_seen": 135490928, + "router_z_loss_mlp": 0.9296875, + "step": 1634, + "time_per_iteration": 2.7105367183685303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187192, + "balance_loss_mlp": 1.09397042, + "epoch": 0.31454405540592534, + "flos": 649679208960.0, + "grad_norm": 0.024579905610689918, + "language_loss": 0.95561564, + "learning_rate": 0.000802296248717147, + "loss": 0.96748757, + "num_input_tokens_seen": 135576288, + "router_z_loss_mlp": 0.93115234, + "step": 1635, + "time_per_iteration": 2.954427480697632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189389, + "balance_loss_mlp": 1.09616756, + "epoch": 0.3147364370911889, + "flos": 644069474304.0, + "grad_norm": 0.026460377875643523, + "language_loss": 0.89723325, + "learning_rate": 0.0008020480357168554, + "loss": 0.90912724, + "num_input_tokens_seen": 135652320, + "router_z_loss_mlp": 0.93115234, + "step": 1636, + "time_per_iteration": 2.7983195781707764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118902, + "balance_loss_mlp": 1.09575093, + "epoch": 0.31492881877645246, + "flos": 472821015552.0, + "grad_norm": 0.024118652497695542, + "language_loss": 0.95980144, + "learning_rate": 0.0008017997054511165, + "loss": 0.97169161, + "num_input_tokens_seen": 135719632, + "router_z_loss_mlp": 0.93164062, + "step": 1637, + "time_per_iteration": 2.543381690979004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188761, + "balance_loss_mlp": 1.09544361, + "epoch": 0.31512120046171604, + "flos": 630629650944.0, + "grad_norm": 0.026442486928658162, + "language_loss": 0.94192296, + "learning_rate": 0.0008015512580163407, + "loss": 0.95381057, + "num_input_tokens_seen": 135796544, + "router_z_loss_mlp": 0.93212891, + "step": 1638, + "time_per_iteration": 2.8069217205047607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189537, + "balance_loss_mlp": 1.09645832, + "epoch": 0.31531358214697963, + "flos": 705053239296.0, + "grad_norm": 0.0247809696854931, + "language_loss": 0.89687169, + "learning_rate": 0.0008013026935089838, + "loss": 0.9087671, + "num_input_tokens_seen": 135871344, + "router_z_loss_mlp": 0.9296875, + "step": 1639, + "time_per_iteration": 2.8575150966644287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189099, + "balance_loss_mlp": 1.09592521, + "epoch": 0.31550596383224316, + "flos": 573631425024.0, + "grad_norm": 0.026868409426578303, + "language_loss": 0.92173505, + "learning_rate": 0.0008010540120255472, + "loss": 0.93362606, + "num_input_tokens_seen": 135944320, + "router_z_loss_mlp": 0.93066406, + "step": 1640, + "time_per_iteration": 2.6781005859375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118909, + "balance_loss_mlp": 1.09591639, + "epoch": 0.31569834551750675, + "flos": 659512800768.0, + "grad_norm": 0.03030176261580671, + "language_loss": 0.95734656, + "learning_rate": 0.0008008052136625774, + "loss": 0.96923745, + "num_input_tokens_seen": 136019456, + "router_z_loss_mlp": 0.93066406, + "step": 1641, + "time_per_iteration": 2.8858654499053955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192627, + "balance_loss_mlp": 1.09950101, + "epoch": 0.3158907272027703, + "flos": 567403338240.0, + "grad_norm": 0.026165343030711524, + "language_loss": 0.94310361, + "learning_rate": 0.0008005562985166666, + "loss": 0.9550299, + "num_input_tokens_seen": 136091232, + "router_z_loss_mlp": 0.93017578, + "step": 1642, + "time_per_iteration": 2.7097506523132324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193912, + "balance_loss_mlp": 1.10102403, + "epoch": 0.31608310888803387, + "flos": 537972968448.0, + "grad_norm": 0.020568762002796243, + "language_loss": 0.9172346, + "learning_rate": 0.0008003072666844524, + "loss": 0.92917377, + "num_input_tokens_seen": 136165088, + "router_z_loss_mlp": 0.92773438, + "step": 1643, + "time_per_iteration": 2.6982197761535645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194419, + "balance_loss_mlp": 1.10181749, + "epoch": 0.3162754905732974, + "flos": 487639259136.0, + "grad_norm": 0.02816029335024998, + "language_loss": 0.90344775, + "learning_rate": 0.0008000581182626173, + "loss": 0.91539198, + "num_input_tokens_seen": 136230368, + "router_z_loss_mlp": 0.92480469, + "step": 1644, + "time_per_iteration": 2.546762466430664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193569, + "balance_loss_mlp": 1.10048997, + "epoch": 0.316467872258561, + "flos": 531095603712.0, + "grad_norm": 0.024394566764596542, + "language_loss": 0.93082815, + "learning_rate": 0.0007998088533478894, + "loss": 0.94276381, + "num_input_tokens_seen": 136302512, + "router_z_loss_mlp": 0.9296875, + "step": 1645, + "time_per_iteration": 2.6320817470550537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188922, + "balance_loss_mlp": 1.09622455, + "epoch": 0.3166602539438245, + "flos": 444413227008.0, + "grad_norm": 0.029455070645316363, + "language_loss": 0.9479661, + "learning_rate": 0.000799559472037042, + "loss": 0.95985526, + "num_input_tokens_seen": 136368064, + "router_z_loss_mlp": 0.92578125, + "step": 1646, + "time_per_iteration": 2.535414457321167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187182, + "balance_loss_mlp": 1.09458041, + "epoch": 0.3168526356290881, + "flos": 647102289408.0, + "grad_norm": 0.02168302123393663, + "language_loss": 0.94649625, + "learning_rate": 0.0007993099744268932, + "loss": 0.95836812, + "num_input_tokens_seen": 136451520, + "router_z_loss_mlp": 0.92480469, + "step": 1647, + "time_per_iteration": 2.912095785140991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182437, + "balance_loss_mlp": 1.08988261, + "epoch": 0.3170450173143517, + "flos": 587257900032.0, + "grad_norm": 0.023943172344495993, + "language_loss": 0.96008313, + "learning_rate": 0.000799060360614307, + "loss": 0.97190744, + "num_input_tokens_seen": 136521184, + "router_z_loss_mlp": 0.92431641, + "step": 1648, + "time_per_iteration": 2.6763339042663574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118738, + "balance_loss_mlp": 1.09482586, + "epoch": 0.3172373989996152, + "flos": 828573106176.0, + "grad_norm": 0.025050943971751935, + "language_loss": 0.91967106, + "learning_rate": 0.0007988106306961917, + "loss": 0.93154484, + "num_input_tokens_seen": 136612592, + "router_z_loss_mlp": 0.92431641, + "step": 1649, + "time_per_iteration": 3.1265392303466797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183645, + "balance_loss_mlp": 1.09151971, + "epoch": 0.3174297806848788, + "flos": 528434090496.0, + "grad_norm": 0.026893421102733506, + "language_loss": 0.92866611, + "learning_rate": 0.0007985607847695014, + "loss": 0.94050252, + "num_input_tokens_seen": 136684336, + "router_z_loss_mlp": 0.91992188, + "step": 1650, + "time_per_iteration": 2.640529155731201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184032, + "balance_loss_mlp": 1.09152567, + "epoch": 0.31762216237014235, + "flos": 714481327104.0, + "grad_norm": 0.024008942139765378, + "language_loss": 0.9102264, + "learning_rate": 0.0007983108229312345, + "loss": 0.92206669, + "num_input_tokens_seen": 136766400, + "router_z_loss_mlp": 0.92382812, + "step": 1651, + "time_per_iteration": 2.890881299972534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183971, + "balance_loss_mlp": 1.09170341, + "epoch": 0.31781454405540593, + "flos": 484799826432.0, + "grad_norm": 0.027702532543066302, + "language_loss": 0.9509185, + "learning_rate": 0.0007980607452784351, + "loss": 0.96275818, + "num_input_tokens_seen": 136834016, + "router_z_loss_mlp": 0.92138672, + "step": 1652, + "time_per_iteration": 2.5693578720092773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118418, + "balance_loss_mlp": 1.09186423, + "epoch": 0.31800692574066947, + "flos": 549804059136.0, + "grad_norm": 0.028510736103347943, + "language_loss": 0.99507928, + "learning_rate": 0.0007978105519081919, + "loss": 1.00692105, + "num_input_tokens_seen": 136906288, + "router_z_loss_mlp": 0.921875, + "step": 1653, + "time_per_iteration": 2.674062967300415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181597, + "balance_loss_mlp": 1.08947253, + "epoch": 0.31819930742593305, + "flos": 517916292096.0, + "grad_norm": 0.029899238666621586, + "language_loss": 0.96953475, + "learning_rate": 0.0007975602429176385, + "loss": 0.98135078, + "num_input_tokens_seen": 136972416, + "router_z_loss_mlp": 0.91992188, + "step": 1654, + "time_per_iteration": 2.595107316970825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011812, + "balance_loss_mlp": 1.08907461, + "epoch": 0.31839168911119664, + "flos": 456969457152.0, + "grad_norm": 0.02327460697487094, + "language_loss": 0.90136862, + "learning_rate": 0.0007973098184039536, + "loss": 0.91318059, + "num_input_tokens_seen": 137044576, + "router_z_loss_mlp": 0.91992188, + "step": 1655, + "time_per_iteration": 2.654873847961426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184047, + "balance_loss_mlp": 1.09192252, + "epoch": 0.3185840707964602, + "flos": 627295391232.0, + "grad_norm": 0.025652000789891626, + "language_loss": 0.955365, + "learning_rate": 0.0007970592784643602, + "loss": 0.96720552, + "num_input_tokens_seen": 137125120, + "router_z_loss_mlp": 0.91992188, + "step": 1656, + "time_per_iteration": 2.8485612869262695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183486, + "balance_loss_mlp": 1.09107482, + "epoch": 0.31877645248172376, + "flos": 568540712448.0, + "grad_norm": 0.02977939264047221, + "language_loss": 0.94253254, + "learning_rate": 0.0007968086231961272, + "loss": 0.9543674, + "num_input_tokens_seen": 137195344, + "router_z_loss_mlp": 0.92285156, + "step": 1657, + "time_per_iteration": 2.6949312686920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182357, + "balance_loss_mlp": 1.09004128, + "epoch": 0.3189688341669873, + "flos": 490552551936.0, + "grad_norm": 0.03598298081414456, + "language_loss": 0.95643866, + "learning_rate": 0.0007965578526965671, + "loss": 0.96826226, + "num_input_tokens_seen": 137261040, + "router_z_loss_mlp": 0.921875, + "step": 1658, + "time_per_iteration": 2.5717341899871826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182583, + "balance_loss_mlp": 1.09012401, + "epoch": 0.3191612158522509, + "flos": 577380647424.0, + "grad_norm": 0.02594626841132509, + "language_loss": 0.93226576, + "learning_rate": 0.0007963069670630377, + "loss": 0.94409156, + "num_input_tokens_seen": 137334400, + "router_z_loss_mlp": 0.92333984, + "step": 1659, + "time_per_iteration": 2.7431960105895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187517, + "balance_loss_mlp": 1.09486747, + "epoch": 0.3193535975375144, + "flos": 539192934912.0, + "grad_norm": 0.026552556196046555, + "language_loss": 0.97412628, + "learning_rate": 0.0007960559663929416, + "loss": 0.98600149, + "num_input_tokens_seen": 137405344, + "router_z_loss_mlp": 0.92529297, + "step": 1660, + "time_per_iteration": 2.631037473678589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186332, + "balance_loss_mlp": 1.09382606, + "epoch": 0.319545979222778, + "flos": 735627714048.0, + "grad_norm": 0.022912970149823363, + "language_loss": 0.94840437, + "learning_rate": 0.0007958048507837259, + "loss": 0.96026772, + "num_input_tokens_seen": 137486016, + "router_z_loss_mlp": 0.92382812, + "step": 1661, + "time_per_iteration": 2.925752878189087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191424, + "balance_loss_mlp": 1.09872651, + "epoch": 0.31973836090804153, + "flos": 765767760384.0, + "grad_norm": 0.030797304976158044, + "language_loss": 0.98320282, + "learning_rate": 0.0007955536203328822, + "loss": 0.99511707, + "num_input_tokens_seen": 137562304, + "router_z_loss_mlp": 0.92578125, + "step": 1662, + "time_per_iteration": 2.9076955318450928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187513, + "balance_loss_mlp": 1.09486389, + "epoch": 0.3199307425933051, + "flos": 561741937152.0, + "grad_norm": 0.02511010738984868, + "language_loss": 0.90468192, + "learning_rate": 0.0007953022751379469, + "loss": 0.91655713, + "num_input_tokens_seen": 137639248, + "router_z_loss_mlp": 0.92529297, + "step": 1663, + "time_per_iteration": 2.7703394889831543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188156, + "balance_loss_mlp": 1.09564936, + "epoch": 0.3201231242785687, + "flos": 752671041024.0, + "grad_norm": 0.029121282383782986, + "language_loss": 0.92101777, + "learning_rate": 0.000795050815296501, + "loss": 0.93289936, + "num_input_tokens_seen": 137718256, + "router_z_loss_mlp": 0.92382812, + "step": 1664, + "time_per_iteration": 2.966632843017578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188504, + "balance_loss_mlp": 1.0960933, + "epoch": 0.32031550596383224, + "flos": 497384254464.0, + "grad_norm": 0.02307975398987516, + "language_loss": 1.00050378, + "learning_rate": 0.0007947992409061695, + "loss": 1.01238883, + "num_input_tokens_seen": 137785216, + "router_z_loss_mlp": 0.92285156, + "step": 1665, + "time_per_iteration": 2.6264171600341797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193124, + "balance_loss_mlp": 1.10080826, + "epoch": 0.3205078876490958, + "flos": 732874876416.0, + "grad_norm": 0.02454331261307917, + "language_loss": 0.93550396, + "learning_rate": 0.0007945475520646226, + "loss": 0.9474352, + "num_input_tokens_seen": 137863424, + "router_z_loss_mlp": 0.921875, + "step": 1666, + "time_per_iteration": 2.915013551712036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191587, + "balance_loss_mlp": 1.09941399, + "epoch": 0.32070026933435936, + "flos": 550474804224.0, + "grad_norm": 0.02796219722650757, + "language_loss": 0.9429689, + "learning_rate": 0.0007942957488695743, + "loss": 0.95488477, + "num_input_tokens_seen": 137930384, + "router_z_loss_mlp": 0.92041016, + "step": 1667, + "time_per_iteration": 2.621396780014038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186724, + "balance_loss_mlp": 1.09421742, + "epoch": 0.32089265101962294, + "flos": 746684000256.0, + "grad_norm": 0.022875326013334737, + "language_loss": 0.87680244, + "learning_rate": 0.0007940438314187833, + "loss": 0.88866973, + "num_input_tokens_seen": 138017200, + "router_z_loss_mlp": 0.92382812, + "step": 1668, + "time_per_iteration": 3.0475997924804688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187112, + "balance_loss_mlp": 1.0947485, + "epoch": 0.3210850327048865, + "flos": 495196101120.0, + "grad_norm": 0.03400858364934581, + "language_loss": 0.88502395, + "learning_rate": 0.0007937917998100529, + "loss": 0.89689511, + "num_input_tokens_seen": 138084048, + "router_z_loss_mlp": 0.92236328, + "step": 1669, + "time_per_iteration": 2.6158430576324463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188853, + "balance_loss_mlp": 1.09658515, + "epoch": 0.32127741439015006, + "flos": 531673022976.0, + "grad_norm": 0.029937804889017615, + "language_loss": 0.92354518, + "learning_rate": 0.0007935396541412302, + "loss": 0.93543375, + "num_input_tokens_seen": 138153280, + "router_z_loss_mlp": 0.92138672, + "step": 1670, + "time_per_iteration": 2.6148414611816406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188159, + "balance_loss_mlp": 1.09589148, + "epoch": 0.3214697960754136, + "flos": 502223187456.0, + "grad_norm": 0.027719397006423088, + "language_loss": 0.94146281, + "learning_rate": 0.0007932873945102068, + "loss": 0.95334446, + "num_input_tokens_seen": 138222320, + "router_z_loss_mlp": 0.92138672, + "step": 1671, + "time_per_iteration": 2.5756680965423584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189911, + "balance_loss_mlp": 1.09950256, + "epoch": 0.3216621777606772, + "flos": 1386402089472.0, + "grad_norm": 0.015471737686433536, + "language_loss": 0.75761777, + "learning_rate": 0.0007930350210149188, + "loss": 0.76951689, + "num_input_tokens_seen": 138449488, + "router_z_loss_mlp": 0.90234375, + "step": 1672, + "time_per_iteration": 4.848818778991699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181453, + "balance_loss_mlp": 1.08975732, + "epoch": 0.32185455944594077, + "flos": 572635040256.0, + "grad_norm": 0.021338606013939526, + "language_loss": 0.94597888, + "learning_rate": 0.0007927825337533461, + "loss": 0.95779347, + "num_input_tokens_seen": 138522496, + "router_z_loss_mlp": 0.91552734, + "step": 1673, + "time_per_iteration": 2.6742517948150635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181114, + "balance_loss_mlp": 1.08975172, + "epoch": 0.3220469411312043, + "flos": 544936928256.0, + "grad_norm": 0.029706455848313437, + "language_loss": 0.9645716, + "learning_rate": 0.0007925299328235131, + "loss": 0.97638273, + "num_input_tokens_seen": 138590096, + "router_z_loss_mlp": 0.91210938, + "step": 1674, + "time_per_iteration": 2.637598991394043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182375, + "balance_loss_mlp": 1.09101272, + "epoch": 0.3222393228164679, + "flos": 492161284608.0, + "grad_norm": 0.02873592636128419, + "language_loss": 0.969607, + "learning_rate": 0.000792277218323488, + "loss": 0.98143071, + "num_input_tokens_seen": 138658224, + "router_z_loss_mlp": 0.91210938, + "step": 1675, + "time_per_iteration": 2.589118719100952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182718, + "balance_loss_mlp": 1.0914042, + "epoch": 0.3224317045017314, + "flos": 491362285056.0, + "grad_norm": 0.026517432951267347, + "language_loss": 0.94174361, + "learning_rate": 0.0007920243903513833, + "loss": 0.95357084, + "num_input_tokens_seen": 138722864, + "router_z_loss_mlp": 0.91162109, + "step": 1676, + "time_per_iteration": 2.5541775226593018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179593, + "balance_loss_mlp": 1.08832622, + "epoch": 0.322624086186995, + "flos": 576870357504.0, + "grad_norm": 0.028460659829427477, + "language_loss": 0.94868386, + "learning_rate": 0.0007917714490053556, + "loss": 0.96047986, + "num_input_tokens_seen": 138791472, + "router_z_loss_mlp": 0.91113281, + "step": 1677, + "time_per_iteration": 2.685833215713501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196193, + "balance_loss_mlp": 1.10454535, + "epoch": 0.32281646787225854, + "flos": 630571253760.0, + "grad_norm": 0.02861547850998442, + "language_loss": 0.93624204, + "learning_rate": 0.0007915183943836055, + "loss": 0.94820398, + "num_input_tokens_seen": 138873424, + "router_z_loss_mlp": 0.91503906, + "step": 1678, + "time_per_iteration": 2.8957157135009766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184806, + "balance_loss_mlp": 1.09363461, + "epoch": 0.3230088495575221, + "flos": 782807084544.0, + "grad_norm": 0.029736135795599906, + "language_loss": 0.92990124, + "learning_rate": 0.0007912652265843773, + "loss": 0.94174933, + "num_input_tokens_seen": 138956880, + "router_z_loss_mlp": 0.91015625, + "step": 1679, + "time_per_iteration": 3.0256145000457764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187663, + "balance_loss_mlp": 1.09620523, + "epoch": 0.3232012312427857, + "flos": 537200165376.0, + "grad_norm": 0.0299548546326655, + "language_loss": 0.88938797, + "learning_rate": 0.0007910119457059597, + "loss": 0.90126455, + "num_input_tokens_seen": 139031296, + "router_z_loss_mlp": 0.91308594, + "step": 1680, + "time_per_iteration": 2.7195773124694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118719, + "balance_loss_mlp": 1.09601843, + "epoch": 0.32339361292804925, + "flos": 706232272896.0, + "grad_norm": 0.03079987155163935, + "language_loss": 0.89790422, + "learning_rate": 0.0007907585518466849, + "loss": 0.90977609, + "num_input_tokens_seen": 139109776, + "router_z_loss_mlp": 0.91015625, + "step": 1681, + "time_per_iteration": 2.9635961055755615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186411, + "balance_loss_mlp": 1.09523988, + "epoch": 0.32358599461331283, + "flos": 453257164800.0, + "grad_norm": 0.027692195030378806, + "language_loss": 0.99450397, + "learning_rate": 0.000790505045104929, + "loss": 1.00636816, + "num_input_tokens_seen": 139174736, + "router_z_loss_mlp": 0.91015625, + "step": 1682, + "time_per_iteration": 2.5084030628204346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186896, + "balance_loss_mlp": 1.09553456, + "epoch": 0.32377837629857636, + "flos": 602091606528.0, + "grad_norm": 0.028152445524849662, + "language_loss": 0.96712899, + "learning_rate": 0.0007902514255791125, + "loss": 0.97899795, + "num_input_tokens_seen": 139252064, + "router_z_loss_mlp": 0.91210938, + "step": 1683, + "time_per_iteration": 2.7732536792755127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185338, + "balance_loss_mlp": 1.09388101, + "epoch": 0.32397075798383995, + "flos": 808898465280.0, + "grad_norm": 0.02645952871958238, + "language_loss": 0.9579218, + "learning_rate": 0.0007899976933676986, + "loss": 0.9697752, + "num_input_tokens_seen": 139333328, + "router_z_loss_mlp": 0.91308594, + "step": 1684, + "time_per_iteration": 2.985987424850464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184012, + "balance_loss_mlp": 1.09274495, + "epoch": 0.3241631396691035, + "flos": 602792550912.0, + "grad_norm": 0.02682215462305332, + "language_loss": 0.96423018, + "learning_rate": 0.0007897438485691955, + "loss": 0.97607034, + "num_input_tokens_seen": 139400976, + "router_z_loss_mlp": 0.91113281, + "step": 1685, + "time_per_iteration": 2.673083543777466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185177, + "balance_loss_mlp": 1.09386301, + "epoch": 0.32435552135436707, + "flos": 475176354816.0, + "grad_norm": 0.030260846574811467, + "language_loss": 0.93327641, + "learning_rate": 0.0007894898912821542, + "loss": 0.9451282, + "num_input_tokens_seen": 139465664, + "router_z_loss_mlp": 0.91162109, + "step": 1686, + "time_per_iteration": 2.526704788208008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181419, + "balance_loss_mlp": 1.09015274, + "epoch": 0.3245479030396306, + "flos": 539219131392.0, + "grad_norm": 0.02519584895765407, + "language_loss": 0.95407552, + "learning_rate": 0.0007892358216051695, + "loss": 0.96588969, + "num_input_tokens_seen": 139541984, + "router_z_loss_mlp": 0.91113281, + "step": 1687, + "time_per_iteration": 2.718292713165283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186611, + "balance_loss_mlp": 1.09543955, + "epoch": 0.3247402847248942, + "flos": 548696884224.0, + "grad_norm": 0.02873183694146744, + "language_loss": 1.00761271, + "learning_rate": 0.0007889816396368803, + "loss": 1.0194788, + "num_input_tokens_seen": 139607408, + "router_z_loss_mlp": 0.91015625, + "step": 1688, + "time_per_iteration": 2.6112852096557617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179714, + "balance_loss_mlp": 1.08835161, + "epoch": 0.3249326664101578, + "flos": 378992030208.0, + "grad_norm": 0.0263136625306578, + "language_loss": 0.95246112, + "learning_rate": 0.0007887273454759687, + "loss": 0.96425825, + "num_input_tokens_seen": 139670000, + "router_z_loss_mlp": 0.91210938, + "step": 1689, + "time_per_iteration": 2.466093063354492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185248, + "balance_loss_mlp": 1.09407663, + "epoch": 0.3251250480954213, + "flos": 529122299904.0, + "grad_norm": 0.02633136368880149, + "language_loss": 0.91763788, + "learning_rate": 0.0007884729392211603, + "loss": 0.92949039, + "num_input_tokens_seen": 139739872, + "router_z_loss_mlp": 0.91015625, + "step": 1690, + "time_per_iteration": 2.633387804031372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182102, + "balance_loss_mlp": 1.09054887, + "epoch": 0.3253174297806849, + "flos": 450558721536.0, + "grad_norm": 0.03256384134880849, + "language_loss": 0.96271229, + "learning_rate": 0.0007882184209712245, + "loss": 0.97453332, + "num_input_tokens_seen": 139802032, + "router_z_loss_mlp": 0.9140625, + "step": 1691, + "time_per_iteration": 2.511629104614258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183951, + "balance_loss_mlp": 1.09239864, + "epoch": 0.32550981146594843, + "flos": 705489669120.0, + "grad_norm": 0.02306884235196454, + "language_loss": 0.92818689, + "learning_rate": 0.000787963790824974, + "loss": 0.9400264, + "num_input_tokens_seen": 139885648, + "router_z_loss_mlp": 0.9140625, + "step": 1692, + "time_per_iteration": 2.953939914703369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118506, + "balance_loss_mlp": 1.0935545, + "epoch": 0.325702193151212, + "flos": 393558494208.0, + "grad_norm": 0.026666894987577915, + "language_loss": 0.98025191, + "learning_rate": 0.0007877090488812651, + "loss": 0.9921025, + "num_input_tokens_seen": 139947920, + "router_z_loss_mlp": 0.91357422, + "step": 1693, + "time_per_iteration": 2.4410316944122314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178009, + "balance_loss_mlp": 1.08659911, + "epoch": 0.32589457483647555, + "flos": 578583149568.0, + "grad_norm": 0.029080232987036207, + "language_loss": 0.92532402, + "learning_rate": 0.0007874541952389973, + "loss": 0.93710411, + "num_input_tokens_seen": 140020048, + "router_z_loss_mlp": 0.91259766, + "step": 1694, + "time_per_iteration": 2.660390853881836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179003, + "balance_loss_mlp": 1.08792675, + "epoch": 0.32608695652173914, + "flos": 499329360384.0, + "grad_norm": 0.023433013698769337, + "language_loss": 0.93903476, + "learning_rate": 0.0007871992299971136, + "loss": 0.9508248, + "num_input_tokens_seen": 140085600, + "router_z_loss_mlp": 0.90917969, + "step": 1695, + "time_per_iteration": 2.5506269931793213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179394, + "balance_loss_mlp": 1.08822274, + "epoch": 0.32627933820700267, + "flos": 592300948992.0, + "grad_norm": 0.02355558557065364, + "language_loss": 0.91491008, + "learning_rate": 0.0007869441532546001, + "loss": 0.92670405, + "num_input_tokens_seen": 140155152, + "router_z_loss_mlp": 0.91015625, + "step": 1696, + "time_per_iteration": 2.7493326663970947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177542, + "balance_loss_mlp": 1.08618009, + "epoch": 0.32647171989226625, + "flos": 610273531392.0, + "grad_norm": 0.02705729718991907, + "language_loss": 0.87004846, + "learning_rate": 0.0007866889651104867, + "loss": 0.8818239, + "num_input_tokens_seen": 140228560, + "router_z_loss_mlp": 0.91210938, + "step": 1697, + "time_per_iteration": 2.7824432849884033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179221, + "balance_loss_mlp": 1.08785892, + "epoch": 0.32666410157752984, + "flos": 478189704192.0, + "grad_norm": 0.028152017440838794, + "language_loss": 0.94142878, + "learning_rate": 0.000786433665663846, + "loss": 0.95322108, + "num_input_tokens_seen": 140297952, + "router_z_loss_mlp": 0.91210938, + "step": 1698, + "time_per_iteration": 2.6674411296844482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187877, + "balance_loss_mlp": 1.09670568, + "epoch": 0.3268564832627934, + "flos": 719693563392.0, + "grad_norm": 0.040459779361444057, + "language_loss": 0.95728016, + "learning_rate": 0.0007861782550137942, + "loss": 0.96915889, + "num_input_tokens_seen": 140373408, + "router_z_loss_mlp": 0.91015625, + "step": 1699, + "time_per_iteration": 2.923370599746704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187429, + "balance_loss_mlp": 1.09625793, + "epoch": 0.32704886494805696, + "flos": 770105135616.0, + "grad_norm": 0.025720199745930695, + "language_loss": 0.93479955, + "learning_rate": 0.0007859227332594901, + "loss": 0.94667387, + "num_input_tokens_seen": 140451840, + "router_z_loss_mlp": 0.91015625, + "step": 1700, + "time_per_iteration": 2.94014573097229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191948, + "balance_loss_mlp": 1.10120583, + "epoch": 0.3272412466333205, + "flos": 851404087296.0, + "grad_norm": 0.0329500691508657, + "language_loss": 0.94768298, + "learning_rate": 0.0007856671005001365, + "loss": 0.95960248, + "num_input_tokens_seen": 140537696, + "router_z_loss_mlp": 0.90576172, + "step": 1701, + "time_per_iteration": 3.1774539947509766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118211, + "balance_loss_mlp": 1.09065294, + "epoch": 0.3274336283185841, + "flos": 833040737280.0, + "grad_norm": 0.029774404200988806, + "language_loss": 0.90405869, + "learning_rate": 0.0007854113568349787, + "loss": 0.91587985, + "num_input_tokens_seen": 140623536, + "router_z_loss_mlp": 0.91308594, + "step": 1702, + "time_per_iteration": 3.107083559036255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186026, + "balance_loss_mlp": 1.09471202, + "epoch": 0.3276260100038476, + "flos": 693252347904.0, + "grad_norm": 0.029328613393929583, + "language_loss": 0.89606428, + "learning_rate": 0.0007851555023633052, + "loss": 0.90792453, + "num_input_tokens_seen": 140700688, + "router_z_loss_mlp": 0.91162109, + "step": 1703, + "time_per_iteration": 2.8335254192352295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011877, + "balance_loss_mlp": 1.09643364, + "epoch": 0.3278183916891112, + "flos": 436977908736.0, + "grad_norm": 0.03479764223743197, + "language_loss": 0.91987431, + "learning_rate": 0.0007848995371844474, + "loss": 0.93175125, + "num_input_tokens_seen": 140765808, + "router_z_loss_mlp": 0.91113281, + "step": 1704, + "time_per_iteration": 2.51261043548584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118827, + "balance_loss_mlp": 1.09728956, + "epoch": 0.3280107733743748, + "flos": 462016508928.0, + "grad_norm": 0.027955151013136243, + "language_loss": 0.90236068, + "learning_rate": 0.0007846434613977801, + "loss": 0.91424334, + "num_input_tokens_seen": 140830512, + "router_z_loss_mlp": 0.90820312, + "step": 1705, + "time_per_iteration": 2.51505708694458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185335, + "balance_loss_mlp": 1.09464061, + "epoch": 0.3282031550596383, + "flos": 680528931840.0, + "grad_norm": 0.0285448105624817, + "language_loss": 0.86403298, + "learning_rate": 0.0007843872751027203, + "loss": 0.87588632, + "num_input_tokens_seen": 140902816, + "router_z_loss_mlp": 0.90527344, + "step": 1706, + "time_per_iteration": 2.7977733612060547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183945, + "balance_loss_mlp": 1.0931555, + "epoch": 0.3283955367449019, + "flos": 546254949888.0, + "grad_norm": 0.024438576566567966, + "language_loss": 0.93906903, + "learning_rate": 0.0007841309783987287, + "loss": 0.95090854, + "num_input_tokens_seen": 140975488, + "router_z_loss_mlp": 0.90625, + "step": 1707, + "time_per_iteration": 2.737680196762085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178748, + "balance_loss_mlp": 1.08757639, + "epoch": 0.32858791843016544, + "flos": 482240371200.0, + "grad_norm": 0.027193371904651382, + "language_loss": 0.97315758, + "learning_rate": 0.0007838745713853084, + "loss": 0.98494506, + "num_input_tokens_seen": 141043248, + "router_z_loss_mlp": 0.91015625, + "step": 1708, + "time_per_iteration": 2.5702459812164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189964, + "balance_loss_mlp": 1.09879303, + "epoch": 0.328780300115429, + "flos": 567915629568.0, + "grad_norm": 0.029427091701823335, + "language_loss": 0.93208408, + "learning_rate": 0.0007836180541620053, + "loss": 0.94398379, + "num_input_tokens_seen": 141119408, + "router_z_loss_mlp": 0.91015625, + "step": 1709, + "time_per_iteration": 2.7365195751190186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189596, + "balance_loss_mlp": 1.09852052, + "epoch": 0.32897268180069256, + "flos": 476991204864.0, + "grad_norm": 0.02924752300223344, + "language_loss": 0.94609785, + "learning_rate": 0.0007833614268284082, + "loss": 0.95799387, + "num_input_tokens_seen": 141184112, + "router_z_loss_mlp": 0.90917969, + "step": 1710, + "time_per_iteration": 2.575416326522827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186913, + "balance_loss_mlp": 1.09745789, + "epoch": 0.32916506348595614, + "flos": 1580450603520.0, + "grad_norm": 0.014653073497659498, + "language_loss": 0.74109769, + "learning_rate": 0.0007831046894841489, + "loss": 0.75296688, + "num_input_tokens_seen": 141414960, + "router_z_loss_mlp": 0.89257812, + "step": 1711, + "time_per_iteration": 4.8569114208221436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117837, + "balance_loss_mlp": 1.08681703, + "epoch": 0.3293574451712197, + "flos": 483851105280.0, + "grad_norm": 0.027096123044633498, + "language_loss": 0.8678506, + "learning_rate": 0.0007828478422289016, + "loss": 0.87963432, + "num_input_tokens_seen": 141485744, + "router_z_loss_mlp": 0.9140625, + "step": 1712, + "time_per_iteration": 2.5748305320739746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181971, + "balance_loss_mlp": 1.09041798, + "epoch": 0.32954982685648326, + "flos": 623724088320.0, + "grad_norm": 0.027491608740018197, + "language_loss": 0.97854888, + "learning_rate": 0.0007825908851623833, + "loss": 0.99036855, + "num_input_tokens_seen": 141560592, + "router_z_loss_mlp": 0.9140625, + "step": 1713, + "time_per_iteration": 2.7387707233428955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180742, + "balance_loss_mlp": 1.0893327, + "epoch": 0.32974220854174685, + "flos": 546070299648.0, + "grad_norm": 0.028986059756107307, + "language_loss": 0.93660253, + "learning_rate": 0.0007823338183843533, + "loss": 0.94840991, + "num_input_tokens_seen": 141630400, + "router_z_loss_mlp": 0.91259766, + "step": 1714, + "time_per_iteration": 2.7061285972595215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194773, + "balance_loss_mlp": 1.10341084, + "epoch": 0.3299345902270104, + "flos": 983822286336.0, + "grad_norm": 0.02918308821255402, + "language_loss": 0.89344442, + "learning_rate": 0.0007820766419946141, + "loss": 0.90539211, + "num_input_tokens_seen": 141721552, + "router_z_loss_mlp": 0.91210938, + "step": 1715, + "time_per_iteration": 3.2698333263397217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119133, + "balance_loss_mlp": 1.10206604, + "epoch": 0.33012697191227397, + "flos": 1406901926400.0, + "grad_norm": 0.008988097140154246, + "language_loss": 0.7967248, + "learning_rate": 0.0007818193560930102, + "loss": 0.8086381, + "num_input_tokens_seen": 141956464, + "router_z_loss_mlp": 0.890625, + "step": 1716, + "time_per_iteration": 4.931420564651489 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193588, + "balance_loss_mlp": 1.10213029, + "epoch": 0.3303193535975375, + "flos": 506169795072.0, + "grad_norm": 0.03043585823380059, + "language_loss": 0.87317824, + "learning_rate": 0.0007815619607794288, + "loss": 0.88511419, + "num_input_tokens_seen": 142029552, + "router_z_loss_mlp": 0.91308594, + "step": 1717, + "time_per_iteration": 2.611924171447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198413, + "balance_loss_mlp": 1.10676467, + "epoch": 0.3305117352828011, + "flos": 939484349952.0, + "grad_norm": 0.029759763631388395, + "language_loss": 0.92828202, + "learning_rate": 0.0007813044561538001, + "loss": 0.94026613, + "num_input_tokens_seen": 142117344, + "router_z_loss_mlp": 0.91503906, + "step": 1718, + "time_per_iteration": 3.188633680343628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186368, + "balance_loss_mlp": 1.09495842, + "epoch": 0.3307041169680646, + "flos": 722793507840.0, + "grad_norm": 0.027827869889066197, + "language_loss": 0.97286105, + "learning_rate": 0.0007810468423160958, + "loss": 0.9847247, + "num_input_tokens_seen": 142190096, + "router_z_loss_mlp": 0.91259766, + "step": 1719, + "time_per_iteration": 2.8963494300842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179653, + "balance_loss_mlp": 1.08829057, + "epoch": 0.3308964986533282, + "flos": 584815965696.0, + "grad_norm": 0.0232486528054596, + "language_loss": 0.89203978, + "learning_rate": 0.0007807891193663306, + "loss": 0.90383637, + "num_input_tokens_seen": 142265584, + "router_z_loss_mlp": 0.91210938, + "step": 1720, + "time_per_iteration": 2.784005880355835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188579, + "balance_loss_mlp": 1.09712148, + "epoch": 0.33108888033859174, + "flos": 474525075456.0, + "grad_norm": 0.03234593548431852, + "language_loss": 0.92577451, + "learning_rate": 0.0007805312874045614, + "loss": 0.93766028, + "num_input_tokens_seen": 142330352, + "router_z_loss_mlp": 0.91308594, + "step": 1721, + "time_per_iteration": 2.5072579383850098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187856, + "balance_loss_mlp": 1.09635103, + "epoch": 0.3312812620238553, + "flos": 386996035584.0, + "grad_norm": 0.030880666413309405, + "language_loss": 0.96009982, + "learning_rate": 0.0007802733465308874, + "loss": 0.97197837, + "num_input_tokens_seen": 142392208, + "router_z_loss_mlp": 0.91357422, + "step": 1722, + "time_per_iteration": 2.460878372192383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193288, + "balance_loss_mlp": 1.10173571, + "epoch": 0.3314736437091189, + "flos": 495604333056.0, + "grad_norm": 0.02871647017272099, + "language_loss": 0.9219079, + "learning_rate": 0.0007800152968454501, + "loss": 0.93384075, + "num_input_tokens_seen": 142462112, + "router_z_loss_mlp": 0.9140625, + "step": 1723, + "time_per_iteration": 2.6537680625915527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185112, + "balance_loss_mlp": 1.09365499, + "epoch": 0.33166602539438245, + "flos": 654930376704.0, + "grad_norm": 0.0223046700763118, + "language_loss": 0.96869862, + "learning_rate": 0.0007797571384484334, + "loss": 0.98054969, + "num_input_tokens_seen": 142539120, + "router_z_loss_mlp": 0.91308594, + "step": 1724, + "time_per_iteration": 2.8509135246276855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180603, + "balance_loss_mlp": 1.08909798, + "epoch": 0.33185840707964603, + "flos": 521834701824.0, + "grad_norm": 0.02731483808063424, + "language_loss": 1.00636935, + "learning_rate": 0.0007794988714400633, + "loss": 1.01817536, + "num_input_tokens_seen": 142611520, + "router_z_loss_mlp": 0.91357422, + "step": 1725, + "time_per_iteration": 2.5883586406707764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180377, + "balance_loss_mlp": 1.08901501, + "epoch": 0.33205078876490957, + "flos": 437898432000.0, + "grad_norm": 0.028871117282170154, + "language_loss": 0.94438303, + "learning_rate": 0.0007792404959206079, + "loss": 0.95618677, + "num_input_tokens_seen": 142676064, + "router_z_loss_mlp": 0.91210938, + "step": 1726, + "time_per_iteration": 2.522392988204956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196305, + "balance_loss_mlp": 1.10499096, + "epoch": 0.33224317045017315, + "flos": 770094402048.0, + "grad_norm": 0.026417182809826974, + "language_loss": 0.89548182, + "learning_rate": 0.0007789820119903774, + "loss": 0.90744483, + "num_input_tokens_seen": 142750944, + "router_z_loss_mlp": 0.91162109, + "step": 1727, + "time_per_iteration": 3.015399217605591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119368, + "balance_loss_mlp": 1.10441589, + "epoch": 0.3324355521354367, + "flos": 1469293584384.0, + "grad_norm": 0.009201187704085647, + "language_loss": 0.78492665, + "learning_rate": 0.0007787234197497242, + "loss": 0.79686344, + "num_input_tokens_seen": 142974032, + "router_z_loss_mlp": 0.890625, + "step": 1728, + "time_per_iteration": 4.849627494812012 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187682, + "balance_loss_mlp": 1.09641564, + "epoch": 0.3326279338207003, + "flos": 497799217152.0, + "grad_norm": 0.02618775195690524, + "language_loss": 0.91979456, + "learning_rate": 0.0007784647192990428, + "loss": 0.93167138, + "num_input_tokens_seen": 143047280, + "router_z_loss_mlp": 0.91113281, + "step": 1729, + "time_per_iteration": 2.6944785118103027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178599, + "balance_loss_mlp": 1.08761811, + "epoch": 0.33282031550596386, + "flos": 637053121536.0, + "grad_norm": 0.02771760173732663, + "language_loss": 0.88792735, + "learning_rate": 0.0007782059107387696, + "loss": 0.89971334, + "num_input_tokens_seen": 143124224, + "router_z_loss_mlp": 0.90820312, + "step": 1730, + "time_per_iteration": 2.8583710193634033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179548, + "balance_loss_mlp": 1.0887109, + "epoch": 0.3330126971912274, + "flos": 690721090560.0, + "grad_norm": 0.027739782699759397, + "language_loss": 0.98025161, + "learning_rate": 0.0007779469941693826, + "loss": 0.99204707, + "num_input_tokens_seen": 143194048, + "router_z_loss_mlp": 0.90673828, + "step": 1731, + "time_per_iteration": 2.810589075088501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184359, + "balance_loss_mlp": 1.09361696, + "epoch": 0.333205078876491, + "flos": 567553059840.0, + "grad_norm": 0.03096728777448764, + "language_loss": 0.86715639, + "learning_rate": 0.0007776879696914029, + "loss": 0.87899995, + "num_input_tokens_seen": 143272976, + "router_z_loss_mlp": 0.90576172, + "step": 1732, + "time_per_iteration": 2.8331797122955322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179804, + "balance_loss_mlp": 1.08906233, + "epoch": 0.3333974605617545, + "flos": 642170030592.0, + "grad_norm": 0.024377484958938406, + "language_loss": 0.95668435, + "learning_rate": 0.000777428837405392, + "loss": 0.96848238, + "num_input_tokens_seen": 143346496, + "router_z_loss_mlp": 0.90576172, + "step": 1733, + "time_per_iteration": 2.8495984077453613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178278, + "balance_loss_mlp": 1.087345, + "epoch": 0.3335898422470181, + "flos": 462778578432.0, + "grad_norm": 0.02888991438897714, + "language_loss": 0.96001673, + "learning_rate": 0.0007771695974119544, + "loss": 0.97179955, + "num_input_tokens_seen": 143410448, + "router_z_loss_mlp": 0.90771484, + "step": 1734, + "time_per_iteration": 2.581843614578247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193993, + "balance_loss_mlp": 1.10267842, + "epoch": 0.33378222393228163, + "flos": 854336845824.0, + "grad_norm": 0.031032438471150628, + "language_loss": 0.84453082, + "learning_rate": 0.0007769102498117359, + "loss": 0.85647076, + "num_input_tokens_seen": 143492416, + "router_z_loss_mlp": 0.91162109, + "step": 1735, + "time_per_iteration": 3.092892646789551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118579, + "balance_loss_mlp": 1.09471452, + "epoch": 0.3339746056175452, + "flos": 956308824576.0, + "grad_norm": 0.02638013374987503, + "language_loss": 0.87690091, + "learning_rate": 0.000776650794705424, + "loss": 0.88875878, + "num_input_tokens_seen": 143590096, + "router_z_loss_mlp": 0.90917969, + "step": 1736, + "time_per_iteration": 3.26749587059021 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188294, + "balance_loss_mlp": 1.09693241, + "epoch": 0.33416698730280875, + "flos": 545894381568.0, + "grad_norm": 0.025194797458818457, + "language_loss": 0.89670336, + "learning_rate": 0.0007763912321937483, + "loss": 0.90858638, + "num_input_tokens_seen": 143663344, + "router_z_loss_mlp": 0.91210938, + "step": 1737, + "time_per_iteration": 2.680321455001831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186775, + "balance_loss_mlp": 1.09522188, + "epoch": 0.33435936898807234, + "flos": 1015875237888.0, + "grad_norm": 0.02847992800895855, + "language_loss": 0.91932124, + "learning_rate": 0.0007761315623774799, + "loss": 0.93118894, + "num_input_tokens_seen": 143753072, + "router_z_loss_mlp": 0.9140625, + "step": 1738, + "time_per_iteration": 3.3992278575897217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191791, + "balance_loss_mlp": 1.10014248, + "epoch": 0.3345517506733359, + "flos": 616371362304.0, + "grad_norm": 0.027566762490977777, + "language_loss": 0.97487831, + "learning_rate": 0.0007758717853574313, + "loss": 0.9867962, + "num_input_tokens_seen": 143827280, + "router_z_loss_mlp": 0.91503906, + "step": 1739, + "time_per_iteration": 2.7331244945526123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195023, + "balance_loss_mlp": 1.10327947, + "epoch": 0.33474413235859946, + "flos": 495569404416.0, + "grad_norm": 0.027457607023843998, + "language_loss": 0.9961037, + "learning_rate": 0.0007756119012344571, + "loss": 1.00805402, + "num_input_tokens_seen": 143895072, + "router_z_loss_mlp": 0.91601562, + "step": 1740, + "time_per_iteration": 2.5305063724517822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189378, + "balance_loss_mlp": 1.09772944, + "epoch": 0.33493651404386304, + "flos": 629487547392.0, + "grad_norm": 0.029043894294382887, + "language_loss": 0.93616855, + "learning_rate": 0.0007753519101094535, + "loss": 0.9480623, + "num_input_tokens_seen": 143965728, + "router_z_loss_mlp": 0.91503906, + "step": 1741, + "time_per_iteration": 2.7408056259155273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177762, + "balance_loss_mlp": 1.08630431, + "epoch": 0.3351288957291266, + "flos": 514742487552.0, + "grad_norm": 0.027889242250670986, + "language_loss": 0.95720202, + "learning_rate": 0.0007750918120833575, + "loss": 0.96897966, + "num_input_tokens_seen": 144030272, + "router_z_loss_mlp": 0.91308594, + "step": 1742, + "time_per_iteration": 2.5787625312805176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179593, + "balance_loss_mlp": 1.08818376, + "epoch": 0.33532127741439016, + "flos": 648482711040.0, + "grad_norm": 0.029208114264274002, + "language_loss": 0.95614851, + "learning_rate": 0.0007748316072571485, + "loss": 0.96794444, + "num_input_tokens_seen": 144104048, + "router_z_loss_mlp": 0.91259766, + "step": 1743, + "time_per_iteration": 2.751394033432007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178526, + "balance_loss_mlp": 1.08764088, + "epoch": 0.3355136590996537, + "flos": 769788228096.0, + "grad_norm": 0.02678280054581141, + "language_loss": 0.86505532, + "learning_rate": 0.0007745712957318467, + "loss": 0.87684047, + "num_input_tokens_seen": 144180432, + "router_z_loss_mlp": 0.90722656, + "step": 1744, + "time_per_iteration": 2.9703569412231445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179715, + "balance_loss_mlp": 1.088925, + "epoch": 0.3357060407849173, + "flos": 596649057792.0, + "grad_norm": 0.023433474800662903, + "language_loss": 0.94101429, + "learning_rate": 0.0007743108776085141, + "loss": 0.95281148, + "num_input_tokens_seen": 144258704, + "router_z_loss_mlp": 0.90625, + "step": 1745, + "time_per_iteration": 2.7529683113098145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184954, + "balance_loss_mlp": 1.09435499, + "epoch": 0.3358984224701808, + "flos": 599801395200.0, + "grad_norm": 0.02538707782704008, + "language_loss": 0.88967884, + "learning_rate": 0.0007740503529882543, + "loss": 0.9015283, + "num_input_tokens_seen": 144335104, + "router_z_loss_mlp": 0.90429688, + "step": 1746, + "time_per_iteration": 2.79131817817688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188552, + "balance_loss_mlp": 1.09780991, + "epoch": 0.3360908041554444, + "flos": 579429812736.0, + "grad_norm": 0.028485119021284356, + "language_loss": 0.99668056, + "learning_rate": 0.0007737897219722114, + "loss": 1.00856614, + "num_input_tokens_seen": 144402912, + "router_z_loss_mlp": 0.90576172, + "step": 1747, + "time_per_iteration": 2.685925006866455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189008, + "balance_loss_mlp": 1.09836173, + "epoch": 0.336283185840708, + "flos": 514620963840.0, + "grad_norm": 0.027318502045144608, + "language_loss": 0.90481317, + "learning_rate": 0.0007735289846615716, + "loss": 0.91670322, + "num_input_tokens_seen": 144475328, + "router_z_loss_mlp": 0.90478516, + "step": 1748, + "time_per_iteration": 2.62443470954895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189766, + "balance_loss_mlp": 1.09902358, + "epoch": 0.3364755675259715, + "flos": 526013623296.0, + "grad_norm": 0.026723032477842582, + "language_loss": 0.90137696, + "learning_rate": 0.0007732681411575621, + "loss": 0.91327465, + "num_input_tokens_seen": 144548288, + "router_z_loss_mlp": 0.90576172, + "step": 1749, + "time_per_iteration": 2.646358013153076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182694, + "balance_loss_mlp": 1.09209466, + "epoch": 0.3366679492112351, + "flos": 555973748736.0, + "grad_norm": 0.023573972968583972, + "language_loss": 0.93333745, + "learning_rate": 0.0007730071915614514, + "loss": 0.94516432, + "num_input_tokens_seen": 144619488, + "router_z_loss_mlp": 0.90429688, + "step": 1750, + "time_per_iteration": 2.6758012771606445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179857, + "balance_loss_mlp": 1.08901942, + "epoch": 0.33686033089649864, + "flos": 428164170240.0, + "grad_norm": 0.030830494146199924, + "language_loss": 0.97502697, + "learning_rate": 0.0007727461359745489, + "loss": 0.98682547, + "num_input_tokens_seen": 144682560, + "router_z_loss_mlp": 0.90673828, + "step": 1751, + "time_per_iteration": 2.4563541412353516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182248, + "balance_loss_mlp": 1.09145832, + "epoch": 0.3370527125817622, + "flos": 542840099328.0, + "grad_norm": 0.023246790346845608, + "language_loss": 0.93729055, + "learning_rate": 0.0007724849744982056, + "loss": 0.94911301, + "num_input_tokens_seen": 144753328, + "router_z_loss_mlp": 0.90625, + "step": 1752, + "time_per_iteration": 2.668113946914673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179422, + "balance_loss_mlp": 1.08858418, + "epoch": 0.33724509426702576, + "flos": 543230866944.0, + "grad_norm": 0.02371236203418416, + "language_loss": 0.90932786, + "learning_rate": 0.0007722237072338131, + "loss": 0.92112207, + "num_input_tokens_seen": 144827312, + "router_z_loss_mlp": 0.90673828, + "step": 1753, + "time_per_iteration": 2.69787335395813 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178324, + "balance_loss_mlp": 1.08753431, + "epoch": 0.33743747595228935, + "flos": 473752272384.0, + "grad_norm": 0.029898359882718887, + "language_loss": 0.95709926, + "learning_rate": 0.0007719623342828046, + "loss": 0.96888256, + "num_input_tokens_seen": 144893488, + "router_z_loss_mlp": 0.90625, + "step": 1754, + "time_per_iteration": 2.4994091987609863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183652, + "balance_loss_mlp": 1.09295714, + "epoch": 0.33762985763755293, + "flos": 470836978176.0, + "grad_norm": 0.02665869511949433, + "language_loss": 0.93777692, + "learning_rate": 0.000771700855746654, + "loss": 0.94961339, + "num_input_tokens_seen": 144961152, + "router_z_loss_mlp": 0.90527344, + "step": 1755, + "time_per_iteration": 2.58086895942688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178715, + "balance_loss_mlp": 1.08792567, + "epoch": 0.33782223932281646, + "flos": 493250995200.0, + "grad_norm": 0.024252070816233498, + "language_loss": 0.95916575, + "learning_rate": 0.0007714392717268763, + "loss": 0.97095293, + "num_input_tokens_seen": 145030576, + "router_z_loss_mlp": 0.90625, + "step": 1756, + "time_per_iteration": 2.5631322860717773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180772, + "balance_loss_mlp": 1.08988702, + "epoch": 0.33801462100808005, + "flos": 466017510912.0, + "grad_norm": 0.025388958299120416, + "language_loss": 0.95127004, + "learning_rate": 0.0007711775823250273, + "loss": 0.96307778, + "num_input_tokens_seen": 145095648, + "router_z_loss_mlp": 0.90722656, + "step": 1757, + "time_per_iteration": 2.5053045749664307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178431, + "balance_loss_mlp": 1.08754551, + "epoch": 0.3382070026933436, + "flos": 797067374592.0, + "grad_norm": 0.024419621343361942, + "language_loss": 0.92107689, + "learning_rate": 0.0007709157876427039, + "loss": 0.93286121, + "num_input_tokens_seen": 145181248, + "router_z_loss_mlp": 0.90722656, + "step": 1758, + "time_per_iteration": 3.1007301807403564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178269, + "balance_loss_mlp": 1.08738351, + "epoch": 0.33839938437860717, + "flos": 509428193280.0, + "grad_norm": 0.024832384176200758, + "language_loss": 0.94253516, + "learning_rate": 0.0007706538877815439, + "loss": 0.95431781, + "num_input_tokens_seen": 145252944, + "router_z_loss_mlp": 0.90722656, + "step": 1759, + "time_per_iteration": 2.588744640350342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178646, + "balance_loss_mlp": 1.0878557, + "epoch": 0.3385917660638707, + "flos": 485273186304.0, + "grad_norm": 0.02369115174437829, + "language_loss": 0.89945841, + "learning_rate": 0.0007703918828432259, + "loss": 0.91124481, + "num_input_tokens_seen": 145323168, + "router_z_loss_mlp": 0.90625, + "step": 1760, + "time_per_iteration": 2.5859875679016113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178403, + "balance_loss_mlp": 1.08770907, + "epoch": 0.3387841477491343, + "flos": 546415405056.0, + "grad_norm": 0.02534991906570622, + "language_loss": 0.96946132, + "learning_rate": 0.000770129772929469, + "loss": 0.9812454, + "num_input_tokens_seen": 145395776, + "router_z_loss_mlp": 0.90527344, + "step": 1761, + "time_per_iteration": 2.633229970932007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117744, + "balance_loss_mlp": 1.08684063, + "epoch": 0.3389765294343978, + "flos": 721063251456.0, + "grad_norm": 0.027907228809642075, + "language_loss": 0.96886694, + "learning_rate": 0.0007698675581420334, + "loss": 0.98064131, + "num_input_tokens_seen": 145470576, + "router_z_loss_mlp": 0.90429688, + "step": 1762, + "time_per_iteration": 2.8309946060180664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190138, + "balance_loss_mlp": 1.09987259, + "epoch": 0.3391689111196614, + "flos": 701263084032.0, + "grad_norm": 0.028701846645649853, + "language_loss": 0.87853253, + "learning_rate": 0.0007696052385827199, + "loss": 0.89043397, + "num_input_tokens_seen": 145548896, + "router_z_loss_mlp": 0.90087891, + "step": 1763, + "time_per_iteration": 2.9673497676849365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183311, + "balance_loss_mlp": 1.09304607, + "epoch": 0.339361292804925, + "flos": 628248115200.0, + "grad_norm": 0.027144566695111814, + "language_loss": 0.85910845, + "learning_rate": 0.00076934281435337, + "loss": 0.87094158, + "num_input_tokens_seen": 145617136, + "router_z_loss_mlp": 0.90087891, + "step": 1764, + "time_per_iteration": 2.7069530487060547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011791, + "balance_loss_mlp": 1.08869135, + "epoch": 0.33955367449018853, + "flos": 610794554880.0, + "grad_norm": 0.025973604998757366, + "language_loss": 0.94002628, + "learning_rate": 0.0007690802855558658, + "loss": 0.95181727, + "num_input_tokens_seen": 145696416, + "router_z_loss_mlp": 0.90234375, + "step": 1765, + "time_per_iteration": 2.8596885204315186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198868, + "balance_loss_mlp": 1.11151123, + "epoch": 0.3397460561754521, + "flos": 1456586357760.0, + "grad_norm": 0.018873382807181687, + "language_loss": 0.76374954, + "learning_rate": 0.0007688176522921302, + "loss": 0.77573818, + "num_input_tokens_seen": 145919680, + "router_z_loss_mlp": 0.87109375, + "step": 1766, + "time_per_iteration": 4.900039434432983 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183458, + "balance_loss_mlp": 1.09304976, + "epoch": 0.33993843786071565, + "flos": 488290538496.0, + "grad_norm": 0.033631077459875626, + "language_loss": 1.00266671, + "learning_rate": 0.0007685549146641262, + "loss": 1.01450121, + "num_input_tokens_seen": 145984272, + "router_z_loss_mlp": 0.90234375, + "step": 1767, + "time_per_iteration": 2.521587610244751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176512, + "balance_loss_mlp": 1.08557928, + "epoch": 0.34013081954597923, + "flos": 418232523264.0, + "grad_norm": 0.024531175575557927, + "language_loss": 0.95696396, + "learning_rate": 0.0007682920727738579, + "loss": 0.96872908, + "num_input_tokens_seen": 146047248, + "router_z_loss_mlp": 0.90771484, + "step": 1768, + "time_per_iteration": 2.4606878757476807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177177, + "balance_loss_mlp": 1.08614898, + "epoch": 0.34032320123124277, + "flos": 438430189056.0, + "grad_norm": 0.027457130501572214, + "language_loss": 0.93990809, + "learning_rate": 0.000768029126723369, + "loss": 0.95167989, + "num_input_tokens_seen": 146111872, + "router_z_loss_mlp": 0.90869141, + "step": 1769, + "time_per_iteration": 2.494699478149414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181852, + "balance_loss_mlp": 1.09077609, + "epoch": 0.34051558291650635, + "flos": 458543261184.0, + "grad_norm": 0.027949795017340132, + "language_loss": 0.90377855, + "learning_rate": 0.0007677660766147447, + "loss": 0.91559708, + "num_input_tokens_seen": 146172608, + "router_z_loss_mlp": 0.90917969, + "step": 1770, + "time_per_iteration": 2.5302748680114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183578, + "balance_loss_mlp": 1.09469604, + "epoch": 0.3407079646017699, + "flos": 1562137645056.0, + "grad_norm": 0.011444512115251876, + "language_loss": 0.72470945, + "learning_rate": 0.0007675029225501102, + "loss": 0.73654521, + "num_input_tokens_seen": 146413584, + "router_z_loss_mlp": 0.88671875, + "step": 1771, + "time_per_iteration": 4.913311004638672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188847, + "balance_loss_mlp": 1.09758055, + "epoch": 0.3409003462870335, + "flos": 493530972672.0, + "grad_norm": 0.032062498304007335, + "language_loss": 0.91194993, + "learning_rate": 0.0007672396646316306, + "loss": 0.92383844, + "num_input_tokens_seen": 146476992, + "router_z_loss_mlp": 0.91113281, + "step": 1772, + "time_per_iteration": 2.539181709289551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179934, + "balance_loss_mlp": 1.08885825, + "epoch": 0.34109272797229706, + "flos": 809820989952.0, + "grad_norm": 0.028470010979029077, + "language_loss": 0.88439053, + "learning_rate": 0.000766976302961512, + "loss": 0.89618981, + "num_input_tokens_seen": 146552848, + "router_z_loss_mlp": 0.90917969, + "step": 1773, + "time_per_iteration": 3.006547212600708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181829, + "balance_loss_mlp": 1.09094357, + "epoch": 0.3412851096575606, + "flos": 471099491328.0, + "grad_norm": 0.02901021255147234, + "language_loss": 0.91066158, + "learning_rate": 0.0007667128376420003, + "loss": 0.92247993, + "num_input_tokens_seen": 146617504, + "router_z_loss_mlp": 0.90722656, + "step": 1774, + "time_per_iteration": 2.534266233444214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118318, + "balance_loss_mlp": 1.09253371, + "epoch": 0.3414774913428242, + "flos": 596770581504.0, + "grad_norm": 0.02876896591079206, + "language_loss": 0.92739397, + "learning_rate": 0.0007664492687753817, + "loss": 0.93922579, + "num_input_tokens_seen": 146691568, + "router_z_loss_mlp": 0.90478516, + "step": 1775, + "time_per_iteration": 2.671475410461426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181574, + "balance_loss_mlp": 1.09102285, + "epoch": 0.3416698730280877, + "flos": 528507950592.0, + "grad_norm": 0.025483549401886952, + "language_loss": 0.89018893, + "learning_rate": 0.000766185596463983, + "loss": 0.90200466, + "num_input_tokens_seen": 146764208, + "router_z_loss_mlp": 0.90380859, + "step": 1776, + "time_per_iteration": 2.6099884510040283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177935, + "balance_loss_mlp": 1.08719325, + "epoch": 0.3418622547133513, + "flos": 876117047808.0, + "grad_norm": 0.026020404961979337, + "language_loss": 0.84743214, + "learning_rate": 0.0007659218208101706, + "loss": 0.8592115, + "num_input_tokens_seen": 146847744, + "router_z_loss_mlp": 0.90576172, + "step": 1777, + "time_per_iteration": 3.1272366046905518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118093, + "balance_loss_mlp": 1.08994997, + "epoch": 0.34205463639861483, + "flos": 604876644864.0, + "grad_norm": 0.024068405360429687, + "language_loss": 0.91582745, + "learning_rate": 0.0007656579419163515, + "loss": 0.92763674, + "num_input_tokens_seen": 146918336, + "router_z_loss_mlp": 0.90820312, + "step": 1778, + "time_per_iteration": 2.7243831157684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180436, + "balance_loss_mlp": 1.0894556, + "epoch": 0.3422470180838784, + "flos": 464714952192.0, + "grad_norm": 0.02739040164484414, + "language_loss": 0.86445272, + "learning_rate": 0.0007653939598849724, + "loss": 0.87625706, + "num_input_tokens_seen": 146982496, + "router_z_loss_mlp": 0.90820312, + "step": 1779, + "time_per_iteration": 2.4913573265075684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180695, + "balance_loss_mlp": 1.09143066, + "epoch": 0.34243939976914195, + "flos": 1589816291328.0, + "grad_norm": 0.01051605552964957, + "language_loss": 0.82880205, + "learning_rate": 0.0007651298748185204, + "loss": 0.84060901, + "num_input_tokens_seen": 147213600, + "router_z_loss_mlp": 0.890625, + "step": 1780, + "time_per_iteration": 4.891184091567993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176554, + "balance_loss_mlp": 1.085621, + "epoch": 0.34263178145440554, + "flos": 874443187200.0, + "grad_norm": 0.026322112436007235, + "language_loss": 0.88782489, + "learning_rate": 0.000764865686819522, + "loss": 0.89959043, + "num_input_tokens_seen": 147287664, + "router_z_loss_mlp": 0.90771484, + "step": 1781, + "time_per_iteration": 3.048123836517334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176352, + "balance_loss_mlp": 1.08551466, + "epoch": 0.3428241631396691, + "flos": 507873854976.0, + "grad_norm": 0.024622696081698998, + "language_loss": 0.93515933, + "learning_rate": 0.0007646013959905449, + "loss": 0.94692284, + "num_input_tokens_seen": 147356800, + "router_z_loss_mlp": 0.90673828, + "step": 1782, + "time_per_iteration": 2.565661907196045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176257, + "balance_loss_mlp": 1.08565772, + "epoch": 0.34301654482493266, + "flos": 881524667904.0, + "grad_norm": 0.0252118274748732, + "language_loss": 0.880337, + "learning_rate": 0.0007643370024341949, + "loss": 0.89209956, + "num_input_tokens_seen": 147432496, + "router_z_loss_mlp": 0.90429688, + "step": 1783, + "time_per_iteration": 3.0695888996124268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180625, + "balance_loss_mlp": 1.08959711, + "epoch": 0.34320892651019624, + "flos": 432668731392.0, + "grad_norm": 0.024350173092139916, + "language_loss": 0.89407057, + "learning_rate": 0.0007640725062531195, + "loss": 0.90587682, + "num_input_tokens_seen": 147495856, + "router_z_loss_mlp": 0.90869141, + "step": 1784, + "time_per_iteration": 2.5120832920074463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184023, + "balance_loss_mlp": 1.09294736, + "epoch": 0.3434013081954598, + "flos": 464593428480.0, + "grad_norm": 0.02877111448667641, + "language_loss": 0.95969987, + "learning_rate": 0.0007638079075500047, + "loss": 0.97154009, + "num_input_tokens_seen": 147559632, + "router_z_loss_mlp": 0.90917969, + "step": 1785, + "time_per_iteration": 2.5176198482513428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194351, + "balance_loss_mlp": 1.10546875, + "epoch": 0.34359368988072336, + "flos": 1560674631168.0, + "grad_norm": 0.01088995253456435, + "language_loss": 0.75180668, + "learning_rate": 0.0007635432064275772, + "loss": 0.7637502, + "num_input_tokens_seen": 147794576, + "router_z_loss_mlp": 0.88671875, + "step": 1786, + "time_per_iteration": 5.021549463272095 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183341, + "balance_loss_mlp": 1.09278917, + "epoch": 0.3437860715659869, + "flos": 496572519936.0, + "grad_norm": 0.024204144242014246, + "language_loss": 0.90540475, + "learning_rate": 0.0007632784029886026, + "loss": 0.91723818, + "num_input_tokens_seen": 147866960, + "router_z_loss_mlp": 0.90380859, + "step": 1787, + "time_per_iteration": 2.6350793838500977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178894, + "balance_loss_mlp": 1.08791375, + "epoch": 0.3439784532512505, + "flos": 719608969728.0, + "grad_norm": 0.025958683961259412, + "language_loss": 0.93068433, + "learning_rate": 0.0007630134973358873, + "loss": 0.94247323, + "num_input_tokens_seen": 147947808, + "router_z_loss_mlp": 0.90820312, + "step": 1788, + "time_per_iteration": 2.93084454536438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178793, + "balance_loss_mlp": 1.08785999, + "epoch": 0.34417083493651407, + "flos": 566921246208.0, + "grad_norm": 0.025032512144454056, + "language_loss": 0.92506206, + "learning_rate": 0.0007627484895722763, + "loss": 0.93685007, + "num_input_tokens_seen": 148015936, + "router_z_loss_mlp": 0.90771484, + "step": 1789, + "time_per_iteration": 2.649689197540283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177857, + "balance_loss_mlp": 1.08706772, + "epoch": 0.3443632166217776, + "flos": 797701189632.0, + "grad_norm": 0.027302991531117576, + "language_loss": 0.89870507, + "learning_rate": 0.0007624833798006552, + "loss": 0.9104836, + "num_input_tokens_seen": 148099776, + "router_z_loss_mlp": 0.90625, + "step": 1790, + "time_per_iteration": 3.0469179153442383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117862, + "balance_loss_mlp": 1.08811665, + "epoch": 0.3445555983070412, + "flos": 570392492544.0, + "grad_norm": 0.0288389056738737, + "language_loss": 0.92729777, + "learning_rate": 0.0007622181681239483, + "loss": 0.93908393, + "num_input_tokens_seen": 148169616, + "router_z_loss_mlp": 0.90332031, + "step": 1791, + "time_per_iteration": 2.6440184116363525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178949, + "balance_loss_mlp": 1.08849263, + "epoch": 0.3447479799923047, + "flos": 569980257792.0, + "grad_norm": 0.022982775931836206, + "language_loss": 0.91584516, + "learning_rate": 0.0007619528546451202, + "loss": 0.9276346, + "num_input_tokens_seen": 148247824, + "router_z_loss_mlp": 0.90283203, + "step": 1792, + "time_per_iteration": 2.797133445739746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177091, + "balance_loss_mlp": 1.08673048, + "epoch": 0.3449403616775683, + "flos": 969331683840.0, + "grad_norm": 0.02628926210615307, + "language_loss": 0.90923131, + "learning_rate": 0.0007616874394671745, + "loss": 0.92100227, + "num_input_tokens_seen": 148333040, + "router_z_loss_mlp": 0.90185547, + "step": 1793, + "time_per_iteration": 3.3191378116607666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178301, + "balance_loss_mlp": 1.08784556, + "epoch": 0.34513274336283184, + "flos": 569676085248.0, + "grad_norm": 0.03267712320672132, + "language_loss": 0.9558928, + "learning_rate": 0.0007614219226931547, + "loss": 0.96767581, + "num_input_tokens_seen": 148401840, + "router_z_loss_mlp": 0.90283203, + "step": 1794, + "time_per_iteration": 2.677525043487549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178051, + "balance_loss_mlp": 1.0875473, + "epoch": 0.3453251250480954, + "flos": 461858055168.0, + "grad_norm": 0.024689469906648515, + "language_loss": 0.92397773, + "learning_rate": 0.0007611563044261435, + "loss": 0.93575823, + "num_input_tokens_seen": 148466576, + "router_z_loss_mlp": 0.90332031, + "step": 1795, + "time_per_iteration": 2.5183908939361572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178812, + "balance_loss_mlp": 1.08835602, + "epoch": 0.34551750673335896, + "flos": 416519731200.0, + "grad_norm": 0.027710199676415265, + "language_loss": 0.96473086, + "learning_rate": 0.0007608905847692631, + "loss": 0.97651899, + "num_input_tokens_seen": 148530016, + "router_z_loss_mlp": 0.90283203, + "step": 1796, + "time_per_iteration": 2.4600772857666016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182482, + "balance_loss_mlp": 1.09212101, + "epoch": 0.34570988841862255, + "flos": 589114409472.0, + "grad_norm": 0.023363368939277738, + "language_loss": 0.92555124, + "learning_rate": 0.0007606247638256749, + "loss": 0.93737608, + "num_input_tokens_seen": 148610064, + "router_z_loss_mlp": 0.90185547, + "step": 1797, + "time_per_iteration": 2.8326525688171387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183395, + "balance_loss_mlp": 1.09565735, + "epoch": 0.34590227010388613, + "flos": 1571142764544.0, + "grad_norm": 0.009651567236440416, + "language_loss": 0.78170294, + "learning_rate": 0.0007603588416985798, + "loss": 0.79353684, + "num_input_tokens_seen": 148835872, + "router_z_loss_mlp": 0.875, + "step": 1798, + "time_per_iteration": 4.921091794967651 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118071, + "balance_loss_mlp": 1.09259033, + "epoch": 0.34609465178914967, + "flos": 1540928131584.0, + "grad_norm": 0.004186018133500934, + "language_loss": 0.79327202, + "learning_rate": 0.0007600928184912179, + "loss": 0.8050791, + "num_input_tokens_seen": 149066864, + "router_z_loss_mlp": 0.87890625, + "step": 1799, + "time_per_iteration": 4.76463508605957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177428, + "balance_loss_mlp": 1.08692396, + "epoch": 0.34628703347441325, + "flos": 610516578816.0, + "grad_norm": 0.027319297321258894, + "language_loss": 0.94778776, + "learning_rate": 0.0007598266943068686, + "loss": 0.95956194, + "num_input_tokens_seen": 149141600, + "router_z_loss_mlp": 0.90332031, + "step": 1800, + "time_per_iteration": 2.741830348968506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180421, + "balance_loss_mlp": 1.0898217, + "epoch": 0.3464794151596768, + "flos": 474264563712.0, + "grad_norm": 0.0268607754896097, + "language_loss": 0.91417915, + "learning_rate": 0.0007595604692488507, + "loss": 0.92598337, + "num_input_tokens_seen": 149205888, + "router_z_loss_mlp": 0.90429688, + "step": 1801, + "time_per_iteration": 2.5253777503967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117756, + "balance_loss_mlp": 1.08719921, + "epoch": 0.34667179684494037, + "flos": 606821750784.0, + "grad_norm": 0.0251267071243342, + "language_loss": 0.907076, + "learning_rate": 0.0007592941434205215, + "loss": 0.91885161, + "num_input_tokens_seen": 149281280, + "router_z_loss_mlp": 0.90185547, + "step": 1802, + "time_per_iteration": 2.7729735374450684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175873, + "balance_loss_mlp": 1.0877533, + "epoch": 0.3468641785302039, + "flos": 1568359727616.0, + "grad_norm": 0.004114808875680539, + "language_loss": 0.73571062, + "learning_rate": 0.0007590277169252782, + "loss": 0.74746931, + "num_input_tokens_seen": 149525008, + "router_z_loss_mlp": 0.87890625, + "step": 1803, + "time_per_iteration": 5.036771774291992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178076, + "balance_loss_mlp": 1.08776271, + "epoch": 0.3470565602154675, + "flos": 908723223552.0, + "grad_norm": 0.03174792037748739, + "language_loss": 0.90712535, + "learning_rate": 0.0007587611898665566, + "loss": 0.91890609, + "num_input_tokens_seen": 149600624, + "router_z_loss_mlp": 0.90136719, + "step": 1804, + "time_per_iteration": 3.0725910663604736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177414, + "balance_loss_mlp": 1.08719671, + "epoch": 0.347248941900731, + "flos": 640059740160.0, + "grad_norm": 0.023310551488003612, + "language_loss": 0.90306699, + "learning_rate": 0.0007584945623478315, + "loss": 0.91484118, + "num_input_tokens_seen": 149674224, + "router_z_loss_mlp": 0.90039062, + "step": 1805, + "time_per_iteration": 2.8080646991729736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176916, + "balance_loss_mlp": 1.08655512, + "epoch": 0.3474413235859946, + "flos": 848781505536.0, + "grad_norm": 0.027596494202169034, + "language_loss": 0.90514499, + "learning_rate": 0.000758227834472617, + "loss": 0.91691411, + "num_input_tokens_seen": 149758688, + "router_z_loss_mlp": 0.90185547, + "step": 1806, + "time_per_iteration": 3.0443291664123535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179899, + "balance_loss_mlp": 1.08972931, + "epoch": 0.3476337052712582, + "flos": 516696325632.0, + "grad_norm": 0.02724510251762829, + "language_loss": 0.86438924, + "learning_rate": 0.0007579610063444664, + "loss": 0.87618828, + "num_input_tokens_seen": 149831648, + "router_z_loss_mlp": 0.89990234, + "step": 1807, + "time_per_iteration": 2.716522455215454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177066, + "balance_loss_mlp": 1.08694386, + "epoch": 0.34782608695652173, + "flos": 915114493440.0, + "grad_norm": 0.02927822844999151, + "language_loss": 0.96424794, + "learning_rate": 0.0007576940780669712, + "loss": 0.97601861, + "num_input_tokens_seen": 149919440, + "router_z_loss_mlp": 0.89941406, + "step": 1808, + "time_per_iteration": 3.21464204788208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177424, + "balance_loss_mlp": 1.08734941, + "epoch": 0.3480184686417853, + "flos": 775083056640.0, + "grad_norm": 0.026376675364870938, + "language_loss": 0.91835052, + "learning_rate": 0.0007574270497437624, + "loss": 0.93012476, + "num_input_tokens_seen": 150001632, + "router_z_loss_mlp": 0.89892578, + "step": 1809, + "time_per_iteration": 2.965306043624878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177298, + "balance_loss_mlp": 1.0874145, + "epoch": 0.34821085032704885, + "flos": 578003728896.0, + "grad_norm": 0.024336980271772477, + "language_loss": 0.95592844, + "learning_rate": 0.000757159921478509, + "loss": 0.96770144, + "num_input_tokens_seen": 150077552, + "router_z_loss_mlp": 0.89697266, + "step": 1810, + "time_per_iteration": 2.781496047973633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177093, + "balance_loss_mlp": 1.088974, + "epoch": 0.34840323201231244, + "flos": 1528039531008.0, + "grad_norm": 0.007178450494277746, + "language_loss": 0.74450636, + "learning_rate": 0.0007568926933749201, + "loss": 0.75627732, + "num_input_tokens_seen": 150295328, + "router_z_loss_mlp": 0.87890625, + "step": 1811, + "time_per_iteration": 4.719515562057495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176704, + "balance_loss_mlp": 1.08691561, + "epoch": 0.34859561369757597, + "flos": 510181530624.0, + "grad_norm": 0.02648580139398905, + "language_loss": 0.96071857, + "learning_rate": 0.0007566253655367423, + "loss": 0.97248554, + "num_input_tokens_seen": 150360496, + "router_z_loss_mlp": 0.89599609, + "step": 1812, + "time_per_iteration": 2.5699198246002197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177921, + "balance_loss_mlp": 1.08822834, + "epoch": 0.34878799538283956, + "flos": 549756395520.0, + "grad_norm": 0.036663453377328174, + "language_loss": 0.96810794, + "learning_rate": 0.000756357938067762, + "loss": 0.97988713, + "num_input_tokens_seen": 150432064, + "router_z_loss_mlp": 0.89501953, + "step": 1813, + "time_per_iteration": 2.6622092723846436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179077, + "balance_loss_mlp": 1.08885992, + "epoch": 0.34898037706810314, + "flos": 985193975808.0, + "grad_norm": 0.026013801782247825, + "language_loss": 0.90032709, + "learning_rate": 0.0007560904110718033, + "loss": 0.91211784, + "num_input_tokens_seen": 150512176, + "router_z_loss_mlp": 0.90039062, + "step": 1814, + "time_per_iteration": 3.2704074382781982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176613, + "balance_loss_mlp": 1.08639514, + "epoch": 0.3491727587533667, + "flos": 682836607488.0, + "grad_norm": 0.025025787643359835, + "language_loss": 0.91824377, + "learning_rate": 0.0007558227846527297, + "loss": 0.93000984, + "num_input_tokens_seen": 150586416, + "router_z_loss_mlp": 0.90039062, + "step": 1815, + "time_per_iteration": 2.870858907699585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176853, + "balance_loss_mlp": 1.08673084, + "epoch": 0.34936514043863026, + "flos": 394889250816.0, + "grad_norm": 0.0291076708707547, + "language_loss": 0.91979998, + "learning_rate": 0.0007555550589144429, + "loss": 0.9315685, + "num_input_tokens_seen": 150648944, + "router_z_loss_mlp": 0.89941406, + "step": 1816, + "time_per_iteration": 2.4363009929656982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177424, + "balance_loss_mlp": 1.08739722, + "epoch": 0.3495575221238938, + "flos": 462340147200.0, + "grad_norm": 0.02440335273431038, + "language_loss": 0.92281306, + "learning_rate": 0.000755287233960883, + "loss": 0.9345873, + "num_input_tokens_seen": 150717200, + "router_z_loss_mlp": 0.8984375, + "step": 1817, + "time_per_iteration": 2.538250207901001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117706, + "balance_loss_mlp": 1.08693826, + "epoch": 0.3497499038091574, + "flos": 725428824576.0, + "grad_norm": 0.028430093115180927, + "language_loss": 0.88002723, + "learning_rate": 0.0007550193098960292, + "loss": 0.89179784, + "num_input_tokens_seen": 150790368, + "router_z_loss_mlp": 0.89941406, + "step": 1818, + "time_per_iteration": 2.8685545921325684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174425, + "balance_loss_mlp": 1.08411181, + "epoch": 0.3499422854944209, + "flos": 829196187648.0, + "grad_norm": 0.021653398091314287, + "language_loss": 0.92103571, + "learning_rate": 0.0007547512868238988, + "loss": 0.93277991, + "num_input_tokens_seen": 150879872, + "router_z_loss_mlp": 0.90136719, + "step": 1819, + "time_per_iteration": 3.115814685821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118204, + "balance_loss_mlp": 1.092013, + "epoch": 0.3501346671796845, + "flos": 494542820352.0, + "grad_norm": 0.026515438979626053, + "language_loss": 0.9198699, + "learning_rate": 0.0007544831648485473, + "loss": 0.93169028, + "num_input_tokens_seen": 150953712, + "router_z_loss_mlp": 0.8984375, + "step": 1820, + "time_per_iteration": 2.6666150093078613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178247, + "balance_loss_mlp": 1.08783865, + "epoch": 0.35032704886494803, + "flos": 579848778240.0, + "grad_norm": 0.026574936148936048, + "language_loss": 0.89372301, + "learning_rate": 0.0007542149440740694, + "loss": 0.90550542, + "num_input_tokens_seen": 151026192, + "router_z_loss_mlp": 0.90234375, + "step": 1821, + "time_per_iteration": 2.6776442527770996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178869, + "balance_loss_mlp": 1.08841276, + "epoch": 0.3505194305502116, + "flos": 585831816192.0, + "grad_norm": 0.02674162112947977, + "language_loss": 0.9602831, + "learning_rate": 0.000753946624604597, + "loss": 0.97207189, + "num_input_tokens_seen": 151100720, + "router_z_loss_mlp": 0.90283203, + "step": 1822, + "time_per_iteration": 2.746363639831543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175368, + "balance_loss_mlp": 1.08491182, + "epoch": 0.3507118122354752, + "flos": 527978194944.0, + "grad_norm": 0.02703682960411951, + "language_loss": 0.95658362, + "learning_rate": 0.0007536782065443015, + "loss": 0.9683373, + "num_input_tokens_seen": 151166032, + "router_z_loss_mlp": 0.90283203, + "step": 1823, + "time_per_iteration": 2.5945184230804443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175188, + "balance_loss_mlp": 1.08458936, + "epoch": 0.35090419392073874, + "flos": 512545602048.0, + "grad_norm": 0.03278557538641046, + "language_loss": 0.86822712, + "learning_rate": 0.0007534096899973919, + "loss": 0.87997901, + "num_input_tokens_seen": 151232208, + "router_z_loss_mlp": 0.90429688, + "step": 1824, + "time_per_iteration": 2.56933331489563 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184456, + "balance_loss_mlp": 1.0944289, + "epoch": 0.3510965756060023, + "flos": 565195719168.0, + "grad_norm": 0.023191753507183704, + "language_loss": 0.89392567, + "learning_rate": 0.0007531410750681154, + "loss": 0.90577018, + "num_input_tokens_seen": 151308128, + "router_z_loss_mlp": 0.8984375, + "step": 1825, + "time_per_iteration": 2.7223169803619385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186327, + "balance_loss_mlp": 1.09630024, + "epoch": 0.35128895729126586, + "flos": 1022253046272.0, + "grad_norm": 0.026424599574572643, + "language_loss": 0.93470478, + "learning_rate": 0.0007528723618607575, + "loss": 0.94656801, + "num_input_tokens_seen": 151402560, + "router_z_loss_mlp": 0.8984375, + "step": 1826, + "time_per_iteration": 3.404395580291748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182394, + "balance_loss_mlp": 1.09236717, + "epoch": 0.35148133897652944, + "flos": 589424586240.0, + "grad_norm": 0.02767542011563751, + "language_loss": 0.89242589, + "learning_rate": 0.0007526035504796422, + "loss": 0.90424991, + "num_input_tokens_seen": 151478816, + "router_z_loss_mlp": 0.8984375, + "step": 1827, + "time_per_iteration": 2.820510149002075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117853, + "balance_loss_mlp": 1.08850324, + "epoch": 0.351673720661793, + "flos": 496285811712.0, + "grad_norm": 0.02845608163714707, + "language_loss": 0.94670665, + "learning_rate": 0.0007523346410291312, + "loss": 0.95849192, + "num_input_tokens_seen": 151554528, + "router_z_loss_mlp": 0.8984375, + "step": 1828, + "time_per_iteration": 2.763277053833008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177518, + "balance_loss_mlp": 1.08753836, + "epoch": 0.35186610234705656, + "flos": 763998572544.0, + "grad_norm": 0.028566964886064136, + "language_loss": 0.91855693, + "learning_rate": 0.0007520656336136245, + "loss": 0.93033206, + "num_input_tokens_seen": 151629440, + "router_z_loss_mlp": 0.89794922, + "step": 1829, + "time_per_iteration": 2.9501917362213135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179113, + "balance_loss_mlp": 1.08908641, + "epoch": 0.3520584840323201, + "flos": 627388717056.0, + "grad_norm": 0.0235814228834027, + "language_loss": 0.94624627, + "learning_rate": 0.0007517965283375599, + "loss": 0.95803738, + "num_input_tokens_seen": 151708544, + "router_z_loss_mlp": 0.8984375, + "step": 1830, + "time_per_iteration": 2.8197402954101562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179857, + "balance_loss_mlp": 1.08992577, + "epoch": 0.3522508657175837, + "flos": 538448329728.0, + "grad_norm": 0.025024391475303026, + "language_loss": 0.97205818, + "learning_rate": 0.0007515273253054132, + "loss": 0.9838568, + "num_input_tokens_seen": 151779152, + "router_z_loss_mlp": 0.89746094, + "step": 1831, + "time_per_iteration": 2.6376330852508545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191124, + "balance_loss_mlp": 1.10109711, + "epoch": 0.35244324740284727, + "flos": 568501780992.0, + "grad_norm": 0.029882616882314406, + "language_loss": 0.9266001, + "learning_rate": 0.0007512580246216988, + "loss": 0.93851131, + "num_input_tokens_seen": 151853216, + "router_z_loss_mlp": 0.8984375, + "step": 1832, + "time_per_iteration": 2.708432912826538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179716, + "balance_loss_mlp": 1.08964145, + "epoch": 0.3526356290881108, + "flos": 514054278144.0, + "grad_norm": 0.030813246422457925, + "language_loss": 0.91671479, + "learning_rate": 0.000750988626390968, + "loss": 0.92851192, + "num_input_tokens_seen": 151920416, + "router_z_loss_mlp": 0.89892578, + "step": 1833, + "time_per_iteration": 2.592047929763794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179987, + "balance_loss_mlp": 1.09010315, + "epoch": 0.3528280107733744, + "flos": 596972696064.0, + "grad_norm": 0.024705197674389605, + "language_loss": 0.91622353, + "learning_rate": 0.0007507191307178108, + "loss": 0.9280234, + "num_input_tokens_seen": 151990848, + "router_z_loss_mlp": 0.89697266, + "step": 1834, + "time_per_iteration": 2.7884535789489746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176506, + "balance_loss_mlp": 1.08652651, + "epoch": 0.3530203924586379, + "flos": 552298386432.0, + "grad_norm": 0.0302975798262418, + "language_loss": 0.83893424, + "learning_rate": 0.0007504495377068543, + "loss": 0.85069931, + "num_input_tokens_seen": 152064864, + "router_z_loss_mlp": 0.89794922, + "step": 1835, + "time_per_iteration": 2.7751786708831787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175764, + "balance_loss_mlp": 1.08573675, + "epoch": 0.3532127741439015, + "flos": 654305293824.0, + "grad_norm": 0.027517554164180617, + "language_loss": 0.90655488, + "learning_rate": 0.0007501798474627642, + "loss": 0.91831255, + "num_input_tokens_seen": 152150096, + "router_z_loss_mlp": 0.8984375, + "step": 1836, + "time_per_iteration": 2.9638845920562744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179149, + "balance_loss_mlp": 1.08926523, + "epoch": 0.35340515582916504, + "flos": 724150460928.0, + "grad_norm": 0.024568481275515953, + "language_loss": 0.91140759, + "learning_rate": 0.0007499100600902433, + "loss": 0.92319906, + "num_input_tokens_seen": 152232528, + "router_z_loss_mlp": 0.89697266, + "step": 1837, + "time_per_iteration": 2.9948322772979736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184038, + "balance_loss_mlp": 1.09396327, + "epoch": 0.35359753751442863, + "flos": 595997778432.0, + "grad_norm": 0.031821297821065, + "language_loss": 0.92654896, + "learning_rate": 0.0007496401756940324, + "loss": 0.9383893, + "num_input_tokens_seen": 152299584, + "router_z_loss_mlp": 0.89892578, + "step": 1838, + "time_per_iteration": 2.678050994873047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176486, + "balance_loss_mlp": 1.08665001, + "epoch": 0.3537899191996922, + "flos": 633805456896.0, + "grad_norm": 0.02718368250353396, + "language_loss": 0.91091663, + "learning_rate": 0.0007493701943789098, + "loss": 0.92268145, + "num_input_tokens_seen": 152370368, + "router_z_loss_mlp": 0.89648438, + "step": 1839, + "time_per_iteration": 2.779574155807495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175825, + "balance_loss_mlp": 1.08608413, + "epoch": 0.35398230088495575, + "flos": 507352831488.0, + "grad_norm": 0.028671493841357993, + "language_loss": 0.91863656, + "learning_rate": 0.000749100116249692, + "loss": 0.93039483, + "num_input_tokens_seen": 152436928, + "router_z_loss_mlp": 0.89550781, + "step": 1840, + "time_per_iteration": 2.607614755630493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189406, + "balance_loss_mlp": 1.09980869, + "epoch": 0.35417468257021933, + "flos": 509046157824.0, + "grad_norm": 0.03229862826848899, + "language_loss": 0.95953786, + "learning_rate": 0.0007488299414112321, + "loss": 0.97143197, + "num_input_tokens_seen": 152505952, + "router_z_loss_mlp": 0.89404297, + "step": 1841, + "time_per_iteration": 2.566596746444702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181321, + "balance_loss_mlp": 1.09210455, + "epoch": 0.35436706425548287, + "flos": 657659019264.0, + "grad_norm": 0.02732135002339032, + "language_loss": 0.86453879, + "learning_rate": 0.0007485596699684215, + "loss": 0.87635195, + "num_input_tokens_seen": 152577408, + "router_z_loss_mlp": 0.89013672, + "step": 1842, + "time_per_iteration": 2.8111371994018555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185021, + "balance_loss_mlp": 1.09575689, + "epoch": 0.35455944594074645, + "flos": 653888329728.0, + "grad_norm": 0.026686949506238997, + "language_loss": 0.92940086, + "learning_rate": 0.000748289302026189, + "loss": 0.94125104, + "num_input_tokens_seen": 152654480, + "router_z_loss_mlp": 0.890625, + "step": 1843, + "time_per_iteration": 2.8244054317474365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187203, + "balance_loss_mlp": 1.09793901, + "epoch": 0.35475182762601, + "flos": 850010204160.0, + "grad_norm": 0.02649701564047654, + "language_loss": 0.9307664, + "learning_rate": 0.0007480188376895004, + "loss": 0.94263846, + "num_input_tokens_seen": 152732304, + "router_z_loss_mlp": 0.890625, + "step": 1844, + "time_per_iteration": 3.041001319885254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187935, + "balance_loss_mlp": 1.10115051, + "epoch": 0.3549442093112736, + "flos": 1524775128576.0, + "grad_norm": 0.01173136965559212, + "language_loss": 0.7381134, + "learning_rate": 0.0007477482770633596, + "loss": 0.74999273, + "num_input_tokens_seen": 152965952, + "router_z_loss_mlp": 0.86914062, + "step": 1845, + "time_per_iteration": 4.865761756896973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183261, + "balance_loss_mlp": 1.09390223, + "epoch": 0.3551365909965371, + "flos": 652714025472.0, + "grad_norm": 0.028658093872898062, + "language_loss": 0.85614175, + "learning_rate": 0.0007474776202528074, + "loss": 0.8679744, + "num_input_tokens_seen": 153053088, + "router_z_loss_mlp": 0.89160156, + "step": 1846, + "time_per_iteration": 2.9342904090881348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184977, + "balance_loss_mlp": 1.0954746, + "epoch": 0.3553289726818007, + "flos": 898921832448.0, + "grad_norm": 0.03609141350995601, + "language_loss": 0.89849555, + "learning_rate": 0.000747206867362922, + "loss": 0.91034532, + "num_input_tokens_seen": 153129216, + "router_z_loss_mlp": 0.89306641, + "step": 1847, + "time_per_iteration": 3.1089484691619873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185041, + "balance_loss_mlp": 1.09553862, + "epoch": 0.3555213543670643, + "flos": 689733437952.0, + "grad_norm": 0.0286779566522822, + "language_loss": 0.9096849, + "learning_rate": 0.0007469360184988194, + "loss": 0.92153525, + "num_input_tokens_seen": 153199360, + "router_z_loss_mlp": 0.89306641, + "step": 1848, + "time_per_iteration": 2.820265293121338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183493, + "balance_loss_mlp": 1.09399033, + "epoch": 0.3557137360523278, + "flos": 539603168256.0, + "grad_norm": 0.02648998316664428, + "language_loss": 0.93967247, + "learning_rate": 0.0007466650737656518, + "loss": 0.95150745, + "num_input_tokens_seen": 153269168, + "router_z_loss_mlp": 0.89306641, + "step": 1849, + "time_per_iteration": 2.596639394760132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183541, + "balance_loss_mlp": 1.09427702, + "epoch": 0.3559061177375914, + "flos": 403153767936.0, + "grad_norm": 0.02765421607491624, + "language_loss": 0.97574586, + "learning_rate": 0.0007463940332686098, + "loss": 0.98758125, + "num_input_tokens_seen": 153333120, + "router_z_loss_mlp": 0.890625, + "step": 1850, + "time_per_iteration": 2.478158473968506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177245, + "balance_loss_mlp": 1.08764756, + "epoch": 0.35609849942285493, + "flos": 697893895680.0, + "grad_norm": 0.023379973164811964, + "language_loss": 0.90857208, + "learning_rate": 0.0007461228971129205, + "loss": 0.92034447, + "num_input_tokens_seen": 153407600, + "router_z_loss_mlp": 0.89404297, + "step": 1851, + "time_per_iteration": 2.9202487468719482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179211, + "balance_loss_mlp": 1.08966124, + "epoch": 0.3562908811081185, + "flos": 570001724928.0, + "grad_norm": 0.028863121832353986, + "language_loss": 0.92692959, + "learning_rate": 0.0007458516654038483, + "loss": 0.93872178, + "num_input_tokens_seen": 153477408, + "router_z_loss_mlp": 0.89355469, + "step": 1852, + "time_per_iteration": 2.658867359161377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179202, + "balance_loss_mlp": 1.08936572, + "epoch": 0.35648326279338205, + "flos": 683609410560.0, + "grad_norm": 0.028040747176241956, + "language_loss": 0.94642723, + "learning_rate": 0.0007455803382466946, + "loss": 0.95821923, + "num_input_tokens_seen": 153551888, + "router_z_loss_mlp": 0.89648438, + "step": 1853, + "time_per_iteration": 2.86330509185791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183408, + "balance_loss_mlp": 1.09376252, + "epoch": 0.35667564447864564, + "flos": 630340941312.0, + "grad_norm": 0.02553826751691769, + "language_loss": 0.94946796, + "learning_rate": 0.0007453089157467979, + "loss": 0.96130198, + "num_input_tokens_seen": 153626912, + "router_z_loss_mlp": 0.89453125, + "step": 1854, + "time_per_iteration": 2.792577028274536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180437, + "balance_loss_mlp": 1.09093451, + "epoch": 0.35686802616390917, + "flos": 815504584704.0, + "grad_norm": 0.02468703395074296, + "language_loss": 0.8986901, + "learning_rate": 0.0007450373980095341, + "loss": 0.91049451, + "num_input_tokens_seen": 153711312, + "router_z_loss_mlp": 0.89306641, + "step": 1855, + "time_per_iteration": 3.0555014610290527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182657, + "balance_loss_mlp": 1.09334552, + "epoch": 0.35706040784917276, + "flos": 527205391872.0, + "grad_norm": 0.02890256158864057, + "language_loss": 0.93639445, + "learning_rate": 0.0007447657851403155, + "loss": 0.94822103, + "num_input_tokens_seen": 153780208, + "router_z_loss_mlp": 0.89111328, + "step": 1856, + "time_per_iteration": 2.589708089828491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182935, + "balance_loss_mlp": 1.09367096, + "epoch": 0.35725278953443634, + "flos": 513064624128.0, + "grad_norm": 0.032008561774258475, + "language_loss": 0.88987339, + "learning_rate": 0.0007444940772445915, + "loss": 0.9017027, + "num_input_tokens_seen": 153853152, + "router_z_loss_mlp": 0.890625, + "step": 1857, + "time_per_iteration": 2.7185556888580322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180668, + "balance_loss_mlp": 1.09169042, + "epoch": 0.3574451712196999, + "flos": 488492653056.0, + "grad_norm": 0.02708223160327311, + "language_loss": 0.88387084, + "learning_rate": 0.0007442222744278484, + "loss": 0.89567751, + "num_input_tokens_seen": 153924160, + "router_z_loss_mlp": 0.88769531, + "step": 1858, + "time_per_iteration": 2.6569979190826416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182567, + "balance_loss_mlp": 1.09339869, + "epoch": 0.35763755290496346, + "flos": 551821023744.0, + "grad_norm": 0.023402609147138306, + "language_loss": 0.90506786, + "learning_rate": 0.0007439503767956099, + "loss": 0.91689354, + "num_input_tokens_seen": 153998688, + "router_z_loss_mlp": 0.88964844, + "step": 1859, + "time_per_iteration": 2.7072699069976807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180801, + "balance_loss_mlp": 1.09249115, + "epoch": 0.357829934590227, + "flos": 1507225514496.0, + "grad_norm": 0.010565166743096084, + "language_loss": 0.79671603, + "learning_rate": 0.0007436783844534352, + "loss": 0.80852401, + "num_input_tokens_seen": 154230960, + "router_z_loss_mlp": 0.88085938, + "step": 1860, + "time_per_iteration": 4.9006147384643555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177337, + "balance_loss_mlp": 1.08835948, + "epoch": 0.3580223162754906, + "flos": 569841269760.0, + "grad_norm": 0.022894220472823423, + "language_loss": 0.92520916, + "learning_rate": 0.000743406297506922, + "loss": 0.93698251, + "num_input_tokens_seen": 154309104, + "router_z_loss_mlp": 0.88769531, + "step": 1861, + "time_per_iteration": 2.7065579891204834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186252, + "balance_loss_mlp": 1.09741747, + "epoch": 0.3582146979607541, + "flos": 627760018944.0, + "grad_norm": 0.02759787968542248, + "language_loss": 0.91638815, + "learning_rate": 0.0007431341160617031, + "loss": 0.92825067, + "num_input_tokens_seen": 154387424, + "router_z_loss_mlp": 0.88623047, + "step": 1862, + "time_per_iteration": 2.9316203594207764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179684, + "balance_loss_mlp": 1.09089661, + "epoch": 0.3584070796460177, + "flos": 508319016960.0, + "grad_norm": 0.024526236298265516, + "language_loss": 0.95309365, + "learning_rate": 0.0007428618402234491, + "loss": 0.96489048, + "num_input_tokens_seen": 154459952, + "router_z_loss_mlp": 0.88574219, + "step": 1863, + "time_per_iteration": 2.648061752319336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179939, + "balance_loss_mlp": 1.09129453, + "epoch": 0.3585994613312813, + "flos": 607640216064.0, + "grad_norm": 0.026400757424935653, + "language_loss": 0.88735509, + "learning_rate": 0.0007425894700978668, + "loss": 0.89915442, + "num_input_tokens_seen": 154535456, + "router_z_loss_mlp": 0.88427734, + "step": 1864, + "time_per_iteration": 2.7512128353118896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178956, + "balance_loss_mlp": 1.0905509, + "epoch": 0.3587918430165448, + "flos": 1415087675904.0, + "grad_norm": 0.025937088976099313, + "language_loss": 0.86489892, + "learning_rate": 0.0007423170057906996, + "loss": 0.87668848, + "num_input_tokens_seen": 154627568, + "router_z_loss_mlp": 0.88183594, + "step": 1865, + "time_per_iteration": 3.8491222858428955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181386, + "balance_loss_mlp": 1.0926944, + "epoch": 0.3589842247018084, + "flos": 479513730048.0, + "grad_norm": 0.0296684402619103, + "language_loss": 0.94328964, + "learning_rate": 0.0007420444474077275, + "loss": 0.95510352, + "num_input_tokens_seen": 154694640, + "router_z_loss_mlp": 0.88476562, + "step": 1866, + "time_per_iteration": 2.5396502017974854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183129, + "balance_loss_mlp": 1.09458029, + "epoch": 0.35917660638707194, + "flos": 505705167360.0, + "grad_norm": 0.030930075238968464, + "language_loss": 0.98337018, + "learning_rate": 0.0007417717950547671, + "loss": 0.99520147, + "num_input_tokens_seen": 154762048, + "router_z_loss_mlp": 0.88330078, + "step": 1867, + "time_per_iteration": 2.562638759613037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182945, + "balance_loss_mlp": 1.09654236, + "epoch": 0.3593689880723355, + "flos": 1495481745408.0, + "grad_norm": 0.008554058370081398, + "language_loss": 0.75996608, + "learning_rate": 0.0007414990488376713, + "loss": 0.77179551, + "num_input_tokens_seen": 154989952, + "router_z_loss_mlp": 0.86523438, + "step": 1868, + "time_per_iteration": 4.885401487350464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184482, + "balance_loss_mlp": 1.09583843, + "epoch": 0.35956136975759906, + "flos": 529671521280.0, + "grad_norm": 0.02257875970711003, + "language_loss": 0.91369003, + "learning_rate": 0.0007412262088623299, + "loss": 0.92553484, + "num_input_tokens_seen": 155066992, + "router_z_loss_mlp": 0.88427734, + "step": 1869, + "time_per_iteration": 2.755620241165161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184303, + "balance_loss_mlp": 1.09584975, + "epoch": 0.35975375144286265, + "flos": 535999664640.0, + "grad_norm": 0.02945163599469251, + "language_loss": 0.8810817, + "learning_rate": 0.0007409532752346684, + "loss": 0.89292467, + "num_input_tokens_seen": 155137616, + "router_z_loss_mlp": 0.88232422, + "step": 1870, + "time_per_iteration": 2.6426498889923096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187445, + "balance_loss_mlp": 1.09860992, + "epoch": 0.3599461331281262, + "flos": 505928749056.0, + "grad_norm": 0.025692069404306732, + "language_loss": 0.95194697, + "learning_rate": 0.0007406802480606491, + "loss": 0.96382141, + "num_input_tokens_seen": 155209248, + "router_z_loss_mlp": 0.88623047, + "step": 1871, + "time_per_iteration": 2.6156716346740723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180117, + "balance_loss_mlp": 1.09123456, + "epoch": 0.36013851481338977, + "flos": 512536869888.0, + "grad_norm": 0.029138864413584674, + "language_loss": 0.9874596, + "learning_rate": 0.0007404071274462707, + "loss": 0.99926078, + "num_input_tokens_seen": 155274176, + "router_z_loss_mlp": 0.88671875, + "step": 1872, + "time_per_iteration": 2.5790889263153076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179425, + "balance_loss_mlp": 1.09054244, + "epoch": 0.36033089649865335, + "flos": 548631756288.0, + "grad_norm": 0.029675252163234106, + "language_loss": 0.91584998, + "learning_rate": 0.0007401339134975682, + "loss": 0.92764425, + "num_input_tokens_seen": 155343232, + "router_z_loss_mlp": 0.88671875, + "step": 1873, + "time_per_iteration": 2.6279983520507812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185016, + "balance_loss_mlp": 1.09613371, + "epoch": 0.3605232781839169, + "flos": 459613506048.0, + "grad_norm": 0.030657976300352024, + "language_loss": 0.92556155, + "learning_rate": 0.0007398606063206122, + "loss": 0.93741173, + "num_input_tokens_seen": 155410080, + "router_z_loss_mlp": 0.88671875, + "step": 1874, + "time_per_iteration": 2.5750958919525146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178477, + "balance_loss_mlp": 1.0895946, + "epoch": 0.36071565986918047, + "flos": 510563566080.0, + "grad_norm": 0.029863822651947862, + "language_loss": 0.87000763, + "learning_rate": 0.0007395872060215101, + "loss": 0.88179243, + "num_input_tokens_seen": 155476240, + "router_z_loss_mlp": 0.88671875, + "step": 1875, + "time_per_iteration": 2.599595546722412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180043, + "balance_loss_mlp": 1.09101713, + "epoch": 0.360908041554444, + "flos": 560256729600.0, + "grad_norm": 0.02914010843617622, + "language_loss": 0.95866597, + "learning_rate": 0.0007393137127064056, + "loss": 0.97046638, + "num_input_tokens_seen": 155543392, + "router_z_loss_mlp": 0.88818359, + "step": 1876, + "time_per_iteration": 2.629855155944824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179718, + "balance_loss_mlp": 1.09064531, + "epoch": 0.3611004232397076, + "flos": 524878250496.0, + "grad_norm": 0.029199641876594032, + "language_loss": 0.93452048, + "learning_rate": 0.0007390401264814779, + "loss": 0.94631773, + "num_input_tokens_seen": 155613264, + "router_z_loss_mlp": 0.88867188, + "step": 1877, + "time_per_iteration": 2.6057403087615967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182123, + "balance_loss_mlp": 1.0932405, + "epoch": 0.3612928049249711, + "flos": 542032367616.0, + "grad_norm": 0.029384759310162312, + "language_loss": 0.93887711, + "learning_rate": 0.0007387664474529427, + "loss": 0.95069838, + "num_input_tokens_seen": 155683712, + "router_z_loss_mlp": 0.88671875, + "step": 1878, + "time_per_iteration": 2.612924814224243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181149, + "balance_loss_mlp": 1.09207559, + "epoch": 0.3614851866102347, + "flos": 553629143040.0, + "grad_norm": 0.028847856052759763, + "language_loss": 0.99400896, + "learning_rate": 0.0007384926757270518, + "loss": 1.00582051, + "num_input_tokens_seen": 155751760, + "router_z_loss_mlp": 0.88867188, + "step": 1879, + "time_per_iteration": 2.631417751312256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183007, + "balance_loss_mlp": 1.09364784, + "epoch": 0.36167756829549824, + "flos": 773426660352.0, + "grad_norm": 0.027790454764264987, + "language_loss": 0.87101346, + "learning_rate": 0.0007382188114100924, + "loss": 0.88284349, + "num_input_tokens_seen": 155830464, + "router_z_loss_mlp": 0.89160156, + "step": 1880, + "time_per_iteration": 3.0146212577819824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182663, + "balance_loss_mlp": 1.09330404, + "epoch": 0.36186994998076183, + "flos": 713187500544.0, + "grad_norm": 0.025874200926848077, + "language_loss": 0.89437282, + "learning_rate": 0.0007379448546083884, + "loss": 0.90619946, + "num_input_tokens_seen": 155906208, + "router_z_loss_mlp": 0.89160156, + "step": 1881, + "time_per_iteration": 2.9882314205169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182414, + "balance_loss_mlp": 1.09305489, + "epoch": 0.3620623316660254, + "flos": 748900351488.0, + "grad_norm": 0.028120122690860328, + "language_loss": 0.95218164, + "learning_rate": 0.0007376708054282992, + "loss": 0.96400583, + "num_input_tokens_seen": 155983584, + "router_z_loss_mlp": 0.89160156, + "step": 1882, + "time_per_iteration": 2.937251329421997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185259, + "balance_loss_mlp": 1.09609008, + "epoch": 0.36225471335128895, + "flos": 483534197760.0, + "grad_norm": 0.025051425069896712, + "language_loss": 0.90089262, + "learning_rate": 0.0007373966639762201, + "loss": 0.91274524, + "num_input_tokens_seen": 156052464, + "router_z_loss_mlp": 0.88964844, + "step": 1883, + "time_per_iteration": 2.5956366062164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189104, + "balance_loss_mlp": 1.09964943, + "epoch": 0.36244709503655254, + "flos": 507910785024.0, + "grad_norm": 0.028814908336841725, + "language_loss": 0.97620124, + "learning_rate": 0.0007371224303585822, + "loss": 0.9880923, + "num_input_tokens_seen": 156121424, + "router_z_loss_mlp": 0.89257812, + "step": 1884, + "time_per_iteration": 2.5689563751220703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188454, + "balance_loss_mlp": 1.10205078, + "epoch": 0.36263947672181607, + "flos": 1397052145152.0, + "grad_norm": 0.012535477100621303, + "language_loss": 0.80357069, + "learning_rate": 0.0007368481046818524, + "loss": 0.8154552, + "num_input_tokens_seen": 156346144, + "router_z_loss_mlp": 0.86523438, + "step": 1885, + "time_per_iteration": 4.708393573760986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184768, + "balance_loss_mlp": 1.09531295, + "epoch": 0.36283185840707965, + "flos": 654522144768.0, + "grad_norm": 0.026882878095346403, + "language_loss": 0.90798199, + "learning_rate": 0.0007365736870525335, + "loss": 0.91982961, + "num_input_tokens_seen": 156420880, + "router_z_loss_mlp": 0.89257812, + "step": 1886, + "time_per_iteration": 2.8096718788146973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188121, + "balance_loss_mlp": 1.09842801, + "epoch": 0.3630242400923432, + "flos": 489844876800.0, + "grad_norm": 0.028488669634490066, + "language_loss": 0.90766525, + "learning_rate": 0.000736299177577164, + "loss": 0.91954637, + "num_input_tokens_seen": 156485616, + "router_z_loss_mlp": 0.89501953, + "step": 1887, + "time_per_iteration": 2.5731940269470215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184527, + "balance_loss_mlp": 1.09488153, + "epoch": 0.3632166217776068, + "flos": 518231198208.0, + "grad_norm": 0.0291282657352475, + "language_loss": 0.90900671, + "learning_rate": 0.0007360245763623174, + "loss": 0.92085195, + "num_input_tokens_seen": 156557840, + "router_z_loss_mlp": 0.89453125, + "step": 1888, + "time_per_iteration": 2.6255550384521484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184122, + "balance_loss_mlp": 1.09457171, + "epoch": 0.36340900346287036, + "flos": 647347338240.0, + "grad_norm": 0.024297388169127104, + "language_loss": 0.96519047, + "learning_rate": 0.0007357498835146039, + "loss": 0.97703171, + "num_input_tokens_seen": 156632496, + "router_z_loss_mlp": 0.89355469, + "step": 1889, + "time_per_iteration": 2.8253488540649414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183322, + "balance_loss_mlp": 1.09386766, + "epoch": 0.3636013851481339, + "flos": 554410678272.0, + "grad_norm": 0.02538543495771105, + "language_loss": 0.93937147, + "learning_rate": 0.0007354750991406684, + "loss": 0.95120472, + "num_input_tokens_seen": 156705296, + "router_z_loss_mlp": 0.89257812, + "step": 1890, + "time_per_iteration": 2.692335844039917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182823, + "balance_loss_mlp": 1.09336889, + "epoch": 0.3637937668333975, + "flos": 547691767296.0, + "grad_norm": 0.028084450652072174, + "language_loss": 0.88223994, + "learning_rate": 0.0007352002233471919, + "loss": 0.89406812, + "num_input_tokens_seen": 156773376, + "router_z_loss_mlp": 0.89257812, + "step": 1891, + "time_per_iteration": 2.620753765106201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181153, + "balance_loss_mlp": 1.09212756, + "epoch": 0.363986148518661, + "flos": 539210399232.0, + "grad_norm": 0.027970426809957948, + "language_loss": 0.87592262, + "learning_rate": 0.0007349252562408906, + "loss": 0.88773412, + "num_input_tokens_seen": 156844336, + "router_z_loss_mlp": 0.88818359, + "step": 1892, + "time_per_iteration": 2.6963558197021484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186893, + "balance_loss_mlp": 1.09762907, + "epoch": 0.3641785302039246, + "flos": 661510299648.0, + "grad_norm": 0.026164868426956554, + "language_loss": 0.89186442, + "learning_rate": 0.0007346501979285158, + "loss": 0.90373337, + "num_input_tokens_seen": 156918848, + "router_z_loss_mlp": 0.890625, + "step": 1893, + "time_per_iteration": 2.880326747894287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188103, + "balance_loss_mlp": 1.10150909, + "epoch": 0.36437091188918813, + "flos": 1472082077184.0, + "grad_norm": 0.013556454199407954, + "language_loss": 0.80539101, + "learning_rate": 0.0007343750485168551, + "loss": 0.81727207, + "num_input_tokens_seen": 157134736, + "router_z_loss_mlp": 0.8671875, + "step": 1894, + "time_per_iteration": 4.7823100090026855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189424, + "balance_loss_mlp": 1.10011292, + "epoch": 0.3645632935744517, + "flos": 598444442112.0, + "grad_norm": 0.028411509484180794, + "language_loss": 0.93676329, + "learning_rate": 0.0007340998081127308, + "loss": 0.94865751, + "num_input_tokens_seen": 157211920, + "router_z_loss_mlp": 0.89111328, + "step": 1895, + "time_per_iteration": 2.7800211906433105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179101, + "balance_loss_mlp": 1.08998048, + "epoch": 0.36475567525971525, + "flos": 600695721984.0, + "grad_norm": 0.025932670803143428, + "language_loss": 0.98669052, + "learning_rate": 0.0007338244768230007, + "loss": 0.99848151, + "num_input_tokens_seen": 157284224, + "router_z_loss_mlp": 0.88916016, + "step": 1896, + "time_per_iteration": 2.7945594787597656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180722, + "balance_loss_mlp": 1.09169638, + "epoch": 0.36494805694497884, + "flos": 799830945792.0, + "grad_norm": 0.022772977260465788, + "language_loss": 0.94548512, + "learning_rate": 0.0007335490547545578, + "loss": 0.95729244, + "num_input_tokens_seen": 157367920, + "router_z_loss_mlp": 0.88818359, + "step": 1897, + "time_per_iteration": 3.031527280807495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182826, + "balance_loss_mlp": 1.09389579, + "epoch": 0.3651404386302424, + "flos": 638477203968.0, + "grad_norm": 0.024439781626348547, + "language_loss": 0.90189934, + "learning_rate": 0.0007332735420143308, + "loss": 0.91372758, + "num_input_tokens_seen": 157438672, + "router_z_loss_mlp": 0.88720703, + "step": 1898, + "time_per_iteration": 2.743051767349243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118252, + "balance_loss_mlp": 1.09363747, + "epoch": 0.36533282031550596, + "flos": 492562785792.0, + "grad_norm": 0.03052059755540218, + "language_loss": 0.95941794, + "learning_rate": 0.0007329979387092826, + "loss": 0.97124314, + "num_input_tokens_seen": 157505888, + "router_z_loss_mlp": 0.88671875, + "step": 1899, + "time_per_iteration": 2.5555779933929443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181449, + "balance_loss_mlp": 1.09247124, + "epoch": 0.36552520200076954, + "flos": 857508648960.0, + "grad_norm": 0.02266050351879182, + "language_loss": 0.89947438, + "learning_rate": 0.0007327222449464124, + "loss": 0.91128886, + "num_input_tokens_seen": 157601568, + "router_z_loss_mlp": 0.88769531, + "step": 1900, + "time_per_iteration": 3.2362029552459717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181183, + "balance_loss_mlp": 1.09206235, + "epoch": 0.3657175836860331, + "flos": 484715232768.0, + "grad_norm": 0.026374750280255838, + "language_loss": 0.95288622, + "learning_rate": 0.0007324464608327538, + "loss": 0.96469808, + "num_input_tokens_seen": 157670992, + "router_z_loss_mlp": 0.88916016, + "step": 1901, + "time_per_iteration": 2.5933730602264404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179798, + "balance_loss_mlp": 1.09058213, + "epoch": 0.36590996537129666, + "flos": 435721012224.0, + "grad_norm": 0.02685373461110618, + "language_loss": 0.96213037, + "learning_rate": 0.0007321705864753758, + "loss": 0.97392833, + "num_input_tokens_seen": 157743616, + "router_z_loss_mlp": 0.89013672, + "step": 1902, + "time_per_iteration": 2.6981201171875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180605, + "balance_loss_mlp": 1.09124577, + "epoch": 0.3661023470565602, + "flos": 713513140224.0, + "grad_norm": 0.022756571637903334, + "language_loss": 0.91225153, + "learning_rate": 0.0007318946219813823, + "loss": 0.9240576, + "num_input_tokens_seen": 157823520, + "router_z_loss_mlp": 0.89160156, + "step": 1903, + "time_per_iteration": 2.992624044418335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183651, + "balance_loss_mlp": 1.09443474, + "epoch": 0.3662947287418238, + "flos": 565822803456.0, + "grad_norm": 0.027935940535232063, + "language_loss": 0.96619356, + "learning_rate": 0.000731618567457912, + "loss": 0.97803003, + "num_input_tokens_seen": 157893248, + "router_z_loss_mlp": 0.89013672, + "step": 1904, + "time_per_iteration": 2.685476064682007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183785, + "balance_loss_mlp": 1.09433067, + "epoch": 0.3664871104270873, + "flos": 791201857536.0, + "grad_norm": 0.029459392082425068, + "language_loss": 0.95166355, + "learning_rate": 0.000731342423012139, + "loss": 0.96350139, + "num_input_tokens_seen": 157973216, + "router_z_loss_mlp": 0.89257812, + "step": 1905, + "time_per_iteration": 3.0574183464050293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184501, + "balance_loss_mlp": 1.09480846, + "epoch": 0.3666794921123509, + "flos": 753980330496.0, + "grad_norm": 0.028631588758117728, + "language_loss": 0.89661896, + "learning_rate": 0.0007310661887512722, + "loss": 0.90846401, + "num_input_tokens_seen": 158051088, + "router_z_loss_mlp": 0.89501953, + "step": 1906, + "time_per_iteration": 3.024423122406006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183077, + "balance_loss_mlp": 1.09343171, + "epoch": 0.3668718737976145, + "flos": 524607005184.0, + "grad_norm": 0.02900954708937733, + "language_loss": 0.89823443, + "learning_rate": 0.0007307898647825549, + "loss": 0.91006529, + "num_input_tokens_seen": 158124368, + "router_z_loss_mlp": 0.89453125, + "step": 1907, + "time_per_iteration": 2.6485068798065186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182186, + "balance_loss_mlp": 1.09277892, + "epoch": 0.367064255482878, + "flos": 573045273600.0, + "grad_norm": 0.031417651983294596, + "language_loss": 0.98967636, + "learning_rate": 0.0007305134512132659, + "loss": 1.00149822, + "num_input_tokens_seen": 158191472, + "router_z_loss_mlp": 0.89208984, + "step": 1908, + "time_per_iteration": 2.646838903427124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180724, + "balance_loss_mlp": 1.09107888, + "epoch": 0.3672566371681416, + "flos": 448053660672.0, + "grad_norm": 0.03289649974011927, + "language_loss": 0.93253779, + "learning_rate": 0.0007302369481507183, + "loss": 0.94434512, + "num_input_tokens_seen": 158254384, + "router_z_loss_mlp": 0.89453125, + "step": 1909, + "time_per_iteration": 2.562856674194336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188614, + "balance_loss_mlp": 1.10011292, + "epoch": 0.36744901885340514, + "flos": 1543364061696.0, + "grad_norm": 0.010877058892954462, + "language_loss": 0.79961759, + "learning_rate": 0.00072996035570226, + "loss": 0.81150377, + "num_input_tokens_seen": 158486160, + "router_z_loss_mlp": 0.8828125, + "step": 1910, + "time_per_iteration": 4.90735387802124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011789, + "balance_loss_mlp": 1.08949292, + "epoch": 0.36764140053866873, + "flos": 564761290752.0, + "grad_norm": 0.024499581587470617, + "language_loss": 0.92626876, + "learning_rate": 0.000729683673975274, + "loss": 0.93805778, + "num_input_tokens_seen": 158555616, + "router_z_loss_mlp": 0.89208984, + "step": 1911, + "time_per_iteration": 2.6646595001220703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182116, + "balance_loss_mlp": 1.09285223, + "epoch": 0.36783378222393226, + "flos": 1218650895360.0, + "grad_norm": 0.021973130552363645, + "language_loss": 0.89050859, + "learning_rate": 0.0007294069030771774, + "loss": 0.90232974, + "num_input_tokens_seen": 158653984, + "router_z_loss_mlp": 0.890625, + "step": 1912, + "time_per_iteration": 3.6834843158721924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189865, + "balance_loss_mlp": 1.10021913, + "epoch": 0.36802616390919585, + "flos": 499720128000.0, + "grad_norm": 0.028676866730684987, + "language_loss": 0.97328013, + "learning_rate": 0.0007291300431154224, + "loss": 0.98517883, + "num_input_tokens_seen": 158719728, + "router_z_loss_mlp": 0.89453125, + "step": 1913, + "time_per_iteration": 2.587052822113037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195931, + "balance_loss_mlp": 1.10838318, + "epoch": 0.36821854559445943, + "flos": 1585615902720.0, + "grad_norm": 0.013013835157786544, + "language_loss": 0.70389736, + "learning_rate": 0.0007288530941974955, + "loss": 0.71585667, + "num_input_tokens_seen": 158952544, + "router_z_loss_mlp": 0.87695312, + "step": 1914, + "time_per_iteration": 4.952203989028931 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185283, + "balance_loss_mlp": 1.09582841, + "epoch": 0.36841092727972297, + "flos": 837089402880.0, + "grad_norm": 0.02834339080565921, + "language_loss": 0.8768307, + "learning_rate": 0.0007285760564309179, + "loss": 0.88868356, + "num_input_tokens_seen": 159039680, + "router_z_loss_mlp": 0.89257812, + "step": 1915, + "time_per_iteration": 3.100893974304199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185476, + "balance_loss_mlp": 1.09602106, + "epoch": 0.36860330896498655, + "flos": 691209913344.0, + "grad_norm": 0.028423235038061073, + "language_loss": 0.92041719, + "learning_rate": 0.0007282989299232448, + "loss": 0.93227196, + "num_input_tokens_seen": 159128128, + "router_z_loss_mlp": 0.89257812, + "step": 1916, + "time_per_iteration": 3.0683393478393555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189801, + "balance_loss_mlp": 1.10048962, + "epoch": 0.3687956906502501, + "flos": 555239877120.0, + "grad_norm": 0.03332088686108748, + "language_loss": 0.92434603, + "learning_rate": 0.0007280217147820668, + "loss": 0.93624407, + "num_input_tokens_seen": 159193248, + "router_z_loss_mlp": 0.89111328, + "step": 1917, + "time_per_iteration": 2.635451078414917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188211, + "balance_loss_mlp": 1.09894717, + "epoch": 0.3689880723355137, + "flos": 577819078656.0, + "grad_norm": 0.027623597033391085, + "language_loss": 0.8697632, + "learning_rate": 0.0007277444111150079, + "loss": 0.88164532, + "num_input_tokens_seen": 159265824, + "router_z_loss_mlp": 0.890625, + "step": 1918, + "time_per_iteration": 2.810635805130005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184664, + "balance_loss_mlp": 1.09540033, + "epoch": 0.3691804540207772, + "flos": 529886370816.0, + "grad_norm": 0.029489830132381867, + "language_loss": 0.91299617, + "learning_rate": 0.0007274670190297272, + "loss": 0.92484283, + "num_input_tokens_seen": 159332992, + "router_z_loss_mlp": 0.890625, + "step": 1919, + "time_per_iteration": 2.615386486053467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118238, + "balance_loss_mlp": 1.09368861, + "epoch": 0.3693728357060408, + "flos": 562180368384.0, + "grad_norm": 0.025570373781710027, + "language_loss": 0.90037912, + "learning_rate": 0.0007271895386339179, + "loss": 0.91220295, + "num_input_tokens_seen": 159409808, + "router_z_loss_mlp": 0.88476562, + "step": 1920, + "time_per_iteration": 2.7868921756744385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192586, + "balance_loss_mlp": 1.10375118, + "epoch": 0.3695652173913043, + "flos": 580899557376.0, + "grad_norm": 0.02893533685872539, + "language_loss": 0.90819347, + "learning_rate": 0.0007269119700353073, + "loss": 0.92011935, + "num_input_tokens_seen": 159486128, + "router_z_loss_mlp": 0.88623047, + "step": 1921, + "time_per_iteration": 2.7836573123931885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178636, + "balance_loss_mlp": 1.09023082, + "epoch": 0.3697575990765679, + "flos": 514059007488.0, + "grad_norm": 0.024390447267758214, + "language_loss": 0.90977228, + "learning_rate": 0.0007266343133416571, + "loss": 0.92155862, + "num_input_tokens_seen": 159562224, + "router_z_loss_mlp": 0.8828125, + "step": 1922, + "time_per_iteration": 2.800387382507324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173615, + "balance_loss_mlp": 1.08816528, + "epoch": 0.3699499807618315, + "flos": 1573903607808.0, + "grad_norm": 0.0066311072211368925, + "language_loss": 0.77116919, + "learning_rate": 0.0007263565686607632, + "loss": 0.78290522, + "num_input_tokens_seen": 159784768, + "router_z_loss_mlp": 0.85546875, + "step": 1923, + "time_per_iteration": 4.845300912857056 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176045, + "balance_loss_mlp": 1.08844995, + "epoch": 0.37014236244709503, + "flos": 498324243456.0, + "grad_norm": 0.031949393340513096, + "language_loss": 0.9351213, + "learning_rate": 0.0007260787361004556, + "loss": 0.94688171, + "num_input_tokens_seen": 159848608, + "router_z_loss_mlp": 0.87744141, + "step": 1924, + "time_per_iteration": 2.5984597206115723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175598, + "balance_loss_mlp": 1.0905304, + "epoch": 0.3703347441323586, + "flos": 1447605433344.0, + "grad_norm": 0.008500773473990196, + "language_loss": 0.73761505, + "learning_rate": 0.0007258008157685987, + "loss": 0.74937099, + "num_input_tokens_seen": 160080928, + "router_z_loss_mlp": 0.8515625, + "step": 1925, + "time_per_iteration": 4.886027097702026 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197031, + "balance_loss_mlp": 1.10862505, + "epoch": 0.37052712581762215, + "flos": 564713627136.0, + "grad_norm": 0.03178088368953176, + "language_loss": 0.94516188, + "learning_rate": 0.0007255228077730903, + "loss": 0.95713222, + "num_input_tokens_seen": 160148976, + "router_z_loss_mlp": 0.88183594, + "step": 1926, + "time_per_iteration": 2.6847593784332275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185383, + "balance_loss_mlp": 1.09731126, + "epoch": 0.37071950750288574, + "flos": 927570667008.0, + "grad_norm": 0.029564625514678724, + "language_loss": 0.89603549, + "learning_rate": 0.0007252447122218632, + "loss": 0.90788931, + "num_input_tokens_seen": 160233504, + "router_z_loss_mlp": 0.88037109, + "step": 1927, + "time_per_iteration": 3.106748342514038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179784, + "balance_loss_mlp": 1.0919987, + "epoch": 0.37091188918814927, + "flos": 419200710144.0, + "grad_norm": 0.03402230349378661, + "language_loss": 0.98334146, + "learning_rate": 0.0007249665292228834, + "loss": 0.99513936, + "num_input_tokens_seen": 160299696, + "router_z_loss_mlp": 0.87939453, + "step": 1928, + "time_per_iteration": 2.5786120891571045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186321, + "balance_loss_mlp": 1.09801054, + "epoch": 0.37110427087341286, + "flos": 464146265088.0, + "grad_norm": 0.029271450765855984, + "language_loss": 0.9102214, + "learning_rate": 0.000724688258884151, + "loss": 0.92208457, + "num_input_tokens_seen": 160367904, + "router_z_loss_mlp": 0.88183594, + "step": 1929, + "time_per_iteration": 2.5388894081115723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185686, + "balance_loss_mlp": 1.09780467, + "epoch": 0.3712966525586764, + "flos": 851080449024.0, + "grad_norm": 0.02435916983518334, + "language_loss": 0.9136247, + "learning_rate": 0.0007244099013137002, + "loss": 0.92548156, + "num_input_tokens_seen": 160453600, + "router_z_loss_mlp": 0.88037109, + "step": 1930, + "time_per_iteration": 3.0708000659942627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179762, + "balance_loss_mlp": 1.09159458, + "epoch": 0.37148903424394, + "flos": 927557932032.0, + "grad_norm": 0.024720397528266293, + "language_loss": 0.95256186, + "learning_rate": 0.0007241314566195993, + "loss": 0.96435952, + "num_input_tokens_seen": 160543472, + "router_z_loss_mlp": 0.88232422, + "step": 1931, + "time_per_iteration": 3.2293543815612793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179876, + "balance_loss_mlp": 1.09180403, + "epoch": 0.37168141592920356, + "flos": 520820852736.0, + "grad_norm": 0.029266961451931986, + "language_loss": 0.92750597, + "learning_rate": 0.0007238529249099496, + "loss": 0.93930471, + "num_input_tokens_seen": 160614016, + "router_z_loss_mlp": 0.88232422, + "step": 1932, + "time_per_iteration": 2.6091582775115967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188461, + "balance_loss_mlp": 1.10263062, + "epoch": 0.3718737976144671, + "flos": 1449059715072.0, + "grad_norm": 0.015165360012205364, + "language_loss": 0.77856874, + "learning_rate": 0.0007235743062928872, + "loss": 0.79045337, + "num_input_tokens_seen": 160828640, + "router_z_loss_mlp": 0.859375, + "step": 1933, + "time_per_iteration": 4.854676246643066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184357, + "balance_loss_mlp": 1.09614182, + "epoch": 0.3720661792997307, + "flos": 760953022464.0, + "grad_norm": 0.028795817149727888, + "language_loss": 0.88381398, + "learning_rate": 0.000723295600876581, + "loss": 0.89565754, + "num_input_tokens_seen": 160913088, + "router_z_loss_mlp": 0.8828125, + "step": 1934, + "time_per_iteration": 2.9830405712127686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118189, + "balance_loss_mlp": 1.09396136, + "epoch": 0.3722585609849942, + "flos": 518044546560.0, + "grad_norm": 0.028690096062057496, + "language_loss": 0.95446575, + "learning_rate": 0.0007230168087692344, + "loss": 0.96628463, + "num_input_tokens_seen": 160982960, + "router_z_loss_mlp": 0.88085938, + "step": 1935, + "time_per_iteration": 2.651982307434082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181923, + "balance_loss_mlp": 1.09404159, + "epoch": 0.3724509426702578, + "flos": 783868597248.0, + "grad_norm": 0.02900654324264667, + "language_loss": 0.88952625, + "learning_rate": 0.0007227379300790839, + "loss": 0.90134549, + "num_input_tokens_seen": 161066000, + "router_z_loss_mlp": 0.88037109, + "step": 1936, + "time_per_iteration": 3.0127265453338623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177948, + "balance_loss_mlp": 1.09006691, + "epoch": 0.37264332435552133, + "flos": 392599039488.0, + "grad_norm": 0.02836050450865214, + "language_loss": 0.94049299, + "learning_rate": 0.0007224589649143997, + "loss": 0.95227242, + "num_input_tokens_seen": 161131040, + "router_z_loss_mlp": 0.88037109, + "step": 1937, + "time_per_iteration": 2.5600061416625977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178201, + "balance_loss_mlp": 1.09074926, + "epoch": 0.3728357060407849, + "flos": 543912345600.0, + "grad_norm": 0.027673862011078548, + "language_loss": 0.89373219, + "learning_rate": 0.0007221799133834861, + "loss": 0.90551418, + "num_input_tokens_seen": 161201248, + "router_z_loss_mlp": 0.87597656, + "step": 1938, + "time_per_iteration": 2.646632671356201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011797, + "balance_loss_mlp": 1.0919621, + "epoch": 0.3730280877260485, + "flos": 434483581440.0, + "grad_norm": 0.03019004471989451, + "language_loss": 0.90666437, + "learning_rate": 0.00072190077559468, + "loss": 0.91846132, + "num_input_tokens_seen": 161266288, + "router_z_loss_mlp": 0.87890625, + "step": 1939, + "time_per_iteration": 2.5193679332733154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118304, + "balance_loss_mlp": 1.0957315, + "epoch": 0.37322046941131204, + "flos": 532510953984.0, + "grad_norm": 0.02812892901872328, + "language_loss": 0.95514065, + "learning_rate": 0.0007216215516563527, + "loss": 0.96697104, + "num_input_tokens_seen": 161335648, + "router_z_loss_mlp": 0.87451172, + "step": 1940, + "time_per_iteration": 2.6975200176239014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184025, + "balance_loss_mlp": 1.09666896, + "epoch": 0.3734128510965756, + "flos": 532576081920.0, + "grad_norm": 0.028733495674926814, + "language_loss": 0.91960251, + "learning_rate": 0.0007213422416769083, + "loss": 0.93144274, + "num_input_tokens_seen": 161403440, + "router_z_loss_mlp": 0.875, + "step": 1941, + "time_per_iteration": 2.636056900024414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183262, + "balance_loss_mlp": 1.09561944, + "epoch": 0.37360523278183916, + "flos": 501432920064.0, + "grad_norm": 0.028111058318233337, + "language_loss": 0.83044219, + "learning_rate": 0.0007210628457647849, + "loss": 0.84227479, + "num_input_tokens_seen": 161472864, + "router_z_loss_mlp": 0.87792969, + "step": 1942, + "time_per_iteration": 2.6564345359802246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182498, + "balance_loss_mlp": 1.09475958, + "epoch": 0.37379761446710275, + "flos": 549111846912.0, + "grad_norm": 0.03172951338735415, + "language_loss": 0.86608446, + "learning_rate": 0.000720783364028453, + "loss": 0.87790942, + "num_input_tokens_seen": 161548096, + "router_z_loss_mlp": 0.87890625, + "step": 1943, + "time_per_iteration": 2.7782797813415527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176645, + "balance_loss_mlp": 1.08909822, + "epoch": 0.3739899961523663, + "flos": 476739425280.0, + "grad_norm": 0.0265564263320471, + "language_loss": 0.94348681, + "learning_rate": 0.0007205037965764177, + "loss": 0.95525324, + "num_input_tokens_seen": 161615600, + "router_z_loss_mlp": 0.87695312, + "step": 1944, + "time_per_iteration": 2.5670034885406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198539, + "balance_loss_mlp": 1.11003804, + "epoch": 0.37418237783762986, + "flos": 613076034048.0, + "grad_norm": 0.032068934234115415, + "language_loss": 0.94037992, + "learning_rate": 0.0007202241435172161, + "loss": 0.95236534, + "num_input_tokens_seen": 161687408, + "router_z_loss_mlp": 0.8828125, + "step": 1945, + "time_per_iteration": 2.7505762577056885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119095, + "balance_loss_mlp": 1.10283065, + "epoch": 0.3743747595228934, + "flos": 767628272640.0, + "grad_norm": 0.02891432689626354, + "language_loss": 0.95249915, + "learning_rate": 0.0007199444049594198, + "loss": 0.9644087, + "num_input_tokens_seen": 161764224, + "router_z_loss_mlp": 0.88085938, + "step": 1946, + "time_per_iteration": 2.9690663814544678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179721, + "balance_loss_mlp": 1.09188759, + "epoch": 0.374567141208157, + "flos": 525490598400.0, + "grad_norm": 0.029648083740235674, + "language_loss": 0.90769064, + "learning_rate": 0.0007196645810116322, + "loss": 0.91948783, + "num_input_tokens_seen": 161835520, + "router_z_loss_mlp": 0.87988281, + "step": 1947, + "time_per_iteration": 2.690214157104492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178535, + "balance_loss_mlp": 1.09065437, + "epoch": 0.37475952289342057, + "flos": 682613025792.0, + "grad_norm": 0.029716110952303924, + "language_loss": 0.91939867, + "learning_rate": 0.0007193846717824912, + "loss": 0.93118405, + "num_input_tokens_seen": 161912000, + "router_z_loss_mlp": 0.88037109, + "step": 1948, + "time_per_iteration": 2.9668121337890625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179187, + "balance_loss_mlp": 1.09140122, + "epoch": 0.3749519045786841, + "flos": 461215507968.0, + "grad_norm": 0.032662314662123194, + "language_loss": 0.97396064, + "learning_rate": 0.0007191046773806669, + "loss": 0.98575246, + "num_input_tokens_seen": 161977296, + "router_z_loss_mlp": 0.87939453, + "step": 1949, + "time_per_iteration": 2.5580427646636963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189402, + "balance_loss_mlp": 1.10166442, + "epoch": 0.3751442862639477, + "flos": 956386687488.0, + "grad_norm": 0.03764484603893814, + "language_loss": 0.94282359, + "learning_rate": 0.0007188245979148631, + "loss": 0.95471758, + "num_input_tokens_seen": 162051888, + "router_z_loss_mlp": 0.87890625, + "step": 1950, + "time_per_iteration": 3.1307644844055176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185097, + "balance_loss_mlp": 1.09678674, + "epoch": 0.3753366679492112, + "flos": 528805392384.0, + "grad_norm": 0.0321726971318772, + "language_loss": 0.95554888, + "learning_rate": 0.0007185444334938157, + "loss": 0.96739984, + "num_input_tokens_seen": 162124384, + "router_z_loss_mlp": 0.8828125, + "step": 1951, + "time_per_iteration": 2.7235019207000732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181124, + "balance_loss_mlp": 1.09324276, + "epoch": 0.3755290496344748, + "flos": 522848550912.0, + "grad_norm": 0.029170285322497422, + "language_loss": 0.91979843, + "learning_rate": 0.0007182641842262947, + "loss": 0.93160963, + "num_input_tokens_seen": 162191440, + "router_z_loss_mlp": 0.88037109, + "step": 1952, + "time_per_iteration": 2.6083579063415527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179821, + "balance_loss_mlp": 1.09193957, + "epoch": 0.37572143131973834, + "flos": 622371864576.0, + "grad_norm": 0.029206332986401715, + "language_loss": 0.85116351, + "learning_rate": 0.0007179838502211022, + "loss": 0.86296165, + "num_input_tokens_seen": 162268480, + "router_z_loss_mlp": 0.88037109, + "step": 1953, + "time_per_iteration": 2.8308520317077637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185603, + "balance_loss_mlp": 1.0973407, + "epoch": 0.37591381300500193, + "flos": 772273823232.0, + "grad_norm": 0.030259488278154622, + "language_loss": 0.94510454, + "learning_rate": 0.0007177034315870738, + "loss": 0.9569605, + "num_input_tokens_seen": 162346752, + "router_z_loss_mlp": 0.88232422, + "step": 1954, + "time_per_iteration": 2.966627359390259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187445, + "balance_loss_mlp": 1.09908688, + "epoch": 0.37610619469026546, + "flos": 521480864256.0, + "grad_norm": 0.02960656624392615, + "language_loss": 0.99060822, + "learning_rate": 0.0007174229284330773, + "loss": 1.00248265, + "num_input_tokens_seen": 162415120, + "router_z_loss_mlp": 0.88330078, + "step": 1955, + "time_per_iteration": 2.642186403274536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182076, + "balance_loss_mlp": 1.09338391, + "epoch": 0.37629857637552905, + "flos": 599970582528.0, + "grad_norm": 0.025408092842649905, + "language_loss": 0.92700577, + "learning_rate": 0.0007171423408680141, + "loss": 0.93882644, + "num_input_tokens_seen": 162493280, + "router_z_loss_mlp": 0.88671875, + "step": 1956, + "time_per_iteration": 2.8501906394958496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180409, + "balance_loss_mlp": 1.09138381, + "epoch": 0.37649095806079264, + "flos": 566018187264.0, + "grad_norm": 0.027446848492574977, + "language_loss": 0.96095192, + "learning_rate": 0.0007168616690008176, + "loss": 0.97275609, + "num_input_tokens_seen": 162560736, + "router_z_loss_mlp": 0.88818359, + "step": 1957, + "time_per_iteration": 2.658282995223999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183288, + "balance_loss_mlp": 1.09440601, + "epoch": 0.37668333974605617, + "flos": 593568579072.0, + "grad_norm": 0.029268558303355535, + "language_loss": 0.93381131, + "learning_rate": 0.0007165809129404545, + "loss": 0.9456442, + "num_input_tokens_seen": 162630688, + "router_z_loss_mlp": 0.88671875, + "step": 1958, + "time_per_iteration": 2.738896608352661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185047, + "balance_loss_mlp": 1.09621239, + "epoch": 0.37687572143131975, + "flos": 420364280832.0, + "grad_norm": 0.028940223287944336, + "language_loss": 0.94791234, + "learning_rate": 0.0007163000727959239, + "loss": 0.95976275, + "num_input_tokens_seen": 162694304, + "router_z_loss_mlp": 0.88623047, + "step": 1959, + "time_per_iteration": 2.5175514221191406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122541, + "balance_loss_mlp": 1.14034271, + "epoch": 0.3770681031165833, + "flos": 1360384568832.0, + "grad_norm": 0.031863979933265396, + "language_loss": 0.77959073, + "learning_rate": 0.0007160191486762575, + "loss": 0.79184484, + "num_input_tokens_seen": 162920336, + "router_z_loss_mlp": 0.8515625, + "step": 1960, + "time_per_iteration": 4.834294557571411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187625, + "balance_loss_mlp": 1.0985992, + "epoch": 0.3772604848018469, + "flos": 646153568256.0, + "grad_norm": 0.027699188267120346, + "language_loss": 0.9236567, + "learning_rate": 0.00071573814069052, + "loss": 0.93553299, + "num_input_tokens_seen": 163000720, + "router_z_loss_mlp": 0.88818359, + "step": 1961, + "time_per_iteration": 2.8704912662506104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195985, + "balance_loss_mlp": 1.10681665, + "epoch": 0.3774528664871104, + "flos": 903200810496.0, + "grad_norm": 0.025601029742712816, + "language_loss": 0.93588847, + "learning_rate": 0.0007154570489478081, + "loss": 0.94784832, + "num_input_tokens_seen": 163085680, + "router_z_loss_mlp": 0.88964844, + "step": 1962, + "time_per_iteration": 3.2312510013580322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198663, + "balance_loss_mlp": 1.1095897, + "epoch": 0.377645248172374, + "flos": 789462868992.0, + "grad_norm": 0.028157211525065163, + "language_loss": 0.92405236, + "learning_rate": 0.0007151758735572514, + "loss": 0.93603897, + "num_input_tokens_seen": 163162224, + "router_z_loss_mlp": 0.88867188, + "step": 1963, + "time_per_iteration": 3.0338857173919678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192995, + "balance_loss_mlp": 1.10396981, + "epoch": 0.3778376298576376, + "flos": 587924642304.0, + "grad_norm": 0.030822839560022956, + "language_loss": 0.89740217, + "learning_rate": 0.0007148946146280119, + "loss": 0.90933216, + "num_input_tokens_seen": 163237920, + "router_z_loss_mlp": 0.88818359, + "step": 1964, + "time_per_iteration": 2.795830488204956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193161, + "balance_loss_mlp": 1.10656738, + "epoch": 0.3780300115429011, + "flos": 1399669997568.0, + "grad_norm": 0.013238700163895742, + "language_loss": 0.72192144, + "learning_rate": 0.000714613272269284, + "loss": 0.7338531, + "num_input_tokens_seen": 163455760, + "router_z_loss_mlp": 0.8671875, + "step": 1965, + "time_per_iteration": 4.866962909698486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120089, + "balance_loss_mlp": 1.11372375, + "epoch": 0.3782223932281647, + "flos": 1360631619072.0, + "grad_norm": 0.015556792607008025, + "language_loss": 0.75341946, + "learning_rate": 0.0007143318465902943, + "loss": 0.76542836, + "num_input_tokens_seen": 163678064, + "router_z_loss_mlp": 0.87304688, + "step": 1966, + "time_per_iteration": 4.942438364028931 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179172, + "balance_loss_mlp": 1.09114802, + "epoch": 0.37841477491342823, + "flos": 705515865600.0, + "grad_norm": 0.024767419651172896, + "language_loss": 0.90831983, + "learning_rate": 0.0007140503377003022, + "loss": 0.92011154, + "num_input_tokens_seen": 163764320, + "router_z_loss_mlp": 0.88183594, + "step": 1967, + "time_per_iteration": 2.9852232933044434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118121, + "balance_loss_mlp": 1.09318614, + "epoch": 0.3786071565986918, + "flos": 530155614720.0, + "grad_norm": 0.02676934241732637, + "language_loss": 0.92451024, + "learning_rate": 0.000713768745708599, + "loss": 0.93632239, + "num_input_tokens_seen": 163831808, + "router_z_loss_mlp": 0.88183594, + "step": 1968, + "time_per_iteration": 2.6276321411132812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180899, + "balance_loss_mlp": 1.09311283, + "epoch": 0.37879953828395535, + "flos": 994900039680.0, + "grad_norm": 0.026029915049846697, + "language_loss": 0.85207623, + "learning_rate": 0.0007134870707245085, + "loss": 0.86388516, + "num_input_tokens_seen": 163918128, + "router_z_loss_mlp": 0.87939453, + "step": 1969, + "time_per_iteration": 3.2757370471954346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118867, + "balance_loss_mlp": 1.10074103, + "epoch": 0.37899191996921894, + "flos": 627792219648.0, + "grad_norm": 0.029282968357198087, + "language_loss": 0.91297084, + "learning_rate": 0.0007132053128573864, + "loss": 0.92485756, + "num_input_tokens_seen": 163987552, + "router_z_loss_mlp": 0.88085938, + "step": 1970, + "time_per_iteration": 2.713987350463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184407, + "balance_loss_mlp": 1.09633517, + "epoch": 0.37918430165448247, + "flos": 687519088128.0, + "grad_norm": 0.026716081838251738, + "language_loss": 0.91701669, + "learning_rate": 0.0007129234722166211, + "loss": 0.92886078, + "num_input_tokens_seen": 164063248, + "router_z_loss_mlp": 0.88232422, + "step": 1971, + "time_per_iteration": 2.830312728881836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178089, + "balance_loss_mlp": 1.09025514, + "epoch": 0.37937668333974606, + "flos": 476617901568.0, + "grad_norm": 0.023390773702336033, + "language_loss": 0.97041333, + "learning_rate": 0.0007126415489116328, + "loss": 0.98219419, + "num_input_tokens_seen": 164133776, + "router_z_loss_mlp": 0.87988281, + "step": 1972, + "time_per_iteration": 2.6577088832855225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186585, + "balance_loss_mlp": 1.09903812, + "epoch": 0.37956906502500964, + "flos": 708823928832.0, + "grad_norm": 0.02822522227358307, + "language_loss": 0.89341533, + "learning_rate": 0.0007123595430518736, + "loss": 0.90528119, + "num_input_tokens_seen": 164206672, + "router_z_loss_mlp": 0.87695312, + "step": 1973, + "time_per_iteration": 2.8803040981292725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187247, + "balance_loss_mlp": 1.09974778, + "epoch": 0.3797614467102732, + "flos": 427558553088.0, + "grad_norm": 0.030455517002935972, + "language_loss": 0.93240166, + "learning_rate": 0.0007120774547468282, + "loss": 0.94427419, + "num_input_tokens_seen": 164271968, + "router_z_loss_mlp": 0.87646484, + "step": 1974, + "time_per_iteration": 2.5190658569335938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185963, + "balance_loss_mlp": 1.09836841, + "epoch": 0.37995382839553676, + "flos": 482880916992.0, + "grad_norm": 0.028219754054602288, + "language_loss": 0.89357984, + "learning_rate": 0.0007117952841060128, + "loss": 0.9054395, + "num_input_tokens_seen": 164342800, + "router_z_loss_mlp": 0.87744141, + "step": 1975, + "time_per_iteration": 2.6428894996643066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184241, + "balance_loss_mlp": 1.09631252, + "epoch": 0.3801462100808003, + "flos": 561670078464.0, + "grad_norm": 0.02907805968320273, + "language_loss": 0.90876186, + "learning_rate": 0.0007115130312389756, + "loss": 0.92060423, + "num_input_tokens_seen": 164414928, + "router_z_loss_mlp": 0.88085938, + "step": 1976, + "time_per_iteration": 2.669287919998169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188644, + "balance_loss_mlp": 1.10066783, + "epoch": 0.3803385917660639, + "flos": 465887255040.0, + "grad_norm": 0.031138982719559682, + "language_loss": 0.88565898, + "learning_rate": 0.0007112306962552973, + "loss": 0.89754546, + "num_input_tokens_seen": 164483312, + "router_z_loss_mlp": 0.88134766, + "step": 1977, + "time_per_iteration": 2.617105007171631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188488, + "balance_loss_mlp": 1.10055935, + "epoch": 0.3805309734513274, + "flos": 522904946688.0, + "grad_norm": 0.027881475391737562, + "language_loss": 0.92461807, + "learning_rate": 0.0007109482792645896, + "loss": 0.93650293, + "num_input_tokens_seen": 164555760, + "router_z_loss_mlp": 0.88085938, + "step": 1978, + "time_per_iteration": 2.7350404262542725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191644, + "balance_loss_mlp": 1.10352468, + "epoch": 0.380723355136591, + "flos": 592552728576.0, + "grad_norm": 0.03010131618310245, + "language_loss": 0.91373634, + "learning_rate": 0.0007106657803764969, + "loss": 0.92565274, + "num_input_tokens_seen": 164626768, + "router_z_loss_mlp": 0.88183594, + "step": 1979, + "time_per_iteration": 2.7113609313964844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188099, + "balance_loss_mlp": 1.10007489, + "epoch": 0.38091573682185453, + "flos": 623854344192.0, + "grad_norm": 0.03122566409921124, + "language_loss": 0.90192807, + "learning_rate": 0.0007103831997006948, + "loss": 0.91380906, + "num_input_tokens_seen": 164698016, + "router_z_loss_mlp": 0.88183594, + "step": 1980, + "time_per_iteration": 2.7460203170776367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183293, + "balance_loss_mlp": 1.09507859, + "epoch": 0.3811081185071181, + "flos": 570175641600.0, + "grad_norm": 0.027157726640451497, + "language_loss": 0.92157245, + "learning_rate": 0.0007101005373468908, + "loss": 0.9334054, + "num_input_tokens_seen": 164780320, + "router_z_loss_mlp": 0.8828125, + "step": 1981, + "time_per_iteration": 2.869722604751587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176795, + "balance_loss_mlp": 1.08891392, + "epoch": 0.3813005001923817, + "flos": 585990269952.0, + "grad_norm": 0.026054611177121254, + "language_loss": 0.92786968, + "learning_rate": 0.0007098177934248242, + "loss": 0.9396376, + "num_input_tokens_seen": 164854400, + "router_z_loss_mlp": 0.88037109, + "step": 1982, + "time_per_iteration": 2.7341668605804443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179814, + "balance_loss_mlp": 1.09188521, + "epoch": 0.38149288187764524, + "flos": 622810295808.0, + "grad_norm": 0.03120804506271422, + "language_loss": 0.94404829, + "learning_rate": 0.0007095349680442661, + "loss": 0.95584643, + "num_input_tokens_seen": 164932896, + "router_z_loss_mlp": 0.88085938, + "step": 1983, + "time_per_iteration": 2.845836639404297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182966, + "balance_loss_mlp": 1.09522831, + "epoch": 0.3816852635629088, + "flos": 571797109248.0, + "grad_norm": 0.027372063240090748, + "language_loss": 0.86448967, + "learning_rate": 0.0007092520613150188, + "loss": 0.87631935, + "num_input_tokens_seen": 165002896, + "router_z_loss_mlp": 0.87890625, + "step": 1984, + "time_per_iteration": 2.6740176677703857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178711, + "balance_loss_mlp": 1.09106863, + "epoch": 0.38187764524817236, + "flos": 566678198784.0, + "grad_norm": 0.03160695384354602, + "language_loss": 0.87573516, + "learning_rate": 0.0007089690733469165, + "loss": 0.88752234, + "num_input_tokens_seen": 165074704, + "router_z_loss_mlp": 0.87792969, + "step": 1985, + "time_per_iteration": 2.717921733856201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178571, + "balance_loss_mlp": 1.09073794, + "epoch": 0.38207002693343595, + "flos": 632398838784.0, + "grad_norm": 0.031031403109496963, + "language_loss": 0.90504575, + "learning_rate": 0.000708686004249825, + "loss": 0.91683149, + "num_input_tokens_seen": 165149136, + "router_z_loss_mlp": 0.87988281, + "step": 1986, + "time_per_iteration": 2.758554697036743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179432, + "balance_loss_mlp": 1.09164619, + "epoch": 0.3822624086186995, + "flos": 549840989184.0, + "grad_norm": 0.025201133141653974, + "language_loss": 0.97533029, + "learning_rate": 0.0007084028541336413, + "loss": 0.98712462, + "num_input_tokens_seen": 165220864, + "router_z_loss_mlp": 0.87939453, + "step": 1987, + "time_per_iteration": 2.6981115341186523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187219, + "balance_loss_mlp": 1.09909916, + "epoch": 0.38245479030396307, + "flos": 615066802176.0, + "grad_norm": 0.02853553744793089, + "language_loss": 0.9291808, + "learning_rate": 0.0007081196231082942, + "loss": 0.94105303, + "num_input_tokens_seen": 165301568, + "router_z_loss_mlp": 0.8828125, + "step": 1988, + "time_per_iteration": 2.7912278175354004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186636, + "balance_loss_mlp": 1.09851646, + "epoch": 0.38264717198922665, + "flos": 669303458304.0, + "grad_norm": 0.029318681320032423, + "language_loss": 0.88455558, + "learning_rate": 0.0007078363112837436, + "loss": 0.89642197, + "num_input_tokens_seen": 165373152, + "router_z_loss_mlp": 0.8828125, + "step": 1989, + "time_per_iteration": 2.8133885860443115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187352, + "balance_loss_mlp": 1.09927964, + "epoch": 0.3828395536744902, + "flos": 455686364160.0, + "grad_norm": 0.029265262626364436, + "language_loss": 0.9249233, + "learning_rate": 0.000707552918769981, + "loss": 0.93679678, + "num_input_tokens_seen": 165439136, + "router_z_loss_mlp": 0.88232422, + "step": 1990, + "time_per_iteration": 2.538587808609009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180802, + "balance_loss_mlp": 1.09277809, + "epoch": 0.3830319353597538, + "flos": 500482197504.0, + "grad_norm": 0.02588536582900798, + "language_loss": 0.91112638, + "learning_rate": 0.000707269445677029, + "loss": 0.92293441, + "num_input_tokens_seen": 165514624, + "router_z_loss_mlp": 0.88183594, + "step": 1991, + "time_per_iteration": 2.7578041553497314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183391, + "balance_loss_mlp": 1.09536684, + "epoch": 0.3832243170450173, + "flos": 745466035200.0, + "grad_norm": 0.02707218781991338, + "language_loss": 0.91718936, + "learning_rate": 0.0007069858921149416, + "loss": 0.92902327, + "num_input_tokens_seen": 165594512, + "router_z_loss_mlp": 0.88183594, + "step": 1992, + "time_per_iteration": 2.948418617248535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184259, + "balance_loss_mlp": 1.09613955, + "epoch": 0.3834166987302809, + "flos": 579345219072.0, + "grad_norm": 0.02587271093699699, + "language_loss": 0.92343616, + "learning_rate": 0.0007067022581938043, + "loss": 0.93527877, + "num_input_tokens_seen": 165673968, + "router_z_loss_mlp": 0.8828125, + "step": 1993, + "time_per_iteration": 2.881967782974243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118782, + "balance_loss_mlp": 1.09965289, + "epoch": 0.3836090804155444, + "flos": 537608397312.0, + "grad_norm": 0.029882536442049617, + "language_loss": 0.91833031, + "learning_rate": 0.0007064185440237334, + "loss": 0.9302085, + "num_input_tokens_seen": 165747664, + "router_z_loss_mlp": 0.88330078, + "step": 1994, + "time_per_iteration": 2.7481510639190674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190014, + "balance_loss_mlp": 1.10189474, + "epoch": 0.383801462100808, + "flos": 603051061248.0, + "grad_norm": 0.027232179622410133, + "language_loss": 0.91516536, + "learning_rate": 0.0007061347497148764, + "loss": 0.92706549, + "num_input_tokens_seen": 165824624, + "router_z_loss_mlp": 0.8828125, + "step": 1995, + "time_per_iteration": 2.762807846069336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191619, + "balance_loss_mlp": 1.10321367, + "epoch": 0.38399384378607154, + "flos": 573798610944.0, + "grad_norm": 0.03191203592253993, + "language_loss": 0.9478448, + "learning_rate": 0.0007058508753774122, + "loss": 0.95976096, + "num_input_tokens_seen": 165896304, + "router_z_loss_mlp": 0.88476562, + "step": 1996, + "time_per_iteration": 2.7208473682403564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185202, + "balance_loss_mlp": 1.09708297, + "epoch": 0.38418622547133513, + "flos": 537779586048.0, + "grad_norm": 0.03234926235653744, + "language_loss": 0.93760306, + "learning_rate": 0.0007055669211215505, + "loss": 0.94945514, + "num_input_tokens_seen": 165961312, + "router_z_loss_mlp": 0.8828125, + "step": 1997, + "time_per_iteration": 2.6605474948883057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182194, + "balance_loss_mlp": 1.09397876, + "epoch": 0.3843786071565987, + "flos": 574013460480.0, + "grad_norm": 0.03558568539094479, + "language_loss": 0.86620909, + "learning_rate": 0.0007052828870575322, + "loss": 0.87803102, + "num_input_tokens_seen": 166028064, + "router_z_loss_mlp": 0.88378906, + "step": 1998, + "time_per_iteration": 2.6478962898254395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179215, + "balance_loss_mlp": 1.09100008, + "epoch": 0.38457098884186225, + "flos": 730079104512.0, + "grad_norm": 0.027610192556292087, + "language_loss": 0.94167769, + "learning_rate": 0.0007049987732956291, + "loss": 0.95346981, + "num_input_tokens_seen": 166110272, + "router_z_loss_mlp": 0.88378906, + "step": 1999, + "time_per_iteration": 2.9643850326538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190926, + "balance_loss_mlp": 1.10199583, + "epoch": 0.38476337052712584, + "flos": 584620581888.0, + "grad_norm": 0.023866575274933036, + "language_loss": 0.8787694, + "learning_rate": 0.0007047145799461439, + "loss": 0.89067864, + "num_input_tokens_seen": 166193088, + "router_z_loss_mlp": 0.88720703, + "step": 2000, + "time_per_iteration": 2.8542819023132324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191076, + "balance_loss_mlp": 1.10200322, + "epoch": 0.38495575221238937, + "flos": 554158898688.0, + "grad_norm": 0.025960095413567152, + "language_loss": 0.89154112, + "learning_rate": 0.00070443030711941, + "loss": 0.90345186, + "num_input_tokens_seen": 166271776, + "router_z_loss_mlp": 0.88867188, + "step": 2001, + "time_per_iteration": 2.770023822784424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189246, + "balance_loss_mlp": 1.10084057, + "epoch": 0.38514813389765296, + "flos": 655676983296.0, + "grad_norm": 0.026490656569535233, + "language_loss": 0.88696259, + "learning_rate": 0.0007041459549257924, + "loss": 0.89885509, + "num_input_tokens_seen": 166350000, + "router_z_loss_mlp": 0.88476562, + "step": 2002, + "time_per_iteration": 4.357714414596558 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182232, + "balance_loss_mlp": 1.09392142, + "epoch": 0.3853405155829165, + "flos": 869645913600.0, + "grad_norm": 0.03138294802585753, + "language_loss": 0.86704218, + "learning_rate": 0.0007038615234756859, + "loss": 0.87886453, + "num_input_tokens_seen": 166434336, + "router_z_loss_mlp": 0.88476562, + "step": 2003, + "time_per_iteration": 3.154315233230591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179573, + "balance_loss_mlp": 1.09135854, + "epoch": 0.3855328972681801, + "flos": 547468185600.0, + "grad_norm": 0.030993794918127784, + "language_loss": 0.91032863, + "learning_rate": 0.000703577012879517, + "loss": 0.92212439, + "num_input_tokens_seen": 166503952, + "router_z_loss_mlp": 0.88378906, + "step": 2004, + "time_per_iteration": 2.6320230960845947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184907, + "balance_loss_mlp": 1.09673953, + "epoch": 0.3857252789534436, + "flos": 535098607104.0, + "grad_norm": 0.029525133384240967, + "language_loss": 0.9687134, + "learning_rate": 0.0007032924232477423, + "loss": 0.98056245, + "num_input_tokens_seen": 166575168, + "router_z_loss_mlp": 0.88330078, + "step": 2005, + "time_per_iteration": 2.650982618331909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184324, + "balance_loss_mlp": 1.09630013, + "epoch": 0.3859176606387072, + "flos": 492766901760.0, + "grad_norm": 0.029334702789067958, + "language_loss": 0.8823278, + "learning_rate": 0.0007030077546908493, + "loss": 0.89417106, + "num_input_tokens_seen": 166647552, + "router_z_loss_mlp": 0.88183594, + "step": 2006, + "time_per_iteration": 2.642333745956421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203979, + "balance_loss_mlp": 1.11700439, + "epoch": 0.3861100423239708, + "flos": 1490155991040.0, + "grad_norm": 0.02217822259323008, + "language_loss": 0.83064663, + "learning_rate": 0.0007027230073193561, + "loss": 0.84268641, + "num_input_tokens_seen": 166875088, + "router_z_loss_mlp": 0.87109375, + "step": 2007, + "time_per_iteration": 4.759521961212158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184336, + "balance_loss_mlp": 1.09635913, + "epoch": 0.3863024240092343, + "flos": 474692261376.0, + "grad_norm": 0.030825589148035897, + "language_loss": 0.87378025, + "learning_rate": 0.0007024381812438117, + "loss": 0.88562357, + "num_input_tokens_seen": 166939344, + "router_z_loss_mlp": 0.88134766, + "step": 2008, + "time_per_iteration": 2.5227372646331787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184691, + "balance_loss_mlp": 1.09728634, + "epoch": 0.3864948056944979, + "flos": 717978769920.0, + "grad_norm": 0.032935981886219476, + "language_loss": 0.91112518, + "learning_rate": 0.0007021532765747951, + "loss": 0.92297208, + "num_input_tokens_seen": 167014992, + "router_z_loss_mlp": 0.87548828, + "step": 2009, + "time_per_iteration": 2.963550567626953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182737, + "balance_loss_mlp": 1.0952853, + "epoch": 0.38668718737976143, + "flos": 728954465280.0, + "grad_norm": 0.030267959416106823, + "language_loss": 0.86631739, + "learning_rate": 0.0007018682934229162, + "loss": 0.87814474, + "num_input_tokens_seen": 167092096, + "router_z_loss_mlp": 0.87597656, + "step": 2010, + "time_per_iteration": 2.955132246017456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179617, + "balance_loss_mlp": 1.09235525, + "epoch": 0.386879569065025, + "flos": 526488984576.0, + "grad_norm": 0.02588052645359636, + "language_loss": 0.89375025, + "learning_rate": 0.0007015832318988152, + "loss": 0.90554643, + "num_input_tokens_seen": 167162144, + "router_z_loss_mlp": 0.87402344, + "step": 2011, + "time_per_iteration": 2.612443208694458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117942, + "balance_loss_mlp": 1.09454346, + "epoch": 0.38707195075028855, + "flos": 1530724512768.0, + "grad_norm": 0.010241364382771095, + "language_loss": 0.73890078, + "learning_rate": 0.000701298092113163, + "loss": 0.75069499, + "num_input_tokens_seen": 167391536, + "router_z_loss_mlp": 0.84960938, + "step": 2012, + "time_per_iteration": 4.952507495880127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187813, + "balance_loss_mlp": 1.10040927, + "epoch": 0.38726433243555214, + "flos": 558385483776.0, + "grad_norm": 0.026729103388188073, + "language_loss": 0.89776802, + "learning_rate": 0.0007010128741766604, + "loss": 0.90964615, + "num_input_tokens_seen": 167466000, + "router_z_loss_mlp": 0.87548828, + "step": 2013, + "time_per_iteration": 2.759916067123413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184734, + "balance_loss_mlp": 1.09756815, + "epoch": 0.38745671412081567, + "flos": 554755783680.0, + "grad_norm": 0.0314384592840016, + "language_loss": 0.91517645, + "learning_rate": 0.0007007275782000391, + "loss": 0.92702377, + "num_input_tokens_seen": 167536144, + "router_z_loss_mlp": 0.87304688, + "step": 2014, + "time_per_iteration": 2.6659133434295654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181864, + "balance_loss_mlp": 1.09469819, + "epoch": 0.38764909580607926, + "flos": 459344262144.0, + "grad_norm": 0.028810992523736655, + "language_loss": 0.92611015, + "learning_rate": 0.0007004422042940605, + "loss": 0.9379288, + "num_input_tokens_seen": 167600064, + "router_z_loss_mlp": 0.87304688, + "step": 2015, + "time_per_iteration": 2.4901411533355713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180932, + "balance_loss_mlp": 1.09376657, + "epoch": 0.38784147749134285, + "flos": 523258784256.0, + "grad_norm": 0.030339968140386194, + "language_loss": 0.98432136, + "learning_rate": 0.0007001567525695169, + "loss": 0.99613065, + "num_input_tokens_seen": 167666576, + "router_z_loss_mlp": 0.87304688, + "step": 2016, + "time_per_iteration": 2.605134963989258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182969, + "balance_loss_mlp": 1.09575546, + "epoch": 0.3880338591766064, + "flos": 667400011776.0, + "grad_norm": 0.023304348995526428, + "language_loss": 0.90603948, + "learning_rate": 0.0006998712231372303, + "loss": 0.91786909, + "num_input_tokens_seen": 167753296, + "router_z_loss_mlp": 0.87353516, + "step": 2017, + "time_per_iteration": 2.9866511821746826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187647, + "balance_loss_mlp": 1.10024321, + "epoch": 0.38822624086186996, + "flos": 595175310336.0, + "grad_norm": 0.027834044235160192, + "language_loss": 0.92810535, + "learning_rate": 0.0006995856161080532, + "loss": 0.93998176, + "num_input_tokens_seen": 167834080, + "router_z_loss_mlp": 0.87548828, + "step": 2018, + "time_per_iteration": 2.8917806148529053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181908, + "balance_loss_mlp": 1.09426534, + "epoch": 0.3884186225471335, + "flos": 613681651200.0, + "grad_norm": 0.030912624722110756, + "language_loss": 0.90135586, + "learning_rate": 0.0006992999315928679, + "loss": 0.91317499, + "num_input_tokens_seen": 167912368, + "router_z_loss_mlp": 0.87792969, + "step": 2019, + "time_per_iteration": 2.821570873260498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179846, + "balance_loss_mlp": 1.0924896, + "epoch": 0.3886110042323971, + "flos": 608243831808.0, + "grad_norm": 0.025167723735071885, + "language_loss": 0.91748118, + "learning_rate": 0.0006990141697025871, + "loss": 0.92927969, + "num_input_tokens_seen": 167991968, + "router_z_loss_mlp": 0.875, + "step": 2020, + "time_per_iteration": 2.774073600769043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181915, + "balance_loss_mlp": 1.09684753, + "epoch": 0.3888033859176606, + "flos": 1531193869824.0, + "grad_norm": 0.011544022481713089, + "language_loss": 0.76359642, + "learning_rate": 0.0006987283305481533, + "loss": 0.77541554, + "num_input_tokens_seen": 168212128, + "router_z_loss_mlp": 0.8515625, + "step": 2021, + "time_per_iteration": 4.741650581359863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174887, + "balance_loss_mlp": 1.08734, + "epoch": 0.3889957676029242, + "flos": 693671313408.0, + "grad_norm": 0.03334226176751645, + "language_loss": 0.90383756, + "learning_rate": 0.0006984424142405392, + "loss": 0.91558647, + "num_input_tokens_seen": 168287440, + "router_z_loss_mlp": 0.87695312, + "step": 2022, + "time_per_iteration": 2.839838981628418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174992, + "balance_loss_mlp": 1.08734977, + "epoch": 0.3891881492881878, + "flos": 516194767872.0, + "grad_norm": 0.031660307701904165, + "language_loss": 0.90829813, + "learning_rate": 0.0006981564208907474, + "loss": 0.92004812, + "num_input_tokens_seen": 168354704, + "router_z_loss_mlp": 0.87792969, + "step": 2023, + "time_per_iteration": 2.6160523891448975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179623, + "balance_loss_mlp": 1.09178972, + "epoch": 0.3893805309734513, + "flos": 630175756800.0, + "grad_norm": 0.02822603249283798, + "language_loss": 0.96692258, + "learning_rate": 0.0006978703506098102, + "loss": 0.97871882, + "num_input_tokens_seen": 168424272, + "router_z_loss_mlp": 0.87988281, + "step": 2024, + "time_per_iteration": 2.770775556564331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177682, + "balance_loss_mlp": 1.08994389, + "epoch": 0.3895729126587149, + "flos": 545206172160.0, + "grad_norm": 0.026225366557941037, + "language_loss": 0.95314252, + "learning_rate": 0.00069758420350879, + "loss": 0.96491939, + "num_input_tokens_seen": 168488912, + "router_z_loss_mlp": 0.87890625, + "step": 2025, + "time_per_iteration": 2.615687608718872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179844, + "balance_loss_mlp": 1.09201062, + "epoch": 0.38976529434397844, + "flos": 619406178816.0, + "grad_norm": 0.03181269468531491, + "language_loss": 0.9379099, + "learning_rate": 0.000697297979698779, + "loss": 0.94970834, + "num_input_tokens_seen": 168563248, + "router_z_loss_mlp": 0.87988281, + "step": 2026, + "time_per_iteration": 2.723860740661621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187768, + "balance_loss_mlp": 1.10007727, + "epoch": 0.38995767602924203, + "flos": 836344797696.0, + "grad_norm": 0.025703512313876988, + "language_loss": 0.89683533, + "learning_rate": 0.0006970116792908992, + "loss": 0.90871298, + "num_input_tokens_seen": 168648272, + "router_z_loss_mlp": 0.87841797, + "step": 2027, + "time_per_iteration": 3.0871434211730957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117977, + "balance_loss_mlp": 1.09203207, + "epoch": 0.39015005771450556, + "flos": 542646716928.0, + "grad_norm": 0.03022946762166595, + "language_loss": 0.88945854, + "learning_rate": 0.000696725302396302, + "loss": 0.9012562, + "num_input_tokens_seen": 168721760, + "router_z_loss_mlp": 0.87890625, + "step": 2028, + "time_per_iteration": 2.632178783416748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174959, + "balance_loss_mlp": 1.0871253, + "epoch": 0.39034243939976915, + "flos": 1009140864000.0, + "grad_norm": 0.026055335602768993, + "language_loss": 0.92111158, + "learning_rate": 0.0006964388491261692, + "loss": 0.93286121, + "num_input_tokens_seen": 168803664, + "router_z_loss_mlp": 0.87988281, + "step": 2029, + "time_per_iteration": 3.2683680057525635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174119, + "balance_loss_mlp": 1.08633304, + "epoch": 0.3905348210850327, + "flos": 680240222208.0, + "grad_norm": 0.029787695509808892, + "language_loss": 0.96251416, + "learning_rate": 0.0006961523195917114, + "loss": 0.97425532, + "num_input_tokens_seen": 168879184, + "router_z_loss_mlp": 0.87939453, + "step": 2030, + "time_per_iteration": 2.807161331176758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182527, + "balance_loss_mlp": 1.09459865, + "epoch": 0.39072720277029627, + "flos": 549988709376.0, + "grad_norm": 0.03099080969443711, + "language_loss": 0.86433041, + "learning_rate": 0.0006958657139041696, + "loss": 0.87615567, + "num_input_tokens_seen": 168957808, + "router_z_loss_mlp": 0.88085938, + "step": 2031, + "time_per_iteration": 2.728208065032959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119693, + "balance_loss_mlp": 1.11052704, + "epoch": 0.39091958445555985, + "flos": 1551051159552.0, + "grad_norm": 0.01789751173127641, + "language_loss": 0.76712966, + "learning_rate": 0.0006955790321748136, + "loss": 0.77909899, + "num_input_tokens_seen": 169194416, + "router_z_loss_mlp": 0.86523438, + "step": 2032, + "time_per_iteration": 4.911708354949951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179573, + "balance_loss_mlp": 1.09193051, + "epoch": 0.3911119661408234, + "flos": 505051886592.0, + "grad_norm": 0.03095157096826047, + "language_loss": 0.85940099, + "learning_rate": 0.0006952922745149434, + "loss": 0.87119675, + "num_input_tokens_seen": 169263552, + "router_z_loss_mlp": 0.87792969, + "step": 2033, + "time_per_iteration": 2.649538040161133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176483, + "balance_loss_mlp": 1.08903146, + "epoch": 0.391304347826087, + "flos": 558329088000.0, + "grad_norm": 0.028319463440814277, + "language_loss": 0.94666743, + "learning_rate": 0.000695005441035888, + "loss": 0.95843232, + "num_input_tokens_seen": 169333696, + "router_z_loss_mlp": 0.87597656, + "step": 2034, + "time_per_iteration": 2.6671407222747803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178574, + "balance_loss_mlp": 1.09293365, + "epoch": 0.3914967295113505, + "flos": 1502941807104.0, + "grad_norm": 0.0063133772361172544, + "language_loss": 0.73723435, + "learning_rate": 0.0006947185318490064, + "loss": 0.7490201, + "num_input_tokens_seen": 169556416, + "router_z_loss_mlp": 0.85742188, + "step": 2035, + "time_per_iteration": 4.863725423812866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180506, + "balance_loss_mlp": 1.09338748, + "epoch": 0.3916891111966141, + "flos": 708329101824.0, + "grad_norm": 0.025753563122139746, + "language_loss": 0.86980474, + "learning_rate": 0.0006944315470656863, + "loss": 0.88160974, + "num_input_tokens_seen": 169643312, + "router_z_loss_mlp": 0.87255859, + "step": 2036, + "time_per_iteration": 2.936588764190674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188418, + "balance_loss_mlp": 1.10110939, + "epoch": 0.3918814928818776, + "flos": 557408564736.0, + "grad_norm": 0.031943380680049066, + "language_loss": 0.99613088, + "learning_rate": 0.000694144486797345, + "loss": 1.00801504, + "num_input_tokens_seen": 169712560, + "router_z_loss_mlp": 0.87451172, + "step": 2037, + "time_per_iteration": 2.676107883453369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193756, + "balance_loss_mlp": 1.10868835, + "epoch": 0.3920738745671412, + "flos": 1541685471744.0, + "grad_norm": 0.012882287356254449, + "language_loss": 0.79520434, + "learning_rate": 0.0006938573511554296, + "loss": 0.8071419, + "num_input_tokens_seen": 169914912, + "router_z_loss_mlp": 0.8515625, + "step": 2038, + "time_per_iteration": 4.63246750831604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178826, + "balance_loss_mlp": 1.0916127, + "epoch": 0.39226625625240474, + "flos": 499804721664.0, + "grad_norm": 0.027391930017631044, + "language_loss": 0.96627682, + "learning_rate": 0.0006935701402514156, + "loss": 0.97806513, + "num_input_tokens_seen": 169978848, + "router_z_loss_mlp": 0.87353516, + "step": 2039, + "time_per_iteration": 2.5613086223602295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177521, + "balance_loss_mlp": 1.092453, + "epoch": 0.39245863793766833, + "flos": 1350450920448.0, + "grad_norm": 0.011737641894846437, + "language_loss": 0.73034894, + "learning_rate": 0.0006932828541968083, + "loss": 0.74212414, + "num_input_tokens_seen": 170211488, + "router_z_loss_mlp": 0.8515625, + "step": 2040, + "time_per_iteration": 4.902123689651489 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176176, + "balance_loss_mlp": 1.08881962, + "epoch": 0.3926510196229319, + "flos": 1348114142208.0, + "grad_norm": 0.028665962134257456, + "language_loss": 0.92107272, + "learning_rate": 0.0006929954931031422, + "loss": 0.93283451, + "num_input_tokens_seen": 170298528, + "router_z_loss_mlp": 0.875, + "step": 2041, + "time_per_iteration": 3.7387020587921143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176234, + "balance_loss_mlp": 1.08902013, + "epoch": 0.39284340130819545, + "flos": 500603721216.0, + "grad_norm": 0.024641039111334598, + "language_loss": 0.95021844, + "learning_rate": 0.0006927080570819805, + "loss": 0.96198076, + "num_input_tokens_seen": 170365680, + "router_z_loss_mlp": 0.87353516, + "step": 2042, + "time_per_iteration": 2.5837514400482178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117531, + "balance_loss_mlp": 1.08814418, + "epoch": 0.39303578299345904, + "flos": 521341876224.0, + "grad_norm": 0.03605238478740547, + "language_loss": 0.89998531, + "learning_rate": 0.0006924205462449161, + "loss": 0.9117384, + "num_input_tokens_seen": 170432224, + "router_z_loss_mlp": 0.87304688, + "step": 2043, + "time_per_iteration": 2.560842514038086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173664, + "balance_loss_mlp": 1.08664155, + "epoch": 0.39322816467872257, + "flos": 909537686016.0, + "grad_norm": 0.029197625514705252, + "language_loss": 0.89668262, + "learning_rate": 0.0006921329607035702, + "loss": 0.90841925, + "num_input_tokens_seen": 170517920, + "router_z_loss_mlp": 0.87158203, + "step": 2044, + "time_per_iteration": 3.2215418815612793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185916, + "balance_loss_mlp": 1.09860718, + "epoch": 0.39342054636398616, + "flos": 518641431552.0, + "grad_norm": 0.026194219642157263, + "language_loss": 0.94294739, + "learning_rate": 0.0006918453005695938, + "loss": 0.95480657, + "num_input_tokens_seen": 170589072, + "router_z_loss_mlp": 0.87451172, + "step": 2045, + "time_per_iteration": 2.637197732925415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183114, + "balance_loss_mlp": 1.09594774, + "epoch": 0.3936129280492497, + "flos": 549011790336.0, + "grad_norm": 0.026944227420126074, + "language_loss": 0.91576457, + "learning_rate": 0.0006915575659546662, + "loss": 0.92759573, + "num_input_tokens_seen": 170657856, + "router_z_loss_mlp": 0.87304688, + "step": 2046, + "time_per_iteration": 2.7570858001708984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185485, + "balance_loss_mlp": 1.098176, + "epoch": 0.3938053097345133, + "flos": 527140263936.0, + "grad_norm": 0.02948359624940754, + "language_loss": 0.88347399, + "learning_rate": 0.0006912697569704959, + "loss": 0.89532876, + "num_input_tokens_seen": 170723696, + "router_z_loss_mlp": 0.87451172, + "step": 2047, + "time_per_iteration": 2.635467290878296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186252, + "balance_loss_mlp": 1.09899104, + "epoch": 0.39399769141977686, + "flos": 472588701696.0, + "grad_norm": 0.02995196024762557, + "language_loss": 0.93503523, + "learning_rate": 0.0006909818737288205, + "loss": 0.94689775, + "num_input_tokens_seen": 170789536, + "router_z_loss_mlp": 0.87402344, + "step": 2048, + "time_per_iteration": 2.558013916015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181668, + "balance_loss_mlp": 1.09488404, + "epoch": 0.3941900731050404, + "flos": 502726746624.0, + "grad_norm": 0.02878603575662113, + "language_loss": 0.88763595, + "learning_rate": 0.000690693916341406, + "loss": 0.89945263, + "num_input_tokens_seen": 170859232, + "router_z_loss_mlp": 0.86914062, + "step": 2049, + "time_per_iteration": 2.5820720195770264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178505, + "balance_loss_mlp": 1.09152949, + "epoch": 0.394382454790304, + "flos": 582006732288.0, + "grad_norm": 0.024885306311727563, + "language_loss": 0.90003175, + "learning_rate": 0.0006904058849200475, + "loss": 0.91181684, + "num_input_tokens_seen": 170931568, + "router_z_loss_mlp": 0.87109375, + "step": 2050, + "time_per_iteration": 2.7304697036743164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118427, + "balance_loss_mlp": 1.09700906, + "epoch": 0.3945748364755675, + "flos": 514844545536.0, + "grad_norm": 0.02745844528377672, + "language_loss": 0.91741204, + "learning_rate": 0.0006901177795765683, + "loss": 0.92925465, + "num_input_tokens_seen": 170999856, + "router_z_loss_mlp": 0.87402344, + "step": 2051, + "time_per_iteration": 2.610621213912964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180664, + "balance_loss_mlp": 1.09368861, + "epoch": 0.3947672181608311, + "flos": 595057789440.0, + "grad_norm": 0.03028158635704326, + "language_loss": 0.89240891, + "learning_rate": 0.0006898296004228213, + "loss": 0.90421557, + "num_input_tokens_seen": 171072320, + "router_z_loss_mlp": 0.87109375, + "step": 2052, + "time_per_iteration": 2.747377395629883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119046, + "balance_loss_mlp": 1.10634613, + "epoch": 0.39495959984609463, + "flos": 1551049158144.0, + "grad_norm": 0.018267218432335405, + "language_loss": 0.7812674, + "learning_rate": 0.0006895413475706873, + "loss": 0.793172, + "num_input_tokens_seen": 171304128, + "router_z_loss_mlp": 0.84179688, + "step": 2053, + "time_per_iteration": 4.871596336364746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117553, + "balance_loss_mlp": 1.08845937, + "epoch": 0.3951519815313582, + "flos": 497523242496.0, + "grad_norm": 0.028876315996474663, + "language_loss": 0.87133646, + "learning_rate": 0.0006892530211320763, + "loss": 0.88309175, + "num_input_tokens_seen": 171377392, + "router_z_loss_mlp": 0.87207031, + "step": 2054, + "time_per_iteration": 2.696796417236328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117541, + "balance_loss_mlp": 1.08824456, + "epoch": 0.39534436321662175, + "flos": 532222244352.0, + "grad_norm": 0.031248767008087052, + "language_loss": 0.9121244, + "learning_rate": 0.000688964621218926, + "loss": 0.92387855, + "num_input_tokens_seen": 171447424, + "router_z_loss_mlp": 0.87304688, + "step": 2055, + "time_per_iteration": 2.6398446559906006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176401, + "balance_loss_mlp": 1.08899677, + "epoch": 0.39553674490188534, + "flos": 703724484096.0, + "grad_norm": 0.031024749515969993, + "language_loss": 0.88066703, + "learning_rate": 0.0006886761479432037, + "loss": 0.89243108, + "num_input_tokens_seen": 171519920, + "router_z_loss_mlp": 0.87548828, + "step": 2056, + "time_per_iteration": 2.896899700164795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184707, + "balance_loss_mlp": 1.09720743, + "epoch": 0.3957291265871489, + "flos": 410656215552.0, + "grad_norm": 0.031805347037857014, + "language_loss": 0.92354834, + "learning_rate": 0.0006883876014169045, + "loss": 0.93539548, + "num_input_tokens_seen": 171583856, + "router_z_loss_mlp": 0.87646484, + "step": 2057, + "time_per_iteration": 2.49245023727417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118858, + "balance_loss_mlp": 1.10108006, + "epoch": 0.39592150827241246, + "flos": 619638492672.0, + "grad_norm": 0.03245947566344542, + "language_loss": 0.97519982, + "learning_rate": 0.000688098981752052, + "loss": 0.98708564, + "num_input_tokens_seen": 171656064, + "router_z_loss_mlp": 0.87646484, + "step": 2058, + "time_per_iteration": 2.7079999446868896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183973, + "balance_loss_mlp": 1.09642518, + "epoch": 0.39611388995767605, + "flos": 822720324096.0, + "grad_norm": 0.029593298786174956, + "language_loss": 0.88381338, + "learning_rate": 0.0006878102890606982, + "loss": 0.89565313, + "num_input_tokens_seen": 171738800, + "router_z_loss_mlp": 0.87695312, + "step": 2059, + "time_per_iteration": 3.089268922805786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182646, + "balance_loss_mlp": 1.09524131, + "epoch": 0.3963062716429396, + "flos": 493214065152.0, + "grad_norm": 0.03350279358204369, + "language_loss": 0.88991904, + "learning_rate": 0.0006875215234549239, + "loss": 0.9017455, + "num_input_tokens_seen": 171803664, + "router_z_loss_mlp": 0.87548828, + "step": 2060, + "time_per_iteration": 2.538806200027466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182648, + "balance_loss_mlp": 1.09533882, + "epoch": 0.39649865332820317, + "flos": 585833817600.0, + "grad_norm": 0.030947291001002426, + "language_loss": 0.93147129, + "learning_rate": 0.0006872326850468376, + "loss": 0.9432978, + "num_input_tokens_seen": 171871968, + "router_z_loss_mlp": 0.87451172, + "step": 2061, + "time_per_iteration": 2.6593003273010254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179357, + "balance_loss_mlp": 1.09214342, + "epoch": 0.3966910350134667, + "flos": 459511448064.0, + "grad_norm": 0.03264577108022065, + "language_loss": 0.89072591, + "learning_rate": 0.0006869437739485762, + "loss": 0.90251946, + "num_input_tokens_seen": 171942368, + "router_z_loss_mlp": 0.87353516, + "step": 2062, + "time_per_iteration": 2.605191230773926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180604, + "balance_loss_mlp": 1.0932951, + "epoch": 0.3968834166987303, + "flos": 509614844928.0, + "grad_norm": 0.02743430972643364, + "language_loss": 0.9889155, + "learning_rate": 0.0006866547902723053, + "loss": 1.00072145, + "num_input_tokens_seen": 172012336, + "router_z_loss_mlp": 0.87451172, + "step": 2063, + "time_per_iteration": 2.6466383934020996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178614, + "balance_loss_mlp": 1.09116209, + "epoch": 0.3970757983839938, + "flos": 573742215168.0, + "grad_norm": 0.030016333454088624, + "language_loss": 0.87640852, + "learning_rate": 0.000686365734130218, + "loss": 0.88819462, + "num_input_tokens_seen": 172084640, + "router_z_loss_mlp": 0.87597656, + "step": 2064, + "time_per_iteration": 2.6795899868011475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178875, + "balance_loss_mlp": 1.09161353, + "epoch": 0.3972681800692574, + "flos": 482585476608.0, + "grad_norm": 0.03115409384976, + "language_loss": 0.90479839, + "learning_rate": 0.000686076605634536, + "loss": 0.91658711, + "num_input_tokens_seen": 172152992, + "router_z_loss_mlp": 0.87402344, + "step": 2065, + "time_per_iteration": 2.6956639289855957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176026, + "balance_loss_mlp": 1.0887177, + "epoch": 0.397460561754521, + "flos": 488904887808.0, + "grad_norm": 0.028660372999824147, + "language_loss": 0.91924292, + "learning_rate": 0.0006857874048975088, + "loss": 0.93100321, + "num_input_tokens_seen": 172219312, + "router_z_loss_mlp": 0.87451172, + "step": 2066, + "time_per_iteration": 2.541707992553711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182319, + "balance_loss_mlp": 1.09515274, + "epoch": 0.3976529434397845, + "flos": 422895538176.0, + "grad_norm": 0.03007540042591745, + "language_loss": 0.93814421, + "learning_rate": 0.0006854981320314142, + "loss": 0.94996738, + "num_input_tokens_seen": 172282112, + "router_z_loss_mlp": 0.87304688, + "step": 2067, + "time_per_iteration": 2.455916166305542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118284, + "balance_loss_mlp": 1.09586513, + "epoch": 0.3978453251250481, + "flos": 546621522432.0, + "grad_norm": 0.0330596148196893, + "language_loss": 0.94973123, + "learning_rate": 0.0006852087871485579, + "loss": 0.96155965, + "num_input_tokens_seen": 172347872, + "router_z_loss_mlp": 0.87109375, + "step": 2068, + "time_per_iteration": 2.609492063522339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175372, + "balance_loss_mlp": 1.08801544, + "epoch": 0.39803770681031164, + "flos": 652001620992.0, + "grad_norm": 0.0336676185790188, + "language_loss": 0.8912071, + "learning_rate": 0.0006849193703612735, + "loss": 0.90296078, + "num_input_tokens_seen": 172418560, + "router_z_loss_mlp": 0.875, + "step": 2069, + "time_per_iteration": 2.816309690475464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177646, + "balance_loss_mlp": 1.09071827, + "epoch": 0.39823008849557523, + "flos": 741426101760.0, + "grad_norm": 0.026625397702565265, + "language_loss": 0.84925234, + "learning_rate": 0.0006846298817819225, + "loss": 0.86102879, + "num_input_tokens_seen": 172497984, + "router_z_loss_mlp": 0.87060547, + "step": 2070, + "time_per_iteration": 2.9875504970550537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175555, + "balance_loss_mlp": 1.088485, + "epoch": 0.39842247018083876, + "flos": 385888860672.0, + "grad_norm": 0.03226539532166374, + "language_loss": 0.89664173, + "learning_rate": 0.0006843403215228945, + "loss": 0.90839732, + "num_input_tokens_seen": 172560112, + "router_z_loss_mlp": 0.87207031, + "step": 2071, + "time_per_iteration": 2.4326088428497314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173604, + "balance_loss_mlp": 1.08648539, + "epoch": 0.39861485186610235, + "flos": 534762233856.0, + "grad_norm": 0.028550920618746804, + "language_loss": 0.88238078, + "learning_rate": 0.0006840506896966065, + "loss": 0.89411676, + "num_input_tokens_seen": 172636192, + "router_z_loss_mlp": 0.87255859, + "step": 2072, + "time_per_iteration": 2.6961326599121094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177722, + "balance_loss_mlp": 1.09084272, + "epoch": 0.39880723355136594, + "flos": 644412578304.0, + "grad_norm": 0.03366874484709253, + "language_loss": 0.90951228, + "learning_rate": 0.0006837609864155038, + "loss": 0.9212895, + "num_input_tokens_seen": 172715264, + "router_z_loss_mlp": 0.87011719, + "step": 2073, + "time_per_iteration": 2.8584561347961426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119321, + "balance_loss_mlp": 1.10623515, + "epoch": 0.39899961523662947, + "flos": 516891709440.0, + "grad_norm": 0.031985803275243696, + "language_loss": 0.90341693, + "learning_rate": 0.0006834712117920592, + "loss": 0.91534901, + "num_input_tokens_seen": 172783456, + "router_z_loss_mlp": 0.87109375, + "step": 2074, + "time_per_iteration": 2.624469757080078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186501, + "balance_loss_mlp": 1.09933496, + "epoch": 0.39919199692189306, + "flos": 465338033664.0, + "grad_norm": 0.0320663192521817, + "language_loss": 0.92968071, + "learning_rate": 0.0006831813659387729, + "loss": 0.94154572, + "num_input_tokens_seen": 172848928, + "router_z_loss_mlp": 0.87304688, + "step": 2075, + "time_per_iteration": 2.5216238498687744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184926, + "balance_loss_mlp": 1.09785569, + "epoch": 0.3993843786071566, + "flos": 532678139904.0, + "grad_norm": 0.03441409861038799, + "language_loss": 0.91210699, + "learning_rate": 0.0006828914489681733, + "loss": 0.92395616, + "num_input_tokens_seen": 172921152, + "router_z_loss_mlp": 0.87207031, + "step": 2076, + "time_per_iteration": 2.686810255050659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186966, + "balance_loss_mlp": 1.10008633, + "epoch": 0.3995767602924202, + "flos": 505023688704.0, + "grad_norm": 0.02837279486305722, + "language_loss": 0.91445708, + "learning_rate": 0.0006826014609928162, + "loss": 0.92632675, + "num_input_tokens_seen": 172998864, + "router_z_loss_mlp": 0.87011719, + "step": 2077, + "time_per_iteration": 2.6775381565093994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225517, + "balance_loss_mlp": 1.13892365, + "epoch": 0.3997691419776837, + "flos": 1457471225856.0, + "grad_norm": 0.023004253676312834, + "language_loss": 0.83199388, + "learning_rate": 0.0006823114021252846, + "loss": 0.84424907, + "num_input_tokens_seen": 173219216, + "router_z_loss_mlp": 0.8671875, + "step": 2078, + "time_per_iteration": 4.87092661857605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117794, + "balance_loss_mlp": 1.09134626, + "epoch": 0.3999615236629473, + "flos": 531755615232.0, + "grad_norm": 0.028989200184594895, + "language_loss": 0.86860782, + "learning_rate": 0.0006820212724781896, + "loss": 0.88038719, + "num_input_tokens_seen": 173292000, + "router_z_loss_mlp": 0.8671875, + "step": 2079, + "time_per_iteration": 2.6908116340637207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176834, + "balance_loss_mlp": 1.09033561, + "epoch": 0.4001539053482108, + "flos": 696361024512.0, + "grad_norm": 0.02837619494351951, + "language_loss": 0.90808308, + "learning_rate": 0.0006817310721641694, + "loss": 0.91985142, + "num_input_tokens_seen": 173365568, + "router_z_loss_mlp": 0.86621094, + "step": 2080, + "time_per_iteration": 2.8117949962615967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190878, + "balance_loss_mlp": 1.10437989, + "epoch": 0.4003462870334744, + "flos": 521378806272.0, + "grad_norm": 0.0346474179870518, + "language_loss": 0.91806537, + "learning_rate": 0.00068144080129589, + "loss": 0.9299742, + "num_input_tokens_seen": 173430144, + "router_z_loss_mlp": 0.86621094, + "step": 2081, + "time_per_iteration": 2.596397638320923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190824, + "balance_loss_mlp": 1.10422993, + "epoch": 0.400538668718738, + "flos": 493502774784.0, + "grad_norm": 0.03225854359639043, + "language_loss": 0.90241659, + "learning_rate": 0.0006811504599860441, + "loss": 0.91432476, + "num_input_tokens_seen": 173494464, + "router_z_loss_mlp": 0.8671875, + "step": 2082, + "time_per_iteration": 2.5100014209747314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187111, + "balance_loss_mlp": 1.10075557, + "epoch": 0.40073105040400153, + "flos": 491451608064.0, + "grad_norm": 0.02371927790759806, + "language_loss": 0.91368544, + "learning_rate": 0.0006808600483473526, + "loss": 0.92555654, + "num_input_tokens_seen": 173577168, + "router_z_loss_mlp": 0.86474609, + "step": 2083, + "time_per_iteration": 2.9103221893310547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178586, + "balance_loss_mlp": 1.0923264, + "epoch": 0.4009234320892651, + "flos": 563539322880.0, + "grad_norm": 0.025152017879447597, + "language_loss": 0.9285866, + "learning_rate": 0.0006805695664925629, + "loss": 0.94037247, + "num_input_tokens_seen": 173655632, + "router_z_loss_mlp": 0.86376953, + "step": 2084, + "time_per_iteration": 2.804859161376953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170802, + "balance_loss_mlp": 1.08444667, + "epoch": 0.40111581377452865, + "flos": 426852879360.0, + "grad_norm": 0.029415551527707178, + "language_loss": 0.90934992, + "learning_rate": 0.0006802790145344506, + "loss": 0.92105794, + "num_input_tokens_seen": 173719040, + "router_z_loss_mlp": 0.86474609, + "step": 2085, + "time_per_iteration": 2.476952075958252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117314, + "balance_loss_mlp": 1.0870235, + "epoch": 0.40130819545979224, + "flos": 613642719744.0, + "grad_norm": 0.028611036161279673, + "language_loss": 0.93620002, + "learning_rate": 0.0006799883925858176, + "loss": 0.94793141, + "num_input_tokens_seen": 173796704, + "router_z_loss_mlp": 0.86230469, + "step": 2086, + "time_per_iteration": 2.8800101280212402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118738, + "balance_loss_mlp": 1.10112, + "epoch": 0.40150057714505577, + "flos": 524450552832.0, + "grad_norm": 0.02956813955479834, + "language_loss": 0.92602348, + "learning_rate": 0.0006796977007594933, + "loss": 0.93789732, + "num_input_tokens_seen": 173862352, + "router_z_loss_mlp": 0.86376953, + "step": 2087, + "time_per_iteration": 2.6013576984405518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191969, + "balance_loss_mlp": 1.10537529, + "epoch": 0.40169295883031936, + "flos": 562553671680.0, + "grad_norm": 0.03319927890150985, + "language_loss": 0.92797327, + "learning_rate": 0.0006794069391683345, + "loss": 0.93989295, + "num_input_tokens_seen": 173935408, + "router_z_loss_mlp": 0.8671875, + "step": 2088, + "time_per_iteration": 2.7359838485717773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177019, + "balance_loss_mlp": 1.09095037, + "epoch": 0.4018853405155829, + "flos": 520019851776.0, + "grad_norm": 0.03157379152927814, + "language_loss": 0.87612534, + "learning_rate": 0.0006791161079252248, + "loss": 0.88789552, + "num_input_tokens_seen": 174007152, + "router_z_loss_mlp": 0.86181641, + "step": 2089, + "time_per_iteration": 2.596851348876953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118277, + "balance_loss_mlp": 1.09655797, + "epoch": 0.4020777222008465, + "flos": 527287984128.0, + "grad_norm": 0.02654740933555753, + "language_loss": 0.89437628, + "learning_rate": 0.0006788252071430747, + "loss": 0.90620387, + "num_input_tokens_seen": 174074976, + "router_z_loss_mlp": 0.86328125, + "step": 2090, + "time_per_iteration": 2.8311312198638916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184846, + "balance_loss_mlp": 1.09853876, + "epoch": 0.40227010388611006, + "flos": 526840820736.0, + "grad_norm": 0.026844852664274194, + "language_loss": 0.92195117, + "learning_rate": 0.0006785342369348222, + "loss": 0.93379962, + "num_input_tokens_seen": 174149392, + "router_z_loss_mlp": 0.86425781, + "step": 2091, + "time_per_iteration": 2.7458736896514893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191242, + "balance_loss_mlp": 1.10488725, + "epoch": 0.4024624855713736, + "flos": 433226684928.0, + "grad_norm": 0.031284534475277, + "language_loss": 0.86698365, + "learning_rate": 0.0006782431974134316, + "loss": 0.87889606, + "num_input_tokens_seen": 174214656, + "router_z_loss_mlp": 0.86474609, + "step": 2092, + "time_per_iteration": 2.607151985168457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176082, + "balance_loss_mlp": 1.08996522, + "epoch": 0.4026548672566372, + "flos": 768090898944.0, + "grad_norm": 0.02657615147076362, + "language_loss": 0.96284211, + "learning_rate": 0.0006779520886918949, + "loss": 0.97460294, + "num_input_tokens_seen": 174296064, + "router_z_loss_mlp": 0.86230469, + "step": 2093, + "time_per_iteration": 3.03474760055542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173331, + "balance_loss_mlp": 1.08711922, + "epoch": 0.4028472489419007, + "flos": 644117137920.0, + "grad_norm": 0.02625373299959776, + "language_loss": 0.87827718, + "learning_rate": 0.0006776609108832301, + "loss": 0.89001048, + "num_input_tokens_seen": 174370896, + "router_z_loss_mlp": 0.86328125, + "step": 2094, + "time_per_iteration": 2.7667970657348633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171496, + "balance_loss_mlp": 1.08537877, + "epoch": 0.4030396306271643, + "flos": 492823297536.0, + "grad_norm": 0.02676539061642846, + "language_loss": 0.91710174, + "learning_rate": 0.0006773696641004828, + "loss": 0.92881668, + "num_input_tokens_seen": 174438448, + "router_z_loss_mlp": 0.86230469, + "step": 2095, + "time_per_iteration": 2.6013715267181396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177786, + "balance_loss_mlp": 1.09119189, + "epoch": 0.40323201231242783, + "flos": 903194079744.0, + "grad_norm": 0.03019422222161545, + "language_loss": 0.84170926, + "learning_rate": 0.0006770783484567247, + "loss": 0.85348713, + "num_input_tokens_seen": 174525952, + "router_z_loss_mlp": 0.8671875, + "step": 2096, + "time_per_iteration": 3.1032629013061523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180554, + "balance_loss_mlp": 1.09405565, + "epoch": 0.4034243939976914, + "flos": 571729979904.0, + "grad_norm": 0.026575026001379017, + "language_loss": 0.91571426, + "learning_rate": 0.000676786964065055, + "loss": 0.9275198, + "num_input_tokens_seen": 174607200, + "router_z_loss_mlp": 0.86621094, + "step": 2097, + "time_per_iteration": 2.8030343055725098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179089, + "balance_loss_mlp": 1.09254348, + "epoch": 0.403616775682955, + "flos": 508460006400.0, + "grad_norm": 0.029415731928054877, + "language_loss": 0.85702783, + "learning_rate": 0.0006764955110385986, + "loss": 0.86881876, + "num_input_tokens_seen": 174680976, + "router_z_loss_mlp": 0.86669922, + "step": 2098, + "time_per_iteration": 2.7224180698394775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175119, + "balance_loss_mlp": 1.08857322, + "epoch": 0.40380915736821854, + "flos": 520410619392.0, + "grad_norm": 0.02850929110585318, + "language_loss": 0.87608683, + "learning_rate": 0.0006762039894905083, + "loss": 0.88783801, + "num_input_tokens_seen": 174753152, + "router_z_loss_mlp": 0.86669922, + "step": 2099, + "time_per_iteration": 2.5972354412078857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169724, + "balance_loss_mlp": 1.08313072, + "epoch": 0.40400153905348213, + "flos": 442887086592.0, + "grad_norm": 0.05130464738927161, + "language_loss": 0.88512945, + "learning_rate": 0.000675912399533962, + "loss": 0.89682674, + "num_input_tokens_seen": 174817184, + "router_z_loss_mlp": 0.8671875, + "step": 2100, + "time_per_iteration": 2.502772808074951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168649, + "balance_loss_mlp": 1.08210301, + "epoch": 0.40419392073874566, + "flos": 773704636416.0, + "grad_norm": 0.02210637201548751, + "language_loss": 0.90372586, + "learning_rate": 0.0006756207412821656, + "loss": 0.91541237, + "num_input_tokens_seen": 174898128, + "router_z_loss_mlp": 0.86669922, + "step": 2101, + "time_per_iteration": 2.991191864013672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169884, + "balance_loss_mlp": 1.08319497, + "epoch": 0.40438630242400925, + "flos": 767988840960.0, + "grad_norm": 0.03154624750871164, + "language_loss": 0.88513219, + "learning_rate": 0.0006753290148483505, + "loss": 0.89683104, + "num_input_tokens_seen": 174981872, + "router_z_loss_mlp": 0.86816406, + "step": 2102, + "time_per_iteration": 3.005350112915039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166151, + "balance_loss_mlp": 1.07950926, + "epoch": 0.4045786841092728, + "flos": 416128963584.0, + "grad_norm": 0.026413403572192035, + "language_loss": 0.86387646, + "learning_rate": 0.0006750372203457752, + "loss": 0.87553799, + "num_input_tokens_seen": 175044976, + "router_z_loss_mlp": 0.86767578, + "step": 2103, + "time_per_iteration": 2.4381816387176514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168631, + "balance_loss_mlp": 1.08203721, + "epoch": 0.40477106579453637, + "flos": 540308841984.0, + "grad_norm": 0.025857351914300337, + "language_loss": 0.93101668, + "learning_rate": 0.0006747453578877242, + "loss": 0.94270301, + "num_input_tokens_seen": 175121104, + "router_z_loss_mlp": 0.8671875, + "step": 2104, + "time_per_iteration": 2.7268197536468506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169336, + "balance_loss_mlp": 1.08269489, + "epoch": 0.4049634474797999, + "flos": 828091014144.0, + "grad_norm": 0.03225143111931073, + "language_loss": 0.91022515, + "learning_rate": 0.0006744534275875085, + "loss": 0.92191851, + "num_input_tokens_seen": 175194512, + "router_z_loss_mlp": 0.86767578, + "step": 2105, + "time_per_iteration": 3.0087900161743164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176017, + "balance_loss_mlp": 1.08970928, + "epoch": 0.4051558291650635, + "flos": 573752948736.0, + "grad_norm": 0.02821186929772288, + "language_loss": 0.92500931, + "learning_rate": 0.0006741614295584657, + "loss": 0.93676949, + "num_input_tokens_seen": 175264176, + "router_z_loss_mlp": 0.86425781, + "step": 2106, + "time_per_iteration": 2.666135787963867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183174, + "balance_loss_mlp": 1.09691453, + "epoch": 0.4053482108503271, + "flos": 733244176896.0, + "grad_norm": 0.04647201706044112, + "language_loss": 0.85025966, + "learning_rate": 0.0006738693639139595, + "loss": 0.86209136, + "num_input_tokens_seen": 175347488, + "router_z_loss_mlp": 0.86376953, + "step": 2107, + "time_per_iteration": 2.9633677005767822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177787, + "balance_loss_mlp": 1.09100294, + "epoch": 0.4055405925355906, + "flos": 1214949336576.0, + "grad_norm": 0.0302025425082437, + "language_loss": 0.85097325, + "learning_rate": 0.0006735772307673796, + "loss": 0.86275113, + "num_input_tokens_seen": 175438336, + "router_z_loss_mlp": 0.86914062, + "step": 2108, + "time_per_iteration": 3.5333871841430664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177556, + "balance_loss_mlp": 1.09105742, + "epoch": 0.4057329742208542, + "flos": 717107911680.0, + "grad_norm": 0.026166055652869804, + "language_loss": 0.8899157, + "learning_rate": 0.0006732850302321421, + "loss": 0.90169132, + "num_input_tokens_seen": 175510912, + "router_z_loss_mlp": 0.86621094, + "step": 2109, + "time_per_iteration": 2.8610079288482666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170548, + "balance_loss_mlp": 1.0842886, + "epoch": 0.4059253559061177, + "flos": 565953059328.0, + "grad_norm": 0.026405563608612303, + "language_loss": 0.90377712, + "learning_rate": 0.00067299276242169, + "loss": 0.91548264, + "num_input_tokens_seen": 175583040, + "router_z_loss_mlp": 0.86376953, + "step": 2110, + "time_per_iteration": 2.709127426147461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197311, + "balance_loss_mlp": 1.11319733, + "epoch": 0.4061177375913813, + "flos": 1597186481664.0, + "grad_norm": 0.02594110918583908, + "language_loss": 0.74382168, + "learning_rate": 0.0006727004274494908, + "loss": 0.75579476, + "num_input_tokens_seen": 175817952, + "router_z_loss_mlp": 0.84179688, + "step": 2111, + "time_per_iteration": 4.906593322753906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117304, + "balance_loss_mlp": 1.08654153, + "epoch": 0.40631011927664484, + "flos": 616621140480.0, + "grad_norm": 0.028870166263774127, + "language_loss": 0.85570323, + "learning_rate": 0.0006724080254290395, + "loss": 0.86743361, + "num_input_tokens_seen": 175896352, + "router_z_loss_mlp": 0.86621094, + "step": 2112, + "time_per_iteration": 2.8279542922973633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168033, + "balance_loss_mlp": 1.08134389, + "epoch": 0.40650250096190843, + "flos": 558748053504.0, + "grad_norm": 0.030551496532206422, + "language_loss": 0.96733952, + "learning_rate": 0.0006721155564738566, + "loss": 0.97901982, + "num_input_tokens_seen": 175967152, + "router_z_loss_mlp": 0.86816406, + "step": 2113, + "time_per_iteration": 2.6917896270751953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174904, + "balance_loss_mlp": 1.08964539, + "epoch": 0.40669488264717196, + "flos": 1583542542336.0, + "grad_norm": 0.010618058744132962, + "language_loss": 0.78622639, + "learning_rate": 0.0006718230206974884, + "loss": 0.79797542, + "num_input_tokens_seen": 176205248, + "router_z_loss_mlp": 0.85351562, + "step": 2114, + "time_per_iteration": 4.959328651428223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171549, + "balance_loss_mlp": 1.08476496, + "epoch": 0.40688726433243555, + "flos": 508655390208.0, + "grad_norm": 0.033503716654157654, + "language_loss": 0.93188733, + "learning_rate": 0.0006715304182135078, + "loss": 0.9436028, + "num_input_tokens_seen": 176276208, + "router_z_loss_mlp": 0.86914062, + "step": 2115, + "time_per_iteration": 2.6056840419769287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172073, + "balance_loss_mlp": 1.08528888, + "epoch": 0.40707964601769914, + "flos": 590351840256.0, + "grad_norm": 0.028307470802153102, + "language_loss": 0.95287716, + "learning_rate": 0.0006712377491355127, + "loss": 0.96459788, + "num_input_tokens_seen": 176355072, + "router_z_loss_mlp": 0.86914062, + "step": 2116, + "time_per_iteration": 2.8985562324523926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177825, + "balance_loss_mlp": 1.09146965, + "epoch": 0.40727202770296267, + "flos": 581650893312.0, + "grad_norm": 0.026081347286493965, + "language_loss": 0.86969304, + "learning_rate": 0.0006709450135771274, + "loss": 0.88147128, + "num_input_tokens_seen": 176444592, + "router_z_loss_mlp": 0.86474609, + "step": 2117, + "time_per_iteration": 2.938913345336914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116718, + "balance_loss_mlp": 1.08058655, + "epoch": 0.40746440938822626, + "flos": 505108282368.0, + "grad_norm": 0.02500723808493834, + "language_loss": 0.92501736, + "learning_rate": 0.0006706522116520023, + "loss": 0.93668914, + "num_input_tokens_seen": 176516144, + "router_z_loss_mlp": 0.8671875, + "step": 2118, + "time_per_iteration": 2.6295557022094727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169158, + "balance_loss_mlp": 1.08246934, + "epoch": 0.4076567910734898, + "flos": 606710960640.0, + "grad_norm": 0.031046149511695622, + "language_loss": 0.91392642, + "learning_rate": 0.0006703593434738127, + "loss": 0.92561805, + "num_input_tokens_seen": 176585712, + "router_z_loss_mlp": 0.86816406, + "step": 2119, + "time_per_iteration": 2.6925787925720215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170168, + "balance_loss_mlp": 1.08371782, + "epoch": 0.4078491727587534, + "flos": 480518846976.0, + "grad_norm": 0.026436329156680958, + "language_loss": 0.85361552, + "learning_rate": 0.0006700664091562604, + "loss": 0.86531723, + "num_input_tokens_seen": 176654736, + "router_z_loss_mlp": 0.86572266, + "step": 2120, + "time_per_iteration": 2.567094087600708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177249, + "balance_loss_mlp": 1.09065557, + "epoch": 0.4080415544440169, + "flos": 511418961408.0, + "grad_norm": 0.02549175858454111, + "language_loss": 0.92328954, + "learning_rate": 0.0006697734088130725, + "loss": 0.93506193, + "num_input_tokens_seen": 176722800, + "router_z_loss_mlp": 0.8671875, + "step": 2121, + "time_per_iteration": 2.618701934814453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175348, + "balance_loss_mlp": 1.0889926, + "epoch": 0.4082339361292805, + "flos": 735927157248.0, + "grad_norm": 0.030272250235271202, + "language_loss": 0.93378723, + "learning_rate": 0.0006694803425580018, + "loss": 0.94554067, + "num_input_tokens_seen": 176800320, + "router_z_loss_mlp": 0.86474609, + "step": 2122, + "time_per_iteration": 2.983313798904419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174826, + "balance_loss_mlp": 1.08851826, + "epoch": 0.4084263178145441, + "flos": 458404273152.0, + "grad_norm": 0.031322708915370194, + "language_loss": 0.925843, + "learning_rate": 0.0006691872105048268, + "loss": 0.93759131, + "num_input_tokens_seen": 176867440, + "router_z_loss_mlp": 0.86425781, + "step": 2123, + "time_per_iteration": 2.570157766342163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171971, + "balance_loss_mlp": 1.08566332, + "epoch": 0.4086186994998076, + "flos": 564025417728.0, + "grad_norm": 0.026602974246623758, + "language_loss": 0.91457534, + "learning_rate": 0.0006688940127673513, + "loss": 0.92629504, + "num_input_tokens_seen": 176942048, + "router_z_loss_mlp": 0.86425781, + "step": 2124, + "time_per_iteration": 2.6775970458984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172213, + "balance_loss_mlp": 1.08609629, + "epoch": 0.4088110811850712, + "flos": 574893050880.0, + "grad_norm": 0.023493992507127005, + "language_loss": 0.90594321, + "learning_rate": 0.0006686007494594049, + "loss": 0.91766536, + "num_input_tokens_seen": 177025104, + "router_z_loss_mlp": 0.86230469, + "step": 2125, + "time_per_iteration": 2.8212904930114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166923, + "balance_loss_mlp": 1.08028209, + "epoch": 0.40900346287033473, + "flos": 457846319616.0, + "grad_norm": 0.03600016157180187, + "language_loss": 0.89846623, + "learning_rate": 0.0006683074206948425, + "loss": 0.91013545, + "num_input_tokens_seen": 177089296, + "router_z_loss_mlp": 0.86767578, + "step": 2126, + "time_per_iteration": 2.4914121627807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165958, + "balance_loss_mlp": 1.07926905, + "epoch": 0.4091958445555983, + "flos": 618594444288.0, + "grad_norm": 0.027616550174826966, + "language_loss": 0.88032037, + "learning_rate": 0.0006680140265875443, + "loss": 0.89197993, + "num_input_tokens_seen": 177163648, + "router_z_loss_mlp": 0.86816406, + "step": 2127, + "time_per_iteration": 2.8309690952301025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164825, + "balance_loss_mlp": 1.07846975, + "epoch": 0.40938822624086185, + "flos": 473370236928.0, + "grad_norm": 0.02755246393115647, + "language_loss": 1.01638341, + "learning_rate": 0.0006677205672514162, + "loss": 1.02803159, + "num_input_tokens_seen": 177233856, + "router_z_loss_mlp": 0.86474609, + "step": 2128, + "time_per_iteration": 2.716601610183716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170358, + "balance_loss_mlp": 1.08395457, + "epoch": 0.40958060792612544, + "flos": 571117632000.0, + "grad_norm": 0.024298637355030545, + "language_loss": 0.93714547, + "learning_rate": 0.000667427042800389, + "loss": 0.94884908, + "num_input_tokens_seen": 177309824, + "router_z_loss_mlp": 0.86523438, + "step": 2129, + "time_per_iteration": 2.7863857746124268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181584, + "balance_loss_mlp": 1.09499085, + "epoch": 0.40977298961138897, + "flos": 610470916608.0, + "grad_norm": 0.027297656005279614, + "language_loss": 0.89951032, + "learning_rate": 0.0006671334533484192, + "loss": 0.91132617, + "num_input_tokens_seen": 177380592, + "router_z_loss_mlp": 0.8671875, + "step": 2130, + "time_per_iteration": 2.7272608280181885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177813, + "balance_loss_mlp": 1.09160113, + "epoch": 0.40996537129665256, + "flos": 582872861184.0, + "grad_norm": 0.02438545141207517, + "language_loss": 0.89143705, + "learning_rate": 0.0006668397990094881, + "loss": 0.90321517, + "num_input_tokens_seen": 177454720, + "router_z_loss_mlp": 0.86328125, + "step": 2131, + "time_per_iteration": 2.74776554107666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173755, + "balance_loss_mlp": 1.08739984, + "epoch": 0.41015775298191615, + "flos": 517553722368.0, + "grad_norm": 0.026155362463659675, + "language_loss": 0.91776133, + "learning_rate": 0.0006665460798976027, + "loss": 0.92949885, + "num_input_tokens_seen": 177528224, + "router_z_loss_mlp": 0.86474609, + "step": 2132, + "time_per_iteration": 2.728180170059204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172912, + "balance_loss_mlp": 1.08679533, + "epoch": 0.4103501346671797, + "flos": 511445157888.0, + "grad_norm": 0.02671704384652658, + "language_loss": 0.87880147, + "learning_rate": 0.0006662522961267947, + "loss": 0.89053059, + "num_input_tokens_seen": 177598176, + "router_z_loss_mlp": 0.86230469, + "step": 2133, + "time_per_iteration": 2.6707494258880615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172576, + "balance_loss_mlp": 1.08636391, + "epoch": 0.41054251635244327, + "flos": 550926696960.0, + "grad_norm": 0.02310158230225749, + "language_loss": 0.93120432, + "learning_rate": 0.0006659584478111211, + "loss": 0.9429301, + "num_input_tokens_seen": 177675840, + "router_z_loss_mlp": 0.86328125, + "step": 2134, + "time_per_iteration": 2.7634923458099365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167834, + "balance_loss_mlp": 1.08162224, + "epoch": 0.4107348980377068, + "flos": 841298523648.0, + "grad_norm": 0.0323112144897684, + "language_loss": 0.91370595, + "learning_rate": 0.000665664535064664, + "loss": 0.9253844, + "num_input_tokens_seen": 177751376, + "router_z_loss_mlp": 0.86328125, + "step": 2135, + "time_per_iteration": 3.028343677520752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170594, + "balance_loss_mlp": 1.08447671, + "epoch": 0.4109272797229704, + "flos": 504763176960.0, + "grad_norm": 0.026958983372987907, + "language_loss": 0.8977797, + "learning_rate": 0.0006653705580015303, + "loss": 0.90948564, + "num_input_tokens_seen": 177825264, + "router_z_loss_mlp": 0.86230469, + "step": 2136, + "time_per_iteration": 2.6786246299743652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173433, + "balance_loss_mlp": 1.08731592, + "epoch": 0.4111196614082339, + "flos": 612023253504.0, + "grad_norm": 0.02687154551301225, + "language_loss": 0.92936879, + "learning_rate": 0.0006650765167358523, + "loss": 0.9411031, + "num_input_tokens_seen": 177901680, + "router_z_loss_mlp": 0.86230469, + "step": 2137, + "time_per_iteration": 2.765503168106079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170304, + "balance_loss_mlp": 1.08409154, + "epoch": 0.4113120430934975, + "flos": 454103827968.0, + "grad_norm": 0.029691236683527498, + "language_loss": 0.97143424, + "learning_rate": 0.0006647824113817864, + "loss": 0.98313725, + "num_input_tokens_seen": 177965264, + "router_z_loss_mlp": 0.86328125, + "step": 2138, + "time_per_iteration": 2.490111827850342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179698, + "balance_loss_mlp": 1.09329462, + "epoch": 0.41150442477876104, + "flos": 542709843456.0, + "grad_norm": 0.027637209651618533, + "language_loss": 0.88423729, + "learning_rate": 0.000664488242053515, + "loss": 0.89603424, + "num_input_tokens_seen": 178039712, + "router_z_loss_mlp": 0.86523438, + "step": 2139, + "time_per_iteration": 2.7109243869781494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193887, + "balance_loss_mlp": 1.10748434, + "epoch": 0.4116968064640246, + "flos": 577391380992.0, + "grad_norm": 0.026757188222196804, + "language_loss": 0.8939023, + "learning_rate": 0.0006641940088652445, + "loss": 0.90584123, + "num_input_tokens_seen": 178114080, + "router_z_loss_mlp": 0.86523438, + "step": 2140, + "time_per_iteration": 2.7461891174316406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186164, + "balance_loss_mlp": 1.09952235, + "epoch": 0.4118891881492882, + "flos": 497149939200.0, + "grad_norm": 0.030186458882164903, + "language_loss": 0.90177953, + "learning_rate": 0.0006638997119312065, + "loss": 0.91364121, + "num_input_tokens_seen": 178188032, + "router_z_loss_mlp": 0.86767578, + "step": 2141, + "time_per_iteration": 2.7632482051849365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206482, + "balance_loss_mlp": 1.11969757, + "epoch": 0.41208156983455174, + "flos": 1541570678784.0, + "grad_norm": 0.01865751049600735, + "language_loss": 0.75063306, + "learning_rate": 0.0006636053513656568, + "loss": 0.76269788, + "num_input_tokens_seen": 178395328, + "router_z_loss_mlp": 0.86914062, + "step": 2142, + "time_per_iteration": 4.916187286376953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117268, + "balance_loss_mlp": 1.0864203, + "epoch": 0.41227395151981533, + "flos": 586057399296.0, + "grad_norm": 0.03006664462158482, + "language_loss": 0.91539335, + "learning_rate": 0.000663310927282877, + "loss": 0.92712009, + "num_input_tokens_seen": 178471952, + "router_z_loss_mlp": 0.86376953, + "step": 2143, + "time_per_iteration": 2.783862829208374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178317, + "balance_loss_mlp": 1.09220016, + "epoch": 0.41246633320507886, + "flos": 443892203520.0, + "grad_norm": 0.03021664461702893, + "language_loss": 0.92787349, + "learning_rate": 0.000663016439797172, + "loss": 0.93965667, + "num_input_tokens_seen": 178542192, + "router_z_loss_mlp": 0.86230469, + "step": 2144, + "time_per_iteration": 2.617626428604126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177938, + "balance_loss_mlp": 1.09177303, + "epoch": 0.41265871489034245, + "flos": 581094941184.0, + "grad_norm": 0.031114344129188405, + "language_loss": 0.87895894, + "learning_rate": 0.0006627218890228724, + "loss": 0.89073837, + "num_input_tokens_seen": 178622736, + "router_z_loss_mlp": 0.86279297, + "step": 2145, + "time_per_iteration": 2.823136329650879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172469, + "balance_loss_mlp": 1.08611357, + "epoch": 0.412851096575606, + "flos": 762528827904.0, + "grad_norm": 0.03009040753958223, + "language_loss": 0.9065426, + "learning_rate": 0.0006624272750743326, + "loss": 0.91826725, + "num_input_tokens_seen": 178705808, + "router_z_loss_mlp": 0.86474609, + "step": 2146, + "time_per_iteration": 3.009969472885132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172508, + "balance_loss_mlp": 1.08615267, + "epoch": 0.41304347826086957, + "flos": 556520968704.0, + "grad_norm": 0.023356325653820006, + "language_loss": 0.88529593, + "learning_rate": 0.0006621325980659322, + "loss": 0.89702094, + "num_input_tokens_seen": 178781200, + "router_z_loss_mlp": 0.86474609, + "step": 2147, + "time_per_iteration": 2.7459471225738525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176953, + "balance_loss_mlp": 1.09083641, + "epoch": 0.41323585994613315, + "flos": 666893724672.0, + "grad_norm": 0.029406479855093332, + "language_loss": 0.8760705, + "learning_rate": 0.000661837858112075, + "loss": 0.88783997, + "num_input_tokens_seen": 178855072, + "router_z_loss_mlp": 0.86230469, + "step": 2148, + "time_per_iteration": 2.816408634185791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173515, + "balance_loss_mlp": 1.08763647, + "epoch": 0.4134282416313967, + "flos": 549784593408.0, + "grad_norm": 0.02816234486414791, + "language_loss": 0.9661653, + "learning_rate": 0.0006615430553271888, + "loss": 0.97790039, + "num_input_tokens_seen": 178927936, + "router_z_loss_mlp": 0.85986328, + "step": 2149, + "time_per_iteration": 2.7518115043640137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174425, + "balance_loss_mlp": 1.08859468, + "epoch": 0.4136206233166603, + "flos": 647512522752.0, + "grad_norm": 0.025697121170903614, + "language_loss": 0.9133321, + "learning_rate": 0.0006612481898257264, + "loss": 0.92507643, + "num_input_tokens_seen": 179007792, + "router_z_loss_mlp": 0.859375, + "step": 2150, + "time_per_iteration": 2.841632127761841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179143, + "balance_loss_mlp": 1.09364581, + "epoch": 0.4138130050019238, + "flos": 518363455488.0, + "grad_norm": 0.029278566016903075, + "language_loss": 0.9170779, + "learning_rate": 0.000660953261722165, + "loss": 0.92886931, + "num_input_tokens_seen": 179075200, + "router_z_loss_mlp": 0.85595703, + "step": 2151, + "time_per_iteration": 2.6203365325927734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178641, + "balance_loss_mlp": 1.09309638, + "epoch": 0.4140053866871874, + "flos": 610368858624.0, + "grad_norm": 0.02858072061503926, + "language_loss": 0.90138143, + "learning_rate": 0.0006606582711310055, + "loss": 0.91316783, + "num_input_tokens_seen": 179144448, + "router_z_loss_mlp": 0.85644531, + "step": 2152, + "time_per_iteration": 2.71352481842041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167147, + "balance_loss_mlp": 1.08103001, + "epoch": 0.4141977683724509, + "flos": 580845163008.0, + "grad_norm": 0.02998636441804494, + "language_loss": 0.9075436, + "learning_rate": 0.0006603632181667736, + "loss": 0.91921502, + "num_input_tokens_seen": 179215776, + "router_z_loss_mlp": 0.86230469, + "step": 2153, + "time_per_iteration": 2.766855478286743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175224, + "balance_loss_mlp": 1.09034729, + "epoch": 0.4143901500577145, + "flos": 1310176386048.0, + "grad_norm": 0.007725969282803628, + "language_loss": 0.78943324, + "learning_rate": 0.0006600681029440187, + "loss": 0.80118549, + "num_input_tokens_seen": 179436688, + "router_z_loss_mlp": 0.84960938, + "step": 2154, + "time_per_iteration": 4.895019292831421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175162, + "balance_loss_mlp": 1.08890247, + "epoch": 0.41458253174297804, + "flos": 461122182144.0, + "grad_norm": 0.032062709167589486, + "language_loss": 0.89760709, + "learning_rate": 0.0006597729255773153, + "loss": 0.90935868, + "num_input_tokens_seen": 179503264, + "router_z_loss_mlp": 0.86376953, + "step": 2155, + "time_per_iteration": 2.5811779499053955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170487, + "balance_loss_mlp": 1.08413148, + "epoch": 0.41477491342824163, + "flos": 554438876160.0, + "grad_norm": 0.02646748417883587, + "language_loss": 0.88947552, + "learning_rate": 0.0006594776861812608, + "loss": 0.90118033, + "num_input_tokens_seen": 179574864, + "router_z_loss_mlp": 0.86474609, + "step": 2156, + "time_per_iteration": 2.6486780643463135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174434, + "balance_loss_mlp": 1.08803129, + "epoch": 0.4149672951135052, + "flos": 699085664256.0, + "grad_norm": 0.02893226937169889, + "language_loss": 0.92862517, + "learning_rate": 0.0006591823848704776, + "loss": 0.94036949, + "num_input_tokens_seen": 179658208, + "router_z_loss_mlp": 0.86523438, + "step": 2157, + "time_per_iteration": 2.9617741107940674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175673, + "balance_loss_mlp": 1.08946109, + "epoch": 0.41515967679876875, + "flos": 566836652544.0, + "grad_norm": 0.025963915394380376, + "language_loss": 0.87666786, + "learning_rate": 0.0006588870217596117, + "loss": 0.88842458, + "num_input_tokens_seen": 179732320, + "router_z_loss_mlp": 0.86328125, + "step": 2158, + "time_per_iteration": 2.7438344955444336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175578, + "balance_loss_mlp": 1.08927035, + "epoch": 0.41535205848403234, + "flos": 502177525248.0, + "grad_norm": 0.03336248103115958, + "language_loss": 0.93542749, + "learning_rate": 0.0006585915969633334, + "loss": 0.94718325, + "num_input_tokens_seen": 179801616, + "router_z_loss_mlp": 0.86425781, + "step": 2159, + "time_per_iteration": 2.5621583461761475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170555, + "balance_loss_mlp": 1.08429492, + "epoch": 0.41554444016929587, + "flos": 608701728768.0, + "grad_norm": 0.03070944646834424, + "language_loss": 0.95915914, + "learning_rate": 0.0006582961105963366, + "loss": 0.97086465, + "num_input_tokens_seen": 179876112, + "router_z_loss_mlp": 0.86376953, + "step": 2160, + "time_per_iteration": 2.798051118850708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171192, + "balance_loss_mlp": 1.08498013, + "epoch": 0.41573682185455946, + "flos": 530155614720.0, + "grad_norm": 0.02743693152360054, + "language_loss": 0.85023397, + "learning_rate": 0.0006580005627733395, + "loss": 0.86194587, + "num_input_tokens_seen": 179949936, + "router_z_loss_mlp": 0.86328125, + "step": 2161, + "time_per_iteration": 2.6954233646392822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168175, + "balance_loss_mlp": 1.08234429, + "epoch": 0.415929203539823, + "flos": 506037537792.0, + "grad_norm": 0.027357224978205523, + "language_loss": 0.88365781, + "learning_rate": 0.0006577049536090838, + "loss": 0.89533949, + "num_input_tokens_seen": 180023184, + "router_z_loss_mlp": 0.859375, + "step": 2162, + "time_per_iteration": 2.6762402057647705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167145, + "balance_loss_mlp": 1.08140957, + "epoch": 0.4161215852250866, + "flos": 583823583744.0, + "grad_norm": 0.02816159229600616, + "language_loss": 0.92433643, + "learning_rate": 0.000657409283218335, + "loss": 0.93600792, + "num_input_tokens_seen": 180091728, + "router_z_loss_mlp": 0.85839844, + "step": 2163, + "time_per_iteration": 2.708815574645996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116891, + "balance_loss_mlp": 1.0833174, + "epoch": 0.4163139669103501, + "flos": 491759783424.0, + "grad_norm": 0.02622965675004396, + "language_loss": 0.87195617, + "learning_rate": 0.0006571135517158829, + "loss": 0.8836453, + "num_input_tokens_seen": 180162096, + "router_z_loss_mlp": 0.85693359, + "step": 2164, + "time_per_iteration": 2.7412045001983643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177162, + "balance_loss_mlp": 1.0930481, + "epoch": 0.4165063485956137, + "flos": 1291020767232.0, + "grad_norm": 0.0113690904759025, + "language_loss": 0.76764059, + "learning_rate": 0.0006568177592165404, + "loss": 0.77941221, + "num_input_tokens_seen": 180380912, + "router_z_loss_mlp": 0.84179688, + "step": 2165, + "time_per_iteration": 4.793722867965698 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172447, + "balance_loss_mlp": 1.08680665, + "epoch": 0.4166987302808773, + "flos": 496257613824.0, + "grad_norm": 0.031372404533623194, + "language_loss": 0.90335643, + "learning_rate": 0.0006565219058351444, + "loss": 0.9150809, + "num_input_tokens_seen": 180447424, + "router_z_loss_mlp": 0.85742188, + "step": 2166, + "time_per_iteration": 2.5605039596557617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169955, + "balance_loss_mlp": 1.08412397, + "epoch": 0.4168911119661408, + "flos": 465066788352.0, + "grad_norm": 0.02745374217966413, + "language_loss": 0.89900762, + "learning_rate": 0.0006562259916865553, + "loss": 0.91070712, + "num_input_tokens_seen": 180516336, + "router_z_loss_mlp": 0.859375, + "step": 2167, + "time_per_iteration": 2.5815963745117188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011761, + "balance_loss_mlp": 1.09055507, + "epoch": 0.4170834936514044, + "flos": 537942769152.0, + "grad_norm": 0.0279390150832869, + "language_loss": 0.86569649, + "learning_rate": 0.0006559300168856573, + "loss": 0.8774575, + "num_input_tokens_seen": 180589824, + "router_z_loss_mlp": 0.85644531, + "step": 2168, + "time_per_iteration": 2.7917275428771973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181119, + "balance_loss_mlp": 1.09547901, + "epoch": 0.41727587533666793, + "flos": 551749165056.0, + "grad_norm": 0.026888463962073755, + "language_loss": 0.92254919, + "learning_rate": 0.0006556339815473577, + "loss": 0.93436038, + "num_input_tokens_seen": 180661296, + "router_z_loss_mlp": 0.85742188, + "step": 2169, + "time_per_iteration": 2.640456438064575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170658, + "balance_loss_mlp": 1.08492219, + "epoch": 0.4174682570219315, + "flos": 632377371648.0, + "grad_norm": 0.027558904728032622, + "language_loss": 0.91870886, + "learning_rate": 0.000655337885786588, + "loss": 0.93041539, + "num_input_tokens_seen": 180744896, + "router_z_loss_mlp": 0.85839844, + "step": 2170, + "time_per_iteration": 2.885754108428955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170686, + "balance_loss_mlp": 1.08485556, + "epoch": 0.41766063870719505, + "flos": 520755724800.0, + "grad_norm": 0.031037248087189308, + "language_loss": 0.9245193, + "learning_rate": 0.0006550417297183025, + "loss": 0.93622619, + "num_input_tokens_seen": 180813008, + "router_z_loss_mlp": 0.859375, + "step": 2171, + "time_per_iteration": 2.607590436935425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175474, + "balance_loss_mlp": 1.08945298, + "epoch": 0.41785302039245864, + "flos": 559054227456.0, + "grad_norm": 0.02737354340834092, + "language_loss": 0.87721866, + "learning_rate": 0.0006547455134574793, + "loss": 0.88897336, + "num_input_tokens_seen": 180886480, + "router_z_loss_mlp": 0.86132812, + "step": 2172, + "time_per_iteration": 2.7324562072753906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184116, + "balance_loss_mlp": 1.09833348, + "epoch": 0.41804540207772223, + "flos": 790027553280.0, + "grad_norm": 0.06230752646239431, + "language_loss": 0.90406793, + "learning_rate": 0.0006544492371191198, + "loss": 0.91590911, + "num_input_tokens_seen": 180973776, + "router_z_loss_mlp": 0.85888672, + "step": 2173, + "time_per_iteration": 3.1248764991760254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186676, + "balance_loss_mlp": 1.10089302, + "epoch": 0.41823778376298576, + "flos": 905890521600.0, + "grad_norm": 0.03053935653615099, + "language_loss": 0.9052453, + "learning_rate": 0.0006541529008182485, + "loss": 0.91711211, + "num_input_tokens_seen": 181062768, + "router_z_loss_mlp": 0.85888672, + "step": 2174, + "time_per_iteration": 3.2052760124206543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169526, + "balance_loss_mlp": 1.08383834, + "epoch": 0.41843016544824935, + "flos": 512573799936.0, + "grad_norm": 0.02722476190126499, + "language_loss": 0.93815506, + "learning_rate": 0.0006538565046699136, + "loss": 0.94985026, + "num_input_tokens_seen": 181129872, + "router_z_loss_mlp": 0.85791016, + "step": 2175, + "time_per_iteration": 2.578150987625122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167473, + "balance_loss_mlp": 1.08183265, + "epoch": 0.4186225471335129, + "flos": 654289830912.0, + "grad_norm": 0.03154991846739093, + "language_loss": 0.89587617, + "learning_rate": 0.0006535600487891862, + "loss": 0.90755087, + "num_input_tokens_seen": 181208112, + "router_z_loss_mlp": 0.85742188, + "step": 2176, + "time_per_iteration": 2.8699960708618164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167918, + "balance_loss_mlp": 1.08218253, + "epoch": 0.41881492881877647, + "flos": 570225306624.0, + "grad_norm": 0.027441287945076498, + "language_loss": 0.94665354, + "learning_rate": 0.0006532635332911603, + "loss": 0.95833272, + "num_input_tokens_seen": 181278736, + "router_z_loss_mlp": 0.85839844, + "step": 2177, + "time_per_iteration": 2.695180892944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168273, + "balance_loss_mlp": 1.08239508, + "epoch": 0.41900731050404, + "flos": 913484293632.0, + "grad_norm": 0.030353783790969455, + "language_loss": 0.86808872, + "learning_rate": 0.0006529669582909541, + "loss": 0.87977153, + "num_input_tokens_seen": 181362512, + "router_z_loss_mlp": 0.85986328, + "step": 2178, + "time_per_iteration": 3.2746284008026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116623, + "balance_loss_mlp": 1.08073354, + "epoch": 0.4191996921893036, + "flos": 536783201280.0, + "grad_norm": 0.031775111638151596, + "language_loss": 0.93350971, + "learning_rate": 0.0006526703239037077, + "loss": 0.94517195, + "num_input_tokens_seen": 181432080, + "router_z_loss_mlp": 0.85595703, + "step": 2179, + "time_per_iteration": 2.6485140323638916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167238, + "balance_loss_mlp": 1.08159792, + "epoch": 0.4193920738745671, + "flos": 583730257920.0, + "grad_norm": 0.027399178820930566, + "language_loss": 0.92623031, + "learning_rate": 0.0006523736302445851, + "loss": 0.93790269, + "num_input_tokens_seen": 181507296, + "router_z_loss_mlp": 0.85742188, + "step": 2180, + "time_per_iteration": 2.8337948322296143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116728, + "balance_loss_mlp": 1.08149683, + "epoch": 0.4195844555598307, + "flos": 1337800459776.0, + "grad_norm": 0.031235958835637387, + "language_loss": 0.83915186, + "learning_rate": 0.0006520768774287728, + "loss": 0.85082471, + "num_input_tokens_seen": 181599408, + "router_z_loss_mlp": 0.85888672, + "step": 2181, + "time_per_iteration": 3.725524663925171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170743, + "balance_loss_mlp": 1.08505547, + "epoch": 0.4197768372450943, + "flos": 599996779008.0, + "grad_norm": 0.025797087070179033, + "language_loss": 0.91158509, + "learning_rate": 0.0006517800655714806, + "loss": 0.92329252, + "num_input_tokens_seen": 181674944, + "router_z_loss_mlp": 0.85791016, + "step": 2182, + "time_per_iteration": 2.8207623958587646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172108, + "balance_loss_mlp": 1.08646846, + "epoch": 0.4199692189303578, + "flos": 736595900928.0, + "grad_norm": 0.0300192342725077, + "language_loss": 0.91644537, + "learning_rate": 0.0006514831947879407, + "loss": 0.92816639, + "num_input_tokens_seen": 181756704, + "router_z_loss_mlp": 0.85742188, + "step": 2183, + "time_per_iteration": 2.9593582153320312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170186, + "balance_loss_mlp": 1.08454573, + "epoch": 0.4201616006156214, + "flos": 751661921280.0, + "grad_norm": 0.02826942186100045, + "language_loss": 0.84773123, + "learning_rate": 0.0006511862651934091, + "loss": 0.85943305, + "num_input_tokens_seen": 181837952, + "router_z_loss_mlp": 0.85742188, + "step": 2184, + "time_per_iteration": 3.1170709133148193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168703, + "balance_loss_mlp": 1.08301497, + "epoch": 0.42035398230088494, + "flos": 548091267072.0, + "grad_norm": 0.027950639773315498, + "language_loss": 0.89124084, + "learning_rate": 0.0006508892769031638, + "loss": 0.90292788, + "num_input_tokens_seen": 181906896, + "router_z_loss_mlp": 0.85791016, + "step": 2185, + "time_per_iteration": 2.6419410705566406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116924, + "balance_loss_mlp": 1.08379054, + "epoch": 0.42054636398614853, + "flos": 618047224320.0, + "grad_norm": 0.03133969262582121, + "language_loss": 0.94198585, + "learning_rate": 0.000650592230032506, + "loss": 0.95367819, + "num_input_tokens_seen": 181974976, + "router_z_loss_mlp": 0.85546875, + "step": 2186, + "time_per_iteration": 2.7254862785339355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175, + "balance_loss_mlp": 1.08935976, + "epoch": 0.42073874567141206, + "flos": 641666471424.0, + "grad_norm": 0.02942747497692904, + "language_loss": 0.9171921, + "learning_rate": 0.0006502951246967595, + "loss": 0.92894208, + "num_input_tokens_seen": 182054704, + "router_z_loss_mlp": 0.85742188, + "step": 2187, + "time_per_iteration": 2.8912041187286377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174567, + "balance_loss_mlp": 1.08897436, + "epoch": 0.42093112735667565, + "flos": 494822797824.0, + "grad_norm": 0.02515329577356359, + "language_loss": 0.92510098, + "learning_rate": 0.0006499979610112706, + "loss": 0.93684661, + "num_input_tokens_seen": 182129696, + "router_z_loss_mlp": 0.85693359, + "step": 2188, + "time_per_iteration": 2.710610866546631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119078, + "balance_loss_mlp": 1.1055218, + "epoch": 0.4211235090419392, + "flos": 543436984320.0, + "grad_norm": 0.027549100686041793, + "language_loss": 0.89267701, + "learning_rate": 0.000649700739091409, + "loss": 0.90458483, + "num_input_tokens_seen": 182203792, + "router_z_loss_mlp": 0.85351562, + "step": 2189, + "time_per_iteration": 2.770158290863037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177139, + "balance_loss_mlp": 1.09321594, + "epoch": 0.42131589072720277, + "flos": 1535388254208.0, + "grad_norm": 0.007480893247264192, + "language_loss": 0.73836273, + "learning_rate": 0.0006494034590525657, + "loss": 0.75013411, + "num_input_tokens_seen": 182432080, + "router_z_loss_mlp": 0.83984375, + "step": 2190, + "time_per_iteration": 4.826355218887329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168739, + "balance_loss_mlp": 1.08381474, + "epoch": 0.42150827241246636, + "flos": 567935095296.0, + "grad_norm": 0.025807507169531153, + "language_loss": 0.91430855, + "learning_rate": 0.0006491061210101557, + "loss": 0.92599595, + "num_input_tokens_seen": 182500256, + "router_z_loss_mlp": 0.85009766, + "step": 2191, + "time_per_iteration": 2.6813712120056152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170756, + "balance_loss_mlp": 1.08568799, + "epoch": 0.4217006540977299, + "flos": 708841393152.0, + "grad_norm": 0.02710796189326301, + "language_loss": 0.90667284, + "learning_rate": 0.0006488087250796157, + "loss": 0.91838038, + "num_input_tokens_seen": 182582912, + "router_z_loss_mlp": 0.8515625, + "step": 2192, + "time_per_iteration": 2.8864076137542725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117035, + "balance_loss_mlp": 1.08528221, + "epoch": 0.4218930357829935, + "flos": 628561019904.0, + "grad_norm": 0.0271709214243351, + "language_loss": 0.87769991, + "learning_rate": 0.0006485112713764049, + "loss": 0.8894034, + "num_input_tokens_seen": 182670304, + "router_z_loss_mlp": 0.8515625, + "step": 2193, + "time_per_iteration": 2.9007742404937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170953, + "balance_loss_mlp": 1.08578944, + "epoch": 0.422085417468257, + "flos": 461289368064.0, + "grad_norm": 0.026123872435626132, + "language_loss": 0.89901912, + "learning_rate": 0.0006482137600160051, + "loss": 0.91072869, + "num_input_tokens_seen": 182735024, + "router_z_loss_mlp": 0.85253906, + "step": 2194, + "time_per_iteration": 2.4960973262786865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170401, + "balance_loss_mlp": 1.08533287, + "epoch": 0.4222777991535206, + "flos": 474980971008.0, + "grad_norm": 0.02685495955741856, + "language_loss": 0.90204549, + "learning_rate": 0.0006479161911139206, + "loss": 0.91374946, + "num_input_tokens_seen": 182805024, + "router_z_loss_mlp": 0.8515625, + "step": 2195, + "time_per_iteration": 2.574496030807495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170408, + "balance_loss_mlp": 1.08534062, + "epoch": 0.4224701808387841, + "flos": 471844096512.0, + "grad_norm": 0.03212817551635824, + "language_loss": 0.93686366, + "learning_rate": 0.0006476185647856778, + "loss": 0.94856775, + "num_input_tokens_seen": 182871360, + "router_z_loss_mlp": 0.8515625, + "step": 2196, + "time_per_iteration": 2.558581829071045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169081, + "balance_loss_mlp": 1.08401346, + "epoch": 0.4226625625240477, + "flos": 678822870528.0, + "grad_norm": 0.034209207392335836, + "language_loss": 0.88652933, + "learning_rate": 0.0006473208811468255, + "loss": 0.89822018, + "num_input_tokens_seen": 182952912, + "router_z_loss_mlp": 0.8515625, + "step": 2197, + "time_per_iteration": 2.8745005130767822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169989, + "balance_loss_mlp": 1.08487344, + "epoch": 0.4228549442093113, + "flos": 504559060992.0, + "grad_norm": 0.02694559660877684, + "language_loss": 0.9045344, + "learning_rate": 0.0006470231403129347, + "loss": 0.91623431, + "num_input_tokens_seen": 183022016, + "router_z_loss_mlp": 0.85205078, + "step": 2198, + "time_per_iteration": 2.6385552883148193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171157, + "balance_loss_mlp": 1.08594668, + "epoch": 0.42304732589457483, + "flos": 613074032640.0, + "grad_norm": 0.02362792419875934, + "language_loss": 0.86769903, + "learning_rate": 0.0006467253423995988, + "loss": 0.87941062, + "num_input_tokens_seen": 183101776, + "router_z_loss_mlp": 0.85302734, + "step": 2199, + "time_per_iteration": 2.8800480365753174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169589, + "balance_loss_mlp": 1.08418751, + "epoch": 0.4232397075798384, + "flos": 516648662016.0, + "grad_norm": 0.0345778065938135, + "language_loss": 0.86613309, + "learning_rate": 0.000646427487522433, + "loss": 0.87782902, + "num_input_tokens_seen": 183171392, + "router_z_loss_mlp": 0.85498047, + "step": 2200, + "time_per_iteration": 2.658045768737793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170112, + "balance_loss_mlp": 1.08451986, + "epoch": 0.42343208926510195, + "flos": 590933262336.0, + "grad_norm": 0.02424061904629306, + "language_loss": 0.89308071, + "learning_rate": 0.0006461295757970749, + "loss": 0.90478176, + "num_input_tokens_seen": 183253936, + "router_z_loss_mlp": 0.85693359, + "step": 2201, + "time_per_iteration": 2.8574764728546143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170293, + "balance_loss_mlp": 1.08465314, + "epoch": 0.42362447095036554, + "flos": 641818194432.0, + "grad_norm": 0.03053594684877434, + "language_loss": 0.89224029, + "learning_rate": 0.0006458316073391839, + "loss": 0.90394318, + "num_input_tokens_seen": 183333744, + "router_z_loss_mlp": 0.85742188, + "step": 2202, + "time_per_iteration": 2.932666063308716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168878, + "balance_loss_mlp": 1.08318996, + "epoch": 0.42381685263562907, + "flos": 513717904896.0, + "grad_norm": 0.025745877239568934, + "language_loss": 0.93694568, + "learning_rate": 0.0006455335822644422, + "loss": 0.94863445, + "num_input_tokens_seen": 183401904, + "router_z_loss_mlp": 0.85791016, + "step": 2203, + "time_per_iteration": 2.6537110805511475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169969, + "balance_loss_mlp": 1.0842818, + "epoch": 0.42400923432089266, + "flos": 547822023168.0, + "grad_norm": 0.028367329203477194, + "language_loss": 0.84440267, + "learning_rate": 0.0006452355006885527, + "loss": 0.85610235, + "num_input_tokens_seen": 183471312, + "router_z_loss_mlp": 0.85791016, + "step": 2204, + "time_per_iteration": 2.639218330383301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169105, + "balance_loss_mlp": 1.08346462, + "epoch": 0.4242016160061562, + "flos": 623287658496.0, + "grad_norm": 0.03537327431533643, + "language_loss": 0.96295106, + "learning_rate": 0.0006449373627272412, + "loss": 0.9746421, + "num_input_tokens_seen": 183539184, + "router_z_loss_mlp": 0.85742188, + "step": 2205, + "time_per_iteration": 2.728724956512451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168771, + "balance_loss_mlp": 1.08317852, + "epoch": 0.4243939976914198, + "flos": 572971413504.0, + "grad_norm": 0.029625174738980242, + "language_loss": 0.88551587, + "learning_rate": 0.0006446391684962553, + "loss": 0.89720356, + "num_input_tokens_seen": 183607504, + "router_z_loss_mlp": 0.85693359, + "step": 2206, + "time_per_iteration": 2.6687116622924805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167518, + "balance_loss_mlp": 1.08192575, + "epoch": 0.42458637937668336, + "flos": 449664394752.0, + "grad_norm": 0.02816858253159587, + "language_loss": 0.89565998, + "learning_rate": 0.000644340918111364, + "loss": 0.90733516, + "num_input_tokens_seen": 183674720, + "router_z_loss_mlp": 0.85693359, + "step": 2207, + "time_per_iteration": 2.620295763015747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167512, + "balance_loss_mlp": 1.08206332, + "epoch": 0.4247787610619469, + "flos": 436335361536.0, + "grad_norm": 0.0303416400904182, + "language_loss": 0.92792743, + "learning_rate": 0.0006440426116883585, + "loss": 0.93960261, + "num_input_tokens_seen": 183740448, + "router_z_loss_mlp": 0.85546875, + "step": 2208, + "time_per_iteration": 2.5411367416381836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171139, + "balance_loss_mlp": 1.08602309, + "epoch": 0.4249711427472105, + "flos": 497121741312.0, + "grad_norm": 0.025596497409994177, + "language_loss": 0.92383361, + "learning_rate": 0.0006437442493430519, + "loss": 0.93554503, + "num_input_tokens_seen": 183812640, + "router_z_loss_mlp": 0.85205078, + "step": 2209, + "time_per_iteration": 2.6431679725646973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172012, + "balance_loss_mlp": 1.08694398, + "epoch": 0.425163524432474, + "flos": 657107796480.0, + "grad_norm": 0.030657116246539617, + "language_loss": 0.93065524, + "learning_rate": 0.000643445831191278, + "loss": 0.94237542, + "num_input_tokens_seen": 183895312, + "router_z_loss_mlp": 0.8515625, + "step": 2210, + "time_per_iteration": 2.9031519889831543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117009, + "balance_loss_mlp": 1.08502185, + "epoch": 0.4253559061177376, + "flos": 651778039296.0, + "grad_norm": 0.031032190975230387, + "language_loss": 0.88729775, + "learning_rate": 0.0006431473573488937, + "loss": 0.89899862, + "num_input_tokens_seen": 183966384, + "router_z_loss_mlp": 0.8515625, + "step": 2211, + "time_per_iteration": 2.745398759841919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170674, + "balance_loss_mlp": 1.08560598, + "epoch": 0.42554828780300114, + "flos": 555202947072.0, + "grad_norm": 0.03338022114707726, + "language_loss": 0.92210639, + "learning_rate": 0.0006428488279317765, + "loss": 0.93381315, + "num_input_tokens_seen": 184031728, + "router_z_loss_mlp": 0.8515625, + "step": 2212, + "time_per_iteration": 2.6822004318237305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172615, + "balance_loss_mlp": 1.08797669, + "epoch": 0.4257406694882647, + "flos": 515421964800.0, + "grad_norm": 0.02921339084637532, + "language_loss": 0.9444955, + "learning_rate": 0.0006425502430558259, + "loss": 0.95622164, + "num_input_tokens_seen": 184096160, + "router_z_loss_mlp": 0.84716797, + "step": 2213, + "time_per_iteration": 2.6147451400756836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173123, + "balance_loss_mlp": 1.08824575, + "epoch": 0.42593305117352825, + "flos": 516705057792.0, + "grad_norm": 0.028975617453248656, + "language_loss": 0.90705556, + "learning_rate": 0.0006422516028369628, + "loss": 0.91878676, + "num_input_tokens_seen": 184169664, + "router_z_loss_mlp": 0.84960938, + "step": 2214, + "time_per_iteration": 2.634315013885498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169159, + "balance_loss_mlp": 1.08423436, + "epoch": 0.42612543285879184, + "flos": 589237934592.0, + "grad_norm": 0.02737510916321625, + "language_loss": 0.88997841, + "learning_rate": 0.0006419529073911296, + "loss": 0.90166998, + "num_input_tokens_seen": 184249152, + "router_z_loss_mlp": 0.85009766, + "step": 2215, + "time_per_iteration": 2.934429168701172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168143, + "balance_loss_mlp": 1.08321857, + "epoch": 0.42631781454405543, + "flos": 636751676928.0, + "grad_norm": 0.02841677319990709, + "language_loss": 0.91541028, + "learning_rate": 0.0006416541568342901, + "loss": 0.92709166, + "num_input_tokens_seen": 184326816, + "router_z_loss_mlp": 0.85009766, + "step": 2216, + "time_per_iteration": 2.924881935119629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167669, + "balance_loss_mlp": 1.08269632, + "epoch": 0.42651019622931896, + "flos": 542245215744.0, + "grad_norm": 0.024048936266806608, + "language_loss": 0.89849669, + "learning_rate": 0.0006413553512824297, + "loss": 0.91017342, + "num_input_tokens_seen": 184404336, + "router_z_loss_mlp": 0.85058594, + "step": 2217, + "time_per_iteration": 2.7312259674072266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166506, + "balance_loss_mlp": 1.08096182, + "epoch": 0.42670257791458255, + "flos": 559223414784.0, + "grad_norm": 0.030670266673020908, + "language_loss": 0.90927672, + "learning_rate": 0.0006410564908515549, + "loss": 0.92094177, + "num_input_tokens_seen": 184472320, + "router_z_loss_mlp": 0.85644531, + "step": 2218, + "time_per_iteration": 2.646705389022827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165047, + "balance_loss_mlp": 1.07964516, + "epoch": 0.4268949595998461, + "flos": 622449727488.0, + "grad_norm": 0.03126891192332862, + "language_loss": 0.92295194, + "learning_rate": 0.0006407575756576935, + "loss": 0.93460238, + "num_input_tokens_seen": 184544704, + "router_z_loss_mlp": 0.85498047, + "step": 2219, + "time_per_iteration": 2.750229597091675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163243, + "balance_loss_mlp": 1.07769799, + "epoch": 0.42708734128510967, + "flos": 539015015424.0, + "grad_norm": 0.029393225010211587, + "language_loss": 0.93690813, + "learning_rate": 0.0006404586058168951, + "loss": 0.94854057, + "num_input_tokens_seen": 184622544, + "router_z_loss_mlp": 0.85644531, + "step": 2220, + "time_per_iteration": 2.75992488861084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166043, + "balance_loss_mlp": 1.08049834, + "epoch": 0.4272797229703732, + "flos": 503862119424.0, + "grad_norm": 0.0277791101580606, + "language_loss": 0.93672097, + "learning_rate": 0.0006401595814452296, + "loss": 0.94838136, + "num_input_tokens_seen": 184692544, + "router_z_loss_mlp": 0.85644531, + "step": 2221, + "time_per_iteration": 2.6034135818481445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166502, + "balance_loss_mlp": 1.08081436, + "epoch": 0.4274721046556368, + "flos": 493437646848.0, + "grad_norm": 0.028798228067485887, + "language_loss": 0.8755163, + "learning_rate": 0.000639860502658789, + "loss": 0.88718128, + "num_input_tokens_seen": 184760480, + "router_z_loss_mlp": 0.85791016, + "step": 2222, + "time_per_iteration": 2.6364476680755615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168114, + "balance_loss_mlp": 1.08242607, + "epoch": 0.4276644863409004, + "flos": 569461235712.0, + "grad_norm": 0.025058965600795662, + "language_loss": 0.90727627, + "learning_rate": 0.0006395613695736853, + "loss": 0.91895741, + "num_input_tokens_seen": 184834080, + "router_z_loss_mlp": 0.85791016, + "step": 2223, + "time_per_iteration": 2.7128536701202393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170105, + "balance_loss_mlp": 1.08432245, + "epoch": 0.4278568680261639, + "flos": 608562740736.0, + "grad_norm": 0.029982203504376047, + "language_loss": 0.88910139, + "learning_rate": 0.0006392621823060529, + "loss": 0.90080237, + "num_input_tokens_seen": 184905872, + "router_z_loss_mlp": 0.85888672, + "step": 2224, + "time_per_iteration": 2.7404489517211914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167658, + "balance_loss_mlp": 1.08177996, + "epoch": 0.4280492497114275, + "flos": 561578754048.0, + "grad_norm": 0.03210591854722722, + "language_loss": 0.92597878, + "learning_rate": 0.0006389629409720465, + "loss": 0.93765533, + "num_input_tokens_seen": 184972320, + "router_z_loss_mlp": 0.85986328, + "step": 2225, + "time_per_iteration": 2.66398549079895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170504, + "balance_loss_mlp": 1.08467305, + "epoch": 0.428241631396691, + "flos": 721901182464.0, + "grad_norm": 0.03010502161811575, + "language_loss": 0.95236158, + "learning_rate": 0.0006386636456878417, + "loss": 0.96406662, + "num_input_tokens_seen": 185051040, + "router_z_loss_mlp": 0.859375, + "step": 2226, + "time_per_iteration": 2.866391897201538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168906, + "balance_loss_mlp": 1.08307493, + "epoch": 0.4284340130819546, + "flos": 430369787904.0, + "grad_norm": 0.032531705768225685, + "language_loss": 0.99370027, + "learning_rate": 0.0006383642965696353, + "loss": 1.00538921, + "num_input_tokens_seen": 185113552, + "router_z_loss_mlp": 0.859375, + "step": 2227, + "time_per_iteration": 2.4586703777313232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169599, + "balance_loss_mlp": 1.08376861, + "epoch": 0.42862639476721814, + "flos": 526159342080.0, + "grad_norm": 0.030010487503704626, + "language_loss": 0.90640998, + "learning_rate": 0.000638064893733645, + "loss": 0.91810596, + "num_input_tokens_seen": 185185056, + "router_z_loss_mlp": 0.859375, + "step": 2228, + "time_per_iteration": 2.71899676322937 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168473, + "balance_loss_mlp": 1.08269, + "epoch": 0.42881877645248173, + "flos": 466378079232.0, + "grad_norm": 0.029133853286813928, + "language_loss": 0.95973945, + "learning_rate": 0.000637765437296109, + "loss": 0.97142416, + "num_input_tokens_seen": 185257248, + "router_z_loss_mlp": 0.85888672, + "step": 2229, + "time_per_iteration": 2.6824750900268555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166344, + "balance_loss_mlp": 1.08075178, + "epoch": 0.42901115813774526, + "flos": 561355172352.0, + "grad_norm": 0.028234307189641095, + "language_loss": 0.92378092, + "learning_rate": 0.000637465927373287, + "loss": 0.93544424, + "num_input_tokens_seen": 185324800, + "router_z_loss_mlp": 0.85693359, + "step": 2230, + "time_per_iteration": 2.65869402885437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166629, + "balance_loss_mlp": 1.08137035, + "epoch": 0.42920353982300885, + "flos": 562527475200.0, + "grad_norm": 0.03139177124565146, + "language_loss": 0.86247277, + "learning_rate": 0.000637166364081459, + "loss": 0.87413907, + "num_input_tokens_seen": 185393408, + "router_z_loss_mlp": 0.85351562, + "step": 2231, + "time_per_iteration": 2.7071642875671387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165657, + "balance_loss_mlp": 1.080446, + "epoch": 0.42939592150827244, + "flos": 557315238912.0, + "grad_norm": 0.03049902562345181, + "language_loss": 0.89974546, + "learning_rate": 0.0006368667475369256, + "loss": 0.91140211, + "num_input_tokens_seen": 185467968, + "router_z_loss_mlp": 0.85302734, + "step": 2232, + "time_per_iteration": 2.74843168258667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166412, + "balance_loss_mlp": 1.08363342, + "epoch": 0.42958830319353597, + "flos": 1524942314496.0, + "grad_norm": 0.009964168253272706, + "language_loss": 0.78527778, + "learning_rate": 0.0006365670778560084, + "loss": 0.79694188, + "num_input_tokens_seen": 185705232, + "router_z_loss_mlp": 0.828125, + "step": 2233, + "time_per_iteration": 4.862222909927368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165146, + "balance_loss_mlp": 1.08236694, + "epoch": 0.42978068487879956, + "flos": 1498869672960.0, + "grad_norm": 0.007691227120989337, + "language_loss": 0.78895426, + "learning_rate": 0.0006362673551550494, + "loss": 0.80060571, + "num_input_tokens_seen": 185932672, + "router_z_loss_mlp": 0.828125, + "step": 2234, + "time_per_iteration": 4.816195011138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167111, + "balance_loss_mlp": 1.08242488, + "epoch": 0.4299730665640631, + "flos": 548063069184.0, + "grad_norm": 0.02593969644103988, + "language_loss": 0.92186785, + "learning_rate": 0.0006359675795504112, + "loss": 0.93353903, + "num_input_tokens_seen": 186006288, + "router_z_loss_mlp": 0.84765625, + "step": 2235, + "time_per_iteration": 2.6802918910980225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167601, + "balance_loss_mlp": 1.08300984, + "epoch": 0.4301654482493267, + "flos": 1131115124736.0, + "grad_norm": 0.035304816631346984, + "language_loss": 0.82753956, + "learning_rate": 0.0006356677511584775, + "loss": 0.83921564, + "num_input_tokens_seen": 186097168, + "router_z_loss_mlp": 0.84667969, + "step": 2236, + "time_per_iteration": 3.444307327270508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169724, + "balance_loss_mlp": 1.08522856, + "epoch": 0.4303578299345902, + "flos": 496741707264.0, + "grad_norm": 0.0313639268125667, + "language_loss": 0.9209317, + "learning_rate": 0.0006353678700956511, + "loss": 0.93262899, + "num_input_tokens_seen": 186163904, + "router_z_loss_mlp": 0.84570312, + "step": 2237, + "time_per_iteration": 2.5677876472473145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164152, + "balance_loss_mlp": 1.07965648, + "epoch": 0.4305502116198538, + "flos": 616929315840.0, + "grad_norm": 0.02814766917627989, + "language_loss": 0.90743506, + "learning_rate": 0.0006350679364783569, + "loss": 0.91907656, + "num_input_tokens_seen": 186233888, + "router_z_loss_mlp": 0.84570312, + "step": 2238, + "time_per_iteration": 2.7363951206207275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175266, + "balance_loss_mlp": 1.09081805, + "epoch": 0.4307425933051173, + "flos": 560321857536.0, + "grad_norm": 0.032687311784007, + "language_loss": 0.92748511, + "learning_rate": 0.0006347679504230393, + "loss": 0.93923771, + "num_input_tokens_seen": 186301168, + "router_z_loss_mlp": 0.84521484, + "step": 2239, + "time_per_iteration": 2.6805875301361084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172185, + "balance_loss_mlp": 1.08749855, + "epoch": 0.4309349749903809, + "flos": 973816779264.0, + "grad_norm": 0.03249158230487725, + "language_loss": 0.83304834, + "learning_rate": 0.0006344679120461632, + "loss": 0.84477019, + "num_input_tokens_seen": 186392096, + "router_z_loss_mlp": 0.84765625, + "step": 2240, + "time_per_iteration": 3.4101555347442627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166292, + "balance_loss_mlp": 1.08146274, + "epoch": 0.4311273566756445, + "flos": 542972356608.0, + "grad_norm": 0.03524791345855764, + "language_loss": 0.87825459, + "learning_rate": 0.0006341678214642134, + "loss": 0.88991749, + "num_input_tokens_seen": 186458000, + "router_z_loss_mlp": 0.84912109, + "step": 2241, + "time_per_iteration": 2.625896692276001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165486, + "balance_loss_mlp": 1.08041823, + "epoch": 0.43131973836090803, + "flos": 763110976512.0, + "grad_norm": 0.027424867307564667, + "language_loss": 0.89878041, + "learning_rate": 0.0006338676787936963, + "loss": 0.91043526, + "num_input_tokens_seen": 186544992, + "router_z_loss_mlp": 0.8515625, + "step": 2242, + "time_per_iteration": 3.063455820083618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167252, + "balance_loss_mlp": 1.08199346, + "epoch": 0.4315121200461716, + "flos": 555602446848.0, + "grad_norm": 0.031429355894507384, + "language_loss": 0.916659, + "learning_rate": 0.0006335674841511367, + "loss": 0.92833149, + "num_input_tokens_seen": 186614960, + "router_z_loss_mlp": 0.85351562, + "step": 2243, + "time_per_iteration": 2.666233777999878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192352, + "balance_loss_mlp": 1.10804749, + "epoch": 0.43170450173143515, + "flos": 1488686972928.0, + "grad_norm": 0.015912473948710273, + "language_loss": 0.7918117, + "learning_rate": 0.000633267237653081, + "loss": 0.80373514, + "num_input_tokens_seen": 186854288, + "router_z_loss_mlp": 0.84375, + "step": 2244, + "time_per_iteration": 4.980380535125732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183075, + "balance_loss_mlp": 1.09877014, + "epoch": 0.43189688341669874, + "flos": 1476907548672.0, + "grad_norm": 0.014137336443723746, + "language_loss": 0.77365553, + "learning_rate": 0.0006329669394160953, + "loss": 0.78548628, + "num_input_tokens_seen": 187090272, + "router_z_loss_mlp": 0.84375, + "step": 2245, + "time_per_iteration": 4.896914005279541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011678, + "balance_loss_mlp": 1.08254158, + "epoch": 0.43208926510196227, + "flos": 493984866816.0, + "grad_norm": 0.02893589890767333, + "language_loss": 0.89212227, + "learning_rate": 0.0006326665895567652, + "loss": 0.90380025, + "num_input_tokens_seen": 187157584, + "router_z_loss_mlp": 0.85351562, + "step": 2246, + "time_per_iteration": 2.6488964557647705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169613, + "balance_loss_mlp": 1.08430731, + "epoch": 0.43228164678722586, + "flos": 521302944768.0, + "grad_norm": 0.0351368535627373, + "language_loss": 0.94705987, + "learning_rate": 0.0006323661881916976, + "loss": 0.95875597, + "num_input_tokens_seen": 187229408, + "router_z_loss_mlp": 0.85400391, + "step": 2247, + "time_per_iteration": 2.7094948291778564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170289, + "balance_loss_mlp": 1.08522093, + "epoch": 0.4324740284724894, + "flos": 797395015680.0, + "grad_norm": 0.0300569180656374, + "language_loss": 0.88277382, + "learning_rate": 0.0006320657354375179, + "loss": 0.89447677, + "num_input_tokens_seen": 187304384, + "router_z_loss_mlp": 0.8515625, + "step": 2248, + "time_per_iteration": 2.942108154296875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166997, + "balance_loss_mlp": 1.08188176, + "epoch": 0.432666410157753, + "flos": 483097767936.0, + "grad_norm": 0.027676603795042543, + "language_loss": 0.93945193, + "learning_rate": 0.0006317652314108726, + "loss": 0.95112193, + "num_input_tokens_seen": 187368064, + "router_z_loss_mlp": 0.85205078, + "step": 2249, + "time_per_iteration": 2.559255838394165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167847, + "balance_loss_mlp": 1.08268416, + "epoch": 0.43285879184301657, + "flos": 501209338368.0, + "grad_norm": 0.028764721331973258, + "language_loss": 0.98109567, + "learning_rate": 0.0006314646762284277, + "loss": 0.99277413, + "num_input_tokens_seen": 187436320, + "router_z_loss_mlp": 0.85253906, + "step": 2250, + "time_per_iteration": 2.6713576316833496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188225, + "balance_loss_mlp": 1.10582733, + "epoch": 0.4330511735282801, + "flos": 1513790701056.0, + "grad_norm": 0.02095115440391329, + "language_loss": 0.75425828, + "learning_rate": 0.0006311640700068691, + "loss": 0.76614058, + "num_input_tokens_seen": 187670912, + "router_z_loss_mlp": 0.82421875, + "step": 2251, + "time_per_iteration": 4.936391592025757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170203, + "balance_loss_mlp": 1.08518302, + "epoch": 0.4332435552135437, + "flos": 700837387776.0, + "grad_norm": 0.037779543880407794, + "language_loss": 0.84241956, + "learning_rate": 0.0006308634128629022, + "loss": 0.85412163, + "num_input_tokens_seen": 187746432, + "router_z_loss_mlp": 0.85107422, + "step": 2252, + "time_per_iteration": 2.890848398208618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168176, + "balance_loss_mlp": 1.0830133, + "epoch": 0.4334359368988072, + "flos": 593481984000.0, + "grad_norm": 0.0295787243575072, + "language_loss": 0.93934762, + "learning_rate": 0.0006305627049132531, + "loss": 0.95102942, + "num_input_tokens_seen": 187820032, + "router_z_loss_mlp": 0.85253906, + "step": 2253, + "time_per_iteration": 2.7571680545806885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167414, + "balance_loss_mlp": 1.08220303, + "epoch": 0.4336283185840708, + "flos": 844274942976.0, + "grad_norm": 0.0242542623992157, + "language_loss": 0.90322375, + "learning_rate": 0.0006302619462746662, + "loss": 0.91489786, + "num_input_tokens_seen": 187904400, + "router_z_loss_mlp": 0.85302734, + "step": 2254, + "time_per_iteration": 3.1296751499176025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167279, + "balance_loss_mlp": 1.0821631, + "epoch": 0.43382070026933434, + "flos": 627401452032.0, + "grad_norm": 0.02849659363202695, + "language_loss": 0.96522522, + "learning_rate": 0.0006299611370639069, + "loss": 0.97689807, + "num_input_tokens_seen": 187973264, + "router_z_loss_mlp": 0.85205078, + "step": 2255, + "time_per_iteration": 2.7125463485717773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167069, + "balance_loss_mlp": 1.08181024, + "epoch": 0.4340130819545979, + "flos": 592209624576.0, + "grad_norm": 0.029264792527705672, + "language_loss": 0.85361564, + "learning_rate": 0.0006296602773977593, + "loss": 0.86528635, + "num_input_tokens_seen": 188039984, + "router_z_loss_mlp": 0.85351562, + "step": 2256, + "time_per_iteration": 2.692830801010132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166353, + "balance_loss_mlp": 1.0810945, + "epoch": 0.4342054636398615, + "flos": 491955167232.0, + "grad_norm": 0.02531800088280138, + "language_loss": 0.92533612, + "learning_rate": 0.0006293593673930277, + "loss": 0.93699974, + "num_input_tokens_seen": 188113456, + "router_z_loss_mlp": 0.85351562, + "step": 2257, + "time_per_iteration": 2.6522371768951416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118061, + "balance_loss_mlp": 1.09568477, + "epoch": 0.43439784532512504, + "flos": 700259968512.0, + "grad_norm": 0.028144633410819173, + "language_loss": 0.84340745, + "learning_rate": 0.0006290584071665358, + "loss": 0.85521352, + "num_input_tokens_seen": 188192480, + "router_z_loss_mlp": 0.85009766, + "step": 2258, + "time_per_iteration": 2.878753662109375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179592, + "balance_loss_mlp": 1.09452426, + "epoch": 0.43459022701038863, + "flos": 486801328128.0, + "grad_norm": 0.028951325004384125, + "language_loss": 0.88270766, + "learning_rate": 0.0006287573968351266, + "loss": 0.89450359, + "num_input_tokens_seen": 188258784, + "router_z_loss_mlp": 0.8515625, + "step": 2259, + "time_per_iteration": 2.55161190032959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173139, + "balance_loss_mlp": 1.08830976, + "epoch": 0.43478260869565216, + "flos": 644266859520.0, + "grad_norm": 0.030714073024811012, + "language_loss": 0.91379642, + "learning_rate": 0.0006284563365156626, + "loss": 0.92552781, + "num_input_tokens_seen": 188331312, + "router_z_loss_mlp": 0.84912109, + "step": 2260, + "time_per_iteration": 2.778975009918213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177671, + "balance_loss_mlp": 1.09274662, + "epoch": 0.43497499038091575, + "flos": 427009331712.0, + "grad_norm": 0.03207934204379992, + "language_loss": 0.94470251, + "learning_rate": 0.0006281552263250261, + "loss": 0.95647919, + "num_input_tokens_seen": 188393712, + "router_z_loss_mlp": 0.85009766, + "step": 2261, + "time_per_iteration": 2.540102005004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175407, + "balance_loss_mlp": 1.09281921, + "epoch": 0.4351673720661793, + "flos": 1541525016576.0, + "grad_norm": 0.010664027023399645, + "language_loss": 0.80691534, + "learning_rate": 0.000627854066380118, + "loss": 0.81866938, + "num_input_tokens_seen": 188621152, + "router_z_loss_mlp": 0.82617188, + "step": 2262, + "time_per_iteration": 4.828954219818115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167291, + "balance_loss_mlp": 1.08260465, + "epoch": 0.43535975375144287, + "flos": 750465423360.0, + "grad_norm": 0.02969029135984414, + "language_loss": 0.88281786, + "learning_rate": 0.0006275528567978593, + "loss": 0.89449072, + "num_input_tokens_seen": 188697120, + "router_z_loss_mlp": 0.84765625, + "step": 2263, + "time_per_iteration": 2.9683096408843994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167048, + "balance_loss_mlp": 1.08193278, + "epoch": 0.4355521354367064, + "flos": 862751084544.0, + "grad_norm": 0.03226302104273745, + "language_loss": 0.89985508, + "learning_rate": 0.0006272515976951898, + "loss": 0.91152549, + "num_input_tokens_seen": 188778480, + "router_z_loss_mlp": 0.85205078, + "step": 2264, + "time_per_iteration": 4.429616689682007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166942, + "balance_loss_mlp": 1.08182704, + "epoch": 0.43574451712197, + "flos": 735842563584.0, + "grad_norm": 0.02499576623287147, + "language_loss": 0.84365284, + "learning_rate": 0.0006269502891890687, + "loss": 0.8553223, + "num_input_tokens_seen": 188863616, + "router_z_loss_mlp": 0.85205078, + "step": 2265, + "time_per_iteration": 3.0444254875183105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166782, + "balance_loss_mlp": 1.08214331, + "epoch": 0.4359368988072336, + "flos": 571712515584.0, + "grad_norm": 0.02707186340155289, + "language_loss": 0.93191004, + "learning_rate": 0.0006266489313964743, + "loss": 0.94357783, + "num_input_tokens_seen": 188933984, + "router_z_loss_mlp": 0.84716797, + "step": 2266, + "time_per_iteration": 2.7227466106414795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164913, + "balance_loss_mlp": 1.0802747, + "epoch": 0.4361292804924971, + "flos": 556670690304.0, + "grad_norm": 0.03376827968070452, + "language_loss": 0.92200565, + "learning_rate": 0.0006263475244344041, + "loss": 0.93365479, + "num_input_tokens_seen": 189012976, + "router_z_loss_mlp": 0.84716797, + "step": 2267, + "time_per_iteration": 2.845227003097534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167657, + "balance_loss_mlp": 1.08335233, + "epoch": 0.4363216621777607, + "flos": 558348553728.0, + "grad_norm": 0.031080273211388402, + "language_loss": 0.91650617, + "learning_rate": 0.0006260460684198746, + "loss": 0.92818272, + "num_input_tokens_seen": 189079664, + "router_z_loss_mlp": 0.84375, + "step": 2268, + "time_per_iteration": 2.652310371398926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165668, + "balance_loss_mlp": 1.08141088, + "epoch": 0.4365140438630242, + "flos": 479196822528.0, + "grad_norm": 0.029843008840560653, + "language_loss": 0.92140841, + "learning_rate": 0.0006257445634699213, + "loss": 0.93306512, + "num_input_tokens_seen": 189144688, + "router_z_loss_mlp": 0.84326172, + "step": 2269, + "time_per_iteration": 2.5779240131378174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164543, + "balance_loss_mlp": 1.08042932, + "epoch": 0.4367064255482878, + "flos": 580007232000.0, + "grad_norm": 0.028296510675920098, + "language_loss": 0.89645165, + "learning_rate": 0.0006254430097015993, + "loss": 0.90809709, + "num_input_tokens_seen": 189213984, + "router_z_loss_mlp": 0.84179688, + "step": 2270, + "time_per_iteration": 2.6566953659057617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172028, + "balance_loss_mlp": 1.08963013, + "epoch": 0.43689880723355135, + "flos": 1462271953920.0, + "grad_norm": 0.010844604855090543, + "language_loss": 0.76479089, + "learning_rate": 0.0006251414072319815, + "loss": 0.77651119, + "num_input_tokens_seen": 189434416, + "router_z_loss_mlp": 0.82421875, + "step": 2271, + "time_per_iteration": 4.794802904129028 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170244, + "balance_loss_mlp": 1.08593976, + "epoch": 0.43709118891881493, + "flos": 668873759232.0, + "grad_norm": 0.024959132899117664, + "language_loss": 0.91526961, + "learning_rate": 0.0006248397561781609, + "loss": 0.92697203, + "num_input_tokens_seen": 189513248, + "router_z_loss_mlp": 0.84375, + "step": 2272, + "time_per_iteration": 2.8676164150238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170164, + "balance_loss_mlp": 1.08562064, + "epoch": 0.43728357060407846, + "flos": 545913847296.0, + "grad_norm": 0.033809863548240594, + "language_loss": 0.93834352, + "learning_rate": 0.0006245380566572482, + "loss": 0.95004517, + "num_input_tokens_seen": 189585392, + "router_z_loss_mlp": 0.84619141, + "step": 2273, + "time_per_iteration": 2.6419596672058105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169646, + "balance_loss_mlp": 1.08519816, + "epoch": 0.43747595228934205, + "flos": 748183944192.0, + "grad_norm": 0.02624268387252208, + "language_loss": 0.83012575, + "learning_rate": 0.0006242363087863744, + "loss": 0.84182227, + "num_input_tokens_seen": 189667552, + "router_z_loss_mlp": 0.84521484, + "step": 2274, + "time_per_iteration": 2.9927828311920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165646, + "balance_loss_mlp": 1.08057845, + "epoch": 0.43766833397460564, + "flos": 632529094656.0, + "grad_norm": 0.025411969041571628, + "language_loss": 0.92234564, + "learning_rate": 0.0006239345126826878, + "loss": 0.9340021, + "num_input_tokens_seen": 189742048, + "router_z_loss_mlp": 0.8515625, + "step": 2275, + "time_per_iteration": 2.8180527687072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164237, + "balance_loss_mlp": 1.07931209, + "epoch": 0.43786071565986917, + "flos": 532098719232.0, + "grad_norm": 0.028730665522240066, + "language_loss": 0.90992379, + "learning_rate": 0.0006236326684633561, + "loss": 0.92156613, + "num_input_tokens_seen": 189817968, + "router_z_loss_mlp": 0.85009766, + "step": 2276, + "time_per_iteration": 2.828425168991089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163177, + "balance_loss_mlp": 1.07810962, + "epoch": 0.43805309734513276, + "flos": 539557506048.0, + "grad_norm": 0.03648062799061939, + "language_loss": 0.82486773, + "learning_rate": 0.0006233307762455658, + "loss": 0.83649945, + "num_input_tokens_seen": 189882608, + "router_z_loss_mlp": 0.8515625, + "step": 2277, + "time_per_iteration": 2.608886957168579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164162, + "balance_loss_mlp": 1.07909381, + "epoch": 0.4382454790303963, + "flos": 865963820544.0, + "grad_norm": 0.025903790262040906, + "language_loss": 0.90223956, + "learning_rate": 0.0006230288361465216, + "loss": 0.91388112, + "num_input_tokens_seen": 189960608, + "router_z_loss_mlp": 0.8515625, + "step": 2278, + "time_per_iteration": 3.036163568496704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171688, + "balance_loss_mlp": 1.08638203, + "epoch": 0.4384378607156599, + "flos": 766801075200.0, + "grad_norm": 0.03187081568607536, + "language_loss": 0.92773926, + "learning_rate": 0.0006227268482834473, + "loss": 0.93945611, + "num_input_tokens_seen": 190035472, + "router_z_loss_mlp": 0.85400391, + "step": 2279, + "time_per_iteration": 2.9320731163024902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176636, + "balance_loss_mlp": 1.09137762, + "epoch": 0.4386302424009234, + "flos": 669796283904.0, + "grad_norm": 0.028047353495827182, + "language_loss": 0.9305023, + "learning_rate": 0.000622424812773585, + "loss": 0.94226873, + "num_input_tokens_seen": 190109312, + "router_z_loss_mlp": 0.85351562, + "step": 2280, + "time_per_iteration": 2.7847142219543457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174317, + "balance_loss_mlp": 1.08901083, + "epoch": 0.438822624086187, + "flos": 486150048768.0, + "grad_norm": 0.03276492690852342, + "language_loss": 0.87875438, + "learning_rate": 0.000622122729734195, + "loss": 0.89049757, + "num_input_tokens_seen": 190174176, + "router_z_loss_mlp": 0.85400391, + "step": 2281, + "time_per_iteration": 2.5878114700317383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175391, + "balance_loss_mlp": 1.09008515, + "epoch": 0.4390150057714506, + "flos": 500258615808.0, + "grad_norm": 0.02649151217717187, + "language_loss": 0.92922705, + "learning_rate": 0.0006218205992825566, + "loss": 0.94098091, + "num_input_tokens_seen": 190243888, + "router_z_loss_mlp": 0.85400391, + "step": 2282, + "time_per_iteration": 2.6129069328308105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171786, + "balance_loss_mlp": 1.08652771, + "epoch": 0.4392073874567141, + "flos": 559351669248.0, + "grad_norm": 0.029077625047839704, + "language_loss": 0.88682199, + "learning_rate": 0.0006215184215359671, + "loss": 0.89853978, + "num_input_tokens_seen": 190317504, + "router_z_loss_mlp": 0.85351562, + "step": 2283, + "time_per_iteration": 2.7397634983062744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011712, + "balance_loss_mlp": 1.08594131, + "epoch": 0.4393997691419777, + "flos": 606422251008.0, + "grad_norm": 0.030174398524898192, + "language_loss": 0.92242193, + "learning_rate": 0.0006212161966117425, + "loss": 0.93413389, + "num_input_tokens_seen": 190390160, + "router_z_loss_mlp": 0.85351562, + "step": 2284, + "time_per_iteration": 2.710947275161743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168513, + "balance_loss_mlp": 1.08349264, + "epoch": 0.43959215082724123, + "flos": 805483614720.0, + "grad_norm": 0.03159683391584848, + "language_loss": 0.8931039, + "learning_rate": 0.0006209139246272164, + "loss": 0.90478909, + "num_input_tokens_seen": 190467600, + "router_z_loss_mlp": 0.85107422, + "step": 2285, + "time_per_iteration": 2.9573750495910645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167409, + "balance_loss_mlp": 1.08229375, + "epoch": 0.4397845325125048, + "flos": 488607446016.0, + "grad_norm": 0.033192711624055064, + "language_loss": 0.89631027, + "learning_rate": 0.0006206116056997421, + "loss": 0.90798426, + "num_input_tokens_seen": 190534192, + "router_z_loss_mlp": 0.85205078, + "step": 2286, + "time_per_iteration": 2.5915918350219727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168495, + "balance_loss_mlp": 1.08380854, + "epoch": 0.43997691419776835, + "flos": 481784475648.0, + "grad_norm": 0.02920198010279229, + "language_loss": 0.88986552, + "learning_rate": 0.0006203092399466892, + "loss": 0.90155041, + "num_input_tokens_seen": 190601440, + "router_z_loss_mlp": 0.84765625, + "step": 2287, + "time_per_iteration": 2.6179182529449463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167372, + "balance_loss_mlp": 1.08282888, + "epoch": 0.44016929588303194, + "flos": 484129081344.0, + "grad_norm": 0.024305807708132735, + "language_loss": 0.91028094, + "learning_rate": 0.0006200068274854473, + "loss": 0.92195475, + "num_input_tokens_seen": 190672528, + "router_z_loss_mlp": 0.84619141, + "step": 2288, + "time_per_iteration": 2.6643898487091064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168421, + "balance_loss_mlp": 1.08387816, + "epoch": 0.4403616775682955, + "flos": 573023806464.0, + "grad_norm": 0.025110382343061666, + "language_loss": 0.90969157, + "learning_rate": 0.0006197043684334229, + "loss": 0.92137575, + "num_input_tokens_seen": 190750704, + "router_z_loss_mlp": 0.84619141, + "step": 2289, + "time_per_iteration": 2.7810122966766357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169529, + "balance_loss_mlp": 1.08503318, + "epoch": 0.44055405925355906, + "flos": 631999339008.0, + "grad_norm": 0.03160389670817918, + "language_loss": 0.85855997, + "learning_rate": 0.0006194018629080411, + "loss": 0.87025523, + "num_input_tokens_seen": 190821664, + "router_z_loss_mlp": 0.84570312, + "step": 2290, + "time_per_iteration": 2.7407448291778564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165877, + "balance_loss_mlp": 1.08147717, + "epoch": 0.44074644093882265, + "flos": 537825248256.0, + "grad_norm": 0.027939915930863316, + "language_loss": 0.87505877, + "learning_rate": 0.0006190993110267451, + "loss": 0.88671762, + "num_input_tokens_seen": 190893888, + "router_z_loss_mlp": 0.84472656, + "step": 2291, + "time_per_iteration": 2.7158915996551514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167062, + "balance_loss_mlp": 1.08280444, + "epoch": 0.4409388226240862, + "flos": 464165730816.0, + "grad_norm": 0.03127864863359821, + "language_loss": 0.91365832, + "learning_rate": 0.0006187967129069958, + "loss": 0.92532897, + "num_input_tokens_seen": 190956800, + "router_z_loss_mlp": 0.84326172, + "step": 2292, + "time_per_iteration": 2.506866931915283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167494, + "balance_loss_mlp": 1.08337986, + "epoch": 0.44113120430934977, + "flos": 567160290816.0, + "grad_norm": 0.024295125434261364, + "language_loss": 0.92081046, + "learning_rate": 0.0006184940686662722, + "loss": 0.93248534, + "num_input_tokens_seen": 191032048, + "router_z_loss_mlp": 0.84179688, + "step": 2293, + "time_per_iteration": 2.7406985759735107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168054, + "balance_loss_mlp": 1.084131, + "epoch": 0.4413235859946133, + "flos": 544674415104.0, + "grad_norm": 0.02998433601693185, + "language_loss": 0.95718068, + "learning_rate": 0.0006181913784220714, + "loss": 0.96886122, + "num_input_tokens_seen": 191099952, + "router_z_loss_mlp": 0.83984375, + "step": 2294, + "time_per_iteration": 2.7276971340179443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186783, + "balance_loss_mlp": 1.1034317, + "epoch": 0.4415159676798769, + "flos": 1573302720000.0, + "grad_norm": 0.012177255736314117, + "language_loss": 0.80553782, + "learning_rate": 0.0006178886422919078, + "loss": 0.8174057, + "num_input_tokens_seen": 191335968, + "router_z_loss_mlp": 0.83398438, + "step": 2295, + "time_per_iteration": 4.898420333862305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174829, + "balance_loss_mlp": 1.0908581, + "epoch": 0.4417083493651404, + "flos": 660012357120.0, + "grad_norm": 0.02926637357686751, + "language_loss": 0.86549121, + "learning_rate": 0.0006175858603933146, + "loss": 0.87723947, + "num_input_tokens_seen": 191410112, + "router_z_loss_mlp": 0.84033203, + "step": 2296, + "time_per_iteration": 2.866745710372925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166372, + "balance_loss_mlp": 1.08225799, + "epoch": 0.441900731050404, + "flos": 741816869376.0, + "grad_norm": 0.028401827027787777, + "language_loss": 0.8638438, + "learning_rate": 0.0006172830328438416, + "loss": 0.87550759, + "num_input_tokens_seen": 191491552, + "router_z_loss_mlp": 0.84179688, + "step": 2297, + "time_per_iteration": 2.9731123447418213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165335, + "balance_loss_mlp": 1.08088684, + "epoch": 0.44209311273566754, + "flos": 540595550208.0, + "grad_norm": 0.030114194292861593, + "language_loss": 0.93111193, + "learning_rate": 0.0006169801597610572, + "loss": 0.94276524, + "num_input_tokens_seen": 191567872, + "router_z_loss_mlp": 0.84521484, + "step": 2298, + "time_per_iteration": 2.777326822280884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163943, + "balance_loss_mlp": 1.07959104, + "epoch": 0.4422854944209311, + "flos": 622729704960.0, + "grad_norm": 0.030043302620551878, + "language_loss": 0.96779996, + "learning_rate": 0.0006166772412625469, + "loss": 0.97943938, + "num_input_tokens_seen": 191638032, + "router_z_loss_mlp": 0.84423828, + "step": 2299, + "time_per_iteration": 2.8143997192382812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164367, + "balance_loss_mlp": 1.08006215, + "epoch": 0.4424778761061947, + "flos": 660060020736.0, + "grad_norm": 0.031086205360051855, + "language_loss": 0.88609374, + "learning_rate": 0.0006163742774659141, + "loss": 0.89773744, + "num_input_tokens_seen": 191709104, + "router_z_loss_mlp": 0.84375, + "step": 2300, + "time_per_iteration": 2.8234009742736816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116513, + "balance_loss_mlp": 1.08087325, + "epoch": 0.44267025779145824, + "flos": 569702281728.0, + "grad_norm": 0.02554920530971592, + "language_loss": 0.92150819, + "learning_rate": 0.0006160712684887801, + "loss": 0.93315947, + "num_input_tokens_seen": 191787072, + "router_z_loss_mlp": 0.84326172, + "step": 2301, + "time_per_iteration": 2.733370542526245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170443, + "balance_loss_mlp": 1.08623374, + "epoch": 0.44286263947672183, + "flos": 497818682880.0, + "grad_norm": 0.02788747598953172, + "language_loss": 0.88145387, + "learning_rate": 0.0006157682144487832, + "loss": 0.89315832, + "num_input_tokens_seen": 191863040, + "router_z_loss_mlp": 0.84277344, + "step": 2302, + "time_per_iteration": 2.766334295272827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171189, + "balance_loss_mlp": 1.08697963, + "epoch": 0.44305502116198536, + "flos": 610607903232.0, + "grad_norm": 0.028872273370365097, + "language_loss": 0.89961743, + "learning_rate": 0.0006154651154635793, + "loss": 0.91132939, + "num_input_tokens_seen": 191940352, + "router_z_loss_mlp": 0.84277344, + "step": 2303, + "time_per_iteration": 2.844402313232422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172304, + "balance_loss_mlp": 1.08776116, + "epoch": 0.44324740284724895, + "flos": 471742038528.0, + "grad_norm": 0.028372285588360545, + "language_loss": 0.91810459, + "learning_rate": 0.0006151619716508421, + "loss": 0.92982763, + "num_input_tokens_seen": 192006896, + "router_z_loss_mlp": 0.84619141, + "step": 2304, + "time_per_iteration": 2.545243263244629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166666, + "balance_loss_mlp": 1.08197927, + "epoch": 0.4434397845325125, + "flos": 579811848192.0, + "grad_norm": 0.029138508250266412, + "language_loss": 0.93279153, + "learning_rate": 0.0006148587831282625, + "loss": 0.94445825, + "num_input_tokens_seen": 192075312, + "router_z_loss_mlp": 0.84765625, + "step": 2305, + "time_per_iteration": 2.6743574142456055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179131, + "balance_loss_mlp": 1.09654236, + "epoch": 0.44363216621777607, + "flos": 1499995038720.0, + "grad_norm": 0.011431210063158581, + "language_loss": 0.79176068, + "learning_rate": 0.0006145555500135483, + "loss": 0.80355197, + "num_input_tokens_seen": 192304816, + "router_z_loss_mlp": 0.82617188, + "step": 2306, + "time_per_iteration": 4.870469570159912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177668, + "balance_loss_mlp": 1.09298158, + "epoch": 0.44382454790303966, + "flos": 478285031424.0, + "grad_norm": 0.03377230518223979, + "language_loss": 0.94630158, + "learning_rate": 0.0006142522724244255, + "loss": 0.95807827, + "num_input_tokens_seen": 192369232, + "router_z_loss_mlp": 0.84765625, + "step": 2307, + "time_per_iteration": 2.5165300369262695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181709, + "balance_loss_mlp": 1.09912109, + "epoch": 0.4440169295883032, + "flos": 1547303938560.0, + "grad_norm": 0.010354849447395944, + "language_loss": 0.76484716, + "learning_rate": 0.0006139489504786368, + "loss": 0.77666426, + "num_input_tokens_seen": 192600176, + "router_z_loss_mlp": 0.82617188, + "step": 2308, + "time_per_iteration": 4.86593222618103 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168989, + "balance_loss_mlp": 1.0843029, + "epoch": 0.4442093112735668, + "flos": 592290215424.0, + "grad_norm": 0.030546908540126056, + "language_loss": 0.84313834, + "learning_rate": 0.000613645584293942, + "loss": 0.85482824, + "num_input_tokens_seen": 192675424, + "router_z_loss_mlp": 0.84765625, + "step": 2309, + "time_per_iteration": 2.9245197772979736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179296, + "balance_loss_mlp": 1.09465766, + "epoch": 0.4444016929588303, + "flos": 531327917568.0, + "grad_norm": 0.02954341623225009, + "language_loss": 0.89990199, + "learning_rate": 0.0006133421739881185, + "loss": 0.91169494, + "num_input_tokens_seen": 192747552, + "router_z_loss_mlp": 0.84716797, + "step": 2310, + "time_per_iteration": 2.6806466579437256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173935, + "balance_loss_mlp": 1.08958304, + "epoch": 0.4445940746440939, + "flos": 621388214784.0, + "grad_norm": 0.03132503362752706, + "language_loss": 0.89829159, + "learning_rate": 0.0006130387196789605, + "loss": 0.91003096, + "num_input_tokens_seen": 192819984, + "router_z_loss_mlp": 0.84423828, + "step": 2311, + "time_per_iteration": 2.7674410343170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171768, + "balance_loss_mlp": 1.08751106, + "epoch": 0.4447864563293574, + "flos": 630375869952.0, + "grad_norm": 0.024389617188914626, + "language_loss": 0.89820284, + "learning_rate": 0.0006127352214842795, + "loss": 0.90992051, + "num_input_tokens_seen": 192906080, + "router_z_loss_mlp": 0.84326172, + "step": 2312, + "time_per_iteration": 3.0181000232696533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170174, + "balance_loss_mlp": 1.08591735, + "epoch": 0.444978838014621, + "flos": 652001620992.0, + "grad_norm": 0.03266392614581568, + "language_loss": 0.92178452, + "learning_rate": 0.0006124316795219041, + "loss": 0.93348622, + "num_input_tokens_seen": 192972336, + "router_z_loss_mlp": 0.84326172, + "step": 2313, + "time_per_iteration": 2.7772133350372314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172939, + "balance_loss_mlp": 1.08911133, + "epoch": 0.44517121969988455, + "flos": 613588325376.0, + "grad_norm": 0.026148577301855224, + "language_loss": 0.88032007, + "learning_rate": 0.0006121280939096794, + "loss": 0.89204955, + "num_input_tokens_seen": 193045744, + "router_z_loss_mlp": 0.83886719, + "step": 2314, + "time_per_iteration": 2.7472517490386963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173697, + "balance_loss_mlp": 1.09010756, + "epoch": 0.44536360138514813, + "flos": 489714620928.0, + "grad_norm": 0.031365562822013526, + "language_loss": 0.94548678, + "learning_rate": 0.000611824464765468, + "loss": 0.95722377, + "num_input_tokens_seen": 193115248, + "router_z_loss_mlp": 0.83642578, + "step": 2315, + "time_per_iteration": 2.5471882820129395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188843, + "balance_loss_mlp": 1.10758972, + "epoch": 0.4455559830704117, + "flos": 1519053877248.0, + "grad_norm": 0.020817362108823283, + "language_loss": 0.78594941, + "learning_rate": 0.0006115207922071492, + "loss": 0.79783785, + "num_input_tokens_seen": 193330816, + "router_z_loss_mlp": 0.8125, + "step": 2316, + "time_per_iteration": 4.660900831222534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170364, + "balance_loss_mlp": 1.08663106, + "epoch": 0.44574836475567525, + "flos": 616816524288.0, + "grad_norm": 0.03088300803415325, + "language_loss": 0.9123913, + "learning_rate": 0.000611217076352619, + "loss": 0.92409492, + "num_input_tokens_seen": 193407616, + "router_z_loss_mlp": 0.83789062, + "step": 2317, + "time_per_iteration": 2.7556822299957275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171317, + "balance_loss_mlp": 1.08772719, + "epoch": 0.44594074644093884, + "flos": 507433422336.0, + "grad_norm": 0.026331926721779163, + "language_loss": 0.8931551, + "learning_rate": 0.0006109133173197905, + "loss": 0.90486825, + "num_input_tokens_seen": 193482624, + "router_z_loss_mlp": 0.83642578, + "step": 2318, + "time_per_iteration": 2.720372200012207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172625, + "balance_loss_mlp": 1.08908355, + "epoch": 0.44613312812620237, + "flos": 728311918080.0, + "grad_norm": 0.030991917971638312, + "language_loss": 0.91262019, + "learning_rate": 0.0006106095152265935, + "loss": 0.92434645, + "num_input_tokens_seen": 193555952, + "router_z_loss_mlp": 0.8359375, + "step": 2319, + "time_per_iteration": 2.8956825733184814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171779, + "balance_loss_mlp": 1.08776009, + "epoch": 0.44632550981146596, + "flos": 637057850880.0, + "grad_norm": 0.02763281666385245, + "language_loss": 0.90440875, + "learning_rate": 0.0006103056701909739, + "loss": 0.91612655, + "num_input_tokens_seen": 193636672, + "router_z_loss_mlp": 0.84082031, + "step": 2320, + "time_per_iteration": 2.9104726314544678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175182, + "balance_loss_mlp": 1.09116352, + "epoch": 0.4465178914967295, + "flos": 828616766976.0, + "grad_norm": 0.02413420043376393, + "language_loss": 0.88773656, + "learning_rate": 0.0006100017823308956, + "loss": 0.89948833, + "num_input_tokens_seen": 193721728, + "router_z_loss_mlp": 0.84082031, + "step": 2321, + "time_per_iteration": 3.1638107299804688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176807, + "balance_loss_mlp": 1.0927887, + "epoch": 0.4467102731819931, + "flos": 667032712704.0, + "grad_norm": 0.03201581013716374, + "language_loss": 0.87315178, + "learning_rate": 0.0006096978517643377, + "loss": 0.88491988, + "num_input_tokens_seen": 193795456, + "router_z_loss_mlp": 0.84082031, + "step": 2322, + "time_per_iteration": 2.7875144481658936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182039, + "balance_loss_mlp": 1.09792459, + "epoch": 0.4469026548672566, + "flos": 513969684480.0, + "grad_norm": 0.032089815412588485, + "language_loss": 0.90642822, + "learning_rate": 0.0006093938786092968, + "loss": 0.91824853, + "num_input_tokens_seen": 193865520, + "router_z_loss_mlp": 0.84179688, + "step": 2323, + "time_per_iteration": 2.6789090633392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181311, + "balance_loss_mlp": 1.097054, + "epoch": 0.4470950365525202, + "flos": 685285272576.0, + "grad_norm": 0.032095192334159584, + "language_loss": 0.95970643, + "learning_rate": 0.0006090898629837857, + "loss": 0.97151959, + "num_input_tokens_seen": 193935040, + "router_z_loss_mlp": 0.84326172, + "step": 2324, + "time_per_iteration": 2.842829704284668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174335, + "balance_loss_mlp": 1.08993506, + "epoch": 0.4472874182377838, + "flos": 628534823424.0, + "grad_norm": 0.02542366781046337, + "language_loss": 0.93390518, + "learning_rate": 0.0006087858050058337, + "loss": 0.94564855, + "num_input_tokens_seen": 194009120, + "router_z_loss_mlp": 0.84472656, + "step": 2325, + "time_per_iteration": 2.798461675643921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173301, + "balance_loss_mlp": 1.08899629, + "epoch": 0.4474797999230473, + "flos": 548240988672.0, + "grad_norm": 0.026872235695321916, + "language_loss": 0.8790192, + "learning_rate": 0.0006084817047934866, + "loss": 0.8907522, + "num_input_tokens_seen": 194076672, + "router_z_loss_mlp": 0.84375, + "step": 2326, + "time_per_iteration": 2.6333069801330566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170357, + "balance_loss_mlp": 1.08552742, + "epoch": 0.4476721816083109, + "flos": 456756609024.0, + "grad_norm": 0.03263470786125086, + "language_loss": 0.9605242, + "learning_rate": 0.0006081775624648066, + "loss": 0.97222769, + "num_input_tokens_seen": 194142320, + "router_z_loss_mlp": 0.84912109, + "step": 2327, + "time_per_iteration": 2.506568431854248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171196, + "balance_loss_mlp": 1.08660555, + "epoch": 0.44786456329357444, + "flos": 482500882944.0, + "grad_norm": 0.030530219610100114, + "language_loss": 0.89424241, + "learning_rate": 0.0006078733781378721, + "loss": 0.90595436, + "num_input_tokens_seen": 194208560, + "router_z_loss_mlp": 0.84667969, + "step": 2328, + "time_per_iteration": 2.5324759483337402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174464, + "balance_loss_mlp": 1.09006357, + "epoch": 0.448056944978838, + "flos": 553236374016.0, + "grad_norm": 0.028423200188041658, + "language_loss": 0.87742424, + "learning_rate": 0.0006075691519307781, + "loss": 0.88916886, + "num_input_tokens_seen": 194288080, + "router_z_loss_mlp": 0.84472656, + "step": 2329, + "time_per_iteration": 2.8329951763153076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169966, + "balance_loss_mlp": 1.08580375, + "epoch": 0.44824932666410156, + "flos": 551916350976.0, + "grad_norm": 0.030957218182316032, + "language_loss": 0.88990253, + "learning_rate": 0.0006072648839616356, + "loss": 0.90160215, + "num_input_tokens_seen": 194358464, + "router_z_loss_mlp": 0.84228516, + "step": 2330, + "time_per_iteration": 2.6367061138153076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169901, + "balance_loss_mlp": 1.08612072, + "epoch": 0.44844170834936514, + "flos": 990271953408.0, + "grad_norm": 0.02484019388371453, + "language_loss": 0.87772298, + "learning_rate": 0.0006069605743485718, + "loss": 0.88942194, + "num_input_tokens_seen": 194456112, + "router_z_loss_mlp": 0.83837891, + "step": 2331, + "time_per_iteration": 3.3425865173339844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177153, + "balance_loss_mlp": 1.09356356, + "epoch": 0.44863409003462873, + "flos": 592450670592.0, + "grad_norm": 0.02816420707323987, + "language_loss": 0.89319122, + "learning_rate": 0.0006066562232097303, + "loss": 0.90496272, + "num_input_tokens_seen": 194526880, + "router_z_loss_mlp": 0.83642578, + "step": 2332, + "time_per_iteration": 2.7754669189453125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178328, + "balance_loss_mlp": 1.09473884, + "epoch": 0.44882647171989226, + "flos": 725984776704.0, + "grad_norm": 0.02840681089712515, + "language_loss": 0.91798162, + "learning_rate": 0.0006063518306632708, + "loss": 0.92976487, + "num_input_tokens_seen": 194606800, + "router_z_loss_mlp": 0.83642578, + "step": 2333, + "time_per_iteration": 2.9270272254943848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174339, + "balance_loss_mlp": 1.09065437, + "epoch": 0.44901885340515585, + "flos": 535990932480.0, + "grad_norm": 0.029373675588589353, + "language_loss": 0.88265771, + "learning_rate": 0.0006060473968273688, + "loss": 0.89440107, + "num_input_tokens_seen": 194679856, + "router_z_loss_mlp": 0.83740234, + "step": 2334, + "time_per_iteration": 2.6593613624572754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199905, + "balance_loss_mlp": 1.11693573, + "epoch": 0.4492112350904194, + "flos": 1558690593792.0, + "grad_norm": 0.016875691883268894, + "language_loss": 0.77879542, + "learning_rate": 0.000605742921820216, + "loss": 0.79079443, + "num_input_tokens_seen": 194906320, + "router_z_loss_mlp": 0.83007812, + "step": 2335, + "time_per_iteration": 4.868390321731567 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182762, + "balance_loss_mlp": 1.10017395, + "epoch": 0.44940361677568297, + "flos": 1526700768768.0, + "grad_norm": 0.009982769528938305, + "language_loss": 0.81005216, + "learning_rate": 0.0006054384057600202, + "loss": 0.82187974, + "num_input_tokens_seen": 195129152, + "router_z_loss_mlp": 0.82617188, + "step": 2336, + "time_per_iteration": 4.8639936447143555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176453, + "balance_loss_mlp": 1.09286392, + "epoch": 0.4495959984609465, + "flos": 383320673280.0, + "grad_norm": 0.04017386378382665, + "language_loss": 0.95653474, + "learning_rate": 0.0006051338487650047, + "loss": 0.96829921, + "num_input_tokens_seen": 195189792, + "router_z_loss_mlp": 0.83642578, + "step": 2337, + "time_per_iteration": 2.451195240020752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177188, + "balance_loss_mlp": 1.09364605, + "epoch": 0.4497883801462101, + "flos": 498882196992.0, + "grad_norm": 0.03424215683733749, + "language_loss": 0.88682485, + "learning_rate": 0.0006048292509534095, + "loss": 0.89859676, + "num_input_tokens_seen": 195258640, + "router_z_loss_mlp": 0.8359375, + "step": 2338, + "time_per_iteration": 2.5799245834350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174646, + "balance_loss_mlp": 1.09139061, + "epoch": 0.4499807618314736, + "flos": 615589827072.0, + "grad_norm": 0.03300851417215051, + "language_loss": 0.85045063, + "learning_rate": 0.0006045246124434895, + "loss": 0.86219716, + "num_input_tokens_seen": 195327984, + "router_z_loss_mlp": 0.83300781, + "step": 2339, + "time_per_iteration": 2.732715368270874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170546, + "balance_loss_mlp": 1.08738542, + "epoch": 0.4501731435167372, + "flos": 1007067503616.0, + "grad_norm": 0.0319502465029259, + "language_loss": 0.92538428, + "learning_rate": 0.0006042199333535162, + "loss": 0.9370898, + "num_input_tokens_seen": 195409504, + "router_z_loss_mlp": 0.83203125, + "step": 2340, + "time_per_iteration": 3.3100435733795166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170678, + "balance_loss_mlp": 1.08742249, + "epoch": 0.4503655252020008, + "flos": 822327555072.0, + "grad_norm": 0.024782286149646622, + "language_loss": 0.88794839, + "learning_rate": 0.0006039152138017763, + "loss": 0.89965516, + "num_input_tokens_seen": 195489424, + "router_z_loss_mlp": 0.83300781, + "step": 2341, + "time_per_iteration": 3.0845420360565186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117382, + "balance_loss_mlp": 1.09027839, + "epoch": 0.4505579068872643, + "flos": 487413676032.0, + "grad_norm": 0.028274686754151398, + "language_loss": 0.8912791, + "learning_rate": 0.0006036104539065726, + "loss": 0.90301728, + "num_input_tokens_seen": 195562128, + "router_z_loss_mlp": 0.8359375, + "step": 2342, + "time_per_iteration": 2.704869270324707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170482, + "balance_loss_mlp": 1.08679724, + "epoch": 0.4507502885725279, + "flos": 886335403008.0, + "grad_norm": 0.02767032513042878, + "language_loss": 0.89237905, + "learning_rate": 0.000603305653786223, + "loss": 0.90408385, + "num_input_tokens_seen": 195646800, + "router_z_loss_mlp": 0.83740234, + "step": 2343, + "time_per_iteration": 3.143308162689209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169453, + "balance_loss_mlp": 1.08576834, + "epoch": 0.45094267025779144, + "flos": 579421080576.0, + "grad_norm": 0.028420960086658186, + "language_loss": 0.90634954, + "learning_rate": 0.0006030008135590622, + "loss": 0.91804409, + "num_input_tokens_seen": 195719648, + "router_z_loss_mlp": 0.83740234, + "step": 2344, + "time_per_iteration": 2.7383973598480225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177198, + "balance_loss_mlp": 1.09332275, + "epoch": 0.45113505194305503, + "flos": 526441320960.0, + "grad_norm": 0.025225422820390885, + "language_loss": 0.85642457, + "learning_rate": 0.0006026959333434387, + "loss": 0.86819655, + "num_input_tokens_seen": 195794800, + "router_z_loss_mlp": 0.83935547, + "step": 2345, + "time_per_iteration": 2.7594330310821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177326, + "balance_loss_mlp": 1.09316456, + "epoch": 0.45132743362831856, + "flos": 503115512832.0, + "grad_norm": 0.026356266791679354, + "language_loss": 0.83258432, + "learning_rate": 0.0006023910132577181, + "loss": 0.84435755, + "num_input_tokens_seen": 195866848, + "router_z_loss_mlp": 0.84228516, + "step": 2346, + "time_per_iteration": 2.6426072120666504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174296, + "balance_loss_mlp": 1.09051549, + "epoch": 0.45151981531358215, + "flos": 432835917312.0, + "grad_norm": 0.03747446326611767, + "language_loss": 0.91464496, + "learning_rate": 0.0006020860534202806, + "loss": 0.92638797, + "num_input_tokens_seen": 195930640, + "router_z_loss_mlp": 0.83837891, + "step": 2347, + "time_per_iteration": 2.5375916957855225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172304, + "balance_loss_mlp": 1.08799899, + "epoch": 0.4517121969988457, + "flos": 713493674496.0, + "grad_norm": 0.026159040948808, + "language_loss": 0.86486131, + "learning_rate": 0.0006017810539495224, + "loss": 0.87658435, + "num_input_tokens_seen": 196014240, + "router_z_loss_mlp": 0.84375, + "step": 2348, + "time_per_iteration": 2.935776472091675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172944, + "balance_loss_mlp": 1.0886873, + "epoch": 0.45190457868410927, + "flos": 580556453376.0, + "grad_norm": 0.02859512200307389, + "language_loss": 0.8919422, + "learning_rate": 0.0006014760149638547, + "loss": 0.90367162, + "num_input_tokens_seen": 196083296, + "router_z_loss_mlp": 0.84326172, + "step": 2349, + "time_per_iteration": 4.1359429359436035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117423, + "balance_loss_mlp": 1.08982956, + "epoch": 0.45209696036937286, + "flos": 483627523584.0, + "grad_norm": 0.04225699722465749, + "language_loss": 0.94155228, + "learning_rate": 0.000601170936581704, + "loss": 0.95329458, + "num_input_tokens_seen": 196147840, + "router_z_loss_mlp": 0.84472656, + "step": 2350, + "time_per_iteration": 2.551886796951294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171893, + "balance_loss_mlp": 1.08739793, + "epoch": 0.4522893420546364, + "flos": 541259564544.0, + "grad_norm": 0.03047412078786442, + "language_loss": 0.90869355, + "learning_rate": 0.0006008658189215121, + "loss": 0.92041242, + "num_input_tokens_seen": 196219008, + "router_z_loss_mlp": 0.84570312, + "step": 2351, + "time_per_iteration": 2.6196951866149902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176582, + "balance_loss_mlp": 1.09175217, + "epoch": 0.4524817237399, + "flos": 497690428416.0, + "grad_norm": 0.03573709607194862, + "language_loss": 0.8682127, + "learning_rate": 0.0006005606621017366, + "loss": 0.87997848, + "num_input_tokens_seen": 196287792, + "router_z_loss_mlp": 0.84912109, + "step": 2352, + "time_per_iteration": 2.5675714015960693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174694, + "balance_loss_mlp": 1.09024608, + "epoch": 0.4526741054251635, + "flos": 653840666112.0, + "grad_norm": 0.027536817578414453, + "language_loss": 0.86718237, + "learning_rate": 0.0006002554662408496, + "loss": 0.87892926, + "num_input_tokens_seen": 196371776, + "router_z_loss_mlp": 0.84521484, + "step": 2353, + "time_per_iteration": 2.887061595916748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182285, + "balance_loss_mlp": 1.09774196, + "epoch": 0.4528664871104271, + "flos": 572003226624.0, + "grad_norm": 0.03098083736113463, + "language_loss": 0.96988797, + "learning_rate": 0.0005999502314573388, + "loss": 0.98171079, + "num_input_tokens_seen": 196441840, + "router_z_loss_mlp": 0.84619141, + "step": 2354, + "time_per_iteration": 2.6700878143310547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184968, + "balance_loss_mlp": 1.1005199, + "epoch": 0.45305886879569063, + "flos": 459678633984.0, + "grad_norm": 0.034884925425697356, + "language_loss": 0.93055832, + "learning_rate": 0.0005996449578697066, + "loss": 0.94240803, + "num_input_tokens_seen": 196510464, + "router_z_loss_mlp": 0.84521484, + "step": 2355, + "time_per_iteration": 2.6873598098754883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180832, + "balance_loss_mlp": 1.09647942, + "epoch": 0.4532512504809542, + "flos": 506206725120.0, + "grad_norm": 0.028006133853455534, + "language_loss": 0.87364781, + "learning_rate": 0.0005993396455964709, + "loss": 0.88545609, + "num_input_tokens_seen": 196583888, + "router_z_loss_mlp": 0.84423828, + "step": 2356, + "time_per_iteration": 2.672428607940674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179518, + "balance_loss_mlp": 1.09545124, + "epoch": 0.4534436321662178, + "flos": 583311292416.0, + "grad_norm": 0.033764708533666976, + "language_loss": 0.88888013, + "learning_rate": 0.0005990342947561647, + "loss": 0.90067536, + "num_input_tokens_seen": 196652816, + "router_z_loss_mlp": 0.84130859, + "step": 2357, + "time_per_iteration": 2.7101337909698486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179265, + "balance_loss_mlp": 1.09529436, + "epoch": 0.45363601385148133, + "flos": 550772246016.0, + "grad_norm": 0.03168807299418994, + "language_loss": 0.84871709, + "learning_rate": 0.0005987289054673351, + "loss": 0.86050975, + "num_input_tokens_seen": 196720208, + "router_z_loss_mlp": 0.84033203, + "step": 2358, + "time_per_iteration": 2.6033973693847656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122184, + "balance_loss_mlp": 1.14096832, + "epoch": 0.4538283955367449, + "flos": 1477791141888.0, + "grad_norm": 0.02971290012878958, + "language_loss": 0.76575738, + "learning_rate": 0.0005984234778485451, + "loss": 0.7779758, + "num_input_tokens_seen": 196947696, + "router_z_loss_mlp": 0.80859375, + "step": 2359, + "time_per_iteration": 4.841644525527954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172875, + "balance_loss_mlp": 1.0889039, + "epoch": 0.45402077722200845, + "flos": 585796887552.0, + "grad_norm": 0.03208897744410929, + "language_loss": 0.98243296, + "learning_rate": 0.0005981180120183722, + "loss": 0.99416173, + "num_input_tokens_seen": 197015712, + "router_z_loss_mlp": 0.84033203, + "step": 2360, + "time_per_iteration": 2.76943302154541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183781, + "balance_loss_mlp": 1.09957135, + "epoch": 0.45421315890727204, + "flos": 532888986624.0, + "grad_norm": 0.026822351719262807, + "language_loss": 0.89930874, + "learning_rate": 0.0005978125080954089, + "loss": 0.91114652, + "num_input_tokens_seen": 197094880, + "router_z_loss_mlp": 0.84277344, + "step": 2361, + "time_per_iteration": 2.822767972946167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180091, + "balance_loss_mlp": 1.09597707, + "epoch": 0.4544055405925356, + "flos": 786551577600.0, + "grad_norm": 0.034773976616178995, + "language_loss": 0.84516251, + "learning_rate": 0.000597506966198262, + "loss": 0.85696352, + "num_input_tokens_seen": 197176448, + "router_z_loss_mlp": 0.84179688, + "step": 2362, + "time_per_iteration": 2.952383518218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177, + "balance_loss_mlp": 1.09288561, + "epoch": 0.45459792227779916, + "flos": 519201386496.0, + "grad_norm": 0.03664720273497137, + "language_loss": 0.91360861, + "learning_rate": 0.0005972013864455536, + "loss": 0.92537856, + "num_input_tokens_seen": 197243520, + "router_z_loss_mlp": 0.84179688, + "step": 2363, + "time_per_iteration": 2.6317927837371826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178521, + "balance_loss_mlp": 1.09450209, + "epoch": 0.4547903039630627, + "flos": 538598051328.0, + "grad_norm": 0.028772208334572696, + "language_loss": 0.91273308, + "learning_rate": 0.0005968957689559203, + "loss": 0.92451829, + "num_input_tokens_seen": 197311536, + "router_z_loss_mlp": 0.84082031, + "step": 2364, + "time_per_iteration": 2.6589906215667725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173596, + "balance_loss_mlp": 1.0895294, + "epoch": 0.4549826856483263, + "flos": 529690987008.0, + "grad_norm": 0.029727340486193105, + "language_loss": 0.95477283, + "learning_rate": 0.0005965901138480131, + "loss": 0.96650875, + "num_input_tokens_seen": 197382752, + "router_z_loss_mlp": 0.84130859, + "step": 2365, + "time_per_iteration": 2.595510959625244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171355, + "balance_loss_mlp": 1.08700228, + "epoch": 0.45517506733358987, + "flos": 521982422016.0, + "grad_norm": 0.030829958952989886, + "language_loss": 0.94295681, + "learning_rate": 0.0005962844212404982, + "loss": 0.95467031, + "num_input_tokens_seen": 197456592, + "router_z_loss_mlp": 0.84423828, + "step": 2366, + "time_per_iteration": 2.662235736846924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177016, + "balance_loss_mlp": 1.09271073, + "epoch": 0.4553674490188534, + "flos": 452009000448.0, + "grad_norm": 0.02436634770305822, + "language_loss": 0.92783928, + "learning_rate": 0.0005959786912520558, + "loss": 0.93960941, + "num_input_tokens_seen": 197525408, + "router_z_loss_mlp": 0.84375, + "step": 2367, + "time_per_iteration": 2.573124408721924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117318, + "balance_loss_mlp": 1.08906567, + "epoch": 0.455559830704117, + "flos": 547744160256.0, + "grad_norm": 0.037205613753220755, + "language_loss": 0.90209919, + "learning_rate": 0.0005956729240013806, + "loss": 0.913831, + "num_input_tokens_seen": 197608480, + "router_z_loss_mlp": 0.84179688, + "step": 2368, + "time_per_iteration": 2.772557020187378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173597, + "balance_loss_mlp": 1.08943486, + "epoch": 0.4557522123893805, + "flos": 584865630720.0, + "grad_norm": 0.026144628796570656, + "language_loss": 0.97770655, + "learning_rate": 0.0005953671196071824, + "loss": 0.98944247, + "num_input_tokens_seen": 197678416, + "router_z_loss_mlp": 0.84228516, + "step": 2369, + "time_per_iteration": 2.7082910537719727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172311, + "balance_loss_mlp": 1.08819652, + "epoch": 0.4559445940746441, + "flos": 527483367936.0, + "grad_norm": 0.0309922218143565, + "language_loss": 0.8751142, + "learning_rate": 0.0005950612781881846, + "loss": 0.8868373, + "num_input_tokens_seen": 197753424, + "router_z_loss_mlp": 0.84179688, + "step": 2370, + "time_per_iteration": 2.7258613109588623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172868, + "balance_loss_mlp": 1.08913577, + "epoch": 0.45613697575990764, + "flos": 653367306240.0, + "grad_norm": 0.03125586624235708, + "language_loss": 0.84058654, + "learning_rate": 0.0005947553998631259, + "loss": 0.85231519, + "num_input_tokens_seen": 197832080, + "router_z_loss_mlp": 0.83789062, + "step": 2371, + "time_per_iteration": 2.8463094234466553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169614, + "balance_loss_mlp": 1.08626282, + "epoch": 0.4563293574451712, + "flos": 868623332352.0, + "grad_norm": 0.025158843177806284, + "language_loss": 0.84537494, + "learning_rate": 0.000594449484750758, + "loss": 0.85707104, + "num_input_tokens_seen": 197919536, + "router_z_loss_mlp": 0.83398438, + "step": 2372, + "time_per_iteration": 3.1793160438537598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165382, + "balance_loss_mlp": 1.08193552, + "epoch": 0.45652173913043476, + "flos": 499131975168.0, + "grad_norm": 0.03016735007152292, + "language_loss": 0.8953886, + "learning_rate": 0.0005941435329698484, + "loss": 0.90704238, + "num_input_tokens_seen": 197991872, + "router_z_loss_mlp": 0.83496094, + "step": 2373, + "time_per_iteration": 2.6885011196136475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168274, + "balance_loss_mlp": 1.08458936, + "epoch": 0.45671412081569834, + "flos": 561958788096.0, + "grad_norm": 0.029049495784182693, + "language_loss": 0.89830238, + "learning_rate": 0.0005938375446391778, + "loss": 0.90998513, + "num_input_tokens_seen": 198063392, + "router_z_loss_mlp": 0.83740234, + "step": 2374, + "time_per_iteration": 2.7694103717803955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169785, + "balance_loss_mlp": 1.08605206, + "epoch": 0.45690650250096193, + "flos": 504122631168.0, + "grad_norm": 0.032895841438659715, + "language_loss": 0.95283711, + "learning_rate": 0.0005935315198775415, + "loss": 0.96453488, + "num_input_tokens_seen": 198131232, + "router_z_loss_mlp": 0.83789062, + "step": 2375, + "time_per_iteration": 2.6797261238098145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117336, + "balance_loss_mlp": 1.08967507, + "epoch": 0.45709888418622546, + "flos": 431598486528.0, + "grad_norm": 0.029217874962507603, + "language_loss": 0.93084061, + "learning_rate": 0.0005932254588037486, + "loss": 0.94257426, + "num_input_tokens_seen": 198194944, + "router_z_loss_mlp": 0.83740234, + "step": 2376, + "time_per_iteration": 2.5119664669036865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170171, + "balance_loss_mlp": 1.08634305, + "epoch": 0.45729126587148905, + "flos": 526693100544.0, + "grad_norm": 0.033600967739372, + "language_loss": 0.91914618, + "learning_rate": 0.000592919361536623, + "loss": 0.93084788, + "num_input_tokens_seen": 198265728, + "router_z_loss_mlp": 0.83886719, + "step": 2377, + "time_per_iteration": 2.627753734588623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172251, + "balance_loss_mlp": 1.08861363, + "epoch": 0.4574836475567526, + "flos": 639147949056.0, + "grad_norm": 0.02676395696709272, + "language_loss": 0.95213675, + "learning_rate": 0.0005926132281950017, + "loss": 0.9638592, + "num_input_tokens_seen": 198336640, + "router_z_loss_mlp": 0.83691406, + "step": 2378, + "time_per_iteration": 2.7404637336730957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171278, + "balance_loss_mlp": 1.08754539, + "epoch": 0.45767602924201617, + "flos": 650790386688.0, + "grad_norm": 0.03076010987013328, + "language_loss": 0.92175043, + "learning_rate": 0.0005923070588977367, + "loss": 0.93346316, + "num_input_tokens_seen": 198413552, + "router_z_loss_mlp": 0.83789062, + "step": 2379, + "time_per_iteration": 2.7948412895202637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173225, + "balance_loss_mlp": 1.08944476, + "epoch": 0.4578684109272797, + "flos": 747962363904.0, + "grad_norm": 0.027484014603145524, + "language_loss": 0.92339164, + "learning_rate": 0.0005920008537636931, + "loss": 0.93512392, + "num_input_tokens_seen": 198490864, + "router_z_loss_mlp": 0.83837891, + "step": 2380, + "time_per_iteration": 2.903837203979492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173408, + "balance_loss_mlp": 1.08972311, + "epoch": 0.4580607926125433, + "flos": 642727984128.0, + "grad_norm": 0.029077527756171735, + "language_loss": 0.92490625, + "learning_rate": 0.0005916946129117504, + "loss": 0.93664026, + "num_input_tokens_seen": 198571200, + "router_z_loss_mlp": 0.83740234, + "step": 2381, + "time_per_iteration": 2.902449131011963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169328, + "balance_loss_mlp": 1.08569121, + "epoch": 0.4582531742978069, + "flos": 803239065600.0, + "grad_norm": 0.02842187637415346, + "language_loss": 0.86509985, + "learning_rate": 0.0005913883364608017, + "loss": 0.87679315, + "num_input_tokens_seen": 198658624, + "router_z_loss_mlp": 0.83691406, + "step": 2382, + "time_per_iteration": 3.0474140644073486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171424, + "balance_loss_mlp": 1.0876435, + "epoch": 0.4584455559830704, + "flos": 685517586432.0, + "grad_norm": 0.02678099894990505, + "language_loss": 0.94194049, + "learning_rate": 0.0005910820245297542, + "loss": 0.95365477, + "num_input_tokens_seen": 198731312, + "router_z_loss_mlp": 0.83837891, + "step": 2383, + "time_per_iteration": 2.879652261734009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171015, + "balance_loss_mlp": 1.08718669, + "epoch": 0.458637937668334, + "flos": 519281977344.0, + "grad_norm": 0.03033035418174317, + "language_loss": 0.87193358, + "learning_rate": 0.000590775677237529, + "loss": 0.88364375, + "num_input_tokens_seen": 198805296, + "router_z_loss_mlp": 0.83886719, + "step": 2384, + "time_per_iteration": 2.718327045440674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116823, + "balance_loss_mlp": 1.08478332, + "epoch": 0.4588303193535975, + "flos": 506532364800.0, + "grad_norm": 0.028303891516217768, + "language_loss": 0.87188554, + "learning_rate": 0.0005904692947030601, + "loss": 0.88356787, + "num_input_tokens_seen": 198872112, + "router_z_loss_mlp": 0.83496094, + "step": 2385, + "time_per_iteration": 2.5850000381469727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166672, + "balance_loss_mlp": 1.08303475, + "epoch": 0.4590227010388611, + "flos": 496908893184.0, + "grad_norm": 0.031451346934425, + "language_loss": 0.9665041, + "learning_rate": 0.0005901628770452963, + "loss": 0.97817081, + "num_input_tokens_seen": 198938480, + "router_z_loss_mlp": 0.83691406, + "step": 2386, + "time_per_iteration": 2.5478482246398926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172991, + "balance_loss_mlp": 1.08964002, + "epoch": 0.45921508272412465, + "flos": 494601217536.0, + "grad_norm": 0.030858044337890404, + "language_loss": 0.93199378, + "learning_rate": 0.000589856424383199, + "loss": 0.94372368, + "num_input_tokens_seen": 199008608, + "router_z_loss_mlp": 0.83398438, + "step": 2387, + "time_per_iteration": 2.6889121532440186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170845, + "balance_loss_mlp": 1.08744633, + "epoch": 0.45940746440938823, + "flos": 692592336384.0, + "grad_norm": 0.02985924743030105, + "language_loss": 0.89320701, + "learning_rate": 0.000589549936835744, + "loss": 0.90491545, + "num_input_tokens_seen": 199084592, + "router_z_loss_mlp": 0.83447266, + "step": 2388, + "time_per_iteration": 2.929584264755249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167353, + "balance_loss_mlp": 1.08390617, + "epoch": 0.45959984609465176, + "flos": 504736980480.0, + "grad_norm": 0.026272627268038303, + "language_loss": 0.85652947, + "learning_rate": 0.0005892434145219202, + "loss": 0.86820304, + "num_input_tokens_seen": 199151504, + "router_z_loss_mlp": 0.83496094, + "step": 2389, + "time_per_iteration": 2.6049258708953857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169189, + "balance_loss_mlp": 1.08593321, + "epoch": 0.45979222777991535, + "flos": 677839220736.0, + "grad_norm": 0.032142260667283734, + "language_loss": 0.89047158, + "learning_rate": 0.0005889368575607303, + "loss": 0.90216345, + "num_input_tokens_seen": 199224528, + "router_z_loss_mlp": 0.83300781, + "step": 2390, + "time_per_iteration": 2.8630926609039307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170087, + "balance_loss_mlp": 1.08673584, + "epoch": 0.45998460946517894, + "flos": 779038396416.0, + "grad_norm": 0.02948026619685868, + "language_loss": 0.84149277, + "learning_rate": 0.00058863026607119, + "loss": 0.85319364, + "num_input_tokens_seen": 199312512, + "router_z_loss_mlp": 0.83398438, + "step": 2391, + "time_per_iteration": 3.0889787673950195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170542, + "balance_loss_mlp": 1.08709574, + "epoch": 0.46017699115044247, + "flos": 853021552128.0, + "grad_norm": 0.028406278062058678, + "language_loss": 0.85429174, + "learning_rate": 0.0005883236401723287, + "loss": 0.8659972, + "num_input_tokens_seen": 199397216, + "router_z_loss_mlp": 0.83496094, + "step": 2392, + "time_per_iteration": 3.1613874435424805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167478, + "balance_loss_mlp": 1.08403194, + "epoch": 0.46036937283570606, + "flos": 576963683328.0, + "grad_norm": 0.029157836827012555, + "language_loss": 0.90157199, + "learning_rate": 0.0005880169799831893, + "loss": 0.91324675, + "num_input_tokens_seen": 199464288, + "router_z_loss_mlp": 0.83496094, + "step": 2393, + "time_per_iteration": 2.6974027156829834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117291, + "balance_loss_mlp": 1.08955884, + "epoch": 0.4605617545209696, + "flos": 613119694848.0, + "grad_norm": 0.028584885066092792, + "language_loss": 0.87511885, + "learning_rate": 0.0005877102856228278, + "loss": 0.88684797, + "num_input_tokens_seen": 199538096, + "router_z_loss_mlp": 0.83398438, + "step": 2394, + "time_per_iteration": 2.862462043762207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169553, + "balance_loss_mlp": 1.08591628, + "epoch": 0.4607541362062332, + "flos": 534158618112.0, + "grad_norm": 0.03156913659667245, + "language_loss": 0.91444194, + "learning_rate": 0.0005874035572103133, + "loss": 0.92613751, + "num_input_tokens_seen": 199609504, + "router_z_loss_mlp": 0.83691406, + "step": 2395, + "time_per_iteration": 2.66796612739563 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171842, + "balance_loss_mlp": 1.08830035, + "epoch": 0.4609465178914967, + "flos": 648473978880.0, + "grad_norm": 0.039315545211924735, + "language_loss": 0.89278555, + "learning_rate": 0.0005870967948647288, + "loss": 0.90450394, + "num_input_tokens_seen": 199678960, + "router_z_loss_mlp": 0.8359375, + "step": 2396, + "time_per_iteration": 2.7669596672058105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209076, + "balance_loss_mlp": 1.12553406, + "epoch": 0.4611388995767603, + "flos": 1469498426880.0, + "grad_norm": 0.015424486797259693, + "language_loss": 0.743083, + "learning_rate": 0.0005867899987051693, + "loss": 0.7551738, + "num_input_tokens_seen": 199903568, + "router_z_loss_mlp": 0.8359375, + "step": 2397, + "time_per_iteration": 5.5382936000823975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177127, + "balance_loss_mlp": 1.09377611, + "epoch": 0.46133128126202383, + "flos": 724476100608.0, + "grad_norm": 0.029375695907885992, + "language_loss": 0.91919947, + "learning_rate": 0.0005864831688507443, + "loss": 0.93097073, + "num_input_tokens_seen": 199988672, + "router_z_loss_mlp": 0.83398438, + "step": 2398, + "time_per_iteration": 2.95526123046875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171581, + "balance_loss_mlp": 1.08846855, + "epoch": 0.4615236629472874, + "flos": 549113848320.0, + "grad_norm": 0.030696537047505416, + "language_loss": 0.82409662, + "learning_rate": 0.0005861763054205754, + "loss": 0.83581245, + "num_input_tokens_seen": 200062304, + "router_z_loss_mlp": 0.83154297, + "step": 2399, + "time_per_iteration": 2.767615795135498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172709, + "balance_loss_mlp": 1.08973968, + "epoch": 0.461716044632551, + "flos": 603459293184.0, + "grad_norm": 0.02737063612292851, + "language_loss": 0.84976828, + "learning_rate": 0.0005858694085337976, + "loss": 0.86149538, + "num_input_tokens_seen": 200138464, + "router_z_loss_mlp": 0.83007812, + "step": 2400, + "time_per_iteration": 2.7964670658111572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011724, + "balance_loss_mlp": 1.08966899, + "epoch": 0.46190842631781454, + "flos": 475436866560.0, + "grad_norm": 0.03229000781534058, + "language_loss": 0.9094255, + "learning_rate": 0.0005855624783095589, + "loss": 0.92114949, + "num_input_tokens_seen": 200205728, + "router_z_loss_mlp": 0.82763672, + "step": 2401, + "time_per_iteration": 2.534349203109741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170734, + "balance_loss_mlp": 1.08814597, + "epoch": 0.4621008080030781, + "flos": 438401991168.0, + "grad_norm": 0.027555285929390542, + "language_loss": 0.90607065, + "learning_rate": 0.00058525551486702, + "loss": 0.91777802, + "num_input_tokens_seen": 200269824, + "router_z_loss_mlp": 0.82617188, + "step": 2402, + "time_per_iteration": 2.5021228790283203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172463, + "balance_loss_mlp": 1.08987451, + "epoch": 0.46229318968834165, + "flos": 526497716736.0, + "grad_norm": 0.03262891309156314, + "language_loss": 0.88400978, + "learning_rate": 0.0005849485183253548, + "loss": 0.89573443, + "num_input_tokens_seen": 200341264, + "router_z_loss_mlp": 0.82617188, + "step": 2403, + "time_per_iteration": 2.6212213039398193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165506, + "balance_loss_mlp": 1.08291745, + "epoch": 0.46248557137360524, + "flos": 440533748736.0, + "grad_norm": 0.02845192827842058, + "language_loss": 0.92361593, + "learning_rate": 0.0005846414888037501, + "loss": 0.93527102, + "num_input_tokens_seen": 200405632, + "router_z_loss_mlp": 0.82617188, + "step": 2404, + "time_per_iteration": 2.482285499572754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166688, + "balance_loss_mlp": 1.08409953, + "epoch": 0.4626779530588688, + "flos": 618772363776.0, + "grad_norm": 0.03074329225106782, + "language_loss": 0.881423, + "learning_rate": 0.0005843344264214049, + "loss": 0.89308989, + "num_input_tokens_seen": 200479312, + "router_z_loss_mlp": 0.82617188, + "step": 2405, + "time_per_iteration": 2.746795415878296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170811, + "balance_loss_mlp": 1.08803225, + "epoch": 0.46287033474413236, + "flos": 671359354368.0, + "grad_norm": 0.02816556419491645, + "language_loss": 0.904742, + "learning_rate": 0.0005840273312975317, + "loss": 0.91645014, + "num_input_tokens_seen": 200552976, + "router_z_loss_mlp": 0.828125, + "step": 2406, + "time_per_iteration": 2.866894483566284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168834, + "balance_loss_mlp": 1.08572149, + "epoch": 0.46306271642939595, + "flos": 481198324224.0, + "grad_norm": 0.027370741977369897, + "language_loss": 0.96141434, + "learning_rate": 0.0005837202035513555, + "loss": 0.97310269, + "num_input_tokens_seen": 200621088, + "router_z_loss_mlp": 0.83154297, + "step": 2407, + "time_per_iteration": 2.589233636856079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168547, + "balance_loss_mlp": 1.08562469, + "epoch": 0.4632550981146595, + "flos": 581857010688.0, + "grad_norm": 0.028787881065009197, + "language_loss": 0.87249482, + "learning_rate": 0.0005834130433021136, + "loss": 0.88418025, + "num_input_tokens_seen": 200698400, + "router_z_loss_mlp": 0.82958984, + "step": 2408, + "time_per_iteration": 2.77109432220459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176276, + "balance_loss_mlp": 1.09311593, + "epoch": 0.46344747979992307, + "flos": 525017238528.0, + "grad_norm": 0.03139748973768327, + "language_loss": 0.79860151, + "learning_rate": 0.0005831058506690563, + "loss": 0.81036425, + "num_input_tokens_seen": 200767264, + "router_z_loss_mlp": 0.83203125, + "step": 2409, + "time_per_iteration": 2.6422629356384277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175968, + "balance_loss_mlp": 1.0931412, + "epoch": 0.4636398614851866, + "flos": 747812642304.0, + "grad_norm": 0.02712568041794283, + "language_loss": 0.9122293, + "learning_rate": 0.0005827986257714464, + "loss": 0.92398894, + "num_input_tokens_seen": 200841440, + "router_z_loss_mlp": 0.82861328, + "step": 2410, + "time_per_iteration": 2.915513515472412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175895, + "balance_loss_mlp": 1.09254348, + "epoch": 0.4638322431704502, + "flos": 597645442560.0, + "grad_norm": 0.03337742182336422, + "language_loss": 0.94969916, + "learning_rate": 0.0005824913687285591, + "loss": 0.96145809, + "num_input_tokens_seen": 200911296, + "router_z_loss_mlp": 0.83398438, + "step": 2411, + "time_per_iteration": 2.7729153633117676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174985, + "balance_loss_mlp": 1.09168148, + "epoch": 0.4640246248557137, + "flos": 540532423680.0, + "grad_norm": 0.028926449520475586, + "language_loss": 0.87762833, + "learning_rate": 0.0005821840796596821, + "loss": 0.88937813, + "num_input_tokens_seen": 200981920, + "router_z_loss_mlp": 0.83349609, + "step": 2412, + "time_per_iteration": 2.7454707622528076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174854, + "balance_loss_mlp": 1.09155095, + "epoch": 0.4642170065409773, + "flos": 563808566784.0, + "grad_norm": 0.027243427778446835, + "language_loss": 0.85983133, + "learning_rate": 0.0005818767586841158, + "loss": 0.87157989, + "num_input_tokens_seen": 201059392, + "router_z_loss_mlp": 0.83349609, + "step": 2413, + "time_per_iteration": 2.7634999752044678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174726, + "balance_loss_mlp": 1.09161353, + "epoch": 0.46440938822624084, + "flos": 532061789184.0, + "grad_norm": 0.026139841130999073, + "language_loss": 0.91185576, + "learning_rate": 0.0005815694059211726, + "loss": 0.923603, + "num_input_tokens_seen": 201130192, + "router_z_loss_mlp": 0.83154297, + "step": 2414, + "time_per_iteration": 2.6814608573913574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193306, + "balance_loss_mlp": 1.11109924, + "epoch": 0.4646017699115044, + "flos": 1529624795136.0, + "grad_norm": 0.015412108289742382, + "language_loss": 0.80873632, + "learning_rate": 0.0005812620214901778, + "loss": 0.82066941, + "num_input_tokens_seen": 201354720, + "router_z_loss_mlp": 0.82226562, + "step": 2415, + "time_per_iteration": 4.867271184921265 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183273, + "balance_loss_mlp": 1.10163879, + "epoch": 0.464794151596768, + "flos": 1544171793408.0, + "grad_norm": 0.012751682226462524, + "language_loss": 0.7694506, + "learning_rate": 0.000580954605510468, + "loss": 0.78128332, + "num_input_tokens_seen": 201592096, + "router_z_loss_mlp": 0.81640625, + "step": 2416, + "time_per_iteration": 5.0150392055511475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166548, + "balance_loss_mlp": 1.08391249, + "epoch": 0.46498653328203154, + "flos": 502538093568.0, + "grad_norm": 0.028765151082888876, + "language_loss": 0.92239797, + "learning_rate": 0.0005806471581013931, + "loss": 0.93406343, + "num_input_tokens_seen": 201666160, + "router_z_loss_mlp": 0.82666016, + "step": 2417, + "time_per_iteration": 2.6913554668426514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165917, + "balance_loss_mlp": 1.08332872, + "epoch": 0.46517891496729513, + "flos": 677300732928.0, + "grad_norm": 0.03431254801555697, + "language_loss": 0.85110676, + "learning_rate": 0.0005803396793823146, + "loss": 0.86276597, + "num_input_tokens_seen": 201733552, + "router_z_loss_mlp": 0.82617188, + "step": 2418, + "time_per_iteration": 2.8245232105255127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169421, + "balance_loss_mlp": 1.08702314, + "epoch": 0.46537129665255866, + "flos": 586511293440.0, + "grad_norm": 0.03532488466841911, + "language_loss": 0.93255758, + "learning_rate": 0.0005800321694726065, + "loss": 0.94425178, + "num_input_tokens_seen": 201806128, + "router_z_loss_mlp": 0.82421875, + "step": 2419, + "time_per_iteration": 2.74255108833313 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117097, + "balance_loss_mlp": 1.08866799, + "epoch": 0.46556367833782225, + "flos": 588820970496.0, + "grad_norm": 0.031254530654890866, + "language_loss": 0.92505676, + "learning_rate": 0.0005797246284916545, + "loss": 0.93676651, + "num_input_tokens_seen": 201874224, + "router_z_loss_mlp": 0.82324219, + "step": 2420, + "time_per_iteration": 2.6942667961120605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182114, + "balance_loss_mlp": 1.10238647, + "epoch": 0.4657560600230858, + "flos": 1488582187008.0, + "grad_norm": 0.01896402624903705, + "language_loss": 0.77505189, + "learning_rate": 0.0005794170565588569, + "loss": 0.78687304, + "num_input_tokens_seen": 202111648, + "router_z_loss_mlp": 0.796875, + "step": 2421, + "time_per_iteration": 4.965069532394409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179806, + "balance_loss_mlp": 1.09740829, + "epoch": 0.46594844170834937, + "flos": 581392382976.0, + "grad_norm": 0.035008146137172264, + "language_loss": 0.92618293, + "learning_rate": 0.0005791094537936233, + "loss": 0.93798101, + "num_input_tokens_seen": 202183344, + "router_z_loss_mlp": 0.82421875, + "step": 2422, + "time_per_iteration": 2.7509443759918213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116805, + "balance_loss_mlp": 1.08555722, + "epoch": 0.4661408233936129, + "flos": 513570184704.0, + "grad_norm": 0.03182837491947037, + "language_loss": 0.88539767, + "learning_rate": 0.0005788018203153762, + "loss": 0.89707822, + "num_input_tokens_seen": 202252512, + "router_z_loss_mlp": 0.82519531, + "step": 2423, + "time_per_iteration": 2.6291344165802 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163454, + "balance_loss_mlp": 1.08038855, + "epoch": 0.4663332050788765, + "flos": 492033030144.0, + "grad_norm": 0.03147692461991822, + "language_loss": 0.92034245, + "learning_rate": 0.000578494156243549, + "loss": 0.93197691, + "num_input_tokens_seen": 202320096, + "router_z_loss_mlp": 0.83105469, + "step": 2424, + "time_per_iteration": 2.5616393089294434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167158, + "balance_loss_mlp": 1.08390224, + "epoch": 0.4665255867641401, + "flos": 513707171328.0, + "grad_norm": 0.028174773974589257, + "language_loss": 0.94988501, + "learning_rate": 0.0005781864616975878, + "loss": 0.96155655, + "num_input_tokens_seen": 202391552, + "router_z_loss_mlp": 0.83300781, + "step": 2425, + "time_per_iteration": 2.67893648147583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178777, + "balance_loss_mlp": 1.09552157, + "epoch": 0.4667179684494036, + "flos": 425706772992.0, + "grad_norm": 0.03381525890081808, + "language_loss": 0.91298926, + "learning_rate": 0.0005778787367969502, + "loss": 0.92477703, + "num_input_tokens_seen": 202457328, + "router_z_loss_mlp": 0.83300781, + "step": 2426, + "time_per_iteration": 2.5708863735198975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180968, + "balance_loss_mlp": 1.09790349, + "epoch": 0.4669103501346672, + "flos": 709223428608.0, + "grad_norm": 0.031023375068471706, + "language_loss": 0.86979687, + "learning_rate": 0.0005775709816611053, + "loss": 0.88160658, + "num_input_tokens_seen": 202535888, + "router_z_loss_mlp": 0.83105469, + "step": 2427, + "time_per_iteration": 2.9488039016723633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178737, + "balance_loss_mlp": 1.09543312, + "epoch": 0.4671027318199307, + "flos": 555945550848.0, + "grad_norm": 0.0268683026146142, + "language_loss": 0.8862977, + "learning_rate": 0.0005772631964095346, + "loss": 0.89808506, + "num_input_tokens_seen": 202608400, + "router_z_loss_mlp": 0.83349609, + "step": 2428, + "time_per_iteration": 2.6830828189849854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176571, + "balance_loss_mlp": 1.09321952, + "epoch": 0.4672951135051943, + "flos": 568195607040.0, + "grad_norm": 0.029193722689313813, + "language_loss": 0.92024446, + "learning_rate": 0.000576955381161731, + "loss": 0.93201017, + "num_input_tokens_seen": 202677712, + "router_z_loss_mlp": 0.83398438, + "step": 2429, + "time_per_iteration": 2.7286531925201416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172919, + "balance_loss_mlp": 1.08956802, + "epoch": 0.46748749519045785, + "flos": 425418063360.0, + "grad_norm": 0.030194965591673555, + "language_loss": 0.93541706, + "learning_rate": 0.0005766475360371985, + "loss": 0.94714624, + "num_input_tokens_seen": 202743824, + "router_z_loss_mlp": 0.83398438, + "step": 2430, + "time_per_iteration": 2.5866243839263916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171537, + "balance_loss_mlp": 1.08809078, + "epoch": 0.46767987687572143, + "flos": 539370854400.0, + "grad_norm": 0.031323302876694416, + "language_loss": 0.91645998, + "learning_rate": 0.0005763396611554536, + "loss": 0.92817533, + "num_input_tokens_seen": 202813072, + "router_z_loss_mlp": 0.83496094, + "step": 2431, + "time_per_iteration": 2.644538402557373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169389, + "balance_loss_mlp": 1.08622885, + "epoch": 0.467872258560985, + "flos": 825075663360.0, + "grad_norm": 0.035112660876247544, + "language_loss": 0.8720994, + "learning_rate": 0.0005760317566360237, + "loss": 0.88379329, + "num_input_tokens_seen": 202886576, + "router_z_loss_mlp": 0.83203125, + "step": 2432, + "time_per_iteration": 2.9847497940063477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169145, + "balance_loss_mlp": 1.08598459, + "epoch": 0.46806464024624855, + "flos": 662853791232.0, + "grad_norm": 0.03130586605287321, + "language_loss": 0.92657965, + "learning_rate": 0.000575723822598448, + "loss": 0.93827116, + "num_input_tokens_seen": 202956736, + "router_z_loss_mlp": 0.83203125, + "step": 2433, + "time_per_iteration": 2.7757930755615234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166037, + "balance_loss_mlp": 1.08325768, + "epoch": 0.46825702193151214, + "flos": 757054078464.0, + "grad_norm": 0.025972857143736858, + "language_loss": 0.87588978, + "learning_rate": 0.0005754158591622773, + "loss": 0.88755012, + "num_input_tokens_seen": 203036432, + "router_z_loss_mlp": 0.828125, + "step": 2434, + "time_per_iteration": 2.9586892127990723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167751, + "balance_loss_mlp": 1.08482957, + "epoch": 0.4684494036167757, + "flos": 440310167040.0, + "grad_norm": 0.03095385887839679, + "language_loss": 0.89792037, + "learning_rate": 0.0005751078664470732, + "loss": 0.90959787, + "num_input_tokens_seen": 203101904, + "router_z_loss_mlp": 0.82958984, + "step": 2435, + "time_per_iteration": 2.5508580207824707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167106, + "balance_loss_mlp": 1.08446991, + "epoch": 0.46864178530203926, + "flos": 533748384768.0, + "grad_norm": 0.02784458934890301, + "language_loss": 0.91441107, + "learning_rate": 0.0005747998445724094, + "loss": 0.92608213, + "num_input_tokens_seen": 203170272, + "router_z_loss_mlp": 0.82666016, + "step": 2436, + "time_per_iteration": 2.6264078617095947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166893, + "balance_loss_mlp": 1.08435297, + "epoch": 0.4688341669873028, + "flos": 577825809408.0, + "grad_norm": 0.028098929039846225, + "language_loss": 0.94501269, + "learning_rate": 0.0005744917936578707, + "loss": 0.95668173, + "num_input_tokens_seen": 203243920, + "router_z_loss_mlp": 0.82568359, + "step": 2437, + "time_per_iteration": 2.7923285961151123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163054, + "balance_loss_mlp": 1.0805608, + "epoch": 0.4690265486725664, + "flos": 540717073920.0, + "grad_norm": 0.02510139841230761, + "language_loss": 0.88352144, + "learning_rate": 0.0005741837138230526, + "loss": 0.89515197, + "num_input_tokens_seen": 203321760, + "router_z_loss_mlp": 0.82519531, + "step": 2438, + "time_per_iteration": 2.720592737197876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117104, + "balance_loss_mlp": 1.08849919, + "epoch": 0.4692189303578299, + "flos": 771881054208.0, + "grad_norm": 0.031043213179005578, + "language_loss": 0.91746414, + "learning_rate": 0.0005738756051875627, + "loss": 0.92917454, + "num_input_tokens_seen": 203409088, + "router_z_loss_mlp": 0.82568359, + "step": 2439, + "time_per_iteration": 3.0688676834106445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179368, + "balance_loss_mlp": 1.09697056, + "epoch": 0.4694113120430935, + "flos": 572513516544.0, + "grad_norm": 0.031224617656339514, + "language_loss": 0.8895998, + "learning_rate": 0.0005735674678710192, + "loss": 0.90139341, + "num_input_tokens_seen": 203481680, + "router_z_loss_mlp": 0.82421875, + "step": 2440, + "time_per_iteration": 2.6647889614105225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180255, + "balance_loss_mlp": 1.09814322, + "epoch": 0.4696036937283571, + "flos": 750094121472.0, + "grad_norm": 0.03673041295896698, + "language_loss": 0.88509989, + "learning_rate": 0.0005732593019930517, + "loss": 0.89690244, + "num_input_tokens_seen": 203554848, + "router_z_loss_mlp": 0.82128906, + "step": 2441, + "time_per_iteration": 2.9219651222229004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177833, + "balance_loss_mlp": 1.09553087, + "epoch": 0.4697960754136206, + "flos": 494442763776.0, + "grad_norm": 0.03186685029176949, + "language_loss": 0.93415046, + "learning_rate": 0.0005729511076733008, + "loss": 0.94592881, + "num_input_tokens_seen": 203624816, + "router_z_loss_mlp": 0.82324219, + "step": 2442, + "time_per_iteration": 2.6268982887268066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163524, + "balance_loss_mlp": 1.08088803, + "epoch": 0.4699884570988842, + "flos": 726360081408.0, + "grad_norm": 0.03313850577325225, + "language_loss": 0.91418898, + "learning_rate": 0.000572642885031418, + "loss": 0.92582428, + "num_input_tokens_seen": 203698256, + "router_z_loss_mlp": 0.82666016, + "step": 2443, + "time_per_iteration": 2.847228527069092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165965, + "balance_loss_mlp": 1.08337641, + "epoch": 0.47018083878414774, + "flos": 556577364480.0, + "grad_norm": 0.031620033102277616, + "language_loss": 0.86240256, + "learning_rate": 0.0005723346341870662, + "loss": 0.87406218, + "num_input_tokens_seen": 203772672, + "router_z_loss_mlp": 0.82617188, + "step": 2444, + "time_per_iteration": 2.7060024738311768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171889, + "balance_loss_mlp": 1.08944428, + "epoch": 0.4703732204694113, + "flos": 424962167808.0, + "grad_norm": 0.03469194433982127, + "language_loss": 0.92819834, + "learning_rate": 0.0005720263552599188, + "loss": 0.93991721, + "num_input_tokens_seen": 203835904, + "router_z_loss_mlp": 0.82470703, + "step": 2445, + "time_per_iteration": 2.486546754837036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175277, + "balance_loss_mlp": 1.09307039, + "epoch": 0.47056560215467486, + "flos": 704755797504.0, + "grad_norm": 0.03273224664010927, + "language_loss": 0.86175644, + "learning_rate": 0.0005717180483696604, + "loss": 0.87350929, + "num_input_tokens_seen": 203914704, + "router_z_loss_mlp": 0.82226562, + "step": 2446, + "time_per_iteration": 2.8490843772888184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173534, + "balance_loss_mlp": 1.09123182, + "epoch": 0.47075798383993844, + "flos": 556012680192.0, + "grad_norm": 0.030967943008195494, + "language_loss": 0.88733399, + "learning_rate": 0.0005714097136359862, + "loss": 0.89906937, + "num_input_tokens_seen": 203985072, + "router_z_loss_mlp": 0.82324219, + "step": 2447, + "time_per_iteration": 2.6790409088134766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172662, + "balance_loss_mlp": 1.09035945, + "epoch": 0.470950365525202, + "flos": 565493160960.0, + "grad_norm": 0.028459673893144737, + "language_loss": 0.91199988, + "learning_rate": 0.0005711013511786027, + "loss": 0.92372644, + "num_input_tokens_seen": 204061904, + "router_z_loss_mlp": 0.82324219, + "step": 2448, + "time_per_iteration": 2.871711492538452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169516, + "balance_loss_mlp": 1.08745217, + "epoch": 0.47114274721046556, + "flos": 535498106880.0, + "grad_norm": 0.02665313173872239, + "language_loss": 0.88226557, + "learning_rate": 0.0005707929611172263, + "loss": 0.89396071, + "num_input_tokens_seen": 204137392, + "router_z_loss_mlp": 0.82080078, + "step": 2449, + "time_per_iteration": 2.69319748878479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166092, + "balance_loss_mlp": 1.08402824, + "epoch": 0.47133512889572915, + "flos": 474077912064.0, + "grad_norm": 0.0332447507442279, + "language_loss": 0.90459168, + "learning_rate": 0.000570484543571585, + "loss": 0.91625261, + "num_input_tokens_seen": 204202752, + "router_z_loss_mlp": 0.82080078, + "step": 2450, + "time_per_iteration": 2.5612680912017822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164305, + "balance_loss_mlp": 1.08228934, + "epoch": 0.4715275105809927, + "flos": 459967343616.0, + "grad_norm": 0.03392229050190778, + "language_loss": 0.90577096, + "learning_rate": 0.0005701760986614171, + "loss": 0.91741407, + "num_input_tokens_seen": 204266960, + "router_z_loss_mlp": 0.8203125, + "step": 2451, + "time_per_iteration": 2.5571579933166504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166326, + "balance_loss_mlp": 1.08435798, + "epoch": 0.47171989226625627, + "flos": 422886806016.0, + "grad_norm": 0.028518751420243762, + "language_loss": 0.93793362, + "learning_rate": 0.0005698676265064714, + "loss": 0.94959688, + "num_input_tokens_seen": 204331216, + "router_z_loss_mlp": 0.81982422, + "step": 2452, + "time_per_iteration": 2.476069211959839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169062, + "balance_loss_mlp": 1.08680761, + "epoch": 0.4719122739515198, + "flos": 458376075264.0, + "grad_norm": 0.03301356479716476, + "language_loss": 0.95592558, + "learning_rate": 0.0005695591272265074, + "loss": 0.9676162, + "num_input_tokens_seen": 204397216, + "router_z_loss_mlp": 0.82275391, + "step": 2453, + "time_per_iteration": 2.512503147125244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169417, + "balance_loss_mlp": 1.08730555, + "epoch": 0.4721046556367834, + "flos": 516016848384.0, + "grad_norm": 0.02961212180136774, + "language_loss": 0.87225032, + "learning_rate": 0.0005692506009412954, + "loss": 0.88394439, + "num_input_tokens_seen": 204469952, + "router_z_loss_mlp": 0.82128906, + "step": 2454, + "time_per_iteration": 2.673123836517334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187157, + "balance_loss_mlp": 1.10609436, + "epoch": 0.4722970373220469, + "flos": 1575703721472.0, + "grad_norm": 0.017157731663316397, + "language_loss": 0.7755127, + "learning_rate": 0.0005689420477706156, + "loss": 0.78738415, + "num_input_tokens_seen": 204701152, + "router_z_loss_mlp": 0.81054688, + "step": 2455, + "time_per_iteration": 4.97356915473938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164137, + "balance_loss_mlp": 1.08216834, + "epoch": 0.4724894190073105, + "flos": 587394886656.0, + "grad_norm": 0.02627427755104431, + "language_loss": 0.95142597, + "learning_rate": 0.0005686334678342593, + "loss": 0.96306741, + "num_input_tokens_seen": 204778144, + "router_z_loss_mlp": 0.81982422, + "step": 2456, + "time_per_iteration": 2.867849588394165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165061, + "balance_loss_mlp": 1.08304489, + "epoch": 0.4726818006925741, + "flos": 869072497152.0, + "grad_norm": 0.03086214810478132, + "language_loss": 0.87917793, + "learning_rate": 0.0005683248612520274, + "loss": 0.89082849, + "num_input_tokens_seen": 204853376, + "router_z_loss_mlp": 0.8203125, + "step": 2457, + "time_per_iteration": 3.078068733215332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164085, + "balance_loss_mlp": 1.08206928, + "epoch": 0.4728741823778376, + "flos": 754227380736.0, + "grad_norm": 0.03352301766800045, + "language_loss": 0.88896751, + "learning_rate": 0.0005680162281437321, + "loss": 0.90060842, + "num_input_tokens_seen": 204925280, + "router_z_loss_mlp": 0.8203125, + "step": 2458, + "time_per_iteration": 2.9237887859344482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116424, + "balance_loss_mlp": 1.08260512, + "epoch": 0.4730665640631012, + "flos": 539657562624.0, + "grad_norm": 0.027635752733509208, + "language_loss": 0.89953935, + "learning_rate": 0.000567707568629195, + "loss": 0.91118181, + "num_input_tokens_seen": 205000592, + "router_z_loss_mlp": 0.81640625, + "step": 2459, + "time_per_iteration": 2.719519853591919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166645, + "balance_loss_mlp": 1.08505821, + "epoch": 0.47325894574836475, + "flos": 492682308096.0, + "grad_norm": 0.027667404433321316, + "language_loss": 0.88089126, + "learning_rate": 0.0005673988828282486, + "loss": 0.89255774, + "num_input_tokens_seen": 205073968, + "router_z_loss_mlp": 0.81591797, + "step": 2460, + "time_per_iteration": 2.71736216545105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165583, + "balance_loss_mlp": 1.0839963, + "epoch": 0.47345132743362833, + "flos": 765830886912.0, + "grad_norm": 0.028127891455978875, + "language_loss": 0.87479305, + "learning_rate": 0.0005670901708607352, + "loss": 0.88644892, + "num_input_tokens_seen": 205153536, + "router_z_loss_mlp": 0.81591797, + "step": 2461, + "time_per_iteration": 2.9727017879486084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165349, + "balance_loss_mlp": 1.08371425, + "epoch": 0.47364370911889186, + "flos": 541168240128.0, + "grad_norm": 0.03987357596495419, + "language_loss": 0.90376979, + "learning_rate": 0.0005667814328465076, + "loss": 0.91542327, + "num_input_tokens_seen": 205220944, + "router_z_loss_mlp": 0.81640625, + "step": 2462, + "time_per_iteration": 2.632636547088623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163463, + "balance_loss_mlp": 1.0815897, + "epoch": 0.47383609080415545, + "flos": 407091643392.0, + "grad_norm": 0.03654753942721471, + "language_loss": 0.88796914, + "learning_rate": 0.0005664726689054285, + "loss": 0.89960378, + "num_input_tokens_seen": 205282688, + "router_z_loss_mlp": 0.81884766, + "step": 2463, + "time_per_iteration": 2.466054916381836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170123, + "balance_loss_mlp": 1.08867884, + "epoch": 0.474028472489419, + "flos": 454438199808.0, + "grad_norm": 0.03923165930345575, + "language_loss": 0.8627066, + "learning_rate": 0.0005661638791573704, + "loss": 0.87440789, + "num_input_tokens_seen": 205357360, + "router_z_loss_mlp": 0.81445312, + "step": 2464, + "time_per_iteration": 2.7042744159698486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166183, + "balance_loss_mlp": 1.08450055, + "epoch": 0.47422085417468257, + "flos": 493194599424.0, + "grad_norm": 0.026684931914484025, + "language_loss": 0.92592585, + "learning_rate": 0.0005658550637222164, + "loss": 0.93758774, + "num_input_tokens_seen": 205424352, + "router_z_loss_mlp": 0.81689453, + "step": 2465, + "time_per_iteration": 2.6058290004730225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168127, + "balance_loss_mlp": 1.08611059, + "epoch": 0.47441323585994616, + "flos": 740125544448.0, + "grad_norm": 0.026202374072225774, + "language_loss": 0.87139833, + "learning_rate": 0.0005655462227198592, + "loss": 0.88307959, + "num_input_tokens_seen": 205502912, + "router_z_loss_mlp": 0.8203125, + "step": 2466, + "time_per_iteration": 2.8945796489715576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167919, + "balance_loss_mlp": 1.08590269, + "epoch": 0.4746056175452097, + "flos": 485674687488.0, + "grad_norm": 0.02746668082221095, + "language_loss": 0.89712787, + "learning_rate": 0.0005652373562702016, + "loss": 0.90880704, + "num_input_tokens_seen": 205571168, + "router_z_loss_mlp": 0.8203125, + "step": 2467, + "time_per_iteration": 2.576364278793335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166795, + "balance_loss_mlp": 1.08463609, + "epoch": 0.4747979992304733, + "flos": 462005775360.0, + "grad_norm": 0.03040478239716322, + "language_loss": 0.95003092, + "learning_rate": 0.000564928464493156, + "loss": 0.96169889, + "num_input_tokens_seen": 205639648, + "router_z_loss_mlp": 0.82177734, + "step": 2468, + "time_per_iteration": 2.5468242168426514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168306, + "balance_loss_mlp": 1.08624196, + "epoch": 0.4749903809157368, + "flos": 865879226880.0, + "grad_norm": 0.029413898751956376, + "language_loss": 0.88262731, + "learning_rate": 0.000564619547508645, + "loss": 0.89431041, + "num_input_tokens_seen": 205721536, + "router_z_loss_mlp": 0.82080078, + "step": 2469, + "time_per_iteration": 3.042994260787964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116966, + "balance_loss_mlp": 1.08764374, + "epoch": 0.4751827626010004, + "flos": 506551830528.0, + "grad_norm": 0.035426943126194606, + "language_loss": 0.90271819, + "learning_rate": 0.0005643106054366008, + "loss": 0.91441476, + "num_input_tokens_seen": 205788512, + "router_z_loss_mlp": 0.8203125, + "step": 2470, + "time_per_iteration": 2.5660367012023926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168432, + "balance_loss_mlp": 1.0863688, + "epoch": 0.47537514428626393, + "flos": 560452113408.0, + "grad_norm": 0.029652672624791387, + "language_loss": 0.85815179, + "learning_rate": 0.000564001638396965, + "loss": 0.86983615, + "num_input_tokens_seen": 205863104, + "router_z_loss_mlp": 0.82080078, + "step": 2471, + "time_per_iteration": 2.7345728874206543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167677, + "balance_loss_mlp": 1.08566117, + "epoch": 0.4755675259715275, + "flos": 835676054016.0, + "grad_norm": 0.029111814859825738, + "language_loss": 0.87706691, + "learning_rate": 0.0005636926465096897, + "loss": 0.8887437, + "num_input_tokens_seen": 205940688, + "router_z_loss_mlp": 0.8203125, + "step": 2472, + "time_per_iteration": 3.0570740699768066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166306, + "balance_loss_mlp": 1.08424211, + "epoch": 0.47575990765679105, + "flos": 509232809472.0, + "grad_norm": 0.030849533450069865, + "language_loss": 0.93407679, + "learning_rate": 0.0005633836298947363, + "loss": 0.94573981, + "num_input_tokens_seen": 206008352, + "router_z_loss_mlp": 0.82080078, + "step": 2473, + "time_per_iteration": 2.6804757118225098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167624, + "balance_loss_mlp": 1.08570302, + "epoch": 0.47595228934205464, + "flos": 592962961920.0, + "grad_norm": 0.0319092637225127, + "language_loss": 0.77122205, + "learning_rate": 0.000563074588672075, + "loss": 0.78289831, + "num_input_tokens_seen": 206078240, + "router_z_loss_mlp": 0.81933594, + "step": 2474, + "time_per_iteration": 2.7190651893615723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166922, + "balance_loss_mlp": 1.08500123, + "epoch": 0.4761446710273182, + "flos": 581683094016.0, + "grad_norm": 0.028375010801601097, + "language_loss": 0.91505527, + "learning_rate": 0.0005627655229616868, + "loss": 0.92672449, + "num_input_tokens_seen": 206148896, + "router_z_loss_mlp": 0.81933594, + "step": 2475, + "time_per_iteration": 2.689652919769287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164128, + "balance_loss_mlp": 1.08235061, + "epoch": 0.47633705271258175, + "flos": 674079264768.0, + "grad_norm": 0.024988633596495675, + "language_loss": 0.94898891, + "learning_rate": 0.0005624564328835616, + "loss": 0.96063018, + "num_input_tokens_seen": 206223792, + "router_z_loss_mlp": 0.81787109, + "step": 2476, + "time_per_iteration": 2.8038489818573 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169163, + "balance_loss_mlp": 1.08728969, + "epoch": 0.47652943439784534, + "flos": 542970355200.0, + "grad_norm": 0.0285977430554916, + "language_loss": 0.89680123, + "learning_rate": 0.0005621473185576986, + "loss": 0.90849286, + "num_input_tokens_seen": 206299376, + "router_z_loss_mlp": 0.81884766, + "step": 2477, + "time_per_iteration": 2.7568743228912354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165779, + "balance_loss_mlp": 1.08433557, + "epoch": 0.4767218160831089, + "flos": 525846437376.0, + "grad_norm": 0.0316668482667046, + "language_loss": 0.93167424, + "learning_rate": 0.0005618381801041068, + "loss": 0.94333208, + "num_input_tokens_seen": 206367936, + "router_z_loss_mlp": 0.81445312, + "step": 2478, + "time_per_iteration": 2.612211227416992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167228, + "balance_loss_mlp": 1.08545041, + "epoch": 0.47691419776837246, + "flos": 569126863872.0, + "grad_norm": 0.03238452738028376, + "language_loss": 0.88936818, + "learning_rate": 0.0005615290176428044, + "loss": 0.90104043, + "num_input_tokens_seen": 206438864, + "router_z_loss_mlp": 0.81787109, + "step": 2479, + "time_per_iteration": 2.649019241333008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168128, + "balance_loss_mlp": 1.08668435, + "epoch": 0.477106579453636, + "flos": 532024859136.0, + "grad_norm": 0.027888492093205767, + "language_loss": 0.91917288, + "learning_rate": 0.0005612198312938187, + "loss": 0.93085408, + "num_input_tokens_seen": 206516656, + "router_z_loss_mlp": 0.81445312, + "step": 2480, + "time_per_iteration": 2.739767551422119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169934, + "balance_loss_mlp": 1.08839524, + "epoch": 0.4772989611388996, + "flos": 595500950016.0, + "grad_norm": 0.027931665483744535, + "language_loss": 0.84935582, + "learning_rate": 0.0005609106211771868, + "loss": 0.86105514, + "num_input_tokens_seen": 206595040, + "router_z_loss_mlp": 0.81542969, + "step": 2481, + "time_per_iteration": 2.850339651107788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169841, + "balance_loss_mlp": 1.08835006, + "epoch": 0.4774913428241631, + "flos": 545707729920.0, + "grad_norm": 0.027660076347337716, + "language_loss": 0.94426548, + "learning_rate": 0.0005606013874129543, + "loss": 0.95596385, + "num_input_tokens_seen": 206670192, + "router_z_loss_mlp": 0.81494141, + "step": 2482, + "time_per_iteration": 2.7403533458709717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169934, + "balance_loss_mlp": 1.08829987, + "epoch": 0.4776837245094267, + "flos": 541129308672.0, + "grad_norm": 0.02810737401227857, + "language_loss": 0.86136961, + "learning_rate": 0.0005602921301211768, + "loss": 0.87306893, + "num_input_tokens_seen": 206746992, + "router_z_loss_mlp": 0.81640625, + "step": 2483, + "time_per_iteration": 2.6941261291503906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171891, + "balance_loss_mlp": 1.09016109, + "epoch": 0.4778761061946903, + "flos": 472755887616.0, + "grad_norm": 0.029011275825861695, + "language_loss": 0.8832168, + "learning_rate": 0.0005599828494219185, + "loss": 0.89493567, + "num_input_tokens_seen": 206813584, + "router_z_loss_mlp": 0.81738281, + "step": 2484, + "time_per_iteration": 2.5801451206207275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116562, + "balance_loss_mlp": 1.08355606, + "epoch": 0.4780684878799538, + "flos": 727337000448.0, + "grad_norm": 0.03126301150284597, + "language_loss": 0.95766234, + "learning_rate": 0.0005596735454352527, + "loss": 0.96931851, + "num_input_tokens_seen": 206885840, + "router_z_loss_mlp": 0.82080078, + "step": 2485, + "time_per_iteration": 2.866809368133545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165282, + "balance_loss_mlp": 1.0832181, + "epoch": 0.4782608695652174, + "flos": 549953780736.0, + "grad_norm": 0.032811891631208345, + "language_loss": 0.91780031, + "learning_rate": 0.0005593642182812619, + "loss": 0.92945307, + "num_input_tokens_seen": 206955104, + "router_z_loss_mlp": 0.82080078, + "step": 2486, + "time_per_iteration": 2.6762824058532715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166087, + "balance_loss_mlp": 1.08388078, + "epoch": 0.47845325125048094, + "flos": 831401805312.0, + "grad_norm": 0.03291122574992765, + "language_loss": 0.91604954, + "learning_rate": 0.0005590548680800378, + "loss": 0.92771041, + "num_input_tokens_seen": 207039792, + "router_z_loss_mlp": 0.82226562, + "step": 2487, + "time_per_iteration": 3.1848442554473877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159859, + "balance_loss_mlp": 1.07765198, + "epoch": 0.4786456329357445, + "flos": 515270241792.0, + "grad_norm": 0.02977291399963519, + "language_loss": 0.8241533, + "learning_rate": 0.0005587454949516804, + "loss": 0.83575195, + "num_input_tokens_seen": 207115632, + "router_z_loss_mlp": 0.82226562, + "step": 2488, + "time_per_iteration": 2.728825330734253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163121, + "balance_loss_mlp": 1.08077133, + "epoch": 0.47883801462100806, + "flos": 565729477632.0, + "grad_norm": 0.034122039627151275, + "language_loss": 0.9412536, + "learning_rate": 0.0005584360990162993, + "loss": 0.95288485, + "num_input_tokens_seen": 207184336, + "router_z_loss_mlp": 0.82373047, + "step": 2489, + "time_per_iteration": 2.65055251121521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162976, + "balance_loss_mlp": 1.08076906, + "epoch": 0.47903039630627164, + "flos": 580704173568.0, + "grad_norm": 0.025976014522421025, + "language_loss": 0.89770818, + "learning_rate": 0.0005581266803940124, + "loss": 0.90933788, + "num_input_tokens_seen": 207258720, + "router_z_loss_mlp": 0.82226562, + "step": 2490, + "time_per_iteration": 2.740140199661255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164709, + "balance_loss_mlp": 1.08250248, + "epoch": 0.47922277799153523, + "flos": 620085656064.0, + "grad_norm": 0.030357385002024635, + "language_loss": 0.93398184, + "learning_rate": 0.0005578172392049471, + "loss": 0.94562888, + "num_input_tokens_seen": 207329216, + "router_z_loss_mlp": 0.82226562, + "step": 2491, + "time_per_iteration": 2.7492756843566895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164354, + "balance_loss_mlp": 1.08214724, + "epoch": 0.47941515967679876, + "flos": 640858739712.0, + "grad_norm": 0.03220406636162171, + "language_loss": 0.9124878, + "learning_rate": 0.0005575077755692386, + "loss": 0.92413139, + "num_input_tokens_seen": 207403712, + "router_z_loss_mlp": 0.82226562, + "step": 2492, + "time_per_iteration": 2.8061015605926514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166388, + "balance_loss_mlp": 1.08437181, + "epoch": 0.47960754136206235, + "flos": 520875247104.0, + "grad_norm": 0.02527329704122564, + "language_loss": 0.91187584, + "learning_rate": 0.0005571982896070316, + "loss": 0.92353964, + "num_input_tokens_seen": 207477120, + "router_z_loss_mlp": 0.8203125, + "step": 2493, + "time_per_iteration": 4.094395160675049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116615, + "balance_loss_mlp": 1.08399141, + "epoch": 0.4797999230473259, + "flos": 476031750144.0, + "grad_norm": 0.03303640593992076, + "language_loss": 0.95932508, + "learning_rate": 0.0005568887814384792, + "loss": 0.97098666, + "num_input_tokens_seen": 207544592, + "router_z_loss_mlp": 0.82177734, + "step": 2494, + "time_per_iteration": 2.572852373123169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011645, + "balance_loss_mlp": 1.08229315, + "epoch": 0.47999230473258947, + "flos": 533068907520.0, + "grad_norm": 0.028664161711311382, + "language_loss": 0.92573094, + "learning_rate": 0.000556579251183743, + "loss": 0.93737602, + "num_input_tokens_seen": 207613808, + "router_z_loss_mlp": 0.82226562, + "step": 2495, + "time_per_iteration": 2.6538801193237305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162424, + "balance_loss_mlp": 1.08036053, + "epoch": 0.480184686417853, + "flos": 602605899264.0, + "grad_norm": 0.03331899292815792, + "language_loss": 0.86056805, + "learning_rate": 0.0005562696989629936, + "loss": 0.87219226, + "num_input_tokens_seen": 207684464, + "router_z_loss_mlp": 0.82080078, + "step": 2496, + "time_per_iteration": 2.687903881072998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162213, + "balance_loss_mlp": 1.08019686, + "epoch": 0.4803770681031166, + "flos": 529261287936.0, + "grad_norm": 0.02923998603568501, + "language_loss": 0.88484073, + "learning_rate": 0.0005559601248964095, + "loss": 0.89646292, + "num_input_tokens_seen": 207754016, + "router_z_loss_mlp": 0.8203125, + "step": 2497, + "time_per_iteration": 2.6282827854156494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161296, + "balance_loss_mlp": 1.07918417, + "epoch": 0.4805694497883801, + "flos": 512228694528.0, + "grad_norm": 0.02922528152793709, + "language_loss": 0.91127884, + "learning_rate": 0.0005556505291041783, + "loss": 0.92289186, + "num_input_tokens_seen": 207827104, + "router_z_loss_mlp": 0.82128906, + "step": 2498, + "time_per_iteration": 2.662783622741699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161007, + "balance_loss_mlp": 1.07899094, + "epoch": 0.4807618314736437, + "flos": 601605511680.0, + "grad_norm": 0.02724196548061384, + "language_loss": 0.8966158, + "learning_rate": 0.0005553409117064954, + "loss": 0.90822583, + "num_input_tokens_seen": 207907824, + "router_z_loss_mlp": 0.8203125, + "step": 2499, + "time_per_iteration": 2.898850917816162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164849, + "balance_loss_mlp": 1.08245122, + "epoch": 0.4809542131589073, + "flos": 570029922816.0, + "grad_norm": 0.028349491645904, + "language_loss": 0.91357303, + "learning_rate": 0.0005550312728235654, + "loss": 0.92522144, + "num_input_tokens_seen": 207975632, + "router_z_loss_mlp": 0.82421875, + "step": 2500, + "time_per_iteration": 2.754187822341919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164619, + "balance_loss_mlp": 1.08217347, + "epoch": 0.4811465948441708, + "flos": 577165797888.0, + "grad_norm": 0.034664680835738745, + "language_loss": 0.91214681, + "learning_rate": 0.0005547216125756003, + "loss": 0.92379302, + "num_input_tokens_seen": 208048000, + "router_z_loss_mlp": 0.82470703, + "step": 2501, + "time_per_iteration": 2.778639078140259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164023, + "balance_loss_mlp": 1.08143485, + "epoch": 0.4813389765294344, + "flos": 825297243648.0, + "grad_norm": 0.028167486861350455, + "language_loss": 0.87736559, + "learning_rate": 0.0005544119310828211, + "loss": 0.88900584, + "num_input_tokens_seen": 208132592, + "router_z_loss_mlp": 0.82617188, + "step": 2502, + "time_per_iteration": 3.0756351947784424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164093, + "balance_loss_mlp": 1.08174348, + "epoch": 0.48153135821469795, + "flos": 636699283968.0, + "grad_norm": 0.030410217991048386, + "language_loss": 0.91046345, + "learning_rate": 0.0005541022284654568, + "loss": 0.92210436, + "num_input_tokens_seen": 208215824, + "router_z_loss_mlp": 0.82373047, + "step": 2503, + "time_per_iteration": 2.892679214477539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163382, + "balance_loss_mlp": 1.08103192, + "epoch": 0.48172373989996153, + "flos": 504708782592.0, + "grad_norm": 0.02826951852510112, + "language_loss": 0.89667141, + "learning_rate": 0.0005537925048437446, + "loss": 0.90830529, + "num_input_tokens_seen": 208284304, + "router_z_loss_mlp": 0.82373047, + "step": 2504, + "time_per_iteration": 2.5750081539154053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179108, + "balance_loss_mlp": 1.09918976, + "epoch": 0.48191612158522507, + "flos": 1535566173696.0, + "grad_norm": 0.017261305400491866, + "language_loss": 0.75751472, + "learning_rate": 0.00055348276033793, + "loss": 0.76930583, + "num_input_tokens_seen": 208510224, + "router_z_loss_mlp": 0.79882812, + "step": 2505, + "time_per_iteration": 4.912463426589966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162522, + "balance_loss_mlp": 1.07988608, + "epoch": 0.48210850327048865, + "flos": 703811805696.0, + "grad_norm": 0.027104005826713556, + "language_loss": 0.93955028, + "learning_rate": 0.0005531729950682664, + "loss": 0.95117545, + "num_input_tokens_seen": 208596816, + "router_z_loss_mlp": 0.82666016, + "step": 2506, + "time_per_iteration": 3.000925064086914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162538, + "balance_loss_mlp": 1.07999802, + "epoch": 0.4823008849557522, + "flos": 440700934656.0, + "grad_norm": 0.03451729562062639, + "language_loss": 0.91777337, + "learning_rate": 0.000552863209155015, + "loss": 0.92939872, + "num_input_tokens_seen": 208659616, + "router_z_loss_mlp": 0.82568359, + "step": 2507, + "time_per_iteration": 2.478809118270874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159773, + "balance_loss_mlp": 1.07737529, + "epoch": 0.48249326664101577, + "flos": 472812283392.0, + "grad_norm": 0.02691149649688828, + "language_loss": 0.87363136, + "learning_rate": 0.0005525534027184461, + "loss": 0.88522899, + "num_input_tokens_seen": 208728080, + "router_z_loss_mlp": 0.82421875, + "step": 2508, + "time_per_iteration": 2.54645037651062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161526, + "balance_loss_mlp": 1.07951045, + "epoch": 0.48268564832627936, + "flos": 564314127360.0, + "grad_norm": 0.023137570540037285, + "language_loss": 0.88137501, + "learning_rate": 0.0005522435758788365, + "loss": 0.89299035, + "num_input_tokens_seen": 208803376, + "router_z_loss_mlp": 0.8203125, + "step": 2509, + "time_per_iteration": 2.700540542602539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160536, + "balance_loss_mlp": 1.07842445, + "epoch": 0.4828780300115429, + "flos": 630842499072.0, + "grad_norm": 0.03372990027790351, + "language_loss": 0.86188895, + "learning_rate": 0.0005519337287564721, + "loss": 0.87349427, + "num_input_tokens_seen": 208876656, + "router_z_loss_mlp": 0.82128906, + "step": 2510, + "time_per_iteration": 2.8127758502960205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161519, + "balance_loss_mlp": 1.07945526, + "epoch": 0.4830704116968065, + "flos": 633004455936.0, + "grad_norm": 0.029001937113396697, + "language_loss": 0.88535267, + "learning_rate": 0.000551623861471646, + "loss": 0.89696789, + "num_input_tokens_seen": 208950224, + "router_z_loss_mlp": 0.82080078, + "step": 2511, + "time_per_iteration": 2.7925469875335693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166962, + "balance_loss_mlp": 1.08647156, + "epoch": 0.48326279338207, + "flos": 1572616512000.0, + "grad_norm": 0.009161484988790693, + "language_loss": 0.78818834, + "learning_rate": 0.0005513139741446594, + "loss": 0.79985785, + "num_input_tokens_seen": 209173984, + "router_z_loss_mlp": 0.8046875, + "step": 2512, + "time_per_iteration": 4.850747108459473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159851, + "balance_loss_mlp": 1.07783449, + "epoch": 0.4834551750673336, + "flos": 510237926400.0, + "grad_norm": 0.028933780257729795, + "language_loss": 0.92768925, + "learning_rate": 0.0005510040668958211, + "loss": 0.93928778, + "num_input_tokens_seen": 209242832, + "router_z_loss_mlp": 0.8203125, + "step": 2513, + "time_per_iteration": 2.56387996673584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165955, + "balance_loss_mlp": 1.08546448, + "epoch": 0.48364755675259713, + "flos": 1531825683456.0, + "grad_norm": 0.007133010503999018, + "language_loss": 0.77760583, + "learning_rate": 0.0005506941398454483, + "loss": 0.78926539, + "num_input_tokens_seen": 209473520, + "router_z_loss_mlp": 0.8046875, + "step": 2514, + "time_per_iteration": 4.836379289627075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160977, + "balance_loss_mlp": 1.07938981, + "epoch": 0.4838399384378607, + "flos": 566046385152.0, + "grad_norm": 0.029153045334521625, + "language_loss": 0.89274001, + "learning_rate": 0.0005503841931138645, + "loss": 0.9043498, + "num_input_tokens_seen": 209544208, + "router_z_loss_mlp": 0.81591797, + "step": 2515, + "time_per_iteration": 2.6633048057556152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160148, + "balance_loss_mlp": 1.07846582, + "epoch": 0.4840323201231243, + "flos": 388541641728.0, + "grad_norm": 0.03187042626689644, + "language_loss": 0.88861662, + "learning_rate": 0.0005500742268214025, + "loss": 0.90021807, + "num_input_tokens_seen": 209607408, + "router_z_loss_mlp": 0.81689453, + "step": 2516, + "time_per_iteration": 2.4762659072875977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160045, + "balance_loss_mlp": 1.07845843, + "epoch": 0.48422470180838784, + "flos": 632175257088.0, + "grad_norm": 0.026732605532440536, + "language_loss": 0.9007901, + "learning_rate": 0.0005497642410884014, + "loss": 0.91239059, + "num_input_tokens_seen": 209683392, + "router_z_loss_mlp": 0.81591797, + "step": 2517, + "time_per_iteration": 2.7693819999694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164478, + "balance_loss_mlp": 1.08246255, + "epoch": 0.4844170834936514, + "flos": 500313010176.0, + "grad_norm": 0.028128961210665323, + "language_loss": 0.90248644, + "learning_rate": 0.0005494542360352085, + "loss": 0.91413122, + "num_input_tokens_seen": 209753184, + "router_z_loss_mlp": 0.8203125, + "step": 2518, + "time_per_iteration": 2.6704978942871094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163589, + "balance_loss_mlp": 1.08152497, + "epoch": 0.48460946517891496, + "flos": 552194327040.0, + "grad_norm": 0.02893400906180164, + "language_loss": 0.92442286, + "learning_rate": 0.0005491442117821783, + "loss": 0.93605876, + "num_input_tokens_seen": 209829568, + "router_z_loss_mlp": 0.82080078, + "step": 2519, + "time_per_iteration": 2.691898822784424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167118, + "balance_loss_mlp": 1.08491123, + "epoch": 0.48480184686417854, + "flos": 530461788672.0, + "grad_norm": 0.03488173137086134, + "language_loss": 0.937814, + "learning_rate": 0.0005488341684496732, + "loss": 0.94948518, + "num_input_tokens_seen": 209902176, + "router_z_loss_mlp": 0.82226562, + "step": 2520, + "time_per_iteration": 2.6527535915374756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165597, + "balance_loss_mlp": 1.08343804, + "epoch": 0.4849942285494421, + "flos": 533047440384.0, + "grad_norm": 0.028537304261499467, + "language_loss": 0.97065389, + "learning_rate": 0.0005485241061580624, + "loss": 0.98230994, + "num_input_tokens_seen": 209969168, + "router_z_loss_mlp": 0.82177734, + "step": 2521, + "time_per_iteration": 2.7213969230651855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166792, + "balance_loss_mlp": 1.08463287, + "epoch": 0.48518661023470566, + "flos": 723972541440.0, + "grad_norm": 0.02938300657957885, + "language_loss": 0.90224278, + "learning_rate": 0.0005482140250277228, + "loss": 0.91391075, + "num_input_tokens_seen": 210049616, + "router_z_loss_mlp": 0.82177734, + "step": 2522, + "time_per_iteration": 2.9924206733703613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116808, + "balance_loss_mlp": 1.08592129, + "epoch": 0.4853789919199692, + "flos": 507155446272.0, + "grad_norm": 0.030604201389603965, + "language_loss": 0.93692237, + "learning_rate": 0.0005479039251790387, + "loss": 0.94860315, + "num_input_tokens_seen": 210118512, + "router_z_loss_mlp": 0.82177734, + "step": 2523, + "time_per_iteration": 2.7099061012268066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167569, + "balance_loss_mlp": 1.08541012, + "epoch": 0.4855713736052328, + "flos": 661698952704.0, + "grad_norm": 0.03222198223164457, + "language_loss": 0.90574634, + "learning_rate": 0.0005475938067324014, + "loss": 0.917422, + "num_input_tokens_seen": 210193728, + "router_z_loss_mlp": 0.82177734, + "step": 2524, + "time_per_iteration": 2.8379342555999756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117016, + "balance_loss_mlp": 1.08823884, + "epoch": 0.48576375529049637, + "flos": 437889699840.0, + "grad_norm": 0.03297241328571355, + "language_loss": 0.89402866, + "learning_rate": 0.0005472836698082098, + "loss": 0.90573025, + "num_input_tokens_seen": 210258832, + "router_z_loss_mlp": 0.81933594, + "step": 2525, + "time_per_iteration": 2.5135462284088135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165117, + "balance_loss_mlp": 1.08300531, + "epoch": 0.4859561369757599, + "flos": 582844663296.0, + "grad_norm": 0.028434138704400515, + "language_loss": 0.88848263, + "learning_rate": 0.0005469735145268694, + "loss": 0.90013373, + "num_input_tokens_seen": 210335280, + "router_z_loss_mlp": 0.82128906, + "step": 2526, + "time_per_iteration": 2.7137279510498047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162635, + "balance_loss_mlp": 1.08066678, + "epoch": 0.4861485186610235, + "flos": 488933085696.0, + "grad_norm": 0.028544121185286958, + "language_loss": 0.86922419, + "learning_rate": 0.0005466633410087933, + "loss": 0.88085049, + "num_input_tokens_seen": 210407072, + "router_z_loss_mlp": 0.81982422, + "step": 2527, + "time_per_iteration": 2.7106595039367676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116584, + "balance_loss_mlp": 1.08554077, + "epoch": 0.486340900346287, + "flos": 1561111060992.0, + "grad_norm": 0.005447093154513016, + "language_loss": 0.77260822, + "learning_rate": 0.0005463531493744017, + "loss": 0.78426665, + "num_input_tokens_seen": 210644544, + "router_z_loss_mlp": 0.80273438, + "step": 2528, + "time_per_iteration": 4.841828346252441 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162423, + "balance_loss_mlp": 1.08069348, + "epoch": 0.4865332820315506, + "flos": 483990093312.0, + "grad_norm": 0.026581719305211308, + "language_loss": 0.93869209, + "learning_rate": 0.0005460429397441214, + "loss": 0.95031631, + "num_input_tokens_seen": 210711760, + "router_z_loss_mlp": 0.81738281, + "step": 2529, + "time_per_iteration": 2.553438425064087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164502, + "balance_loss_mlp": 1.08296263, + "epoch": 0.48672566371681414, + "flos": 536857061376.0, + "grad_norm": 0.02943507577689114, + "language_loss": 0.92893845, + "learning_rate": 0.0005457327122383866, + "loss": 0.94058347, + "num_input_tokens_seen": 210783040, + "router_z_loss_mlp": 0.81542969, + "step": 2530, + "time_per_iteration": 2.628859043121338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167305, + "balance_loss_mlp": 1.08795929, + "epoch": 0.4869180454020777, + "flos": 1415830457856.0, + "grad_norm": 0.01207374103656724, + "language_loss": 0.74636483, + "learning_rate": 0.0005454224669776385, + "loss": 0.75803792, + "num_input_tokens_seen": 211002128, + "router_z_loss_mlp": 0.79296875, + "step": 2531, + "time_per_iteration": 4.798464775085449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163612, + "balance_loss_mlp": 1.08212042, + "epoch": 0.48711042708734126, + "flos": 574226308608.0, + "grad_norm": 0.027593185975689192, + "language_loss": 0.81384307, + "learning_rate": 0.0005451122040823244, + "loss": 0.82547921, + "num_input_tokens_seen": 211080080, + "router_z_loss_mlp": 0.81494141, + "step": 2532, + "time_per_iteration": 2.7749013900756836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116272, + "balance_loss_mlp": 1.08118057, + "epoch": 0.48730280877260485, + "flos": 627816414720.0, + "grad_norm": 0.02591805781842408, + "language_loss": 0.82129884, + "learning_rate": 0.0005448019236728997, + "loss": 0.83292603, + "num_input_tokens_seen": 211162944, + "router_z_loss_mlp": 0.81542969, + "step": 2533, + "time_per_iteration": 2.865239381790161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164787, + "balance_loss_mlp": 1.08315206, + "epoch": 0.48749519045786843, + "flos": 513468126720.0, + "grad_norm": 0.03027053938911928, + "language_loss": 0.91336226, + "learning_rate": 0.0005444916258698255, + "loss": 0.92501009, + "num_input_tokens_seen": 211230448, + "router_z_loss_mlp": 0.81640625, + "step": 2534, + "time_per_iteration": 2.5986597537994385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164085, + "balance_loss_mlp": 1.08259368, + "epoch": 0.48768757214313196, + "flos": 526478251008.0, + "grad_norm": 0.02699578070604874, + "language_loss": 0.90958095, + "learning_rate": 0.0005441813107935704, + "loss": 0.92122173, + "num_input_tokens_seen": 211301248, + "router_z_loss_mlp": 0.81494141, + "step": 2535, + "time_per_iteration": 2.685478925704956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162911, + "balance_loss_mlp": 1.08137167, + "epoch": 0.48787995382839555, + "flos": 506030807040.0, + "grad_norm": 0.02902824988643181, + "language_loss": 0.91504169, + "learning_rate": 0.0005438709785646091, + "loss": 0.92667079, + "num_input_tokens_seen": 211369888, + "router_z_loss_mlp": 0.81542969, + "step": 2536, + "time_per_iteration": 2.563302755355835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164758, + "balance_loss_mlp": 1.08302808, + "epoch": 0.4880723355136591, + "flos": 576247276032.0, + "grad_norm": 0.028837521239882914, + "language_loss": 0.92468232, + "learning_rate": 0.0005435606293034234, + "loss": 0.93632984, + "num_input_tokens_seen": 211441808, + "router_z_loss_mlp": 0.81738281, + "step": 2537, + "time_per_iteration": 2.6447930335998535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117327, + "balance_loss_mlp": 1.09163582, + "epoch": 0.48826471719892267, + "flos": 562536207360.0, + "grad_norm": 0.0312247117460979, + "language_loss": 0.90714639, + "learning_rate": 0.0005432502631305016, + "loss": 0.91887903, + "num_input_tokens_seen": 211511216, + "router_z_loss_mlp": 0.81640625, + "step": 2538, + "time_per_iteration": 2.6652588844299316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173314, + "balance_loss_mlp": 1.09163225, + "epoch": 0.4884570988841862, + "flos": 727547847168.0, + "grad_norm": 0.027646073497336384, + "language_loss": 0.88003767, + "learning_rate": 0.0005429398801663386, + "loss": 0.89177084, + "num_input_tokens_seen": 211589264, + "router_z_loss_mlp": 0.81689453, + "step": 2539, + "time_per_iteration": 2.9378042221069336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163435, + "balance_loss_mlp": 1.08180094, + "epoch": 0.4886494805694498, + "flos": 431924126208.0, + "grad_norm": 0.03488087397138866, + "language_loss": 0.90234458, + "learning_rate": 0.0005426294805314355, + "loss": 0.91397893, + "num_input_tokens_seen": 211652928, + "router_z_loss_mlp": 0.81640625, + "step": 2540, + "time_per_iteration": 2.538275718688965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161042, + "balance_loss_mlp": 1.07935977, + "epoch": 0.4888418622547134, + "flos": 674344505856.0, + "grad_norm": 0.02710942555690322, + "language_loss": 0.8497895, + "learning_rate": 0.0005423190643463003, + "loss": 0.86139989, + "num_input_tokens_seen": 211741664, + "router_z_loss_mlp": 0.81689453, + "step": 2541, + "time_per_iteration": 2.9786784648895264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163064, + "balance_loss_mlp": 1.08133411, + "epoch": 0.4890342439399769, + "flos": 542935426560.0, + "grad_norm": 0.02908053911836938, + "language_loss": 0.88889569, + "learning_rate": 0.0005420086317314473, + "loss": 0.90052634, + "num_input_tokens_seen": 211809136, + "router_z_loss_mlp": 0.81738281, + "step": 2542, + "time_per_iteration": 2.650505781173706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163957, + "balance_loss_mlp": 1.08198881, + "epoch": 0.4892266256252405, + "flos": 591862517760.0, + "grad_norm": 0.032456825889771945, + "language_loss": 0.86421382, + "learning_rate": 0.0005416981828073971, + "loss": 0.87585342, + "num_input_tokens_seen": 211883136, + "router_z_loss_mlp": 0.81982422, + "step": 2543, + "time_per_iteration": 2.756906032562256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167862, + "balance_loss_mlp": 1.08718109, + "epoch": 0.48941900731050403, + "flos": 1519654216704.0, + "grad_norm": 0.009398242691954228, + "language_loss": 0.77115011, + "learning_rate": 0.0005413877176946765, + "loss": 0.78282875, + "num_input_tokens_seen": 212117488, + "router_z_loss_mlp": 0.80664062, + "step": 2544, + "time_per_iteration": 4.826622486114502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163984, + "balance_loss_mlp": 1.08225381, + "epoch": 0.4896113889957676, + "flos": 471518456832.0, + "grad_norm": 0.03564931489131084, + "language_loss": 0.92759442, + "learning_rate": 0.000541077236513819, + "loss": 0.93923426, + "num_input_tokens_seen": 212181952, + "router_z_loss_mlp": 0.81738281, + "step": 2545, + "time_per_iteration": 2.5047078132629395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169885, + "balance_loss_mlp": 1.08848882, + "epoch": 0.48980377068103115, + "flos": 497551440384.0, + "grad_norm": 0.02644804149278648, + "language_loss": 0.87771875, + "learning_rate": 0.0005407667393853638, + "loss": 0.88941759, + "num_input_tokens_seen": 212252608, + "router_z_loss_mlp": 0.81396484, + "step": 2546, + "time_per_iteration": 2.615182876586914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172802, + "balance_loss_mlp": 1.09116721, + "epoch": 0.48999615236629473, + "flos": 694107743232.0, + "grad_norm": 0.032384144791382644, + "language_loss": 0.89844877, + "learning_rate": 0.0005404562264298569, + "loss": 0.91017681, + "num_input_tokens_seen": 212328560, + "router_z_loss_mlp": 0.81640625, + "step": 2547, + "time_per_iteration": 2.8694136142730713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164836, + "balance_loss_mlp": 1.08310628, + "epoch": 0.49018853405155827, + "flos": 542748774912.0, + "grad_norm": 0.02932030725962162, + "language_loss": 0.90206313, + "learning_rate": 0.0005401456977678498, + "loss": 0.91371155, + "num_input_tokens_seen": 212399616, + "router_z_loss_mlp": 0.81738281, + "step": 2548, + "time_per_iteration": 2.644604444503784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158708, + "balance_loss_mlp": 1.07702553, + "epoch": 0.49038091573682185, + "flos": 697108357632.0, + "grad_norm": 0.0348486432591887, + "language_loss": 0.83939159, + "learning_rate": 0.0005398351535199008, + "loss": 0.85097861, + "num_input_tokens_seen": 212482352, + "router_z_loss_mlp": 0.81689453, + "step": 2549, + "time_per_iteration": 3.064962863922119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158664, + "balance_loss_mlp": 1.07693398, + "epoch": 0.49057329742208544, + "flos": 598062406656.0, + "grad_norm": 0.028343941430048352, + "language_loss": 0.89488542, + "learning_rate": 0.0005395245938065735, + "loss": 0.90647209, + "num_input_tokens_seen": 212559504, + "router_z_loss_mlp": 0.81738281, + "step": 2550, + "time_per_iteration": 2.8023993968963623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162826, + "balance_loss_mlp": 1.08119094, + "epoch": 0.490765679107349, + "flos": 514416847872.0, + "grad_norm": 0.036438353865587, + "language_loss": 0.8920716, + "learning_rate": 0.0005392140187484379, + "loss": 0.90369982, + "num_input_tokens_seen": 212625664, + "router_z_loss_mlp": 0.81640625, + "step": 2551, + "time_per_iteration": 2.5544004440307617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160822, + "balance_loss_mlp": 1.07928288, + "epoch": 0.49095806079261256, + "flos": 630842499072.0, + "grad_norm": 0.02833803159801528, + "language_loss": 0.95730108, + "learning_rate": 0.0005389034284660701, + "loss": 0.96890926, + "num_input_tokens_seen": 212702000, + "router_z_loss_mlp": 0.81542969, + "step": 2552, + "time_per_iteration": 2.787997245788574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156735, + "balance_loss_mlp": 1.07524312, + "epoch": 0.4911504424778761, + "flos": 916792356864.0, + "grad_norm": 0.03441290589053542, + "language_loss": 0.8892417, + "learning_rate": 0.000538592823080052, + "loss": 0.90080899, + "num_input_tokens_seen": 212785376, + "router_z_loss_mlp": 0.81494141, + "step": 2553, + "time_per_iteration": 3.1353423595428467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159599, + "balance_loss_mlp": 1.07858455, + "epoch": 0.4913428241631397, + "flos": 439854271488.0, + "grad_norm": 0.03215354145178159, + "language_loss": 0.91146123, + "learning_rate": 0.000538282202710971, + "loss": 0.9230572, + "num_input_tokens_seen": 212848176, + "router_z_loss_mlp": 0.81005859, + "step": 2554, + "time_per_iteration": 2.524106025695801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158745, + "balance_loss_mlp": 1.0776825, + "epoch": 0.4915352058484032, + "flos": 637239773184.0, + "grad_norm": 0.03412299335020121, + "language_loss": 0.8861627, + "learning_rate": 0.000537971567479421, + "loss": 0.8977502, + "num_input_tokens_seen": 212917888, + "router_z_loss_mlp": 0.81054688, + "step": 2555, + "time_per_iteration": 2.750051736831665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162188, + "balance_loss_mlp": 1.08107841, + "epoch": 0.4917275875336668, + "flos": 505509783552.0, + "grad_norm": 0.03289434989172404, + "language_loss": 0.93214262, + "learning_rate": 0.0005376609175060011, + "loss": 0.94376451, + "num_input_tokens_seen": 212986288, + "router_z_loss_mlp": 0.81103516, + "step": 2556, + "time_per_iteration": 2.588437557220459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160453, + "balance_loss_mlp": 1.07924759, + "epoch": 0.49191996921893033, + "flos": 655733379072.0, + "grad_norm": 0.02731850736189593, + "language_loss": 0.86463559, + "learning_rate": 0.0005373502529113162, + "loss": 0.87624013, + "num_input_tokens_seen": 213059504, + "router_z_loss_mlp": 0.81201172, + "step": 2557, + "time_per_iteration": 2.775529146194458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160279, + "balance_loss_mlp": 1.07897866, + "epoch": 0.4921123509041939, + "flos": 493398715392.0, + "grad_norm": 0.02896728411720768, + "language_loss": 0.88084292, + "learning_rate": 0.0005370395738159773, + "loss": 0.8924458, + "num_input_tokens_seen": 213129984, + "router_z_loss_mlp": 0.81298828, + "step": 2558, + "time_per_iteration": 2.638489007949829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162432, + "balance_loss_mlp": 1.08084488, + "epoch": 0.4923047325894575, + "flos": 547207673856.0, + "grad_norm": 0.030679841284503157, + "language_loss": 0.90182674, + "learning_rate": 0.0005367288803406003, + "loss": 0.91345102, + "num_input_tokens_seen": 213199184, + "router_z_loss_mlp": 0.81591797, + "step": 2559, + "time_per_iteration": 2.655319929122925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166456, + "balance_loss_mlp": 1.08477354, + "epoch": 0.49249711427472104, + "flos": 597589046784.0, + "grad_norm": 0.03258957792314928, + "language_loss": 0.88157088, + "learning_rate": 0.0005364181726058073, + "loss": 0.89323545, + "num_input_tokens_seen": 213272480, + "router_z_loss_mlp": 0.81689453, + "step": 2560, + "time_per_iteration": 2.7416017055511475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116275, + "balance_loss_mlp": 1.08111596, + "epoch": 0.4926894959599846, + "flos": 498808336896.0, + "grad_norm": 0.03132101057916933, + "language_loss": 0.88768357, + "learning_rate": 0.0005361074507322261, + "loss": 0.89931107, + "num_input_tokens_seen": 213338704, + "router_z_loss_mlp": 0.81640625, + "step": 2561, + "time_per_iteration": 2.6130712032318115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165857, + "balance_loss_mlp": 1.08446133, + "epoch": 0.49288187764524816, + "flos": 537182701056.0, + "grad_norm": 0.03057631912079697, + "language_loss": 0.88031554, + "learning_rate": 0.000535796714840489, + "loss": 0.89197409, + "num_input_tokens_seen": 213406016, + "router_z_loss_mlp": 0.81396484, + "step": 2562, + "time_per_iteration": 2.6463782787323 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167526, + "balance_loss_mlp": 1.08584368, + "epoch": 0.49307425933051174, + "flos": 642712521216.0, + "grad_norm": 0.037191189532270505, + "language_loss": 0.90339726, + "learning_rate": 0.0005354859650512348, + "loss": 0.91507256, + "num_input_tokens_seen": 213474016, + "router_z_loss_mlp": 0.81689453, + "step": 2563, + "time_per_iteration": 2.807185649871826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169953, + "balance_loss_mlp": 1.08831811, + "epoch": 0.4932666410157753, + "flos": 517265012736.0, + "grad_norm": 0.033499096438589164, + "language_loss": 0.92994809, + "learning_rate": 0.0005351752014851074, + "loss": 0.94164765, + "num_input_tokens_seen": 213539696, + "router_z_loss_mlp": 0.81640625, + "step": 2564, + "time_per_iteration": 2.574969530105591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164544, + "balance_loss_mlp": 1.08310056, + "epoch": 0.49345902270103886, + "flos": 602651561472.0, + "grad_norm": 0.03279756121209128, + "language_loss": 0.89816988, + "learning_rate": 0.0005348644242627553, + "loss": 0.90981531, + "num_input_tokens_seen": 213609504, + "router_z_loss_mlp": 0.81445312, + "step": 2565, + "time_per_iteration": 2.718763828277588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170387, + "balance_loss_mlp": 1.0912323, + "epoch": 0.49365140438630245, + "flos": 1496981689344.0, + "grad_norm": 0.010263800536892794, + "language_loss": 0.75286627, + "learning_rate": 0.0005345536335048336, + "loss": 0.76457012, + "num_input_tokens_seen": 213846064, + "router_z_loss_mlp": 0.79101562, + "step": 2566, + "time_per_iteration": 4.933185815811157 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116695, + "balance_loss_mlp": 1.08588743, + "epoch": 0.493843786071566, + "flos": 630788104704.0, + "grad_norm": 0.030129730382445888, + "language_loss": 0.87054515, + "learning_rate": 0.0005342428293320013, + "loss": 0.88221461, + "num_input_tokens_seen": 213923216, + "router_z_loss_mlp": 0.81054688, + "step": 2567, + "time_per_iteration": 2.7435762882232666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167603, + "balance_loss_mlp": 1.08635032, + "epoch": 0.49403616775682957, + "flos": 618689771520.0, + "grad_norm": 0.03756496493147188, + "language_loss": 0.89032316, + "learning_rate": 0.0005339320118649238, + "loss": 0.90199912, + "num_input_tokens_seen": 213994096, + "router_z_loss_mlp": 0.8125, + "step": 2568, + "time_per_iteration": 2.732135057449341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162688, + "balance_loss_mlp": 1.08148313, + "epoch": 0.4942285494420931, + "flos": 578813462016.0, + "grad_norm": 0.027001968550623295, + "language_loss": 0.91260755, + "learning_rate": 0.000533621181224271, + "loss": 0.92423451, + "num_input_tokens_seen": 214069104, + "router_z_loss_mlp": 0.81201172, + "step": 2569, + "time_per_iteration": 2.79868483543396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164198, + "balance_loss_mlp": 1.08304083, + "epoch": 0.4944209311273567, + "flos": 631465580544.0, + "grad_norm": 0.0320565630919746, + "language_loss": 0.86978823, + "learning_rate": 0.0005333103375307182, + "loss": 0.88143021, + "num_input_tokens_seen": 214150368, + "router_z_loss_mlp": 0.81152344, + "step": 2570, + "time_per_iteration": 2.850125551223755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159265, + "balance_loss_mlp": 1.07825053, + "epoch": 0.4946133128126202, + "flos": 588718912512.0, + "grad_norm": 0.030887982554767154, + "language_loss": 0.91666126, + "learning_rate": 0.0005329994809049451, + "loss": 0.92825389, + "num_input_tokens_seen": 214220112, + "router_z_loss_mlp": 0.81005859, + "step": 2571, + "time_per_iteration": 2.716823101043701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115557, + "balance_loss_mlp": 1.07460296, + "epoch": 0.4948056944978838, + "flos": 584846164992.0, + "grad_norm": 0.031743542415023744, + "language_loss": 0.93336749, + "learning_rate": 0.0005326886114676375, + "loss": 0.94492316, + "num_input_tokens_seen": 214294480, + "router_z_loss_mlp": 0.80957031, + "step": 2572, + "time_per_iteration": 2.7895162105560303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160915, + "balance_loss_mlp": 1.08004355, + "epoch": 0.49499807618314734, + "flos": 482780860416.0, + "grad_norm": 0.03097072525481985, + "language_loss": 0.93359911, + "learning_rate": 0.0005323777293394854, + "loss": 0.94520825, + "num_input_tokens_seen": 214359568, + "router_z_loss_mlp": 0.80859375, + "step": 2573, + "time_per_iteration": 2.5428624153137207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161628, + "balance_loss_mlp": 1.08089912, + "epoch": 0.4951904578684109, + "flos": 520037316096.0, + "grad_norm": 0.029847836155631635, + "language_loss": 0.87235224, + "learning_rate": 0.000532066834641184, + "loss": 0.88396853, + "num_input_tokens_seen": 214432032, + "router_z_loss_mlp": 0.80712891, + "step": 2574, + "time_per_iteration": 2.666405439376831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116292, + "balance_loss_mlp": 1.08195353, + "epoch": 0.4953828395536745, + "flos": 536577083904.0, + "grad_norm": 0.029607666498307577, + "language_loss": 0.91085738, + "learning_rate": 0.0005317559274934334, + "loss": 0.92248654, + "num_input_tokens_seen": 214504096, + "router_z_loss_mlp": 0.80957031, + "step": 2575, + "time_per_iteration": 2.694953441619873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161488, + "balance_loss_mlp": 1.08056831, + "epoch": 0.49557522123893805, + "flos": 529606393344.0, + "grad_norm": 0.03416750639658743, + "language_loss": 0.87365144, + "learning_rate": 0.0005314450080169382, + "loss": 0.8852663, + "num_input_tokens_seen": 214575920, + "router_z_loss_mlp": 0.80908203, + "step": 2576, + "time_per_iteration": 2.6648805141448975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160753, + "balance_loss_mlp": 1.07973826, + "epoch": 0.49576760292420163, + "flos": 428917507584.0, + "grad_norm": 0.028909192983869472, + "language_loss": 0.86833698, + "learning_rate": 0.0005311340763324083, + "loss": 0.87994456, + "num_input_tokens_seen": 214641664, + "router_z_loss_mlp": 0.81005859, + "step": 2577, + "time_per_iteration": 2.563143014907837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160705, + "balance_loss_mlp": 1.07945204, + "epoch": 0.49595998460946517, + "flos": 566315629056.0, + "grad_norm": 0.02703431344264104, + "language_loss": 0.87897325, + "learning_rate": 0.0005308231325605578, + "loss": 0.8905803, + "num_input_tokens_seen": 214711744, + "router_z_loss_mlp": 0.8125, + "step": 2578, + "time_per_iteration": 2.690247058868408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159003, + "balance_loss_mlp": 1.07746387, + "epoch": 0.49615236629472875, + "flos": 703813807104.0, + "grad_norm": 0.02447176932933424, + "language_loss": 0.81124884, + "learning_rate": 0.0005305121768221061, + "loss": 0.8228389, + "num_input_tokens_seen": 214802256, + "router_z_loss_mlp": 0.81542969, + "step": 2579, + "time_per_iteration": 3.1026089191436768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011698, + "balance_loss_mlp": 1.08969116, + "epoch": 0.4963447479799923, + "flos": 1444752539136.0, + "grad_norm": 0.010536082657862093, + "language_loss": 0.75038326, + "learning_rate": 0.000530201209237777, + "loss": 0.76208121, + "num_input_tokens_seen": 215023648, + "router_z_loss_mlp": 0.80078125, + "step": 2580, + "time_per_iteration": 4.814293146133423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160566, + "balance_loss_mlp": 1.07912242, + "epoch": 0.49653712966525587, + "flos": 538663179264.0, + "grad_norm": 0.027995208065503225, + "language_loss": 0.97084171, + "learning_rate": 0.0005298902299282984, + "loss": 0.98244739, + "num_input_tokens_seen": 215094080, + "router_z_loss_mlp": 0.81445312, + "step": 2581, + "time_per_iteration": 2.6197092533111572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115749, + "balance_loss_mlp": 1.07609439, + "epoch": 0.4967295113505194, + "flos": 608395554816.0, + "grad_norm": 0.029727926282221828, + "language_loss": 0.90264994, + "learning_rate": 0.0005295792390144033, + "loss": 0.91422486, + "num_input_tokens_seen": 215165456, + "router_z_loss_mlp": 0.81396484, + "step": 2582, + "time_per_iteration": 2.6830005645751953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156586, + "balance_loss_mlp": 1.07528532, + "epoch": 0.496921893035783, + "flos": 475530192384.0, + "grad_norm": 0.034235181262718475, + "language_loss": 0.90576661, + "learning_rate": 0.0005292682366168294, + "loss": 0.91733253, + "num_input_tokens_seen": 215229344, + "router_z_loss_mlp": 0.81298828, + "step": 2583, + "time_per_iteration": 2.5291895866394043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158052, + "balance_loss_mlp": 1.07694244, + "epoch": 0.4971142747210466, + "flos": 598602895872.0, + "grad_norm": 0.029240794220739816, + "language_loss": 0.86485231, + "learning_rate": 0.0005289572228563181, + "loss": 0.8764329, + "num_input_tokens_seen": 215305616, + "router_z_loss_mlp": 0.81103516, + "step": 2584, + "time_per_iteration": 2.777571678161621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159994, + "balance_loss_mlp": 1.0788368, + "epoch": 0.4973066564063101, + "flos": 600734653440.0, + "grad_norm": 0.030481884249605188, + "language_loss": 0.889974, + "learning_rate": 0.000528646197853616, + "loss": 0.90157396, + "num_input_tokens_seen": 215378128, + "router_z_loss_mlp": 0.81152344, + "step": 2585, + "time_per_iteration": 2.767935276031494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116269, + "balance_loss_mlp": 1.08162796, + "epoch": 0.4974990380915737, + "flos": 650768919552.0, + "grad_norm": 0.027212373173769577, + "language_loss": 0.90572929, + "learning_rate": 0.0005283351617294735, + "loss": 0.91735625, + "num_input_tokens_seen": 215453536, + "router_z_loss_mlp": 0.81054688, + "step": 2586, + "time_per_iteration": 2.890571117401123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167969, + "balance_loss_mlp": 1.08862305, + "epoch": 0.49769141977683723, + "flos": 1532440032768.0, + "grad_norm": 0.00993779830792852, + "language_loss": 0.7663666, + "learning_rate": 0.0005280241146046456, + "loss": 0.77804637, + "num_input_tokens_seen": 215689440, + "router_z_loss_mlp": 0.79296875, + "step": 2587, + "time_per_iteration": 4.995927095413208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116898, + "balance_loss_mlp": 1.08791721, + "epoch": 0.4978838014621008, + "flos": 537397550592.0, + "grad_norm": 0.03215658272946184, + "language_loss": 0.92911154, + "learning_rate": 0.0005277130565998916, + "loss": 0.94080132, + "num_input_tokens_seen": 215759600, + "router_z_loss_mlp": 0.81054688, + "step": 2588, + "time_per_iteration": 2.717165946960449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162431, + "balance_loss_mlp": 1.08122599, + "epoch": 0.49807618314736435, + "flos": 540745271808.0, + "grad_norm": 0.02720148099542, + "language_loss": 0.86777204, + "learning_rate": 0.0005274019878359748, + "loss": 0.87939632, + "num_input_tokens_seen": 215833920, + "router_z_loss_mlp": 0.81201172, + "step": 2589, + "time_per_iteration": 2.71560001373291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162135, + "balance_loss_mlp": 1.08088183, + "epoch": 0.49826856483262794, + "flos": 543521577984.0, + "grad_norm": 0.03624054616449923, + "language_loss": 0.92995536, + "learning_rate": 0.0005270909084336628, + "loss": 0.94157672, + "num_input_tokens_seen": 215903616, + "router_z_loss_mlp": 0.8125, + "step": 2590, + "time_per_iteration": 2.6439368724823 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165371, + "balance_loss_mlp": 1.08435619, + "epoch": 0.4984609465178915, + "flos": 523360842240.0, + "grad_norm": 0.02994333023587166, + "language_loss": 0.94466031, + "learning_rate": 0.0005267798185137276, + "loss": 0.95631397, + "num_input_tokens_seen": 215974832, + "router_z_loss_mlp": 0.81005859, + "step": 2591, + "time_per_iteration": 2.6229867935180664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159677, + "balance_loss_mlp": 1.07851899, + "epoch": 0.49865332820315506, + "flos": 575704785408.0, + "grad_norm": 0.030323117469882623, + "language_loss": 0.94773531, + "learning_rate": 0.0005264687181969444, + "loss": 0.95933211, + "num_input_tokens_seen": 216045024, + "router_z_loss_mlp": 0.81152344, + "step": 2592, + "time_per_iteration": 2.7226686477661133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164286, + "balance_loss_mlp": 1.08303344, + "epoch": 0.49884570988841864, + "flos": 1015210497024.0, + "grad_norm": 0.0376584975450282, + "language_loss": 0.82159829, + "learning_rate": 0.0005261576076040937, + "loss": 0.83324111, + "num_input_tokens_seen": 216129024, + "router_z_loss_mlp": 0.8125, + "step": 2593, + "time_per_iteration": 3.2477946281433105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169307, + "balance_loss_mlp": 1.08843529, + "epoch": 0.4990380915736822, + "flos": 560647497216.0, + "grad_norm": 0.03227625840551658, + "language_loss": 0.90092522, + "learning_rate": 0.0005258464868559591, + "loss": 0.91261828, + "num_input_tokens_seen": 216197648, + "router_z_loss_mlp": 0.80859375, + "step": 2594, + "time_per_iteration": 2.650367259979248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167043, + "balance_loss_mlp": 1.08588493, + "epoch": 0.49923047325894576, + "flos": 499943709696.0, + "grad_norm": 0.030210069947970843, + "language_loss": 0.94528484, + "learning_rate": 0.0005255353560733284, + "loss": 0.95695531, + "num_input_tokens_seen": 216263904, + "router_z_loss_mlp": 0.81152344, + "step": 2595, + "time_per_iteration": 2.6242079734802246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174149, + "balance_loss_mlp": 1.09518433, + "epoch": 0.4994228549442093, + "flos": 1499788194816.0, + "grad_norm": 0.015118012466641684, + "language_loss": 0.75578642, + "learning_rate": 0.0005252242153769931, + "loss": 0.76752794, + "num_input_tokens_seen": 216493152, + "router_z_loss_mlp": 0.7890625, + "step": 2596, + "time_per_iteration": 4.820875883102417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116628, + "balance_loss_mlp": 1.08521724, + "epoch": 0.4996152366294729, + "flos": 558513738240.0, + "grad_norm": 0.031441861478263874, + "language_loss": 0.89123356, + "learning_rate": 0.0005249130648877492, + "loss": 0.9028964, + "num_input_tokens_seen": 216567216, + "router_z_loss_mlp": 0.81054688, + "step": 2597, + "time_per_iteration": 2.71932053565979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158102, + "balance_loss_mlp": 1.07699203, + "epoch": 0.4998076183147364, + "flos": 416482801152.0, + "grad_norm": 0.03314289919132309, + "language_loss": 0.90550959, + "learning_rate": 0.0005246019047263953, + "loss": 0.91709059, + "num_input_tokens_seen": 216630624, + "router_z_loss_mlp": 0.81103516, + "step": 2598, + "time_per_iteration": 2.4899134635925293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158453, + "balance_loss_mlp": 1.07739091, + "epoch": 0.5, + "flos": 468325186560.0, + "grad_norm": 0.03341299307449988, + "language_loss": 0.88387024, + "learning_rate": 0.0005242907350137353, + "loss": 0.89545476, + "num_input_tokens_seen": 216696576, + "router_z_loss_mlp": 0.81054688, + "step": 2599, + "time_per_iteration": 2.553997039794922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164809, + "balance_loss_mlp": 1.08369899, + "epoch": 0.5001923816852636, + "flos": 483755778048.0, + "grad_norm": 0.03321709561705903, + "language_loss": 0.85543942, + "learning_rate": 0.0005239795558705754, + "loss": 0.86708754, + "num_input_tokens_seen": 216767584, + "router_z_loss_mlp": 0.81103516, + "step": 2600, + "time_per_iteration": 2.6166868209838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164506, + "balance_loss_mlp": 1.08339632, + "epoch": 0.5003847633705272, + "flos": 534855559680.0, + "grad_norm": 0.030012173683065246, + "language_loss": 0.95093107, + "learning_rate": 0.0005236683674177264, + "loss": 0.96257615, + "num_input_tokens_seen": 216834320, + "router_z_loss_mlp": 0.81103516, + "step": 2601, + "time_per_iteration": 2.6404433250427246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162684, + "balance_loss_mlp": 1.08157361, + "epoch": 0.5005771450557907, + "flos": 739055299584.0, + "grad_norm": 0.032030290781944436, + "language_loss": 0.88311857, + "learning_rate": 0.0005233571697760021, + "loss": 0.89474535, + "num_input_tokens_seen": 216907312, + "router_z_loss_mlp": 0.81103516, + "step": 2602, + "time_per_iteration": 2.8534095287323 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160577, + "balance_loss_mlp": 1.07937133, + "epoch": 0.5007695267410542, + "flos": 780306026496.0, + "grad_norm": 0.036141348793487994, + "language_loss": 0.90016913, + "learning_rate": 0.0005230459630662203, + "loss": 0.91177493, + "num_input_tokens_seen": 216979872, + "router_z_loss_mlp": 0.81201172, + "step": 2603, + "time_per_iteration": 2.952563524246216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162299, + "balance_loss_mlp": 1.0812366, + "epoch": 0.5009619084263178, + "flos": 624618415104.0, + "grad_norm": 0.03600647163377571, + "language_loss": 0.88813984, + "learning_rate": 0.0005227347474092022, + "loss": 0.89976281, + "num_input_tokens_seen": 217054000, + "router_z_loss_mlp": 0.81054688, + "step": 2604, + "time_per_iteration": 2.70975399017334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166549, + "balance_loss_mlp": 1.08543897, + "epoch": 0.5011542901115814, + "flos": 532192045056.0, + "grad_norm": 0.023202845192485378, + "language_loss": 0.88172328, + "learning_rate": 0.0005224235229257724, + "loss": 0.89338881, + "num_input_tokens_seen": 217126784, + "router_z_loss_mlp": 0.81103516, + "step": 2605, + "time_per_iteration": 2.6811788082122803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165049, + "balance_loss_mlp": 1.08393872, + "epoch": 0.5013466717968449, + "flos": 528627472896.0, + "grad_norm": 0.02710312658737552, + "language_loss": 0.91735983, + "learning_rate": 0.0005221122897367589, + "loss": 0.92901027, + "num_input_tokens_seen": 217203056, + "router_z_loss_mlp": 0.81103516, + "step": 2606, + "time_per_iteration": 2.7866344451904297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115755, + "balance_loss_mlp": 1.07644022, + "epoch": 0.5015390534821085, + "flos": 567088432128.0, + "grad_norm": 0.035852557706828735, + "language_loss": 0.88253903, + "learning_rate": 0.0005218010479629932, + "loss": 0.89411449, + "num_input_tokens_seen": 217273280, + "router_z_loss_mlp": 0.81103516, + "step": 2607, + "time_per_iteration": 2.7290749549865723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157153, + "balance_loss_mlp": 1.07594728, + "epoch": 0.5017314351673721, + "flos": 567767909376.0, + "grad_norm": 0.03266328125205783, + "language_loss": 0.88539654, + "learning_rate": 0.0005214897977253102, + "loss": 0.89696807, + "num_input_tokens_seen": 217345568, + "router_z_loss_mlp": 0.81201172, + "step": 2608, + "time_per_iteration": 2.695686101913452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158723, + "balance_loss_mlp": 1.07751739, + "epoch": 0.5019238168526357, + "flos": 523387038720.0, + "grad_norm": 0.02584859781626205, + "language_loss": 0.88962579, + "learning_rate": 0.0005211785391445473, + "loss": 0.90121305, + "num_input_tokens_seen": 217422848, + "router_z_loss_mlp": 0.81201172, + "step": 2609, + "time_per_iteration": 2.7320780754089355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157806, + "balance_loss_mlp": 1.07674336, + "epoch": 0.5021161985378992, + "flos": 642636659712.0, + "grad_norm": 0.03213074952610081, + "language_loss": 0.85809815, + "learning_rate": 0.0005208672723415467, + "loss": 0.86967611, + "num_input_tokens_seen": 217502896, + "router_z_loss_mlp": 0.81054688, + "step": 2610, + "time_per_iteration": 2.8137152194976807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115836, + "balance_loss_mlp": 1.07729781, + "epoch": 0.5023085802231627, + "flos": 592422472704.0, + "grad_norm": 0.03276582898634011, + "language_loss": 0.85898113, + "learning_rate": 0.0005205559974371525, + "loss": 0.8705647, + "num_input_tokens_seen": 217575072, + "router_z_loss_mlp": 0.81054688, + "step": 2611, + "time_per_iteration": 2.7611584663391113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158271, + "balance_loss_mlp": 1.07720828, + "epoch": 0.5025009619084263, + "flos": 473333306880.0, + "grad_norm": 0.02842666355233711, + "language_loss": 0.86990851, + "learning_rate": 0.0005202447145522123, + "loss": 0.88149118, + "num_input_tokens_seen": 217644976, + "router_z_loss_mlp": 0.81054688, + "step": 2612, + "time_per_iteration": 2.6646487712860107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161741, + "balance_loss_mlp": 1.08067882, + "epoch": 0.5026933435936899, + "flos": 456077131776.0, + "grad_norm": 0.031223796902704184, + "language_loss": 0.84174728, + "learning_rate": 0.0005199334238075769, + "loss": 0.85336471, + "num_input_tokens_seen": 217712816, + "router_z_loss_mlp": 0.81054688, + "step": 2613, + "time_per_iteration": 2.567990779876709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163025, + "balance_loss_mlp": 1.08229649, + "epoch": 0.5028857252789535, + "flos": 492721239552.0, + "grad_norm": 0.02841040015147714, + "language_loss": 0.97840261, + "learning_rate": 0.0005196221253241, + "loss": 0.99003285, + "num_input_tokens_seen": 217780256, + "router_z_loss_mlp": 0.80712891, + "step": 2614, + "time_per_iteration": 2.5584659576416016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160421, + "balance_loss_mlp": 1.07988286, + "epoch": 0.503078106964217, + "flos": 626730706944.0, + "grad_norm": 0.03241817920698289, + "language_loss": 0.88891315, + "learning_rate": 0.0005193108192226383, + "loss": 0.90051734, + "num_input_tokens_seen": 217848496, + "router_z_loss_mlp": 0.80517578, + "step": 2615, + "time_per_iteration": 2.7840871810913086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164078, + "balance_loss_mlp": 1.0830152, + "epoch": 0.5032704886494805, + "flos": 580137487872.0, + "grad_norm": 0.02867464613296787, + "language_loss": 0.91759968, + "learning_rate": 0.000518999505624052, + "loss": 0.92924047, + "num_input_tokens_seen": 217919216, + "router_z_loss_mlp": 0.81054688, + "step": 2616, + "time_per_iteration": 2.6807193756103516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161331, + "balance_loss_mlp": 1.08017337, + "epoch": 0.5034628703347441, + "flos": 472845210624.0, + "grad_norm": 0.027070743385767714, + "language_loss": 0.8816672, + "learning_rate": 0.000518688184649203, + "loss": 0.89328051, + "num_input_tokens_seen": 217996096, + "router_z_loss_mlp": 0.81152344, + "step": 2617, + "time_per_iteration": 2.7943994998931885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159886, + "balance_loss_mlp": 1.07877576, + "epoch": 0.5036552520200077, + "flos": 490813063680.0, + "grad_norm": 0.03074056287258418, + "language_loss": 0.88926733, + "learning_rate": 0.0005183768564189577, + "loss": 0.90086615, + "num_input_tokens_seen": 218063072, + "router_z_loss_mlp": 0.81103516, + "step": 2618, + "time_per_iteration": 2.549255609512329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159762, + "balance_loss_mlp": 1.07860434, + "epoch": 0.5038476337052713, + "flos": 495215566848.0, + "grad_norm": 0.030783318052010424, + "language_loss": 0.87459326, + "learning_rate": 0.0005180655210541838, + "loss": 0.88619089, + "num_input_tokens_seen": 218131056, + "router_z_loss_mlp": 0.81152344, + "step": 2619, + "time_per_iteration": 2.5555741786956787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157127, + "balance_loss_mlp": 1.0759213, + "epoch": 0.5040400153905348, + "flos": 601739770368.0, + "grad_norm": 0.036447475930772646, + "language_loss": 0.89893603, + "learning_rate": 0.0005177541786757527, + "loss": 0.91050732, + "num_input_tokens_seen": 218203536, + "router_z_loss_mlp": 0.81201172, + "step": 2620, + "time_per_iteration": 2.75068998336792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157658, + "balance_loss_mlp": 1.07621455, + "epoch": 0.5042323970757984, + "flos": 812918932992.0, + "grad_norm": 0.03476449221513998, + "language_loss": 0.90274507, + "learning_rate": 0.000517442829404538, + "loss": 0.91432166, + "num_input_tokens_seen": 218283008, + "router_z_loss_mlp": 0.81445312, + "step": 2621, + "time_per_iteration": 2.981661558151245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160033, + "balance_loss_mlp": 1.07854116, + "epoch": 0.504424778761062, + "flos": 628606682112.0, + "grad_norm": 0.030074963346690586, + "language_loss": 0.92839754, + "learning_rate": 0.0005171314733614166, + "loss": 0.93999791, + "num_input_tokens_seen": 218362096, + "router_z_loss_mlp": 0.81494141, + "step": 2622, + "time_per_iteration": 2.942354202270508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160933, + "balance_loss_mlp": 1.07934618, + "epoch": 0.5046171604463255, + "flos": 516956837376.0, + "grad_norm": 0.029806335990833818, + "language_loss": 0.84097135, + "learning_rate": 0.0005168201106672671, + "loss": 0.85258067, + "num_input_tokens_seen": 218439440, + "router_z_loss_mlp": 0.81591797, + "step": 2623, + "time_per_iteration": 2.7703733444213867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160048, + "balance_loss_mlp": 1.07841325, + "epoch": 0.504809542131589, + "flos": 528853056000.0, + "grad_norm": 0.03248441490058616, + "language_loss": 0.91679412, + "learning_rate": 0.0005165087414429717, + "loss": 0.92839456, + "num_input_tokens_seen": 218505936, + "router_z_loss_mlp": 0.81640625, + "step": 2624, + "time_per_iteration": 2.620872974395752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116106, + "balance_loss_mlp": 1.07937741, + "epoch": 0.5050019238168526, + "flos": 555174749184.0, + "grad_norm": 0.03119977790816051, + "language_loss": 0.88980711, + "learning_rate": 0.0005161973658094144, + "loss": 0.90141767, + "num_input_tokens_seen": 218573824, + "router_z_loss_mlp": 0.81689453, + "step": 2625, + "time_per_iteration": 2.640408754348755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161049, + "balance_loss_mlp": 1.07955778, + "epoch": 0.5051943055021162, + "flos": 575928367104.0, + "grad_norm": 0.024986408688213266, + "language_loss": 0.88551366, + "learning_rate": 0.000515885983887482, + "loss": 0.89712417, + "num_input_tokens_seen": 218648016, + "router_z_loss_mlp": 0.81494141, + "step": 2626, + "time_per_iteration": 2.7737276554107666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161913, + "balance_loss_mlp": 1.08066046, + "epoch": 0.5053866871873798, + "flos": 497681696256.0, + "grad_norm": 0.03126501141119064, + "language_loss": 0.91551393, + "learning_rate": 0.0005155745957980636, + "loss": 0.92713308, + "num_input_tokens_seen": 218714128, + "router_z_loss_mlp": 0.8125, + "step": 2627, + "time_per_iteration": 2.5588245391845703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159267, + "balance_loss_mlp": 1.07801354, + "epoch": 0.5055790688726434, + "flos": 503219572224.0, + "grad_norm": 0.028407663328603422, + "language_loss": 0.94095421, + "learning_rate": 0.000515263201662051, + "loss": 0.95254695, + "num_input_tokens_seen": 218784800, + "router_z_loss_mlp": 0.8125, + "step": 2628, + "time_per_iteration": 2.6333348751068115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115977, + "balance_loss_mlp": 1.07851708, + "epoch": 0.5057714505579068, + "flos": 846767268864.0, + "grad_norm": 0.025627158908879104, + "language_loss": 0.8802768, + "learning_rate": 0.0005149518016003378, + "loss": 0.89187449, + "num_input_tokens_seen": 218868256, + "router_z_loss_mlp": 0.8125, + "step": 2629, + "time_per_iteration": 3.159515142440796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115843, + "balance_loss_mlp": 1.07722509, + "epoch": 0.5059638322431704, + "flos": 498808336896.0, + "grad_norm": 0.032654832965012745, + "language_loss": 0.88445461, + "learning_rate": 0.0005146403957338206, + "loss": 0.89603889, + "num_input_tokens_seen": 218932496, + "router_z_loss_mlp": 0.81201172, + "step": 2630, + "time_per_iteration": 2.569671154022217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166774, + "balance_loss_mlp": 1.08571208, + "epoch": 0.506156213928434, + "flos": 619113466368.0, + "grad_norm": 0.027165343024338446, + "language_loss": 0.86742038, + "learning_rate": 0.0005143289841833975, + "loss": 0.8790881, + "num_input_tokens_seen": 219010672, + "router_z_loss_mlp": 0.81054688, + "step": 2631, + "time_per_iteration": 2.8505327701568604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169752, + "balance_loss_mlp": 1.08911932, + "epoch": 0.5063485956136976, + "flos": 425789365248.0, + "grad_norm": 0.03495904047465476, + "language_loss": 0.89354646, + "learning_rate": 0.0005140175670699696, + "loss": 0.90524399, + "num_input_tokens_seen": 219077104, + "router_z_loss_mlp": 0.80615234, + "step": 2632, + "time_per_iteration": 2.5920779705047607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174002, + "balance_loss_mlp": 1.09341669, + "epoch": 0.5065409772989612, + "flos": 571069968384.0, + "grad_norm": 0.02494402323857881, + "language_loss": 0.86924809, + "learning_rate": 0.0005137061445144395, + "loss": 0.88098812, + "num_input_tokens_seen": 219164880, + "router_z_loss_mlp": 0.80566406, + "step": 2633, + "time_per_iteration": 2.8890433311462402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172992, + "balance_loss_mlp": 1.09250152, + "epoch": 0.5067333589842247, + "flos": 629969639424.0, + "grad_norm": 0.03395805639170181, + "language_loss": 0.93242514, + "learning_rate": 0.000513394716637712, + "loss": 0.94415504, + "num_input_tokens_seen": 219237376, + "router_z_loss_mlp": 0.8046875, + "step": 2634, + "time_per_iteration": 2.7772305011749268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171906, + "balance_loss_mlp": 1.09217834, + "epoch": 0.5069257406694883, + "flos": 1451096145408.0, + "grad_norm": 0.011960900894201355, + "language_loss": 0.79191709, + "learning_rate": 0.0005130832835606946, + "loss": 0.80363613, + "num_input_tokens_seen": 219467632, + "router_z_loss_mlp": 0.796875, + "step": 2635, + "time_per_iteration": 4.93586802482605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116392, + "balance_loss_mlp": 1.08323884, + "epoch": 0.5071181223547518, + "flos": 640057738752.0, + "grad_norm": 0.03273720191955115, + "language_loss": 0.86367166, + "learning_rate": 0.0005127718454042958, + "loss": 0.87531078, + "num_input_tokens_seen": 219545392, + "router_z_loss_mlp": 0.80664062, + "step": 2636, + "time_per_iteration": 2.8407700061798096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115771, + "balance_loss_mlp": 1.07683849, + "epoch": 0.5073105040400154, + "flos": 714872094720.0, + "grad_norm": 0.03167408399625075, + "language_loss": 0.89809334, + "learning_rate": 0.0005124604022894269, + "loss": 0.90967047, + "num_input_tokens_seen": 219623104, + "router_z_loss_mlp": 0.80859375, + "step": 2637, + "time_per_iteration": 2.9438648223876953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011651, + "balance_loss_mlp": 1.08575439, + "epoch": 0.5075028857252789, + "flos": 1439612161536.0, + "grad_norm": 0.009234713476178756, + "language_loss": 0.77188224, + "learning_rate": 0.000512148954337001, + "loss": 0.78353328, + "num_input_tokens_seen": 219853328, + "router_z_loss_mlp": 0.79296875, + "step": 2638, + "time_per_iteration": 4.855467319488525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170042, + "balance_loss_mlp": 1.08950412, + "epoch": 0.5076952674105425, + "flos": 572307399168.0, + "grad_norm": 0.033371281415520225, + "language_loss": 0.89923447, + "learning_rate": 0.0005118375016679325, + "loss": 0.91093493, + "num_input_tokens_seen": 219925024, + "router_z_loss_mlp": 0.80517578, + "step": 2639, + "time_per_iteration": 2.7761123180389404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168126, + "balance_loss_mlp": 1.08735013, + "epoch": 0.5078876490958061, + "flos": 517712176128.0, + "grad_norm": 0.04218063889538898, + "language_loss": 0.87796986, + "learning_rate": 0.0005115260444031382, + "loss": 0.88965112, + "num_input_tokens_seen": 219992752, + "router_z_loss_mlp": 0.80761719, + "step": 2640, + "time_per_iteration": 2.5914742946624756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164741, + "balance_loss_mlp": 1.08596802, + "epoch": 0.5080800307810697, + "flos": 1587619405824.0, + "grad_norm": 0.012463066852979446, + "language_loss": 0.78731823, + "learning_rate": 0.000511214582663537, + "loss": 0.79896557, + "num_input_tokens_seen": 220224160, + "router_z_loss_mlp": 0.78710938, + "step": 2641, + "time_per_iteration": 4.9428391456604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164884, + "balance_loss_mlp": 1.08420289, + "epoch": 0.5082724124663333, + "flos": 486186978816.0, + "grad_norm": 0.039006057605032056, + "language_loss": 0.93060952, + "learning_rate": 0.0005109031165700483, + "loss": 0.94225836, + "num_input_tokens_seen": 220289504, + "router_z_loss_mlp": 0.80664062, + "step": 2642, + "time_per_iteration": 2.5630409717559814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164249, + "balance_loss_mlp": 1.08318675, + "epoch": 0.5084647941515967, + "flos": 683442224640.0, + "grad_norm": 0.03324563219825503, + "language_loss": 0.88873887, + "learning_rate": 0.0005105916462435945, + "loss": 0.90038145, + "num_input_tokens_seen": 220361376, + "router_z_loss_mlp": 0.81054688, + "step": 2643, + "time_per_iteration": 2.8135592937469482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165445, + "balance_loss_mlp": 1.08438289, + "epoch": 0.5086571758368603, + "flos": 549812791296.0, + "grad_norm": 0.031221131167697595, + "language_loss": 0.92092431, + "learning_rate": 0.0005102801718050989, + "loss": 0.93257874, + "num_input_tokens_seen": 220434720, + "router_z_loss_mlp": 0.81054688, + "step": 2644, + "time_per_iteration": 2.684957981109619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165053, + "balance_loss_mlp": 1.08413339, + "epoch": 0.5088495575221239, + "flos": 565078198272.0, + "grad_norm": 0.032204925975490975, + "language_loss": 0.95189679, + "learning_rate": 0.0005099686933754867, + "loss": 0.96354735, + "num_input_tokens_seen": 220506208, + "router_z_loss_mlp": 0.80908203, + "step": 2645, + "time_per_iteration": 2.6721112728118896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167263, + "balance_loss_mlp": 1.08620095, + "epoch": 0.5090419392073875, + "flos": 552511234560.0, + "grad_norm": 0.03332524240735616, + "language_loss": 0.90223062, + "learning_rate": 0.0005096572110756845, + "loss": 0.9139033, + "num_input_tokens_seen": 220577456, + "router_z_loss_mlp": 0.81054688, + "step": 2646, + "time_per_iteration": 2.6559739112854004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167828, + "balance_loss_mlp": 1.08686149, + "epoch": 0.509234320892651, + "flos": 568883816448.0, + "grad_norm": 0.029529111031728714, + "language_loss": 0.90596855, + "learning_rate": 0.0005093457250266205, + "loss": 0.91764688, + "num_input_tokens_seen": 220649648, + "router_z_loss_mlp": 0.80957031, + "step": 2647, + "time_per_iteration": 2.7653987407684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167889, + "balance_loss_mlp": 1.08673143, + "epoch": 0.5094267025779146, + "flos": 583693327872.0, + "grad_norm": 0.03457257756125772, + "language_loss": 0.89727396, + "learning_rate": 0.000509034235349224, + "loss": 0.90895277, + "num_input_tokens_seen": 220721168, + "router_z_loss_mlp": 0.81152344, + "step": 2648, + "time_per_iteration": 2.690363645553589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159753, + "balance_loss_mlp": 1.07854819, + "epoch": 0.5096190842631781, + "flos": 593138880000.0, + "grad_norm": 0.0341546457293008, + "language_loss": 0.88255095, + "learning_rate": 0.0005087227421644266, + "loss": 0.89414853, + "num_input_tokens_seen": 220796464, + "router_z_loss_mlp": 0.81201172, + "step": 2649, + "time_per_iteration": 2.6982481479644775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159974, + "balance_loss_mlp": 1.07891166, + "epoch": 0.5098114659484417, + "flos": 514584033792.0, + "grad_norm": 0.030485361797949893, + "language_loss": 0.92298341, + "learning_rate": 0.0005084112455931602, + "loss": 0.93458325, + "num_input_tokens_seen": 220862976, + "router_z_loss_mlp": 0.81054688, + "step": 2650, + "time_per_iteration": 2.5739448070526123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162291, + "balance_loss_mlp": 1.08170521, + "epoch": 0.5100038476337053, + "flos": 485600827392.0, + "grad_norm": 0.03052985498468287, + "language_loss": 0.91529775, + "learning_rate": 0.0005080997457563586, + "loss": 0.92692065, + "num_input_tokens_seen": 220926432, + "router_z_loss_mlp": 0.80566406, + "step": 2651, + "time_per_iteration": 2.5381717681884766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165638, + "balance_loss_mlp": 1.08514845, + "epoch": 0.5101962293189688, + "flos": 462554996736.0, + "grad_norm": 0.037278277228963375, + "language_loss": 0.86181092, + "learning_rate": 0.0005077882427749569, + "loss": 0.87346727, + "num_input_tokens_seen": 220993008, + "router_z_loss_mlp": 0.8046875, + "step": 2652, + "time_per_iteration": 2.490943670272827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158092, + "balance_loss_mlp": 1.07745898, + "epoch": 0.5103886110042324, + "flos": 588132761088.0, + "grad_norm": 0.03182463194953253, + "language_loss": 0.91334021, + "learning_rate": 0.0005074767367698913, + "loss": 0.9249211, + "num_input_tokens_seen": 221059248, + "router_z_loss_mlp": 0.80615234, + "step": 2653, + "time_per_iteration": 2.6900839805603027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115906, + "balance_loss_mlp": 1.07847476, + "epoch": 0.510580992689496, + "flos": 846677945856.0, + "grad_norm": 0.027057922805634398, + "language_loss": 0.89024949, + "learning_rate": 0.0005071652278620988, + "loss": 0.90184009, + "num_input_tokens_seen": 221133712, + "router_z_loss_mlp": 0.80566406, + "step": 2654, + "time_per_iteration": 3.044296979904175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115973, + "balance_loss_mlp": 1.07919204, + "epoch": 0.5107733743747596, + "flos": 659810242560.0, + "grad_norm": 0.0315385737613105, + "language_loss": 0.89305294, + "learning_rate": 0.0005068537161725186, + "loss": 0.90465021, + "num_input_tokens_seen": 221202192, + "router_z_loss_mlp": 0.80517578, + "step": 2655, + "time_per_iteration": 2.770669937133789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160641, + "balance_loss_mlp": 1.08000755, + "epoch": 0.510965756060023, + "flos": 702960413184.0, + "grad_norm": 0.03531630249392906, + "language_loss": 0.91070223, + "learning_rate": 0.0005065422018220893, + "loss": 0.92230862, + "num_input_tokens_seen": 221277104, + "router_z_loss_mlp": 0.80615234, + "step": 2656, + "time_per_iteration": 2.833031177520752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165495, + "balance_loss_mlp": 1.08490956, + "epoch": 0.5111581377452866, + "flos": 560940936192.0, + "grad_norm": 0.03615724120857576, + "language_loss": 0.85921729, + "learning_rate": 0.0005062306849317521, + "loss": 0.87087226, + "num_input_tokens_seen": 221352320, + "router_z_loss_mlp": 0.80566406, + "step": 2657, + "time_per_iteration": 2.800971031188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167929, + "balance_loss_mlp": 1.0873909, + "epoch": 0.5113505194305502, + "flos": 610145276928.0, + "grad_norm": 0.029932060678028026, + "language_loss": 0.88435352, + "learning_rate": 0.0005059191656224487, + "loss": 0.89603281, + "num_input_tokens_seen": 221421056, + "router_z_loss_mlp": 0.80517578, + "step": 2658, + "time_per_iteration": 2.7075443267822266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159414, + "balance_loss_mlp": 1.07882822, + "epoch": 0.5115429011158138, + "flos": 535535036928.0, + "grad_norm": 0.028231439832000826, + "language_loss": 0.94975483, + "learning_rate": 0.0005056076440151212, + "loss": 0.96134901, + "num_input_tokens_seen": 221492064, + "router_z_loss_mlp": 0.80566406, + "step": 2659, + "time_per_iteration": 2.6906392574310303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162323, + "balance_loss_mlp": 1.0835495, + "epoch": 0.5117352828010774, + "flos": 1365273166848.0, + "grad_norm": 0.00971890017277948, + "language_loss": 0.76288116, + "learning_rate": 0.0005052961202307133, + "loss": 0.77450442, + "num_input_tokens_seen": 221724672, + "router_z_loss_mlp": 0.78515625, + "step": 2660, + "time_per_iteration": 4.880187273025513 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160968, + "balance_loss_mlp": 1.07990551, + "epoch": 0.5119276644863409, + "flos": 634930096128.0, + "grad_norm": 0.027317751888226913, + "language_loss": 0.91815728, + "learning_rate": 0.0005049845943901691, + "loss": 0.92976695, + "num_input_tokens_seen": 221800144, + "router_z_loss_mlp": 0.81054688, + "step": 2661, + "time_per_iteration": 2.8184986114501953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160969, + "balance_loss_mlp": 1.08004987, + "epoch": 0.5121200461716044, + "flos": 586780537344.0, + "grad_norm": 0.02944382500923868, + "language_loss": 0.91654462, + "learning_rate": 0.0005046730666144338, + "loss": 0.92815423, + "num_input_tokens_seen": 221877168, + "router_z_loss_mlp": 0.80908203, + "step": 2662, + "time_per_iteration": 2.755974769592285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160878, + "balance_loss_mlp": 1.0798161, + "epoch": 0.512312427856868, + "flos": 1034223124992.0, + "grad_norm": 0.029507171441845153, + "language_loss": 0.93013144, + "learning_rate": 0.0005043615370244532, + "loss": 0.94174021, + "num_input_tokens_seen": 221964208, + "router_z_loss_mlp": 0.81054688, + "step": 2663, + "time_per_iteration": 3.3488211631774902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177849, + "balance_loss_mlp": 1.09907532, + "epoch": 0.5125048095421316, + "flos": 1540899207168.0, + "grad_norm": 0.013662934984579522, + "language_loss": 0.78244388, + "learning_rate": 0.0005040500057411736, + "loss": 0.79422235, + "num_input_tokens_seen": 222179264, + "router_z_loss_mlp": 0.78710938, + "step": 2664, + "time_per_iteration": 4.6237993240356445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162223, + "balance_loss_mlp": 1.08130419, + "epoch": 0.5126971912273951, + "flos": 592327145472.0, + "grad_norm": 0.024418914459260154, + "language_loss": 0.89686567, + "learning_rate": 0.0005037384728855425, + "loss": 0.90848792, + "num_input_tokens_seen": 222259504, + "router_z_loss_mlp": 0.80908203, + "step": 2665, + "time_per_iteration": 2.8003761768341064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163774, + "balance_loss_mlp": 1.08299828, + "epoch": 0.5128895729126587, + "flos": 552717351936.0, + "grad_norm": 0.03867267783646357, + "language_loss": 0.9114759, + "learning_rate": 0.0005034269385785075, + "loss": 0.9231137, + "num_input_tokens_seen": 222330512, + "router_z_loss_mlp": 0.80761719, + "step": 2666, + "time_per_iteration": 2.664607286453247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161159, + "balance_loss_mlp": 1.08047831, + "epoch": 0.5130819545979223, + "flos": 482231639040.0, + "grad_norm": 0.037339426134761385, + "language_loss": 0.92204285, + "learning_rate": 0.0005031154029410168, + "loss": 0.93365449, + "num_input_tokens_seen": 222394000, + "router_z_loss_mlp": 0.80664062, + "step": 2667, + "time_per_iteration": 2.5419206619262695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157708, + "balance_loss_mlp": 1.0769316, + "epoch": 0.5132743362831859, + "flos": 476767623168.0, + "grad_norm": 0.03576788906651519, + "language_loss": 0.93073893, + "learning_rate": 0.0005028038660940197, + "loss": 0.942316, + "num_input_tokens_seen": 222459344, + "router_z_loss_mlp": 0.80761719, + "step": 2668, + "time_per_iteration": 2.5499191284179688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166102, + "balance_loss_mlp": 1.08542132, + "epoch": 0.5134667179684494, + "flos": 504902164992.0, + "grad_norm": 0.02981054719592371, + "language_loss": 0.89144588, + "learning_rate": 0.0005024923281584648, + "loss": 0.90310693, + "num_input_tokens_seen": 222528912, + "router_z_loss_mlp": 0.80664062, + "step": 2669, + "time_per_iteration": 2.6367011070251465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165888, + "balance_loss_mlp": 1.08496881, + "epoch": 0.5136590996537129, + "flos": 505004222976.0, + "grad_norm": 0.029270286325536108, + "language_loss": 0.87695622, + "learning_rate": 0.0005021807892553026, + "loss": 0.88861501, + "num_input_tokens_seen": 222604704, + "router_z_loss_mlp": 0.80908203, + "step": 2670, + "time_per_iteration": 2.697326421737671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165807, + "balance_loss_mlp": 1.08522201, + "epoch": 0.5138514813389765, + "flos": 625799450112.0, + "grad_norm": 0.029434336289691197, + "language_loss": 0.8977018, + "learning_rate": 0.0005018692495054828, + "loss": 0.90935987, + "num_input_tokens_seen": 222677888, + "router_z_loss_mlp": 0.80566406, + "step": 2671, + "time_per_iteration": 2.848576784133911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154912, + "balance_loss_mlp": 1.07394516, + "epoch": 0.5140438630242401, + "flos": 584633316864.0, + "grad_norm": 0.027486728027613972, + "language_loss": 0.85466325, + "learning_rate": 0.0005015577090299561, + "loss": 0.86621237, + "num_input_tokens_seen": 222751936, + "router_z_loss_mlp": 0.80957031, + "step": 2672, + "time_per_iteration": 2.698976993560791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155424, + "balance_loss_mlp": 1.0744096, + "epoch": 0.5142362447095037, + "flos": 488904887808.0, + "grad_norm": 0.030629892529963922, + "language_loss": 0.92615306, + "learning_rate": 0.0005012461679496729, + "loss": 0.9377073, + "num_input_tokens_seen": 222819616, + "router_z_loss_mlp": 0.81005859, + "step": 2673, + "time_per_iteration": 2.5998294353485107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115671, + "balance_loss_mlp": 1.07564759, + "epoch": 0.5144286263947672, + "flos": 527884869120.0, + "grad_norm": 0.029257555563523763, + "language_loss": 0.93652987, + "learning_rate": 0.0005009346263855848, + "loss": 0.94809699, + "num_input_tokens_seen": 222888448, + "router_z_loss_mlp": 0.81054688, + "step": 2674, + "time_per_iteration": 2.702364444732666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156546, + "balance_loss_mlp": 1.07548332, + "epoch": 0.5146210080800308, + "flos": 487589594112.0, + "grad_norm": 0.025826040346785265, + "language_loss": 0.88576883, + "learning_rate": 0.0005006230844586422, + "loss": 0.89733428, + "num_input_tokens_seen": 222964736, + "router_z_loss_mlp": 0.81054688, + "step": 2675, + "time_per_iteration": 2.7889058589935303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159564, + "balance_loss_mlp": 1.07845449, + "epoch": 0.5148133897652943, + "flos": 516974301696.0, + "grad_norm": 0.025127862595781116, + "language_loss": 0.83195055, + "learning_rate": 0.0005003115422897968, + "loss": 0.84354615, + "num_input_tokens_seen": 223040944, + "router_z_loss_mlp": 0.81103516, + "step": 2676, + "time_per_iteration": 2.7474374771118164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165139, + "balance_loss_mlp": 1.08436286, + "epoch": 0.5150057714505579, + "flos": 512211230208.0, + "grad_norm": 0.02805317572608274, + "language_loss": 0.92311704, + "learning_rate": 0.0005, + "loss": 0.93476844, + "num_input_tokens_seen": 223109632, + "router_z_loss_mlp": 0.80761719, + "step": 2677, + "time_per_iteration": 2.635801076889038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167536, + "balance_loss_mlp": 1.08652139, + "epoch": 0.5151981531358215, + "flos": 912389853696.0, + "grad_norm": 0.03671017270530106, + "language_loss": 0.86270726, + "learning_rate": 0.0004996884577102033, + "loss": 0.87438262, + "num_input_tokens_seen": 223191648, + "router_z_loss_mlp": 0.81005859, + "step": 2678, + "time_per_iteration": 3.1016898155212402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116356, + "balance_loss_mlp": 1.08264065, + "epoch": 0.515390534821085, + "flos": 472929804288.0, + "grad_norm": 0.02746999857609634, + "language_loss": 0.90178144, + "learning_rate": 0.000499376915541358, + "loss": 0.91341698, + "num_input_tokens_seen": 223265920, + "router_z_loss_mlp": 0.80908203, + "step": 2679, + "time_per_iteration": 2.7041540145874023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163327, + "balance_loss_mlp": 1.0826937, + "epoch": 0.5155829165063486, + "flos": 651357072384.0, + "grad_norm": 0.02786171231522906, + "language_loss": 0.85589147, + "learning_rate": 0.0004990653736144155, + "loss": 0.86752468, + "num_input_tokens_seen": 223340688, + "router_z_loss_mlp": 0.80615234, + "step": 2680, + "time_per_iteration": 2.883392572402954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163916, + "balance_loss_mlp": 1.08280623, + "epoch": 0.5157752981916122, + "flos": 415160776704.0, + "grad_norm": 0.030701546031170052, + "language_loss": 0.92331398, + "learning_rate": 0.0004987538320503271, + "loss": 0.93495315, + "num_input_tokens_seen": 223404064, + "router_z_loss_mlp": 0.81103516, + "step": 2681, + "time_per_iteration": 2.4719676971435547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169918, + "balance_loss_mlp": 1.0890938, + "epoch": 0.5159676798768758, + "flos": 554931701760.0, + "grad_norm": 0.03041903817165714, + "language_loss": 0.89793313, + "learning_rate": 0.0004984422909700442, + "loss": 0.90963233, + "num_input_tokens_seen": 223476784, + "router_z_loss_mlp": 0.80810547, + "step": 2682, + "time_per_iteration": 2.7486019134521484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168893, + "balance_loss_mlp": 1.08816493, + "epoch": 0.5161600615621393, + "flos": 587620469760.0, + "grad_norm": 0.02833679783776788, + "language_loss": 0.89197505, + "learning_rate": 0.0004981307504945173, + "loss": 0.90366399, + "num_input_tokens_seen": 223542832, + "router_z_loss_mlp": 0.80712891, + "step": 2683, + "time_per_iteration": 2.6918153762817383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161385, + "balance_loss_mlp": 1.08060837, + "epoch": 0.5163524432474028, + "flos": 589947611136.0, + "grad_norm": 0.03153559446680845, + "language_loss": 0.9527353, + "learning_rate": 0.0004978192107446976, + "loss": 0.96434915, + "num_input_tokens_seen": 223617968, + "router_z_loss_mlp": 0.80761719, + "step": 2684, + "time_per_iteration": 2.7622218132019043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159701, + "balance_loss_mlp": 1.07906806, + "epoch": 0.5165448249326664, + "flos": 504904166400.0, + "grad_norm": 0.029863924033148703, + "language_loss": 0.92634213, + "learning_rate": 0.0004975076718415353, + "loss": 0.93793911, + "num_input_tokens_seen": 223689504, + "router_z_loss_mlp": 0.80615234, + "step": 2685, + "time_per_iteration": 2.644228219985962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172411, + "balance_loss_mlp": 1.09220684, + "epoch": 0.51673720661793, + "flos": 417646371840.0, + "grad_norm": 0.031084732221220036, + "language_loss": 0.95470178, + "learning_rate": 0.0004971961339059806, + "loss": 0.96642584, + "num_input_tokens_seen": 223752288, + "router_z_loss_mlp": 0.80175781, + "step": 2686, + "time_per_iteration": 2.469081401824951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160009, + "balance_loss_mlp": 1.0795666, + "epoch": 0.5169295883031936, + "flos": 600074641920.0, + "grad_norm": 0.03147701291149863, + "language_loss": 0.89665824, + "learning_rate": 0.0004968845970589832, + "loss": 0.90825832, + "num_input_tokens_seen": 223822304, + "router_z_loss_mlp": 0.80419922, + "step": 2687, + "time_per_iteration": 2.7054736614227295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159105, + "balance_loss_mlp": 1.07847178, + "epoch": 0.517121969988457, + "flos": 557910122496.0, + "grad_norm": 0.03772331123991374, + "language_loss": 0.90882772, + "learning_rate": 0.0004965730614214926, + "loss": 0.92041886, + "num_input_tokens_seen": 223888592, + "router_z_loss_mlp": 0.80615234, + "step": 2688, + "time_per_iteration": 2.6433985233306885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159068, + "balance_loss_mlp": 1.0787214, + "epoch": 0.5173143516737206, + "flos": 470374351872.0, + "grad_norm": 0.031353493154565384, + "language_loss": 0.9113276, + "learning_rate": 0.0004962615271144576, + "loss": 0.92291832, + "num_input_tokens_seen": 223952880, + "router_z_loss_mlp": 0.80322266, + "step": 2689, + "time_per_iteration": 2.5081796646118164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159566, + "balance_loss_mlp": 1.07912409, + "epoch": 0.5175067333589842, + "flos": 721378157568.0, + "grad_norm": 0.03531118205346665, + "language_loss": 0.88785195, + "learning_rate": 0.0004959499942588264, + "loss": 0.89944768, + "num_input_tokens_seen": 224030000, + "router_z_loss_mlp": 0.80419922, + "step": 2690, + "time_per_iteration": 2.8977034091949463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165977, + "balance_loss_mlp": 1.08682251, + "epoch": 0.5176991150442478, + "flos": 1469341974528.0, + "grad_norm": 0.00940812354228104, + "language_loss": 0.78200024, + "learning_rate": 0.0004956384629755469, + "loss": 0.79365999, + "num_input_tokens_seen": 224252384, + "router_z_loss_mlp": 0.79101562, + "step": 2691, + "time_per_iteration": 4.744166851043701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162816, + "balance_loss_mlp": 1.08227849, + "epoch": 0.5178914967295114, + "flos": 613783709184.0, + "grad_norm": 0.0285194405600695, + "language_loss": 0.91181535, + "learning_rate": 0.0004953269333855661, + "loss": 0.92344356, + "num_input_tokens_seen": 224324640, + "router_z_loss_mlp": 0.80517578, + "step": 2692, + "time_per_iteration": 2.7305634021759033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164372, + "balance_loss_mlp": 1.0839293, + "epoch": 0.5180838784147749, + "flos": 501980140032.0, + "grad_norm": 0.03457473418848995, + "language_loss": 0.89626956, + "learning_rate": 0.0004950154056098309, + "loss": 0.90791321, + "num_input_tokens_seen": 224398368, + "router_z_loss_mlp": 0.80419922, + "step": 2693, + "time_per_iteration": 2.7358009815216064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162458, + "balance_loss_mlp": 1.08215868, + "epoch": 0.5182762601000385, + "flos": 690041613312.0, + "grad_norm": 0.03333155233389222, + "language_loss": 0.90543425, + "learning_rate": 0.0004947038797692867, + "loss": 0.91705889, + "num_input_tokens_seen": 224465456, + "router_z_loss_mlp": 0.80273438, + "step": 2694, + "time_per_iteration": 2.8636367321014404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178055, + "balance_loss_mlp": 1.09775615, + "epoch": 0.518468641785302, + "flos": 666800398848.0, + "grad_norm": 0.03410817354988479, + "language_loss": 0.8335048, + "learning_rate": 0.0004943923559848789, + "loss": 0.84528536, + "num_input_tokens_seen": 224540960, + "router_z_loss_mlp": 0.80273438, + "step": 2695, + "time_per_iteration": 2.797072172164917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117824, + "balance_loss_mlp": 1.09794104, + "epoch": 0.5186610234705656, + "flos": 567813571584.0, + "grad_norm": 0.02729227458516312, + "language_loss": 0.95474803, + "learning_rate": 0.0004940808343775515, + "loss": 0.96653044, + "num_input_tokens_seen": 224613200, + "router_z_loss_mlp": 0.80273438, + "step": 2696, + "time_per_iteration": 2.6839044094085693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162534, + "balance_loss_mlp": 1.08204436, + "epoch": 0.5188534051558291, + "flos": 429792368640.0, + "grad_norm": 0.03355790964159957, + "language_loss": 0.87542081, + "learning_rate": 0.0004937693150682479, + "loss": 0.88704622, + "num_input_tokens_seen": 224677456, + "router_z_loss_mlp": 0.8046875, + "step": 2697, + "time_per_iteration": 2.5123825073242188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161323, + "balance_loss_mlp": 1.08045113, + "epoch": 0.5190457868410927, + "flos": 547411789824.0, + "grad_norm": 0.031455242836056954, + "language_loss": 0.81813598, + "learning_rate": 0.0004934577981779107, + "loss": 0.82974923, + "num_input_tokens_seen": 224745600, + "router_z_loss_mlp": 0.80859375, + "step": 2698, + "time_per_iteration": 2.662545919418335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117247, + "balance_loss_mlp": 1.09159839, + "epoch": 0.5192381685263563, + "flos": 549745661952.0, + "grad_norm": 0.02804159255629041, + "language_loss": 0.86178321, + "learning_rate": 0.0004931462838274817, + "loss": 0.87350786, + "num_input_tokens_seen": 224826944, + "router_z_loss_mlp": 0.80859375, + "step": 2699, + "time_per_iteration": 2.877682685852051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172435, + "balance_loss_mlp": 1.09156311, + "epoch": 0.5194305502116199, + "flos": 576349334016.0, + "grad_norm": 0.03885998177020277, + "language_loss": 0.90400088, + "learning_rate": 0.0004928347721379011, + "loss": 0.91572523, + "num_input_tokens_seen": 224895280, + "router_z_loss_mlp": 0.80859375, + "step": 2700, + "time_per_iteration": 2.671849489212036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169932, + "balance_loss_mlp": 1.08906007, + "epoch": 0.5196229318968835, + "flos": 435217453056.0, + "grad_norm": 0.030583901836551724, + "language_loss": 0.87633044, + "learning_rate": 0.0004925232632301089, + "loss": 0.88802975, + "num_input_tokens_seen": 224961632, + "router_z_loss_mlp": 0.80859375, + "step": 2701, + "time_per_iteration": 2.57857608795166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166407, + "balance_loss_mlp": 1.08558309, + "epoch": 0.5198153135821469, + "flos": 559985484288.0, + "grad_norm": 0.03187287566803064, + "language_loss": 0.85556304, + "learning_rate": 0.0004922117572250431, + "loss": 0.86722708, + "num_input_tokens_seen": 225032816, + "router_z_loss_mlp": 0.80810547, + "step": 2702, + "time_per_iteration": 2.7037737369537354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166773, + "balance_loss_mlp": 1.08618808, + "epoch": 0.5200076952674105, + "flos": 566834651136.0, + "grad_norm": 0.03219739559056917, + "language_loss": 0.8641057, + "learning_rate": 0.0004919002542436414, + "loss": 0.87577343, + "num_input_tokens_seen": 225112736, + "router_z_loss_mlp": 0.80566406, + "step": 2703, + "time_per_iteration": 2.8919363021850586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169953, + "balance_loss_mlp": 1.08965361, + "epoch": 0.5202000769526741, + "flos": 572272470528.0, + "grad_norm": 0.0327510509858114, + "language_loss": 0.87948251, + "learning_rate": 0.0004915887544068399, + "loss": 0.89118207, + "num_input_tokens_seen": 225182672, + "router_z_loss_mlp": 0.80273438, + "step": 2704, + "time_per_iteration": 2.6497535705566406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169089, + "balance_loss_mlp": 1.08869386, + "epoch": 0.5203924586379377, + "flos": 695466697728.0, + "grad_norm": 0.02924473313894461, + "language_loss": 0.83824521, + "learning_rate": 0.0004912772578355736, + "loss": 0.84993607, + "num_input_tokens_seen": 225260272, + "router_z_loss_mlp": 0.80371094, + "step": 2705, + "time_per_iteration": 2.8862009048461914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163429, + "balance_loss_mlp": 1.08274853, + "epoch": 0.5205848403232012, + "flos": 567690046464.0, + "grad_norm": 0.031189936278329552, + "language_loss": 0.88606453, + "learning_rate": 0.000490965764650776, + "loss": 0.89769882, + "num_input_tokens_seen": 225337120, + "router_z_loss_mlp": 0.80664062, + "step": 2706, + "time_per_iteration": 2.865553379058838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163571, + "balance_loss_mlp": 1.08308065, + "epoch": 0.5207772220084648, + "flos": 1216204231680.0, + "grad_norm": 0.03053180986383906, + "language_loss": 0.8816222, + "learning_rate": 0.0004906542749733798, + "loss": 0.89325786, + "num_input_tokens_seen": 225433984, + "router_z_loss_mlp": 0.8046875, + "step": 2707, + "time_per_iteration": 3.6396875381469727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162365, + "balance_loss_mlp": 1.08197033, + "epoch": 0.5209696036937284, + "flos": 594031205376.0, + "grad_norm": 0.027334962594272247, + "language_loss": 0.90568572, + "learning_rate": 0.0004903427889243156, + "loss": 0.91730928, + "num_input_tokens_seen": 225512112, + "router_z_loss_mlp": 0.80371094, + "step": 2708, + "time_per_iteration": 2.853013753890991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116169, + "balance_loss_mlp": 1.08129489, + "epoch": 0.5211619853789919, + "flos": 523955725824.0, + "grad_norm": 0.032301377197285666, + "language_loss": 0.91200471, + "learning_rate": 0.0004900313066245134, + "loss": 0.92362165, + "num_input_tokens_seen": 225586944, + "router_z_loss_mlp": 0.80371094, + "step": 2709, + "time_per_iteration": 2.706407070159912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161577, + "balance_loss_mlp": 1.08146846, + "epoch": 0.5213543670642555, + "flos": 503860118016.0, + "grad_norm": 0.02918491733204221, + "language_loss": 0.86683327, + "learning_rate": 0.0004897198281949012, + "loss": 0.87844902, + "num_input_tokens_seen": 225657184, + "router_z_loss_mlp": 0.80078125, + "step": 2710, + "time_per_iteration": 2.6603598594665527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115684, + "balance_loss_mlp": 1.07654023, + "epoch": 0.521546748749519, + "flos": 587071248384.0, + "grad_norm": 0.0328837537508598, + "language_loss": 0.84538651, + "learning_rate": 0.0004894083537564057, + "loss": 0.85695493, + "num_input_tokens_seen": 225729968, + "router_z_loss_mlp": 0.80273438, + "step": 2711, + "time_per_iteration": 2.740659236907959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159708, + "balance_loss_mlp": 1.07955158, + "epoch": 0.5217391304347826, + "flos": 571265352192.0, + "grad_norm": 0.028894041826031003, + "language_loss": 0.85799223, + "learning_rate": 0.0004890968834299519, + "loss": 0.86958933, + "num_input_tokens_seen": 225801808, + "router_z_loss_mlp": 0.80126953, + "step": 2712, + "time_per_iteration": 2.7206225395202637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157432, + "balance_loss_mlp": 1.077371, + "epoch": 0.5219315121200462, + "flos": 543919076352.0, + "grad_norm": 0.029763432747936528, + "language_loss": 0.83741677, + "learning_rate": 0.0004887854173364633, + "loss": 0.84899104, + "num_input_tokens_seen": 225878576, + "router_z_loss_mlp": 0.80029297, + "step": 2713, + "time_per_iteration": 2.737755060195923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160512, + "balance_loss_mlp": 1.08097565, + "epoch": 0.5221238938053098, + "flos": 551530312704.0, + "grad_norm": 0.028214516718367867, + "language_loss": 0.86704654, + "learning_rate": 0.0004884739555968617, + "loss": 0.87865162, + "num_input_tokens_seen": 225960096, + "router_z_loss_mlp": 0.79492188, + "step": 2714, + "time_per_iteration": 2.872819185256958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168823, + "balance_loss_mlp": 1.09100342, + "epoch": 0.5223162754905732, + "flos": 1358389797888.0, + "grad_norm": 0.012476009787944744, + "language_loss": 0.78977054, + "learning_rate": 0.0004881624983320676, + "loss": 0.80145878, + "num_input_tokens_seen": 226184960, + "router_z_loss_mlp": 0.77539062, + "step": 2715, + "time_per_iteration": 4.96741795539856 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170398, + "balance_loss_mlp": 1.09028971, + "epoch": 0.5225086571758368, + "flos": 568973139456.0, + "grad_norm": 0.03267804467904664, + "language_loss": 0.92675197, + "learning_rate": 0.0004878510456629992, + "loss": 0.93845594, + "num_input_tokens_seen": 226271328, + "router_z_loss_mlp": 0.80078125, + "step": 2716, + "time_per_iteration": 2.9626121520996094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160651, + "balance_loss_mlp": 1.08054268, + "epoch": 0.5227010388611004, + "flos": 501135478272.0, + "grad_norm": 0.033781088666230946, + "language_loss": 0.9089278, + "learning_rate": 0.00048753959771057314, + "loss": 0.92053425, + "num_input_tokens_seen": 226340080, + "router_z_loss_mlp": 0.80078125, + "step": 2717, + "time_per_iteration": 2.611691951751709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157135, + "balance_loss_mlp": 1.07702601, + "epoch": 0.522893420546364, + "flos": 598798279680.0, + "grad_norm": 0.032963356718883376, + "language_loss": 0.88626194, + "learning_rate": 0.0004872281545957044, + "loss": 0.89783323, + "num_input_tokens_seen": 226415120, + "router_z_loss_mlp": 0.80078125, + "step": 2718, + "time_per_iteration": 2.7218518257141113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116303, + "balance_loss_mlp": 1.08287394, + "epoch": 0.5230858022316276, + "flos": 665921534976.0, + "grad_norm": 0.02884991307967795, + "language_loss": 0.91186881, + "learning_rate": 0.0004869167164393055, + "loss": 0.92349917, + "num_input_tokens_seen": 226501200, + "router_z_loss_mlp": 0.80126953, + "step": 2719, + "time_per_iteration": 2.932335376739502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164195, + "balance_loss_mlp": 1.08403885, + "epoch": 0.5232781839168911, + "flos": 605033097216.0, + "grad_norm": 0.02708280335676697, + "language_loss": 0.94493294, + "learning_rate": 0.00048660528336228793, + "loss": 0.95657486, + "num_input_tokens_seen": 226582064, + "router_z_loss_mlp": 0.80126953, + "step": 2720, + "time_per_iteration": 2.8030405044555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158564, + "balance_loss_mlp": 1.07840788, + "epoch": 0.5234705656021547, + "flos": 551840489472.0, + "grad_norm": 0.028885887647779437, + "language_loss": 0.95077229, + "learning_rate": 0.0004862938554855606, + "loss": 0.96235794, + "num_input_tokens_seen": 226656448, + "router_z_loss_mlp": 0.80126953, + "step": 2721, + "time_per_iteration": 2.797297716140747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159208, + "balance_loss_mlp": 1.0790993, + "epoch": 0.5236629472874182, + "flos": 505294934016.0, + "grad_norm": 0.03214550067861962, + "language_loss": 0.91548902, + "learning_rate": 0.0004859824329300304, + "loss": 0.92708111, + "num_input_tokens_seen": 226725568, + "router_z_loss_mlp": 0.80078125, + "step": 2722, + "time_per_iteration": 2.589529037475586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164653, + "balance_loss_mlp": 1.08444893, + "epoch": 0.5238553289726818, + "flos": 548696884224.0, + "grad_norm": 0.029959051591606282, + "language_loss": 0.88512689, + "learning_rate": 0.00048567101581660244, + "loss": 0.89677346, + "num_input_tokens_seen": 226795728, + "router_z_loss_mlp": 0.80175781, + "step": 2723, + "time_per_iteration": 2.6637237071990967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160999, + "balance_loss_mlp": 1.08065164, + "epoch": 0.5240477106579453, + "flos": 533003779584.0, + "grad_norm": 0.031636293719806106, + "language_loss": 0.92529982, + "learning_rate": 0.00048535960426617956, + "loss": 0.93690991, + "num_input_tokens_seen": 226865344, + "router_z_loss_mlp": 0.80322266, + "step": 2724, + "time_per_iteration": 2.6061489582061768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156405, + "balance_loss_mlp": 1.07620108, + "epoch": 0.5242400923432089, + "flos": 619089271296.0, + "grad_norm": 0.028230181756235023, + "language_loss": 0.87247139, + "learning_rate": 0.0004850481983996621, + "loss": 0.88403541, + "num_input_tokens_seen": 226936800, + "router_z_loss_mlp": 0.80175781, + "step": 2725, + "time_per_iteration": 2.7699060440063477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157933, + "balance_loss_mlp": 1.07787168, + "epoch": 0.5244324740284725, + "flos": 417589976064.0, + "grad_norm": 0.03201067328997522, + "language_loss": 0.93398654, + "learning_rate": 0.0004847367983379492, + "loss": 0.94556582, + "num_input_tokens_seen": 226998448, + "router_z_loss_mlp": 0.80029297, + "step": 2726, + "time_per_iteration": 2.521516799926758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156009, + "balance_loss_mlp": 1.07599604, + "epoch": 0.5246248557137361, + "flos": 627731821056.0, + "grad_norm": 0.028083517097400017, + "language_loss": 0.83866012, + "learning_rate": 0.00048442540420193643, + "loss": 0.8502202, + "num_input_tokens_seen": 227081872, + "router_z_loss_mlp": 0.79980469, + "step": 2727, + "time_per_iteration": 2.8968660831451416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155443, + "balance_loss_mlp": 1.07547724, + "epoch": 0.5248172373989997, + "flos": 1250401675776.0, + "grad_norm": 0.032601939018394276, + "language_loss": 0.85122609, + "learning_rate": 0.0004841140161125182, + "loss": 0.86278045, + "num_input_tokens_seen": 227167744, + "router_z_loss_mlp": 0.79931641, + "step": 2728, + "time_per_iteration": 3.585556983947754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156303, + "balance_loss_mlp": 1.0764327, + "epoch": 0.5250096190842631, + "flos": 507882587136.0, + "grad_norm": 0.02942710549962748, + "language_loss": 0.90605354, + "learning_rate": 0.0004838026341905857, + "loss": 0.91761655, + "num_input_tokens_seen": 227239136, + "router_z_loss_mlp": 0.79833984, + "step": 2729, + "time_per_iteration": 2.7116506099700928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157734, + "balance_loss_mlp": 1.07781577, + "epoch": 0.5252020007695267, + "flos": 612507346944.0, + "grad_norm": 0.029260311632026755, + "language_loss": 0.9089191, + "learning_rate": 0.00048349125855702844, + "loss": 0.92049646, + "num_input_tokens_seen": 227311968, + "router_z_loss_mlp": 0.79882812, + "step": 2730, + "time_per_iteration": 2.772508144378662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157575, + "balance_loss_mlp": 1.07780039, + "epoch": 0.5253943824547903, + "flos": 540291377664.0, + "grad_norm": 0.027039643287400304, + "language_loss": 0.86249292, + "learning_rate": 0.00048317988933273287, + "loss": 0.87406862, + "num_input_tokens_seen": 227385248, + "router_z_loss_mlp": 0.79736328, + "step": 2731, + "time_per_iteration": 2.7501025199890137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159148, + "balance_loss_mlp": 1.07918203, + "epoch": 0.5255867641400539, + "flos": 699337443840.0, + "grad_norm": 0.030025626211663315, + "language_loss": 0.87967253, + "learning_rate": 0.00048286852663858367, + "loss": 0.89126396, + "num_input_tokens_seen": 227464640, + "router_z_loss_mlp": 0.79931641, + "step": 2732, + "time_per_iteration": 2.9441256523132324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156016, + "balance_loss_mlp": 1.07604992, + "epoch": 0.5257791458253175, + "flos": 668548119552.0, + "grad_norm": 0.03127119397180798, + "language_loss": 0.89405584, + "learning_rate": 0.000482557170595462, + "loss": 0.90561604, + "num_input_tokens_seen": 227542192, + "router_z_loss_mlp": 0.79931641, + "step": 2733, + "time_per_iteration": 2.875559091567993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158055, + "balance_loss_mlp": 1.07813704, + "epoch": 0.525971527510581, + "flos": 484604442624.0, + "grad_norm": 0.02914442262172993, + "language_loss": 0.93156296, + "learning_rate": 0.0004822458213242475, + "loss": 0.94314349, + "num_input_tokens_seen": 227606096, + "router_z_loss_mlp": 0.79882812, + "step": 2734, + "time_per_iteration": 2.5386509895324707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157288, + "balance_loss_mlp": 1.07737029, + "epoch": 0.5261639091958445, + "flos": 831347410944.0, + "grad_norm": 0.025020932409653307, + "language_loss": 0.90545583, + "learning_rate": 0.00048193447894581627, + "loss": 0.91702867, + "num_input_tokens_seen": 227689552, + "router_z_loss_mlp": 0.79882812, + "step": 2735, + "time_per_iteration": 3.087679862976074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158596, + "balance_loss_mlp": 1.07853508, + "epoch": 0.5263562908811081, + "flos": 521732643840.0, + "grad_norm": 0.03948252554958876, + "language_loss": 0.93270254, + "learning_rate": 0.00048162314358104243, + "loss": 0.94428849, + "num_input_tokens_seen": 227760784, + "router_z_loss_mlp": 0.80029297, + "step": 2736, + "time_per_iteration": 2.601278305053711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156345, + "balance_loss_mlp": 1.07633209, + "epoch": 0.5265486725663717, + "flos": 576097554432.0, + "grad_norm": 0.032044906976615765, + "language_loss": 0.89525604, + "learning_rate": 0.0004813118153507969, + "loss": 0.90681952, + "num_input_tokens_seen": 227834304, + "router_z_loss_mlp": 0.79980469, + "step": 2737, + "time_per_iteration": 2.7360177040100098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160461, + "balance_loss_mlp": 1.0820694, + "epoch": 0.5267410542516352, + "flos": 1550558333952.0, + "grad_norm": 0.008730383218555248, + "language_loss": 0.82447124, + "learning_rate": 0.0004810004943759482, + "loss": 0.8360759, + "num_input_tokens_seen": 228057232, + "router_z_loss_mlp": 0.78320312, + "step": 2738, + "time_per_iteration": 4.80830717086792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160505, + "balance_loss_mlp": 1.08039653, + "epoch": 0.5269334359368988, + "flos": 931460878848.0, + "grad_norm": 0.03056162512939441, + "language_loss": 0.89627469, + "learning_rate": 0.00048068918077736163, + "loss": 0.90787971, + "num_input_tokens_seen": 228140816, + "router_z_loss_mlp": 0.80078125, + "step": 2739, + "time_per_iteration": 3.228745222091675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160328, + "balance_loss_mlp": 1.08021903, + "epoch": 0.5271258176221624, + "flos": 656634436608.0, + "grad_norm": 0.03221347808604687, + "language_loss": 0.87126762, + "learning_rate": 0.0004803778746759001, + "loss": 0.88287091, + "num_input_tokens_seen": 228216208, + "router_z_loss_mlp": 0.80078125, + "step": 2740, + "time_per_iteration": 2.888040542602539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161897, + "balance_loss_mlp": 1.08217001, + "epoch": 0.527318199307426, + "flos": 544062067200.0, + "grad_norm": 0.03125376981830108, + "language_loss": 0.87138033, + "learning_rate": 0.00048006657619242317, + "loss": 0.8829993, + "num_input_tokens_seen": 228283184, + "router_z_loss_mlp": 0.796875, + "step": 2741, + "time_per_iteration": 2.6788547039031982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156491, + "balance_loss_mlp": 1.07662046, + "epoch": 0.5275105809926895, + "flos": 448898322432.0, + "grad_norm": 0.035204553781932095, + "language_loss": 0.84527659, + "learning_rate": 0.00047975528544778775, + "loss": 0.8568415, + "num_input_tokens_seen": 228351328, + "router_z_loss_mlp": 0.79833984, + "step": 2742, + "time_per_iteration": 2.5953187942504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156742, + "balance_loss_mlp": 1.07677603, + "epoch": 0.527702962677953, + "flos": 580052894208.0, + "grad_norm": 0.031790657619887884, + "language_loss": 0.9544906, + "learning_rate": 0.00047944400256284754, + "loss": 0.96605802, + "num_input_tokens_seen": 228423632, + "router_z_loss_mlp": 0.79931641, + "step": 2743, + "time_per_iteration": 2.6874876022338867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158128, + "balance_loss_mlp": 1.07821035, + "epoch": 0.5278953443632166, + "flos": 654009853440.0, + "grad_norm": 0.028533864641999515, + "language_loss": 0.84914398, + "learning_rate": 0.0004791327276584532, + "loss": 0.86072528, + "num_input_tokens_seen": 228498736, + "router_z_loss_mlp": 0.79882812, + "step": 2744, + "time_per_iteration": 2.851484537124634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159082, + "balance_loss_mlp": 1.07902145, + "epoch": 0.5280877260484802, + "flos": 515048661504.0, + "grad_norm": 0.02936794285447426, + "language_loss": 0.85631824, + "learning_rate": 0.00047882146085545264, + "loss": 0.86790907, + "num_input_tokens_seen": 228569056, + "router_z_loss_mlp": 0.80029297, + "step": 2745, + "time_per_iteration": 2.6376991271972656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159996, + "balance_loss_mlp": 1.081604, + "epoch": 0.5282801077337438, + "flos": 1448712608256.0, + "grad_norm": 0.005116949586401208, + "language_loss": 0.75402379, + "learning_rate": 0.00047851020227469, + "loss": 0.76562381, + "num_input_tokens_seen": 228800560, + "router_z_loss_mlp": 0.78125, + "step": 2746, + "time_per_iteration": 4.958376169204712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158639, + "balance_loss_mlp": 1.0789119, + "epoch": 0.5284724894190073, + "flos": 605966355456.0, + "grad_norm": 0.03386849685542916, + "language_loss": 0.85558748, + "learning_rate": 0.00047819895203700684, + "loss": 0.86717391, + "num_input_tokens_seen": 228869216, + "router_z_loss_mlp": 0.796875, + "step": 2747, + "time_per_iteration": 2.7103474140167236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161659, + "balance_loss_mlp": 1.08326721, + "epoch": 0.5286648711042709, + "flos": 1498103600640.0, + "grad_norm": 0.005524480658063938, + "language_loss": 0.75512433, + "learning_rate": 0.0004778877102632412, + "loss": 0.76674092, + "num_input_tokens_seen": 229085520, + "router_z_loss_mlp": 0.78125, + "step": 2748, + "time_per_iteration": 4.636225938796997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156911, + "balance_loss_mlp": 1.077088, + "epoch": 0.5288572527895344, + "flos": 598833208320.0, + "grad_norm": 0.030227845431380972, + "language_loss": 0.94071984, + "learning_rate": 0.0004775764770742277, + "loss": 0.95228899, + "num_input_tokens_seen": 229160912, + "router_z_loss_mlp": 0.79785156, + "step": 2749, + "time_per_iteration": 2.7894628047943115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154981, + "balance_loss_mlp": 1.07496762, + "epoch": 0.529049634474798, + "flos": 558439878144.0, + "grad_norm": 0.038921610012438906, + "language_loss": 0.92515904, + "learning_rate": 0.00047726525259079777, + "loss": 0.93670887, + "num_input_tokens_seen": 229235792, + "router_z_loss_mlp": 0.79980469, + "step": 2750, + "time_per_iteration": 2.8399362564086914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156308, + "balance_loss_mlp": 1.07643819, + "epoch": 0.5292420161600616, + "flos": 582434429952.0, + "grad_norm": 0.03493339209419754, + "language_loss": 0.94807124, + "learning_rate": 0.0004769540369337798, + "loss": 0.9596343, + "num_input_tokens_seen": 229309984, + "router_z_loss_mlp": 0.79833984, + "step": 2751, + "time_per_iteration": 2.7520663738250732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171177, + "balance_loss_mlp": 1.09097254, + "epoch": 0.5294343978453251, + "flos": 609563854848.0, + "grad_norm": 0.029200425139457874, + "language_loss": 0.90377945, + "learning_rate": 0.00047664283022399794, + "loss": 0.91549122, + "num_input_tokens_seen": 229394000, + "router_z_loss_mlp": 0.80175781, + "step": 2752, + "time_per_iteration": 2.827075719833374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169344, + "balance_loss_mlp": 1.08904481, + "epoch": 0.5296267795305887, + "flos": 647709907968.0, + "grad_norm": 0.03322281077035965, + "language_loss": 0.85670567, + "learning_rate": 0.00047633163258227376, + "loss": 0.86839902, + "num_input_tokens_seen": 229474320, + "router_z_loss_mlp": 0.80273438, + "step": 2753, + "time_per_iteration": 2.8684630393981934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168156, + "balance_loss_mlp": 1.08790445, + "epoch": 0.5298191612158523, + "flos": 560805950976.0, + "grad_norm": 0.0355054677596956, + "language_loss": 0.92337191, + "learning_rate": 0.0004760204441294247, + "loss": 0.93505347, + "num_input_tokens_seen": 229543072, + "router_z_loss_mlp": 0.80224609, + "step": 2754, + "time_per_iteration": 2.6347973346710205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162052, + "balance_loss_mlp": 1.08194327, + "epoch": 0.5300115429011159, + "flos": 515131253760.0, + "grad_norm": 0.03178410473183971, + "language_loss": 0.90992713, + "learning_rate": 0.00047570926498626486, + "loss": 0.92154765, + "num_input_tokens_seen": 229615296, + "router_z_loss_mlp": 0.80078125, + "step": 2755, + "time_per_iteration": 2.6713931560516357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165293, + "balance_loss_mlp": 1.08513677, + "epoch": 0.5302039245863793, + "flos": 674049065472.0, + "grad_norm": 0.025883205751119107, + "language_loss": 0.86624229, + "learning_rate": 0.00047539809527360474, + "loss": 0.87789524, + "num_input_tokens_seen": 229693728, + "router_z_loss_mlp": 0.80126953, + "step": 2756, + "time_per_iteration": 2.855339765548706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163284, + "balance_loss_mlp": 1.08312809, + "epoch": 0.5303963062716429, + "flos": 732156467712.0, + "grad_norm": 0.025616439830169112, + "language_loss": 0.86757731, + "learning_rate": 0.0004750869351122511, + "loss": 0.87921017, + "num_input_tokens_seen": 229772144, + "router_z_loss_mlp": 0.80126953, + "step": 2757, + "time_per_iteration": 2.9861788749694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157792, + "balance_loss_mlp": 1.07773066, + "epoch": 0.5305886879569065, + "flos": 574551948288.0, + "grad_norm": 0.030995691560080724, + "language_loss": 0.87564695, + "learning_rate": 0.00047477578462300685, + "loss": 0.88722491, + "num_input_tokens_seen": 229847024, + "router_z_loss_mlp": 0.80029297, + "step": 2758, + "time_per_iteration": 2.711434841156006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158236, + "balance_loss_mlp": 1.07817531, + "epoch": 0.5307810696421701, + "flos": 696728323584.0, + "grad_norm": 0.030944173565867344, + "language_loss": 0.85500729, + "learning_rate": 0.0004744646439266718, + "loss": 0.86658955, + "num_input_tokens_seen": 229932416, + "router_z_loss_mlp": 0.80029297, + "step": 2759, + "time_per_iteration": 3.012730121612549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159665, + "balance_loss_mlp": 1.07965159, + "epoch": 0.5309734513274337, + "flos": 650202233856.0, + "grad_norm": 0.02922555436454367, + "language_loss": 0.9794637, + "learning_rate": 0.000474153513144041, + "loss": 0.99106038, + "num_input_tokens_seen": 230010976, + "router_z_loss_mlp": 0.79980469, + "step": 2760, + "time_per_iteration": 2.9069197177886963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158721, + "balance_loss_mlp": 1.07866037, + "epoch": 0.5311658330126972, + "flos": 606055678464.0, + "grad_norm": 0.0324154212137011, + "language_loss": 0.92613202, + "learning_rate": 0.00047384239239590633, + "loss": 0.93771923, + "num_input_tokens_seen": 230093344, + "router_z_loss_mlp": 0.80029297, + "step": 2761, + "time_per_iteration": 2.8556571006774902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159506, + "balance_loss_mlp": 1.07949257, + "epoch": 0.5313582146979607, + "flos": 559316740608.0, + "grad_norm": 0.03061440617121834, + "language_loss": 0.94290936, + "learning_rate": 0.0004735312818030556, + "loss": 0.95450437, + "num_input_tokens_seen": 230165520, + "router_z_loss_mlp": 0.79980469, + "step": 2762, + "time_per_iteration": 2.6934847831726074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157514, + "balance_loss_mlp": 1.07764399, + "epoch": 0.5315505963832243, + "flos": 509445657600.0, + "grad_norm": 0.029953313176207894, + "language_loss": 0.88601178, + "learning_rate": 0.0004732201814862727, + "loss": 0.89758694, + "num_input_tokens_seen": 230237808, + "router_z_loss_mlp": 0.79833984, + "step": 2763, + "time_per_iteration": 2.7555651664733887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156859, + "balance_loss_mlp": 1.0773226, + "epoch": 0.5317429780684879, + "flos": 627668694528.0, + "grad_norm": 0.030098925618691368, + "language_loss": 0.87074947, + "learning_rate": 0.0004729090915663373, + "loss": 0.88231808, + "num_input_tokens_seen": 230321568, + "router_z_loss_mlp": 0.79492188, + "step": 2764, + "time_per_iteration": 2.83986496925354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157289, + "balance_loss_mlp": 1.07751369, + "epoch": 0.5319353597537514, + "flos": 477698880000.0, + "grad_norm": 0.035256009305486516, + "language_loss": 0.9145658, + "learning_rate": 0.00047259801216402534, + "loss": 0.92613864, + "num_input_tokens_seen": 230385376, + "router_z_loss_mlp": 0.79736328, + "step": 2765, + "time_per_iteration": 2.49153208732605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158926, + "balance_loss_mlp": 1.07934201, + "epoch": 0.532127741439015, + "flos": 502633420800.0, + "grad_norm": 0.031216360034414494, + "language_loss": 0.91137969, + "learning_rate": 0.00047228694340010845, + "loss": 0.92296898, + "num_input_tokens_seen": 230449760, + "router_z_loss_mlp": 0.79541016, + "step": 2766, + "time_per_iteration": 2.5491669178009033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163587, + "balance_loss_mlp": 1.08385968, + "epoch": 0.5323201231242786, + "flos": 1166482870272.0, + "grad_norm": 0.028947902109049614, + "language_loss": 0.91277415, + "learning_rate": 0.0004719758853953544, + "loss": 0.92440999, + "num_input_tokens_seen": 230536592, + "router_z_loss_mlp": 0.796875, + "step": 2767, + "time_per_iteration": 3.576573610305786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167049, + "balance_loss_mlp": 1.08694029, + "epoch": 0.5325125048095422, + "flos": 379541251584.0, + "grad_norm": 0.04259356627609034, + "language_loss": 0.91498351, + "learning_rate": 0.00047166483827052645, + "loss": 0.92665404, + "num_input_tokens_seen": 230596688, + "router_z_loss_mlp": 0.80078125, + "step": 2768, + "time_per_iteration": 2.3893725872039795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172249, + "balance_loss_mlp": 1.09423828, + "epoch": 0.5327048864948057, + "flos": 1544747211264.0, + "grad_norm": 0.007240897484727242, + "language_loss": 0.77078491, + "learning_rate": 0.00047135380214638413, + "loss": 0.78250736, + "num_input_tokens_seen": 230829408, + "router_z_loss_mlp": 0.77929688, + "step": 2769, + "time_per_iteration": 4.972010374069214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167053, + "balance_loss_mlp": 1.08737326, + "epoch": 0.5328972681800692, + "flos": 912861212160.0, + "grad_norm": 0.03027786850862354, + "language_loss": 0.8989411, + "learning_rate": 0.000471042777143682, + "loss": 0.91061163, + "num_input_tokens_seen": 230912528, + "router_z_loss_mlp": 0.79638672, + "step": 2770, + "time_per_iteration": 3.1992523670196533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116085, + "balance_loss_mlp": 1.08126593, + "epoch": 0.5330896498653328, + "flos": 474850715136.0, + "grad_norm": 0.032478463467180745, + "language_loss": 0.85492694, + "learning_rate": 0.0004707317633831707, + "loss": 0.86653543, + "num_input_tokens_seen": 230979424, + "router_z_loss_mlp": 0.79541016, + "step": 2771, + "time_per_iteration": 2.636418342590332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159417, + "balance_loss_mlp": 1.07983315, + "epoch": 0.5332820315505964, + "flos": 502633420800.0, + "grad_norm": 0.034509360784450445, + "language_loss": 0.84931278, + "learning_rate": 0.00047042076098559673, + "loss": 0.86090696, + "num_input_tokens_seen": 231046416, + "router_z_loss_mlp": 0.79541016, + "step": 2772, + "time_per_iteration": 2.587954521179199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155982, + "balance_loss_mlp": 1.07615912, + "epoch": 0.53347441323586, + "flos": 926031791616.0, + "grad_norm": 0.036007721663536225, + "language_loss": 0.8042109, + "learning_rate": 0.00047010977007170174, + "loss": 0.81577075, + "num_input_tokens_seen": 231136064, + "router_z_loss_mlp": 0.79785156, + "step": 2773, + "time_per_iteration": 3.207517623901367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154797, + "balance_loss_mlp": 1.07497442, + "epoch": 0.5336667949211235, + "flos": 575539600896.0, + "grad_norm": 0.032460813123339774, + "language_loss": 0.88737571, + "learning_rate": 0.00046979879076222334, + "loss": 0.89892364, + "num_input_tokens_seen": 231203616, + "router_z_loss_mlp": 0.79785156, + "step": 2774, + "time_per_iteration": 2.711036443710327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154367, + "balance_loss_mlp": 1.07459235, + "epoch": 0.533859176606387, + "flos": 1066390869504.0, + "grad_norm": 0.02757600625184913, + "language_loss": 0.88843602, + "learning_rate": 0.0004694878231778939, + "loss": 0.89997971, + "num_input_tokens_seen": 231287008, + "router_z_loss_mlp": 0.79736328, + "step": 2775, + "time_per_iteration": 3.3735690116882324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154523, + "balance_loss_mlp": 1.07512975, + "epoch": 0.5340515582916506, + "flos": 747905968128.0, + "grad_norm": 0.025749810309272533, + "language_loss": 0.89188796, + "learning_rate": 0.0004691768674394423, + "loss": 0.9034332, + "num_input_tokens_seen": 231365296, + "router_z_loss_mlp": 0.79345703, + "step": 2776, + "time_per_iteration": 2.9947128295898438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171234, + "balance_loss_mlp": 1.09341431, + "epoch": 0.5342439399769142, + "flos": 1448818669056.0, + "grad_norm": 0.018487467205991936, + "language_loss": 0.84484011, + "learning_rate": 0.0004688659236675918, + "loss": 0.85655242, + "num_input_tokens_seen": 231579040, + "router_z_loss_mlp": 0.77734375, + "step": 2777, + "time_per_iteration": 4.765547275543213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166931, + "balance_loss_mlp": 1.08872986, + "epoch": 0.5344363216621778, + "flos": 1430696365056.0, + "grad_norm": 0.01490962088780182, + "language_loss": 0.76653534, + "learning_rate": 0.00046855499198306187, + "loss": 0.77820462, + "num_input_tokens_seen": 231812736, + "router_z_loss_mlp": 0.77929688, + "step": 2778, + "time_per_iteration": 4.979669570922852 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156329, + "balance_loss_mlp": 1.07636368, + "epoch": 0.5346287033474413, + "flos": 528675136512.0, + "grad_norm": 0.028255812601682327, + "language_loss": 0.84707999, + "learning_rate": 0.00046824407250656676, + "loss": 0.85864329, + "num_input_tokens_seen": 231883840, + "router_z_loss_mlp": 0.79931641, + "step": 2779, + "time_per_iteration": 2.6169135570526123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161852, + "balance_loss_mlp": 1.08183897, + "epoch": 0.5348210850327049, + "flos": 511755334656.0, + "grad_norm": 0.02960487915529887, + "language_loss": 0.89552319, + "learning_rate": 0.0004679331653588161, + "loss": 0.90714169, + "num_input_tokens_seen": 231955360, + "router_z_loss_mlp": 0.79980469, + "step": 2780, + "time_per_iteration": 2.651503562927246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165567, + "balance_loss_mlp": 1.08536327, + "epoch": 0.5350134667179685, + "flos": 463625241600.0, + "grad_norm": 0.0331551624405392, + "language_loss": 0.91242051, + "learning_rate": 0.0004676222706605147, + "loss": 0.9240762, + "num_input_tokens_seen": 232027088, + "router_z_loss_mlp": 0.80175781, + "step": 2781, + "time_per_iteration": 2.609180450439453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171695, + "balance_loss_mlp": 1.09149086, + "epoch": 0.535205848403232, + "flos": 710117755392.0, + "grad_norm": 0.03114563748345981, + "language_loss": 0.9013232, + "learning_rate": 0.0004673113885323626, + "loss": 0.91304016, + "num_input_tokens_seen": 232099472, + "router_z_loss_mlp": 0.80175781, + "step": 2782, + "time_per_iteration": 2.889096736907959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167285, + "balance_loss_mlp": 1.08708084, + "epoch": 0.5353982300884956, + "flos": 895791688704.0, + "grad_norm": 0.029628425021764316, + "language_loss": 0.840244, + "learning_rate": 0.00046700051909505494, + "loss": 0.85191679, + "num_input_tokens_seen": 232182528, + "router_z_loss_mlp": 0.80175781, + "step": 2783, + "time_per_iteration": 3.1921920776367188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161558, + "balance_loss_mlp": 1.08130586, + "epoch": 0.5355906117737591, + "flos": 537024247296.0, + "grad_norm": 0.03383499561986932, + "language_loss": 0.89968938, + "learning_rate": 0.000466689662469282, + "loss": 0.91130495, + "num_input_tokens_seen": 232253344, + "router_z_loss_mlp": 0.80224609, + "step": 2784, + "time_per_iteration": 2.644693613052368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160347, + "balance_loss_mlp": 1.08009481, + "epoch": 0.5357829934590227, + "flos": 870327392256.0, + "grad_norm": 0.02956685166305249, + "language_loss": 0.89793074, + "learning_rate": 0.00046637881877572917, + "loss": 0.90953422, + "num_input_tokens_seen": 232337232, + "router_z_loss_mlp": 0.80224609, + "step": 2785, + "time_per_iteration": 3.134896755218506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159974, + "balance_loss_mlp": 1.0797224, + "epoch": 0.5359753751442863, + "flos": 554445606912.0, + "grad_norm": 0.027747995864539122, + "language_loss": 0.88820761, + "learning_rate": 0.0004660679881350764, + "loss": 0.89980739, + "num_input_tokens_seen": 232412864, + "router_z_loss_mlp": 0.80224609, + "step": 2786, + "time_per_iteration": 2.7258269786834717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186935, + "balance_loss_mlp": 1.10682678, + "epoch": 0.5361677568295499, + "flos": 1483756715520.0, + "grad_norm": 0.018012162763561924, + "language_loss": 0.75608146, + "learning_rate": 0.0004657571706679988, + "loss": 0.76795077, + "num_input_tokens_seen": 232639888, + "router_z_loss_mlp": 0.80078125, + "step": 2787, + "time_per_iteration": 5.011500835418701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163662, + "balance_loss_mlp": 1.08345807, + "epoch": 0.5363601385148133, + "flos": 807641568768.0, + "grad_norm": 0.03200093229385197, + "language_loss": 0.83718783, + "learning_rate": 0.0004654463664951667, + "loss": 0.84882444, + "num_input_tokens_seen": 232719248, + "router_z_loss_mlp": 0.80175781, + "step": 2788, + "time_per_iteration": 3.0044353008270264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162852, + "balance_loss_mlp": 1.08274364, + "epoch": 0.5365525202000769, + "flos": 508878971904.0, + "grad_norm": 0.03055357919616021, + "language_loss": 0.89048028, + "learning_rate": 0.0004651355757372447, + "loss": 0.90210879, + "num_input_tokens_seen": 232788464, + "router_z_loss_mlp": 0.80078125, + "step": 2789, + "time_per_iteration": 2.6024739742279053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011626, + "balance_loss_mlp": 1.08277702, + "epoch": 0.5367449018853405, + "flos": 530014625280.0, + "grad_norm": 0.03243837084279447, + "language_loss": 0.90724301, + "learning_rate": 0.00046482479851489274, + "loss": 0.91886902, + "num_input_tokens_seen": 232859792, + "router_z_loss_mlp": 0.79785156, + "step": 2790, + "time_per_iteration": 2.7023818492889404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168089, + "balance_loss_mlp": 1.08840978, + "epoch": 0.5369372835706041, + "flos": 651216082944.0, + "grad_norm": 0.035661652748611536, + "language_loss": 0.83603406, + "learning_rate": 0.00046451403494876525, + "loss": 0.84771496, + "num_input_tokens_seen": 232941472, + "router_z_loss_mlp": 0.79443359, + "step": 2791, + "time_per_iteration": 2.9009790420532227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169917, + "balance_loss_mlp": 1.09033263, + "epoch": 0.5371296652558677, + "flos": 585627700224.0, + "grad_norm": 0.03267915449635738, + "language_loss": 0.90313196, + "learning_rate": 0.0004642032851595111, + "loss": 0.91483116, + "num_input_tokens_seen": 233017120, + "router_z_loss_mlp": 0.79345703, + "step": 2792, + "time_per_iteration": 2.743093967437744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171549, + "balance_loss_mlp": 1.09196496, + "epoch": 0.5373220469411312, + "flos": 597083486208.0, + "grad_norm": 0.03226534649155799, + "language_loss": 0.89917493, + "learning_rate": 0.00046389254926777404, + "loss": 0.91089034, + "num_input_tokens_seen": 233095408, + "router_z_loss_mlp": 0.79345703, + "step": 2793, + "time_per_iteration": 2.816979169845581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162732, + "balance_loss_mlp": 1.08319557, + "epoch": 0.5375144286263948, + "flos": 1116277415424.0, + "grad_norm": 0.030732828924726157, + "language_loss": 0.83480382, + "learning_rate": 0.0004635818273941926, + "loss": 0.84643114, + "num_input_tokens_seen": 233191056, + "router_z_loss_mlp": 0.79443359, + "step": 2794, + "time_per_iteration": 3.538351058959961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156539, + "balance_loss_mlp": 1.07704997, + "epoch": 0.5377068103116583, + "flos": 596768580096.0, + "grad_norm": 0.03686105726392354, + "language_loss": 0.88212651, + "learning_rate": 0.0004632711196593997, + "loss": 0.8936919, + "num_input_tokens_seen": 233265536, + "router_z_loss_mlp": 0.79443359, + "step": 2795, + "time_per_iteration": 2.7304327487945557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153271, + "balance_loss_mlp": 1.07383037, + "epoch": 0.5378991919969219, + "flos": 885649195008.0, + "grad_norm": 0.031821277780470766, + "language_loss": 0.90781128, + "learning_rate": 0.00046296042618402297, + "loss": 0.91934395, + "num_input_tokens_seen": 233348224, + "router_z_loss_mlp": 0.79394531, + "step": 2796, + "time_per_iteration": 3.117605447769165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154822, + "balance_loss_mlp": 1.07523799, + "epoch": 0.5380915736821854, + "flos": 711950069760.0, + "grad_norm": 0.03181223121167454, + "language_loss": 0.84282267, + "learning_rate": 0.0004626497470886839, + "loss": 0.85437095, + "num_input_tokens_seen": 233429344, + "router_z_loss_mlp": 0.79541016, + "step": 2797, + "time_per_iteration": 2.943110704421997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154308, + "balance_loss_mlp": 1.07439017, + "epoch": 0.538283955367449, + "flos": 558114238464.0, + "grad_norm": 0.03131439333064892, + "language_loss": 0.87165904, + "learning_rate": 0.00046233908249399897, + "loss": 0.88320208, + "num_input_tokens_seen": 233504944, + "router_z_loss_mlp": 0.79882812, + "step": 2798, + "time_per_iteration": 2.753664970397949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156214, + "balance_loss_mlp": 1.0763911, + "epoch": 0.5384763370527126, + "flos": 514481975808.0, + "grad_norm": 0.02763164557850803, + "language_loss": 0.84223002, + "learning_rate": 0.00046202843252057905, + "loss": 0.85379213, + "num_input_tokens_seen": 233573072, + "router_z_loss_mlp": 0.79785156, + "step": 2799, + "time_per_iteration": 2.5850727558135986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157398, + "balance_loss_mlp": 1.07767105, + "epoch": 0.5386687187379762, + "flos": 490719737856.0, + "grad_norm": 0.033199019667933, + "language_loss": 0.8910532, + "learning_rate": 0.00046171779728902896, + "loss": 0.90262723, + "num_input_tokens_seen": 233640896, + "router_z_loss_mlp": 0.796875, + "step": 2800, + "time_per_iteration": 2.54720139503479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157318, + "balance_loss_mlp": 1.07730448, + "epoch": 0.5388611004232398, + "flos": 483627523584.0, + "grad_norm": 0.041719681603307614, + "language_loss": 0.92617553, + "learning_rate": 0.000461407176919948, + "loss": 0.93774867, + "num_input_tokens_seen": 233703904, + "router_z_loss_mlp": 0.79980469, + "step": 2801, + "time_per_iteration": 2.5201830863952637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158799, + "balance_loss_mlp": 1.07868993, + "epoch": 0.5390534821085032, + "flos": 562089043968.0, + "grad_norm": 0.03196091571695152, + "language_loss": 0.90337479, + "learning_rate": 0.00046109657153392997, + "loss": 0.91496283, + "num_input_tokens_seen": 233779248, + "router_z_loss_mlp": 0.80078125, + "step": 2802, + "time_per_iteration": 2.694173574447632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160257, + "balance_loss_mlp": 1.08014798, + "epoch": 0.5392458637937668, + "flos": 489360783360.0, + "grad_norm": 0.039860159596143786, + "language_loss": 0.89760619, + "learning_rate": 0.0004607859812515622, + "loss": 0.90920877, + "num_input_tokens_seen": 233847520, + "router_z_loss_mlp": 0.80078125, + "step": 2803, + "time_per_iteration": 2.585549831390381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164203, + "balance_loss_mlp": 1.08404684, + "epoch": 0.5394382454790304, + "flos": 513049161216.0, + "grad_norm": 0.03534563174473093, + "language_loss": 0.94152969, + "learning_rate": 0.00046047540619342667, + "loss": 0.95317167, + "num_input_tokens_seen": 233911328, + "router_z_loss_mlp": 0.80126953, + "step": 2804, + "time_per_iteration": 2.589845895767212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116808, + "balance_loss_mlp": 1.08835244, + "epoch": 0.539630627164294, + "flos": 568688432640.0, + "grad_norm": 0.02864783436473809, + "language_loss": 0.85705817, + "learning_rate": 0.00046016484648009933, + "loss": 0.86873901, + "num_input_tokens_seen": 233987104, + "router_z_loss_mlp": 0.796875, + "step": 2805, + "time_per_iteration": 2.687539577484131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162339, + "balance_loss_mlp": 1.08246911, + "epoch": 0.5398230088495575, + "flos": 527502833664.0, + "grad_norm": 0.03312242512211549, + "language_loss": 0.8782742, + "learning_rate": 0.0004598543022321501, + "loss": 0.88989753, + "num_input_tokens_seen": 234057216, + "router_z_loss_mlp": 0.79833984, + "step": 2806, + "time_per_iteration": 2.6111719608306885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159262, + "balance_loss_mlp": 1.07910562, + "epoch": 0.5400153905348211, + "flos": 539852946432.0, + "grad_norm": 0.03059923694994547, + "language_loss": 0.85068846, + "learning_rate": 0.0004595437735701433, + "loss": 0.86228108, + "num_input_tokens_seen": 234129984, + "router_z_loss_mlp": 0.80126953, + "step": 2807, + "time_per_iteration": 2.668133020401001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158376, + "balance_loss_mlp": 1.07826769, + "epoch": 0.5402077722200846, + "flos": 514664624640.0, + "grad_norm": 0.03937747929323063, + "language_loss": 0.88849455, + "learning_rate": 0.00045923326061463623, + "loss": 0.90007836, + "num_input_tokens_seen": 234203920, + "router_z_loss_mlp": 0.80078125, + "step": 2808, + "time_per_iteration": 2.76680588722229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152678, + "balance_loss_mlp": 1.07261717, + "epoch": 0.5404001539053482, + "flos": 677565974016.0, + "grad_norm": 0.030976456011377742, + "language_loss": 0.87454319, + "learning_rate": 0.00045892276348618113, + "loss": 0.88606995, + "num_input_tokens_seen": 234285440, + "router_z_loss_mlp": 0.80029297, + "step": 2809, + "time_per_iteration": 2.9939539432525635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173447, + "balance_loss_mlp": 1.09410095, + "epoch": 0.5405925355906118, + "flos": 1558189036032.0, + "grad_norm": 0.015961767794208704, + "language_loss": 0.78260827, + "learning_rate": 0.0004586122823053235, + "loss": 0.79434276, + "num_input_tokens_seen": 234521424, + "router_z_loss_mlp": 0.79296875, + "step": 2810, + "time_per_iteration": 4.974013328552246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157913, + "balance_loss_mlp": 1.07818568, + "epoch": 0.5407849172758753, + "flos": 648537105408.0, + "grad_norm": 0.02696900388574031, + "language_loss": 0.85372365, + "learning_rate": 0.000458301817192603, + "loss": 0.8653028, + "num_input_tokens_seen": 234601632, + "router_z_loss_mlp": 0.796875, + "step": 2811, + "time_per_iteration": 2.8575778007507324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118454, + "balance_loss_mlp": 1.1057663, + "epoch": 0.5409772989611389, + "flos": 1410481234944.0, + "grad_norm": 0.012734794042181983, + "language_loss": 0.8084178, + "learning_rate": 0.00045799136826855263, + "loss": 0.82026327, + "num_input_tokens_seen": 234825776, + "router_z_loss_mlp": 0.78710938, + "step": 2812, + "time_per_iteration": 4.809651613235474 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163077, + "balance_loss_mlp": 1.0835402, + "epoch": 0.5411696806464025, + "flos": 555544049664.0, + "grad_norm": 0.031759632467193835, + "language_loss": 0.91974443, + "learning_rate": 0.00045768093565369983, + "loss": 0.93137515, + "num_input_tokens_seen": 234901504, + "router_z_loss_mlp": 0.79492188, + "step": 2813, + "time_per_iteration": 2.7333316802978516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164131, + "balance_loss_mlp": 1.0847373, + "epoch": 0.5413620623316661, + "flos": 529204892160.0, + "grad_norm": 0.03127565438509195, + "language_loss": 0.8788538, + "learning_rate": 0.0004573705194685646, + "loss": 0.89049512, + "num_input_tokens_seen": 234970288, + "router_z_loss_mlp": 0.79199219, + "step": 2814, + "time_per_iteration": 2.645961284637451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164839, + "balance_loss_mlp": 1.08544588, + "epoch": 0.5415544440169295, + "flos": 599851060224.0, + "grad_norm": 0.03485280634812332, + "language_loss": 0.91058564, + "learning_rate": 0.00045706011983366157, + "loss": 0.92223406, + "num_input_tokens_seen": 235039984, + "router_z_loss_mlp": 0.79199219, + "step": 2815, + "time_per_iteration": 2.6676552295684814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161812, + "balance_loss_mlp": 1.08237088, + "epoch": 0.5417468257021931, + "flos": 471713840640.0, + "grad_norm": 0.03625185410953689, + "language_loss": 0.88930029, + "learning_rate": 0.00045674973686949847, + "loss": 0.90091836, + "num_input_tokens_seen": 235105232, + "router_z_loss_mlp": 0.79199219, + "step": 2816, + "time_per_iteration": 2.51118540763855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116016, + "balance_loss_mlp": 1.08076715, + "epoch": 0.5419392073874567, + "flos": 682190057472.0, + "grad_norm": 0.02856526912727588, + "language_loss": 0.90316737, + "learning_rate": 0.0004564393706965766, + "loss": 0.91476899, + "num_input_tokens_seen": 235192560, + "router_z_loss_mlp": 0.79199219, + "step": 2817, + "time_per_iteration": 2.9563546180725098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160311, + "balance_loss_mlp": 1.0809654, + "epoch": 0.5421315890727203, + "flos": 463336531968.0, + "grad_norm": 0.032507832188727104, + "language_loss": 0.87249088, + "learning_rate": 0.00045612902143539116, + "loss": 0.884094, + "num_input_tokens_seen": 235258448, + "router_z_loss_mlp": 0.79199219, + "step": 2818, + "time_per_iteration": 2.5383646488189697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162479, + "balance_loss_mlp": 1.08294284, + "epoch": 0.5423239707579839, + "flos": 437889699840.0, + "grad_norm": 0.03622660962153638, + "language_loss": 0.8863132, + "learning_rate": 0.00045581868920642986, + "loss": 0.89793801, + "num_input_tokens_seen": 235322176, + "router_z_loss_mlp": 0.79296875, + "step": 2819, + "time_per_iteration": 2.4692800045013428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163903, + "balance_loss_mlp": 1.08441401, + "epoch": 0.5425163524432474, + "flos": 459305330688.0, + "grad_norm": 0.036307438946012835, + "language_loss": 0.86308074, + "learning_rate": 0.00045550837413017457, + "loss": 0.8747198, + "num_input_tokens_seen": 235390960, + "router_z_loss_mlp": 0.79296875, + "step": 2820, + "time_per_iteration": 2.59252667427063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160476, + "balance_loss_mlp": 1.08089161, + "epoch": 0.542708734128511, + "flos": 420409943040.0, + "grad_norm": 0.028561818537522772, + "language_loss": 0.89964175, + "learning_rate": 0.0004551980763271005, + "loss": 0.91124654, + "num_input_tokens_seen": 235460976, + "router_z_loss_mlp": 0.79394531, + "step": 2821, + "time_per_iteration": 2.64975643157959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158342, + "balance_loss_mlp": 1.07880592, + "epoch": 0.5429011158137745, + "flos": 679708465152.0, + "grad_norm": 0.03014006642218495, + "language_loss": 0.89564693, + "learning_rate": 0.0004548877959176756, + "loss": 0.90723038, + "num_input_tokens_seen": 235540912, + "router_z_loss_mlp": 0.79345703, + "step": 2822, + "time_per_iteration": 2.881334066390991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166233, + "balance_loss_mlp": 1.08693492, + "epoch": 0.5430934974990381, + "flos": 541967239680.0, + "grad_norm": 0.03201888254331298, + "language_loss": 0.91779578, + "learning_rate": 0.00045457753302236166, + "loss": 0.92945808, + "num_input_tokens_seen": 235608736, + "router_z_loss_mlp": 0.79150391, + "step": 2823, + "time_per_iteration": 2.615506887435913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160293, + "balance_loss_mlp": 1.08075619, + "epoch": 0.5432858791843016, + "flos": 659643056640.0, + "grad_norm": 0.03397006228821556, + "language_loss": 0.93680996, + "learning_rate": 0.00045426728776161353, + "loss": 0.94841284, + "num_input_tokens_seen": 235678720, + "router_z_loss_mlp": 0.79443359, + "step": 2824, + "time_per_iteration": 2.815668821334839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160478, + "balance_loss_mlp": 1.08084619, + "epoch": 0.5434782608695652, + "flos": 532966849536.0, + "grad_norm": 0.030340926449950675, + "language_loss": 0.86484039, + "learning_rate": 0.00045395706025587863, + "loss": 0.87644517, + "num_input_tokens_seen": 235748704, + "router_z_loss_mlp": 0.79589844, + "step": 2825, + "time_per_iteration": 2.677969455718994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159818, + "balance_loss_mlp": 1.0802815, + "epoch": 0.5436706425548288, + "flos": 609632985600.0, + "grad_norm": 0.032758454025991736, + "language_loss": 0.88250875, + "learning_rate": 0.00045364685062559843, + "loss": 0.89410686, + "num_input_tokens_seen": 235828224, + "router_z_loss_mlp": 0.79492188, + "step": 2826, + "time_per_iteration": 2.7975664138793945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160655, + "balance_loss_mlp": 1.08111823, + "epoch": 0.5438630242400924, + "flos": 706772762112.0, + "grad_norm": 0.047560346967580276, + "language_loss": 0.96112239, + "learning_rate": 0.0004533366589912067, + "loss": 0.97272885, + "num_input_tokens_seen": 235909392, + "router_z_loss_mlp": 0.79492188, + "step": 2827, + "time_per_iteration": 2.9455690383911133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161858, + "balance_loss_mlp": 1.08232152, + "epoch": 0.544055405925356, + "flos": 857838291456.0, + "grad_norm": 0.035082604549872, + "language_loss": 0.84527165, + "learning_rate": 0.0004530264854731306, + "loss": 0.8568902, + "num_input_tokens_seen": 235983888, + "router_z_loss_mlp": 0.79492188, + "step": 2828, + "time_per_iteration": 3.0149006843566895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161186, + "balance_loss_mlp": 1.08160186, + "epoch": 0.5442477876106194, + "flos": 572967410688.0, + "grad_norm": 0.029506216108961765, + "language_loss": 0.89973861, + "learning_rate": 0.00045271633019179034, + "loss": 0.91135049, + "num_input_tokens_seen": 236063056, + "router_z_loss_mlp": 0.79541016, + "step": 2829, + "time_per_iteration": 2.7735414505004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162764, + "balance_loss_mlp": 1.08313203, + "epoch": 0.544440169295883, + "flos": 626802565632.0, + "grad_norm": 0.028700635940731967, + "language_loss": 0.92908496, + "learning_rate": 0.0004524061932675986, + "loss": 0.94071257, + "num_input_tokens_seen": 236141104, + "router_z_loss_mlp": 0.79589844, + "step": 2830, + "time_per_iteration": 2.828461170196533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116197, + "balance_loss_mlp": 1.08224237, + "epoch": 0.5446325509811466, + "flos": 837640625664.0, + "grad_norm": 0.03503891147687097, + "language_loss": 0.92219722, + "learning_rate": 0.00045209607482096125, + "loss": 0.93381691, + "num_input_tokens_seen": 236220320, + "router_z_loss_mlp": 0.79541016, + "step": 2831, + "time_per_iteration": 3.0058434009552 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162561, + "balance_loss_mlp": 1.08292878, + "epoch": 0.5448249326664102, + "flos": 484389593088.0, + "grad_norm": 0.03287703969217422, + "language_loss": 0.89665288, + "learning_rate": 0.0004517859749722772, + "loss": 0.90827847, + "num_input_tokens_seen": 236288208, + "router_z_loss_mlp": 0.79443359, + "step": 2832, + "time_per_iteration": 2.6527607440948486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116426, + "balance_loss_mlp": 1.08453321, + "epoch": 0.5450173143516738, + "flos": 562345552896.0, + "grad_norm": 0.03300449363670703, + "language_loss": 0.84396762, + "learning_rate": 0.0004514758938419376, + "loss": 0.85561025, + "num_input_tokens_seen": 236366864, + "router_z_loss_mlp": 0.79541016, + "step": 2833, + "time_per_iteration": 2.799923896789551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176773, + "balance_loss_mlp": 1.09971619, + "epoch": 0.5452096960369373, + "flos": 1473586023936.0, + "grad_norm": 0.016868588983801922, + "language_loss": 0.76920587, + "learning_rate": 0.0004511658315503268, + "loss": 0.78097355, + "num_input_tokens_seen": 236597120, + "router_z_loss_mlp": 0.76953125, + "step": 2834, + "time_per_iteration": 4.904434442520142 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116397, + "balance_loss_mlp": 1.08414805, + "epoch": 0.5454020777222008, + "flos": 466017510912.0, + "grad_norm": 0.028290923396431526, + "language_loss": 0.88719809, + "learning_rate": 0.00045085578821782175, + "loss": 0.8988378, + "num_input_tokens_seen": 236664192, + "router_z_loss_mlp": 0.79589844, + "step": 2835, + "time_per_iteration": 2.5375516414642334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116069, + "balance_loss_mlp": 1.08325195, + "epoch": 0.5455944594074644, + "flos": 1472615109120.0, + "grad_norm": 0.00840245760684232, + "language_loss": 0.76134741, + "learning_rate": 0.0004505457639647917, + "loss": 0.77295429, + "num_input_tokens_seen": 236888784, + "router_z_loss_mlp": 0.7734375, + "step": 2836, + "time_per_iteration": 4.908621549606323 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161179, + "balance_loss_mlp": 1.08121371, + "epoch": 0.545786841092728, + "flos": 534304336896.0, + "grad_norm": 0.026675001792915147, + "language_loss": 0.85451794, + "learning_rate": 0.00045023575891159866, + "loss": 0.86612976, + "num_input_tokens_seen": 236962528, + "router_z_loss_mlp": 0.79931641, + "step": 2837, + "time_per_iteration": 2.77382230758667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167343, + "balance_loss_mlp": 1.08952332, + "epoch": 0.5459792227779915, + "flos": 1355426113536.0, + "grad_norm": 0.010026273514264956, + "language_loss": 0.74763811, + "learning_rate": 0.00044992577317859764, + "loss": 0.75931144, + "num_input_tokens_seen": 237179360, + "router_z_loss_mlp": 0.77734375, + "step": 2838, + "time_per_iteration": 4.8985395431518555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163141, + "balance_loss_mlp": 1.08322346, + "epoch": 0.5461716044632551, + "flos": 639072087552.0, + "grad_norm": 0.03170534586871267, + "language_loss": 0.83100337, + "learning_rate": 0.0004496158068861354, + "loss": 0.8426348, + "num_input_tokens_seen": 237256240, + "router_z_loss_mlp": 0.79833984, + "step": 2839, + "time_per_iteration": 2.8032078742980957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163887, + "balance_loss_mlp": 1.08396888, + "epoch": 0.5463639861485187, + "flos": 603925922304.0, + "grad_norm": 0.031486344316249366, + "language_loss": 0.85257053, + "learning_rate": 0.00044930586015455207, + "loss": 0.86420941, + "num_input_tokens_seen": 237334272, + "router_z_loss_mlp": 0.79833984, + "step": 2840, + "time_per_iteration": 2.780024290084839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168265, + "balance_loss_mlp": 1.08834755, + "epoch": 0.5465563678337823, + "flos": 643752566784.0, + "grad_norm": 0.02832807598538896, + "language_loss": 0.93569458, + "learning_rate": 0.000448995933104179, + "loss": 0.9473772, + "num_input_tokens_seen": 237415408, + "router_z_loss_mlp": 0.79736328, + "step": 2841, + "time_per_iteration": 2.848741292953491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168336, + "balance_loss_mlp": 1.08841801, + "epoch": 0.5467487495190458, + "flos": 615364243968.0, + "grad_norm": 0.03451251764660495, + "language_loss": 0.86641318, + "learning_rate": 0.00044868602585534077, + "loss": 0.87809658, + "num_input_tokens_seen": 237493232, + "router_z_loss_mlp": 0.796875, + "step": 2842, + "time_per_iteration": 2.8590362071990967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166404, + "balance_loss_mlp": 1.08677208, + "epoch": 0.5469411312043093, + "flos": 462127299072.0, + "grad_norm": 0.03329693034046033, + "language_loss": 0.9437651, + "learning_rate": 0.0004483761385283541, + "loss": 0.95542908, + "num_input_tokens_seen": 237556624, + "router_z_loss_mlp": 0.79443359, + "step": 2843, + "time_per_iteration": 2.523390769958496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116664, + "balance_loss_mlp": 1.08691323, + "epoch": 0.5471335128895729, + "flos": 562266963456.0, + "grad_norm": 0.03201679454384124, + "language_loss": 0.87509483, + "learning_rate": 0.0004480662712435281, + "loss": 0.88676119, + "num_input_tokens_seen": 237632048, + "router_z_loss_mlp": 0.79492188, + "step": 2844, + "time_per_iteration": 2.7186124324798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162399, + "balance_loss_mlp": 1.08286297, + "epoch": 0.5473258945748365, + "flos": 519685479936.0, + "grad_norm": 0.032165214678065886, + "language_loss": 0.93768156, + "learning_rate": 0.0004477564241211635, + "loss": 0.94930553, + "num_input_tokens_seen": 237699840, + "router_z_loss_mlp": 0.79345703, + "step": 2845, + "time_per_iteration": 2.5637102127075195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159503, + "balance_loss_mlp": 1.08034766, + "epoch": 0.5475182762601001, + "flos": 434744093184.0, + "grad_norm": 0.03138398317411523, + "language_loss": 0.92521811, + "learning_rate": 0.0004474465972815541, + "loss": 0.93681312, + "num_input_tokens_seen": 237762560, + "router_z_loss_mlp": 0.79101562, + "step": 2846, + "time_per_iteration": 2.470494508743286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162403, + "balance_loss_mlp": 1.08348668, + "epoch": 0.5477106579453636, + "flos": 512573799936.0, + "grad_norm": 0.02767233380819538, + "language_loss": 0.92665255, + "learning_rate": 0.000447136790844985, + "loss": 0.93827659, + "num_input_tokens_seen": 237837152, + "router_z_loss_mlp": 0.78759766, + "step": 2847, + "time_per_iteration": 2.7123520374298096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164922, + "balance_loss_mlp": 1.0861007, + "epoch": 0.5479030396306271, + "flos": 677140277760.0, + "grad_norm": 0.030326073882101023, + "language_loss": 0.85917926, + "learning_rate": 0.00044682700493173385, + "loss": 0.87082845, + "num_input_tokens_seen": 237909488, + "router_z_loss_mlp": 0.78710938, + "step": 2848, + "time_per_iteration": 2.826556921005249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166552, + "balance_loss_mlp": 1.08787405, + "epoch": 0.5480954213158907, + "flos": 877578060288.0, + "grad_norm": 0.033676298977630685, + "language_loss": 0.86673969, + "learning_rate": 0.00044651723966207004, + "loss": 0.87840521, + "num_input_tokens_seen": 237991056, + "router_z_loss_mlp": 0.78564453, + "step": 2849, + "time_per_iteration": 3.192443370819092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164243, + "balance_loss_mlp": 1.08556521, + "epoch": 0.5482878030011543, + "flos": 623174866944.0, + "grad_norm": 0.03042847520175512, + "language_loss": 0.83109522, + "learning_rate": 0.00044620749515625536, + "loss": 0.84273762, + "num_input_tokens_seen": 238064576, + "router_z_loss_mlp": 0.78564453, + "step": 2850, + "time_per_iteration": 2.7753841876983643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164392, + "balance_loss_mlp": 1.08528447, + "epoch": 0.5484801846864179, + "flos": 498257114112.0, + "grad_norm": 0.03264010932273605, + "language_loss": 0.90008557, + "learning_rate": 0.00044589777153454334, + "loss": 0.91172945, + "num_input_tokens_seen": 238136464, + "router_z_loss_mlp": 0.78857422, + "step": 2851, + "time_per_iteration": 2.7295939922332764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162977, + "balance_loss_mlp": 1.08391714, + "epoch": 0.5486725663716814, + "flos": 443353715712.0, + "grad_norm": 0.029420479903708215, + "language_loss": 0.88820338, + "learning_rate": 0.00044558806891717895, + "loss": 0.8998332, + "num_input_tokens_seen": 238198912, + "router_z_loss_mlp": 0.78808594, + "step": 2852, + "time_per_iteration": 2.4784035682678223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164311, + "balance_loss_mlp": 1.08548951, + "epoch": 0.548864948056945, + "flos": 656347728384.0, + "grad_norm": 0.02822438724303185, + "language_loss": 0.84744209, + "learning_rate": 0.0004452783874243998, + "loss": 0.8590852, + "num_input_tokens_seen": 238275184, + "router_z_loss_mlp": 0.78759766, + "step": 2853, + "time_per_iteration": 2.821592092514038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159975, + "balance_loss_mlp": 1.08105898, + "epoch": 0.5490573297422086, + "flos": 547140544512.0, + "grad_norm": 0.03150495246723179, + "language_loss": 0.90787637, + "learning_rate": 0.00044496872717643475, + "loss": 0.91947615, + "num_input_tokens_seen": 238348496, + "router_z_loss_mlp": 0.78710938, + "step": 2854, + "time_per_iteration": 2.6908938884735107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011614, + "balance_loss_mlp": 1.08415222, + "epoch": 0.5492497114274721, + "flos": 1593760897536.0, + "grad_norm": 0.006862097523809848, + "language_loss": 0.77089292, + "learning_rate": 0.00044465908829350453, + "loss": 0.78250694, + "num_input_tokens_seen": 238578464, + "router_z_loss_mlp": 0.77148438, + "step": 2855, + "time_per_iteration": 4.92158579826355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159374, + "balance_loss_mlp": 1.08036256, + "epoch": 0.5494420931127356, + "flos": 752269539840.0, + "grad_norm": 0.030842116299214104, + "language_loss": 0.87009478, + "learning_rate": 0.0004443494708958217, + "loss": 0.88168848, + "num_input_tokens_seen": 238660256, + "router_z_loss_mlp": 0.78759766, + "step": 2856, + "time_per_iteration": 2.952693223953247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155384, + "balance_loss_mlp": 1.07627714, + "epoch": 0.5496344747979992, + "flos": 627304123392.0, + "grad_norm": 0.026887140123268247, + "language_loss": 0.85396117, + "learning_rate": 0.0004440398751035906, + "loss": 0.86551499, + "num_input_tokens_seen": 238745856, + "router_z_loss_mlp": 0.79052734, + "step": 2857, + "time_per_iteration": 2.8657121658325195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156313, + "balance_loss_mlp": 1.07691979, + "epoch": 0.5498268564832628, + "flos": 524124913152.0, + "grad_norm": 0.03681476772579859, + "language_loss": 0.90347362, + "learning_rate": 0.00044373030103700645, + "loss": 0.9150368, + "num_input_tokens_seen": 238813888, + "router_z_loss_mlp": 0.79248047, + "step": 2858, + "time_per_iteration": 2.6372759342193604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161253, + "balance_loss_mlp": 1.08185947, + "epoch": 0.5500192381685264, + "flos": 605777702400.0, + "grad_norm": 0.027579474955625485, + "language_loss": 0.8405782, + "learning_rate": 0.000443420748816257, + "loss": 0.85219079, + "num_input_tokens_seen": 238885440, + "router_z_loss_mlp": 0.79248047, + "step": 2859, + "time_per_iteration": 2.832864999771118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163587, + "balance_loss_mlp": 1.08395553, + "epoch": 0.55021161985379, + "flos": 521654780928.0, + "grad_norm": 0.03409053016014856, + "language_loss": 0.84214079, + "learning_rate": 0.0004431112185615208, + "loss": 0.85377669, + "num_input_tokens_seen": 238960944, + "router_z_loss_mlp": 0.79443359, + "step": 2860, + "time_per_iteration": 2.7533481121063232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165675, + "balance_loss_mlp": 1.0862813, + "epoch": 0.5504040015390534, + "flos": 490654609920.0, + "grad_norm": 0.028251427239966796, + "language_loss": 0.84584463, + "learning_rate": 0.00044280171039296845, + "loss": 0.85750139, + "num_input_tokens_seen": 239030592, + "router_z_loss_mlp": 0.79296875, + "step": 2861, + "time_per_iteration": 2.6798369884490967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116251, + "balance_loss_mlp": 1.08306909, + "epoch": 0.550596383224317, + "flos": 576861625344.0, + "grad_norm": 0.030462386563617952, + "language_loss": 0.93688512, + "learning_rate": 0.0004424922244307616, + "loss": 0.94851023, + "num_input_tokens_seen": 239097440, + "router_z_loss_mlp": 0.79296875, + "step": 2862, + "time_per_iteration": 2.7042698860168457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164147, + "balance_loss_mlp": 1.08461094, + "epoch": 0.5507887649095806, + "flos": 643633044480.0, + "grad_norm": 0.03244616812289036, + "language_loss": 0.87943101, + "learning_rate": 0.00044218276079505315, + "loss": 0.89107251, + "num_input_tokens_seen": 239179872, + "router_z_loss_mlp": 0.79296875, + "step": 2863, + "time_per_iteration": 2.869657278060913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116435, + "balance_loss_mlp": 1.08490932, + "epoch": 0.5509811465948442, + "flos": 532864791552.0, + "grad_norm": 0.03309127401700594, + "language_loss": 0.80069649, + "learning_rate": 0.0004418733196059876, + "loss": 0.81234002, + "num_input_tokens_seen": 239251264, + "router_z_loss_mlp": 0.79248047, + "step": 2864, + "time_per_iteration": 2.694439649581909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164051, + "balance_loss_mlp": 1.08489633, + "epoch": 0.5511735282801077, + "flos": 655983157248.0, + "grad_norm": 0.031218908498787497, + "language_loss": 0.85167533, + "learning_rate": 0.0004415639009837008, + "loss": 0.86331582, + "num_input_tokens_seen": 239326688, + "router_z_loss_mlp": 0.79101562, + "step": 2865, + "time_per_iteration": 2.8214035034179688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160959, + "balance_loss_mlp": 1.08175683, + "epoch": 0.5513659099653713, + "flos": 530609508864.0, + "grad_norm": 0.029306479659861318, + "language_loss": 0.87106019, + "learning_rate": 0.00044125450504831955, + "loss": 0.88266975, + "num_input_tokens_seen": 239401248, + "router_z_loss_mlp": 0.79150391, + "step": 2866, + "time_per_iteration": 2.7755370140075684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157699, + "balance_loss_mlp": 1.0782584, + "epoch": 0.5515582916506349, + "flos": 555973748736.0, + "grad_norm": 0.03358668454464356, + "language_loss": 0.88577026, + "learning_rate": 0.0004409451319199622, + "loss": 0.89734721, + "num_input_tokens_seen": 239471600, + "router_z_loss_mlp": 0.79248047, + "step": 2867, + "time_per_iteration": 2.700601577758789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160497, + "balance_loss_mlp": 1.08105552, + "epoch": 0.5517506733358984, + "flos": 736771819008.0, + "grad_norm": 0.033780629576782226, + "language_loss": 0.90037191, + "learning_rate": 0.0004406357817187381, + "loss": 0.91197693, + "num_input_tokens_seen": 239548592, + "router_z_loss_mlp": 0.79248047, + "step": 2868, + "time_per_iteration": 2.9809505939483643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160757, + "balance_loss_mlp": 1.0816493, + "epoch": 0.551943055021162, + "flos": 1117189206528.0, + "grad_norm": 0.02667902344135768, + "language_loss": 0.86254233, + "learning_rate": 0.0004403264545647474, + "loss": 0.87414992, + "num_input_tokens_seen": 239644432, + "router_z_loss_mlp": 0.79052734, + "step": 2869, + "time_per_iteration": 3.5932819843292236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156378, + "balance_loss_mlp": 1.07727027, + "epoch": 0.5521354367064255, + "flos": 545501612544.0, + "grad_norm": 0.024843999573841903, + "language_loss": 0.89363241, + "learning_rate": 0.00044001715057808154, + "loss": 0.90519619, + "num_input_tokens_seen": 239723392, + "router_z_loss_mlp": 0.79052734, + "step": 2870, + "time_per_iteration": 2.7333626747131348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159059, + "balance_loss_mlp": 1.07999909, + "epoch": 0.5523278183916891, + "flos": 937871614464.0, + "grad_norm": 0.027996488517333572, + "language_loss": 0.86652702, + "learning_rate": 0.0004397078698788232, + "loss": 0.87811756, + "num_input_tokens_seen": 239806896, + "router_z_loss_mlp": 0.79003906, + "step": 2871, + "time_per_iteration": 3.199366807937622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168602, + "balance_loss_mlp": 1.0909729, + "epoch": 0.5525202000769527, + "flos": 1469098927104.0, + "grad_norm": 0.009568898658781464, + "language_loss": 0.80442369, + "learning_rate": 0.0004393986125870456, + "loss": 0.81610966, + "num_input_tokens_seen": 240037824, + "router_z_loss_mlp": 0.77539062, + "step": 2872, + "time_per_iteration": 4.912739515304565 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163231, + "balance_loss_mlp": 1.08426642, + "epoch": 0.5527125817622163, + "flos": 490784865792.0, + "grad_norm": 0.03313805620558485, + "language_loss": 0.83656394, + "learning_rate": 0.00043908937882281343, + "loss": 0.84819627, + "num_input_tokens_seen": 240107952, + "router_z_loss_mlp": 0.78808594, + "step": 2873, + "time_per_iteration": 2.6517224311828613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163059, + "balance_loss_mlp": 1.08409429, + "epoch": 0.5529049634474797, + "flos": 636148061184.0, + "grad_norm": 0.033554896267230024, + "language_loss": 0.87775517, + "learning_rate": 0.0004387801687061814, + "loss": 0.88938576, + "num_input_tokens_seen": 240183824, + "router_z_loss_mlp": 0.78710938, + "step": 2874, + "time_per_iteration": 2.8159070014953613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159743, + "balance_loss_mlp": 1.08073115, + "epoch": 0.5530973451327433, + "flos": 582434429952.0, + "grad_norm": 0.02986403100144585, + "language_loss": 0.86760765, + "learning_rate": 0.0004384709823571958, + "loss": 0.87920505, + "num_input_tokens_seen": 240259296, + "router_z_loss_mlp": 0.78857422, + "step": 2875, + "time_per_iteration": 2.755831480026245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158961, + "balance_loss_mlp": 1.08004439, + "epoch": 0.5532897268180069, + "flos": 1124329084416.0, + "grad_norm": 0.02992932493519035, + "language_loss": 0.88625169, + "learning_rate": 0.0004381618198958932, + "loss": 0.89784127, + "num_input_tokens_seen": 240346768, + "router_z_loss_mlp": 0.78662109, + "step": 2876, + "time_per_iteration": 3.504112720489502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157815, + "balance_loss_mlp": 1.0788027, + "epoch": 0.5534821085032705, + "flos": 638512132608.0, + "grad_norm": 0.032170459842753865, + "language_loss": 0.89321101, + "learning_rate": 0.00043785268144230137, + "loss": 0.90478921, + "num_input_tokens_seen": 240429344, + "router_z_loss_mlp": 0.78808594, + "step": 2877, + "time_per_iteration": 2.889683961868286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158076, + "balance_loss_mlp": 1.07911134, + "epoch": 0.5536744901885341, + "flos": 572216074752.0, + "grad_norm": 0.0339903958733494, + "language_loss": 0.87417912, + "learning_rate": 0.00043754356711643837, + "loss": 0.88575995, + "num_input_tokens_seen": 240497008, + "router_z_loss_mlp": 0.78759766, + "step": 2878, + "time_per_iteration": 2.6604373455047607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115856, + "balance_loss_mlp": 1.07950056, + "epoch": 0.5538668718737976, + "flos": 596916300288.0, + "grad_norm": 0.029580626213001865, + "language_loss": 0.88473797, + "learning_rate": 0.0004372344770383132, + "loss": 0.89632356, + "num_input_tokens_seen": 240578432, + "router_z_loss_mlp": 0.78808594, + "step": 2879, + "time_per_iteration": 2.7906830310821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011565, + "balance_loss_mlp": 1.07753599, + "epoch": 0.5540592535590612, + "flos": 533718185472.0, + "grad_norm": 0.030293675767491222, + "language_loss": 0.88174736, + "learning_rate": 0.00043692541132792507, + "loss": 0.89331234, + "num_input_tokens_seen": 240649136, + "router_z_loss_mlp": 0.78710938, + "step": 2880, + "time_per_iteration": 2.7152342796325684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156751, + "balance_loss_mlp": 1.07764363, + "epoch": 0.5542516352443247, + "flos": 413504380416.0, + "grad_norm": 0.03343546183057337, + "language_loss": 0.89203489, + "learning_rate": 0.00043661637010526384, + "loss": 0.90360242, + "num_input_tokens_seen": 240714240, + "router_z_loss_mlp": 0.78857422, + "step": 2881, + "time_per_iteration": 2.4759325981140137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156889, + "balance_loss_mlp": 1.07792521, + "epoch": 0.5544440169295883, + "flos": 548677418496.0, + "grad_norm": 0.03944129006740139, + "language_loss": 0.89678496, + "learning_rate": 0.00043630735349031025, + "loss": 0.90835381, + "num_input_tokens_seen": 240786928, + "router_z_loss_mlp": 0.78759766, + "step": 2882, + "time_per_iteration": 2.6376428604125977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157119, + "balance_loss_mlp": 1.07815528, + "epoch": 0.5546363986148518, + "flos": 623033877504.0, + "grad_norm": 0.025659357486645176, + "language_loss": 0.85712773, + "learning_rate": 0.00043599836160303495, + "loss": 0.86869895, + "num_input_tokens_seen": 240865328, + "router_z_loss_mlp": 0.78710938, + "step": 2883, + "time_per_iteration": 2.861966133117676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155488, + "balance_loss_mlp": 1.07633352, + "epoch": 0.5548287803001154, + "flos": 706579379712.0, + "grad_norm": 0.03141972013571756, + "language_loss": 0.82934201, + "learning_rate": 0.0004356893945633995, + "loss": 0.8408969, + "num_input_tokens_seen": 240945680, + "router_z_loss_mlp": 0.7890625, + "step": 2884, + "time_per_iteration": 2.9471499919891357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154456, + "balance_loss_mlp": 1.07534921, + "epoch": 0.555021161985379, + "flos": 505184143872.0, + "grad_norm": 0.031430850490502316, + "language_loss": 0.85807753, + "learning_rate": 0.0004353804524913551, + "loss": 0.86962205, + "num_input_tokens_seen": 241010800, + "router_z_loss_mlp": 0.78857422, + "step": 2885, + "time_per_iteration": 2.6182141304016113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154918, + "balance_loss_mlp": 1.07576323, + "epoch": 0.5552135436706426, + "flos": 617209293312.0, + "grad_norm": 0.033803824808406595, + "language_loss": 0.88278472, + "learning_rate": 0.0004350715355068441, + "loss": 0.89433384, + "num_input_tokens_seen": 241085328, + "router_z_loss_mlp": 0.7890625, + "step": 2886, + "time_per_iteration": 2.815993547439575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154719, + "balance_loss_mlp": 1.07556415, + "epoch": 0.5554059253559062, + "flos": 464817010176.0, + "grad_norm": 0.03994579560883884, + "language_loss": 0.85848737, + "learning_rate": 0.00043476264372979847, + "loss": 0.87003452, + "num_input_tokens_seen": 241149600, + "router_z_loss_mlp": 0.7890625, + "step": 2887, + "time_per_iteration": 2.5898871421813965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154914, + "balance_loss_mlp": 1.07618785, + "epoch": 0.5555983070411696, + "flos": 1564874841600.0, + "grad_norm": 0.03588081892536478, + "language_loss": 0.85341823, + "learning_rate": 0.0004344537772801408, + "loss": 0.86496735, + "num_input_tokens_seen": 241244832, + "router_z_loss_mlp": 0.78613281, + "step": 2888, + "time_per_iteration": 3.880375385284424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158798, + "balance_loss_mlp": 1.0821228, + "epoch": 0.5557906887264332, + "flos": 1471226681856.0, + "grad_norm": 0.005822600355857551, + "language_loss": 0.73422456, + "learning_rate": 0.0004341449362777836, + "loss": 0.74581254, + "num_input_tokens_seen": 241479728, + "router_z_loss_mlp": 0.76757812, + "step": 2889, + "time_per_iteration": 4.9117255210876465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155617, + "balance_loss_mlp": 1.07670069, + "epoch": 0.5559830704116968, + "flos": 530863289856.0, + "grad_norm": 0.03666523888945824, + "language_loss": 0.89283395, + "learning_rate": 0.0004338361208426298, + "loss": 0.90439016, + "num_input_tokens_seen": 241545616, + "router_z_loss_mlp": 0.78710938, + "step": 2890, + "time_per_iteration": 2.6093485355377197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155534, + "balance_loss_mlp": 1.07671309, + "epoch": 0.5561754520969604, + "flos": 652518641664.0, + "grad_norm": 0.027207956668339604, + "language_loss": 0.85981715, + "learning_rate": 0.00043352733109457164, + "loss": 0.87137252, + "num_input_tokens_seen": 241629040, + "router_z_loss_mlp": 0.78710938, + "step": 2891, + "time_per_iteration": 2.929133892059326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155522, + "balance_loss_mlp": 1.07670057, + "epoch": 0.556367833782224, + "flos": 735618981888.0, + "grad_norm": 0.028477777137297752, + "language_loss": 0.89055073, + "learning_rate": 0.00043321856715349244, + "loss": 0.90210593, + "num_input_tokens_seen": 241706272, + "router_z_loss_mlp": 0.78662109, + "step": 2892, + "time_per_iteration": 2.94014573097229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154528, + "balance_loss_mlp": 1.0758971, + "epoch": 0.5565602154674875, + "flos": 673640833536.0, + "grad_norm": 0.028305708839331062, + "language_loss": 0.85380936, + "learning_rate": 0.00043290982913926466, + "loss": 0.8653546, + "num_input_tokens_seen": 241782304, + "router_z_loss_mlp": 0.78564453, + "step": 2893, + "time_per_iteration": 2.797816038131714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153225, + "balance_loss_mlp": 1.07449973, + "epoch": 0.556752597152751, + "flos": 587503675392.0, + "grad_norm": 0.03108865563447884, + "language_loss": 0.90100253, + "learning_rate": 0.0004326011171717514, + "loss": 0.91253483, + "num_input_tokens_seen": 241868576, + "router_z_loss_mlp": 0.78613281, + "step": 2894, + "time_per_iteration": 2.885183334350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153367, + "balance_loss_mlp": 1.07426023, + "epoch": 0.5569449788380146, + "flos": 438690700800.0, + "grad_norm": 0.03571349027789826, + "language_loss": 0.87187707, + "learning_rate": 0.0004322924313708051, + "loss": 0.88341075, + "num_input_tokens_seen": 241933696, + "router_z_loss_mlp": 0.78857422, + "step": 2895, + "time_per_iteration": 2.505321502685547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115508, + "balance_loss_mlp": 1.07635403, + "epoch": 0.5571373605232782, + "flos": 503247770112.0, + "grad_norm": 0.03410983593663488, + "language_loss": 0.90630054, + "learning_rate": 0.0004319837718562681, + "loss": 0.91785133, + "num_input_tokens_seen": 242003056, + "router_z_loss_mlp": 0.78613281, + "step": 2896, + "time_per_iteration": 2.6243269443511963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154122, + "balance_loss_mlp": 1.07530081, + "epoch": 0.5573297422085417, + "flos": 578589880320.0, + "grad_norm": 0.033933273128928194, + "language_loss": 0.88206899, + "learning_rate": 0.0004316751387479726, + "loss": 0.89361024, + "num_input_tokens_seen": 242076368, + "router_z_loss_mlp": 0.78662109, + "step": 2897, + "time_per_iteration": 2.7566635608673096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153209, + "balance_loss_mlp": 1.074579, + "epoch": 0.5575221238938053, + "flos": 1346047512576.0, + "grad_norm": 0.03456307454544867, + "language_loss": 0.88955474, + "learning_rate": 0.0004313665321657409, + "loss": 0.90108681, + "num_input_tokens_seen": 242161600, + "router_z_loss_mlp": 0.78564453, + "step": 2898, + "time_per_iteration": 3.766465187072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155323, + "balance_loss_mlp": 1.07616794, + "epoch": 0.5577145055790689, + "flos": 603098724864.0, + "grad_norm": 0.03371138021934881, + "language_loss": 0.86232543, + "learning_rate": 0.00043105795222938436, + "loss": 0.8738786, + "num_input_tokens_seen": 242237904, + "router_z_loss_mlp": 0.7890625, + "step": 2899, + "time_per_iteration": 2.7334022521972656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155497, + "balance_loss_mlp": 1.07658088, + "epoch": 0.5579068872643325, + "flos": 563691045888.0, + "grad_norm": 0.045182395108838744, + "language_loss": 0.86075807, + "learning_rate": 0.00043074939905870467, + "loss": 0.87231296, + "num_input_tokens_seen": 242306736, + "router_z_loss_mlp": 0.78759766, + "step": 2900, + "time_per_iteration": 2.696669340133667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155611, + "balance_loss_mlp": 1.0766468, + "epoch": 0.558099268949596, + "flos": 545588207616.0, + "grad_norm": 0.03640236345196184, + "language_loss": 0.86178941, + "learning_rate": 0.0004304408727734927, + "loss": 0.87334555, + "num_input_tokens_seen": 242376000, + "router_z_loss_mlp": 0.78759766, + "step": 2901, + "time_per_iteration": 2.62982439994812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115605, + "balance_loss_mlp": 1.07727695, + "epoch": 0.5582916506348595, + "flos": 553852724736.0, + "grad_norm": 0.027303392187282394, + "language_loss": 0.9274894, + "learning_rate": 0.0004301323734935288, + "loss": 0.93904984, + "num_input_tokens_seen": 242447056, + "router_z_loss_mlp": 0.78613281, + "step": 2902, + "time_per_iteration": 2.705291986465454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164959, + "balance_loss_mlp": 1.08632815, + "epoch": 0.5584840323201231, + "flos": 544424636928.0, + "grad_norm": 0.032065850930778406, + "language_loss": 0.92794406, + "learning_rate": 0.000429823901338583, + "loss": 0.93959367, + "num_input_tokens_seen": 242514400, + "router_z_loss_mlp": 0.78564453, + "step": 2903, + "time_per_iteration": 2.620115041732788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162843, + "balance_loss_mlp": 1.08421218, + "epoch": 0.5586764140053867, + "flos": 817021992960.0, + "grad_norm": 0.03266293414683286, + "language_loss": 0.92888266, + "learning_rate": 0.00042951545642841513, + "loss": 0.94051105, + "num_input_tokens_seen": 242601616, + "router_z_loss_mlp": 0.78564453, + "step": 2904, + "time_per_iteration": 3.066140651702881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160381, + "balance_loss_mlp": 1.08165538, + "epoch": 0.5588687956906503, + "flos": 487415677440.0, + "grad_norm": 0.02932995016233391, + "language_loss": 0.91419339, + "learning_rate": 0.0004292070388827737, + "loss": 0.92579722, + "num_input_tokens_seen": 242669648, + "router_z_loss_mlp": 0.78613281, + "step": 2905, + "time_per_iteration": 2.5493688583374023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153401, + "balance_loss_mlp": 1.07453251, + "epoch": 0.5590611773759138, + "flos": 453068511744.0, + "grad_norm": 0.02745082882239035, + "language_loss": 0.85835731, + "learning_rate": 0.00042889864882139753, + "loss": 0.86989129, + "num_input_tokens_seen": 242737456, + "router_z_loss_mlp": 0.78710938, + "step": 2906, + "time_per_iteration": 2.572270631790161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115253, + "balance_loss_mlp": 1.07347012, + "epoch": 0.5592535590611774, + "flos": 521956225536.0, + "grad_norm": 0.03525028250709423, + "language_loss": 0.87143886, + "learning_rate": 0.0004285902863640139, + "loss": 0.88296419, + "num_input_tokens_seen": 242807008, + "router_z_loss_mlp": 0.78857422, + "step": 2907, + "time_per_iteration": 2.657799482345581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153209, + "balance_loss_mlp": 1.07448292, + "epoch": 0.5594459407464409, + "flos": 553600945152.0, + "grad_norm": 0.02873947635122419, + "language_loss": 0.90871602, + "learning_rate": 0.00042828195163033966, + "loss": 0.92024809, + "num_input_tokens_seen": 242877328, + "router_z_loss_mlp": 0.78613281, + "step": 2908, + "time_per_iteration": 2.6421632766723633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152251, + "balance_loss_mlp": 1.07323921, + "epoch": 0.5596383224317045, + "flos": 485787479040.0, + "grad_norm": 0.030747286656696786, + "language_loss": 0.84394485, + "learning_rate": 0.0004279736447400812, + "loss": 0.85546738, + "num_input_tokens_seen": 242943152, + "router_z_loss_mlp": 0.78808594, + "step": 2909, + "time_per_iteration": 2.571681022644043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152122, + "balance_loss_mlp": 1.07344413, + "epoch": 0.5598307041169681, + "flos": 612379092480.0, + "grad_norm": 0.030942423142950287, + "language_loss": 0.83957374, + "learning_rate": 0.00042766536581293385, + "loss": 0.85109496, + "num_input_tokens_seen": 243014656, + "router_z_loss_mlp": 0.78613281, + "step": 2910, + "time_per_iteration": 2.7282116413116455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155729, + "balance_loss_mlp": 1.07662177, + "epoch": 0.5600230858022316, + "flos": 489916735488.0, + "grad_norm": 0.03226747500803281, + "language_loss": 0.85277241, + "learning_rate": 0.0004273571149685819, + "loss": 0.86432964, + "num_input_tokens_seen": 243089040, + "router_z_loss_mlp": 0.78857422, + "step": 2911, + "time_per_iteration": 2.787032127380371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154593, + "balance_loss_mlp": 1.0759151, + "epoch": 0.5602154674874952, + "flos": 599981316096.0, + "grad_norm": 0.03215276166374932, + "language_loss": 0.88704693, + "learning_rate": 0.00042704889232669937, + "loss": 0.89859283, + "num_input_tokens_seen": 243162480, + "router_z_loss_mlp": 0.78613281, + "step": 2912, + "time_per_iteration": 2.686586856842041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154743, + "balance_loss_mlp": 1.07611275, + "epoch": 0.5604078491727588, + "flos": 587062516224.0, + "grad_norm": 0.032254540051477425, + "language_loss": 0.9111523, + "learning_rate": 0.0004267406980069484, + "loss": 0.92269969, + "num_input_tokens_seen": 243232880, + "router_z_loss_mlp": 0.78466797, + "step": 2913, + "time_per_iteration": 2.6899847984313965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154041, + "balance_loss_mlp": 1.07545817, + "epoch": 0.5606002308580224, + "flos": 542327808000.0, + "grad_norm": 0.028324891167666608, + "language_loss": 0.8452785, + "learning_rate": 0.0004264325321289808, + "loss": 0.85681891, + "num_input_tokens_seen": 243309168, + "router_z_loss_mlp": 0.78515625, + "step": 2914, + "time_per_iteration": 2.770299196243286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151899, + "balance_loss_mlp": 1.07331622, + "epoch": 0.5607926125432858, + "flos": 585078478848.0, + "grad_norm": 0.03365993170310601, + "language_loss": 0.91764051, + "learning_rate": 0.00042612439481243736, + "loss": 0.92915952, + "num_input_tokens_seen": 243382064, + "router_z_loss_mlp": 0.78515625, + "step": 2915, + "time_per_iteration": 2.7451834678649902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162837, + "balance_loss_mlp": 1.08406377, + "epoch": 0.5609849942285494, + "flos": 628630150656.0, + "grad_norm": 0.03395322139017605, + "language_loss": 0.95402431, + "learning_rate": 0.00042581628617694735, + "loss": 0.96565264, + "num_input_tokens_seen": 243452064, + "router_z_loss_mlp": 0.78613281, + "step": 2916, + "time_per_iteration": 2.7379772663116455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157541, + "balance_loss_mlp": 1.07871938, + "epoch": 0.561177375913813, + "flos": 589454785536.0, + "grad_norm": 0.03197816551531196, + "language_loss": 0.86920869, + "learning_rate": 0.0004255082063421296, + "loss": 0.88078409, + "num_input_tokens_seen": 243525600, + "router_z_loss_mlp": 0.78759766, + "step": 2917, + "time_per_iteration": 2.7153422832489014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161631, + "balance_loss_mlp": 1.08285797, + "epoch": 0.5613697575990766, + "flos": 528143379456.0, + "grad_norm": 0.03128753614155992, + "language_loss": 0.89917612, + "learning_rate": 0.00042520015542759065, + "loss": 0.91079247, + "num_input_tokens_seen": 243605536, + "router_z_loss_mlp": 0.78710938, + "step": 2918, + "time_per_iteration": 2.8688042163848877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165136, + "balance_loss_mlp": 1.08636212, + "epoch": 0.5615621392843402, + "flos": 643874090496.0, + "grad_norm": 0.03249260096588731, + "language_loss": 0.93211949, + "learning_rate": 0.00042489213355292687, + "loss": 0.94377089, + "num_input_tokens_seen": 243684208, + "router_z_loss_mlp": 0.78613281, + "step": 2919, + "time_per_iteration": 2.8982832431793213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167734, + "balance_loss_mlp": 1.08900821, + "epoch": 0.5617545209696037, + "flos": 428656995840.0, + "grad_norm": 0.034334958581954525, + "language_loss": 0.87036526, + "learning_rate": 0.00042458414083772276, + "loss": 0.88204259, + "num_input_tokens_seen": 243749376, + "router_z_loss_mlp": 0.78466797, + "step": 2920, + "time_per_iteration": 2.5067636966705322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164187, + "balance_loss_mlp": 1.08536625, + "epoch": 0.5619469026548672, + "flos": 569589490176.0, + "grad_norm": 0.025989129211014445, + "language_loss": 0.89547098, + "learning_rate": 0.000424276177401552, + "loss": 0.90711284, + "num_input_tokens_seen": 243828096, + "router_z_loss_mlp": 0.78710938, + "step": 2921, + "time_per_iteration": 2.810723304748535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158764, + "balance_loss_mlp": 1.07975173, + "epoch": 0.5621392843401308, + "flos": 506243655168.0, + "grad_norm": 0.03554030610259364, + "language_loss": 0.91916943, + "learning_rate": 0.0004239682433639763, + "loss": 0.93075705, + "num_input_tokens_seen": 243896752, + "router_z_loss_mlp": 0.7890625, + "step": 2922, + "time_per_iteration": 2.6607391834259033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159452, + "balance_loss_mlp": 1.08034527, + "epoch": 0.5623316660253944, + "flos": 518009617920.0, + "grad_norm": 0.03283867999662062, + "language_loss": 0.91225737, + "learning_rate": 0.0004236603388445467, + "loss": 0.92385185, + "num_input_tokens_seen": 243964592, + "router_z_loss_mlp": 0.78955078, + "step": 2923, + "time_per_iteration": 2.586524248123169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159206, + "balance_loss_mlp": 1.08043242, + "epoch": 0.5625240477106579, + "flos": 607138658304.0, + "grad_norm": 0.07898356089021562, + "language_loss": 0.87176222, + "learning_rate": 0.00042335246396280166, + "loss": 0.88335431, + "num_input_tokens_seen": 244036656, + "router_z_loss_mlp": 0.78710938, + "step": 2924, + "time_per_iteration": 2.7597639560699463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115906, + "balance_loss_mlp": 1.08004844, + "epoch": 0.5627164293959215, + "flos": 451340256768.0, + "grad_norm": 0.0302800933285396, + "language_loss": 0.96241242, + "learning_rate": 0.0004230446188382693, + "loss": 0.97400308, + "num_input_tokens_seen": 244102704, + "router_z_loss_mlp": 0.7890625, + "step": 2925, + "time_per_iteration": 2.573899030685425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158596, + "balance_loss_mlp": 1.07977474, + "epoch": 0.5629088110811851, + "flos": 743436335616.0, + "grad_norm": 0.03229142562201564, + "language_loss": 0.85888505, + "learning_rate": 0.0004227368035904654, + "loss": 0.87047106, + "num_input_tokens_seen": 244186640, + "router_z_loss_mlp": 0.78759766, + "step": 2926, + "time_per_iteration": 2.9811575412750244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161727, + "balance_loss_mlp": 1.08295333, + "epoch": 0.5631011927664487, + "flos": 497979138048.0, + "grad_norm": 0.030188812186764755, + "language_loss": 0.88692701, + "learning_rate": 0.00042242901833889474, + "loss": 0.89854425, + "num_input_tokens_seen": 244257680, + "router_z_loss_mlp": 0.78710938, + "step": 2927, + "time_per_iteration": 2.6326565742492676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160764, + "balance_loss_mlp": 1.08194327, + "epoch": 0.5632935744517122, + "flos": 887594300928.0, + "grad_norm": 0.033144673445412554, + "language_loss": 0.91819888, + "learning_rate": 0.0004221212632030501, + "loss": 0.92980659, + "num_input_tokens_seen": 244331248, + "router_z_loss_mlp": 0.78759766, + "step": 2928, + "time_per_iteration": 3.0669453144073486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115887, + "balance_loss_mlp": 1.08014381, + "epoch": 0.5634859561369757, + "flos": 605901227520.0, + "grad_norm": 0.03167965641147859, + "language_loss": 0.85548306, + "learning_rate": 0.0004218135383024124, + "loss": 0.86707169, + "num_input_tokens_seen": 244403920, + "router_z_loss_mlp": 0.78662109, + "step": 2929, + "time_per_iteration": 2.704127788543701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154152, + "balance_loss_mlp": 1.07542574, + "epoch": 0.5636783378222393, + "flos": 454902827520.0, + "grad_norm": 0.0331862396137692, + "language_loss": 0.91072655, + "learning_rate": 0.0004215058437564511, + "loss": 0.92226809, + "num_input_tokens_seen": 244470464, + "router_z_loss_mlp": 0.78662109, + "step": 2930, + "time_per_iteration": 2.5648486614227295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153736, + "balance_loss_mlp": 1.07496285, + "epoch": 0.5638707195075029, + "flos": 519461898240.0, + "grad_norm": 0.030026295980520465, + "language_loss": 0.87243164, + "learning_rate": 0.00042119817968462397, + "loss": 0.88396895, + "num_input_tokens_seen": 244536864, + "router_z_loss_mlp": 0.78613281, + "step": 2931, + "time_per_iteration": 2.596165895462036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011545, + "balance_loss_mlp": 1.07572603, + "epoch": 0.5640631011927665, + "flos": 565844270592.0, + "grad_norm": 0.035813464167598875, + "language_loss": 0.92307299, + "learning_rate": 0.0004208905462063766, + "loss": 0.934618, + "num_input_tokens_seen": 244603344, + "router_z_loss_mlp": 0.78564453, + "step": 2932, + "time_per_iteration": 2.6596782207489014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161524, + "balance_loss_mlp": 1.0827024, + "epoch": 0.56425548287803, + "flos": 518037815808.0, + "grad_norm": 0.03163601566095553, + "language_loss": 0.90576756, + "learning_rate": 0.00042058294344114315, + "loss": 0.91738278, + "num_input_tokens_seen": 244671984, + "router_z_loss_mlp": 0.78564453, + "step": 2933, + "time_per_iteration": 2.6681416034698486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160347, + "balance_loss_mlp": 1.08157361, + "epoch": 0.5644478645632935, + "flos": 855669603840.0, + "grad_norm": 0.031443670044009366, + "language_loss": 0.83703303, + "learning_rate": 0.0004202753715083456, + "loss": 0.84863651, + "num_input_tokens_seen": 244754000, + "router_z_loss_mlp": 0.78515625, + "step": 2934, + "time_per_iteration": 3.1047325134277344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159543, + "balance_loss_mlp": 1.08081746, + "epoch": 0.5646402462485571, + "flos": 554495271936.0, + "grad_norm": 0.034946601892201584, + "language_loss": 0.87802339, + "learning_rate": 0.0004199678305273936, + "loss": 0.88961881, + "num_input_tokens_seen": 244820896, + "router_z_loss_mlp": 0.78613281, + "step": 2935, + "time_per_iteration": 2.649768352508545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159598, + "balance_loss_mlp": 1.08092046, + "epoch": 0.5648326279338207, + "flos": 687310969344.0, + "grad_norm": 0.04027660967531297, + "language_loss": 0.86366433, + "learning_rate": 0.0004196603206176854, + "loss": 0.87526035, + "num_input_tokens_seen": 244904464, + "router_z_loss_mlp": 0.78613281, + "step": 2936, + "time_per_iteration": 2.916745662689209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158764, + "balance_loss_mlp": 1.08003819, + "epoch": 0.5650250096190843, + "flos": 804682613760.0, + "grad_norm": 0.03045212290633188, + "language_loss": 0.89034498, + "learning_rate": 0.000419352841898607, + "loss": 0.9019326, + "num_input_tokens_seen": 244983760, + "router_z_loss_mlp": 0.78662109, + "step": 2937, + "time_per_iteration": 3.019742250442505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154573, + "balance_loss_mlp": 1.07541847, + "epoch": 0.5652173913043478, + "flos": 583144106496.0, + "grad_norm": 0.0352415717236192, + "language_loss": 0.82975399, + "learning_rate": 0.000419045394489532, + "loss": 0.84129971, + "num_input_tokens_seen": 245053184, + "router_z_loss_mlp": 0.79003906, + "step": 2938, + "time_per_iteration": 2.7263834476470947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155775, + "balance_loss_mlp": 1.07661998, + "epoch": 0.5654097729896114, + "flos": 822167099904.0, + "grad_norm": 0.030545896529673648, + "language_loss": 0.81679785, + "learning_rate": 0.0004187379785098224, + "loss": 0.82835561, + "num_input_tokens_seen": 245137408, + "router_z_loss_mlp": 0.7890625, + "step": 2939, + "time_per_iteration": 3.125208854675293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155934, + "balance_loss_mlp": 1.07682657, + "epoch": 0.565602154674875, + "flos": 785481332736.0, + "grad_norm": 0.038076573598017076, + "language_loss": 0.89879513, + "learning_rate": 0.00041843059407882744, + "loss": 0.9103545, + "num_input_tokens_seen": 245215504, + "router_z_loss_mlp": 0.78857422, + "step": 2940, + "time_per_iteration": 2.9577417373657227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157161, + "balance_loss_mlp": 1.07814884, + "epoch": 0.5657945363601385, + "flos": 550744048128.0, + "grad_norm": 0.03292975836505615, + "language_loss": 0.88439214, + "learning_rate": 0.0004181232413158842, + "loss": 0.89596379, + "num_input_tokens_seen": 245286032, + "router_z_loss_mlp": 0.78759766, + "step": 2941, + "time_per_iteration": 2.636016845703125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156819, + "balance_loss_mlp": 1.07771146, + "epoch": 0.5659869180454021, + "flos": 669331656192.0, + "grad_norm": 0.0384606105275957, + "language_loss": 0.88344961, + "learning_rate": 0.0004178159203403179, + "loss": 0.89501786, + "num_input_tokens_seen": 245359040, + "router_z_loss_mlp": 0.78857422, + "step": 2942, + "time_per_iteration": 2.873724937438965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157408, + "balance_loss_mlp": 1.07839596, + "epoch": 0.5661792997306656, + "flos": 500948826624.0, + "grad_norm": 0.031907837289758996, + "language_loss": 0.86677325, + "learning_rate": 0.0004175086312714409, + "loss": 0.8783474, + "num_input_tokens_seen": 245426384, + "router_z_loss_mlp": 0.78808594, + "step": 2943, + "time_per_iteration": 2.553450107574463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160396, + "balance_loss_mlp": 1.08138418, + "epoch": 0.5663716814159292, + "flos": 602362851840.0, + "grad_norm": 0.02897032807353051, + "language_loss": 0.8872959, + "learning_rate": 0.00041720137422855366, + "loss": 0.89889991, + "num_input_tokens_seen": 245501216, + "router_z_loss_mlp": 0.78759766, + "step": 2944, + "time_per_iteration": 2.7116591930389404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159876, + "balance_loss_mlp": 1.08095932, + "epoch": 0.5665640631011928, + "flos": 542032367616.0, + "grad_norm": 0.031139658556859174, + "language_loss": 0.83964241, + "learning_rate": 0.00041689414933094383, + "loss": 0.85124123, + "num_input_tokens_seen": 245571600, + "router_z_loss_mlp": 0.78710938, + "step": 2945, + "time_per_iteration": 2.638216495513916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158364, + "balance_loss_mlp": 1.07968628, + "epoch": 0.5667564447864564, + "flos": 603061794816.0, + "grad_norm": 0.037847476611961306, + "language_loss": 0.8757143, + "learning_rate": 0.00041658695669788653, + "loss": 0.88729787, + "num_input_tokens_seen": 245645632, + "router_z_loss_mlp": 0.78613281, + "step": 2946, + "time_per_iteration": 2.736724615097046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159515, + "balance_loss_mlp": 1.08074152, + "epoch": 0.5669488264717198, + "flos": 660722033664.0, + "grad_norm": 0.03809672024086723, + "language_loss": 0.87564874, + "learning_rate": 0.00041627979644864453, + "loss": 0.88724387, + "num_input_tokens_seen": 245715776, + "router_z_loss_mlp": 0.78662109, + "step": 2947, + "time_per_iteration": 2.787102460861206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160652, + "balance_loss_mlp": 1.08192623, + "epoch": 0.5671412081569834, + "flos": 486382362624.0, + "grad_norm": 0.028726289994514737, + "language_loss": 0.86769605, + "learning_rate": 0.0004159726687024683, + "loss": 0.87930262, + "num_input_tokens_seen": 245785328, + "router_z_loss_mlp": 0.78662109, + "step": 2948, + "time_per_iteration": 2.627268075942993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157953, + "balance_loss_mlp": 1.07917941, + "epoch": 0.567333589842247, + "flos": 731060026368.0, + "grad_norm": 0.031224685517340662, + "language_loss": 0.85094821, + "learning_rate": 0.00041566557357859506, + "loss": 0.86252779, + "num_input_tokens_seen": 245858000, + "router_z_loss_mlp": 0.78710938, + "step": 2949, + "time_per_iteration": 2.903480052947998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115639, + "balance_loss_mlp": 1.07737851, + "epoch": 0.5675259715275106, + "flos": 970558381056.0, + "grad_norm": 0.02889906202993953, + "language_loss": 0.84761345, + "learning_rate": 0.0004153585111962502, + "loss": 0.85917735, + "num_input_tokens_seen": 245950640, + "router_z_loss_mlp": 0.78857422, + "step": 2950, + "time_per_iteration": 3.327157497406006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155395, + "balance_loss_mlp": 1.07638264, + "epoch": 0.5677183532127742, + "flos": 566213571072.0, + "grad_norm": 0.036221800053715905, + "language_loss": 0.90357536, + "learning_rate": 0.0004150514816746453, + "loss": 0.9151293, + "num_input_tokens_seen": 246019568, + "router_z_loss_mlp": 0.78857422, + "step": 2951, + "time_per_iteration": 2.664881467819214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155178, + "balance_loss_mlp": 1.07640433, + "epoch": 0.5679107348980377, + "flos": 552745549824.0, + "grad_norm": 0.032718571293428464, + "language_loss": 0.90599716, + "learning_rate": 0.0004147444851329802, + "loss": 0.91754901, + "num_input_tokens_seen": 246089520, + "router_z_loss_mlp": 0.78710938, + "step": 2952, + "time_per_iteration": 2.659607410430908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156293, + "balance_loss_mlp": 1.07752001, + "epoch": 0.5681031165833013, + "flos": 820840346112.0, + "grad_norm": 0.029462667986489877, + "language_loss": 0.91018391, + "learning_rate": 0.00041443752169044126, + "loss": 0.92174685, + "num_input_tokens_seen": 246165920, + "router_z_loss_mlp": 0.78710938, + "step": 2953, + "time_per_iteration": 3.0214719772338867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115648, + "balance_loss_mlp": 1.07775402, + "epoch": 0.5682954982685648, + "flos": 619145667072.0, + "grad_norm": 0.03021657930021912, + "language_loss": 0.89565808, + "learning_rate": 0.0004141305914662025, + "loss": 0.90722287, + "num_input_tokens_seen": 246238672, + "router_z_loss_mlp": 0.78662109, + "step": 2954, + "time_per_iteration": 2.7215545177459717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154854, + "balance_loss_mlp": 1.07608008, + "epoch": 0.5684878799538284, + "flos": 649251511296.0, + "grad_norm": 0.03170231797387521, + "language_loss": 0.85884857, + "learning_rate": 0.0004138236945794246, + "loss": 0.87039715, + "num_input_tokens_seen": 246320208, + "router_z_loss_mlp": 0.78613281, + "step": 2955, + "time_per_iteration": 2.896960496902466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154548, + "balance_loss_mlp": 1.07587004, + "epoch": 0.5686802616390919, + "flos": 807352859136.0, + "grad_norm": 0.03477888356704498, + "language_loss": 0.88849628, + "learning_rate": 0.00041351683114925576, + "loss": 0.90004176, + "num_input_tokens_seen": 246406464, + "router_z_loss_mlp": 0.78564453, + "step": 2956, + "time_per_iteration": 3.056138753890991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155475, + "balance_loss_mlp": 1.07698798, + "epoch": 0.5688726433243555, + "flos": 548175860736.0, + "grad_norm": 0.02988071875067647, + "language_loss": 0.91774637, + "learning_rate": 0.0004132100012948308, + "loss": 0.92930108, + "num_input_tokens_seen": 246477456, + "router_z_loss_mlp": 0.78320312, + "step": 2957, + "time_per_iteration": 2.620039701461792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153148, + "balance_loss_mlp": 1.07475579, + "epoch": 0.5690650250096191, + "flos": 487545933312.0, + "grad_norm": 0.03388139796228596, + "language_loss": 0.90210378, + "learning_rate": 0.00041290320513527145, + "loss": 0.91363525, + "num_input_tokens_seen": 246541744, + "router_z_loss_mlp": 0.78222656, + "step": 2958, + "time_per_iteration": 2.5424137115478516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158065, + "balance_loss_mlp": 1.07953036, + "epoch": 0.5692574066948827, + "flos": 578554951680.0, + "grad_norm": 0.03065337308060062, + "language_loss": 0.9014492, + "learning_rate": 0.0004125964427896867, + "loss": 0.91302985, + "num_input_tokens_seen": 246611440, + "router_z_loss_mlp": 0.78369141, + "step": 2959, + "time_per_iteration": 2.6540746688842773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157828, + "balance_loss_mlp": 1.07924569, + "epoch": 0.5694497883801463, + "flos": 455219735040.0, + "grad_norm": 0.03288997710459115, + "language_loss": 0.8486557, + "learning_rate": 0.0004122897143771723, + "loss": 0.86023396, + "num_input_tokens_seen": 246676496, + "router_z_loss_mlp": 0.78515625, + "step": 2960, + "time_per_iteration": 2.5677952766418457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157581, + "balance_loss_mlp": 1.07899833, + "epoch": 0.5696421700654097, + "flos": 560582369280.0, + "grad_norm": 0.029260680521972587, + "language_loss": 0.86686659, + "learning_rate": 0.0004119830200168109, + "loss": 0.87844241, + "num_input_tokens_seen": 246746464, + "router_z_loss_mlp": 0.78515625, + "step": 2961, + "time_per_iteration": 2.661398410797119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116102, + "balance_loss_mlp": 1.08243668, + "epoch": 0.5698345517506733, + "flos": 466501604352.0, + "grad_norm": 0.06131137217333051, + "language_loss": 0.93434393, + "learning_rate": 0.0004116763598276714, + "loss": 0.94595408, + "num_input_tokens_seen": 246811808, + "router_z_loss_mlp": 0.78515625, + "step": 2962, + "time_per_iteration": 2.5421509742736816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161307, + "balance_loss_mlp": 1.08267653, + "epoch": 0.5700269334359369, + "flos": 607191051264.0, + "grad_norm": 0.033090735660708526, + "language_loss": 0.8645342, + "learning_rate": 0.00041136973392881017, + "loss": 0.87614727, + "num_input_tokens_seen": 246890432, + "router_z_loss_mlp": 0.78515625, + "step": 2963, + "time_per_iteration": 2.826312303543091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116111, + "balance_loss_mlp": 1.08233654, + "epoch": 0.5702193151212005, + "flos": 563856230400.0, + "grad_norm": 0.029371137494056676, + "language_loss": 0.87366056, + "learning_rate": 0.00041106314243926983, + "loss": 0.88527167, + "num_input_tokens_seen": 246959616, + "router_z_loss_mlp": 0.78613281, + "step": 2964, + "time_per_iteration": 2.729848861694336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163001, + "balance_loss_mlp": 1.08432257, + "epoch": 0.570411696806464, + "flos": 524309563392.0, + "grad_norm": 0.030081020285570834, + "language_loss": 0.91922152, + "learning_rate": 0.0004107565854780798, + "loss": 0.93085158, + "num_input_tokens_seen": 247030656, + "router_z_loss_mlp": 0.78564453, + "step": 2965, + "time_per_iteration": 2.6243247985839844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162398, + "balance_loss_mlp": 1.08348167, + "epoch": 0.5706040784917276, + "flos": 719471983104.0, + "grad_norm": 0.03134673766290682, + "language_loss": 0.86833286, + "learning_rate": 0.000410450063164256, + "loss": 0.87995684, + "num_input_tokens_seen": 247105872, + "router_z_loss_mlp": 0.78710938, + "step": 2966, + "time_per_iteration": 2.8488268852233887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160157, + "balance_loss_mlp": 1.08109784, + "epoch": 0.5707964601769911, + "flos": 477670682112.0, + "grad_norm": 0.03469711129941245, + "language_loss": 0.88420385, + "learning_rate": 0.00041014357561680115, + "loss": 0.89580548, + "num_input_tokens_seen": 247170448, + "router_z_loss_mlp": 0.78808594, + "step": 2967, + "time_per_iteration": 2.531399965286255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158843, + "balance_loss_mlp": 1.07997382, + "epoch": 0.5709888418622547, + "flos": 581216464896.0, + "grad_norm": 0.0299141756983156, + "language_loss": 0.91230297, + "learning_rate": 0.0004098371229547039, + "loss": 0.92389137, + "num_input_tokens_seen": 247240400, + "router_z_loss_mlp": 0.78662109, + "step": 2968, + "time_per_iteration": 2.7010715007781982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166153, + "balance_loss_mlp": 1.08947754, + "epoch": 0.5711812235475183, + "flos": 1583192707584.0, + "grad_norm": 0.007250174551889785, + "language_loss": 0.80010808, + "learning_rate": 0.0004095307052969399, + "loss": 0.8117696, + "num_input_tokens_seen": 247469136, + "router_z_loss_mlp": 0.76757812, + "step": 2969, + "time_per_iteration": 4.720959663391113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158975, + "balance_loss_mlp": 1.08001077, + "epoch": 0.5713736052327818, + "flos": 469497489408.0, + "grad_norm": 0.030927251593918268, + "language_loss": 0.85219097, + "learning_rate": 0.00040922432276247107, + "loss": 0.86378068, + "num_input_tokens_seen": 247537712, + "router_z_loss_mlp": 0.78710938, + "step": 2970, + "time_per_iteration": 2.5976855754852295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155112, + "balance_loss_mlp": 1.07610035, + "epoch": 0.5715659869180454, + "flos": 538754503680.0, + "grad_norm": 0.02782082883725602, + "language_loss": 0.88734138, + "learning_rate": 0.0004089179754702457, + "loss": 0.89889252, + "num_input_tokens_seen": 247613872, + "router_z_loss_mlp": 0.78759766, + "step": 2971, + "time_per_iteration": 2.735511064529419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155002, + "balance_loss_mlp": 1.07608509, + "epoch": 0.571758368603309, + "flos": 657250787328.0, + "grad_norm": 0.03021364085019089, + "language_loss": 0.86246514, + "learning_rate": 0.00040861166353919843, + "loss": 0.87401509, + "num_input_tokens_seen": 247686064, + "router_z_loss_mlp": 0.78710938, + "step": 2972, + "time_per_iteration": 2.784243583679199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156758, + "balance_loss_mlp": 1.07808018, + "epoch": 0.5719507502885726, + "flos": 669099342336.0, + "grad_norm": 0.04093131787913085, + "language_loss": 0.87037605, + "learning_rate": 0.00040830538708824983, + "loss": 0.8819437, + "num_input_tokens_seen": 247760384, + "router_z_loss_mlp": 0.78564453, + "step": 2973, + "time_per_iteration": 2.847334861755371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156641, + "balance_loss_mlp": 1.07815385, + "epoch": 0.572143131973836, + "flos": 477279914496.0, + "grad_norm": 0.029260532033913305, + "language_loss": 0.87478364, + "learning_rate": 0.000407999146236307, + "loss": 0.88635004, + "num_input_tokens_seen": 247824768, + "router_z_loss_mlp": 0.78417969, + "step": 2974, + "time_per_iteration": 2.5809874534606934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156886, + "balance_loss_mlp": 1.07849395, + "epoch": 0.5723355136590996, + "flos": 540534425088.0, + "grad_norm": 0.03484414683288605, + "language_loss": 0.89636898, + "learning_rate": 0.0004076929411022634, + "loss": 0.90793782, + "num_input_tokens_seen": 247894448, + "router_z_loss_mlp": 0.78320312, + "step": 2975, + "time_per_iteration": 2.631016969680786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156314, + "balance_loss_mlp": 1.07782686, + "epoch": 0.5725278953443632, + "flos": 825649079808.0, + "grad_norm": 0.03393435544828211, + "language_loss": 0.84972572, + "learning_rate": 0.0004073867718049982, + "loss": 0.86128891, + "num_input_tokens_seen": 247976432, + "router_z_loss_mlp": 0.78369141, + "step": 2976, + "time_per_iteration": 3.09523606300354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158881, + "balance_loss_mlp": 1.08044088, + "epoch": 0.5727202770296268, + "flos": 588569190912.0, + "grad_norm": 0.031011693938846972, + "language_loss": 0.87586653, + "learning_rate": 0.00040708063846337704, + "loss": 0.88745534, + "num_input_tokens_seen": 248048800, + "router_z_loss_mlp": 0.78222656, + "step": 2977, + "time_per_iteration": 2.7148561477661133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159545, + "balance_loss_mlp": 1.08100963, + "epoch": 0.5729126587148904, + "flos": 447940869120.0, + "grad_norm": 0.0318916011479424, + "language_loss": 0.87124234, + "learning_rate": 0.00040677454119625143, + "loss": 0.88283777, + "num_input_tokens_seen": 248116496, + "router_z_loss_mlp": 0.78320312, + "step": 2978, + "time_per_iteration": 2.6003363132476807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158917, + "balance_loss_mlp": 1.0804776, + "epoch": 0.5731050404001539, + "flos": 520467015168.0, + "grad_norm": 0.03318988951179658, + "language_loss": 0.88396186, + "learning_rate": 0.0004064684801224587, + "loss": 0.89555109, + "num_input_tokens_seen": 248184960, + "router_z_loss_mlp": 0.78173828, + "step": 2979, + "time_per_iteration": 2.6103272438049316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160698, + "balance_loss_mlp": 1.08225846, + "epoch": 0.5732974220854175, + "flos": 505770295296.0, + "grad_norm": 0.029710652762807207, + "language_loss": 0.85663891, + "learning_rate": 0.00040616245536082224, + "loss": 0.86824596, + "num_input_tokens_seen": 248252208, + "router_z_loss_mlp": 0.78222656, + "step": 2980, + "time_per_iteration": 2.5594868659973145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159175, + "balance_loss_mlp": 1.08078313, + "epoch": 0.573489803770681, + "flos": 593677367808.0, + "grad_norm": 0.027966372317681742, + "language_loss": 0.86258745, + "learning_rate": 0.00040585646703015165, + "loss": 0.87417924, + "num_input_tokens_seen": 248333312, + "router_z_loss_mlp": 0.78320312, + "step": 2981, + "time_per_iteration": 2.789937734603882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157316, + "balance_loss_mlp": 1.07878125, + "epoch": 0.5736821854559446, + "flos": 490869459456.0, + "grad_norm": 0.031111464824263694, + "language_loss": 0.83780992, + "learning_rate": 0.0004055505152492419, + "loss": 0.84938312, + "num_input_tokens_seen": 248403808, + "router_z_loss_mlp": 0.78466797, + "step": 2982, + "time_per_iteration": 2.6471428871154785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158265, + "balance_loss_mlp": 1.07963431, + "epoch": 0.5738745671412081, + "flos": 459201271296.0, + "grad_norm": 0.03311000411840089, + "language_loss": 0.79528159, + "learning_rate": 0.00040524460013687425, + "loss": 0.80686426, + "num_input_tokens_seen": 248477184, + "router_z_loss_mlp": 0.78564453, + "step": 2983, + "time_per_iteration": 2.708540678024292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155372, + "balance_loss_mlp": 1.07650268, + "epoch": 0.5740669488264717, + "flos": 581620694016.0, + "grad_norm": 0.028109694322635652, + "language_loss": 0.86855406, + "learning_rate": 0.0004049387218118155, + "loss": 0.88010776, + "num_input_tokens_seen": 248565552, + "router_z_loss_mlp": 0.78759766, + "step": 2984, + "time_per_iteration": 2.926750421524048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155283, + "balance_loss_mlp": 1.07622325, + "epoch": 0.5742593305117353, + "flos": 525573190656.0, + "grad_norm": 0.03395381439898354, + "language_loss": 0.91635472, + "learning_rate": 0.00040463288039281777, + "loss": 0.92790747, + "num_input_tokens_seen": 248635456, + "router_z_loss_mlp": 0.78857422, + "step": 2985, + "time_per_iteration": 2.704287528991699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162964, + "balance_loss_mlp": 1.08666992, + "epoch": 0.5744517121969989, + "flos": 1557266511360.0, + "grad_norm": 0.007878379047691413, + "language_loss": 0.77876419, + "learning_rate": 0.0004043270759986194, + "loss": 0.79039383, + "num_input_tokens_seen": 248870160, + "router_z_loss_mlp": 0.76367188, + "step": 2986, + "time_per_iteration": 4.989194869995117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155742, + "balance_loss_mlp": 1.07677734, + "epoch": 0.5746440938822625, + "flos": 753202798080.0, + "grad_norm": 0.03402997808137808, + "language_loss": 0.87620312, + "learning_rate": 0.0004040213087479444, + "loss": 0.88776052, + "num_input_tokens_seen": 248946960, + "router_z_loss_mlp": 0.78759766, + "step": 2987, + "time_per_iteration": 2.9275078773498535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163311, + "balance_loss_mlp": 1.08453715, + "epoch": 0.5748364755675259, + "flos": 502857002496.0, + "grad_norm": 0.03361733343242669, + "language_loss": 0.90824878, + "learning_rate": 0.0004037155787595018, + "loss": 0.91988194, + "num_input_tokens_seen": 249014128, + "router_z_loss_mlp": 0.78710938, + "step": 2988, + "time_per_iteration": 2.576448440551758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160011, + "balance_loss_mlp": 1.08109498, + "epoch": 0.5750288572527895, + "flos": 505197605376.0, + "grad_norm": 0.02880586923954642, + "language_loss": 0.85724807, + "learning_rate": 0.000403409886151987, + "loss": 0.86884815, + "num_input_tokens_seen": 249090016, + "router_z_loss_mlp": 0.78759766, + "step": 2989, + "time_per_iteration": 2.916322946548462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157013, + "balance_loss_mlp": 1.08033752, + "epoch": 0.5752212389380531, + "flos": 1544675352576.0, + "grad_norm": 0.005932241765552608, + "language_loss": 0.81999105, + "learning_rate": 0.0004031042310440799, + "loss": 0.83156121, + "num_input_tokens_seen": 249305552, + "router_z_loss_mlp": 0.765625, + "step": 2990, + "time_per_iteration": 4.758445978164673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115937, + "balance_loss_mlp": 1.08269501, + "epoch": 0.5754136206233167, + "flos": 1570671406080.0, + "grad_norm": 0.005822498768858246, + "language_loss": 0.781986, + "learning_rate": 0.00040279861355444656, + "loss": 0.7935797, + "num_input_tokens_seen": 249523408, + "router_z_loss_mlp": 0.765625, + "step": 2991, + "time_per_iteration": 4.785308122634888 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163075, + "balance_loss_mlp": 1.08420658, + "epoch": 0.5756060023085803, + "flos": 799561701888.0, + "grad_norm": 0.0320241684810352, + "language_loss": 0.81581879, + "learning_rate": 0.00040249303380173807, + "loss": 0.82744956, + "num_input_tokens_seen": 249616624, + "router_z_loss_mlp": 0.78808594, + "step": 2992, + "time_per_iteration": 3.060910940170288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160943, + "balance_loss_mlp": 1.08202648, + "epoch": 0.5757983839938438, + "flos": 589033818624.0, + "grad_norm": 0.033230938583522406, + "language_loss": 0.85061818, + "learning_rate": 0.00040218749190459126, + "loss": 0.86222756, + "num_input_tokens_seen": 249689936, + "router_z_loss_mlp": 0.78857422, + "step": 2993, + "time_per_iteration": 2.722538948059082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159067, + "balance_loss_mlp": 1.08029306, + "epoch": 0.5759907656791073, + "flos": 517851164160.0, + "grad_norm": 0.036503805232005304, + "language_loss": 0.88598883, + "learning_rate": 0.00040188198798162775, + "loss": 0.89757949, + "num_input_tokens_seen": 249759984, + "router_z_loss_mlp": 0.78662109, + "step": 2994, + "time_per_iteration": 2.626763105392456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157444, + "balance_loss_mlp": 1.078861, + "epoch": 0.5761831473643709, + "flos": 588289213440.0, + "grad_norm": 0.030677551313055676, + "language_loss": 0.90523088, + "learning_rate": 0.000401576522151455, + "loss": 0.91680533, + "num_input_tokens_seen": 249837888, + "router_z_loss_mlp": 0.78466797, + "step": 2995, + "time_per_iteration": 2.8290417194366455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156979, + "balance_loss_mlp": 1.07839644, + "epoch": 0.5763755290496345, + "flos": 545008786944.0, + "grad_norm": 0.030026851509959627, + "language_loss": 0.87201327, + "learning_rate": 0.0004012710945326651, + "loss": 0.88358307, + "num_input_tokens_seen": 249913584, + "router_z_loss_mlp": 0.78515625, + "step": 2996, + "time_per_iteration": 2.78725004196167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156215, + "balance_loss_mlp": 1.07767999, + "epoch": 0.576567910734898, + "flos": 627427648512.0, + "grad_norm": 0.03065527687354923, + "language_loss": 0.86651611, + "learning_rate": 0.0004009657052438355, + "loss": 0.87807822, + "num_input_tokens_seen": 249992144, + "router_z_loss_mlp": 0.78271484, + "step": 2997, + "time_per_iteration": 2.8221359252929688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156096, + "balance_loss_mlp": 1.07756102, + "epoch": 0.5767602924201616, + "flos": 539277528576.0, + "grad_norm": 0.032463443859892846, + "language_loss": 0.9117527, + "learning_rate": 0.00040066035440352904, + "loss": 0.92331362, + "num_input_tokens_seen": 250060736, + "router_z_loss_mlp": 0.78271484, + "step": 2998, + "time_per_iteration": 2.614291191101074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169762, + "balance_loss_mlp": 1.09403992, + "epoch": 0.5769526741054252, + "flos": 1563023239680.0, + "grad_norm": 0.012552051598097233, + "language_loss": 0.79293132, + "learning_rate": 0.0004003550421302934, + "loss": 0.80462897, + "num_input_tokens_seen": 250296864, + "router_z_loss_mlp": 0.7578125, + "step": 2999, + "time_per_iteration": 4.9131574630737305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162548, + "balance_loss_mlp": 1.0844425, + "epoch": 0.5771450557906888, + "flos": 469171849728.0, + "grad_norm": 0.03695219944655869, + "language_loss": 0.82297212, + "learning_rate": 0.00040004976854266145, + "loss": 0.83459759, + "num_input_tokens_seen": 250362528, + "router_z_loss_mlp": 0.78027344, + "step": 3000, + "time_per_iteration": 2.599562406539917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161323, + "balance_loss_mlp": 1.08321714, + "epoch": 0.5773374374759523, + "flos": 575632926720.0, + "grad_norm": 0.03253250172707863, + "language_loss": 0.86701882, + "learning_rate": 0.0003997445337591505, + "loss": 0.87863207, + "num_input_tokens_seen": 250432768, + "router_z_loss_mlp": 0.78027344, + "step": 3001, + "time_per_iteration": 2.651052951812744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161912, + "balance_loss_mlp": 1.08380568, + "epoch": 0.5775298191612158, + "flos": 529504335360.0, + "grad_norm": 0.030455172240490772, + "language_loss": 0.78589356, + "learning_rate": 0.0003994393378982635, + "loss": 0.79751271, + "num_input_tokens_seen": 250501504, + "router_z_loss_mlp": 0.78027344, + "step": 3002, + "time_per_iteration": 2.6081488132476807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162445, + "balance_loss_mlp": 1.08576965, + "epoch": 0.5777222008464794, + "flos": 1306896520704.0, + "grad_norm": 0.00976162227486582, + "language_loss": 0.79538, + "learning_rate": 0.00039913418107848786, + "loss": 0.80700445, + "num_input_tokens_seen": 250733632, + "router_z_loss_mlp": 0.765625, + "step": 3003, + "time_per_iteration": 4.794616460800171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154088, + "balance_loss_mlp": 1.07550502, + "epoch": 0.577914582531743, + "flos": 604792051200.0, + "grad_norm": 0.035927509548420514, + "language_loss": 0.93844306, + "learning_rate": 0.0003988290634182961, + "loss": 0.94998395, + "num_input_tokens_seen": 250809152, + "router_z_loss_mlp": 0.78417969, + "step": 3004, + "time_per_iteration": 2.7580206394195557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152956, + "balance_loss_mlp": 1.07465923, + "epoch": 0.5781069642170066, + "flos": 487832641536.0, + "grad_norm": 0.03166140659951907, + "language_loss": 0.85788441, + "learning_rate": 0.0003985239850361453, + "loss": 0.86941397, + "num_input_tokens_seen": 250879152, + "router_z_loss_mlp": 0.78173828, + "step": 3005, + "time_per_iteration": 2.5811102390289307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148402, + "balance_loss_mlp": 1.0700103, + "epoch": 0.5782993459022701, + "flos": 507413956608.0, + "grad_norm": 0.03361154868402879, + "language_loss": 0.90845788, + "learning_rate": 0.0003982189460504777, + "loss": 0.9199419, + "num_input_tokens_seen": 250949904, + "router_z_loss_mlp": 0.78271484, + "step": 3006, + "time_per_iteration": 2.701486349105835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150426, + "balance_loss_mlp": 1.07208133, + "epoch": 0.5784917275875336, + "flos": 603294108672.0, + "grad_norm": 0.03266847587020217, + "language_loss": 0.84488243, + "learning_rate": 0.00039791394657971935, + "loss": 0.85638666, + "num_input_tokens_seen": 251020976, + "router_z_loss_mlp": 0.78222656, + "step": 3007, + "time_per_iteration": 2.7029902935028076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114812, + "balance_loss_mlp": 1.06953716, + "epoch": 0.5786841092727972, + "flos": 522588039168.0, + "grad_norm": 0.03327041662205967, + "language_loss": 0.89717233, + "learning_rate": 0.00039760898674228205, + "loss": 0.90865356, + "num_input_tokens_seen": 251093280, + "router_z_loss_mlp": 0.78466797, + "step": 3008, + "time_per_iteration": 2.6650431156158447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163782, + "balance_loss_mlp": 1.08510339, + "epoch": 0.5788764909580608, + "flos": 768835504128.0, + "grad_norm": 0.02880825356575122, + "language_loss": 0.85863519, + "learning_rate": 0.0003973040666565613, + "loss": 0.87027305, + "num_input_tokens_seen": 251181376, + "router_z_loss_mlp": 0.78515625, + "step": 3009, + "time_per_iteration": 3.0480079650878906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165461, + "balance_loss_mlp": 1.08668745, + "epoch": 0.5790688726433244, + "flos": 600331150848.0, + "grad_norm": 0.03153140111016463, + "language_loss": 0.87491179, + "learning_rate": 0.000396999186440938, + "loss": 0.8865664, + "num_input_tokens_seen": 251256176, + "router_z_loss_mlp": 0.78515625, + "step": 3010, + "time_per_iteration": 2.866971254348755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163905, + "balance_loss_mlp": 1.08517945, + "epoch": 0.5792612543285879, + "flos": 524105447424.0, + "grad_norm": 0.03493307290908607, + "language_loss": 0.90569246, + "learning_rate": 0.000396694346213777, + "loss": 0.91733146, + "num_input_tokens_seen": 251325344, + "router_z_loss_mlp": 0.78564453, + "step": 3011, + "time_per_iteration": 2.6576690673828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160972, + "balance_loss_mlp": 1.08234167, + "epoch": 0.5794536360138515, + "flos": 878079618048.0, + "grad_norm": 0.028681737588389107, + "language_loss": 0.88734698, + "learning_rate": 0.0003963895460934276, + "loss": 0.89895672, + "num_input_tokens_seen": 251406656, + "router_z_loss_mlp": 0.78369141, + "step": 3012, + "time_per_iteration": 3.1439104080200195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159333, + "balance_loss_mlp": 1.08065438, + "epoch": 0.5796460176991151, + "flos": 402298372608.0, + "grad_norm": 0.038884721414284784, + "language_loss": 0.92029333, + "learning_rate": 0.00039608478619822376, + "loss": 0.93188667, + "num_input_tokens_seen": 251467760, + "router_z_loss_mlp": 0.78613281, + "step": 3013, + "time_per_iteration": 2.4331459999084473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115895, + "balance_loss_mlp": 1.08032, + "epoch": 0.5798383993843786, + "flos": 619675422720.0, + "grad_norm": 0.029275699876953817, + "language_loss": 0.87518513, + "learning_rate": 0.00039578006664648394, + "loss": 0.88677466, + "num_input_tokens_seen": 251542272, + "router_z_loss_mlp": 0.78417969, + "step": 3014, + "time_per_iteration": 2.770930290222168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157872, + "balance_loss_mlp": 1.07928884, + "epoch": 0.5800307810696421, + "flos": 845792351232.0, + "grad_norm": 0.03304881172222658, + "language_loss": 0.8676393, + "learning_rate": 0.0003954753875565105, + "loss": 0.87921804, + "num_input_tokens_seen": 251625584, + "router_z_loss_mlp": 0.78320312, + "step": 3015, + "time_per_iteration": 3.08627986907959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155618, + "balance_loss_mlp": 1.0769875, + "epoch": 0.5802231627549057, + "flos": 570364294656.0, + "grad_norm": 0.02949140039649942, + "language_loss": 0.86755216, + "learning_rate": 0.00039517074904659057, + "loss": 0.87910825, + "num_input_tokens_seen": 251696704, + "router_z_loss_mlp": 0.78369141, + "step": 3016, + "time_per_iteration": 2.685842990875244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155954, + "balance_loss_mlp": 1.07732403, + "epoch": 0.5804155444401693, + "flos": 661662022656.0, + "grad_norm": 0.030068480846806175, + "language_loss": 0.90490985, + "learning_rate": 0.00039486615123499535, + "loss": 0.91646945, + "num_input_tokens_seen": 251774784, + "router_z_loss_mlp": 0.78369141, + "step": 3017, + "time_per_iteration": 2.8422367572784424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158277, + "balance_loss_mlp": 1.07950318, + "epoch": 0.5806079261254329, + "flos": 515057393664.0, + "grad_norm": 0.0339975061302382, + "language_loss": 0.90716887, + "learning_rate": 0.00039456159423997996, + "loss": 0.91875166, + "num_input_tokens_seen": 251844768, + "router_z_loss_mlp": 0.78515625, + "step": 3018, + "time_per_iteration": 2.6301286220550537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159604, + "balance_loss_mlp": 1.08116388, + "epoch": 0.5808003078106965, + "flos": 529717183488.0, + "grad_norm": 0.035522237622510534, + "language_loss": 0.94178265, + "learning_rate": 0.00039425707817978406, + "loss": 0.95337874, + "num_input_tokens_seen": 251912736, + "router_z_loss_mlp": 0.78320312, + "step": 3019, + "time_per_iteration": 2.6516103744506836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159065, + "balance_loss_mlp": 1.08033943, + "epoch": 0.58099268949596, + "flos": 477996321792.0, + "grad_norm": 0.033660479575399194, + "language_loss": 0.88736534, + "learning_rate": 0.00039395260317263124, + "loss": 0.89895594, + "num_input_tokens_seen": 251979328, + "router_z_loss_mlp": 0.78466797, + "step": 3020, + "time_per_iteration": 2.5736000537872314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158964, + "balance_loss_mlp": 1.08033383, + "epoch": 0.5811850711812235, + "flos": 518687093760.0, + "grad_norm": 0.032372571582398105, + "language_loss": 0.90171605, + "learning_rate": 0.0003936481693367291, + "loss": 0.9133057, + "num_input_tokens_seen": 252050928, + "router_z_loss_mlp": 0.78417969, + "step": 3021, + "time_per_iteration": 2.655585289001465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152938, + "balance_loss_mlp": 1.07416463, + "epoch": 0.5813774528664871, + "flos": 617626257408.0, + "grad_norm": 0.037353178472421755, + "language_loss": 0.94038713, + "learning_rate": 0.0003933437767902697, + "loss": 0.95191658, + "num_input_tokens_seen": 252126496, + "router_z_loss_mlp": 0.78564453, + "step": 3022, + "time_per_iteration": 2.7785356044769287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155749, + "balance_loss_mlp": 1.07707083, + "epoch": 0.5815698345517507, + "flos": 568603838976.0, + "grad_norm": 0.03237494754713459, + "language_loss": 0.83540273, + "learning_rate": 0.00039303942565142825, + "loss": 0.84696019, + "num_input_tokens_seen": 252203008, + "router_z_loss_mlp": 0.78466797, + "step": 3023, + "time_per_iteration": 2.8082921504974365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115966, + "balance_loss_mlp": 1.08122075, + "epoch": 0.5817622162370142, + "flos": 564303393792.0, + "grad_norm": 0.030406133972166762, + "language_loss": 0.81602162, + "learning_rate": 0.0003927351160383644, + "loss": 0.82761824, + "num_input_tokens_seen": 252283440, + "router_z_loss_mlp": 0.78369141, + "step": 3024, + "time_per_iteration": 2.8258216381073 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115841, + "balance_loss_mlp": 1.07992303, + "epoch": 0.5819545979222778, + "flos": 460153995264.0, + "grad_norm": 0.0330231934286986, + "language_loss": 0.82985759, + "learning_rate": 0.000392430848069222, + "loss": 0.84144175, + "num_input_tokens_seen": 252351760, + "router_z_loss_mlp": 0.78369141, + "step": 3025, + "time_per_iteration": 2.552351713180542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155737, + "balance_loss_mlp": 1.0769639, + "epoch": 0.5821469796075414, + "flos": 542516461056.0, + "grad_norm": 0.03445814315346002, + "language_loss": 0.88443869, + "learning_rate": 0.00039212662186212795, + "loss": 0.89599597, + "num_input_tokens_seen": 252418480, + "router_z_loss_mlp": 0.78515625, + "step": 3026, + "time_per_iteration": 2.6369402408599854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157395, + "balance_loss_mlp": 1.07890785, + "epoch": 0.582339361292805, + "flos": 553340433408.0, + "grad_norm": 0.029462079730168216, + "language_loss": 0.82325065, + "learning_rate": 0.0003918224375351934, + "loss": 0.83482456, + "num_input_tokens_seen": 252493712, + "router_z_loss_mlp": 0.78369141, + "step": 3027, + "time_per_iteration": 2.698915958404541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116249, + "balance_loss_mlp": 1.08386004, + "epoch": 0.5825317429780685, + "flos": 497447380992.0, + "grad_norm": 0.03190253080273137, + "language_loss": 0.83360291, + "learning_rate": 0.0003915182952065135, + "loss": 0.84522784, + "num_input_tokens_seen": 252566096, + "router_z_loss_mlp": 0.78417969, + "step": 3028, + "time_per_iteration": 2.6572346687316895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160994, + "balance_loss_mlp": 1.08265007, + "epoch": 0.582724124663332, + "flos": 565254116352.0, + "grad_norm": 0.030478660984130428, + "language_loss": 0.92836106, + "learning_rate": 0.0003912141949941664, + "loss": 0.93997103, + "num_input_tokens_seen": 252639424, + "router_z_loss_mlp": 0.78271484, + "step": 3029, + "time_per_iteration": 2.683072090148926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153282, + "balance_loss_mlp": 1.07484198, + "epoch": 0.5829165063485956, + "flos": 493112007168.0, + "grad_norm": 0.03294557051603365, + "language_loss": 0.89173961, + "learning_rate": 0.0003909101370162143, + "loss": 0.90327239, + "num_input_tokens_seen": 252706672, + "router_z_loss_mlp": 0.78369141, + "step": 3030, + "time_per_iteration": 2.575670003890991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160767, + "balance_loss_mlp": 1.08370972, + "epoch": 0.5831088880338592, + "flos": 1531877349888.0, + "grad_norm": 0.012849020092446796, + "language_loss": 0.72433889, + "learning_rate": 0.00039060612139070326, + "loss": 0.7359466, + "num_input_tokens_seen": 252932464, + "router_z_loss_mlp": 0.76953125, + "step": 3031, + "time_per_iteration": 4.9284889698028564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152337, + "balance_loss_mlp": 1.07370639, + "epoch": 0.5833012697191228, + "flos": 619208793600.0, + "grad_norm": 0.02929875839371022, + "language_loss": 0.87939668, + "learning_rate": 0.0003903021482356622, + "loss": 0.89092004, + "num_input_tokens_seen": 253011920, + "router_z_loss_mlp": 0.78466797, + "step": 3032, + "time_per_iteration": 2.8254482746124268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152205, + "balance_loss_mlp": 1.07362223, + "epoch": 0.5834936514043862, + "flos": 769293401088.0, + "grad_norm": 0.02695668391828596, + "language_loss": 0.87565535, + "learning_rate": 0.00038999821766910465, + "loss": 0.88717741, + "num_input_tokens_seen": 253091552, + "router_z_loss_mlp": 0.78417969, + "step": 3033, + "time_per_iteration": 3.006687641143799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156362, + "balance_loss_mlp": 1.07796979, + "epoch": 0.5836860330896498, + "flos": 459316064256.0, + "grad_norm": 0.030677066462792797, + "language_loss": 0.91205192, + "learning_rate": 0.00038969432980902606, + "loss": 0.92361552, + "num_input_tokens_seen": 253158608, + "router_z_loss_mlp": 0.78320312, + "step": 3034, + "time_per_iteration": 2.550684690475464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011586, + "balance_loss_mlp": 1.08192444, + "epoch": 0.5838784147749134, + "flos": 1364196191232.0, + "grad_norm": 0.008170267563240248, + "language_loss": 0.79784501, + "learning_rate": 0.0003893904847734068, + "loss": 0.80943102, + "num_input_tokens_seen": 253381184, + "router_z_loss_mlp": 0.765625, + "step": 3035, + "time_per_iteration": 4.859564304351807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154223, + "balance_loss_mlp": 1.07592607, + "epoch": 0.584070796460177, + "flos": 568288932864.0, + "grad_norm": 0.030253680936045732, + "language_loss": 0.87217242, + "learning_rate": 0.00038908668268020953, + "loss": 0.88371468, + "num_input_tokens_seen": 253452880, + "router_z_loss_mlp": 0.78222656, + "step": 3036, + "time_per_iteration": 2.7140538692474365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154776, + "balance_loss_mlp": 1.07624114, + "epoch": 0.5842631781454406, + "flos": 612665800704.0, + "grad_norm": 0.02904438680956131, + "language_loss": 0.90014827, + "learning_rate": 0.00038878292364738097, + "loss": 0.91169608, + "num_input_tokens_seen": 253530000, + "router_z_loss_mlp": 0.78271484, + "step": 3037, + "time_per_iteration": 2.787289619445801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157819, + "balance_loss_mlp": 1.07923615, + "epoch": 0.5844555598307041, + "flos": 464332916736.0, + "grad_norm": 0.03338514659593435, + "language_loss": 0.93144816, + "learning_rate": 0.0003884792077928508, + "loss": 0.94302636, + "num_input_tokens_seen": 253593504, + "router_z_loss_mlp": 0.78320312, + "step": 3038, + "time_per_iteration": 2.513655185699463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155243, + "balance_loss_mlp": 1.07666051, + "epoch": 0.5846479415159677, + "flos": 411057716736.0, + "grad_norm": 0.039769663121131886, + "language_loss": 0.82121253, + "learning_rate": 0.0003881755352345322, + "loss": 0.83276498, + "num_input_tokens_seen": 253657904, + "router_z_loss_mlp": 0.78320312, + "step": 3039, + "time_per_iteration": 2.5270330905914307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154802, + "balance_loss_mlp": 1.07641041, + "epoch": 0.5848403232012312, + "flos": 492265344000.0, + "grad_norm": 0.02801571871014385, + "language_loss": 0.90901846, + "learning_rate": 0.0003878719060903207, + "loss": 0.9205665, + "num_input_tokens_seen": 253725280, + "router_z_loss_mlp": 0.78222656, + "step": 3040, + "time_per_iteration": 2.5588507652282715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154937, + "balance_loss_mlp": 1.07644928, + "epoch": 0.5850327048864948, + "flos": 585508177920.0, + "grad_norm": 0.037771067006053156, + "language_loss": 0.89005375, + "learning_rate": 0.0003875683204780961, + "loss": 0.90160316, + "num_input_tokens_seen": 253795040, + "router_z_loss_mlp": 0.78271484, + "step": 3041, + "time_per_iteration": 2.668827533721924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152572, + "balance_loss_mlp": 1.07408428, + "epoch": 0.5852250865717584, + "flos": 652718028288.0, + "grad_norm": 0.037622145269810676, + "language_loss": 0.92115968, + "learning_rate": 0.00038726477851572043, + "loss": 0.93268543, + "num_input_tokens_seen": 253866384, + "router_z_loss_mlp": 0.78271484, + "step": 3042, + "time_per_iteration": 2.813145160675049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152742, + "balance_loss_mlp": 1.07434952, + "epoch": 0.5854174682570219, + "flos": 535619630592.0, + "grad_norm": 0.034632487357399135, + "language_loss": 0.85911977, + "learning_rate": 0.0003869612803210395, + "loss": 0.87064719, + "num_input_tokens_seen": 253935712, + "router_z_loss_mlp": 0.78222656, + "step": 3043, + "time_per_iteration": 2.6411526203155518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150207, + "balance_loss_mlp": 1.07176721, + "epoch": 0.5856098499422855, + "flos": 510758949888.0, + "grad_norm": 0.03364322076393535, + "language_loss": 0.8838582, + "learning_rate": 0.0003866578260118817, + "loss": 0.89536023, + "num_input_tokens_seen": 254003152, + "router_z_loss_mlp": 0.78271484, + "step": 3044, + "time_per_iteration": 2.59216570854187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160339, + "balance_loss_mlp": 1.08228123, + "epoch": 0.5858022316275491, + "flos": 594992661504.0, + "grad_norm": 0.03592243508466687, + "language_loss": 0.87963545, + "learning_rate": 0.0003863544157060581, + "loss": 0.89123881, + "num_input_tokens_seen": 254072816, + "router_z_loss_mlp": 0.77978516, + "step": 3045, + "time_per_iteration": 2.6693618297576904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159373, + "balance_loss_mlp": 1.08131468, + "epoch": 0.5859946133128127, + "flos": 560317854720.0, + "grad_norm": 0.029657376615259006, + "language_loss": 0.86909235, + "learning_rate": 0.0003860510495213634, + "loss": 0.88068604, + "num_input_tokens_seen": 254152800, + "router_z_loss_mlp": 0.77978516, + "step": 3046, + "time_per_iteration": 2.799967050552368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159061, + "balance_loss_mlp": 1.08085966, + "epoch": 0.5861869949980761, + "flos": 554755783680.0, + "grad_norm": 0.03663253930872626, + "language_loss": 0.84493214, + "learning_rate": 0.0003857477275755746, + "loss": 0.85652274, + "num_input_tokens_seen": 254224384, + "router_z_loss_mlp": 0.78125, + "step": 3047, + "time_per_iteration": 2.6989481449127197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116382, + "balance_loss_mlp": 1.08566678, + "epoch": 0.5863793766833397, + "flos": 720054131712.0, + "grad_norm": 0.029238524404730352, + "language_loss": 0.89394152, + "learning_rate": 0.00038544444998645167, + "loss": 0.90557969, + "num_input_tokens_seen": 254310960, + "router_z_loss_mlp": 0.78076172, + "step": 3048, + "time_per_iteration": 3.0829827785491943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162492, + "balance_loss_mlp": 1.0843389, + "epoch": 0.5865717583686033, + "flos": 473285643264.0, + "grad_norm": 0.03316519352776713, + "language_loss": 0.8619799, + "learning_rate": 0.00038514121687173767, + "loss": 0.87360477, + "num_input_tokens_seen": 254378336, + "router_z_loss_mlp": 0.78076172, + "step": 3049, + "time_per_iteration": 2.575395107269287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157324, + "balance_loss_mlp": 1.07897997, + "epoch": 0.5867641400538669, + "flos": 814846574592.0, + "grad_norm": 0.0318856413902076, + "language_loss": 0.87874395, + "learning_rate": 0.00038483802834915807, + "loss": 0.8903172, + "num_input_tokens_seen": 254454352, + "router_z_loss_mlp": 0.78271484, + "step": 3050, + "time_per_iteration": 2.973144292831421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153006, + "balance_loss_mlp": 1.07461429, + "epoch": 0.5869565217391305, + "flos": 487517735424.0, + "grad_norm": 0.034960474960603255, + "language_loss": 0.8386789, + "learning_rate": 0.00038453488453642074, + "loss": 0.85020894, + "num_input_tokens_seen": 254526352, + "router_z_loss_mlp": 0.78320312, + "step": 3051, + "time_per_iteration": 2.7100586891174316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152299, + "balance_loss_mlp": 1.0736686, + "epoch": 0.587148903424394, + "flos": 570512014848.0, + "grad_norm": 0.03111841936731719, + "language_loss": 0.91899282, + "learning_rate": 0.00038423178555121697, + "loss": 0.93051583, + "num_input_tokens_seen": 254598720, + "router_z_loss_mlp": 0.78466797, + "step": 3052, + "time_per_iteration": 2.713294744491577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151746, + "balance_loss_mlp": 1.07316351, + "epoch": 0.5873412851096576, + "flos": 748694234112.0, + "grad_norm": 0.039836143626506074, + "language_loss": 0.90698159, + "learning_rate": 0.00038392873151121994, + "loss": 0.91849899, + "num_input_tokens_seen": 254683664, + "router_z_loss_mlp": 0.78466797, + "step": 3053, + "time_per_iteration": 3.0334441661834717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151743, + "balance_loss_mlp": 1.07320774, + "epoch": 0.5875336667949211, + "flos": 529187427840.0, + "grad_norm": 0.03304313685691396, + "language_loss": 0.89048851, + "learning_rate": 0.0003836257225340859, + "loss": 0.90200597, + "num_input_tokens_seen": 254754688, + "router_z_loss_mlp": 0.78417969, + "step": 3054, + "time_per_iteration": 2.612002372741699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152089, + "balance_loss_mlp": 1.07360125, + "epoch": 0.5877260484801847, + "flos": 825640347648.0, + "grad_norm": 0.04168388263761463, + "language_loss": 0.87033945, + "learning_rate": 0.00038332275873745336, + "loss": 0.88186038, + "num_input_tokens_seen": 254838976, + "router_z_loss_mlp": 0.78369141, + "step": 3055, + "time_per_iteration": 3.0469071865081787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153404, + "balance_loss_mlp": 1.07472539, + "epoch": 0.5879184301654482, + "flos": 592693718016.0, + "grad_norm": 0.028534237237830384, + "language_loss": 0.87091875, + "learning_rate": 0.0003830198402389431, + "loss": 0.88245273, + "num_input_tokens_seen": 254912912, + "router_z_loss_mlp": 0.78466797, + "step": 3056, + "time_per_iteration": 2.7129743099212646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116227, + "balance_loss_mlp": 1.08635712, + "epoch": 0.5881108118507118, + "flos": 1549223574528.0, + "grad_norm": 0.013735077759529469, + "language_loss": 0.77348936, + "learning_rate": 0.0003827169671561585, + "loss": 0.78511202, + "num_input_tokens_seen": 255151488, + "router_z_loss_mlp": 0.75976562, + "step": 3057, + "time_per_iteration": 4.971419334411621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155251, + "balance_loss_mlp": 1.0767163, + "epoch": 0.5883031935359754, + "flos": 490598214144.0, + "grad_norm": 0.03703880470659913, + "language_loss": 0.88891268, + "learning_rate": 0.0003824141396066855, + "loss": 0.90046519, + "num_input_tokens_seen": 255218896, + "router_z_loss_mlp": 0.78417969, + "step": 3058, + "time_per_iteration": 2.5657668113708496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153431, + "balance_loss_mlp": 1.0749433, + "epoch": 0.588495575221239, + "flos": 583980036096.0, + "grad_norm": 0.04132288833299083, + "language_loss": 0.89364433, + "learning_rate": 0.000382111357708092, + "loss": 0.90517867, + "num_input_tokens_seen": 255287408, + "router_z_loss_mlp": 0.78417969, + "step": 3059, + "time_per_iteration": 2.7690227031707764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152167, + "balance_loss_mlp": 1.07377541, + "epoch": 0.5886879569065026, + "flos": 662239441920.0, + "grad_norm": 0.03195995960407152, + "language_loss": 0.89352429, + "learning_rate": 0.00038180862157792864, + "loss": 0.90504599, + "num_input_tokens_seen": 255358432, + "router_z_loss_mlp": 0.78320312, + "step": 3060, + "time_per_iteration": 2.797255039215088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149069, + "balance_loss_mlp": 1.07048619, + "epoch": 0.588880338591766, + "flos": 563719243776.0, + "grad_norm": 0.031223560866560994, + "language_loss": 0.86781317, + "learning_rate": 0.0003815059313337279, + "loss": 0.87930381, + "num_input_tokens_seen": 255425744, + "router_z_loss_mlp": 0.78369141, + "step": 3061, + "time_per_iteration": 2.6690454483032227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149002, + "balance_loss_mlp": 1.07056284, + "epoch": 0.5890727202770296, + "flos": 555852225024.0, + "grad_norm": 0.029451906852367885, + "language_loss": 0.83063936, + "learning_rate": 0.00038120328709300436, + "loss": 0.84212935, + "num_input_tokens_seen": 255505808, + "router_z_loss_mlp": 0.78271484, + "step": 3062, + "time_per_iteration": 2.902662515640259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149399, + "balance_loss_mlp": 1.07095897, + "epoch": 0.5892651019622932, + "flos": 656701565952.0, + "grad_norm": 0.028569643240873292, + "language_loss": 0.89099294, + "learning_rate": 0.0003809006889732549, + "loss": 0.90248692, + "num_input_tokens_seen": 255580160, + "router_z_loss_mlp": 0.78320312, + "step": 3063, + "time_per_iteration": 2.8155622482299805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150242, + "balance_loss_mlp": 1.07185006, + "epoch": 0.5894574836475568, + "flos": 454132025856.0, + "grad_norm": 0.03219128848339896, + "language_loss": 0.93056011, + "learning_rate": 0.0003805981370919589, + "loss": 0.9420625, + "num_input_tokens_seen": 255644016, + "router_z_loss_mlp": 0.78173828, + "step": 3064, + "time_per_iteration": 2.533978223800659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156603, + "balance_loss_mlp": 1.07840204, + "epoch": 0.5896498653328203, + "flos": 520111176192.0, + "grad_norm": 0.0315116121131164, + "language_loss": 0.89031386, + "learning_rate": 0.0003802956315665771, + "loss": 0.90187985, + "num_input_tokens_seen": 255718192, + "router_z_loss_mlp": 0.78125, + "step": 3065, + "time_per_iteration": 2.6914567947387695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151617, + "balance_loss_mlp": 1.07341576, + "epoch": 0.5898422470180839, + "flos": 550084036608.0, + "grad_norm": 0.037269486879405754, + "language_loss": 0.87739515, + "learning_rate": 0.0003799931725145529, + "loss": 0.88891131, + "num_input_tokens_seen": 255787696, + "router_z_loss_mlp": 0.78125, + "step": 3066, + "time_per_iteration": 2.6040141582489014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151797, + "balance_loss_mlp": 1.07359576, + "epoch": 0.5900346287033474, + "flos": 525379808256.0, + "grad_norm": 0.03210441330274425, + "language_loss": 0.90831029, + "learning_rate": 0.00037969076005331083, + "loss": 0.9198283, + "num_input_tokens_seen": 255862992, + "router_z_loss_mlp": 0.78125, + "step": 3067, + "time_per_iteration": 2.773045301437378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151142, + "balance_loss_mlp": 1.07298875, + "epoch": 0.590227010388611, + "flos": 568215072768.0, + "grad_norm": 0.03944068050463326, + "language_loss": 0.93933421, + "learning_rate": 0.00037938839430025817, + "loss": 0.9508456, + "num_input_tokens_seen": 255931872, + "router_z_loss_mlp": 0.78076172, + "step": 3068, + "time_per_iteration": 2.6502816677093506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149777, + "balance_loss_mlp": 1.07148039, + "epoch": 0.5904193920738746, + "flos": 584455397376.0, + "grad_norm": 0.029602074998044806, + "language_loss": 0.90136111, + "learning_rate": 0.0003790860753727835, + "loss": 0.91285884, + "num_input_tokens_seen": 256004656, + "router_z_loss_mlp": 0.78173828, + "step": 3069, + "time_per_iteration": 2.8173305988311768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148373, + "balance_loss_mlp": 1.07007682, + "epoch": 0.5906117737591381, + "flos": 530796160512.0, + "grad_norm": 0.03761421694137887, + "language_loss": 0.88493633, + "learning_rate": 0.00037878380338825766, + "loss": 0.89642012, + "num_input_tokens_seen": 256076944, + "router_z_loss_mlp": 0.78173828, + "step": 3070, + "time_per_iteration": 2.6682841777801514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148557, + "balance_loss_mlp": 1.07059419, + "epoch": 0.5908041554444017, + "flos": 685515585024.0, + "grad_norm": 0.029847469423829834, + "language_loss": 0.85616612, + "learning_rate": 0.00037848157846403287, + "loss": 0.86765176, + "num_input_tokens_seen": 256154768, + "router_z_loss_mlp": 0.77880859, + "step": 3071, + "time_per_iteration": 2.942607879638672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148313, + "balance_loss_mlp": 1.07015908, + "epoch": 0.5909965371296653, + "flos": 551132814336.0, + "grad_norm": 0.030659229377642858, + "language_loss": 0.88636756, + "learning_rate": 0.0003781794007174435, + "loss": 0.89785063, + "num_input_tokens_seen": 256230896, + "router_z_loss_mlp": 0.78076172, + "step": 3072, + "time_per_iteration": 2.7619588375091553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159439, + "balance_loss_mlp": 1.08276367, + "epoch": 0.5911889188149289, + "flos": 1495642200576.0, + "grad_norm": 0.009662354088300913, + "language_loss": 0.74074531, + "learning_rate": 0.0003778772702658051, + "loss": 0.75233972, + "num_input_tokens_seen": 256462336, + "router_z_loss_mlp": 0.765625, + "step": 3073, + "time_per_iteration": 4.855187177658081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115096, + "balance_loss_mlp": 1.07275867, + "epoch": 0.5913813005001923, + "flos": 488885422080.0, + "grad_norm": 0.030913240812320716, + "language_loss": 0.86239564, + "learning_rate": 0.0003775751872264152, + "loss": 0.87390518, + "num_input_tokens_seen": 256539376, + "router_z_loss_mlp": 0.78125, + "step": 3074, + "time_per_iteration": 2.7676284313201904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150595, + "balance_loss_mlp": 1.0724895, + "epoch": 0.5915736821854559, + "flos": 574521748992.0, + "grad_norm": 0.02774902568268271, + "language_loss": 0.91979122, + "learning_rate": 0.0003772731517165527, + "loss": 0.93129718, + "num_input_tokens_seen": 256617728, + "router_z_loss_mlp": 0.78027344, + "step": 3075, + "time_per_iteration": 2.7969858646392822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146907, + "balance_loss_mlp": 1.06884861, + "epoch": 0.5917660638707195, + "flos": 790860754944.0, + "grad_norm": 0.032083383212934545, + "language_loss": 0.88416231, + "learning_rate": 0.0003769711638534784, + "loss": 0.89563137, + "num_input_tokens_seen": 256696032, + "router_z_loss_mlp": 0.77978516, + "step": 3076, + "time_per_iteration": 2.966887950897217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147265, + "balance_loss_mlp": 1.06915915, + "epoch": 0.5919584455559831, + "flos": 529756114944.0, + "grad_norm": 0.039188776409307895, + "language_loss": 0.84855187, + "learning_rate": 0.00037666922375443446, + "loss": 0.86002445, + "num_input_tokens_seen": 256767360, + "router_z_loss_mlp": 0.78027344, + "step": 3077, + "time_per_iteration": 2.6466495990753174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146857, + "balance_loss_mlp": 1.06889355, + "epoch": 0.5921508272412467, + "flos": 561752670720.0, + "grad_norm": 0.03396925526876144, + "language_loss": 0.87058771, + "learning_rate": 0.00037636733153664396, + "loss": 0.88205624, + "num_input_tokens_seen": 256844848, + "router_z_loss_mlp": 0.77880859, + "step": 3078, + "time_per_iteration": 2.868244171142578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147912, + "balance_loss_mlp": 1.06980658, + "epoch": 0.5923432089265102, + "flos": 564333593088.0, + "grad_norm": 0.03405949699736924, + "language_loss": 0.86518288, + "learning_rate": 0.0003760654873173124, + "loss": 0.87666202, + "num_input_tokens_seen": 256916688, + "router_z_loss_mlp": 0.78027344, + "step": 3079, + "time_per_iteration": 2.665978193283081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148871, + "balance_loss_mlp": 1.07095611, + "epoch": 0.5925355906117737, + "flos": 496750439424.0, + "grad_norm": 0.031078530741144403, + "language_loss": 0.87091482, + "learning_rate": 0.00037576369121362566, + "loss": 0.88240349, + "num_input_tokens_seen": 256985520, + "router_z_loss_mlp": 0.77832031, + "step": 3080, + "time_per_iteration": 2.5879437923431396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152698, + "balance_loss_mlp": 1.07483089, + "epoch": 0.5927279722970373, + "flos": 567492661248.0, + "grad_norm": 0.029886004026783125, + "language_loss": 0.86116624, + "learning_rate": 0.0003754619433427516, + "loss": 0.87269318, + "num_input_tokens_seen": 257067552, + "router_z_loss_mlp": 0.77783203, + "step": 3081, + "time_per_iteration": 2.911530017852783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149482, + "balance_loss_mlp": 1.07156706, + "epoch": 0.5929203539823009, + "flos": 668159353344.0, + "grad_norm": 0.03611880785888225, + "language_loss": 0.84511012, + "learning_rate": 0.0003751602438218392, + "loss": 0.85660493, + "num_input_tokens_seen": 257138896, + "router_z_loss_mlp": 0.77832031, + "step": 3082, + "time_per_iteration": 2.767104148864746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148924, + "balance_loss_mlp": 1.07105672, + "epoch": 0.5931127356675644, + "flos": 556785483264.0, + "grad_norm": 0.03271098535749721, + "language_loss": 0.89783478, + "learning_rate": 0.0003748585927680186, + "loss": 0.90932405, + "num_input_tokens_seen": 257210592, + "router_z_loss_mlp": 0.77783203, + "step": 3083, + "time_per_iteration": 2.6630167961120605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148966, + "balance_loss_mlp": 1.07100332, + "epoch": 0.593305117352828, + "flos": 536242712064.0, + "grad_norm": 0.03028975884774044, + "language_loss": 0.88271487, + "learning_rate": 0.00037455699029840086, + "loss": 0.89420456, + "num_input_tokens_seen": 257276208, + "router_z_loss_mlp": 0.77880859, + "step": 3084, + "time_per_iteration": 2.647643566131592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148168, + "balance_loss_mlp": 1.07020473, + "epoch": 0.5934974990380916, + "flos": 595057789440.0, + "grad_norm": 0.028668930156423956, + "language_loss": 0.89615595, + "learning_rate": 0.0003742554365300787, + "loss": 0.9076376, + "num_input_tokens_seen": 257351920, + "router_z_loss_mlp": 0.77880859, + "step": 3085, + "time_per_iteration": 2.743479013442993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148026, + "balance_loss_mlp": 1.07015836, + "epoch": 0.5936898807233552, + "flos": 714014697984.0, + "grad_norm": 0.030266517596009415, + "language_loss": 0.84002471, + "learning_rate": 0.0003739539315801255, + "loss": 0.85150492, + "num_input_tokens_seen": 257430016, + "router_z_loss_mlp": 0.77783203, + "step": 3086, + "time_per_iteration": 2.9327478408813477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147359, + "balance_loss_mlp": 1.06944346, + "epoch": 0.5938822624086187, + "flos": 392748761088.0, + "grad_norm": 0.030603721844952317, + "language_loss": 0.96139234, + "learning_rate": 0.000373652475565596, + "loss": 0.97286594, + "num_input_tokens_seen": 257492224, + "router_z_loss_mlp": 0.77832031, + "step": 3087, + "time_per_iteration": 2.471726417541504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146572, + "balance_loss_mlp": 1.06860876, + "epoch": 0.5940746440938822, + "flos": 481335310848.0, + "grad_norm": 0.033612762678092996, + "language_loss": 0.86454874, + "learning_rate": 0.00037335106860352587, + "loss": 0.87601447, + "num_input_tokens_seen": 257567824, + "router_z_loss_mlp": 0.77880859, + "step": 3088, + "time_per_iteration": 2.692692756652832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148512, + "balance_loss_mlp": 1.07045376, + "epoch": 0.5942670257791458, + "flos": 484307000832.0, + "grad_norm": 0.031191733120893732, + "language_loss": 0.87924445, + "learning_rate": 0.00037304971081093146, + "loss": 0.89072955, + "num_input_tokens_seen": 257635488, + "router_z_loss_mlp": 0.77978516, + "step": 3089, + "time_per_iteration": 2.568676710128784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149298, + "balance_loss_mlp": 1.071383, + "epoch": 0.5944594074644094, + "flos": 549057452544.0, + "grad_norm": 0.027833968511861495, + "language_loss": 0.85559821, + "learning_rate": 0.00037274840230481024, + "loss": 0.86709118, + "num_input_tokens_seen": 257709552, + "router_z_loss_mlp": 0.77832031, + "step": 3090, + "time_per_iteration": 2.7224090099334717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148103, + "balance_loss_mlp": 1.07009256, + "epoch": 0.594651789149673, + "flos": 450129022464.0, + "grad_norm": 0.03399265003555819, + "language_loss": 0.85464221, + "learning_rate": 0.00037244714320214077, + "loss": 0.86612326, + "num_input_tokens_seen": 257775520, + "router_z_loss_mlp": 0.77929688, + "step": 3091, + "time_per_iteration": 2.545518398284912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148303, + "balance_loss_mlp": 1.07034016, + "epoch": 0.5948441708349365, + "flos": 597465521664.0, + "grad_norm": 0.029759995876706483, + "language_loss": 0.88336015, + "learning_rate": 0.000372145933619882, + "loss": 0.89484322, + "num_input_tokens_seen": 257858560, + "router_z_loss_mlp": 0.77880859, + "step": 3092, + "time_per_iteration": 2.8612496852874756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147536, + "balance_loss_mlp": 1.06952572, + "epoch": 0.5950365525202, + "flos": 549580477440.0, + "grad_norm": 0.03567164883764641, + "language_loss": 0.87935793, + "learning_rate": 0.000371844773674974, + "loss": 0.89083326, + "num_input_tokens_seen": 257928048, + "router_z_loss_mlp": 0.77929688, + "step": 3093, + "time_per_iteration": 2.6431939601898193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147858, + "balance_loss_mlp": 1.06980002, + "epoch": 0.5952289342054636, + "flos": 655963691520.0, + "grad_norm": 0.03489323159702664, + "language_loss": 0.87669003, + "learning_rate": 0.0003715436634843375, + "loss": 0.88816857, + "num_input_tokens_seen": 258003088, + "router_z_loss_mlp": 0.77978516, + "step": 3094, + "time_per_iteration": 2.889326572418213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115074, + "balance_loss_mlp": 1.07268155, + "epoch": 0.5954213158907272, + "flos": 604603398144.0, + "grad_norm": 0.02937888511977547, + "language_loss": 0.85120195, + "learning_rate": 0.00037124260316487355, + "loss": 0.86270934, + "num_input_tokens_seen": 258084880, + "router_z_loss_mlp": 0.77978516, + "step": 3095, + "time_per_iteration": 2.8256890773773193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011487, + "balance_loss_mlp": 1.07064188, + "epoch": 0.5956136975759908, + "flos": 487267957248.0, + "grad_norm": 0.03289727477229571, + "language_loss": 0.94411993, + "learning_rate": 0.0003709415928334643, + "loss": 0.95560694, + "num_input_tokens_seen": 258152032, + "router_z_loss_mlp": 0.77978516, + "step": 3096, + "time_per_iteration": 2.587526559829712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148362, + "balance_loss_mlp": 1.07025576, + "epoch": 0.5958060792612543, + "flos": 660040555008.0, + "grad_norm": 0.03760653483237211, + "language_loss": 0.8629458, + "learning_rate": 0.00037064063260697233, + "loss": 0.8744294, + "num_input_tokens_seen": 258228896, + "router_z_loss_mlp": 0.78027344, + "step": 3097, + "time_per_iteration": 2.8921737670898438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149624, + "balance_loss_mlp": 1.07170904, + "epoch": 0.5959984609465179, + "flos": 724995122688.0, + "grad_norm": 0.02933465569925715, + "language_loss": 0.84228349, + "learning_rate": 0.0003703397226022407, + "loss": 0.85377973, + "num_input_tokens_seen": 258311152, + "router_z_loss_mlp": 0.77832031, + "step": 3098, + "time_per_iteration": 3.0898213386535645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115181, + "balance_loss_mlp": 1.07627869, + "epoch": 0.5961908426317815, + "flos": 1523218788864.0, + "grad_norm": 0.004520881067607934, + "language_loss": 0.75499874, + "learning_rate": 0.00037003886293609335, + "loss": 0.7665168, + "num_input_tokens_seen": 258540656, + "router_z_loss_mlp": 0.75585938, + "step": 3099, + "time_per_iteration": 4.9205827713012695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148148, + "balance_loss_mlp": 1.07023323, + "epoch": 0.596383224317045, + "flos": 533646326784.0, + "grad_norm": 0.03064762726337019, + "language_loss": 0.87394881, + "learning_rate": 0.0003697380537253339, + "loss": 0.88543034, + "num_input_tokens_seen": 258608960, + "router_z_loss_mlp": 0.77832031, + "step": 3100, + "time_per_iteration": 2.6238889694213867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148472, + "balance_loss_mlp": 1.07065213, + "epoch": 0.5965756060023086, + "flos": 592366076928.0, + "grad_norm": 0.03279417600266174, + "language_loss": 0.87095284, + "learning_rate": 0.0003694372950867471, + "loss": 0.88243759, + "num_input_tokens_seen": 258684304, + "router_z_loss_mlp": 0.77734375, + "step": 3101, + "time_per_iteration": 2.754004955291748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149351, + "balance_loss_mlp": 1.0715313, + "epoch": 0.5967679876875721, + "flos": 863469493248.0, + "grad_norm": 0.096940863219985, + "language_loss": 0.82642257, + "learning_rate": 0.0003691365871370976, + "loss": 0.83791614, + "num_input_tokens_seen": 258769472, + "router_z_loss_mlp": 0.77734375, + "step": 3102, + "time_per_iteration": 3.027898073196411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148471, + "balance_loss_mlp": 1.07065165, + "epoch": 0.5969603693728357, + "flos": 554877307392.0, + "grad_norm": 0.03194116769832037, + "language_loss": 0.90513253, + "learning_rate": 0.00036883592999313093, + "loss": 0.91661727, + "num_input_tokens_seen": 258841696, + "router_z_loss_mlp": 0.77734375, + "step": 3103, + "time_per_iteration": 2.6555323600769043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114931, + "balance_loss_mlp": 1.07158601, + "epoch": 0.5971527510580993, + "flos": 719936610816.0, + "grad_norm": 0.037867869271097296, + "language_loss": 0.85018742, + "learning_rate": 0.0003685353237715722, + "loss": 0.86168051, + "num_input_tokens_seen": 258915616, + "router_z_loss_mlp": 0.77636719, + "step": 3104, + "time_per_iteration": 2.88739013671875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115032, + "balance_loss_mlp": 1.07245219, + "epoch": 0.5973451327433629, + "flos": 648862745088.0, + "grad_norm": 0.032062315519195535, + "language_loss": 0.86408043, + "learning_rate": 0.0003682347685891274, + "loss": 0.87558353, + "num_input_tokens_seen": 258994080, + "router_z_loss_mlp": 0.77783203, + "step": 3105, + "time_per_iteration": 2.8420920372009277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149351, + "balance_loss_mlp": 1.07162631, + "epoch": 0.5975375144286263, + "flos": 723088948224.0, + "grad_norm": 0.03318206210872103, + "language_loss": 0.86870039, + "learning_rate": 0.0003679342645624822, + "loss": 0.88019389, + "num_input_tokens_seen": 259075968, + "router_z_loss_mlp": 0.77636719, + "step": 3106, + "time_per_iteration": 2.995124578475952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150114, + "balance_loss_mlp": 1.07248521, + "epoch": 0.5977298961138899, + "flos": 752343399936.0, + "grad_norm": 0.029134934835651077, + "language_loss": 0.86725187, + "learning_rate": 0.0003676338118083025, + "loss": 0.87875295, + "num_input_tokens_seen": 259162512, + "router_z_loss_mlp": 0.77539062, + "step": 3107, + "time_per_iteration": 2.972302198410034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150139, + "balance_loss_mlp": 1.07251036, + "epoch": 0.5979222777991535, + "flos": 531998662656.0, + "grad_norm": 0.035100601373903646, + "language_loss": 0.857481, + "learning_rate": 0.0003673334104432347, + "loss": 0.86898237, + "num_input_tokens_seen": 259228752, + "router_z_loss_mlp": 0.77539062, + "step": 3108, + "time_per_iteration": 2.6626758575439453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149837, + "balance_loss_mlp": 1.07230318, + "epoch": 0.5981146594844171, + "flos": 622914355200.0, + "grad_norm": 0.0316193314504938, + "language_loss": 0.88024735, + "learning_rate": 0.0003670330605839048, + "loss": 0.89174569, + "num_input_tokens_seen": 259303440, + "router_z_loss_mlp": 0.77441406, + "step": 3109, + "time_per_iteration": 2.8445565700531006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149651, + "balance_loss_mlp": 1.07216513, + "epoch": 0.5983070411696807, + "flos": 604709458944.0, + "grad_norm": 0.030685816325192888, + "language_loss": 0.81470084, + "learning_rate": 0.0003667327623469191, + "loss": 0.82619739, + "num_input_tokens_seen": 259378752, + "router_z_loss_mlp": 0.77392578, + "step": 3110, + "time_per_iteration": 2.7507362365722656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151646, + "balance_loss_mlp": 1.07406473, + "epoch": 0.5984994228549442, + "flos": 634669584384.0, + "grad_norm": 0.03251456811802211, + "language_loss": 0.83321273, + "learning_rate": 0.00036643251584886333, + "loss": 0.84472924, + "num_input_tokens_seen": 259454336, + "router_z_loss_mlp": 0.77490234, + "step": 3111, + "time_per_iteration": 2.816390037536621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156112, + "balance_loss_mlp": 1.07848299, + "epoch": 0.5986918045402078, + "flos": 526293600768.0, + "grad_norm": 0.03439308421341756, + "language_loss": 0.88026524, + "learning_rate": 0.00036613232120630393, + "loss": 0.89182639, + "num_input_tokens_seen": 259518960, + "router_z_loss_mlp": 0.77539062, + "step": 3112, + "time_per_iteration": 2.610931396484375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151048, + "balance_loss_mlp": 1.07332325, + "epoch": 0.5988841862254713, + "flos": 484139814912.0, + "grad_norm": 0.040537518995664656, + "language_loss": 0.85835981, + "learning_rate": 0.00036583217853578643, + "loss": 0.86987036, + "num_input_tokens_seen": 259584352, + "router_z_loss_mlp": 0.77636719, + "step": 3113, + "time_per_iteration": 2.535508871078491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115137, + "balance_loss_mlp": 1.07369328, + "epoch": 0.5990765679107349, + "flos": 1142121745920.0, + "grad_norm": 0.03045218931470109, + "language_loss": 0.82758361, + "learning_rate": 0.000365532087953837, + "loss": 0.83909732, + "num_input_tokens_seen": 259693152, + "router_z_loss_mlp": 0.77587891, + "step": 3114, + "time_per_iteration": 3.635089159011841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150692, + "balance_loss_mlp": 1.07282436, + "epoch": 0.5992689495959984, + "flos": 518018350080.0, + "grad_norm": 0.03475345450765353, + "language_loss": 0.94564217, + "learning_rate": 0.00036523204957696065, + "loss": 0.95714909, + "num_input_tokens_seen": 259762048, + "router_z_loss_mlp": 0.77783203, + "step": 3115, + "time_per_iteration": 2.6130504608154297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150235, + "balance_loss_mlp": 1.07231951, + "epoch": 0.599461331281262, + "flos": 745941396480.0, + "grad_norm": 0.03954805443520273, + "language_loss": 0.86356986, + "learning_rate": 0.00036493206352164324, + "loss": 0.87507224, + "num_input_tokens_seen": 259843184, + "router_z_loss_mlp": 0.77832031, + "step": 3116, + "time_per_iteration": 2.902606964111328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115079, + "balance_loss_mlp": 1.07282686, + "epoch": 0.5996537129665256, + "flos": 593483985408.0, + "grad_norm": 0.030263025154964335, + "language_loss": 0.90265405, + "learning_rate": 0.000364632129904349, + "loss": 0.91416192, + "num_input_tokens_seen": 259912720, + "router_z_loss_mlp": 0.77880859, + "step": 3117, + "time_per_iteration": 2.728739023208618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148018, + "balance_loss_mlp": 1.0701983, + "epoch": 0.5998460946517892, + "flos": 560115740160.0, + "grad_norm": 0.03726043771871862, + "language_loss": 0.8256759, + "learning_rate": 0.00036433224884152283, + "loss": 0.83715606, + "num_input_tokens_seen": 259985472, + "router_z_loss_mlp": 0.77734375, + "step": 3118, + "time_per_iteration": 2.7763798236846924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146842, + "balance_loss_mlp": 1.06897449, + "epoch": 0.6000384763370528, + "flos": 485535699456.0, + "grad_norm": 0.03789921911219481, + "language_loss": 0.83006287, + "learning_rate": 0.00036403242044958875, + "loss": 0.84153128, + "num_input_tokens_seen": 260050336, + "router_z_loss_mlp": 0.77783203, + "step": 3119, + "time_per_iteration": 2.549102783203125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156248, + "balance_loss_mlp": 1.07842839, + "epoch": 0.6002308580223162, + "flos": 597877756416.0, + "grad_norm": 0.03490542571663494, + "language_loss": 0.96794367, + "learning_rate": 0.0003637326448449507, + "loss": 0.97950613, + "num_input_tokens_seen": 260120304, + "router_z_loss_mlp": 0.77734375, + "step": 3120, + "time_per_iteration": 2.7004034519195557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153861, + "balance_loss_mlp": 1.07608855, + "epoch": 0.6004232397075798, + "flos": 546220021248.0, + "grad_norm": 0.03097014244858331, + "language_loss": 0.90828121, + "learning_rate": 0.00036343292214399177, + "loss": 0.91981983, + "num_input_tokens_seen": 260198304, + "router_z_loss_mlp": 0.77685547, + "step": 3121, + "time_per_iteration": 2.7137558460235596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149916, + "balance_loss_mlp": 1.07195354, + "epoch": 0.6006156213928434, + "flos": 631150674432.0, + "grad_norm": 0.035271472923777164, + "language_loss": 0.82629979, + "learning_rate": 0.00036313325246307456, + "loss": 0.83779889, + "num_input_tokens_seen": 260277664, + "router_z_loss_mlp": 0.77880859, + "step": 3122, + "time_per_iteration": 2.7764761447906494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149471, + "balance_loss_mlp": 1.07179451, + "epoch": 0.600808003078107, + "flos": 583404618240.0, + "grad_norm": 0.03572948741638757, + "language_loss": 0.92888528, + "learning_rate": 0.0003628336359185411, + "loss": 0.94037998, + "num_input_tokens_seen": 260350096, + "router_z_loss_mlp": 0.77587891, + "step": 3123, + "time_per_iteration": 2.658597707748413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149832, + "balance_loss_mlp": 1.07215571, + "epoch": 0.6010003847633705, + "flos": 636438772224.0, + "grad_norm": 0.033415641646833916, + "language_loss": 0.81693363, + "learning_rate": 0.000362534072626713, + "loss": 0.8284319, + "num_input_tokens_seen": 260421888, + "router_z_loss_mlp": 0.77587891, + "step": 3124, + "time_per_iteration": 2.7385804653167725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146058, + "balance_loss_mlp": 1.06857181, + "epoch": 0.6011927664486341, + "flos": 720029936640.0, + "grad_norm": 0.0314556326919405, + "language_loss": 0.85929549, + "learning_rate": 0.00036223456270389093, + "loss": 0.87075609, + "num_input_tokens_seen": 260499616, + "router_z_loss_mlp": 0.77392578, + "step": 3125, + "time_per_iteration": 2.9184412956237793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148457, + "balance_loss_mlp": 1.0710187, + "epoch": 0.6013851481338977, + "flos": 500054499840.0, + "grad_norm": 0.03211121673376429, + "language_loss": 0.85866034, + "learning_rate": 0.00036193510626635517, + "loss": 0.87014484, + "num_input_tokens_seen": 260572048, + "router_z_loss_mlp": 0.7734375, + "step": 3126, + "time_per_iteration": 2.6580941677093506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151789, + "balance_loss_mlp": 1.07439816, + "epoch": 0.6015775298191612, + "flos": 750875656704.0, + "grad_norm": 0.03289877663507899, + "language_loss": 0.86000574, + "learning_rate": 0.0003616357034303649, + "loss": 0.87152362, + "num_input_tokens_seen": 260644720, + "router_z_loss_mlp": 0.77294922, + "step": 3127, + "time_per_iteration": 2.925900459289551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154509, + "balance_loss_mlp": 1.07730949, + "epoch": 0.6017699115044248, + "flos": 594263519232.0, + "grad_norm": 0.026386451784686567, + "language_loss": 0.83912927, + "learning_rate": 0.0003613363543121584, + "loss": 0.85067433, + "num_input_tokens_seen": 260724864, + "router_z_loss_mlp": 0.77099609, + "step": 3128, + "time_per_iteration": 2.8285086154937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149104, + "balance_loss_mlp": 1.07185686, + "epoch": 0.6019622931896883, + "flos": 516201498624.0, + "grad_norm": 0.032335523729292034, + "language_loss": 0.89489174, + "learning_rate": 0.00036103705902795357, + "loss": 0.90638286, + "num_input_tokens_seen": 260800896, + "router_z_loss_mlp": 0.77148438, + "step": 3129, + "time_per_iteration": 2.7369625568389893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153149, + "balance_loss_mlp": 1.0759964, + "epoch": 0.6021546748749519, + "flos": 491473075200.0, + "grad_norm": 0.037053521707819316, + "language_loss": 0.86282051, + "learning_rate": 0.0003607378176939471, + "loss": 0.87435198, + "num_input_tokens_seen": 260872736, + "router_z_loss_mlp": 0.77050781, + "step": 3130, + "time_per_iteration": 2.6015982627868652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155234, + "balance_loss_mlp": 1.07832015, + "epoch": 0.6023470565602155, + "flos": 542114959872.0, + "grad_norm": 0.03769359789833061, + "language_loss": 0.87922359, + "learning_rate": 0.00036043863042631465, + "loss": 0.89077592, + "num_input_tokens_seen": 260943264, + "router_z_loss_mlp": 0.76806641, + "step": 3131, + "time_per_iteration": 2.870999813079834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151659, + "balance_loss_mlp": 1.07436335, + "epoch": 0.6025394382454791, + "flos": 846463096320.0, + "grad_norm": 0.03206429015818981, + "language_loss": 0.81416667, + "learning_rate": 0.00036013949734121133, + "loss": 0.82568324, + "num_input_tokens_seen": 261030064, + "router_z_loss_mlp": 0.77197266, + "step": 3132, + "time_per_iteration": 3.1543962955474854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115191, + "balance_loss_mlp": 1.0745194, + "epoch": 0.6027318199307425, + "flos": 578257509888.0, + "grad_norm": 0.03267549496137676, + "language_loss": 0.87371534, + "learning_rate": 0.00035984041855477043, + "loss": 0.88523442, + "num_input_tokens_seen": 261106496, + "router_z_loss_mlp": 0.77294922, + "step": 3133, + "time_per_iteration": 2.7443673610687256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143524, + "balance_loss_mlp": 1.06837463, + "epoch": 0.6029242016160061, + "flos": 1474252766208.0, + "grad_norm": 0.006811691070041734, + "language_loss": 0.78709894, + "learning_rate": 0.00035954139418310495, + "loss": 0.79853421, + "num_input_tokens_seen": 261343248, + "router_z_loss_mlp": 0.75195312, + "step": 3134, + "time_per_iteration": 4.92242431640625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145401, + "balance_loss_mlp": 1.06810546, + "epoch": 0.6031165833012697, + "flos": 481782474240.0, + "grad_norm": 0.029444679170183622, + "language_loss": 0.84435833, + "learning_rate": 0.00035924242434230637, + "loss": 0.85581231, + "num_input_tokens_seen": 261416704, + "router_z_loss_mlp": 0.77197266, + "step": 3135, + "time_per_iteration": 2.6391186714172363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154302, + "balance_loss_mlp": 1.07700658, + "epoch": 0.6033089649865333, + "flos": 500464733184.0, + "grad_norm": 0.036345783287305373, + "language_loss": 0.85093319, + "learning_rate": 0.00035894350914844516, + "loss": 0.86247623, + "num_input_tokens_seen": 261486688, + "router_z_loss_mlp": 0.77197266, + "step": 3136, + "time_per_iteration": 2.6352477073669434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150224, + "balance_loss_mlp": 1.07259464, + "epoch": 0.6035013466717969, + "flos": 557723470848.0, + "grad_norm": 0.0365408898732846, + "language_loss": 0.89268684, + "learning_rate": 0.0003586446487175703, + "loss": 0.90418905, + "num_input_tokens_seen": 261557344, + "router_z_loss_mlp": 0.77539062, + "step": 3137, + "time_per_iteration": 2.693071126937866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149547, + "balance_loss_mlp": 1.07215679, + "epoch": 0.6036937283570604, + "flos": 595995777024.0, + "grad_norm": 0.02904364912520073, + "language_loss": 0.90167797, + "learning_rate": 0.0003583458431657099, + "loss": 0.91317338, + "num_input_tokens_seen": 261626240, + "router_z_loss_mlp": 0.77294922, + "step": 3138, + "time_per_iteration": 2.738223075866699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114932, + "balance_loss_mlp": 1.07178628, + "epoch": 0.603886110042324, + "flos": 542058564096.0, + "grad_norm": 0.037255533971674665, + "language_loss": 0.87546921, + "learning_rate": 0.00035804709260887056, + "loss": 0.88696241, + "num_input_tokens_seen": 261696368, + "router_z_loss_mlp": 0.77441406, + "step": 3139, + "time_per_iteration": 2.6814053058624268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148548, + "balance_loss_mlp": 1.07072818, + "epoch": 0.6040784917275875, + "flos": 519655280640.0, + "grad_norm": 0.02881429249122551, + "language_loss": 0.93902391, + "learning_rate": 0.0003577483971630373, + "loss": 0.95050937, + "num_input_tokens_seen": 261769104, + "router_z_loss_mlp": 0.77734375, + "step": 3140, + "time_per_iteration": 2.6691088676452637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011483, + "balance_loss_mlp": 1.07052839, + "epoch": 0.6042708734128511, + "flos": 662013858816.0, + "grad_norm": 0.0304544298908833, + "language_loss": 0.89555264, + "learning_rate": 0.00035744975694417414, + "loss": 0.90703559, + "num_input_tokens_seen": 261844880, + "router_z_loss_mlp": 0.77685547, + "step": 3141, + "time_per_iteration": 2.872135877609253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148852, + "balance_loss_mlp": 1.07107973, + "epoch": 0.6044632550981146, + "flos": 573516632064.0, + "grad_norm": 0.03378277324120908, + "language_loss": 0.88105464, + "learning_rate": 0.00035715117206822344, + "loss": 0.89254314, + "num_input_tokens_seen": 261923280, + "router_z_loss_mlp": 0.77685547, + "step": 3142, + "time_per_iteration": 2.790640354156494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150783, + "balance_loss_mlp": 1.07315397, + "epoch": 0.6046556367833782, + "flos": 547728697344.0, + "grad_norm": 0.0341385163456541, + "language_loss": 0.86351824, + "learning_rate": 0.0003568526426511065, + "loss": 0.87502599, + "num_input_tokens_seen": 261990832, + "router_z_loss_mlp": 0.77539062, + "step": 3143, + "time_per_iteration": 2.622870683670044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150768, + "balance_loss_mlp": 1.07318711, + "epoch": 0.6048480184686418, + "flos": 778174268928.0, + "grad_norm": 0.03443143260722225, + "language_loss": 0.88285363, + "learning_rate": 0.000356554168808722, + "loss": 0.89436138, + "num_input_tokens_seen": 262063760, + "router_z_loss_mlp": 0.77490234, + "step": 3144, + "time_per_iteration": 2.9785499572753906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151515, + "balance_loss_mlp": 1.07393324, + "epoch": 0.6050404001539054, + "flos": 658375426560.0, + "grad_norm": 0.03050523278027174, + "language_loss": 0.89547616, + "learning_rate": 0.00035625575065694837, + "loss": 0.9069913, + "num_input_tokens_seen": 262137968, + "router_z_loss_mlp": 0.77490234, + "step": 3145, + "time_per_iteration": 2.893160343170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151106, + "balance_loss_mlp": 1.07347679, + "epoch": 0.605232781839169, + "flos": 550082035200.0, + "grad_norm": 0.03434592875619572, + "language_loss": 0.82820475, + "learning_rate": 0.0003559573883116415, + "loss": 0.83971578, + "num_input_tokens_seen": 262211264, + "router_z_loss_mlp": 0.77539062, + "step": 3146, + "time_per_iteration": 2.703378677368164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152026, + "balance_loss_mlp": 1.07434905, + "epoch": 0.6054251635244324, + "flos": 606641829888.0, + "grad_norm": 0.028306929425565355, + "language_loss": 0.90180922, + "learning_rate": 0.00035565908188863604, + "loss": 0.91332948, + "num_input_tokens_seen": 262289648, + "router_z_loss_mlp": 0.77587891, + "step": 3147, + "time_per_iteration": 2.8178632259368896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149693, + "balance_loss_mlp": 1.07201612, + "epoch": 0.605617545209696, + "flos": 614808291840.0, + "grad_norm": 0.03167283444801755, + "language_loss": 0.85591269, + "learning_rate": 0.00035536083150374464, + "loss": 0.86740971, + "num_input_tokens_seen": 262362704, + "router_z_loss_mlp": 0.77587891, + "step": 3148, + "time_per_iteration": 2.7630088329315186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151665, + "balance_loss_mlp": 1.07613373, + "epoch": 0.6058099268949596, + "flos": 1501607774208.0, + "grad_norm": 0.006039709216806875, + "language_loss": 0.74747956, + "learning_rate": 0.00035506263727275893, + "loss": 0.75899613, + "num_input_tokens_seen": 262596864, + "router_z_loss_mlp": 0.75585938, + "step": 3149, + "time_per_iteration": 4.826624870300293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148811, + "balance_loss_mlp": 1.07108641, + "epoch": 0.6060023085802232, + "flos": 671704459776.0, + "grad_norm": 0.03325996872858785, + "language_loss": 0.90532559, + "learning_rate": 0.0003547644993114475, + "loss": 0.91681373, + "num_input_tokens_seen": 262671088, + "router_z_loss_mlp": 0.77636719, + "step": 3150, + "time_per_iteration": 2.802644729614258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149051, + "balance_loss_mlp": 1.07127893, + "epoch": 0.6061946902654868, + "flos": 607305844224.0, + "grad_norm": 0.03277875295758358, + "language_loss": 0.85509253, + "learning_rate": 0.00035446641773555806, + "loss": 0.86658305, + "num_input_tokens_seen": 262743888, + "router_z_loss_mlp": 0.77685547, + "step": 3151, + "time_per_iteration": 2.7055504322052 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148261, + "balance_loss_mlp": 1.07082272, + "epoch": 0.6063870719507503, + "flos": 558952169472.0, + "grad_norm": 0.029065175404624204, + "language_loss": 0.91512465, + "learning_rate": 0.000354168392660816, + "loss": 0.92660725, + "num_input_tokens_seen": 262819616, + "router_z_loss_mlp": 0.7734375, + "step": 3152, + "time_per_iteration": 2.7494730949401855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145734, + "balance_loss_mlp": 1.06829596, + "epoch": 0.6065794536360138, + "flos": 558281424384.0, + "grad_norm": 0.03244852665251002, + "language_loss": 0.88397223, + "learning_rate": 0.0003538704242029252, + "loss": 0.89542961, + "num_input_tokens_seen": 262893984, + "router_z_loss_mlp": 0.7734375, + "step": 3153, + "time_per_iteration": 2.675692558288574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146957, + "balance_loss_mlp": 1.06932831, + "epoch": 0.6067718353212774, + "flos": 691381102080.0, + "grad_norm": 0.033220307719005866, + "language_loss": 0.83031321, + "learning_rate": 0.0003535725124775672, + "loss": 0.84178281, + "num_input_tokens_seen": 262969648, + "router_z_loss_mlp": 0.77539062, + "step": 3154, + "time_per_iteration": 2.843881607055664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156617, + "balance_loss_mlp": 1.07903516, + "epoch": 0.606964217006541, + "flos": 522902945280.0, + "grad_norm": 0.035561743978846455, + "language_loss": 0.91791475, + "learning_rate": 0.00035327465760040126, + "loss": 0.92948091, + "num_input_tokens_seen": 263042048, + "router_z_loss_mlp": 0.77490234, + "step": 3155, + "time_per_iteration": 2.684056043624878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158513, + "balance_loss_mlp": 1.08112192, + "epoch": 0.6071565986918045, + "flos": 642712521216.0, + "grad_norm": 0.03594986649837803, + "language_loss": 0.89308429, + "learning_rate": 0.00035297685968706526, + "loss": 0.9046694, + "num_input_tokens_seen": 263108032, + "router_z_loss_mlp": 0.77294922, + "step": 3156, + "time_per_iteration": 2.7834246158599854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160171, + "balance_loss_mlp": 1.08278084, + "epoch": 0.6073489803770681, + "flos": 561652614144.0, + "grad_norm": 0.034893913409009325, + "language_loss": 0.88205332, + "learning_rate": 0.00035267911885317454, + "loss": 0.89365506, + "num_input_tokens_seen": 263175184, + "router_z_loss_mlp": 0.77294922, + "step": 3157, + "time_per_iteration": 2.669710397720337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158828, + "balance_loss_mlp": 1.08143747, + "epoch": 0.6075413620623317, + "flos": 587201504256.0, + "grad_norm": 0.030643892610273542, + "language_loss": 0.86383843, + "learning_rate": 0.0003523814352143222, + "loss": 0.87542671, + "num_input_tokens_seen": 263252768, + "router_z_loss_mlp": 0.77294922, + "step": 3158, + "time_per_iteration": 2.822089195251465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154763, + "balance_loss_mlp": 1.07741952, + "epoch": 0.6077337437475953, + "flos": 631971141120.0, + "grad_norm": 0.03639599054768475, + "language_loss": 0.96294606, + "learning_rate": 0.00035208380888607937, + "loss": 0.97449374, + "num_input_tokens_seen": 263328720, + "router_z_loss_mlp": 0.77246094, + "step": 3159, + "time_per_iteration": 2.7675912380218506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156998, + "balance_loss_mlp": 1.08184814, + "epoch": 0.6079261254328588, + "flos": 1471623453696.0, + "grad_norm": 0.01008994969394602, + "language_loss": 0.79461986, + "learning_rate": 0.000351786239983995, + "loss": 0.80618984, + "num_input_tokens_seen": 263554656, + "router_z_loss_mlp": 0.75195312, + "step": 3160, + "time_per_iteration": 4.839691638946533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155136, + "balance_loss_mlp": 1.07998657, + "epoch": 0.6081185071181223, + "flos": 1526203213824.0, + "grad_norm": 0.005930182573689796, + "language_loss": 0.7569223, + "learning_rate": 0.00035148872862359517, + "loss": 0.76847368, + "num_input_tokens_seen": 263791600, + "router_z_loss_mlp": 0.75195312, + "step": 3161, + "time_per_iteration": 4.991135835647583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154947, + "balance_loss_mlp": 1.07746089, + "epoch": 0.6083108888033859, + "flos": 557434761216.0, + "grad_norm": 0.030736279817991784, + "language_loss": 0.86955488, + "learning_rate": 0.00035119127492038446, + "loss": 0.88110441, + "num_input_tokens_seen": 263869744, + "router_z_loss_mlp": 0.77392578, + "step": 3162, + "time_per_iteration": 2.8129284381866455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115361, + "balance_loss_mlp": 1.07631505, + "epoch": 0.6085032704886495, + "flos": 842555420160.0, + "grad_norm": 0.033332341835850446, + "language_loss": 0.88169372, + "learning_rate": 0.00035089387898984436, + "loss": 0.89322984, + "num_input_tokens_seen": 263946624, + "router_z_loss_mlp": 0.77197266, + "step": 3163, + "time_per_iteration": 3.0287744998931885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151661, + "balance_loss_mlp": 1.07412744, + "epoch": 0.6086956521739131, + "flos": 685992947712.0, + "grad_norm": 0.03500074735075155, + "language_loss": 0.87286401, + "learning_rate": 0.0003505965409474343, + "loss": 0.88438058, + "num_input_tokens_seen": 264022064, + "router_z_loss_mlp": 0.77441406, + "step": 3164, + "time_per_iteration": 2.8668415546417236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155467, + "balance_loss_mlp": 1.07802904, + "epoch": 0.6088880338591766, + "flos": 536865793536.0, + "grad_norm": 0.03207560682458212, + "language_loss": 0.90936065, + "learning_rate": 0.0003502992609085913, + "loss": 0.92091525, + "num_input_tokens_seen": 264089520, + "router_z_loss_mlp": 0.7734375, + "step": 3165, + "time_per_iteration": 2.6344704627990723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152911, + "balance_loss_mlp": 1.07552052, + "epoch": 0.6090804155444401, + "flos": 732881607168.0, + "grad_norm": 0.03068132972373785, + "language_loss": 0.86756754, + "learning_rate": 0.00035000203898872954, + "loss": 0.87909669, + "num_input_tokens_seen": 264173056, + "router_z_loss_mlp": 0.77294922, + "step": 3166, + "time_per_iteration": 3.007883071899414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151975, + "balance_loss_mlp": 1.07458472, + "epoch": 0.6092727972297037, + "flos": 700242504192.0, + "grad_norm": 0.033743959402083586, + "language_loss": 0.89530504, + "learning_rate": 0.0003497048753032406, + "loss": 0.90682483, + "num_input_tokens_seen": 264250912, + "router_z_loss_mlp": 0.77294922, + "step": 3167, + "time_per_iteration": 2.903841018676758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150053, + "balance_loss_mlp": 1.07285297, + "epoch": 0.6094651789149673, + "flos": 1053676185600.0, + "grad_norm": 0.029535454603069295, + "language_loss": 0.85045445, + "learning_rate": 0.000349407769967494, + "loss": 0.86195493, + "num_input_tokens_seen": 264342800, + "router_z_loss_mlp": 0.77099609, + "step": 3168, + "time_per_iteration": 3.4178872108459473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155901, + "balance_loss_mlp": 1.07860577, + "epoch": 0.6096575606002309, + "flos": 504094433280.0, + "grad_norm": 0.02941914211290898, + "language_loss": 0.89039332, + "learning_rate": 0.0003491107230968361, + "loss": 0.90195233, + "num_input_tokens_seen": 264413664, + "router_z_loss_mlp": 0.77197266, + "step": 3169, + "time_per_iteration": 2.6551673412323 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156463, + "balance_loss_mlp": 1.07921588, + "epoch": 0.6098499422854944, + "flos": 586863129600.0, + "grad_norm": 0.02719917666416643, + "language_loss": 0.85504711, + "learning_rate": 0.00034881373480659085, + "loss": 0.86661172, + "num_input_tokens_seen": 264494944, + "router_z_loss_mlp": 0.77148438, + "step": 3170, + "time_per_iteration": 2.851252317428589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157705, + "balance_loss_mlp": 1.08040965, + "epoch": 0.610042323970758, + "flos": 470159502336.0, + "grad_norm": 0.06140035445399593, + "language_loss": 0.85159725, + "learning_rate": 0.0003485168052120594, + "loss": 0.86317426, + "num_input_tokens_seen": 264561664, + "router_z_loss_mlp": 0.77197266, + "step": 3171, + "time_per_iteration": 2.5498504638671875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156725, + "balance_loss_mlp": 1.07938242, + "epoch": 0.6102347056560216, + "flos": 515198383104.0, + "grad_norm": 0.03549166492948706, + "language_loss": 0.85369307, + "learning_rate": 0.00034821993442851973, + "loss": 0.86526036, + "num_input_tokens_seen": 264626256, + "router_z_loss_mlp": 0.77246094, + "step": 3172, + "time_per_iteration": 2.571030378341675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153351, + "balance_loss_mlp": 1.07600832, + "epoch": 0.6104270873412851, + "flos": 469964118528.0, + "grad_norm": 0.03723847696421654, + "language_loss": 0.87251568, + "learning_rate": 0.00034792312257122735, + "loss": 0.88404918, + "num_input_tokens_seen": 264692768, + "router_z_loss_mlp": 0.77246094, + "step": 3173, + "time_per_iteration": 2.601289987564087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153196, + "balance_loss_mlp": 1.07580578, + "epoch": 0.6106194690265486, + "flos": 550939431936.0, + "grad_norm": 0.03428989424028707, + "language_loss": 0.85585618, + "learning_rate": 0.00034762636975541506, + "loss": 0.86738813, + "num_input_tokens_seen": 264764816, + "router_z_loss_mlp": 0.77294922, + "step": 3174, + "time_per_iteration": 2.623203754425049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155286, + "balance_loss_mlp": 1.07784736, + "epoch": 0.6108118507118122, + "flos": 473880526848.0, + "grad_norm": 0.03492975408157665, + "language_loss": 0.85685778, + "learning_rate": 0.0003473296760962923, + "loss": 0.86841059, + "num_input_tokens_seen": 264837968, + "router_z_loss_mlp": 0.7734375, + "step": 3175, + "time_per_iteration": 2.6674108505249023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157349, + "balance_loss_mlp": 1.08181763, + "epoch": 0.6110042323970758, + "flos": 1448180124672.0, + "grad_norm": 0.011972836775056764, + "language_loss": 0.78533739, + "learning_rate": 0.00034703304170904617, + "loss": 0.79691088, + "num_input_tokens_seen": 265058336, + "router_z_loss_mlp": 0.75585938, + "step": 3176, + "time_per_iteration": 4.719567060470581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150349, + "balance_loss_mlp": 1.07286298, + "epoch": 0.6111966140823394, + "flos": 795541234176.0, + "grad_norm": 0.03714406101939167, + "language_loss": 0.87063801, + "learning_rate": 0.00034673646670883976, + "loss": 0.88214147, + "num_input_tokens_seen": 265135920, + "router_z_loss_mlp": 0.77392578, + "step": 3177, + "time_per_iteration": 2.973940134048462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155601, + "balance_loss_mlp": 1.0800705, + "epoch": 0.611388995767603, + "flos": 1561063397376.0, + "grad_norm": 0.00949552405530534, + "language_loss": 0.75715023, + "learning_rate": 0.0003464399512108141, + "loss": 0.76870626, + "num_input_tokens_seen": 265374464, + "router_z_loss_mlp": 0.75585938, + "step": 3178, + "time_per_iteration": 5.061004400253296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152416, + "balance_loss_mlp": 1.07488239, + "epoch": 0.6115813774528664, + "flos": 713484942336.0, + "grad_norm": 0.03541902083866898, + "language_loss": 0.87553525, + "learning_rate": 0.0003461434953300865, + "loss": 0.88705945, + "num_input_tokens_seen": 265450112, + "router_z_loss_mlp": 0.77441406, + "step": 3179, + "time_per_iteration": 2.916708469390869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153239, + "balance_loss_mlp": 1.07556212, + "epoch": 0.61177375913813, + "flos": 685689501696.0, + "grad_norm": 0.028499371872006348, + "language_loss": 0.85970306, + "learning_rate": 0.0003458470991817515, + "loss": 0.87123549, + "num_input_tokens_seen": 265534336, + "router_z_loss_mlp": 0.77587891, + "step": 3180, + "time_per_iteration": 2.9950902462005615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115431, + "balance_loss_mlp": 1.07677627, + "epoch": 0.6119661408233936, + "flos": 512667125760.0, + "grad_norm": 0.035557395139189776, + "language_loss": 0.89999539, + "learning_rate": 0.0003455507628808802, + "loss": 0.91153848, + "num_input_tokens_seen": 265604480, + "router_z_loss_mlp": 0.77441406, + "step": 3181, + "time_per_iteration": 2.5897092819213867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153736, + "balance_loss_mlp": 1.07629788, + "epoch": 0.6121585225086572, + "flos": 557855728128.0, + "grad_norm": 0.03617294918278912, + "language_loss": 0.90379083, + "learning_rate": 0.00034525448654252076, + "loss": 0.9153282, + "num_input_tokens_seen": 265670848, + "router_z_loss_mlp": 0.7734375, + "step": 3182, + "time_per_iteration": 2.636446714401245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157583, + "balance_loss_mlp": 1.08047891, + "epoch": 0.6123509041939207, + "flos": 562909510656.0, + "grad_norm": 0.037973624968581914, + "language_loss": 0.88617527, + "learning_rate": 0.0003449582702816976, + "loss": 0.89775109, + "num_input_tokens_seen": 265739584, + "router_z_loss_mlp": 0.77001953, + "step": 3183, + "time_per_iteration": 2.6636195182800293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155826, + "balance_loss_mlp": 1.0786258, + "epoch": 0.6125432858791843, + "flos": 559130088960.0, + "grad_norm": 0.03254272947638904, + "language_loss": 0.87538117, + "learning_rate": 0.0003446621142134122, + "loss": 0.88693941, + "num_input_tokens_seen": 265810368, + "router_z_loss_mlp": 0.77099609, + "step": 3184, + "time_per_iteration": 2.6456782817840576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154505, + "balance_loss_mlp": 1.07711458, + "epoch": 0.6127356675644479, + "flos": 415896649728.0, + "grad_norm": 0.03534541862410296, + "language_loss": 0.89029509, + "learning_rate": 0.0003443660184526424, + "loss": 0.90184009, + "num_input_tokens_seen": 265871616, + "router_z_loss_mlp": 0.77294922, + "step": 3185, + "time_per_iteration": 2.4446170330047607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153301, + "balance_loss_mlp": 1.07586265, + "epoch": 0.6129280492497114, + "flos": 605033097216.0, + "grad_norm": 0.03004060948026975, + "language_loss": 0.92148149, + "learning_rate": 0.0003440699831143429, + "loss": 0.93301451, + "num_input_tokens_seen": 265946672, + "router_z_loss_mlp": 0.7734375, + "step": 3186, + "time_per_iteration": 2.738818407058716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114756, + "balance_loss_mlp": 1.07007372, + "epoch": 0.613120430934975, + "flos": 520864513536.0, + "grad_norm": 0.031842648163895024, + "language_loss": 0.87123644, + "learning_rate": 0.0003437740083134449, + "loss": 0.88271207, + "num_input_tokens_seen": 266020640, + "router_z_loss_mlp": 0.77392578, + "step": 3187, + "time_per_iteration": 0.013826608657836914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145943, + "balance_loss_mlp": 1.06850421, + "epoch": 0.6133128126202385, + "flos": 512080974336.0, + "grad_norm": 0.03697103993803325, + "language_loss": 0.8916111, + "learning_rate": 0.00034347809416485574, + "loss": 0.90307051, + "num_input_tokens_seen": 266085776, + "router_z_loss_mlp": 0.7734375, + "step": 3188, + "time_per_iteration": 2.626657724380493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152707, + "balance_loss_mlp": 1.07517374, + "epoch": 0.6135051943055021, + "flos": 608756123136.0, + "grad_norm": 0.032275068446110486, + "language_loss": 0.8676489, + "learning_rate": 0.0003431822407834597, + "loss": 0.87917596, + "num_input_tokens_seen": 266157104, + "router_z_loss_mlp": 0.77441406, + "step": 3189, + "time_per_iteration": 2.784728765487671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153516, + "balance_loss_mlp": 1.07588649, + "epoch": 0.6136975759907657, + "flos": 1162008508416.0, + "grad_norm": 0.035345487562752465, + "language_loss": 0.90027606, + "learning_rate": 0.00034288644828411706, + "loss": 0.91181111, + "num_input_tokens_seen": 266244144, + "router_z_loss_mlp": 0.77539062, + "step": 3190, + "time_per_iteration": 3.453296661376953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147033, + "balance_loss_mlp": 1.06959414, + "epoch": 0.6138899576760293, + "flos": 708172649472.0, + "grad_norm": 0.033974370465757506, + "language_loss": 0.80322051, + "learning_rate": 0.0003425907167816649, + "loss": 0.81469083, + "num_input_tokens_seen": 266319040, + "router_z_loss_mlp": 0.7734375, + "step": 3191, + "time_per_iteration": 2.9247496128082275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147023, + "balance_loss_mlp": 1.0697751, + "epoch": 0.6140823393612928, + "flos": 587618468352.0, + "grad_norm": 0.031154822121678163, + "language_loss": 0.89756465, + "learning_rate": 0.00034229504639091623, + "loss": 0.90903485, + "num_input_tokens_seen": 266390784, + "router_z_loss_mlp": 0.77148438, + "step": 3192, + "time_per_iteration": 2.772437810897827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150486, + "balance_loss_mlp": 1.07342911, + "epoch": 0.6142747210465563, + "flos": 805618599936.0, + "grad_norm": 0.03412621705623903, + "language_loss": 0.84789693, + "learning_rate": 0.0003419994372266606, + "loss": 0.85940182, + "num_input_tokens_seen": 266483216, + "router_z_loss_mlp": 0.76953125, + "step": 3193, + "time_per_iteration": 3.096266984939575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148388, + "balance_loss_mlp": 1.07094979, + "epoch": 0.6144671027318199, + "flos": 530544380928.0, + "grad_norm": 0.028061755795717326, + "language_loss": 0.86464483, + "learning_rate": 0.00034170388940366335, + "loss": 0.87612873, + "num_input_tokens_seen": 266557344, + "router_z_loss_mlp": 0.7734375, + "step": 3194, + "time_per_iteration": 2.6779158115386963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152877, + "balance_loss_mlp": 1.07539093, + "epoch": 0.6146594844170835, + "flos": 806912426496.0, + "grad_norm": 0.030674949388275172, + "language_loss": 0.8474896, + "learning_rate": 0.0003414084030366667, + "loss": 0.85901833, + "num_input_tokens_seen": 266639488, + "router_z_loss_mlp": 0.77392578, + "step": 3195, + "time_per_iteration": 3.106736898422241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153391, + "balance_loss_mlp": 1.07590497, + "epoch": 0.6148518661023471, + "flos": 502761675264.0, + "grad_norm": 0.03337820573482111, + "language_loss": 0.87897015, + "learning_rate": 0.0003411129782403883, + "loss": 0.89050412, + "num_input_tokens_seen": 266711168, + "router_z_loss_mlp": 0.77392578, + "step": 3196, + "time_per_iteration": 2.643308639526367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154002, + "balance_loss_mlp": 1.07642102, + "epoch": 0.6150442477876106, + "flos": 511698938880.0, + "grad_norm": 0.038534572595061774, + "language_loss": 0.91158688, + "learning_rate": 0.0003408176151295225, + "loss": 0.92312694, + "num_input_tokens_seen": 266777632, + "router_z_loss_mlp": 0.77490234, + "step": 3197, + "time_per_iteration": 2.5714070796966553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157848, + "balance_loss_mlp": 1.08040917, + "epoch": 0.6152366294728742, + "flos": 527997660672.0, + "grad_norm": 0.045085971427018416, + "language_loss": 0.83155811, + "learning_rate": 0.00034052231381873944, + "loss": 0.84313661, + "num_input_tokens_seen": 266842880, + "router_z_loss_mlp": 0.7734375, + "step": 3198, + "time_per_iteration": 2.607335329055786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158567, + "balance_loss_mlp": 1.0808903, + "epoch": 0.6154290111581378, + "flos": 474282028032.0, + "grad_norm": 0.03501094506345523, + "language_loss": 0.90176225, + "learning_rate": 0.00034022707442268494, + "loss": 0.91334796, + "num_input_tokens_seen": 266909504, + "router_z_loss_mlp": 0.77587891, + "step": 3199, + "time_per_iteration": 2.541625499725342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160121, + "balance_loss_mlp": 1.08244419, + "epoch": 0.6156213928434013, + "flos": 551933815296.0, + "grad_norm": 0.028863713644250544, + "language_loss": 0.85985374, + "learning_rate": 0.0003399318970559813, + "loss": 0.87145495, + "num_input_tokens_seen": 266988880, + "router_z_loss_mlp": 0.77587891, + "step": 3200, + "time_per_iteration": 2.796062707901001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156186, + "balance_loss_mlp": 1.07850885, + "epoch": 0.6158137745286649, + "flos": 752360864256.0, + "grad_norm": 0.02911689008620782, + "language_loss": 0.8882643, + "learning_rate": 0.00033963678183322656, + "loss": 0.89982617, + "num_input_tokens_seen": 267074512, + "router_z_loss_mlp": 0.77587891, + "step": 3201, + "time_per_iteration": 3.0142765045166016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150573, + "balance_loss_mlp": 1.07313454, + "epoch": 0.6160061562139284, + "flos": 556905005568.0, + "grad_norm": 0.026867696213324778, + "language_loss": 0.87175548, + "learning_rate": 0.0003393417288689945, + "loss": 0.8832612, + "num_input_tokens_seen": 267147952, + "router_z_loss_mlp": 0.7734375, + "step": 3202, + "time_per_iteration": 2.655984401702881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149993, + "balance_loss_mlp": 1.07250667, + "epoch": 0.616198537899192, + "flos": 743466534912.0, + "grad_norm": 0.03671255454087467, + "language_loss": 0.83013773, + "learning_rate": 0.00033904673827783504, + "loss": 0.84163767, + "num_input_tokens_seen": 267224368, + "router_z_loss_mlp": 0.77392578, + "step": 3203, + "time_per_iteration": 2.937826633453369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148812, + "balance_loss_mlp": 1.07142162, + "epoch": 0.6163909195844556, + "flos": 479774241792.0, + "grad_norm": 0.030568222552849134, + "language_loss": 0.8708697, + "learning_rate": 0.00033875181017427357, + "loss": 0.88235784, + "num_input_tokens_seen": 267292688, + "router_z_loss_mlp": 0.77294922, + "step": 3204, + "time_per_iteration": 2.6731438636779785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150596, + "balance_loss_mlp": 1.07325339, + "epoch": 0.6165833012697192, + "flos": 532665404928.0, + "grad_norm": 0.031792873085422224, + "language_loss": 0.85750729, + "learning_rate": 0.00033845694467281133, + "loss": 0.86901325, + "num_input_tokens_seen": 267371888, + "router_z_loss_mlp": 0.77246094, + "step": 3205, + "time_per_iteration": 2.876248598098755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149976, + "balance_loss_mlp": 1.07268083, + "epoch": 0.6167756829549826, + "flos": 809293962240.0, + "grad_norm": 0.03236962907615372, + "language_loss": 0.88327932, + "learning_rate": 0.00033816214188792516, + "loss": 0.89477909, + "num_input_tokens_seen": 267458784, + "router_z_loss_mlp": 0.77197266, + "step": 3206, + "time_per_iteration": 3.1564157009124756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151124, + "balance_loss_mlp": 1.07378113, + "epoch": 0.6169680646402462, + "flos": 489910004736.0, + "grad_norm": 0.03290410688193805, + "language_loss": 0.91087395, + "learning_rate": 0.00033786740193406784, + "loss": 0.92238522, + "num_input_tokens_seen": 267528528, + "router_z_loss_mlp": 0.77246094, + "step": 3207, + "time_per_iteration": 2.614291191101074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149659, + "balance_loss_mlp": 1.07236373, + "epoch": 0.6171604463255098, + "flos": 620203176960.0, + "grad_norm": 0.032558146678985676, + "language_loss": 0.86120403, + "learning_rate": 0.00033757272492566736, + "loss": 0.87270063, + "num_input_tokens_seen": 267611152, + "router_z_loss_mlp": 0.77197266, + "step": 3208, + "time_per_iteration": 2.915374994277954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150778, + "balance_loss_mlp": 1.07333994, + "epoch": 0.6173528280107734, + "flos": 529895102976.0, + "grad_norm": 0.029217733611236158, + "language_loss": 0.91618085, + "learning_rate": 0.0003372781109771278, + "loss": 0.9276886, + "num_input_tokens_seen": 267681520, + "router_z_loss_mlp": 0.7734375, + "step": 3209, + "time_per_iteration": 2.7093894481658936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158751, + "balance_loss_mlp": 1.08155119, + "epoch": 0.617545209696037, + "flos": 597736766976.0, + "grad_norm": 0.03128870869992161, + "language_loss": 0.81418395, + "learning_rate": 0.0003369835602028281, + "loss": 0.82577139, + "num_input_tokens_seen": 267758768, + "router_z_loss_mlp": 0.77099609, + "step": 3210, + "time_per_iteration": 2.7591042518615723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156243, + "balance_loss_mlp": 1.07885218, + "epoch": 0.6177375913813005, + "flos": 476105610240.0, + "grad_norm": 0.03246928186554176, + "language_loss": 0.85136282, + "learning_rate": 0.0003366890727171232, + "loss": 0.86292523, + "num_input_tokens_seen": 267831056, + "router_z_loss_mlp": 0.77294922, + "step": 3211, + "time_per_iteration": 2.663344144821167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155968, + "balance_loss_mlp": 1.07881546, + "epoch": 0.617929973066564, + "flos": 530880754176.0, + "grad_norm": 0.03620138157042922, + "language_loss": 0.83830607, + "learning_rate": 0.00033639464863434313, + "loss": 0.84986579, + "num_input_tokens_seen": 267898416, + "router_z_loss_mlp": 0.77050781, + "step": 3212, + "time_per_iteration": 2.6296675205230713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117601, + "balance_loss_mlp": 1.10105133, + "epoch": 0.6181223547518276, + "flos": 1422832622592.0, + "grad_norm": 0.023588472816246354, + "language_loss": 0.78442466, + "learning_rate": 0.00033610028806879363, + "loss": 0.79618478, + "num_input_tokens_seen": 268112864, + "router_z_loss_mlp": 0.75, + "step": 3213, + "time_per_iteration": 4.6863789558410645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148522, + "balance_loss_mlp": 1.07122719, + "epoch": 0.6183147364370912, + "flos": 741695345664.0, + "grad_norm": 0.0331085707194938, + "language_loss": 0.84652448, + "learning_rate": 0.00033580599113475543, + "loss": 0.8580097, + "num_input_tokens_seen": 268198368, + "router_z_loss_mlp": 0.77197266, + "step": 3214, + "time_per_iteration": 2.9692540168762207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148587, + "balance_loss_mlp": 1.07138717, + "epoch": 0.6185071181223547, + "flos": 382482742272.0, + "grad_norm": 0.030292285906144818, + "language_loss": 0.9191429, + "learning_rate": 0.00033551175794648507, + "loss": 0.93062878, + "num_input_tokens_seen": 268260704, + "router_z_loss_mlp": 0.77099609, + "step": 3215, + "time_per_iteration": 2.4922029972076416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157146, + "balance_loss_mlp": 1.07970774, + "epoch": 0.6186994998076183, + "flos": 464304718848.0, + "grad_norm": 0.029842780568851025, + "language_loss": 0.8691783, + "learning_rate": 0.00033521758861821365, + "loss": 0.88074982, + "num_input_tokens_seen": 268328256, + "router_z_loss_mlp": 0.7734375, + "step": 3216, + "time_per_iteration": 2.599022150039673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152488, + "balance_loss_mlp": 1.07485938, + "epoch": 0.6188918814928819, + "flos": 486252106752.0, + "grad_norm": 0.03103316495727489, + "language_loss": 0.9338237, + "learning_rate": 0.0003349234832641479, + "loss": 0.94534856, + "num_input_tokens_seen": 268394016, + "router_z_loss_mlp": 0.77539062, + "step": 3217, + "time_per_iteration": 2.602800130844116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152031, + "balance_loss_mlp": 1.0744493, + "epoch": 0.6190842631781455, + "flos": 658597006848.0, + "grad_norm": 0.03734469861973323, + "language_loss": 0.85810769, + "learning_rate": 0.00033462944199846975, + "loss": 0.86962795, + "num_input_tokens_seen": 268478512, + "router_z_loss_mlp": 0.77490234, + "step": 3218, + "time_per_iteration": 3.070335626602173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151884, + "balance_loss_mlp": 1.07425499, + "epoch": 0.619276644863409, + "flos": 404467060224.0, + "grad_norm": 0.03666199268188377, + "language_loss": 0.91774654, + "learning_rate": 0.00033433546493533606, + "loss": 0.92926538, + "num_input_tokens_seen": 268540304, + "router_z_loss_mlp": 0.77539062, + "step": 3219, + "time_per_iteration": 2.468400716781616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149767, + "balance_loss_mlp": 1.07223368, + "epoch": 0.6194690265486725, + "flos": 584240547840.0, + "grad_norm": 0.03534009375651296, + "language_loss": 0.89686239, + "learning_rate": 0.00033404155218887897, + "loss": 0.90836006, + "num_input_tokens_seen": 268611136, + "router_z_loss_mlp": 0.77441406, + "step": 3220, + "time_per_iteration": 2.695805788040161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150834, + "balance_loss_mlp": 1.07329988, + "epoch": 0.6196614082339361, + "flos": 505384257024.0, + "grad_norm": 0.028059763946118966, + "language_loss": 0.91884506, + "learning_rate": 0.00033374770387320534, + "loss": 0.93035334, + "num_input_tokens_seen": 268684992, + "router_z_loss_mlp": 0.77441406, + "step": 3221, + "time_per_iteration": 2.7483606338500977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151577, + "balance_loss_mlp": 1.07409084, + "epoch": 0.6198537899191997, + "flos": 576525252096.0, + "grad_norm": 0.031050662157407424, + "language_loss": 0.90087008, + "learning_rate": 0.00033345392010239737, + "loss": 0.91238588, + "num_input_tokens_seen": 268758096, + "router_z_loss_mlp": 0.77392578, + "step": 3222, + "time_per_iteration": 2.714914560317993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114984, + "balance_loss_mlp": 1.07249725, + "epoch": 0.6200461716044633, + "flos": 594302450688.0, + "grad_norm": 0.03255490958660124, + "language_loss": 0.88128847, + "learning_rate": 0.0003331602009905118, + "loss": 0.89278692, + "num_input_tokens_seen": 268834432, + "router_z_loss_mlp": 0.77246094, + "step": 3223, + "time_per_iteration": 2.7981505393981934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148595, + "balance_loss_mlp": 1.0711087, + "epoch": 0.6202385532897268, + "flos": 667410745344.0, + "grad_norm": 0.028478674888367996, + "language_loss": 0.88510197, + "learning_rate": 0.00033286654665158085, + "loss": 0.89658791, + "num_input_tokens_seen": 268921168, + "router_z_loss_mlp": 0.77392578, + "step": 3224, + "time_per_iteration": 2.950357437133789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147753, + "balance_loss_mlp": 1.07045746, + "epoch": 0.6204309349749904, + "flos": 485926467072.0, + "grad_norm": 0.03296106773090735, + "language_loss": 0.92470849, + "learning_rate": 0.0003325729571996109, + "loss": 0.93618602, + "num_input_tokens_seen": 268991440, + "router_z_loss_mlp": 0.77197266, + "step": 3225, + "time_per_iteration": 2.632589340209961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150501, + "balance_loss_mlp": 1.07325304, + "epoch": 0.6206233166602539, + "flos": 585217466880.0, + "grad_norm": 0.0318626759985495, + "language_loss": 0.89139777, + "learning_rate": 0.000332279432748584, + "loss": 0.90290284, + "num_input_tokens_seen": 269061024, + "router_z_loss_mlp": 0.77148438, + "step": 3226, + "time_per_iteration": 2.704615592956543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149408, + "balance_loss_mlp": 1.07235157, + "epoch": 0.6208156983455175, + "flos": 477911728128.0, + "grad_norm": 0.029634304247413663, + "language_loss": 0.91940343, + "learning_rate": 0.00033198597341245576, + "loss": 0.93089747, + "num_input_tokens_seen": 269130560, + "router_z_loss_mlp": 0.76953125, + "step": 3227, + "time_per_iteration": 2.582554340362549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149434, + "balance_loss_mlp": 1.07228148, + "epoch": 0.6210080800307811, + "flos": 790467985920.0, + "grad_norm": 0.031063189419047472, + "language_loss": 0.86885202, + "learning_rate": 0.00033169257930515763, + "loss": 0.88034642, + "num_input_tokens_seen": 269213280, + "router_z_loss_mlp": 0.77050781, + "step": 3228, + "time_per_iteration": 3.0251591205596924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152373, + "balance_loss_mlp": 1.07526827, + "epoch": 0.6212004617160446, + "flos": 608916578304.0, + "grad_norm": 0.037247869916732776, + "language_loss": 0.87339175, + "learning_rate": 0.0003313992505405951, + "loss": 0.88491547, + "num_input_tokens_seen": 269286384, + "router_z_loss_mlp": 0.77001953, + "step": 3229, + "time_per_iteration": 2.697026014328003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149107, + "balance_loss_mlp": 1.07209802, + "epoch": 0.6213928434013082, + "flos": 587611737600.0, + "grad_norm": 0.03555615318912057, + "language_loss": 0.87367719, + "learning_rate": 0.0003311059872326487, + "loss": 0.88516825, + "num_input_tokens_seen": 269353296, + "router_z_loss_mlp": 0.76904297, + "step": 3230, + "time_per_iteration": 2.7712976932525635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157084, + "balance_loss_mlp": 1.08017004, + "epoch": 0.6215852250865718, + "flos": 537108840960.0, + "grad_norm": 0.03130868556859839, + "language_loss": 0.84262764, + "learning_rate": 0.0003308127894951734, + "loss": 0.85419852, + "num_input_tokens_seen": 269422304, + "router_z_loss_mlp": 0.76806641, + "step": 3231, + "time_per_iteration": 2.6406192779541016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114749, + "balance_loss_mlp": 1.07038534, + "epoch": 0.6217776067718354, + "flos": 619312852992.0, + "grad_norm": 0.034917389789924605, + "language_loss": 0.91667497, + "learning_rate": 0.00033051965744199834, + "loss": 0.92814988, + "num_input_tokens_seen": 269498784, + "router_z_loss_mlp": 0.77001953, + "step": 3232, + "time_per_iteration": 2.750717878341675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147898, + "balance_loss_mlp": 1.07084131, + "epoch": 0.6219699884570988, + "flos": 547099611648.0, + "grad_norm": 0.02871355385068571, + "language_loss": 0.9457683, + "learning_rate": 0.0003302265911869276, + "loss": 0.95724726, + "num_input_tokens_seen": 269581264, + "router_z_loss_mlp": 0.76953125, + "step": 3233, + "time_per_iteration": 2.930553436279297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147703, + "balance_loss_mlp": 1.07059801, + "epoch": 0.6221623701423624, + "flos": 482155777536.0, + "grad_norm": 0.03278824818574476, + "language_loss": 0.89681149, + "learning_rate": 0.0003299335908437397, + "loss": 0.90828854, + "num_input_tokens_seen": 269649408, + "router_z_loss_mlp": 0.77001953, + "step": 3234, + "time_per_iteration": 2.5631237030029297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149081, + "balance_loss_mlp": 1.07211912, + "epoch": 0.622354751827626, + "flos": 380872008192.0, + "grad_norm": 0.04189689360611541, + "language_loss": 0.86520332, + "learning_rate": 0.0003296406565261873, + "loss": 0.8766942, + "num_input_tokens_seen": 269711648, + "router_z_loss_mlp": 0.76855469, + "step": 3235, + "time_per_iteration": 2.457258701324463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148211, + "balance_loss_mlp": 1.07129693, + "epoch": 0.6225471335128896, + "flos": 669071144448.0, + "grad_norm": 0.03023362442836584, + "language_loss": 0.89682841, + "learning_rate": 0.0003293477883479978, + "loss": 0.90831059, + "num_input_tokens_seen": 269787376, + "router_z_loss_mlp": 0.76806641, + "step": 3236, + "time_per_iteration": 2.8200809955596924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148687, + "balance_loss_mlp": 1.07172537, + "epoch": 0.6227395151981532, + "flos": 772627660800.0, + "grad_norm": 0.038353629459733245, + "language_loss": 0.85627455, + "learning_rate": 0.0003290549864228727, + "loss": 0.86776143, + "num_input_tokens_seen": 269863008, + "router_z_loss_mlp": 0.76855469, + "step": 3237, + "time_per_iteration": 2.9402804374694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151344, + "balance_loss_mlp": 1.07419205, + "epoch": 0.6229318968834167, + "flos": 485357779968.0, + "grad_norm": 0.030356371486713406, + "language_loss": 0.91371596, + "learning_rate": 0.0003287622508644875, + "loss": 0.92522943, + "num_input_tokens_seen": 269939552, + "router_z_loss_mlp": 0.77050781, + "step": 3238, + "time_per_iteration": 2.761613368988037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152584, + "balance_loss_mlp": 1.07543159, + "epoch": 0.6231242785686802, + "flos": 463877021184.0, + "grad_norm": 0.03773116735562404, + "language_loss": 0.92044532, + "learning_rate": 0.0003284695817864923, + "loss": 0.93197119, + "num_input_tokens_seen": 270002752, + "router_z_loss_mlp": 0.77050781, + "step": 3239, + "time_per_iteration": 2.496115207672119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152871, + "balance_loss_mlp": 1.07562304, + "epoch": 0.6233166602539438, + "flos": 610210404864.0, + "grad_norm": 0.04001521730964561, + "language_loss": 0.91216815, + "learning_rate": 0.0003281769793025116, + "loss": 0.92369688, + "num_input_tokens_seen": 270075696, + "router_z_loss_mlp": 0.77148438, + "step": 3240, + "time_per_iteration": 2.737149953842163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153333, + "balance_loss_mlp": 1.07613325, + "epoch": 0.6235090419392074, + "flos": 440114783232.0, + "grad_norm": 0.039001077055099004, + "language_loss": 0.95066154, + "learning_rate": 0.00032788444352614346, + "loss": 0.9621948, + "num_input_tokens_seen": 270139872, + "router_z_loss_mlp": 0.77099609, + "step": 3241, + "time_per_iteration": 2.5000274181365967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152362, + "balance_loss_mlp": 1.07520986, + "epoch": 0.6237014236244709, + "flos": 505900551168.0, + "grad_norm": 0.03351386174888394, + "language_loss": 0.86000109, + "learning_rate": 0.0003275919745709606, + "loss": 0.87152469, + "num_input_tokens_seen": 270206752, + "router_z_loss_mlp": 0.77050781, + "step": 3242, + "time_per_iteration": 2.5560779571533203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150845, + "balance_loss_mlp": 1.07359755, + "epoch": 0.6238938053097345, + "flos": 513995880960.0, + "grad_norm": 0.02989991495254077, + "language_loss": 0.86827087, + "learning_rate": 0.00032729957255050936, + "loss": 0.87977934, + "num_input_tokens_seen": 270275472, + "router_z_loss_mlp": 0.77148438, + "step": 3243, + "time_per_iteration": 2.7240655422210693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151606, + "balance_loss_mlp": 1.07440567, + "epoch": 0.6240861869949981, + "flos": 738021984768.0, + "grad_norm": 0.03287270457650662, + "language_loss": 0.87638962, + "learning_rate": 0.0003270072375783102, + "loss": 0.88790572, + "num_input_tokens_seen": 270348336, + "router_z_loss_mlp": 0.77099609, + "step": 3244, + "time_per_iteration": 2.9896130561828613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151989, + "balance_loss_mlp": 1.07469356, + "epoch": 0.6242785686802617, + "flos": 495708392448.0, + "grad_norm": 0.032661081616998364, + "language_loss": 0.84373832, + "learning_rate": 0.00032671496976785774, + "loss": 0.85525823, + "num_input_tokens_seen": 270416496, + "router_z_loss_mlp": 0.77197266, + "step": 3245, + "time_per_iteration": 2.635254144668579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152307, + "balance_loss_mlp": 1.0751549, + "epoch": 0.6244709503655252, + "flos": 747233221632.0, + "grad_norm": 0.0292375931838659, + "language_loss": 0.80339247, + "learning_rate": 0.0003264227692326205, + "loss": 0.81491554, + "num_input_tokens_seen": 270501680, + "router_z_loss_mlp": 0.77050781, + "step": 3246, + "time_per_iteration": 3.037773609161377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152481, + "balance_loss_mlp": 1.07523346, + "epoch": 0.6246633320507887, + "flos": 493550438400.0, + "grad_norm": 0.03477244782189641, + "language_loss": 0.90644753, + "learning_rate": 0.00032613063608604055, + "loss": 0.91797233, + "num_input_tokens_seen": 270568656, + "router_z_loss_mlp": 0.77148438, + "step": 3247, + "time_per_iteration": 2.537938117980957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151924, + "balance_loss_mlp": 1.07462883, + "epoch": 0.6248557137360523, + "flos": 518391653376.0, + "grad_norm": 0.03220304016525991, + "language_loss": 0.89104807, + "learning_rate": 0.0003258385704415343, + "loss": 0.90256733, + "num_input_tokens_seen": 270636160, + "router_z_loss_mlp": 0.77197266, + "step": 3248, + "time_per_iteration": 2.6050169467926025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157108, + "balance_loss_mlp": 1.08005083, + "epoch": 0.6250480954213159, + "flos": 520428083712.0, + "grad_norm": 0.030644735245645434, + "language_loss": 0.87455463, + "learning_rate": 0.0003255465724124915, + "loss": 0.88612568, + "num_input_tokens_seen": 270708816, + "router_z_loss_mlp": 0.76953125, + "step": 3249, + "time_per_iteration": 2.7039546966552734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152527, + "balance_loss_mlp": 1.07532752, + "epoch": 0.6252404771065795, + "flos": 517069628928.0, + "grad_norm": 0.031780137669166014, + "language_loss": 0.87919134, + "learning_rate": 0.00032525464211227587, + "loss": 0.89071667, + "num_input_tokens_seen": 270778016, + "router_z_loss_mlp": 0.77099609, + "step": 3250, + "time_per_iteration": 2.601846933364868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150948, + "balance_loss_mlp": 1.07403469, + "epoch": 0.6254328587918431, + "flos": 577996998144.0, + "grad_norm": 0.033725560308058275, + "language_loss": 0.90909386, + "learning_rate": 0.0003249627796542249, + "loss": 0.92060328, + "num_input_tokens_seen": 270847072, + "router_z_loss_mlp": 0.76806641, + "step": 3251, + "time_per_iteration": 2.653550148010254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152607, + "balance_loss_mlp": 1.07578814, + "epoch": 0.6256252404771065, + "flos": 599104453632.0, + "grad_norm": 0.030197281894512866, + "language_loss": 0.89177507, + "learning_rate": 0.00032467098515164943, + "loss": 0.90330118, + "num_input_tokens_seen": 270926320, + "router_z_loss_mlp": 0.76708984, + "step": 3252, + "time_per_iteration": 2.896319627761841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153096, + "balance_loss_mlp": 1.07622945, + "epoch": 0.6258176221623701, + "flos": 509361063936.0, + "grad_norm": 0.03670659852857571, + "language_loss": 0.90126091, + "learning_rate": 0.00032437925871783456, + "loss": 0.91279185, + "num_input_tokens_seen": 270997904, + "router_z_loss_mlp": 0.76757812, + "step": 3253, + "time_per_iteration": 2.6326792240142822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151923, + "balance_loss_mlp": 1.07500935, + "epoch": 0.6260100038476337, + "flos": 640804345344.0, + "grad_norm": 0.03617334498196145, + "language_loss": 0.90267026, + "learning_rate": 0.00032408760046603803, + "loss": 0.91418946, + "num_input_tokens_seen": 271074256, + "router_z_loss_mlp": 0.76806641, + "step": 3254, + "time_per_iteration": 2.803849697113037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151596, + "balance_loss_mlp": 1.07458711, + "epoch": 0.6262023855328973, + "flos": 842451360768.0, + "grad_norm": 0.034269487661108974, + "language_loss": 0.82522523, + "learning_rate": 0.00032379601050949193, + "loss": 0.83674121, + "num_input_tokens_seen": 271155152, + "router_z_loss_mlp": 0.76904297, + "step": 3255, + "time_per_iteration": 3.1005427837371826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150946, + "balance_loss_mlp": 1.07422304, + "epoch": 0.6263947672181608, + "flos": 523156726272.0, + "grad_norm": 0.032816276182318284, + "language_loss": 0.93856758, + "learning_rate": 0.0003235044889614013, + "loss": 0.950077, + "num_input_tokens_seen": 271224784, + "router_z_loss_mlp": 0.76611328, + "step": 3256, + "time_per_iteration": 2.6180245876312256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151059, + "balance_loss_mlp": 1.07419276, + "epoch": 0.6265871489034244, + "flos": 608289494016.0, + "grad_norm": 0.03305761610211967, + "language_loss": 0.8896969, + "learning_rate": 0.0003232130359349451, + "loss": 0.90120745, + "num_input_tokens_seen": 271303584, + "router_z_loss_mlp": 0.76757812, + "step": 3257, + "time_per_iteration": 2.845158576965332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152664, + "balance_loss_mlp": 1.07579827, + "epoch": 0.626779530588688, + "flos": 589593773568.0, + "grad_norm": 0.030590175923720698, + "language_loss": 0.86119747, + "learning_rate": 0.0003229216515432751, + "loss": 0.87272418, + "num_input_tokens_seen": 271379632, + "router_z_loss_mlp": 0.76757812, + "step": 3258, + "time_per_iteration": 2.776336193084717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151745, + "balance_loss_mlp": 1.07473612, + "epoch": 0.6269719122739515, + "flos": 439537363968.0, + "grad_norm": 0.03493081590414929, + "language_loss": 0.86540627, + "learning_rate": 0.0003226303358995174, + "loss": 0.87692368, + "num_input_tokens_seen": 271447808, + "router_z_loss_mlp": 0.76904297, + "step": 3259, + "time_per_iteration": 2.589393377304077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151325, + "balance_loss_mlp": 1.07431602, + "epoch": 0.6271642939592151, + "flos": 564014684160.0, + "grad_norm": 0.02751327310294224, + "language_loss": 0.92896867, + "learning_rate": 0.00032233908911677, + "loss": 0.9404819, + "num_input_tokens_seen": 271526768, + "router_z_loss_mlp": 0.76904297, + "step": 3260, + "time_per_iteration": 2.834845781326294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148548, + "balance_loss_mlp": 1.07153916, + "epoch": 0.6273566756444786, + "flos": 515652277248.0, + "grad_norm": 0.03305165048168085, + "language_loss": 0.86257023, + "learning_rate": 0.0003220479113081053, + "loss": 0.87405574, + "num_input_tokens_seen": 271597840, + "router_z_loss_mlp": 0.76904297, + "step": 3261, + "time_per_iteration": 2.7153472900390625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151278, + "balance_loss_mlp": 1.07431674, + "epoch": 0.6275490573297422, + "flos": 586587154944.0, + "grad_norm": 0.03255760599660819, + "language_loss": 0.84347677, + "learning_rate": 0.00032175680258656836, + "loss": 0.85498953, + "num_input_tokens_seen": 271668352, + "router_z_loss_mlp": 0.76855469, + "step": 3262, + "time_per_iteration": 2.7178304195404053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153298, + "balance_loss_mlp": 1.07638431, + "epoch": 0.6277414390150058, + "flos": 560543437824.0, + "grad_norm": 0.03084786969473793, + "language_loss": 0.84701777, + "learning_rate": 0.00032146576306517794, + "loss": 0.85855073, + "num_input_tokens_seen": 271743936, + "router_z_loss_mlp": 0.76806641, + "step": 3263, + "time_per_iteration": 2.730602502822876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153924, + "balance_loss_mlp": 1.07686687, + "epoch": 0.6279338207002694, + "flos": 613840104960.0, + "grad_norm": 0.03145910939226107, + "language_loss": 0.86918247, + "learning_rate": 0.0003211747928569255, + "loss": 0.88072169, + "num_input_tokens_seen": 271817008, + "router_z_loss_mlp": 0.76953125, + "step": 3264, + "time_per_iteration": 2.724712371826172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155736, + "balance_loss_mlp": 1.07882273, + "epoch": 0.6281262023855329, + "flos": 626932821504.0, + "grad_norm": 0.028624354652689574, + "language_loss": 0.87177598, + "learning_rate": 0.0003208838920747754, + "loss": 0.88333333, + "num_input_tokens_seen": 271896960, + "router_z_loss_mlp": 0.76806641, + "step": 3265, + "time_per_iteration": 2.830962896347046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115106, + "balance_loss_mlp": 1.07405066, + "epoch": 0.6283185840707964, + "flos": 1125418795008.0, + "grad_norm": 0.03154411123335471, + "language_loss": 0.82117403, + "learning_rate": 0.0003205930608316656, + "loss": 0.83268464, + "num_input_tokens_seen": 271985008, + "router_z_loss_mlp": 0.76904297, + "step": 3266, + "time_per_iteration": 3.4846274852752686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152648, + "balance_loss_mlp": 1.07573402, + "epoch": 0.62851096575606, + "flos": 516331754496.0, + "grad_norm": 0.032694316072136534, + "language_loss": 0.89774895, + "learning_rate": 0.00032030229924050673, + "loss": 0.90927541, + "num_input_tokens_seen": 272056368, + "router_z_loss_mlp": 0.76806641, + "step": 3267, + "time_per_iteration": 2.6537904739379883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150261, + "balance_loss_mlp": 1.07320464, + "epoch": 0.6287033474413236, + "flos": 405061943808.0, + "grad_norm": 0.03610764341116815, + "language_loss": 0.86379248, + "learning_rate": 0.00032001160741418247, + "loss": 0.8752951, + "num_input_tokens_seen": 272123424, + "router_z_loss_mlp": 0.76953125, + "step": 3268, + "time_per_iteration": 2.6072278022766113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149975, + "balance_loss_mlp": 1.0729655, + "epoch": 0.6288957291265872, + "flos": 526758228480.0, + "grad_norm": 0.03519251125136882, + "language_loss": 0.87577492, + "learning_rate": 0.0003197209854655494, + "loss": 0.88727468, + "num_input_tokens_seen": 272193008, + "router_z_loss_mlp": 0.76904297, + "step": 3269, + "time_per_iteration": 2.624221086502075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151498, + "balance_loss_mlp": 1.07458413, + "epoch": 0.6290881108118507, + "flos": 604957235712.0, + "grad_norm": 0.03303529236450534, + "language_loss": 0.79662859, + "learning_rate": 0.0003194304335074371, + "loss": 0.80814356, + "num_input_tokens_seen": 272275328, + "router_z_loss_mlp": 0.76806641, + "step": 3270, + "time_per_iteration": 2.842299461364746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153904, + "balance_loss_mlp": 1.07703781, + "epoch": 0.6292804924971143, + "flos": 438597374976.0, + "grad_norm": 0.03323676651467279, + "language_loss": 0.93520898, + "learning_rate": 0.0003191399516526475, + "loss": 0.94674796, + "num_input_tokens_seen": 272339328, + "router_z_loss_mlp": 0.76757812, + "step": 3271, + "time_per_iteration": 2.534921169281006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151771, + "balance_loss_mlp": 1.07500029, + "epoch": 0.6294728741823779, + "flos": 607844332032.0, + "grad_norm": 0.029188592887849887, + "language_loss": 0.84005713, + "learning_rate": 0.0003188495400139559, + "loss": 0.8515749, + "num_input_tokens_seen": 272416336, + "router_z_loss_mlp": 0.76660156, + "step": 3272, + "time_per_iteration": 2.783825397491455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149208, + "balance_loss_mlp": 1.07229424, + "epoch": 0.6296652558676414, + "flos": 702773761536.0, + "grad_norm": 0.03427526038841549, + "language_loss": 0.89267194, + "learning_rate": 0.00031855919870411013, + "loss": 0.90416408, + "num_input_tokens_seen": 272490368, + "router_z_loss_mlp": 0.76806641, + "step": 3273, + "time_per_iteration": 2.8276174068450928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148805, + "balance_loss_mlp": 1.07189095, + "epoch": 0.6298576375529049, + "flos": 524943378432.0, + "grad_norm": 0.029237647029809653, + "language_loss": 0.89991713, + "learning_rate": 0.0003182689278358305, + "loss": 0.91140521, + "num_input_tokens_seen": 272562992, + "router_z_loss_mlp": 0.76806641, + "step": 3274, + "time_per_iteration": 2.706908941268921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148394, + "balance_loss_mlp": 1.07143247, + "epoch": 0.6300500192381685, + "flos": 476926076928.0, + "grad_norm": 0.034587260543346605, + "language_loss": 0.85421312, + "learning_rate": 0.0003179787275218105, + "loss": 0.86569709, + "num_input_tokens_seen": 272629456, + "router_z_loss_mlp": 0.76855469, + "step": 3275, + "time_per_iteration": 2.537382125854492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147946, + "balance_loss_mlp": 1.07117569, + "epoch": 0.6302424009234321, + "flos": 521891097600.0, + "grad_norm": 0.02794771765960627, + "language_loss": 0.8894403, + "learning_rate": 0.0003176885978747155, + "loss": 0.9009198, + "num_input_tokens_seen": 272697440, + "router_z_loss_mlp": 0.76660156, + "step": 3276, + "time_per_iteration": 2.6045258045196533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148975, + "balance_loss_mlp": 1.07225204, + "epoch": 0.6304347826086957, + "flos": 695857465344.0, + "grad_norm": 0.03251661514625025, + "language_loss": 0.87684363, + "learning_rate": 0.0003173985390071839, + "loss": 0.88833332, + "num_input_tokens_seen": 272774080, + "router_z_loss_mlp": 0.76611328, + "step": 3277, + "time_per_iteration": 2.858759641647339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167786, + "balance_loss_mlp": 1.09187317, + "epoch": 0.6306271642939593, + "flos": 1470030183936.0, + "grad_norm": 0.015221211739027024, + "language_loss": 0.77900457, + "learning_rate": 0.00031710855103182675, + "loss": 0.79068244, + "num_input_tokens_seen": 272998512, + "router_z_loss_mlp": 0.7578125, + "step": 3278, + "time_per_iteration": 4.767859220504761 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148, + "balance_loss_mlp": 1.07122958, + "epoch": 0.6308195459792227, + "flos": 602929537536.0, + "grad_norm": 0.03309702536338572, + "language_loss": 0.87110293, + "learning_rate": 0.00031681863406122704, + "loss": 0.8825829, + "num_input_tokens_seen": 273074672, + "router_z_loss_mlp": 0.76660156, + "step": 3279, + "time_per_iteration": 2.7526352405548096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151009, + "balance_loss_mlp": 1.0742383, + "epoch": 0.6310119276644863, + "flos": 728236056576.0, + "grad_norm": 0.03127249771985471, + "language_loss": 0.90830934, + "learning_rate": 0.00031652878820794087, + "loss": 0.91981947, + "num_input_tokens_seen": 273157904, + "router_z_loss_mlp": 0.76660156, + "step": 3280, + "time_per_iteration": 2.980374813079834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152955, + "balance_loss_mlp": 1.07623196, + "epoch": 0.6312043093497499, + "flos": 520818851328.0, + "grad_norm": 0.035871108010903825, + "language_loss": 0.91415131, + "learning_rate": 0.00031623901358449627, + "loss": 0.92568088, + "num_input_tokens_seen": 273228160, + "router_z_loss_mlp": 0.76611328, + "step": 3281, + "time_per_iteration": 2.6661479473114014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153626, + "balance_loss_mlp": 1.07685518, + "epoch": 0.6313966910350135, + "flos": 532222244352.0, + "grad_norm": 0.03104696980992861, + "language_loss": 0.93473637, + "learning_rate": 0.0003159493103033936, + "loss": 0.94627267, + "num_input_tokens_seen": 273295872, + "router_z_loss_mlp": 0.76660156, + "step": 3282, + "time_per_iteration": 2.7015254497528076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156189, + "balance_loss_mlp": 1.08065796, + "epoch": 0.631589072720277, + "flos": 1382993969664.0, + "grad_norm": 0.006807831796281711, + "language_loss": 0.79919052, + "learning_rate": 0.00031565967847710564, + "loss": 0.81075245, + "num_input_tokens_seen": 273524320, + "router_z_loss_mlp": 0.75585938, + "step": 3283, + "time_per_iteration": 4.893282890319824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153518, + "balance_loss_mlp": 1.07674742, + "epoch": 0.6317814544055406, + "flos": 625873310208.0, + "grad_norm": 0.03000778283215098, + "language_loss": 0.87091964, + "learning_rate": 0.0003153701182180776, + "loss": 0.88245487, + "num_input_tokens_seen": 273598544, + "router_z_loss_mlp": 0.76660156, + "step": 3284, + "time_per_iteration": 2.785921335220337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153113, + "balance_loss_mlp": 1.07643747, + "epoch": 0.6319738360908042, + "flos": 499097046528.0, + "grad_norm": 0.030580966863201303, + "language_loss": 0.86424339, + "learning_rate": 0.00031508062963872655, + "loss": 0.8757745, + "num_input_tokens_seen": 273666000, + "router_z_loss_mlp": 0.765625, + "step": 3285, + "time_per_iteration": 2.6083192825317383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152348, + "balance_loss_mlp": 1.07567286, + "epoch": 0.6321662177760677, + "flos": 580908289536.0, + "grad_norm": 0.03249956938477427, + "language_loss": 0.84091449, + "learning_rate": 0.0003147912128514423, + "loss": 0.85243797, + "num_input_tokens_seen": 273742672, + "router_z_loss_mlp": 0.765625, + "step": 3286, + "time_per_iteration": 2.7065303325653076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114775, + "balance_loss_mlp": 1.07107508, + "epoch": 0.6323585994613313, + "flos": 602605899264.0, + "grad_norm": 0.03060189068927108, + "language_loss": 0.92241961, + "learning_rate": 0.0003145018679685859, + "loss": 0.93389714, + "num_input_tokens_seen": 273813984, + "router_z_loss_mlp": 0.765625, + "step": 3287, + "time_per_iteration": 2.724647045135498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147567, + "balance_loss_mlp": 1.07093954, + "epoch": 0.6325509811465948, + "flos": 529632589824.0, + "grad_norm": 0.026442764297463384, + "language_loss": 0.9133988, + "learning_rate": 0.00031421259510249134, + "loss": 0.92487442, + "num_input_tokens_seen": 273892848, + "router_z_loss_mlp": 0.76513672, + "step": 3288, + "time_per_iteration": 2.7890970706939697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146868, + "balance_loss_mlp": 1.07019234, + "epoch": 0.6327433628318584, + "flos": 575344217088.0, + "grad_norm": 0.03165563146125425, + "language_loss": 0.8638919, + "learning_rate": 0.00031392339436546414, + "loss": 0.87536061, + "num_input_tokens_seen": 273971696, + "router_z_loss_mlp": 0.765625, + "step": 3289, + "time_per_iteration": 2.8359181880950928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147105, + "balance_loss_mlp": 1.07042992, + "epoch": 0.632935744517122, + "flos": 518111675904.0, + "grad_norm": 0.040669622782204255, + "language_loss": 0.87612778, + "learning_rate": 0.00031363426586978205, + "loss": 0.88759887, + "num_input_tokens_seen": 274048096, + "router_z_loss_mlp": 0.765625, + "step": 3290, + "time_per_iteration": 2.755444288253784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148795, + "balance_loss_mlp": 1.07216728, + "epoch": 0.6331281262023856, + "flos": 618596445696.0, + "grad_norm": 0.029293061792341625, + "language_loss": 0.89532119, + "learning_rate": 0.0003133452097276947, + "loss": 0.90680915, + "num_input_tokens_seen": 274122848, + "router_z_loss_mlp": 0.76513672, + "step": 3291, + "time_per_iteration": 2.731522560119629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153422, + "balance_loss_mlp": 1.07674634, + "epoch": 0.633320507887649, + "flos": 594115799040.0, + "grad_norm": 0.032525593419921936, + "language_loss": 0.88528687, + "learning_rate": 0.0003130562260514238, + "loss": 0.89682108, + "num_input_tokens_seen": 274198320, + "router_z_loss_mlp": 0.765625, + "step": 3292, + "time_per_iteration": 2.7816312313079834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150685, + "balance_loss_mlp": 1.07396197, + "epoch": 0.6335128895729126, + "flos": 583495942656.0, + "grad_norm": 0.0277750610234457, + "language_loss": 0.86754191, + "learning_rate": 0.0003127673149531626, + "loss": 0.87904876, + "num_input_tokens_seen": 274274944, + "router_z_loss_mlp": 0.76611328, + "step": 3293, + "time_per_iteration": 2.7256717681884766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151215, + "balance_loss_mlp": 1.0744915, + "epoch": 0.6337052712581762, + "flos": 453973572096.0, + "grad_norm": 0.0366063114700609, + "language_loss": 0.89718056, + "learning_rate": 0.0003124784765450762, + "loss": 0.90869272, + "num_input_tokens_seen": 274342384, + "router_z_loss_mlp": 0.76611328, + "step": 3294, + "time_per_iteration": 2.557979106903076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152531, + "balance_loss_mlp": 1.07585573, + "epoch": 0.6338976529434398, + "flos": 574515018240.0, + "grad_norm": 0.03914872981780459, + "language_loss": 0.86348414, + "learning_rate": 0.0003121897109393017, + "loss": 0.87500942, + "num_input_tokens_seen": 274417568, + "router_z_loss_mlp": 0.765625, + "step": 3295, + "time_per_iteration": 2.7648093700408936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150647, + "balance_loss_mlp": 1.0738759, + "epoch": 0.6340900346287034, + "flos": 509808227328.0, + "grad_norm": 0.03170073477682662, + "language_loss": 0.93116355, + "learning_rate": 0.0003119010182479481, + "loss": 0.94267005, + "num_input_tokens_seen": 274488960, + "router_z_loss_mlp": 0.76660156, + "step": 3296, + "time_per_iteration": 2.6290597915649414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152399, + "balance_loss_mlp": 1.07562852, + "epoch": 0.6342824163139669, + "flos": 480714230784.0, + "grad_norm": 0.034261076448020254, + "language_loss": 0.8817153, + "learning_rate": 0.00031161239858309563, + "loss": 0.89323932, + "num_input_tokens_seen": 274556880, + "router_z_loss_mlp": 0.76660156, + "step": 3297, + "time_per_iteration": 2.5535776615142822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152393, + "balance_loss_mlp": 1.07571757, + "epoch": 0.6344747979992305, + "flos": 573110401536.0, + "grad_norm": 0.038934995330749234, + "language_loss": 0.89182544, + "learning_rate": 0.0003113238520567964, + "loss": 0.9033494, + "num_input_tokens_seen": 274624944, + "router_z_loss_mlp": 0.765625, + "step": 3298, + "time_per_iteration": 2.6296586990356445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151588, + "balance_loss_mlp": 1.07486486, + "epoch": 0.634667179684494, + "flos": 607045332480.0, + "grad_norm": 0.035281643877612956, + "language_loss": 0.86709571, + "learning_rate": 0.00031103537878107403, + "loss": 0.87861156, + "num_input_tokens_seen": 274695152, + "router_z_loss_mlp": 0.76611328, + "step": 3299, + "time_per_iteration": 2.7374937534332275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156066, + "balance_loss_mlp": 1.07934332, + "epoch": 0.6348595613697576, + "flos": 648128873472.0, + "grad_norm": 0.04012685096431152, + "language_loss": 0.85757369, + "learning_rate": 0.0003107469788679238, + "loss": 0.86913437, + "num_input_tokens_seen": 274767840, + "router_z_loss_mlp": 0.76611328, + "step": 3300, + "time_per_iteration": 2.763896942138672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150162, + "balance_loss_mlp": 1.07329571, + "epoch": 0.6350519430550212, + "flos": 640272588288.0, + "grad_norm": 0.03353321054785192, + "language_loss": 0.91748559, + "learning_rate": 0.00031045865242931267, + "loss": 0.92898715, + "num_input_tokens_seen": 274839312, + "router_z_loss_mlp": 0.76757812, + "step": 3301, + "time_per_iteration": 2.775559186935425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115092, + "balance_loss_mlp": 1.07405412, + "epoch": 0.6352443247402847, + "flos": 687829991424.0, + "grad_norm": 0.033769350364135475, + "language_loss": 0.89046073, + "learning_rate": 0.00031017039957717877, + "loss": 0.90196997, + "num_input_tokens_seen": 274922704, + "router_z_loss_mlp": 0.76757812, + "step": 3302, + "time_per_iteration": 2.9990227222442627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150719, + "balance_loss_mlp": 1.07399607, + "epoch": 0.6354367064255483, + "flos": 560525973504.0, + "grad_norm": 0.03207500130867294, + "language_loss": 0.93455017, + "learning_rate": 0.0003098822204234318, + "loss": 0.94605732, + "num_input_tokens_seen": 274992848, + "router_z_loss_mlp": 0.76611328, + "step": 3303, + "time_per_iteration": 2.6589555740356445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149713, + "balance_loss_mlp": 1.07294202, + "epoch": 0.6356290881108119, + "flos": 981060716544.0, + "grad_norm": 0.03119033938257745, + "language_loss": 0.92425978, + "learning_rate": 0.00030959411507995273, + "loss": 0.93575692, + "num_input_tokens_seen": 275071456, + "router_z_loss_mlp": 0.76660156, + "step": 3304, + "time_per_iteration": 3.2027275562286377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156004, + "balance_loss_mlp": 1.07932901, + "epoch": 0.6358214697960755, + "flos": 529372078080.0, + "grad_norm": 0.037691107664773085, + "language_loss": 0.88209277, + "learning_rate": 0.00030930608365859407, + "loss": 0.8936528, + "num_input_tokens_seen": 275140512, + "router_z_loss_mlp": 0.765625, + "step": 3305, + "time_per_iteration": 2.672909736633301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153167, + "balance_loss_mlp": 1.07663476, + "epoch": 0.6360138514813389, + "flos": 517868628480.0, + "grad_norm": 0.0314628318508628, + "language_loss": 0.93278992, + "learning_rate": 0.00030901812627117943, + "loss": 0.94432157, + "num_input_tokens_seen": 275210896, + "router_z_loss_mlp": 0.76416016, + "step": 3306, + "time_per_iteration": 2.6096842288970947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152004, + "balance_loss_mlp": 1.07556736, + "epoch": 0.6362062331666025, + "flos": 467469791232.0, + "grad_norm": 0.03698857716885425, + "language_loss": 0.90082693, + "learning_rate": 0.000308730243029504, + "loss": 0.91234696, + "num_input_tokens_seen": 275279888, + "router_z_loss_mlp": 0.76318359, + "step": 3307, + "time_per_iteration": 2.625368595123291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148049, + "balance_loss_mlp": 1.07151699, + "epoch": 0.6363986148518661, + "flos": 550772246016.0, + "grad_norm": 0.03499213724407888, + "language_loss": 0.85284883, + "learning_rate": 0.0003084424340453339, + "loss": 0.86432934, + "num_input_tokens_seen": 275357056, + "router_z_loss_mlp": 0.76416016, + "step": 3308, + "time_per_iteration": 2.79801082611084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154866, + "balance_loss_mlp": 1.07842863, + "epoch": 0.6365909965371297, + "flos": 584157955584.0, + "grad_norm": 0.034280921655294554, + "language_loss": 0.87936795, + "learning_rate": 0.0003081546994304064, + "loss": 0.89091659, + "num_input_tokens_seen": 275428240, + "router_z_loss_mlp": 0.76318359, + "step": 3309, + "time_per_iteration": 2.805798053741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151839, + "balance_loss_mlp": 1.0753541, + "epoch": 0.6367833782223933, + "flos": 532287372288.0, + "grad_norm": 0.031184654205402413, + "language_loss": 0.87230557, + "learning_rate": 0.0003078670392964298, + "loss": 0.88382399, + "num_input_tokens_seen": 275497568, + "router_z_loss_mlp": 0.76367188, + "step": 3310, + "time_per_iteration": 2.637089729309082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114879, + "balance_loss_mlp": 1.07211447, + "epoch": 0.6369757599076568, + "flos": 570587876352.0, + "grad_norm": 0.03249753882493018, + "language_loss": 0.8737638, + "learning_rate": 0.00030757945375508406, + "loss": 0.88525176, + "num_input_tokens_seen": 275569616, + "router_z_loss_mlp": 0.765625, + "step": 3311, + "time_per_iteration": 2.6652672290802 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157923, + "balance_loss_mlp": 1.08139026, + "epoch": 0.6371681415929203, + "flos": 541053447168.0, + "grad_norm": 0.03561310839394214, + "language_loss": 0.86446404, + "learning_rate": 0.00030729194291801944, + "loss": 0.8760432, + "num_input_tokens_seen": 275641408, + "router_z_loss_mlp": 0.76416016, + "step": 3312, + "time_per_iteration": 2.685426712036133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152462, + "balance_loss_mlp": 1.07588232, + "epoch": 0.6373605232781839, + "flos": 484530582528.0, + "grad_norm": 0.03615999538834489, + "language_loss": 0.82315236, + "learning_rate": 0.00030700450689685787, + "loss": 0.83467698, + "num_input_tokens_seen": 275706608, + "router_z_loss_mlp": 0.76464844, + "step": 3313, + "time_per_iteration": 2.5285892486572266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115278, + "balance_loss_mlp": 1.07629561, + "epoch": 0.6375529049634475, + "flos": 579816577536.0, + "grad_norm": 0.031570559387627636, + "language_loss": 0.90687287, + "learning_rate": 0.00030671714580319186, + "loss": 0.91840065, + "num_input_tokens_seen": 275785952, + "router_z_loss_mlp": 0.76367188, + "step": 3314, + "time_per_iteration": 2.7918403148651123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149531, + "balance_loss_mlp": 1.07290328, + "epoch": 0.637745286648711, + "flos": 683479154688.0, + "grad_norm": 0.03649458581150707, + "language_loss": 0.8839801, + "learning_rate": 0.0003064298597485846, + "loss": 0.89547539, + "num_input_tokens_seen": 275866240, + "router_z_loss_mlp": 0.76513672, + "step": 3315, + "time_per_iteration": 2.8336853981018066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157103, + "balance_loss_mlp": 1.08066618, + "epoch": 0.6379376683339746, + "flos": 505648771584.0, + "grad_norm": 0.03434060192765891, + "language_loss": 0.89178324, + "learning_rate": 0.00030614264884457054, + "loss": 0.90335435, + "num_input_tokens_seen": 275936176, + "router_z_loss_mlp": 0.76318359, + "step": 3316, + "time_per_iteration": 2.610029697418213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156868, + "balance_loss_mlp": 1.08038342, + "epoch": 0.6381300500192382, + "flos": 503024188416.0, + "grad_norm": 0.037738287263273475, + "language_loss": 0.83208811, + "learning_rate": 0.000305855513202655, + "loss": 0.8436569, + "num_input_tokens_seen": 276004608, + "router_z_loss_mlp": 0.76367188, + "step": 3317, + "time_per_iteration": 2.56390118598938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115293, + "balance_loss_mlp": 1.07663572, + "epoch": 0.6383224317045018, + "flos": 401367115776.0, + "grad_norm": 0.03934464683594442, + "language_loss": 0.83537889, + "learning_rate": 0.0003055684529343138, + "loss": 0.84690815, + "num_input_tokens_seen": 276066688, + "router_z_loss_mlp": 0.76171875, + "step": 3318, + "time_per_iteration": 2.4260315895080566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011523, + "balance_loss_mlp": 1.07600558, + "epoch": 0.6385148133897653, + "flos": 500362675200.0, + "grad_norm": 0.03558980854731561, + "language_loss": 0.8376438, + "learning_rate": 0.00030528146815099374, + "loss": 0.84916675, + "num_input_tokens_seen": 276140000, + "router_z_loss_mlp": 0.76171875, + "step": 3319, + "time_per_iteration": 2.6329188346862793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151029, + "balance_loss_mlp": 1.07468724, + "epoch": 0.6387071950750288, + "flos": 528694602240.0, + "grad_norm": 0.0315122399919932, + "language_loss": 0.76854849, + "learning_rate": 0.00030499455896411203, + "loss": 0.78005874, + "num_input_tokens_seen": 276209840, + "router_z_loss_mlp": 0.76220703, + "step": 3320, + "time_per_iteration": 2.6750285625457764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156959, + "balance_loss_mlp": 1.0823822, + "epoch": 0.6388995767602924, + "flos": 1459104153600.0, + "grad_norm": 0.009844305017815533, + "language_loss": 0.76300812, + "learning_rate": 0.0003047077254850568, + "loss": 0.77457774, + "num_input_tokens_seen": 276444784, + "router_z_loss_mlp": 0.74609375, + "step": 3321, + "time_per_iteration": 4.953099489212036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151078, + "balance_loss_mlp": 1.07459378, + "epoch": 0.639091958445556, + "flos": 605170083840.0, + "grad_norm": 0.03456514545296231, + "language_loss": 0.8206768, + "learning_rate": 0.0003044209678251865, + "loss": 0.83218759, + "num_input_tokens_seen": 276522768, + "router_z_loss_mlp": 0.76367188, + "step": 3322, + "time_per_iteration": 2.8895435333251953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149613, + "balance_loss_mlp": 1.07312858, + "epoch": 0.6392843401308196, + "flos": 585664630272.0, + "grad_norm": 0.030325412861609304, + "language_loss": 0.89598596, + "learning_rate": 0.0003041342860958306, + "loss": 0.90748215, + "num_input_tokens_seen": 276597104, + "router_z_loss_mlp": 0.76367188, + "step": 3323, + "time_per_iteration": 2.8267457485198975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115059, + "balance_loss_mlp": 1.07401037, + "epoch": 0.6394767218160831, + "flos": 515728138752.0, + "grad_norm": 0.035461056589808096, + "language_loss": 0.97089493, + "learning_rate": 0.00030384768040828857, + "loss": 0.98240083, + "num_input_tokens_seen": 276670256, + "router_z_loss_mlp": 0.76464844, + "step": 3324, + "time_per_iteration": 2.6604483127593994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147614, + "balance_loss_mlp": 1.07127237, + "epoch": 0.6396691035013466, + "flos": 542776972800.0, + "grad_norm": 0.029879293671496117, + "language_loss": 0.90136957, + "learning_rate": 0.00030356115087383094, + "loss": 0.91284573, + "num_input_tokens_seen": 276737680, + "router_z_loss_mlp": 0.76220703, + "step": 3325, + "time_per_iteration": 2.61624813079834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151957, + "balance_loss_mlp": 1.07561517, + "epoch": 0.6398614851866102, + "flos": 526554112512.0, + "grad_norm": 0.03633717350328365, + "language_loss": 0.8974539, + "learning_rate": 0.00030327469760369803, + "loss": 0.90897352, + "num_input_tokens_seen": 276803808, + "router_z_loss_mlp": 0.76220703, + "step": 3326, + "time_per_iteration": 2.5705959796905518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151499, + "balance_loss_mlp": 1.0753485, + "epoch": 0.6400538668718738, + "flos": 624134321664.0, + "grad_norm": 0.04101147906430089, + "language_loss": 0.90274537, + "learning_rate": 0.0003029883207091009, + "loss": 0.91426039, + "num_input_tokens_seen": 276874752, + "router_z_loss_mlp": 0.76025391, + "step": 3327, + "time_per_iteration": 2.710705280303955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153226, + "balance_loss_mlp": 1.07712281, + "epoch": 0.6402462485571374, + "flos": 504455001600.0, + "grad_norm": 0.03565756181750687, + "language_loss": 0.8369143, + "learning_rate": 0.00030270202030122095, + "loss": 0.84844655, + "num_input_tokens_seen": 276947200, + "router_z_loss_mlp": 0.75976562, + "step": 3328, + "time_per_iteration": 2.6669437885284424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153213, + "balance_loss_mlp": 1.07706201, + "epoch": 0.6404386302424009, + "flos": 820662426624.0, + "grad_norm": 0.035758844093176624, + "language_loss": 0.90348649, + "learning_rate": 0.00030241579649121, + "loss": 0.91501862, + "num_input_tokens_seen": 277025712, + "router_z_loss_mlp": 0.76025391, + "step": 3329, + "time_per_iteration": 2.9946744441986084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153577, + "balance_loss_mlp": 1.07747424, + "epoch": 0.6406310119276645, + "flos": 472792817664.0, + "grad_norm": 0.031682669944134774, + "language_loss": 0.84166616, + "learning_rate": 0.00030212964939018994, + "loss": 0.85320187, + "num_input_tokens_seen": 277091264, + "router_z_loss_mlp": 0.75976562, + "step": 3330, + "time_per_iteration": 2.529780864715576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153483, + "balance_loss_mlp": 1.07738006, + "epoch": 0.6408233936129281, + "flos": 426488308224.0, + "grad_norm": 0.0317787576762172, + "language_loss": 0.90697497, + "learning_rate": 0.0003018435791092527, + "loss": 0.91850984, + "num_input_tokens_seen": 277154608, + "router_z_loss_mlp": 0.75976562, + "step": 3331, + "time_per_iteration": 2.482226848602295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154163, + "balance_loss_mlp": 1.07810771, + "epoch": 0.6410157752981916, + "flos": 550837373952.0, + "grad_norm": 0.03245017993162029, + "language_loss": 0.86073428, + "learning_rate": 0.00030155758575946083, + "loss": 0.87227595, + "num_input_tokens_seen": 277222176, + "router_z_loss_mlp": 0.75927734, + "step": 3332, + "time_per_iteration": 2.7268691062927246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154009, + "balance_loss_mlp": 1.07785761, + "epoch": 0.6412081569834551, + "flos": 476860948992.0, + "grad_norm": 0.03331397331841687, + "language_loss": 0.88895929, + "learning_rate": 0.0003012716694518467, + "loss": 0.9004994, + "num_input_tokens_seen": 277289600, + "router_z_loss_mlp": 0.76025391, + "step": 3333, + "time_per_iteration": 2.5955138206481934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154559, + "balance_loss_mlp": 1.07845628, + "epoch": 0.6414005386687187, + "flos": 542030366208.0, + "grad_norm": 0.03145594160852774, + "language_loss": 0.89824158, + "learning_rate": 0.000300985830297413, + "loss": 0.90978718, + "num_input_tokens_seen": 277362784, + "router_z_loss_mlp": 0.75976562, + "step": 3334, + "time_per_iteration": 2.675809144973755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151014, + "balance_loss_mlp": 1.07476771, + "epoch": 0.6415929203539823, + "flos": 1042956272640.0, + "grad_norm": 0.03442120912103133, + "language_loss": 0.92276573, + "learning_rate": 0.00030070006840713205, + "loss": 0.93427593, + "num_input_tokens_seen": 277449728, + "router_z_loss_mlp": 0.76123047, + "step": 3335, + "time_per_iteration": 3.3598873615264893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153261, + "balance_loss_mlp": 1.07696736, + "epoch": 0.6417853020392459, + "flos": 649579152384.0, + "grad_norm": 0.03234716357342597, + "language_loss": 0.78466761, + "learning_rate": 0.000300414383891947, + "loss": 0.79620028, + "num_input_tokens_seen": 277527552, + "router_z_loss_mlp": 0.76171875, + "step": 3336, + "time_per_iteration": 2.8177781105041504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153044, + "balance_loss_mlp": 1.07679784, + "epoch": 0.6419776837245095, + "flos": 501943209984.0, + "grad_norm": 0.029578655992370296, + "language_loss": 0.93100476, + "learning_rate": 0.00030012877686276973, + "loss": 0.94253522, + "num_input_tokens_seen": 277603568, + "router_z_loss_mlp": 0.76123047, + "step": 3337, + "time_per_iteration": 2.6656994819641113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153274, + "balance_loss_mlp": 1.07688439, + "epoch": 0.642170065409773, + "flos": 621778982400.0, + "grad_norm": 0.030467733780945628, + "language_loss": 0.91408634, + "learning_rate": 0.0002998432474304832, + "loss": 0.92561901, + "num_input_tokens_seen": 277679696, + "router_z_loss_mlp": 0.76269531, + "step": 3338, + "time_per_iteration": 2.7804837226867676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156387, + "balance_loss_mlp": 1.08161926, + "epoch": 0.6423624470950365, + "flos": 1426638967296.0, + "grad_norm": 0.010632522477168303, + "language_loss": 0.79237342, + "learning_rate": 0.0002995577957059395, + "loss": 0.80393732, + "num_input_tokens_seen": 277913056, + "router_z_loss_mlp": 0.74804688, + "step": 3339, + "time_per_iteration": 4.905744791030884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151035, + "balance_loss_mlp": 1.07493174, + "epoch": 0.6425548287803001, + "flos": 563439266304.0, + "grad_norm": 0.028877045256785867, + "language_loss": 0.92764187, + "learning_rate": 0.00029927242179996107, + "loss": 0.93915224, + "num_input_tokens_seen": 277983168, + "router_z_loss_mlp": 0.75976562, + "step": 3340, + "time_per_iteration": 2.6661758422851562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145869, + "balance_loss_mlp": 1.0697186, + "epoch": 0.6427472104655637, + "flos": 586613351424.0, + "grad_norm": 0.0300822513158231, + "language_loss": 0.88234377, + "learning_rate": 0.0002989871258233398, + "loss": 0.8938024, + "num_input_tokens_seen": 278057600, + "router_z_loss_mlp": 0.76025391, + "step": 3341, + "time_per_iteration": 2.7374660968780518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144033, + "balance_loss_mlp": 1.06773865, + "epoch": 0.6429395921508272, + "flos": 405146537472.0, + "grad_norm": 0.038389287644004705, + "language_loss": 0.88664877, + "learning_rate": 0.0002987019078868373, + "loss": 0.89808905, + "num_input_tokens_seen": 278119232, + "router_z_loss_mlp": 0.76171875, + "step": 3342, + "time_per_iteration": 2.4243760108947754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140022, + "balance_loss_mlp": 1.06377542, + "epoch": 0.6431319738360908, + "flos": 549832257024.0, + "grad_norm": 0.03024016811094423, + "language_loss": 0.8722378, + "learning_rate": 0.00029841676810118484, + "loss": 0.88363802, + "num_input_tokens_seen": 278187456, + "router_z_loss_mlp": 0.76123047, + "step": 3343, + "time_per_iteration": 2.6617236137390137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147432, + "balance_loss_mlp": 1.07118535, + "epoch": 0.6433243555213544, + "flos": 794705304576.0, + "grad_norm": 0.037506118612829445, + "language_loss": 0.92627275, + "learning_rate": 0.0002981317065770839, + "loss": 0.93774706, + "num_input_tokens_seen": 278262176, + "router_z_loss_mlp": 0.76123047, + "step": 3344, + "time_per_iteration": 3.082211494445801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149276, + "balance_loss_mlp": 1.07288682, + "epoch": 0.643516737206618, + "flos": 584112293376.0, + "grad_norm": 0.03767314060719249, + "language_loss": 0.87199879, + "learning_rate": 0.00029784672342520493, + "loss": 0.88349158, + "num_input_tokens_seen": 278328816, + "router_z_loss_mlp": 0.76269531, + "step": 3345, + "time_per_iteration": 2.7258007526397705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114915, + "balance_loss_mlp": 1.07276022, + "epoch": 0.6437091188918815, + "flos": 519750607872.0, + "grad_norm": 0.03533085288020931, + "language_loss": 0.88640958, + "learning_rate": 0.00029756181875618834, + "loss": 0.89790106, + "num_input_tokens_seen": 278395824, + "router_z_loss_mlp": 0.76269531, + "step": 3346, + "time_per_iteration": 2.569779634475708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144811, + "balance_loss_mlp": 1.06846941, + "epoch": 0.643901500577145, + "flos": 385786802688.0, + "grad_norm": 0.034542585210818905, + "language_loss": 0.89738131, + "learning_rate": 0.0002972769926806439, + "loss": 0.90882939, + "num_input_tokens_seen": 278457696, + "router_z_loss_mlp": 0.76220703, + "step": 3347, + "time_per_iteration": 2.497853994369507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147673, + "balance_loss_mlp": 1.07128322, + "epoch": 0.6440938822624086, + "flos": 484697768448.0, + "grad_norm": 0.03553288196721846, + "language_loss": 0.94382805, + "learning_rate": 0.0002969922453091508, + "loss": 0.95530474, + "num_input_tokens_seen": 278526992, + "router_z_loss_mlp": 0.76269531, + "step": 3348, + "time_per_iteration": 2.5491795539855957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147538, + "balance_loss_mlp": 1.07124412, + "epoch": 0.6442862639476722, + "flos": 541637597184.0, + "grad_norm": 0.03037104728594501, + "language_loss": 0.89609063, + "learning_rate": 0.00029670757675225777, + "loss": 0.90756601, + "num_input_tokens_seen": 278601120, + "router_z_loss_mlp": 0.76171875, + "step": 3349, + "time_per_iteration": 2.721752882003784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148396, + "balance_loss_mlp": 1.07234049, + "epoch": 0.6444786456329358, + "flos": 527958729216.0, + "grad_norm": 0.03079951019721412, + "language_loss": 0.85068369, + "learning_rate": 0.0002964229871204831, + "loss": 0.8621676, + "num_input_tokens_seen": 278668208, + "router_z_loss_mlp": 0.75927734, + "step": 3350, + "time_per_iteration": 2.6219635009765625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146722, + "balance_loss_mlp": 1.07076228, + "epoch": 0.6446710273181993, + "flos": 699161525760.0, + "grad_norm": 0.03075522523020309, + "language_loss": 0.88979256, + "learning_rate": 0.00029613847652431403, + "loss": 0.90125972, + "num_input_tokens_seen": 278742832, + "router_z_loss_mlp": 0.75830078, + "step": 3351, + "time_per_iteration": 2.8463754653930664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143843, + "balance_loss_mlp": 1.06778741, + "epoch": 0.6448634090034628, + "flos": 626299006464.0, + "grad_norm": 0.030404862420189395, + "language_loss": 0.8409062, + "learning_rate": 0.0002958540450742078, + "loss": 0.85234463, + "num_input_tokens_seen": 278829744, + "router_z_loss_mlp": 0.75927734, + "step": 3352, + "time_per_iteration": 2.9119668006896973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145662, + "balance_loss_mlp": 1.0695591, + "epoch": 0.6450557906887264, + "flos": 602165466624.0, + "grad_norm": 0.030375965559079645, + "language_loss": 0.81268156, + "learning_rate": 0.0002955696928805901, + "loss": 0.82413822, + "num_input_tokens_seen": 278908592, + "router_z_loss_mlp": 0.75976562, + "step": 3353, + "time_per_iteration": 2.8792967796325684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146049, + "balance_loss_mlp": 1.06989837, + "epoch": 0.64524817237399, + "flos": 647384268288.0, + "grad_norm": 0.032745807535614124, + "language_loss": 0.90629518, + "learning_rate": 0.0002952854200538563, + "loss": 0.91775572, + "num_input_tokens_seen": 278986960, + "router_z_loss_mlp": 0.76025391, + "step": 3354, + "time_per_iteration": 2.7729763984680176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144907, + "balance_loss_mlp": 1.06870866, + "epoch": 0.6454405540592536, + "flos": 474366621696.0, + "grad_norm": 0.04216820116254093, + "language_loss": 0.87584448, + "learning_rate": 0.000295001226704371, + "loss": 0.88729358, + "num_input_tokens_seen": 279054896, + "router_z_loss_mlp": 0.76074219, + "step": 3355, + "time_per_iteration": 2.5655300617218018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146195, + "balance_loss_mlp": 1.06994879, + "epoch": 0.6456329357445171, + "flos": 613019638272.0, + "grad_norm": 0.03469469169647009, + "language_loss": 0.88972664, + "learning_rate": 0.00029471711294246783, + "loss": 0.90118861, + "num_input_tokens_seen": 279126816, + "router_z_loss_mlp": 0.76123047, + "step": 3356, + "time_per_iteration": 2.7737839221954346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149475, + "balance_loss_mlp": 1.07322907, + "epoch": 0.6458253174297807, + "flos": 732931272192.0, + "grad_norm": 0.03845226629357448, + "language_loss": 0.87651891, + "learning_rate": 0.0002944330788784494, + "loss": 0.88801372, + "num_input_tokens_seen": 279197552, + "router_z_loss_mlp": 0.76123047, + "step": 3357, + "time_per_iteration": 2.9011571407318115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151964, + "balance_loss_mlp": 1.07552743, + "epoch": 0.6460176991150443, + "flos": 571554061824.0, + "grad_norm": 0.03220756952294772, + "language_loss": 0.89507246, + "learning_rate": 0.00029414912462258786, + "loss": 0.90659207, + "num_input_tokens_seen": 279275440, + "router_z_loss_mlp": 0.76318359, + "step": 3358, + "time_per_iteration": 2.87532901763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150464, + "balance_loss_mlp": 1.07397914, + "epoch": 0.6462100808003078, + "flos": 584242549248.0, + "grad_norm": 0.034688747990618336, + "language_loss": 0.87649322, + "learning_rate": 0.00029386525028512366, + "loss": 0.88799781, + "num_input_tokens_seen": 279349168, + "router_z_loss_mlp": 0.76367188, + "step": 3359, + "time_per_iteration": 2.701509714126587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115358, + "balance_loss_mlp": 1.07709527, + "epoch": 0.6464024624855714, + "flos": 485010673152.0, + "grad_norm": 0.035268388031257245, + "language_loss": 0.92228907, + "learning_rate": 0.0002935814559762666, + "loss": 0.9338249, + "num_input_tokens_seen": 279427600, + "router_z_loss_mlp": 0.76367188, + "step": 3360, + "time_per_iteration": 2.7698283195495605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149719, + "balance_loss_mlp": 1.07332945, + "epoch": 0.6465948441708349, + "flos": 528842322432.0, + "grad_norm": 0.029604921797993008, + "language_loss": 0.84675246, + "learning_rate": 0.0002932977418061957, + "loss": 0.85824966, + "num_input_tokens_seen": 279496608, + "router_z_loss_mlp": 0.76269531, + "step": 3361, + "time_per_iteration": 2.637636661529541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148892, + "balance_loss_mlp": 1.07245517, + "epoch": 0.6467872258560985, + "flos": 670625482752.0, + "grad_norm": 0.035318648220588056, + "language_loss": 0.86576068, + "learning_rate": 0.00029301410788505833, + "loss": 0.8772496, + "num_input_tokens_seen": 279568448, + "router_z_loss_mlp": 0.76318359, + "step": 3362, + "time_per_iteration": 2.7763969898223877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144507, + "balance_loss_mlp": 1.06826067, + "epoch": 0.6469796075413621, + "flos": 433040033280.0, + "grad_norm": 0.03731380273504302, + "language_loss": 0.87366712, + "learning_rate": 0.00029273055432297126, + "loss": 0.88511223, + "num_input_tokens_seen": 279631952, + "router_z_loss_mlp": 0.76123047, + "step": 3363, + "time_per_iteration": 2.5110268592834473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144768, + "balance_loss_mlp": 1.06842613, + "epoch": 0.6471719892266257, + "flos": 805101579264.0, + "grad_norm": 0.03447928292768335, + "language_loss": 0.85973775, + "learning_rate": 0.00029244708123001917, + "loss": 0.87118536, + "num_input_tokens_seen": 279706880, + "router_z_loss_mlp": 0.76220703, + "step": 3364, + "time_per_iteration": 2.9464926719665527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145161, + "balance_loss_mlp": 1.06896257, + "epoch": 0.6473643709118891, + "flos": 578348834304.0, + "grad_norm": 0.03376367371908884, + "language_loss": 0.88996613, + "learning_rate": 0.0002921636887162565, + "loss": 0.90141773, + "num_input_tokens_seen": 279778864, + "router_z_loss_mlp": 0.76074219, + "step": 3365, + "time_per_iteration": 2.7177810668945312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144996, + "balance_loss_mlp": 1.06879795, + "epoch": 0.6475567525971527, + "flos": 762787338240.0, + "grad_norm": 0.03409968089483679, + "language_loss": 0.89139444, + "learning_rate": 0.00029188037689170595, + "loss": 0.90284443, + "num_input_tokens_seen": 279853328, + "router_z_loss_mlp": 0.76074219, + "step": 3366, + "time_per_iteration": 2.94266676902771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144468, + "balance_loss_mlp": 1.06817389, + "epoch": 0.6477491342824163, + "flos": 844500526080.0, + "grad_norm": 0.03525364957484555, + "language_loss": 0.88880944, + "learning_rate": 0.0002915971458663586, + "loss": 0.90025413, + "num_input_tokens_seen": 279928464, + "router_z_loss_mlp": 0.76171875, + "step": 3367, + "time_per_iteration": 3.037111282348633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144688, + "balance_loss_mlp": 1.06844163, + "epoch": 0.6479415159676799, + "flos": 886381065216.0, + "grad_norm": 0.02613941789873103, + "language_loss": 0.85508728, + "learning_rate": 0.00029131399575017494, + "loss": 0.86653411, + "num_input_tokens_seen": 280015680, + "router_z_loss_mlp": 0.76123047, + "step": 3368, + "time_per_iteration": 3.1630287170410156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144843, + "balance_loss_mlp": 1.06859708, + "epoch": 0.6481338976529435, + "flos": 616723198464.0, + "grad_norm": 0.02777106453890135, + "language_loss": 0.9063583, + "learning_rate": 0.0002910309266530836, + "loss": 0.91780674, + "num_input_tokens_seen": 280093904, + "router_z_loss_mlp": 0.76123047, + "step": 3369, + "time_per_iteration": 2.7928354740142822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154935, + "balance_loss_mlp": 1.07859313, + "epoch": 0.648326279338207, + "flos": 511019461632.0, + "grad_norm": 0.03366950054230419, + "language_loss": 0.90075457, + "learning_rate": 0.0002907479386849814, + "loss": 0.91230392, + "num_input_tokens_seen": 280161584, + "router_z_loss_mlp": 0.76220703, + "step": 3370, + "time_per_iteration": 2.673582077026367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154894, + "balance_loss_mlp": 1.07869589, + "epoch": 0.6485186610234706, + "flos": 703868201472.0, + "grad_norm": 0.031297921332288904, + "language_loss": 0.8459866, + "learning_rate": 0.0002904650319557339, + "loss": 0.8575356, + "num_input_tokens_seen": 280248016, + "router_z_loss_mlp": 0.76074219, + "step": 3371, + "time_per_iteration": 2.984816789627075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149879, + "balance_loss_mlp": 1.07358491, + "epoch": 0.6487110427087341, + "flos": 561745939968.0, + "grad_norm": 0.03993640989964456, + "language_loss": 0.8677696, + "learning_rate": 0.0002901822065751758, + "loss": 0.87926841, + "num_input_tokens_seen": 280319024, + "router_z_loss_mlp": 0.76171875, + "step": 3372, + "time_per_iteration": 2.642890691757202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149196, + "balance_loss_mlp": 1.0729022, + "epoch": 0.6489034243939977, + "flos": 681301734912.0, + "grad_norm": 0.03031559078625196, + "language_loss": 0.90163612, + "learning_rate": 0.0002898994626531093, + "loss": 0.91312808, + "num_input_tokens_seen": 280393200, + "router_z_loss_mlp": 0.76171875, + "step": 3373, + "time_per_iteration": 2.838804006576538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149133, + "balance_loss_mlp": 1.07303011, + "epoch": 0.6490958060792612, + "flos": 475371738624.0, + "grad_norm": 0.03229066647304318, + "language_loss": 0.92974752, + "learning_rate": 0.00028961680029930526, + "loss": 0.94123888, + "num_input_tokens_seen": 280456944, + "router_z_loss_mlp": 0.75976562, + "step": 3374, + "time_per_iteration": 2.5095248222351074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149591, + "balance_loss_mlp": 1.07339203, + "epoch": 0.6492881877645248, + "flos": 590002005504.0, + "grad_norm": 0.03422977569034653, + "language_loss": 0.8249414, + "learning_rate": 0.00028933421962350317, + "loss": 0.83643734, + "num_input_tokens_seen": 280534352, + "router_z_loss_mlp": 0.76074219, + "step": 3375, + "time_per_iteration": 2.733698606491089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149303, + "balance_loss_mlp": 1.07310462, + "epoch": 0.6494805694497884, + "flos": 643587382272.0, + "grad_norm": 0.03276895180859608, + "language_loss": 0.88882941, + "learning_rate": 0.0002890517207354104, + "loss": 0.90032244, + "num_input_tokens_seen": 280608912, + "router_z_loss_mlp": 0.76074219, + "step": 3376, + "time_per_iteration": 2.8495798110961914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149673, + "balance_loss_mlp": 1.07347465, + "epoch": 0.649672951135052, + "flos": 532836593664.0, + "grad_norm": 0.031246089180930747, + "language_loss": 0.86472917, + "learning_rate": 0.0002887693037447029, + "loss": 0.87622589, + "num_input_tokens_seen": 280678848, + "router_z_loss_mlp": 0.76074219, + "step": 3377, + "time_per_iteration": 2.588364601135254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147339, + "balance_loss_mlp": 1.07109332, + "epoch": 0.6498653328203156, + "flos": 548445104640.0, + "grad_norm": 0.03311172972858422, + "language_loss": 0.87447202, + "learning_rate": 0.00028848696876102443, + "loss": 0.88594544, + "num_input_tokens_seen": 280750224, + "router_z_loss_mlp": 0.76123047, + "step": 3378, + "time_per_iteration": 2.6357853412628174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114593, + "balance_loss_mlp": 1.06977868, + "epoch": 0.650057714505579, + "flos": 463160613888.0, + "grad_norm": 0.0392849096276736, + "language_loss": 0.89328945, + "learning_rate": 0.00028820471589398723, + "loss": 0.90474874, + "num_input_tokens_seen": 280817488, + "router_z_loss_mlp": 0.76025391, + "step": 3379, + "time_per_iteration": 2.530264139175415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161056, + "balance_loss_mlp": 1.08519137, + "epoch": 0.6502500961908426, + "flos": 511241041920.0, + "grad_norm": 0.03964181246795499, + "language_loss": 0.82806408, + "learning_rate": 0.00028792254525317196, + "loss": 0.83967471, + "num_input_tokens_seen": 280887440, + "router_z_loss_mlp": 0.75732422, + "step": 3380, + "time_per_iteration": 2.677969217300415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158758, + "balance_loss_mlp": 1.08279765, + "epoch": 0.6504424778761062, + "flos": 580910290944.0, + "grad_norm": 0.031350821569318954, + "language_loss": 0.8659088, + "learning_rate": 0.00028764045694812645, + "loss": 0.87749636, + "num_input_tokens_seen": 280959072, + "router_z_loss_mlp": 0.75830078, + "step": 3381, + "time_per_iteration": 2.7509915828704834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157316, + "balance_loss_mlp": 1.0813086, + "epoch": 0.6506348595613698, + "flos": 520467015168.0, + "grad_norm": 0.04066104102632486, + "language_loss": 0.82166147, + "learning_rate": 0.0002873584510883671, + "loss": 0.83323467, + "num_input_tokens_seen": 281025376, + "router_z_loss_mlp": 0.75878906, + "step": 3382, + "time_per_iteration": 2.5591564178466797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153945, + "balance_loss_mlp": 1.07769895, + "epoch": 0.6508272412466333, + "flos": 511362565632.0, + "grad_norm": 0.02912056326895262, + "language_loss": 0.91856563, + "learning_rate": 0.0002870765277833788, + "loss": 0.93010509, + "num_input_tokens_seen": 281097616, + "router_z_loss_mlp": 0.76123047, + "step": 3383, + "time_per_iteration": 2.7396798133850098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150716, + "balance_loss_mlp": 1.07461333, + "epoch": 0.6510196229318969, + "flos": 626804567040.0, + "grad_norm": 0.032638591105191926, + "language_loss": 0.86156708, + "learning_rate": 0.00028679468714261347, + "loss": 0.87307423, + "num_input_tokens_seen": 281170192, + "router_z_loss_mlp": 0.75976562, + "step": 3384, + "time_per_iteration": 2.762810230255127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148501, + "balance_loss_mlp": 1.07239771, + "epoch": 0.6512120046171604, + "flos": 475669180416.0, + "grad_norm": 0.033246821782095315, + "language_loss": 0.80913359, + "learning_rate": 0.0002865129292754918, + "loss": 0.82061851, + "num_input_tokens_seen": 281238832, + "router_z_loss_mlp": 0.75976562, + "step": 3385, + "time_per_iteration": 2.6017582416534424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151379, + "balance_loss_mlp": 1.07513273, + "epoch": 0.651404386302424, + "flos": 553030256640.0, + "grad_norm": 0.0304228647826632, + "language_loss": 0.86788058, + "learning_rate": 0.00028623125429140105, + "loss": 0.87939441, + "num_input_tokens_seen": 281319472, + "router_z_loss_mlp": 0.76123047, + "step": 3386, + "time_per_iteration": 2.8177084922790527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114874, + "balance_loss_mlp": 1.07230258, + "epoch": 0.6515967679876876, + "flos": 524374691328.0, + "grad_norm": 0.03154749952631653, + "language_loss": 0.92443657, + "learning_rate": 0.00028594966229969785, + "loss": 0.93592393, + "num_input_tokens_seen": 281391168, + "router_z_loss_mlp": 0.76318359, + "step": 3387, + "time_per_iteration": 2.654865264892578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145456, + "balance_loss_mlp": 1.06925726, + "epoch": 0.6517891496729511, + "flos": 575016576000.0, + "grad_norm": 0.03711897249096357, + "language_loss": 0.87118483, + "learning_rate": 0.00028566815340970577, + "loss": 0.88263941, + "num_input_tokens_seen": 281465664, + "router_z_loss_mlp": 0.76074219, + "step": 3388, + "time_per_iteration": 2.724337339401245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148749, + "balance_loss_mlp": 1.07240736, + "epoch": 0.6519815313582147, + "flos": 556989599232.0, + "grad_norm": 0.03038600941725792, + "language_loss": 0.85638821, + "learning_rate": 0.0002853867277307162, + "loss": 0.8678757, + "num_input_tokens_seen": 281532928, + "router_z_loss_mlp": 0.76220703, + "step": 3389, + "time_per_iteration": 2.6384835243225098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114605, + "balance_loss_mlp": 1.0695653, + "epoch": 0.6521739130434783, + "flos": 481521962496.0, + "grad_norm": 0.03095245810395829, + "language_loss": 0.87876832, + "learning_rate": 0.00028510538537198824, + "loss": 0.89022881, + "num_input_tokens_seen": 281601680, + "router_z_loss_mlp": 0.76367188, + "step": 3390, + "time_per_iteration": 2.6401560306549072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143269, + "balance_loss_mlp": 1.06664157, + "epoch": 0.6523662947287419, + "flos": 667019977728.0, + "grad_norm": 0.029103127011675372, + "language_loss": 0.90833724, + "learning_rate": 0.00028482412644274867, + "loss": 0.91976994, + "num_input_tokens_seen": 281679488, + "router_z_loss_mlp": 0.76513672, + "step": 3391, + "time_per_iteration": 2.914109945297241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143322, + "balance_loss_mlp": 1.06645572, + "epoch": 0.6525586764140053, + "flos": 549702001152.0, + "grad_norm": 0.036601963047289736, + "language_loss": 0.80285096, + "learning_rate": 0.00028454295105219207, + "loss": 0.81428421, + "num_input_tokens_seen": 281751056, + "router_z_loss_mlp": 0.76757812, + "step": 3392, + "time_per_iteration": 2.6647682189941406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142157, + "balance_loss_mlp": 1.06557703, + "epoch": 0.6527510580992689, + "flos": 804389901312.0, + "grad_norm": 0.025027747425113815, + "language_loss": 0.83011138, + "learning_rate": 0.0002842618593094802, + "loss": 0.84153295, + "num_input_tokens_seen": 281841008, + "router_z_loss_mlp": 0.76464844, + "step": 3393, + "time_per_iteration": 3.116758108139038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144173, + "balance_loss_mlp": 1.06744993, + "epoch": 0.6529434397845325, + "flos": 672375204864.0, + "grad_norm": 0.042372987357860006, + "language_loss": 0.85526049, + "learning_rate": 0.00028398085132374243, + "loss": 0.8667022, + "num_input_tokens_seen": 281908016, + "router_z_loss_mlp": 0.76611328, + "step": 3394, + "time_per_iteration": 2.7683980464935303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142459, + "balance_loss_mlp": 1.06592691, + "epoch": 0.6531358214697961, + "flos": 829875664896.0, + "grad_norm": 0.03113385731669579, + "language_loss": 0.89394134, + "learning_rate": 0.0002836999272040761, + "loss": 0.90536594, + "num_input_tokens_seen": 281989072, + "router_z_loss_mlp": 0.76416016, + "step": 3395, + "time_per_iteration": 3.102487087249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140812, + "balance_loss_mlp": 1.06432748, + "epoch": 0.6533282031550597, + "flos": 488392596480.0, + "grad_norm": 0.0404739719167322, + "language_loss": 0.89987487, + "learning_rate": 0.00028341908705954575, + "loss": 0.91128296, + "num_input_tokens_seen": 282053152, + "router_z_loss_mlp": 0.76367188, + "step": 3396, + "time_per_iteration": 2.692906618118286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146225, + "balance_loss_mlp": 1.07183838, + "epoch": 0.6535205848403232, + "flos": 1561102328832.0, + "grad_norm": 0.005117457515533169, + "language_loss": 0.81761813, + "learning_rate": 0.00028313833099918265, + "loss": 0.82908034, + "num_input_tokens_seen": 282283984, + "router_z_loss_mlp": 0.74414062, + "step": 3397, + "time_per_iteration": 4.795916557312012 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144233, + "balance_loss_mlp": 1.06793857, + "epoch": 0.6537129665255867, + "flos": 494703275520.0, + "grad_norm": 0.03597932641299946, + "language_loss": 0.82677722, + "learning_rate": 0.00028285765913198604, + "loss": 0.83821958, + "num_input_tokens_seen": 282353008, + "router_z_loss_mlp": 0.76171875, + "step": 3398, + "time_per_iteration": 2.5658674240112305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114427, + "balance_loss_mlp": 1.06788087, + "epoch": 0.6539053482108503, + "flos": 606142273536.0, + "grad_norm": 0.0350820826110483, + "language_loss": 0.88009775, + "learning_rate": 0.0002825770715669227, + "loss": 0.89154047, + "num_input_tokens_seen": 282427648, + "router_z_loss_mlp": 0.76269531, + "step": 3399, + "time_per_iteration": 2.7702410221099854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145417, + "balance_loss_mlp": 1.06902778, + "epoch": 0.6540977298961139, + "flos": 578880591360.0, + "grad_norm": 0.0325786381033819, + "language_loss": 0.8578831, + "learning_rate": 0.00028229656841292634, + "loss": 0.86933732, + "num_input_tokens_seen": 282502128, + "router_z_loss_mlp": 0.76269531, + "step": 3400, + "time_per_iteration": 2.6832401752471924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145045, + "balance_loss_mlp": 1.06865597, + "epoch": 0.6542901115813774, + "flos": 512769183744.0, + "grad_norm": 0.039852870614421367, + "language_loss": 0.82027632, + "learning_rate": 0.0002820161497788979, + "loss": 0.83172679, + "num_input_tokens_seen": 282569360, + "router_z_loss_mlp": 0.76269531, + "step": 3401, + "time_per_iteration": 2.5679121017456055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149696, + "balance_loss_mlp": 1.07330704, + "epoch": 0.654482493266641, + "flos": 626674311168.0, + "grad_norm": 0.030416914651843395, + "language_loss": 0.91325247, + "learning_rate": 0.00028173581577370545, + "loss": 0.92474937, + "num_input_tokens_seen": 282645472, + "router_z_loss_mlp": 0.76269531, + "step": 3402, + "time_per_iteration": 2.7601027488708496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150076, + "balance_loss_mlp": 1.07368624, + "epoch": 0.6546748749519046, + "flos": 525062900736.0, + "grad_norm": 0.030820927894649717, + "language_loss": 0.83866602, + "learning_rate": 0.0002814555665061844, + "loss": 0.8501668, + "num_input_tokens_seen": 282717568, + "router_z_loss_mlp": 0.76269531, + "step": 3403, + "time_per_iteration": 2.688485860824585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153093, + "balance_loss_mlp": 1.07641792, + "epoch": 0.6548672566371682, + "flos": 480273798144.0, + "grad_norm": 0.03553217015928594, + "language_loss": 0.82424521, + "learning_rate": 0.00028117540208513715, + "loss": 0.83577615, + "num_input_tokens_seen": 282791408, + "router_z_loss_mlp": 0.765625, + "step": 3404, + "time_per_iteration": 2.6906890869140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150931, + "balance_loss_mlp": 1.07425523, + "epoch": 0.6550596383224317, + "flos": 617135433216.0, + "grad_norm": 0.03288416711071717, + "language_loss": 0.89287072, + "learning_rate": 0.00028089532261933313, + "loss": 0.90438002, + "num_input_tokens_seen": 282862992, + "router_z_loss_mlp": 0.765625, + "step": 3405, + "time_per_iteration": 2.718001127243042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147316, + "balance_loss_mlp": 1.07078385, + "epoch": 0.6552520200076952, + "flos": 489807946752.0, + "grad_norm": 0.040144975574141664, + "language_loss": 0.91147745, + "learning_rate": 0.0002806153282175087, + "loss": 0.92295063, + "num_input_tokens_seen": 282930448, + "router_z_loss_mlp": 0.76416016, + "step": 3406, + "time_per_iteration": 2.5618858337402344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114632, + "balance_loss_mlp": 1.06983495, + "epoch": 0.6554444016929588, + "flos": 688858576896.0, + "grad_norm": 0.034942224339764696, + "language_loss": 0.88083732, + "learning_rate": 0.0002803354189883679, + "loss": 0.89230049, + "num_input_tokens_seen": 283010864, + "router_z_loss_mlp": 0.76367188, + "step": 3407, + "time_per_iteration": 2.893331527709961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114697, + "balance_loss_mlp": 1.07039022, + "epoch": 0.6556367833782224, + "flos": 544170855936.0, + "grad_norm": 0.02881485242285111, + "language_loss": 0.89870715, + "learning_rate": 0.00028005559504058053, + "loss": 0.91017687, + "num_input_tokens_seen": 283082240, + "router_z_loss_mlp": 0.76464844, + "step": 3408, + "time_per_iteration": 2.750748634338379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146342, + "balance_loss_mlp": 1.06980956, + "epoch": 0.655829165063486, + "flos": 674730544128.0, + "grad_norm": 0.03409829385099465, + "language_loss": 0.82774001, + "learning_rate": 0.0002797758564827838, + "loss": 0.83920342, + "num_input_tokens_seen": 283156656, + "router_z_loss_mlp": 0.76416016, + "step": 3409, + "time_per_iteration": 2.7883474826812744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114755, + "balance_loss_mlp": 1.07111335, + "epoch": 0.6560215467487496, + "flos": 532836593664.0, + "grad_norm": 0.03847218102070899, + "language_loss": 0.89379394, + "learning_rate": 0.0002794962034235824, + "loss": 0.9052695, + "num_input_tokens_seen": 283223584, + "router_z_loss_mlp": 0.76318359, + "step": 3410, + "time_per_iteration": 2.6389691829681396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147509, + "balance_loss_mlp": 1.07102418, + "epoch": 0.656213928434013, + "flos": 592459402752.0, + "grad_norm": 0.035948217838460056, + "language_loss": 0.79690081, + "learning_rate": 0.00027921663597154695, + "loss": 0.80837584, + "num_input_tokens_seen": 283297680, + "router_z_loss_mlp": 0.76367188, + "step": 3411, + "time_per_iteration": 2.8345415592193604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146787, + "balance_loss_mlp": 1.07030261, + "epoch": 0.6564063101192766, + "flos": 416678184960.0, + "grad_norm": 0.038637742097161205, + "language_loss": 0.87214196, + "learning_rate": 0.00027893715423521525, + "loss": 0.88360977, + "num_input_tokens_seen": 283359744, + "router_z_loss_mlp": 0.76367188, + "step": 3412, + "time_per_iteration": 2.4819529056549072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146018, + "balance_loss_mlp": 1.06953347, + "epoch": 0.6565986918045402, + "flos": 454271013888.0, + "grad_norm": 0.03334091944582967, + "language_loss": 0.89441139, + "learning_rate": 0.00027865775832309163, + "loss": 0.90587157, + "num_input_tokens_seen": 283430688, + "router_z_loss_mlp": 0.76367188, + "step": 3413, + "time_per_iteration": 2.728583335876465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145861, + "balance_loss_mlp": 1.06956708, + "epoch": 0.6567910734898038, + "flos": 548798942208.0, + "grad_norm": 0.03367441290021015, + "language_loss": 0.91664404, + "learning_rate": 0.00027837844834364733, + "loss": 0.92810267, + "num_input_tokens_seen": 283498048, + "router_z_loss_mlp": 0.76171875, + "step": 3414, + "time_per_iteration": 2.6371517181396484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145504, + "balance_loss_mlp": 1.06925821, + "epoch": 0.6569834551750673, + "flos": 656764692480.0, + "grad_norm": 0.030804659012074204, + "language_loss": 0.9116472, + "learning_rate": 0.00027809922440532, + "loss": 0.92310226, + "num_input_tokens_seen": 283573040, + "router_z_loss_mlp": 0.76123047, + "step": 3415, + "time_per_iteration": 2.8265881538391113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148906, + "balance_loss_mlp": 1.07265973, + "epoch": 0.6571758368603309, + "flos": 540810399744.0, + "grad_norm": 0.030022936132040084, + "language_loss": 0.8532089, + "learning_rate": 0.00027782008661651406, + "loss": 0.86469799, + "num_input_tokens_seen": 283651696, + "router_z_loss_mlp": 0.76123047, + "step": 3416, + "time_per_iteration": 2.7672157287597656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149293, + "balance_loss_mlp": 1.07314205, + "epoch": 0.6573682185455945, + "flos": 498378637824.0, + "grad_norm": 0.029653574310281386, + "language_loss": 0.91551638, + "learning_rate": 0.00027754103508560013, + "loss": 0.92700928, + "num_input_tokens_seen": 283721824, + "router_z_loss_mlp": 0.76025391, + "step": 3417, + "time_per_iteration": 2.6405131816864014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114713, + "balance_loss_mlp": 1.07088423, + "epoch": 0.657560600230858, + "flos": 448353103872.0, + "grad_norm": 0.03576987566134107, + "language_loss": 0.87917447, + "learning_rate": 0.0002772620699209163, + "loss": 0.89064574, + "num_input_tokens_seen": 283786960, + "router_z_loss_mlp": 0.76123047, + "step": 3418, + "time_per_iteration": 2.5418612957000732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145939, + "balance_loss_mlp": 1.06983602, + "epoch": 0.6577529819161216, + "flos": 482919848448.0, + "grad_norm": 0.03527260419864515, + "language_loss": 0.85359573, + "learning_rate": 0.0002769831912307658, + "loss": 0.86505508, + "num_input_tokens_seen": 283853808, + "router_z_loss_mlp": 0.75976562, + "step": 3419, + "time_per_iteration": 2.604675054550171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147112, + "balance_loss_mlp": 1.07081771, + "epoch": 0.6579453636013851, + "flos": 531859674624.0, + "grad_norm": 0.03824872762512091, + "language_loss": 0.86228991, + "learning_rate": 0.00027670439912341917, + "loss": 0.87376106, + "num_input_tokens_seen": 283920960, + "router_z_loss_mlp": 0.76171875, + "step": 3420, + "time_per_iteration": 2.6483054161071777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146054, + "balance_loss_mlp": 1.06975985, + "epoch": 0.6581377452866487, + "flos": 629242498560.0, + "grad_norm": 0.03412485031630486, + "language_loss": 0.89059192, + "learning_rate": 0.0002764256937071129, + "loss": 0.90205252, + "num_input_tokens_seen": 283992416, + "router_z_loss_mlp": 0.76171875, + "step": 3421, + "time_per_iteration": 2.839137077331543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146563, + "balance_loss_mlp": 1.07031691, + "epoch": 0.6583301269719123, + "flos": 549673803264.0, + "grad_norm": 0.030144943579318143, + "language_loss": 0.91856694, + "learning_rate": 0.00027614707509005036, + "loss": 0.93003255, + "num_input_tokens_seen": 284061760, + "router_z_loss_mlp": 0.76123047, + "step": 3422, + "time_per_iteration": 2.680708408355713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114715, + "balance_loss_mlp": 1.07095134, + "epoch": 0.6585225086571759, + "flos": 428396484096.0, + "grad_norm": 0.04026315039628517, + "language_loss": 0.84251142, + "learning_rate": 0.0002758685433804008, + "loss": 0.85398293, + "num_input_tokens_seen": 284124848, + "router_z_loss_mlp": 0.76074219, + "step": 3423, + "time_per_iteration": 2.5081021785736084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146911, + "balance_loss_mlp": 1.07052183, + "epoch": 0.6587148903424394, + "flos": 861049026048.0, + "grad_norm": 0.03441249575164818, + "language_loss": 0.84824026, + "learning_rate": 0.00027559009868630005, + "loss": 0.85970938, + "num_input_tokens_seen": 284206272, + "router_z_loss_mlp": 0.76269531, + "step": 3424, + "time_per_iteration": 3.1415717601776123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114833, + "balance_loss_mlp": 1.07213128, + "epoch": 0.6589072720277029, + "flos": 807035951616.0, + "grad_norm": 0.03717672501292478, + "language_loss": 0.86237669, + "learning_rate": 0.0002753117411158491, + "loss": 0.87386, + "num_input_tokens_seen": 284293696, + "router_z_loss_mlp": 0.76074219, + "step": 3425, + "time_per_iteration": 3.041346788406372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148297, + "balance_loss_mlp": 1.07195568, + "epoch": 0.6590996537129665, + "flos": 549673803264.0, + "grad_norm": 0.03250683157775158, + "language_loss": 0.94800514, + "learning_rate": 0.0002750334707771168, + "loss": 0.95948815, + "num_input_tokens_seen": 284360192, + "router_z_loss_mlp": 0.76220703, + "step": 3426, + "time_per_iteration": 2.6350677013397217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149524, + "balance_loss_mlp": 1.07318223, + "epoch": 0.6592920353982301, + "flos": 455108944896.0, + "grad_norm": 0.0355046198758662, + "language_loss": 0.86040199, + "learning_rate": 0.0002747552877781369, + "loss": 0.87189716, + "num_input_tokens_seen": 284423680, + "router_z_loss_mlp": 0.76220703, + "step": 3427, + "time_per_iteration": 2.5129551887512207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114868, + "balance_loss_mlp": 1.07233834, + "epoch": 0.6594844170834937, + "flos": 568260734976.0, + "grad_norm": 0.034595379074033504, + "language_loss": 0.88492763, + "learning_rate": 0.0002744771922269097, + "loss": 0.8964144, + "num_input_tokens_seen": 284495712, + "router_z_loss_mlp": 0.76220703, + "step": 3428, + "time_per_iteration": 2.694378137588501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147393, + "balance_loss_mlp": 1.07114637, + "epoch": 0.6596767987687572, + "flos": 1189754284032.0, + "grad_norm": 0.030854411324183387, + "language_loss": 0.86799264, + "learning_rate": 0.0002741991842314015, + "loss": 0.87946653, + "num_input_tokens_seen": 284583440, + "router_z_loss_mlp": 0.76123047, + "step": 3429, + "time_per_iteration": 3.48809552192688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145028, + "balance_loss_mlp": 1.0686388, + "epoch": 0.6598691804540208, + "flos": 504467736576.0, + "grad_norm": 0.03376941001539595, + "language_loss": 0.89963281, + "learning_rate": 0.0002739212638995445, + "loss": 0.9110831, + "num_input_tokens_seen": 284649168, + "router_z_loss_mlp": 0.76269531, + "step": 3430, + "time_per_iteration": 2.532970428466797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114449, + "balance_loss_mlp": 1.06814861, + "epoch": 0.6600615621392844, + "flos": 532398162432.0, + "grad_norm": 0.038613055067671744, + "language_loss": 0.88853264, + "learning_rate": 0.00027364343133923696, + "loss": 0.89997756, + "num_input_tokens_seen": 284723136, + "router_z_loss_mlp": 0.76220703, + "step": 3431, + "time_per_iteration": 2.6269612312316895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144024, + "balance_loss_mlp": 1.06768203, + "epoch": 0.6602539438245479, + "flos": 566556675072.0, + "grad_norm": 0.03520560530434118, + "language_loss": 0.8882376, + "learning_rate": 0.0002733656866583431, + "loss": 0.89967781, + "num_input_tokens_seen": 284792752, + "router_z_loss_mlp": 0.76220703, + "step": 3432, + "time_per_iteration": 2.682663679122925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156009, + "balance_loss_mlp": 1.07995379, + "epoch": 0.6604463255098114, + "flos": 858591628800.0, + "grad_norm": 0.04099855509153074, + "language_loss": 0.88963896, + "learning_rate": 0.0002730880299646927, + "loss": 0.90119904, + "num_input_tokens_seen": 284871008, + "router_z_loss_mlp": 0.75927734, + "step": 3433, + "time_per_iteration": 3.050039291381836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157407, + "balance_loss_mlp": 1.08149505, + "epoch": 0.660638707195075, + "flos": 675679265280.0, + "grad_norm": 0.03297285173612762, + "language_loss": 0.89854127, + "learning_rate": 0.0002728104613660821, + "loss": 0.91011536, + "num_input_tokens_seen": 284945184, + "router_z_loss_mlp": 0.7578125, + "step": 3434, + "time_per_iteration": 2.8358242511749268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148511, + "balance_loss_mlp": 1.07236028, + "epoch": 0.6608310888803386, + "flos": 890523056640.0, + "grad_norm": 0.03459988631627961, + "language_loss": 0.88072419, + "learning_rate": 0.0002725329809702729, + "loss": 0.89220929, + "num_input_tokens_seen": 285029296, + "router_z_loss_mlp": 0.76025391, + "step": 3435, + "time_per_iteration": 3.181201457977295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146577, + "balance_loss_mlp": 1.07033134, + "epoch": 0.6610234705656022, + "flos": 1138107282432.0, + "grad_norm": 0.04279733621824939, + "language_loss": 0.82982898, + "learning_rate": 0.0002722555888849921, + "loss": 0.84129477, + "num_input_tokens_seen": 285124720, + "router_z_loss_mlp": 0.76123047, + "step": 3436, + "time_per_iteration": 3.423975706100464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147052, + "balance_loss_mlp": 1.07099605, + "epoch": 0.6612158522508658, + "flos": 468959001600.0, + "grad_norm": 0.03231258951929261, + "language_loss": 0.84970325, + "learning_rate": 0.00027197828521793334, + "loss": 0.86117375, + "num_input_tokens_seen": 285191360, + "router_z_loss_mlp": 0.75927734, + "step": 3437, + "time_per_iteration": 2.5456013679504395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147897, + "balance_loss_mlp": 1.07179344, + "epoch": 0.6614082339361292, + "flos": 572774028288.0, + "grad_norm": 0.03152032613188321, + "language_loss": 0.8887009, + "learning_rate": 0.0002717010700767552, + "loss": 0.90017986, + "num_input_tokens_seen": 285262624, + "router_z_loss_mlp": 0.75976562, + "step": 3438, + "time_per_iteration": 2.6809959411621094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149118, + "balance_loss_mlp": 1.07306218, + "epoch": 0.6616006156213928, + "flos": 499459616256.0, + "grad_norm": 0.039698826906756704, + "language_loss": 0.82129598, + "learning_rate": 0.00027142394356908226, + "loss": 0.8327871, + "num_input_tokens_seen": 285328512, + "router_z_loss_mlp": 0.75927734, + "step": 3439, + "time_per_iteration": 2.5949456691741943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148646, + "balance_loss_mlp": 1.07254267, + "epoch": 0.6617929973066564, + "flos": 603609014784.0, + "grad_norm": 0.030441774907891187, + "language_loss": 0.8967098, + "learning_rate": 0.00027114690580250456, + "loss": 0.90819627, + "num_input_tokens_seen": 285406128, + "router_z_loss_mlp": 0.75976562, + "step": 3440, + "time_per_iteration": 2.749826431274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147854, + "balance_loss_mlp": 1.07175064, + "epoch": 0.66198537899192, + "flos": 523994657280.0, + "grad_norm": 0.033263511323201614, + "language_loss": 0.91719675, + "learning_rate": 0.0002708699568845776, + "loss": 0.92867529, + "num_input_tokens_seen": 285474704, + "router_z_loss_mlp": 0.75976562, + "step": 3441, + "time_per_iteration": 2.65191912651062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162537, + "balance_loss_mlp": 1.08815002, + "epoch": 0.6621777606771835, + "flos": 1569609893376.0, + "grad_norm": 0.01497403906155291, + "language_loss": 0.79287779, + "learning_rate": 0.00027059309692282265, + "loss": 0.8045032, + "num_input_tokens_seen": 285698704, + "router_z_loss_mlp": 0.74414062, + "step": 3442, + "time_per_iteration": 4.957901239395142 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154184, + "balance_loss_mlp": 1.07817662, + "epoch": 0.6623701423624471, + "flos": 527689485312.0, + "grad_norm": 0.03191394261297454, + "language_loss": 0.8795507, + "learning_rate": 0.0002703163260247261, + "loss": 0.89109254, + "num_input_tokens_seen": 285767936, + "router_z_loss_mlp": 0.75878906, + "step": 3443, + "time_per_iteration": 2.6025161743164062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151931, + "balance_loss_mlp": 1.07601833, + "epoch": 0.6625625240477107, + "flos": 529215625728.0, + "grad_norm": 0.035865829187726836, + "language_loss": 0.87189507, + "learning_rate": 0.0002700396442977399, + "loss": 0.88341439, + "num_input_tokens_seen": 285839456, + "router_z_loss_mlp": 0.7578125, + "step": 3444, + "time_per_iteration": 2.624119758605957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152482, + "balance_loss_mlp": 1.07652199, + "epoch": 0.6627549057329742, + "flos": 474195432960.0, + "grad_norm": 0.03160775147122319, + "language_loss": 0.890499, + "learning_rate": 0.0002697630518492817, + "loss": 0.90202379, + "num_input_tokens_seen": 285905904, + "router_z_loss_mlp": 0.75830078, + "step": 3445, + "time_per_iteration": 2.7382802963256836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151051, + "balance_loss_mlp": 1.07494795, + "epoch": 0.6629472874182378, + "flos": 529011509760.0, + "grad_norm": 0.03595555935138165, + "language_loss": 0.89779699, + "learning_rate": 0.0002694865487867343, + "loss": 0.90930748, + "num_input_tokens_seen": 285975520, + "router_z_loss_mlp": 0.75976562, + "step": 3446, + "time_per_iteration": 2.704895257949829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150785, + "balance_loss_mlp": 1.0749681, + "epoch": 0.6631396691035013, + "flos": 614378592768.0, + "grad_norm": 0.031003429121565652, + "language_loss": 0.8906312, + "learning_rate": 0.0002692101352174453, + "loss": 0.90213907, + "num_input_tokens_seen": 286050320, + "router_z_loss_mlp": 0.75683594, + "step": 3447, + "time_per_iteration": 2.8165597915649414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148036, + "balance_loss_mlp": 1.07207584, + "epoch": 0.6633320507887649, + "flos": 610433986560.0, + "grad_norm": 0.03537124525005162, + "language_loss": 0.89763427, + "learning_rate": 0.00026893381124872787, + "loss": 0.90911466, + "num_input_tokens_seen": 286120672, + "router_z_loss_mlp": 0.75830078, + "step": 3448, + "time_per_iteration": 2.698657512664795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146339, + "balance_loss_mlp": 1.07033098, + "epoch": 0.6635244324740285, + "flos": 751140897792.0, + "grad_norm": 0.037519042250439116, + "language_loss": 0.85281086, + "learning_rate": 0.00026865757698786097, + "loss": 0.86427426, + "num_input_tokens_seen": 286201152, + "router_z_loss_mlp": 0.75878906, + "step": 3449, + "time_per_iteration": 3.055635452270508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145472, + "balance_loss_mlp": 1.06932163, + "epoch": 0.6637168141592921, + "flos": 665747618304.0, + "grad_norm": 0.03493094826481752, + "language_loss": 0.85618043, + "learning_rate": 0.000268381432542088, + "loss": 0.86763519, + "num_input_tokens_seen": 286274512, + "router_z_loss_mlp": 0.76025391, + "step": 3450, + "time_per_iteration": 2.8057384490966797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145353, + "balance_loss_mlp": 1.06934512, + "epoch": 0.6639091958445555, + "flos": 607920193536.0, + "grad_norm": 0.03317215274134995, + "language_loss": 0.85111237, + "learning_rate": 0.00026810537801861807, + "loss": 0.86256593, + "num_input_tokens_seen": 286349808, + "router_z_loss_mlp": 0.75878906, + "step": 3451, + "time_per_iteration": 2.7435052394866943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149606, + "balance_loss_mlp": 1.0735507, + "epoch": 0.6641015775298191, + "flos": 477679414272.0, + "grad_norm": 0.03227894360580252, + "language_loss": 0.85315323, + "learning_rate": 0.0002678294135246243, + "loss": 0.8646493, + "num_input_tokens_seen": 286422912, + "router_z_loss_mlp": 0.75927734, + "step": 3452, + "time_per_iteration": 2.7193186283111572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147818, + "balance_loss_mlp": 1.07171512, + "epoch": 0.6642939592150827, + "flos": 905595081216.0, + "grad_norm": 0.03357369585289791, + "language_loss": 0.91588908, + "learning_rate": 0.0002675535391672463, + "loss": 0.92736733, + "num_input_tokens_seen": 286501072, + "router_z_loss_mlp": 0.75976562, + "step": 3453, + "time_per_iteration": 3.0945043563842773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148472, + "balance_loss_mlp": 1.07236886, + "epoch": 0.6644863409003463, + "flos": 582937989120.0, + "grad_norm": 0.030535675570776123, + "language_loss": 0.90264779, + "learning_rate": 0.0002672777550535877, + "loss": 0.91413254, + "num_input_tokens_seen": 286580480, + "router_z_loss_mlp": 0.75976562, + "step": 3454, + "time_per_iteration": 2.7741284370422363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150279, + "balance_loss_mlp": 1.07398534, + "epoch": 0.6646787225856099, + "flos": 479969625600.0, + "grad_norm": 0.03106835211233169, + "language_loss": 0.89111888, + "learning_rate": 0.00026700206129071747, + "loss": 0.90262163, + "num_input_tokens_seen": 286646208, + "router_z_loss_mlp": 0.76171875, + "step": 3455, + "time_per_iteration": 2.5455679893493652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149274, + "balance_loss_mlp": 1.07302773, + "epoch": 0.6648711042708734, + "flos": 450827965440.0, + "grad_norm": 0.034343549963822835, + "language_loss": 0.92980659, + "learning_rate": 0.00026672645798566925, + "loss": 0.94129932, + "num_input_tokens_seen": 286710624, + "router_z_loss_mlp": 0.76123047, + "step": 3456, + "time_per_iteration": 2.5500409603118896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149485, + "balance_loss_mlp": 1.07319152, + "epoch": 0.665063485956137, + "flos": 860595858432.0, + "grad_norm": 0.03429824706439816, + "language_loss": 0.85038483, + "learning_rate": 0.00026645094524544225, + "loss": 0.86187971, + "num_input_tokens_seen": 286799472, + "router_z_loss_mlp": 0.76171875, + "step": 3457, + "time_per_iteration": 3.2861030101776123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149344, + "balance_loss_mlp": 1.07290661, + "epoch": 0.6652558676414005, + "flos": 605471528448.0, + "grad_norm": 0.02726612159362192, + "language_loss": 0.79581773, + "learning_rate": 0.00026617552317699945, + "loss": 0.80731118, + "num_input_tokens_seen": 286874752, + "router_z_loss_mlp": 0.76318359, + "step": 3458, + "time_per_iteration": 2.8133809566497803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149341, + "balance_loss_mlp": 1.07299888, + "epoch": 0.6654482493266641, + "flos": 511410229248.0, + "grad_norm": 0.030741900207522484, + "language_loss": 0.92019296, + "learning_rate": 0.0002659001918872693, + "loss": 0.9316864, + "num_input_tokens_seen": 286943312, + "router_z_loss_mlp": 0.76220703, + "step": 3459, + "time_per_iteration": 2.719456672668457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149368, + "balance_loss_mlp": 1.07302606, + "epoch": 0.6656406310119277, + "flos": 566660734464.0, + "grad_norm": 0.03268721915470487, + "language_loss": 0.8501879, + "learning_rate": 0.0002656249514831449, + "loss": 0.86168158, + "num_input_tokens_seen": 287010000, + "router_z_loss_mlp": 0.76220703, + "step": 3460, + "time_per_iteration": 2.7105963230133057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150225, + "balance_loss_mlp": 1.07383597, + "epoch": 0.6658330126971912, + "flos": 1026058664448.0, + "grad_norm": 0.029696729072264432, + "language_loss": 0.91355968, + "learning_rate": 0.00026534980207148416, + "loss": 0.92506194, + "num_input_tokens_seen": 287101456, + "router_z_loss_mlp": 0.76269531, + "step": 3461, + "time_per_iteration": 3.3982574939727783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145433, + "balance_loss_mlp": 1.06894886, + "epoch": 0.6660253943824548, + "flos": 818233227264.0, + "grad_norm": 0.03528061567962845, + "language_loss": 0.78412712, + "learning_rate": 0.0002650747437591097, + "loss": 0.79558146, + "num_input_tokens_seen": 287182848, + "router_z_loss_mlp": 0.76367188, + "step": 3462, + "time_per_iteration": 2.9878056049346924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149719, + "balance_loss_mlp": 1.07533264, + "epoch": 0.6662177760677184, + "flos": 1499530411008.0, + "grad_norm": 0.00830594189347842, + "language_loss": 0.8187958, + "learning_rate": 0.00026479977665280806, + "loss": 0.83029294, + "num_input_tokens_seen": 287417920, + "router_z_loss_mlp": 0.74414062, + "step": 3463, + "time_per_iteration": 6.524547815322876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145921, + "balance_loss_mlp": 1.06953192, + "epoch": 0.666410157752982, + "flos": 501107280384.0, + "grad_norm": 0.03076087992809579, + "language_loss": 0.91384947, + "learning_rate": 0.00026452490085933155, + "loss": 0.9253087, + "num_input_tokens_seen": 287483776, + "router_z_loss_mlp": 0.76269531, + "step": 3464, + "time_per_iteration": 2.598808765411377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145896, + "balance_loss_mlp": 1.06955457, + "epoch": 0.6666025394382454, + "flos": 482138313216.0, + "grad_norm": 0.03618588438682257, + "language_loss": 0.95199478, + "learning_rate": 0.00026425011648539614, + "loss": 0.96345377, + "num_input_tokens_seen": 287548176, + "router_z_loss_mlp": 0.76220703, + "step": 3465, + "time_per_iteration": 2.5265092849731445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145501, + "balance_loss_mlp": 1.06906354, + "epoch": 0.666794921123509, + "flos": 547691767296.0, + "grad_norm": 0.03394030373238319, + "language_loss": 0.87548077, + "learning_rate": 0.00026397542363768267, + "loss": 0.88693571, + "num_input_tokens_seen": 287618496, + "router_z_loss_mlp": 0.76318359, + "step": 3466, + "time_per_iteration": 2.645876407623291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145746, + "balance_loss_mlp": 1.06935704, + "epoch": 0.6669873028087726, + "flos": 472942539264.0, + "grad_norm": 0.0340202515012301, + "language_loss": 0.87299979, + "learning_rate": 0.0002637008224228362, + "loss": 0.88445723, + "num_input_tokens_seen": 287684032, + "router_z_loss_mlp": 0.76269531, + "step": 3467, + "time_per_iteration": 2.5271472930908203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147048, + "balance_loss_mlp": 1.07070661, + "epoch": 0.6671796844940362, + "flos": 548499499008.0, + "grad_norm": 0.029468894408270302, + "language_loss": 0.89176929, + "learning_rate": 0.00026342631294746653, + "loss": 0.90323979, + "num_input_tokens_seen": 287757680, + "router_z_loss_mlp": 0.76220703, + "step": 3468, + "time_per_iteration": 2.694568395614624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146376, + "balance_loss_mlp": 1.07008207, + "epoch": 0.6673720661792998, + "flos": 1072122127872.0, + "grad_norm": 0.03284045124327485, + "language_loss": 0.85731959, + "learning_rate": 0.0002631518953181476, + "loss": 0.86878335, + "num_input_tokens_seen": 287848992, + "router_z_loss_mlp": 0.76171875, + "step": 3469, + "time_per_iteration": 3.4704368114471436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148972, + "balance_loss_mlp": 1.07458496, + "epoch": 0.6675644478645633, + "flos": 1527111002112.0, + "grad_norm": 0.004792795584487496, + "language_loss": 0.76325285, + "learning_rate": 0.000262877569641418, + "loss": 0.7747426, + "num_input_tokens_seen": 288085680, + "router_z_loss_mlp": 0.74414062, + "step": 3470, + "time_per_iteration": 4.929240465164185 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146143, + "balance_loss_mlp": 1.06989694, + "epoch": 0.6677568295498268, + "flos": 580843161600.0, + "grad_norm": 0.032107654736022645, + "language_loss": 0.84914112, + "learning_rate": 0.00026260333602377985, + "loss": 0.86060262, + "num_input_tokens_seen": 288161568, + "router_z_loss_mlp": 0.76123047, + "step": 3471, + "time_per_iteration": 2.740605592727661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146874, + "balance_loss_mlp": 1.07072294, + "epoch": 0.6679492112350904, + "flos": 384790417920.0, + "grad_norm": 0.036226919771653675, + "language_loss": 0.91317421, + "learning_rate": 0.0002623291945717007, + "loss": 0.92464286, + "num_input_tokens_seen": 288224032, + "router_z_loss_mlp": 0.76025391, + "step": 3472, + "time_per_iteration": 2.4707448482513428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146308, + "balance_loss_mlp": 1.07015693, + "epoch": 0.668141592920354, + "flos": 1152615349248.0, + "grad_norm": 0.02851459994850691, + "language_loss": 0.88269627, + "learning_rate": 0.00026205514539161175, + "loss": 0.89415932, + "num_input_tokens_seen": 288312912, + "router_z_loss_mlp": 0.76025391, + "step": 3473, + "time_per_iteration": 3.5094759464263916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146143, + "balance_loss_mlp": 1.07008779, + "epoch": 0.6683339746056175, + "flos": 562291158528.0, + "grad_norm": 0.030234261038109174, + "language_loss": 0.88653791, + "learning_rate": 0.00026178118858990773, + "loss": 0.89799941, + "num_input_tokens_seen": 288394224, + "router_z_loss_mlp": 0.75927734, + "step": 3474, + "time_per_iteration": 2.8636863231658936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147022, + "balance_loss_mlp": 1.07096648, + "epoch": 0.6685263562908811, + "flos": 515328638976.0, + "grad_norm": 0.030631239249789746, + "language_loss": 0.89337111, + "learning_rate": 0.0002615073242729483, + "loss": 0.9048413, + "num_input_tokens_seen": 288462976, + "router_z_loss_mlp": 0.75927734, + "step": 3475, + "time_per_iteration": 2.6223714351654053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148783, + "balance_loss_mlp": 1.07267952, + "epoch": 0.6687187379761447, + "flos": 631000952832.0, + "grad_norm": 0.03058857090132586, + "language_loss": 0.88941103, + "learning_rate": 0.0002612335525470573, + "loss": 0.90089881, + "num_input_tokens_seen": 288542032, + "router_z_loss_mlp": 0.75976562, + "step": 3476, + "time_per_iteration": 2.8004729747772217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148335, + "balance_loss_mlp": 1.07242274, + "epoch": 0.6689111196614083, + "flos": 536687874048.0, + "grad_norm": 0.03636459478392294, + "language_loss": 0.82775843, + "learning_rate": 0.0002609598735185221, + "loss": 0.8392418, + "num_input_tokens_seen": 288610704, + "router_z_loss_mlp": 0.7578125, + "step": 3477, + "time_per_iteration": 2.668614149093628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148386, + "balance_loss_mlp": 1.0723784, + "epoch": 0.6691035013466718, + "flos": 604160237568.0, + "grad_norm": 0.03359617144199284, + "language_loss": 0.87902224, + "learning_rate": 0.00026068628729359445, + "loss": 0.89050609, + "num_input_tokens_seen": 288686080, + "router_z_loss_mlp": 0.75878906, + "step": 3478, + "time_per_iteration": 2.7584378719329834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147866, + "balance_loss_mlp": 1.07185841, + "epoch": 0.6692958830319353, + "flos": 634127093760.0, + "grad_norm": 0.030871112113608438, + "language_loss": 0.80438709, + "learning_rate": 0.00026041279397848996, + "loss": 0.81586581, + "num_input_tokens_seen": 288764944, + "router_z_loss_mlp": 0.75878906, + "step": 3479, + "time_per_iteration": 2.8838839530944824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011474, + "balance_loss_mlp": 1.07143939, + "epoch": 0.6694882647171989, + "flos": 646748451840.0, + "grad_norm": 0.03180979016390224, + "language_loss": 0.87201416, + "learning_rate": 0.00026013939367938797, + "loss": 0.88348818, + "num_input_tokens_seen": 288847856, + "router_z_loss_mlp": 0.75830078, + "step": 3480, + "time_per_iteration": 2.908734083175659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148147, + "balance_loss_mlp": 1.07213914, + "epoch": 0.6696806464024625, + "flos": 570761793024.0, + "grad_norm": 0.030473361279484277, + "language_loss": 0.85594642, + "learning_rate": 0.00025986608650243204, + "loss": 0.86742783, + "num_input_tokens_seen": 288929360, + "router_z_loss_mlp": 0.75878906, + "step": 3481, + "time_per_iteration": 2.85624098777771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147434, + "balance_loss_mlp": 1.07137847, + "epoch": 0.6698730280877261, + "flos": 623963132928.0, + "grad_norm": 0.033030030502012045, + "language_loss": 0.84301388, + "learning_rate": 0.0002595928725537293, + "loss": 0.85448819, + "num_input_tokens_seen": 289010160, + "router_z_loss_mlp": 0.75927734, + "step": 3482, + "time_per_iteration": 2.9488890171051025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147834, + "balance_loss_mlp": 1.07177854, + "epoch": 0.6700654097729896, + "flos": 503508281856.0, + "grad_norm": 0.03256709943741325, + "language_loss": 0.93030363, + "learning_rate": 0.0002593197519393509, + "loss": 0.941782, + "num_input_tokens_seen": 289077392, + "router_z_loss_mlp": 0.75927734, + "step": 3483, + "time_per_iteration": 2.6505393981933594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146862, + "balance_loss_mlp": 1.07085407, + "epoch": 0.6702577914582531, + "flos": 625117971456.0, + "grad_norm": 0.031176357525406213, + "language_loss": 0.83921826, + "learning_rate": 0.00025904672476533165, + "loss": 0.85068691, + "num_input_tokens_seen": 289157248, + "router_z_loss_mlp": 0.75878906, + "step": 3484, + "time_per_iteration": 2.859121084213257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147102, + "balance_loss_mlp": 1.07109404, + "epoch": 0.6704501731435167, + "flos": 457212504576.0, + "grad_norm": 0.03137206075835519, + "language_loss": 0.87799835, + "learning_rate": 0.0002587737911376704, + "loss": 0.88946939, + "num_input_tokens_seen": 289224864, + "router_z_loss_mlp": 0.75878906, + "step": 3485, + "time_per_iteration": 2.599365711212158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147337, + "balance_loss_mlp": 1.07137716, + "epoch": 0.6706425548287803, + "flos": 544257451008.0, + "grad_norm": 0.033540892991266884, + "language_loss": 0.88788569, + "learning_rate": 0.00025850095116232885, + "loss": 0.89935905, + "num_input_tokens_seen": 289293488, + "router_z_loss_mlp": 0.75830078, + "step": 3486, + "time_per_iteration": 2.6457767486572266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143978, + "balance_loss_mlp": 1.06787491, + "epoch": 0.6708349365140439, + "flos": 635179874304.0, + "grad_norm": 0.030051375529732832, + "language_loss": 0.82181835, + "learning_rate": 0.000258228204945233, + "loss": 0.83325815, + "num_input_tokens_seen": 289370560, + "router_z_loss_mlp": 0.75976562, + "step": 3487, + "time_per_iteration": 2.8957583904266357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147088, + "balance_loss_mlp": 1.07117581, + "epoch": 0.6710273181993074, + "flos": 641902788096.0, + "grad_norm": 0.03500138254568088, + "language_loss": 0.89155853, + "learning_rate": 0.00025795555259227254, + "loss": 0.90302938, + "num_input_tokens_seen": 289440096, + "router_z_loss_mlp": 0.7578125, + "step": 3488, + "time_per_iteration": 2.814859628677368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147178, + "balance_loss_mlp": 1.0712657, + "epoch": 0.671219699884571, + "flos": 555025027584.0, + "grad_norm": 0.029480168700917284, + "language_loss": 0.88153946, + "learning_rate": 0.00025768299420930046, + "loss": 0.89301121, + "num_input_tokens_seen": 289515808, + "router_z_loss_mlp": 0.7578125, + "step": 3489, + "time_per_iteration": 2.723747491836548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146316, + "balance_loss_mlp": 1.07045078, + "epoch": 0.6714120815698346, + "flos": 732781550592.0, + "grad_norm": 0.031857153656531974, + "language_loss": 0.87735152, + "learning_rate": 0.0002574105299021332, + "loss": 0.88881469, + "num_input_tokens_seen": 289591344, + "router_z_loss_mlp": 0.75732422, + "step": 3490, + "time_per_iteration": 2.8996829986572266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145484, + "balance_loss_mlp": 1.06957209, + "epoch": 0.6716044632550981, + "flos": 689946286080.0, + "grad_norm": 0.030584806240151117, + "language_loss": 0.88189107, + "learning_rate": 0.00025713815977655084, + "loss": 0.89334595, + "num_input_tokens_seen": 289672032, + "router_z_loss_mlp": 0.7578125, + "step": 3491, + "time_per_iteration": 2.8675849437713623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161081, + "balance_loss_mlp": 1.08545506, + "epoch": 0.6717968449403616, + "flos": 461586809856.0, + "grad_norm": 0.035565643494579496, + "language_loss": 0.89158142, + "learning_rate": 0.0002568658839382969, + "loss": 0.90319222, + "num_input_tokens_seen": 289738304, + "router_z_loss_mlp": 0.75488281, + "step": 3492, + "time_per_iteration": 2.542618989944458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161108, + "balance_loss_mlp": 1.08538604, + "epoch": 0.6719892266256252, + "flos": 502596490752.0, + "grad_norm": 0.03871127770917694, + "language_loss": 0.90369606, + "learning_rate": 0.00025659370249307814, + "loss": 0.91530716, + "num_input_tokens_seen": 289804304, + "router_z_loss_mlp": 0.75585938, + "step": 3493, + "time_per_iteration": 2.617976665496826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155204, + "balance_loss_mlp": 1.07938695, + "epoch": 0.6721816083108888, + "flos": 684736051200.0, + "grad_norm": 0.030709352042026482, + "language_loss": 0.89865196, + "learning_rate": 0.00025632161554656473, + "loss": 0.91020399, + "num_input_tokens_seen": 289877696, + "router_z_loss_mlp": 0.75683594, + "step": 3494, + "time_per_iteration": 2.9416136741638184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153333, + "balance_loss_mlp": 1.07742059, + "epoch": 0.6723739899961524, + "flos": 586895330304.0, + "grad_norm": 0.035401445630926676, + "language_loss": 0.86814046, + "learning_rate": 0.00025604962320439017, + "loss": 0.87967384, + "num_input_tokens_seen": 289947296, + "router_z_loss_mlp": 0.7578125, + "step": 3495, + "time_per_iteration": 2.709865093231201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152259, + "balance_loss_mlp": 1.07639432, + "epoch": 0.672566371681416, + "flos": 507739596288.0, + "grad_norm": 0.03037394710394358, + "language_loss": 0.86663043, + "learning_rate": 0.0002557777255721516, + "loss": 0.87815297, + "num_input_tokens_seen": 290020080, + "router_z_loss_mlp": 0.75732422, + "step": 3496, + "time_per_iteration": 2.7064080238342285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144717, + "balance_loss_mlp": 1.06870878, + "epoch": 0.6727587533666795, + "flos": 536735537664.0, + "grad_norm": 0.03895269185794194, + "language_loss": 0.8665306, + "learning_rate": 0.0002555059227554087, + "loss": 0.87797779, + "num_input_tokens_seen": 290094544, + "router_z_loss_mlp": 0.75878906, + "step": 3497, + "time_per_iteration": 2.725748062133789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144891, + "balance_loss_mlp": 1.06897879, + "epoch": 0.672951135051943, + "flos": 604036712448.0, + "grad_norm": 0.03298671193976436, + "language_loss": 0.82722509, + "learning_rate": 0.00025523421485968453, + "loss": 0.83867407, + "num_input_tokens_seen": 290173520, + "router_z_loss_mlp": 0.7578125, + "step": 3498, + "time_per_iteration": 2.7769460678100586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143713, + "balance_loss_mlp": 1.06780005, + "epoch": 0.6731435167372066, + "flos": 812677886976.0, + "grad_norm": 0.03548022480956623, + "language_loss": 0.90755463, + "learning_rate": 0.00025496260199046585, + "loss": 0.91899168, + "num_input_tokens_seen": 290248240, + "router_z_loss_mlp": 0.7578125, + "step": 3499, + "time_per_iteration": 2.952929973602295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143579, + "balance_loss_mlp": 1.06766629, + "epoch": 0.6733358984224702, + "flos": 612750394368.0, + "grad_norm": 0.030145588081223078, + "language_loss": 0.89167559, + "learning_rate": 0.000254691084253202, + "loss": 0.90311134, + "num_input_tokens_seen": 290326288, + "router_z_loss_mlp": 0.7578125, + "step": 3500, + "time_per_iteration": 2.798442840576172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144185, + "balance_loss_mlp": 1.06827235, + "epoch": 0.6735282801077337, + "flos": 559968019968.0, + "grad_norm": 0.034844314373587704, + "language_loss": 0.83049423, + "learning_rate": 0.00025441966175330567, + "loss": 0.84193599, + "num_input_tokens_seen": 290395984, + "router_z_loss_mlp": 0.7578125, + "step": 3501, + "time_per_iteration": 2.712158203125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143612, + "balance_loss_mlp": 1.06769979, + "epoch": 0.6737206617929973, + "flos": 673632101376.0, + "grad_norm": 0.033990412363220264, + "language_loss": 0.84750879, + "learning_rate": 0.00025414833459615183, + "loss": 0.85894495, + "num_input_tokens_seen": 290470224, + "router_z_loss_mlp": 0.7578125, + "step": 3502, + "time_per_iteration": 2.801419973373413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143927, + "balance_loss_mlp": 1.06801498, + "epoch": 0.6739130434782609, + "flos": 634641386496.0, + "grad_norm": 0.0329145119302939, + "language_loss": 0.85179496, + "learning_rate": 0.0002538771028870796, + "loss": 0.86323422, + "num_input_tokens_seen": 290542864, + "router_z_loss_mlp": 0.7578125, + "step": 3503, + "time_per_iteration": 2.775928497314453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143743, + "balance_loss_mlp": 1.06783044, + "epoch": 0.6741054251635245, + "flos": 532545882624.0, + "grad_norm": 0.03235573519036691, + "language_loss": 0.85924655, + "learning_rate": 0.0002536059667313903, + "loss": 0.87068391, + "num_input_tokens_seen": 290617248, + "router_z_loss_mlp": 0.7578125, + "step": 3504, + "time_per_iteration": 2.7243404388427734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142972, + "balance_loss_mlp": 1.06705964, + "epoch": 0.674297806848788, + "flos": 543651833856.0, + "grad_norm": 0.0371245910075902, + "language_loss": 0.94068909, + "learning_rate": 0.0002533349262343483, + "loss": 0.95211881, + "num_input_tokens_seen": 290690112, + "router_z_loss_mlp": 0.7578125, + "step": 3505, + "time_per_iteration": 2.672279119491577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144049, + "balance_loss_mlp": 1.06818378, + "epoch": 0.6744901885340515, + "flos": 464454440448.0, + "grad_norm": 0.03655603062575672, + "language_loss": 0.87737519, + "learning_rate": 0.0002530639815011807, + "loss": 0.88881564, + "num_input_tokens_seen": 290756352, + "router_z_loss_mlp": 0.75732422, + "step": 3506, + "time_per_iteration": 2.4994444847106934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147432, + "balance_loss_mlp": 1.07156682, + "epoch": 0.6746825702193151, + "flos": 633021920256.0, + "grad_norm": 0.03414682593561894, + "language_loss": 0.89147329, + "learning_rate": 0.0002527931326370781, + "loss": 0.90294766, + "num_input_tokens_seen": 290829776, + "router_z_loss_mlp": 0.75732422, + "step": 3507, + "time_per_iteration": 2.8101861476898193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147739, + "balance_loss_mlp": 1.07201719, + "epoch": 0.6748749519045787, + "flos": 672392669184.0, + "grad_norm": 0.03604109956687097, + "language_loss": 0.87794244, + "learning_rate": 0.00025252237974719276, + "loss": 0.88941985, + "num_input_tokens_seen": 290900736, + "router_z_loss_mlp": 0.75585938, + "step": 3508, + "time_per_iteration": 2.8684208393096924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147125, + "balance_loss_mlp": 1.07140362, + "epoch": 0.6750673335898423, + "flos": 768492400128.0, + "grad_norm": 0.03252394082616114, + "language_loss": 0.85605073, + "learning_rate": 0.00025225172293664056, + "loss": 0.867522, + "num_input_tokens_seen": 290981696, + "router_z_loss_mlp": 0.75585938, + "step": 3509, + "time_per_iteration": 2.979069232940674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161552, + "balance_loss_mlp": 1.08716583, + "epoch": 0.6752597152751059, + "flos": 1515904994304.0, + "grad_norm": 0.012789123044337823, + "language_loss": 0.76933134, + "learning_rate": 0.00025198116231049954, + "loss": 0.78094685, + "num_input_tokens_seen": 291217888, + "router_z_loss_mlp": 0.74414062, + "step": 3510, + "time_per_iteration": 4.922729015350342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115617, + "balance_loss_mlp": 1.0805434, + "epoch": 0.6754520969603693, + "flos": 688532937216.0, + "grad_norm": 0.03719909461445286, + "language_loss": 0.8963424, + "learning_rate": 0.00025171069797381106, + "loss": 0.90790415, + "num_input_tokens_seen": 291287856, + "router_z_loss_mlp": 0.75488281, + "step": 3511, + "time_per_iteration": 2.8566861152648926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151796, + "balance_loss_mlp": 1.07621729, + "epoch": 0.6756444786456329, + "flos": 501617570304.0, + "grad_norm": 0.03363675466936639, + "language_loss": 0.85946679, + "learning_rate": 0.00025144033003157864, + "loss": 0.87098479, + "num_input_tokens_seen": 291354912, + "router_z_loss_mlp": 0.75439453, + "step": 3512, + "time_per_iteration": 2.579599142074585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152227, + "balance_loss_mlp": 1.07650506, + "epoch": 0.6758368603308965, + "flos": 493659227136.0, + "grad_norm": 0.044346995690068114, + "language_loss": 0.8418451, + "learning_rate": 0.00025117005858876806, + "loss": 0.85336733, + "num_input_tokens_seen": 291426816, + "router_z_loss_mlp": 0.75585938, + "step": 3513, + "time_per_iteration": 2.694627285003662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115062, + "balance_loss_mlp": 1.07485056, + "epoch": 0.6760292420161601, + "flos": 557043993600.0, + "grad_norm": 0.034337257206957794, + "language_loss": 0.90733004, + "learning_rate": 0.000250899883750308, + "loss": 0.91883624, + "num_input_tokens_seen": 291497648, + "router_z_loss_mlp": 0.75634766, + "step": 3514, + "time_per_iteration": 2.6701719760894775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150513, + "balance_loss_mlp": 1.07474315, + "epoch": 0.6762216237014236, + "flos": 608721194496.0, + "grad_norm": 0.03416515328617874, + "language_loss": 0.87787104, + "learning_rate": 0.00025062980562109006, + "loss": 0.8893761, + "num_input_tokens_seen": 291568080, + "router_z_loss_mlp": 0.75634766, + "step": 3515, + "time_per_iteration": 2.7225759029388428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150722, + "balance_loss_mlp": 1.07499993, + "epoch": 0.6764140053866872, + "flos": 534927418368.0, + "grad_norm": 0.03854621654418095, + "language_loss": 0.89246118, + "learning_rate": 0.0002503598243059677, + "loss": 0.90396839, + "num_input_tokens_seen": 291644896, + "router_z_loss_mlp": 0.75585938, + "step": 3516, + "time_per_iteration": 2.808784008026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143883, + "balance_loss_mlp": 1.06797004, + "epoch": 0.6766063870719508, + "flos": 505861619712.0, + "grad_norm": 0.034298651238093614, + "language_loss": 0.84964311, + "learning_rate": 0.0002500899399097568, + "loss": 0.86108196, + "num_input_tokens_seen": 291716864, + "router_z_loss_mlp": 0.7578125, + "step": 3517, + "time_per_iteration": 2.713134765625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142698, + "balance_loss_mlp": 1.0667851, + "epoch": 0.6767987687572143, + "flos": 514193266176.0, + "grad_norm": 0.03865641767048317, + "language_loss": 0.91341412, + "learning_rate": 0.0002498201525372359, + "loss": 0.92484111, + "num_input_tokens_seen": 291786000, + "router_z_loss_mlp": 0.7578125, + "step": 3518, + "time_per_iteration": 2.5997681617736816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141854, + "balance_loss_mlp": 1.0659889, + "epoch": 0.6769911504424779, + "flos": 526078751232.0, + "grad_norm": 0.04161600440053586, + "language_loss": 0.877231, + "learning_rate": 0.00024955046229314584, + "loss": 0.88864952, + "num_input_tokens_seen": 291854768, + "router_z_loss_mlp": 0.75732422, + "step": 3519, + "time_per_iteration": 2.6678366661071777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114153, + "balance_loss_mlp": 1.06576014, + "epoch": 0.6771835321277414, + "flos": 450836697600.0, + "grad_norm": 0.03317329770903154, + "language_loss": 0.91456813, + "learning_rate": 0.00024928086928218947, + "loss": 0.92598343, + "num_input_tokens_seen": 291918096, + "router_z_loss_mlp": 0.75634766, + "step": 3520, + "time_per_iteration": 2.599364995956421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142519, + "balance_loss_mlp": 1.06689274, + "epoch": 0.677375913813005, + "flos": 710673707520.0, + "grad_norm": 0.03540178465545925, + "language_loss": 0.81423402, + "learning_rate": 0.00024901137360903216, + "loss": 0.82565916, + "num_input_tokens_seen": 291998752, + "router_z_loss_mlp": 0.75488281, + "step": 3521, + "time_per_iteration": 2.9810547828674316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114229, + "balance_loss_mlp": 1.06671166, + "epoch": 0.6775682954982686, + "flos": 429345205248.0, + "grad_norm": 0.03804572823020318, + "language_loss": 0.86387855, + "learning_rate": 0.00024874197537830115, + "loss": 0.87530142, + "num_input_tokens_seen": 292065056, + "router_z_loss_mlp": 0.75439453, + "step": 3522, + "time_per_iteration": 2.5273780822753906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148684, + "balance_loss_mlp": 1.07281935, + "epoch": 0.6777606771835322, + "flos": 438820956672.0, + "grad_norm": 0.03795067145757124, + "language_loss": 0.88304371, + "learning_rate": 0.00024847267469458684, + "loss": 0.89453053, + "num_input_tokens_seen": 292129248, + "router_z_loss_mlp": 0.75732422, + "step": 3523, + "time_per_iteration": 2.5473203659057617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151175, + "balance_loss_mlp": 1.07516694, + "epoch": 0.6779530588687956, + "flos": 776787116544.0, + "grad_norm": 0.03277402838986502, + "language_loss": 0.82546473, + "learning_rate": 0.00024820347166244034, + "loss": 0.83697653, + "num_input_tokens_seen": 292206080, + "router_z_loss_mlp": 0.75878906, + "step": 3524, + "time_per_iteration": 3.006762742996216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151614, + "balance_loss_mlp": 1.07551062, + "epoch": 0.6781454405540592, + "flos": 572904284160.0, + "grad_norm": 0.03398425592449901, + "language_loss": 0.89193916, + "learning_rate": 0.0002479343663863755, + "loss": 0.90345526, + "num_input_tokens_seen": 292280192, + "router_z_loss_mlp": 0.75976562, + "step": 3525, + "time_per_iteration": 2.7708120346069336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149362, + "balance_loss_mlp": 1.07325864, + "epoch": 0.6783378222393228, + "flos": 485982862848.0, + "grad_norm": 0.03421790564553063, + "language_loss": 0.81340361, + "learning_rate": 0.00024766535897086876, + "loss": 0.82489729, + "num_input_tokens_seen": 292347792, + "router_z_loss_mlp": 0.75976562, + "step": 3526, + "time_per_iteration": 2.5445010662078857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149936, + "balance_loss_mlp": 1.07383275, + "epoch": 0.6785302039245864, + "flos": 483831639552.0, + "grad_norm": 0.03533862611113949, + "language_loss": 0.84491217, + "learning_rate": 0.0002473964495203578, + "loss": 0.85641158, + "num_input_tokens_seen": 292420032, + "router_z_loss_mlp": 0.75976562, + "step": 3527, + "time_per_iteration": 2.6606431007385254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151402, + "balance_loss_mlp": 1.07525146, + "epoch": 0.67872258560985, + "flos": 525861900288.0, + "grad_norm": 0.03371892559640898, + "language_loss": 0.90057969, + "learning_rate": 0.0002471276381392425, + "loss": 0.9120937, + "num_input_tokens_seen": 292497792, + "router_z_loss_mlp": 0.76025391, + "step": 3528, + "time_per_iteration": 2.782986640930176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156944, + "balance_loss_mlp": 1.08255768, + "epoch": 0.6789149672951135, + "flos": 1555892093952.0, + "grad_norm": 0.008577357919530966, + "language_loss": 0.78188634, + "learning_rate": 0.0002468589249318848, + "loss": 0.79345584, + "num_input_tokens_seen": 292726704, + "router_z_loss_mlp": 0.74414062, + "step": 3529, + "time_per_iteration": 4.9733335971832275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152043, + "balance_loss_mlp": 1.07594013, + "epoch": 0.6791073489803771, + "flos": 742684999680.0, + "grad_norm": 0.033404033149465266, + "language_loss": 0.89312834, + "learning_rate": 0.00024659031000260826, + "loss": 0.90464872, + "num_input_tokens_seen": 292802320, + "router_z_loss_mlp": 0.75976562, + "step": 3530, + "time_per_iteration": 2.901157855987549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145514, + "balance_loss_mlp": 1.06936264, + "epoch": 0.6792997306656406, + "flos": 577447776768.0, + "grad_norm": 0.04256917362285044, + "language_loss": 0.86884272, + "learning_rate": 0.0002463217934556985, + "loss": 0.8802979, + "num_input_tokens_seen": 292870480, + "router_z_loss_mlp": 0.76025391, + "step": 3531, + "time_per_iteration": 2.6534667015075684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153702, + "balance_loss_mlp": 1.07931519, + "epoch": 0.6794921123509042, + "flos": 1506544035840.0, + "grad_norm": 0.006337226155731696, + "language_loss": 0.7653209, + "learning_rate": 0.000246053375395403, + "loss": 0.77685791, + "num_input_tokens_seen": 293100752, + "router_z_loss_mlp": 0.74414062, + "step": 3532, + "time_per_iteration": 4.827699899673462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147095, + "balance_loss_mlp": 1.07089639, + "epoch": 0.6796844940361677, + "flos": 700140446208.0, + "grad_norm": 0.038428315777117805, + "language_loss": 0.89542228, + "learning_rate": 0.0002457850559259306, + "loss": 0.90689325, + "num_input_tokens_seen": 293178192, + "router_z_loss_mlp": 0.76074219, + "step": 3533, + "time_per_iteration": 2.827556610107422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147708, + "balance_loss_mlp": 1.07160449, + "epoch": 0.6798768757214313, + "flos": 553815794688.0, + "grad_norm": 0.03257941751207101, + "language_loss": 0.86952329, + "learning_rate": 0.00024551683515145275, + "loss": 0.88100034, + "num_input_tokens_seen": 293246368, + "router_z_loss_mlp": 0.75976562, + "step": 3534, + "time_per_iteration": 2.664051055908203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146574, + "balance_loss_mlp": 1.07051849, + "epoch": 0.6800692574066949, + "flos": 523975191552.0, + "grad_norm": 0.03399690480422162, + "language_loss": 0.91393268, + "learning_rate": 0.0002452487131761014, + "loss": 0.92539847, + "num_input_tokens_seen": 293320656, + "router_z_loss_mlp": 0.75927734, + "step": 3535, + "time_per_iteration": 2.733736276626587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146041, + "balance_loss_mlp": 1.06993783, + "epoch": 0.6802616390919585, + "flos": 575129367552.0, + "grad_norm": 0.03256850712762242, + "language_loss": 0.84912848, + "learning_rate": 0.00024498069010397093, + "loss": 0.86058891, + "num_input_tokens_seen": 293388592, + "router_z_loss_mlp": 0.75976562, + "step": 3536, + "time_per_iteration": 2.687980890274048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144058, + "balance_loss_mlp": 1.06805015, + "epoch": 0.6804540207772221, + "flos": 489128469504.0, + "grad_norm": 0.03259916802392139, + "language_loss": 0.89844334, + "learning_rate": 0.00024471276603911697, + "loss": 0.90988398, + "num_input_tokens_seen": 293453936, + "router_z_loss_mlp": 0.75878906, + "step": 3537, + "time_per_iteration": 2.5977725982666016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144351, + "balance_loss_mlp": 1.06834352, + "epoch": 0.6806464024624855, + "flos": 579744718848.0, + "grad_norm": 0.031208373438408543, + "language_loss": 0.83636969, + "learning_rate": 0.0002444449410855572, + "loss": 0.84781325, + "num_input_tokens_seen": 293527664, + "router_z_loss_mlp": 0.75878906, + "step": 3538, + "time_per_iteration": 2.806182384490967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151082, + "balance_loss_mlp": 1.0752176, + "epoch": 0.6808387841477491, + "flos": 554792713728.0, + "grad_norm": 0.02619955396666995, + "language_loss": 0.88271046, + "learning_rate": 0.00024417721534727033, + "loss": 0.89422125, + "num_input_tokens_seen": 293599344, + "router_z_loss_mlp": 0.75732422, + "step": 3539, + "time_per_iteration": 2.6672027111053467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153254, + "balance_loss_mlp": 1.07753205, + "epoch": 0.6810311658330127, + "flos": 427753936896.0, + "grad_norm": 0.03954259059998535, + "language_loss": 0.8817929, + "learning_rate": 0.00024390958892819687, + "loss": 0.89332551, + "num_input_tokens_seen": 293663088, + "router_z_loss_mlp": 0.75585938, + "step": 3540, + "time_per_iteration": 2.4914028644561768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152621, + "balance_loss_mlp": 1.07685137, + "epoch": 0.6812235475182763, + "flos": 573460236288.0, + "grad_norm": 0.03041439482605579, + "language_loss": 0.85729158, + "learning_rate": 0.0002436420619322381, + "loss": 0.86881781, + "num_input_tokens_seen": 293741296, + "router_z_loss_mlp": 0.75634766, + "step": 3541, + "time_per_iteration": 2.8284380435943604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152525, + "balance_loss_mlp": 1.07675517, + "epoch": 0.6814159292035398, + "flos": 502993989120.0, + "grad_norm": 0.031050490172735493, + "language_loss": 0.87018108, + "learning_rate": 0.0002433746344632577, + "loss": 0.88170624, + "num_input_tokens_seen": 293815840, + "router_z_loss_mlp": 0.75634766, + "step": 3542, + "time_per_iteration": 2.6791961193084717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155107, + "balance_loss_mlp": 1.07919419, + "epoch": 0.6816083108888034, + "flos": 766955526144.0, + "grad_norm": 0.032327379337262395, + "language_loss": 0.85101521, + "learning_rate": 0.00024310730662508006, + "loss": 0.86256623, + "num_input_tokens_seen": 293896368, + "router_z_loss_mlp": 0.7578125, + "step": 3543, + "time_per_iteration": 3.091520309448242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154554, + "balance_loss_mlp": 1.07854629, + "epoch": 0.681800692574067, + "flos": 480479915520.0, + "grad_norm": 0.03033872617251452, + "language_loss": 0.91889656, + "learning_rate": 0.0002428400785214911, + "loss": 0.93044209, + "num_input_tokens_seen": 293963344, + "router_z_loss_mlp": 0.75878906, + "step": 3544, + "time_per_iteration": 2.6075758934020996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148266, + "balance_loss_mlp": 1.07216299, + "epoch": 0.6819930742593305, + "flos": 692833382400.0, + "grad_norm": 0.035894178949101116, + "language_loss": 0.8798629, + "learning_rate": 0.00024257295025623794, + "loss": 0.89134556, + "num_input_tokens_seen": 294035440, + "router_z_loss_mlp": 0.75976562, + "step": 3545, + "time_per_iteration": 2.835088014602661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148628, + "balance_loss_mlp": 1.07257295, + "epoch": 0.6821854559445941, + "flos": 679354627584.0, + "grad_norm": 0.03140204473065851, + "language_loss": 0.85909534, + "learning_rate": 0.00024230592193302892, + "loss": 0.87058157, + "num_input_tokens_seen": 294116944, + "router_z_loss_mlp": 0.75927734, + "step": 3546, + "time_per_iteration": 2.8806655406951904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115113, + "balance_loss_mlp": 1.07517004, + "epoch": 0.6823778376298576, + "flos": 463132416000.0, + "grad_norm": 0.035932436170819634, + "language_loss": 0.89696717, + "learning_rate": 0.00024203899365553372, + "loss": 0.9084785, + "num_input_tokens_seen": 294178976, + "router_z_loss_mlp": 0.75830078, + "step": 3547, + "time_per_iteration": 2.538266897201538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147926, + "balance_loss_mlp": 1.07411194, + "epoch": 0.6825702193151212, + "flos": 1478174452224.0, + "grad_norm": 0.007345057771589815, + "language_loss": 0.76734358, + "learning_rate": 0.00024177216552738302, + "loss": 0.77882284, + "num_input_tokens_seen": 294384960, + "router_z_loss_mlp": 0.73828125, + "step": 3548, + "time_per_iteration": 4.545760154724121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143597, + "balance_loss_mlp": 1.06768405, + "epoch": 0.6827626010003848, + "flos": 724412974080.0, + "grad_norm": 0.035220397583358556, + "language_loss": 0.88068932, + "learning_rate": 0.00024150543765216848, + "loss": 0.89212525, + "num_input_tokens_seen": 294461408, + "router_z_loss_mlp": 0.7578125, + "step": 3549, + "time_per_iteration": 2.9486939907073975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143099, + "balance_loss_mlp": 1.06718683, + "epoch": 0.6829549826856484, + "flos": 559939822080.0, + "grad_norm": 0.03492974535391861, + "language_loss": 0.89375067, + "learning_rate": 0.00024123881013344352, + "loss": 0.90518171, + "num_input_tokens_seen": 294530624, + "router_z_loss_mlp": 0.7578125, + "step": 3550, + "time_per_iteration": 2.651604413986206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150936, + "balance_loss_mlp": 1.07502353, + "epoch": 0.6831473643709118, + "flos": 626133821952.0, + "grad_norm": 0.03217647010825034, + "language_loss": 0.83963066, + "learning_rate": 0.00024097228307472202, + "loss": 0.85114002, + "num_input_tokens_seen": 294606784, + "router_z_loss_mlp": 0.7578125, + "step": 3551, + "time_per_iteration": 2.7857072353363037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011508, + "balance_loss_mlp": 1.07479274, + "epoch": 0.6833397460561754, + "flos": 715097677824.0, + "grad_norm": 0.03621401947072565, + "language_loss": 0.87106031, + "learning_rate": 0.00024070585657947846, + "loss": 0.88256836, + "num_input_tokens_seen": 294686960, + "router_z_loss_mlp": 0.75878906, + "step": 3552, + "time_per_iteration": 2.8683760166168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114886, + "balance_loss_mlp": 1.07299471, + "epoch": 0.683532127741439, + "flos": 465726799872.0, + "grad_norm": 0.03128688144219445, + "language_loss": 0.89219671, + "learning_rate": 0.00024043953075114934, + "loss": 0.90368527, + "num_input_tokens_seen": 294759712, + "router_z_loss_mlp": 0.75732422, + "step": 3553, + "time_per_iteration": 2.704216241836548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114847, + "balance_loss_mlp": 1.07251036, + "epoch": 0.6837245094267026, + "flos": 583339490304.0, + "grad_norm": 0.0349442822995555, + "language_loss": 0.93869305, + "learning_rate": 0.00024017330569313128, + "loss": 0.95017779, + "num_input_tokens_seen": 294830592, + "router_z_loss_mlp": 0.75830078, + "step": 3554, + "time_per_iteration": 2.691981554031372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148981, + "balance_loss_mlp": 1.07287753, + "epoch": 0.6839168911119662, + "flos": 795523769856.0, + "grad_norm": 0.0402217191104916, + "language_loss": 0.80629432, + "learning_rate": 0.0002399071815087821, + "loss": 0.81778413, + "num_input_tokens_seen": 294907504, + "router_z_loss_mlp": 0.75976562, + "step": 3555, + "time_per_iteration": 2.984731912612915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148889, + "balance_loss_mlp": 1.07302415, + "epoch": 0.6841092727972297, + "flos": 581114406912.0, + "grad_norm": 0.035602777463953614, + "language_loss": 0.89145899, + "learning_rate": 0.00023964115830142025, + "loss": 0.9029479, + "num_input_tokens_seen": 294977600, + "router_z_loss_mlp": 0.75732422, + "step": 3556, + "time_per_iteration": 2.7377610206604004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148814, + "balance_loss_mlp": 1.07294965, + "epoch": 0.6843016544824932, + "flos": 384595034112.0, + "grad_norm": 0.03918339808288278, + "language_loss": 0.92691845, + "learning_rate": 0.00023937523617432522, + "loss": 0.93840659, + "num_input_tokens_seen": 295039408, + "router_z_loss_mlp": 0.75732422, + "step": 3557, + "time_per_iteration": 2.571953535079956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148872, + "balance_loss_mlp": 1.07305455, + "epoch": 0.6844940361677568, + "flos": 1441287845376.0, + "grad_norm": 0.033291217727089636, + "language_loss": 0.91850209, + "learning_rate": 0.00023910941523073705, + "loss": 0.92999083, + "num_input_tokens_seen": 295142928, + "router_z_loss_mlp": 0.75683594, + "step": 3558, + "time_per_iteration": 3.910876512527466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148946, + "balance_loss_mlp": 1.07317698, + "epoch": 0.6846864178530204, + "flos": 521899829760.0, + "grad_norm": 0.03402610589420279, + "language_loss": 0.9203999, + "learning_rate": 0.0002388436955738566, + "loss": 0.93188941, + "num_input_tokens_seen": 295215504, + "router_z_loss_mlp": 0.75634766, + "step": 3559, + "time_per_iteration": 2.6723177433013916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148516, + "balance_loss_mlp": 1.07279444, + "epoch": 0.6848787995382839, + "flos": 719228935680.0, + "grad_norm": 0.031030975541128533, + "language_loss": 0.86168528, + "learning_rate": 0.00023857807730684523, + "loss": 0.87317038, + "num_input_tokens_seen": 295291024, + "router_z_loss_mlp": 0.75585938, + "step": 3560, + "time_per_iteration": 2.90830135345459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114827, + "balance_loss_mlp": 1.07254827, + "epoch": 0.6850711812235475, + "flos": 512161565184.0, + "grad_norm": 0.040096201780059196, + "language_loss": 0.88262463, + "learning_rate": 0.00023831256053282547, + "loss": 0.89410734, + "num_input_tokens_seen": 295363248, + "router_z_loss_mlp": 0.75585938, + "step": 3561, + "time_per_iteration": 2.671116352081299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148991, + "balance_loss_mlp": 1.07336485, + "epoch": 0.6852635629088111, + "flos": 669431712768.0, + "grad_norm": 0.03641568128756266, + "language_loss": 0.83697838, + "learning_rate": 0.00023804714535488003, + "loss": 0.8484683, + "num_input_tokens_seen": 295442032, + "router_z_loss_mlp": 0.75488281, + "step": 3562, + "time_per_iteration": 2.861722946166992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149231, + "balance_loss_mlp": 1.0756073, + "epoch": 0.6854559445940747, + "flos": 1526364395520.0, + "grad_norm": 0.005446048976110769, + "language_loss": 0.7980963, + "learning_rate": 0.0002377818318760519, + "loss": 0.80958861, + "num_input_tokens_seen": 295680560, + "router_z_loss_mlp": 0.73632812, + "step": 3563, + "time_per_iteration": 5.001219272613525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145764, + "balance_loss_mlp": 1.07037604, + "epoch": 0.6856483262793382, + "flos": 455137142784.0, + "grad_norm": 0.035220734339555373, + "language_loss": 0.86132681, + "learning_rate": 0.00023751662019934488, + "loss": 0.8727845, + "num_input_tokens_seen": 295745712, + "router_z_loss_mlp": 0.75244141, + "step": 3564, + "time_per_iteration": 2.4870924949645996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146111, + "balance_loss_mlp": 1.07077074, + "epoch": 0.6858407079646017, + "flos": 616688269824.0, + "grad_norm": 0.032854756712223265, + "language_loss": 0.84736019, + "learning_rate": 0.00023725151042772364, + "loss": 0.85882127, + "num_input_tokens_seen": 295815104, + "router_z_loss_mlp": 0.75195312, + "step": 3565, + "time_per_iteration": 2.7391157150268555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146, + "balance_loss_mlp": 1.07056403, + "epoch": 0.6860330896498653, + "flos": 467094486528.0, + "grad_norm": 0.03197662147757374, + "language_loss": 0.88051426, + "learning_rate": 0.00023698650266411276, + "loss": 0.89197421, + "num_input_tokens_seen": 295882928, + "router_z_loss_mlp": 0.75292969, + "step": 3566, + "time_per_iteration": 2.6070899963378906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114589, + "balance_loss_mlp": 1.07054949, + "epoch": 0.6862254713351289, + "flos": 865838294016.0, + "grad_norm": 0.03137777844297811, + "language_loss": 0.88001108, + "learning_rate": 0.00023672159701139755, + "loss": 0.89146996, + "num_input_tokens_seen": 295970960, + "router_z_loss_mlp": 0.75195312, + "step": 3567, + "time_per_iteration": 3.252197504043579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145133, + "balance_loss_mlp": 1.06979275, + "epoch": 0.6864178530203925, + "flos": 448090590720.0, + "grad_norm": 0.03718741839919542, + "language_loss": 0.90576816, + "learning_rate": 0.00023645679357242296, + "loss": 0.91721952, + "num_input_tokens_seen": 296036128, + "router_z_loss_mlp": 0.75195312, + "step": 3568, + "time_per_iteration": 2.551252841949463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146099, + "balance_loss_mlp": 1.07052052, + "epoch": 0.6866102347056561, + "flos": 425211945984.0, + "grad_norm": 0.041154591725143186, + "language_loss": 0.89051086, + "learning_rate": 0.00023619209244999534, + "loss": 0.90197182, + "num_input_tokens_seen": 296101440, + "router_z_loss_mlp": 0.75439453, + "step": 3569, + "time_per_iteration": 2.5833351612091064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148567, + "balance_loss_mlp": 1.07289267, + "epoch": 0.6868026163909196, + "flos": 473333306880.0, + "grad_norm": 0.045387721995194655, + "language_loss": 0.91211587, + "learning_rate": 0.0002359274937468806, + "loss": 0.92360151, + "num_input_tokens_seen": 296165504, + "router_z_loss_mlp": 0.75537109, + "step": 3570, + "time_per_iteration": 2.5472187995910645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148303, + "balance_loss_mlp": 1.07258165, + "epoch": 0.6869949980761831, + "flos": 465205776384.0, + "grad_norm": 0.03150793163610154, + "language_loss": 0.82095093, + "learning_rate": 0.00023566299756580512, + "loss": 0.83243394, + "num_input_tokens_seen": 296236880, + "router_z_loss_mlp": 0.75585938, + "step": 3571, + "time_per_iteration": 2.65720534324646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149363, + "balance_loss_mlp": 1.07364154, + "epoch": 0.6871873797614467, + "flos": 427130855424.0, + "grad_norm": 0.03812414034627887, + "language_loss": 0.83773518, + "learning_rate": 0.0002353986040094551, + "loss": 0.84922886, + "num_input_tokens_seen": 296299776, + "router_z_loss_mlp": 0.75585938, + "step": 3572, + "time_per_iteration": 2.5081918239593506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150153, + "balance_loss_mlp": 1.07443094, + "epoch": 0.6873797614467103, + "flos": 444554216448.0, + "grad_norm": 0.03780966347325107, + "language_loss": 0.84840351, + "learning_rate": 0.00023513431318047796, + "loss": 0.859905, + "num_input_tokens_seen": 296365408, + "router_z_loss_mlp": 0.75585938, + "step": 3573, + "time_per_iteration": 2.5093369483947754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151367, + "balance_loss_mlp": 1.07564497, + "epoch": 0.6875721431319738, + "flos": 993914388480.0, + "grad_norm": 0.03609225050037203, + "language_loss": 0.82789201, + "learning_rate": 0.00023487012518147977, + "loss": 0.83940566, + "num_input_tokens_seen": 296445488, + "router_z_loss_mlp": 0.75585938, + "step": 3574, + "time_per_iteration": 3.209183692932129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147663, + "balance_loss_mlp": 1.07194114, + "epoch": 0.6877645248172374, + "flos": 1287447284736.0, + "grad_norm": 0.03474054925627609, + "language_loss": 0.8951385, + "learning_rate": 0.00023460604011502772, + "loss": 0.90661514, + "num_input_tokens_seen": 296529936, + "router_z_loss_mlp": 0.75585938, + "step": 3575, + "time_per_iteration": 3.6102471351623535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147349, + "balance_loss_mlp": 1.07162762, + "epoch": 0.687956906502501, + "flos": 878229339648.0, + "grad_norm": 0.03667268861696713, + "language_loss": 0.90602195, + "learning_rate": 0.00023434205808364845, + "loss": 0.91749543, + "num_input_tokens_seen": 296607488, + "router_z_loss_mlp": 0.75585938, + "step": 3576, + "time_per_iteration": 3.1072838306427 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145679, + "balance_loss_mlp": 1.07014775, + "epoch": 0.6881492881877646, + "flos": 564470579712.0, + "grad_norm": 0.03470071742143998, + "language_loss": 0.90143359, + "learning_rate": 0.00023407817918982932, + "loss": 0.91289037, + "num_input_tokens_seen": 296678672, + "router_z_loss_mlp": 0.75390625, + "step": 3577, + "time_per_iteration": 2.7108538150787354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144131, + "balance_loss_mlp": 1.06869566, + "epoch": 0.6883416698730281, + "flos": 796509421056.0, + "grad_norm": 0.03216167904462723, + "language_loss": 0.83329225, + "learning_rate": 0.00023381440353601718, + "loss": 0.84473354, + "num_input_tokens_seen": 296758896, + "router_z_loss_mlp": 0.75292969, + "step": 3578, + "time_per_iteration": 3.00079345703125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144719, + "balance_loss_mlp": 1.06933129, + "epoch": 0.6885340515582916, + "flos": 724879603200.0, + "grad_norm": 0.03602954458915834, + "language_loss": 0.91766059, + "learning_rate": 0.00023355073122461822, + "loss": 0.92910779, + "num_input_tokens_seen": 296830736, + "router_z_loss_mlp": 0.75244141, + "step": 3579, + "time_per_iteration": 2.8793976306915283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144346, + "balance_loss_mlp": 1.06891012, + "epoch": 0.6887264332435552, + "flos": 1012520785920.0, + "grad_norm": 0.032157968991135766, + "language_loss": 0.87754709, + "learning_rate": 0.00023328716235799973, + "loss": 0.88899052, + "num_input_tokens_seen": 296911504, + "router_z_loss_mlp": 0.75292969, + "step": 3580, + "time_per_iteration": 3.262232780456543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145628, + "balance_loss_mlp": 1.07028747, + "epoch": 0.6889188149288188, + "flos": 586346108928.0, + "grad_norm": 0.030956213624598772, + "language_loss": 0.88613558, + "learning_rate": 0.00023302369703848803, + "loss": 0.89759183, + "num_input_tokens_seen": 296981488, + "router_z_loss_mlp": 0.75195312, + "step": 3581, + "time_per_iteration": 2.6781458854675293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155772, + "balance_loss_mlp": 1.08043158, + "epoch": 0.6891111966140824, + "flos": 637276703232.0, + "grad_norm": 0.03960885447101306, + "language_loss": 0.85706222, + "learning_rate": 0.00023276033536836937, + "loss": 0.86861998, + "num_input_tokens_seen": 297054896, + "router_z_loss_mlp": 0.75195312, + "step": 3582, + "time_per_iteration": 2.8019070625305176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155352, + "balance_loss_mlp": 1.08005941, + "epoch": 0.6893035782993459, + "flos": 496312008192.0, + "grad_norm": 0.03332092041619006, + "language_loss": 0.89310157, + "learning_rate": 0.00023249707744988984, + "loss": 0.9046551, + "num_input_tokens_seen": 297128224, + "router_z_loss_mlp": 0.75146484, + "step": 3583, + "time_per_iteration": 2.6462185382843018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149559, + "balance_loss_mlp": 1.07421863, + "epoch": 0.6894959599846094, + "flos": 459148878336.0, + "grad_norm": 0.037983425016063846, + "language_loss": 0.88022619, + "learning_rate": 0.00023223392338525529, + "loss": 0.89172179, + "num_input_tokens_seen": 297191312, + "router_z_loss_mlp": 0.75195312, + "step": 3584, + "time_per_iteration": 2.493164539337158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149866, + "balance_loss_mlp": 1.07457304, + "epoch": 0.689688341669873, + "flos": 506057003520.0, + "grad_norm": 0.03394886477629218, + "language_loss": 0.83439797, + "learning_rate": 0.00023197087327663107, + "loss": 0.84589666, + "num_input_tokens_seen": 297261904, + "router_z_loss_mlp": 0.75146484, + "step": 3585, + "time_per_iteration": 2.6373069286346436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149128, + "balance_loss_mlp": 1.0738833, + "epoch": 0.6898807233551366, + "flos": 765218539008.0, + "grad_norm": 0.04715187460336584, + "language_loss": 0.87040132, + "learning_rate": 0.00023170792722614243, + "loss": 0.88189256, + "num_input_tokens_seen": 297338352, + "router_z_loss_mlp": 0.75097656, + "step": 3586, + "time_per_iteration": 2.9102606773376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147386, + "balance_loss_mlp": 1.07218862, + "epoch": 0.6900731050404002, + "flos": 584572918272.0, + "grad_norm": 0.029046800456262803, + "language_loss": 0.87808621, + "learning_rate": 0.00023144508533587377, + "loss": 0.88955998, + "num_input_tokens_seen": 297416688, + "router_z_loss_mlp": 0.75048828, + "step": 3587, + "time_per_iteration": 2.8061466217041016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146464, + "balance_loss_mlp": 1.07112408, + "epoch": 0.6902654867256637, + "flos": 713204964864.0, + "grad_norm": 0.038780286956444227, + "language_loss": 0.83763909, + "learning_rate": 0.0002311823477078698, + "loss": 0.84910375, + "num_input_tokens_seen": 297499968, + "router_z_loss_mlp": 0.75195312, + "step": 3588, + "time_per_iteration": 2.943735122680664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145799, + "balance_loss_mlp": 1.0705539, + "epoch": 0.6904578684109273, + "flos": 598303452672.0, + "grad_norm": 0.03424930843273271, + "language_loss": 0.89383221, + "learning_rate": 0.00023091971444413428, + "loss": 0.90529013, + "num_input_tokens_seen": 297574480, + "router_z_loss_mlp": 0.75097656, + "step": 3589, + "time_per_iteration": 2.8112401962280273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144927, + "balance_loss_mlp": 1.06958711, + "epoch": 0.6906502500961909, + "flos": 586176921600.0, + "grad_norm": 0.03337983464568353, + "language_loss": 0.87353265, + "learning_rate": 0.00023065718564663012, + "loss": 0.88498187, + "num_input_tokens_seen": 297645360, + "router_z_loss_mlp": 0.75195312, + "step": 3590, + "time_per_iteration": 2.712702512741089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148972, + "balance_loss_mlp": 1.0753479, + "epoch": 0.6908426317814544, + "flos": 1591140317184.0, + "grad_norm": 0.007217245787203084, + "language_loss": 0.73911589, + "learning_rate": 0.00023039476141728011, + "loss": 0.75060558, + "num_input_tokens_seen": 297879472, + "router_z_loss_mlp": 0.73632812, + "step": 3591, + "time_per_iteration": 4.975476980209351 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011435, + "balance_loss_mlp": 1.06830287, + "epoch": 0.6910350134667179, + "flos": 501804221952.0, + "grad_norm": 0.03486357436652247, + "language_loss": 0.85128838, + "learning_rate": 0.0002301324418579666, + "loss": 0.86272335, + "num_input_tokens_seen": 297950672, + "router_z_loss_mlp": 0.75048828, + "step": 3592, + "time_per_iteration": 2.6776154041290283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144028, + "balance_loss_mlp": 1.07040405, + "epoch": 0.6912273951519815, + "flos": 1412132901888.0, + "grad_norm": 0.003146877221363815, + "language_loss": 0.78688473, + "learning_rate": 0.00022987022707053107, + "loss": 0.798325, + "num_input_tokens_seen": 298171728, + "router_z_loss_mlp": 0.73632812, + "step": 3593, + "time_per_iteration": 4.794835567474365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143307, + "balance_loss_mlp": 1.06806242, + "epoch": 0.6914197768372451, + "flos": 636556293120.0, + "grad_norm": 0.03715032708342992, + "language_loss": 0.8555156, + "learning_rate": 0.00022960811715677415, + "loss": 0.86694872, + "num_input_tokens_seen": 298250304, + "router_z_loss_mlp": 0.75097656, + "step": 3594, + "time_per_iteration": 2.8951711654663086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147289, + "balance_loss_mlp": 1.07213938, + "epoch": 0.6916121585225087, + "flos": 559201947648.0, + "grad_norm": 0.03507172785049161, + "language_loss": 0.86282074, + "learning_rate": 0.00022934611221845608, + "loss": 0.87429363, + "num_input_tokens_seen": 298328000, + "router_z_loss_mlp": 0.75, + "step": 3595, + "time_per_iteration": 2.8272645473480225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145219, + "balance_loss_mlp": 1.0699265, + "epoch": 0.6918045402077723, + "flos": 530292601344.0, + "grad_norm": 0.04349078621871699, + "language_loss": 0.82568008, + "learning_rate": 0.00022908421235729609, + "loss": 0.83713228, + "num_input_tokens_seen": 298406832, + "router_z_loss_mlp": 0.75146484, + "step": 3596, + "time_per_iteration": 2.7838826179504395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146035, + "balance_loss_mlp": 1.07074213, + "epoch": 0.6919969218930357, + "flos": 571425807360.0, + "grad_norm": 0.03178884209281711, + "language_loss": 0.89899623, + "learning_rate": 0.0002288224176749728, + "loss": 0.9104566, + "num_input_tokens_seen": 298477584, + "router_z_loss_mlp": 0.75146484, + "step": 3597, + "time_per_iteration": 2.6271378993988037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114544, + "balance_loss_mlp": 1.07009995, + "epoch": 0.6921893035782993, + "flos": 684503737344.0, + "grad_norm": 0.040516365330590415, + "language_loss": 0.84238005, + "learning_rate": 0.00022856072827312385, + "loss": 0.85383451, + "num_input_tokens_seen": 298551872, + "router_z_loss_mlp": 0.75195312, + "step": 3598, + "time_per_iteration": 2.8102614879608154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145578, + "balance_loss_mlp": 1.07028556, + "epoch": 0.6923816852635629, + "flos": 547793825280.0, + "grad_norm": 0.038084466235788844, + "language_loss": 0.82715267, + "learning_rate": 0.00022829914425334598, + "loss": 0.83860844, + "num_input_tokens_seen": 298619680, + "router_z_loss_mlp": 0.75146484, + "step": 3599, + "time_per_iteration": 2.6669743061065674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143867, + "balance_loss_mlp": 1.06852686, + "epoch": 0.6925740669488265, + "flos": 511056391680.0, + "grad_norm": 0.034117111871926384, + "language_loss": 0.85557401, + "learning_rate": 0.0002280376657171956, + "loss": 0.86701274, + "num_input_tokens_seen": 298690080, + "router_z_loss_mlp": 0.75195312, + "step": 3600, + "time_per_iteration": 2.655038356781006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144019, + "balance_loss_mlp": 1.0685358, + "epoch": 0.69276644863409, + "flos": 870913543680.0, + "grad_norm": 0.03423377398605859, + "language_loss": 0.81733924, + "learning_rate": 0.00022777629276618706, + "loss": 0.82877946, + "num_input_tokens_seen": 298777712, + "router_z_loss_mlp": 0.75341797, + "step": 3601, + "time_per_iteration": 3.1143221855163574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114446, + "balance_loss_mlp": 1.06897676, + "epoch": 0.6929588303193536, + "flos": 626917358592.0, + "grad_norm": 0.03471097371374876, + "language_loss": 0.82267404, + "learning_rate": 0.0002275150255017947, + "loss": 0.8341186, + "num_input_tokens_seen": 298854368, + "router_z_loss_mlp": 0.75341797, + "step": 3602, + "time_per_iteration": 2.7638230323791504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149361, + "balance_loss_mlp": 1.07592773, + "epoch": 0.6931512120046172, + "flos": 1548804609024.0, + "grad_norm": 0.009029231118545568, + "language_loss": 0.75732672, + "learning_rate": 0.0002272538640254511, + "loss": 0.76882035, + "num_input_tokens_seen": 299091664, + "router_z_loss_mlp": 0.734375, + "step": 3603, + "time_per_iteration": 5.028877019882202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165459, + "balance_loss_mlp": 1.09183502, + "epoch": 0.6933435936898807, + "flos": 1451323729920.0, + "grad_norm": 0.01657275533774484, + "language_loss": 0.75127101, + "learning_rate": 0.0002269928084385487, + "loss": 0.76292562, + "num_input_tokens_seen": 299312656, + "router_z_loss_mlp": 0.73632812, + "step": 3604, + "time_per_iteration": 4.7287609577178955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157905, + "balance_loss_mlp": 1.08204055, + "epoch": 0.6935359753751443, + "flos": 541930309632.0, + "grad_norm": 0.03919534439322985, + "language_loss": 0.90026039, + "learning_rate": 0.0002267318588424379, + "loss": 0.91183943, + "num_input_tokens_seen": 299381136, + "router_z_loss_mlp": 0.75732422, + "step": 3605, + "time_per_iteration": 2.6615920066833496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150618, + "balance_loss_mlp": 1.07484841, + "epoch": 0.6937283570604078, + "flos": 720689948160.0, + "grad_norm": 0.03558950704948247, + "language_loss": 0.91988891, + "learning_rate": 0.00022647101533842845, + "loss": 0.93139505, + "num_input_tokens_seen": 299455216, + "router_z_loss_mlp": 0.75634766, + "step": 3606, + "time_per_iteration": 2.875670909881592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152588, + "balance_loss_mlp": 1.07658041, + "epoch": 0.6939207387456714, + "flos": 523193656320.0, + "grad_norm": 0.041224980702036104, + "language_loss": 0.83253193, + "learning_rate": 0.00022621027802778872, + "loss": 0.84405786, + "num_input_tokens_seen": 299524352, + "router_z_loss_mlp": 0.75878906, + "step": 3607, + "time_per_iteration": 2.6125805377960205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151349, + "balance_loss_mlp": 1.07519805, + "epoch": 0.694113120430935, + "flos": 536401165824.0, + "grad_norm": 0.03463828866617186, + "language_loss": 0.85144913, + "learning_rate": 0.00022594964701174586, + "loss": 0.86296266, + "num_input_tokens_seen": 299594960, + "router_z_loss_mlp": 0.76025391, + "step": 3608, + "time_per_iteration": 2.6021461486816406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150974, + "balance_loss_mlp": 1.07496643, + "epoch": 0.6943055021161986, + "flos": 524394157056.0, + "grad_norm": 0.03515633419070769, + "language_loss": 0.89070058, + "learning_rate": 0.00022568912239148586, + "loss": 0.9022103, + "num_input_tokens_seen": 299662560, + "router_z_loss_mlp": 0.75878906, + "step": 3609, + "time_per_iteration": 2.636577844619751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145151, + "balance_loss_mlp": 1.06904817, + "epoch": 0.694497883801462, + "flos": 485970127872.0, + "grad_norm": 0.037176872987451946, + "language_loss": 0.86671317, + "learning_rate": 0.00022542870426815344, + "loss": 0.87816465, + "num_input_tokens_seen": 299734896, + "router_z_loss_mlp": 0.75976562, + "step": 3610, + "time_per_iteration": 2.6800506114959717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114419, + "balance_loss_mlp": 1.06818187, + "epoch": 0.6946902654867256, + "flos": 462424740864.0, + "grad_norm": 0.03708376402785258, + "language_loss": 0.9062373, + "learning_rate": 0.00022516839274285173, + "loss": 0.91767919, + "num_input_tokens_seen": 299799424, + "router_z_loss_mlp": 0.75878906, + "step": 3611, + "time_per_iteration": 2.516231060028076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144878, + "balance_loss_mlp": 1.06906128, + "epoch": 0.6948826471719892, + "flos": 513867626496.0, + "grad_norm": 0.032040517416043905, + "language_loss": 0.80424583, + "learning_rate": 0.00022490818791664265, + "loss": 0.81569457, + "num_input_tokens_seen": 299868272, + "router_z_loss_mlp": 0.75683594, + "step": 3612, + "time_per_iteration": 2.5825564861297607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153455, + "balance_loss_mlp": 1.07768571, + "epoch": 0.6950750288572528, + "flos": 558255227904.0, + "grad_norm": 0.03220148028893399, + "language_loss": 0.90256339, + "learning_rate": 0.00022464808989054676, + "loss": 0.91409791, + "num_input_tokens_seen": 299939136, + "router_z_loss_mlp": 0.75634766, + "step": 3613, + "time_per_iteration": 2.673570394515991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153455, + "balance_loss_mlp": 1.07763827, + "epoch": 0.6952674105425164, + "flos": 543521577984.0, + "grad_norm": 0.03708971382778387, + "language_loss": 0.80475914, + "learning_rate": 0.00022438809876554284, + "loss": 0.81629372, + "num_input_tokens_seen": 300009472, + "router_z_loss_mlp": 0.75683594, + "step": 3614, + "time_per_iteration": 2.6276586055755615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114766, + "balance_loss_mlp": 1.07179534, + "epoch": 0.6954597922277799, + "flos": 547856951808.0, + "grad_norm": 0.035809532178513556, + "language_loss": 0.85295904, + "learning_rate": 0.00022412821464256873, + "loss": 0.86443567, + "num_input_tokens_seen": 300081008, + "router_z_loss_mlp": 0.75732422, + "step": 3615, + "time_per_iteration": 2.675262689590454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144404, + "balance_loss_mlp": 1.06887305, + "epoch": 0.6956521739130435, + "flos": 520540875264.0, + "grad_norm": 0.03660154684653836, + "language_loss": 0.87111717, + "learning_rate": 0.00022386843762252023, + "loss": 0.88256121, + "num_input_tokens_seen": 300149856, + "router_z_loss_mlp": 0.75390625, + "step": 3616, + "time_per_iteration": 2.601850986480713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145995, + "balance_loss_mlp": 1.07055974, + "epoch": 0.695844555598307, + "flos": 467263673856.0, + "grad_norm": 0.03600236468041408, + "language_loss": 0.85243946, + "learning_rate": 0.00022360876780625193, + "loss": 0.86389947, + "num_input_tokens_seen": 300217344, + "router_z_loss_mlp": 0.75292969, + "step": 3617, + "time_per_iteration": 2.6009066104888916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146046, + "balance_loss_mlp": 1.0705148, + "epoch": 0.6960369372835706, + "flos": 601931151360.0, + "grad_norm": 0.03135963801145649, + "language_loss": 0.84376919, + "learning_rate": 0.00022334920529457604, + "loss": 0.85522962, + "num_input_tokens_seen": 300305584, + "router_z_loss_mlp": 0.75390625, + "step": 3618, + "time_per_iteration": 2.919830322265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152209, + "balance_loss_mlp": 1.07662988, + "epoch": 0.6962293189688342, + "flos": 645465358848.0, + "grad_norm": 0.03118514394285757, + "language_loss": 0.91862655, + "learning_rate": 0.00022308975018826423, + "loss": 0.9301486, + "num_input_tokens_seen": 300386480, + "router_z_loss_mlp": 0.75439453, + "step": 3619, + "time_per_iteration": 2.8989925384521484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152559, + "balance_loss_mlp": 1.07688463, + "epoch": 0.6964217006540977, + "flos": 639957682176.0, + "grad_norm": 0.03812258215137557, + "language_loss": 0.9018597, + "learning_rate": 0.00022283040258804564, + "loss": 0.91338527, + "num_input_tokens_seen": 300461840, + "router_z_loss_mlp": 0.75537109, + "step": 3620, + "time_per_iteration": 2.74235200881958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115248, + "balance_loss_mlp": 1.07680559, + "epoch": 0.6966140823393613, + "flos": 653386771968.0, + "grad_norm": 0.03521446946003712, + "language_loss": 0.88482189, + "learning_rate": 0.00022257116259460802, + "loss": 0.89634669, + "num_input_tokens_seen": 300540400, + "router_z_loss_mlp": 0.75537109, + "step": 3621, + "time_per_iteration": 2.819164991378784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152109, + "balance_loss_mlp": 1.07657778, + "epoch": 0.6968064640246249, + "flos": 705824040960.0, + "grad_norm": 0.033483575769838334, + "language_loss": 0.86131644, + "learning_rate": 0.00022231203030859725, + "loss": 0.87283748, + "num_input_tokens_seen": 300624240, + "router_z_loss_mlp": 0.75390625, + "step": 3622, + "time_per_iteration": 2.9764678478240967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151499, + "balance_loss_mlp": 1.07596815, + "epoch": 0.6969988457098885, + "flos": 493530972672.0, + "grad_norm": 0.03689827849321225, + "language_loss": 0.88673711, + "learning_rate": 0.00022205300583061737, + "loss": 0.89825207, + "num_input_tokens_seen": 300689728, + "router_z_loss_mlp": 0.75390625, + "step": 3623, + "time_per_iteration": 2.56077241897583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160957, + "balance_loss_mlp": 1.08676147, + "epoch": 0.6971912273951519, + "flos": 1355612765184.0, + "grad_norm": 0.01051210233646139, + "language_loss": 0.82838202, + "learning_rate": 0.00022179408926123063, + "loss": 0.83999157, + "num_input_tokens_seen": 300913152, + "router_z_loss_mlp": 0.7421875, + "step": 3624, + "time_per_iteration": 4.901975393295288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150633, + "balance_loss_mlp": 1.07529247, + "epoch": 0.6973836090804155, + "flos": 603574086144.0, + "grad_norm": 0.03562483559578549, + "language_loss": 0.82784301, + "learning_rate": 0.00022153528070095735, + "loss": 0.83934939, + "num_input_tokens_seen": 300985824, + "router_z_loss_mlp": 0.75195312, + "step": 3625, + "time_per_iteration": 2.6827454566955566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147557, + "balance_loss_mlp": 1.07226419, + "epoch": 0.6975759907656791, + "flos": 525110564352.0, + "grad_norm": 0.03740891525888632, + "language_loss": 0.94177675, + "learning_rate": 0.00022127658025027568, + "loss": 0.95325232, + "num_input_tokens_seen": 301058048, + "router_z_loss_mlp": 0.75146484, + "step": 3626, + "time_per_iteration": 2.6243293285369873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145673, + "balance_loss_mlp": 1.07014167, + "epoch": 0.6977683724509427, + "flos": 481877801472.0, + "grad_norm": 0.03606674013608827, + "language_loss": 0.91052938, + "learning_rate": 0.00022101798800962258, + "loss": 0.92198616, + "num_input_tokens_seen": 301127472, + "router_z_loss_mlp": 0.75390625, + "step": 3627, + "time_per_iteration": 2.585353374481201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145537, + "balance_loss_mlp": 1.07005322, + "epoch": 0.6979607541362063, + "flos": 523640819712.0, + "grad_norm": 0.043695073898502274, + "language_loss": 0.852063, + "learning_rate": 0.00022075950407939227, + "loss": 0.86351836, + "num_input_tokens_seen": 301193920, + "router_z_loss_mlp": 0.75341797, + "step": 3628, + "time_per_iteration": 2.6018002033233643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145624, + "balance_loss_mlp": 1.07023609, + "epoch": 0.6981531358214698, + "flos": 549115849728.0, + "grad_norm": 0.039500919644618576, + "language_loss": 0.87787813, + "learning_rate": 0.0002205011285599367, + "loss": 0.88933432, + "num_input_tokens_seen": 301264256, + "router_z_loss_mlp": 0.75244141, + "step": 3629, + "time_per_iteration": 2.6909217834472656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114526, + "balance_loss_mlp": 1.06991994, + "epoch": 0.6983455175067333, + "flos": 701275819008.0, + "grad_norm": 0.03293425746388738, + "language_loss": 0.8505758, + "learning_rate": 0.00022024286155156658, + "loss": 0.86202836, + "num_input_tokens_seen": 301337696, + "router_z_loss_mlp": 0.75195312, + "step": 3630, + "time_per_iteration": 2.8668339252471924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145235, + "balance_loss_mlp": 1.07008553, + "epoch": 0.6985378991919969, + "flos": 486119849472.0, + "grad_norm": 0.03293145354984791, + "language_loss": 0.9093079, + "learning_rate": 0.00021998470315454994, + "loss": 0.92076027, + "num_input_tokens_seen": 301407776, + "router_z_loss_mlp": 0.75, + "step": 3631, + "time_per_iteration": 2.6536853313446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145252, + "balance_loss_mlp": 1.07010257, + "epoch": 0.6987302808772605, + "flos": 559892158464.0, + "grad_norm": 0.03487739632649299, + "language_loss": 0.90976024, + "learning_rate": 0.00021972665346911275, + "loss": 0.92121279, + "num_input_tokens_seen": 301475120, + "router_z_loss_mlp": 0.75, + "step": 3632, + "time_per_iteration": 2.705947160720825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145801, + "balance_loss_mlp": 1.07046092, + "epoch": 0.698922662562524, + "flos": 484567512576.0, + "grad_norm": 0.03530100295621196, + "language_loss": 0.84786582, + "learning_rate": 0.00021946871259543877, + "loss": 0.85932386, + "num_input_tokens_seen": 301542416, + "router_z_loss_mlp": 0.75195312, + "step": 3633, + "time_per_iteration": 2.585474729537964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146213, + "balance_loss_mlp": 1.07106328, + "epoch": 0.6991150442477876, + "flos": 720205854720.0, + "grad_norm": 0.031838987726816204, + "language_loss": 0.87710065, + "learning_rate": 0.00021921088063366957, + "loss": 0.88856274, + "num_input_tokens_seen": 301620672, + "router_z_loss_mlp": 0.75, + "step": 3634, + "time_per_iteration": 2.9367825984954834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150014, + "balance_loss_mlp": 1.0748167, + "epoch": 0.6993074259330512, + "flos": 490159782912.0, + "grad_norm": 0.031688179497796835, + "language_loss": 0.86258936, + "learning_rate": 0.00021895315768390435, + "loss": 0.87408948, + "num_input_tokens_seen": 301688016, + "router_z_loss_mlp": 0.75048828, + "step": 3635, + "time_per_iteration": 2.6028146743774414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150052, + "balance_loss_mlp": 1.07490218, + "epoch": 0.6994998076183148, + "flos": 719467980288.0, + "grad_norm": 0.03153013749596923, + "language_loss": 0.92548811, + "learning_rate": 0.00021869554384619999, + "loss": 0.93698871, + "num_input_tokens_seen": 301771184, + "router_z_loss_mlp": 0.75, + "step": 3636, + "time_per_iteration": 2.998966932296753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146553, + "balance_loss_mlp": 1.07126021, + "epoch": 0.6996921893035783, + "flos": 580163684352.0, + "grad_norm": 0.03271766083883028, + "language_loss": 0.86055148, + "learning_rate": 0.00021843803922057115, + "loss": 0.87201703, + "num_input_tokens_seen": 301844528, + "router_z_loss_mlp": 0.75146484, + "step": 3637, + "time_per_iteration": 2.745859384536743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145131, + "balance_loss_mlp": 1.06983805, + "epoch": 0.6998845709888418, + "flos": 519674746368.0, + "grad_norm": 0.033737468180216806, + "language_loss": 0.86839747, + "learning_rate": 0.00021818064390698977, + "loss": 0.87984878, + "num_input_tokens_seen": 301914960, + "router_z_loss_mlp": 0.75146484, + "step": 3638, + "time_per_iteration": 2.632795810699463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146648, + "balance_loss_mlp": 1.07130754, + "epoch": 0.7000769526741054, + "flos": 622095889920.0, + "grad_norm": 0.03373596031982573, + "language_loss": 0.91870159, + "learning_rate": 0.0002179233580053861, + "loss": 0.93016809, + "num_input_tokens_seen": 301986352, + "router_z_loss_mlp": 0.75195312, + "step": 3639, + "time_per_iteration": 2.753880023956299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115047, + "balance_loss_mlp": 1.07512987, + "epoch": 0.700269334359369, + "flos": 561055729152.0, + "grad_norm": 0.03325206970104953, + "language_loss": 0.90108448, + "learning_rate": 0.00021766618161564688, + "loss": 0.91258919, + "num_input_tokens_seen": 302060544, + "router_z_loss_mlp": 0.75195312, + "step": 3640, + "time_per_iteration": 2.724479913711548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114817, + "balance_loss_mlp": 1.07273436, + "epoch": 0.7004617160446326, + "flos": 484361395200.0, + "grad_norm": 0.03152672477913245, + "language_loss": 0.91440845, + "learning_rate": 0.00021740911483761677, + "loss": 0.92589015, + "num_input_tokens_seen": 302127232, + "router_z_loss_mlp": 0.75292969, + "step": 3641, + "time_per_iteration": 2.5502066612243652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146714, + "balance_loss_mlp": 1.07137418, + "epoch": 0.7006540977298961, + "flos": 698321593344.0, + "grad_norm": 0.030766047541437955, + "language_loss": 0.95812565, + "learning_rate": 0.00021715215777109837, + "loss": 0.96959281, + "num_input_tokens_seen": 302207056, + "router_z_loss_mlp": 0.75195312, + "step": 3642, + "time_per_iteration": 2.9363698959350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150063, + "balance_loss_mlp": 1.07477081, + "epoch": 0.7008464794151597, + "flos": 505770295296.0, + "grad_norm": 0.03557511475331178, + "language_loss": 0.88907003, + "learning_rate": 0.00021689531051585103, + "loss": 0.90057063, + "num_input_tokens_seen": 302275632, + "router_z_loss_mlp": 0.75146484, + "step": 3643, + "time_per_iteration": 2.6452667713165283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150173, + "balance_loss_mlp": 1.07483232, + "epoch": 0.7010388611004232, + "flos": 538272411648.0, + "grad_norm": 0.036527368416016295, + "language_loss": 0.85649168, + "learning_rate": 0.00021663857317159196, + "loss": 0.86799347, + "num_input_tokens_seen": 302343600, + "router_z_loss_mlp": 0.75195312, + "step": 3644, + "time_per_iteration": 2.661463499069214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149991, + "balance_loss_mlp": 1.07465088, + "epoch": 0.7012312427856868, + "flos": 548314848768.0, + "grad_norm": 0.031074257387366924, + "language_loss": 0.86441541, + "learning_rate": 0.00021638194583799487, + "loss": 0.87591535, + "num_input_tokens_seen": 302414656, + "router_z_loss_mlp": 0.75195312, + "step": 3645, + "time_per_iteration": 2.6630945205688477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114701, + "balance_loss_mlp": 1.07166946, + "epoch": 0.7014236244709504, + "flos": 942973060608.0, + "grad_norm": 0.03710031332944713, + "language_loss": 0.87637782, + "learning_rate": 0.00021612542861469176, + "loss": 0.8878479, + "num_input_tokens_seen": 302495120, + "router_z_loss_mlp": 0.75195312, + "step": 3646, + "time_per_iteration": 3.1664998531341553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146595, + "balance_loss_mlp": 1.07120693, + "epoch": 0.7016160061562139, + "flos": 526209007104.0, + "grad_norm": 0.036568631884181475, + "language_loss": 0.87361133, + "learning_rate": 0.00021586902160127135, + "loss": 0.88507724, + "num_input_tokens_seen": 302563024, + "router_z_loss_mlp": 0.75244141, + "step": 3647, + "time_per_iteration": 2.588329792022705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145686, + "balance_loss_mlp": 1.07029808, + "epoch": 0.7018083878414775, + "flos": 374244421632.0, + "grad_norm": 0.046770994216465425, + "language_loss": 0.81241143, + "learning_rate": 0.00021561272489727974, + "loss": 0.82386827, + "num_input_tokens_seen": 302624544, + "router_z_loss_mlp": 0.75244141, + "step": 3648, + "time_per_iteration": 2.4180006980895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145708, + "balance_loss_mlp": 1.07036817, + "epoch": 0.7020007695267411, + "flos": 528833590272.0, + "grad_norm": 0.03433939193961528, + "language_loss": 0.86265445, + "learning_rate": 0.0002153565386022199, + "loss": 0.87411153, + "num_input_tokens_seen": 302697856, + "router_z_loss_mlp": 0.75195312, + "step": 3649, + "time_per_iteration": 2.6287925243377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146273, + "balance_loss_mlp": 1.07093239, + "epoch": 0.7021931512120047, + "flos": 691372369920.0, + "grad_norm": 0.0338942783378883, + "language_loss": 0.87374359, + "learning_rate": 0.00021510046281555262, + "loss": 0.88520634, + "num_input_tokens_seen": 302771984, + "router_z_loss_mlp": 0.75195312, + "step": 3650, + "time_per_iteration": 2.8249292373657227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145818, + "balance_loss_mlp": 1.0704776, + "epoch": 0.7023855328972681, + "flos": 640925869056.0, + "grad_norm": 0.04142301274986203, + "language_loss": 0.87215114, + "learning_rate": 0.0002148444976366949, + "loss": 0.88360929, + "num_input_tokens_seen": 302838832, + "router_z_loss_mlp": 0.75195312, + "step": 3651, + "time_per_iteration": 2.7713325023651123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148886, + "balance_loss_mlp": 1.07368851, + "epoch": 0.7025779145825317, + "flos": 562006451712.0, + "grad_norm": 0.03240472166532918, + "language_loss": 0.87441784, + "learning_rate": 0.00021458864316502136, + "loss": 0.8859067, + "num_input_tokens_seen": 302909952, + "router_z_loss_mlp": 0.75048828, + "step": 3652, + "time_per_iteration": 2.729938268661499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147969, + "balance_loss_mlp": 1.07267606, + "epoch": 0.7027702962677953, + "flos": 448370568192.0, + "grad_norm": 0.03662771353243768, + "language_loss": 0.92350411, + "learning_rate": 0.0002143328994998634, + "loss": 0.93498379, + "num_input_tokens_seen": 302973056, + "router_z_loss_mlp": 0.75146484, + "step": 3653, + "time_per_iteration": 2.4846644401550293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147539, + "balance_loss_mlp": 1.07210338, + "epoch": 0.7029626779530589, + "flos": 623713354752.0, + "grad_norm": 0.03664764199554111, + "language_loss": 0.83479095, + "learning_rate": 0.00021407726674050982, + "loss": 0.84626639, + "num_input_tokens_seen": 303054656, + "router_z_loss_mlp": 0.75292969, + "step": 3654, + "time_per_iteration": 2.850576400756836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145188, + "balance_loss_mlp": 1.07003856, + "epoch": 0.7031550596383225, + "flos": 630733710336.0, + "grad_norm": 0.030002783226809063, + "language_loss": 0.91781414, + "learning_rate": 0.0002138217449862061, + "loss": 0.92926598, + "num_input_tokens_seen": 303124256, + "router_z_loss_mlp": 0.75, + "step": 3655, + "time_per_iteration": 2.7412569522857666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145204, + "balance_loss_mlp": 1.07000697, + "epoch": 0.703347441323586, + "flos": 531859674624.0, + "grad_norm": 0.03278089952227313, + "language_loss": 0.82951868, + "learning_rate": 0.00021356633433615403, + "loss": 0.84097064, + "num_input_tokens_seen": 303192720, + "router_z_loss_mlp": 0.75048828, + "step": 3656, + "time_per_iteration": 2.6387276649475098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144911, + "balance_loss_mlp": 1.06971395, + "epoch": 0.7035398230088495, + "flos": 694915474944.0, + "grad_norm": 0.029068288031651398, + "language_loss": 0.87720138, + "learning_rate": 0.0002133110348895133, + "loss": 0.88865048, + "num_input_tokens_seen": 303275968, + "router_z_loss_mlp": 0.75048828, + "step": 3657, + "time_per_iteration": 2.993046998977661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146816, + "balance_loss_mlp": 1.07152295, + "epoch": 0.7037322046941131, + "flos": 969666055680.0, + "grad_norm": 0.030671197457474774, + "language_loss": 0.89195395, + "learning_rate": 0.0002130558467453999, + "loss": 0.90342212, + "num_input_tokens_seen": 303367296, + "router_z_loss_mlp": 0.75146484, + "step": 3658, + "time_per_iteration": 3.3705010414123535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146747, + "balance_loss_mlp": 1.07131183, + "epoch": 0.7039245863793767, + "flos": 503925245952.0, + "grad_norm": 0.03300080382210099, + "language_loss": 0.88645768, + "learning_rate": 0.0002128007700028865, + "loss": 0.89792514, + "num_input_tokens_seen": 303442768, + "router_z_loss_mlp": 0.75292969, + "step": 3659, + "time_per_iteration": 2.734318256378174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148886, + "balance_loss_mlp": 1.07368839, + "epoch": 0.7041169680646402, + "flos": 466938034176.0, + "grad_norm": 0.036833825821468186, + "language_loss": 0.89132273, + "learning_rate": 0.00021254580476100276, + "loss": 0.90281165, + "num_input_tokens_seen": 303508304, + "router_z_loss_mlp": 0.75048828, + "step": 3660, + "time_per_iteration": 2.5174009799957275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149342, + "balance_loss_mlp": 1.07409692, + "epoch": 0.7043093497499038, + "flos": 633321363456.0, + "grad_norm": 0.04007789586728335, + "language_loss": 0.83207953, + "learning_rate": 0.00021229095111873497, + "loss": 0.84357297, + "num_input_tokens_seen": 303579312, + "router_z_loss_mlp": 0.75097656, + "step": 3661, + "time_per_iteration": 2.739220142364502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150007, + "balance_loss_mlp": 1.07466638, + "epoch": 0.7045017314351674, + "flos": 544094994432.0, + "grad_norm": 0.03298817995700549, + "language_loss": 0.90804625, + "learning_rate": 0.0002120362091750261, + "loss": 0.91954637, + "num_input_tokens_seen": 303658384, + "router_z_loss_mlp": 0.75195312, + "step": 3662, + "time_per_iteration": 2.7960565090179443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146981, + "balance_loss_mlp": 1.07149768, + "epoch": 0.704694113120431, + "flos": 429141089280.0, + "grad_norm": 0.039212871672660514, + "language_loss": 0.92362261, + "learning_rate": 0.00021178157902877566, + "loss": 0.93509239, + "num_input_tokens_seen": 303721136, + "router_z_loss_mlp": 0.75341797, + "step": 3663, + "time_per_iteration": 2.4680960178375244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147972, + "balance_loss_mlp": 1.07263219, + "epoch": 0.7048864948056945, + "flos": 651712911360.0, + "grad_norm": 0.034682408130930084, + "language_loss": 0.9230448, + "learning_rate": 0.0002115270607788397, + "loss": 0.93452454, + "num_input_tokens_seen": 303792368, + "router_z_loss_mlp": 0.75195312, + "step": 3664, + "time_per_iteration": 2.775634288787842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149534, + "balance_loss_mlp": 1.07414639, + "epoch": 0.705078876490958, + "flos": 413493646848.0, + "grad_norm": 0.03365445853786745, + "language_loss": 0.90348285, + "learning_rate": 0.00021127265452403133, + "loss": 0.91497815, + "num_input_tokens_seen": 303856336, + "router_z_loss_mlp": 0.75244141, + "step": 3665, + "time_per_iteration": 2.4944612979888916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153404, + "balance_loss_mlp": 1.07958984, + "epoch": 0.7052712581762216, + "flos": 1423148255232.0, + "grad_norm": 0.008450912797082885, + "language_loss": 0.84091628, + "learning_rate": 0.0002110183603631199, + "loss": 0.85245037, + "num_input_tokens_seen": 304089856, + "router_z_loss_mlp": 0.73828125, + "step": 3666, + "time_per_iteration": 4.8742945194244385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147318, + "balance_loss_mlp": 1.07188284, + "epoch": 0.7054636398614852, + "flos": 494069460480.0, + "grad_norm": 0.03621564888049926, + "language_loss": 0.8791604, + "learning_rate": 0.00021076417839483065, + "loss": 0.89063358, + "num_input_tokens_seen": 304164752, + "router_z_loss_mlp": 0.75292969, + "step": 3667, + "time_per_iteration": 2.8080356121063232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145091, + "balance_loss_mlp": 1.06965578, + "epoch": 0.7056560215467488, + "flos": 451377186816.0, + "grad_norm": 0.031611332246536214, + "language_loss": 0.89408493, + "learning_rate": 0.00021051010871784589, + "loss": 0.90553588, + "num_input_tokens_seen": 304229568, + "router_z_loss_mlp": 0.75292969, + "step": 3668, + "time_per_iteration": 2.57733154296875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145739, + "balance_loss_mlp": 1.07039869, + "epoch": 0.7058484032320124, + "flos": 566817186816.0, + "grad_norm": 0.030127652842763482, + "language_loss": 0.83471566, + "learning_rate": 0.0002102561514308045, + "loss": 0.84617305, + "num_input_tokens_seen": 304299408, + "router_z_loss_mlp": 0.75195312, + "step": 3669, + "time_per_iteration": 2.742791175842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144151, + "balance_loss_mlp": 1.06881058, + "epoch": 0.7060407849172758, + "flos": 568102281216.0, + "grad_norm": 0.033895396428982545, + "language_loss": 0.87930894, + "learning_rate": 0.00021000230663230135, + "loss": 0.89075041, + "num_input_tokens_seen": 304367936, + "router_z_loss_mlp": 0.75195312, + "step": 3670, + "time_per_iteration": 2.667344331741333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143185, + "balance_loss_mlp": 1.06779695, + "epoch": 0.7062331666025394, + "flos": 469712338944.0, + "grad_norm": 0.03501215574939966, + "language_loss": 0.88139564, + "learning_rate": 0.00020974857442088762, + "loss": 0.89282751, + "num_input_tokens_seen": 304438368, + "router_z_loss_mlp": 0.75244141, + "step": 3671, + "time_per_iteration": 2.6410346031188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143999, + "balance_loss_mlp": 1.06861079, + "epoch": 0.706425548287803, + "flos": 596416743936.0, + "grad_norm": 0.033800210787899305, + "language_loss": 0.93517375, + "learning_rate": 0.00020949495489507104, + "loss": 0.94661367, + "num_input_tokens_seen": 304508720, + "router_z_loss_mlp": 0.75244141, + "step": 3672, + "time_per_iteration": 2.750444173812866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143883, + "balance_loss_mlp": 1.0685432, + "epoch": 0.7066179299730666, + "flos": 476813285376.0, + "grad_norm": 0.035802140613359776, + "language_loss": 0.90171611, + "learning_rate": 0.00020924144815331525, + "loss": 0.91315496, + "num_input_tokens_seen": 304576128, + "router_z_loss_mlp": 0.75195312, + "step": 3673, + "time_per_iteration": 2.553835391998291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144146, + "balance_loss_mlp": 1.0689013, + "epoch": 0.7068103116583301, + "flos": 507435423744.0, + "grad_norm": 0.037241628897294654, + "language_loss": 0.87898988, + "learning_rate": 0.00020898805429404044, + "loss": 0.8904314, + "num_input_tokens_seen": 304642416, + "router_z_loss_mlp": 0.75097656, + "step": 3674, + "time_per_iteration": 2.586620330810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114411, + "balance_loss_mlp": 1.06905568, + "epoch": 0.7070026933435937, + "flos": 680574594048.0, + "grad_norm": 0.03737000823174173, + "language_loss": 0.83904374, + "learning_rate": 0.0002087347734156228, + "loss": 0.85048485, + "num_input_tokens_seen": 304719312, + "router_z_loss_mlp": 0.74902344, + "step": 3675, + "time_per_iteration": 2.882800579071045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144169, + "balance_loss_mlp": 1.06906736, + "epoch": 0.7071950750288573, + "flos": 473166120960.0, + "grad_norm": 0.03475094948464188, + "language_loss": 0.84385908, + "learning_rate": 0.00020848160561639452, + "loss": 0.85530072, + "num_input_tokens_seen": 304789296, + "router_z_loss_mlp": 0.74951172, + "step": 3676, + "time_per_iteration": 2.6969666481018066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149349, + "balance_loss_mlp": 1.07429469, + "epoch": 0.7073874567141208, + "flos": 474683529216.0, + "grad_norm": 0.03052777669540167, + "language_loss": 0.90233761, + "learning_rate": 0.0002082285509946445, + "loss": 0.91383111, + "num_input_tokens_seen": 304854320, + "router_z_loss_mlp": 0.74902344, + "step": 3677, + "time_per_iteration": 2.546494722366333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152207, + "balance_loss_mlp": 1.07710516, + "epoch": 0.7075798383993844, + "flos": 547036485120.0, + "grad_norm": 0.03113462016358252, + "language_loss": 0.87627769, + "learning_rate": 0.00020797560964861683, + "loss": 0.88779974, + "num_input_tokens_seen": 304932784, + "router_z_loss_mlp": 0.74951172, + "step": 3678, + "time_per_iteration": 2.745973587036133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150766, + "balance_loss_mlp": 1.07585537, + "epoch": 0.7077722200846479, + "flos": 663390277632.0, + "grad_norm": 0.06964386826372344, + "language_loss": 0.85110044, + "learning_rate": 0.0002077227816765122, + "loss": 0.86260808, + "num_input_tokens_seen": 305018080, + "router_z_loss_mlp": 0.74755859, + "step": 3679, + "time_per_iteration": 2.982367753982544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115432, + "balance_loss_mlp": 1.08107758, + "epoch": 0.7079646017699115, + "flos": 1533300157440.0, + "grad_norm": 0.007004763795919161, + "language_loss": 0.76447725, + "learning_rate": 0.0002074700671764869, + "loss": 0.77602041, + "num_input_tokens_seen": 305241216, + "router_z_loss_mlp": 0.73242188, + "step": 3680, + "time_per_iteration": 4.8018670082092285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147209, + "balance_loss_mlp": 1.07224989, + "epoch": 0.7081569834551751, + "flos": 622645111296.0, + "grad_norm": 0.030610109660701587, + "language_loss": 0.83047998, + "learning_rate": 0.00020721746624665383, + "loss": 0.84195209, + "num_input_tokens_seen": 305311376, + "router_z_loss_mlp": 0.74804688, + "step": 3681, + "time_per_iteration": 2.782902717590332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147174, + "balance_loss_mlp": 1.07207251, + "epoch": 0.7083493651404387, + "flos": 796034059776.0, + "grad_norm": 0.03164783844829979, + "language_loss": 0.84436798, + "learning_rate": 0.00020696497898508114, + "loss": 0.85583979, + "num_input_tokens_seen": 305392736, + "router_z_loss_mlp": 0.74951172, + "step": 3682, + "time_per_iteration": 3.0583677291870117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143785, + "balance_loss_mlp": 1.06882644, + "epoch": 0.7085417468257021, + "flos": 815161480704.0, + "grad_norm": 0.03682994028404894, + "language_loss": 0.82170761, + "learning_rate": 0.00020671260548979316, + "loss": 0.83314544, + "num_input_tokens_seen": 305470896, + "router_z_loss_mlp": 0.74804688, + "step": 3683, + "time_per_iteration": 2.987361192703247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144169, + "balance_loss_mlp": 1.06911492, + "epoch": 0.7087341285109657, + "flos": 701796842496.0, + "grad_norm": 0.03866478361298153, + "language_loss": 0.90972751, + "learning_rate": 0.00020646034585876982, + "loss": 0.92116916, + "num_input_tokens_seen": 305547072, + "router_z_loss_mlp": 0.74902344, + "step": 3684, + "time_per_iteration": 2.810547351837158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144506, + "balance_loss_mlp": 1.06954765, + "epoch": 0.7089265101962293, + "flos": 597734765568.0, + "grad_norm": 0.031076054714904006, + "language_loss": 0.88290167, + "learning_rate": 0.00020620820018994718, + "loss": 0.89434671, + "num_input_tokens_seen": 305624512, + "router_z_loss_mlp": 0.74804688, + "step": 3685, + "time_per_iteration": 2.822174310684204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147475, + "balance_loss_mlp": 1.07246852, + "epoch": 0.7091188918814929, + "flos": 488167013376.0, + "grad_norm": 0.047855359590775554, + "language_loss": 0.88914609, + "learning_rate": 0.00020595616858121675, + "loss": 0.90062082, + "num_input_tokens_seen": 305695088, + "router_z_loss_mlp": 0.74853516, + "step": 3686, + "time_per_iteration": 2.7043378353118896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149664, + "balance_loss_mlp": 1.07470512, + "epoch": 0.7093112735667565, + "flos": 601255676928.0, + "grad_norm": 0.0443498852923524, + "language_loss": 0.85199845, + "learning_rate": 0.00020570425113042586, + "loss": 0.86349511, + "num_input_tokens_seen": 305763680, + "router_z_loss_mlp": 0.74804688, + "step": 3687, + "time_per_iteration": 2.702566623687744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152357, + "balance_loss_mlp": 1.07754159, + "epoch": 0.70950365525202, + "flos": 506849272320.0, + "grad_norm": 0.040092967224601664, + "language_loss": 0.90721941, + "learning_rate": 0.0002054524479353776, + "loss": 0.91874295, + "num_input_tokens_seen": 305835008, + "router_z_loss_mlp": 0.74707031, + "step": 3688, + "time_per_iteration": 2.667358636856079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147763, + "balance_loss_mlp": 1.07294738, + "epoch": 0.7096960369372836, + "flos": 733424097792.0, + "grad_norm": 0.04032937797632071, + "language_loss": 0.86300701, + "learning_rate": 0.00020520075909383063, + "loss": 0.87448466, + "num_input_tokens_seen": 305909072, + "router_z_loss_mlp": 0.74707031, + "step": 3689, + "time_per_iteration": 2.829561710357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145291, + "balance_loss_mlp": 1.07033193, + "epoch": 0.7098884186225471, + "flos": 973651594752.0, + "grad_norm": 0.03422835744235037, + "language_loss": 0.85456049, + "learning_rate": 0.00020494918470349916, + "loss": 0.86601341, + "num_input_tokens_seen": 305994752, + "router_z_loss_mlp": 0.74804688, + "step": 3690, + "time_per_iteration": 3.2887604236602783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147533, + "balance_loss_mlp": 1.0725745, + "epoch": 0.7100808003078107, + "flos": 505258003968.0, + "grad_norm": 0.040153245329332135, + "language_loss": 0.91447139, + "learning_rate": 0.00020469772486205297, + "loss": 0.92594671, + "num_input_tokens_seen": 306062960, + "router_z_loss_mlp": 0.74804688, + "step": 3691, + "time_per_iteration": 2.7245473861694336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148215, + "balance_loss_mlp": 1.07344735, + "epoch": 0.7102731819930742, + "flos": 541389820416.0, + "grad_norm": 0.03217926950478085, + "language_loss": 0.86047411, + "learning_rate": 0.0002044463796671177, + "loss": 0.87195623, + "num_input_tokens_seen": 306134224, + "router_z_loss_mlp": 0.74609375, + "step": 3692, + "time_per_iteration": 2.651794910430908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148314, + "balance_loss_mlp": 1.07330716, + "epoch": 0.7104655636783378, + "flos": 621627259392.0, + "grad_norm": 0.03360219211678542, + "language_loss": 0.85673523, + "learning_rate": 0.00020419514921627408, + "loss": 0.86821842, + "num_input_tokens_seen": 306214512, + "router_z_loss_mlp": 0.74853516, + "step": 3693, + "time_per_iteration": 2.933528184890747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147632, + "balance_loss_mlp": 1.07267368, + "epoch": 0.7106579453636014, + "flos": 558376751616.0, + "grad_norm": 0.03878231917046877, + "language_loss": 0.82689238, + "learning_rate": 0.00020394403360705855, + "loss": 0.83836865, + "num_input_tokens_seen": 306283232, + "router_z_loss_mlp": 0.74804688, + "step": 3694, + "time_per_iteration": 2.717163324356079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114284, + "balance_loss_mlp": 1.06788099, + "epoch": 0.710850327048865, + "flos": 514063010304.0, + "grad_norm": 0.03670457803793717, + "language_loss": 0.93433875, + "learning_rate": 0.00020369303293696228, + "loss": 0.9457671, + "num_input_tokens_seen": 306351536, + "router_z_loss_mlp": 0.74804688, + "step": 3695, + "time_per_iteration": 2.591191053390503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144917, + "balance_loss_mlp": 1.06995821, + "epoch": 0.7110427087341286, + "flos": 424506272256.0, + "grad_norm": 0.04020330353774376, + "language_loss": 0.83559984, + "learning_rate": 0.00020344214730343304, + "loss": 0.847049, + "num_input_tokens_seen": 306419040, + "router_z_loss_mlp": 0.74804688, + "step": 3696, + "time_per_iteration": 2.591609001159668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145099, + "balance_loss_mlp": 1.07014048, + "epoch": 0.711235090419392, + "flos": 578653006848.0, + "grad_norm": 0.02808433050647353, + "language_loss": 0.83313894, + "learning_rate": 0.00020319137680387296, + "loss": 0.84458989, + "num_input_tokens_seen": 306503248, + "router_z_loss_mlp": 0.74804688, + "step": 3697, + "time_per_iteration": 2.950737953186035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145687, + "balance_loss_mlp": 1.07063317, + "epoch": 0.7114274721046556, + "flos": 448984917504.0, + "grad_norm": 0.03843897473466325, + "language_loss": 0.86332655, + "learning_rate": 0.0002029407215356398, + "loss": 0.8747834, + "num_input_tokens_seen": 306566288, + "router_z_loss_mlp": 0.74902344, + "step": 3698, + "time_per_iteration": 2.578458309173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145595, + "balance_loss_mlp": 1.07063591, + "epoch": 0.7116198537899192, + "flos": 623092274688.0, + "grad_norm": 0.03606756354447633, + "language_loss": 0.88161683, + "learning_rate": 0.00020269018159604663, + "loss": 0.89307278, + "num_input_tokens_seen": 306633344, + "router_z_loss_mlp": 0.74804688, + "step": 3699, + "time_per_iteration": 2.7380590438842773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145077, + "balance_loss_mlp": 1.07007015, + "epoch": 0.7118122354751828, + "flos": 499720128000.0, + "grad_norm": 0.030764308679153148, + "language_loss": 0.86152577, + "learning_rate": 0.00020243975708236162, + "loss": 0.87297654, + "num_input_tokens_seen": 306701328, + "router_z_loss_mlp": 0.74853516, + "step": 3700, + "time_per_iteration": 2.5728888511657715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146347, + "balance_loss_mlp": 1.07134008, + "epoch": 0.7120046171604463, + "flos": 573844273152.0, + "grad_norm": 0.03285972243825597, + "language_loss": 0.90220731, + "learning_rate": 0.00020218944809180818, + "loss": 0.91367078, + "num_input_tokens_seen": 306773168, + "router_z_loss_mlp": 0.74853516, + "step": 3701, + "time_per_iteration": 2.684532880783081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146223, + "balance_loss_mlp": 1.07116926, + "epoch": 0.7121969988457099, + "flos": 573770413056.0, + "grad_norm": 0.03115747571146437, + "language_loss": 0.89376664, + "learning_rate": 0.00020193925472156493, + "loss": 0.90522885, + "num_input_tokens_seen": 306845312, + "router_z_loss_mlp": 0.74902344, + "step": 3702, + "time_per_iteration": 2.6705996990203857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152153, + "balance_loss_mlp": 1.07910156, + "epoch": 0.7123893805309734, + "flos": 1526820291072.0, + "grad_norm": 0.004701938060017763, + "language_loss": 0.74289167, + "learning_rate": 0.00020168917706876537, + "loss": 0.75441325, + "num_input_tokens_seen": 307079216, + "router_z_loss_mlp": 0.73046875, + "step": 3703, + "time_per_iteration": 4.916099309921265 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154733, + "balance_loss_mlp": 1.07958353, + "epoch": 0.712581762216237, + "flos": 616413021696.0, + "grad_norm": 0.031775345220902064, + "language_loss": 0.87929761, + "learning_rate": 0.00020143921523049863, + "loss": 0.89084488, + "num_input_tokens_seen": 307163568, + "router_z_loss_mlp": 0.75, + "step": 3704, + "time_per_iteration": 2.913417339324951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115426, + "balance_loss_mlp": 1.07915783, + "epoch": 0.7127741439015006, + "flos": 598874141184.0, + "grad_norm": 0.035207007977916, + "language_loss": 0.88667476, + "learning_rate": 0.00020118936930380837, + "loss": 0.89821732, + "num_input_tokens_seen": 307232800, + "router_z_loss_mlp": 0.74951172, + "step": 3705, + "time_per_iteration": 2.7526493072509766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144386, + "balance_loss_mlp": 1.06928408, + "epoch": 0.7129665255867641, + "flos": 538439597568.0, + "grad_norm": 0.036308279292938186, + "language_loss": 0.86138499, + "learning_rate": 0.0002009396393856932, + "loss": 0.87282884, + "num_input_tokens_seen": 307307216, + "router_z_loss_mlp": 0.74951172, + "step": 3706, + "time_per_iteration": 2.6750972270965576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147628, + "balance_loss_mlp": 1.07243121, + "epoch": 0.7131589072720277, + "flos": 527520297984.0, + "grad_norm": 0.03563284623765711, + "language_loss": 0.87550783, + "learning_rate": 0.00020069002557310673, + "loss": 0.88698411, + "num_input_tokens_seen": 307377472, + "router_z_loss_mlp": 0.75048828, + "step": 3707, + "time_per_iteration": 2.6487066745758057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149229, + "balance_loss_mlp": 1.0741272, + "epoch": 0.7133512889572913, + "flos": 532096717824.0, + "grad_norm": 0.031192275434881008, + "language_loss": 0.81347728, + "learning_rate": 0.00020044052796295807, + "loss": 0.82496965, + "num_input_tokens_seen": 307456880, + "router_z_loss_mlp": 0.74951172, + "step": 3708, + "time_per_iteration": 2.7782645225524902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148063, + "balance_loss_mlp": 1.0728184, + "epoch": 0.7135436706425549, + "flos": 504550328832.0, + "grad_norm": 0.03157354031682846, + "language_loss": 0.86940277, + "learning_rate": 0.00020019114665211063, + "loss": 0.8808834, + "num_input_tokens_seen": 307524784, + "router_z_loss_mlp": 0.75097656, + "step": 3709, + "time_per_iteration": 2.6009671688079834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147572, + "balance_loss_mlp": 1.07242227, + "epoch": 0.7137360523278183, + "flos": 516967570944.0, + "grad_norm": 0.03487007754085134, + "language_loss": 0.85992116, + "learning_rate": 0.00019994188173738276, + "loss": 0.8713969, + "num_input_tokens_seen": 307591408, + "router_z_loss_mlp": 0.75, + "step": 3710, + "time_per_iteration": 2.5438315868377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142507, + "balance_loss_mlp": 1.0673095, + "epoch": 0.7139284340130819, + "flos": 511536482304.0, + "grad_norm": 0.03607772040837418, + "language_loss": 0.85274506, + "learning_rate": 0.0001996927333155477, + "loss": 0.86417007, + "num_input_tokens_seen": 307662912, + "router_z_loss_mlp": 0.75048828, + "step": 3711, + "time_per_iteration": 2.7427854537963867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139044, + "balance_loss_mlp": 1.06389427, + "epoch": 0.7141208156983455, + "flos": 891799418880.0, + "grad_norm": 0.0340111276626949, + "language_loss": 0.9025712, + "learning_rate": 0.00019944370148333346, + "loss": 0.91396165, + "num_input_tokens_seen": 307752256, + "router_z_loss_mlp": 0.75, + "step": 3712, + "time_per_iteration": 3.1386330127716064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113928, + "balance_loss_mlp": 1.0641309, + "epoch": 0.7143131973836091, + "flos": 536883257856.0, + "grad_norm": 0.03639718620252856, + "language_loss": 0.8407408, + "learning_rate": 0.00019919478633742278, + "loss": 0.85213363, + "num_input_tokens_seen": 307821504, + "router_z_loss_mlp": 0.75, + "step": 3713, + "time_per_iteration": 2.6460351943969727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139962, + "balance_loss_mlp": 1.06486058, + "epoch": 0.7145055790688727, + "flos": 474627133440.0, + "grad_norm": 0.03673935987195594, + "language_loss": 0.91008997, + "learning_rate": 0.00019894598797445302, + "loss": 0.9214896, + "num_input_tokens_seen": 307886464, + "router_z_loss_mlp": 0.74951172, + "step": 3714, + "time_per_iteration": 2.5253968238830566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139941, + "balance_loss_mlp": 1.06498206, + "epoch": 0.7146979607541362, + "flos": 571701782016.0, + "grad_norm": 0.032359519554933665, + "language_loss": 0.85796106, + "learning_rate": 0.00019869730649101615, + "loss": 0.86936045, + "num_input_tokens_seen": 307962736, + "router_z_loss_mlp": 0.74804688, + "step": 3715, + "time_per_iteration": 2.765871047973633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139754, + "balance_loss_mlp": 1.06489098, + "epoch": 0.7148903424393998, + "flos": 841138068480.0, + "grad_norm": 0.0393709778481749, + "language_loss": 0.77344263, + "learning_rate": 0.00019844874198365943, + "loss": 0.78484023, + "num_input_tokens_seen": 308046592, + "router_z_loss_mlp": 0.74707031, + "step": 3716, + "time_per_iteration": 3.0865817070007324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140443, + "balance_loss_mlp": 1.06562734, + "epoch": 0.7150827241246633, + "flos": 542879030784.0, + "grad_norm": 0.03442327137938287, + "language_loss": 0.88300014, + "learning_rate": 0.00019820029454888362, + "loss": 0.89440459, + "num_input_tokens_seen": 308119920, + "router_z_loss_mlp": 0.74658203, + "step": 3717, + "time_per_iteration": 2.7028956413269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145981, + "balance_loss_mlp": 1.07312012, + "epoch": 0.7152751058099269, + "flos": 1587187705344.0, + "grad_norm": 0.009338560105867444, + "language_loss": 0.74521267, + "learning_rate": 0.00019795196428314455, + "loss": 0.7566725, + "num_input_tokens_seen": 308361024, + "router_z_loss_mlp": 0.72851562, + "step": 3718, + "time_per_iteration": 5.078125715255737 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142063, + "balance_loss_mlp": 1.06729496, + "epoch": 0.7154674874951905, + "flos": 518428583424.0, + "grad_norm": 0.038346473430325045, + "language_loss": 0.86008942, + "learning_rate": 0.0001977037512828529, + "loss": 0.87151003, + "num_input_tokens_seen": 308429808, + "router_z_loss_mlp": 0.74609375, + "step": 3719, + "time_per_iteration": 2.6236274242401123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141984, + "balance_loss_mlp": 1.0672158, + "epoch": 0.715659869180454, + "flos": 603639214080.0, + "grad_norm": 0.03183829156169413, + "language_loss": 0.90619719, + "learning_rate": 0.0001974556556443734, + "loss": 0.91761708, + "num_input_tokens_seen": 308501888, + "router_z_loss_mlp": 0.74609375, + "step": 3720, + "time_per_iteration": 2.7261006832122803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143131, + "balance_loss_mlp": 1.06836271, + "epoch": 0.7158522508657176, + "flos": 532769464320.0, + "grad_norm": 0.029220712652752532, + "language_loss": 0.93066287, + "learning_rate": 0.00019720767746402547, + "loss": 0.94209415, + "num_input_tokens_seen": 308576368, + "router_z_loss_mlp": 0.74609375, + "step": 3721, + "time_per_iteration": 2.730018377304077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144996, + "balance_loss_mlp": 1.06989455, + "epoch": 0.7160446325509812, + "flos": 558645995520.0, + "grad_norm": 0.03469516261194285, + "language_loss": 0.85035664, + "learning_rate": 0.00019695981683808222, + "loss": 0.86180663, + "num_input_tokens_seen": 308651936, + "router_z_loss_mlp": 0.74951172, + "step": 3722, + "time_per_iteration": 2.7371633052825928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152889, + "balance_loss_mlp": 1.07792997, + "epoch": 0.7162370142362448, + "flos": 692282159616.0, + "grad_norm": 0.032260484298275306, + "language_loss": 0.89382893, + "learning_rate": 0.00019671207386277225, + "loss": 0.90535784, + "num_input_tokens_seen": 308737264, + "router_z_loss_mlp": 0.74804688, + "step": 3723, + "time_per_iteration": 2.9425265789031982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114829, + "balance_loss_mlp": 1.07333136, + "epoch": 0.7164293959215082, + "flos": 795458641920.0, + "grad_norm": 0.035931768652590186, + "language_loss": 0.83636975, + "learning_rate": 0.0001964644486342777, + "loss": 0.84785259, + "num_input_tokens_seen": 308811776, + "router_z_loss_mlp": 0.74804688, + "step": 3724, + "time_per_iteration": 2.9537875652313232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147875, + "balance_loss_mlp": 1.07291591, + "epoch": 0.7166217776067718, + "flos": 495204833280.0, + "grad_norm": 0.03617438678608554, + "language_loss": 0.91026467, + "learning_rate": 0.00019621694124873524, + "loss": 0.92174339, + "num_input_tokens_seen": 308886704, + "router_z_loss_mlp": 0.74804688, + "step": 3725, + "time_per_iteration": 2.6945693492889404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146446, + "balance_loss_mlp": 1.07339478, + "epoch": 0.7168141592920354, + "flos": 1403961710592.0, + "grad_norm": 0.00968138139852001, + "language_loss": 0.76540077, + "learning_rate": 0.00019596955180223557, + "loss": 0.77686524, + "num_input_tokens_seen": 309113456, + "router_z_loss_mlp": 0.73046875, + "step": 3726, + "time_per_iteration": 4.849448919296265 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142124, + "balance_loss_mlp": 1.06716549, + "epoch": 0.717006540977299, + "flos": 794599243776.0, + "grad_norm": 0.04056704618834382, + "language_loss": 0.81872368, + "learning_rate": 0.00019572228039082428, + "loss": 0.83014494, + "num_input_tokens_seen": 309198768, + "router_z_loss_mlp": 0.74804688, + "step": 3727, + "time_per_iteration": 3.045783758163452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146498, + "balance_loss_mlp": 1.07153964, + "epoch": 0.7171989226625626, + "flos": 555963015168.0, + "grad_norm": 0.02715897729892971, + "language_loss": 0.87954736, + "learning_rate": 0.0001954751271105002, + "loss": 0.89101231, + "num_input_tokens_seen": 309279680, + "router_z_loss_mlp": 0.74804688, + "step": 3728, + "time_per_iteration": 2.7890095710754395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145282, + "balance_loss_mlp": 1.07027578, + "epoch": 0.717391304347826, + "flos": 557061457920.0, + "grad_norm": 0.03346658539414039, + "language_loss": 0.86323428, + "learning_rate": 0.00019522809205721687, + "loss": 0.87468708, + "num_input_tokens_seen": 309359152, + "router_z_loss_mlp": 0.74853516, + "step": 3729, + "time_per_iteration": 2.7522380352020264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140607, + "balance_loss_mlp": 1.06579113, + "epoch": 0.7175836860330896, + "flos": 539955004416.0, + "grad_norm": 0.0354578224226226, + "language_loss": 0.87126923, + "learning_rate": 0.0001949811753268816, + "loss": 0.88267529, + "num_input_tokens_seen": 309432800, + "router_z_loss_mlp": 0.74658203, + "step": 3730, + "time_per_iteration": 2.707690477371216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141683, + "balance_loss_mlp": 1.06686759, + "epoch": 0.7177760677183532, + "flos": 516650663424.0, + "grad_norm": 0.04023163535665124, + "language_loss": 0.88339722, + "learning_rate": 0.00019473437701535634, + "loss": 0.89481401, + "num_input_tokens_seen": 309499456, + "router_z_loss_mlp": 0.74658203, + "step": 3731, + "time_per_iteration": 2.570448637008667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114196, + "balance_loss_mlp": 1.06714427, + "epoch": 0.7179684494036168, + "flos": 675939777024.0, + "grad_norm": 0.03444896194332825, + "language_loss": 0.95062304, + "learning_rate": 0.00019448769721845677, + "loss": 0.96204257, + "num_input_tokens_seen": 309571056, + "router_z_loss_mlp": 0.74658203, + "step": 3732, + "time_per_iteration": 2.838884115219116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141126, + "balance_loss_mlp": 1.06635737, + "epoch": 0.7181608310888803, + "flos": 470875909632.0, + "grad_norm": 0.032659655773852006, + "language_loss": 0.9114489, + "learning_rate": 0.00019424113603195203, + "loss": 0.92286015, + "num_input_tokens_seen": 309635040, + "router_z_loss_mlp": 0.74609375, + "step": 3733, + "time_per_iteration": 2.540231704711914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142755, + "balance_loss_mlp": 1.06803441, + "epoch": 0.7183532127741439, + "flos": 595184042496.0, + "grad_norm": 0.0393108175728225, + "language_loss": 0.85483897, + "learning_rate": 0.0001939946935515657, + "loss": 0.86626649, + "num_input_tokens_seen": 309713696, + "router_z_loss_mlp": 0.74560547, + "step": 3734, + "time_per_iteration": 2.867018461227417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142515, + "balance_loss_mlp": 1.06774652, + "epoch": 0.7185455944594075, + "flos": 499915511808.0, + "grad_norm": 0.04034729202871447, + "language_loss": 0.85582328, + "learning_rate": 0.0001937483698729755, + "loss": 0.86724842, + "num_input_tokens_seen": 309782864, + "router_z_loss_mlp": 0.74609375, + "step": 3735, + "time_per_iteration": 2.5829944610595703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142145, + "balance_loss_mlp": 1.06737685, + "epoch": 0.718737976144671, + "flos": 816307587072.0, + "grad_norm": 0.03271819913976636, + "language_loss": 0.86010873, + "learning_rate": 0.0001935021650918128, + "loss": 0.87153018, + "num_input_tokens_seen": 309867056, + "router_z_loss_mlp": 0.74609375, + "step": 3736, + "time_per_iteration": 3.0105531215667725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142718, + "balance_loss_mlp": 1.06795025, + "epoch": 0.7189303578299346, + "flos": 439239922176.0, + "grad_norm": 0.03678550720791007, + "language_loss": 0.92134023, + "learning_rate": 0.0001932560793036625, + "loss": 0.93276739, + "num_input_tokens_seen": 309929744, + "router_z_loss_mlp": 0.74609375, + "step": 3737, + "time_per_iteration": 2.4854748249053955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142524, + "balance_loss_mlp": 1.06775641, + "epoch": 0.7191227395151981, + "flos": 550446606336.0, + "grad_norm": 0.04145641408022902, + "language_loss": 0.92745817, + "learning_rate": 0.00019301011260406382, + "loss": 0.93888342, + "num_input_tokens_seen": 309998128, + "router_z_loss_mlp": 0.74609375, + "step": 3738, + "time_per_iteration": 2.6645443439483643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114754, + "balance_loss_mlp": 1.07258117, + "epoch": 0.7193151212004617, + "flos": 628080929280.0, + "grad_norm": 0.039328087285967164, + "language_loss": 0.84679413, + "learning_rate": 0.00019276426508850936, + "loss": 0.85826951, + "num_input_tokens_seen": 310065472, + "router_z_loss_mlp": 0.74804688, + "step": 3739, + "time_per_iteration": 2.7071337699890137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148446, + "balance_loss_mlp": 1.07343948, + "epoch": 0.7195075028857253, + "flos": 742439950848.0, + "grad_norm": 0.030419377075742837, + "language_loss": 0.84898889, + "learning_rate": 0.00019251853685244564, + "loss": 0.86047333, + "num_input_tokens_seen": 310152960, + "router_z_loss_mlp": 0.74853516, + "step": 3740, + "time_per_iteration": 3.0168538093566895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114834, + "balance_loss_mlp": 1.07328558, + "epoch": 0.7196998845709889, + "flos": 804289844736.0, + "grad_norm": 0.05763766751245881, + "language_loss": 0.86089444, + "learning_rate": 0.00019227292799127283, + "loss": 0.87237775, + "num_input_tokens_seen": 310234080, + "router_z_loss_mlp": 0.74902344, + "step": 3741, + "time_per_iteration": 3.0083675384521484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144489, + "balance_loss_mlp": 1.06957746, + "epoch": 0.7198922662562524, + "flos": 926776396800.0, + "grad_norm": 0.03639396960725551, + "language_loss": 0.83974087, + "learning_rate": 0.00019202743860034454, + "loss": 0.8511858, + "num_input_tokens_seen": 310330208, + "router_z_loss_mlp": 0.74755859, + "step": 3742, + "time_per_iteration": 3.2506234645843506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144029, + "balance_loss_mlp": 1.06907046, + "epoch": 0.7200846479415159, + "flos": 581207732736.0, + "grad_norm": 0.03405610584059509, + "language_loss": 0.88730514, + "learning_rate": 0.00019178206877496873, + "loss": 0.89874554, + "num_input_tokens_seen": 310402960, + "router_z_loss_mlp": 0.74804688, + "step": 3743, + "time_per_iteration": 2.6837918758392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144783, + "balance_loss_mlp": 1.0700146, + "epoch": 0.7202770296267795, + "flos": 558839377920.0, + "grad_norm": 0.02830338825493349, + "language_loss": 0.89031184, + "learning_rate": 0.0001915368186104059, + "loss": 0.90175974, + "num_input_tokens_seen": 310479776, + "router_z_loss_mlp": 0.74609375, + "step": 3744, + "time_per_iteration": 2.7329940795898438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143898, + "balance_loss_mlp": 1.06912982, + "epoch": 0.7204694113120431, + "flos": 673771089408.0, + "grad_norm": 0.03331544271841085, + "language_loss": 0.85722578, + "learning_rate": 0.0001912916882018706, + "loss": 0.86866474, + "num_input_tokens_seen": 310555952, + "router_z_loss_mlp": 0.74609375, + "step": 3745, + "time_per_iteration": 2.7906653881073 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145353, + "balance_loss_mlp": 1.0706327, + "epoch": 0.7206617929973067, + "flos": 800595016704.0, + "grad_norm": 0.03936960108018568, + "language_loss": 0.85040343, + "learning_rate": 0.00019104667764453125, + "loss": 0.861857, + "num_input_tokens_seen": 310634784, + "router_z_loss_mlp": 0.74560547, + "step": 3746, + "time_per_iteration": 3.025996685028076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149239, + "balance_loss_mlp": 1.07437599, + "epoch": 0.7208541746825702, + "flos": 532938651648.0, + "grad_norm": 0.0387374733160612, + "language_loss": 0.85314423, + "learning_rate": 0.00019080178703350926, + "loss": 0.86463666, + "num_input_tokens_seen": 310703216, + "router_z_loss_mlp": 0.74707031, + "step": 3747, + "time_per_iteration": 2.640810251235962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149934, + "balance_loss_mlp": 1.07502282, + "epoch": 0.7210465563678338, + "flos": 536168851968.0, + "grad_norm": 0.035199314592541234, + "language_loss": 0.8746413, + "learning_rate": 0.00019055701646387952, + "loss": 0.88614064, + "num_input_tokens_seen": 310776816, + "router_z_loss_mlp": 0.74755859, + "step": 3748, + "time_per_iteration": 2.6518776416778564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155716, + "balance_loss_mlp": 1.08266449, + "epoch": 0.7212389380530974, + "flos": 1537246765056.0, + "grad_norm": 0.009534270530490536, + "language_loss": 0.80472684, + "learning_rate": 0.00019031236603067042, + "loss": 0.81628406, + "num_input_tokens_seen": 310987056, + "router_z_loss_mlp": 0.73046875, + "step": 3749, + "time_per_iteration": 4.76072096824646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151416, + "balance_loss_mlp": 1.07664847, + "epoch": 0.7214313197383609, + "flos": 462452938752.0, + "grad_norm": 0.03323767151214544, + "language_loss": 0.92055959, + "learning_rate": 0.00019006783582886368, + "loss": 0.93207377, + "num_input_tokens_seen": 311051648, + "router_z_loss_mlp": 0.74609375, + "step": 3750, + "time_per_iteration": 2.536107301712036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147507, + "balance_loss_mlp": 1.0724529, + "epoch": 0.7216237014236244, + "flos": 1038912336384.0, + "grad_norm": 0.03471978227212596, + "language_loss": 0.8780399, + "learning_rate": 0.00018982342595339437, + "loss": 0.88951492, + "num_input_tokens_seen": 311146272, + "router_z_loss_mlp": 0.74902344, + "step": 3751, + "time_per_iteration": 3.496842622756958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146824, + "balance_loss_mlp": 1.07181787, + "epoch": 0.721816083108888, + "flos": 897450086400.0, + "grad_norm": 0.03786430970431107, + "language_loss": 0.87491071, + "learning_rate": 0.00018957913649915076, + "loss": 0.88637894, + "num_input_tokens_seen": 311223760, + "router_z_loss_mlp": 0.74853516, + "step": 3752, + "time_per_iteration": 3.1817660331726074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145034, + "balance_loss_mlp": 1.07002771, + "epoch": 0.7220084647941516, + "flos": 524311564800.0, + "grad_norm": 0.03715970514443419, + "language_loss": 0.85220444, + "learning_rate": 0.00018933496756097428, + "loss": 0.86365485, + "num_input_tokens_seen": 311290336, + "router_z_loss_mlp": 0.74853516, + "step": 3753, + "time_per_iteration": 2.6647567749023438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147456, + "balance_loss_mlp": 1.07244956, + "epoch": 0.7222008464794152, + "flos": 817471157760.0, + "grad_norm": 0.038995714903637436, + "language_loss": 0.86141288, + "learning_rate": 0.0001890909192336603, + "loss": 0.87288737, + "num_input_tokens_seen": 311366240, + "router_z_loss_mlp": 0.74853516, + "step": 3754, + "time_per_iteration": 3.0344350337982178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146781, + "balance_loss_mlp": 1.07172728, + "epoch": 0.7223932281646788, + "flos": 750372097536.0, + "grad_norm": 0.03457656786821505, + "language_loss": 0.74980754, + "learning_rate": 0.00018884699161195623, + "loss": 0.76127535, + "num_input_tokens_seen": 311445184, + "router_z_loss_mlp": 0.74902344, + "step": 3755, + "time_per_iteration": 2.9410288333892822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146383, + "balance_loss_mlp": 1.07137632, + "epoch": 0.7225856098499422, + "flos": 746988172800.0, + "grad_norm": 0.03312890727657128, + "language_loss": 0.82509679, + "learning_rate": 0.00018860318479056327, + "loss": 0.83656067, + "num_input_tokens_seen": 311527280, + "router_z_loss_mlp": 0.74853516, + "step": 3756, + "time_per_iteration": 3.1337335109710693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144277, + "balance_loss_mlp": 1.0693661, + "epoch": 0.7227779915352058, + "flos": 548434371072.0, + "grad_norm": 0.030530532653655316, + "language_loss": 0.88339114, + "learning_rate": 0.00018835949886413555, + "loss": 0.89483386, + "num_input_tokens_seen": 311601552, + "router_z_loss_mlp": 0.74755859, + "step": 3757, + "time_per_iteration": 2.6933181285858154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146399, + "balance_loss_mlp": 1.07158351, + "epoch": 0.7229703732204694, + "flos": 531505837056.0, + "grad_norm": 0.03838754790834608, + "language_loss": 0.84470987, + "learning_rate": 0.0001881159339272806, + "loss": 0.85617381, + "num_input_tokens_seen": 311670736, + "router_z_loss_mlp": 0.74658203, + "step": 3758, + "time_per_iteration": 2.6401891708374023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147602, + "balance_loss_mlp": 1.07273877, + "epoch": 0.723162754905733, + "flos": 529365347328.0, + "grad_norm": 0.035007648752716856, + "language_loss": 0.83889484, + "learning_rate": 0.00018787249007455858, + "loss": 0.85037082, + "num_input_tokens_seen": 311736800, + "router_z_loss_mlp": 0.74707031, + "step": 3759, + "time_per_iteration": 2.605527400970459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147364, + "balance_loss_mlp": 1.07250082, + "epoch": 0.7233551365909965, + "flos": 656059018752.0, + "grad_norm": 0.034978512511305425, + "language_loss": 0.76976448, + "learning_rate": 0.00018762916740048302, + "loss": 0.78123814, + "num_input_tokens_seen": 311806064, + "router_z_loss_mlp": 0.74707031, + "step": 3760, + "time_per_iteration": 2.8233485221862793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156841, + "balance_loss_mlp": 1.081882, + "epoch": 0.7235475182762601, + "flos": 523443434496.0, + "grad_norm": 0.03185291769452338, + "language_loss": 0.9024173, + "learning_rate": 0.0001873859659995195, + "loss": 0.91398567, + "num_input_tokens_seen": 311881280, + "router_z_loss_mlp": 0.74804688, + "step": 3761, + "time_per_iteration": 2.7312240600585938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159221, + "balance_loss_mlp": 1.08440578, + "epoch": 0.7237398999615237, + "flos": 610321195008.0, + "grad_norm": 0.03629534298697415, + "language_loss": 0.88241446, + "learning_rate": 0.0001871428859660878, + "loss": 0.89400673, + "num_input_tokens_seen": 311953696, + "router_z_loss_mlp": 0.74658203, + "step": 3762, + "time_per_iteration": 2.7550981044769287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158067, + "balance_loss_mlp": 1.08329916, + "epoch": 0.7239322816467872, + "flos": 660281601024.0, + "grad_norm": 0.02929996085025788, + "language_loss": 0.86564827, + "learning_rate": 0.00018689992739455975, + "loss": 0.87722898, + "num_input_tokens_seen": 312032752, + "router_z_loss_mlp": 0.74609375, + "step": 3763, + "time_per_iteration": 2.925534963607788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152585, + "balance_loss_mlp": 1.07767427, + "epoch": 0.7241246633320508, + "flos": 970940416512.0, + "grad_norm": 0.028975317515326986, + "language_loss": 0.89523166, + "learning_rate": 0.00018665709037926027, + "loss": 0.90675747, + "num_input_tokens_seen": 312120800, + "router_z_loss_mlp": 0.74755859, + "step": 3764, + "time_per_iteration": 3.3454575538635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149589, + "balance_loss_mlp": 1.0751071, + "epoch": 0.7243170450173143, + "flos": 515999384064.0, + "grad_norm": 0.03578449562727673, + "language_loss": 0.88854849, + "learning_rate": 0.00018641437501446694, + "loss": 0.90004438, + "num_input_tokens_seen": 312188416, + "router_z_loss_mlp": 0.74414062, + "step": 3765, + "time_per_iteration": 2.5862903594970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149356, + "balance_loss_mlp": 1.07463598, + "epoch": 0.7245094267025779, + "flos": 560805950976.0, + "grad_norm": 0.04055976430378051, + "language_loss": 0.87262148, + "learning_rate": 0.0001861717813944104, + "loss": 0.88411504, + "num_input_tokens_seen": 312257792, + "router_z_loss_mlp": 0.74560547, + "step": 3766, + "time_per_iteration": 2.6999149322509766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145931, + "balance_loss_mlp": 1.07111502, + "epoch": 0.7247018083878415, + "flos": 613774977024.0, + "grad_norm": 0.03434162187139979, + "language_loss": 0.84787124, + "learning_rate": 0.00018592930961327365, + "loss": 0.85933053, + "num_input_tokens_seen": 312328544, + "router_z_loss_mlp": 0.74658203, + "step": 3767, + "time_per_iteration": 2.7380406856536865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145503, + "balance_loss_mlp": 1.07068777, + "epoch": 0.7248941900731051, + "flos": 635870085120.0, + "grad_norm": 0.03338829446413619, + "language_loss": 0.92739952, + "learning_rate": 0.00018568695976519273, + "loss": 0.93885458, + "num_input_tokens_seen": 312405888, + "router_z_loss_mlp": 0.74658203, + "step": 3768, + "time_per_iteration": 2.7908759117126465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145327, + "balance_loss_mlp": 1.07036865, + "epoch": 0.7250865717583687, + "flos": 425837028864.0, + "grad_norm": 0.039339840772426415, + "language_loss": 0.85823148, + "learning_rate": 0.00018544473194425593, + "loss": 0.86968476, + "num_input_tokens_seen": 312469552, + "router_z_loss_mlp": 0.74804688, + "step": 3769, + "time_per_iteration": 2.493539810180664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114564, + "balance_loss_mlp": 1.0706811, + "epoch": 0.7252789534436321, + "flos": 636397839360.0, + "grad_norm": 0.0351272666064589, + "language_loss": 0.83947301, + "learning_rate": 0.00018520262624450485, + "loss": 0.85092938, + "num_input_tokens_seen": 312548848, + "router_z_loss_mlp": 0.74804688, + "step": 3770, + "time_per_iteration": 2.8556978702545166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145039, + "balance_loss_mlp": 1.07017529, + "epoch": 0.7254713351288957, + "flos": 618353398272.0, + "grad_norm": 0.031209053717976155, + "language_loss": 0.91200709, + "learning_rate": 0.00018496064275993324, + "loss": 0.9234575, + "num_input_tokens_seen": 312622016, + "router_z_loss_mlp": 0.74707031, + "step": 3771, + "time_per_iteration": 2.7326061725616455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114546, + "balance_loss_mlp": 1.07050157, + "epoch": 0.7256637168141593, + "flos": 768290285568.0, + "grad_norm": 0.04607963634377255, + "language_loss": 0.87999386, + "learning_rate": 0.00018471878158448686, + "loss": 0.89144844, + "num_input_tokens_seen": 312696960, + "router_z_loss_mlp": 0.74804688, + "step": 3772, + "time_per_iteration": 2.945519208908081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011453, + "balance_loss_mlp": 1.07038903, + "epoch": 0.7258560984994229, + "flos": 496726970880.0, + "grad_norm": 0.029552123260588873, + "language_loss": 0.88148075, + "learning_rate": 0.00018447704281206512, + "loss": 0.89293379, + "num_input_tokens_seen": 312774352, + "router_z_loss_mlp": 0.74755859, + "step": 3773, + "time_per_iteration": 2.8680005073547363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114455, + "balance_loss_mlp": 1.06963933, + "epoch": 0.7260484801846864, + "flos": 531141265920.0, + "grad_norm": 0.03674222243829071, + "language_loss": 0.87786865, + "learning_rate": 0.0001842354265365191, + "loss": 0.88931417, + "num_input_tokens_seen": 312849600, + "router_z_loss_mlp": 0.74755859, + "step": 3774, + "time_per_iteration": 2.724771499633789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114502, + "balance_loss_mlp": 1.0701561, + "epoch": 0.72624086186995, + "flos": 626107625472.0, + "grad_norm": 0.03805272317803873, + "language_loss": 0.85790277, + "learning_rate": 0.0001839939328516526, + "loss": 0.869353, + "num_input_tokens_seen": 312922688, + "router_z_loss_mlp": 0.74707031, + "step": 3775, + "time_per_iteration": 2.7149298191070557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114524, + "balance_loss_mlp": 1.07037675, + "epoch": 0.7264332435552135, + "flos": 717804853248.0, + "grad_norm": 0.035296918768569004, + "language_loss": 0.86455274, + "learning_rate": 0.0001837525618512218, + "loss": 0.87600511, + "num_input_tokens_seen": 312997728, + "router_z_loss_mlp": 0.74707031, + "step": 3776, + "time_per_iteration": 2.8749477863311768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145925, + "balance_loss_mlp": 1.07129955, + "epoch": 0.7266256252404771, + "flos": 682241723904.0, + "grad_norm": 0.03797985367726647, + "language_loss": 0.88141412, + "learning_rate": 0.00018351131362893519, + "loss": 0.89287341, + "num_input_tokens_seen": 313067168, + "router_z_loss_mlp": 0.74462891, + "step": 3777, + "time_per_iteration": 2.7961273193359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146331, + "balance_loss_mlp": 1.07156312, + "epoch": 0.7268180069257407, + "flos": 519917793792.0, + "grad_norm": 0.04046507418804878, + "language_loss": 0.86727178, + "learning_rate": 0.00018327018827845364, + "loss": 0.87873513, + "num_input_tokens_seen": 313134688, + "router_z_loss_mlp": 0.74609375, + "step": 3778, + "time_per_iteration": 2.6734490394592285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147275, + "balance_loss_mlp": 1.07265031, + "epoch": 0.7270103886110042, + "flos": 513672242688.0, + "grad_norm": 0.03480448253150256, + "language_loss": 0.91087776, + "learning_rate": 0.00018302918589339036, + "loss": 0.92235053, + "num_input_tokens_seen": 313204816, + "router_z_loss_mlp": 0.74462891, + "step": 3779, + "time_per_iteration": 2.693053722381592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144842, + "balance_loss_mlp": 1.07012212, + "epoch": 0.7272027702962678, + "flos": 547691767296.0, + "grad_norm": 0.037628889327950436, + "language_loss": 0.94755363, + "learning_rate": 0.00018278830656731054, + "loss": 0.95900208, + "num_input_tokens_seen": 313274288, + "router_z_loss_mlp": 0.74560547, + "step": 3780, + "time_per_iteration": 2.7247214317321777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143177, + "balance_loss_mlp": 1.06831324, + "epoch": 0.7273951519815314, + "flos": 594154730496.0, + "grad_norm": 0.032307622186086855, + "language_loss": 0.90543699, + "learning_rate": 0.00018254755039373222, + "loss": 0.91686875, + "num_input_tokens_seen": 313344800, + "router_z_loss_mlp": 0.74707031, + "step": 3781, + "time_per_iteration": 2.7543249130249023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139617, + "balance_loss_mlp": 1.06480122, + "epoch": 0.727587533666795, + "flos": 607138658304.0, + "grad_norm": 0.037695022521252085, + "language_loss": 0.89343524, + "learning_rate": 0.0001823069174661252, + "loss": 0.90483147, + "num_input_tokens_seen": 313417840, + "router_z_loss_mlp": 0.74658203, + "step": 3782, + "time_per_iteration": 2.7875726222991943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140015, + "balance_loss_mlp": 1.06524646, + "epoch": 0.7277799153520584, + "flos": 514026080256.0, + "grad_norm": 0.034513244238831585, + "language_loss": 0.83396327, + "learning_rate": 0.00018206640787791112, + "loss": 0.84536338, + "num_input_tokens_seen": 313485936, + "router_z_loss_mlp": 0.74609375, + "step": 3783, + "time_per_iteration": 2.672685146331787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142732, + "balance_loss_mlp": 1.06782138, + "epoch": 0.727972297037322, + "flos": 538793435136.0, + "grad_norm": 0.03888167743908025, + "language_loss": 0.90142006, + "learning_rate": 0.00018182602172246416, + "loss": 0.9128474, + "num_input_tokens_seen": 313553136, + "router_z_loss_mlp": 0.74755859, + "step": 3784, + "time_per_iteration": 2.637195110321045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142638, + "balance_loss_mlp": 1.06767881, + "epoch": 0.7281646787225856, + "flos": 536075526144.0, + "grad_norm": 0.03379285978086118, + "language_loss": 0.81641448, + "learning_rate": 0.00018158575909311075, + "loss": 0.82784092, + "num_input_tokens_seen": 313620128, + "router_z_loss_mlp": 0.74804688, + "step": 3785, + "time_per_iteration": 2.6302285194396973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143773, + "balance_loss_mlp": 1.0688144, + "epoch": 0.7283570604078492, + "flos": 626209683456.0, + "grad_norm": 0.034294613815109176, + "language_loss": 0.84919262, + "learning_rate": 0.000181345620083129, + "loss": 0.86063033, + "num_input_tokens_seen": 313696432, + "router_z_loss_mlp": 0.74804688, + "step": 3786, + "time_per_iteration": 2.826655626296997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143839, + "balance_loss_mlp": 1.06887996, + "epoch": 0.7285494420931128, + "flos": 535255059456.0, + "grad_norm": 0.03289848846312583, + "language_loss": 0.91744298, + "learning_rate": 0.00018110560478574927, + "loss": 0.92888141, + "num_input_tokens_seen": 313768416, + "router_z_loss_mlp": 0.74804688, + "step": 3787, + "time_per_iteration": 2.6760616302490234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011439, + "balance_loss_mlp": 1.06889331, + "epoch": 0.7287418237783763, + "flos": 667740387840.0, + "grad_norm": 0.04379753934602124, + "language_loss": 0.86934447, + "learning_rate": 0.0001808657132941533, + "loss": 0.88078344, + "num_input_tokens_seen": 313839888, + "router_z_loss_mlp": 0.74853516, + "step": 3788, + "time_per_iteration": 2.8172109127044678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143441, + "balance_loss_mlp": 1.0684824, + "epoch": 0.7289342054636399, + "flos": 551638374912.0, + "grad_norm": 0.03930499856080985, + "language_loss": 0.87319398, + "learning_rate": 0.00018062594570147572, + "loss": 0.88462842, + "num_input_tokens_seen": 313908832, + "router_z_loss_mlp": 0.74804688, + "step": 3789, + "time_per_iteration": 2.6159238815307617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146043, + "balance_loss_mlp": 1.07103622, + "epoch": 0.7291265871489034, + "flos": 689138554368.0, + "grad_norm": 0.030589467753511134, + "language_loss": 0.89662123, + "learning_rate": 0.00018038630210080243, + "loss": 0.90808165, + "num_input_tokens_seen": 313982672, + "router_z_loss_mlp": 0.74853516, + "step": 3790, + "time_per_iteration": 2.8022711277008057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147306, + "balance_loss_mlp": 1.07234764, + "epoch": 0.729318968834167, + "flos": 573770413056.0, + "grad_norm": 0.03374595172498584, + "language_loss": 0.89270401, + "learning_rate": 0.0001801467825851712, + "loss": 0.90417707, + "num_input_tokens_seen": 314057184, + "router_z_loss_mlp": 0.74804688, + "step": 3791, + "time_per_iteration": 2.724628210067749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147876, + "balance_loss_mlp": 1.07310832, + "epoch": 0.7295113505194305, + "flos": 587164574208.0, + "grad_norm": 0.035766234040923994, + "language_loss": 0.83940732, + "learning_rate": 0.00017990738724757172, + "loss": 0.85088611, + "num_input_tokens_seen": 314137344, + "router_z_loss_mlp": 0.74609375, + "step": 3792, + "time_per_iteration": 2.842078924179077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161985, + "balance_loss_mlp": 1.08716917, + "epoch": 0.7297037322046941, + "flos": 708441893376.0, + "grad_norm": 0.03365089778951548, + "language_loss": 0.86588967, + "learning_rate": 0.00017966811618094598, + "loss": 0.87750953, + "num_input_tokens_seen": 314214464, + "router_z_loss_mlp": 0.74658203, + "step": 3793, + "time_per_iteration": 2.9457900524139404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151295, + "balance_loss_mlp": 1.07643151, + "epoch": 0.7298961138899577, + "flos": 488308002816.0, + "grad_norm": 0.03933165170986372, + "language_loss": 0.90208626, + "learning_rate": 0.00017942896947818664, + "loss": 0.91359925, + "num_input_tokens_seen": 314280432, + "router_z_loss_mlp": 0.74707031, + "step": 3794, + "time_per_iteration": 2.5673389434814453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155838, + "balance_loss_mlp": 1.08297729, + "epoch": 0.7300884955752213, + "flos": 1368622162944.0, + "grad_norm": 0.012202680830239692, + "language_loss": 0.74825054, + "learning_rate": 0.000179189947232139, + "loss": 0.7598089, + "num_input_tokens_seen": 314497152, + "router_z_loss_mlp": 0.72851562, + "step": 3795, + "time_per_iteration": 4.860522985458374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150098, + "balance_loss_mlp": 1.07523441, + "epoch": 0.7302808772604849, + "flos": 532836593664.0, + "grad_norm": 0.03730166344512247, + "language_loss": 0.91110396, + "learning_rate": 0.00017895104953559947, + "loss": 0.92260492, + "num_input_tokens_seen": 314565488, + "router_z_loss_mlp": 0.74707031, + "step": 3796, + "time_per_iteration": 2.58555269241333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148597, + "balance_loss_mlp": 1.07378125, + "epoch": 0.7304732589457483, + "flos": 437062502400.0, + "grad_norm": 0.03959489131470051, + "language_loss": 0.95557475, + "learning_rate": 0.00017871227648131672, + "loss": 0.96706069, + "num_input_tokens_seen": 314627392, + "router_z_loss_mlp": 0.74658203, + "step": 3797, + "time_per_iteration": 2.464853048324585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148137, + "balance_loss_mlp": 1.07332122, + "epoch": 0.7306656406310119, + "flos": 452603884032.0, + "grad_norm": 0.03192912066727366, + "language_loss": 0.87151992, + "learning_rate": 0.0001784736281619907, + "loss": 0.88300121, + "num_input_tokens_seen": 314695440, + "router_z_loss_mlp": 0.74658203, + "step": 3798, + "time_per_iteration": 2.582390785217285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146414, + "balance_loss_mlp": 1.07155061, + "epoch": 0.7308580223162755, + "flos": 513029695488.0, + "grad_norm": 0.051326436791091785, + "language_loss": 0.79766852, + "learning_rate": 0.00017823510467027232, + "loss": 0.80913264, + "num_input_tokens_seen": 314772592, + "router_z_loss_mlp": 0.74707031, + "step": 3799, + "time_per_iteration": 2.75164794921875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114555, + "balance_loss_mlp": 1.07078159, + "epoch": 0.7310504040015391, + "flos": 376282853376.0, + "grad_norm": 0.04144001955179666, + "language_loss": 0.8475759, + "learning_rate": 0.00017799670609876516, + "loss": 0.85903138, + "num_input_tokens_seen": 314836192, + "router_z_loss_mlp": 0.74609375, + "step": 3800, + "time_per_iteration": 2.5519416332244873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114588, + "balance_loss_mlp": 1.07106447, + "epoch": 0.7312427856868026, + "flos": 550381478400.0, + "grad_norm": 0.03386508062276854, + "language_loss": 0.93402916, + "learning_rate": 0.00017775843254002366, + "loss": 0.94548798, + "num_input_tokens_seen": 314908400, + "router_z_loss_mlp": 0.74658203, + "step": 3801, + "time_per_iteration": 4.189229965209961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144132, + "balance_loss_mlp": 1.06917357, + "epoch": 0.7314351673720662, + "flos": 768677050368.0, + "grad_norm": 0.03513626967715429, + "language_loss": 0.89011091, + "learning_rate": 0.00017752028408655367, + "loss": 0.9015522, + "num_input_tokens_seen": 314995280, + "router_z_loss_mlp": 0.74804688, + "step": 3802, + "time_per_iteration": 3.0296835899353027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114212, + "balance_loss_mlp": 1.06716144, + "epoch": 0.7316275490573297, + "flos": 487704387072.0, + "grad_norm": 0.036348088487259234, + "language_loss": 0.90090084, + "learning_rate": 0.00017728226083081272, + "loss": 0.91232204, + "num_input_tokens_seen": 315063056, + "router_z_loss_mlp": 0.74804688, + "step": 3803, + "time_per_iteration": 2.5504109859466553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142386, + "balance_loss_mlp": 1.06742704, + "epoch": 0.7318199307425933, + "flos": 474412283904.0, + "grad_norm": 0.03547640994648555, + "language_loss": 0.86963499, + "learning_rate": 0.00017704436286520965, + "loss": 0.88105881, + "num_input_tokens_seen": 315128896, + "router_z_loss_mlp": 0.74804688, + "step": 3804, + "time_per_iteration": 2.5794951915740967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141426, + "balance_loss_mlp": 1.06665754, + "epoch": 0.7320123124278569, + "flos": 550511734272.0, + "grad_norm": 0.04039315575901835, + "language_loss": 0.89054638, + "learning_rate": 0.0001768065902821046, + "loss": 0.90196061, + "num_input_tokens_seen": 315198464, + "router_z_loss_mlp": 0.74609375, + "step": 3805, + "time_per_iteration": 2.684680700302124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141527, + "balance_loss_mlp": 1.06675947, + "epoch": 0.7322046941131204, + "flos": 571899167232.0, + "grad_norm": 0.036858739394668875, + "language_loss": 0.87521064, + "learning_rate": 0.00017656894317380907, + "loss": 0.88662589, + "num_input_tokens_seen": 315270240, + "router_z_loss_mlp": 0.74609375, + "step": 3806, + "time_per_iteration": 2.7203333377838135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147461, + "balance_loss_mlp": 1.07460022, + "epoch": 0.732397075798384, + "flos": 1472501042688.0, + "grad_norm": 0.00876082834102495, + "language_loss": 0.76031268, + "learning_rate": 0.00017633142163258565, + "loss": 0.77178729, + "num_input_tokens_seen": 315502448, + "router_z_loss_mlp": 0.72851562, + "step": 3807, + "time_per_iteration": 4.985222816467285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143568, + "balance_loss_mlp": 1.06884801, + "epoch": 0.7325894574836476, + "flos": 465830859264.0, + "grad_norm": 0.03431257016679264, + "language_loss": 0.883228, + "learning_rate": 0.00017609402575064875, + "loss": 0.89466369, + "num_input_tokens_seen": 315569472, + "router_z_loss_mlp": 0.74560547, + "step": 3808, + "time_per_iteration": 2.5505616664886475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150323, + "balance_loss_mlp": 1.07560253, + "epoch": 0.7327818391689112, + "flos": 496481195520.0, + "grad_norm": 0.036747437689303115, + "language_loss": 0.86707413, + "learning_rate": 0.00017585675562016367, + "loss": 0.87857741, + "num_input_tokens_seen": 315637632, + "router_z_loss_mlp": 0.74560547, + "step": 3809, + "time_per_iteration": 2.566805362701416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148865, + "balance_loss_mlp": 1.07600403, + "epoch": 0.7329742208541746, + "flos": 1436679403008.0, + "grad_norm": 0.008652563544013954, + "language_loss": 0.77212846, + "learning_rate": 0.0001756196113332465, + "loss": 0.78361714, + "num_input_tokens_seen": 315863648, + "router_z_loss_mlp": 0.72851562, + "step": 3810, + "time_per_iteration": 4.843864440917969 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143684, + "balance_loss_mlp": 1.06910706, + "epoch": 0.7331666025394382, + "flos": 497868347904.0, + "grad_norm": 0.0400416063155724, + "language_loss": 0.90367377, + "learning_rate": 0.00017538259298196474, + "loss": 0.91511071, + "num_input_tokens_seen": 315930752, + "router_z_loss_mlp": 0.74414062, + "step": 3811, + "time_per_iteration": 2.573604106903076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146365, + "balance_loss_mlp": 1.07174027, + "epoch": 0.7333589842247018, + "flos": 539638096896.0, + "grad_norm": 0.03197642151293291, + "language_loss": 0.86813134, + "learning_rate": 0.00017514570065833745, + "loss": 0.87959504, + "num_input_tokens_seen": 316006400, + "router_z_loss_mlp": 0.74462891, + "step": 3812, + "time_per_iteration": 2.6921682357788086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146575, + "balance_loss_mlp": 1.0719502, + "epoch": 0.7335513659099654, + "flos": 492041762304.0, + "grad_norm": 0.0378422764823117, + "language_loss": 0.86487865, + "learning_rate": 0.00017490893445433426, + "loss": 0.87634438, + "num_input_tokens_seen": 316075824, + "router_z_loss_mlp": 0.74462891, + "step": 3813, + "time_per_iteration": 2.634765148162842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146185, + "balance_loss_mlp": 1.07160771, + "epoch": 0.733743747595229, + "flos": 563252614656.0, + "grad_norm": 0.03359115001415202, + "language_loss": 0.86180258, + "learning_rate": 0.00017467229446187587, + "loss": 0.87326443, + "num_input_tokens_seen": 316148336, + "router_z_loss_mlp": 0.74414062, + "step": 3814, + "time_per_iteration": 2.6770167350769043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146242, + "balance_loss_mlp": 1.07166481, + "epoch": 0.7339361292804925, + "flos": 539648830464.0, + "grad_norm": 0.03482367170061421, + "language_loss": 0.86801744, + "learning_rate": 0.00017443578077283424, + "loss": 0.87947989, + "num_input_tokens_seen": 316220960, + "router_z_loss_mlp": 0.74414062, + "step": 3815, + "time_per_iteration": 2.6352267265319824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144002, + "balance_loss_mlp": 1.06937671, + "epoch": 0.734128510965756, + "flos": 549561011712.0, + "grad_norm": 0.030322366631391387, + "language_loss": 0.89759493, + "learning_rate": 0.0001741993934790319, + "loss": 0.90903497, + "num_input_tokens_seen": 316295824, + "router_z_loss_mlp": 0.74462891, + "step": 3816, + "time_per_iteration": 2.793721914291382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142717, + "balance_loss_mlp": 1.06799662, + "epoch": 0.7343208926510196, + "flos": 541201167360.0, + "grad_norm": 0.038181865946918005, + "language_loss": 0.887739, + "learning_rate": 0.00017396313267224273, + "loss": 0.89916623, + "num_input_tokens_seen": 316368064, + "router_z_loss_mlp": 0.74560547, + "step": 3817, + "time_per_iteration": 2.773219347000122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145721, + "balance_loss_mlp": 1.07090569, + "epoch": 0.7345132743362832, + "flos": 572170412544.0, + "grad_norm": 0.036498541155499, + "language_loss": 0.93785435, + "learning_rate": 0.0001737269984441912, + "loss": 0.94931155, + "num_input_tokens_seen": 316437440, + "router_z_loss_mlp": 0.74658203, + "step": 3818, + "time_per_iteration": 2.6538641452789307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140549, + "balance_loss_mlp": 1.06592357, + "epoch": 0.7347056560215467, + "flos": 546480532992.0, + "grad_norm": 0.03219237397324587, + "language_loss": 0.8964963, + "learning_rate": 0.00017349099088655263, + "loss": 0.90790182, + "num_input_tokens_seen": 316511936, + "router_z_loss_mlp": 0.74462891, + "step": 3819, + "time_per_iteration": 2.7040135860443115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140645, + "balance_loss_mlp": 1.06606805, + "epoch": 0.7348980377068103, + "flos": 597076755456.0, + "grad_norm": 0.033091718107472336, + "language_loss": 0.85581368, + "learning_rate": 0.00017325511009095375, + "loss": 0.86722016, + "num_input_tokens_seen": 316584304, + "router_z_loss_mlp": 0.74414062, + "step": 3820, + "time_per_iteration": 4.160353183746338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142615, + "balance_loss_mlp": 1.06798947, + "epoch": 0.7350904193920739, + "flos": 539611900416.0, + "grad_norm": 0.031456925706235525, + "language_loss": 0.88030791, + "learning_rate": 0.00017301935614897113, + "loss": 0.89173406, + "num_input_tokens_seen": 316659024, + "router_z_loss_mlp": 0.74462891, + "step": 3821, + "time_per_iteration": 2.6948046684265137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142475, + "balance_loss_mlp": 1.06789804, + "epoch": 0.7352828010773375, + "flos": 514061008896.0, + "grad_norm": 0.030574399918046426, + "language_loss": 0.85837513, + "learning_rate": 0.00017278372915213274, + "loss": 0.86979991, + "num_input_tokens_seen": 316732544, + "router_z_loss_mlp": 0.74414062, + "step": 3822, + "time_per_iteration": 2.6384036540985107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146408, + "balance_loss_mlp": 1.07354736, + "epoch": 0.735475182762601, + "flos": 1557255777792.0, + "grad_norm": 0.0051515936537080845, + "language_loss": 0.79893845, + "learning_rate": 0.00017254822919191693, + "loss": 0.81040251, + "num_input_tokens_seen": 316967104, + "router_z_loss_mlp": 0.72851562, + "step": 3823, + "time_per_iteration": 6.475368976593018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140808, + "balance_loss_mlp": 1.06618333, + "epoch": 0.7356675644478645, + "flos": 682611024384.0, + "grad_norm": 0.03514206822018316, + "language_loss": 0.85822678, + "learning_rate": 0.00017231285635975314, + "loss": 0.86963487, + "num_input_tokens_seen": 317048304, + "router_z_loss_mlp": 0.74462891, + "step": 3824, + "time_per_iteration": 2.881985664367676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140396, + "balance_loss_mlp": 1.0657233, + "epoch": 0.7358599461331281, + "flos": 516231697920.0, + "grad_norm": 0.03601426366769367, + "language_loss": 0.88078141, + "learning_rate": 0.00017207761074702115, + "loss": 0.89218545, + "num_input_tokens_seen": 317115968, + "router_z_loss_mlp": 0.74511719, + "step": 3825, + "time_per_iteration": 2.588801860809326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142954, + "balance_loss_mlp": 1.06818557, + "epoch": 0.7360523278183917, + "flos": 444916786176.0, + "grad_norm": 0.029137218094429037, + "language_loss": 0.87851697, + "learning_rate": 0.0001718424924450514, + "loss": 0.88994652, + "num_input_tokens_seen": 317185680, + "router_z_loss_mlp": 0.74609375, + "step": 3826, + "time_per_iteration": 2.596510410308838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145079, + "balance_loss_mlp": 1.07050133, + "epoch": 0.7362447095036553, + "flos": 604551005184.0, + "grad_norm": 0.02824128078517694, + "language_loss": 0.89933646, + "learning_rate": 0.00017160750154512482, + "loss": 0.91078722, + "num_input_tokens_seen": 317258800, + "router_z_loss_mlp": 0.74414062, + "step": 3827, + "time_per_iteration": 2.737093687057495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139545, + "balance_loss_mlp": 1.06496727, + "epoch": 0.7364370911889189, + "flos": 554250223104.0, + "grad_norm": 0.030336693640123275, + "language_loss": 0.87611473, + "learning_rate": 0.0001713726381384731, + "loss": 0.88751018, + "num_input_tokens_seen": 317334608, + "router_z_loss_mlp": 0.74414062, + "step": 3828, + "time_per_iteration": 2.7642135620117188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140018, + "balance_loss_mlp": 1.06553614, + "epoch": 0.7366294728741823, + "flos": 449990034432.0, + "grad_norm": 0.03985156313807423, + "language_loss": 0.86582565, + "learning_rate": 0.00017113790231627812, + "loss": 0.87722576, + "num_input_tokens_seen": 317397504, + "router_z_loss_mlp": 0.74365234, + "step": 3829, + "time_per_iteration": 2.471085786819458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144356, + "balance_loss_mlp": 1.07168579, + "epoch": 0.7368218545594459, + "flos": 1538703048192.0, + "grad_norm": 0.005233117744578673, + "language_loss": 0.79258227, + "learning_rate": 0.0001709032941696726, + "loss": 0.80402577, + "num_input_tokens_seen": 317611472, + "router_z_loss_mlp": 0.7265625, + "step": 3830, + "time_per_iteration": 4.7661731243133545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146943, + "balance_loss_mlp": 1.072366, + "epoch": 0.7370142362447095, + "flos": 516472743936.0, + "grad_norm": 0.03645785594600137, + "language_loss": 0.87339807, + "learning_rate": 0.00017066881378973936, + "loss": 0.88486743, + "num_input_tokens_seen": 317681328, + "router_z_loss_mlp": 0.74414062, + "step": 3831, + "time_per_iteration": 2.6248505115509033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146898, + "balance_loss_mlp": 1.0723207, + "epoch": 0.7372066179299731, + "flos": 501904278528.0, + "grad_norm": 0.03165196577405493, + "language_loss": 0.87413478, + "learning_rate": 0.00017043446126751189, + "loss": 0.88560379, + "num_input_tokens_seen": 317752336, + "router_z_loss_mlp": 0.74414062, + "step": 3832, + "time_per_iteration": 2.6783525943756104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144804, + "balance_loss_mlp": 1.07022643, + "epoch": 0.7373989996152366, + "flos": 559167019008.0, + "grad_norm": 0.037114015277278894, + "language_loss": 0.82006979, + "learning_rate": 0.00017020023669397376, + "loss": 0.83151782, + "num_input_tokens_seen": 317824112, + "router_z_loss_mlp": 0.74414062, + "step": 3833, + "time_per_iteration": 2.6736700534820557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142842, + "balance_loss_mlp": 1.06816959, + "epoch": 0.7375913813005002, + "flos": 507780529152.0, + "grad_norm": 0.035309103887572656, + "language_loss": 0.88040781, + "learning_rate": 0.0001699661401600589, + "loss": 0.89183623, + "num_input_tokens_seen": 317889120, + "router_z_loss_mlp": 0.74511719, + "step": 3834, + "time_per_iteration": 2.566554069519043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114318, + "balance_loss_mlp": 1.06860292, + "epoch": 0.7377837629857638, + "flos": 487155165696.0, + "grad_norm": 0.03517908569874834, + "language_loss": 0.83206999, + "learning_rate": 0.00016973217175665205, + "loss": 0.84350181, + "num_input_tokens_seen": 317953792, + "router_z_loss_mlp": 0.74414062, + "step": 3835, + "time_per_iteration": 2.5718719959259033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144836, + "balance_loss_mlp": 1.07197571, + "epoch": 0.7379761446710273, + "flos": 1417877621760.0, + "grad_norm": 0.005454955067060188, + "language_loss": 0.8116616, + "learning_rate": 0.00016949833157458755, + "loss": 0.82310998, + "num_input_tokens_seen": 318184848, + "router_z_loss_mlp": 0.72851562, + "step": 3836, + "time_per_iteration": 4.927332401275635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113978, + "balance_loss_mlp": 1.065346, + "epoch": 0.7381685263562909, + "flos": 630909628416.0, + "grad_norm": 0.03248613748529956, + "language_loss": 0.88913381, + "learning_rate": 0.00016926461970465047, + "loss": 0.90053165, + "num_input_tokens_seen": 318259296, + "router_z_loss_mlp": 0.74316406, + "step": 3837, + "time_per_iteration": 2.775867462158203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140207, + "balance_loss_mlp": 1.06591558, + "epoch": 0.7383609080415544, + "flos": 740651297280.0, + "grad_norm": 0.029601422195490622, + "language_loss": 0.88803387, + "learning_rate": 0.00016903103623757516, + "loss": 0.89943594, + "num_input_tokens_seen": 318344704, + "router_z_loss_mlp": 0.7421875, + "step": 3838, + "time_per_iteration": 3.0490381717681885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114028, + "balance_loss_mlp": 1.0659889, + "epoch": 0.738553289726818, + "flos": 551256339456.0, + "grad_norm": 0.036589238474362976, + "language_loss": 0.84502995, + "learning_rate": 0.00016879758126404738, + "loss": 0.85643274, + "num_input_tokens_seen": 318416128, + "router_z_loss_mlp": 0.7421875, + "step": 3839, + "time_per_iteration": 2.7638185024261475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140469, + "balance_loss_mlp": 1.06598663, + "epoch": 0.7387456714120816, + "flos": 911775504384.0, + "grad_norm": 0.03874838451291343, + "language_loss": 0.85589796, + "learning_rate": 0.00016856425487470216, + "loss": 0.86730266, + "num_input_tokens_seen": 318498128, + "router_z_loss_mlp": 0.74316406, + "step": 3840, + "time_per_iteration": 3.1033904552459717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139827, + "balance_loss_mlp": 1.06548798, + "epoch": 0.7389380530973452, + "flos": 854195856384.0, + "grad_norm": 0.035495854767005654, + "language_loss": 0.84398341, + "learning_rate": 0.00016833105716012486, + "loss": 0.85538161, + "num_input_tokens_seen": 318578048, + "router_z_loss_mlp": 0.7421875, + "step": 3841, + "time_per_iteration": 3.1338374614715576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011399, + "balance_loss_mlp": 1.06551313, + "epoch": 0.7391304347826086, + "flos": 818419878912.0, + "grad_norm": 0.034862132205022836, + "language_loss": 0.89572388, + "learning_rate": 0.00016809798821085088, + "loss": 0.90712291, + "num_input_tokens_seen": 318654784, + "router_z_loss_mlp": 0.74267578, + "step": 3842, + "time_per_iteration": 2.980786085128784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140329, + "balance_loss_mlp": 1.06622851, + "epoch": 0.7393228164678722, + "flos": 573937598976.0, + "grad_norm": 0.03111800184883808, + "language_loss": 0.93200815, + "learning_rate": 0.00016786504811736565, + "loss": 0.94341135, + "num_input_tokens_seen": 318727680, + "router_z_loss_mlp": 0.74072266, + "step": 3843, + "time_per_iteration": 2.669473171234131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140191, + "balance_loss_mlp": 1.06618571, + "epoch": 0.7395151981531358, + "flos": 686575096320.0, + "grad_norm": 0.030093907505068344, + "language_loss": 0.86420381, + "learning_rate": 0.00016763223697010442, + "loss": 0.8756057, + "num_input_tokens_seen": 318807568, + "router_z_loss_mlp": 0.74023438, + "step": 3844, + "time_per_iteration": 2.99284291267395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140327, + "balance_loss_mlp": 1.06632161, + "epoch": 0.7397075798383994, + "flos": 557454226944.0, + "grad_norm": 0.030952263508457714, + "language_loss": 0.88928902, + "learning_rate": 0.00016739955485945256, + "loss": 0.90069234, + "num_input_tokens_seen": 318881792, + "router_z_loss_mlp": 0.74023438, + "step": 3845, + "time_per_iteration": 2.7834365367889404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143729, + "balance_loss_mlp": 1.06972384, + "epoch": 0.739899961523663, + "flos": 547822023168.0, + "grad_norm": 0.0384067269834895, + "language_loss": 0.91738451, + "learning_rate": 0.00016716700187574513, + "loss": 0.9288218, + "num_input_tokens_seen": 318951552, + "router_z_loss_mlp": 0.74023438, + "step": 3846, + "time_per_iteration": 2.686281681060791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142346, + "balance_loss_mlp": 1.06824505, + "epoch": 0.7400923432089265, + "flos": 610303730688.0, + "grad_norm": 0.03341447658559241, + "language_loss": 0.87943906, + "learning_rate": 0.0001669345781092675, + "loss": 0.89086246, + "num_input_tokens_seen": 319022304, + "router_z_loss_mlp": 0.74072266, + "step": 3847, + "time_per_iteration": 2.7001636028289795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146926, + "balance_loss_mlp": 1.07258725, + "epoch": 0.7402847248941901, + "flos": 592179425280.0, + "grad_norm": 0.03705340018944972, + "language_loss": 0.92317855, + "learning_rate": 0.0001667022836502546, + "loss": 0.9346478, + "num_input_tokens_seen": 319093200, + "router_z_loss_mlp": 0.74169922, + "step": 3848, + "time_per_iteration": 2.7301111221313477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147022, + "balance_loss_mlp": 1.07263577, + "epoch": 0.7404771065794536, + "flos": 478304497152.0, + "grad_norm": 0.03758678291398601, + "language_loss": 0.88680065, + "learning_rate": 0.00016647011858889077, + "loss": 0.89827085, + "num_input_tokens_seen": 319159712, + "router_z_loss_mlp": 0.7421875, + "step": 3849, + "time_per_iteration": 2.5619609355926514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145959, + "balance_loss_mlp": 1.07152426, + "epoch": 0.7406694882647172, + "flos": 497466846720.0, + "grad_norm": 0.035398733472562116, + "language_loss": 0.90902388, + "learning_rate": 0.00016623808301531056, + "loss": 0.92048347, + "num_input_tokens_seen": 319230544, + "router_z_loss_mlp": 0.74267578, + "step": 3850, + "time_per_iteration": 2.6344494819641113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144814, + "balance_loss_mlp": 1.07042766, + "epoch": 0.7408618699499807, + "flos": 563326474752.0, + "grad_norm": 0.04248736642040007, + "language_loss": 0.8449176, + "learning_rate": 0.00016600617701959842, + "loss": 0.85636574, + "num_input_tokens_seen": 319305440, + "router_z_loss_mlp": 0.7421875, + "step": 3851, + "time_per_iteration": 2.764845609664917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152382, + "balance_loss_mlp": 1.07971191, + "epoch": 0.7410542516352443, + "flos": 1391469333504.0, + "grad_norm": 0.006017952028820176, + "language_loss": 0.78843814, + "learning_rate": 0.00016577440069178811, + "loss": 0.79996192, + "num_input_tokens_seen": 319534384, + "router_z_loss_mlp": 0.7265625, + "step": 3852, + "time_per_iteration": 4.992438316345215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143972, + "balance_loss_mlp": 1.06968081, + "epoch": 0.7412466333205079, + "flos": 671211634176.0, + "grad_norm": 0.03177898311172259, + "language_loss": 0.86077726, + "learning_rate": 0.00016554275412186315, + "loss": 0.872217, + "num_input_tokens_seen": 319610960, + "router_z_loss_mlp": 0.74169922, + "step": 3853, + "time_per_iteration": 2.809633731842041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143877, + "balance_loss_mlp": 1.0695858, + "epoch": 0.7414390150057715, + "flos": 490318236672.0, + "grad_norm": 0.037394191958696615, + "language_loss": 0.85646808, + "learning_rate": 0.0001653112373997568, + "loss": 0.86790681, + "num_input_tokens_seen": 319683872, + "router_z_loss_mlp": 0.74169922, + "step": 3854, + "time_per_iteration": 2.6653616428375244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144328, + "balance_loss_mlp": 1.07013178, + "epoch": 0.7416313966910351, + "flos": 600493607424.0, + "grad_norm": 0.037760188692200464, + "language_loss": 0.80141521, + "learning_rate": 0.0001650798506153517, + "loss": 0.81285852, + "num_input_tokens_seen": 319750032, + "router_z_loss_mlp": 0.74072266, + "step": 3855, + "time_per_iteration": 2.6987767219543457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143504, + "balance_loss_mlp": 1.06921279, + "epoch": 0.7418237783762985, + "flos": 543586705920.0, + "grad_norm": 0.04363259370366351, + "language_loss": 0.89603698, + "learning_rate": 0.00016484859385848023, + "loss": 0.90747201, + "num_input_tokens_seen": 319818864, + "router_z_loss_mlp": 0.74121094, + "step": 3856, + "time_per_iteration": 2.6623427867889404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143237, + "balance_loss_mlp": 1.06889808, + "epoch": 0.7420161600615621, + "flos": 545223636480.0, + "grad_norm": 0.03643329679811027, + "language_loss": 0.82348394, + "learning_rate": 0.0001646174672189243, + "loss": 0.83491635, + "num_input_tokens_seen": 319888816, + "router_z_loss_mlp": 0.74169922, + "step": 3857, + "time_per_iteration": 2.663518190383911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143563, + "balance_loss_mlp": 1.0692718, + "epoch": 0.7422085417468257, + "flos": 528210508800.0, + "grad_norm": 0.03811276290038686, + "language_loss": 0.85172391, + "learning_rate": 0.00016438647078641488, + "loss": 0.86315954, + "num_input_tokens_seen": 319956176, + "router_z_loss_mlp": 0.74121094, + "step": 3858, + "time_per_iteration": 2.5988457202911377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145341, + "balance_loss_mlp": 1.07133579, + "epoch": 0.7424009234320893, + "flos": 509760563712.0, + "grad_norm": 0.034205456810992727, + "language_loss": 0.87813514, + "learning_rate": 0.00016415560465063344, + "loss": 0.88958859, + "num_input_tokens_seen": 320028560, + "router_z_loss_mlp": 0.73925781, + "step": 3859, + "time_per_iteration": 2.7205588817596436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145531, + "balance_loss_mlp": 1.07138264, + "epoch": 0.7425933051173528, + "flos": 513607114752.0, + "grad_norm": 0.03574871107412609, + "language_loss": 0.83894295, + "learning_rate": 0.0001639248689012095, + "loss": 0.85039824, + "num_input_tokens_seen": 320096112, + "router_z_loss_mlp": 0.74023438, + "step": 3860, + "time_per_iteration": 2.604342460632324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145572, + "balance_loss_mlp": 1.07142365, + "epoch": 0.7427856868026164, + "flos": 459377189376.0, + "grad_norm": 0.03221086554930489, + "language_loss": 0.91824234, + "learning_rate": 0.00016369426362772271, + "loss": 0.92969811, + "num_input_tokens_seen": 320168992, + "router_z_loss_mlp": 0.74023438, + "step": 3861, + "time_per_iteration": 2.787710189819336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140907, + "balance_loss_mlp": 1.06666386, + "epoch": 0.74297806848788, + "flos": 606187935744.0, + "grad_norm": 0.034095856542736835, + "language_loss": 0.84967786, + "learning_rate": 0.00016346378891970233, + "loss": 0.86108696, + "num_input_tokens_seen": 320247264, + "router_z_loss_mlp": 0.74072266, + "step": 3862, + "time_per_iteration": 2.791630744934082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140095, + "balance_loss_mlp": 1.06594658, + "epoch": 0.7431704501731435, + "flos": 893069776896.0, + "grad_norm": 0.035970776867332244, + "language_loss": 0.86936057, + "learning_rate": 0.00016323344486662633, + "loss": 0.8807615, + "num_input_tokens_seen": 320338992, + "router_z_loss_mlp": 0.74023438, + "step": 3863, + "time_per_iteration": 3.3644163608551025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140007, + "balance_loss_mlp": 1.06562018, + "epoch": 0.7433628318584071, + "flos": 593351728128.0, + "grad_norm": 0.03309073679941976, + "language_loss": 0.8318609, + "learning_rate": 0.00016300323155792247, + "loss": 0.84326088, + "num_input_tokens_seen": 320422096, + "router_z_loss_mlp": 0.7421875, + "step": 3864, + "time_per_iteration": 2.9201974868774414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140802, + "balance_loss_mlp": 1.06655836, + "epoch": 0.7435552135436706, + "flos": 478189704192.0, + "grad_norm": 0.032691738541971056, + "language_loss": 0.93297988, + "learning_rate": 0.00016277314908296687, + "loss": 0.94438791, + "num_input_tokens_seen": 320492640, + "router_z_loss_mlp": 0.74072266, + "step": 3865, + "time_per_iteration": 2.662276268005371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140447, + "balance_loss_mlp": 1.06606066, + "epoch": 0.7437475952289342, + "flos": 674431100928.0, + "grad_norm": 0.04227589537607751, + "language_loss": 0.82037443, + "learning_rate": 0.00016254319753108604, + "loss": 0.83177888, + "num_input_tokens_seen": 320565264, + "router_z_loss_mlp": 0.7421875, + "step": 3866, + "time_per_iteration": 2.818756341934204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140124, + "balance_loss_mlp": 1.06573772, + "epoch": 0.7439399769141978, + "flos": 771770264064.0, + "grad_norm": 0.04121075784978914, + "language_loss": 0.82100695, + "learning_rate": 0.00016231337699155492, + "loss": 0.83240819, + "num_input_tokens_seen": 320647584, + "router_z_loss_mlp": 0.7421875, + "step": 3867, + "time_per_iteration": 2.9714555740356445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139588, + "balance_loss_mlp": 1.06539237, + "epoch": 0.7441323585994614, + "flos": 649038663168.0, + "grad_norm": 0.03532933640628425, + "language_loss": 0.82657182, + "learning_rate": 0.0001620836875535977, + "loss": 0.83796769, + "num_input_tokens_seen": 320722752, + "router_z_loss_mlp": 0.74023438, + "step": 3868, + "time_per_iteration": 2.849938154220581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139487, + "balance_loss_mlp": 1.06548178, + "epoch": 0.7443247402847248, + "flos": 566500279296.0, + "grad_norm": 0.031528263247616775, + "language_loss": 0.85388362, + "learning_rate": 0.00016185412930638766, + "loss": 0.86527848, + "num_input_tokens_seen": 320802496, + "router_z_loss_mlp": 0.73925781, + "step": 3869, + "time_per_iteration": 2.7786920070648193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139674, + "balance_loss_mlp": 1.06547797, + "epoch": 0.7445171219699884, + "flos": 579679590912.0, + "grad_norm": 0.0366739337080916, + "language_loss": 0.87914336, + "learning_rate": 0.00016162470233904765, + "loss": 0.89054006, + "num_input_tokens_seen": 320872496, + "router_z_loss_mlp": 0.74023438, + "step": 3870, + "time_per_iteration": 2.705364465713501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147326, + "balance_loss_mlp": 1.07351112, + "epoch": 0.744709503655252, + "flos": 620029260288.0, + "grad_norm": 0.03364023309307919, + "language_loss": 0.86704087, + "learning_rate": 0.00016139540674064856, + "loss": 0.87851417, + "num_input_tokens_seen": 320944992, + "router_z_loss_mlp": 0.73828125, + "step": 3871, + "time_per_iteration": 2.727344512939453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147794, + "balance_loss_mlp": 1.07388413, + "epoch": 0.7449018853405156, + "flos": 529680253440.0, + "grad_norm": 0.03265362950694584, + "language_loss": 0.82158148, + "learning_rate": 0.00016116624260021113, + "loss": 0.83305943, + "num_input_tokens_seen": 321020208, + "router_z_loss_mlp": 0.73876953, + "step": 3872, + "time_per_iteration": 2.733447551727295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147438, + "balance_loss_mlp": 1.0736239, + "epoch": 0.7450942670257792, + "flos": 434223069696.0, + "grad_norm": 0.03568420204032938, + "language_loss": 0.89293343, + "learning_rate": 0.0001609372100067046, + "loss": 0.90440786, + "num_input_tokens_seen": 321085984, + "router_z_loss_mlp": 0.73828125, + "step": 3873, + "time_per_iteration": 2.5226526260375977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141021, + "balance_loss_mlp": 1.06682503, + "epoch": 0.7452866487110427, + "flos": 698165140992.0, + "grad_norm": 0.04021816698405521, + "language_loss": 0.90011704, + "learning_rate": 0.0001607083090490475, + "loss": 0.91152722, + "num_input_tokens_seen": 321163200, + "router_z_loss_mlp": 0.74023438, + "step": 3874, + "time_per_iteration": 2.897472381591797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138845, + "balance_loss_mlp": 1.06464863, + "epoch": 0.7454790303963063, + "flos": 513279473664.0, + "grad_norm": 0.03827241503421356, + "language_loss": 0.86578858, + "learning_rate": 0.00016047953981610714, + "loss": 0.877177, + "num_input_tokens_seen": 321237328, + "router_z_loss_mlp": 0.74023438, + "step": 3875, + "time_per_iteration": 2.7049574851989746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153999, + "balance_loss_mlp": 1.08171082, + "epoch": 0.7456714120815698, + "flos": 1328874107904.0, + "grad_norm": 0.014146468768439814, + "language_loss": 0.7972964, + "learning_rate": 0.00016025090239669916, + "loss": 0.8088364, + "num_input_tokens_seen": 321456192, + "router_z_loss_mlp": 0.72460938, + "step": 3876, + "time_per_iteration": 4.997116804122925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147349, + "balance_loss_mlp": 1.0731051, + "epoch": 0.7458637937668334, + "flos": 722971427328.0, + "grad_norm": 0.03963419785288614, + "language_loss": 0.8521378, + "learning_rate": 0.0001600223968795889, + "loss": 0.86361128, + "num_input_tokens_seen": 321530560, + "router_z_loss_mlp": 0.74072266, + "step": 3877, + "time_per_iteration": 2.8971540927886963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147774, + "balance_loss_mlp": 1.07548523, + "epoch": 0.746056175452097, + "flos": 1504866172416.0, + "grad_norm": 0.01288298570823651, + "language_loss": 0.75696075, + "learning_rate": 0.00015979402335349004, + "loss": 0.76843846, + "num_input_tokens_seen": 321760928, + "router_z_loss_mlp": 0.72460938, + "step": 3878, + "time_per_iteration": 4.937422275543213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144499, + "balance_loss_mlp": 1.07025564, + "epoch": 0.7462485571373605, + "flos": 521294212608.0, + "grad_norm": 0.03493161366736204, + "language_loss": 0.85764599, + "learning_rate": 0.00015956578190706483, + "loss": 0.86909091, + "num_input_tokens_seen": 321833248, + "router_z_loss_mlp": 0.74072266, + "step": 3879, + "time_per_iteration": 2.68503737449646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144, + "balance_loss_mlp": 1.06980455, + "epoch": 0.7464409388226241, + "flos": 482166511104.0, + "grad_norm": 0.03362253888482968, + "language_loss": 0.79837132, + "learning_rate": 0.00015933767262892468, + "loss": 0.80981129, + "num_input_tokens_seen": 321905904, + "router_z_loss_mlp": 0.74072266, + "step": 3880, + "time_per_iteration": 2.693495988845825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144861, + "balance_loss_mlp": 1.07071245, + "epoch": 0.7466333205078877, + "flos": 487741317120.0, + "grad_norm": 0.04222777509687144, + "language_loss": 0.88058239, + "learning_rate": 0.00015910969560762927, + "loss": 0.89203095, + "num_input_tokens_seen": 321971920, + "router_z_loss_mlp": 0.74023438, + "step": 3881, + "time_per_iteration": 2.562688112258911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148453, + "balance_loss_mlp": 1.07416224, + "epoch": 0.7468257021931513, + "flos": 612407290368.0, + "grad_norm": 0.034328627776477647, + "language_loss": 0.8732987, + "learning_rate": 0.00015888185093168727, + "loss": 0.88478327, + "num_input_tokens_seen": 322041904, + "router_z_loss_mlp": 0.74121094, + "step": 3882, + "time_per_iteration": 2.718461036682129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146529, + "balance_loss_mlp": 1.072142, + "epoch": 0.7470180838784147, + "flos": 534484257792.0, + "grad_norm": 0.03431059853024658, + "language_loss": 0.85983026, + "learning_rate": 0.00015865413868955581, + "loss": 0.87129557, + "num_input_tokens_seen": 322110816, + "router_z_loss_mlp": 0.7421875, + "step": 3883, + "time_per_iteration": 2.6472575664520264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146306, + "balance_loss_mlp": 1.07225311, + "epoch": 0.7472104655636783, + "flos": 740672764416.0, + "grad_norm": 0.030267060700337457, + "language_loss": 0.87475348, + "learning_rate": 0.00015842655896964054, + "loss": 0.88621652, + "num_input_tokens_seen": 322192704, + "router_z_loss_mlp": 0.73974609, + "step": 3884, + "time_per_iteration": 3.015573024749756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145315, + "balance_loss_mlp": 1.07107127, + "epoch": 0.7474028472489419, + "flos": 641501286912.0, + "grad_norm": 0.03713221878515122, + "language_loss": 0.79442894, + "learning_rate": 0.00015819911186029567, + "loss": 0.8058821, + "num_input_tokens_seen": 322263888, + "router_z_loss_mlp": 0.74121094, + "step": 3885, + "time_per_iteration": 2.7972114086151123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145173, + "balance_loss_mlp": 1.07078624, + "epoch": 0.7475952289342055, + "flos": 591326031360.0, + "grad_norm": 0.035996478944381224, + "language_loss": 0.90933514, + "learning_rate": 0.00015797179744982443, + "loss": 0.92078686, + "num_input_tokens_seen": 322331936, + "router_z_loss_mlp": 0.7421875, + "step": 3886, + "time_per_iteration": 2.699364185333252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145253, + "balance_loss_mlp": 1.07100964, + "epoch": 0.7477876106194691, + "flos": 489219793920.0, + "grad_norm": 0.03742232117847866, + "language_loss": 0.83403462, + "learning_rate": 0.00015774461582647765, + "loss": 0.84548712, + "num_input_tokens_seen": 322402032, + "router_z_loss_mlp": 0.74121094, + "step": 3887, + "time_per_iteration": 2.6602365970611572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146333, + "balance_loss_mlp": 1.07199454, + "epoch": 0.7479799923047326, + "flos": 555789098496.0, + "grad_norm": 0.03709849655597122, + "language_loss": 0.85774076, + "learning_rate": 0.00015751756707845505, + "loss": 0.86920416, + "num_input_tokens_seen": 322472512, + "router_z_loss_mlp": 0.74169922, + "step": 3888, + "time_per_iteration": 2.6497113704681396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145173, + "balance_loss_mlp": 1.07097745, + "epoch": 0.7481723739899961, + "flos": 768789841920.0, + "grad_norm": 0.0326002931336663, + "language_loss": 0.92530739, + "learning_rate": 0.00015729065129390502, + "loss": 0.93675911, + "num_input_tokens_seen": 322555104, + "router_z_loss_mlp": 0.74121094, + "step": 3889, + "time_per_iteration": 3.0129857063293457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145589, + "balance_loss_mlp": 1.07129776, + "epoch": 0.7483647556752597, + "flos": 497160672768.0, + "grad_norm": 0.03921764888683204, + "language_loss": 0.87742007, + "learning_rate": 0.0001570638685609241, + "loss": 0.88887596, + "num_input_tokens_seen": 322621904, + "router_z_loss_mlp": 0.74169922, + "step": 3890, + "time_per_iteration": 2.6674981117248535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145557, + "balance_loss_mlp": 1.07126558, + "epoch": 0.7485571373605233, + "flos": 473826132480.0, + "grad_norm": 0.036715319135455414, + "language_loss": 0.85719097, + "learning_rate": 0.00015683721896755693, + "loss": 0.8686465, + "num_input_tokens_seen": 322688928, + "router_z_loss_mlp": 0.74169922, + "step": 3891, + "time_per_iteration": 2.524322271347046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153778, + "balance_loss_mlp": 1.0816803, + "epoch": 0.7487495190457868, + "flos": 1557898324992.0, + "grad_norm": 0.009583293732515121, + "language_loss": 0.82210493, + "learning_rate": 0.00015661070260179682, + "loss": 0.83364266, + "num_input_tokens_seen": 322928464, + "router_z_loss_mlp": 0.72265625, + "step": 3892, + "time_per_iteration": 4.967085361480713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114376, + "balance_loss_mlp": 1.06980217, + "epoch": 0.7489419007310504, + "flos": 582966187008.0, + "grad_norm": 0.03314224500682494, + "language_loss": 0.89740062, + "learning_rate": 0.00015638431955158528, + "loss": 0.90883827, + "num_input_tokens_seen": 323002672, + "router_z_loss_mlp": 0.73974609, + "step": 3893, + "time_per_iteration": 2.7170591354370117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143436, + "balance_loss_mlp": 1.06952667, + "epoch": 0.749134282416314, + "flos": 568697164800.0, + "grad_norm": 0.032778698573620556, + "language_loss": 0.85919845, + "learning_rate": 0.00015615806990481186, + "loss": 0.87063277, + "num_input_tokens_seen": 323076480, + "router_z_loss_mlp": 0.73925781, + "step": 3894, + "time_per_iteration": 2.6996026039123535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143061, + "balance_loss_mlp": 1.06915176, + "epoch": 0.7493266641015776, + "flos": 534165348864.0, + "grad_norm": 0.030394188724740954, + "language_loss": 0.88159597, + "learning_rate": 0.00015593195374931452, + "loss": 0.89302653, + "num_input_tokens_seen": 323151840, + "router_z_loss_mlp": 0.73876953, + "step": 3895, + "time_per_iteration": 2.7341361045837402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146619, + "balance_loss_mlp": 1.0727098, + "epoch": 0.7495190457868411, + "flos": 524717795328.0, + "grad_norm": 0.03863238275082747, + "language_loss": 0.84834325, + "learning_rate": 0.00015570597117287922, + "loss": 0.8598094, + "num_input_tokens_seen": 323223376, + "router_z_loss_mlp": 0.73925781, + "step": 3896, + "time_per_iteration": 2.659959077835083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144958, + "balance_loss_mlp": 1.07123923, + "epoch": 0.7497114274721046, + "flos": 515189650944.0, + "grad_norm": 0.036153955885896226, + "language_loss": 0.83024484, + "learning_rate": 0.0001554801222632406, + "loss": 0.84169447, + "num_input_tokens_seen": 323290288, + "router_z_loss_mlp": 0.73730469, + "step": 3897, + "time_per_iteration": 2.5906412601470947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145811, + "balance_loss_mlp": 1.07199693, + "epoch": 0.7499038091573682, + "flos": 495997102080.0, + "grad_norm": 0.03335147628193477, + "language_loss": 0.89782715, + "learning_rate": 0.00015525440710808052, + "loss": 0.90928525, + "num_input_tokens_seen": 323359568, + "router_z_loss_mlp": 0.73828125, + "step": 3898, + "time_per_iteration": 2.615407705307007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145951, + "balance_loss_mlp": 1.07199407, + "epoch": 0.7500961908426318, + "flos": 738988170240.0, + "grad_norm": 0.03474247339269188, + "language_loss": 0.84343684, + "learning_rate": 0.00015502882579502953, + "loss": 0.85489637, + "num_input_tokens_seen": 323436688, + "router_z_loss_mlp": 0.73925781, + "step": 3899, + "time_per_iteration": 3.010974645614624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114743, + "balance_loss_mlp": 1.07361519, + "epoch": 0.7502885725278954, + "flos": 534536650752.0, + "grad_norm": 0.03268230414324022, + "language_loss": 0.88787687, + "learning_rate": 0.00015480337841166592, + "loss": 0.89935118, + "num_input_tokens_seen": 323510032, + "router_z_loss_mlp": 0.73828125, + "step": 3900, + "time_per_iteration": 2.7430782318115234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147759, + "balance_loss_mlp": 1.07399249, + "epoch": 0.7504809542131589, + "flos": 590557957632.0, + "grad_norm": 0.04375512425984308, + "language_loss": 0.87710261, + "learning_rate": 0.00015457806504551647, + "loss": 0.8885802, + "num_input_tokens_seen": 323588896, + "router_z_loss_mlp": 0.73779297, + "step": 3901, + "time_per_iteration": 2.8651504516601562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148011, + "balance_loss_mlp": 1.0741967, + "epoch": 0.7506733358984224, + "flos": 512582532096.0, + "grad_norm": 0.0332649439615325, + "language_loss": 0.82646012, + "learning_rate": 0.0001543528857840554, + "loss": 0.83794028, + "num_input_tokens_seen": 323661280, + "router_z_loss_mlp": 0.73828125, + "step": 3902, + "time_per_iteration": 2.6909492015838623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144161, + "balance_loss_mlp": 1.07025158, + "epoch": 0.750865717583686, + "flos": 540382702080.0, + "grad_norm": 0.03600709682352738, + "language_loss": 0.85171556, + "learning_rate": 0.000154127840714705, + "loss": 0.86315715, + "num_input_tokens_seen": 323739200, + "router_z_loss_mlp": 0.73925781, + "step": 3903, + "time_per_iteration": 2.7624754905700684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144936, + "balance_loss_mlp": 1.0707401, + "epoch": 0.7510580992689496, + "flos": 477540426240.0, + "grad_norm": 0.045315321448851864, + "language_loss": 0.87899154, + "learning_rate": 0.00015390292992483557, + "loss": 0.89044094, + "num_input_tokens_seen": 323802816, + "router_z_loss_mlp": 0.74072266, + "step": 3904, + "time_per_iteration": 2.512664794921875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141177, + "balance_loss_mlp": 1.06707633, + "epoch": 0.7512504809542132, + "flos": 580200614400.0, + "grad_norm": 0.0336140335329932, + "language_loss": 0.89387548, + "learning_rate": 0.00015367815350176523, + "loss": 0.90528727, + "num_input_tokens_seen": 323879488, + "router_z_loss_mlp": 0.74072266, + "step": 3905, + "time_per_iteration": 2.743971824645996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139798, + "balance_loss_mlp": 1.06550705, + "epoch": 0.7514428626394767, + "flos": 419563279872.0, + "grad_norm": 0.033015406559801515, + "language_loss": 0.88140541, + "learning_rate": 0.00015345351153275987, + "loss": 0.89280337, + "num_input_tokens_seen": 323944512, + "router_z_loss_mlp": 0.74169922, + "step": 3906, + "time_per_iteration": 2.5664329528808594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137169, + "balance_loss_mlp": 1.06335413, + "epoch": 0.7516352443247403, + "flos": 642254624256.0, + "grad_norm": 0.03633245053817903, + "language_loss": 0.85467315, + "learning_rate": 0.00015322900410503332, + "loss": 0.86604482, + "num_input_tokens_seen": 324020688, + "router_z_loss_mlp": 0.73828125, + "step": 3907, + "time_per_iteration": 2.797030210494995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139178, + "balance_loss_mlp": 1.0650295, + "epoch": 0.7518276260100039, + "flos": 582191382528.0, + "grad_norm": 0.03436736061108426, + "language_loss": 0.8251732, + "learning_rate": 0.00015300463130574703, + "loss": 0.83656502, + "num_input_tokens_seen": 324098080, + "router_z_loss_mlp": 0.74023438, + "step": 3908, + "time_per_iteration": 2.8524422645568848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139345, + "balance_loss_mlp": 1.06524479, + "epoch": 0.7520200076952674, + "flos": 688615529472.0, + "grad_norm": 0.03139939166900202, + "language_loss": 0.85847479, + "learning_rate": 0.00015278039322201033, + "loss": 0.86986822, + "num_input_tokens_seen": 324183968, + "router_z_loss_mlp": 0.73974609, + "step": 3909, + "time_per_iteration": 2.9437077045440674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113959, + "balance_loss_mlp": 1.0656805, + "epoch": 0.7522123893805309, + "flos": 487415677440.0, + "grad_norm": 0.04345489019259924, + "language_loss": 0.85063672, + "learning_rate": 0.00015255628994088004, + "loss": 0.86203265, + "num_input_tokens_seen": 324249568, + "router_z_loss_mlp": 0.73876953, + "step": 3910, + "time_per_iteration": 2.5493288040161133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139511, + "balance_loss_mlp": 1.0655055, + "epoch": 0.7524047710657945, + "flos": 820591294464.0, + "grad_norm": 0.035053470769469915, + "language_loss": 0.79975402, + "learning_rate": 0.00015233232154936082, + "loss": 0.81114912, + "num_input_tokens_seen": 324345312, + "router_z_loss_mlp": 0.73925781, + "step": 3911, + "time_per_iteration": 3.2801201343536377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136453, + "balance_loss_mlp": 1.06259108, + "epoch": 0.7525971527510581, + "flos": 700780992000.0, + "grad_norm": 0.03701963339686214, + "language_loss": 0.80987895, + "learning_rate": 0.0001521084881344048, + "loss": 0.82124352, + "num_input_tokens_seen": 324419056, + "router_z_loss_mlp": 0.73876953, + "step": 3912, + "time_per_iteration": 2.864623785018921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136423, + "balance_loss_mlp": 1.06260836, + "epoch": 0.7527895344363217, + "flos": 634949561856.0, + "grad_norm": 0.03193238845442204, + "language_loss": 0.90964454, + "learning_rate": 0.00015188478978291208, + "loss": 0.92100877, + "num_input_tokens_seen": 324490848, + "router_z_loss_mlp": 0.73828125, + "step": 3913, + "time_per_iteration": 2.817735433578491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138001, + "balance_loss_mlp": 1.06423438, + "epoch": 0.7529819161215853, + "flos": 563932091904.0, + "grad_norm": 0.03160281710037872, + "language_loss": 0.90830052, + "learning_rate": 0.00015166122658173014, + "loss": 0.91968054, + "num_input_tokens_seen": 324565648, + "router_z_loss_mlp": 0.73779297, + "step": 3914, + "time_per_iteration": 2.769164562225342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143642, + "balance_loss_mlp": 1.06992257, + "epoch": 0.7531742978068487, + "flos": 691956519936.0, + "grad_norm": 0.03347021027562271, + "language_loss": 0.9305917, + "learning_rate": 0.00015143779861765332, + "loss": 0.94202816, + "num_input_tokens_seen": 324642832, + "router_z_loss_mlp": 0.73730469, + "step": 3915, + "time_per_iteration": 2.8637077808380127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143643, + "balance_loss_mlp": 1.07001936, + "epoch": 0.7533666794921123, + "flos": 682306851840.0, + "grad_norm": 0.03059680855463854, + "language_loss": 0.85590506, + "learning_rate": 0.00015121450597742458, + "loss": 0.86734146, + "num_input_tokens_seen": 324718336, + "router_z_loss_mlp": 0.73632812, + "step": 3916, + "time_per_iteration": 2.822169065475464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143917, + "balance_loss_mlp": 1.0701977, + "epoch": 0.7535590611773759, + "flos": 624813798912.0, + "grad_norm": 0.03788604820756776, + "language_loss": 0.84024751, + "learning_rate": 0.00015099134874773369, + "loss": 0.85168672, + "num_input_tokens_seen": 324787744, + "router_z_loss_mlp": 0.73730469, + "step": 3917, + "time_per_iteration": 2.739708185195923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143474, + "balance_loss_mlp": 1.06975508, + "epoch": 0.7537514428626395, + "flos": 520493211648.0, + "grad_norm": 0.03128503546806215, + "language_loss": 0.84470636, + "learning_rate": 0.00015076832701521793, + "loss": 0.85614109, + "num_input_tokens_seen": 324863280, + "router_z_loss_mlp": 0.73730469, + "step": 3918, + "time_per_iteration": 2.7321834564208984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143927, + "balance_loss_mlp": 1.07016027, + "epoch": 0.753943824547903, + "flos": 725034054144.0, + "grad_norm": 0.04314682819864583, + "language_loss": 0.87482226, + "learning_rate": 0.000150545440866462, + "loss": 0.88626158, + "num_input_tokens_seen": 324949600, + "router_z_loss_mlp": 0.73779297, + "step": 3919, + "time_per_iteration": 2.9775331020355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138634, + "balance_loss_mlp": 1.06486762, + "epoch": 0.7541362062331666, + "flos": 438467119104.0, + "grad_norm": 0.052938940004614674, + "language_loss": 0.83896869, + "learning_rate": 0.000150322690387998, + "loss": 0.85035503, + "num_input_tokens_seen": 325013808, + "router_z_loss_mlp": 0.73779297, + "step": 3920, + "time_per_iteration": 2.49090576171875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137452, + "balance_loss_mlp": 1.06363773, + "epoch": 0.7543285879184302, + "flos": 566343826944.0, + "grad_norm": 0.033797104064901606, + "language_loss": 0.79905725, + "learning_rate": 0.00015010007566630535, + "loss": 0.81043172, + "num_input_tokens_seen": 325084832, + "router_z_loss_mlp": 0.73828125, + "step": 3921, + "time_per_iteration": 2.731271266937256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136388, + "balance_loss_mlp": 1.06257319, + "epoch": 0.7545209696036937, + "flos": 522058283520.0, + "grad_norm": 0.038458937044939336, + "language_loss": 0.86757135, + "learning_rate": 0.00014987759678781077, + "loss": 0.87893528, + "num_input_tokens_seen": 325155120, + "router_z_loss_mlp": 0.73828125, + "step": 3922, + "time_per_iteration": 2.6090140342712402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137282, + "balance_loss_mlp": 1.06356251, + "epoch": 0.7547133512889573, + "flos": 617209293312.0, + "grad_norm": 0.03880443282291728, + "language_loss": 0.87359434, + "learning_rate": 0.00014965525383888795, + "loss": 0.88496715, + "num_input_tokens_seen": 325235632, + "router_z_loss_mlp": 0.73730469, + "step": 3923, + "time_per_iteration": 2.7862982749938965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142684, + "balance_loss_mlp": 1.06867838, + "epoch": 0.7549057329742208, + "flos": 752141285376.0, + "grad_norm": 0.034394345643830246, + "language_loss": 0.76875985, + "learning_rate": 0.00014943304690585851, + "loss": 0.78018677, + "num_input_tokens_seen": 325309696, + "router_z_loss_mlp": 0.73876953, + "step": 3924, + "time_per_iteration": 2.910545825958252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143742, + "balance_loss_mlp": 1.06964111, + "epoch": 0.7550981146594844, + "flos": 515450162688.0, + "grad_norm": 0.03861308320303695, + "language_loss": 0.84874004, + "learning_rate": 0.0001492109760749908, + "loss": 0.8601774, + "num_input_tokens_seen": 325375744, + "router_z_loss_mlp": 0.73925781, + "step": 3925, + "time_per_iteration": 2.6297590732574463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114885, + "balance_loss_mlp": 1.07503557, + "epoch": 0.755290496344748, + "flos": 523026470400.0, + "grad_norm": 0.03619284623478051, + "language_loss": 0.84284902, + "learning_rate": 0.00014898904143250002, + "loss": 0.85433757, + "num_input_tokens_seen": 325448384, + "router_z_loss_mlp": 0.73828125, + "step": 3926, + "time_per_iteration": 2.6899092197418213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155189, + "balance_loss_mlp": 1.082901, + "epoch": 0.7554828780300116, + "flos": 1417703705088.0, + "grad_norm": 0.01325688578051584, + "language_loss": 0.75755203, + "learning_rate": 0.00014876724306454886, + "loss": 0.76910388, + "num_input_tokens_seen": 325678672, + "router_z_loss_mlp": 0.72460938, + "step": 3927, + "time_per_iteration": 4.904372692108154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141123, + "balance_loss_mlp": 1.06683159, + "epoch": 0.7556752597152752, + "flos": 557985984000.0, + "grad_norm": 0.031943357844755736, + "language_loss": 0.84718072, + "learning_rate": 0.0001485455810572474, + "loss": 0.85859191, + "num_input_tokens_seen": 325746656, + "router_z_loss_mlp": 0.74121094, + "step": 3928, + "time_per_iteration": 2.6653287410736084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139674, + "balance_loss_mlp": 1.06519186, + "epoch": 0.7558676414005386, + "flos": 564741825024.0, + "grad_norm": 0.03222629584019241, + "language_loss": 0.88709021, + "learning_rate": 0.00014832405549665236, + "loss": 0.89848697, + "num_input_tokens_seen": 325820304, + "router_z_loss_mlp": 0.74316406, + "step": 3929, + "time_per_iteration": 2.69524884223938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114176, + "balance_loss_mlp": 1.0672785, + "epoch": 0.7560600230858022, + "flos": 562534205952.0, + "grad_norm": 0.03584285097744866, + "language_loss": 0.82973742, + "learning_rate": 0.00014810266646876746, + "loss": 0.84115505, + "num_input_tokens_seen": 325895584, + "router_z_loss_mlp": 0.74316406, + "step": 3930, + "time_per_iteration": 2.781097888946533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141215, + "balance_loss_mlp": 1.06663764, + "epoch": 0.7562524047710658, + "flos": 720957190656.0, + "grad_norm": 0.038983110262219116, + "language_loss": 0.82315147, + "learning_rate": 0.00014788141405954364, + "loss": 0.83456367, + "num_input_tokens_seen": 325976752, + "router_z_loss_mlp": 0.74414062, + "step": 3931, + "time_per_iteration": 2.9991354942321777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140296, + "balance_loss_mlp": 1.06571853, + "epoch": 0.7564447864563294, + "flos": 544396439040.0, + "grad_norm": 0.037101319530533854, + "language_loss": 0.90224212, + "learning_rate": 0.00014766029835487865, + "loss": 0.91364509, + "num_input_tokens_seen": 326047152, + "router_z_loss_mlp": 0.74414062, + "step": 3932, + "time_per_iteration": 2.692891836166382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144662, + "balance_loss_mlp": 1.07008481, + "epoch": 0.7566371681415929, + "flos": 727093953024.0, + "grad_norm": 0.03778072998608002, + "language_loss": 0.86007833, + "learning_rate": 0.0001474393194406173, + "loss": 0.87152493, + "num_input_tokens_seen": 326119056, + "router_z_loss_mlp": 0.74414062, + "step": 3933, + "time_per_iteration": 2.891930341720581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146005, + "balance_loss_mlp": 1.07142723, + "epoch": 0.7568295498268565, + "flos": 577806343680.0, + "grad_norm": 0.03260015867991467, + "language_loss": 0.84333152, + "learning_rate": 0.00014721847740255112, + "loss": 0.85479152, + "num_input_tokens_seen": 326196736, + "router_z_loss_mlp": 0.74414062, + "step": 3934, + "time_per_iteration": 2.799757242202759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151863, + "balance_loss_mlp": 1.07919312, + "epoch": 0.75702193151212, + "flos": 1523216060928.0, + "grad_norm": 0.00897818069303787, + "language_loss": 0.73911923, + "learning_rate": 0.00014699777232641853, + "loss": 0.75063783, + "num_input_tokens_seen": 326404752, + "router_z_loss_mlp": 0.7265625, + "step": 3935, + "time_per_iteration": 4.575445175170898 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146571, + "balance_loss_mlp": 1.07199419, + "epoch": 0.7572143131973836, + "flos": 526488984576.0, + "grad_norm": 0.039044960519486104, + "language_loss": 0.83207357, + "learning_rate": 0.00014677720429790526, + "loss": 0.8435393, + "num_input_tokens_seen": 326472832, + "router_z_loss_mlp": 0.74414062, + "step": 3936, + "time_per_iteration": 2.6141350269317627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143608, + "balance_loss_mlp": 1.06917346, + "epoch": 0.7574066948826472, + "flos": 551823025152.0, + "grad_norm": 0.030693904946920876, + "language_loss": 0.88398033, + "learning_rate": 0.0001465567734026429, + "loss": 0.89541638, + "num_input_tokens_seen": 326546976, + "router_z_loss_mlp": 0.74267578, + "step": 3937, + "time_per_iteration": 2.738377571105957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136961, + "balance_loss_mlp": 1.06219339, + "epoch": 0.7575990765679107, + "flos": 396769228800.0, + "grad_norm": 0.04103098357371863, + "language_loss": 0.88068545, + "learning_rate": 0.00014633647972621034, + "loss": 0.89205503, + "num_input_tokens_seen": 326609296, + "router_z_loss_mlp": 0.74609375, + "step": 3938, + "time_per_iteration": 2.4616434574127197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138132, + "balance_loss_mlp": 1.06336367, + "epoch": 0.7577914582531743, + "flos": 586185653760.0, + "grad_norm": 0.030008665391221847, + "language_loss": 0.90353823, + "learning_rate": 0.00014611632335413354, + "loss": 0.91491956, + "num_input_tokens_seen": 326687168, + "router_z_loss_mlp": 0.74609375, + "step": 3939, + "time_per_iteration": 2.775031805038452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113606, + "balance_loss_mlp": 1.06143546, + "epoch": 0.7579838399384379, + "flos": 822484007424.0, + "grad_norm": 0.031088983596600554, + "language_loss": 0.87266111, + "learning_rate": 0.00014589630437188456, + "loss": 0.8840217, + "num_input_tokens_seen": 326777760, + "router_z_loss_mlp": 0.74462891, + "step": 3940, + "time_per_iteration": 3.1587963104248047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136592, + "balance_loss_mlp": 1.06187153, + "epoch": 0.7581762216237015, + "flos": 444805996032.0, + "grad_norm": 0.04449780821151478, + "language_loss": 0.84434611, + "learning_rate": 0.00014567642286488253, + "loss": 0.85571206, + "num_input_tokens_seen": 326843952, + "router_z_loss_mlp": 0.74560547, + "step": 3941, + "time_per_iteration": 2.541396141052246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146151, + "balance_loss_mlp": 1.07143092, + "epoch": 0.7583686033089649, + "flos": 541939041792.0, + "grad_norm": 0.045311193933261745, + "language_loss": 0.84473586, + "learning_rate": 0.00014545667891849258, + "loss": 0.85619736, + "num_input_tokens_seen": 326911296, + "router_z_loss_mlp": 0.74560547, + "step": 3942, + "time_per_iteration": 2.653228998184204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146078, + "balance_loss_mlp": 1.07150042, + "epoch": 0.7585609849942285, + "flos": 523612621824.0, + "grad_norm": 0.032810068859795746, + "language_loss": 0.87606031, + "learning_rate": 0.00014523707261802733, + "loss": 0.88752109, + "num_input_tokens_seen": 326977776, + "router_z_loss_mlp": 0.74414062, + "step": 3943, + "time_per_iteration": 2.6271109580993652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145321, + "balance_loss_mlp": 1.07064807, + "epoch": 0.7587533666794921, + "flos": 542907228672.0, + "grad_norm": 0.03968141925916535, + "language_loss": 0.87281996, + "learning_rate": 0.00014501760404874527, + "loss": 0.88427311, + "num_input_tokens_seen": 327050240, + "router_z_loss_mlp": 0.74511719, + "step": 3944, + "time_per_iteration": 2.696624279022217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143644, + "balance_loss_mlp": 1.06921005, + "epoch": 0.7589457483647557, + "flos": 607520693760.0, + "grad_norm": 0.03527343203685723, + "language_loss": 0.909307, + "learning_rate": 0.00014479827329585176, + "loss": 0.92074347, + "num_input_tokens_seen": 327119952, + "router_z_loss_mlp": 0.74267578, + "step": 3945, + "time_per_iteration": 2.7308402061462402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141632, + "balance_loss_mlp": 1.06724524, + "epoch": 0.7591381300500193, + "flos": 556251724800.0, + "grad_norm": 0.03227407382042984, + "language_loss": 0.88668191, + "learning_rate": 0.00014457908044449846, + "loss": 0.89809817, + "num_input_tokens_seen": 327192640, + "router_z_loss_mlp": 0.7421875, + "step": 3946, + "time_per_iteration": 2.723604917526245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145154, + "balance_loss_mlp": 1.07076728, + "epoch": 0.7593305117352828, + "flos": 530813624832.0, + "grad_norm": 0.032659275008273744, + "language_loss": 0.87264967, + "learning_rate": 0.00014436002557978371, + "loss": 0.88410115, + "num_input_tokens_seen": 327271008, + "router_z_loss_mlp": 0.7421875, + "step": 3947, + "time_per_iteration": 2.7849090099334717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151436, + "balance_loss_mlp": 1.07876587, + "epoch": 0.7595228934205464, + "flos": 1505922955776.0, + "grad_norm": 0.01242422674418897, + "language_loss": 0.76643145, + "learning_rate": 0.00014414110878675201, + "loss": 0.77794582, + "num_input_tokens_seen": 327505392, + "router_z_loss_mlp": 0.7265625, + "step": 3948, + "time_per_iteration": 4.869319200515747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141564, + "balance_loss_mlp": 1.06717777, + "epoch": 0.7597152751058099, + "flos": 456467899392.0, + "grad_norm": 0.03330137470124234, + "language_loss": 0.84041482, + "learning_rate": 0.0001439223301503945, + "loss": 0.85183042, + "num_input_tokens_seen": 327569392, + "router_z_loss_mlp": 0.7421875, + "step": 3949, + "time_per_iteration": 2.511057138442993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141649, + "balance_loss_mlp": 1.06721532, + "epoch": 0.7599076567910735, + "flos": 686798678016.0, + "grad_norm": 0.040114283676211684, + "language_loss": 0.80981869, + "learning_rate": 0.00014370368975564834, + "loss": 0.82123518, + "num_input_tokens_seen": 327648304, + "router_z_loss_mlp": 0.74267578, + "step": 3950, + "time_per_iteration": 3.0096349716186523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144078, + "balance_loss_mlp": 1.06973898, + "epoch": 0.760100038476337, + "flos": 533494603776.0, + "grad_norm": 0.03798147365213374, + "language_loss": 0.88830221, + "learning_rate": 0.00014348518768739766, + "loss": 0.89974296, + "num_input_tokens_seen": 327725600, + "router_z_loss_mlp": 0.74169922, + "step": 3951, + "time_per_iteration": 2.789020299911499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146828, + "balance_loss_mlp": 1.07415771, + "epoch": 0.7602924201616006, + "flos": 1474916780544.0, + "grad_norm": 0.005782127135677509, + "language_loss": 0.7672804, + "learning_rate": 0.00014326682403047243, + "loss": 0.77874869, + "num_input_tokens_seen": 327954048, + "router_z_loss_mlp": 0.7265625, + "step": 3952, + "time_per_iteration": 4.8369224071502686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142903, + "balance_loss_mlp": 1.06875467, + "epoch": 0.7604848018468642, + "flos": 776040509952.0, + "grad_norm": 0.03364559855712782, + "language_loss": 0.90537649, + "learning_rate": 0.00014304859886964867, + "loss": 0.91680551, + "num_input_tokens_seen": 328034656, + "router_z_loss_mlp": 0.74072266, + "step": 3953, + "time_per_iteration": 2.9843015670776367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142718, + "balance_loss_mlp": 1.06871259, + "epoch": 0.7606771835321278, + "flos": 559260344832.0, + "grad_norm": 0.034495919290042885, + "language_loss": 0.88372874, + "learning_rate": 0.00014283051228964878, + "loss": 0.89515591, + "num_input_tokens_seen": 328107264, + "router_z_loss_mlp": 0.74023438, + "step": 3954, + "time_per_iteration": 2.6971194744110107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143086, + "balance_loss_mlp": 1.06912816, + "epoch": 0.7608695652173914, + "flos": 526432588800.0, + "grad_norm": 0.03600141615552244, + "language_loss": 0.87487853, + "learning_rate": 0.00014261256437514197, + "loss": 0.88630933, + "num_input_tokens_seen": 328177168, + "router_z_loss_mlp": 0.73974609, + "step": 3955, + "time_per_iteration": 2.641023635864258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143325, + "balance_loss_mlp": 1.06932008, + "epoch": 0.7610619469026548, + "flos": 616167246336.0, + "grad_norm": 0.03384728426849952, + "language_loss": 0.87191808, + "learning_rate": 0.0001423947552107428, + "loss": 0.88335133, + "num_input_tokens_seen": 328245360, + "router_z_loss_mlp": 0.73974609, + "step": 3956, + "time_per_iteration": 2.7422232627868652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143723, + "balance_loss_mlp": 1.06981361, + "epoch": 0.7612543285879184, + "flos": 864817714176.0, + "grad_norm": 0.03496249839254083, + "language_loss": 0.82073259, + "learning_rate": 0.00014217708488101243, + "loss": 0.83216989, + "num_input_tokens_seen": 328326560, + "router_z_loss_mlp": 0.73925781, + "step": 3957, + "time_per_iteration": 3.1032650470733643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142422, + "balance_loss_mlp": 1.06822646, + "epoch": 0.761446710273182, + "flos": 554727585792.0, + "grad_norm": 0.03657356062959036, + "language_loss": 0.82088828, + "learning_rate": 0.0001419595534704579, + "loss": 0.83231246, + "num_input_tokens_seen": 328395760, + "router_z_loss_mlp": 0.74121094, + "step": 3958, + "time_per_iteration": 2.624901533126831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145496, + "balance_loss_mlp": 1.07149136, + "epoch": 0.7616390919584456, + "flos": 468325186560.0, + "grad_norm": 0.0357245127474846, + "language_loss": 0.85904223, + "learning_rate": 0.00014174216106353237, + "loss": 0.87049717, + "num_input_tokens_seen": 328464560, + "router_z_loss_mlp": 0.73974609, + "step": 3959, + "time_per_iteration": 2.595851421356201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143762, + "balance_loss_mlp": 1.06966209, + "epoch": 0.7618314736437091, + "flos": 499431418368.0, + "grad_norm": 0.03393548471878093, + "language_loss": 0.81279588, + "learning_rate": 0.00014152490774463512, + "loss": 0.82423347, + "num_input_tokens_seen": 328532640, + "router_z_loss_mlp": 0.73974609, + "step": 3960, + "time_per_iteration": 2.589545488357544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143507, + "balance_loss_mlp": 1.06931114, + "epoch": 0.7620238553289727, + "flos": 435451768320.0, + "grad_norm": 0.03935121424248522, + "language_loss": 0.92124438, + "learning_rate": 0.00014130779359811135, + "loss": 0.93267947, + "num_input_tokens_seen": 328595392, + "router_z_loss_mlp": 0.74072266, + "step": 3961, + "time_per_iteration": 2.455334424972534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114569, + "balance_loss_mlp": 1.07144618, + "epoch": 0.7622162370142362, + "flos": 665541500928.0, + "grad_norm": 0.033439971209903066, + "language_loss": 0.90740561, + "learning_rate": 0.0001410908187082521, + "loss": 0.91886252, + "num_input_tokens_seen": 328676368, + "router_z_loss_mlp": 0.74072266, + "step": 3962, + "time_per_iteration": 2.849613904953003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145492, + "balance_loss_mlp": 1.07105827, + "epoch": 0.7624086186994998, + "flos": 559028030976.0, + "grad_norm": 0.03941593540167477, + "language_loss": 0.90269017, + "learning_rate": 0.0001408739831592949, + "loss": 0.91414511, + "num_input_tokens_seen": 328745136, + "router_z_loss_mlp": 0.74267578, + "step": 3963, + "time_per_iteration": 2.638357639312744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114573, + "balance_loss_mlp": 1.07134342, + "epoch": 0.7626010003847634, + "flos": 630286546944.0, + "grad_norm": 0.03652031952844941, + "language_loss": 0.82416636, + "learning_rate": 0.0001406572870354224, + "loss": 0.83562368, + "num_input_tokens_seen": 328820384, + "router_z_loss_mlp": 0.7421875, + "step": 3964, + "time_per_iteration": 2.8123042583465576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145859, + "balance_loss_mlp": 1.07142508, + "epoch": 0.7627933820700269, + "flos": 438849154560.0, + "grad_norm": 0.03432760394377559, + "language_loss": 0.91489524, + "learning_rate": 0.00014044073042076337, + "loss": 0.92635381, + "num_input_tokens_seen": 328884976, + "router_z_loss_mlp": 0.74267578, + "step": 3965, + "time_per_iteration": 2.536203145980835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146519, + "balance_loss_mlp": 1.0722276, + "epoch": 0.7629857637552905, + "flos": 533794046976.0, + "grad_norm": 0.02784014268631594, + "language_loss": 0.9243055, + "learning_rate": 0.00014022431339939302, + "loss": 0.93577063, + "num_input_tokens_seen": 328957792, + "router_z_loss_mlp": 0.74121094, + "step": 3966, + "time_per_iteration": 2.6469874382019043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145692, + "balance_loss_mlp": 1.07135272, + "epoch": 0.7631781454405541, + "flos": 681236606976.0, + "grad_norm": 0.04013351668688065, + "language_loss": 0.82884651, + "learning_rate": 0.00014000803605533163, + "loss": 0.84030342, + "num_input_tokens_seen": 329034960, + "router_z_loss_mlp": 0.74169922, + "step": 3967, + "time_per_iteration": 2.802208185195923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145081, + "balance_loss_mlp": 1.07074177, + "epoch": 0.7633705271258177, + "flos": 508488204288.0, + "grad_norm": 0.04349575646472503, + "language_loss": 0.88445222, + "learning_rate": 0.00013979189847254553, + "loss": 0.89590299, + "num_input_tokens_seen": 329100848, + "router_z_loss_mlp": 0.74169922, + "step": 3968, + "time_per_iteration": 2.5820798873901367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145241, + "balance_loss_mlp": 1.07085466, + "epoch": 0.7635629088110811, + "flos": 620037992448.0, + "grad_norm": 0.0345033477005795, + "language_loss": 0.85449362, + "learning_rate": 0.00013957590073494674, + "loss": 0.86594605, + "num_input_tokens_seen": 329181120, + "router_z_loss_mlp": 0.7421875, + "step": 3969, + "time_per_iteration": 2.7904934883117676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139507, + "balance_loss_mlp": 1.0648824, + "epoch": 0.7637552904963447, + "flos": 639566914560.0, + "grad_norm": 0.03972116820389674, + "language_loss": 0.84200621, + "learning_rate": 0.0001393600429263931, + "loss": 0.8534013, + "num_input_tokens_seen": 329249888, + "router_z_loss_mlp": 0.74462891, + "step": 3970, + "time_per_iteration": 2.7333059310913086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145393, + "balance_loss_mlp": 1.07272339, + "epoch": 0.7639476721816083, + "flos": 1566683865600.0, + "grad_norm": 0.008603454608039083, + "language_loss": 0.74744886, + "learning_rate": 0.00013914432513068792, + "loss": 0.75890285, + "num_input_tokens_seen": 329483824, + "router_z_loss_mlp": 0.7265625, + "step": 3971, + "time_per_iteration": 4.924766302108765 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139229, + "balance_loss_mlp": 1.06484199, + "epoch": 0.7641400538668719, + "flos": 497019683328.0, + "grad_norm": 0.0358458499629568, + "language_loss": 0.86623794, + "learning_rate": 0.0001389287474315804, + "loss": 0.87763023, + "num_input_tokens_seen": 329553536, + "router_z_loss_mlp": 0.7421875, + "step": 3972, + "time_per_iteration": 2.6104958057403564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139206, + "balance_loss_mlp": 1.06481898, + "epoch": 0.7643324355521355, + "flos": 579514406400.0, + "grad_norm": 0.02970253105840928, + "language_loss": 0.84359801, + "learning_rate": 0.00013871330991276505, + "loss": 0.85499001, + "num_input_tokens_seen": 329621856, + "router_z_loss_mlp": 0.7421875, + "step": 3973, + "time_per_iteration": 2.7183613777160645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145413, + "balance_loss_mlp": 1.07102644, + "epoch": 0.764524817237399, + "flos": 786232668672.0, + "grad_norm": 0.038742643805220495, + "language_loss": 0.85575706, + "learning_rate": 0.00013849801265788247, + "loss": 0.86721122, + "num_input_tokens_seen": 329708192, + "router_z_loss_mlp": 0.7421875, + "step": 3974, + "time_per_iteration": 3.0245180130004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145329, + "balance_loss_mlp": 1.07094204, + "epoch": 0.7647171989226625, + "flos": 527298717696.0, + "grad_norm": 0.0343294309098999, + "language_loss": 0.88214505, + "learning_rate": 0.00013828285575051818, + "loss": 0.89359832, + "num_input_tokens_seen": 329774704, + "router_z_loss_mlp": 0.7421875, + "step": 3975, + "time_per_iteration": 2.6501829624176025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143749, + "balance_loss_mlp": 1.06964874, + "epoch": 0.7649095806079261, + "flos": 556028143104.0, + "grad_norm": 0.034577120087892245, + "language_loss": 0.88279045, + "learning_rate": 0.0001380678392742035, + "loss": 0.89422792, + "num_input_tokens_seen": 329846432, + "router_z_loss_mlp": 0.74072266, + "step": 3976, + "time_per_iteration": 2.717852830886841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143601, + "balance_loss_mlp": 1.06921458, + "epoch": 0.7651019622931897, + "flos": 650388885504.0, + "grad_norm": 0.0329487622471132, + "language_loss": 0.89186555, + "learning_rate": 0.00013785296331241526, + "loss": 0.90330154, + "num_input_tokens_seen": 329926336, + "router_z_loss_mlp": 0.7421875, + "step": 3977, + "time_per_iteration": 2.877988576889038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113775, + "balance_loss_mlp": 1.06336296, + "epoch": 0.7652943439784533, + "flos": 1048112113152.0, + "grad_norm": 0.034644421756337376, + "language_loss": 0.92511564, + "learning_rate": 0.00013763822794857583, + "loss": 0.9364931, + "num_input_tokens_seen": 330009536, + "router_z_loss_mlp": 0.7421875, + "step": 3978, + "time_per_iteration": 3.3197543621063232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113835, + "balance_loss_mlp": 1.06386817, + "epoch": 0.7654867256637168, + "flos": 505414456320.0, + "grad_norm": 0.032056341535250436, + "language_loss": 0.94870603, + "learning_rate": 0.00013742363326605278, + "loss": 0.96008945, + "num_input_tokens_seen": 330083264, + "router_z_loss_mlp": 0.74316406, + "step": 3979, + "time_per_iteration": 2.714352607727051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137744, + "balance_loss_mlp": 1.06330967, + "epoch": 0.7656791073489804, + "flos": 575863239168.0, + "grad_norm": 0.03156054452878063, + "language_loss": 0.82591552, + "learning_rate": 0.00013720917934815935, + "loss": 0.83729297, + "num_input_tokens_seen": 330157120, + "router_z_loss_mlp": 0.74267578, + "step": 3980, + "time_per_iteration": 2.717848300933838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113843, + "balance_loss_mlp": 1.06394827, + "epoch": 0.765871489034244, + "flos": 493791484416.0, + "grad_norm": 0.0408766328487834, + "language_loss": 0.88351345, + "learning_rate": 0.00013699486627815344, + "loss": 0.89489782, + "num_input_tokens_seen": 330224560, + "router_z_loss_mlp": 0.74316406, + "step": 3981, + "time_per_iteration": 2.570958137512207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114649, + "balance_loss_mlp": 1.07215071, + "epoch": 0.7660638707195075, + "flos": 487051106304.0, + "grad_norm": 0.03334801499225344, + "language_loss": 0.87230325, + "learning_rate": 0.00013678069413923928, + "loss": 0.8837682, + "num_input_tokens_seen": 330292000, + "router_z_loss_mlp": 0.74169922, + "step": 3982, + "time_per_iteration": 2.59192156791687 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145932, + "balance_loss_mlp": 1.07168806, + "epoch": 0.766256252404771, + "flos": 445242425856.0, + "grad_norm": 0.033038982399311745, + "language_loss": 0.86065191, + "learning_rate": 0.00013656666301456555, + "loss": 0.8721112, + "num_input_tokens_seen": 330357472, + "router_z_loss_mlp": 0.74121094, + "step": 3983, + "time_per_iteration": 2.5096640586853027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139926, + "balance_loss_mlp": 1.06568277, + "epoch": 0.7664486340900346, + "flos": 486213175296.0, + "grad_norm": 0.0343473148612919, + "language_loss": 0.88720405, + "learning_rate": 0.0001363527729872267, + "loss": 0.89860332, + "num_input_tokens_seen": 330427792, + "router_z_loss_mlp": 0.74072266, + "step": 3984, + "time_per_iteration": 2.652386426925659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138175, + "balance_loss_mlp": 1.06359744, + "epoch": 0.7666410157752982, + "flos": 647384268288.0, + "grad_norm": 0.033932927272579565, + "language_loss": 0.81177199, + "learning_rate": 0.00013613902414026207, + "loss": 0.82315373, + "num_input_tokens_seen": 330500320, + "router_z_loss_mlp": 0.74414062, + "step": 3985, + "time_per_iteration": 2.785083055496216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138176, + "balance_loss_mlp": 1.06359911, + "epoch": 0.7668333974605618, + "flos": 775660475904.0, + "grad_norm": 0.03599596212719163, + "language_loss": 0.86968917, + "learning_rate": 0.00013592541655665642, + "loss": 0.88107091, + "num_input_tokens_seen": 330581696, + "router_z_loss_mlp": 0.74414062, + "step": 3986, + "time_per_iteration": 3.013932704925537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144262, + "balance_loss_mlp": 1.06987572, + "epoch": 0.7670257791458254, + "flos": 614512851456.0, + "grad_norm": 0.036460289004419034, + "language_loss": 0.90080905, + "learning_rate": 0.00013571195031933947, + "loss": 0.91225165, + "num_input_tokens_seen": 330648000, + "router_z_loss_mlp": 0.7421875, + "step": 3987, + "time_per_iteration": 2.6782960891723633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114978, + "balance_loss_mlp": 1.0776825, + "epoch": 0.7672181608310888, + "flos": 1488362608128.0, + "grad_norm": 0.008503355118198302, + "language_loss": 0.80481339, + "learning_rate": 0.00013549862551118626, + "loss": 0.81631124, + "num_input_tokens_seen": 330873872, + "router_z_loss_mlp": 0.72265625, + "step": 3988, + "time_per_iteration": 4.697616338729858 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135719, + "balance_loss_mlp": 1.06128454, + "epoch": 0.7674105425163524, + "flos": 611866801152.0, + "grad_norm": 0.03376269838630617, + "language_loss": 0.9032138, + "learning_rate": 0.00013528544221501655, + "loss": 0.91457105, + "num_input_tokens_seen": 330945760, + "router_z_loss_mlp": 0.74267578, + "step": 3989, + "time_per_iteration": 2.731600284576416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135719, + "balance_loss_mlp": 1.06118917, + "epoch": 0.767602924201616, + "flos": 846604085760.0, + "grad_norm": 0.0353786451651817, + "language_loss": 0.86480021, + "learning_rate": 0.00013507240051359586, + "loss": 0.8761574, + "num_input_tokens_seen": 331025584, + "router_z_loss_mlp": 0.74365234, + "step": 3990, + "time_per_iteration": 3.0497024059295654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135952, + "balance_loss_mlp": 1.06156516, + "epoch": 0.7677953058868796, + "flos": 528145380864.0, + "grad_norm": 0.040368948500693246, + "language_loss": 0.91154569, + "learning_rate": 0.00013485950048963425, + "loss": 0.92290527, + "num_input_tokens_seen": 331093008, + "router_z_loss_mlp": 0.7421875, + "step": 3991, + "time_per_iteration": 2.596708059310913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135888, + "balance_loss_mlp": 1.06145394, + "epoch": 0.7679876875721431, + "flos": 925111268352.0, + "grad_norm": 0.05870608675269832, + "language_loss": 0.88347316, + "learning_rate": 0.00013464674222578643, + "loss": 0.89483202, + "num_input_tokens_seen": 331177120, + "router_z_loss_mlp": 0.74267578, + "step": 3992, + "time_per_iteration": 3.1901588439941406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114079, + "balance_loss_mlp": 1.06640303, + "epoch": 0.7681800692574067, + "flos": 459018622464.0, + "grad_norm": 0.03723022902665057, + "language_loss": 0.87956703, + "learning_rate": 0.00013443412580465292, + "loss": 0.89097494, + "num_input_tokens_seen": 331245424, + "router_z_loss_mlp": 0.7421875, + "step": 3993, + "time_per_iteration": 2.603252649307251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141634, + "balance_loss_mlp": 1.06724763, + "epoch": 0.7683724509426703, + "flos": 659732379648.0, + "grad_norm": 0.0341053080993109, + "language_loss": 0.8901087, + "learning_rate": 0.00013422165130877857, + "loss": 0.90152502, + "num_input_tokens_seen": 331327504, + "router_z_loss_mlp": 0.7421875, + "step": 3994, + "time_per_iteration": 2.911731004714966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142658, + "balance_loss_mlp": 1.06827152, + "epoch": 0.7685648326279338, + "flos": 556338319872.0, + "grad_norm": 0.037345354137488074, + "language_loss": 0.84750074, + "learning_rate": 0.00013400931882065327, + "loss": 0.85892731, + "num_input_tokens_seen": 331398464, + "router_z_loss_mlp": 0.7421875, + "step": 3995, + "time_per_iteration": 2.6689093112945557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142291, + "balance_loss_mlp": 1.06790483, + "epoch": 0.7687572143131974, + "flos": 688743783936.0, + "grad_norm": 0.03341807173983279, + "language_loss": 0.85686117, + "learning_rate": 0.0001337971284227118, + "loss": 0.86828411, + "num_input_tokens_seen": 331484592, + "router_z_loss_mlp": 0.7421875, + "step": 3996, + "time_per_iteration": 3.0353329181671143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148544, + "balance_loss_mlp": 1.07644653, + "epoch": 0.7689495959984609, + "flos": 1492665781248.0, + "grad_norm": 0.006288320283860005, + "language_loss": 0.76118422, + "learning_rate": 0.00013358508019733388, + "loss": 0.77266961, + "num_input_tokens_seen": 331721360, + "router_z_loss_mlp": 0.72265625, + "step": 3997, + "time_per_iteration": 4.911880731582642 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144884, + "balance_loss_mlp": 1.07049692, + "epoch": 0.7691419776837245, + "flos": 571499667456.0, + "grad_norm": 0.031757425540639796, + "language_loss": 0.84642863, + "learning_rate": 0.0001333731742268438, + "loss": 0.85787749, + "num_input_tokens_seen": 331794240, + "router_z_loss_mlp": 0.7421875, + "step": 3998, + "time_per_iteration": 2.6962177753448486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145361, + "balance_loss_mlp": 1.07097435, + "epoch": 0.7693343593689881, + "flos": 521190153216.0, + "grad_norm": 0.03369214696754818, + "language_loss": 0.89708233, + "learning_rate": 0.0001331614105935109, + "loss": 0.9085359, + "num_input_tokens_seen": 331866496, + "router_z_loss_mlp": 0.7421875, + "step": 3999, + "time_per_iteration": 2.6809701919555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114508, + "balance_loss_mlp": 1.07074106, + "epoch": 0.7695267410542517, + "flos": 661551232512.0, + "grad_norm": 0.03371243854874441, + "language_loss": 0.88376063, + "learning_rate": 0.00013294978937954883, + "loss": 0.8952114, + "num_input_tokens_seen": 331936592, + "router_z_loss_mlp": 0.74169922, + "step": 4000, + "time_per_iteration": 2.867079973220825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114193, + "balance_loss_mlp": 1.06754363, + "epoch": 0.7697191227395151, + "flos": 547858953216.0, + "grad_norm": 0.037308762350110276, + "language_loss": 0.89336216, + "learning_rate": 0.00013273831066711655, + "loss": 0.90478146, + "num_input_tokens_seen": 332003536, + "router_z_loss_mlp": 0.7421875, + "step": 4001, + "time_per_iteration": 2.5953049659729004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141038, + "balance_loss_mlp": 1.06684196, + "epoch": 0.7699115044247787, + "flos": 541695994368.0, + "grad_norm": 0.03259494083798661, + "language_loss": 0.84480441, + "learning_rate": 0.00013252697453831747, + "loss": 0.85621476, + "num_input_tokens_seen": 332075248, + "router_z_loss_mlp": 0.74121094, + "step": 4002, + "time_per_iteration": 2.685664653778076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140964, + "balance_loss_mlp": 1.06686342, + "epoch": 0.7701038861100423, + "flos": 564142938624.0, + "grad_norm": 0.03879527633270508, + "language_loss": 0.87191802, + "learning_rate": 0.00013231578107519916, + "loss": 0.8833276, + "num_input_tokens_seen": 332158944, + "router_z_loss_mlp": 0.74072266, + "step": 4003, + "time_per_iteration": 2.8707611560821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142721, + "balance_loss_mlp": 1.06843019, + "epoch": 0.7702962677953059, + "flos": 482733196800.0, + "grad_norm": 0.03964954780213044, + "language_loss": 0.87790287, + "learning_rate": 0.00013210473035975422, + "loss": 0.88933003, + "num_input_tokens_seen": 332226368, + "router_z_loss_mlp": 0.74169922, + "step": 4004, + "time_per_iteration": 2.577669143676758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137199, + "balance_loss_mlp": 1.06266928, + "epoch": 0.7704886494805695, + "flos": 771805192704.0, + "grad_norm": 0.03541890764411222, + "language_loss": 0.90018678, + "learning_rate": 0.0001318938224739201, + "loss": 0.91155875, + "num_input_tokens_seen": 332314784, + "router_z_loss_mlp": 0.74365234, + "step": 4005, + "time_per_iteration": 3.054161787033081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138331, + "balance_loss_mlp": 1.06384909, + "epoch": 0.770681031165833, + "flos": 602317189632.0, + "grad_norm": 0.032853196947195275, + "language_loss": 0.87994003, + "learning_rate": 0.00013168305749957843, + "loss": 0.89132333, + "num_input_tokens_seen": 332387952, + "router_z_loss_mlp": 0.74316406, + "step": 4006, + "time_per_iteration": 2.742284059524536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139142, + "balance_loss_mlp": 1.06461227, + "epoch": 0.7708734128510966, + "flos": 497095544832.0, + "grad_norm": 0.034737097331234285, + "language_loss": 0.87459195, + "learning_rate": 0.00013147243551855532, + "loss": 0.88598335, + "num_input_tokens_seen": 332456352, + "router_z_loss_mlp": 0.74365234, + "step": 4007, + "time_per_iteration": 2.565561532974243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138441, + "balance_loss_mlp": 1.06400645, + "epoch": 0.7710657945363601, + "flos": 568454117376.0, + "grad_norm": 0.028865688800901353, + "language_loss": 0.84292293, + "learning_rate": 0.00013126195661262148, + "loss": 0.85430735, + "num_input_tokens_seen": 332534288, + "router_z_loss_mlp": 0.74267578, + "step": 4008, + "time_per_iteration": 2.76387357711792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143893, + "balance_loss_mlp": 1.06969726, + "epoch": 0.7712581762216237, + "flos": 605749504512.0, + "grad_norm": 0.03137791389810697, + "language_loss": 0.90203846, + "learning_rate": 0.00013105162086349216, + "loss": 0.91347742, + "num_input_tokens_seen": 332615440, + "router_z_loss_mlp": 0.74121094, + "step": 4009, + "time_per_iteration": 2.8172740936279297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144917, + "balance_loss_mlp": 1.07057822, + "epoch": 0.7714505579068872, + "flos": 531996661248.0, + "grad_norm": 0.03056437231076115, + "language_loss": 0.89419609, + "learning_rate": 0.00013084142835282687, + "loss": 0.90564525, + "num_input_tokens_seen": 332687360, + "router_z_loss_mlp": 0.74169922, + "step": 4010, + "time_per_iteration": 2.7165045738220215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150368, + "balance_loss_mlp": 1.07769775, + "epoch": 0.7716429395921508, + "flos": 1425380069376.0, + "grad_norm": 0.007418114590999428, + "language_loss": 0.79884362, + "learning_rate": 0.00013063137916222956, + "loss": 0.81034732, + "num_input_tokens_seen": 332919936, + "router_z_loss_mlp": 0.7265625, + "step": 4011, + "time_per_iteration": 4.772608757019043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143697, + "balance_loss_mlp": 1.06978679, + "epoch": 0.7718353212774144, + "flos": 579586265088.0, + "grad_norm": 0.032910193378974356, + "language_loss": 0.94427228, + "learning_rate": 0.0001304214733732485, + "loss": 0.95570928, + "num_input_tokens_seen": 332990096, + "router_z_loss_mlp": 0.73925781, + "step": 4012, + "time_per_iteration": 2.789973258972168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143696, + "balance_loss_mlp": 1.06969118, + "epoch": 0.772027702962678, + "flos": 511772798976.0, + "grad_norm": 0.03524437980359451, + "language_loss": 0.87796986, + "learning_rate": 0.00013021171106737672, + "loss": 0.8894068, + "num_input_tokens_seen": 333063616, + "router_z_loss_mlp": 0.74023438, + "step": 4013, + "time_per_iteration": 2.71975040435791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113924, + "balance_loss_mlp": 1.06499684, + "epoch": 0.7722200846479416, + "flos": 526747494912.0, + "grad_norm": 0.030121234112763372, + "language_loss": 0.84496903, + "learning_rate": 0.00013000209232605071, + "loss": 0.85636145, + "num_input_tokens_seen": 333136368, + "router_z_loss_mlp": 0.74121094, + "step": 4014, + "time_per_iteration": 2.6892056465148926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139469, + "balance_loss_mlp": 1.06508267, + "epoch": 0.772412466333205, + "flos": 480601439232.0, + "grad_norm": 0.03460224041299985, + "language_loss": 0.83357382, + "learning_rate": 0.0001297926172306519, + "loss": 0.84496856, + "num_input_tokens_seen": 333207136, + "router_z_loss_mlp": 0.7421875, + "step": 4015, + "time_per_iteration": 2.6161460876464844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138641, + "balance_loss_mlp": 1.06449294, + "epoch": 0.7726048480184686, + "flos": 907312602624.0, + "grad_norm": 0.03829273799260643, + "language_loss": 0.83440059, + "learning_rate": 0.0001295832858625055, + "loss": 0.84578699, + "num_input_tokens_seen": 333291920, + "router_z_loss_mlp": 0.74023438, + "step": 4016, + "time_per_iteration": 3.286180019378662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137589, + "balance_loss_mlp": 1.06329787, + "epoch": 0.7727972297037322, + "flos": 632566024704.0, + "grad_norm": 0.037636726324715264, + "language_loss": 0.7551474, + "learning_rate": 0.00012937409830288154, + "loss": 0.7665233, + "num_input_tokens_seen": 333369824, + "router_z_loss_mlp": 0.74121094, + "step": 4017, + "time_per_iteration": 2.8370349407196045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142791, + "balance_loss_mlp": 1.0688808, + "epoch": 0.7729896113889958, + "flos": 415673068032.0, + "grad_norm": 0.038209347580389144, + "language_loss": 0.9001559, + "learning_rate": 0.00012916505463299362, + "loss": 0.91158378, + "num_input_tokens_seen": 333434192, + "router_z_loss_mlp": 0.73925781, + "step": 4018, + "time_per_iteration": 2.519319772720337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141641, + "balance_loss_mlp": 1.06754065, + "epoch": 0.7731819930742593, + "flos": 670104459264.0, + "grad_norm": 0.03754903876157777, + "language_loss": 0.83159339, + "learning_rate": 0.00012895615493399972, + "loss": 0.84300983, + "num_input_tokens_seen": 333509696, + "router_z_loss_mlp": 0.74072266, + "step": 4019, + "time_per_iteration": 2.8084754943847656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136472, + "balance_loss_mlp": 1.06203771, + "epoch": 0.7733743747595229, + "flos": 490858725888.0, + "grad_norm": 0.052975326566308774, + "language_loss": 0.88814008, + "learning_rate": 0.00012874739928700192, + "loss": 0.89950484, + "num_input_tokens_seen": 333575184, + "router_z_loss_mlp": 0.74267578, + "step": 4020, + "time_per_iteration": 2.6240487098693848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113737, + "balance_loss_mlp": 1.06307888, + "epoch": 0.7735667564447865, + "flos": 660887218176.0, + "grad_norm": 0.04201046633060088, + "language_loss": 0.84696388, + "learning_rate": 0.00012853878777304624, + "loss": 0.85833752, + "num_input_tokens_seen": 333651568, + "router_z_loss_mlp": 0.74121094, + "step": 4021, + "time_per_iteration": 2.873288154602051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135595, + "balance_loss_mlp": 1.06120825, + "epoch": 0.77375913813005, + "flos": 534490988544.0, + "grad_norm": 0.02933243833596509, + "language_loss": 0.88221383, + "learning_rate": 0.000128330320473123, + "loss": 0.89356983, + "num_input_tokens_seen": 333726400, + "router_z_loss_mlp": 0.7421875, + "step": 4022, + "time_per_iteration": 2.6959497928619385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138573, + "balance_loss_mlp": 1.06590271, + "epoch": 0.7739515198153136, + "flos": 1523379244032.0, + "grad_norm": 0.005476553783658496, + "language_loss": 0.783319, + "learning_rate": 0.00012812199746816628, + "loss": 0.79470468, + "num_input_tokens_seen": 333960224, + "router_z_loss_mlp": 0.7265625, + "step": 4023, + "time_per_iteration": 4.908393621444702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136949, + "balance_loss_mlp": 1.06256282, + "epoch": 0.7741439015005771, + "flos": 641251508736.0, + "grad_norm": 0.0388161486580036, + "language_loss": 0.86722291, + "learning_rate": 0.0001279138188390543, + "loss": 0.87859237, + "num_input_tokens_seen": 334033904, + "router_z_loss_mlp": 0.7421875, + "step": 4024, + "time_per_iteration": 2.8532886505126953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142263, + "balance_loss_mlp": 1.06835282, + "epoch": 0.7743362831858407, + "flos": 667023980544.0, + "grad_norm": 0.03451580070650428, + "language_loss": 0.90432525, + "learning_rate": 0.00012770578466660915, + "loss": 0.91574788, + "num_input_tokens_seen": 334107904, + "router_z_loss_mlp": 0.73925781, + "step": 4025, + "time_per_iteration": 2.862123489379883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142172, + "balance_loss_mlp": 1.06807196, + "epoch": 0.7745286648711043, + "flos": 563993217024.0, + "grad_norm": 0.03283033762939225, + "language_loss": 0.85806942, + "learning_rate": 0.0001274978950315968, + "loss": 0.86949122, + "num_input_tokens_seen": 334184048, + "router_z_loss_mlp": 0.74072266, + "step": 4026, + "time_per_iteration": 2.802757501602173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137274, + "balance_loss_mlp": 1.06288695, + "epoch": 0.7747210465563679, + "flos": 517961954304.0, + "grad_norm": 0.042128094380904035, + "language_loss": 0.87673521, + "learning_rate": 0.00012729015001472716, + "loss": 0.88810796, + "num_input_tokens_seen": 334257152, + "router_z_loss_mlp": 0.7421875, + "step": 4027, + "time_per_iteration": 2.6692821979522705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137346, + "balance_loss_mlp": 1.06295931, + "epoch": 0.7749134282416313, + "flos": 635368527360.0, + "grad_norm": 0.03931555017475162, + "language_loss": 0.86517704, + "learning_rate": 0.00012708254969665418, + "loss": 0.87655056, + "num_input_tokens_seen": 334331312, + "router_z_loss_mlp": 0.7421875, + "step": 4028, + "time_per_iteration": 2.7921457290649414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138509, + "balance_loss_mlp": 1.0641222, + "epoch": 0.7751058099268949, + "flos": 496350939648.0, + "grad_norm": 0.04579390573234304, + "language_loss": 0.889467, + "learning_rate": 0.00012687509415797526, + "loss": 0.90085208, + "num_input_tokens_seen": 334397344, + "router_z_loss_mlp": 0.7421875, + "step": 4029, + "time_per_iteration": 2.5587246417999268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137293, + "balance_loss_mlp": 1.06304908, + "epoch": 0.7752981916121585, + "flos": 511362565632.0, + "grad_norm": 0.03685004486441248, + "language_loss": 0.85761744, + "learning_rate": 0.00012666778347923208, + "loss": 0.86899036, + "num_input_tokens_seen": 334467872, + "router_z_loss_mlp": 0.74072266, + "step": 4030, + "time_per_iteration": 2.6332554817199707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143646, + "balance_loss_mlp": 1.06978357, + "epoch": 0.7754905732974221, + "flos": 498565289472.0, + "grad_norm": 0.03255854062300405, + "language_loss": 0.87846529, + "learning_rate": 0.0001264606177409092, + "loss": 0.88990176, + "num_input_tokens_seen": 334539088, + "router_z_loss_mlp": 0.73876953, + "step": 4031, + "time_per_iteration": 2.6323087215423584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139185, + "balance_loss_mlp": 1.06498873, + "epoch": 0.7756829549826857, + "flos": 481782474240.0, + "grad_norm": 0.03677638670321597, + "language_loss": 0.90051126, + "learning_rate": 0.00012625359702343609, + "loss": 0.91190314, + "num_input_tokens_seen": 334612576, + "router_z_loss_mlp": 0.74023438, + "step": 4032, + "time_per_iteration": 2.764946937561035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136066, + "balance_loss_mlp": 1.06186974, + "epoch": 0.7758753366679492, + "flos": 553685538816.0, + "grad_norm": 0.03552074396287166, + "language_loss": 0.89551866, + "learning_rate": 0.00012604672140718504, + "loss": 0.90687937, + "num_input_tokens_seen": 334677824, + "router_z_loss_mlp": 0.74072266, + "step": 4033, + "time_per_iteration": 2.616276741027832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136731, + "balance_loss_mlp": 1.06243956, + "epoch": 0.7760677183532128, + "flos": 705063972864.0, + "grad_norm": 0.03368756555440988, + "language_loss": 0.82777321, + "learning_rate": 0.00012583999097247233, + "loss": 0.83914053, + "num_input_tokens_seen": 334751456, + "router_z_loss_mlp": 0.74121094, + "step": 4034, + "time_per_iteration": 2.8126814365386963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136753, + "balance_loss_mlp": 1.06255746, + "epoch": 0.7762601000384763, + "flos": 524478750720.0, + "grad_norm": 0.036921944541312396, + "language_loss": 0.85384995, + "learning_rate": 0.0001256334057995578, + "loss": 0.86521751, + "num_input_tokens_seen": 334823008, + "router_z_loss_mlp": 0.74072266, + "step": 4035, + "time_per_iteration": 2.6846728324890137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138277, + "balance_loss_mlp": 1.0641768, + "epoch": 0.7764524817237399, + "flos": 558617797632.0, + "grad_norm": 0.033254007354158545, + "language_loss": 0.89694679, + "learning_rate": 0.000125426965968645, + "loss": 0.90832961, + "num_input_tokens_seen": 334896048, + "router_z_loss_mlp": 0.73974609, + "step": 4036, + "time_per_iteration": 2.747835636138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144007, + "balance_loss_mlp": 1.07009733, + "epoch": 0.7766448634090035, + "flos": 580816965120.0, + "grad_norm": 0.036524717116784906, + "language_loss": 0.87124515, + "learning_rate": 0.00012522067155988092, + "loss": 0.88268518, + "num_input_tokens_seen": 334964416, + "router_z_loss_mlp": 0.73925781, + "step": 4037, + "time_per_iteration": 2.7287211418151855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011441, + "balance_loss_mlp": 1.07028544, + "epoch": 0.776837245094267, + "flos": 636818806272.0, + "grad_norm": 0.04076227552668926, + "language_loss": 0.80187047, + "learning_rate": 0.00012501452265335617, + "loss": 0.81331146, + "num_input_tokens_seen": 335043360, + "router_z_loss_mlp": 0.73828125, + "step": 4038, + "time_per_iteration": 2.811866283416748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138752, + "balance_loss_mlp": 1.06455588, + "epoch": 0.7770296267795306, + "flos": 615813408768.0, + "grad_norm": 0.0355390445236554, + "language_loss": 0.87746716, + "learning_rate": 0.0001248085193291047, + "loss": 0.88885468, + "num_input_tokens_seen": 335113216, + "router_z_loss_mlp": 0.74023438, + "step": 4039, + "time_per_iteration": 2.734161853790283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137901, + "balance_loss_mlp": 1.06380022, + "epoch": 0.7772220084647942, + "flos": 880295969280.0, + "grad_norm": 0.030150697576870535, + "language_loss": 0.86369264, + "learning_rate": 0.00012460266166710443, + "loss": 0.87507164, + "num_input_tokens_seen": 335195824, + "router_z_loss_mlp": 0.73974609, + "step": 4040, + "time_per_iteration": 3.137223243713379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146543, + "balance_loss_mlp": 1.07215619, + "epoch": 0.7774143901500578, + "flos": 841038011904.0, + "grad_norm": 0.03809465045400834, + "language_loss": 0.82413107, + "learning_rate": 0.00012439694974727633, + "loss": 0.8355965, + "num_input_tokens_seen": 335269712, + "router_z_loss_mlp": 0.7421875, + "step": 4041, + "time_per_iteration": 3.0596840381622314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146741, + "balance_loss_mlp": 1.07225895, + "epoch": 0.7776067718353212, + "flos": 569228921856.0, + "grad_norm": 0.03500635055952716, + "language_loss": 0.84672141, + "learning_rate": 0.00012419138364948458, + "loss": 0.85818887, + "num_input_tokens_seen": 335343408, + "router_z_loss_mlp": 0.74316406, + "step": 4042, + "time_per_iteration": 2.697154998779297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153394, + "balance_loss_mlp": 1.07919836, + "epoch": 0.7777991535205848, + "flos": 747209026560.0, + "grad_norm": 0.038117976475530245, + "language_loss": 0.87011731, + "learning_rate": 0.00012398596345353702, + "loss": 0.88165122, + "num_input_tokens_seen": 335415360, + "router_z_loss_mlp": 0.74072266, + "step": 4043, + "time_per_iteration": 2.903593063354492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145251, + "balance_loss_mlp": 1.07086432, + "epoch": 0.7779915352058484, + "flos": 539182201344.0, + "grad_norm": 0.034270473867383876, + "language_loss": 0.87845659, + "learning_rate": 0.0001237806892391851, + "loss": 0.88990903, + "num_input_tokens_seen": 335491568, + "router_z_loss_mlp": 0.7421875, + "step": 4044, + "time_per_iteration": 2.713480234146118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145012, + "balance_loss_mlp": 1.0706259, + "epoch": 0.778183916891112, + "flos": 635954678784.0, + "grad_norm": 0.03512178084580865, + "language_loss": 0.85495478, + "learning_rate": 0.0001235755610861233, + "loss": 0.86640489, + "num_input_tokens_seen": 335567200, + "router_z_loss_mlp": 0.7421875, + "step": 4045, + "time_per_iteration": 2.732534170150757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141546, + "balance_loss_mlp": 1.06711173, + "epoch": 0.7783762985763756, + "flos": 589789157376.0, + "grad_norm": 0.036702613640591464, + "language_loss": 0.89351201, + "learning_rate": 0.0001233705790739893, + "loss": 0.90492749, + "num_input_tokens_seen": 335640512, + "router_z_loss_mlp": 0.74267578, + "step": 4046, + "time_per_iteration": 2.7078564167022705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139744, + "balance_loss_mlp": 1.06535733, + "epoch": 0.7785686802616391, + "flos": 932240412672.0, + "grad_norm": 0.03647485158303252, + "language_loss": 0.79245514, + "learning_rate": 0.0001231657432823643, + "loss": 0.80385262, + "num_input_tokens_seen": 335726016, + "router_z_loss_mlp": 0.7421875, + "step": 4047, + "time_per_iteration": 3.204200029373169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114146, + "balance_loss_mlp": 1.06707299, + "epoch": 0.7787610619469026, + "flos": 498956057088.0, + "grad_norm": 0.04086385671919431, + "language_loss": 0.84949565, + "learning_rate": 0.0001229610537907725, + "loss": 0.86091024, + "num_input_tokens_seen": 335794864, + "router_z_loss_mlp": 0.7421875, + "step": 4048, + "time_per_iteration": 2.587411403656006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139179, + "balance_loss_mlp": 1.06483984, + "epoch": 0.7789534436321662, + "flos": 516650663424.0, + "grad_norm": 0.0370984959952915, + "language_loss": 0.95913208, + "learning_rate": 0.00012275651067868143, + "loss": 0.97052377, + "num_input_tokens_seen": 335860928, + "router_z_loss_mlp": 0.74169922, + "step": 4049, + "time_per_iteration": 2.6297829151153564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145054, + "balance_loss_mlp": 1.07095397, + "epoch": 0.7791458253174298, + "flos": 990061106688.0, + "grad_norm": 0.049766868205719794, + "language_loss": 0.84448528, + "learning_rate": 0.00012255211402550182, + "loss": 0.85593581, + "num_input_tokens_seen": 335945728, + "router_z_loss_mlp": 0.74072266, + "step": 4050, + "time_per_iteration": 3.2185845375061035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138393, + "balance_loss_mlp": 1.06400621, + "epoch": 0.7793382070026933, + "flos": 630184488960.0, + "grad_norm": 0.041629514228615855, + "language_loss": 0.82138163, + "learning_rate": 0.00012234786391058727, + "loss": 0.83276558, + "num_input_tokens_seen": 336014848, + "router_z_loss_mlp": 0.7421875, + "step": 4051, + "time_per_iteration": 2.7984745502471924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114015, + "balance_loss_mlp": 1.06590664, + "epoch": 0.7795305886879569, + "flos": 532762733568.0, + "grad_norm": 0.042901247751836985, + "language_loss": 0.90027404, + "learning_rate": 0.0001221437604132352, + "loss": 0.91167557, + "num_input_tokens_seen": 336080096, + "router_z_loss_mlp": 0.74072266, + "step": 4052, + "time_per_iteration": 2.6062204837799072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139339, + "balance_loss_mlp": 1.06490481, + "epoch": 0.7797229703732205, + "flos": 613141161984.0, + "grad_norm": 0.0426206226565264, + "language_loss": 0.86529624, + "learning_rate": 0.0001219398036126852, + "loss": 0.87668967, + "num_input_tokens_seen": 336154640, + "router_z_loss_mlp": 0.74267578, + "step": 4053, + "time_per_iteration": 2.7453675270080566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137791, + "balance_loss_mlp": 1.06340408, + "epoch": 0.7799153520584841, + "flos": 873794635776.0, + "grad_norm": 0.03320369943222444, + "language_loss": 0.82415718, + "learning_rate": 0.00012173599358812027, + "loss": 0.83553505, + "num_input_tokens_seen": 336244160, + "router_z_loss_mlp": 0.7421875, + "step": 4054, + "time_per_iteration": 3.2739408016204834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137317, + "balance_loss_mlp": 1.06293011, + "epoch": 0.7801077337437476, + "flos": 584744107008.0, + "grad_norm": 0.03804124847596099, + "language_loss": 0.87714571, + "learning_rate": 0.0001215323304186668, + "loss": 0.88851887, + "num_input_tokens_seen": 336317936, + "router_z_loss_mlp": 0.7421875, + "step": 4055, + "time_per_iteration": 2.7659378051757812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137343, + "balance_loss_mlp": 1.06319451, + "epoch": 0.7803001154290111, + "flos": 602280259584.0, + "grad_norm": 0.03158827116137511, + "language_loss": 0.91988087, + "learning_rate": 0.00012132881418339364, + "loss": 0.93125427, + "num_input_tokens_seen": 336389504, + "router_z_loss_mlp": 0.74072266, + "step": 4056, + "time_per_iteration": 2.7168469429016113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114492, + "balance_loss_mlp": 1.07263184, + "epoch": 0.7804924971142747, + "flos": 1482925515264.0, + "grad_norm": 0.005095674237873183, + "language_loss": 0.77517563, + "learning_rate": 0.00012112544496131306, + "loss": 0.78662485, + "num_input_tokens_seen": 336615536, + "router_z_loss_mlp": 0.72460938, + "step": 4057, + "time_per_iteration": 4.8585734367370605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113894, + "balance_loss_mlp": 1.06460154, + "epoch": 0.7806848787995383, + "flos": 631515245568.0, + "grad_norm": 0.03359665860494396, + "language_loss": 0.81806797, + "learning_rate": 0.00012092222283137944, + "loss": 0.8294574, + "num_input_tokens_seen": 336686400, + "router_z_loss_mlp": 0.74169922, + "step": 4058, + "time_per_iteration": 2.757882595062256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115152, + "balance_loss_mlp": 1.079422, + "epoch": 0.7808772604848019, + "flos": 1420745252352.0, + "grad_norm": 0.008112478231263178, + "language_loss": 0.7890631, + "learning_rate": 0.00012071914787249111, + "loss": 0.8005783, + "num_input_tokens_seen": 336912704, + "router_z_loss_mlp": 0.72265625, + "step": 4059, + "time_per_iteration": 4.779797315597534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011384, + "balance_loss_mlp": 1.06406116, + "epoch": 0.7810696421700654, + "flos": 733103187456.0, + "grad_norm": 0.03176373649090862, + "language_loss": 0.88107026, + "learning_rate": 0.00012051622016348856, + "loss": 0.89245427, + "num_input_tokens_seen": 336997040, + "router_z_loss_mlp": 0.74169922, + "step": 4060, + "time_per_iteration": 3.0269150733947754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138414, + "balance_loss_mlp": 1.06412303, + "epoch": 0.781262023855329, + "flos": 425837028864.0, + "grad_norm": 0.038145388321841694, + "language_loss": 0.90811419, + "learning_rate": 0.00012031343978315539, + "loss": 0.91949832, + "num_input_tokens_seen": 337059760, + "router_z_loss_mlp": 0.74121094, + "step": 4061, + "time_per_iteration": 2.459432363510132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136363, + "balance_loss_mlp": 1.06197631, + "epoch": 0.7814544055405925, + "flos": 502073465856.0, + "grad_norm": 0.03753829813607959, + "language_loss": 0.87161046, + "learning_rate": 0.00012011080681021774, + "loss": 0.88297415, + "num_input_tokens_seen": 337128528, + "router_z_loss_mlp": 0.7421875, + "step": 4062, + "time_per_iteration": 2.691654920578003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136384, + "balance_loss_mlp": 1.06204486, + "epoch": 0.7816467872258561, + "flos": 463392927744.0, + "grad_norm": 0.03545714253981061, + "language_loss": 0.90689021, + "learning_rate": 0.00011990832132334512, + "loss": 0.91825402, + "num_input_tokens_seen": 337194112, + "router_z_loss_mlp": 0.74169922, + "step": 4063, + "time_per_iteration": 2.501356363296509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011365, + "balance_loss_mlp": 1.06211364, + "epoch": 0.7818391689111197, + "flos": 742107580416.0, + "grad_norm": 0.03646375779692072, + "language_loss": 0.8761006, + "learning_rate": 0.00011970598340114897, + "loss": 0.8874656, + "num_input_tokens_seen": 337270416, + "router_z_loss_mlp": 0.7421875, + "step": 4064, + "time_per_iteration": 2.9211695194244385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138234, + "balance_loss_mlp": 1.06389523, + "epoch": 0.7820315505963832, + "flos": 548805672960.0, + "grad_norm": 0.037373767627345386, + "language_loss": 0.88286138, + "learning_rate": 0.00011950379312218396, + "loss": 0.89424372, + "num_input_tokens_seen": 337343024, + "router_z_loss_mlp": 0.74169922, + "step": 4065, + "time_per_iteration": 2.7662761211395264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139451, + "balance_loss_mlp": 1.06511247, + "epoch": 0.7822239322816468, + "flos": 730259025408.0, + "grad_norm": 0.031688812892368586, + "language_loss": 0.90089023, + "learning_rate": 0.00011930175056494719, + "loss": 0.91228467, + "num_input_tokens_seen": 337417232, + "router_z_loss_mlp": 0.74169922, + "step": 4066, + "time_per_iteration": 2.8510522842407227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145428, + "balance_loss_mlp": 1.07137561, + "epoch": 0.7824163139669104, + "flos": 452985919488.0, + "grad_norm": 0.030648314991386538, + "language_loss": 0.79762566, + "learning_rate": 0.00011909985580787885, + "loss": 0.80907995, + "num_input_tokens_seen": 337488224, + "router_z_loss_mlp": 0.73974609, + "step": 4067, + "time_per_iteration": 2.6272332668304443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144706, + "balance_loss_mlp": 1.07074893, + "epoch": 0.782608695652174, + "flos": 541620132864.0, + "grad_norm": 0.030654260562385374, + "language_loss": 0.85639668, + "learning_rate": 0.00011889810892936137, + "loss": 0.86784375, + "num_input_tokens_seen": 337564928, + "router_z_loss_mlp": 0.73974609, + "step": 4068, + "time_per_iteration": 2.7750964164733887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114329, + "balance_loss_mlp": 1.06899869, + "epoch": 0.7828010773374374, + "flos": 501428917248.0, + "grad_norm": 0.03582388212815207, + "language_loss": 0.82907784, + "learning_rate": 0.00011869651000771959, + "loss": 0.84051073, + "num_input_tokens_seen": 337641632, + "router_z_loss_mlp": 0.74169922, + "step": 4069, + "time_per_iteration": 2.8643925189971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138233, + "balance_loss_mlp": 1.06389439, + "epoch": 0.782993459022701, + "flos": 601917689856.0, + "grad_norm": 0.03429166344261292, + "language_loss": 0.87759733, + "learning_rate": 0.00011849505912122117, + "loss": 0.88897967, + "num_input_tokens_seen": 337711968, + "router_z_loss_mlp": 0.74169922, + "step": 4070, + "time_per_iteration": 2.6959619522094727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138061, + "balance_loss_mlp": 1.06377029, + "epoch": 0.7831858407079646, + "flos": 811475384832.0, + "grad_norm": 0.039746496548432604, + "language_loss": 0.82642615, + "learning_rate": 0.00011829375634807654, + "loss": 0.8378067, + "num_input_tokens_seen": 337795792, + "router_z_loss_mlp": 0.74121094, + "step": 4071, + "time_per_iteration": 3.0114569664001465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136715, + "balance_loss_mlp": 1.06247175, + "epoch": 0.7833782223932282, + "flos": 808012870656.0, + "grad_norm": 0.03273964905208881, + "language_loss": 0.857427, + "learning_rate": 0.00011809260176643821, + "loss": 0.86879414, + "num_input_tokens_seen": 337875584, + "router_z_loss_mlp": 0.74121094, + "step": 4072, + "time_per_iteration": 3.0994741916656494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136116, + "balance_loss_mlp": 1.06206274, + "epoch": 0.7835706040784918, + "flos": 521899829760.0, + "grad_norm": 0.04024817722432492, + "language_loss": 0.88959461, + "learning_rate": 0.00011789159545440131, + "loss": 0.9009558, + "num_input_tokens_seen": 337942304, + "router_z_loss_mlp": 0.74023438, + "step": 4073, + "time_per_iteration": 2.644077777862549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138181, + "balance_loss_mlp": 1.06398499, + "epoch": 0.7837629857637552, + "flos": 506743211520.0, + "grad_norm": 0.03009333087268268, + "language_loss": 0.86380607, + "learning_rate": 0.00011769073749000348, + "loss": 0.87518787, + "num_input_tokens_seen": 338020864, + "router_z_loss_mlp": 0.74023438, + "step": 4074, + "time_per_iteration": 2.7675211429595947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138086, + "balance_loss_mlp": 1.06384242, + "epoch": 0.7839553674490188, + "flos": 517134756864.0, + "grad_norm": 0.03603773685865746, + "language_loss": 0.81149113, + "learning_rate": 0.0001174900279512246, + "loss": 0.82287204, + "num_input_tokens_seen": 338089584, + "router_z_loss_mlp": 0.74072266, + "step": 4075, + "time_per_iteration": 2.559067964553833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138281, + "balance_loss_mlp": 1.06418085, + "epoch": 0.7841477491342824, + "flos": 507650273280.0, + "grad_norm": 0.04900023922641464, + "language_loss": 0.86111671, + "learning_rate": 0.00011728946691598707, + "loss": 0.87249947, + "num_input_tokens_seen": 338159568, + "router_z_loss_mlp": 0.73974609, + "step": 4076, + "time_per_iteration": 2.601316213607788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139089, + "balance_loss_mlp": 1.06498837, + "epoch": 0.784340130819546, + "flos": 720904797696.0, + "grad_norm": 0.037946042945582265, + "language_loss": 0.81358349, + "learning_rate": 0.00011708905446215561, + "loss": 0.82497436, + "num_input_tokens_seen": 338233952, + "router_z_loss_mlp": 0.73974609, + "step": 4077, + "time_per_iteration": 2.8491528034210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138777, + "balance_loss_mlp": 1.06477141, + "epoch": 0.7845325125048095, + "flos": 515513289216.0, + "grad_norm": 0.03152801605769719, + "language_loss": 0.84297472, + "learning_rate": 0.00011688879066753711, + "loss": 0.85436249, + "num_input_tokens_seen": 338309568, + "router_z_loss_mlp": 0.73925781, + "step": 4078, + "time_per_iteration": 2.649890184402466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139298, + "balance_loss_mlp": 1.06529319, + "epoch": 0.7847248941900731, + "flos": 467050825728.0, + "grad_norm": 0.04544253460314975, + "language_loss": 0.92901659, + "learning_rate": 0.00011668867560988122, + "loss": 0.9404096, + "num_input_tokens_seen": 338375920, + "router_z_loss_mlp": 0.73925781, + "step": 4079, + "time_per_iteration": 2.583395004272461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137742, + "balance_loss_mlp": 1.06383276, + "epoch": 0.7849172758753367, + "flos": 504083699712.0, + "grad_norm": 0.03256844135977144, + "language_loss": 0.89159727, + "learning_rate": 0.00011648870936687916, + "loss": 0.90297467, + "num_input_tokens_seen": 338452208, + "router_z_loss_mlp": 0.73876953, + "step": 4080, + "time_per_iteration": 2.729670524597168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137567, + "balance_loss_mlp": 1.06375289, + "epoch": 0.7851096575606002, + "flos": 533031977472.0, + "grad_norm": 0.038157171447079044, + "language_loss": 0.83702409, + "learning_rate": 0.00011628889201616461, + "loss": 0.84839982, + "num_input_tokens_seen": 338522864, + "router_z_loss_mlp": 0.73828125, + "step": 4081, + "time_per_iteration": 2.6109676361083984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139939, + "balance_loss_mlp": 1.06602943, + "epoch": 0.7853020392458638, + "flos": 571043771904.0, + "grad_norm": 0.03751217922846888, + "language_loss": 0.86986727, + "learning_rate": 0.00011608922363531393, + "loss": 0.88126665, + "num_input_tokens_seen": 338591024, + "router_z_loss_mlp": 0.73876953, + "step": 4082, + "time_per_iteration": 2.6544032096862793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140867, + "balance_loss_mlp": 1.06686151, + "epoch": 0.7854944209311273, + "flos": 833991459840.0, + "grad_norm": 0.051644606704595315, + "language_loss": 0.88386512, + "learning_rate": 0.00011588970430184504, + "loss": 0.8952738, + "num_input_tokens_seen": 338669616, + "router_z_loss_mlp": 0.73925781, + "step": 4083, + "time_per_iteration": 3.0330986976623535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137232, + "balance_loss_mlp": 1.06332254, + "epoch": 0.7856868026163909, + "flos": 561010066944.0, + "grad_norm": 0.028770858152958077, + "language_loss": 0.85727829, + "learning_rate": 0.00011569033409321822, + "loss": 0.86865062, + "num_input_tokens_seen": 338740416, + "router_z_loss_mlp": 0.73876953, + "step": 4084, + "time_per_iteration": 2.678072452545166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137339, + "balance_loss_mlp": 1.0635246, + "epoch": 0.7858791843016545, + "flos": 546267684864.0, + "grad_norm": 0.036494926225622726, + "language_loss": 0.77694023, + "learning_rate": 0.00011549111308683591, + "loss": 0.78831363, + "num_input_tokens_seen": 338807664, + "router_z_loss_mlp": 0.73828125, + "step": 4085, + "time_per_iteration": 2.67767596244812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137399, + "balance_loss_mlp": 1.06339443, + "epoch": 0.7860715659869181, + "flos": 381840195072.0, + "grad_norm": 0.03798884187272388, + "language_loss": 0.86288953, + "learning_rate": 0.00011529204136004251, + "loss": 0.87426353, + "num_input_tokens_seen": 338869472, + "router_z_loss_mlp": 0.73925781, + "step": 4086, + "time_per_iteration": 2.533773422241211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143071, + "balance_loss_mlp": 1.068923, + "epoch": 0.7862639476721817, + "flos": 568512514560.0, + "grad_norm": 0.030679232207270264, + "language_loss": 0.87964737, + "learning_rate": 0.00011509311899012459, + "loss": 0.89107811, + "num_input_tokens_seen": 338941312, + "router_z_loss_mlp": 0.73974609, + "step": 4087, + "time_per_iteration": 2.76526141166687 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143134, + "balance_loss_mlp": 1.06903315, + "epoch": 0.7864563293574451, + "flos": 546322079232.0, + "grad_norm": 0.04187466244210811, + "language_loss": 0.83333945, + "learning_rate": 0.00011489434605431053, + "loss": 0.84477079, + "num_input_tokens_seen": 339010208, + "router_z_loss_mlp": 0.73925781, + "step": 4088, + "time_per_iteration": 2.6215317249298096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113809, + "balance_loss_mlp": 1.06408453, + "epoch": 0.7866487110427087, + "flos": 564648499200.0, + "grad_norm": 0.03663955414764931, + "language_loss": 0.861283, + "learning_rate": 0.0001146957226297708, + "loss": 0.87266392, + "num_input_tokens_seen": 339081232, + "router_z_loss_mlp": 0.73925781, + "step": 4089, + "time_per_iteration": 2.673021078109741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137912, + "balance_loss_mlp": 1.06381154, + "epoch": 0.7868410927279723, + "flos": 729558081024.0, + "grad_norm": 0.03607616248061006, + "language_loss": 0.80388957, + "learning_rate": 0.00011449724879361827, + "loss": 0.8152687, + "num_input_tokens_seen": 339161040, + "router_z_loss_mlp": 0.73974609, + "step": 4090, + "time_per_iteration": 2.9554953575134277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138064, + "balance_loss_mlp": 1.06410635, + "epoch": 0.7870334744132359, + "flos": 522447049728.0, + "grad_norm": 0.04384771027998422, + "language_loss": 0.79606628, + "learning_rate": 0.00011429892462290687, + "loss": 0.80744684, + "num_input_tokens_seen": 339233984, + "router_z_loss_mlp": 0.73925781, + "step": 4091, + "time_per_iteration": 2.663344383239746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137849, + "balance_loss_mlp": 1.06360543, + "epoch": 0.7872258560984994, + "flos": 452362838016.0, + "grad_norm": 0.03444063676499776, + "language_loss": 0.88160485, + "learning_rate": 0.00011410075019463295, + "loss": 0.89298332, + "num_input_tokens_seen": 339303168, + "router_z_loss_mlp": 0.74072266, + "step": 4092, + "time_per_iteration": 2.6327311992645264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137383, + "balance_loss_mlp": 1.06323516, + "epoch": 0.787418237783763, + "flos": 516249162240.0, + "grad_norm": 0.03476027857253962, + "language_loss": 0.84398365, + "learning_rate": 0.00011390272558573461, + "loss": 0.85535741, + "num_input_tokens_seen": 339374512, + "router_z_loss_mlp": 0.74023438, + "step": 4093, + "time_per_iteration": 2.675528049468994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137221, + "balance_loss_mlp": 1.06316793, + "epoch": 0.7876106194690266, + "flos": 486056722944.0, + "grad_norm": 0.030632947109506273, + "language_loss": 0.84047627, + "learning_rate": 0.00011370485087309202, + "loss": 0.85184848, + "num_input_tokens_seen": 339442720, + "router_z_loss_mlp": 0.73974609, + "step": 4094, + "time_per_iteration": 2.6260645389556885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138901, + "balance_loss_mlp": 1.06465769, + "epoch": 0.7878030011542901, + "flos": 543929809920.0, + "grad_norm": 0.0372748045886788, + "language_loss": 0.83189571, + "learning_rate": 0.00011350712613352688, + "loss": 0.84328461, + "num_input_tokens_seen": 339508800, + "router_z_loss_mlp": 0.74072266, + "step": 4095, + "time_per_iteration": 2.64158034324646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138645, + "balance_loss_mlp": 1.06440127, + "epoch": 0.7879953828395537, + "flos": 517749106176.0, + "grad_norm": 0.04715116302825024, + "language_loss": 0.85976934, + "learning_rate": 0.00011330955144380283, + "loss": 0.87115586, + "num_input_tokens_seen": 339578048, + "router_z_loss_mlp": 0.74072266, + "step": 4096, + "time_per_iteration": 2.599391222000122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138884, + "balance_loss_mlp": 1.06464028, + "epoch": 0.7881877645248172, + "flos": 583376420352.0, + "grad_norm": 0.03608757830250762, + "language_loss": 0.90583527, + "learning_rate": 0.00011311212688062483, + "loss": 0.91722411, + "num_input_tokens_seen": 339650176, + "router_z_loss_mlp": 0.74072266, + "step": 4097, + "time_per_iteration": 2.7737503051757812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141606, + "balance_loss_mlp": 1.06741059, + "epoch": 0.7883801462100808, + "flos": 590327645184.0, + "grad_norm": 0.09861102268280594, + "language_loss": 0.83454096, + "learning_rate": 0.0001129148525206402, + "loss": 0.84595704, + "num_input_tokens_seen": 339727312, + "router_z_loss_mlp": 0.74023438, + "step": 4098, + "time_per_iteration": 2.8053319454193115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114196, + "balance_loss_mlp": 1.06766832, + "epoch": 0.7885725278953444, + "flos": 482741928960.0, + "grad_norm": 0.039263204911434944, + "language_loss": 0.9157722, + "learning_rate": 0.00011271772844043759, + "loss": 0.92719185, + "num_input_tokens_seen": 339801344, + "router_z_loss_mlp": 0.74121094, + "step": 4099, + "time_per_iteration": 2.6722400188446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113855, + "balance_loss_mlp": 1.06440175, + "epoch": 0.788764909580608, + "flos": 758098126848.0, + "grad_norm": 0.0423984319236596, + "language_loss": 0.81897676, + "learning_rate": 0.00011252075471654727, + "loss": 0.83036232, + "num_input_tokens_seen": 339877840, + "router_z_loss_mlp": 0.74023438, + "step": 4100, + "time_per_iteration": 2.941443920135498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138656, + "balance_loss_mlp": 1.0645076, + "epoch": 0.7889572912658714, + "flos": 703878935040.0, + "grad_norm": 0.03307179261397765, + "language_loss": 0.82702905, + "learning_rate": 0.00011232393142544133, + "loss": 0.83841556, + "num_input_tokens_seen": 339959568, + "router_z_loss_mlp": 0.74023438, + "step": 4101, + "time_per_iteration": 2.9557137489318848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138555, + "balance_loss_mlp": 1.06435919, + "epoch": 0.789149672951135, + "flos": 737840062464.0, + "grad_norm": 0.034454067220804824, + "language_loss": 0.87124509, + "learning_rate": 0.00011212725864353323, + "loss": 0.88263059, + "num_input_tokens_seen": 340043600, + "router_z_loss_mlp": 0.74023438, + "step": 4102, + "time_per_iteration": 3.0640292167663574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145164, + "balance_loss_mlp": 1.07287598, + "epoch": 0.7893420546363986, + "flos": 1484487859200.0, + "grad_norm": 0.005768368046383886, + "language_loss": 0.76335925, + "learning_rate": 0.00011193073644717822, + "loss": 0.77481097, + "num_input_tokens_seen": 340270608, + "router_z_loss_mlp": 0.72460938, + "step": 4103, + "time_per_iteration": 4.858243227005005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140406, + "balance_loss_mlp": 1.06620967, + "epoch": 0.7895344363216622, + "flos": 510079472640.0, + "grad_norm": 0.047695984740599745, + "language_loss": 0.81464952, + "learning_rate": 0.00011173436491267291, + "loss": 0.82605356, + "num_input_tokens_seen": 340338784, + "router_z_loss_mlp": 0.74023438, + "step": 4104, + "time_per_iteration": 2.6253249645233154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137981, + "balance_loss_mlp": 1.06378555, + "epoch": 0.7897268180069258, + "flos": 543037484544.0, + "grad_norm": 0.03504267179198509, + "language_loss": 0.86698043, + "learning_rate": 0.0001115381441162554, + "loss": 0.87836027, + "num_input_tokens_seen": 340407744, + "router_z_loss_mlp": 0.74023438, + "step": 4105, + "time_per_iteration": 2.644268274307251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143089, + "balance_loss_mlp": 1.07080078, + "epoch": 0.7899191996921893, + "flos": 1415749867008.0, + "grad_norm": 0.006312961233255799, + "language_loss": 0.73583722, + "learning_rate": 0.00011134207413410557, + "loss": 0.7472682, + "num_input_tokens_seen": 340635824, + "router_z_loss_mlp": 0.72460938, + "step": 4106, + "time_per_iteration": 4.874951601028442 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139255, + "balance_loss_mlp": 1.06486893, + "epoch": 0.7901115813774529, + "flos": 624021530112.0, + "grad_norm": 0.035685278807963586, + "language_loss": 0.89252567, + "learning_rate": 0.00011114615504234465, + "loss": 0.90391827, + "num_input_tokens_seen": 340710928, + "router_z_loss_mlp": 0.7421875, + "step": 4107, + "time_per_iteration": 2.759730577468872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139038, + "balance_loss_mlp": 1.06488955, + "epoch": 0.7903039630627164, + "flos": 646804847616.0, + "grad_norm": 0.03564605308593673, + "language_loss": 0.86189628, + "learning_rate": 0.00011095038691703468, + "loss": 0.87328672, + "num_input_tokens_seen": 340786128, + "router_z_loss_mlp": 0.74023438, + "step": 4108, + "time_per_iteration": 2.8478689193725586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141249, + "balance_loss_mlp": 1.0670532, + "epoch": 0.79049634474798, + "flos": 595611740160.0, + "grad_norm": 0.03583745426638565, + "language_loss": 0.86790907, + "learning_rate": 0.00011075476983417998, + "loss": 0.87932158, + "num_input_tokens_seen": 340861616, + "router_z_loss_mlp": 0.74072266, + "step": 4109, + "time_per_iteration": 2.8335795402526855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139823, + "balance_loss_mlp": 1.0655793, + "epoch": 0.7906887264332435, + "flos": 717331493376.0, + "grad_norm": 0.038905447121572734, + "language_loss": 0.82716894, + "learning_rate": 0.00011055930386972579, + "loss": 0.83856714, + "num_input_tokens_seen": 340934480, + "router_z_loss_mlp": 0.74072266, + "step": 4110, + "time_per_iteration": 2.871617555618286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136816, + "balance_loss_mlp": 1.06271601, + "epoch": 0.7908811081185071, + "flos": 791260254720.0, + "grad_norm": 0.03420948770513602, + "language_loss": 0.82615238, + "learning_rate": 0.00011036398909955863, + "loss": 0.8375206, + "num_input_tokens_seen": 341014912, + "router_z_loss_mlp": 0.74023438, + "step": 4111, + "time_per_iteration": 3.035374402999878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137149, + "balance_loss_mlp": 1.06304824, + "epoch": 0.7910734898037707, + "flos": 643075090944.0, + "grad_norm": 0.03464769838403225, + "language_loss": 0.85694349, + "learning_rate": 0.00011016882559950648, + "loss": 0.86831492, + "num_input_tokens_seen": 341090608, + "router_z_loss_mlp": 0.73974609, + "step": 4112, + "time_per_iteration": 2.809424877166748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136751, + "balance_loss_mlp": 1.06284177, + "epoch": 0.7912658714890343, + "flos": 670560354816.0, + "grad_norm": 0.03852457437308278, + "language_loss": 0.85799241, + "learning_rate": 0.00010997381344533853, + "loss": 0.86935997, + "num_input_tokens_seen": 341160992, + "router_z_loss_mlp": 0.73876953, + "step": 4113, + "time_per_iteration": 2.7723140716552734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139368, + "balance_loss_mlp": 1.06512499, + "epoch": 0.7914582531742979, + "flos": 558887041536.0, + "grad_norm": 0.03351504494890856, + "language_loss": 0.84678841, + "learning_rate": 0.00010977895271276517, + "loss": 0.85818207, + "num_input_tokens_seen": 341232032, + "router_z_loss_mlp": 0.74072266, + "step": 4114, + "time_per_iteration": 2.6767303943634033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138954, + "balance_loss_mlp": 1.06494868, + "epoch": 0.7916506348595613, + "flos": 571191492096.0, + "grad_norm": 0.04313250317632895, + "language_loss": 0.84584868, + "learning_rate": 0.00010958424347743807, + "loss": 0.85723823, + "num_input_tokens_seen": 341303888, + "router_z_loss_mlp": 0.73925781, + "step": 4115, + "time_per_iteration": 2.7286806106567383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136476, + "balance_loss_mlp": 1.06266189, + "epoch": 0.7918430165448249, + "flos": 719645899776.0, + "grad_norm": 0.03512595532684894, + "language_loss": 0.8494817, + "learning_rate": 0.00010938968581494991, + "loss": 0.8608464, + "num_input_tokens_seen": 341385616, + "router_z_loss_mlp": 0.73828125, + "step": 4116, + "time_per_iteration": 2.9482476711273193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136681, + "balance_loss_mlp": 1.06277156, + "epoch": 0.7920353982300885, + "flos": 554736317952.0, + "grad_norm": 0.04228851157339113, + "language_loss": 0.83485335, + "learning_rate": 0.000109195279800835, + "loss": 0.84622014, + "num_input_tokens_seen": 341460976, + "router_z_loss_mlp": 0.73876953, + "step": 4117, + "time_per_iteration": 2.69572114944458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139513, + "balance_loss_mlp": 1.06555605, + "epoch": 0.7922277799153521, + "flos": 811540512768.0, + "grad_norm": 0.03903964409517225, + "language_loss": 0.81738925, + "learning_rate": 0.00010900102551056834, + "loss": 0.82878435, + "num_input_tokens_seen": 341537328, + "router_z_loss_mlp": 0.73876953, + "step": 4118, + "time_per_iteration": 3.021683692932129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139717, + "balance_loss_mlp": 1.06580722, + "epoch": 0.7924201616006156, + "flos": 422244258816.0, + "grad_norm": 0.03704274036887823, + "language_loss": 0.89204621, + "learning_rate": 0.00010880692301956601, + "loss": 0.90344346, + "num_input_tokens_seen": 341600272, + "router_z_loss_mlp": 0.73876953, + "step": 4119, + "time_per_iteration": 2.509284019470215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136195, + "balance_loss_mlp": 1.06238043, + "epoch": 0.7926125432858792, + "flos": 619104734208.0, + "grad_norm": 0.032195482380303, + "language_loss": 0.90015543, + "learning_rate": 0.00010861297240318518, + "loss": 0.91151732, + "num_input_tokens_seen": 341682096, + "router_z_loss_mlp": 0.73828125, + "step": 4120, + "time_per_iteration": 2.835418939590454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136735, + "balance_loss_mlp": 1.0630163, + "epoch": 0.7928049249711427, + "flos": 603611016192.0, + "grad_norm": 0.031028055346739136, + "language_loss": 0.90660435, + "learning_rate": 0.00010841917373672444, + "loss": 0.91797173, + "num_input_tokens_seen": 341754912, + "router_z_loss_mlp": 0.73730469, + "step": 4121, + "time_per_iteration": 2.7115211486816406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136879, + "balance_loss_mlp": 1.06306481, + "epoch": 0.7929973066564063, + "flos": 657231321600.0, + "grad_norm": 0.03886819591939463, + "language_loss": 0.83054501, + "learning_rate": 0.00010822552709542293, + "loss": 0.84191382, + "num_input_tokens_seen": 341831152, + "router_z_loss_mlp": 0.73828125, + "step": 4122, + "time_per_iteration": 2.811147928237915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137962, + "balance_loss_mlp": 1.0642904, + "epoch": 0.7931896883416699, + "flos": 537434480640.0, + "grad_norm": 0.03139044095393014, + "language_loss": 0.90324616, + "learning_rate": 0.0001080320325544612, + "loss": 0.91462576, + "num_input_tokens_seen": 341903552, + "router_z_loss_mlp": 0.73681641, + "step": 4123, + "time_per_iteration": 2.6880621910095215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137532, + "balance_loss_mlp": 1.06381249, + "epoch": 0.7933820700269334, + "flos": 499068848640.0, + "grad_norm": 0.03512735769346207, + "language_loss": 0.87548339, + "learning_rate": 0.00010783869018895997, + "loss": 0.8868587, + "num_input_tokens_seen": 341972256, + "router_z_loss_mlp": 0.73730469, + "step": 4124, + "time_per_iteration": 2.6342406272888184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138023, + "balance_loss_mlp": 1.06425595, + "epoch": 0.793574451712197, + "flos": 538495993344.0, + "grad_norm": 0.03751622303181437, + "language_loss": 0.88749498, + "learning_rate": 0.00010764550007398189, + "loss": 0.89887518, + "num_input_tokens_seen": 342040496, + "router_z_loss_mlp": 0.73779297, + "step": 4125, + "time_per_iteration": 2.6272289752960205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137744, + "balance_loss_mlp": 1.0640254, + "epoch": 0.7937668333974606, + "flos": 489258725376.0, + "grad_norm": 0.034933857523794375, + "language_loss": 0.85822791, + "learning_rate": 0.00010745246228452982, + "loss": 0.86960542, + "num_input_tokens_seen": 342108512, + "router_z_loss_mlp": 0.73730469, + "step": 4126, + "time_per_iteration": 2.5639169216156006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137347, + "balance_loss_mlp": 1.06358075, + "epoch": 0.7939592150827242, + "flos": 528479752704.0, + "grad_norm": 0.034679171376522114, + "language_loss": 0.86079615, + "learning_rate": 0.00010725957689554771, + "loss": 0.87216961, + "num_input_tokens_seen": 342183568, + "router_z_loss_mlp": 0.73779297, + "step": 4127, + "time_per_iteration": 2.7611310482025146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137731, + "balance_loss_mlp": 1.06391644, + "epoch": 0.7941515967679876, + "flos": 542803169280.0, + "grad_norm": 0.03824880137917062, + "language_loss": 0.88766754, + "learning_rate": 0.00010706684398192013, + "loss": 0.89904475, + "num_input_tokens_seen": 342259920, + "router_z_loss_mlp": 0.73828125, + "step": 4128, + "time_per_iteration": 2.7266509532928467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138133, + "balance_loss_mlp": 1.06436622, + "epoch": 0.7943439784532512, + "flos": 519523023360.0, + "grad_norm": 0.040169030809423835, + "language_loss": 0.87296367, + "learning_rate": 0.00010687426361847313, + "loss": 0.88434494, + "num_input_tokens_seen": 342330192, + "router_z_loss_mlp": 0.73779297, + "step": 4129, + "time_per_iteration": 2.7299461364746094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137822, + "balance_loss_mlp": 1.06405497, + "epoch": 0.7945363601385148, + "flos": 510060006912.0, + "grad_norm": 0.03365010231466857, + "language_loss": 0.9038803, + "learning_rate": 0.00010668183587997254, + "loss": 0.91525853, + "num_input_tokens_seen": 342398944, + "router_z_loss_mlp": 0.73779297, + "step": 4130, + "time_per_iteration": 2.5838053226470947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137059, + "balance_loss_mlp": 1.06343496, + "epoch": 0.7947287418237784, + "flos": 652401120768.0, + "grad_norm": 0.02856230138733652, + "language_loss": 0.8155334, + "learning_rate": 0.0001064895608411256, + "loss": 0.826904, + "num_input_tokens_seen": 342474000, + "router_z_loss_mlp": 0.73632812, + "step": 4131, + "time_per_iteration": 2.855571746826172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140645, + "balance_loss_mlp": 1.0668304, + "epoch": 0.794921123509042, + "flos": 697372872192.0, + "grad_norm": 0.03566888341568189, + "language_loss": 0.84410554, + "learning_rate": 0.00010629743857657998, + "loss": 0.85551202, + "num_input_tokens_seen": 342549184, + "router_z_loss_mlp": 0.73828125, + "step": 4132, + "time_per_iteration": 2.8950796127319336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149963, + "balance_loss_mlp": 1.07805634, + "epoch": 0.7951135051943055, + "flos": 1406076730368.0, + "grad_norm": 0.009945360443955307, + "language_loss": 0.70598668, + "learning_rate": 0.0001061054691609244, + "loss": 0.71748632, + "num_input_tokens_seen": 342767376, + "router_z_loss_mlp": 0.72070312, + "step": 4133, + "time_per_iteration": 4.6428234577178955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137714, + "balance_loss_mlp": 1.06399536, + "epoch": 0.795305886879569, + "flos": 811449188352.0, + "grad_norm": 0.03756536523282242, + "language_loss": 0.86775541, + "learning_rate": 0.00010591365266868802, + "loss": 0.87913251, + "num_input_tokens_seen": 342845024, + "router_z_loss_mlp": 0.73730469, + "step": 4134, + "time_per_iteration": 2.9570915699005127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143425, + "balance_loss_mlp": 1.07132721, + "epoch": 0.7954982685648326, + "flos": 1429213885440.0, + "grad_norm": 0.0062941693525409926, + "language_loss": 0.75511783, + "learning_rate": 0.00010572198917434018, + "loss": 0.76655209, + "num_input_tokens_seen": 343072496, + "router_z_loss_mlp": 0.72265625, + "step": 4135, + "time_per_iteration": 4.914888143539429 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137959, + "balance_loss_mlp": 1.06404912, + "epoch": 0.7956906502500962, + "flos": 390747259392.0, + "grad_norm": 0.0392560850681974, + "language_loss": 0.85252422, + "learning_rate": 0.00010553047875229166, + "loss": 0.86390382, + "num_input_tokens_seen": 343136928, + "router_z_loss_mlp": 0.73876953, + "step": 4136, + "time_per_iteration": 2.5757832527160645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137394, + "balance_loss_mlp": 1.06362712, + "epoch": 0.7958830319353598, + "flos": 516585535488.0, + "grad_norm": 0.03073809129555248, + "language_loss": 0.8796097, + "learning_rate": 0.00010533912147689328, + "loss": 0.89098364, + "num_input_tokens_seen": 343207440, + "router_z_loss_mlp": 0.73779297, + "step": 4137, + "time_per_iteration": 2.6300714015960693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137078, + "balance_loss_mlp": 1.06335866, + "epoch": 0.7960754136206233, + "flos": 494926857216.0, + "grad_norm": 0.033442699276882225, + "language_loss": 0.87293124, + "learning_rate": 0.00010514791742243656, + "loss": 0.88430202, + "num_input_tokens_seen": 343273744, + "router_z_loss_mlp": 0.73730469, + "step": 4138, + "time_per_iteration": 2.5906717777252197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136999, + "balance_loss_mlp": 1.06323278, + "epoch": 0.7962677953058869, + "flos": 657005738496.0, + "grad_norm": 0.03903943901806541, + "language_loss": 0.87440938, + "learning_rate": 0.00010495686666315341, + "loss": 0.88577938, + "num_input_tokens_seen": 343357648, + "router_z_loss_mlp": 0.73779297, + "step": 4139, + "time_per_iteration": 2.909572124481201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113797, + "balance_loss_mlp": 1.06401289, + "epoch": 0.7964601769911505, + "flos": 543419520000.0, + "grad_norm": 0.08585465629101555, + "language_loss": 0.81986225, + "learning_rate": 0.00010476596927321635, + "loss": 0.83124197, + "num_input_tokens_seen": 343425344, + "router_z_loss_mlp": 0.73876953, + "step": 4140, + "time_per_iteration": 2.5994365215301514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137712, + "balance_loss_mlp": 1.06389797, + "epoch": 0.796652558676414, + "flos": 538826362368.0, + "grad_norm": 0.03248172590146644, + "language_loss": 0.84015322, + "learning_rate": 0.00010457522532673835, + "loss": 0.85153031, + "num_input_tokens_seen": 343504960, + "router_z_loss_mlp": 0.73828125, + "step": 4141, + "time_per_iteration": 2.851498603820801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137565, + "balance_loss_mlp": 1.06375015, + "epoch": 0.7968449403616775, + "flos": 476051215872.0, + "grad_norm": 0.03503840732668985, + "language_loss": 0.8857249, + "learning_rate": 0.00010438463489777272, + "loss": 0.89710057, + "num_input_tokens_seen": 343570832, + "router_z_loss_mlp": 0.73828125, + "step": 4142, + "time_per_iteration": 2.56007981300354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137015, + "balance_loss_mlp": 1.06320024, + "epoch": 0.7970373220469411, + "flos": 568725362688.0, + "grad_norm": 0.0411728476443369, + "language_loss": 0.82051033, + "learning_rate": 0.00010419419806031316, + "loss": 0.83188045, + "num_input_tokens_seen": 343639808, + "router_z_loss_mlp": 0.73828125, + "step": 4143, + "time_per_iteration": 2.66398549079895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138356, + "balance_loss_mlp": 1.0646373, + "epoch": 0.7972297037322047, + "flos": 557350167552.0, + "grad_norm": 0.048021721616636356, + "language_loss": 0.88371974, + "learning_rate": 0.00010400391488829403, + "loss": 0.89510334, + "num_input_tokens_seen": 343715232, + "router_z_loss_mlp": 0.73730469, + "step": 4144, + "time_per_iteration": 2.764263153076172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137941, + "balance_loss_mlp": 1.06412661, + "epoch": 0.7974220854174683, + "flos": 577306787328.0, + "grad_norm": 0.030349731756734208, + "language_loss": 0.90217054, + "learning_rate": 0.00010381378545558984, + "loss": 0.9135499, + "num_input_tokens_seen": 343787168, + "router_z_loss_mlp": 0.73828125, + "step": 4145, + "time_per_iteration": 2.694387197494507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139239, + "balance_loss_mlp": 1.06552041, + "epoch": 0.7976144671027319, + "flos": 484055221248.0, + "grad_norm": 0.04602586335086132, + "language_loss": 0.89352703, + "learning_rate": 0.00010362380983601505, + "loss": 0.90491945, + "num_input_tokens_seen": 343853600, + "router_z_loss_mlp": 0.73730469, + "step": 4146, + "time_per_iteration": 2.5373778343200684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139051, + "balance_loss_mlp": 1.06528461, + "epoch": 0.7978068487879953, + "flos": 1079652773376.0, + "grad_norm": 0.026886472634432064, + "language_loss": 0.83036357, + "learning_rate": 0.00010343398810332477, + "loss": 0.84175408, + "num_input_tokens_seen": 343942816, + "router_z_loss_mlp": 0.73779297, + "step": 4147, + "time_per_iteration": 3.465343952178955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137553, + "balance_loss_mlp": 1.06383419, + "epoch": 0.7979992304732589, + "flos": 735015366144.0, + "grad_norm": 0.0386131750052721, + "language_loss": 0.89394611, + "learning_rate": 0.00010324432033121467, + "loss": 0.9053216, + "num_input_tokens_seen": 344021232, + "router_z_loss_mlp": 0.73730469, + "step": 4148, + "time_per_iteration": 2.95272159576416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137647, + "balance_loss_mlp": 1.06397593, + "epoch": 0.7981916121585225, + "flos": 416750043648.0, + "grad_norm": 0.03182767294568272, + "language_loss": 0.87920535, + "learning_rate": 0.00010305480659332005, + "loss": 0.89058185, + "num_input_tokens_seen": 344089616, + "router_z_loss_mlp": 0.73681641, + "step": 4149, + "time_per_iteration": 2.6444265842437744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113765, + "balance_loss_mlp": 1.0638833, + "epoch": 0.7983839938437861, + "flos": 466212894720.0, + "grad_norm": 0.047857965738547205, + "language_loss": 0.88751274, + "learning_rate": 0.00010286544696321682, + "loss": 0.89888918, + "num_input_tokens_seen": 344154992, + "router_z_loss_mlp": 0.73779297, + "step": 4150, + "time_per_iteration": 2.5789239406585693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138352, + "balance_loss_mlp": 1.06472826, + "epoch": 0.7985763755290496, + "flos": 511623077376.0, + "grad_norm": 0.03835001072611694, + "language_loss": 0.83638573, + "learning_rate": 0.00010267624151442073, + "loss": 0.84776926, + "num_input_tokens_seen": 344225232, + "router_z_loss_mlp": 0.73632812, + "step": 4151, + "time_per_iteration": 2.670612096786499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137657, + "balance_loss_mlp": 1.06408083, + "epoch": 0.7987687572143132, + "flos": 1012277738496.0, + "grad_norm": 0.03249576548614517, + "language_loss": 0.85286856, + "learning_rate": 0.000102487190320388, + "loss": 0.86424506, + "num_input_tokens_seen": 344309120, + "router_z_loss_mlp": 0.73583984, + "step": 4152, + "time_per_iteration": 3.3122832775115967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138879, + "balance_loss_mlp": 1.06520724, + "epoch": 0.7989611388995768, + "flos": 1022747873280.0, + "grad_norm": 0.03976712139414911, + "language_loss": 0.85336626, + "learning_rate": 0.00010229829345451475, + "loss": 0.86475503, + "num_input_tokens_seen": 344394112, + "router_z_loss_mlp": 0.73681641, + "step": 4153, + "time_per_iteration": 3.3512771129608154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138777, + "balance_loss_mlp": 1.0651536, + "epoch": 0.7991535205848403, + "flos": 1103036978688.0, + "grad_norm": 0.04036200779620281, + "language_loss": 0.83784497, + "learning_rate": 0.00010210955099013724, + "loss": 0.84923279, + "num_input_tokens_seen": 344476512, + "router_z_loss_mlp": 0.73632812, + "step": 4154, + "time_per_iteration": 3.352534532546997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138505, + "balance_loss_mlp": 1.06492949, + "epoch": 0.7993459022701039, + "flos": 836279669760.0, + "grad_norm": 0.04342364986110735, + "language_loss": 0.81863582, + "learning_rate": 0.00010192096300053167, + "loss": 0.83002084, + "num_input_tokens_seen": 344561088, + "router_z_loss_mlp": 0.73583984, + "step": 4155, + "time_per_iteration": 3.055297374725342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140351, + "balance_loss_mlp": 1.06672716, + "epoch": 0.7995382839553674, + "flos": 523769074176.0, + "grad_norm": 0.02922915705008151, + "language_loss": 0.89245528, + "learning_rate": 0.00010173252955891477, + "loss": 0.90385878, + "num_input_tokens_seen": 344639424, + "router_z_loss_mlp": 0.73632812, + "step": 4156, + "time_per_iteration": 2.741558790206909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141174, + "balance_loss_mlp": 1.0675503, + "epoch": 0.799730665640631, + "flos": 538858563072.0, + "grad_norm": 0.03668807577756746, + "language_loss": 0.78405279, + "learning_rate": 0.00010154425073844253, + "loss": 0.79546452, + "num_input_tokens_seen": 344710048, + "router_z_loss_mlp": 0.73632812, + "step": 4157, + "time_per_iteration": 2.6747748851776123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141717, + "balance_loss_mlp": 1.0680933, + "epoch": 0.7999230473258946, + "flos": 506067737088.0, + "grad_norm": 0.03089804381419182, + "language_loss": 0.86340404, + "learning_rate": 0.00010135612661221138, + "loss": 0.87482131, + "num_input_tokens_seen": 344776832, + "router_z_loss_mlp": 0.73632812, + "step": 4158, + "time_per_iteration": 2.565213680267334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144064, + "balance_loss_mlp": 1.07034528, + "epoch": 0.8001154290111582, + "flos": 1028975960064.0, + "grad_norm": 0.0395229836188532, + "language_loss": 0.87076604, + "learning_rate": 0.00010116815725325751, + "loss": 0.88220668, + "num_input_tokens_seen": 344864928, + "router_z_loss_mlp": 0.73681641, + "step": 4159, + "time_per_iteration": 3.3038952350616455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142065, + "balance_loss_mlp": 1.06834638, + "epoch": 0.8003078106964217, + "flos": 752269539840.0, + "grad_norm": 0.03606815133795925, + "language_loss": 0.85251313, + "learning_rate": 0.00010098034273455725, + "loss": 0.8639338, + "num_input_tokens_seen": 344944048, + "router_z_loss_mlp": 0.73681641, + "step": 4160, + "time_per_iteration": 2.9671449661254883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141282, + "balance_loss_mlp": 1.0676111, + "epoch": 0.8005001923816852, + "flos": 489525967872.0, + "grad_norm": 0.034755861099366334, + "language_loss": 0.84454644, + "learning_rate": 0.00010079268312902662, + "loss": 0.8559593, + "num_input_tokens_seen": 345015392, + "router_z_loss_mlp": 0.73632812, + "step": 4161, + "time_per_iteration": 2.6727142333984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140957, + "balance_loss_mlp": 1.06714249, + "epoch": 0.8006925740669488, + "flos": 514312788480.0, + "grad_norm": 0.03457602588260787, + "language_loss": 0.86664772, + "learning_rate": 0.0001006051785095215, + "loss": 0.8780573, + "num_input_tokens_seen": 345086640, + "router_z_loss_mlp": 0.73730469, + "step": 4162, + "time_per_iteration": 2.6881067752838135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140369, + "balance_loss_mlp": 1.06674516, + "epoch": 0.8008849557522124, + "flos": 579679590912.0, + "grad_norm": 0.039589703999255765, + "language_loss": 0.84823501, + "learning_rate": 0.0001004178289488376, + "loss": 0.85963869, + "num_input_tokens_seen": 345159616, + "router_z_loss_mlp": 0.73632812, + "step": 4163, + "time_per_iteration": 2.7627196311950684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140575, + "balance_loss_mlp": 1.06676042, + "epoch": 0.801077337437476, + "flos": 479680915968.0, + "grad_norm": 0.03562538391210133, + "language_loss": 0.88413119, + "learning_rate": 0.0001002306345197106, + "loss": 0.89553696, + "num_input_tokens_seen": 345225536, + "router_z_loss_mlp": 0.73730469, + "step": 4164, + "time_per_iteration": 2.6279873847961426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140166, + "balance_loss_mlp": 1.06635118, + "epoch": 0.8012697191227395, + "flos": 677967475200.0, + "grad_norm": 0.04047488864482016, + "language_loss": 0.85436863, + "learning_rate": 0.00010004359529481571, + "loss": 0.86577028, + "num_input_tokens_seen": 345302960, + "router_z_loss_mlp": 0.73730469, + "step": 4165, + "time_per_iteration": 2.995342493057251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114203, + "balance_loss_mlp": 1.06802452, + "epoch": 0.8014621008080031, + "flos": 1297170812928.0, + "grad_norm": 0.037617272041868384, + "language_loss": 0.87359077, + "learning_rate": 9.985671134676804e-05, + "loss": 0.88501108, + "num_input_tokens_seen": 345397792, + "router_z_loss_mlp": 0.73828125, + "step": 4166, + "time_per_iteration": 3.725456476211548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143421, + "balance_loss_mlp": 1.06941605, + "epoch": 0.8016544824932667, + "flos": 512825579520.0, + "grad_norm": 0.041033167099134404, + "language_loss": 0.89462924, + "learning_rate": 9.966998274812234e-05, + "loss": 0.90606344, + "num_input_tokens_seen": 345465440, + "router_z_loss_mlp": 0.73828125, + "step": 4167, + "time_per_iteration": 2.587735176086426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143621, + "balance_loss_mlp": 1.06961536, + "epoch": 0.8018468641785302, + "flos": 536718073344.0, + "grad_norm": 0.04253470612408202, + "language_loss": 0.87705988, + "learning_rate": 9.948340957137308e-05, + "loss": 0.88849604, + "num_input_tokens_seen": 345533072, + "router_z_loss_mlp": 0.73828125, + "step": 4168, + "time_per_iteration": 2.645045042037964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143563, + "balance_loss_mlp": 1.06950998, + "epoch": 0.8020392458637937, + "flos": 1025057550336.0, + "grad_norm": 0.04189552781046156, + "language_loss": 0.84953403, + "learning_rate": 9.929699188895447e-05, + "loss": 0.86096966, + "num_input_tokens_seen": 345622208, + "router_z_loss_mlp": 0.73876953, + "step": 4169, + "time_per_iteration": 3.2518906593322754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145859, + "balance_loss_mlp": 1.07376099, + "epoch": 0.8022316275490573, + "flos": 1565070403584.0, + "grad_norm": 0.005699099945185395, + "language_loss": 0.78054404, + "learning_rate": 9.911072977324009e-05, + "loss": 0.79200262, + "num_input_tokens_seen": 345852544, + "router_z_loss_mlp": 0.72265625, + "step": 4170, + "time_per_iteration": 4.9828410148620605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140499, + "balance_loss_mlp": 1.06644583, + "epoch": 0.8024240092343209, + "flos": 421601711616.0, + "grad_norm": 0.040177155372648383, + "language_loss": 0.88612646, + "learning_rate": 9.89246232965435e-05, + "loss": 0.89753145, + "num_input_tokens_seen": 345917328, + "router_z_loss_mlp": 0.73876953, + "step": 4171, + "time_per_iteration": 2.67098331451416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141029, + "balance_loss_mlp": 1.06702411, + "epoch": 0.8026163909195845, + "flos": 765162143232.0, + "grad_norm": 0.038738782156352326, + "language_loss": 0.84076917, + "learning_rate": 9.873867253111762e-05, + "loss": 0.85217947, + "num_input_tokens_seen": 345995936, + "router_z_loss_mlp": 0.73828125, + "step": 4172, + "time_per_iteration": 2.9889214038848877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141708, + "balance_loss_mlp": 1.06941986, + "epoch": 0.8028087726048481, + "flos": 1522141813248.0, + "grad_norm": 0.007464951030714858, + "language_loss": 0.80264562, + "learning_rate": 9.855287754915503e-05, + "loss": 0.81406271, + "num_input_tokens_seen": 346232720, + "router_z_loss_mlp": 0.72460938, + "step": 4173, + "time_per_iteration": 5.007925987243652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136925, + "balance_loss_mlp": 1.06277657, + "epoch": 0.8030011542901115, + "flos": 518830084608.0, + "grad_norm": 0.0383067219529844, + "language_loss": 0.93575275, + "learning_rate": 9.836723842278733e-05, + "loss": 0.9471221, + "num_input_tokens_seen": 346298208, + "router_z_loss_mlp": 0.73974609, + "step": 4174, + "time_per_iteration": 2.5880677700042725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137605, + "balance_loss_mlp": 1.06355226, + "epoch": 0.8031935359753751, + "flos": 546658452480.0, + "grad_norm": 0.035609660945247874, + "language_loss": 0.82692063, + "learning_rate": 9.818175522408646e-05, + "loss": 0.83829665, + "num_input_tokens_seen": 346370080, + "router_z_loss_mlp": 0.73876953, + "step": 4175, + "time_per_iteration": 2.6955156326293945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141225, + "balance_loss_mlp": 1.06717181, + "epoch": 0.8033859176606387, + "flos": 604735655424.0, + "grad_norm": 0.04032435514134155, + "language_loss": 0.8889333, + "learning_rate": 9.79964280250632e-05, + "loss": 0.90034556, + "num_input_tokens_seen": 346442432, + "router_z_loss_mlp": 0.73876953, + "step": 4176, + "time_per_iteration": 2.853034734725952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137722, + "balance_loss_mlp": 1.06362164, + "epoch": 0.8035782993459023, + "flos": 566984372736.0, + "grad_norm": 0.03679613531109102, + "language_loss": 0.86388361, + "learning_rate": 9.781125689766795e-05, + "loss": 0.87526083, + "num_input_tokens_seen": 346513088, + "router_z_loss_mlp": 0.73925781, + "step": 4177, + "time_per_iteration": 2.7487175464630127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137775, + "balance_loss_mlp": 1.06372213, + "epoch": 0.8037706810311658, + "flos": 539472912384.0, + "grad_norm": 0.05184044937246734, + "language_loss": 0.90083796, + "learning_rate": 9.762624191379054e-05, + "loss": 0.91221571, + "num_input_tokens_seen": 346581376, + "router_z_loss_mlp": 0.73876953, + "step": 4178, + "time_per_iteration": 2.6330466270446777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138006, + "balance_loss_mlp": 1.06390512, + "epoch": 0.8039630627164294, + "flos": 516194767872.0, + "grad_norm": 0.03661326628709558, + "language_loss": 0.84443927, + "learning_rate": 9.744138314526014e-05, + "loss": 0.85581934, + "num_input_tokens_seen": 346653328, + "router_z_loss_mlp": 0.73925781, + "step": 4179, + "time_per_iteration": 2.6247572898864746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141739, + "balance_loss_mlp": 1.06964111, + "epoch": 0.804155444401693, + "flos": 1481937136128.0, + "grad_norm": 0.005376898019679374, + "language_loss": 0.74733561, + "learning_rate": 9.725668066384535e-05, + "loss": 0.758753, + "num_input_tokens_seen": 346873264, + "router_z_loss_mlp": 0.72265625, + "step": 4180, + "time_per_iteration": 4.874308824539185 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113728, + "balance_loss_mlp": 1.06308401, + "epoch": 0.8043478260869565, + "flos": 522188539392.0, + "grad_norm": 0.04021078617434091, + "language_loss": 0.81771445, + "learning_rate": 9.707213454125396e-05, + "loss": 0.82908726, + "num_input_tokens_seen": 346946272, + "router_z_loss_mlp": 0.74023438, + "step": 4181, + "time_per_iteration": 2.693844795227051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137199, + "balance_loss_mlp": 1.0630033, + "epoch": 0.8045402077722201, + "flos": 546563125248.0, + "grad_norm": 0.03164680023603822, + "language_loss": 0.85049474, + "learning_rate": 9.688774484913298e-05, + "loss": 0.86186671, + "num_input_tokens_seen": 347024048, + "router_z_loss_mlp": 0.74023438, + "step": 4182, + "time_per_iteration": 2.7522850036621094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136736, + "balance_loss_mlp": 1.06254017, + "epoch": 0.8047325894574836, + "flos": 679706463744.0, + "grad_norm": 0.03486353569754657, + "language_loss": 0.79253167, + "learning_rate": 9.670351165906921e-05, + "loss": 0.80389905, + "num_input_tokens_seen": 347108736, + "router_z_loss_mlp": 0.74023438, + "step": 4183, + "time_per_iteration": 2.911919116973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137093, + "balance_loss_mlp": 1.06289673, + "epoch": 0.8049249711427472, + "flos": 588328144896.0, + "grad_norm": 0.03566696314646497, + "language_loss": 0.8362298, + "learning_rate": 9.65194350425882e-05, + "loss": 0.8476007, + "num_input_tokens_seen": 347184192, + "router_z_loss_mlp": 0.74023438, + "step": 4184, + "time_per_iteration": 2.7444334030151367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113629, + "balance_loss_mlp": 1.06204677, + "epoch": 0.8051173528280108, + "flos": 815680502784.0, + "grad_norm": 0.03248361844772192, + "language_loss": 0.82128632, + "learning_rate": 9.633551507115452e-05, + "loss": 0.83264923, + "num_input_tokens_seen": 347282336, + "router_z_loss_mlp": 0.74072266, + "step": 4185, + "time_per_iteration": 3.1254687309265137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136424, + "balance_loss_mlp": 1.06222832, + "epoch": 0.8053097345132744, + "flos": 726954964992.0, + "grad_norm": 0.030976719489159976, + "language_loss": 0.81902802, + "learning_rate": 9.615175181617259e-05, + "loss": 0.83039224, + "num_input_tokens_seen": 347364800, + "router_z_loss_mlp": 0.74023438, + "step": 4186, + "time_per_iteration": 2.9419145584106445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136622, + "balance_loss_mlp": 1.06242585, + "epoch": 0.805502116198538, + "flos": 749430107136.0, + "grad_norm": 0.03914823623045536, + "language_loss": 0.85688961, + "learning_rate": 9.596814534898552e-05, + "loss": 0.86825585, + "num_input_tokens_seen": 347443328, + "router_z_loss_mlp": 0.74023438, + "step": 4187, + "time_per_iteration": 3.0158443450927734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135947, + "balance_loss_mlp": 1.06184673, + "epoch": 0.8056944978838014, + "flos": 641481821184.0, + "grad_norm": 0.03272363751287634, + "language_loss": 0.91907942, + "learning_rate": 9.578469574087561e-05, + "loss": 0.93043882, + "num_input_tokens_seen": 347522064, + "router_z_loss_mlp": 0.73974609, + "step": 4188, + "time_per_iteration": 2.857875347137451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136336, + "balance_loss_mlp": 1.06218791, + "epoch": 0.805886879569065, + "flos": 645784267776.0, + "grad_norm": 0.037643576136900954, + "language_loss": 0.82672054, + "learning_rate": 9.560140306306436e-05, + "loss": 0.83808386, + "num_input_tokens_seen": 347597200, + "router_z_loss_mlp": 0.73974609, + "step": 4189, + "time_per_iteration": 2.7978317737579346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135607, + "balance_loss_mlp": 1.06160235, + "epoch": 0.8060792612543286, + "flos": 662443557888.0, + "grad_norm": 0.03459706232601391, + "language_loss": 0.86474156, + "learning_rate": 9.541826738671233e-05, + "loss": 0.87609762, + "num_input_tokens_seen": 347676928, + "router_z_loss_mlp": 0.73876953, + "step": 4190, + "time_per_iteration": 2.808532476425171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135589, + "balance_loss_mlp": 1.06153619, + "epoch": 0.8062716429395922, + "flos": 456012003840.0, + "grad_norm": 0.03810258680601671, + "language_loss": 0.87435436, + "learning_rate": 9.523528878291904e-05, + "loss": 0.88571024, + "num_input_tokens_seen": 347741552, + "router_z_loss_mlp": 0.73925781, + "step": 4191, + "time_per_iteration": 2.5479166507720947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011352, + "balance_loss_mlp": 1.06114757, + "epoch": 0.8064640246248557, + "flos": 527428973568.0, + "grad_norm": 0.03760103878345668, + "language_loss": 0.90479159, + "learning_rate": 9.50524673227231e-05, + "loss": 0.9161436, + "num_input_tokens_seen": 347807008, + "router_z_loss_mlp": 0.73925781, + "step": 4192, + "time_per_iteration": 2.595338821411133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135682, + "balance_loss_mlp": 1.0616293, + "epoch": 0.8066564063101193, + "flos": 866676225024.0, + "grad_norm": 0.03134383848670985, + "language_loss": 0.86391032, + "learning_rate": 9.486980307710208e-05, + "loss": 0.87526715, + "num_input_tokens_seen": 347895728, + "router_z_loss_mlp": 0.73925781, + "step": 4193, + "time_per_iteration": 3.1573548316955566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136404, + "balance_loss_mlp": 1.06254196, + "epoch": 0.8068487879953828, + "flos": 531642823680.0, + "grad_norm": 0.03189422174274218, + "language_loss": 0.8618921, + "learning_rate": 9.468729611697246e-05, + "loss": 0.87325615, + "num_input_tokens_seen": 347970368, + "router_z_loss_mlp": 0.73779297, + "step": 4194, + "time_per_iteration": 2.6939430236816406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135828, + "balance_loss_mlp": 1.06191802, + "epoch": 0.8070411696806464, + "flos": 567246885888.0, + "grad_norm": 0.031528158130144396, + "language_loss": 0.86619771, + "learning_rate": 9.450494651319003e-05, + "loss": 0.87755609, + "num_input_tokens_seen": 348039040, + "router_z_loss_mlp": 0.73828125, + "step": 4195, + "time_per_iteration": 2.6411421298980713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135645, + "balance_loss_mlp": 1.0615921, + "epoch": 0.80723355136591, + "flos": 988252987392.0, + "grad_norm": 0.028641893528927848, + "language_loss": 0.83544791, + "learning_rate": 9.432275433654885e-05, + "loss": 0.84680438, + "num_input_tokens_seen": 348126064, + "router_z_loss_mlp": 0.73925781, + "step": 4196, + "time_per_iteration": 3.284620761871338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136199, + "balance_loss_mlp": 1.06214666, + "epoch": 0.8074259330511735, + "flos": 568082815488.0, + "grad_norm": 0.03274043714207543, + "language_loss": 0.87193251, + "learning_rate": 9.414071965778221e-05, + "loss": 0.88329452, + "num_input_tokens_seen": 348205888, + "router_z_loss_mlp": 0.73876953, + "step": 4197, + "time_per_iteration": 2.8321473598480225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134907, + "balance_loss_mlp": 1.06075931, + "epoch": 0.8076183147364371, + "flos": 495752053248.0, + "grad_norm": 0.03175873877301644, + "language_loss": 0.83771801, + "learning_rate": 9.395884254756242e-05, + "loss": 0.84906709, + "num_input_tokens_seen": 348278608, + "router_z_loss_mlp": 0.73974609, + "step": 4198, + "time_per_iteration": 2.7369918823242188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134798, + "balance_loss_mlp": 1.06098342, + "epoch": 0.8078106964217007, + "flos": 420867840000.0, + "grad_norm": 0.03527202560929497, + "language_loss": 0.84655821, + "learning_rate": 9.377712307650044e-05, + "loss": 0.85790622, + "num_input_tokens_seen": 348341312, + "router_z_loss_mlp": 0.73779297, + "step": 4199, + "time_per_iteration": 2.523756504058838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134397, + "balance_loss_mlp": 1.06029618, + "epoch": 0.8080030781069643, + "flos": 528564346368.0, + "grad_norm": 0.03723834939135813, + "language_loss": 0.88157082, + "learning_rate": 9.359556131514602e-05, + "loss": 0.89291477, + "num_input_tokens_seen": 348409184, + "router_z_loss_mlp": 0.73974609, + "step": 4200, + "time_per_iteration": 2.6045093536376953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134559, + "balance_loss_mlp": 1.06036282, + "epoch": 0.8081954597922277, + "flos": 545151777792.0, + "grad_norm": 0.03389487766318828, + "language_loss": 0.86047804, + "learning_rate": 9.341415733398733e-05, + "loss": 0.87182367, + "num_input_tokens_seen": 348480832, + "router_z_loss_mlp": 0.74023438, + "step": 4201, + "time_per_iteration": 2.6960625648498535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134481, + "balance_loss_mlp": 1.06038058, + "epoch": 0.8083878414774913, + "flos": 642133100544.0, + "grad_norm": 0.03528539994977632, + "language_loss": 0.79933041, + "learning_rate": 9.323291120345207e-05, + "loss": 0.81067526, + "num_input_tokens_seen": 348559232, + "router_z_loss_mlp": 0.73974609, + "step": 4202, + "time_per_iteration": 2.841066837310791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135094, + "balance_loss_mlp": 1.06099403, + "epoch": 0.8085802231627549, + "flos": 706905019392.0, + "grad_norm": 0.03577618457162915, + "language_loss": 0.77572632, + "learning_rate": 9.305182299390614e-05, + "loss": 0.78707725, + "num_input_tokens_seen": 348638960, + "router_z_loss_mlp": 0.73974609, + "step": 4203, + "time_per_iteration": 2.881850004196167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134762, + "balance_loss_mlp": 1.0607096, + "epoch": 0.8087726048480185, + "flos": 420661722624.0, + "grad_norm": 0.03818278195025951, + "language_loss": 0.93325853, + "learning_rate": 9.287089277565409e-05, + "loss": 0.94460618, + "num_input_tokens_seen": 348704816, + "router_z_loss_mlp": 0.73925781, + "step": 4204, + "time_per_iteration": 2.5712902545928955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134942, + "balance_loss_mlp": 1.06093681, + "epoch": 0.8089649865332821, + "flos": 509862621696.0, + "grad_norm": 0.028510707328060825, + "language_loss": 0.90784013, + "learning_rate": 9.269012061893922e-05, + "loss": 0.91918957, + "num_input_tokens_seen": 348783504, + "router_z_loss_mlp": 0.73925781, + "step": 4205, + "time_per_iteration": 2.774871587753296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134999, + "balance_loss_mlp": 1.0608989, + "epoch": 0.8091573682185456, + "flos": 458261282304.0, + "grad_norm": 0.03265489614473136, + "language_loss": 0.88958049, + "learning_rate": 9.250950659394386e-05, + "loss": 0.90093046, + "num_input_tokens_seen": 348858272, + "router_z_loss_mlp": 0.73974609, + "step": 4206, + "time_per_iteration": 2.7118797302246094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113461, + "balance_loss_mlp": 1.06079543, + "epoch": 0.8093497499038091, + "flos": 526374191616.0, + "grad_norm": 0.03169326833456576, + "language_loss": 0.8122524, + "learning_rate": 9.232905077078824e-05, + "loss": 0.82359844, + "num_input_tokens_seen": 348934432, + "router_z_loss_mlp": 0.73779297, + "step": 4207, + "time_per_iteration": 2.72802734375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134871, + "balance_loss_mlp": 1.06091356, + "epoch": 0.8095421315890727, + "flos": 490580749824.0, + "grad_norm": 0.036826369012514064, + "language_loss": 0.81312108, + "learning_rate": 9.214875321953164e-05, + "loss": 0.8244698, + "num_input_tokens_seen": 349003856, + "router_z_loss_mlp": 0.73876953, + "step": 4208, + "time_per_iteration": 2.605091094970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113518, + "balance_loss_mlp": 1.06117523, + "epoch": 0.8097345132743363, + "flos": 626283543552.0, + "grad_norm": 0.03355343413507775, + "language_loss": 0.85747409, + "learning_rate": 9.196861401017164e-05, + "loss": 0.86882585, + "num_input_tokens_seen": 349080544, + "router_z_loss_mlp": 0.73876953, + "step": 4209, + "time_per_iteration": 2.776834726333618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135546, + "balance_loss_mlp": 1.06149364, + "epoch": 0.8099268949595998, + "flos": 616872920064.0, + "grad_norm": 0.03618347801617859, + "language_loss": 0.8405565, + "learning_rate": 9.178863321264475e-05, + "loss": 0.85191202, + "num_input_tokens_seen": 349159072, + "router_z_loss_mlp": 0.73876953, + "step": 4210, + "time_per_iteration": 2.829793930053711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136593, + "balance_loss_mlp": 1.06258821, + "epoch": 0.8101192766448634, + "flos": 480684031488.0, + "grad_norm": 0.03384381910797024, + "language_loss": 0.84874779, + "learning_rate": 9.160881089682566e-05, + "loss": 0.86011374, + "num_input_tokens_seen": 349230176, + "router_z_loss_mlp": 0.73828125, + "step": 4211, + "time_per_iteration": 2.6381702423095703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136603, + "balance_loss_mlp": 1.06269372, + "epoch": 0.810311658330127, + "flos": 518326525440.0, + "grad_norm": 0.03431479693344864, + "language_loss": 0.91464251, + "learning_rate": 9.142914713252725e-05, + "loss": 0.92600852, + "num_input_tokens_seen": 349299760, + "router_z_loss_mlp": 0.73779297, + "step": 4212, + "time_per_iteration": 2.6007797718048096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137699, + "balance_loss_mlp": 1.0639801, + "epoch": 0.8105040400153906, + "flos": 576987878400.0, + "grad_norm": 0.02918606823415051, + "language_loss": 0.87603903, + "learning_rate": 9.124964198950159e-05, + "loss": 0.88741606, + "num_input_tokens_seen": 349379712, + "router_z_loss_mlp": 0.73681641, + "step": 4213, + "time_per_iteration": 2.8085403442382812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137019, + "balance_loss_mlp": 1.06315684, + "epoch": 0.8106964217006541, + "flos": 640187994624.0, + "grad_norm": 0.033620937872648055, + "language_loss": 0.89619857, + "learning_rate": 9.107029553743862e-05, + "loss": 0.90756875, + "num_input_tokens_seen": 349460320, + "router_z_loss_mlp": 0.73730469, + "step": 4214, + "time_per_iteration": 2.884916305541992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136885, + "balance_loss_mlp": 1.06297493, + "epoch": 0.8108888033859176, + "flos": 580584651264.0, + "grad_norm": 0.03884853564505628, + "language_loss": 0.866575, + "learning_rate": 9.089110784596672e-05, + "loss": 0.87794381, + "num_input_tokens_seen": 349527648, + "router_z_loss_mlp": 0.73779297, + "step": 4215, + "time_per_iteration": 2.6847498416900635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136591, + "balance_loss_mlp": 1.06258559, + "epoch": 0.8110811850711812, + "flos": 561090657792.0, + "grad_norm": 0.03395287421728693, + "language_loss": 0.88044077, + "learning_rate": 9.071207898465284e-05, + "loss": 0.89180672, + "num_input_tokens_seen": 349606912, + "router_z_loss_mlp": 0.73828125, + "step": 4216, + "time_per_iteration": 2.7887377738952637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145004, + "balance_loss_mlp": 1.07290649, + "epoch": 0.8112735667564448, + "flos": 1521066839040.0, + "grad_norm": 0.008024079584686653, + "language_loss": 0.77260417, + "learning_rate": 9.053320902300205e-05, + "loss": 0.78405422, + "num_input_tokens_seen": 349827040, + "router_z_loss_mlp": 0.72265625, + "step": 4217, + "time_per_iteration": 4.71375584602356 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113793, + "balance_loss_mlp": 1.06402028, + "epoch": 0.8114659484417084, + "flos": 617515467264.0, + "grad_norm": 0.0391225260866388, + "language_loss": 0.90230364, + "learning_rate": 9.035449803045792e-05, + "loss": 0.91368294, + "num_input_tokens_seen": 349900080, + "router_z_loss_mlp": 0.73779297, + "step": 4218, + "time_per_iteration": 2.8041131496429443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136237, + "balance_loss_mlp": 1.06242275, + "epoch": 0.8116583301269719, + "flos": 651261745152.0, + "grad_norm": 0.030797335982040666, + "language_loss": 0.83055115, + "learning_rate": 9.017594607640211e-05, + "loss": 0.84191352, + "num_input_tokens_seen": 349983568, + "router_z_loss_mlp": 0.73730469, + "step": 4219, + "time_per_iteration": 2.9443857669830322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136868, + "balance_loss_mlp": 1.06295788, + "epoch": 0.8118507118122354, + "flos": 554195828736.0, + "grad_norm": 0.03810511170832895, + "language_loss": 0.85147524, + "learning_rate": 8.999755323015463e-05, + "loss": 0.86284399, + "num_input_tokens_seen": 350054928, + "router_z_loss_mlp": 0.73779297, + "step": 4220, + "time_per_iteration": 2.680670738220215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136982, + "balance_loss_mlp": 1.06326246, + "epoch": 0.812043093497499, + "flos": 545177974272.0, + "grad_norm": 0.03408780635951255, + "language_loss": 0.91583371, + "learning_rate": 8.981931956097384e-05, + "loss": 0.92720354, + "num_input_tokens_seen": 350127872, + "router_z_loss_mlp": 0.73681641, + "step": 4221, + "time_per_iteration": 2.642547369003296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136863, + "balance_loss_mlp": 1.06295288, + "epoch": 0.8122354751827626, + "flos": 584574919680.0, + "grad_norm": 0.03129027929290594, + "language_loss": 0.87976468, + "learning_rate": 8.964124513805628e-05, + "loss": 0.89113331, + "num_input_tokens_seen": 350206592, + "router_z_loss_mlp": 0.73779297, + "step": 4222, + "time_per_iteration": 2.7617506980895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142586, + "balance_loss_mlp": 1.07067871, + "epoch": 0.8124278568680262, + "flos": 1533860112384.0, + "grad_norm": 0.005717741019292163, + "language_loss": 0.78250074, + "learning_rate": 8.94633300305363e-05, + "loss": 0.7939266, + "num_input_tokens_seen": 350436048, + "router_z_loss_mlp": 0.72070312, + "step": 4223, + "time_per_iteration": 4.967041492462158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135426, + "balance_loss_mlp": 1.06142044, + "epoch": 0.8126202385532897, + "flos": 433767174144.0, + "grad_norm": 0.038884513065240225, + "language_loss": 0.84713882, + "learning_rate": 8.928557430748668e-05, + "loss": 0.85849309, + "num_input_tokens_seen": 350501376, + "router_z_loss_mlp": 0.73876953, + "step": 4224, + "time_per_iteration": 2.5755624771118164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140594, + "balance_loss_mlp": 1.06830597, + "epoch": 0.8128126202385533, + "flos": 1551146486784.0, + "grad_norm": 0.0052150499454202155, + "language_loss": 0.76495624, + "learning_rate": 8.910797803791854e-05, + "loss": 0.77636218, + "num_input_tokens_seen": 350735232, + "router_z_loss_mlp": 0.72460938, + "step": 4225, + "time_per_iteration": 4.887953281402588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136452, + "balance_loss_mlp": 1.06273341, + "epoch": 0.8130050019238169, + "flos": 529337149440.0, + "grad_norm": 0.038030015177674494, + "language_loss": 0.93251669, + "learning_rate": 8.893054129078077e-05, + "loss": 0.94388121, + "num_input_tokens_seen": 350805088, + "router_z_loss_mlp": 0.73681641, + "step": 4226, + "time_per_iteration": 2.6120243072509766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135963, + "balance_loss_mlp": 1.06224418, + "epoch": 0.8131973836090804, + "flos": 544227251712.0, + "grad_norm": 0.04131080667228598, + "language_loss": 0.8568573, + "learning_rate": 8.875326413496037e-05, + "loss": 0.86821687, + "num_input_tokens_seen": 350876896, + "router_z_loss_mlp": 0.73681641, + "step": 4227, + "time_per_iteration": 2.7287051677703857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135709, + "balance_loss_mlp": 1.0617516, + "epoch": 0.8133897652943439, + "flos": 577578032640.0, + "grad_norm": 0.03865852336010986, + "language_loss": 0.86959839, + "learning_rate": 8.857614663928249e-05, + "loss": 0.88095552, + "num_input_tokens_seen": 350948400, + "router_z_loss_mlp": 0.73828125, + "step": 4228, + "time_per_iteration": 2.6870715618133545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135823, + "balance_loss_mlp": 1.06219947, + "epoch": 0.8135821469796075, + "flos": 580350336000.0, + "grad_norm": 0.0387504778946499, + "language_loss": 0.84373677, + "learning_rate": 8.839918887251025e-05, + "loss": 0.85509503, + "num_input_tokens_seen": 351023328, + "router_z_loss_mlp": 0.73632812, + "step": 4229, + "time_per_iteration": 2.745539426803589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135168, + "balance_loss_mlp": 1.06140161, + "epoch": 0.8137745286648711, + "flos": 651643780608.0, + "grad_norm": 0.037162762850376806, + "language_loss": 0.8921082, + "learning_rate": 8.822239090334472e-05, + "loss": 0.90345985, + "num_input_tokens_seen": 351108672, + "router_z_loss_mlp": 0.73730469, + "step": 4230, + "time_per_iteration": 2.971499443054199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134218, + "balance_loss_mlp": 1.06035542, + "epoch": 0.8139669103501347, + "flos": 703127599104.0, + "grad_norm": 0.036809374739783886, + "language_loss": 0.81143808, + "learning_rate": 8.804575280042493e-05, + "loss": 0.82278025, + "num_input_tokens_seen": 351185056, + "router_z_loss_mlp": 0.73828125, + "step": 4231, + "time_per_iteration": 2.89591121673584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134335, + "balance_loss_mlp": 1.06056821, + "epoch": 0.8141592920353983, + "flos": 651387271680.0, + "grad_norm": 0.04068280906456379, + "language_loss": 0.88771474, + "learning_rate": 8.786927463232774e-05, + "loss": 0.8990581, + "num_input_tokens_seen": 351255856, + "router_z_loss_mlp": 0.73730469, + "step": 4232, + "time_per_iteration": 2.777247905731201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113425, + "balance_loss_mlp": 1.06029224, + "epoch": 0.8143516737206618, + "flos": 537844713984.0, + "grad_norm": 0.04131834896262191, + "language_loss": 0.85812843, + "learning_rate": 8.769295646756853e-05, + "loss": 0.86947101, + "num_input_tokens_seen": 351322336, + "router_z_loss_mlp": 0.73876953, + "step": 4233, + "time_per_iteration": 2.6038644313812256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134212, + "balance_loss_mlp": 1.0605886, + "epoch": 0.8145440554059253, + "flos": 509363065344.0, + "grad_norm": 0.03311543445898947, + "language_loss": 0.86719936, + "learning_rate": 8.751679837459963e-05, + "loss": 0.87854147, + "num_input_tokens_seen": 351387440, + "router_z_loss_mlp": 0.73632812, + "step": 4234, + "time_per_iteration": 2.5994458198547363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133864, + "balance_loss_mlp": 1.06024003, + "epoch": 0.8147364370911889, + "flos": 636287049216.0, + "grad_norm": 0.02964347408998998, + "language_loss": 0.90857178, + "learning_rate": 8.734080042181181e-05, + "loss": 0.91991043, + "num_input_tokens_seen": 351464192, + "router_z_loss_mlp": 0.73632812, + "step": 4235, + "time_per_iteration": 2.831850051879883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133974, + "balance_loss_mlp": 1.0603503, + "epoch": 0.8149288187764525, + "flos": 423705271296.0, + "grad_norm": 0.03639444166963084, + "language_loss": 0.83094406, + "learning_rate": 8.716496267753343e-05, + "loss": 0.84228379, + "num_input_tokens_seen": 351528016, + "router_z_loss_mlp": 0.73632812, + "step": 4236, + "time_per_iteration": 2.4640309810638428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135903, + "balance_loss_mlp": 1.06227982, + "epoch": 0.8151212004617161, + "flos": 598620360192.0, + "grad_norm": 0.03190443114038452, + "language_loss": 0.85766506, + "learning_rate": 8.698928521003097e-05, + "loss": 0.8690241, + "num_input_tokens_seen": 351601648, + "router_z_loss_mlp": 0.73632812, + "step": 4237, + "time_per_iteration": 2.7593436241149902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141319, + "balance_loss_mlp": 1.06941223, + "epoch": 0.8153135821469796, + "flos": 1482412497408.0, + "grad_norm": 0.006034012067476844, + "language_loss": 0.77852845, + "learning_rate": 8.681376808750835e-05, + "loss": 0.78994167, + "num_input_tokens_seen": 351826720, + "router_z_loss_mlp": 0.72070312, + "step": 4238, + "time_per_iteration": 5.0358593463897705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135115, + "balance_loss_mlp": 1.06149137, + "epoch": 0.8155059638322432, + "flos": 438011223552.0, + "grad_norm": 0.03574751342036468, + "language_loss": 0.86546302, + "learning_rate": 8.663841137810741e-05, + "loss": 0.87681419, + "num_input_tokens_seen": 351891760, + "router_z_loss_mlp": 0.73632812, + "step": 4239, + "time_per_iteration": 2.5296990871429443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134629, + "balance_loss_mlp": 1.06100523, + "epoch": 0.8156983455175068, + "flos": 795819210240.0, + "grad_norm": 0.036631860682182917, + "language_loss": 0.90299451, + "learning_rate": 8.646321514990763e-05, + "loss": 0.91434073, + "num_input_tokens_seen": 351977504, + "router_z_loss_mlp": 0.73632812, + "step": 4240, + "time_per_iteration": 3.116800308227539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134029, + "balance_loss_mlp": 1.06040537, + "epoch": 0.8158907272027703, + "flos": 687193448448.0, + "grad_norm": 0.03497799399814432, + "language_loss": 0.86212909, + "learning_rate": 8.628817947092616e-05, + "loss": 0.87346935, + "num_input_tokens_seen": 352050176, + "router_z_loss_mlp": 0.73632812, + "step": 4241, + "time_per_iteration": 2.8215630054473877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113408, + "balance_loss_mlp": 1.06040835, + "epoch": 0.8160831088880338, + "flos": 488030026752.0, + "grad_norm": 0.04917888887057411, + "language_loss": 0.90205991, + "learning_rate": 8.611330440911797e-05, + "loss": 0.91340065, + "num_input_tokens_seen": 352116848, + "router_z_loss_mlp": 0.73681641, + "step": 4242, + "time_per_iteration": 2.5900723934173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133957, + "balance_loss_mlp": 1.06033301, + "epoch": 0.8162754905732974, + "flos": 465822127104.0, + "grad_norm": 0.03688342086176751, + "language_loss": 0.8533777, + "learning_rate": 8.593859003237558e-05, + "loss": 0.86471725, + "num_input_tokens_seen": 352185056, + "router_z_loss_mlp": 0.73632812, + "step": 4243, + "time_per_iteration": 2.560988664627075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138031, + "balance_loss_mlp": 1.06593323, + "epoch": 0.816467872258561, + "flos": 1242143341056.0, + "grad_norm": 0.003656687556676087, + "language_loss": 0.75285125, + "learning_rate": 8.576403640852904e-05, + "loss": 0.76423156, + "num_input_tokens_seen": 352397648, + "router_z_loss_mlp": 0.72265625, + "step": 4244, + "time_per_iteration": 4.697356462478638 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134208, + "balance_loss_mlp": 1.06058443, + "epoch": 0.8166602539438246, + "flos": 688402681344.0, + "grad_norm": 0.0314239637841158, + "language_loss": 0.90210414, + "learning_rate": 8.558964360534615e-05, + "loss": 0.91344625, + "num_input_tokens_seen": 352478272, + "router_z_loss_mlp": 0.73632812, + "step": 4245, + "time_per_iteration": 2.9143781661987305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138077, + "balance_loss_mlp": 1.065979, + "epoch": 0.8168526356290882, + "flos": 1493916673536.0, + "grad_norm": 0.0037263758813665952, + "language_loss": 0.72974741, + "learning_rate": 8.541541169053219e-05, + "loss": 0.74112821, + "num_input_tokens_seen": 352707104, + "router_z_loss_mlp": 0.72265625, + "step": 4246, + "time_per_iteration": 4.9454734325408936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133915, + "balance_loss_mlp": 1.06029105, + "epoch": 0.8170450173143516, + "flos": 579299556864.0, + "grad_norm": 0.030493016441410038, + "language_loss": 0.89006281, + "learning_rate": 8.524134073172984e-05, + "loss": 0.901402, + "num_input_tokens_seen": 352779248, + "router_z_loss_mlp": 0.73632812, + "step": 4247, + "time_per_iteration": 2.716303586959839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133981, + "balance_loss_mlp": 1.06035721, + "epoch": 0.8172373989996152, + "flos": 572437655040.0, + "grad_norm": 0.032931273654240076, + "language_loss": 0.89490271, + "learning_rate": 8.506743079651974e-05, + "loss": 0.90624249, + "num_input_tokens_seen": 352856784, + "router_z_loss_mlp": 0.73632812, + "step": 4248, + "time_per_iteration": 2.8293991088867188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134211, + "balance_loss_mlp": 1.06063545, + "epoch": 0.8174297806848788, + "flos": 529858172928.0, + "grad_norm": 0.037171294021196906, + "language_loss": 0.85910308, + "learning_rate": 8.489368195241948e-05, + "loss": 0.87044525, + "num_input_tokens_seen": 352926496, + "router_z_loss_mlp": 0.73583984, + "step": 4249, + "time_per_iteration": 2.6829066276550293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134079, + "balance_loss_mlp": 1.06059849, + "epoch": 0.8176221623701424, + "flos": 570268967424.0, + "grad_norm": 0.034080250978502535, + "language_loss": 0.8438381, + "learning_rate": 8.47200942668846e-05, + "loss": 0.85517883, + "num_input_tokens_seen": 353005312, + "router_z_loss_mlp": 0.73486328, + "step": 4250, + "time_per_iteration": 2.8265514373779297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135859, + "balance_loss_mlp": 1.06237853, + "epoch": 0.8178145440554059, + "flos": 657706682880.0, + "grad_norm": 0.03911715002347649, + "language_loss": 0.85039294, + "learning_rate": 8.454666780730735e-05, + "loss": 0.8617515, + "num_input_tokens_seen": 353085120, + "router_z_loss_mlp": 0.73486328, + "step": 4251, + "time_per_iteration": 2.8799848556518555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136417, + "balance_loss_mlp": 1.06298411, + "epoch": 0.8180069257406695, + "flos": 547055950848.0, + "grad_norm": 0.03495030858038778, + "language_loss": 0.925497, + "learning_rate": 8.437340264101828e-05, + "loss": 0.93686116, + "num_input_tokens_seen": 353160992, + "router_z_loss_mlp": 0.734375, + "step": 4252, + "time_per_iteration": 2.741757392883301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134952, + "balance_loss_mlp": 1.06156695, + "epoch": 0.818199307425933, + "flos": 620411295744.0, + "grad_norm": 0.03572313096621812, + "language_loss": 0.89690208, + "learning_rate": 8.420029883528474e-05, + "loss": 0.90825158, + "num_input_tokens_seen": 353233328, + "router_z_loss_mlp": 0.73388672, + "step": 4253, + "time_per_iteration": 2.7292418479919434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135002, + "balance_loss_mlp": 1.06152105, + "epoch": 0.8183916891111966, + "flos": 648934603776.0, + "grad_norm": 0.03748901013328147, + "language_loss": 0.82274991, + "learning_rate": 8.402735645731157e-05, + "loss": 0.83409989, + "num_input_tokens_seen": 353310592, + "router_z_loss_mlp": 0.73486328, + "step": 4254, + "time_per_iteration": 2.910111665725708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134818, + "balance_loss_mlp": 1.06152833, + "epoch": 0.8185840707964602, + "flos": 500102163456.0, + "grad_norm": 0.038471995455164235, + "language_loss": 0.82772928, + "learning_rate": 8.385457557424098e-05, + "loss": 0.83907747, + "num_input_tokens_seen": 353376544, + "router_z_loss_mlp": 0.73291016, + "step": 4255, + "time_per_iteration": 2.5621390342712402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134815, + "balance_loss_mlp": 1.06142998, + "epoch": 0.8187764524817237, + "flos": 787611088896.0, + "grad_norm": 0.030170748899510557, + "language_loss": 0.84222317, + "learning_rate": 8.368195625315251e-05, + "loss": 0.8535713, + "num_input_tokens_seen": 353461200, + "router_z_loss_mlp": 0.73388672, + "step": 4256, + "time_per_iteration": 3.078824996948242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134992, + "balance_loss_mlp": 1.06170225, + "epoch": 0.8189688341669873, + "flos": 551786095104.0, + "grad_norm": 0.03557729872276572, + "language_loss": 0.84799671, + "learning_rate": 8.350949856106283e-05, + "loss": 0.85934663, + "num_input_tokens_seen": 353538608, + "router_z_loss_mlp": 0.73291016, + "step": 4257, + "time_per_iteration": 2.7947750091552734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137352, + "balance_loss_mlp": 1.06544495, + "epoch": 0.8191612158522509, + "flos": 1354880894976.0, + "grad_norm": 0.0054924176528901095, + "language_loss": 0.71149343, + "learning_rate": 8.333720256492599e-05, + "loss": 0.72286695, + "num_input_tokens_seen": 353766960, + "router_z_loss_mlp": 0.72070312, + "step": 4258, + "time_per_iteration": 4.84255051612854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134347, + "balance_loss_mlp": 1.06096172, + "epoch": 0.8193535975375145, + "flos": 545299497984.0, + "grad_norm": 0.03816003226358518, + "language_loss": 0.88573909, + "learning_rate": 8.316506833163318e-05, + "loss": 0.89708257, + "num_input_tokens_seen": 353833552, + "router_z_loss_mlp": 0.73388672, + "step": 4259, + "time_per_iteration": 2.6227800846099854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134587, + "balance_loss_mlp": 1.06110692, + "epoch": 0.8195459792227779, + "flos": 867227447808.0, + "grad_norm": 0.030985411869637765, + "language_loss": 0.89433575, + "learning_rate": 8.299309592801297e-05, + "loss": 0.90568173, + "num_input_tokens_seen": 353915520, + "router_z_loss_mlp": 0.73486328, + "step": 4260, + "time_per_iteration": 3.120332717895508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136288, + "balance_loss_mlp": 1.06299853, + "epoch": 0.8197383609080415, + "flos": 570409956864.0, + "grad_norm": 0.03501003143671651, + "language_loss": 0.85849857, + "learning_rate": 8.282128542083101e-05, + "loss": 0.86986148, + "num_input_tokens_seen": 353992048, + "router_z_loss_mlp": 0.73291016, + "step": 4261, + "time_per_iteration": 2.7042295932769775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113623, + "balance_loss_mlp": 1.06284475, + "epoch": 0.8199307425933051, + "flos": 531885871104.0, + "grad_norm": 0.03573115992813463, + "language_loss": 0.89631218, + "learning_rate": 8.264963687678978e-05, + "loss": 0.90767449, + "num_input_tokens_seen": 354064848, + "router_z_loss_mlp": 0.73388672, + "step": 4262, + "time_per_iteration": 2.698512554168701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136104, + "balance_loss_mlp": 1.0625757, + "epoch": 0.8201231242785687, + "flos": 568230535680.0, + "grad_norm": 0.03738858607219498, + "language_loss": 0.8919028, + "learning_rate": 8.247815036252921e-05, + "loss": 0.90326387, + "num_input_tokens_seen": 354138848, + "router_z_loss_mlp": 0.73535156, + "step": 4263, + "time_per_iteration": 2.7295687198638916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136681, + "balance_loss_mlp": 1.0632956, + "epoch": 0.8203155059638323, + "flos": 1232383431168.0, + "grad_norm": 0.035805039372270496, + "language_loss": 0.86680698, + "learning_rate": 8.230682594462652e-05, + "loss": 0.87817383, + "num_input_tokens_seen": 354227696, + "router_z_loss_mlp": 0.73388672, + "step": 4264, + "time_per_iteration": 3.529435634613037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137219, + "balance_loss_mlp": 1.0639292, + "epoch": 0.8205078876490958, + "flos": 575279089152.0, + "grad_norm": 0.03283426930312581, + "language_loss": 0.84526485, + "learning_rate": 8.213566368959558e-05, + "loss": 0.856637, + "num_input_tokens_seen": 354298400, + "router_z_loss_mlp": 0.73291016, + "step": 4265, + "time_per_iteration": 2.6853911876678467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136935, + "balance_loss_mlp": 1.06354988, + "epoch": 0.8207002693343594, + "flos": 932985017856.0, + "grad_norm": 0.03554909182622845, + "language_loss": 0.83231854, + "learning_rate": 8.196466366388744e-05, + "loss": 0.84368789, + "num_input_tokens_seen": 354385024, + "router_z_loss_mlp": 0.73388672, + "step": 4266, + "time_per_iteration": 3.2028071880340576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136873, + "balance_loss_mlp": 1.06358302, + "epoch": 0.8208926510196229, + "flos": 550659454464.0, + "grad_norm": 0.030804523886097362, + "language_loss": 0.84640598, + "learning_rate": 8.179382593389029e-05, + "loss": 0.85777473, + "num_input_tokens_seen": 354456384, + "router_z_loss_mlp": 0.73291016, + "step": 4267, + "time_per_iteration": 2.650616407394409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113631, + "balance_loss_mlp": 1.06297278, + "epoch": 0.8210850327048865, + "flos": 649411966464.0, + "grad_norm": 0.034163705244185175, + "language_loss": 0.86939591, + "learning_rate": 8.162315056592918e-05, + "loss": 0.880759, + "num_input_tokens_seen": 354531296, + "router_z_loss_mlp": 0.73339844, + "step": 4268, + "time_per_iteration": 2.8432037830352783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135687, + "balance_loss_mlp": 1.06239724, + "epoch": 0.82127741439015, + "flos": 602697223680.0, + "grad_norm": 0.0327614409719618, + "language_loss": 0.85872579, + "learning_rate": 8.145263762626615e-05, + "loss": 0.87008262, + "num_input_tokens_seen": 354605680, + "router_z_loss_mlp": 0.73291016, + "step": 4269, + "time_per_iteration": 2.794907808303833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136528, + "balance_loss_mlp": 1.06314278, + "epoch": 0.8214697960754136, + "flos": 475853830656.0, + "grad_norm": 0.03329504882056361, + "language_loss": 0.88679749, + "learning_rate": 8.128228718110015e-05, + "loss": 0.89816278, + "num_input_tokens_seen": 354678160, + "router_z_loss_mlp": 0.73388672, + "step": 4270, + "time_per_iteration": 2.6682534217834473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137291, + "balance_loss_mlp": 1.06395364, + "epoch": 0.8216621777606772, + "flos": 905093523456.0, + "grad_norm": 0.04141096199227741, + "language_loss": 0.89987427, + "learning_rate": 8.11120992965671e-05, + "loss": 0.91124725, + "num_input_tokens_seen": 354751024, + "router_z_loss_mlp": 0.73339844, + "step": 4271, + "time_per_iteration": 3.0566489696502686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137158, + "balance_loss_mlp": 1.06372499, + "epoch": 0.8218545594459408, + "flos": 515495824896.0, + "grad_norm": 0.03644141192614607, + "language_loss": 0.88000762, + "learning_rate": 8.094207403873998e-05, + "loss": 0.89137918, + "num_input_tokens_seen": 354819408, + "router_z_loss_mlp": 0.734375, + "step": 4272, + "time_per_iteration": 2.6066787242889404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136048, + "balance_loss_mlp": 1.06261528, + "epoch": 0.8220469411312044, + "flos": 495558670848.0, + "grad_norm": 0.033626065990782314, + "language_loss": 0.90746641, + "learning_rate": 8.077221147362829e-05, + "loss": 0.91882682, + "num_input_tokens_seen": 354887376, + "router_z_loss_mlp": 0.734375, + "step": 4273, + "time_per_iteration": 2.6172597408294678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137405, + "balance_loss_mlp": 1.0640676, + "epoch": 0.8222393228164678, + "flos": 387276013056.0, + "grad_norm": 0.041107028258718356, + "language_loss": 0.94696027, + "learning_rate": 8.060251166717835e-05, + "loss": 0.95833433, + "num_input_tokens_seen": 354948288, + "router_z_loss_mlp": 0.73339844, + "step": 4274, + "time_per_iteration": 2.4571101665496826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136137, + "balance_loss_mlp": 1.06270397, + "epoch": 0.8224317045017314, + "flos": 537629864448.0, + "grad_norm": 0.036324046899494276, + "language_loss": 0.90921676, + "learning_rate": 8.043297468527383e-05, + "loss": 0.92057812, + "num_input_tokens_seen": 355016912, + "router_z_loss_mlp": 0.734375, + "step": 4275, + "time_per_iteration": 2.6465563774108887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137298, + "balance_loss_mlp": 1.06396043, + "epoch": 0.822624086186995, + "flos": 555947552256.0, + "grad_norm": 0.03930955148337389, + "language_loss": 0.87730598, + "learning_rate": 8.02636005937346e-05, + "loss": 0.88867891, + "num_input_tokens_seen": 355085936, + "router_z_loss_mlp": 0.73339844, + "step": 4276, + "time_per_iteration": 2.6447408199310303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137809, + "balance_loss_mlp": 1.06451952, + "epoch": 0.8228164678722586, + "flos": 540717073920.0, + "grad_norm": 0.032348524230564446, + "language_loss": 0.8416298, + "learning_rate": 8.009438945831771e-05, + "loss": 0.85300791, + "num_input_tokens_seen": 355161984, + "router_z_loss_mlp": 0.73291016, + "step": 4277, + "time_per_iteration": 2.725992441177368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137287, + "balance_loss_mlp": 1.06404459, + "epoch": 0.8230088495575221, + "flos": 474262562304.0, + "grad_norm": 0.0328588755399637, + "language_loss": 0.84125638, + "learning_rate": 7.992534134471641e-05, + "loss": 0.8526293, + "num_input_tokens_seen": 355234544, + "router_z_loss_mlp": 0.73242188, + "step": 4278, + "time_per_iteration": 2.722247362136841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137164, + "balance_loss_mlp": 1.0638746, + "epoch": 0.8232012312427857, + "flos": 592750113792.0, + "grad_norm": 0.04012924603788627, + "language_loss": 0.88655663, + "learning_rate": 7.975645631856127e-05, + "loss": 0.89792836, + "num_input_tokens_seen": 355302896, + "router_z_loss_mlp": 0.73291016, + "step": 4279, + "time_per_iteration": 2.67391037940979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137959, + "balance_loss_mlp": 1.06471694, + "epoch": 0.8233936129280492, + "flos": 573787877376.0, + "grad_norm": 0.031871243045387916, + "language_loss": 0.79251921, + "learning_rate": 7.958773444541916e-05, + "loss": 0.80389881, + "num_input_tokens_seen": 355377040, + "router_z_loss_mlp": 0.73242188, + "step": 4280, + "time_per_iteration": 2.7263128757476807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138187, + "balance_loss_mlp": 1.06499279, + "epoch": 0.8235859946133128, + "flos": 732749349888.0, + "grad_norm": 0.030378228316341748, + "language_loss": 0.82564437, + "learning_rate": 7.941917579079383e-05, + "loss": 0.83702624, + "num_input_tokens_seen": 355461616, + "router_z_loss_mlp": 0.73193359, + "step": 4281, + "time_per_iteration": 3.002906322479248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138376, + "balance_loss_mlp": 1.06522954, + "epoch": 0.8237783762985764, + "flos": 571397609472.0, + "grad_norm": 0.035495855879207304, + "language_loss": 0.86794972, + "learning_rate": 7.92507804201253e-05, + "loss": 0.8793335, + "num_input_tokens_seen": 355532480, + "router_z_loss_mlp": 0.73144531, + "step": 4282, + "time_per_iteration": 2.662153720855713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141701, + "balance_loss_mlp": 1.07017517, + "epoch": 0.8239707579838399, + "flos": 1469424566784.0, + "grad_norm": 0.006000143567348165, + "language_loss": 0.75297678, + "learning_rate": 7.908254839879092e-05, + "loss": 0.76439381, + "num_input_tokens_seen": 355768752, + "router_z_loss_mlp": 0.71679688, + "step": 4283, + "time_per_iteration": 4.955921649932861 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134641, + "balance_loss_mlp": 1.0613029, + "epoch": 0.8241631396691035, + "flos": 468296988672.0, + "grad_norm": 0.03760259633973049, + "language_loss": 0.85799181, + "learning_rate": 7.89144797921037e-05, + "loss": 0.86933821, + "num_input_tokens_seen": 355838800, + "router_z_loss_mlp": 0.73339844, + "step": 4284, + "time_per_iteration": 2.670642614364624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137108, + "balance_loss_mlp": 1.06520081, + "epoch": 0.8243555213543671, + "flos": 1542549599232.0, + "grad_norm": 0.0035179548887658537, + "language_loss": 0.77934271, + "learning_rate": 7.874657466531388e-05, + "loss": 0.79071379, + "num_input_tokens_seen": 356069280, + "router_z_loss_mlp": 0.72070312, + "step": 4285, + "time_per_iteration": 4.919512510299683 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135036, + "balance_loss_mlp": 1.06169832, + "epoch": 0.8245479030396307, + "flos": 798862758912.0, + "grad_norm": 0.02838711581178409, + "language_loss": 0.8627755, + "learning_rate": 7.85788330836078e-05, + "loss": 0.87412584, + "num_input_tokens_seen": 356164528, + "router_z_loss_mlp": 0.73339844, + "step": 4286, + "time_per_iteration": 3.106489419937134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135135, + "balance_loss_mlp": 1.06170166, + "epoch": 0.8247402847248941, + "flos": 647399731200.0, + "grad_norm": 0.035275587559529614, + "language_loss": 0.81354994, + "learning_rate": 7.841125511210878e-05, + "loss": 0.82490128, + "num_input_tokens_seen": 356243600, + "router_z_loss_mlp": 0.734375, + "step": 4287, + "time_per_iteration": 2.8796138763427734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135286, + "balance_loss_mlp": 1.06199634, + "epoch": 0.8249326664101577, + "flos": 605619248640.0, + "grad_norm": 0.03206789384595215, + "language_loss": 0.83634263, + "learning_rate": 7.824384081587637e-05, + "loss": 0.84769547, + "num_input_tokens_seen": 356320320, + "router_z_loss_mlp": 0.73291016, + "step": 4288, + "time_per_iteration": 2.846707820892334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134793, + "balance_loss_mlp": 1.06155086, + "epoch": 0.8251250480954213, + "flos": 825826999296.0, + "grad_norm": 0.09140379180840759, + "language_loss": 0.91303772, + "learning_rate": 7.807659025990637e-05, + "loss": 0.92438555, + "num_input_tokens_seen": 356406928, + "router_z_loss_mlp": 0.73242188, + "step": 4289, + "time_per_iteration": 3.1333796977996826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134594, + "balance_loss_mlp": 1.06125653, + "epoch": 0.8253174297806849, + "flos": 758675546112.0, + "grad_norm": 0.03823856900412753, + "language_loss": 0.83296132, + "learning_rate": 7.790950350913112e-05, + "loss": 0.8443073, + "num_input_tokens_seen": 356481456, + "router_z_loss_mlp": 0.73339844, + "step": 4290, + "time_per_iteration": 2.9032602310180664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134661, + "balance_loss_mlp": 1.06141841, + "epoch": 0.8255098114659485, + "flos": 795993126912.0, + "grad_norm": 0.03957304400162463, + "language_loss": 0.91916239, + "learning_rate": 7.774258062841971e-05, + "loss": 0.93050897, + "num_input_tokens_seen": 356568736, + "router_z_loss_mlp": 0.73242188, + "step": 4291, + "time_per_iteration": 3.2001283168792725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135868, + "balance_loss_mlp": 1.06272089, + "epoch": 0.825702193151212, + "flos": 711680825856.0, + "grad_norm": 0.035067281879066665, + "language_loss": 0.82225877, + "learning_rate": 7.757582168257731e-05, + "loss": 0.83361745, + "num_input_tokens_seen": 356643328, + "router_z_loss_mlp": 0.73144531, + "step": 4292, + "time_per_iteration": 2.863765001296997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137284, + "balance_loss_mlp": 1.06413746, + "epoch": 0.8258945748364755, + "flos": 684668921856.0, + "grad_norm": 0.032242786757735724, + "language_loss": 0.85239249, + "learning_rate": 7.740922673634537e-05, + "loss": 0.8637653, + "num_input_tokens_seen": 356723824, + "router_z_loss_mlp": 0.73144531, + "step": 4293, + "time_per_iteration": 2.907665729522705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136851, + "balance_loss_mlp": 1.06360924, + "epoch": 0.8260869565217391, + "flos": 595680870912.0, + "grad_norm": 0.0674529865816818, + "language_loss": 0.82838464, + "learning_rate": 7.724279585440186e-05, + "loss": 0.83975315, + "num_input_tokens_seen": 356796512, + "router_z_loss_mlp": 0.73242188, + "step": 4294, + "time_per_iteration": 2.7359163761138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136147, + "balance_loss_mlp": 1.06290495, + "epoch": 0.8262793382070027, + "flos": 652652900352.0, + "grad_norm": 0.037208876536065486, + "language_loss": 0.90246564, + "learning_rate": 7.707652910136098e-05, + "loss": 0.91382712, + "num_input_tokens_seen": 356868624, + "router_z_loss_mlp": 0.73242188, + "step": 4295, + "time_per_iteration": 2.7886202335357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135781, + "balance_loss_mlp": 1.0624913, + "epoch": 0.8264717198922663, + "flos": 539957005824.0, + "grad_norm": 0.03534933797875362, + "language_loss": 0.89258248, + "learning_rate": 7.691042654177315e-05, + "loss": 0.90394032, + "num_input_tokens_seen": 356934368, + "router_z_loss_mlp": 0.73291016, + "step": 4296, + "time_per_iteration": 2.651456594467163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135891, + "balance_loss_mlp": 1.0626967, + "epoch": 0.8266641015775298, + "flos": 539993935872.0, + "grad_norm": 0.03536676261879614, + "language_loss": 0.81180108, + "learning_rate": 7.674448824012514e-05, + "loss": 0.82316005, + "num_input_tokens_seen": 357005536, + "router_z_loss_mlp": 0.73193359, + "step": 4297, + "time_per_iteration": 2.691899061203003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136173, + "balance_loss_mlp": 1.06278765, + "epoch": 0.8268564832627934, + "flos": 586502561280.0, + "grad_norm": 0.03294900814096248, + "language_loss": 0.88706392, + "learning_rate": 7.657871426083979e-05, + "loss": 0.89842564, + "num_input_tokens_seen": 357082160, + "router_z_loss_mlp": 0.73388672, + "step": 4298, + "time_per_iteration": 3.3337292671203613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150659, + "balance_loss_mlp": 1.07727432, + "epoch": 0.827048864948057, + "flos": 431570288640.0, + "grad_norm": 0.03920761424756738, + "language_loss": 0.88906097, + "learning_rate": 7.641310466827667e-05, + "loss": 0.90056753, + "num_input_tokens_seen": 357146928, + "router_z_loss_mlp": 0.73388672, + "step": 4299, + "time_per_iteration": 3.4399309158325195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150747, + "balance_loss_mlp": 1.07740986, + "epoch": 0.8272412466333205, + "flos": 1390500241920.0, + "grad_norm": 0.03570603995956023, + "language_loss": 0.89542663, + "learning_rate": 7.624765952673069e-05, + "loss": 0.90693414, + "num_input_tokens_seen": 357236768, + "router_z_loss_mlp": 0.73339844, + "step": 4300, + "time_per_iteration": 3.9774158000946045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150112, + "balance_loss_mlp": 1.07667911, + "epoch": 0.827433628318584, + "flos": 539349387264.0, + "grad_norm": 0.034642967404352416, + "language_loss": 0.87599683, + "learning_rate": 7.608237890043335e-05, + "loss": 0.8874979, + "num_input_tokens_seen": 357307568, + "router_z_loss_mlp": 0.734375, + "step": 4301, + "time_per_iteration": 2.814303398132324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114939, + "balance_loss_mlp": 1.0759089, + "epoch": 0.8276260100038476, + "flos": 732063141888.0, + "grad_norm": 0.044295314753443144, + "language_loss": 0.82156098, + "learning_rate": 7.59172628535526e-05, + "loss": 0.8330549, + "num_input_tokens_seen": 357387712, + "router_z_loss_mlp": 0.73486328, + "step": 4302, + "time_per_iteration": 3.0075466632843018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144301, + "balance_loss_mlp": 1.07086802, + "epoch": 0.8278183916891112, + "flos": 872661264384.0, + "grad_norm": 0.03293198528529039, + "language_loss": 0.86338317, + "learning_rate": 7.575231145019196e-05, + "loss": 0.87482619, + "num_input_tokens_seen": 357473360, + "router_z_loss_mlp": 0.734375, + "step": 4303, + "time_per_iteration": 3.220668077468872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144066, + "balance_loss_mlp": 1.0707283, + "epoch": 0.8280107733743748, + "flos": 595698335232.0, + "grad_norm": 0.03223563949514157, + "language_loss": 0.81716228, + "learning_rate": 7.558752475439134e-05, + "loss": 0.82860291, + "num_input_tokens_seen": 357548432, + "router_z_loss_mlp": 0.73339844, + "step": 4304, + "time_per_iteration": 2.810628652572632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142863, + "balance_loss_mlp": 1.06942999, + "epoch": 0.8282031550596384, + "flos": 770027272704.0, + "grad_norm": 0.03508054216090567, + "language_loss": 0.87922353, + "learning_rate": 7.542290283012653e-05, + "loss": 0.89065218, + "num_input_tokens_seen": 357625968, + "router_z_loss_mlp": 0.734375, + "step": 4305, + "time_per_iteration": 3.1161751747131348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142615, + "balance_loss_mlp": 1.06922984, + "epoch": 0.8283955367449019, + "flos": 697446732288.0, + "grad_norm": 0.03898160364369505, + "language_loss": 0.82788968, + "learning_rate": 7.525844574130947e-05, + "loss": 0.83931583, + "num_input_tokens_seen": 357705824, + "router_z_loss_mlp": 0.73388672, + "step": 4306, + "time_per_iteration": 2.9796903133392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142397, + "balance_loss_mlp": 1.06896424, + "epoch": 0.8285879184301654, + "flos": 661937997312.0, + "grad_norm": 0.035115838558733896, + "language_loss": 0.87112027, + "learning_rate": 7.509415355178806e-05, + "loss": 0.88254428, + "num_input_tokens_seen": 357787040, + "router_z_loss_mlp": 0.734375, + "step": 4307, + "time_per_iteration": 2.9617509841918945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138818, + "balance_loss_mlp": 1.06543314, + "epoch": 0.828780300115429, + "flos": 559772636160.0, + "grad_norm": 0.04100434212152103, + "language_loss": 0.82768691, + "learning_rate": 7.493002632534618e-05, + "loss": 0.83907503, + "num_input_tokens_seen": 357856960, + "router_z_loss_mlp": 0.73388672, + "step": 4308, + "time_per_iteration": 2.727365016937256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137735, + "balance_loss_mlp": 1.06439769, + "epoch": 0.8289726818006926, + "flos": 832371993600.0, + "grad_norm": 0.035278553055239026, + "language_loss": 0.86246669, + "learning_rate": 7.476606412570352e-05, + "loss": 0.87384403, + "num_input_tokens_seen": 357937760, + "router_z_loss_mlp": 0.73339844, + "step": 4309, + "time_per_iteration": 3.108769416809082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154857, + "balance_loss_mlp": 1.08161438, + "epoch": 0.8291650634859561, + "flos": 733554353664.0, + "grad_norm": 0.0366695194121263, + "language_loss": 0.85579491, + "learning_rate": 7.460226701651624e-05, + "loss": 0.86734343, + "num_input_tokens_seen": 358012480, + "router_z_loss_mlp": 0.73242188, + "step": 4310, + "time_per_iteration": 2.954108238220215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153477, + "balance_loss_mlp": 1.08013999, + "epoch": 0.8293574451712197, + "flos": 862469105664.0, + "grad_norm": 0.03497290190762598, + "language_loss": 0.85557121, + "learning_rate": 7.443863506137566e-05, + "loss": 0.86710596, + "num_input_tokens_seen": 358100720, + "router_z_loss_mlp": 0.73339844, + "step": 4311, + "time_per_iteration": 3.2707061767578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145208, + "balance_loss_mlp": 1.071823, + "epoch": 0.8295498268564833, + "flos": 496290541056.0, + "grad_norm": 0.030603174986020117, + "language_loss": 0.85576063, + "learning_rate": 7.427516832380948e-05, + "loss": 0.86721271, + "num_input_tokens_seen": 358180496, + "router_z_loss_mlp": 0.73388672, + "step": 4312, + "time_per_iteration": 2.8450915813446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011424, + "balance_loss_mlp": 1.06896734, + "epoch": 0.8297422085417469, + "flos": 555654839808.0, + "grad_norm": 0.0318834502446829, + "language_loss": 0.82207704, + "learning_rate": 7.4111866867281e-05, + "loss": 0.8335011, + "num_input_tokens_seen": 358261104, + "router_z_loss_mlp": 0.734375, + "step": 4313, + "time_per_iteration": 2.8910624980926514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141956, + "balance_loss_mlp": 1.06852293, + "epoch": 0.8299345902270104, + "flos": 1249487883264.0, + "grad_norm": 0.032916410073977276, + "language_loss": 0.8188554, + "learning_rate": 7.39487307551896e-05, + "loss": 0.83027506, + "num_input_tokens_seen": 358356368, + "router_z_loss_mlp": 0.734375, + "step": 4314, + "time_per_iteration": 3.6977193355560303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138238, + "balance_loss_mlp": 1.06480479, + "epoch": 0.8301269719122739, + "flos": 586409235456.0, + "grad_norm": 0.03544125426025781, + "language_loss": 0.86962932, + "learning_rate": 7.378576005087034e-05, + "loss": 0.88101172, + "num_input_tokens_seen": 358429104, + "router_z_loss_mlp": 0.734375, + "step": 4315, + "time_per_iteration": 2.764580011367798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137941, + "balance_loss_mlp": 1.06446016, + "epoch": 0.8303193535975375, + "flos": 510776414208.0, + "grad_norm": 0.03851406833152273, + "language_loss": 0.89923644, + "learning_rate": 7.362295481759412e-05, + "loss": 0.91061592, + "num_input_tokens_seen": 358501344, + "router_z_loss_mlp": 0.73486328, + "step": 4316, + "time_per_iteration": 2.6864657402038574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139377, + "balance_loss_mlp": 1.06556237, + "epoch": 0.8305117352828011, + "flos": 581765686272.0, + "grad_norm": 0.03996280155822034, + "language_loss": 0.87696218, + "learning_rate": 7.346031511856722e-05, + "loss": 0.88835597, + "num_input_tokens_seen": 358575584, + "router_z_loss_mlp": 0.73730469, + "step": 4317, + "time_per_iteration": 2.7490365505218506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138995, + "balance_loss_mlp": 1.06508517, + "epoch": 0.8307041169680647, + "flos": 482648603136.0, + "grad_norm": 0.03410540332175001, + "language_loss": 0.83901942, + "learning_rate": 7.329784101693232e-05, + "loss": 0.85040939, + "num_input_tokens_seen": 358644304, + "router_z_loss_mlp": 0.73779297, + "step": 4318, + "time_per_iteration": 2.633737087249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140154, + "balance_loss_mlp": 1.06629157, + "epoch": 0.8308964986533282, + "flos": 625753787904.0, + "grad_norm": 0.039585355181565605, + "language_loss": 0.87891459, + "learning_rate": 7.313553257576727e-05, + "loss": 0.89031613, + "num_input_tokens_seen": 358712384, + "router_z_loss_mlp": 0.73730469, + "step": 4319, + "time_per_iteration": 2.73393177986145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137292, + "balance_loss_mlp": 1.06362104, + "epoch": 0.8310888803385917, + "flos": 828705363456.0, + "grad_norm": 0.038987738379061505, + "language_loss": 0.83643472, + "learning_rate": 7.297338985808589e-05, + "loss": 0.84780765, + "num_input_tokens_seen": 358789264, + "router_z_loss_mlp": 0.73583984, + "step": 4320, + "time_per_iteration": 3.0508508682250977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137036, + "balance_loss_mlp": 1.06350768, + "epoch": 0.8312812620238553, + "flos": 584946221568.0, + "grad_norm": 0.030329036309150237, + "language_loss": 0.85852158, + "learning_rate": 7.281141292683746e-05, + "loss": 0.86989194, + "num_input_tokens_seen": 358868976, + "router_z_loss_mlp": 0.73486328, + "step": 4321, + "time_per_iteration": 2.864978551864624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136398, + "balance_loss_mlp": 1.06277454, + "epoch": 0.8314736437091189, + "flos": 1117369127424.0, + "grad_norm": 0.04535130746874187, + "language_loss": 0.79764462, + "learning_rate": 7.26496018449071e-05, + "loss": 0.8090086, + "num_input_tokens_seen": 358953600, + "router_z_loss_mlp": 0.73535156, + "step": 4322, + "time_per_iteration": 3.5574073791503906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113609, + "balance_loss_mlp": 1.06237078, + "epoch": 0.8316660253943825, + "flos": 518558839296.0, + "grad_norm": 0.03678795377404695, + "language_loss": 0.86844653, + "learning_rate": 7.248795667511543e-05, + "loss": 0.87980741, + "num_input_tokens_seen": 359028768, + "router_z_loss_mlp": 0.73632812, + "step": 4323, + "time_per_iteration": 2.8555359840393066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136953, + "balance_loss_mlp": 1.06328201, + "epoch": 0.831858407079646, + "flos": 796696072704.0, + "grad_norm": 0.032683299236101075, + "language_loss": 0.82923019, + "learning_rate": 7.232647748021864e-05, + "loss": 0.84059966, + "num_input_tokens_seen": 359116208, + "router_z_loss_mlp": 0.73632812, + "step": 4324, + "time_per_iteration": 3.0507915019989014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135863, + "balance_loss_mlp": 1.06223941, + "epoch": 0.8320507887649096, + "flos": 551041489920.0, + "grad_norm": 0.03984980567953029, + "language_loss": 0.88372821, + "learning_rate": 7.216516432290843e-05, + "loss": 0.89508682, + "num_input_tokens_seen": 359189552, + "router_z_loss_mlp": 0.73583984, + "step": 4325, + "time_per_iteration": 2.910611867904663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135315, + "balance_loss_mlp": 1.06178665, + "epoch": 0.8322431704501732, + "flos": 480351661056.0, + "grad_norm": 0.03873731479113487, + "language_loss": 0.86735284, + "learning_rate": 7.20040172658123e-05, + "loss": 0.87870598, + "num_input_tokens_seen": 359253008, + "router_z_loss_mlp": 0.73535156, + "step": 4326, + "time_per_iteration": 2.637766122817993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113728, + "balance_loss_mlp": 1.06375158, + "epoch": 0.8324355521354367, + "flos": 573546831360.0, + "grad_norm": 0.031469774572695536, + "language_loss": 0.89963889, + "learning_rate": 7.184303637149308e-05, + "loss": 0.9110117, + "num_input_tokens_seen": 359326368, + "router_z_loss_mlp": 0.73535156, + "step": 4327, + "time_per_iteration": 2.7417519092559814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136846, + "balance_loss_mlp": 1.06341326, + "epoch": 0.8326279338207002, + "flos": 504439538688.0, + "grad_norm": 0.03407361480864025, + "language_loss": 0.8678869, + "learning_rate": 7.168222170244888e-05, + "loss": 0.87925529, + "num_input_tokens_seen": 359394192, + "router_z_loss_mlp": 0.734375, + "step": 4328, + "time_per_iteration": 2.7490806579589844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113636, + "balance_loss_mlp": 1.06283176, + "epoch": 0.8328203155059638, + "flos": 606950005248.0, + "grad_norm": 0.0316879397336073, + "language_loss": 0.85139227, + "learning_rate": 7.152157332111364e-05, + "loss": 0.86275589, + "num_input_tokens_seen": 359476016, + "router_z_loss_mlp": 0.73535156, + "step": 4329, + "time_per_iteration": 3.043998956680298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136964, + "balance_loss_mlp": 1.06353128, + "epoch": 0.8330126971912274, + "flos": 699122594304.0, + "grad_norm": 0.03501346929276039, + "language_loss": 0.90436953, + "learning_rate": 7.136109128985663e-05, + "loss": 0.91573918, + "num_input_tokens_seen": 359554048, + "router_z_loss_mlp": 0.734375, + "step": 4330, + "time_per_iteration": 2.9104068279266357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136816, + "balance_loss_mlp": 1.06338286, + "epoch": 0.833205078876491, + "flos": 495020183040.0, + "grad_norm": 0.039903195298822546, + "language_loss": 0.91142917, + "learning_rate": 7.120077567098249e-05, + "loss": 0.92279732, + "num_input_tokens_seen": 359621440, + "router_z_loss_mlp": 0.734375, + "step": 4331, + "time_per_iteration": 2.539658784866333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136663, + "balance_loss_mlp": 1.06327808, + "epoch": 0.8333974605617546, + "flos": 483794709504.0, + "grad_norm": 0.031623545880620704, + "language_loss": 0.86857003, + "learning_rate": 7.104062652673115e-05, + "loss": 0.87993664, + "num_input_tokens_seen": 359690320, + "router_z_loss_mlp": 0.73388672, + "step": 4332, + "time_per_iteration": 2.592482566833496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136652, + "balance_loss_mlp": 1.063362, + "epoch": 0.833589842247018, + "flos": 688040111616.0, + "grad_norm": 0.04080208699909347, + "language_loss": 0.87699354, + "learning_rate": 7.088064391927818e-05, + "loss": 0.88836008, + "num_input_tokens_seen": 359759888, + "router_z_loss_mlp": 0.73291016, + "step": 4333, + "time_per_iteration": 2.8243579864501953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136297, + "balance_loss_mlp": 1.06300712, + "epoch": 0.8337822239322816, + "flos": 883191797760.0, + "grad_norm": 0.034267642896518694, + "language_loss": 0.87079096, + "learning_rate": 7.072082791073419e-05, + "loss": 0.88215387, + "num_input_tokens_seen": 359836544, + "router_z_loss_mlp": 0.73291016, + "step": 4334, + "time_per_iteration": 3.095567226409912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136658, + "balance_loss_mlp": 1.06341565, + "epoch": 0.8339746056175452, + "flos": 498157057536.0, + "grad_norm": 0.036797660488946164, + "language_loss": 0.87406766, + "learning_rate": 7.056117856314531e-05, + "loss": 0.88543415, + "num_input_tokens_seen": 359903024, + "router_z_loss_mlp": 0.73242188, + "step": 4335, + "time_per_iteration": 2.6543936729431152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138151, + "balance_loss_mlp": 1.06490886, + "epoch": 0.8341669873028088, + "flos": 511503555072.0, + "grad_norm": 0.033824511697931096, + "language_loss": 0.91365576, + "learning_rate": 7.040169593849289e-05, + "loss": 0.92503732, + "num_input_tokens_seen": 359971200, + "router_z_loss_mlp": 0.73242188, + "step": 4336, + "time_per_iteration": 2.6173272132873535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141861, + "balance_loss_mlp": 1.06852305, + "epoch": 0.8343593689880723, + "flos": 693541057536.0, + "grad_norm": 0.036766896527395135, + "language_loss": 0.89182138, + "learning_rate": 7.024238009869366e-05, + "loss": 0.90323997, + "num_input_tokens_seen": 360042560, + "router_z_loss_mlp": 0.73339844, + "step": 4337, + "time_per_iteration": 2.832035779953003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113989, + "balance_loss_mlp": 1.06650496, + "epoch": 0.8345517506733359, + "flos": 553516351488.0, + "grad_norm": 0.03709810498280935, + "language_loss": 0.83323646, + "learning_rate": 7.008323110559956e-05, + "loss": 0.84463537, + "num_input_tokens_seen": 360118048, + "router_z_loss_mlp": 0.73388672, + "step": 4338, + "time_per_iteration": 2.7567930221557617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140629, + "balance_loss_mlp": 1.06743467, + "epoch": 0.8347441323585995, + "flos": 593267134464.0, + "grad_norm": 0.04006529314442172, + "language_loss": 0.80799747, + "learning_rate": 6.992424902099754e-05, + "loss": 0.81940377, + "num_input_tokens_seen": 360192528, + "router_z_loss_mlp": 0.73193359, + "step": 4339, + "time_per_iteration": 2.7979674339294434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140723, + "balance_loss_mlp": 1.06752896, + "epoch": 0.834936514043863, + "flos": 616091384832.0, + "grad_norm": 0.03516018404637607, + "language_loss": 0.89085752, + "learning_rate": 6.976543390660983e-05, + "loss": 0.90226471, + "num_input_tokens_seen": 360266880, + "router_z_loss_mlp": 0.73193359, + "step": 4340, + "time_per_iteration": 3.017014980316162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140539, + "balance_loss_mlp": 1.0673449, + "epoch": 0.8351288957291266, + "flos": 468863674368.0, + "grad_norm": 0.040869831177599326, + "language_loss": 0.83971238, + "learning_rate": 6.960678582409424e-05, + "loss": 0.85111785, + "num_input_tokens_seen": 360336336, + "router_z_loss_mlp": 0.73193359, + "step": 4341, + "time_per_iteration": 3.5495381355285645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114012, + "balance_loss_mlp": 1.06697321, + "epoch": 0.8353212774143901, + "flos": 510348716544.0, + "grad_norm": 0.04414728367362659, + "language_loss": 0.83281082, + "learning_rate": 6.944830483504328e-05, + "loss": 0.84421206, + "num_input_tokens_seen": 360409776, + "router_z_loss_mlp": 0.73144531, + "step": 4342, + "time_per_iteration": 2.8123908042907715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140117, + "balance_loss_mlp": 1.06697071, + "epoch": 0.8355136590996537, + "flos": 689017030656.0, + "grad_norm": 0.03677224015719086, + "language_loss": 0.85329032, + "learning_rate": 6.928999100098483e-05, + "loss": 0.8646915, + "num_input_tokens_seen": 360486800, + "router_z_loss_mlp": 0.73144531, + "step": 4343, + "time_per_iteration": 2.8525094985961914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140369, + "balance_loss_mlp": 1.06712639, + "epoch": 0.8357060407849173, + "flos": 985975511040.0, + "grad_norm": 0.03601056440929186, + "language_loss": 0.88194978, + "learning_rate": 6.913184438338138e-05, + "loss": 0.89335346, + "num_input_tokens_seen": 360568624, + "router_z_loss_mlp": 0.73242188, + "step": 4344, + "time_per_iteration": 3.206106185913086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141569, + "balance_loss_mlp": 1.06842268, + "epoch": 0.8358984224701809, + "flos": 844507256832.0, + "grad_norm": 0.03403059716979156, + "language_loss": 0.8941586, + "learning_rate": 6.89738650436313e-05, + "loss": 0.90557432, + "num_input_tokens_seen": 360652384, + "router_z_loss_mlp": 0.73144531, + "step": 4345, + "time_per_iteration": 3.211400032043457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141203, + "balance_loss_mlp": 1.06796038, + "epoch": 0.8360908041554445, + "flos": 627418916352.0, + "grad_norm": 0.033473351355860013, + "language_loss": 0.86278164, + "learning_rate": 6.881605304306748e-05, + "loss": 0.87419367, + "num_input_tokens_seen": 360723200, + "router_z_loss_mlp": 0.73242188, + "step": 4346, + "time_per_iteration": 2.8406436443328857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141884, + "balance_loss_mlp": 1.06878495, + "epoch": 0.8362831858407079, + "flos": 577222193664.0, + "grad_norm": 0.034289712493456775, + "language_loss": 0.89250559, + "learning_rate": 6.865840844295796e-05, + "loss": 0.90392447, + "num_input_tokens_seen": 360798240, + "router_z_loss_mlp": 0.73095703, + "step": 4347, + "time_per_iteration": 2.8221635818481445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114195, + "balance_loss_mlp": 1.06885087, + "epoch": 0.8364755675259715, + "flos": 835183228416.0, + "grad_norm": 0.040230317170211145, + "language_loss": 0.8577764, + "learning_rate": 6.850093130450569e-05, + "loss": 0.86919594, + "num_input_tokens_seen": 360873552, + "router_z_loss_mlp": 0.73095703, + "step": 4348, + "time_per_iteration": 3.087906837463379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142182, + "balance_loss_mlp": 1.0691303, + "epoch": 0.8366679492112351, + "flos": 583563072000.0, + "grad_norm": 0.04163204479707521, + "language_loss": 0.91017622, + "learning_rate": 6.834362168884912e-05, + "loss": 0.92159808, + "num_input_tokens_seen": 360940800, + "router_z_loss_mlp": 0.73046875, + "step": 4349, + "time_per_iteration": 2.6955840587615967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141373, + "balance_loss_mlp": 1.06817806, + "epoch": 0.8368603308964987, + "flos": 612880650240.0, + "grad_norm": 0.03976549497353498, + "language_loss": 0.93744481, + "learning_rate": 6.818647965706076e-05, + "loss": 0.94885856, + "num_input_tokens_seen": 361014368, + "router_z_loss_mlp": 0.73193359, + "step": 4350, + "time_per_iteration": 2.8501739501953125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142129, + "balance_loss_mlp": 1.06902957, + "epoch": 0.8370527125817622, + "flos": 508264622592.0, + "grad_norm": 0.03390143622863109, + "language_loss": 0.8937093, + "learning_rate": 6.802950527014884e-05, + "loss": 0.90513057, + "num_input_tokens_seen": 361087184, + "router_z_loss_mlp": 0.73095703, + "step": 4351, + "time_per_iteration": 2.7211203575134277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140268, + "balance_loss_mlp": 1.06707358, + "epoch": 0.8372450942670258, + "flos": 772282555392.0, + "grad_norm": 0.04155998502814681, + "language_loss": 0.86906236, + "learning_rate": 6.787269858905603e-05, + "loss": 0.88046503, + "num_input_tokens_seen": 361160720, + "router_z_loss_mlp": 0.73193359, + "step": 4352, + "time_per_iteration": 2.9425594806671143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140282, + "balance_loss_mlp": 1.06703997, + "epoch": 0.8374374759522893, + "flos": 580361069568.0, + "grad_norm": 0.036304027113603754, + "language_loss": 0.89294255, + "learning_rate": 6.771605967466033e-05, + "loss": 0.90434539, + "num_input_tokens_seen": 361234432, + "router_z_loss_mlp": 0.73242188, + "step": 4353, + "time_per_iteration": 2.686323881149292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139987, + "balance_loss_mlp": 1.06669676, + "epoch": 0.8376298576375529, + "flos": 789527996928.0, + "grad_norm": 0.03911073314318024, + "language_loss": 0.87069052, + "learning_rate": 6.755958858777434e-05, + "loss": 0.88209045, + "num_input_tokens_seen": 361309376, + "router_z_loss_mlp": 0.73291016, + "step": 4354, + "time_per_iteration": 3.059568166732788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140086, + "balance_loss_mlp": 1.06679642, + "epoch": 0.8378222393228165, + "flos": 578722137600.0, + "grad_norm": 0.03555136596776637, + "language_loss": 0.85425603, + "learning_rate": 6.74032853891452e-05, + "loss": 0.86565685, + "num_input_tokens_seen": 361386768, + "router_z_loss_mlp": 0.73291016, + "step": 4355, + "time_per_iteration": 2.7401504516601562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138958, + "balance_loss_mlp": 1.06566799, + "epoch": 0.83801462100808, + "flos": 481858335744.0, + "grad_norm": 0.03498215623204101, + "language_loss": 0.86501992, + "learning_rate": 6.724715013945548e-05, + "loss": 0.87640953, + "num_input_tokens_seen": 361456704, + "router_z_loss_mlp": 0.73291016, + "step": 4356, + "time_per_iteration": 2.637608528137207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139048, + "balance_loss_mlp": 1.06580544, + "epoch": 0.8382070026933436, + "flos": 551996941824.0, + "grad_norm": 0.03258486084339394, + "language_loss": 0.93043453, + "learning_rate": 6.709118289932226e-05, + "loss": 0.94182503, + "num_input_tokens_seen": 361533648, + "router_z_loss_mlp": 0.73242188, + "step": 4357, + "time_per_iteration": 2.803379535675049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139227, + "balance_loss_mlp": 1.06584203, + "epoch": 0.8383993843786072, + "flos": 626225146368.0, + "grad_norm": 0.04207482015939984, + "language_loss": 0.87703115, + "learning_rate": 6.693538372929725e-05, + "loss": 0.88842344, + "num_input_tokens_seen": 361614256, + "router_z_loss_mlp": 0.73388672, + "step": 4358, + "time_per_iteration": 2.893259286880493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139769, + "balance_loss_mlp": 1.06652725, + "epoch": 0.8385917660638708, + "flos": 492135088128.0, + "grad_norm": 0.038027162181002674, + "language_loss": 0.91387022, + "learning_rate": 6.677975268986719e-05, + "loss": 0.92526793, + "num_input_tokens_seen": 361679008, + "router_z_loss_mlp": 0.73242188, + "step": 4359, + "time_per_iteration": 2.580935001373291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140209, + "balance_loss_mlp": 1.06691909, + "epoch": 0.8387841477491342, + "flos": 467869291008.0, + "grad_norm": 0.03829625401791919, + "language_loss": 0.91665077, + "learning_rate": 6.662428984145336e-05, + "loss": 0.92805284, + "num_input_tokens_seen": 361747600, + "router_z_loss_mlp": 0.73291016, + "step": 4360, + "time_per_iteration": 2.583767890930176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144524, + "balance_loss_mlp": 1.07299805, + "epoch": 0.8389765294343978, + "flos": 1567597658112.0, + "grad_norm": 0.007274153524221762, + "language_loss": 0.71780187, + "learning_rate": 6.646899524441175e-05, + "loss": 0.72924709, + "num_input_tokens_seen": 361983104, + "router_z_loss_mlp": 0.71679688, + "step": 4361, + "time_per_iteration": 5.073408365249634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138412, + "balance_loss_mlp": 1.06521726, + "epoch": 0.8391689111196614, + "flos": 603411629568.0, + "grad_norm": 0.030598309130581258, + "language_loss": 0.86443758, + "learning_rate": 6.631386895903308e-05, + "loss": 0.87582171, + "num_input_tokens_seen": 362065824, + "router_z_loss_mlp": 0.73193359, + "step": 4362, + "time_per_iteration": 2.8680214881896973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138687, + "balance_loss_mlp": 1.06544518, + "epoch": 0.839361292804925, + "flos": 443968065024.0, + "grad_norm": 0.03783251777685458, + "language_loss": 0.84810257, + "learning_rate": 6.615891104554261e-05, + "loss": 0.85948944, + "num_input_tokens_seen": 362128240, + "router_z_loss_mlp": 0.73242188, + "step": 4363, + "time_per_iteration": 2.5391616821289062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138227, + "balance_loss_mlp": 1.06493664, + "epoch": 0.8395536744901886, + "flos": 595298835456.0, + "grad_norm": 0.034478723046930226, + "language_loss": 0.87398577, + "learning_rate": 6.600412156410057e-05, + "loss": 0.88536799, + "num_input_tokens_seen": 362198256, + "router_z_loss_mlp": 0.73291016, + "step": 4364, + "time_per_iteration": 2.712852716445923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138545, + "balance_loss_mlp": 1.06525552, + "epoch": 0.8397460561754521, + "flos": 891334791168.0, + "grad_norm": 0.03388693894725111, + "language_loss": 0.89365327, + "learning_rate": 6.58495005748016e-05, + "loss": 0.90503871, + "num_input_tokens_seen": 362279792, + "router_z_loss_mlp": 0.73291016, + "step": 4365, + "time_per_iteration": 3.19172739982605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138646, + "balance_loss_mlp": 1.06540406, + "epoch": 0.8399384378607156, + "flos": 554560399872.0, + "grad_norm": 0.034766159346027045, + "language_loss": 0.93272662, + "learning_rate": 6.569504813767463e-05, + "loss": 0.94411302, + "num_input_tokens_seen": 362351712, + "router_z_loss_mlp": 0.73242188, + "step": 4366, + "time_per_iteration": 2.6472387313842773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138425, + "balance_loss_mlp": 1.06523097, + "epoch": 0.8401308195459792, + "flos": 519963456000.0, + "grad_norm": 0.031091903503957602, + "language_loss": 0.87725037, + "learning_rate": 6.554076431268341e-05, + "loss": 0.88863462, + "num_input_tokens_seen": 362423424, + "router_z_loss_mlp": 0.73193359, + "step": 4367, + "time_per_iteration": 2.6440939903259277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138179, + "balance_loss_mlp": 1.06488955, + "epoch": 0.8403232012312428, + "flos": 686295118848.0, + "grad_norm": 0.03330958137241384, + "language_loss": 0.84921622, + "learning_rate": 6.538664915972648e-05, + "loss": 0.86059797, + "num_input_tokens_seen": 362514704, + "router_z_loss_mlp": 0.73291016, + "step": 4368, + "time_per_iteration": 3.006840944290161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136367, + "balance_loss_mlp": 1.06307733, + "epoch": 0.8405155829165063, + "flos": 578669744640.0, + "grad_norm": 0.040494146128891996, + "language_loss": 0.82172203, + "learning_rate": 6.523270273863652e-05, + "loss": 0.83308572, + "num_input_tokens_seen": 362581296, + "router_z_loss_mlp": 0.73291016, + "step": 4369, + "time_per_iteration": 2.726771354675293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136099, + "balance_loss_mlp": 1.06290472, + "epoch": 0.8407079646017699, + "flos": 457566342144.0, + "grad_norm": 0.03926161531299747, + "language_loss": 0.92181575, + "learning_rate": 6.507892510918079e-05, + "loss": 0.93317676, + "num_input_tokens_seen": 362648304, + "router_z_loss_mlp": 0.73193359, + "step": 4370, + "time_per_iteration": 2.5662474632263184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136174, + "balance_loss_mlp": 1.06288445, + "epoch": 0.8409003462870335, + "flos": 535999664640.0, + "grad_norm": 0.03344035414756239, + "language_loss": 0.86222243, + "learning_rate": 6.492531633106114e-05, + "loss": 0.87358415, + "num_input_tokens_seen": 362721264, + "router_z_loss_mlp": 0.73291016, + "step": 4371, + "time_per_iteration": 2.7723512649536133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136298, + "balance_loss_mlp": 1.0631038, + "epoch": 0.8410927279722971, + "flos": 557899388928.0, + "grad_norm": 0.03943054767144193, + "language_loss": 0.82708782, + "learning_rate": 6.477187646391374e-05, + "loss": 0.83845079, + "num_input_tokens_seen": 362795312, + "router_z_loss_mlp": 0.73193359, + "step": 4372, + "time_per_iteration": 2.725720167160034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141853, + "balance_loss_mlp": 1.07013702, + "epoch": 0.8412851096575606, + "flos": 1552926408192.0, + "grad_norm": 0.004959659749384099, + "language_loss": 0.77679121, + "learning_rate": 6.461860556730925e-05, + "loss": 0.78820974, + "num_input_tokens_seen": 363026272, + "router_z_loss_mlp": 0.71875, + "step": 4373, + "time_per_iteration": 4.933819770812988 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136162, + "balance_loss_mlp": 1.06296706, + "epoch": 0.8414774913428241, + "flos": 553108119552.0, + "grad_norm": 0.03645525381144212, + "language_loss": 0.84143221, + "learning_rate": 6.446550370075271e-05, + "loss": 0.85279381, + "num_input_tokens_seen": 363098384, + "router_z_loss_mlp": 0.73193359, + "step": 4374, + "time_per_iteration": 2.7640419006347656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140726, + "balance_loss_mlp": 1.06743658, + "epoch": 0.8416698730280877, + "flos": 574069856256.0, + "grad_norm": 0.035030184778751555, + "language_loss": 0.82005304, + "learning_rate": 6.431257092368336e-05, + "loss": 0.83146024, + "num_input_tokens_seen": 363170960, + "router_z_loss_mlp": 0.73291016, + "step": 4375, + "time_per_iteration": 2.8986310958862305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114067, + "balance_loss_mlp": 1.06737995, + "epoch": 0.8418622547133513, + "flos": 760043232768.0, + "grad_norm": 0.04161434529267318, + "language_loss": 0.84811461, + "learning_rate": 6.415980729547543e-05, + "loss": 0.85952127, + "num_input_tokens_seen": 363242000, + "router_z_loss_mlp": 0.73291016, + "step": 4376, + "time_per_iteration": 2.9330646991729736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140768, + "balance_loss_mlp": 1.06743073, + "epoch": 0.8420546363986149, + "flos": 1075921015296.0, + "grad_norm": 0.04130069201888351, + "language_loss": 0.78135824, + "learning_rate": 6.40072128754366e-05, + "loss": 0.79276592, + "num_input_tokens_seen": 363340288, + "router_z_loss_mlp": 0.73339844, + "step": 4377, + "time_per_iteration": 3.4237923622131348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140552, + "balance_loss_mlp": 1.06735754, + "epoch": 0.8422470180838784, + "flos": 527016738816.0, + "grad_norm": 0.03545536535288648, + "language_loss": 0.87165993, + "learning_rate": 6.385478772280933e-05, + "loss": 0.88306552, + "num_input_tokens_seen": 363416208, + "router_z_loss_mlp": 0.73193359, + "step": 4378, + "time_per_iteration": 2.753131628036499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141175, + "balance_loss_mlp": 1.06793308, + "epoch": 0.842439399769142, + "flos": 601963352064.0, + "grad_norm": 0.03434358981966458, + "language_loss": 0.86777276, + "learning_rate": 6.370253189677038e-05, + "loss": 0.87918454, + "num_input_tokens_seen": 363492864, + "router_z_loss_mlp": 0.73242188, + "step": 4379, + "time_per_iteration": 2.779681921005249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114171, + "balance_loss_mlp": 1.06846821, + "epoch": 0.8426317814544055, + "flos": 553375362048.0, + "grad_norm": 0.03541517543705223, + "language_loss": 0.90755582, + "learning_rate": 6.355044545643073e-05, + "loss": 0.91897291, + "num_input_tokens_seen": 363572000, + "router_z_loss_mlp": 0.73242188, + "step": 4380, + "time_per_iteration": 2.812915802001953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142077, + "balance_loss_mlp": 1.06878674, + "epoch": 0.8428241631396691, + "flos": 680044838400.0, + "grad_norm": 0.03810176337310906, + "language_loss": 0.82064164, + "learning_rate": 6.33985284608356e-05, + "loss": 0.83206236, + "num_input_tokens_seen": 363646480, + "router_z_loss_mlp": 0.73291016, + "step": 4381, + "time_per_iteration": 3.037733554840088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138351, + "balance_loss_mlp": 1.0651089, + "epoch": 0.8430165448249327, + "flos": 755198295552.0, + "grad_norm": 0.028303447358351223, + "language_loss": 0.8332209, + "learning_rate": 6.324678096896435e-05, + "loss": 0.84460437, + "num_input_tokens_seen": 363737552, + "router_z_loss_mlp": 0.73242188, + "step": 4382, + "time_per_iteration": 3.35500431060791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136887, + "balance_loss_mlp": 1.06354892, + "epoch": 0.8432089265101962, + "flos": 700435886592.0, + "grad_norm": 0.03473950502542374, + "language_loss": 0.85785019, + "learning_rate": 6.30952030397306e-05, + "loss": 0.86921906, + "num_input_tokens_seen": 363816016, + "router_z_loss_mlp": 0.73339844, + "step": 4383, + "time_per_iteration": 2.925360918045044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135912, + "balance_loss_mlp": 1.06262255, + "epoch": 0.8434013081954598, + "flos": 486790594560.0, + "grad_norm": 0.03830758033053903, + "language_loss": 0.88952708, + "learning_rate": 6.294379473198208e-05, + "loss": 0.90088624, + "num_input_tokens_seen": 363888192, + "router_z_loss_mlp": 0.73291016, + "step": 4384, + "time_per_iteration": 2.6873929500579834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135663, + "balance_loss_mlp": 1.06251621, + "epoch": 0.8435936898807234, + "flos": 521630585856.0, + "grad_norm": 0.03664735464592092, + "language_loss": 0.89606541, + "learning_rate": 6.279255610450068e-05, + "loss": 0.90742207, + "num_input_tokens_seen": 363953904, + "router_z_loss_mlp": 0.73144531, + "step": 4385, + "time_per_iteration": 2.619441509246826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136436, + "balance_loss_mlp": 1.06328917, + "epoch": 0.843786071565987, + "flos": 787313647104.0, + "grad_norm": 0.03681711065218231, + "language_loss": 0.85414076, + "learning_rate": 6.264148721600254e-05, + "loss": 0.8655051, + "num_input_tokens_seen": 364031552, + "router_z_loss_mlp": 0.73144531, + "step": 4386, + "time_per_iteration": 3.0707485675811768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140541, + "balance_loss_mlp": 1.06882477, + "epoch": 0.8439784532512504, + "flos": 1449513609216.0, + "grad_norm": 0.00413751236378941, + "language_loss": 0.75836509, + "learning_rate": 6.24905881251378e-05, + "loss": 0.76977056, + "num_input_tokens_seen": 364256480, + "router_z_loss_mlp": 0.71875, + "step": 4387, + "time_per_iteration": 5.089155197143555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113603, + "balance_loss_mlp": 1.06278777, + "epoch": 0.844170834936514, + "flos": 709968033792.0, + "grad_norm": 0.06407093609242513, + "language_loss": 0.88289285, + "learning_rate": 6.23398588904906e-05, + "loss": 0.89425313, + "num_input_tokens_seen": 364329696, + "router_z_loss_mlp": 0.73242188, + "step": 4388, + "time_per_iteration": 3.0436534881591797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135293, + "balance_loss_mlp": 1.06205094, + "epoch": 0.8443632166217776, + "flos": 484409058816.0, + "grad_norm": 0.03790339659307899, + "language_loss": 0.8391732, + "learning_rate": 6.218929957057922e-05, + "loss": 0.85052609, + "num_input_tokens_seen": 364400944, + "router_z_loss_mlp": 0.73242188, + "step": 4389, + "time_per_iteration": 2.7934298515319824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137913, + "balance_loss_mlp": 1.0647186, + "epoch": 0.8445555983070412, + "flos": 679923314688.0, + "grad_norm": 0.03718559505154548, + "language_loss": 0.8493886, + "learning_rate": 6.2038910223856e-05, + "loss": 0.86076784, + "num_input_tokens_seen": 364475744, + "router_z_loss_mlp": 0.73193359, + "step": 4390, + "time_per_iteration": 2.9792392253875732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137881, + "balance_loss_mlp": 1.06468666, + "epoch": 0.8447479799923048, + "flos": 742858916352.0, + "grad_norm": 0.03376774595397736, + "language_loss": 0.78831851, + "learning_rate": 6.18886909087073e-05, + "loss": 0.79969728, + "num_input_tokens_seen": 364557248, + "router_z_loss_mlp": 0.73193359, + "step": 4391, + "time_per_iteration": 3.1305229663848877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136874, + "balance_loss_mlp": 1.06367922, + "epoch": 0.8449403616775683, + "flos": 954949870080.0, + "grad_norm": 0.036571969449469936, + "language_loss": 0.84915316, + "learning_rate": 6.173864168345344e-05, + "loss": 0.86052191, + "num_input_tokens_seen": 364647856, + "router_z_loss_mlp": 0.73193359, + "step": 4392, + "time_per_iteration": 3.35559344291687 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137009, + "balance_loss_mlp": 1.06371963, + "epoch": 0.8451327433628318, + "flos": 658607740416.0, + "grad_norm": 0.04080767890774202, + "language_loss": 0.78550094, + "learning_rate": 6.158876260634871e-05, + "loss": 0.79687101, + "num_input_tokens_seen": 364728848, + "router_z_loss_mlp": 0.73291016, + "step": 4393, + "time_per_iteration": 2.8861243724823 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136802, + "balance_loss_mlp": 1.06360781, + "epoch": 0.8453251250480954, + "flos": 447048543744.0, + "grad_norm": 0.03643076078950129, + "language_loss": 0.87869531, + "learning_rate": 6.143905373558112e-05, + "loss": 0.89006329, + "num_input_tokens_seen": 364794032, + "router_z_loss_mlp": 0.73193359, + "step": 4394, + "time_per_iteration": 2.601045846939087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136522, + "balance_loss_mlp": 1.06332743, + "epoch": 0.845517506733359, + "flos": 543873414144.0, + "grad_norm": 0.04754169737380615, + "language_loss": 0.75916922, + "learning_rate": 6.128951512927305e-05, + "loss": 0.77053452, + "num_input_tokens_seen": 364868624, + "router_z_loss_mlp": 0.73193359, + "step": 4395, + "time_per_iteration": 2.6586995124816895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136024, + "balance_loss_mlp": 1.06282973, + "epoch": 0.8457098884186226, + "flos": 503506280448.0, + "grad_norm": 0.034957513190318694, + "language_loss": 0.88970757, + "learning_rate": 6.114014684548046e-05, + "loss": 0.90106773, + "num_input_tokens_seen": 364938208, + "router_z_loss_mlp": 0.73193359, + "step": 4396, + "time_per_iteration": 2.641904592514038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136391, + "balance_loss_mlp": 1.06319618, + "epoch": 0.8459022701038861, + "flos": 449894707200.0, + "grad_norm": 0.03727348899635202, + "language_loss": 0.85077035, + "learning_rate": 6.099094894219326e-05, + "loss": 0.86213428, + "num_input_tokens_seen": 365009440, + "router_z_loss_mlp": 0.73193359, + "step": 4397, + "time_per_iteration": 2.7485921382904053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138262, + "balance_loss_mlp": 1.06516242, + "epoch": 0.8460946517891497, + "flos": 744471651840.0, + "grad_norm": 0.03568111304963743, + "language_loss": 0.79751641, + "learning_rate": 6.0841921477335194e-05, + "loss": 0.80889904, + "num_input_tokens_seen": 365085904, + "router_z_loss_mlp": 0.73095703, + "step": 4398, + "time_per_iteration": 3.0065886974334717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137749, + "balance_loss_mlp": 1.06469774, + "epoch": 0.8462870334744133, + "flos": 554326084608.0, + "grad_norm": 0.034126813456360164, + "language_loss": 0.84568942, + "learning_rate": 6.069306450876389e-05, + "loss": 0.85706693, + "num_input_tokens_seen": 365163600, + "router_z_loss_mlp": 0.73095703, + "step": 4399, + "time_per_iteration": 2.758197069168091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142326, + "balance_loss_mlp": 1.07080078, + "epoch": 0.8464794151596768, + "flos": 1568268403200.0, + "grad_norm": 0.004082399579893022, + "language_loss": 0.81708568, + "learning_rate": 6.054437809427071e-05, + "loss": 0.82850897, + "num_input_tokens_seen": 365384528, + "router_z_loss_mlp": 0.71679688, + "step": 4400, + "time_per_iteration": 5.1885364055633545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113736, + "balance_loss_mlp": 1.06416523, + "epoch": 0.8466717968449403, + "flos": 551265071616.0, + "grad_norm": 0.03422118197100462, + "language_loss": 0.84376073, + "learning_rate": 6.039586229158084e-05, + "loss": 0.85513437, + "num_input_tokens_seen": 365453760, + "router_z_loss_mlp": 0.73193359, + "step": 4401, + "time_per_iteration": 2.866410255432129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137584, + "balance_loss_mlp": 1.06438947, + "epoch": 0.8468641785302039, + "flos": 553095384576.0, + "grad_norm": 0.04013122246303511, + "language_loss": 0.89010692, + "learning_rate": 6.024751715835314e-05, + "loss": 0.90148282, + "num_input_tokens_seen": 365532416, + "router_z_loss_mlp": 0.73193359, + "step": 4402, + "time_per_iteration": 2.8533406257629395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137073, + "balance_loss_mlp": 1.06402123, + "epoch": 0.8470565602154675, + "flos": 573824807424.0, + "grad_norm": 0.04032328985760824, + "language_loss": 0.91560149, + "learning_rate": 6.009934275218049e-05, + "loss": 0.92697221, + "num_input_tokens_seen": 365603776, + "router_z_loss_mlp": 0.73095703, + "step": 4403, + "time_per_iteration": 2.733107566833496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137261, + "balance_loss_mlp": 1.06406605, + "epoch": 0.8472489419007311, + "flos": 473780470272.0, + "grad_norm": 0.040727002498919716, + "language_loss": 0.89137018, + "learning_rate": 5.995133913058936e-05, + "loss": 0.90274274, + "num_input_tokens_seen": 365670432, + "router_z_loss_mlp": 0.73193359, + "step": 4404, + "time_per_iteration": 2.5842621326446533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137106, + "balance_loss_mlp": 1.06405413, + "epoch": 0.8474413235859947, + "flos": 799377051648.0, + "grad_norm": 0.036020961775101966, + "language_loss": 0.84674489, + "learning_rate": 5.980350635103954e-05, + "loss": 0.85811591, + "num_input_tokens_seen": 365741584, + "router_z_loss_mlp": 0.73095703, + "step": 4405, + "time_per_iteration": 3.0260725021362305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138055, + "balance_loss_mlp": 1.06495583, + "epoch": 0.8476337052712581, + "flos": 503378025984.0, + "grad_norm": 0.03673815005033266, + "language_loss": 0.85231286, + "learning_rate": 5.9655844470924866e-05, + "loss": 0.86369342, + "num_input_tokens_seen": 365805344, + "router_z_loss_mlp": 0.73144531, + "step": 4406, + "time_per_iteration": 2.5721280574798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011379, + "balance_loss_mlp": 1.06475341, + "epoch": 0.8478260869565217, + "flos": 933516774912.0, + "grad_norm": 0.029177319887610593, + "language_loss": 0.87274981, + "learning_rate": 5.9508353547573e-05, + "loss": 0.88412881, + "num_input_tokens_seen": 365890976, + "router_z_loss_mlp": 0.73193359, + "step": 4407, + "time_per_iteration": 3.267518997192383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138465, + "balance_loss_mlp": 1.0652225, + "epoch": 0.8480184686417853, + "flos": 710052627456.0, + "grad_norm": 0.039132750442480525, + "language_loss": 0.85530651, + "learning_rate": 5.9361033638244855e-05, + "loss": 0.86669123, + "num_input_tokens_seen": 365968912, + "router_z_loss_mlp": 0.73242188, + "step": 4408, + "time_per_iteration": 2.9040720462799072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138377, + "balance_loss_mlp": 1.06513441, + "epoch": 0.8482108503270489, + "flos": 615598559232.0, + "grad_norm": 0.03128645050494452, + "language_loss": 0.8671034, + "learning_rate": 5.9213884800135066e-05, + "loss": 0.87848717, + "num_input_tokens_seen": 366047680, + "router_z_loss_mlp": 0.73242188, + "step": 4409, + "time_per_iteration": 2.814863443374634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138014, + "balance_loss_mlp": 1.06486738, + "epoch": 0.8484032320123124, + "flos": 532072522752.0, + "grad_norm": 0.0338980139670295, + "language_loss": 0.86382216, + "learning_rate": 5.906690709037194e-05, + "loss": 0.8752023, + "num_input_tokens_seen": 366118720, + "router_z_loss_mlp": 0.73193359, + "step": 4410, + "time_per_iteration": 2.678199291229248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142998, + "balance_loss_mlp": 1.07147217, + "epoch": 0.848595613697576, + "flos": 1546171293696.0, + "grad_norm": 0.005786644875246692, + "language_loss": 0.76296914, + "learning_rate": 5.892010056601726e-05, + "loss": 0.7743991, + "num_input_tokens_seen": 366346928, + "router_z_loss_mlp": 0.71679688, + "step": 4411, + "time_per_iteration": 4.905268669128418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113754, + "balance_loss_mlp": 1.06439316, + "epoch": 0.8487879953828396, + "flos": 678618754560.0, + "grad_norm": 0.03786348460058995, + "language_loss": 0.78656065, + "learning_rate": 5.877346528406635e-05, + "loss": 0.79793596, + "num_input_tokens_seen": 366422848, + "router_z_loss_mlp": 0.73144531, + "step": 4412, + "time_per_iteration": 2.9538323879241943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113752, + "balance_loss_mlp": 1.06432509, + "epoch": 0.8489803770681031, + "flos": 504671852544.0, + "grad_norm": 0.03662625673681008, + "language_loss": 0.84200561, + "learning_rate": 5.8627001301448105e-05, + "loss": 0.8533808, + "num_input_tokens_seen": 366492016, + "router_z_loss_mlp": 0.73193359, + "step": 4413, + "time_per_iteration": 2.631989002227783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137281, + "balance_loss_mlp": 1.06408703, + "epoch": 0.8491727587533667, + "flos": 564349056000.0, + "grad_norm": 0.0365734841662918, + "language_loss": 0.81773579, + "learning_rate": 5.84807086750247e-05, + "loss": 0.82910866, + "num_input_tokens_seen": 366566400, + "router_z_loss_mlp": 0.73193359, + "step": 4414, + "time_per_iteration": 2.7764105796813965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137553, + "balance_loss_mlp": 1.06435871, + "epoch": 0.8493651404386302, + "flos": 460748878848.0, + "grad_norm": 0.050320136156211864, + "language_loss": 0.83642417, + "learning_rate": 5.833458746159243e-05, + "loss": 0.84779972, + "num_input_tokens_seen": 366634016, + "router_z_loss_mlp": 0.73193359, + "step": 4415, + "time_per_iteration": 2.55906343460083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136357, + "balance_loss_mlp": 1.06321061, + "epoch": 0.8495575221238938, + "flos": 462144763392.0, + "grad_norm": 0.042827503999962074, + "language_loss": 0.86903214, + "learning_rate": 5.818863771788013e-05, + "loss": 0.88039577, + "num_input_tokens_seen": 366704384, + "router_z_loss_mlp": 0.73193359, + "step": 4416, + "time_per_iteration": 2.7008659839630127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141524, + "balance_loss_mlp": 1.06790054, + "epoch": 0.8497499038091574, + "flos": 872152975872.0, + "grad_norm": 0.03663907725736085, + "language_loss": 0.85962868, + "learning_rate": 5.8042859500550604e-05, + "loss": 0.87104392, + "num_input_tokens_seen": 366785456, + "router_z_loss_mlp": 0.734375, + "step": 4417, + "time_per_iteration": 3.1430251598358154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113909, + "balance_loss_mlp": 1.06594312, + "epoch": 0.849942285494421, + "flos": 780974770176.0, + "grad_norm": 0.037432401008812614, + "language_loss": 0.82071102, + "learning_rate": 5.789725286620018e-05, + "loss": 0.83210188, + "num_input_tokens_seen": 366862848, + "router_z_loss_mlp": 0.73193359, + "step": 4418, + "time_per_iteration": 3.003854990005493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138933, + "balance_loss_mlp": 1.0654043, + "epoch": 0.8501346671796844, + "flos": 514907672064.0, + "grad_norm": 0.035344238090593685, + "language_loss": 0.8925063, + "learning_rate": 5.775181787135819e-05, + "loss": 0.90389562, + "num_input_tokens_seen": 366934800, + "router_z_loss_mlp": 0.73388672, + "step": 4419, + "time_per_iteration": 2.6802642345428467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140063, + "balance_loss_mlp": 1.06663048, + "epoch": 0.850327048864948, + "flos": 622634377728.0, + "grad_norm": 0.045521781734965405, + "language_loss": 0.87826395, + "learning_rate": 5.76065545724877e-05, + "loss": 0.88966453, + "num_input_tokens_seen": 367015152, + "router_z_loss_mlp": 0.73339844, + "step": 4420, + "time_per_iteration": 2.812560558319092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113939, + "balance_loss_mlp": 1.06595683, + "epoch": 0.8505194305502116, + "flos": 775549685760.0, + "grad_norm": 0.03647510347249887, + "language_loss": 0.84107387, + "learning_rate": 5.746146302598454e-05, + "loss": 0.85246778, + "num_input_tokens_seen": 367092192, + "router_z_loss_mlp": 0.73339844, + "step": 4421, + "time_per_iteration": 3.0192792415618896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140317, + "balance_loss_mlp": 1.06697929, + "epoch": 0.8507118122354752, + "flos": 466212894720.0, + "grad_norm": 0.037024341612432836, + "language_loss": 0.90897202, + "learning_rate": 5.731654328817859e-05, + "loss": 0.92037523, + "num_input_tokens_seen": 367159744, + "router_z_loss_mlp": 0.73291016, + "step": 4422, + "time_per_iteration": 2.584484100341797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139141, + "balance_loss_mlp": 1.06580317, + "epoch": 0.8509041939207388, + "flos": 535469908992.0, + "grad_norm": 0.035199882567299716, + "language_loss": 0.8991701, + "learning_rate": 5.717179541533257e-05, + "loss": 0.9105615, + "num_input_tokens_seen": 367226384, + "router_z_loss_mlp": 0.73291016, + "step": 4423, + "time_per_iteration": 2.732942819595337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139224, + "balance_loss_mlp": 1.06588686, + "epoch": 0.8510965756060023, + "flos": 584828700672.0, + "grad_norm": 0.037111715680716625, + "language_loss": 0.89189512, + "learning_rate": 5.702721946364264e-05, + "loss": 0.90328735, + "num_input_tokens_seen": 367294768, + "router_z_loss_mlp": 0.73291016, + "step": 4424, + "time_per_iteration": 2.698284864425659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139191, + "balance_loss_mlp": 1.0658536, + "epoch": 0.8512889572912659, + "flos": 602017746432.0, + "grad_norm": 0.06811401099002824, + "language_loss": 0.81721288, + "learning_rate": 5.688281548923796e-05, + "loss": 0.82860482, + "num_input_tokens_seen": 367372368, + "router_z_loss_mlp": 0.73291016, + "step": 4425, + "time_per_iteration": 2.8075883388519287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137527, + "balance_loss_mlp": 1.06409407, + "epoch": 0.8514813389765294, + "flos": 656065749504.0, + "grad_norm": 0.035446247672874326, + "language_loss": 0.82858717, + "learning_rate": 5.673858354818151e-05, + "loss": 0.83996248, + "num_input_tokens_seen": 367452656, + "router_z_loss_mlp": 0.73388672, + "step": 4426, + "time_per_iteration": 2.880490303039551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136979, + "balance_loss_mlp": 1.06359351, + "epoch": 0.851673720661793, + "flos": 430658497536.0, + "grad_norm": 0.03977079168614994, + "language_loss": 0.84184194, + "learning_rate": 5.6594523696468726e-05, + "loss": 0.8532117, + "num_input_tokens_seen": 367517808, + "router_z_loss_mlp": 0.73388672, + "step": 4427, + "time_per_iteration": 2.5517382621765137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136952, + "balance_loss_mlp": 1.06356657, + "epoch": 0.8518661023470565, + "flos": 642758183424.0, + "grad_norm": 0.03736572659166184, + "language_loss": 0.84144545, + "learning_rate": 5.645063599002875e-05, + "loss": 0.85281491, + "num_input_tokens_seen": 367591728, + "router_z_loss_mlp": 0.73388672, + "step": 4428, + "time_per_iteration": 2.7877635955810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136697, + "balance_loss_mlp": 1.06307364, + "epoch": 0.8520584840323201, + "flos": 563198220288.0, + "grad_norm": 0.038754285899443935, + "language_loss": 0.83934295, + "learning_rate": 5.630692048472363e-05, + "loss": 0.85070992, + "num_input_tokens_seen": 367664496, + "router_z_loss_mlp": 0.73535156, + "step": 4429, + "time_per_iteration": 2.690920352935791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137169, + "balance_loss_mlp": 1.06344974, + "epoch": 0.8522508657175837, + "flos": 528080252928.0, + "grad_norm": 0.04107244986742461, + "language_loss": 0.83775079, + "learning_rate": 5.61633772363489e-05, + "loss": 0.84912252, + "num_input_tokens_seen": 367735584, + "router_z_loss_mlp": 0.73583984, + "step": 4430, + "time_per_iteration": 2.6325595378875732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136253, + "balance_loss_mlp": 1.06272459, + "epoch": 0.8524432474028473, + "flos": 500102163456.0, + "grad_norm": 0.03352438353947398, + "language_loss": 0.84562439, + "learning_rate": 5.602000630063298e-05, + "loss": 0.85698688, + "num_input_tokens_seen": 367801136, + "router_z_loss_mlp": 0.73486328, + "step": 4431, + "time_per_iteration": 2.6214230060577393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135919, + "balance_loss_mlp": 1.06239092, + "epoch": 0.8526356290881109, + "flos": 422216060928.0, + "grad_norm": 0.048049255454419064, + "language_loss": 0.86048019, + "learning_rate": 5.587680773323706e-05, + "loss": 0.8718394, + "num_input_tokens_seen": 367865312, + "router_z_loss_mlp": 0.73486328, + "step": 4432, + "time_per_iteration": 2.535344362258911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136977, + "balance_loss_mlp": 1.06349599, + "epoch": 0.8528280107733743, + "flos": 508329750528.0, + "grad_norm": 0.034970015630649706, + "language_loss": 0.8575263, + "learning_rate": 5.5733781589756115e-05, + "loss": 0.86889607, + "num_input_tokens_seen": 367931104, + "router_z_loss_mlp": 0.73388672, + "step": 4433, + "time_per_iteration": 2.598065137863159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136944, + "balance_loss_mlp": 1.06360638, + "epoch": 0.8530203924586379, + "flos": 446816229888.0, + "grad_norm": 0.03606846672239564, + "language_loss": 0.87374574, + "learning_rate": 5.5590927925717684e-05, + "loss": 0.88511515, + "num_input_tokens_seen": 367995520, + "router_z_loss_mlp": 0.73339844, + "step": 4434, + "time_per_iteration": 2.5089426040649414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136784, + "balance_loss_mlp": 1.06330335, + "epoch": 0.8532127741439015, + "flos": 658989775872.0, + "grad_norm": 0.0360649650839633, + "language_loss": 0.88019717, + "learning_rate": 5.54482467965825e-05, + "loss": 0.89156508, + "num_input_tokens_seen": 368073664, + "router_z_loss_mlp": 0.73388672, + "step": 4435, + "time_per_iteration": 2.8504323959350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137737, + "balance_loss_mlp": 1.06420863, + "epoch": 0.8534051558291651, + "flos": 537098107392.0, + "grad_norm": 0.03019065878399416, + "language_loss": 0.87391806, + "learning_rate": 5.5305738257744264e-05, + "loss": 0.88529551, + "num_input_tokens_seen": 368147536, + "router_z_loss_mlp": 0.734375, + "step": 4436, + "time_per_iteration": 2.728482246398926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137806, + "balance_loss_mlp": 1.06399131, + "epoch": 0.8535975375144286, + "flos": 534037094400.0, + "grad_norm": 0.04283357460488269, + "language_loss": 0.84772766, + "learning_rate": 5.5163402364529655e-05, + "loss": 0.85910571, + "num_input_tokens_seen": 368218672, + "router_z_loss_mlp": 0.73632812, + "step": 4437, + "time_per_iteration": 2.6375861167907715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137872, + "balance_loss_mlp": 1.06405759, + "epoch": 0.8537899191996922, + "flos": 575268355584.0, + "grad_norm": 0.04299966443974174, + "language_loss": 0.8751781, + "learning_rate": 5.502123917219848e-05, + "loss": 0.88655686, + "num_input_tokens_seen": 368287056, + "router_z_loss_mlp": 0.73632812, + "step": 4438, + "time_per_iteration": 2.698176145553589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137907, + "balance_loss_mlp": 1.0640924, + "epoch": 0.8539823008849557, + "flos": 466006777344.0, + "grad_norm": 0.03463807162353114, + "language_loss": 0.87774605, + "learning_rate": 5.48792487359433e-05, + "loss": 0.88912511, + "num_input_tokens_seen": 368358400, + "router_z_loss_mlp": 0.73632812, + "step": 4439, + "time_per_iteration": 2.6831352710723877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137679, + "balance_loss_mlp": 1.06410253, + "epoch": 0.8541746825702193, + "flos": 555806562816.0, + "grad_norm": 0.03867022608803846, + "language_loss": 0.86941582, + "learning_rate": 5.4737431110889745e-05, + "loss": 0.88079262, + "num_input_tokens_seen": 368427168, + "router_z_loss_mlp": 0.73486328, + "step": 4440, + "time_per_iteration": 2.6928815841674805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136981, + "balance_loss_mlp": 1.06331003, + "epoch": 0.8543670642554829, + "flos": 547557508608.0, + "grad_norm": 0.03384967972445922, + "language_loss": 0.81909108, + "learning_rate": 5.4595786352096165e-05, + "loss": 0.83046091, + "num_input_tokens_seen": 368503584, + "router_z_loss_mlp": 0.73535156, + "step": 4441, + "time_per_iteration": 2.747842311859131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137327, + "balance_loss_mlp": 1.06360793, + "epoch": 0.8545594459407464, + "flos": 513075357696.0, + "grad_norm": 0.032234703238349205, + "language_loss": 0.86772889, + "learning_rate": 5.4454314514554236e-05, + "loss": 0.87910211, + "num_input_tokens_seen": 368576976, + "router_z_loss_mlp": 0.73583984, + "step": 4442, + "time_per_iteration": 2.6481122970581055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136922, + "balance_loss_mlp": 1.0632025, + "epoch": 0.85475182762601, + "flos": 422085805056.0, + "grad_norm": 0.03761893009858474, + "language_loss": 0.86693609, + "learning_rate": 5.431301565318786e-05, + "loss": 0.87830532, + "num_input_tokens_seen": 368641664, + "router_z_loss_mlp": 0.73583984, + "step": 4443, + "time_per_iteration": 2.4967923164367676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136971, + "balance_loss_mlp": 1.06339502, + "epoch": 0.8549442093112736, + "flos": 390291363840.0, + "grad_norm": 0.04115905585379076, + "language_loss": 0.82256216, + "learning_rate": 5.41718898228542e-05, + "loss": 0.83393186, + "num_input_tokens_seen": 368705616, + "router_z_loss_mlp": 0.73486328, + "step": 4444, + "time_per_iteration": 2.5440807342529297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137026, + "balance_loss_mlp": 1.0632118, + "epoch": 0.8551365909965372, + "flos": 607154121216.0, + "grad_norm": 0.035375940453208764, + "language_loss": 0.84474754, + "learning_rate": 5.403093707834334e-05, + "loss": 0.85611778, + "num_input_tokens_seen": 368779664, + "router_z_loss_mlp": 0.73632812, + "step": 4445, + "time_per_iteration": 2.843111515045166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136392, + "balance_loss_mlp": 1.0628165, + "epoch": 0.8553289726818007, + "flos": 505155945984.0, + "grad_norm": 0.03917502988089021, + "language_loss": 0.83616102, + "learning_rate": 5.3890157474377865e-05, + "loss": 0.84752494, + "num_input_tokens_seen": 368846656, + "router_z_loss_mlp": 0.73486328, + "step": 4446, + "time_per_iteration": 2.59338641166687 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135534, + "balance_loss_mlp": 1.06162477, + "epoch": 0.8555213543670642, + "flos": 558105506304.0, + "grad_norm": 0.03523140729629835, + "language_loss": 0.80791306, + "learning_rate": 5.374955106561324e-05, + "loss": 0.81926841, + "num_input_tokens_seen": 368923712, + "router_z_loss_mlp": 0.73779297, + "step": 4447, + "time_per_iteration": 2.766433000564575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135051, + "balance_loss_mlp": 1.06114113, + "epoch": 0.8557137360523278, + "flos": 549152779776.0, + "grad_norm": 0.042335426638136726, + "language_loss": 0.80681795, + "learning_rate": 5.360911790663775e-05, + "loss": 0.81816846, + "num_input_tokens_seen": 368994496, + "router_z_loss_mlp": 0.73779297, + "step": 4448, + "time_per_iteration": 2.69462251663208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135659, + "balance_loss_mlp": 1.0617491, + "epoch": 0.8559061177375914, + "flos": 729503686656.0, + "grad_norm": 0.03336299345483442, + "language_loss": 0.82454473, + "learning_rate": 5.346885805197238e-05, + "loss": 0.83590126, + "num_input_tokens_seen": 369077088, + "router_z_loss_mlp": 0.73779297, + "step": 4449, + "time_per_iteration": 3.009009838104248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136011, + "balance_loss_mlp": 1.06238735, + "epoch": 0.856098499422855, + "flos": 536976583680.0, + "grad_norm": 0.039322841970345704, + "language_loss": 0.88345414, + "learning_rate": 5.332877155607085e-05, + "loss": 0.89481425, + "num_input_tokens_seen": 369147680, + "router_z_loss_mlp": 0.73583984, + "step": 4450, + "time_per_iteration": 2.6745853424072266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135355, + "balance_loss_mlp": 1.06163609, + "epoch": 0.8562908811081185, + "flos": 574775529984.0, + "grad_norm": 0.03966072419835989, + "language_loss": 0.88200045, + "learning_rate": 5.3188858473319504e-05, + "loss": 0.893354, + "num_input_tokens_seen": 369224320, + "router_z_loss_mlp": 0.73681641, + "step": 4451, + "time_per_iteration": 2.7596144676208496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136075, + "balance_loss_mlp": 1.06249857, + "epoch": 0.856483262793382, + "flos": 783215316480.0, + "grad_norm": 0.03609964177893848, + "language_loss": 0.85612303, + "learning_rate": 5.3049118858037426e-05, + "loss": 0.86748379, + "num_input_tokens_seen": 369315744, + "router_z_loss_mlp": 0.73535156, + "step": 4452, + "time_per_iteration": 3.099785089492798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136641, + "balance_loss_mlp": 1.06311262, + "epoch": 0.8566756444786456, + "flos": 456756609024.0, + "grad_norm": 0.03265431385486569, + "language_loss": 0.89154232, + "learning_rate": 5.290955276447651e-05, + "loss": 0.90290874, + "num_input_tokens_seen": 369382800, + "router_z_loss_mlp": 0.73486328, + "step": 4453, + "time_per_iteration": 2.553025007247925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135846, + "balance_loss_mlp": 1.06236541, + "epoch": 0.8568680261639092, + "flos": 450315674112.0, + "grad_norm": 0.036031278358889064, + "language_loss": 0.88903332, + "learning_rate": 5.277016024682091e-05, + "loss": 0.9003917, + "num_input_tokens_seen": 369447312, + "router_z_loss_mlp": 0.73486328, + "step": 4454, + "time_per_iteration": 2.578143835067749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142006, + "balance_loss_mlp": 1.0684303, + "epoch": 0.8570604078491728, + "flos": 480937812480.0, + "grad_norm": 0.0381879382744482, + "language_loss": 0.87082827, + "learning_rate": 5.2630941359187665e-05, + "loss": 0.88224834, + "num_input_tokens_seen": 369512800, + "router_z_loss_mlp": 0.73583984, + "step": 4455, + "time_per_iteration": 2.5473132133483887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141253, + "balance_loss_mlp": 1.06762922, + "epoch": 0.8572527895344363, + "flos": 506933865984.0, + "grad_norm": 0.04281102576641978, + "language_loss": 0.8965286, + "learning_rate": 5.249189615562627e-05, + "loss": 0.9079411, + "num_input_tokens_seen": 369580720, + "router_z_loss_mlp": 0.73632812, + "step": 4456, + "time_per_iteration": 2.581775665283203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140917, + "balance_loss_mlp": 1.06748414, + "epoch": 0.8574451712196999, + "flos": 788475216384.0, + "grad_norm": 0.03185344103864885, + "language_loss": 0.87001526, + "learning_rate": 5.235302469011905e-05, + "loss": 0.88142449, + "num_input_tokens_seen": 369672544, + "router_z_loss_mlp": 0.734375, + "step": 4457, + "time_per_iteration": 3.0588223934173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140594, + "balance_loss_mlp": 1.06711328, + "epoch": 0.8576375529049635, + "flos": 510346715136.0, + "grad_norm": 0.037812671186274974, + "language_loss": 0.79738897, + "learning_rate": 5.2214327016580575e-05, + "loss": 0.80879498, + "num_input_tokens_seen": 369745776, + "router_z_loss_mlp": 0.73486328, + "step": 4458, + "time_per_iteration": 2.681156635284424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146698, + "balance_loss_mlp": 1.07498169, + "epoch": 0.857829934590227, + "flos": 1463888692224.0, + "grad_norm": 0.008556411684699908, + "language_loss": 0.84767288, + "learning_rate": 5.207580318885802e-05, + "loss": 0.85913986, + "num_input_tokens_seen": 369975200, + "router_z_loss_mlp": 0.71875, + "step": 4459, + "time_per_iteration": 4.9717326164245605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143149, + "balance_loss_mlp": 1.06976426, + "epoch": 0.8580223162754905, + "flos": 480258335232.0, + "grad_norm": 0.03181762715741318, + "language_loss": 0.93217885, + "learning_rate": 5.193745326073118e-05, + "loss": 0.94361031, + "num_input_tokens_seen": 370043296, + "router_z_loss_mlp": 0.73388672, + "step": 4460, + "time_per_iteration": 2.633009672164917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142727, + "balance_loss_mlp": 1.06934178, + "epoch": 0.8582146979607541, + "flos": 707456242176.0, + "grad_norm": 0.040093751457138914, + "language_loss": 0.83515179, + "learning_rate": 5.179927728591227e-05, + "loss": 0.84657907, + "num_input_tokens_seen": 370111152, + "router_z_loss_mlp": 0.73388672, + "step": 4461, + "time_per_iteration": 2.835998773574829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142655, + "balance_loss_mlp": 1.06922185, + "epoch": 0.8584070796460177, + "flos": 766492899840.0, + "grad_norm": 0.04020414939935447, + "language_loss": 0.87611806, + "learning_rate": 5.1661275318045874e-05, + "loss": 0.88754463, + "num_input_tokens_seen": 370190272, + "router_z_loss_mlp": 0.734375, + "step": 4462, + "time_per_iteration": 2.9936819076538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142285, + "balance_loss_mlp": 1.06885219, + "epoch": 0.8585994613312813, + "flos": 588009235968.0, + "grad_norm": 0.034025859465722855, + "language_loss": 0.905936, + "learning_rate": 5.152344741070919e-05, + "loss": 0.91735888, + "num_input_tokens_seen": 370267056, + "router_z_loss_mlp": 0.734375, + "step": 4463, + "time_per_iteration": 2.7997395992279053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142411, + "balance_loss_mlp": 1.06912124, + "epoch": 0.8587918430165449, + "flos": 609509460480.0, + "grad_norm": 0.03526777948899912, + "language_loss": 0.83016932, + "learning_rate": 5.138579361741169e-05, + "loss": 0.8415935, + "num_input_tokens_seen": 370344176, + "router_z_loss_mlp": 0.73291016, + "step": 4464, + "time_per_iteration": 2.799365520477295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141716, + "balance_loss_mlp": 1.06833065, + "epoch": 0.8589842247018084, + "flos": 590069134848.0, + "grad_norm": 0.038611970938618144, + "language_loss": 0.86071271, + "learning_rate": 5.124831399159535e-05, + "loss": 0.87212992, + "num_input_tokens_seen": 370414224, + "router_z_loss_mlp": 0.73388672, + "step": 4465, + "time_per_iteration": 2.7324819564819336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139539, + "balance_loss_mlp": 1.06610572, + "epoch": 0.8591766063870719, + "flos": 544963124736.0, + "grad_norm": 0.04312248482760193, + "language_loss": 0.83882284, + "learning_rate": 5.1111008586634475e-05, + "loss": 0.85021818, + "num_input_tokens_seen": 370484736, + "router_z_loss_mlp": 0.734375, + "step": 4466, + "time_per_iteration": 2.703601360321045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137187, + "balance_loss_mlp": 1.06365895, + "epoch": 0.8593689880723355, + "flos": 494785867776.0, + "grad_norm": 0.03941312585989275, + "language_loss": 0.86610931, + "learning_rate": 5.0973877455835816e-05, + "loss": 0.87748122, + "num_input_tokens_seen": 370556512, + "router_z_loss_mlp": 0.73486328, + "step": 4467, + "time_per_iteration": 2.681820869445801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136714, + "balance_loss_mlp": 1.06309068, + "epoch": 0.8595613697575991, + "flos": 534940153344.0, + "grad_norm": 0.03756039109661357, + "language_loss": 0.88946462, + "learning_rate": 5.083692065243822e-05, + "loss": 0.9008317, + "num_input_tokens_seen": 370622880, + "router_z_loss_mlp": 0.73535156, + "step": 4468, + "time_per_iteration": 2.6121115684509277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136802, + "balance_loss_mlp": 1.06327391, + "epoch": 0.8597537514428626, + "flos": 618754899456.0, + "grad_norm": 0.03953585832407336, + "language_loss": 0.80730748, + "learning_rate": 5.070013822961328e-05, + "loss": 0.81867552, + "num_input_tokens_seen": 370691632, + "router_z_loss_mlp": 0.73486328, + "step": 4469, + "time_per_iteration": 2.729743719100952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136854, + "balance_loss_mlp": 1.06332588, + "epoch": 0.8599461331281262, + "flos": 609856567296.0, + "grad_norm": 0.039611412927669135, + "language_loss": 0.88193107, + "learning_rate": 5.056353024046462e-05, + "loss": 0.89329958, + "num_input_tokens_seen": 370764848, + "router_z_loss_mlp": 0.73486328, + "step": 4470, + "time_per_iteration": 2.747981071472168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139553, + "balance_loss_mlp": 1.06573892, + "epoch": 0.8601385148133898, + "flos": 552344048640.0, + "grad_norm": 0.036428161077625955, + "language_loss": 0.87615812, + "learning_rate": 5.042709673802786e-05, + "loss": 0.88755369, + "num_input_tokens_seen": 370832496, + "router_z_loss_mlp": 0.73632812, + "step": 4471, + "time_per_iteration": 2.732907772064209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144942, + "balance_loss_mlp": 1.071509, + "epoch": 0.8603308964986534, + "flos": 582378034176.0, + "grad_norm": 0.031295899789225104, + "language_loss": 0.85058415, + "learning_rate": 5.0290837775271494e-05, + "loss": 0.86203361, + "num_input_tokens_seen": 370917104, + "router_z_loss_mlp": 0.734375, + "step": 4472, + "time_per_iteration": 2.87262225151062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144552, + "balance_loss_mlp": 1.07111919, + "epoch": 0.8605232781839169, + "flos": 630147558912.0, + "grad_norm": 0.04037586195823243, + "language_loss": 0.79786807, + "learning_rate": 5.0154753405095846e-05, + "loss": 0.8093136, + "num_input_tokens_seen": 370984512, + "router_z_loss_mlp": 0.734375, + "step": 4473, + "time_per_iteration": 2.787599802017212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144625, + "balance_loss_mlp": 1.07119215, + "epoch": 0.8607156598691804, + "flos": 469089257472.0, + "grad_norm": 0.040989177055780444, + "language_loss": 0.82621419, + "learning_rate": 5.0018843680333604e-05, + "loss": 0.83766043, + "num_input_tokens_seen": 371049664, + "router_z_loss_mlp": 0.734375, + "step": 4474, + "time_per_iteration": 2.5246458053588867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143896, + "balance_loss_mlp": 1.0704627, + "epoch": 0.860908041554444, + "flos": 489406445568.0, + "grad_norm": 0.03447505359677043, + "language_loss": 0.87655497, + "learning_rate": 4.988310865374945e-05, + "loss": 0.88799393, + "num_input_tokens_seen": 371120704, + "router_z_loss_mlp": 0.734375, + "step": 4475, + "time_per_iteration": 2.644601583480835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143481, + "balance_loss_mlp": 1.06995285, + "epoch": 0.8611004232397076, + "flos": 593169079296.0, + "grad_norm": 0.04484226543219231, + "language_loss": 0.85604751, + "learning_rate": 4.974754837804057e-05, + "loss": 0.8674823, + "num_input_tokens_seen": 371189376, + "router_z_loss_mlp": 0.73535156, + "step": 4476, + "time_per_iteration": 2.718604326248169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143628, + "balance_loss_mlp": 1.07014692, + "epoch": 0.8612928049249712, + "flos": 775621544448.0, + "grad_norm": 0.035398978535946514, + "language_loss": 0.90864736, + "learning_rate": 4.9612162905836036e-05, + "loss": 0.92008364, + "num_input_tokens_seen": 371275184, + "router_z_loss_mlp": 0.73486328, + "step": 4477, + "time_per_iteration": 3.0402839183807373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143583, + "balance_loss_mlp": 1.07014966, + "epoch": 0.8614851866102347, + "flos": 538606783488.0, + "grad_norm": 0.045897520744467304, + "language_loss": 0.878411, + "learning_rate": 4.947695228969718e-05, + "loss": 0.88984686, + "num_input_tokens_seen": 371347920, + "router_z_loss_mlp": 0.734375, + "step": 4478, + "time_per_iteration": 2.653444528579712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141916, + "balance_loss_mlp": 1.06838739, + "epoch": 0.8616775682954982, + "flos": 566995106304.0, + "grad_norm": 0.04005533562206663, + "language_loss": 0.84431696, + "learning_rate": 4.934191658211729e-05, + "loss": 0.85573614, + "num_input_tokens_seen": 371419728, + "router_z_loss_mlp": 0.73486328, + "step": 4479, + "time_per_iteration": 2.6883862018585205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114245, + "balance_loss_mlp": 1.06844449, + "epoch": 0.8618699499807618, + "flos": 482557278720.0, + "grad_norm": 0.04408793841080807, + "language_loss": 0.87477684, + "learning_rate": 4.92070558355221e-05, + "loss": 0.88620138, + "num_input_tokens_seen": 371488768, + "router_z_loss_mlp": 0.73828125, + "step": 4480, + "time_per_iteration": 2.6091084480285645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142157, + "balance_loss_mlp": 1.06815219, + "epoch": 0.8620623316660254, + "flos": 650679596544.0, + "grad_norm": 0.04658475745596792, + "language_loss": 0.80903435, + "learning_rate": 4.9072370102269226e-05, + "loss": 0.82045591, + "num_input_tokens_seen": 371560144, + "router_z_loss_mlp": 0.73828125, + "step": 4481, + "time_per_iteration": 2.7939393520355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114159, + "balance_loss_mlp": 1.06806207, + "epoch": 0.862254713351289, + "flos": 753081274368.0, + "grad_norm": 0.03549223597269206, + "language_loss": 0.90676355, + "learning_rate": 4.893785943464801e-05, + "loss": 0.91817951, + "num_input_tokens_seen": 371635920, + "router_z_loss_mlp": 0.73535156, + "step": 4482, + "time_per_iteration": 2.9854023456573486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141507, + "balance_loss_mlp": 1.06788337, + "epoch": 0.8624470950365525, + "flos": 843135567360.0, + "grad_norm": 0.03295717035983083, + "language_loss": 0.82174349, + "learning_rate": 4.880352388488024e-05, + "loss": 0.83315861, + "num_input_tokens_seen": 371727664, + "router_z_loss_mlp": 0.73583984, + "step": 4483, + "time_per_iteration": 3.2930996417999268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141344, + "balance_loss_mlp": 1.0677681, + "epoch": 0.8626394767218161, + "flos": 756087892992.0, + "grad_norm": 0.03698694034231399, + "language_loss": 0.87834418, + "learning_rate": 4.866936350511969e-05, + "loss": 0.88975763, + "num_input_tokens_seen": 371800832, + "router_z_loss_mlp": 0.73535156, + "step": 4484, + "time_per_iteration": 2.905592918395996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141202, + "balance_loss_mlp": 1.06767344, + "epoch": 0.8628318584070797, + "flos": 704857855488.0, + "grad_norm": 0.040701360718788646, + "language_loss": 0.86439824, + "learning_rate": 4.853537834745203e-05, + "loss": 0.87581027, + "num_input_tokens_seen": 371871472, + "router_z_loss_mlp": 0.73535156, + "step": 4485, + "time_per_iteration": 2.876677989959717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141051, + "balance_loss_mlp": 1.0674752, + "epoch": 0.8630242400923432, + "flos": 472197934080.0, + "grad_norm": 0.0356487521331988, + "language_loss": 0.82481432, + "learning_rate": 4.840156846389487e-05, + "loss": 0.83622479, + "num_input_tokens_seen": 371936512, + "router_z_loss_mlp": 0.73583984, + "step": 4486, + "time_per_iteration": 2.5704009532928467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141683, + "balance_loss_mlp": 1.06810677, + "epoch": 0.8632166217776067, + "flos": 965962495488.0, + "grad_norm": 0.042485813473706315, + "language_loss": 0.82875609, + "learning_rate": 4.826793390639783e-05, + "loss": 0.84017289, + "num_input_tokens_seen": 372018032, + "router_z_loss_mlp": 0.73535156, + "step": 4487, + "time_per_iteration": 3.2337405681610107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141296, + "balance_loss_mlp": 1.06772029, + "epoch": 0.8634090034628703, + "flos": 769239006720.0, + "grad_norm": 0.03930910636761154, + "language_loss": 0.82854676, + "learning_rate": 4.813447472684246e-05, + "loss": 0.83995974, + "num_input_tokens_seen": 372092176, + "router_z_loss_mlp": 0.73583984, + "step": 4488, + "time_per_iteration": 3.0039660930633545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114056, + "balance_loss_mlp": 1.06693602, + "epoch": 0.8636013851481339, + "flos": 521719908864.0, + "grad_norm": 0.035635459683833186, + "language_loss": 0.88014925, + "learning_rate": 4.800119097704214e-05, + "loss": 0.89155483, + "num_input_tokens_seen": 372166880, + "router_z_loss_mlp": 0.73583984, + "step": 4489, + "time_per_iteration": 2.762113332748413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141983, + "balance_loss_mlp": 1.06826377, + "epoch": 0.8637937668333975, + "flos": 633293165568.0, + "grad_norm": 0.0371692275655829, + "language_loss": 0.85686231, + "learning_rate": 4.7868082708742324e-05, + "loss": 0.86828208, + "num_input_tokens_seen": 372234608, + "router_z_loss_mlp": 0.73681641, + "step": 4490, + "time_per_iteration": 2.7638096809387207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114124, + "balance_loss_mlp": 1.06771219, + "epoch": 0.8639861485186611, + "flos": 857521383936.0, + "grad_norm": 0.03348350646617803, + "language_loss": 0.80966526, + "learning_rate": 4.773514997362e-05, + "loss": 0.8210777, + "num_input_tokens_seen": 372314704, + "router_z_loss_mlp": 0.73535156, + "step": 4491, + "time_per_iteration": 3.1014699935913086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141741, + "balance_loss_mlp": 1.06826007, + "epoch": 0.8641785302039245, + "flos": 482240371200.0, + "grad_norm": 0.04238731422676562, + "language_loss": 0.83083439, + "learning_rate": 4.7602392823284605e-05, + "loss": 0.84225178, + "num_input_tokens_seen": 372374848, + "router_z_loss_mlp": 0.73486328, + "step": 4492, + "time_per_iteration": 2.5285348892211914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114229, + "balance_loss_mlp": 1.06871402, + "epoch": 0.8643709118891881, + "flos": 505648771584.0, + "grad_norm": 0.03789343460075339, + "language_loss": 0.85717583, + "learning_rate": 4.746981130927675e-05, + "loss": 0.86859876, + "num_input_tokens_seen": 372442432, + "router_z_loss_mlp": 0.73583984, + "step": 4493, + "time_per_iteration": 2.6251981258392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141993, + "balance_loss_mlp": 1.06856048, + "epoch": 0.8645632935744517, + "flos": 553551280128.0, + "grad_norm": 0.03757320956431773, + "language_loss": 0.86991334, + "learning_rate": 4.733740548306908e-05, + "loss": 0.88133329, + "num_input_tokens_seen": 372520048, + "router_z_loss_mlp": 0.734375, + "step": 4494, + "time_per_iteration": 2.798293352127075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142343, + "balance_loss_mlp": 1.06876707, + "epoch": 0.8647556752597153, + "flos": 525735647232.0, + "grad_norm": 0.037128189922481854, + "language_loss": 0.88154763, + "learning_rate": 4.7205175396066336e-05, + "loss": 0.89297104, + "num_input_tokens_seen": 372587968, + "router_z_loss_mlp": 0.73583984, + "step": 4495, + "time_per_iteration": 2.585801124572754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114259, + "balance_loss_mlp": 1.06915712, + "epoch": 0.8649480569449788, + "flos": 789237285888.0, + "grad_norm": 0.036509667691993125, + "language_loss": 0.87320912, + "learning_rate": 4.707312109960471e-05, + "loss": 0.88463509, + "num_input_tokens_seen": 372672544, + "router_z_loss_mlp": 0.734375, + "step": 4496, + "time_per_iteration": 3.11242413520813 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142515, + "balance_loss_mlp": 1.06903481, + "epoch": 0.8651404386302424, + "flos": 765199073280.0, + "grad_norm": 0.037756570686122495, + "language_loss": 0.81536937, + "learning_rate": 4.694124264495225e-05, + "loss": 0.82679451, + "num_input_tokens_seen": 372751296, + "router_z_loss_mlp": 0.73486328, + "step": 4497, + "time_per_iteration": 3.061692476272583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141615, + "balance_loss_mlp": 1.06813455, + "epoch": 0.865332820315506, + "flos": 540988319232.0, + "grad_norm": 0.03448303115707208, + "language_loss": 0.86013806, + "learning_rate": 4.680954008330851e-05, + "loss": 0.87155426, + "num_input_tokens_seen": 372825264, + "router_z_loss_mlp": 0.73486328, + "step": 4498, + "time_per_iteration": 2.7776076793670654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146858, + "balance_loss_mlp": 1.07495117, + "epoch": 0.8655252020007695, + "flos": 1479677124096.0, + "grad_norm": 0.010203912282881854, + "language_loss": 0.79174447, + "learning_rate": 4.667801346580519e-05, + "loss": 0.803213, + "num_input_tokens_seen": 373052000, + "router_z_loss_mlp": 0.72070312, + "step": 4499, + "time_per_iteration": 4.785112619400024 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139577, + "balance_loss_mlp": 1.06576228, + "epoch": 0.8657175836860331, + "flos": 518472244224.0, + "grad_norm": 0.03309637596200986, + "language_loss": 0.86845696, + "learning_rate": 4.6546662843505396e-05, + "loss": 0.87985277, + "num_input_tokens_seen": 373124128, + "router_z_loss_mlp": 0.73632812, + "step": 4500, + "time_per_iteration": 2.7067041397094727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140055, + "balance_loss_mlp": 1.06628788, + "epoch": 0.8659099653712966, + "flos": 591632205312.0, + "grad_norm": 0.036173409641408416, + "language_loss": 0.85177112, + "learning_rate": 4.641548826740394e-05, + "loss": 0.8631717, + "num_input_tokens_seen": 373195472, + "router_z_loss_mlp": 0.73583984, + "step": 4501, + "time_per_iteration": 2.7207436561584473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140299, + "balance_loss_mlp": 1.06667542, + "epoch": 0.8661023470565602, + "flos": 591575809536.0, + "grad_norm": 0.03801706750898132, + "language_loss": 0.9136349, + "learning_rate": 4.628448978842731e-05, + "loss": 0.92503786, + "num_input_tokens_seen": 373273504, + "router_z_loss_mlp": 0.73535156, + "step": 4502, + "time_per_iteration": 2.809257745742798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140286, + "balance_loss_mlp": 1.06647146, + "epoch": 0.8662947287418238, + "flos": 568736096256.0, + "grad_norm": 0.03693136535041395, + "language_loss": 0.84185296, + "learning_rate": 4.61536674574336e-05, + "loss": 0.85325581, + "num_input_tokens_seen": 373346032, + "router_z_loss_mlp": 0.73632812, + "step": 4503, + "time_per_iteration": 2.7448463439941406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141065, + "balance_loss_mlp": 1.06729817, + "epoch": 0.8664871104270874, + "flos": 517002499584.0, + "grad_norm": 0.029797244201928218, + "language_loss": 0.85579336, + "learning_rate": 4.6023021325212636e-05, + "loss": 0.86720395, + "num_input_tokens_seen": 373419968, + "router_z_loss_mlp": 0.73583984, + "step": 4504, + "time_per_iteration": 2.771195411682129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141096, + "balance_loss_mlp": 1.06728137, + "epoch": 0.866679492112351, + "flos": 558429144576.0, + "grad_norm": 0.03508013517718755, + "language_loss": 0.82380766, + "learning_rate": 4.589255144248561e-05, + "loss": 0.83521855, + "num_input_tokens_seen": 373502448, + "router_z_loss_mlp": 0.73632812, + "step": 4505, + "time_per_iteration": 2.779545545578003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142726, + "balance_loss_mlp": 1.0692935, + "epoch": 0.8668718737976144, + "flos": 723661638144.0, + "grad_norm": 0.04291164810102399, + "language_loss": 0.87122786, + "learning_rate": 4.57622578599054e-05, + "loss": 0.88265514, + "num_input_tokens_seen": 373581184, + "router_z_loss_mlp": 0.73388672, + "step": 4506, + "time_per_iteration": 2.866483211517334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142832, + "balance_loss_mlp": 1.06935108, + "epoch": 0.867064255482878, + "flos": 601833096192.0, + "grad_norm": 0.044988032903290696, + "language_loss": 0.90554643, + "learning_rate": 4.5632140628056705e-05, + "loss": 0.91697466, + "num_input_tokens_seen": 373652272, + "router_z_loss_mlp": 0.734375, + "step": 4507, + "time_per_iteration": 2.7110989093780518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142288, + "balance_loss_mlp": 1.06880796, + "epoch": 0.8672566371681416, + "flos": 804932391936.0, + "grad_norm": 0.03964357174424219, + "language_loss": 0.81517231, + "learning_rate": 4.550219979745529e-05, + "loss": 0.82659519, + "num_input_tokens_seen": 373734896, + "router_z_loss_mlp": 0.734375, + "step": 4508, + "time_per_iteration": 3.0471880435943604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142367, + "balance_loss_mlp": 1.06883836, + "epoch": 0.8674490188534052, + "flos": 628554289152.0, + "grad_norm": 0.035932979941611695, + "language_loss": 0.88173008, + "learning_rate": 4.5372435418548905e-05, + "loss": 0.89315367, + "num_input_tokens_seen": 373806960, + "router_z_loss_mlp": 0.734375, + "step": 4509, + "time_per_iteration": 2.7578866481781006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114294, + "balance_loss_mlp": 1.06941152, + "epoch": 0.8676414005386687, + "flos": 729204243456.0, + "grad_norm": 0.03320098636721179, + "language_loss": 0.90483028, + "learning_rate": 4.524284754171615e-05, + "loss": 0.91625965, + "num_input_tokens_seen": 373888352, + "router_z_loss_mlp": 0.734375, + "step": 4510, + "time_per_iteration": 2.963334321975708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142596, + "balance_loss_mlp": 1.06901991, + "epoch": 0.8678337822239323, + "flos": 541162235904.0, + "grad_norm": 0.03785696811984203, + "language_loss": 0.85416347, + "learning_rate": 4.5113436217267765e-05, + "loss": 0.86558938, + "num_input_tokens_seen": 373962112, + "router_z_loss_mlp": 0.734375, + "step": 4511, + "time_per_iteration": 2.8185770511627197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142668, + "balance_loss_mlp": 1.06894934, + "epoch": 0.8680261639091958, + "flos": 508525134336.0, + "grad_norm": 0.039845679615304476, + "language_loss": 0.84207547, + "learning_rate": 4.4984201495445744e-05, + "loss": 0.85350215, + "num_input_tokens_seen": 374028256, + "router_z_loss_mlp": 0.73583984, + "step": 4512, + "time_per_iteration": 2.585066795349121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141066, + "balance_loss_mlp": 1.06729949, + "epoch": 0.8682185455944594, + "flos": 488149549056.0, + "grad_norm": 0.038660182567623144, + "language_loss": 0.85638297, + "learning_rate": 4.4855143426423275e-05, + "loss": 0.86779356, + "num_input_tokens_seen": 374100080, + "router_z_loss_mlp": 0.73583984, + "step": 4513, + "time_per_iteration": 2.633535861968994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143624, + "balance_loss_mlp": 1.07019103, + "epoch": 0.868410927279723, + "flos": 604802784768.0, + "grad_norm": 0.04017621150999441, + "language_loss": 0.86356068, + "learning_rate": 4.472626206030528e-05, + "loss": 0.8749969, + "num_input_tokens_seen": 374174368, + "router_z_loss_mlp": 0.734375, + "step": 4514, + "time_per_iteration": 2.7051877975463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143529, + "balance_loss_mlp": 1.07009649, + "epoch": 0.8686033089649865, + "flos": 1120720851456.0, + "grad_norm": 0.03707200576292934, + "language_loss": 0.88939041, + "learning_rate": 4.4597557447127846e-05, + "loss": 0.90082574, + "num_input_tokens_seen": 374257328, + "router_z_loss_mlp": 0.734375, + "step": 4515, + "time_per_iteration": 3.379136562347412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142134, + "balance_loss_mlp": 1.06870151, + "epoch": 0.8687956906502501, + "flos": 569098665984.0, + "grad_norm": 0.03976409225750092, + "language_loss": 0.89278877, + "learning_rate": 4.446902963685862e-05, + "loss": 0.90421009, + "num_input_tokens_seen": 374327936, + "router_z_loss_mlp": 0.734375, + "step": 4516, + "time_per_iteration": 2.688634157180786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140065, + "balance_loss_mlp": 1.06663203, + "epoch": 0.8689880723355137, + "flos": 545410288128.0, + "grad_norm": 0.03650159916701781, + "language_loss": 0.89403987, + "learning_rate": 4.4340678679396454e-05, + "loss": 0.90544057, + "num_input_tokens_seen": 374400496, + "router_z_loss_mlp": 0.734375, + "step": 4517, + "time_per_iteration": 2.6805598735809326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114102, + "balance_loss_mlp": 1.06763518, + "epoch": 0.8691804540207773, + "flos": 458384807424.0, + "grad_norm": 0.03496696120486749, + "language_loss": 0.90544659, + "learning_rate": 4.4212504624571495e-05, + "loss": 0.91685677, + "num_input_tokens_seen": 374470528, + "router_z_loss_mlp": 0.73388672, + "step": 4518, + "time_per_iteration": 2.601616859436035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141562, + "balance_loss_mlp": 1.06812906, + "epoch": 0.8693728357060407, + "flos": 592999891968.0, + "grad_norm": 0.035835180224579856, + "language_loss": 0.8468256, + "learning_rate": 4.40845075221456e-05, + "loss": 0.8582412, + "num_input_tokens_seen": 374542656, + "router_z_loss_mlp": 0.734375, + "step": 4519, + "time_per_iteration": 2.711921215057373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141689, + "balance_loss_mlp": 1.06835151, + "epoch": 0.8695652173913043, + "flos": 681523315200.0, + "grad_norm": 0.03942817475285988, + "language_loss": 0.84818816, + "learning_rate": 4.395668742181164e-05, + "loss": 0.85960507, + "num_input_tokens_seen": 374617232, + "router_z_loss_mlp": 0.73339844, + "step": 4520, + "time_per_iteration": 2.9093902111053467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140477, + "balance_loss_mlp": 1.06709146, + "epoch": 0.8697575990765679, + "flos": 493335588864.0, + "grad_norm": 0.037682038057646666, + "language_loss": 0.83001059, + "learning_rate": 4.38290443731934e-05, + "loss": 0.84141541, + "num_input_tokens_seen": 374681888, + "router_z_loss_mlp": 0.73388672, + "step": 4521, + "time_per_iteration": 2.5499000549316406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140213, + "balance_loss_mlp": 1.06682801, + "epoch": 0.8699499807618315, + "flos": 527986927104.0, + "grad_norm": 0.03154316551914982, + "language_loss": 0.85485643, + "learning_rate": 4.370157842584671e-05, + "loss": 0.86625856, + "num_input_tokens_seen": 374750464, + "router_z_loss_mlp": 0.73388672, + "step": 4522, + "time_per_iteration": 2.7108314037323 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140429, + "balance_loss_mlp": 1.06699562, + "epoch": 0.8701423624470951, + "flos": 815793294336.0, + "grad_norm": 0.03787775248383424, + "language_loss": 0.84961677, + "learning_rate": 4.357428962925808e-05, + "loss": 0.86102104, + "num_input_tokens_seen": 374836064, + "router_z_loss_mlp": 0.734375, + "step": 4523, + "time_per_iteration": 3.114084482192993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140204, + "balance_loss_mlp": 1.06681871, + "epoch": 0.8703347441323586, + "flos": 557873192448.0, + "grad_norm": 0.037626849509955144, + "language_loss": 0.93374288, + "learning_rate": 4.344717803284542e-05, + "loss": 0.94514489, + "num_input_tokens_seen": 374903392, + "router_z_loss_mlp": 0.73388672, + "step": 4524, + "time_per_iteration": 2.702937602996826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139648, + "balance_loss_mlp": 1.06631005, + "epoch": 0.8705271258176221, + "flos": 586613351424.0, + "grad_norm": 0.0317274327667996, + "language_loss": 0.88659638, + "learning_rate": 4.3320243685957825e-05, + "loss": 0.89799285, + "num_input_tokens_seen": 374985904, + "router_z_loss_mlp": 0.73339844, + "step": 4525, + "time_per_iteration": 2.8281044960021973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140144, + "balance_loss_mlp": 1.06675887, + "epoch": 0.8707195075028857, + "flos": 670501957632.0, + "grad_norm": 0.03755252318995871, + "language_loss": 0.89142346, + "learning_rate": 4.3193486637875536e-05, + "loss": 0.90282488, + "num_input_tokens_seen": 375062992, + "router_z_loss_mlp": 0.73388672, + "step": 4526, + "time_per_iteration": 2.8868792057037354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137755, + "balance_loss_mlp": 1.06436968, + "epoch": 0.8709118891881493, + "flos": 521470130688.0, + "grad_norm": 0.03465882180034492, + "language_loss": 0.88376933, + "learning_rate": 4.306690693781007e-05, + "loss": 0.89514691, + "num_input_tokens_seen": 375139296, + "router_z_loss_mlp": 0.73388672, + "step": 4527, + "time_per_iteration": 2.7600021362304688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137372, + "balance_loss_mlp": 1.06384361, + "epoch": 0.8711042708734128, + "flos": 554271690240.0, + "grad_norm": 0.0382661525421971, + "language_loss": 0.86503428, + "learning_rate": 4.294050463490401e-05, + "loss": 0.87640798, + "num_input_tokens_seen": 375206576, + "router_z_loss_mlp": 0.73486328, + "step": 4528, + "time_per_iteration": 2.6349923610687256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137844, + "balance_loss_mlp": 1.06445885, + "epoch": 0.8712966525586764, + "flos": 503237036544.0, + "grad_norm": 0.04010187218615475, + "language_loss": 0.87453485, + "learning_rate": 4.281427977823094e-05, + "loss": 0.88591325, + "num_input_tokens_seen": 375279008, + "router_z_loss_mlp": 0.73388672, + "step": 4529, + "time_per_iteration": 2.699385166168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113745, + "balance_loss_mlp": 1.06411278, + "epoch": 0.87148903424394, + "flos": 805527275520.0, + "grad_norm": 0.03499624240949085, + "language_loss": 0.7799021, + "learning_rate": 4.268823241679593e-05, + "loss": 0.79127657, + "num_input_tokens_seen": 375368512, + "router_z_loss_mlp": 0.73339844, + "step": 4530, + "time_per_iteration": 3.0969526767730713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113759, + "balance_loss_mlp": 1.06425273, + "epoch": 0.8716814159292036, + "flos": 774840009216.0, + "grad_norm": 0.04260127752626609, + "language_loss": 0.89968532, + "learning_rate": 4.256236259953489e-05, + "loss": 0.91106123, + "num_input_tokens_seen": 375450528, + "router_z_loss_mlp": 0.73339844, + "step": 4531, + "time_per_iteration": 3.010664224624634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113744, + "balance_loss_mlp": 1.06405497, + "epoch": 0.8718737976144671, + "flos": 487797712896.0, + "grad_norm": 0.03878344757926045, + "language_loss": 0.9016605, + "learning_rate": 4.243667037531468e-05, + "loss": 0.91303492, + "num_input_tokens_seen": 375518256, + "router_z_loss_mlp": 0.73339844, + "step": 4532, + "time_per_iteration": 2.5791871547698975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137314, + "balance_loss_mlp": 1.06402397, + "epoch": 0.8720661792997306, + "flos": 585219468288.0, + "grad_norm": 0.034654863878580654, + "language_loss": 0.83150959, + "learning_rate": 4.2311155792933264e-05, + "loss": 0.84288275, + "num_input_tokens_seen": 375588112, + "router_z_loss_mlp": 0.73291016, + "step": 4533, + "time_per_iteration": 2.711474657058716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143066, + "balance_loss_mlp": 1.0713501, + "epoch": 0.8722585609849942, + "flos": 1498999928832.0, + "grad_norm": 0.008770633490120042, + "language_loss": 0.80966806, + "learning_rate": 4.2185818901119946e-05, + "loss": 0.82109869, + "num_input_tokens_seen": 375814496, + "router_z_loss_mlp": 0.71875, + "step": 4534, + "time_per_iteration": 4.842734336853027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137401, + "balance_loss_mlp": 1.06396782, + "epoch": 0.8724509426702578, + "flos": 597309069312.0, + "grad_norm": 0.03609431409406132, + "language_loss": 0.91708696, + "learning_rate": 4.206065974853479e-05, + "loss": 0.92846096, + "num_input_tokens_seen": 375885440, + "router_z_loss_mlp": 0.73388672, + "step": 4535, + "time_per_iteration": 2.740379810333252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140364, + "balance_loss_mlp": 1.06702685, + "epoch": 0.8726433243555214, + "flos": 444545484288.0, + "grad_norm": 0.042510018256880736, + "language_loss": 0.86475211, + "learning_rate": 4.193567838376888e-05, + "loss": 0.87615573, + "num_input_tokens_seen": 375952640, + "router_z_loss_mlp": 0.73339844, + "step": 4536, + "time_per_iteration": 2.634587526321411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142129, + "balance_loss_mlp": 1.06907749, + "epoch": 0.8728357060407849, + "flos": 554234760192.0, + "grad_norm": 0.042982945041552326, + "language_loss": 0.87478817, + "learning_rate": 4.181087485534402e-05, + "loss": 0.88620949, + "num_input_tokens_seen": 376021648, + "router_z_loss_mlp": 0.73046875, + "step": 4537, + "time_per_iteration": 2.6632931232452393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141929, + "balance_loss_mlp": 1.06878173, + "epoch": 0.8730280877260485, + "flos": 629018916864.0, + "grad_norm": 0.03625222734252447, + "language_loss": 0.8318783, + "learning_rate": 4.16862492117136e-05, + "loss": 0.8432976, + "num_input_tokens_seen": 376102304, + "router_z_loss_mlp": 0.73144531, + "step": 4538, + "time_per_iteration": 2.8200526237487793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140845, + "balance_loss_mlp": 1.06750751, + "epoch": 0.873220469411312, + "flos": 536501222400.0, + "grad_norm": 0.03838073368509028, + "language_loss": 0.85009706, + "learning_rate": 4.156180150126143e-05, + "loss": 0.86150557, + "num_input_tokens_seen": 376177072, + "router_z_loss_mlp": 0.73339844, + "step": 4539, + "time_per_iteration": 2.720931053161621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140177, + "balance_loss_mlp": 1.06688702, + "epoch": 0.8734128510965756, + "flos": 563000835072.0, + "grad_norm": 0.036962465734187516, + "language_loss": 0.89154851, + "learning_rate": 4.143753177230242e-05, + "loss": 0.90295029, + "num_input_tokens_seen": 376251376, + "router_z_loss_mlp": 0.73291016, + "step": 4540, + "time_per_iteration": 2.7204575538635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140918, + "balance_loss_mlp": 1.06762838, + "epoch": 0.8736052327818392, + "flos": 687803794944.0, + "grad_norm": 0.05823857081406219, + "language_loss": 0.83594728, + "learning_rate": 4.131344007308224e-05, + "loss": 0.8473565, + "num_input_tokens_seen": 376337104, + "router_z_loss_mlp": 0.73291016, + "step": 4541, + "time_per_iteration": 3.0238983631134033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140844, + "balance_loss_mlp": 1.06750619, + "epoch": 0.8737976144671027, + "flos": 532832590848.0, + "grad_norm": 0.03481069740007844, + "language_loss": 0.85935038, + "learning_rate": 4.1189526451777816e-05, + "loss": 0.87075877, + "num_input_tokens_seen": 376415456, + "router_z_loss_mlp": 0.73339844, + "step": 4542, + "time_per_iteration": 2.805119752883911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141863, + "balance_loss_mlp": 1.06871605, + "epoch": 0.8739899961523663, + "flos": 576729368064.0, + "grad_norm": 0.03488368865297959, + "language_loss": 0.86241484, + "learning_rate": 4.106579095649649e-05, + "loss": 0.87383342, + "num_input_tokens_seen": 376494880, + "router_z_loss_mlp": 0.73144531, + "step": 4543, + "time_per_iteration": 2.8203420639038086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011421, + "balance_loss_mlp": 1.06885803, + "epoch": 0.8741823778376299, + "flos": 732631828992.0, + "grad_norm": 0.04473609359833568, + "language_loss": 0.83021426, + "learning_rate": 4.094223363527666e-05, + "loss": 0.84163529, + "num_input_tokens_seen": 376571760, + "router_z_loss_mlp": 0.73242188, + "step": 4544, + "time_per_iteration": 2.9382483959198 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140903, + "balance_loss_mlp": 1.06766069, + "epoch": 0.8743747595228935, + "flos": 568221803520.0, + "grad_norm": 0.0362289518248913, + "language_loss": 0.88223737, + "learning_rate": 4.081885453608747e-05, + "loss": 0.89364642, + "num_input_tokens_seen": 376644464, + "router_z_loss_mlp": 0.73242188, + "step": 4545, + "time_per_iteration": 2.7575058937072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140609, + "balance_loss_mlp": 1.06731939, + "epoch": 0.8745671412081569, + "flos": 494395100160.0, + "grad_norm": 0.03736605456447314, + "language_loss": 0.86481446, + "learning_rate": 4.0695653706829095e-05, + "loss": 0.87622052, + "num_input_tokens_seen": 376709584, + "router_z_loss_mlp": 0.73291016, + "step": 4546, + "time_per_iteration": 2.600027322769165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141765, + "balance_loss_mlp": 1.06866539, + "epoch": 0.8747595228934205, + "flos": 525166960128.0, + "grad_norm": 0.03010216092213021, + "language_loss": 0.87510192, + "learning_rate": 4.057263119533233e-05, + "loss": 0.88651955, + "num_input_tokens_seen": 376779472, + "router_z_loss_mlp": 0.73095703, + "step": 4547, + "time_per_iteration": 2.6267926692962646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142092, + "balance_loss_mlp": 1.06899297, + "epoch": 0.8749519045786841, + "flos": 745752743424.0, + "grad_norm": 0.036693225963323806, + "language_loss": 0.849769, + "learning_rate": 4.044978704935853e-05, + "loss": 0.86118996, + "num_input_tokens_seen": 376863408, + "router_z_loss_mlp": 0.73095703, + "step": 4548, + "time_per_iteration": 3.072727918624878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140781, + "balance_loss_mlp": 1.06758618, + "epoch": 0.8751442862639477, + "flos": 595383429120.0, + "grad_norm": 0.032788799851171016, + "language_loss": 0.84310943, + "learning_rate": 4.032712131660027e-05, + "loss": 0.85451728, + "num_input_tokens_seen": 376942080, + "router_z_loss_mlp": 0.73193359, + "step": 4549, + "time_per_iteration": 2.878819465637207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138154, + "balance_loss_mlp": 1.06486428, + "epoch": 0.8753366679492113, + "flos": 497514510336.0, + "grad_norm": 0.037587751687951164, + "language_loss": 0.83288509, + "learning_rate": 4.020463404468055e-05, + "loss": 0.84426665, + "num_input_tokens_seen": 377015696, + "router_z_loss_mlp": 0.73291016, + "step": 4550, + "time_per_iteration": 2.7538514137268066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138001, + "balance_loss_mlp": 1.06475925, + "epoch": 0.8755290496344748, + "flos": 490849993728.0, + "grad_norm": 0.036673671086796596, + "language_loss": 0.87328094, + "learning_rate": 4.0082325281153074e-05, + "loss": 0.88466096, + "num_input_tokens_seen": 377081424, + "router_z_loss_mlp": 0.73242188, + "step": 4551, + "time_per_iteration": 2.5642802715301514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137849, + "balance_loss_mlp": 1.06446373, + "epoch": 0.8757214313197383, + "flos": 593071750656.0, + "grad_norm": 0.03525869575859479, + "language_loss": 0.86262238, + "learning_rate": 3.9960195073502345e-05, + "loss": 0.87400079, + "num_input_tokens_seen": 377159360, + "router_z_loss_mlp": 0.73388672, + "step": 4552, + "time_per_iteration": 2.8446478843688965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138023, + "balance_loss_mlp": 1.06473362, + "epoch": 0.8759138130050019, + "flos": 978399203328.0, + "grad_norm": 0.052190711444307536, + "language_loss": 0.83496857, + "learning_rate": 3.9838243469143555e-05, + "loss": 0.84634876, + "num_input_tokens_seen": 377240704, + "router_z_loss_mlp": 0.73291016, + "step": 4553, + "time_per_iteration": 3.229661464691162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138498, + "balance_loss_mlp": 1.06520855, + "epoch": 0.8761061946902655, + "flos": 804205251072.0, + "grad_norm": 0.0321030761247515, + "language_loss": 0.80983669, + "learning_rate": 3.971647051542243e-05, + "loss": 0.82122165, + "num_input_tokens_seen": 377324176, + "router_z_loss_mlp": 0.73291016, + "step": 4554, + "time_per_iteration": 3.0523788928985596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137491, + "balance_loss_mlp": 1.06420088, + "epoch": 0.8762985763755291, + "flos": 699847733760.0, + "grad_norm": 0.035078141939390024, + "language_loss": 0.80103445, + "learning_rate": 3.95948762596155e-05, + "loss": 0.8124094, + "num_input_tokens_seen": 377403440, + "router_z_loss_mlp": 0.73291016, + "step": 4555, + "time_per_iteration": 2.972339391708374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138129, + "balance_loss_mlp": 1.06488729, + "epoch": 0.8764909580607926, + "flos": 630927092736.0, + "grad_norm": 0.0358178830175899, + "language_loss": 0.85281265, + "learning_rate": 3.9473460748929765e-05, + "loss": 0.86419404, + "num_input_tokens_seen": 377483440, + "router_z_loss_mlp": 0.73242188, + "step": 4556, + "time_per_iteration": 2.8507936000823975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137844, + "balance_loss_mlp": 1.06455374, + "epoch": 0.8766833397460562, + "flos": 482537812992.0, + "grad_norm": 0.035589487880799825, + "language_loss": 0.85349488, + "learning_rate": 3.935222403050304e-05, + "loss": 0.86487329, + "num_input_tokens_seen": 377554688, + "router_z_loss_mlp": 0.73291016, + "step": 4557, + "time_per_iteration": 2.686985969543457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138302, + "balance_loss_mlp": 1.06506014, + "epoch": 0.8768757214313198, + "flos": 408617783808.0, + "grad_norm": 0.03886308693669829, + "language_loss": 0.83731771, + "learning_rate": 3.923116615140354e-05, + "loss": 0.84870076, + "num_input_tokens_seen": 377617616, + "router_z_loss_mlp": 0.73242188, + "step": 4558, + "time_per_iteration": 2.5058376789093018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138698, + "balance_loss_mlp": 1.06545591, + "epoch": 0.8770681031165833, + "flos": 583656397824.0, + "grad_norm": 0.050661458115567146, + "language_loss": 0.87454987, + "learning_rate": 3.9110287158630076e-05, + "loss": 0.88593686, + "num_input_tokens_seen": 377685888, + "router_z_loss_mlp": 0.73242188, + "step": 4559, + "time_per_iteration": 2.669386625289917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138391, + "balance_loss_mlp": 1.06495833, + "epoch": 0.8772604848018468, + "flos": 509688705024.0, + "grad_norm": 0.03644513335402904, + "language_loss": 0.85219496, + "learning_rate": 3.8989587099111875e-05, + "loss": 0.86357886, + "num_input_tokens_seen": 377755744, + "router_z_loss_mlp": 0.73388672, + "step": 4560, + "time_per_iteration": 2.6710524559020996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138458, + "balance_loss_mlp": 1.06521559, + "epoch": 0.8774528664871104, + "flos": 409716226560.0, + "grad_norm": 0.04166962676030205, + "language_loss": 0.9057163, + "learning_rate": 3.886906601970913e-05, + "loss": 0.91710079, + "num_input_tokens_seen": 377818880, + "router_z_loss_mlp": 0.73242188, + "step": 4561, + "time_per_iteration": 2.4726264476776123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138891, + "balance_loss_mlp": 1.06574452, + "epoch": 0.877645248172374, + "flos": 501869349888.0, + "grad_norm": 0.03332122726470747, + "language_loss": 0.87716341, + "learning_rate": 3.8748723967212184e-05, + "loss": 0.88855237, + "num_input_tokens_seen": 377893280, + "router_z_loss_mlp": 0.73144531, + "step": 4562, + "time_per_iteration": 2.6267993450164795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139069, + "balance_loss_mlp": 1.06582642, + "epoch": 0.8778376298576376, + "flos": 634298282496.0, + "grad_norm": 0.03625990929087617, + "language_loss": 0.82094103, + "learning_rate": 3.862856098834189e-05, + "loss": 0.83233178, + "num_input_tokens_seen": 377972912, + "router_z_loss_mlp": 0.73242188, + "step": 4563, + "time_per_iteration": 2.874626398086548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138987, + "balance_loss_mlp": 1.06569707, + "epoch": 0.8780300115429012, + "flos": 535114070016.0, + "grad_norm": 0.033329550364358154, + "language_loss": 0.84246641, + "learning_rate": 3.850857712974976e-05, + "loss": 0.85385627, + "num_input_tokens_seen": 378054000, + "router_z_loss_mlp": 0.73291016, + "step": 4564, + "time_per_iteration": 2.865466833114624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138742, + "balance_loss_mlp": 1.06550014, + "epoch": 0.8782223932281646, + "flos": 512667125760.0, + "grad_norm": 0.035748918412903466, + "language_loss": 0.81673437, + "learning_rate": 3.838877243801758e-05, + "loss": 0.82812178, + "num_input_tokens_seen": 378120336, + "router_z_loss_mlp": 0.73242188, + "step": 4565, + "time_per_iteration": 2.6305251121520996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113867, + "balance_loss_mlp": 1.06547523, + "epoch": 0.8784147749134282, + "flos": 782245128192.0, + "grad_norm": 0.039934883887501355, + "language_loss": 0.74876142, + "learning_rate": 3.826914695965766e-05, + "loss": 0.76014817, + "num_input_tokens_seen": 378216672, + "router_z_loss_mlp": 0.73193359, + "step": 4566, + "time_per_iteration": 3.187756299972534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138841, + "balance_loss_mlp": 1.06550372, + "epoch": 0.8786071565986918, + "flos": 562071579648.0, + "grad_norm": 0.044145845900659855, + "language_loss": 0.81758606, + "learning_rate": 3.814970074111279e-05, + "loss": 0.82897443, + "num_input_tokens_seen": 378287536, + "router_z_loss_mlp": 0.73339844, + "step": 4567, + "time_per_iteration": 2.694370746612549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138507, + "balance_loss_mlp": 1.06516922, + "epoch": 0.8787995382839554, + "flos": 604651061760.0, + "grad_norm": 0.03484451232050219, + "language_loss": 0.81663251, + "learning_rate": 3.8030433828755926e-05, + "loss": 0.82801759, + "num_input_tokens_seen": 378362128, + "router_z_loss_mlp": 0.73291016, + "step": 4568, + "time_per_iteration": 2.8261232376098633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137882, + "balance_loss_mlp": 1.06444907, + "epoch": 0.8789919199692189, + "flos": 561290044416.0, + "grad_norm": 0.034253757816549546, + "language_loss": 0.892627, + "learning_rate": 3.7911346268890924e-05, + "loss": 0.90400583, + "num_input_tokens_seen": 378435696, + "router_z_loss_mlp": 0.73388672, + "step": 4569, + "time_per_iteration": 2.671189546585083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114104, + "balance_loss_mlp": 1.06789315, + "epoch": 0.8791843016544825, + "flos": 540152389632.0, + "grad_norm": 0.03918561185928757, + "language_loss": 0.87219656, + "learning_rate": 3.7792438107751405e-05, + "loss": 0.88360703, + "num_input_tokens_seen": 378505664, + "router_z_loss_mlp": 0.73144531, + "step": 4570, + "time_per_iteration": 2.627720355987549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140909, + "balance_loss_mlp": 1.06780934, + "epoch": 0.8793766833397461, + "flos": 1010404491264.0, + "grad_norm": 0.03486713685308289, + "language_loss": 0.83421218, + "learning_rate": 3.767370939150167e-05, + "loss": 0.84562135, + "num_input_tokens_seen": 378598016, + "router_z_loss_mlp": 0.73095703, + "step": 4571, + "time_per_iteration": 3.3709144592285156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141064, + "balance_loss_mlp": 1.06791723, + "epoch": 0.8795690650250096, + "flos": 679912581120.0, + "grad_norm": 0.03284343034146008, + "language_loss": 0.85293531, + "learning_rate": 3.755516016623628e-05, + "loss": 0.86434591, + "num_input_tokens_seen": 378676176, + "router_z_loss_mlp": 0.73144531, + "step": 4572, + "time_per_iteration": 2.883894205093384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140417, + "balance_loss_mlp": 1.06717467, + "epoch": 0.8797614467102732, + "flos": 454355607552.0, + "grad_norm": 0.038996415271177934, + "language_loss": 0.93823111, + "learning_rate": 3.7436790477980157e-05, + "loss": 0.94963527, + "num_input_tokens_seen": 378737952, + "router_z_loss_mlp": 0.73242188, + "step": 4573, + "time_per_iteration": 2.5188074111938477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114079, + "balance_loss_mlp": 1.06773829, + "epoch": 0.8799538283955367, + "flos": 551972746752.0, + "grad_norm": 0.03577674735145117, + "language_loss": 0.8895998, + "learning_rate": 3.7318600372688526e-05, + "loss": 0.90100765, + "num_input_tokens_seen": 378806704, + "router_z_loss_mlp": 0.73046875, + "step": 4574, + "time_per_iteration": 2.6594581604003906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139479, + "balance_loss_mlp": 1.06614149, + "epoch": 0.8801462100808003, + "flos": 808859533824.0, + "grad_norm": 0.03486958865574067, + "language_loss": 0.89314497, + "learning_rate": 3.720058989624681e-05, + "loss": 0.90453982, + "num_input_tokens_seen": 378887616, + "router_z_loss_mlp": 0.73339844, + "step": 4575, + "time_per_iteration": 3.0489046573638916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138104, + "balance_loss_mlp": 1.06481373, + "epoch": 0.8803385917660639, + "flos": 770011809792.0, + "grad_norm": 0.035651765700735125, + "language_loss": 0.88622105, + "learning_rate": 3.708275909447079e-05, + "loss": 0.89760214, + "num_input_tokens_seen": 378964656, + "router_z_loss_mlp": 0.73291016, + "step": 4576, + "time_per_iteration": 2.9586453437805176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138145, + "balance_loss_mlp": 1.06490231, + "epoch": 0.8805309734513275, + "flos": 568419188736.0, + "grad_norm": 0.032922624832929834, + "language_loss": 0.85456908, + "learning_rate": 3.696510801310632e-05, + "loss": 0.86595052, + "num_input_tokens_seen": 379036752, + "router_z_loss_mlp": 0.73242188, + "step": 4577, + "time_per_iteration": 2.719613790512085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137266, + "balance_loss_mlp": 1.06397581, + "epoch": 0.880723355136591, + "flos": 680976095232.0, + "grad_norm": 0.03544954996381365, + "language_loss": 0.8560704, + "learning_rate": 3.6847636697829755e-05, + "loss": 0.86744308, + "num_input_tokens_seen": 379106480, + "router_z_loss_mlp": 0.73291016, + "step": 4578, + "time_per_iteration": 2.8218014240264893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137911, + "balance_loss_mlp": 1.06462061, + "epoch": 0.8809157368218545, + "flos": 566760791040.0, + "grad_norm": 0.03362495082799701, + "language_loss": 0.83221316, + "learning_rate": 3.673034519424734e-05, + "loss": 0.84359229, + "num_input_tokens_seen": 379182544, + "router_z_loss_mlp": 0.73291016, + "step": 4579, + "time_per_iteration": 2.7465338706970215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139025, + "balance_loss_mlp": 1.06578302, + "epoch": 0.8811081185071181, + "flos": 516427081728.0, + "grad_norm": 0.03125001754888258, + "language_loss": 0.79574335, + "learning_rate": 3.661323354789586e-05, + "loss": 0.80713362, + "num_input_tokens_seen": 379255856, + "router_z_loss_mlp": 0.73242188, + "step": 4580, + "time_per_iteration": 2.690438985824585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139132, + "balance_loss_mlp": 1.06589007, + "epoch": 0.8813005001923817, + "flos": 595448557056.0, + "grad_norm": 0.03786361904540541, + "language_loss": 0.8583113, + "learning_rate": 3.649630180424191e-05, + "loss": 0.86970258, + "num_input_tokens_seen": 379322704, + "router_z_loss_mlp": 0.73242188, + "step": 4581, + "time_per_iteration": 2.715607166290283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113771, + "balance_loss_mlp": 1.06446779, + "epoch": 0.8814928818776453, + "flos": 668185549824.0, + "grad_norm": 0.03829692440387713, + "language_loss": 0.82977974, + "learning_rate": 3.637955000868254e-05, + "loss": 0.84115684, + "num_input_tokens_seen": 379395008, + "router_z_loss_mlp": 0.73242188, + "step": 4582, + "time_per_iteration": 2.8873000144958496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138319, + "balance_loss_mlp": 1.06507647, + "epoch": 0.8816852635629088, + "flos": 610275532800.0, + "grad_norm": 0.034998121361190335, + "language_loss": 0.90240663, + "learning_rate": 3.626297820654467e-05, + "loss": 0.91378981, + "num_input_tokens_seen": 379465824, + "router_z_loss_mlp": 0.73242188, + "step": 4583, + "time_per_iteration": 2.7176356315612793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138968, + "balance_loss_mlp": 1.06567812, + "epoch": 0.8818776452481724, + "flos": 481374242304.0, + "grad_norm": 0.0376212060911988, + "language_loss": 0.86705077, + "learning_rate": 3.614658644308572e-05, + "loss": 0.87844038, + "num_input_tokens_seen": 379534960, + "router_z_loss_mlp": 0.73291016, + "step": 4584, + "time_per_iteration": 2.6146843433380127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138915, + "balance_loss_mlp": 1.0655303, + "epoch": 0.882070026933436, + "flos": 1047033136128.0, + "grad_norm": 0.040308027049788406, + "language_loss": 0.78901362, + "learning_rate": 3.60303747634928e-05, + "loss": 0.80040276, + "num_input_tokens_seen": 379617456, + "router_z_loss_mlp": 0.73388672, + "step": 4585, + "time_per_iteration": 3.30761456489563 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136732, + "balance_loss_mlp": 1.06344187, + "epoch": 0.8822624086186995, + "flos": 475434865152.0, + "grad_norm": 0.03393344724745408, + "language_loss": 0.84516394, + "learning_rate": 3.591434321288345e-05, + "loss": 0.8565312, + "num_input_tokens_seen": 379687792, + "router_z_loss_mlp": 0.73291016, + "step": 4586, + "time_per_iteration": 2.680474042892456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113674, + "balance_loss_mlp": 1.06345069, + "epoch": 0.882454790303963, + "flos": 655221087744.0, + "grad_norm": 0.039082630684481784, + "language_loss": 0.86279416, + "learning_rate": 3.579849183630485e-05, + "loss": 0.87416154, + "num_input_tokens_seen": 379761120, + "router_z_loss_mlp": 0.73291016, + "step": 4587, + "time_per_iteration": 2.8492140769958496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136645, + "balance_loss_mlp": 1.06335557, + "epoch": 0.8826471719892266, + "flos": 471303607296.0, + "grad_norm": 0.039436934050180984, + "language_loss": 0.83528584, + "learning_rate": 3.568282067873468e-05, + "loss": 0.84665227, + "num_input_tokens_seen": 379829008, + "router_z_loss_mlp": 0.73291016, + "step": 4588, + "time_per_iteration": 2.562138319015503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136884, + "balance_loss_mlp": 1.06364226, + "epoch": 0.8828395536744902, + "flos": 469766733312.0, + "grad_norm": 0.033013862791337924, + "language_loss": 0.88277167, + "learning_rate": 3.556732978508048e-05, + "loss": 0.89414054, + "num_input_tokens_seen": 379899584, + "router_z_loss_mlp": 0.73242188, + "step": 4589, + "time_per_iteration": 2.7143378257751465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141687, + "balance_loss_mlp": 1.06844449, + "epoch": 0.8830319353597538, + "flos": 722717646336.0, + "grad_norm": 0.03609529277559126, + "language_loss": 0.85748345, + "learning_rate": 3.545201920017971e-05, + "loss": 0.8689003, + "num_input_tokens_seen": 379979440, + "router_z_loss_mlp": 0.73242188, + "step": 4590, + "time_per_iteration": 2.939535140991211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114124, + "balance_loss_mlp": 1.06790292, + "epoch": 0.8832243170450174, + "flos": 444191646720.0, + "grad_norm": 0.03979161587651804, + "language_loss": 0.85422397, + "learning_rate": 3.5336888968799996e-05, + "loss": 0.86563635, + "num_input_tokens_seen": 380046944, + "router_z_loss_mlp": 0.73339844, + "step": 4591, + "time_per_iteration": 2.594569683074951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141267, + "balance_loss_mlp": 1.06792951, + "epoch": 0.8834166987302808, + "flos": 567746442240.0, + "grad_norm": 0.04357275936054337, + "language_loss": 0.87711227, + "learning_rate": 3.5221939135638756e-05, + "loss": 0.88852489, + "num_input_tokens_seen": 380118048, + "router_z_loss_mlp": 0.73339844, + "step": 4592, + "time_per_iteration": 2.7693564891815186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141211, + "balance_loss_mlp": 1.06763518, + "epoch": 0.8836090804155444, + "flos": 610497113088.0, + "grad_norm": 0.036235581662511764, + "language_loss": 0.86945099, + "learning_rate": 3.510716974532352e-05, + "loss": 0.88086307, + "num_input_tokens_seen": 380192416, + "router_z_loss_mlp": 0.73486328, + "step": 4593, + "time_per_iteration": 2.823115587234497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141441, + "balance_loss_mlp": 1.06786549, + "epoch": 0.883801462100808, + "flos": 558116239872.0, + "grad_norm": 0.037409309315743274, + "language_loss": 0.84331363, + "learning_rate": 3.4992580842411745e-05, + "loss": 0.85472804, + "num_input_tokens_seen": 380264432, + "router_z_loss_mlp": 0.73486328, + "step": 4594, + "time_per_iteration": 2.6731603145599365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142652, + "balance_loss_mlp": 1.06917179, + "epoch": 0.8839938437860716, + "flos": 517199884800.0, + "grad_norm": 0.05623624543417451, + "language_loss": 0.82118529, + "learning_rate": 3.487817247139064e-05, + "loss": 0.8326118, + "num_input_tokens_seen": 380334192, + "router_z_loss_mlp": 0.734375, + "step": 4595, + "time_per_iteration": 2.643226385116577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142905, + "balance_loss_mlp": 1.06966281, + "epoch": 0.8841862254713351, + "flos": 714939224064.0, + "grad_norm": 0.03953602235880356, + "language_loss": 0.84327024, + "learning_rate": 3.47639446766777e-05, + "loss": 0.85469925, + "num_input_tokens_seen": 380407504, + "router_z_loss_mlp": 0.73242188, + "step": 4596, + "time_per_iteration": 2.8558902740478516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142903, + "balance_loss_mlp": 1.06966054, + "epoch": 0.8843786071565987, + "flos": 835378612224.0, + "grad_norm": 0.03630937996165782, + "language_loss": 0.8742218, + "learning_rate": 3.4649897502620095e-05, + "loss": 0.88565087, + "num_input_tokens_seen": 380486272, + "router_z_loss_mlp": 0.73242188, + "step": 4597, + "time_per_iteration": 3.0525734424591064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142043, + "balance_loss_mlp": 1.06875324, + "epoch": 0.8845709888418622, + "flos": 658178041344.0, + "grad_norm": 0.03258789526355552, + "language_loss": 0.86930513, + "learning_rate": 3.453603099349462e-05, + "loss": 0.88072556, + "num_input_tokens_seen": 380568480, + "router_z_loss_mlp": 0.73291016, + "step": 4598, + "time_per_iteration": 2.912843704223633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141884, + "balance_loss_mlp": 1.06859386, + "epoch": 0.8847633705271258, + "flos": 524483480064.0, + "grad_norm": 0.03479113833885251, + "language_loss": 0.84803116, + "learning_rate": 3.442234519350823e-05, + "loss": 0.85944992, + "num_input_tokens_seen": 380643088, + "router_z_loss_mlp": 0.73291016, + "step": 4599, + "time_per_iteration": 2.7513442039489746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114178, + "balance_loss_mlp": 1.06844211, + "epoch": 0.8849557522123894, + "flos": 549636873216.0, + "grad_norm": 0.03798845472112611, + "language_loss": 0.88343596, + "learning_rate": 3.430884014679786e-05, + "loss": 0.89485371, + "num_input_tokens_seen": 380714512, + "router_z_loss_mlp": 0.73339844, + "step": 4600, + "time_per_iteration": 2.665273666381836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141776, + "balance_loss_mlp": 1.06848598, + "epoch": 0.8851481338976529, + "flos": 623583098880.0, + "grad_norm": 0.03350151892147519, + "language_loss": 0.88500738, + "learning_rate": 3.4195515897429974e-05, + "loss": 0.89642519, + "num_input_tokens_seen": 380789168, + "router_z_loss_mlp": 0.73291016, + "step": 4601, + "time_per_iteration": 2.8266654014587402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139622, + "balance_loss_mlp": 1.0663321, + "epoch": 0.8853405155829165, + "flos": 445307553792.0, + "grad_norm": 0.035348073668552936, + "language_loss": 0.85571676, + "learning_rate": 3.408237248940088e-05, + "loss": 0.86711299, + "num_input_tokens_seen": 380856992, + "router_z_loss_mlp": 0.73291016, + "step": 4602, + "time_per_iteration": 2.556607246398926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141214, + "balance_loss_mlp": 1.06816256, + "epoch": 0.8855328972681801, + "flos": 731748235776.0, + "grad_norm": 0.03825998754316307, + "language_loss": 0.82411921, + "learning_rate": 3.396940996663683e-05, + "loss": 0.83553129, + "num_input_tokens_seen": 380930480, + "router_z_loss_mlp": 0.73046875, + "step": 4603, + "time_per_iteration": 2.8917107582092285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140786, + "balance_loss_mlp": 1.06763935, + "epoch": 0.8857252789534437, + "flos": 488355666432.0, + "grad_norm": 0.038685533641598824, + "language_loss": 0.83611298, + "learning_rate": 3.385662837299375e-05, + "loss": 0.84752083, + "num_input_tokens_seen": 380994192, + "router_z_loss_mlp": 0.73144531, + "step": 4604, + "time_per_iteration": 2.548560857772827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140966, + "balance_loss_mlp": 1.067819, + "epoch": 0.8859176606387072, + "flos": 509621575680.0, + "grad_norm": 0.042063998825786784, + "language_loss": 0.87247568, + "learning_rate": 3.374402775225727e-05, + "loss": 0.88388538, + "num_input_tokens_seen": 381066848, + "router_z_loss_mlp": 0.73144531, + "step": 4605, + "time_per_iteration": 2.7407033443450928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139586, + "balance_loss_mlp": 1.06634402, + "epoch": 0.8861100423239707, + "flos": 517664512512.0, + "grad_norm": 0.03414528469711758, + "language_loss": 0.89563382, + "learning_rate": 3.3631608148142925e-05, + "loss": 0.90702963, + "num_input_tokens_seen": 381138816, + "router_z_loss_mlp": 0.73242188, + "step": 4606, + "time_per_iteration": 2.652094602584839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113943, + "balance_loss_mlp": 1.06623542, + "epoch": 0.8863024240092343, + "flos": 628109127168.0, + "grad_norm": 0.03551682642921559, + "language_loss": 0.83570439, + "learning_rate": 3.3519369604295746e-05, + "loss": 0.84709865, + "num_input_tokens_seen": 381208448, + "router_z_loss_mlp": 0.73193359, + "step": 4607, + "time_per_iteration": 2.725616455078125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113989, + "balance_loss_mlp": 1.06679058, + "epoch": 0.8864948056944979, + "flos": 768297016320.0, + "grad_norm": 0.030729524445201942, + "language_loss": 0.87768084, + "learning_rate": 3.340731216429083e-05, + "loss": 0.88907969, + "num_input_tokens_seen": 381289712, + "router_z_loss_mlp": 0.73095703, + "step": 4608, + "time_per_iteration": 3.0135393142700195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143433, + "balance_loss_mlp": 1.07171631, + "epoch": 0.8866871873797615, + "flos": 1505665171968.0, + "grad_norm": 0.005181000489045015, + "language_loss": 0.78830957, + "learning_rate": 3.329543587163253e-05, + "loss": 0.79974389, + "num_input_tokens_seen": 381520848, + "router_z_loss_mlp": 0.71875, + "step": 4609, + "time_per_iteration": 4.8497114181518555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139247, + "balance_loss_mlp": 1.06619585, + "epoch": 0.886879569065025, + "flos": 812927665152.0, + "grad_norm": 0.03659826934115083, + "language_loss": 0.86593419, + "learning_rate": 3.3183740769755e-05, + "loss": 0.87732661, + "num_input_tokens_seen": 381603008, + "router_z_loss_mlp": 0.73046875, + "step": 4610, + "time_per_iteration": 3.0547640323638916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143288, + "balance_loss_mlp": 1.07176208, + "epoch": 0.8870719507502886, + "flos": 1586223521280.0, + "grad_norm": 0.0047245300791828836, + "language_loss": 0.7691083, + "learning_rate": 3.307222690202238e-05, + "loss": 0.78054118, + "num_input_tokens_seen": 381844336, + "router_z_loss_mlp": 0.71679688, + "step": 4611, + "time_per_iteration": 4.970493316650391 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140218, + "balance_loss_mlp": 1.06716621, + "epoch": 0.8872643324355521, + "flos": 635164411392.0, + "grad_norm": 0.0365799977682868, + "language_loss": 0.79885757, + "learning_rate": 3.296089431172811e-05, + "loss": 0.8102597, + "num_input_tokens_seen": 381918576, + "router_z_loss_mlp": 0.73046875, + "step": 4612, + "time_per_iteration": 2.800936698913574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140152, + "balance_loss_mlp": 1.06705284, + "epoch": 0.8874567141208157, + "flos": 536783201280.0, + "grad_norm": 0.03880516552904762, + "language_loss": 0.88008845, + "learning_rate": 3.284974304209532e-05, + "loss": 0.89148998, + "num_input_tokens_seen": 381987296, + "router_z_loss_mlp": 0.73095703, + "step": 4613, + "time_per_iteration": 2.6119205951690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139668, + "balance_loss_mlp": 1.06652081, + "epoch": 0.8876490958060793, + "flos": 1568717389824.0, + "grad_norm": 0.03468157692994687, + "language_loss": 0.83196402, + "learning_rate": 3.27387731362766e-05, + "loss": 0.84336072, + "num_input_tokens_seen": 382091744, + "router_z_loss_mlp": 0.73144531, + "step": 4614, + "time_per_iteration": 3.8848578929901123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140594, + "balance_loss_mlp": 1.06754243, + "epoch": 0.8878414774913428, + "flos": 637797726720.0, + "grad_norm": 0.03727846125722482, + "language_loss": 0.90132129, + "learning_rate": 3.2627984637354444e-05, + "loss": 0.91272724, + "num_input_tokens_seen": 382169600, + "router_z_loss_mlp": 0.73046875, + "step": 4615, + "time_per_iteration": 2.821709156036377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140764, + "balance_loss_mlp": 1.0677129, + "epoch": 0.8880338591766064, + "flos": 497421184512.0, + "grad_norm": 0.04463724567610171, + "language_loss": 0.86964601, + "learning_rate": 3.251737758834084e-05, + "loss": 0.88105357, + "num_input_tokens_seen": 382238336, + "router_z_loss_mlp": 0.73046875, + "step": 4616, + "time_per_iteration": 2.6447269916534424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141098, + "balance_loss_mlp": 1.06804681, + "epoch": 0.88822624086187, + "flos": 543912345600.0, + "grad_norm": 0.03827212430271638, + "language_loss": 0.84569329, + "learning_rate": 3.2406952032177086e-05, + "loss": 0.85710424, + "num_input_tokens_seen": 382308560, + "router_z_loss_mlp": 0.73046875, + "step": 4617, + "time_per_iteration": 2.6946191787719727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141215, + "balance_loss_mlp": 1.06816316, + "epoch": 0.8884186225471336, + "flos": 552875805696.0, + "grad_norm": 0.042682461995962664, + "language_loss": 0.88825953, + "learning_rate": 3.229670801173418e-05, + "loss": 0.89967167, + "num_input_tokens_seen": 382377504, + "router_z_loss_mlp": 0.73046875, + "step": 4618, + "time_per_iteration": 2.617229700088501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144875, + "balance_loss_mlp": 1.073349, + "epoch": 0.888611004232397, + "flos": 1568659170816.0, + "grad_norm": 0.003196569224435078, + "language_loss": 0.78512192, + "learning_rate": 3.218664556981288e-05, + "loss": 0.79657078, + "num_input_tokens_seen": 382615728, + "router_z_loss_mlp": 0.71679688, + "step": 4619, + "time_per_iteration": 5.0100486278533936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140753, + "balance_loss_mlp": 1.06770194, + "epoch": 0.8888033859176606, + "flos": 768436004352.0, + "grad_norm": 0.031145339209085954, + "language_loss": 0.86609745, + "learning_rate": 3.207676474914301e-05, + "loss": 0.877505, + "num_input_tokens_seen": 382695552, + "router_z_loss_mlp": 0.73046875, + "step": 4620, + "time_per_iteration": 3.0852935314178467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140488, + "balance_loss_mlp": 1.06738901, + "epoch": 0.8889957676029242, + "flos": 935648532480.0, + "grad_norm": 0.034367536832817513, + "language_loss": 0.88588071, + "learning_rate": 3.1967065592384105e-05, + "loss": 0.89728558, + "num_input_tokens_seen": 382775824, + "router_z_loss_mlp": 0.73095703, + "step": 4621, + "time_per_iteration": 3.1627614498138428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140338, + "balance_loss_mlp": 1.06728625, + "epoch": 0.8891881492881878, + "flos": 590792272896.0, + "grad_norm": 0.03508210471851401, + "language_loss": 0.86302722, + "learning_rate": 3.1857548142125104e-05, + "loss": 0.87443054, + "num_input_tokens_seen": 382854464, + "router_z_loss_mlp": 0.73046875, + "step": 4622, + "time_per_iteration": 2.8091282844543457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141091, + "balance_loss_mlp": 1.06803989, + "epoch": 0.8893805309734514, + "flos": 541843714560.0, + "grad_norm": 0.040725276818425686, + "language_loss": 0.87760389, + "learning_rate": 3.174821244088466e-05, + "loss": 0.88901484, + "num_input_tokens_seen": 382925088, + "router_z_loss_mlp": 0.73046875, + "step": 4623, + "time_per_iteration": 2.712893486022949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138455, + "balance_loss_mlp": 1.06530809, + "epoch": 0.8895729126587149, + "flos": 561168520704.0, + "grad_norm": 0.036429232224768356, + "language_loss": 0.86250001, + "learning_rate": 3.163905853111054e-05, + "loss": 0.87388456, + "num_input_tokens_seen": 382998640, + "router_z_loss_mlp": 0.73144531, + "step": 4624, + "time_per_iteration": 2.683321475982666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138327, + "balance_loss_mlp": 1.06522739, + "epoch": 0.8897652943439784, + "flos": 611280649728.0, + "grad_norm": 0.034860067275865936, + "language_loss": 0.85074407, + "learning_rate": 3.153008645517996e-05, + "loss": 0.86212736, + "num_input_tokens_seen": 383076000, + "router_z_loss_mlp": 0.73144531, + "step": 4625, + "time_per_iteration": 2.78021240234375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140004, + "balance_loss_mlp": 1.06685686, + "epoch": 0.889957676029242, + "flos": 919423670784.0, + "grad_norm": 0.0398902332567692, + "language_loss": 0.81782848, + "learning_rate": 3.142129625539969e-05, + "loss": 0.82922852, + "num_input_tokens_seen": 383166640, + "router_z_loss_mlp": 0.73144531, + "step": 4626, + "time_per_iteration": 3.2139408588409424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138118, + "balance_loss_mlp": 1.06501937, + "epoch": 0.8901500577145056, + "flos": 489686423040.0, + "grad_norm": 0.038017552561291156, + "language_loss": 0.85747802, + "learning_rate": 3.131268797400588e-05, + "loss": 0.86885923, + "num_input_tokens_seen": 383232928, + "router_z_loss_mlp": 0.73095703, + "step": 4627, + "time_per_iteration": 2.599820852279663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138395, + "balance_loss_mlp": 1.06520021, + "epoch": 0.8903424393997691, + "flos": 734913308160.0, + "grad_norm": 0.040511574906705955, + "language_loss": 0.84754193, + "learning_rate": 3.120426165316398e-05, + "loss": 0.85892582, + "num_input_tokens_seen": 383314352, + "router_z_loss_mlp": 0.73193359, + "step": 4628, + "time_per_iteration": 3.002224922180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138975, + "balance_loss_mlp": 1.0660187, + "epoch": 0.8905348210850327, + "flos": 520883979264.0, + "grad_norm": 0.035652036973535486, + "language_loss": 0.86524069, + "learning_rate": 3.109601733496881e-05, + "loss": 0.87663043, + "num_input_tokens_seen": 383384848, + "router_z_loss_mlp": 0.73046875, + "step": 4629, + "time_per_iteration": 2.6983273029327393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138867, + "balance_loss_mlp": 1.0656724, + "epoch": 0.8907272027702963, + "flos": 580198612992.0, + "grad_norm": 0.03507449840097698, + "language_loss": 0.84010351, + "learning_rate": 3.098795506144458e-05, + "loss": 0.85149217, + "num_input_tokens_seen": 383463360, + "router_z_loss_mlp": 0.73193359, + "step": 4630, + "time_per_iteration": 2.8263354301452637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138725, + "balance_loss_mlp": 1.06567347, + "epoch": 0.8909195844555599, + "flos": 895114212864.0, + "grad_norm": 0.03741426633430978, + "language_loss": 0.83983612, + "learning_rate": 3.088007487454475e-05, + "loss": 0.85122335, + "num_input_tokens_seen": 383542080, + "router_z_loss_mlp": 0.73095703, + "step": 4631, + "time_per_iteration": 3.1382222175598145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138682, + "balance_loss_mlp": 1.06548715, + "epoch": 0.8911119661408234, + "flos": 550948164096.0, + "grad_norm": 0.036182534075673435, + "language_loss": 0.89434344, + "learning_rate": 3.077237681615208e-05, + "loss": 0.90573025, + "num_input_tokens_seen": 383613056, + "router_z_loss_mlp": 0.73193359, + "step": 4632, + "time_per_iteration": 2.678633689880371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138526, + "balance_loss_mlp": 1.06533146, + "epoch": 0.8913043478260869, + "flos": 482164509696.0, + "grad_norm": 0.04328943324944268, + "language_loss": 0.89203089, + "learning_rate": 3.066486092807874e-05, + "loss": 0.90341616, + "num_input_tokens_seen": 383683280, + "router_z_loss_mlp": 0.73193359, + "step": 4633, + "time_per_iteration": 2.677217483520508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138784, + "balance_loss_mlp": 1.06568491, + "epoch": 0.8914967295113505, + "flos": 485644488192.0, + "grad_norm": 0.03105234799668386, + "language_loss": 0.88713467, + "learning_rate": 3.055752725206601e-05, + "loss": 0.8985225, + "num_input_tokens_seen": 383754624, + "router_z_loss_mlp": 0.73144531, + "step": 4634, + "time_per_iteration": 2.649566411972046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113871, + "balance_loss_mlp": 1.06561065, + "epoch": 0.8916891111966141, + "flos": 446592648192.0, + "grad_norm": 0.03682675744399267, + "language_loss": 0.86206222, + "learning_rate": 3.0450375829784714e-05, + "loss": 0.87344927, + "num_input_tokens_seen": 383821984, + "router_z_loss_mlp": 0.73095703, + "step": 4635, + "time_per_iteration": 2.5900418758392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138801, + "balance_loss_mlp": 1.06560659, + "epoch": 0.8918814928818777, + "flos": 565078198272.0, + "grad_norm": 0.03804470729703714, + "language_loss": 0.82817543, + "learning_rate": 3.034340670283453e-05, + "loss": 0.83956349, + "num_input_tokens_seen": 383890880, + "router_z_loss_mlp": 0.73193359, + "step": 4636, + "time_per_iteration": 2.741692543029785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137613, + "balance_loss_mlp": 1.06460917, + "epoch": 0.8920738745671412, + "flos": 577028811264.0, + "grad_norm": 0.032886435040047124, + "language_loss": 0.85431588, + "learning_rate": 3.0236619912744513e-05, + "loss": 0.86569202, + "num_input_tokens_seen": 383962480, + "router_z_loss_mlp": 0.73046875, + "step": 4637, + "time_per_iteration": 2.67724609375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137852, + "balance_loss_mlp": 1.06470549, + "epoch": 0.8922662562524047, + "flos": 621314354688.0, + "grad_norm": 0.033521285935624826, + "language_loss": 0.88356864, + "learning_rate": 3.0130015500973163e-05, + "loss": 0.89494717, + "num_input_tokens_seen": 384033616, + "router_z_loss_mlp": 0.73144531, + "step": 4638, + "time_per_iteration": 2.7949366569519043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137013, + "balance_loss_mlp": 1.06396186, + "epoch": 0.8924586379376683, + "flos": 584807233536.0, + "grad_norm": 0.03559045547501193, + "language_loss": 0.84122229, + "learning_rate": 3.0023593508907877e-05, + "loss": 0.85259241, + "num_input_tokens_seen": 384108848, + "router_z_loss_mlp": 0.73046875, + "step": 4639, + "time_per_iteration": 2.7761454582214355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137562, + "balance_loss_mlp": 1.06441486, + "epoch": 0.8926510196229319, + "flos": 526200274944.0, + "grad_norm": 0.03227679041862644, + "language_loss": 0.85516953, + "learning_rate": 2.991735397786538e-05, + "loss": 0.8665452, + "num_input_tokens_seen": 384185728, + "router_z_loss_mlp": 0.73144531, + "step": 4640, + "time_per_iteration": 2.7680256366729736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137327, + "balance_loss_mlp": 1.06422806, + "epoch": 0.8928434013081955, + "flos": 487639259136.0, + "grad_norm": 0.040770764772957185, + "language_loss": 0.85741651, + "learning_rate": 2.981129694909146e-05, + "loss": 0.86878973, + "num_input_tokens_seen": 384251552, + "router_z_loss_mlp": 0.73095703, + "step": 4641, + "time_per_iteration": 2.579289674758911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140709, + "balance_loss_mlp": 1.06918335, + "epoch": 0.893035782993459, + "flos": 1451199478272.0, + "grad_norm": 0.004510853592179409, + "language_loss": 0.80330861, + "learning_rate": 2.970542246376118e-05, + "loss": 0.81471562, + "num_input_tokens_seen": 384472176, + "router_z_loss_mlp": 0.71679688, + "step": 4642, + "time_per_iteration": 4.69758939743042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136696, + "balance_loss_mlp": 1.06345379, + "epoch": 0.8932281646787226, + "flos": 612444220416.0, + "grad_norm": 0.03833301661243837, + "language_loss": 0.86010414, + "learning_rate": 2.95997305629786e-05, + "loss": 0.87147105, + "num_input_tokens_seen": 384544224, + "router_z_loss_mlp": 0.73242188, + "step": 4643, + "time_per_iteration": 2.8750672340393066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136763, + "balance_loss_mlp": 1.06352127, + "epoch": 0.8934205463639862, + "flos": 566827920384.0, + "grad_norm": 0.03653494632059431, + "language_loss": 0.89745998, + "learning_rate": 2.9494221287776957e-05, + "loss": 0.90882766, + "num_input_tokens_seen": 384611728, + "router_z_loss_mlp": 0.73242188, + "step": 4644, + "time_per_iteration": 2.695143222808838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136836, + "balance_loss_mlp": 1.06359375, + "epoch": 0.8936129280492497, + "flos": 489434643456.0, + "grad_norm": 0.042946143094068745, + "language_loss": 0.83516526, + "learning_rate": 2.9388894679118484e-05, + "loss": 0.84653366, + "num_input_tokens_seen": 384678048, + "router_z_loss_mlp": 0.73242188, + "step": 4645, + "time_per_iteration": 2.6457924842834473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137122, + "balance_loss_mlp": 1.06388009, + "epoch": 0.8938053097345132, + "flos": 888074391552.0, + "grad_norm": 0.03223112269549949, + "language_loss": 0.84166312, + "learning_rate": 2.9283750777894912e-05, + "loss": 0.85303438, + "num_input_tokens_seen": 384766768, + "router_z_loss_mlp": 0.73242188, + "step": 4646, + "time_per_iteration": 3.3025524616241455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135843, + "balance_loss_mlp": 1.06260049, + "epoch": 0.8939976914197768, + "flos": 594432706560.0, + "grad_norm": 0.03742744544217847, + "language_loss": 0.88538921, + "learning_rate": 2.9178789624926427e-05, + "loss": 0.89674759, + "num_input_tokens_seen": 384842352, + "router_z_loss_mlp": 0.73242188, + "step": 4647, + "time_per_iteration": 2.732344627380371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136223, + "balance_loss_mlp": 1.06307614, + "epoch": 0.8941900731050404, + "flos": 524309563392.0, + "grad_norm": 0.041291033915724536, + "language_loss": 0.8619101, + "learning_rate": 2.9074011260962706e-05, + "loss": 0.87327242, + "num_input_tokens_seen": 384912048, + "router_z_loss_mlp": 0.73144531, + "step": 4648, + "time_per_iteration": 2.6516520977020264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136671, + "balance_loss_mlp": 1.0635246, + "epoch": 0.894382454790304, + "flos": 801927774720.0, + "grad_norm": 0.03416583650485881, + "language_loss": 0.85338318, + "learning_rate": 2.8969415726682158e-05, + "loss": 0.86474991, + "num_input_tokens_seen": 384986560, + "router_z_loss_mlp": 0.73144531, + "step": 4649, + "time_per_iteration": 3.0272881984710693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136105, + "balance_loss_mlp": 1.06305349, + "epoch": 0.8945748364755676, + "flos": 480060950016.0, + "grad_norm": 0.033926472362053865, + "language_loss": 0.88941896, + "learning_rate": 2.8865003062692517e-05, + "loss": 0.90078008, + "num_input_tokens_seen": 385057376, + "router_z_loss_mlp": 0.73046875, + "step": 4650, + "time_per_iteration": 2.660466194152832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136079, + "balance_loss_mlp": 1.06293249, + "epoch": 0.894767218160831, + "flos": 509853889536.0, + "grad_norm": 0.038719839462236214, + "language_loss": 0.87774134, + "learning_rate": 2.876077330953042e-05, + "loss": 0.8891021, + "num_input_tokens_seen": 385130880, + "router_z_loss_mlp": 0.73144531, + "step": 4651, + "time_per_iteration": 2.9371914863586426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137185, + "balance_loss_mlp": 1.06408608, + "epoch": 0.8949595998460946, + "flos": 687063192576.0, + "grad_norm": 0.035863421919143566, + "language_loss": 0.8627305, + "learning_rate": 2.8656726507661378e-05, + "loss": 0.87410235, + "num_input_tokens_seen": 385205808, + "router_z_loss_mlp": 0.73095703, + "step": 4652, + "time_per_iteration": 2.943850040435791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113756, + "balance_loss_mlp": 1.0645082, + "epoch": 0.8951519815313582, + "flos": 801293959680.0, + "grad_norm": 0.037185614582169284, + "language_loss": 0.81720811, + "learning_rate": 2.855286269747981e-05, + "loss": 0.82858372, + "num_input_tokens_seen": 385283616, + "router_z_loss_mlp": 0.73046875, + "step": 4653, + "time_per_iteration": 3.2343595027923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113662, + "balance_loss_mlp": 1.06347299, + "epoch": 0.8953443632166218, + "flos": 667935771648.0, + "grad_norm": 0.03649889337751892, + "language_loss": 0.90619528, + "learning_rate": 2.8449181919309398e-05, + "loss": 0.91756141, + "num_input_tokens_seen": 385357488, + "router_z_loss_mlp": 0.73144531, + "step": 4654, + "time_per_iteration": 2.87142014503479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113725, + "balance_loss_mlp": 1.06419849, + "epoch": 0.8955367449018854, + "flos": 646209964032.0, + "grad_norm": 0.036322322502662, + "language_loss": 0.8830961, + "learning_rate": 2.8345684213402556e-05, + "loss": 0.89446861, + "num_input_tokens_seen": 385431280, + "router_z_loss_mlp": 0.73046875, + "step": 4655, + "time_per_iteration": 2.8817336559295654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137331, + "balance_loss_mlp": 1.06423211, + "epoch": 0.8957291265871489, + "flos": 810162092544.0, + "grad_norm": 0.03904529135922208, + "language_loss": 0.82293046, + "learning_rate": 2.8242369619940644e-05, + "loss": 0.83430374, + "num_input_tokens_seen": 385509840, + "router_z_loss_mlp": 0.73095703, + "step": 4656, + "time_per_iteration": 3.0670013427734375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137676, + "balance_loss_mlp": 1.06452966, + "epoch": 0.8959215082724125, + "flos": 519963456000.0, + "grad_norm": 0.036966986897206296, + "language_loss": 0.81371593, + "learning_rate": 2.813923817903391e-05, + "loss": 0.82509267, + "num_input_tokens_seen": 385580384, + "router_z_loss_mlp": 0.73144531, + "step": 4657, + "time_per_iteration": 2.6919400691986084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137331, + "balance_loss_mlp": 1.06423163, + "epoch": 0.896113889957676, + "flos": 477911728128.0, + "grad_norm": 0.03989276240480501, + "language_loss": 0.82006389, + "learning_rate": 2.8036289930721603e-05, + "loss": 0.83143717, + "num_input_tokens_seen": 385649184, + "router_z_loss_mlp": 0.73095703, + "step": 4658, + "time_per_iteration": 2.607644557952881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137714, + "balance_loss_mlp": 1.06456733, + "epoch": 0.8963062716429396, + "flos": 519173188608.0, + "grad_norm": 0.033528793307819646, + "language_loss": 0.87108302, + "learning_rate": 2.7933524914971697e-05, + "loss": 0.88246012, + "num_input_tokens_seen": 385717072, + "router_z_loss_mlp": 0.73144531, + "step": 4659, + "time_per_iteration": 2.6183245182037354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113748, + "balance_loss_mlp": 1.06433296, + "epoch": 0.8964986533282031, + "flos": 509502053376.0, + "grad_norm": 0.037292402625012336, + "language_loss": 0.86541545, + "learning_rate": 2.7830943171681113e-05, + "loss": 0.87679029, + "num_input_tokens_seen": 385788880, + "router_z_loss_mlp": 0.73144531, + "step": 4660, + "time_per_iteration": 2.6836605072021484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137687, + "balance_loss_mlp": 1.06454027, + "epoch": 0.8966910350134667, + "flos": 537108840960.0, + "grad_norm": 0.04787249223130083, + "language_loss": 0.87312889, + "learning_rate": 2.77285447406756e-05, + "loss": 0.88450575, + "num_input_tokens_seen": 385854240, + "router_z_loss_mlp": 0.73144531, + "step": 4661, + "time_per_iteration": 2.6272199153900146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137712, + "balance_loss_mlp": 1.0647558, + "epoch": 0.8968834166987303, + "flos": 724497567744.0, + "grad_norm": 0.03914389932725733, + "language_loss": 0.88940513, + "learning_rate": 2.7626329661709914e-05, + "loss": 0.90078223, + "num_input_tokens_seen": 385926080, + "router_z_loss_mlp": 0.72998047, + "step": 4662, + "time_per_iteration": 2.923161268234253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137766, + "balance_loss_mlp": 1.06495285, + "epoch": 0.8970757983839939, + "flos": 682947397632.0, + "grad_norm": 0.02836643100094979, + "language_loss": 0.87210166, + "learning_rate": 2.7524297974467372e-05, + "loss": 0.88347936, + "num_input_tokens_seen": 386005696, + "router_z_loss_mlp": 0.72949219, + "step": 4663, + "time_per_iteration": 2.9333901405334473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137571, + "balance_loss_mlp": 1.06451964, + "epoch": 0.8972681800692575, + "flos": 614157012480.0, + "grad_norm": 0.04594668712214378, + "language_loss": 0.82504487, + "learning_rate": 2.742244971856006e-05, + "loss": 0.83642054, + "num_input_tokens_seen": 386073248, + "router_z_loss_mlp": 0.73095703, + "step": 4664, + "time_per_iteration": 2.7572762966156006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136784, + "balance_loss_mlp": 1.06363738, + "epoch": 0.8974605617545209, + "flos": 573499167744.0, + "grad_norm": 0.03351248965112738, + "language_loss": 0.87172771, + "learning_rate": 2.732078493352913e-05, + "loss": 0.8830955, + "num_input_tokens_seen": 386148528, + "router_z_loss_mlp": 0.73144531, + "step": 4665, + "time_per_iteration": 2.7434494495391846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136912, + "balance_loss_mlp": 1.0637176, + "epoch": 0.8976529434397845, + "flos": 521507060736.0, + "grad_norm": 0.03367433914500393, + "language_loss": 0.92143202, + "learning_rate": 2.721930365884434e-05, + "loss": 0.93280119, + "num_input_tokens_seen": 386218528, + "router_z_loss_mlp": 0.73193359, + "step": 4666, + "time_per_iteration": 2.816922664642334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136738, + "balance_loss_mlp": 1.06359124, + "epoch": 0.8978453251250481, + "flos": 472282527744.0, + "grad_norm": 0.03434454323546124, + "language_loss": 0.8620975, + "learning_rate": 2.7118005933904176e-05, + "loss": 0.87346482, + "num_input_tokens_seen": 386284704, + "router_z_loss_mlp": 0.73144531, + "step": 4667, + "time_per_iteration": 2.7096781730651855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113737, + "balance_loss_mlp": 1.06441426, + "epoch": 0.8980377068103117, + "flos": 592821972480.0, + "grad_norm": 0.030419293496563398, + "language_loss": 0.86279666, + "learning_rate": 2.7016891798035904e-05, + "loss": 0.8741703, + "num_input_tokens_seen": 386356128, + "router_z_loss_mlp": 0.72998047, + "step": 4668, + "time_per_iteration": 3.019211530685425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137373, + "balance_loss_mlp": 1.06427431, + "epoch": 0.8982300884955752, + "flos": 768950297088.0, + "grad_norm": 0.03649542042278363, + "language_loss": 0.87581873, + "learning_rate": 2.691596129049556e-05, + "loss": 0.88719249, + "num_input_tokens_seen": 386434048, + "router_z_loss_mlp": 0.73095703, + "step": 4669, + "time_per_iteration": 3.122833728790283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113727, + "balance_loss_mlp": 1.064219, + "epoch": 0.8984224701808388, + "flos": 846124721664.0, + "grad_norm": 0.0371250323019601, + "language_loss": 0.81804687, + "learning_rate": 2.681521445046775e-05, + "loss": 0.82941949, + "num_input_tokens_seen": 386532384, + "router_z_loss_mlp": 0.73046875, + "step": 4670, + "time_per_iteration": 3.369352340698242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138035, + "balance_loss_mlp": 1.06484008, + "epoch": 0.8986148518661023, + "flos": 759099240960.0, + "grad_norm": 0.03474578852123265, + "language_loss": 0.80345845, + "learning_rate": 2.6714651317065963e-05, + "loss": 0.81483877, + "num_input_tokens_seen": 386627120, + "router_z_loss_mlp": 0.73193359, + "step": 4671, + "time_per_iteration": 3.138118267059326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113809, + "balance_loss_mlp": 1.06484783, + "epoch": 0.8988072335513659, + "flos": 564146941440.0, + "grad_norm": 0.03574727497124782, + "language_loss": 0.81828249, + "learning_rate": 2.6614271929332133e-05, + "loss": 0.8296634, + "num_input_tokens_seen": 386700192, + "router_z_loss_mlp": 0.73242188, + "step": 4672, + "time_per_iteration": 2.695159673690796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136953, + "balance_loss_mlp": 1.06366277, + "epoch": 0.8989996152366295, + "flos": 493661228544.0, + "grad_norm": 0.03847010617944712, + "language_loss": 0.91765416, + "learning_rate": 2.6514076326237147e-05, + "loss": 0.92902374, + "num_input_tokens_seen": 386764256, + "router_z_loss_mlp": 0.73291016, + "step": 4673, + "time_per_iteration": 2.5458626747131348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136749, + "balance_loss_mlp": 1.0634588, + "epoch": 0.899191996921893, + "flos": 543623635968.0, + "grad_norm": 0.04589399919654321, + "language_loss": 0.80554837, + "learning_rate": 2.6414064546680438e-05, + "loss": 0.81691587, + "num_input_tokens_seen": 386835792, + "router_z_loss_mlp": 0.73291016, + "step": 4674, + "time_per_iteration": 2.642505168914795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136848, + "balance_loss_mlp": 1.06355786, + "epoch": 0.8993843786071566, + "flos": 472308724224.0, + "grad_norm": 0.03589158641039823, + "language_loss": 0.84531856, + "learning_rate": 2.631423662948984e-05, + "loss": 0.85668707, + "num_input_tokens_seen": 386904368, + "router_z_loss_mlp": 0.73291016, + "step": 4675, + "time_per_iteration": 2.6165904998779297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136516, + "balance_loss_mlp": 1.0631305, + "epoch": 0.8995767602924202, + "flos": 527817739776.0, + "grad_norm": 0.0341476422766562, + "language_loss": 0.86405528, + "learning_rate": 2.621459261342196e-05, + "loss": 0.87542045, + "num_input_tokens_seen": 386977872, + "router_z_loss_mlp": 0.73388672, + "step": 4676, + "time_per_iteration": 2.719243049621582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137023, + "balance_loss_mlp": 1.06363761, + "epoch": 0.8997691419776838, + "flos": 558711123456.0, + "grad_norm": 0.0347905092588009, + "language_loss": 0.88358057, + "learning_rate": 2.6115132537162245e-05, + "loss": 0.89495075, + "num_input_tokens_seen": 387052080, + "router_z_loss_mlp": 0.73388672, + "step": 4677, + "time_per_iteration": 2.7013773918151855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136646, + "balance_loss_mlp": 1.06321299, + "epoch": 0.8999615236629472, + "flos": 640253122560.0, + "grad_norm": 0.03439496525861691, + "language_loss": 0.84559703, + "learning_rate": 2.601585643932436e-05, + "loss": 0.85696346, + "num_input_tokens_seen": 387129712, + "router_z_loss_mlp": 0.734375, + "step": 4678, + "time_per_iteration": 2.8610715866088867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139397, + "balance_loss_mlp": 1.06768036, + "epoch": 0.9001539053482108, + "flos": 1434588578304.0, + "grad_norm": 0.0055187474782550615, + "language_loss": 0.85784018, + "learning_rate": 2.5916764358450862e-05, + "loss": 0.8692342, + "num_input_tokens_seen": 387356560, + "router_z_loss_mlp": 0.71875, + "step": 4679, + "time_per_iteration": 4.780034780502319 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136508, + "balance_loss_mlp": 1.06321776, + "epoch": 0.9003462870334744, + "flos": 568035151872.0, + "grad_norm": 0.039726434733231085, + "language_loss": 0.84240907, + "learning_rate": 2.5817856333012425e-05, + "loss": 0.85377413, + "num_input_tokens_seen": 387438640, + "router_z_loss_mlp": 0.73291016, + "step": 4680, + "time_per_iteration": 2.8599278926849365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137032, + "balance_loss_mlp": 1.06369436, + "epoch": 0.900538668718738, + "flos": 539705226240.0, + "grad_norm": 0.03640877309681453, + "language_loss": 0.82617021, + "learning_rate": 2.5719132401408883e-05, + "loss": 0.83754051, + "num_input_tokens_seen": 387507088, + "router_z_loss_mlp": 0.73339844, + "step": 4681, + "time_per_iteration": 2.7729578018188477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137597, + "balance_loss_mlp": 1.06435442, + "epoch": 0.9007310504040016, + "flos": 489352051200.0, + "grad_norm": 0.0368162628628219, + "language_loss": 0.90017235, + "learning_rate": 2.5620592601968028e-05, + "loss": 0.91154826, + "num_input_tokens_seen": 387574160, + "router_z_loss_mlp": 0.73242188, + "step": 4682, + "time_per_iteration": 2.755814552307129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113733, + "balance_loss_mlp": 1.06403971, + "epoch": 0.9009234320892651, + "flos": 654140109312.0, + "grad_norm": 0.038557562175802175, + "language_loss": 0.83839279, + "learning_rate": 2.5522236972946532e-05, + "loss": 0.84976614, + "num_input_tokens_seen": 387652528, + "router_z_loss_mlp": 0.73291016, + "step": 4683, + "time_per_iteration": 2.8485474586486816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137175, + "balance_loss_mlp": 1.06393278, + "epoch": 0.9011158137745287, + "flos": 546638986752.0, + "grad_norm": 0.033729496474815886, + "language_loss": 0.89113462, + "learning_rate": 2.5424065552529295e-05, + "loss": 0.90250635, + "num_input_tokens_seen": 387723520, + "router_z_loss_mlp": 0.73242188, + "step": 4684, + "time_per_iteration": 2.6239471435546875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137283, + "balance_loss_mlp": 1.06404042, + "epoch": 0.9013081954597922, + "flos": 560786485248.0, + "grad_norm": 0.03771517464908619, + "language_loss": 0.87072444, + "learning_rate": 2.532607837883011e-05, + "loss": 0.88209724, + "num_input_tokens_seen": 387793664, + "router_z_loss_mlp": 0.73242188, + "step": 4685, + "time_per_iteration": 2.668337345123291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136586, + "balance_loss_mlp": 1.06320047, + "epoch": 0.9015005771450558, + "flos": 729942117888.0, + "grad_norm": 0.031716062736378385, + "language_loss": 0.84871745, + "learning_rate": 2.5228275489890706e-05, + "loss": 0.86008328, + "num_input_tokens_seen": 387871008, + "router_z_loss_mlp": 0.73388672, + "step": 4686, + "time_per_iteration": 2.903815507888794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113734, + "balance_loss_mlp": 1.06419337, + "epoch": 0.9016929588303193, + "flos": 518491709952.0, + "grad_norm": 0.037159626638255984, + "language_loss": 0.85474777, + "learning_rate": 2.5130656923681605e-05, + "loss": 0.86612117, + "num_input_tokens_seen": 387950832, + "router_z_loss_mlp": 0.73144531, + "step": 4687, + "time_per_iteration": 2.7882134914398193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137561, + "balance_loss_mlp": 1.0643189, + "epoch": 0.9018853405155829, + "flos": 623554900992.0, + "grad_norm": 0.030497030476657076, + "language_loss": 0.90075636, + "learning_rate": 2.503322271810171e-05, + "loss": 0.91213191, + "num_input_tokens_seen": 388029792, + "router_z_loss_mlp": 0.73242188, + "step": 4688, + "time_per_iteration": 2.863872766494751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137148, + "balance_loss_mlp": 1.06381035, + "epoch": 0.9020777222008465, + "flos": 524337761280.0, + "grad_norm": 0.0356508664141184, + "language_loss": 0.82390887, + "learning_rate": 2.4935972910978378e-05, + "loss": 0.8352803, + "num_input_tokens_seen": 388095872, + "router_z_loss_mlp": 0.73339844, + "step": 4689, + "time_per_iteration": 2.6352643966674805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137658, + "balance_loss_mlp": 1.06451106, + "epoch": 0.9022701038861101, + "flos": 634893166080.0, + "grad_norm": 0.03217572249444131, + "language_loss": 0.85226208, + "learning_rate": 2.4838907540067346e-05, + "loss": 0.8636387, + "num_input_tokens_seen": 388171632, + "router_z_loss_mlp": 0.73144531, + "step": 4690, + "time_per_iteration": 2.7964348793029785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137305, + "balance_loss_mlp": 1.06411064, + "epoch": 0.9024624855713737, + "flos": 514332254208.0, + "grad_norm": 0.03518480344616928, + "language_loss": 0.8914479, + "learning_rate": 2.474202664305253e-05, + "loss": 0.90282094, + "num_input_tokens_seen": 388242240, + "router_z_loss_mlp": 0.73193359, + "step": 4691, + "time_per_iteration": 2.6292026042938232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113622, + "balance_loss_mlp": 1.06283426, + "epoch": 0.9026548672566371, + "flos": 478450215936.0, + "grad_norm": 0.034512274724425716, + "language_loss": 0.8996951, + "learning_rate": 2.464533025754673e-05, + "loss": 0.91105729, + "num_input_tokens_seen": 388310960, + "router_z_loss_mlp": 0.73388672, + "step": 4692, + "time_per_iteration": 2.6084232330322266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136428, + "balance_loss_mlp": 1.0630908, + "epoch": 0.9028472489419007, + "flos": 663170698752.0, + "grad_norm": 0.04470517923282093, + "language_loss": 0.78629088, + "learning_rate": 2.454881842109058e-05, + "loss": 0.79765511, + "num_input_tokens_seen": 388387280, + "router_z_loss_mlp": 0.73339844, + "step": 4693, + "time_per_iteration": 2.81938099861145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136126, + "balance_loss_mlp": 1.06288338, + "epoch": 0.9030396306271643, + "flos": 535619630592.0, + "grad_norm": 0.03960598704445331, + "language_loss": 0.87602615, + "learning_rate": 2.4452491171153445e-05, + "loss": 0.88738739, + "num_input_tokens_seen": 388456992, + "router_z_loss_mlp": 0.73242188, + "step": 4694, + "time_per_iteration": 2.607726812362671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135271, + "balance_loss_mlp": 1.06193364, + "epoch": 0.9032320123124279, + "flos": 802383670272.0, + "grad_norm": 0.03396233932640605, + "language_loss": 0.86772144, + "learning_rate": 2.43563485451328e-05, + "loss": 0.87907416, + "num_input_tokens_seen": 388534896, + "router_z_loss_mlp": 0.73339844, + "step": 4695, + "time_per_iteration": 2.946852684020996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135645, + "balance_loss_mlp": 1.06221211, + "epoch": 0.9034243939976914, + "flos": 555025027584.0, + "grad_norm": 0.04144086028744623, + "language_loss": 0.81962967, + "learning_rate": 2.426039058035451e-05, + "loss": 0.83098608, + "num_input_tokens_seen": 388606640, + "router_z_loss_mlp": 0.734375, + "step": 4696, + "time_per_iteration": 2.6538476943969727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137462, + "balance_loss_mlp": 1.06417239, + "epoch": 0.903616775682955, + "flos": 504895434240.0, + "grad_norm": 0.04262123189824164, + "language_loss": 0.88294876, + "learning_rate": 2.4164617314072823e-05, + "loss": 0.89432335, + "num_input_tokens_seen": 388675920, + "router_z_loss_mlp": 0.73291016, + "step": 4697, + "time_per_iteration": 2.611482620239258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011379, + "balance_loss_mlp": 1.06465781, + "epoch": 0.9038091573682185, + "flos": 437255884800.0, + "grad_norm": 0.03845558802533531, + "language_loss": 0.83261943, + "learning_rate": 2.406902878347017e-05, + "loss": 0.84399843, + "num_input_tokens_seen": 388743968, + "router_z_loss_mlp": 0.73242188, + "step": 4698, + "time_per_iteration": 2.6136317253112793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137364, + "balance_loss_mlp": 1.0641216, + "epoch": 0.9040015390534821, + "flos": 533989430784.0, + "grad_norm": 0.043430243161230425, + "language_loss": 0.86828995, + "learning_rate": 2.3973625025657253e-05, + "loss": 0.87966359, + "num_input_tokens_seen": 388810784, + "router_z_loss_mlp": 0.73242188, + "step": 4699, + "time_per_iteration": 2.619580030441284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137638, + "balance_loss_mlp": 1.06434846, + "epoch": 0.9041939207387457, + "flos": 565430034432.0, + "grad_norm": 0.038139504979905946, + "language_loss": 0.85678428, + "learning_rate": 2.3878406077673275e-05, + "loss": 0.86816067, + "num_input_tokens_seen": 388885072, + "router_z_loss_mlp": 0.73291016, + "step": 4700, + "time_per_iteration": 2.775902509689331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135693, + "balance_loss_mlp": 1.06230736, + "epoch": 0.9043863024240092, + "flos": 516520407552.0, + "grad_norm": 0.042725558603523235, + "language_loss": 0.8274883, + "learning_rate": 2.3783371976485447e-05, + "loss": 0.83884525, + "num_input_tokens_seen": 388951184, + "router_z_loss_mlp": 0.73388672, + "step": 4701, + "time_per_iteration": 2.564540386199951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139694, + "balance_loss_mlp": 1.06797791, + "epoch": 0.9045786841092728, + "flos": 1280782946304.0, + "grad_norm": 0.004733647973715265, + "language_loss": 0.72929788, + "learning_rate": 2.368852275898914e-05, + "loss": 0.74069482, + "num_input_tokens_seen": 389170752, + "router_z_loss_mlp": 0.71875, + "step": 4702, + "time_per_iteration": 4.953817367553711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135708, + "balance_loss_mlp": 1.0623225, + "epoch": 0.9047710657945364, + "flos": 586932260352.0, + "grad_norm": 0.037178314529548034, + "language_loss": 0.87704772, + "learning_rate": 2.3593858462008178e-05, + "loss": 0.88840485, + "num_input_tokens_seen": 389239600, + "router_z_loss_mlp": 0.73388672, + "step": 4703, + "time_per_iteration": 2.657202959060669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135469, + "balance_loss_mlp": 1.06203628, + "epoch": 0.9049634474798, + "flos": 573071470080.0, + "grad_norm": 0.03668762127255847, + "language_loss": 0.83756787, + "learning_rate": 2.3499379122294495e-05, + "loss": 0.84892261, + "num_input_tokens_seen": 389316032, + "router_z_loss_mlp": 0.734375, + "step": 4704, + "time_per_iteration": 2.7138781547546387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136388, + "balance_loss_mlp": 1.06295526, + "epoch": 0.9051558291650635, + "flos": 573687820800.0, + "grad_norm": 0.04230135132201795, + "language_loss": 0.80652225, + "learning_rate": 2.3405084776528307e-05, + "loss": 0.81788611, + "num_input_tokens_seen": 389383504, + "router_z_loss_mlp": 0.73388672, + "step": 4705, + "time_per_iteration": 2.652484655380249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136762, + "balance_loss_mlp": 1.06342399, + "epoch": 0.905348210850327, + "flos": 541576472064.0, + "grad_norm": 0.03894186985703792, + "language_loss": 0.8417691, + "learning_rate": 2.331097546131783e-05, + "loss": 0.85313666, + "num_input_tokens_seen": 389454592, + "router_z_loss_mlp": 0.73339844, + "step": 4706, + "time_per_iteration": 2.646650791168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136958, + "balance_loss_mlp": 1.0637157, + "epoch": 0.9055405925355906, + "flos": 517395268608.0, + "grad_norm": 0.03706201229587213, + "language_loss": 0.86367965, + "learning_rate": 2.321705121319956e-05, + "loss": 0.87504923, + "num_input_tokens_seen": 389519696, + "router_z_loss_mlp": 0.73242188, + "step": 4707, + "time_per_iteration": 2.578150510787964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136926, + "balance_loss_mlp": 1.0636363, + "epoch": 0.9057329742208542, + "flos": 916221668352.0, + "grad_norm": 0.027988535833480262, + "language_loss": 0.8856324, + "learning_rate": 2.3123312068638104e-05, + "loss": 0.89700168, + "num_input_tokens_seen": 389603568, + "router_z_loss_mlp": 0.73291016, + "step": 4708, + "time_per_iteration": 3.2058019638061523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137016, + "balance_loss_mlp": 1.06363082, + "epoch": 0.9059253559061178, + "flos": 906776116224.0, + "grad_norm": 0.040646490031674046, + "language_loss": 0.87692308, + "learning_rate": 2.3029758064026295e-05, + "loss": 0.88829321, + "num_input_tokens_seen": 389687504, + "router_z_loss_mlp": 0.73388672, + "step": 4709, + "time_per_iteration": 3.121534824371338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136893, + "balance_loss_mlp": 1.06355548, + "epoch": 0.9061177375913813, + "flos": 665802012672.0, + "grad_norm": 0.05005893347039075, + "language_loss": 0.82001013, + "learning_rate": 2.2936389235684918e-05, + "loss": 0.83137906, + "num_input_tokens_seen": 389764880, + "router_z_loss_mlp": 0.73339844, + "step": 4710, + "time_per_iteration": 2.845099449157715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137048, + "balance_loss_mlp": 1.06366277, + "epoch": 0.9063101192766448, + "flos": 566778255360.0, + "grad_norm": 0.03738260666061765, + "language_loss": 0.87451136, + "learning_rate": 2.2843205619862972e-05, + "loss": 0.88588178, + "num_input_tokens_seen": 389838304, + "router_z_loss_mlp": 0.73388672, + "step": 4711, + "time_per_iteration": 2.749617338180542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136766, + "balance_loss_mlp": 1.06342876, + "epoch": 0.9065025009619084, + "flos": 728630827008.0, + "grad_norm": 0.03643976718461331, + "language_loss": 0.82941359, + "learning_rate": 2.2750207252737742e-05, + "loss": 0.84078121, + "num_input_tokens_seen": 389908592, + "router_z_loss_mlp": 0.73339844, + "step": 4712, + "time_per_iteration": 2.8681652545928955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136904, + "balance_loss_mlp": 1.06370974, + "epoch": 0.906694882647172, + "flos": 532547884032.0, + "grad_norm": 0.041072095585484664, + "language_loss": 0.85065079, + "learning_rate": 2.265739417041418e-05, + "loss": 0.86201984, + "num_input_tokens_seen": 389979040, + "router_z_loss_mlp": 0.73193359, + "step": 4713, + "time_per_iteration": 2.6370742321014404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113708, + "balance_loss_mlp": 1.06388533, + "epoch": 0.9068872643324356, + "flos": 430695427584.0, + "grad_norm": 0.035065691956439445, + "language_loss": 0.89791685, + "learning_rate": 2.2564766408925574e-05, + "loss": 0.90928769, + "num_input_tokens_seen": 390046080, + "router_z_loss_mlp": 0.73193359, + "step": 4714, + "time_per_iteration": 2.588728427886963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136841, + "balance_loss_mlp": 1.06350315, + "epoch": 0.9070796460176991, + "flos": 589454785536.0, + "grad_norm": 0.04403478134734124, + "language_loss": 0.84667605, + "learning_rate": 2.2472324004233214e-05, + "loss": 0.85804451, + "num_input_tokens_seen": 390122176, + "router_z_loss_mlp": 0.73339844, + "step": 4715, + "time_per_iteration": 2.7413907051086426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136965, + "balance_loss_mlp": 1.06357956, + "epoch": 0.9072720277029627, + "flos": 572654505984.0, + "grad_norm": 0.03890461174208685, + "language_loss": 0.8084088, + "learning_rate": 2.2380066992226446e-05, + "loss": 0.81977844, + "num_input_tokens_seen": 390195216, + "router_z_loss_mlp": 0.73388672, + "step": 4716, + "time_per_iteration": 2.7009265422821045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136751, + "balance_loss_mlp": 1.06350887, + "epoch": 0.9074644093882263, + "flos": 556859343360.0, + "grad_norm": 0.035784983001337665, + "language_loss": 0.92963278, + "learning_rate": 2.2287995408722617e-05, + "loss": 0.94100022, + "num_input_tokens_seen": 390263216, + "router_z_loss_mlp": 0.73242188, + "step": 4717, + "time_per_iteration": 2.658792734146118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136681, + "balance_loss_mlp": 1.06334293, + "epoch": 0.9076567910734898, + "flos": 642172032000.0, + "grad_norm": 0.035461342657685004, + "language_loss": 0.87066031, + "learning_rate": 2.2196109289467083e-05, + "loss": 0.88202703, + "num_input_tokens_seen": 390337360, + "router_z_loss_mlp": 0.73339844, + "step": 4718, + "time_per_iteration": 2.774747371673584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113687, + "balance_loss_mlp": 1.06353295, + "epoch": 0.9078491727587533, + "flos": 735456525312.0, + "grad_norm": 0.03318515468824905, + "language_loss": 0.86531991, + "learning_rate": 2.2104408670133193e-05, + "loss": 0.8766886, + "num_input_tokens_seen": 390427728, + "router_z_loss_mlp": 0.73339844, + "step": 4719, + "time_per_iteration": 3.110316753387451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136868, + "balance_loss_mlp": 1.06357777, + "epoch": 0.9080415544440169, + "flos": 656020087296.0, + "grad_norm": 0.03252250742039747, + "language_loss": 0.90962839, + "learning_rate": 2.2012893586322245e-05, + "loss": 0.92099708, + "num_input_tokens_seen": 390504736, + "router_z_loss_mlp": 0.73291016, + "step": 4720, + "time_per_iteration": 2.8136444091796875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137209, + "balance_loss_mlp": 1.06382358, + "epoch": 0.9082339361292805, + "flos": 598602895872.0, + "grad_norm": 0.03508499547859316, + "language_loss": 0.84060097, + "learning_rate": 2.1921564073563604e-05, + "loss": 0.85197306, + "num_input_tokens_seen": 390582048, + "router_z_loss_mlp": 0.73388672, + "step": 4721, + "time_per_iteration": 2.728701114654541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137056, + "balance_loss_mlp": 1.0636704, + "epoch": 0.9084263178145441, + "flos": 505425189888.0, + "grad_norm": 0.03720654975675441, + "language_loss": 0.89186943, + "learning_rate": 2.183042016731457e-05, + "loss": 0.90324003, + "num_input_tokens_seen": 390652976, + "router_z_loss_mlp": 0.73339844, + "step": 4722, + "time_per_iteration": 2.6093122959136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137238, + "balance_loss_mlp": 1.06380546, + "epoch": 0.9086186994998077, + "flos": 551106617856.0, + "grad_norm": 0.03925189384717369, + "language_loss": 0.84773749, + "learning_rate": 2.1739461902960223e-05, + "loss": 0.85910988, + "num_input_tokens_seen": 390726832, + "router_z_loss_mlp": 0.734375, + "step": 4723, + "time_per_iteration": 2.706056594848633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137174, + "balance_loss_mlp": 1.06393194, + "epoch": 0.9088110811850711, + "flos": 1135908395520.0, + "grad_norm": 0.031906636087630606, + "language_loss": 0.78563046, + "learning_rate": 2.1648689315813763e-05, + "loss": 0.7970022, + "num_input_tokens_seen": 390824480, + "router_z_loss_mlp": 0.73242188, + "step": 4724, + "time_per_iteration": 3.5388522148132324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137122, + "balance_loss_mlp": 1.06388009, + "epoch": 0.9090034628703347, + "flos": 558059844096.0, + "grad_norm": 0.038899730458288276, + "language_loss": 0.81937408, + "learning_rate": 2.155810244111628e-05, + "loss": 0.83074534, + "num_input_tokens_seen": 390897552, + "router_z_loss_mlp": 0.73242188, + "step": 4725, + "time_per_iteration": 2.6709446907043457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136742, + "balance_loss_mlp": 1.06350017, + "epoch": 0.9091958445555983, + "flos": 545065182720.0, + "grad_norm": 0.034504955767497236, + "language_loss": 0.89321834, + "learning_rate": 2.146770131403658e-05, + "loss": 0.90458584, + "num_input_tokens_seen": 390969008, + "router_z_loss_mlp": 0.73242188, + "step": 4726, + "time_per_iteration": 2.685490608215332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137086, + "balance_loss_mlp": 1.06379664, + "epoch": 0.9093882262408619, + "flos": 527140263936.0, + "grad_norm": 0.040107209375530216, + "language_loss": 0.86455953, + "learning_rate": 2.1377485969671594e-05, + "loss": 0.87593037, + "num_input_tokens_seen": 391038880, + "router_z_loss_mlp": 0.73291016, + "step": 4727, + "time_per_iteration": 2.6698527336120605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137417, + "balance_loss_mlp": 1.06417525, + "epoch": 0.9095806079261254, + "flos": 549571745280.0, + "grad_norm": 0.03978461900871093, + "language_loss": 0.86923885, + "learning_rate": 2.1287456443046084e-05, + "loss": 0.88061309, + "num_input_tokens_seen": 391106720, + "router_z_loss_mlp": 0.73242188, + "step": 4728, + "time_per_iteration": 2.621840476989746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113738, + "balance_loss_mlp": 1.06413734, + "epoch": 0.909772989611389, + "flos": 573640157184.0, + "grad_norm": 0.036584315059023036, + "language_loss": 0.89296705, + "learning_rate": 2.1197612769112528e-05, + "loss": 0.90434086, + "num_input_tokens_seen": 391178128, + "router_z_loss_mlp": 0.73242188, + "step": 4729, + "time_per_iteration": 2.700291395187378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136817, + "balance_loss_mlp": 1.06352687, + "epoch": 0.9099653712966526, + "flos": 562881312768.0, + "grad_norm": 0.0404955741903976, + "language_loss": 0.85047817, + "learning_rate": 2.1107954982751254e-05, + "loss": 0.86184633, + "num_input_tokens_seen": 391248848, + "router_z_loss_mlp": 0.73291016, + "step": 4730, + "time_per_iteration": 2.678140640258789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136149, + "balance_loss_mlp": 1.06271577, + "epoch": 0.9101577529819161, + "flos": 1095497601024.0, + "grad_norm": 0.03929606258638513, + "language_loss": 0.84986031, + "learning_rate": 2.101848311877069e-05, + "loss": 0.86122179, + "num_input_tokens_seen": 391328000, + "router_z_loss_mlp": 0.734375, + "step": 4731, + "time_per_iteration": 3.3611509799957275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135878, + "balance_loss_mlp": 1.06249321, + "epoch": 0.9103501346671797, + "flos": 446360334336.0, + "grad_norm": 0.04307227071554131, + "language_loss": 0.87402189, + "learning_rate": 2.092919721190678e-05, + "loss": 0.88538074, + "num_input_tokens_seen": 391391616, + "router_z_loss_mlp": 0.73388672, + "step": 4732, + "time_per_iteration": 2.5086095333099365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135658, + "balance_loss_mlp": 1.06227303, + "epoch": 0.9105425163524432, + "flos": 501812954112.0, + "grad_norm": 0.03966317690451211, + "language_loss": 0.8330757, + "learning_rate": 2.0840097296823346e-05, + "loss": 0.84443229, + "num_input_tokens_seen": 391461312, + "router_z_loss_mlp": 0.73388672, + "step": 4733, + "time_per_iteration": 2.6233813762664795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011355, + "balance_loss_mlp": 1.06211519, + "epoch": 0.9107348980377068, + "flos": 658774926336.0, + "grad_norm": 0.0391604867021726, + "language_loss": 0.88541472, + "learning_rate": 2.0751183408112162e-05, + "loss": 0.89676976, + "num_input_tokens_seen": 391542192, + "router_z_loss_mlp": 0.73388672, + "step": 4734, + "time_per_iteration": 2.8359274864196777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137103, + "balance_loss_mlp": 1.06381297, + "epoch": 0.9109272797229704, + "flos": 554718853632.0, + "grad_norm": 0.03421844082243491, + "language_loss": 0.8903842, + "learning_rate": 2.066245558029256e-05, + "loss": 0.90175527, + "num_input_tokens_seen": 391609968, + "router_z_loss_mlp": 0.73291016, + "step": 4735, + "time_per_iteration": 2.6057238578796387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136816, + "balance_loss_mlp": 1.06352627, + "epoch": 0.911119661408234, + "flos": 520011119616.0, + "grad_norm": 0.03846629204542353, + "language_loss": 0.89047289, + "learning_rate": 2.057391384781182e-05, + "loss": 0.90184104, + "num_input_tokens_seen": 391681264, + "router_z_loss_mlp": 0.73291016, + "step": 4736, + "time_per_iteration": 2.633537530899048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136729, + "balance_loss_mlp": 1.06348717, + "epoch": 0.9113120430934974, + "flos": 555435260928.0, + "grad_norm": 0.039830009072267566, + "language_loss": 0.87907994, + "learning_rate": 2.0485558245044834e-05, + "loss": 0.89044726, + "num_input_tokens_seen": 391751392, + "router_z_loss_mlp": 0.73242188, + "step": 4737, + "time_per_iteration": 2.6331467628479004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136847, + "balance_loss_mlp": 1.06350923, + "epoch": 0.911504424778761, + "flos": 502957059072.0, + "grad_norm": 0.03552190117680254, + "language_loss": 0.85479963, + "learning_rate": 2.0397388806294216e-05, + "loss": 0.86616814, + "num_input_tokens_seen": 391823952, + "router_z_loss_mlp": 0.73339844, + "step": 4738, + "time_per_iteration": 2.657090663909912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137184, + "balance_loss_mlp": 1.06394231, + "epoch": 0.9116968064640246, + "flos": 612211906560.0, + "grad_norm": 0.03175859953298452, + "language_loss": 0.85633034, + "learning_rate": 2.0309405565790527e-05, + "loss": 0.86770225, + "num_input_tokens_seen": 391895264, + "router_z_loss_mlp": 0.73242188, + "step": 4739, + "time_per_iteration": 2.7278242111206055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137098, + "balance_loss_mlp": 1.06385577, + "epoch": 0.9118891881492882, + "flos": 574094051328.0, + "grad_norm": 0.029792698419162895, + "language_loss": 0.86312258, + "learning_rate": 2.0221608557691895e-05, + "loss": 0.87449354, + "num_input_tokens_seen": 391973040, + "router_z_loss_mlp": 0.73242188, + "step": 4740, + "time_per_iteration": 2.763500452041626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113712, + "balance_loss_mlp": 1.06378198, + "epoch": 0.9120815698345518, + "flos": 637172643840.0, + "grad_norm": 0.034763930832622233, + "language_loss": 0.82391727, + "learning_rate": 2.0133997816083992e-05, + "loss": 0.83528852, + "num_input_tokens_seen": 392048160, + "router_z_loss_mlp": 0.73339844, + "step": 4741, + "time_per_iteration": 2.817636489868164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137084, + "balance_loss_mlp": 1.06384206, + "epoch": 0.9122739515198153, + "flos": 703555296768.0, + "grad_norm": 0.038607205451932886, + "language_loss": 0.90239573, + "learning_rate": 2.0046573374980447e-05, + "loss": 0.91376662, + "num_input_tokens_seen": 392128960, + "router_z_loss_mlp": 0.73242188, + "step": 4742, + "time_per_iteration": 2.8458170890808105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138421, + "balance_loss_mlp": 1.06508315, + "epoch": 0.9124663332050789, + "flos": 525716181504.0, + "grad_norm": 0.04055009874504829, + "language_loss": 0.93180835, + "learning_rate": 1.995933526832239e-05, + "loss": 0.9431926, + "num_input_tokens_seen": 392195008, + "router_z_loss_mlp": 0.73291016, + "step": 4743, + "time_per_iteration": 2.59576678276062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138396, + "balance_loss_mlp": 1.06501067, + "epoch": 0.9126587148903424, + "flos": 564370523136.0, + "grad_norm": 0.03672916386573753, + "language_loss": 0.8672806, + "learning_rate": 1.9872283529978662e-05, + "loss": 0.87866455, + "num_input_tokens_seen": 392265168, + "router_z_loss_mlp": 0.73339844, + "step": 4744, + "time_per_iteration": 2.640869379043579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137273, + "balance_loss_mlp": 1.06398344, + "epoch": 0.912851096575606, + "flos": 506933865984.0, + "grad_norm": 0.03925828506694119, + "language_loss": 0.84253651, + "learning_rate": 1.978541819374574e-05, + "loss": 0.85390925, + "num_input_tokens_seen": 392329456, + "router_z_loss_mlp": 0.73291016, + "step": 4745, + "time_per_iteration": 2.6787405014038086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137181, + "balance_loss_mlp": 1.06389141, + "epoch": 0.9130434782608695, + "flos": 551768630784.0, + "grad_norm": 0.03898701708502903, + "language_loss": 0.87371671, + "learning_rate": 1.9698739293347755e-05, + "loss": 0.88508856, + "num_input_tokens_seen": 392397792, + "router_z_loss_mlp": 0.73291016, + "step": 4746, + "time_per_iteration": 2.6251258850097656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137732, + "balance_loss_mlp": 1.064538, + "epoch": 0.9132358599461331, + "flos": 469935920640.0, + "grad_norm": 0.037506103614932354, + "language_loss": 0.87836325, + "learning_rate": 1.9612246862436456e-05, + "loss": 0.88974053, + "num_input_tokens_seen": 392462928, + "router_z_loss_mlp": 0.73193359, + "step": 4747, + "time_per_iteration": 2.536179542541504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137446, + "balance_loss_mlp": 1.06415629, + "epoch": 0.9134282416313967, + "flos": 507101051904.0, + "grad_norm": 0.038265188221768345, + "language_loss": 0.84132433, + "learning_rate": 1.9525940934591148e-05, + "loss": 0.8526988, + "num_input_tokens_seen": 392531840, + "router_z_loss_mlp": 0.73291016, + "step": 4748, + "time_per_iteration": 2.6317527294158936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138034, + "balance_loss_mlp": 1.06479192, + "epoch": 0.9136206233166603, + "flos": 605938157568.0, + "grad_norm": 0.038780374815894, + "language_loss": 0.88831162, + "learning_rate": 1.9439821543318748e-05, + "loss": 0.89969194, + "num_input_tokens_seen": 392602464, + "router_z_loss_mlp": 0.73242188, + "step": 4749, + "time_per_iteration": 2.7483599185943604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113715, + "balance_loss_mlp": 1.06390798, + "epoch": 0.9138130050019239, + "flos": 562824916992.0, + "grad_norm": 0.03593036056465836, + "language_loss": 0.87584126, + "learning_rate": 1.9353888722053793e-05, + "loss": 0.88721275, + "num_input_tokens_seen": 392669872, + "router_z_loss_mlp": 0.73242188, + "step": 4750, + "time_per_iteration": 2.6593310832977295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137274, + "balance_loss_mlp": 1.06398451, + "epoch": 0.9140053866871873, + "flos": 691344172032.0, + "grad_norm": 0.033756057406165677, + "language_loss": 0.94630772, + "learning_rate": 1.9268142504158426e-05, + "loss": 0.95768046, + "num_input_tokens_seen": 392744256, + "router_z_loss_mlp": 0.73291016, + "step": 4751, + "time_per_iteration": 2.8558006286621094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136083, + "balance_loss_mlp": 1.06279266, + "epoch": 0.9141977683724509, + "flos": 552129199104.0, + "grad_norm": 0.0351497110671635, + "language_loss": 0.88143069, + "learning_rate": 1.9182582922922186e-05, + "loss": 0.89279151, + "num_input_tokens_seen": 392816832, + "router_z_loss_mlp": 0.73291016, + "step": 4752, + "time_per_iteration": 2.6890971660614014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135831, + "balance_loss_mlp": 1.06258917, + "epoch": 0.9143901500577145, + "flos": 541120576512.0, + "grad_norm": 0.039948380347975404, + "language_loss": 0.80258191, + "learning_rate": 1.9097210011562228e-05, + "loss": 0.81394029, + "num_input_tokens_seen": 392886304, + "router_z_loss_mlp": 0.73242188, + "step": 4753, + "time_per_iteration": 2.660975217819214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135888, + "balance_loss_mlp": 1.06264615, + "epoch": 0.9145825317429781, + "flos": 529793044992.0, + "grad_norm": 0.03802405513720637, + "language_loss": 0.85889542, + "learning_rate": 1.9012023803223366e-05, + "loss": 0.87025428, + "num_input_tokens_seen": 392955872, + "router_z_loss_mlp": 0.73242188, + "step": 4754, + "time_per_iteration": 2.6234130859375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135989, + "balance_loss_mlp": 1.06269932, + "epoch": 0.9147749134282416, + "flos": 515812732416.0, + "grad_norm": 0.0330610975308954, + "language_loss": 0.83169824, + "learning_rate": 1.892702433097776e-05, + "loss": 0.84305817, + "num_input_tokens_seen": 393025776, + "router_z_loss_mlp": 0.73291016, + "step": 4755, + "time_per_iteration": 2.6349074840545654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136034, + "balance_loss_mlp": 1.06293452, + "epoch": 0.9149672951135052, + "flos": 515513289216.0, + "grad_norm": 0.03561497864158172, + "language_loss": 0.90493286, + "learning_rate": 1.8842211627825233e-05, + "loss": 0.91629314, + "num_input_tokens_seen": 393095936, + "router_z_loss_mlp": 0.73095703, + "step": 4756, + "time_per_iteration": 2.672971725463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137657, + "balance_loss_mlp": 1.06441462, + "epoch": 0.9151596767987688, + "flos": 578227310592.0, + "grad_norm": 0.0357639019467354, + "language_loss": 0.86071813, + "learning_rate": 1.8757585726692727e-05, + "loss": 0.87209469, + "num_input_tokens_seen": 393166816, + "router_z_loss_mlp": 0.73242188, + "step": 4757, + "time_per_iteration": 2.7354896068573 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113794, + "balance_loss_mlp": 1.06484115, + "epoch": 0.9153520584840323, + "flos": 620476423680.0, + "grad_norm": 0.033473586287839016, + "language_loss": 0.87076652, + "learning_rate": 1.8673146660435182e-05, + "loss": 0.88214588, + "num_input_tokens_seen": 393242176, + "router_z_loss_mlp": 0.73095703, + "step": 4758, + "time_per_iteration": 2.744753122329712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137943, + "balance_loss_mlp": 1.06470096, + "epoch": 0.9155444401692959, + "flos": 469862060544.0, + "grad_norm": 0.03673386334031248, + "language_loss": 0.87150836, + "learning_rate": 1.8588894461834704e-05, + "loss": 0.88288778, + "num_input_tokens_seen": 393311792, + "router_z_loss_mlp": 0.73242188, + "step": 4759, + "time_per_iteration": 2.589590311050415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142845, + "balance_loss_mlp": 1.07131958, + "epoch": 0.9157368218545594, + "flos": 1413839689728.0, + "grad_norm": 0.005825750154504474, + "language_loss": 0.7481907, + "learning_rate": 1.8504829163600855e-05, + "loss": 0.75961918, + "num_input_tokens_seen": 393535648, + "router_z_loss_mlp": 0.71679688, + "step": 4760, + "time_per_iteration": 4.916935682296753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143028, + "balance_loss_mlp": 1.07150269, + "epoch": 0.915929203539823, + "flos": 1525324349952.0, + "grad_norm": 0.00593786079998211, + "language_loss": 0.79576051, + "learning_rate": 1.8420950798370584e-05, + "loss": 0.8071909, + "num_input_tokens_seen": 393767040, + "router_z_loss_mlp": 0.71679688, + "step": 4761, + "time_per_iteration": 4.881082534790039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136307, + "balance_loss_mlp": 1.06306517, + "epoch": 0.9161215852250866, + "flos": 536846327808.0, + "grad_norm": 0.03600435736689933, + "language_loss": 0.85723937, + "learning_rate": 1.8337259398708616e-05, + "loss": 0.86860245, + "num_input_tokens_seen": 393841232, + "router_z_loss_mlp": 0.73242188, + "step": 4762, + "time_per_iteration": 2.6991817951202393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011356, + "balance_loss_mlp": 1.06245291, + "epoch": 0.9163139669103502, + "flos": 591725531136.0, + "grad_norm": 0.04011016842573452, + "language_loss": 0.86041784, + "learning_rate": 1.8253754997106632e-05, + "loss": 0.87177384, + "num_input_tokens_seen": 393910512, + "router_z_loss_mlp": 0.73144531, + "step": 4763, + "time_per_iteration": 2.699273109436035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134482, + "balance_loss_mlp": 1.06114411, + "epoch": 0.9165063485956138, + "flos": 823371603456.0, + "grad_norm": 0.03153796906678494, + "language_loss": 0.88287377, + "learning_rate": 1.817043762598397e-05, + "loss": 0.89421856, + "num_input_tokens_seen": 393988624, + "router_z_loss_mlp": 0.73339844, + "step": 4764, + "time_per_iteration": 3.0631844997406006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113468, + "balance_loss_mlp": 1.0613898, + "epoch": 0.9166987302808772, + "flos": 526245937152.0, + "grad_norm": 0.03701950876229616, + "language_loss": 0.87147516, + "learning_rate": 1.8087307317687264e-05, + "loss": 0.88282192, + "num_input_tokens_seen": 394059184, + "router_z_loss_mlp": 0.73291016, + "step": 4765, + "time_per_iteration": 2.6542019844055176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134817, + "balance_loss_mlp": 1.06152701, + "epoch": 0.9168911119661408, + "flos": 656345726976.0, + "grad_norm": 0.033448675815540965, + "language_loss": 0.88564223, + "learning_rate": 1.800436410449058e-05, + "loss": 0.89699042, + "num_input_tokens_seen": 394142160, + "router_z_loss_mlp": 0.73291016, + "step": 4766, + "time_per_iteration": 2.9484171867370605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134985, + "balance_loss_mlp": 1.06174314, + "epoch": 0.9170834936514044, + "flos": 492721239552.0, + "grad_norm": 0.03145874781003063, + "language_loss": 0.89064819, + "learning_rate": 1.7921608018595436e-05, + "loss": 0.90199804, + "num_input_tokens_seen": 394207056, + "router_z_loss_mlp": 0.73242188, + "step": 4767, + "time_per_iteration": 2.54239821434021 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134486, + "balance_loss_mlp": 1.06124353, + "epoch": 0.917275875336668, + "flos": 629179372032.0, + "grad_norm": 0.03937996598674544, + "language_loss": 0.85276043, + "learning_rate": 1.7839039092130415e-05, + "loss": 0.86410534, + "num_input_tokens_seen": 394275456, + "router_z_loss_mlp": 0.73242188, + "step": 4768, + "time_per_iteration": 2.788365125656128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139496, + "balance_loss_mlp": 1.06777954, + "epoch": 0.9174682570219315, + "flos": 1521212557824.0, + "grad_norm": 0.003465998436582984, + "language_loss": 0.78180236, + "learning_rate": 1.7756657357151762e-05, + "loss": 0.79319733, + "num_input_tokens_seen": 394503808, + "router_z_loss_mlp": 0.71875, + "step": 4769, + "time_per_iteration": 4.939180850982666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134868, + "balance_loss_mlp": 1.06157768, + "epoch": 0.917660638707195, + "flos": 561112124928.0, + "grad_norm": 0.03362556891440619, + "language_loss": 0.8936972, + "learning_rate": 1.7674462845642835e-05, + "loss": 0.90504587, + "num_input_tokens_seen": 394573776, + "router_z_loss_mlp": 0.73291016, + "step": 4770, + "time_per_iteration": 2.734116315841675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113516, + "balance_loss_mlp": 1.06186974, + "epoch": 0.9178530203924586, + "flos": 448175184384.0, + "grad_norm": 0.03565950552809895, + "language_loss": 0.88209128, + "learning_rate": 1.7592455589514387e-05, + "loss": 0.89344281, + "num_input_tokens_seen": 394637600, + "router_z_loss_mlp": 0.73291016, + "step": 4771, + "time_per_iteration": 2.482034683227539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134749, + "balance_loss_mlp": 1.06150699, + "epoch": 0.9180454020777222, + "flos": 466974964224.0, + "grad_norm": 0.033285195978275374, + "language_loss": 0.83965075, + "learning_rate": 1.7510635620604453e-05, + "loss": 0.85099828, + "num_input_tokens_seen": 394707344, + "router_z_loss_mlp": 0.73242188, + "step": 4772, + "time_per_iteration": 2.5653374195098877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113499, + "balance_loss_mlp": 1.06174767, + "epoch": 0.9182377837629858, + "flos": 597484987392.0, + "grad_norm": 0.03234819221060202, + "language_loss": 0.91231674, + "learning_rate": 1.74290029706784e-05, + "loss": 0.92366672, + "num_input_tokens_seen": 394786368, + "router_z_loss_mlp": 0.73242188, + "step": 4773, + "time_per_iteration": 2.758915901184082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134829, + "balance_loss_mlp": 1.06139612, + "epoch": 0.9184301654482493, + "flos": 998360552448.0, + "grad_norm": 0.03268667368696316, + "language_loss": 0.87101263, + "learning_rate": 1.734755767142876e-05, + "loss": 0.88236094, + "num_input_tokens_seen": 394876976, + "router_z_loss_mlp": 0.734375, + "step": 4774, + "time_per_iteration": 3.328178644180298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134649, + "balance_loss_mlp": 1.06140733, + "epoch": 0.9186225471335129, + "flos": 509901553152.0, + "grad_norm": 0.029942945001472855, + "language_loss": 0.87889773, + "learning_rate": 1.7266299754475467e-05, + "loss": 0.89024425, + "num_input_tokens_seen": 394949024, + "router_z_loss_mlp": 0.73242188, + "step": 4775, + "time_per_iteration": 2.658120632171631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134933, + "balance_loss_mlp": 1.06164348, + "epoch": 0.9188149288187765, + "flos": 942076732416.0, + "grad_norm": 0.03844935783294636, + "language_loss": 0.83205068, + "learning_rate": 1.718522925136551e-05, + "loss": 0.8434, + "num_input_tokens_seen": 395044352, + "router_z_loss_mlp": 0.73291016, + "step": 4776, + "time_per_iteration": 3.2743020057678223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134929, + "balance_loss_mlp": 1.06173444, + "epoch": 0.91900731050404, + "flos": 584763572736.0, + "grad_norm": 0.03633610266670935, + "language_loss": 0.87877005, + "learning_rate": 1.7104346193573484e-05, + "loss": 0.89011931, + "num_input_tokens_seen": 395113824, + "router_z_loss_mlp": 0.73193359, + "step": 4777, + "time_per_iteration": 2.6747422218322754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136109, + "balance_loss_mlp": 1.06277132, + "epoch": 0.9191996921893035, + "flos": 582306175488.0, + "grad_norm": 0.04168169923395777, + "language_loss": 0.85453916, + "learning_rate": 1.7023650612500828e-05, + "loss": 0.86590028, + "num_input_tokens_seen": 395184496, + "router_z_loss_mlp": 0.73339844, + "step": 4778, + "time_per_iteration": 2.6795010566711426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136418, + "balance_loss_mlp": 1.06317592, + "epoch": 0.9193920738745671, + "flos": 910416549888.0, + "grad_norm": 0.03761875549388394, + "language_loss": 0.84188634, + "learning_rate": 1.6943142539476374e-05, + "loss": 0.8532505, + "num_input_tokens_seen": 395263760, + "router_z_loss_mlp": 0.73242188, + "step": 4779, + "time_per_iteration": 3.1361474990844727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142517, + "balance_loss_mlp": 1.07080078, + "epoch": 0.9195844555598307, + "flos": 1561644819456.0, + "grad_norm": 0.005775441395861982, + "language_loss": 0.79795396, + "learning_rate": 1.686282200575606e-05, + "loss": 0.8093791, + "num_input_tokens_seen": 395482384, + "router_z_loss_mlp": 0.71875, + "step": 4780, + "time_per_iteration": 4.66200065612793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136054, + "balance_loss_mlp": 1.06271684, + "epoch": 0.9197768372450943, + "flos": 475017901056.0, + "grad_norm": 0.042723214120450784, + "language_loss": 0.83727241, + "learning_rate": 1.678268904252317e-05, + "loss": 0.84863299, + "num_input_tokens_seen": 395550384, + "router_z_loss_mlp": 0.73339844, + "step": 4781, + "time_per_iteration": 2.5478897094726562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134824, + "balance_loss_mlp": 1.06143892, + "epoch": 0.9199692189303579, + "flos": 858596358144.0, + "grad_norm": 0.044037253062345634, + "language_loss": 0.89346141, + "learning_rate": 1.6702743680888088e-05, + "loss": 0.90480959, + "num_input_tokens_seen": 395632320, + "router_z_loss_mlp": 0.73388672, + "step": 4782, + "time_per_iteration": 3.2057340145111084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134616, + "balance_loss_mlp": 1.06137359, + "epoch": 0.9201616006156214, + "flos": 505379527680.0, + "grad_norm": 0.03661647161350629, + "language_loss": 0.82697654, + "learning_rate": 1.6622985951888327e-05, + "loss": 0.83832264, + "num_input_tokens_seen": 395703856, + "router_z_loss_mlp": 0.73242188, + "step": 4783, + "time_per_iteration": 2.646583080291748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134557, + "balance_loss_mlp": 1.06117201, + "epoch": 0.9203539823008849, + "flos": 549895383552.0, + "grad_norm": 0.04183695528673719, + "language_loss": 0.89185143, + "learning_rate": 1.6543415886488554e-05, + "loss": 0.90319705, + "num_input_tokens_seen": 395779456, + "router_z_loss_mlp": 0.73388672, + "step": 4784, + "time_per_iteration": 2.70615816116333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135056, + "balance_loss_mlp": 1.06176567, + "epoch": 0.9205463639861485, + "flos": 541072912896.0, + "grad_norm": 0.038118566916411155, + "language_loss": 0.86795676, + "learning_rate": 1.6464033515580624e-05, + "loss": 0.87930727, + "num_input_tokens_seen": 395849584, + "router_z_loss_mlp": 0.73291016, + "step": 4785, + "time_per_iteration": 2.640362501144409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134779, + "balance_loss_mlp": 1.06144154, + "epoch": 0.9207387456714121, + "flos": 801161702400.0, + "grad_norm": 0.03691419431117059, + "language_loss": 0.82699919, + "learning_rate": 1.6384838869983488e-05, + "loss": 0.83834696, + "num_input_tokens_seen": 395943712, + "router_z_loss_mlp": 0.73339844, + "step": 4786, + "time_per_iteration": 3.035921573638916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134791, + "balance_loss_mlp": 1.06150079, + "epoch": 0.9209311273566756, + "flos": 503816457216.0, + "grad_norm": 0.03887199086882918, + "language_loss": 0.8393299, + "learning_rate": 1.630583198044333e-05, + "loss": 0.85067785, + "num_input_tokens_seen": 396013168, + "router_z_loss_mlp": 0.73291016, + "step": 4787, + "time_per_iteration": 2.648547887802124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136404, + "balance_loss_mlp": 1.06316197, + "epoch": 0.9211235090419392, + "flos": 570383760384.0, + "grad_norm": 0.034570845531176744, + "language_loss": 0.86524636, + "learning_rate": 1.6227012877633173e-05, + "loss": 0.8766104, + "num_input_tokens_seen": 396082032, + "router_z_loss_mlp": 0.73242188, + "step": 4788, + "time_per_iteration": 2.6737005710601807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136182, + "balance_loss_mlp": 1.0629878, + "epoch": 0.9213158907272028, + "flos": 807930278400.0, + "grad_norm": 0.038736420027196794, + "language_loss": 0.88138419, + "learning_rate": 1.6148381592153538e-05, + "loss": 0.89274597, + "num_input_tokens_seen": 396157984, + "router_z_loss_mlp": 0.73193359, + "step": 4789, + "time_per_iteration": 2.984248161315918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136426, + "balance_loss_mlp": 1.06308794, + "epoch": 0.9215082724124664, + "flos": 491650994688.0, + "grad_norm": 0.03447141076986377, + "language_loss": 0.80724669, + "learning_rate": 1.6069938154531618e-05, + "loss": 0.81861091, + "num_input_tokens_seen": 396223840, + "router_z_loss_mlp": 0.73339844, + "step": 4790, + "time_per_iteration": 2.5614049434661865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139565, + "balance_loss_mlp": 1.06765747, + "epoch": 0.9217006540977299, + "flos": 1517893761024.0, + "grad_norm": 0.0033789664426223543, + "language_loss": 0.77070266, + "learning_rate": 1.599168259522188e-05, + "loss": 0.78209823, + "num_input_tokens_seen": 396458288, + "router_z_loss_mlp": 0.72070312, + "step": 4791, + "time_per_iteration": 4.978902578353882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134776, + "balance_loss_mlp": 1.06153357, + "epoch": 0.9218930357829934, + "flos": 745086001152.0, + "grad_norm": 0.03665734830285374, + "language_loss": 0.809376, + "learning_rate": 1.5913614944605804e-05, + "loss": 0.82072377, + "num_input_tokens_seen": 396536208, + "router_z_loss_mlp": 0.73242188, + "step": 4792, + "time_per_iteration": 2.9215128421783447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134751, + "balance_loss_mlp": 1.06146133, + "epoch": 0.922085417468257, + "flos": 453973572096.0, + "grad_norm": 0.04198200068683094, + "language_loss": 0.85471809, + "learning_rate": 1.5835735232992032e-05, + "loss": 0.86606556, + "num_input_tokens_seen": 396599984, + "router_z_loss_mlp": 0.73291016, + "step": 4793, + "time_per_iteration": 2.502872943878174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134773, + "balance_loss_mlp": 1.06148362, + "epoch": 0.9222777991535206, + "flos": 501237536256.0, + "grad_norm": 0.04225847617164951, + "language_loss": 0.89807576, + "learning_rate": 1.575804349061616e-05, + "loss": 0.90942347, + "num_input_tokens_seen": 396664592, + "router_z_loss_mlp": 0.73291016, + "step": 4794, + "time_per_iteration": 2.576061964035034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134907, + "balance_loss_mlp": 1.06147456, + "epoch": 0.9224701808387842, + "flos": 528983311872.0, + "grad_norm": 0.03721796107962599, + "language_loss": 0.8360222, + "learning_rate": 1.5680539747640722e-05, + "loss": 0.84737134, + "num_input_tokens_seen": 396729472, + "router_z_loss_mlp": 0.734375, + "step": 4795, + "time_per_iteration": 2.583193778991699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134896, + "balance_loss_mlp": 1.06160617, + "epoch": 0.9226625625240477, + "flos": 876117047808.0, + "grad_norm": 0.03443008595735349, + "language_loss": 0.79559839, + "learning_rate": 1.5603224034155315e-05, + "loss": 0.80694729, + "num_input_tokens_seen": 396810384, + "router_z_loss_mlp": 0.73291016, + "step": 4796, + "time_per_iteration": 3.1217310428619385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134541, + "balance_loss_mlp": 1.06125164, + "epoch": 0.9228549442093112, + "flos": 503760061440.0, + "grad_norm": 0.036776332050838995, + "language_loss": 0.92655843, + "learning_rate": 1.5526096380176657e-05, + "loss": 0.93790388, + "num_input_tokens_seen": 396875472, + "router_z_loss_mlp": 0.73291016, + "step": 4797, + "time_per_iteration": 2.5615105628967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134953, + "balance_loss_mlp": 1.06161523, + "epoch": 0.9230473258945748, + "flos": 601125421056.0, + "grad_norm": 0.033291935221544965, + "language_loss": 0.89235032, + "learning_rate": 1.544915681564829e-05, + "loss": 0.90369982, + "num_input_tokens_seen": 396949888, + "router_z_loss_mlp": 0.73339844, + "step": 4798, + "time_per_iteration": 2.877967596054077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134901, + "balance_loss_mlp": 1.06165874, + "epoch": 0.9232397075798384, + "flos": 823875162624.0, + "grad_norm": 0.038339368705079924, + "language_loss": 0.84685349, + "learning_rate": 1.5372405370440822e-05, + "loss": 0.85820246, + "num_input_tokens_seen": 397027504, + "router_z_loss_mlp": 0.73242188, + "step": 4799, + "time_per_iteration": 3.0926709175109863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135028, + "balance_loss_mlp": 1.06173778, + "epoch": 0.923432089265102, + "flos": 708274707456.0, + "grad_norm": 0.03568827047974618, + "language_loss": 0.89519256, + "learning_rate": 1.5295842074351805e-05, + "loss": 0.9065429, + "num_input_tokens_seen": 397101600, + "router_z_loss_mlp": 0.73291016, + "step": 4800, + "time_per_iteration": 2.881060838699341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136822, + "balance_loss_mlp": 1.06362712, + "epoch": 0.9236244709503655, + "flos": 703090669056.0, + "grad_norm": 0.0411673786427115, + "language_loss": 0.82487589, + "learning_rate": 1.5219466957105798e-05, + "loss": 0.83624411, + "num_input_tokens_seen": 397170880, + "router_z_loss_mlp": 0.73193359, + "step": 4801, + "time_per_iteration": 2.840782403945923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136403, + "balance_loss_mlp": 1.0632081, + "epoch": 0.9238168526356291, + "flos": 516081976320.0, + "grad_norm": 0.03540606312834152, + "language_loss": 0.88255292, + "learning_rate": 1.5143280048354136e-05, + "loss": 0.89391702, + "num_input_tokens_seen": 397242272, + "router_z_loss_mlp": 0.73193359, + "step": 4802, + "time_per_iteration": 2.6457712650299072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136586, + "balance_loss_mlp": 1.06334352, + "epoch": 0.9240092343208927, + "flos": 492964286976.0, + "grad_norm": 0.04044553968836264, + "language_loss": 0.86154222, + "learning_rate": 1.5067281377675213e-05, + "loss": 0.87290812, + "num_input_tokens_seen": 397308032, + "router_z_loss_mlp": 0.73242188, + "step": 4803, + "time_per_iteration": 2.580083131790161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135778, + "balance_loss_mlp": 1.06239247, + "epoch": 0.9242016160061562, + "flos": 648435047424.0, + "grad_norm": 0.0375252651835897, + "language_loss": 0.78042829, + "learning_rate": 1.4991470974574484e-05, + "loss": 0.79178602, + "num_input_tokens_seen": 397390944, + "router_z_loss_mlp": 0.73388672, + "step": 4804, + "time_per_iteration": 2.8536152839660645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136397, + "balance_loss_mlp": 1.0632025, + "epoch": 0.9243939976914197, + "flos": 730778047488.0, + "grad_norm": 0.037173114265174334, + "language_loss": 0.84313226, + "learning_rate": 1.4915848868484016e-05, + "loss": 0.85449624, + "num_input_tokens_seen": 397468128, + "router_z_loss_mlp": 0.73193359, + "step": 4805, + "time_per_iteration": 2.968522310256958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135222, + "balance_loss_mlp": 1.0618844, + "epoch": 0.9245863793766833, + "flos": 453209501184.0, + "grad_norm": 0.03394031409690086, + "language_loss": 0.94972181, + "learning_rate": 1.4840415088763048e-05, + "loss": 0.96107405, + "num_input_tokens_seen": 397538976, + "router_z_loss_mlp": 0.73339844, + "step": 4806, + "time_per_iteration": 2.591217517852783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135015, + "balance_loss_mlp": 1.06162941, + "epoch": 0.9247787610619469, + "flos": 756365869056.0, + "grad_norm": 0.03881181193239194, + "language_loss": 0.82753104, + "learning_rate": 1.476516966469732e-05, + "loss": 0.83888113, + "num_input_tokens_seen": 397612944, + "router_z_loss_mlp": 0.73388672, + "step": 4807, + "time_per_iteration": 2.9434964656829834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135205, + "balance_loss_mlp": 1.06186795, + "epoch": 0.9249711427472105, + "flos": 563083427328.0, + "grad_norm": 0.034947383902908004, + "language_loss": 0.89372003, + "learning_rate": 1.4690112625499908e-05, + "loss": 0.90507203, + "num_input_tokens_seen": 397690848, + "router_z_loss_mlp": 0.73339844, + "step": 4808, + "time_per_iteration": 2.770357370376587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134947, + "balance_loss_mlp": 1.06156158, + "epoch": 0.9251635244324741, + "flos": 527780809728.0, + "grad_norm": 0.03910850874583782, + "language_loss": 0.89453298, + "learning_rate": 1.4615244000310501e-05, + "loss": 0.90588242, + "num_input_tokens_seen": 397761008, + "router_z_loss_mlp": 0.73388672, + "step": 4809, + "time_per_iteration": 2.6631083488464355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135004, + "balance_loss_mlp": 1.0615716, + "epoch": 0.9253559061177375, + "flos": 612479149056.0, + "grad_norm": 0.03802586190927124, + "language_loss": 0.83715951, + "learning_rate": 1.4540563818195685e-05, + "loss": 0.84850955, + "num_input_tokens_seen": 397840640, + "router_z_loss_mlp": 0.734375, + "step": 4810, + "time_per_iteration": 2.8262386322021484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139725, + "balance_loss_mlp": 1.06800842, + "epoch": 0.9255482878030011, + "flos": 1554461280768.0, + "grad_norm": 0.004137695643331225, + "language_loss": 0.76925391, + "learning_rate": 1.446607210814882e-05, + "loss": 0.78065115, + "num_input_tokens_seen": 398060096, + "router_z_loss_mlp": 0.71875, + "step": 4811, + "time_per_iteration": 4.7207818031311035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134856, + "balance_loss_mlp": 1.06151867, + "epoch": 0.9257406694882647, + "flos": 767802189312.0, + "grad_norm": 0.03858144301478165, + "language_loss": 0.85714322, + "learning_rate": 1.4391768899090219e-05, + "loss": 0.86849177, + "num_input_tokens_seen": 398143680, + "router_z_loss_mlp": 0.73339844, + "step": 4812, + "time_per_iteration": 3.0623562335968018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136229, + "balance_loss_mlp": 1.06298673, + "epoch": 0.9259330511735283, + "flos": 498966790656.0, + "grad_norm": 0.03833501566517131, + "language_loss": 0.8808893, + "learning_rate": 1.431765421986686e-05, + "loss": 0.89225155, + "num_input_tokens_seen": 398207056, + "router_z_loss_mlp": 0.73242188, + "step": 4813, + "time_per_iteration": 2.6300573348999023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136541, + "balance_loss_mlp": 1.06339419, + "epoch": 0.9261254328587919, + "flos": 628015801344.0, + "grad_norm": 0.036925045587933254, + "language_loss": 0.8380208, + "learning_rate": 1.424372809925273e-05, + "loss": 0.84938622, + "num_input_tokens_seen": 398277472, + "router_z_loss_mlp": 0.73144531, + "step": 4814, + "time_per_iteration": 2.739515542984009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136367, + "balance_loss_mlp": 1.06312442, + "epoch": 0.9263178145440554, + "flos": 598492105728.0, + "grad_norm": 0.036427674031464095, + "language_loss": 0.89815581, + "learning_rate": 1.416999056594831e-05, + "loss": 0.90951943, + "num_input_tokens_seen": 398346544, + "router_z_loss_mlp": 0.73242188, + "step": 4815, + "time_per_iteration": 2.7244396209716797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113381, + "balance_loss_mlp": 1.06042469, + "epoch": 0.926510196229319, + "flos": 389416502784.0, + "grad_norm": 0.03761333342393075, + "language_loss": 0.88639969, + "learning_rate": 1.4096441648581259e-05, + "loss": 0.8977378, + "num_input_tokens_seen": 398409344, + "router_z_loss_mlp": 0.73388672, + "step": 4816, + "time_per_iteration": 2.497323513031006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134114, + "balance_loss_mlp": 1.06082404, + "epoch": 0.9267025779145825, + "flos": 546862568448.0, + "grad_norm": 0.04104132157625523, + "language_loss": 0.8884635, + "learning_rate": 1.4023081375705737e-05, + "loss": 0.89980459, + "num_input_tokens_seen": 398478816, + "router_z_loss_mlp": 0.73291016, + "step": 4817, + "time_per_iteration": 2.657047986984253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134159, + "balance_loss_mlp": 1.06086874, + "epoch": 0.9268949595998461, + "flos": 500790372864.0, + "grad_norm": 0.03579000656747544, + "language_loss": 0.86026472, + "learning_rate": 1.3949909775802682e-05, + "loss": 0.87160635, + "num_input_tokens_seen": 398550384, + "router_z_loss_mlp": 0.73291016, + "step": 4818, + "time_per_iteration": 2.6788973808288574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135314, + "balance_loss_mlp": 1.06202364, + "epoch": 0.9270873412851096, + "flos": 433738976256.0, + "grad_norm": 0.03546119064203232, + "language_loss": 0.86793125, + "learning_rate": 1.3876926877279817e-05, + "loss": 0.87928438, + "num_input_tokens_seen": 398620832, + "router_z_loss_mlp": 0.73291016, + "step": 4819, + "time_per_iteration": 2.6300439834594727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135322, + "balance_loss_mlp": 1.06217515, + "epoch": 0.9272797229703732, + "flos": 467802161664.0, + "grad_norm": 0.039403892128954024, + "language_loss": 0.9138974, + "learning_rate": 1.380413270847164e-05, + "loss": 0.92525059, + "num_input_tokens_seen": 398689776, + "router_z_loss_mlp": 0.73144531, + "step": 4820, + "time_per_iteration": 2.6474528312683105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134919, + "balance_loss_mlp": 1.06172454, + "epoch": 0.9274721046556368, + "flos": 706249737216.0, + "grad_norm": 0.036493835710477124, + "language_loss": 0.83149821, + "learning_rate": 1.373152729763938e-05, + "loss": 0.84284735, + "num_input_tokens_seen": 398775072, + "router_z_loss_mlp": 0.73193359, + "step": 4821, + "time_per_iteration": 3.0488803386688232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140076, + "balance_loss_mlp": 1.06835938, + "epoch": 0.9276644863409004, + "flos": 1405342858752.0, + "grad_norm": 0.0042348399486481225, + "language_loss": 0.82380462, + "learning_rate": 1.3659110672970931e-05, + "loss": 0.83520538, + "num_input_tokens_seen": 399002016, + "router_z_loss_mlp": 0.71875, + "step": 4822, + "time_per_iteration": 4.881706237792969 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113384, + "balance_loss_mlp": 1.06054974, + "epoch": 0.927856868026164, + "flos": 743136892416.0, + "grad_norm": 0.036665981072277615, + "language_loss": 0.84963113, + "learning_rate": 1.3586882862580917e-05, + "loss": 0.86096954, + "num_input_tokens_seen": 399085808, + "router_z_loss_mlp": 0.73291016, + "step": 4823, + "time_per_iteration": 3.027317523956299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133668, + "balance_loss_mlp": 1.06028235, + "epoch": 0.9280492497114274, + "flos": 413122344960.0, + "grad_norm": 0.044707757388090734, + "language_loss": 0.79886949, + "learning_rate": 1.3514843894510686e-05, + "loss": 0.81020617, + "num_input_tokens_seen": 399146768, + "router_z_loss_mlp": 0.73388672, + "step": 4824, + "time_per_iteration": 2.4648141860961914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133648, + "balance_loss_mlp": 1.06035805, + "epoch": 0.928241631396691, + "flos": 647664245760.0, + "grad_norm": 0.0394631241951201, + "language_loss": 0.90115678, + "learning_rate": 1.3442993796728254e-05, + "loss": 0.91249329, + "num_input_tokens_seen": 399220192, + "router_z_loss_mlp": 0.73291016, + "step": 4825, + "time_per_iteration": 2.8606808185577393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133488, + "balance_loss_mlp": 1.06019819, + "epoch": 0.9284340130819546, + "flos": 698128210944.0, + "grad_norm": 0.037269219229585766, + "language_loss": 0.85544008, + "learning_rate": 1.3371332597128249e-05, + "loss": 0.86677498, + "num_input_tokens_seen": 399300064, + "router_z_loss_mlp": 0.73291016, + "step": 4826, + "time_per_iteration": 2.960580348968506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135082, + "balance_loss_mlp": 1.06174421, + "epoch": 0.9286263947672182, + "flos": 760542789120.0, + "grad_norm": 0.033270395094925145, + "language_loss": 0.88126981, + "learning_rate": 1.3299860323532032e-05, + "loss": 0.89262056, + "num_input_tokens_seen": 399383200, + "router_z_loss_mlp": 0.73339844, + "step": 4827, + "time_per_iteration": 3.026780366897583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135119, + "balance_loss_mlp": 1.06187654, + "epoch": 0.9288187764524817, + "flos": 674140389888.0, + "grad_norm": 0.03346604176423535, + "language_loss": 0.85396868, + "learning_rate": 1.3228577003687681e-05, + "loss": 0.86531985, + "num_input_tokens_seen": 399466400, + "router_z_loss_mlp": 0.73242188, + "step": 4828, + "time_per_iteration": 2.9438445568084717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113508, + "balance_loss_mlp": 1.06183743, + "epoch": 0.9290111581377453, + "flos": 501469850112.0, + "grad_norm": 0.03828039220289202, + "language_loss": 0.87901628, + "learning_rate": 1.3157482665269727e-05, + "loss": 0.89036709, + "num_input_tokens_seen": 399533504, + "router_z_loss_mlp": 0.73242188, + "step": 4829, + "time_per_iteration": 2.577852725982666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113945, + "balance_loss_mlp": 1.06773376, + "epoch": 0.9292035398230089, + "flos": 1567057168896.0, + "grad_norm": 0.003695990156438286, + "language_loss": 0.72122061, + "learning_rate": 1.3086577335879424e-05, + "loss": 0.73261511, + "num_input_tokens_seen": 399769872, + "router_z_loss_mlp": 0.71875, + "step": 4830, + "time_per_iteration": 4.9167375564575195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139557, + "balance_loss_mlp": 1.06784058, + "epoch": 0.9293959215082724, + "flos": 1522063950336.0, + "grad_norm": 0.003745427392518177, + "language_loss": 0.79511833, + "learning_rate": 1.3015861043044753e-05, + "loss": 0.80651391, + "num_input_tokens_seen": 399997760, + "router_z_loss_mlp": 0.71875, + "step": 4831, + "time_per_iteration": 4.895474195480347 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133447, + "balance_loss_mlp": 1.06006205, + "epoch": 0.929588303193536, + "flos": 558897775104.0, + "grad_norm": 0.05587972312929897, + "language_loss": 0.89084888, + "learning_rate": 1.2945333814220195e-05, + "loss": 0.90218329, + "num_input_tokens_seen": 400063872, + "router_z_loss_mlp": 0.73388672, + "step": 4832, + "time_per_iteration": 2.6715126037597656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134017, + "balance_loss_mlp": 1.06063223, + "epoch": 0.9297806848787995, + "flos": 479550660096.0, + "grad_norm": 0.04310011942892276, + "language_loss": 0.85959709, + "learning_rate": 1.2874995676786905e-05, + "loss": 0.87093729, + "num_input_tokens_seen": 400126064, + "router_z_loss_mlp": 0.73388672, + "step": 4833, + "time_per_iteration": 2.5311076641082764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133753, + "balance_loss_mlp": 1.06036782, + "epoch": 0.9299730665640631, + "flos": 565653616128.0, + "grad_norm": 0.03259048154405644, + "language_loss": 0.84302491, + "learning_rate": 1.2804846658052372e-05, + "loss": 0.85436249, + "num_input_tokens_seen": 400201776, + "router_z_loss_mlp": 0.73388672, + "step": 4834, + "time_per_iteration": 2.917907476425171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133453, + "balance_loss_mlp": 1.06006742, + "epoch": 0.9301654482493267, + "flos": 561342437376.0, + "grad_norm": 0.03578896280013595, + "language_loss": 0.87560201, + "learning_rate": 1.2734886785251032e-05, + "loss": 0.88693655, + "num_input_tokens_seen": 400279504, + "router_z_loss_mlp": 0.73388672, + "step": 4835, + "time_per_iteration": 3.398090362548828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113652, + "balance_loss_mlp": 1.06480408, + "epoch": 0.9303578299345903, + "flos": 1523488032768.0, + "grad_norm": 0.004265178869550273, + "language_loss": 0.76852441, + "learning_rate": 1.2665116085543715e-05, + "loss": 0.7798897, + "num_input_tokens_seen": 400514800, + "router_z_loss_mlp": 0.71875, + "step": 4836, + "time_per_iteration": 5.208449840545654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113369, + "balance_loss_mlp": 1.06040013, + "epoch": 0.9305502116198537, + "flos": 531859674624.0, + "grad_norm": 0.03622258066971115, + "language_loss": 0.88041896, + "learning_rate": 1.2595534586017698e-05, + "loss": 0.89175594, + "num_input_tokens_seen": 400582640, + "router_z_loss_mlp": 0.73291016, + "step": 4837, + "time_per_iteration": 2.700305461883545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133586, + "balance_loss_mlp": 1.06020057, + "epoch": 0.9307425933051173, + "flos": 475855832064.0, + "grad_norm": 0.0423398747183289, + "language_loss": 0.86512882, + "learning_rate": 1.2526142313686983e-05, + "loss": 0.87646472, + "num_input_tokens_seen": 400646912, + "router_z_loss_mlp": 0.73388672, + "step": 4838, + "time_per_iteration": 2.5601203441619873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135535, + "balance_loss_mlp": 1.06219733, + "epoch": 0.9309349749903809, + "flos": 586064130048.0, + "grad_norm": 0.03684050044649056, + "language_loss": 0.90734005, + "learning_rate": 1.245693929549213e-05, + "loss": 0.91869539, + "num_input_tokens_seen": 400722128, + "router_z_loss_mlp": 0.73339844, + "step": 4839, + "time_per_iteration": 2.814164638519287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134924, + "balance_loss_mlp": 1.06168175, + "epoch": 0.9311273566756445, + "flos": 863141852160.0, + "grad_norm": 0.031996461961234596, + "language_loss": 0.80324173, + "learning_rate": 1.2387925558299984e-05, + "loss": 0.81459093, + "num_input_tokens_seen": 400801440, + "router_z_loss_mlp": 0.73242188, + "step": 4840, + "time_per_iteration": 3.157158374786377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134839, + "balance_loss_mlp": 1.06154966, + "epoch": 0.9313197383609081, + "flos": 549161511936.0, + "grad_norm": 0.037830595917140816, + "language_loss": 0.87318212, + "learning_rate": 1.231910112890411e-05, + "loss": 0.88453048, + "num_input_tokens_seen": 400873008, + "router_z_loss_mlp": 0.73291016, + "step": 4841, + "time_per_iteration": 2.7342753410339355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134557, + "balance_loss_mlp": 1.0612191, + "epoch": 0.9315121200461716, + "flos": 469703606784.0, + "grad_norm": 0.04359539081936152, + "language_loss": 0.86872697, + "learning_rate": 1.2250466034024522e-05, + "loss": 0.88007247, + "num_input_tokens_seen": 400935328, + "router_z_loss_mlp": 0.73339844, + "step": 4842, + "time_per_iteration": 2.5657942295074463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113488, + "balance_loss_mlp": 1.06154215, + "epoch": 0.9317045017314352, + "flos": 418558162944.0, + "grad_norm": 0.03823873856936876, + "language_loss": 0.82610798, + "learning_rate": 1.2182020300307684e-05, + "loss": 0.83745676, + "num_input_tokens_seen": 401000720, + "router_z_loss_mlp": 0.73339844, + "step": 4843, + "time_per_iteration": 2.549171209335327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134819, + "balance_loss_mlp": 1.06152916, + "epoch": 0.9318968834166987, + "flos": 541620132864.0, + "grad_norm": 0.03905937038896375, + "language_loss": 0.82102406, + "learning_rate": 1.2113763954326729e-05, + "loss": 0.83237225, + "num_input_tokens_seen": 401079664, + "router_z_loss_mlp": 0.73291016, + "step": 4844, + "time_per_iteration": 2.782175302505493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135109, + "balance_loss_mlp": 1.06172371, + "epoch": 0.9320892651019623, + "flos": 522346993152.0, + "grad_norm": 0.03778476990300089, + "language_loss": 0.84996724, + "learning_rate": 1.2045697022581015e-05, + "loss": 0.86131835, + "num_input_tokens_seen": 401146160, + "router_z_loss_mlp": 0.73388672, + "step": 4845, + "time_per_iteration": 2.640185832977295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135247, + "balance_loss_mlp": 1.06205273, + "epoch": 0.9322816467872258, + "flos": 583252895232.0, + "grad_norm": 0.03215108173886952, + "language_loss": 0.84850752, + "learning_rate": 1.1977819531496348e-05, + "loss": 0.85986006, + "num_input_tokens_seen": 401223264, + "router_z_loss_mlp": 0.73193359, + "step": 4846, + "time_per_iteration": 2.77775239944458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135397, + "balance_loss_mlp": 1.06215477, + "epoch": 0.9324740284724894, + "flos": 485802941952.0, + "grad_norm": 0.03897238462940964, + "language_loss": 0.85641253, + "learning_rate": 1.191013150742537e-05, + "loss": 0.8677665, + "num_input_tokens_seen": 401296368, + "router_z_loss_mlp": 0.73242188, + "step": 4847, + "time_per_iteration": 2.7562150955200195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113494, + "balance_loss_mlp": 1.06160247, + "epoch": 0.932666410157753, + "flos": 734023710720.0, + "grad_norm": 0.035990757069540615, + "language_loss": 0.87008613, + "learning_rate": 1.1842632976646672e-05, + "loss": 0.88143551, + "num_input_tokens_seen": 401383936, + "router_z_loss_mlp": 0.73339844, + "step": 4848, + "time_per_iteration": 3.0684380531311035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134114, + "balance_loss_mlp": 1.06077683, + "epoch": 0.9328587918430166, + "flos": 967180460544.0, + "grad_norm": 0.03473747152051204, + "language_loss": 0.83081275, + "learning_rate": 1.1775323965365681e-05, + "loss": 0.84215385, + "num_input_tokens_seen": 401468784, + "router_z_loss_mlp": 0.73339844, + "step": 4849, + "time_per_iteration": 3.298288583755493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133382, + "balance_loss_mlp": 1.06004477, + "epoch": 0.9330511735282802, + "flos": 615683152896.0, + "grad_norm": 0.04047956220186344, + "language_loss": 0.85783911, + "learning_rate": 1.1708204499713936e-05, + "loss": 0.86917299, + "num_input_tokens_seen": 401539712, + "router_z_loss_mlp": 0.73339844, + "step": 4850, + "time_per_iteration": 2.7613956928253174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134044, + "balance_loss_mlp": 1.06080151, + "epoch": 0.9332435552135436, + "flos": 560217798144.0, + "grad_norm": 0.03457415903450117, + "language_loss": 0.89681101, + "learning_rate": 1.1641274605749653e-05, + "loss": 0.90815145, + "num_input_tokens_seen": 401610432, + "router_z_loss_mlp": 0.73242188, + "step": 4851, + "time_per_iteration": 2.7369134426116943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134874, + "balance_loss_mlp": 1.06153619, + "epoch": 0.9334359368988072, + "flos": 516557337600.0, + "grad_norm": 0.035468780719106426, + "language_loss": 0.8622269, + "learning_rate": 1.1574534309457208e-05, + "loss": 0.87357557, + "num_input_tokens_seen": 401677344, + "router_z_loss_mlp": 0.73339844, + "step": 4852, + "time_per_iteration": 2.609017848968506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134841, + "balance_loss_mlp": 1.06159878, + "epoch": 0.9336283185840708, + "flos": 540940655616.0, + "grad_norm": 0.03211276800808927, + "language_loss": 0.86742085, + "learning_rate": 1.1507983636747488e-05, + "loss": 0.87876928, + "num_input_tokens_seen": 401756864, + "router_z_loss_mlp": 0.73242188, + "step": 4853, + "time_per_iteration": 2.800187587738037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139191, + "balance_loss_mlp": 1.06747437, + "epoch": 0.9338207002693344, + "flos": 1566121182720.0, + "grad_norm": 0.003325990500550125, + "language_loss": 0.78455019, + "learning_rate": 1.1441622613457824e-05, + "loss": 0.79594207, + "num_input_tokens_seen": 401983664, + "router_z_loss_mlp": 0.71875, + "step": 4854, + "time_per_iteration": 4.910603046417236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134905, + "balance_loss_mlp": 1.06161559, + "epoch": 0.9340130819545979, + "flos": 646507405824.0, + "grad_norm": 0.032821826781519965, + "language_loss": 0.85680681, + "learning_rate": 1.1375451265351833e-05, + "loss": 0.86815584, + "num_input_tokens_seen": 402065744, + "router_z_loss_mlp": 0.73291016, + "step": 4855, + "time_per_iteration": 2.939924478530884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113405, + "balance_loss_mlp": 1.06061697, + "epoch": 0.9342054636398615, + "flos": 504511397376.0, + "grad_norm": 0.037538841009704504, + "language_loss": 0.8107596, + "learning_rate": 1.1309469618119516e-05, + "loss": 0.8221001, + "num_input_tokens_seen": 402137728, + "router_z_loss_mlp": 0.734375, + "step": 4856, + "time_per_iteration": 2.6526336669921875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133343, + "balance_loss_mlp": 1.05995786, + "epoch": 0.934397845325125, + "flos": 594235321344.0, + "grad_norm": 0.029967610162658413, + "language_loss": 0.88165474, + "learning_rate": 1.1243677697377109e-05, + "loss": 0.89298815, + "num_input_tokens_seen": 402220160, + "router_z_loss_mlp": 0.73388672, + "step": 4857, + "time_per_iteration": 2.887981414794922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134666, + "balance_loss_mlp": 1.06137609, + "epoch": 0.9345902270103886, + "flos": 500883698688.0, + "grad_norm": 0.036598265959695855, + "language_loss": 0.84688962, + "learning_rate": 1.1178075528667453e-05, + "loss": 0.85823631, + "num_input_tokens_seen": 402285168, + "router_z_loss_mlp": 0.73291016, + "step": 4858, + "time_per_iteration": 2.704299211502075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139069, + "balance_loss_mlp": 1.06735229, + "epoch": 0.9347826086956522, + "flos": 1523404713984.0, + "grad_norm": 0.00324066268031166, + "language_loss": 0.7598772, + "learning_rate": 1.1112663137459566e-05, + "loss": 0.77126789, + "num_input_tokens_seen": 402504912, + "router_z_loss_mlp": 0.71875, + "step": 4859, + "time_per_iteration": 4.773599147796631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134721, + "balance_loss_mlp": 1.06147838, + "epoch": 0.9349749903809157, + "flos": 505664234496.0, + "grad_norm": 0.033069357773198756, + "language_loss": 0.8570931, + "learning_rate": 1.1047440549148636e-05, + "loss": 0.86844027, + "num_input_tokens_seen": 402582032, + "router_z_loss_mlp": 0.73242188, + "step": 4860, + "time_per_iteration": 2.8723926544189453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133433, + "balance_loss_mlp": 1.06009555, + "epoch": 0.9351673720661793, + "flos": 569964794880.0, + "grad_norm": 0.046471377956300595, + "language_loss": 0.84156215, + "learning_rate": 1.0982407789056514e-05, + "loss": 0.85289651, + "num_input_tokens_seen": 402650144, + "router_z_loss_mlp": 0.73339844, + "step": 4861, + "time_per_iteration": 2.6781229972839355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133548, + "balance_loss_mlp": 1.06011534, + "epoch": 0.9353597537514429, + "flos": 545662067712.0, + "grad_norm": 0.03778800547137944, + "language_loss": 0.90822428, + "learning_rate": 1.0917564882430952e-05, + "loss": 0.91955978, + "num_input_tokens_seen": 402720368, + "router_z_loss_mlp": 0.734375, + "step": 4862, + "time_per_iteration": 2.66455340385437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135025, + "balance_loss_mlp": 1.06173515, + "epoch": 0.9355521354367065, + "flos": 520019851776.0, + "grad_norm": 0.029824520949781164, + "language_loss": 0.88586128, + "learning_rate": 1.0852911854446368e-05, + "loss": 0.89721155, + "num_input_tokens_seen": 402795568, + "router_z_loss_mlp": 0.73291016, + "step": 4863, + "time_per_iteration": 2.698141098022461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135399, + "balance_loss_mlp": 1.06215656, + "epoch": 0.93574451712197, + "flos": 447235195392.0, + "grad_norm": 0.037674472562729544, + "language_loss": 0.83579856, + "learning_rate": 1.0788448730203237e-05, + "loss": 0.84715259, + "num_input_tokens_seen": 402858784, + "router_z_loss_mlp": 0.73242188, + "step": 4864, + "time_per_iteration": 2.512160062789917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135421, + "balance_loss_mlp": 1.06217897, + "epoch": 0.9359368988072335, + "flos": 481495766016.0, + "grad_norm": 0.046001044108411895, + "language_loss": 0.81934822, + "learning_rate": 1.072417553472832e-05, + "loss": 0.83070242, + "num_input_tokens_seen": 402924144, + "router_z_loss_mlp": 0.73242188, + "step": 4865, + "time_per_iteration": 2.5373268127441406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135169, + "balance_loss_mlp": 1.06197476, + "epoch": 0.9361292804924971, + "flos": 498091929600.0, + "grad_norm": 0.04032803456119548, + "language_loss": 0.90056789, + "learning_rate": 1.0660092292974766e-05, + "loss": 0.91191959, + "num_input_tokens_seen": 402987488, + "router_z_loss_mlp": 0.73193359, + "step": 4866, + "time_per_iteration": 2.6002197265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135017, + "balance_loss_mlp": 1.06187046, + "epoch": 0.9363216621777607, + "flos": 619293387264.0, + "grad_norm": 0.03580675503506335, + "language_loss": 0.88945127, + "learning_rate": 1.059619902982184e-05, + "loss": 0.90080142, + "num_input_tokens_seen": 403058224, + "router_z_loss_mlp": 0.73144531, + "step": 4867, + "time_per_iteration": 2.777174711227417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113549, + "balance_loss_mlp": 1.06377411, + "epoch": 0.9365140438630243, + "flos": 1418980067328.0, + "grad_norm": 0.003775098340926471, + "language_loss": 0.79203337, + "learning_rate": 1.053249577007509e-05, + "loss": 0.8033883, + "num_input_tokens_seen": 403289072, + "router_z_loss_mlp": 0.71875, + "step": 4868, + "time_per_iteration": 4.925109624862671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134027, + "balance_loss_mlp": 1.06068969, + "epoch": 0.9367064255482878, + "flos": 591649669632.0, + "grad_norm": 0.03396019612935237, + "language_loss": 0.85704494, + "learning_rate": 1.0468982538466287e-05, + "loss": 0.8683852, + "num_input_tokens_seen": 403361728, + "router_z_loss_mlp": 0.73339844, + "step": 4869, + "time_per_iteration": 2.752171754837036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133687, + "balance_loss_mlp": 1.06044507, + "epoch": 0.9368988072335513, + "flos": 527652555264.0, + "grad_norm": 0.03952131288198883, + "language_loss": 0.86593235, + "learning_rate": 1.0405659359653597e-05, + "loss": 0.87726915, + "num_input_tokens_seen": 403431536, + "router_z_loss_mlp": 0.73242188, + "step": 4870, + "time_per_iteration": 2.7232959270477295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134053, + "balance_loss_mlp": 1.06071544, + "epoch": 0.9370911889188149, + "flos": 744508581888.0, + "grad_norm": 0.03463108069269443, + "language_loss": 0.83654445, + "learning_rate": 1.034252625822113e-05, + "loss": 0.84788495, + "num_input_tokens_seen": 403504768, + "router_z_loss_mlp": 0.73339844, + "step": 4871, + "time_per_iteration": 2.9093987941741943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135096, + "balance_loss_mlp": 1.06199658, + "epoch": 0.9372835706040785, + "flos": 547077417984.0, + "grad_norm": 0.039804478611465105, + "language_loss": 0.82813054, + "learning_rate": 1.0279583258679448e-05, + "loss": 0.83948147, + "num_input_tokens_seen": 403575584, + "router_z_loss_mlp": 0.73095703, + "step": 4872, + "time_per_iteration": 2.61991286277771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135188, + "balance_loss_mlp": 1.06194568, + "epoch": 0.9374759522893421, + "flos": 492699772416.0, + "grad_norm": 0.03924108622044038, + "language_loss": 0.8609668, + "learning_rate": 1.0216830385465003e-05, + "loss": 0.87231869, + "num_input_tokens_seen": 403648720, + "router_z_loss_mlp": 0.73242188, + "step": 4873, + "time_per_iteration": 2.662440061569214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135351, + "balance_loss_mlp": 1.06206155, + "epoch": 0.9376683339746056, + "flos": 579531870720.0, + "grad_norm": 0.040838494467933396, + "language_loss": 0.87158096, + "learning_rate": 1.0154267662940809e-05, + "loss": 0.88293445, + "num_input_tokens_seen": 403721392, + "router_z_loss_mlp": 0.73291016, + "step": 4874, + "time_per_iteration": 2.6864585876464844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134875, + "balance_loss_mlp": 1.06153762, + "epoch": 0.9378607156598692, + "flos": 507296435712.0, + "grad_norm": 0.041653799515210505, + "language_loss": 0.86168003, + "learning_rate": 1.0091895115395766e-05, + "loss": 0.87302876, + "num_input_tokens_seen": 403792112, + "router_z_loss_mlp": 0.73339844, + "step": 4875, + "time_per_iteration": 2.6001012325286865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136122, + "balance_loss_mlp": 1.06302249, + "epoch": 0.9380530973451328, + "flos": 521070630912.0, + "grad_norm": 0.05437496502115945, + "language_loss": 0.82745492, + "learning_rate": 1.0029712767045062e-05, + "loss": 0.83881617, + "num_input_tokens_seen": 403860928, + "router_z_loss_mlp": 0.73095703, + "step": 4876, + "time_per_iteration": 2.6492278575897217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135619, + "balance_loss_mlp": 1.06242442, + "epoch": 0.9382454790303963, + "flos": 558869577216.0, + "grad_norm": 0.035653858877996346, + "language_loss": 0.89391607, + "learning_rate": 9.967720642029999e-06, + "loss": 0.90527225, + "num_input_tokens_seen": 403928240, + "router_z_loss_mlp": 0.73193359, + "step": 4877, + "time_per_iteration": 2.6514732837677 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134863, + "balance_loss_mlp": 1.06166816, + "epoch": 0.9384378607156598, + "flos": 696786720768.0, + "grad_norm": 0.03491740156282248, + "language_loss": 0.85915047, + "learning_rate": 9.905918764418153e-06, + "loss": 0.87049913, + "num_input_tokens_seen": 404004320, + "router_z_loss_mlp": 0.73193359, + "step": 4878, + "time_per_iteration": 2.908747673034668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134971, + "balance_loss_mlp": 1.06182373, + "epoch": 0.9386302424009234, + "flos": 555834760704.0, + "grad_norm": 0.040753856632951786, + "language_loss": 0.85157609, + "learning_rate": 9.844307158203058e-06, + "loss": 0.86292583, + "num_input_tokens_seen": 404077040, + "router_z_loss_mlp": 0.73144531, + "step": 4879, + "time_per_iteration": 2.6491734981536865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134453, + "balance_loss_mlp": 1.06116271, + "epoch": 0.938822624086187, + "flos": 568065351168.0, + "grad_norm": 0.04395633401499817, + "language_loss": 0.8441397, + "learning_rate": 9.782885847304469e-06, + "loss": 0.85548419, + "num_input_tokens_seen": 404145248, + "router_z_loss_mlp": 0.73291016, + "step": 4880, + "time_per_iteration": 2.7252390384674072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134587, + "balance_loss_mlp": 1.06153524, + "epoch": 0.9390150057714506, + "flos": 418547429376.0, + "grad_norm": 0.03347739941940771, + "language_loss": 0.8443892, + "learning_rate": 9.721654855568196e-06, + "loss": 0.85573506, + "num_input_tokens_seen": 404212000, + "router_z_loss_mlp": 0.73046875, + "step": 4881, + "time_per_iteration": 2.583867311477661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136116, + "balance_loss_mlp": 1.06301677, + "epoch": 0.9392073874567142, + "flos": 1556082570240.0, + "grad_norm": 0.03746627101283315, + "language_loss": 0.80632669, + "learning_rate": 9.660614206766394e-06, + "loss": 0.81768787, + "num_input_tokens_seen": 404305408, + "router_z_loss_mlp": 0.73095703, + "step": 4882, + "time_per_iteration": 3.714630126953125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135223, + "balance_loss_mlp": 1.06198061, + "epoch": 0.9393997691419776, + "flos": 653731877376.0, + "grad_norm": 0.0382645062266071, + "language_loss": 0.82485741, + "learning_rate": 9.59976392459705e-06, + "loss": 0.83620965, + "num_input_tokens_seen": 404383248, + "router_z_loss_mlp": 0.73242188, + "step": 4883, + "time_per_iteration": 3.2966370582580566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138214, + "balance_loss_mlp": 1.0664978, + "epoch": 0.9395921508272412, + "flos": 1556562839040.0, + "grad_norm": 0.003695333595737308, + "language_loss": 0.78170681, + "learning_rate": 9.539104032684209e-06, + "loss": 0.79308891, + "num_input_tokens_seen": 404615264, + "router_z_loss_mlp": 0.71875, + "step": 4884, + "time_per_iteration": 5.404622554779053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135325, + "balance_loss_mlp": 1.06212997, + "epoch": 0.9397845325125048, + "flos": 499197103104.0, + "grad_norm": 0.03656984791754246, + "language_loss": 0.82897919, + "learning_rate": 9.478634554578314e-06, + "loss": 0.84033239, + "num_input_tokens_seen": 404684656, + "router_z_loss_mlp": 0.73193359, + "step": 4885, + "time_per_iteration": 2.7291576862335205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135657, + "balance_loss_mlp": 1.06251049, + "epoch": 0.9399769141977684, + "flos": 499589872128.0, + "grad_norm": 0.036644251179858374, + "language_loss": 0.88491553, + "learning_rate": 9.418355513755638e-06, + "loss": 0.89627206, + "num_input_tokens_seen": 404752096, + "router_z_loss_mlp": 0.73144531, + "step": 4886, + "time_per_iteration": 2.620981216430664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135735, + "balance_loss_mlp": 1.06401825, + "epoch": 0.9401692958830319, + "flos": 1405675229184.0, + "grad_norm": 0.003512744995628987, + "language_loss": 0.79332191, + "learning_rate": 9.358266933618575e-06, + "loss": 0.80467921, + "num_input_tokens_seen": 404980944, + "router_z_loss_mlp": 0.71875, + "step": 4887, + "time_per_iteration": 4.856574296951294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133934, + "balance_loss_mlp": 1.06073952, + "epoch": 0.9403616775682955, + "flos": 541211900928.0, + "grad_norm": 0.0305164549996701, + "language_loss": 0.88444626, + "learning_rate": 9.298368837495575e-06, + "loss": 0.89578557, + "num_input_tokens_seen": 405056688, + "router_z_loss_mlp": 0.73193359, + "step": 4888, + "time_per_iteration": 2.739971399307251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135735, + "balance_loss_mlp": 1.06401825, + "epoch": 0.9405540592535591, + "flos": 1324938233856.0, + "grad_norm": 0.0035002189725473307, + "language_loss": 0.75169432, + "learning_rate": 9.238661248641089e-06, + "loss": 0.76305169, + "num_input_tokens_seen": 405284656, + "router_z_loss_mlp": 0.71875, + "step": 4889, + "time_per_iteration": 4.893186569213867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135497, + "balance_loss_mlp": 1.06230211, + "epoch": 0.9407464409388226, + "flos": 573427309056.0, + "grad_norm": 0.04031631625697337, + "language_loss": 0.88505602, + "learning_rate": 9.179144190235799e-06, + "loss": 0.896411, + "num_input_tokens_seen": 405351584, + "router_z_loss_mlp": 0.73193359, + "step": 4890, + "time_per_iteration": 2.6828339099884033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135232, + "balance_loss_mlp": 1.06199026, + "epoch": 0.9409388226240862, + "flos": 512348216832.0, + "grad_norm": 0.03147351995793952, + "language_loss": 0.81225574, + "learning_rate": 9.119817685386112e-06, + "loss": 0.82360804, + "num_input_tokens_seen": 405425712, + "router_z_loss_mlp": 0.73242188, + "step": 4891, + "time_per_iteration": 2.752286911010742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140076, + "balance_loss_mlp": 1.06835938, + "epoch": 0.9411312043093497, + "flos": 1573276523520.0, + "grad_norm": 0.004486626700418182, + "language_loss": 0.80241883, + "learning_rate": 9.06068175712471e-06, + "loss": 0.81381959, + "num_input_tokens_seen": 405655760, + "router_z_loss_mlp": 0.71875, + "step": 4892, + "time_per_iteration": 4.878049850463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136128, + "balance_loss_mlp": 1.06298077, + "epoch": 0.9413235859946133, + "flos": 570559678464.0, + "grad_norm": 0.041259003272787025, + "language_loss": 0.831617, + "learning_rate": 9.001736428410234e-06, + "loss": 0.84297824, + "num_input_tokens_seen": 405731664, + "router_z_loss_mlp": 0.73144531, + "step": 4893, + "time_per_iteration": 2.7614989280700684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134748, + "balance_loss_mlp": 1.06150591, + "epoch": 0.9415159676798769, + "flos": 783264981504.0, + "grad_norm": 0.04024659681002993, + "language_loss": 0.84358162, + "learning_rate": 8.942981722127263e-06, + "loss": 0.85492909, + "num_input_tokens_seen": 405808128, + "router_z_loss_mlp": 0.73242188, + "step": 4894, + "time_per_iteration": 3.074845552444458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113637, + "balance_loss_mlp": 1.06312764, + "epoch": 0.9417083493651405, + "flos": 850872330240.0, + "grad_norm": 0.02979508524031529, + "language_loss": 0.84446144, + "learning_rate": 8.884417661086331e-06, + "loss": 0.85582519, + "num_input_tokens_seen": 405892448, + "router_z_loss_mlp": 0.73242188, + "step": 4895, + "time_per_iteration": 3.244321346282959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135905, + "balance_loss_mlp": 1.06280613, + "epoch": 0.941900731050404, + "flos": 530451055104.0, + "grad_norm": 0.03415903081576368, + "language_loss": 0.90385509, + "learning_rate": 8.826044268024025e-06, + "loss": 0.91521418, + "num_input_tokens_seen": 405966736, + "router_z_loss_mlp": 0.73095703, + "step": 4896, + "time_per_iteration": 2.7122864723205566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134586, + "balance_loss_mlp": 1.06134343, + "epoch": 0.9420931127356675, + "flos": 558170634240.0, + "grad_norm": 0.03438694613546509, + "language_loss": 0.84335274, + "learning_rate": 8.767861565602997e-06, + "loss": 0.85469854, + "num_input_tokens_seen": 406043264, + "router_z_loss_mlp": 0.73242188, + "step": 4897, + "time_per_iteration": 2.7777915000915527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134624, + "balance_loss_mlp": 1.06142986, + "epoch": 0.9422854944209311, + "flos": 653786271744.0, + "grad_norm": 0.03610817623575041, + "language_loss": 0.90061867, + "learning_rate": 8.709869576411733e-06, + "loss": 0.91196489, + "num_input_tokens_seen": 406119552, + "router_z_loss_mlp": 0.73193359, + "step": 4898, + "time_per_iteration": 2.8397042751312256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136714, + "balance_loss_mlp": 1.06351972, + "epoch": 0.9424778761061947, + "flos": 554764515840.0, + "grad_norm": 0.032200962869082285, + "language_loss": 0.88306475, + "learning_rate": 8.65206832296478e-06, + "loss": 0.89443189, + "num_input_tokens_seen": 406192464, + "router_z_loss_mlp": 0.73193359, + "step": 4899, + "time_per_iteration": 2.758490800857544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136296, + "balance_loss_mlp": 1.06314933, + "epoch": 0.9426702577914583, + "flos": 589650169344.0, + "grad_norm": 0.04146685259937853, + "language_loss": 0.84754741, + "learning_rate": 8.594457827702406e-06, + "loss": 0.85891032, + "num_input_tokens_seen": 406262640, + "router_z_loss_mlp": 0.73144531, + "step": 4900, + "time_per_iteration": 2.6957013607025146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136116, + "balance_loss_mlp": 1.06292105, + "epoch": 0.9428626394767218, + "flos": 617812909056.0, + "grad_norm": 0.04053390873945447, + "language_loss": 0.83133346, + "learning_rate": 8.537038112991114e-06, + "loss": 0.84269458, + "num_input_tokens_seen": 406341328, + "router_z_loss_mlp": 0.73193359, + "step": 4901, + "time_per_iteration": 2.8101513385772705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136485, + "balance_loss_mlp": 1.06329107, + "epoch": 0.9430550211619854, + "flos": 611541161472.0, + "grad_norm": 0.036057292363132605, + "language_loss": 0.86717069, + "learning_rate": 8.479809201123178e-06, + "loss": 0.87853551, + "num_input_tokens_seen": 406418864, + "router_z_loss_mlp": 0.73193359, + "step": 4902, + "time_per_iteration": 2.7493042945861816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136839, + "balance_loss_mlp": 1.06364477, + "epoch": 0.943247402847249, + "flos": 567051502080.0, + "grad_norm": 0.03817021033168505, + "language_loss": 0.82748675, + "learning_rate": 8.422771114316885e-06, + "loss": 0.83885515, + "num_input_tokens_seen": 406492320, + "router_z_loss_mlp": 0.73193359, + "step": 4903, + "time_per_iteration": 2.731077194213867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135115, + "balance_loss_mlp": 1.06187308, + "epoch": 0.9434397845325125, + "flos": 528088985088.0, + "grad_norm": 0.04132634874172125, + "language_loss": 0.86513394, + "learning_rate": 8.365923874716297e-06, + "loss": 0.87648505, + "num_input_tokens_seen": 406560448, + "router_z_loss_mlp": 0.73242188, + "step": 4904, + "time_per_iteration": 2.6607890129089355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135447, + "balance_loss_mlp": 1.06229973, + "epoch": 0.943632166217776, + "flos": 594591160320.0, + "grad_norm": 0.03589040439105028, + "language_loss": 0.87627959, + "learning_rate": 8.309267504391593e-06, + "loss": 0.88763404, + "num_input_tokens_seen": 406631376, + "router_z_loss_mlp": 0.73144531, + "step": 4905, + "time_per_iteration": 2.725121021270752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135257, + "balance_loss_mlp": 1.06206262, + "epoch": 0.9438245479030396, + "flos": 573981259776.0, + "grad_norm": 0.028116257659022252, + "language_loss": 0.88786232, + "learning_rate": 8.252802025338623e-06, + "loss": 0.89921498, + "num_input_tokens_seen": 406713728, + "router_z_loss_mlp": 0.73193359, + "step": 4906, + "time_per_iteration": 2.84151291847229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137071, + "balance_loss_mlp": 1.06387651, + "epoch": 0.9440169295883032, + "flos": 489221795328.0, + "grad_norm": 0.03908331871996133, + "language_loss": 0.86816639, + "learning_rate": 8.196527459479242e-06, + "loss": 0.87953711, + "num_input_tokens_seen": 406779168, + "router_z_loss_mlp": 0.73193359, + "step": 4907, + "time_per_iteration": 2.593106269836426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136761, + "balance_loss_mlp": 1.06361377, + "epoch": 0.9442093112735668, + "flos": 733122653184.0, + "grad_norm": 0.03263207151306975, + "language_loss": 0.78277397, + "learning_rate": 8.140443828661137e-06, + "loss": 0.79414153, + "num_input_tokens_seen": 406860816, + "router_z_loss_mlp": 0.73144531, + "step": 4908, + "time_per_iteration": 2.9979734420776367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136747, + "balance_loss_mlp": 1.06355298, + "epoch": 0.9444016929588304, + "flos": 572105284608.0, + "grad_norm": 0.039051820427737964, + "language_loss": 0.86598486, + "learning_rate": 8.084551154658004e-06, + "loss": 0.8773523, + "num_input_tokens_seen": 406929888, + "router_z_loss_mlp": 0.73193359, + "step": 4909, + "time_per_iteration": 2.6849961280822754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136144, + "balance_loss_mlp": 1.06299686, + "epoch": 0.9445940746440938, + "flos": 510311786496.0, + "grad_norm": 0.03853248508401035, + "language_loss": 0.91414893, + "learning_rate": 8.028849459169318e-06, + "loss": 0.92551035, + "num_input_tokens_seen": 406998224, + "router_z_loss_mlp": 0.73144531, + "step": 4910, + "time_per_iteration": 2.5958712100982666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136817, + "balance_loss_mlp": 1.06357515, + "epoch": 0.9447864563293574, + "flos": 625797448704.0, + "grad_norm": 0.03483487859921532, + "language_loss": 0.85226071, + "learning_rate": 7.97333876382028e-06, + "loss": 0.86362892, + "num_input_tokens_seen": 407075088, + "router_z_loss_mlp": 0.73242188, + "step": 4911, + "time_per_iteration": 2.8528859615325928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134822, + "balance_loss_mlp": 1.06158018, + "epoch": 0.944978838014621, + "flos": 506308783104.0, + "grad_norm": 0.03612723857831656, + "language_loss": 0.85505927, + "learning_rate": 7.918019090162098e-06, + "loss": 0.86640745, + "num_input_tokens_seen": 407147792, + "router_z_loss_mlp": 0.73242188, + "step": 4912, + "time_per_iteration": 2.7557713985443115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139984, + "balance_loss_mlp": 1.06826782, + "epoch": 0.9451712196998846, + "flos": 1487551600128.0, + "grad_norm": 0.004706549025358334, + "language_loss": 0.78287339, + "learning_rate": 7.862890459671812e-06, + "loss": 0.79427326, + "num_input_tokens_seen": 407387216, + "router_z_loss_mlp": 0.71875, + "step": 4913, + "time_per_iteration": 4.964468955993652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135191, + "balance_loss_mlp": 1.06194913, + "epoch": 0.9453636013851482, + "flos": 522151609344.0, + "grad_norm": 0.03617704302923612, + "language_loss": 0.95077229, + "learning_rate": 7.80795289375219e-06, + "loss": 0.96212423, + "num_input_tokens_seen": 407457664, + "router_z_loss_mlp": 0.73242188, + "step": 4914, + "time_per_iteration": 2.6678929328918457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138802, + "balance_loss_mlp": 1.06708527, + "epoch": 0.9455559830704117, + "flos": 1500283748352.0, + "grad_norm": 0.004548904069174758, + "language_loss": 0.8356235, + "learning_rate": 7.75320641373195e-06, + "loss": 0.84701157, + "num_input_tokens_seen": 407700256, + "router_z_loss_mlp": 0.71875, + "step": 4915, + "time_per_iteration": 4.94046950340271 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113512, + "balance_loss_mlp": 1.06187737, + "epoch": 0.9457483647556753, + "flos": 499151440896.0, + "grad_norm": 0.034056935768259265, + "language_loss": 0.86546624, + "learning_rate": 7.698651040865534e-06, + "loss": 0.87681735, + "num_input_tokens_seen": 407770080, + "router_z_loss_mlp": 0.73242188, + "step": 4916, + "time_per_iteration": 2.6402246952056885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136151, + "balance_loss_mlp": 1.0630039, + "epoch": 0.9459407464409388, + "flos": 1021117673472.0, + "grad_norm": 0.03091693708004351, + "language_loss": 0.86156452, + "learning_rate": 7.644286796333222e-06, + "loss": 0.872926, + "num_input_tokens_seen": 407854640, + "router_z_loss_mlp": 0.73144531, + "step": 4917, + "time_per_iteration": 3.370896816253662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136157, + "balance_loss_mlp": 1.06300974, + "epoch": 0.9461331281262024, + "flos": 514620963840.0, + "grad_norm": 0.03805401706614232, + "language_loss": 0.86857271, + "learning_rate": 7.590113701241075e-06, + "loss": 0.87993431, + "num_input_tokens_seen": 407922704, + "router_z_loss_mlp": 0.73144531, + "step": 4918, + "time_per_iteration": 2.6039915084838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136067, + "balance_loss_mlp": 1.06282437, + "epoch": 0.9463255098114659, + "flos": 529048439808.0, + "grad_norm": 0.04139599350872911, + "language_loss": 0.83497351, + "learning_rate": 7.536131776620936e-06, + "loss": 0.84633422, + "num_input_tokens_seen": 407991136, + "router_z_loss_mlp": 0.73242188, + "step": 4919, + "time_per_iteration": 2.6238739490509033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135985, + "balance_loss_mlp": 1.06283832, + "epoch": 0.9465178914967295, + "flos": 507027191808.0, + "grad_norm": 0.044536709524851746, + "language_loss": 0.88624299, + "learning_rate": 7.482341043430485e-06, + "loss": 0.89760286, + "num_input_tokens_seen": 408056576, + "router_z_loss_mlp": 0.73144531, + "step": 4920, + "time_per_iteration": 2.5972156524658203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134582, + "balance_loss_mlp": 1.06133986, + "epoch": 0.9467102731819931, + "flos": 661538497536.0, + "grad_norm": 0.045944769490510115, + "language_loss": 0.89346719, + "learning_rate": 7.428741522553184e-06, + "loss": 0.90481305, + "num_input_tokens_seen": 408136960, + "router_z_loss_mlp": 0.73242188, + "step": 4921, + "time_per_iteration": 2.878498077392578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134699, + "balance_loss_mlp": 1.06145644, + "epoch": 0.9469026548672567, + "flos": 676504461312.0, + "grad_norm": 0.03622409343837378, + "language_loss": 0.93210799, + "learning_rate": 7.375333234798054e-06, + "loss": 0.94345504, + "num_input_tokens_seen": 408218304, + "router_z_loss_mlp": 0.73242188, + "step": 4922, + "time_per_iteration": 2.9211013317108154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136193, + "balance_loss_mlp": 1.06295109, + "epoch": 0.9470950365525203, + "flos": 515020463616.0, + "grad_norm": 0.07987170801903949, + "language_loss": 0.84155279, + "learning_rate": 7.32211620090012e-06, + "loss": 0.85291469, + "num_input_tokens_seen": 408287936, + "router_z_loss_mlp": 0.73242188, + "step": 4923, + "time_per_iteration": 2.6229920387268066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136284, + "balance_loss_mlp": 1.06304216, + "epoch": 0.9472874182377837, + "flos": 551226140160.0, + "grad_norm": 0.03359870786609723, + "language_loss": 0.85794783, + "learning_rate": 7.269090441520132e-06, + "loss": 0.86931068, + "num_input_tokens_seen": 408365568, + "router_z_loss_mlp": 0.73242188, + "step": 4924, + "time_per_iteration": 4.327451705932617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136476, + "balance_loss_mlp": 1.06332874, + "epoch": 0.9474797999230473, + "flos": 543810287616.0, + "grad_norm": 0.04461289962569648, + "language_loss": 0.84685075, + "learning_rate": 7.216255977244457e-06, + "loss": 0.85821545, + "num_input_tokens_seen": 408431248, + "router_z_loss_mlp": 0.73144531, + "step": 4925, + "time_per_iteration": 2.628014326095581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136189, + "balance_loss_mlp": 1.06294644, + "epoch": 0.9476721816083109, + "flos": 846063596544.0, + "grad_norm": 0.03676518142184114, + "language_loss": 0.90082932, + "learning_rate": 7.163612828585242e-06, + "loss": 0.91219121, + "num_input_tokens_seen": 408514112, + "router_z_loss_mlp": 0.73242188, + "step": 4926, + "time_per_iteration": 3.1086716651916504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136732, + "balance_loss_mlp": 1.06368101, + "epoch": 0.9478645632935745, + "flos": 639147949056.0, + "grad_norm": 0.037886935855288933, + "language_loss": 0.83596742, + "learning_rate": 7.1111610159803605e-06, + "loss": 0.84733474, + "num_input_tokens_seen": 408585968, + "router_z_loss_mlp": 0.73046875, + "step": 4927, + "time_per_iteration": 2.840261220932007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134894, + "balance_loss_mlp": 1.06174707, + "epoch": 0.948056944978838, + "flos": 658041054720.0, + "grad_norm": 0.03537137119366953, + "language_loss": 0.80161017, + "learning_rate": 7.058900559793469e-06, + "loss": 0.81295913, + "num_input_tokens_seen": 408665456, + "router_z_loss_mlp": 0.73193359, + "step": 4928, + "time_per_iteration": 2.820704936981201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134616, + "balance_loss_mlp": 1.06137371, + "epoch": 0.9482493266641016, + "flos": 441836307456.0, + "grad_norm": 0.03955323262094278, + "language_loss": 0.87748522, + "learning_rate": 7.00683148031378e-06, + "loss": 0.88883138, + "num_input_tokens_seen": 408730192, + "router_z_loss_mlp": 0.73242188, + "step": 4929, + "time_per_iteration": 2.5240581035614014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136268, + "balance_loss_mlp": 1.06302619, + "epoch": 0.9484417083493651, + "flos": 547121078784.0, + "grad_norm": 0.03887739915879212, + "language_loss": 0.82831037, + "learning_rate": 6.9549537977564024e-06, + "loss": 0.83967304, + "num_input_tokens_seen": 408807616, + "router_z_loss_mlp": 0.73242188, + "step": 4930, + "time_per_iteration": 2.7851428985595703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136252, + "balance_loss_mlp": 1.0630095, + "epoch": 0.9486340900346287, + "flos": 539694492672.0, + "grad_norm": 0.0339786344788922, + "language_loss": 0.83988905, + "learning_rate": 6.903267532262003e-06, + "loss": 0.8512516, + "num_input_tokens_seen": 408883552, + "router_z_loss_mlp": 0.73242188, + "step": 4931, + "time_per_iteration": 2.6893911361694336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135873, + "balance_loss_mlp": 1.06267822, + "epoch": 0.9488264717198923, + "flos": 682901735424.0, + "grad_norm": 0.03750385652355195, + "language_loss": 0.90455812, + "learning_rate": 6.851772703896975e-06, + "loss": 0.91591686, + "num_input_tokens_seen": 408956400, + "router_z_loss_mlp": 0.73193359, + "step": 4932, + "time_per_iteration": 2.870084762573242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136544, + "balance_loss_mlp": 1.06330168, + "epoch": 0.9490188534051558, + "flos": 463560113664.0, + "grad_norm": 0.04146699354604264, + "language_loss": 0.93162906, + "learning_rate": 6.8004693326533805e-06, + "loss": 0.94299448, + "num_input_tokens_seen": 409019904, + "router_z_loss_mlp": 0.73242188, + "step": 4933, + "time_per_iteration": 2.523359775543213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136279, + "balance_loss_mlp": 1.06308496, + "epoch": 0.9492112350904194, + "flos": 544218519552.0, + "grad_norm": 0.03412343034174357, + "language_loss": 0.87004709, + "learning_rate": 6.7493574384489e-06, + "loss": 0.88140994, + "num_input_tokens_seen": 409094288, + "router_z_loss_mlp": 0.73193359, + "step": 4934, + "time_per_iteration": 2.6940860748291016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136518, + "balance_loss_mlp": 1.06332338, + "epoch": 0.949403616775683, + "flos": 551458454016.0, + "grad_norm": 0.03617720765095602, + "language_loss": 0.8781929, + "learning_rate": 6.698437041126992e-06, + "loss": 0.88955808, + "num_input_tokens_seen": 409169120, + "router_z_loss_mlp": 0.73193359, + "step": 4935, + "time_per_iteration": 2.790689706802368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134956, + "balance_loss_mlp": 1.06171405, + "epoch": 0.9495959984609466, + "flos": 599497222656.0, + "grad_norm": 0.032619945002332076, + "language_loss": 0.86929369, + "learning_rate": 6.647708160456678e-06, + "loss": 0.88064325, + "num_input_tokens_seen": 409243200, + "router_z_loss_mlp": 0.73242188, + "step": 4936, + "time_per_iteration": 2.712833881378174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113519, + "balance_loss_mlp": 1.06194746, + "epoch": 0.94978838014621, + "flos": 609530927616.0, + "grad_norm": 0.03651321025229267, + "language_loss": 0.87489212, + "learning_rate": 6.597170816132702e-06, + "loss": 0.88624406, + "num_input_tokens_seen": 409319264, + "router_z_loss_mlp": 0.73242188, + "step": 4937, + "time_per_iteration": 2.800729513168335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136433, + "balance_loss_mlp": 1.0631907, + "epoch": 0.9499807618314736, + "flos": 541865181696.0, + "grad_norm": 0.03285741477727048, + "language_loss": 0.90760124, + "learning_rate": 6.546825027775427e-06, + "loss": 0.91896558, + "num_input_tokens_seen": 409389840, + "router_z_loss_mlp": 0.73242188, + "step": 4938, + "time_per_iteration": 2.683340311050415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136285, + "balance_loss_mlp": 1.0631386, + "epoch": 0.9501731435167372, + "flos": 595709068800.0, + "grad_norm": 0.03334591399320501, + "language_loss": 0.86523139, + "learning_rate": 6.496670814930717e-06, + "loss": 0.87659431, + "num_input_tokens_seen": 409458752, + "router_z_loss_mlp": 0.73144531, + "step": 4939, + "time_per_iteration": 2.82743763923645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136188, + "balance_loss_mlp": 1.06304121, + "epoch": 0.9503655252020008, + "flos": 455072014848.0, + "grad_norm": 0.03930006662979796, + "language_loss": 0.85443276, + "learning_rate": 6.446708197070161e-06, + "loss": 0.86579466, + "num_input_tokens_seen": 409525008, + "router_z_loss_mlp": 0.73144531, + "step": 4940, + "time_per_iteration": 2.613368034362793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113632, + "balance_loss_mlp": 1.06307828, + "epoch": 0.9505579068872644, + "flos": 669127540224.0, + "grad_norm": 0.0356696809458609, + "language_loss": 0.89633119, + "learning_rate": 6.396937193591079e-06, + "loss": 0.90769434, + "num_input_tokens_seen": 409603376, + "router_z_loss_mlp": 0.73242188, + "step": 4941, + "time_per_iteration": 2.8095662593841553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134768, + "balance_loss_mlp": 1.06147814, + "epoch": 0.9507502885725279, + "flos": 403079907840.0, + "grad_norm": 0.038919580018142184, + "language_loss": 0.87087023, + "learning_rate": 6.347357823816235e-06, + "loss": 0.88221788, + "num_input_tokens_seen": 409667168, + "router_z_loss_mlp": 0.73291016, + "step": 4942, + "time_per_iteration": 2.473461627960205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113482, + "balance_loss_mlp": 1.06157768, + "epoch": 0.9509426702577914, + "flos": 701736443904.0, + "grad_norm": 0.03427667838843753, + "language_loss": 0.84288859, + "learning_rate": 6.297970106994011e-06, + "loss": 0.85423684, + "num_input_tokens_seen": 409746832, + "router_z_loss_mlp": 0.73242188, + "step": 4943, + "time_per_iteration": 2.9936366081237793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135576, + "balance_loss_mlp": 1.06233358, + "epoch": 0.951135051943055, + "flos": 502401106944.0, + "grad_norm": 0.03656450632617296, + "language_loss": 0.86557579, + "learning_rate": 6.2487740622985126e-06, + "loss": 0.87693161, + "num_input_tokens_seen": 409813792, + "router_z_loss_mlp": 0.73242188, + "step": 4944, + "time_per_iteration": 2.610600233078003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136645, + "balance_loss_mlp": 1.06354642, + "epoch": 0.9513274336283186, + "flos": 615865801728.0, + "grad_norm": 0.03295078964621213, + "language_loss": 0.85542595, + "learning_rate": 6.1997697088292395e-06, + "loss": 0.86679238, + "num_input_tokens_seen": 409898848, + "router_z_loss_mlp": 0.73095703, + "step": 4945, + "time_per_iteration": 2.9333925247192383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136839, + "balance_loss_mlp": 1.06369233, + "epoch": 0.9515198153135821, + "flos": 520597271040.0, + "grad_norm": 0.04029361545540468, + "language_loss": 0.86667025, + "learning_rate": 6.150957065611363e-06, + "loss": 0.87803864, + "num_input_tokens_seen": 409966368, + "router_z_loss_mlp": 0.73144531, + "step": 4946, + "time_per_iteration": 2.5970242023468018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136296, + "balance_loss_mlp": 1.06314898, + "epoch": 0.9517121969988457, + "flos": 666284104704.0, + "grad_norm": 0.033604894008419074, + "language_loss": 0.80945677, + "learning_rate": 6.102336151595667e-06, + "loss": 0.82081974, + "num_input_tokens_seen": 410048496, + "router_z_loss_mlp": 0.73144531, + "step": 4947, + "time_per_iteration": 2.9714138507843018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138525, + "balance_loss_mlp": 1.06537843, + "epoch": 0.9519045786841093, + "flos": 677615639040.0, + "grad_norm": 0.040926124550095325, + "language_loss": 0.8053059, + "learning_rate": 6.053906985658553e-06, + "loss": 0.81669116, + "num_input_tokens_seen": 410121840, + "router_z_loss_mlp": 0.73144531, + "step": 4948, + "time_per_iteration": 2.809159278869629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138321, + "balance_loss_mlp": 1.06507838, + "epoch": 0.9520969603693729, + "flos": 654140109312.0, + "grad_norm": 0.03095345074034261, + "language_loss": 0.84655893, + "learning_rate": 6.005669586601814e-06, + "loss": 0.8579421, + "num_input_tokens_seen": 410199152, + "router_z_loss_mlp": 0.73242188, + "step": 4949, + "time_per_iteration": 2.910127878189087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138692, + "balance_loss_mlp": 1.06554544, + "epoch": 0.9522893420546364, + "flos": 744682498560.0, + "grad_norm": 0.032408881572200024, + "language_loss": 0.87415892, + "learning_rate": 5.957623973152748e-06, + "loss": 0.88554585, + "num_input_tokens_seen": 410285392, + "router_z_loss_mlp": 0.73144531, + "step": 4950, + "time_per_iteration": 3.021373987197876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138367, + "balance_loss_mlp": 1.06521976, + "epoch": 0.9524817237398999, + "flos": 763030385664.0, + "grad_norm": 0.03881087544404618, + "language_loss": 0.85428655, + "learning_rate": 5.909770163964545e-06, + "loss": 0.86567014, + "num_input_tokens_seen": 410359872, + "router_z_loss_mlp": 0.73144531, + "step": 4951, + "time_per_iteration": 2.9622764587402344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138142, + "balance_loss_mlp": 1.06499469, + "epoch": 0.9526741054251635, + "flos": 530146882560.0, + "grad_norm": 0.038541049170088305, + "language_loss": 0.85973597, + "learning_rate": 5.8621081776155105e-06, + "loss": 0.87111747, + "num_input_tokens_seen": 410425728, + "router_z_loss_mlp": 0.73144531, + "step": 4952, + "time_per_iteration": 2.5878281593322754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136477, + "balance_loss_mlp": 1.06337738, + "epoch": 0.9528664871104271, + "flos": 489425911296.0, + "grad_norm": 0.03895213525755141, + "language_loss": 0.86453646, + "learning_rate": 5.814638032609787e-06, + "loss": 0.87590122, + "num_input_tokens_seen": 410496080, + "router_z_loss_mlp": 0.73095703, + "step": 4953, + "time_per_iteration": 2.6211817264556885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136503, + "balance_loss_mlp": 1.06340432, + "epoch": 0.9530588687956907, + "flos": 518871744000.0, + "grad_norm": 0.033652335193776035, + "language_loss": 0.8942554, + "learning_rate": 5.76735974737691e-06, + "loss": 0.90562046, + "num_input_tokens_seen": 410576448, + "router_z_loss_mlp": 0.73095703, + "step": 4954, + "time_per_iteration": 2.7593400478363037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134917, + "balance_loss_mlp": 1.06167483, + "epoch": 0.9532512504809542, + "flos": 676413136896.0, + "grad_norm": 0.040464559019193894, + "language_loss": 0.86070359, + "learning_rate": 5.720273340271864e-06, + "loss": 0.87205279, + "num_input_tokens_seen": 410655792, + "router_z_loss_mlp": 0.73242188, + "step": 4955, + "time_per_iteration": 2.8816840648651123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134706, + "balance_loss_mlp": 1.06146348, + "epoch": 0.9534436321662177, + "flos": 490541818368.0, + "grad_norm": 0.03782014800574082, + "language_loss": 0.88387191, + "learning_rate": 5.673378829575249e-06, + "loss": 0.89521897, + "num_input_tokens_seen": 410725440, + "router_z_loss_mlp": 0.73242188, + "step": 4956, + "time_per_iteration": 2.583472967147827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134542, + "balance_loss_mlp": 1.06129992, + "epoch": 0.9536360138514813, + "flos": 497588370432.0, + "grad_norm": 0.03567484815320272, + "language_loss": 0.86718768, + "learning_rate": 5.626676233493167e-06, + "loss": 0.87853312, + "num_input_tokens_seen": 410797552, + "router_z_loss_mlp": 0.73242188, + "step": 4957, + "time_per_iteration": 2.6281793117523193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113481, + "balance_loss_mlp": 1.06156778, + "epoch": 0.9538283955367449, + "flos": 802857030144.0, + "grad_norm": 0.03957427847301793, + "language_loss": 0.87529492, + "learning_rate": 5.580165570157114e-06, + "loss": 0.88664305, + "num_input_tokens_seen": 410876736, + "router_z_loss_mlp": 0.73242188, + "step": 4958, + "time_per_iteration": 3.0466809272766113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136277, + "balance_loss_mlp": 1.06317747, + "epoch": 0.9540207772220085, + "flos": 557797330944.0, + "grad_norm": 0.03074291397770573, + "language_loss": 0.83816719, + "learning_rate": 5.533846857624203e-06, + "loss": 0.84952998, + "num_input_tokens_seen": 410955632, + "router_z_loss_mlp": 0.73095703, + "step": 4959, + "time_per_iteration": 2.7519495487213135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136369, + "balance_loss_mlp": 1.0633173, + "epoch": 0.954213158907272, + "flos": 685758632448.0, + "grad_norm": 0.035505648918623366, + "language_loss": 0.86093831, + "learning_rate": 5.487720113876882e-06, + "loss": 0.87230206, + "num_input_tokens_seen": 411038480, + "router_z_loss_mlp": 0.73046875, + "step": 4960, + "time_per_iteration": 2.910886764526367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136287, + "balance_loss_mlp": 1.06318796, + "epoch": 0.9544055405925356, + "flos": 536846327808.0, + "grad_norm": 0.04174534847869379, + "language_loss": 0.87276769, + "learning_rate": 5.441785356823214e-06, + "loss": 0.88413054, + "num_input_tokens_seen": 411109744, + "router_z_loss_mlp": 0.73095703, + "step": 4961, + "time_per_iteration": 2.7283856868743896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135918, + "balance_loss_mlp": 1.06281853, + "epoch": 0.9545979222777992, + "flos": 826923440640.0, + "grad_norm": 0.04693224510811112, + "language_loss": 0.84321594, + "learning_rate": 5.3960426042965476e-06, + "loss": 0.8545751, + "num_input_tokens_seen": 411202192, + "router_z_loss_mlp": 0.73095703, + "step": 4962, + "time_per_iteration": 3.1215646266937256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135961, + "balance_loss_mlp": 1.0628618, + "epoch": 0.9547903039630627, + "flos": 763156638720.0, + "grad_norm": 0.0399330944835338, + "language_loss": 0.81885892, + "learning_rate": 5.3504918740558405e-06, + "loss": 0.83021849, + "num_input_tokens_seen": 411289248, + "router_z_loss_mlp": 0.73095703, + "step": 4963, + "time_per_iteration": 3.1090612411499023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136424, + "balance_loss_mlp": 1.06332457, + "epoch": 0.9549826856483262, + "flos": 516333755904.0, + "grad_norm": 0.03824273588558422, + "language_loss": 0.87225604, + "learning_rate": 5.3051331837855045e-06, + "loss": 0.88362026, + "num_input_tokens_seen": 411355232, + "router_z_loss_mlp": 0.73095703, + "step": 4964, + "time_per_iteration": 2.620351552963257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134867, + "balance_loss_mlp": 1.06172025, + "epoch": 0.9551750673335898, + "flos": 644266859520.0, + "grad_norm": 0.03397371897405953, + "language_loss": 0.87095642, + "learning_rate": 5.259966551095341e-06, + "loss": 0.88230509, + "num_input_tokens_seen": 411432288, + "router_z_loss_mlp": 0.73193359, + "step": 4965, + "time_per_iteration": 2.814934015274048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134469, + "balance_loss_mlp": 1.06127489, + "epoch": 0.9553674490188534, + "flos": 473174853120.0, + "grad_norm": 0.03543650438605603, + "language_loss": 0.8735832, + "learning_rate": 5.214991993520546e-06, + "loss": 0.88492787, + "num_input_tokens_seen": 411499376, + "router_z_loss_mlp": 0.73193359, + "step": 4966, + "time_per_iteration": 2.6101207733154297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134749, + "balance_loss_mlp": 1.06150663, + "epoch": 0.955559830704117, + "flos": 529337149440.0, + "grad_norm": 0.04293839693076082, + "language_loss": 0.87281948, + "learning_rate": 5.170209528521763e-06, + "loss": 0.88416696, + "num_input_tokens_seen": 411564976, + "router_z_loss_mlp": 0.73242188, + "step": 4967, + "time_per_iteration": 2.5984079837799072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135008, + "balance_loss_mlp": 1.06181312, + "epoch": 0.9557522123893806, + "flos": 549217907712.0, + "grad_norm": 0.038038109123601054, + "language_loss": 0.88284183, + "learning_rate": 5.125619173485196e-06, + "loss": 0.89419186, + "num_input_tokens_seen": 411636464, + "router_z_loss_mlp": 0.73193359, + "step": 4968, + "time_per_iteration": 2.634786605834961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134993, + "balance_loss_mlp": 1.06175089, + "epoch": 0.955944594074644, + "flos": 510524634624.0, + "grad_norm": 0.029523963923908957, + "language_loss": 0.85467374, + "learning_rate": 5.08122094572222e-06, + "loss": 0.86602366, + "num_input_tokens_seen": 411710672, + "router_z_loss_mlp": 0.73242188, + "step": 4969, + "time_per_iteration": 2.6917636394500732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136238, + "balance_loss_mlp": 1.0630914, + "epoch": 0.9561369757599076, + "flos": 528710065152.0, + "grad_norm": 0.036722318154549516, + "language_loss": 0.84130347, + "learning_rate": 5.037014862469824e-06, + "loss": 0.85266584, + "num_input_tokens_seen": 411785616, + "router_z_loss_mlp": 0.73144531, + "step": 4970, + "time_per_iteration": 2.764472723007202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136347, + "balance_loss_mlp": 1.06329584, + "epoch": 0.9563293574451712, + "flos": 499207836672.0, + "grad_norm": 0.035098427244714854, + "language_loss": 0.83948302, + "learning_rate": 4.993000940890391e-06, + "loss": 0.85084653, + "num_input_tokens_seen": 411854832, + "router_z_loss_mlp": 0.73046875, + "step": 4971, + "time_per_iteration": 2.6011996269226074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141472, + "balance_loss_mlp": 1.06994629, + "epoch": 0.9565217391304348, + "flos": 1411744135680.0, + "grad_norm": 0.0046380775984094435, + "language_loss": 0.81773561, + "learning_rate": 4.949179198071585e-06, + "loss": 0.82915032, + "num_input_tokens_seen": 412081856, + "router_z_loss_mlp": 0.71679688, + "step": 4972, + "time_per_iteration": 4.86350417137146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136441, + "balance_loss_mlp": 1.06329453, + "epoch": 0.9567141208156984, + "flos": 504884700672.0, + "grad_norm": 0.036181124300498, + "language_loss": 0.8206802, + "learning_rate": 4.905549651026464e-06, + "loss": 0.8320446, + "num_input_tokens_seen": 412155600, + "router_z_loss_mlp": 0.73144531, + "step": 4973, + "time_per_iteration": 2.7482728958129883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113485, + "balance_loss_mlp": 1.06160808, + "epoch": 0.9569065025009619, + "flos": 434129743872.0, + "grad_norm": 0.045997872643652196, + "language_loss": 0.84962678, + "learning_rate": 4.86211231669359e-06, + "loss": 0.86097533, + "num_input_tokens_seen": 412219584, + "router_z_loss_mlp": 0.73242188, + "step": 4974, + "time_per_iteration": 2.470872163772583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134551, + "balance_loss_mlp": 1.06130922, + "epoch": 0.9570988841862255, + "flos": 591154842624.0, + "grad_norm": 0.0403367254829792, + "language_loss": 0.84212631, + "learning_rate": 4.818867211936806e-06, + "loss": 0.85347188, + "num_input_tokens_seen": 412295088, + "router_z_loss_mlp": 0.73242188, + "step": 4975, + "time_per_iteration": 2.7816882133483887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135143, + "balance_loss_mlp": 1.06190073, + "epoch": 0.957291265871489, + "flos": 768642121728.0, + "grad_norm": 0.04652333923499507, + "language_loss": 0.835931, + "learning_rate": 4.7758143535454045e-06, + "loss": 0.84728247, + "num_input_tokens_seen": 412376992, + "router_z_loss_mlp": 0.73242188, + "step": 4976, + "time_per_iteration": 2.957157850265503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134733, + "balance_loss_mlp": 1.0615381, + "epoch": 0.9574836475567526, + "flos": 640246391808.0, + "grad_norm": 0.03712988268786209, + "language_loss": 0.89267516, + "learning_rate": 4.732953758233849e-06, + "loss": 0.90402251, + "num_input_tokens_seen": 412450064, + "router_z_loss_mlp": 0.73193359, + "step": 4977, + "time_per_iteration": 2.803529977798462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141525, + "balance_loss_mlp": 1.06980896, + "epoch": 0.9576760292420161, + "flos": 1579398549504.0, + "grad_norm": 0.004511171675373937, + "language_loss": 0.78607261, + "learning_rate": 4.690285442642272e-06, + "loss": 0.79748785, + "num_input_tokens_seen": 412676896, + "router_z_loss_mlp": 0.71875, + "step": 4978, + "time_per_iteration": 4.911847352981567 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134672, + "balance_loss_mlp": 1.0614301, + "epoch": 0.9578684109272797, + "flos": 497373520896.0, + "grad_norm": 0.03570297995537699, + "language_loss": 0.91898167, + "learning_rate": 4.6478094233358695e-06, + "loss": 0.93032837, + "num_input_tokens_seen": 412746848, + "router_z_loss_mlp": 0.73242188, + "step": 4979, + "time_per_iteration": 2.59523868560791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135113, + "balance_loss_mlp": 1.06182265, + "epoch": 0.9580607926125433, + "flos": 430853881344.0, + "grad_norm": 0.043029309448741025, + "language_loss": 0.91334265, + "learning_rate": 4.605525716805337e-06, + "loss": 0.92469382, + "num_input_tokens_seen": 412810144, + "router_z_loss_mlp": 0.73291016, + "step": 4980, + "time_per_iteration": 2.4755971431732178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136154, + "balance_loss_mlp": 1.0630554, + "epoch": 0.9582531742978069, + "flos": 1129131087360.0, + "grad_norm": 0.042821653988821394, + "language_loss": 0.8443023, + "learning_rate": 4.563434339466599e-06, + "loss": 0.8556639, + "num_input_tokens_seen": 412904768, + "router_z_loss_mlp": 0.73095703, + "step": 4981, + "time_per_iteration": 3.5472586154937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136224, + "balance_loss_mlp": 1.06312537, + "epoch": 0.9584455559830705, + "flos": 525555726336.0, + "grad_norm": 0.03335114170802168, + "language_loss": 0.83248258, + "learning_rate": 4.521535307661085e-06, + "loss": 0.84384483, + "num_input_tokens_seen": 412974592, + "router_z_loss_mlp": 0.73095703, + "step": 4982, + "time_per_iteration": 2.6682260036468506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113622, + "balance_loss_mlp": 1.06307316, + "epoch": 0.9586379376683339, + "flos": 635449118208.0, + "grad_norm": 0.03182275504909025, + "language_loss": 0.84402609, + "learning_rate": 4.479828637655392e-06, + "loss": 0.85538828, + "num_input_tokens_seen": 413052848, + "router_z_loss_mlp": 0.73144531, + "step": 4983, + "time_per_iteration": 2.840589761734009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136281, + "balance_loss_mlp": 1.06313407, + "epoch": 0.9588303193535975, + "flos": 416984358912.0, + "grad_norm": 0.03935201485071488, + "language_loss": 0.88144433, + "learning_rate": 4.438314345641459e-06, + "loss": 0.89280713, + "num_input_tokens_seen": 413118000, + "router_z_loss_mlp": 0.73144531, + "step": 4984, + "time_per_iteration": 2.549217700958252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136295, + "balance_loss_mlp": 1.06310058, + "epoch": 0.9590227010388611, + "flos": 482659336704.0, + "grad_norm": 0.03510699411251916, + "language_loss": 0.82830805, + "learning_rate": 4.3969924477365585e-06, + "loss": 0.83967102, + "num_input_tokens_seen": 413185616, + "router_z_loss_mlp": 0.73193359, + "step": 4985, + "time_per_iteration": 2.6106717586517334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134857, + "balance_loss_mlp": 1.06180549, + "epoch": 0.9592150827241247, + "flos": 685849956864.0, + "grad_norm": 0.034999035587186825, + "language_loss": 0.84885329, + "learning_rate": 4.355862959983359e-06, + "loss": 0.86020184, + "num_input_tokens_seen": 413265616, + "router_z_loss_mlp": 0.73095703, + "step": 4986, + "time_per_iteration": 2.933217763900757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135131, + "balance_loss_mlp": 1.06198394, + "epoch": 0.9594074644093882, + "flos": 575630925312.0, + "grad_norm": 0.04204182022141106, + "language_loss": 0.74685031, + "learning_rate": 4.314925898349642e-06, + "loss": 0.7582016, + "num_input_tokens_seen": 413341248, + "router_z_loss_mlp": 0.73193359, + "step": 4987, + "time_per_iteration": 2.726092576980591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134792, + "balance_loss_mlp": 1.06155026, + "epoch": 0.9595998460946518, + "flos": 547987207680.0, + "grad_norm": 0.03775455227306167, + "language_loss": 0.82959723, + "learning_rate": 4.2741812787286395e-06, + "loss": 0.84094512, + "num_input_tokens_seen": 413416080, + "router_z_loss_mlp": 0.73242188, + "step": 4988, + "time_per_iteration": 2.7773516178131104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135054, + "balance_loss_mlp": 1.06181157, + "epoch": 0.9597922277799154, + "flos": 475026633216.0, + "grad_norm": 0.041401816345422476, + "language_loss": 0.82861459, + "learning_rate": 4.233629116938809e-06, + "loss": 0.83996511, + "num_input_tokens_seen": 413482336, + "router_z_loss_mlp": 0.73242188, + "step": 4989, + "time_per_iteration": 2.551558494567871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134589, + "balance_loss_mlp": 1.06134653, + "epoch": 0.9599846094651789, + "flos": 515719406592.0, + "grad_norm": 0.052249401603679996, + "language_loss": 0.90226066, + "learning_rate": 4.193269428723889e-06, + "loss": 0.91360652, + "num_input_tokens_seen": 413553248, + "router_z_loss_mlp": 0.73242188, + "step": 4990, + "time_per_iteration": 2.641939163208008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134583, + "balance_loss_mlp": 1.06134093, + "epoch": 0.9601769911504425, + "flos": 596162962944.0, + "grad_norm": 0.03785738083806385, + "language_loss": 0.82735097, + "learning_rate": 4.1531022297529035e-06, + "loss": 0.83869678, + "num_input_tokens_seen": 413625776, + "router_z_loss_mlp": 0.73242188, + "step": 4991, + "time_per_iteration": 2.772304058074951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136451, + "balance_loss_mlp": 1.06330407, + "epoch": 0.960369372835706, + "flos": 494041262592.0, + "grad_norm": 0.034704241516027634, + "language_loss": 0.83890998, + "learning_rate": 4.1131275356201536e-06, + "loss": 0.85027456, + "num_input_tokens_seen": 413693056, + "router_z_loss_mlp": 0.73144531, + "step": 4992, + "time_per_iteration": 2.6465232372283936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136442, + "balance_loss_mlp": 1.06339037, + "epoch": 0.9605617545209696, + "flos": 580406731776.0, + "grad_norm": 0.033359643790349336, + "language_loss": 0.86629891, + "learning_rate": 4.073345361845171e-06, + "loss": 0.87766337, + "num_input_tokens_seen": 413765616, + "router_z_loss_mlp": 0.73046875, + "step": 4993, + "time_per_iteration": 2.689033269882202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135961, + "balance_loss_mlp": 1.06290936, + "epoch": 0.9607541362062332, + "flos": 929298921984.0, + "grad_norm": 0.029146870910398723, + "language_loss": 0.89981806, + "learning_rate": 4.033755723872767e-06, + "loss": 0.91117764, + "num_input_tokens_seen": 413850976, + "router_z_loss_mlp": 0.73046875, + "step": 4994, + "time_per_iteration": 3.2702882289886475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136365, + "balance_loss_mlp": 1.06312311, + "epoch": 0.9609465178914968, + "flos": 574280702976.0, + "grad_norm": 0.03393299990449358, + "language_loss": 0.80548346, + "learning_rate": 3.994358637073036e-06, + "loss": 0.81684709, + "num_input_tokens_seen": 413931648, + "router_z_loss_mlp": 0.73242188, + "step": 4995, + "time_per_iteration": 2.7817986011505127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136147, + "balance_loss_mlp": 1.0630002, + "epoch": 0.9611388995767602, + "flos": 531914068992.0, + "grad_norm": 0.033026252404674224, + "language_loss": 0.89345288, + "learning_rate": 3.955154116741244e-06, + "loss": 0.90481436, + "num_input_tokens_seen": 414003216, + "router_z_loss_mlp": 0.73144531, + "step": 4996, + "time_per_iteration": 2.655974864959717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113658, + "balance_loss_mlp": 1.06343305, + "epoch": 0.9613312812620238, + "flos": 647403734016.0, + "grad_norm": 0.0373910335582963, + "language_loss": 0.87061286, + "learning_rate": 3.916142178097881e-06, + "loss": 0.88197875, + "num_input_tokens_seen": 414077072, + "router_z_loss_mlp": 0.73144531, + "step": 4997, + "time_per_iteration": 2.7723019123077393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136218, + "balance_loss_mlp": 1.06311882, + "epoch": 0.9615236629472874, + "flos": 497178137088.0, + "grad_norm": 0.03336855538209936, + "language_loss": 0.81832653, + "learning_rate": 3.877322836288888e-06, + "loss": 0.82968867, + "num_input_tokens_seen": 414157600, + "router_z_loss_mlp": 0.73095703, + "step": 4998, + "time_per_iteration": 2.844299554824829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136341, + "balance_loss_mlp": 1.06319392, + "epoch": 0.961716044632551, + "flos": 514006614528.0, + "grad_norm": 0.03899261635106141, + "language_loss": 0.80403006, + "learning_rate": 3.838696106385153e-06, + "loss": 0.81539345, + "num_input_tokens_seen": 414224880, + "router_z_loss_mlp": 0.73144531, + "step": 4999, + "time_per_iteration": 2.6195151805877686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136197, + "balance_loss_mlp": 1.0630976, + "epoch": 0.9619084263178146, + "flos": 502084199424.0, + "grad_norm": 0.03786304088384279, + "language_loss": 0.85582483, + "learning_rate": 3.800262003382904e-06, + "loss": 0.86718684, + "num_input_tokens_seen": 414291728, + "router_z_loss_mlp": 0.73095703, + "step": 5000, + "time_per_iteration": 2.5949509143829346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134465, + "balance_loss_mlp": 1.06122255, + "epoch": 0.9621008080030781, + "flos": 596805510144.0, + "grad_norm": 0.041941865277851494, + "language_loss": 0.80558175, + "learning_rate": 3.7620205422035923e-06, + "loss": 0.81692636, + "num_input_tokens_seen": 414369568, + "router_z_loss_mlp": 0.73242188, + "step": 5001, + "time_per_iteration": 2.773188829421997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134714, + "balance_loss_mlp": 1.0614723, + "epoch": 0.9622931896883417, + "flos": 503247770112.0, + "grad_norm": 0.04000138367761118, + "language_loss": 0.87168002, + "learning_rate": 3.723971737693899e-06, + "loss": 0.88302714, + "num_input_tokens_seen": 414441424, + "router_z_loss_mlp": 0.73242188, + "step": 5002, + "time_per_iteration": 2.6144204139709473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134777, + "balance_loss_mlp": 1.06153464, + "epoch": 0.9624855713736052, + "flos": 608449949184.0, + "grad_norm": 0.03656605710173359, + "language_loss": 0.85194814, + "learning_rate": 3.6861156046256728e-06, + "loss": 0.86329585, + "num_input_tokens_seen": 414512960, + "router_z_loss_mlp": 0.73242188, + "step": 5003, + "time_per_iteration": 2.772636890411377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136303, + "balance_loss_mlp": 1.06320393, + "epoch": 0.9626779530588688, + "flos": 511735868928.0, + "grad_norm": 0.044650316551590984, + "language_loss": 0.89575279, + "learning_rate": 3.648452157695936e-06, + "loss": 0.90711582, + "num_input_tokens_seen": 414577392, + "router_z_loss_mlp": 0.73095703, + "step": 5004, + "time_per_iteration": 2.5866780281066895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136273, + "balance_loss_mlp": 1.06322193, + "epoch": 0.9628703347441323, + "flos": 628497893376.0, + "grad_norm": 0.037572642245888015, + "language_loss": 0.87363774, + "learning_rate": 3.610981411526937e-06, + "loss": 0.88500047, + "num_input_tokens_seen": 414655152, + "router_z_loss_mlp": 0.73046875, + "step": 5005, + "time_per_iteration": 2.814835548400879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113604, + "balance_loss_mlp": 1.06294048, + "epoch": 0.9630627164293959, + "flos": 631897281024.0, + "grad_norm": 0.03692802527340189, + "language_loss": 0.82178611, + "learning_rate": 3.573703380666149e-06, + "loss": 0.83314651, + "num_input_tokens_seen": 414730432, + "router_z_loss_mlp": 0.73095703, + "step": 5006, + "time_per_iteration": 2.7788455486297607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113652, + "balance_loss_mlp": 1.06346869, + "epoch": 0.9632550981146595, + "flos": 571729979904.0, + "grad_norm": 0.03764323441994214, + "language_loss": 0.82586932, + "learning_rate": 3.5366180795861622e-06, + "loss": 0.83723456, + "num_input_tokens_seen": 414810688, + "router_z_loss_mlp": 0.73046875, + "step": 5007, + "time_per_iteration": 2.8145768642425537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134652, + "balance_loss_mlp": 1.06141019, + "epoch": 0.9634474797999231, + "flos": 467159614464.0, + "grad_norm": 0.03643507504396426, + "language_loss": 0.86381149, + "learning_rate": 3.4997255226847937e-06, + "loss": 0.87515807, + "num_input_tokens_seen": 414880544, + "router_z_loss_mlp": 0.73242188, + "step": 5008, + "time_per_iteration": 2.641538619995117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134761, + "balance_loss_mlp": 1.06151867, + "epoch": 0.9636398614851867, + "flos": 527624357376.0, + "grad_norm": 0.03653594954025797, + "language_loss": 0.89453661, + "learning_rate": 3.463025724284974e-06, + "loss": 0.90588421, + "num_input_tokens_seen": 414949920, + "router_z_loss_mlp": 0.73242188, + "step": 5009, + "time_per_iteration": 2.6100451946258545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135987, + "balance_loss_mlp": 1.06284046, + "epoch": 0.9638322431704501, + "flos": 565942325760.0, + "grad_norm": 0.035991126690817755, + "language_loss": 0.79672241, + "learning_rate": 3.4265186986348618e-06, + "loss": 0.80808234, + "num_input_tokens_seen": 415024288, + "router_z_loss_mlp": 0.73144531, + "step": 5010, + "time_per_iteration": 2.768517255783081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136278, + "balance_loss_mlp": 1.06317854, + "epoch": 0.9640246248557137, + "flos": 478740926976.0, + "grad_norm": 0.03726077990698358, + "language_loss": 0.89582598, + "learning_rate": 3.3902044599076754e-06, + "loss": 0.90718877, + "num_input_tokens_seen": 415092032, + "router_z_loss_mlp": 0.73095703, + "step": 5011, + "time_per_iteration": 2.578130006790161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135669, + "balance_loss_mlp": 1.06252217, + "epoch": 0.9642170065409773, + "flos": 540339041280.0, + "grad_norm": 0.036587267985256175, + "language_loss": 0.92892486, + "learning_rate": 3.354083022201859e-06, + "loss": 0.94028151, + "num_input_tokens_seen": 415158544, + "router_z_loss_mlp": 0.73144531, + "step": 5012, + "time_per_iteration": 2.626784563064575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136225, + "balance_loss_mlp": 1.06317353, + "epoch": 0.9644093882262409, + "flos": 524776192512.0, + "grad_norm": 0.03589608787010189, + "language_loss": 0.88128811, + "learning_rate": 3.3181543995410843e-06, + "loss": 0.89265037, + "num_input_tokens_seen": 415225088, + "router_z_loss_mlp": 0.73046875, + "step": 5013, + "time_per_iteration": 2.577364444732666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137481, + "balance_loss_mlp": 1.06452537, + "epoch": 0.9646017699115044, + "flos": 575381147136.0, + "grad_norm": 0.036469182684706475, + "language_loss": 0.83875465, + "learning_rate": 3.2824186058740268e-06, + "loss": 0.85012949, + "num_input_tokens_seen": 415300224, + "router_z_loss_mlp": 0.72998047, + "step": 5014, + "time_per_iteration": 2.6983656883239746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135531, + "balance_loss_mlp": 1.06238461, + "epoch": 0.964794151596768, + "flos": 637956180480.0, + "grad_norm": 0.040034570453418294, + "language_loss": 0.89572299, + "learning_rate": 3.246875655074588e-06, + "loss": 0.90707827, + "num_input_tokens_seen": 415368784, + "router_z_loss_mlp": 0.73193359, + "step": 5015, + "time_per_iteration": 2.774064064025879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136038, + "balance_loss_mlp": 1.06279588, + "epoch": 0.9649865332820315, + "flos": 618559515648.0, + "grad_norm": 0.038560774155918465, + "language_loss": 0.90913039, + "learning_rate": 3.211525560941675e-06, + "loss": 0.92049074, + "num_input_tokens_seen": 415440752, + "router_z_loss_mlp": 0.73242188, + "step": 5016, + "time_per_iteration": 2.7157909870147705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135584, + "balance_loss_mlp": 1.06243753, + "epoch": 0.9651789149672951, + "flos": 517326137856.0, + "grad_norm": 0.03416472134449421, + "language_loss": 0.85285097, + "learning_rate": 3.1763683371994754e-06, + "loss": 0.86420679, + "num_input_tokens_seen": 415516128, + "router_z_loss_mlp": 0.73193359, + "step": 5017, + "time_per_iteration": 2.729053020477295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136208, + "balance_loss_mlp": 1.06315696, + "epoch": 0.9653712966525587, + "flos": 493921740288.0, + "grad_norm": 0.04119563726090097, + "language_loss": 0.85390657, + "learning_rate": 3.1414039974972385e-06, + "loss": 0.86526859, + "num_input_tokens_seen": 415583744, + "router_z_loss_mlp": 0.73046875, + "step": 5018, + "time_per_iteration": 2.563650131225586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113648, + "balance_loss_mlp": 1.06338084, + "epoch": 0.9655636783378222, + "flos": 537656060928.0, + "grad_norm": 0.03021172693995666, + "language_loss": 0.85570192, + "learning_rate": 3.106632555409328e-06, + "loss": 0.86706674, + "num_input_tokens_seen": 415659856, + "router_z_loss_mlp": 0.73095703, + "step": 5019, + "time_per_iteration": 2.7251713275909424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136099, + "balance_loss_mlp": 1.06290472, + "epoch": 0.9657560600230858, + "flos": 459958611456.0, + "grad_norm": 0.03436013437508305, + "language_loss": 0.86592716, + "learning_rate": 3.072054024435167e-06, + "loss": 0.87728816, + "num_input_tokens_seen": 415731792, + "router_z_loss_mlp": 0.73193359, + "step": 5020, + "time_per_iteration": 2.6252498626708984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136711, + "balance_loss_mlp": 1.06356394, + "epoch": 0.9659484417083494, + "flos": 687388832256.0, + "grad_norm": 0.043622735099904504, + "language_loss": 0.88656896, + "learning_rate": 3.0376684179994064e-06, + "loss": 0.89793605, + "num_input_tokens_seen": 415809536, + "router_z_loss_mlp": 0.73144531, + "step": 5021, + "time_per_iteration": 2.8548264503479004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140694, + "balance_loss_mlp": 1.06916809, + "epoch": 0.966140823393613, + "flos": 1505456326656.0, + "grad_norm": 0.004755883898104752, + "language_loss": 0.80694246, + "learning_rate": 3.0034757494516453e-06, + "loss": 0.81834936, + "num_input_tokens_seen": 416027600, + "router_z_loss_mlp": 0.71679688, + "step": 5022, + "time_per_iteration": 4.785803556442261 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011346, + "balance_loss_mlp": 1.06135833, + "epoch": 0.9663332050788765, + "flos": 465859057152.0, + "grad_norm": 0.04060247816118618, + "language_loss": 0.85319602, + "learning_rate": 2.9694760320667093e-06, + "loss": 0.86454201, + "num_input_tokens_seen": 416096128, + "router_z_loss_mlp": 0.73242188, + "step": 5023, + "time_per_iteration": 2.615492820739746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134536, + "balance_loss_mlp": 1.06129432, + "epoch": 0.96652558676414, + "flos": 501878082048.0, + "grad_norm": 0.036856046559520406, + "language_loss": 0.90339649, + "learning_rate": 2.9356692790444283e-06, + "loss": 0.91474187, + "num_input_tokens_seen": 416164256, + "router_z_loss_mlp": 0.73242188, + "step": 5024, + "time_per_iteration": 2.659637451171875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134714, + "balance_loss_mlp": 1.06147206, + "epoch": 0.9667179684494036, + "flos": 425743703040.0, + "grad_norm": 0.04260558113745741, + "language_loss": 0.88175714, + "learning_rate": 2.9020555035097484e-06, + "loss": 0.89310426, + "num_input_tokens_seen": 416227296, + "router_z_loss_mlp": 0.73242188, + "step": 5025, + "time_per_iteration": 2.48905611038208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134596, + "balance_loss_mlp": 1.06149662, + "epoch": 0.9669103501346672, + "flos": 518009617920.0, + "grad_norm": 0.03460776123355322, + "language_loss": 0.90789652, + "learning_rate": 2.8686347185127305e-06, + "loss": 0.91924238, + "num_input_tokens_seen": 416297184, + "router_z_loss_mlp": 0.73144531, + "step": 5026, + "time_per_iteration": 2.6764590740203857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134794, + "balance_loss_mlp": 1.06155145, + "epoch": 0.9671027318199308, + "flos": 457175574528.0, + "grad_norm": 0.04902786366657777, + "language_loss": 0.82283497, + "learning_rate": 2.8354069370284396e-06, + "loss": 0.83418286, + "num_input_tokens_seen": 416363056, + "router_z_loss_mlp": 0.73242188, + "step": 5027, + "time_per_iteration": 2.595550537109375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134603, + "balance_loss_mlp": 1.06136048, + "epoch": 0.9672951135051943, + "flos": 526061286912.0, + "grad_norm": 0.03802823500081439, + "language_loss": 0.84784377, + "learning_rate": 2.802372171957057e-06, + "loss": 0.85918975, + "num_input_tokens_seen": 416430688, + "router_z_loss_mlp": 0.73242188, + "step": 5028, + "time_per_iteration": 2.674833059310913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136455, + "balance_loss_mlp": 1.06335628, + "epoch": 0.9674874951904578, + "flos": 575101169664.0, + "grad_norm": 0.03757979852199149, + "language_loss": 0.84332544, + "learning_rate": 2.7695304361237682e-06, + "loss": 0.85469002, + "num_input_tokens_seen": 416505248, + "router_z_loss_mlp": 0.73095703, + "step": 5029, + "time_per_iteration": 2.787973403930664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136141, + "balance_loss_mlp": 1.06289899, + "epoch": 0.9676798768757214, + "flos": 630423533568.0, + "grad_norm": 0.03236731472285776, + "language_loss": 0.83900696, + "learning_rate": 2.7368817422789848e-06, + "loss": 0.85036838, + "num_input_tokens_seen": 416592640, + "router_z_loss_mlp": 0.73242188, + "step": 5030, + "time_per_iteration": 2.92444109916687 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140633, + "balance_loss_mlp": 1.06910706, + "epoch": 0.967872258560985, + "flos": 1467114889728.0, + "grad_norm": 0.004700971558181271, + "language_loss": 0.75563359, + "learning_rate": 2.7044261030979566e-06, + "loss": 0.7670399, + "num_input_tokens_seen": 416808560, + "router_z_loss_mlp": 0.71679688, + "step": 5031, + "time_per_iteration": 4.658704519271851 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136242, + "balance_loss_mlp": 1.0631907, + "epoch": 0.9680646402462486, + "flos": 566567408640.0, + "grad_norm": 0.045787284444390154, + "language_loss": 0.85227001, + "learning_rate": 2.672163531181049e-06, + "loss": 0.86363238, + "num_input_tokens_seen": 416878208, + "router_z_loss_mlp": 0.73046875, + "step": 5032, + "time_per_iteration": 2.662707805633545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137848, + "balance_loss_mlp": 1.06632233, + "epoch": 0.9682570219315121, + "flos": 1437647589888.0, + "grad_norm": 0.0038661012253674927, + "language_loss": 0.78074801, + "learning_rate": 2.6400940390537976e-06, + "loss": 0.79212654, + "num_input_tokens_seen": 417105968, + "router_z_loss_mlp": 0.71679688, + "step": 5033, + "time_per_iteration": 4.825839519500732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134757, + "balance_loss_mlp": 1.06156242, + "epoch": 0.9684494036167757, + "flos": 585703561728.0, + "grad_norm": 0.037836121912765926, + "language_loss": 0.86821753, + "learning_rate": 2.608217639166688e-06, + "loss": 0.87956512, + "num_input_tokens_seen": 417175168, + "router_z_loss_mlp": 0.73193359, + "step": 5034, + "time_per_iteration": 2.733405351638794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134865, + "balance_loss_mlp": 1.0616231, + "epoch": 0.9686417853020393, + "flos": 560189600256.0, + "grad_norm": 0.033762716228182665, + "language_loss": 0.88299072, + "learning_rate": 2.5765343438950982e-06, + "loss": 0.89433932, + "num_input_tokens_seen": 417247760, + "router_z_loss_mlp": 0.73242188, + "step": 5035, + "time_per_iteration": 2.694063186645508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113452, + "balance_loss_mlp": 1.06132543, + "epoch": 0.9688341669873028, + "flos": 786262867968.0, + "grad_norm": 0.040583945106096336, + "language_loss": 0.88091248, + "learning_rate": 2.545044165539745e-06, + "loss": 0.89225769, + "num_input_tokens_seen": 417324080, + "router_z_loss_mlp": 0.73193359, + "step": 5036, + "time_per_iteration": 2.9456684589385986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134924, + "balance_loss_mlp": 1.06168199, + "epoch": 0.9690265486725663, + "flos": 396769228800.0, + "grad_norm": 0.038331219578498374, + "language_loss": 0.8455385, + "learning_rate": 2.513747116326126e-06, + "loss": 0.8568877, + "num_input_tokens_seen": 417386416, + "router_z_loss_mlp": 0.73242188, + "step": 5037, + "time_per_iteration": 2.523125648498535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134975, + "balance_loss_mlp": 1.06173313, + "epoch": 0.9692189303578299, + "flos": 477416901120.0, + "grad_norm": 0.041475216157481225, + "language_loss": 0.82368696, + "learning_rate": 2.4826432084048002e-06, + "loss": 0.83503664, + "num_input_tokens_seen": 417459648, + "router_z_loss_mlp": 0.73242188, + "step": 5038, + "time_per_iteration": 2.7746524810791016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134895, + "balance_loss_mlp": 1.06170058, + "epoch": 0.9694113120430935, + "flos": 598687489536.0, + "grad_norm": 0.040629799686120044, + "language_loss": 0.83335608, + "learning_rate": 2.451732453851385e-06, + "loss": 0.84470499, + "num_input_tokens_seen": 417530512, + "router_z_loss_mlp": 0.73193359, + "step": 5039, + "time_per_iteration": 4.120795726776123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113648, + "balance_loss_mlp": 1.06338096, + "epoch": 0.9696036937283571, + "flos": 501897547776.0, + "grad_norm": 0.033826903503827166, + "language_loss": 0.86580127, + "learning_rate": 2.4210148646665598e-06, + "loss": 0.87716603, + "num_input_tokens_seen": 417597600, + "router_z_loss_mlp": 0.73095703, + "step": 5040, + "time_per_iteration": 2.607876777648926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135933, + "balance_loss_mlp": 1.06278634, + "epoch": 0.9697960754136207, + "flos": 433189754880.0, + "grad_norm": 0.04362735320956941, + "language_loss": 0.92283428, + "learning_rate": 2.3904904527758952e-06, + "loss": 0.93419361, + "num_input_tokens_seen": 417659616, + "router_z_loss_mlp": 0.73144531, + "step": 5041, + "time_per_iteration": 2.4580559730529785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136172, + "balance_loss_mlp": 1.06307268, + "epoch": 0.9699884570988841, + "flos": 569674083840.0, + "grad_norm": 0.03235624014830649, + "language_loss": 0.89051294, + "learning_rate": 2.3601592300300235e-06, + "loss": 0.90187466, + "num_input_tokens_seen": 417730896, + "router_z_loss_mlp": 0.73095703, + "step": 5042, + "time_per_iteration": 2.713972568511963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136359, + "balance_loss_mlp": 1.06321263, + "epoch": 0.9701808387841477, + "flos": 517236814848.0, + "grad_norm": 0.03727061706685101, + "language_loss": 0.85871363, + "learning_rate": 2.33002120820458e-06, + "loss": 0.87007725, + "num_input_tokens_seen": 417803296, + "router_z_loss_mlp": 0.73144531, + "step": 5043, + "time_per_iteration": 2.6875967979431152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113646, + "balance_loss_mlp": 1.06326568, + "epoch": 0.9703732204694113, + "flos": 492497657856.0, + "grad_norm": 0.03840937503625704, + "language_loss": 0.80693823, + "learning_rate": 2.300076399000206e-06, + "loss": 0.81830281, + "num_input_tokens_seen": 417870208, + "router_z_loss_mlp": 0.73193359, + "step": 5044, + "time_per_iteration": 2.5949554443359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113635, + "balance_loss_mlp": 1.06320333, + "epoch": 0.9705656021546749, + "flos": 627279928320.0, + "grad_norm": 0.03683083642331674, + "language_loss": 0.85812724, + "learning_rate": 2.2703248140424348e-06, + "loss": 0.8694908, + "num_input_tokens_seen": 417944464, + "router_z_loss_mlp": 0.73144531, + "step": 5045, + "time_per_iteration": 2.8123650550842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136233, + "balance_loss_mlp": 1.06308591, + "epoch": 0.9707579838399384, + "flos": 472393317888.0, + "grad_norm": 0.03632831837945052, + "language_loss": 0.87609589, + "learning_rate": 2.2407664648819715e-06, + "loss": 0.88745821, + "num_input_tokens_seen": 418010480, + "router_z_loss_mlp": 0.73144531, + "step": 5046, + "time_per_iteration": 2.5618367195129395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113635, + "balance_loss_mlp": 1.06315589, + "epoch": 0.970950365525202, + "flos": 493138203648.0, + "grad_norm": 0.038642032843630054, + "language_loss": 0.85051489, + "learning_rate": 2.2114013629942475e-06, + "loss": 0.8618784, + "num_input_tokens_seen": 418083952, + "router_z_loss_mlp": 0.73193359, + "step": 5047, + "time_per_iteration": 4.11439061164856 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136071, + "balance_loss_mlp": 1.06301963, + "epoch": 0.9711427472104656, + "flos": 558376751616.0, + "grad_norm": 0.04056698166765332, + "language_loss": 0.85194492, + "learning_rate": 2.1822295197799213e-06, + "loss": 0.86330569, + "num_input_tokens_seen": 418156672, + "router_z_loss_mlp": 0.73046875, + "step": 5048, + "time_per_iteration": 2.6787452697753906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134824, + "balance_loss_mlp": 1.06158209, + "epoch": 0.9713351288957291, + "flos": 627100007424.0, + "grad_norm": 0.030987251047231726, + "language_loss": 0.87520432, + "learning_rate": 2.153250946564489e-06, + "loss": 0.88655257, + "num_input_tokens_seen": 418242160, + "router_z_loss_mlp": 0.73242188, + "step": 5049, + "time_per_iteration": 2.9055373668670654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134922, + "balance_loss_mlp": 1.0616796, + "epoch": 0.9715275105809927, + "flos": 500082697728.0, + "grad_norm": 0.03604755550471877, + "language_loss": 0.86542779, + "learning_rate": 2.1244656545983397e-06, + "loss": 0.87677705, + "num_input_tokens_seen": 418316960, + "router_z_loss_mlp": 0.73242188, + "step": 5050, + "time_per_iteration": 2.7245774269104004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113493, + "balance_loss_mlp": 1.06168818, + "epoch": 0.9717198922662562, + "flos": 478480415232.0, + "grad_norm": 0.03989506366730262, + "language_loss": 0.82222277, + "learning_rate": 2.0958736550570345e-06, + "loss": 0.83357209, + "num_input_tokens_seen": 418383888, + "router_z_loss_mlp": 0.73242188, + "step": 5051, + "time_per_iteration": 2.549938201904297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134999, + "balance_loss_mlp": 1.06180418, + "epoch": 0.9719122739515198, + "flos": 554549666304.0, + "grad_norm": 0.03271132462984947, + "language_loss": 0.82110488, + "learning_rate": 2.067474959040916e-06, + "loss": 0.83245492, + "num_input_tokens_seen": 418453776, + "router_z_loss_mlp": 0.73193359, + "step": 5052, + "time_per_iteration": 2.7398674488067627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135029, + "balance_loss_mlp": 1.06178653, + "epoch": 0.9721046556367834, + "flos": 566929978368.0, + "grad_norm": 0.03652890903263657, + "language_loss": 0.85459185, + "learning_rate": 2.0392695775753312e-06, + "loss": 0.86594218, + "num_input_tokens_seen": 418521984, + "router_z_loss_mlp": 0.73242188, + "step": 5053, + "time_per_iteration": 2.660578966140747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135966, + "balance_loss_mlp": 1.06291485, + "epoch": 0.972297037322047, + "flos": 561400834560.0, + "grad_norm": 0.04122701334842068, + "language_loss": 0.8283239, + "learning_rate": 2.0112575216105766e-06, + "loss": 0.83968359, + "num_input_tokens_seen": 418598768, + "router_z_loss_mlp": 0.73046875, + "step": 5054, + "time_per_iteration": 2.773737907409668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136236, + "balance_loss_mlp": 1.06304181, + "epoch": 0.9724894190073105, + "flos": 513503055360.0, + "grad_norm": 0.04021059743942707, + "language_loss": 0.8332113, + "learning_rate": 1.9834388020218974e-06, + "loss": 0.84457362, + "num_input_tokens_seen": 418670064, + "router_z_loss_mlp": 0.73193359, + "step": 5055, + "time_per_iteration": 2.712599992752075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136328, + "balance_loss_mlp": 1.06313324, + "epoch": 0.972681800692574, + "flos": 615038604288.0, + "grad_norm": 0.04232559781751974, + "language_loss": 0.85386884, + "learning_rate": 1.9558134296094875e-06, + "loss": 0.86523211, + "num_input_tokens_seen": 418745216, + "router_z_loss_mlp": 0.73193359, + "step": 5056, + "time_per_iteration": 2.8266754150390625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136353, + "balance_loss_mlp": 1.06325388, + "epoch": 0.9728741823778376, + "flos": 835313484288.0, + "grad_norm": 0.03448022317319212, + "language_loss": 0.87796867, + "learning_rate": 1.92838141509849e-06, + "loss": 0.88933218, + "num_input_tokens_seen": 418824224, + "router_z_loss_mlp": 0.73095703, + "step": 5057, + "time_per_iteration": 3.078075885772705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136379, + "balance_loss_mlp": 1.06323254, + "epoch": 0.9730665640631012, + "flos": 572587376640.0, + "grad_norm": 0.03571508034746827, + "language_loss": 0.89210469, + "learning_rate": 1.9011427691389415e-06, + "loss": 0.90346849, + "num_input_tokens_seen": 418899712, + "router_z_loss_mlp": 0.73144531, + "step": 5058, + "time_per_iteration": 2.743687391281128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136509, + "balance_loss_mlp": 1.06345737, + "epoch": 0.9732589457483648, + "flos": 507520017408.0, + "grad_norm": 0.03560266740855486, + "language_loss": 0.82347834, + "learning_rate": 1.8740975023057715e-06, + "loss": 0.83484346, + "num_input_tokens_seen": 418964912, + "router_z_loss_mlp": 0.73046875, + "step": 5059, + "time_per_iteration": 2.603219985961914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113618, + "balance_loss_mlp": 1.06308138, + "epoch": 0.9734513274336283, + "flos": 928482458112.0, + "grad_norm": 0.03831156338681025, + "language_loss": 0.84692299, + "learning_rate": 1.84724562509897e-06, + "loss": 0.85828483, + "num_input_tokens_seen": 419040032, + "router_z_loss_mlp": 0.73095703, + "step": 5060, + "time_per_iteration": 3.1661386489868164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134848, + "balance_loss_mlp": 1.06165326, + "epoch": 0.9736437091188919, + "flos": 492925355520.0, + "grad_norm": 0.03299060222462335, + "language_loss": 0.81984901, + "learning_rate": 1.8205871479433089e-06, + "loss": 0.8311975, + "num_input_tokens_seen": 419112672, + "router_z_loss_mlp": 0.73193359, + "step": 5061, + "time_per_iteration": 2.7532899379730225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134743, + "balance_loss_mlp": 1.06150103, + "epoch": 0.9738360908041555, + "flos": 614454454272.0, + "grad_norm": 0.044137149894814875, + "language_loss": 0.88850021, + "learning_rate": 1.7941220811885096e-06, + "loss": 0.89984763, + "num_input_tokens_seen": 419183408, + "router_z_loss_mlp": 0.73242188, + "step": 5062, + "time_per_iteration": 2.7332098484039307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138641, + "balance_loss_mlp": 1.06692505, + "epoch": 0.974028472489419, + "flos": 1552731024384.0, + "grad_norm": 0.003870058232261716, + "language_loss": 0.75992095, + "learning_rate": 1.7678504351092972e-06, + "loss": 0.77130735, + "num_input_tokens_seen": 419415472, + "router_z_loss_mlp": 0.71875, + "step": 5063, + "time_per_iteration": 4.949795484542847 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138702, + "balance_loss_mlp": 1.06698608, + "epoch": 0.9742208541746825, + "flos": 1414178064384.0, + "grad_norm": 0.0038928950863822815, + "language_loss": 0.79677713, + "learning_rate": 1.7417722199051245e-06, + "loss": 0.80816418, + "num_input_tokens_seen": 419651840, + "router_z_loss_mlp": 0.71875, + "step": 5064, + "time_per_iteration": 4.926048994064331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134852, + "balance_loss_mlp": 1.06160998, + "epoch": 0.9744132358599461, + "flos": 676098230784.0, + "grad_norm": 0.030093067662967588, + "language_loss": 0.80718327, + "learning_rate": 1.7158874457005592e-06, + "loss": 0.81853181, + "num_input_tokens_seen": 419729424, + "router_z_loss_mlp": 0.73242188, + "step": 5065, + "time_per_iteration": 2.866382360458374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135156, + "balance_loss_mlp": 1.06196105, + "epoch": 0.9746056175452097, + "flos": 599597279232.0, + "grad_norm": 0.03459907750020676, + "language_loss": 0.82514048, + "learning_rate": 1.690196122544896e-06, + "loss": 0.836492, + "num_input_tokens_seen": 419803616, + "router_z_loss_mlp": 0.73193359, + "step": 5066, + "time_per_iteration": 2.8023762702941895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135035, + "balance_loss_mlp": 1.06179249, + "epoch": 0.9747979992304733, + "flos": 733532886528.0, + "grad_norm": 0.03471604647902471, + "language_loss": 0.86751151, + "learning_rate": 1.6646982604123784e-06, + "loss": 0.8788619, + "num_input_tokens_seen": 419883536, + "router_z_loss_mlp": 0.73242188, + "step": 5067, + "time_per_iteration": 3.010525941848755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134934, + "balance_loss_mlp": 1.06174004, + "epoch": 0.9749903809157369, + "flos": 617619526656.0, + "grad_norm": 0.04453093202467409, + "language_loss": 0.81295151, + "learning_rate": 1.6393938692022548e-06, + "loss": 0.82430089, + "num_input_tokens_seen": 419956816, + "router_z_loss_mlp": 0.73193359, + "step": 5068, + "time_per_iteration": 2.7091329097747803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134722, + "balance_loss_mlp": 1.06148005, + "epoch": 0.9751827626010003, + "flos": 469349769216.0, + "grad_norm": 0.03581121919344097, + "language_loss": 0.88265562, + "learning_rate": 1.6142829587384443e-06, + "loss": 0.89400285, + "num_input_tokens_seen": 420022096, + "router_z_loss_mlp": 0.73242188, + "step": 5069, + "time_per_iteration": 2.6038756370544434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134695, + "balance_loss_mlp": 1.06145287, + "epoch": 0.9753751442862639, + "flos": 600407012352.0, + "grad_norm": 0.04136761381890335, + "language_loss": 0.91069138, + "learning_rate": 1.5893655387698713e-06, + "loss": 0.92203832, + "num_input_tokens_seen": 420097008, + "router_z_loss_mlp": 0.73242188, + "step": 5070, + "time_per_iteration": 2.826425075531006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136024, + "balance_loss_mlp": 1.06292439, + "epoch": 0.9755675259715275, + "flos": 652090944000.0, + "grad_norm": 0.03089674785401136, + "language_loss": 0.86145902, + "learning_rate": 1.5646416189704637e-06, + "loss": 0.87281919, + "num_input_tokens_seen": 420174960, + "router_z_loss_mlp": 0.73095703, + "step": 5071, + "time_per_iteration": 2.940932512283325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011359, + "balance_loss_mlp": 1.06275284, + "epoch": 0.9757599076567911, + "flos": 564724360704.0, + "grad_norm": 0.04003681230801716, + "language_loss": 0.83221871, + "learning_rate": 1.5401112089387659e-06, + "loss": 0.84357774, + "num_input_tokens_seen": 420245248, + "router_z_loss_mlp": 0.73144531, + "step": 5072, + "time_per_iteration": 2.683784246444702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135923, + "balance_loss_mlp": 1.06287193, + "epoch": 0.9759522893420547, + "flos": 505648771584.0, + "grad_norm": 0.03649073694406785, + "language_loss": 0.85017049, + "learning_rate": 1.5157743181983819e-06, + "loss": 0.86152965, + "num_input_tokens_seen": 420310688, + "router_z_loss_mlp": 0.73046875, + "step": 5073, + "time_per_iteration": 2.621758222579956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135904, + "balance_loss_mlp": 1.06280482, + "epoch": 0.9761446710273182, + "flos": 584837432832.0, + "grad_norm": 0.04240200515467586, + "language_loss": 0.86889368, + "learning_rate": 1.4916309561976982e-06, + "loss": 0.88025272, + "num_input_tokens_seen": 420379008, + "router_z_loss_mlp": 0.73095703, + "step": 5074, + "time_per_iteration": 2.754220485687256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135867, + "balance_loss_mlp": 1.06276762, + "epoch": 0.9763370527125818, + "flos": 483171628032.0, + "grad_norm": 0.041938466654606696, + "language_loss": 0.87228501, + "learning_rate": 1.4676811323099947e-06, + "loss": 0.88364369, + "num_input_tokens_seen": 420445504, + "router_z_loss_mlp": 0.73095703, + "step": 5075, + "time_per_iteration": 2.660871982574463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135876, + "balance_loss_mlp": 1.06272912, + "epoch": 0.9765294343978453, + "flos": 620113853952.0, + "grad_norm": 0.034349586662843025, + "language_loss": 0.82321155, + "learning_rate": 1.4439248558335561e-06, + "loss": 0.83457041, + "num_input_tokens_seen": 420520528, + "router_z_loss_mlp": 0.73144531, + "step": 5076, + "time_per_iteration": 2.7837605476379395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136031, + "balance_loss_mlp": 1.06293166, + "epoch": 0.9767218160831089, + "flos": 527587427328.0, + "grad_norm": 0.03936217857713211, + "language_loss": 0.89625615, + "learning_rate": 1.4203621359911712e-06, + "loss": 0.9076165, + "num_input_tokens_seen": 420586224, + "router_z_loss_mlp": 0.73095703, + "step": 5077, + "time_per_iteration": 2.5941243171691895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135825, + "balance_loss_mlp": 1.06263041, + "epoch": 0.9769141977683724, + "flos": 526245937152.0, + "grad_norm": 0.034114352455168806, + "language_loss": 0.88253415, + "learning_rate": 1.3969929819308557e-06, + "loss": 0.89389241, + "num_input_tokens_seen": 420655456, + "router_z_loss_mlp": 0.73193359, + "step": 5078, + "time_per_iteration": 2.6603527069091797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135907, + "balance_loss_mlp": 1.06276, + "epoch": 0.977106579453636, + "flos": 458643317760.0, + "grad_norm": 0.03736684310262229, + "language_loss": 0.84752488, + "learning_rate": 1.3738174027252416e-06, + "loss": 0.85888398, + "num_input_tokens_seen": 420733216, + "router_z_loss_mlp": 0.73144531, + "step": 5079, + "time_per_iteration": 2.8190555572509766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113459, + "balance_loss_mlp": 1.06134772, + "epoch": 0.9772989611388996, + "flos": 533134035456.0, + "grad_norm": 0.03786927366079968, + "language_loss": 0.86551404, + "learning_rate": 1.3508354073719642e-06, + "loss": 0.87685996, + "num_input_tokens_seen": 420803376, + "router_z_loss_mlp": 0.73242188, + "step": 5080, + "time_per_iteration": 2.6154069900512695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134748, + "balance_loss_mlp": 1.06150591, + "epoch": 0.9774913428241632, + "flos": 756754635264.0, + "grad_norm": 0.037853043258092404, + "language_loss": 0.8976739, + "learning_rate": 1.3280470047933313e-06, + "loss": 0.90902144, + "num_input_tokens_seen": 420886256, + "router_z_loss_mlp": 0.73242188, + "step": 5081, + "time_per_iteration": 3.048454523086548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138092, + "balance_loss_mlp": 1.06637573, + "epoch": 0.9776837245094268, + "flos": 1557668012544.0, + "grad_norm": 0.00376334878312987, + "language_loss": 0.78895497, + "learning_rate": 1.3054522038366544e-06, + "loss": 0.80033588, + "num_input_tokens_seen": 421123728, + "router_z_loss_mlp": 0.71875, + "step": 5082, + "time_per_iteration": 5.043825149536133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134654, + "balance_loss_mlp": 1.06141222, + "epoch": 0.9778761061946902, + "flos": 593633707008.0, + "grad_norm": 0.04337083767470995, + "language_loss": 0.89383692, + "learning_rate": 1.2830510132739725e-06, + "loss": 0.90518343, + "num_input_tokens_seen": 421192576, + "router_z_loss_mlp": 0.73242188, + "step": 5083, + "time_per_iteration": 2.7039098739624023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135781, + "balance_loss_mlp": 1.06263411, + "epoch": 0.9780684878799538, + "flos": 415831521792.0, + "grad_norm": 0.03593395529924556, + "language_loss": 0.86301732, + "learning_rate": 1.2608434418022175e-06, + "loss": 0.8743751, + "num_input_tokens_seen": 421256272, + "router_z_loss_mlp": 0.73144531, + "step": 5084, + "time_per_iteration": 4.800846815109253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136122, + "balance_loss_mlp": 1.06302321, + "epoch": 0.9782608695652174, + "flos": 569543827968.0, + "grad_norm": 0.03668547357374544, + "language_loss": 0.89074433, + "learning_rate": 1.2388294980431036e-06, + "loss": 0.90210557, + "num_input_tokens_seen": 421332880, + "router_z_loss_mlp": 0.73095703, + "step": 5085, + "time_per_iteration": 2.7352962493896484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135976, + "balance_loss_mlp": 1.06287682, + "epoch": 0.978453251250481, + "flos": 691761136128.0, + "grad_norm": 0.03913427215526849, + "language_loss": 0.87911779, + "learning_rate": 1.217009190543239e-06, + "loss": 0.89047754, + "num_input_tokens_seen": 421406160, + "router_z_loss_mlp": 0.73095703, + "step": 5086, + "time_per_iteration": 2.8892364501953125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135825, + "balance_loss_mlp": 1.06263065, + "epoch": 0.9786456329357445, + "flos": 503571408384.0, + "grad_norm": 0.034620175401031496, + "language_loss": 0.81605828, + "learning_rate": 1.1953825277740694e-06, + "loss": 0.82741642, + "num_input_tokens_seen": 421476208, + "router_z_loss_mlp": 0.73193359, + "step": 5087, + "time_per_iteration": 2.67069149017334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134413, + "balance_loss_mlp": 1.06117117, + "epoch": 0.9788380146210081, + "flos": 864604866048.0, + "grad_norm": 0.039272428340046274, + "language_loss": 0.85826278, + "learning_rate": 1.1739495181317117e-06, + "loss": 0.86960691, + "num_input_tokens_seen": 421549232, + "router_z_loss_mlp": 0.73242188, + "step": 5088, + "time_per_iteration": 3.05206561088562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134797, + "balance_loss_mlp": 1.06155455, + "epoch": 0.9790303963062716, + "flos": 513746102784.0, + "grad_norm": 0.034545752771366, + "language_loss": 0.88846779, + "learning_rate": 1.1527101699371767e-06, + "loss": 0.8998158, + "num_input_tokens_seen": 421617056, + "router_z_loss_mlp": 0.73242188, + "step": 5089, + "time_per_iteration": 2.6102468967437744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134619, + "balance_loss_mlp": 1.06132865, + "epoch": 0.9792227779915352, + "flos": 495410950656.0, + "grad_norm": 0.042612868246076144, + "language_loss": 0.91103876, + "learning_rate": 1.1316644914364237e-06, + "loss": 0.92238486, + "num_input_tokens_seen": 421683424, + "router_z_loss_mlp": 0.73291016, + "step": 5090, + "time_per_iteration": 2.5904555320739746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135903, + "balance_loss_mlp": 1.06275654, + "epoch": 0.9794151596767988, + "flos": 609483264000.0, + "grad_norm": 0.038327834107812486, + "language_loss": 0.86390072, + "learning_rate": 1.1108124908000838e-06, + "loss": 0.87525976, + "num_input_tokens_seen": 421761200, + "router_z_loss_mlp": 0.73144531, + "step": 5091, + "time_per_iteration": 2.7942652702331543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135987, + "balance_loss_mlp": 1.06284022, + "epoch": 0.9796075413620623, + "flos": 479196822528.0, + "grad_norm": 0.04242679412713505, + "language_loss": 0.91551888, + "learning_rate": 1.09015417612357e-06, + "loss": 0.92687881, + "num_input_tokens_seen": 421829600, + "router_z_loss_mlp": 0.73144531, + "step": 5092, + "time_per_iteration": 2.6029610633850098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113605, + "balance_loss_mlp": 1.06285572, + "epoch": 0.9797999230473259, + "flos": 593362461696.0, + "grad_norm": 0.038287668132117786, + "language_loss": 0.88482207, + "learning_rate": 1.0696895554271335e-06, + "loss": 0.8961826, + "num_input_tokens_seen": 421904928, + "router_z_loss_mlp": 0.73193359, + "step": 5093, + "time_per_iteration": 2.7648696899414062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134535, + "balance_loss_mlp": 1.06129241, + "epoch": 0.9799923047325895, + "flos": 557563015680.0, + "grad_norm": 0.03420994841763029, + "language_loss": 0.86238348, + "learning_rate": 1.049418636655919e-06, + "loss": 0.87372881, + "num_input_tokens_seen": 421989616, + "router_z_loss_mlp": 0.73242188, + "step": 5094, + "time_per_iteration": 2.912834644317627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136088, + "balance_loss_mlp": 1.06284571, + "epoch": 0.9801846864178531, + "flos": 580628312064.0, + "grad_norm": 0.03371993676263859, + "language_loss": 0.89129627, + "learning_rate": 1.0293414276797974e-06, + "loss": 0.90265721, + "num_input_tokens_seen": 422067088, + "router_z_loss_mlp": 0.73242188, + "step": 5095, + "time_per_iteration": 2.773477792739868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134792, + "balance_loss_mlp": 1.06154943, + "epoch": 0.9803770681031165, + "flos": 516210230784.0, + "grad_norm": 0.034566414625280935, + "language_loss": 0.83682495, + "learning_rate": 1.0094579362933677e-06, + "loss": 0.8481729, + "num_input_tokens_seen": 422141136, + "router_z_loss_mlp": 0.73242188, + "step": 5096, + "time_per_iteration": 2.712693691253662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136254, + "balance_loss_mlp": 1.06315458, + "epoch": 0.9805694497883801, + "flos": 568119745536.0, + "grad_norm": 0.03425876820589903, + "language_loss": 0.82894945, + "learning_rate": 9.897681702160654e-07, + "loss": 0.840312, + "num_input_tokens_seen": 422216400, + "router_z_loss_mlp": 0.73095703, + "step": 5097, + "time_per_iteration": 2.737246036529541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135606, + "balance_loss_mlp": 1.06241155, + "epoch": 0.9807618314736437, + "flos": 480332195328.0, + "grad_norm": 0.04046674037063813, + "language_loss": 0.78180015, + "learning_rate": 9.702721370922208e-07, + "loss": 0.79315621, + "num_input_tokens_seen": 422287664, + "router_z_loss_mlp": 0.73193359, + "step": 5098, + "time_per_iteration": 2.652815341949463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135541, + "balance_loss_mlp": 1.0623461, + "epoch": 0.9809542131589073, + "flos": 546341544960.0, + "grad_norm": 0.04086563357176875, + "language_loss": 0.85544622, + "learning_rate": 9.509698444908344e-07, + "loss": 0.86680162, + "num_input_tokens_seen": 422357552, + "router_z_loss_mlp": 0.73193359, + "step": 5099, + "time_per_iteration": 2.6499040126800537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134438, + "balance_loss_mlp": 1.06119621, + "epoch": 0.9811465948441709, + "flos": 521862899712.0, + "grad_norm": 0.04248805685521767, + "language_loss": 0.85820013, + "learning_rate": 9.318612999057452e-07, + "loss": 0.86954451, + "num_input_tokens_seen": 422425872, + "router_z_loss_mlp": 0.73242188, + "step": 5100, + "time_per_iteration": 2.6109817028045654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134571, + "balance_loss_mlp": 1.06132865, + "epoch": 0.9813389765294344, + "flos": 542321077248.0, + "grad_norm": 0.03689155006089091, + "language_loss": 0.84802127, + "learning_rate": 9.129465107554635e-07, + "loss": 0.85936701, + "num_input_tokens_seen": 422495760, + "router_z_loss_mlp": 0.73242188, + "step": 5101, + "time_per_iteration": 2.646704912185669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134579, + "balance_loss_mlp": 1.06133687, + "epoch": 0.981531358214698, + "flos": 568464850944.0, + "grad_norm": 0.03755425810198059, + "language_loss": 0.88694, + "learning_rate": 8.942254843834485e-07, + "loss": 0.89828575, + "num_input_tokens_seen": 422568112, + "router_z_loss_mlp": 0.73242188, + "step": 5102, + "time_per_iteration": 2.7322897911071777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136296, + "balance_loss_mlp": 1.06314886, + "epoch": 0.9817237398999615, + "flos": 578413962240.0, + "grad_norm": 0.03455798640068261, + "language_loss": 0.85217297, + "learning_rate": 8.756982280578307e-07, + "loss": 0.86353588, + "num_input_tokens_seen": 422641280, + "router_z_loss_mlp": 0.73144531, + "step": 5103, + "time_per_iteration": 2.751131057739258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136072, + "balance_loss_mlp": 1.06282985, + "epoch": 0.9819161215852251, + "flos": 702854352384.0, + "grad_norm": 0.03555623235695427, + "language_loss": 0.85993326, + "learning_rate": 8.573647489714676e-07, + "loss": 0.87129396, + "num_input_tokens_seen": 422720416, + "router_z_loss_mlp": 0.73242188, + "step": 5104, + "time_per_iteration": 2.951957941055298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135655, + "balance_loss_mlp": 1.0624609, + "epoch": 0.9821085032704886, + "flos": 625452343296.0, + "grad_norm": 0.03465418860850988, + "language_loss": 0.88711596, + "learning_rate": 8.392250542421653e-07, + "loss": 0.89847255, + "num_input_tokens_seen": 422800384, + "router_z_loss_mlp": 0.73193359, + "step": 5105, + "time_per_iteration": 2.886805772781372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136322, + "balance_loss_mlp": 1.06327093, + "epoch": 0.9823008849557522, + "flos": 500492931072.0, + "grad_norm": 0.03689529509653958, + "language_loss": 0.86079448, + "learning_rate": 8.212791509122353e-07, + "loss": 0.87215769, + "num_input_tokens_seen": 422870768, + "router_z_loss_mlp": 0.73046875, + "step": 5106, + "time_per_iteration": 2.687134265899658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134787, + "balance_loss_mlp": 1.06154442, + "epoch": 0.9824932666410158, + "flos": 524904446976.0, + "grad_norm": 0.040173053897464624, + "language_loss": 0.78432387, + "learning_rate": 8.035270459489929e-07, + "loss": 0.79567176, + "num_input_tokens_seen": 422942864, + "router_z_loss_mlp": 0.73242188, + "step": 5107, + "time_per_iteration": 2.6810905933380127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135022, + "balance_loss_mlp": 1.06178021, + "epoch": 0.9826856483262794, + "flos": 503675467776.0, + "grad_norm": 0.03566590525509119, + "language_loss": 0.87364811, + "learning_rate": 7.859687462443698e-07, + "loss": 0.88499832, + "num_input_tokens_seen": 423013600, + "router_z_loss_mlp": 0.73242188, + "step": 5108, + "time_per_iteration": 2.653001546859741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134775, + "balance_loss_mlp": 1.06153297, + "epoch": 0.982878030011543, + "flos": 563213683200.0, + "grad_norm": 0.04574005448539413, + "language_loss": 0.88620985, + "learning_rate": 7.686042586151354e-07, + "loss": 0.89755762, + "num_input_tokens_seen": 423093680, + "router_z_loss_mlp": 0.73242188, + "step": 5109, + "time_per_iteration": 2.8465735912323 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136369, + "balance_loss_mlp": 1.06331754, + "epoch": 0.9830704116968064, + "flos": 538214014464.0, + "grad_norm": 0.034798278837774685, + "language_loss": 0.8696683, + "learning_rate": 7.514335898027857e-07, + "loss": 0.88103199, + "num_input_tokens_seen": 423168608, + "router_z_loss_mlp": 0.73046875, + "step": 5110, + "time_per_iteration": 2.779977321624756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113608, + "balance_loss_mlp": 1.06298041, + "epoch": 0.98326279338207, + "flos": 459902215680.0, + "grad_norm": 0.03838898388533907, + "language_loss": 0.88750166, + "learning_rate": 7.344567464735441e-07, + "loss": 0.89886248, + "num_input_tokens_seen": 423233552, + "router_z_loss_mlp": 0.73095703, + "step": 5111, + "time_per_iteration": 2.5905652046203613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136156, + "balance_loss_mlp": 1.06310439, + "epoch": 0.9834551750673336, + "flos": 642189496320.0, + "grad_norm": 0.03516170903549916, + "language_loss": 0.83847117, + "learning_rate": 7.17673735218416e-07, + "loss": 0.84983265, + "num_input_tokens_seen": 423307440, + "router_z_loss_mlp": 0.73046875, + "step": 5112, + "time_per_iteration": 2.8230271339416504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135233, + "balance_loss_mlp": 1.06199098, + "epoch": 0.9836475567525972, + "flos": 1073548211712.0, + "grad_norm": 0.03562811843552658, + "language_loss": 0.83895671, + "learning_rate": 7.010845625530782e-07, + "loss": 0.85030913, + "num_input_tokens_seen": 423394880, + "router_z_loss_mlp": 0.73242188, + "step": 5113, + "time_per_iteration": 3.4172170162200928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134582, + "balance_loss_mlp": 1.0613873, + "epoch": 0.9838399384378607, + "flos": 566278699008.0, + "grad_norm": 0.043401730302991125, + "language_loss": 0.81372494, + "learning_rate": 6.846892349181566e-07, + "loss": 0.82507074, + "num_input_tokens_seen": 423461792, + "router_z_loss_mlp": 0.73193359, + "step": 5114, + "time_per_iteration": 2.6795566082000732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134656, + "balance_loss_mlp": 1.061414, + "epoch": 0.9840323201231242, + "flos": 774179997696.0, + "grad_norm": 0.042339759208220466, + "language_loss": 0.85027516, + "learning_rate": 6.684877586787819e-07, + "loss": 0.86162174, + "num_input_tokens_seen": 423539952, + "router_z_loss_mlp": 0.73242188, + "step": 5115, + "time_per_iteration": 3.0095579624176025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136423, + "balance_loss_mlp": 1.06322873, + "epoch": 0.9842247018083878, + "flos": 473248713216.0, + "grad_norm": 0.0363602378953053, + "language_loss": 0.89681566, + "learning_rate": 6.524801401249225e-07, + "loss": 0.90817988, + "num_input_tokens_seen": 423607184, + "router_z_loss_mlp": 0.73193359, + "step": 5116, + "time_per_iteration": 2.5631868839263916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136374, + "balance_loss_mlp": 1.06332254, + "epoch": 0.9844170834936514, + "flos": 526311065088.0, + "grad_norm": 0.035086314947572486, + "language_loss": 0.8950007, + "learning_rate": 6.366663854713295e-07, + "loss": 0.90636444, + "num_input_tokens_seen": 423676528, + "router_z_loss_mlp": 0.73046875, + "step": 5117, + "time_per_iteration": 2.6976704597473145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139755, + "balance_loss_mlp": 1.06803894, + "epoch": 0.984609465178915, + "flos": 1570623742464.0, + "grad_norm": 0.005251722325346967, + "language_loss": 0.77162516, + "learning_rate": 6.210465008574251e-07, + "loss": 0.78302276, + "num_input_tokens_seen": 423905856, + "router_z_loss_mlp": 0.71875, + "step": 5118, + "time_per_iteration": 4.95673942565918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134864, + "balance_loss_mlp": 1.06166935, + "epoch": 0.9848018468641785, + "flos": 520569073152.0, + "grad_norm": 0.04534599796839803, + "language_loss": 0.8812722, + "learning_rate": 6.056204923473584e-07, + "loss": 0.8926208, + "num_input_tokens_seen": 423972496, + "router_z_loss_mlp": 0.73193359, + "step": 5119, + "time_per_iteration": 2.6061763763427734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134973, + "balance_loss_mlp": 1.06173038, + "epoch": 0.9849942285494421, + "flos": 493986868224.0, + "grad_norm": 0.034301666318635994, + "language_loss": 0.87063777, + "learning_rate": 5.903883659301167e-07, + "loss": 0.88198745, + "num_input_tokens_seen": 424039968, + "router_z_loss_mlp": 0.73242188, + "step": 5120, + "time_per_iteration": 2.6077840328216553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134811, + "balance_loss_mlp": 1.06161654, + "epoch": 0.9851866102347057, + "flos": 547049220096.0, + "grad_norm": 0.03687618838408007, + "language_loss": 0.85899603, + "learning_rate": 5.753501275193029e-07, + "loss": 0.87034416, + "num_input_tokens_seen": 424108096, + "router_z_loss_mlp": 0.73193359, + "step": 5121, + "time_per_iteration": 2.6531834602355957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113473, + "balance_loss_mlp": 1.06148791, + "epoch": 0.9853789919199692, + "flos": 477214786560.0, + "grad_norm": 0.04121503477449517, + "language_loss": 0.85198522, + "learning_rate": 5.605057829531912e-07, + "loss": 0.86333251, + "num_input_tokens_seen": 424172256, + "router_z_loss_mlp": 0.73242188, + "step": 5122, + "time_per_iteration": 2.5439565181732178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134707, + "balance_loss_mlp": 1.06146467, + "epoch": 0.9855713736052328, + "flos": 1034307718656.0, + "grad_norm": 0.03796282782398555, + "language_loss": 0.80304152, + "learning_rate": 5.458553379950049e-07, + "loss": 0.81438863, + "num_input_tokens_seen": 424261088, + "router_z_loss_mlp": 0.73242188, + "step": 5123, + "time_per_iteration": 3.3912107944488525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134932, + "balance_loss_mlp": 1.06169021, + "epoch": 0.9857637552904963, + "flos": 496079694336.0, + "grad_norm": 0.0481766672977676, + "language_loss": 0.8670826, + "learning_rate": 5.31398798332472e-07, + "loss": 0.87843192, + "num_input_tokens_seen": 424329168, + "router_z_loss_mlp": 0.73242188, + "step": 5124, + "time_per_iteration": 2.6348800659179688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136248, + "balance_loss_mlp": 1.06314898, + "epoch": 0.9859561369757599, + "flos": 593381927424.0, + "grad_norm": 0.042122648622967405, + "language_loss": 0.89123881, + "learning_rate": 5.17136169578103e-07, + "loss": 0.9026013, + "num_input_tokens_seen": 424399392, + "router_z_loss_mlp": 0.73095703, + "step": 5125, + "time_per_iteration": 2.7288503646850586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136176, + "balance_loss_mlp": 1.06298196, + "epoch": 0.9861485186610235, + "flos": 487982363136.0, + "grad_norm": 0.0358846591177453, + "language_loss": 0.83094305, + "learning_rate": 5.030674572691907e-07, + "loss": 0.84230483, + "num_input_tokens_seen": 424470080, + "router_z_loss_mlp": 0.73193359, + "step": 5126, + "time_per_iteration": 2.660942792892456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113627, + "balance_loss_mlp": 1.06317127, + "epoch": 0.9863409003462871, + "flos": 519833200128.0, + "grad_norm": 0.030624136680643108, + "language_loss": 0.86946189, + "learning_rate": 4.891926668676994e-07, + "loss": 0.88082457, + "num_input_tokens_seen": 424541824, + "router_z_loss_mlp": 0.73095703, + "step": 5127, + "time_per_iteration": 2.7073521614074707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139725, + "balance_loss_mlp": 1.06800842, + "epoch": 0.9865332820315506, + "flos": 1489294591488.0, + "grad_norm": 0.005262688675018299, + "language_loss": 0.79182732, + "learning_rate": 4.755118037602646e-07, + "loss": 0.80322456, + "num_input_tokens_seen": 424773408, + "router_z_loss_mlp": 0.71875, + "step": 5128, + "time_per_iteration": 4.899366617202759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113637, + "balance_loss_mlp": 1.06327081, + "epoch": 0.9867256637168141, + "flos": 583217966592.0, + "grad_norm": 0.03678420177070357, + "language_loss": 0.83516836, + "learning_rate": 4.620248732582488e-07, + "loss": 0.84653205, + "num_input_tokens_seen": 424840608, + "router_z_loss_mlp": 0.73095703, + "step": 5129, + "time_per_iteration": 2.7090418338775635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135775, + "balance_loss_mlp": 1.06272316, + "epoch": 0.9869180454020777, + "flos": 960926177280.0, + "grad_norm": 0.03558291852194016, + "language_loss": 0.904948, + "learning_rate": 4.487318805977969e-07, + "loss": 0.91630578, + "num_input_tokens_seen": 424926128, + "router_z_loss_mlp": 0.73046875, + "step": 5130, + "time_per_iteration": 3.30485463142395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134312, + "balance_loss_mlp": 1.06107008, + "epoch": 0.9871104270873413, + "flos": 772113368064.0, + "grad_norm": 0.03765358627123921, + "language_loss": 0.87391722, + "learning_rate": 4.3563283093966954e-07, + "loss": 0.8852604, + "num_input_tokens_seen": 425005744, + "router_z_loss_mlp": 0.73242188, + "step": 5131, + "time_per_iteration": 2.9843320846557617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134246, + "balance_loss_mlp": 1.06100392, + "epoch": 0.9873028087726049, + "flos": 447365451264.0, + "grad_norm": 0.043947923730938386, + "language_loss": 0.84125459, + "learning_rate": 4.2272772936940986e-07, + "loss": 0.852597, + "num_input_tokens_seen": 425068112, + "router_z_loss_mlp": 0.73242188, + "step": 5132, + "time_per_iteration": 2.4963319301605225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135167, + "balance_loss_mlp": 1.06192493, + "epoch": 0.9874951904578684, + "flos": 508627192320.0, + "grad_norm": 0.035291470132473204, + "language_loss": 0.90447533, + "learning_rate": 4.1001658089717676e-07, + "loss": 0.91582704, + "num_input_tokens_seen": 425137408, + "router_z_loss_mlp": 0.73242188, + "step": 5133, + "time_per_iteration": 2.5896787643432617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134492, + "balance_loss_mlp": 1.06124949, + "epoch": 0.987687572143132, + "flos": 718037167104.0, + "grad_norm": 0.034260144513400544, + "language_loss": 0.86916608, + "learning_rate": 3.9749939045791164e-07, + "loss": 0.88051105, + "num_input_tokens_seen": 425213504, + "router_z_loss_mlp": 0.73242188, + "step": 5134, + "time_per_iteration": 2.9246342182159424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138206, + "balance_loss_mlp": 1.06629944, + "epoch": 0.9878799538283956, + "flos": 1541957443584.0, + "grad_norm": 0.003697455186378142, + "language_loss": 0.79817951, + "learning_rate": 3.851761629111716e-07, + "loss": 0.80956161, + "num_input_tokens_seen": 425451296, + "router_z_loss_mlp": 0.72070312, + "step": 5135, + "time_per_iteration": 4.907610654830933 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134617, + "balance_loss_mlp": 1.06142259, + "epoch": 0.9880723355136591, + "flos": 722737112064.0, + "grad_norm": 0.03189445878324839, + "language_loss": 0.85751259, + "learning_rate": 3.730469030412964e-07, + "loss": 0.86885875, + "num_input_tokens_seen": 425527536, + "router_z_loss_mlp": 0.73193359, + "step": 5136, + "time_per_iteration": 2.918485164642334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135851, + "balance_loss_mlp": 1.06279981, + "epoch": 0.9882647171989226, + "flos": 558413681664.0, + "grad_norm": 0.032326338581805884, + "language_loss": 0.88415384, + "learning_rate": 3.611116155572969e-07, + "loss": 0.89551234, + "num_input_tokens_seen": 425596608, + "router_z_loss_mlp": 0.73046875, + "step": 5137, + "time_per_iteration": 2.6782608032226562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136054, + "balance_loss_mlp": 1.06290746, + "epoch": 0.9884570988841862, + "flos": 563940824064.0, + "grad_norm": 0.041268271106656235, + "language_loss": 0.85345703, + "learning_rate": 3.493703050927999e-07, + "loss": 0.86481762, + "num_input_tokens_seen": 425667280, + "router_z_loss_mlp": 0.73144531, + "step": 5138, + "time_per_iteration": 2.737701416015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113618, + "balance_loss_mlp": 1.06303346, + "epoch": 0.9886494805694498, + "flos": 432668731392.0, + "grad_norm": 0.04045018787743159, + "language_loss": 0.91157293, + "learning_rate": 3.378229762062146e-07, + "loss": 0.92293483, + "num_input_tokens_seen": 425730736, + "router_z_loss_mlp": 0.73144531, + "step": 5139, + "time_per_iteration": 2.5153446197509766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136158, + "balance_loss_mlp": 1.06310701, + "epoch": 0.9888418622547134, + "flos": 593240937984.0, + "grad_norm": 0.0339250061411206, + "language_loss": 0.94499457, + "learning_rate": 3.264696333806771e-07, + "loss": 0.95635617, + "num_input_tokens_seen": 425807616, + "router_z_loss_mlp": 0.73046875, + "step": 5140, + "time_per_iteration": 2.8330492973327637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136272, + "balance_loss_mlp": 1.06322026, + "epoch": 0.989034243939977, + "flos": 1136865848832.0, + "grad_norm": 0.048311873953814935, + "language_loss": 0.84138036, + "learning_rate": 3.1531028102388394e-07, + "loss": 0.85274303, + "num_input_tokens_seen": 425900880, + "router_z_loss_mlp": 0.73046875, + "step": 5141, + "time_per_iteration": 3.5308704376220703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136536, + "balance_loss_mlp": 1.06334126, + "epoch": 0.9892266256252404, + "flos": 567730979328.0, + "grad_norm": 0.035998364171371054, + "language_loss": 0.85842848, + "learning_rate": 3.0434492346825824e-07, + "loss": 0.86979377, + "num_input_tokens_seen": 425973632, + "router_z_loss_mlp": 0.73193359, + "step": 5142, + "time_per_iteration": 2.7318220138549805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136331, + "balance_loss_mlp": 1.06323171, + "epoch": 0.989419007310504, + "flos": 641870587392.0, + "grad_norm": 0.04445949933168621, + "language_loss": 0.88850874, + "learning_rate": 2.9357356497095033e-07, + "loss": 0.899872, + "num_input_tokens_seen": 426057088, + "router_z_loss_mlp": 0.73095703, + "step": 5143, + "time_per_iteration": 2.9219346046447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136317, + "balance_loss_mlp": 1.0632174, + "epoch": 0.9896113889957676, + "flos": 456448433664.0, + "grad_norm": 0.03712500975558181, + "language_loss": 0.85754621, + "learning_rate": 2.829962097138372e-07, + "loss": 0.86890936, + "num_input_tokens_seen": 426124336, + "router_z_loss_mlp": 0.73095703, + "step": 5144, + "time_per_iteration": 2.6135852336883545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113489, + "balance_loss_mlp": 1.06164801, + "epoch": 0.9898037706810312, + "flos": 568419188736.0, + "grad_norm": 0.036970241662831894, + "language_loss": 0.85173666, + "learning_rate": 2.726128618033008e-07, + "loss": 0.86308557, + "num_input_tokens_seen": 426191888, + "router_z_loss_mlp": 0.73242188, + "step": 5145, + "time_per_iteration": 2.728771209716797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138741, + "balance_loss_mlp": 1.06702423, + "epoch": 0.9899961523662947, + "flos": 1553447431680.0, + "grad_norm": 0.0039494611042856405, + "language_loss": 0.78146422, + "learning_rate": 2.624235252706164e-07, + "loss": 0.79285163, + "num_input_tokens_seen": 426425840, + "router_z_loss_mlp": 0.71875, + "step": 5146, + "time_per_iteration": 4.958428382873535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135081, + "balance_loss_mlp": 1.06183898, + "epoch": 0.9901885340515583, + "flos": 611947392000.0, + "grad_norm": 0.03732558697558194, + "language_loss": 0.89710462, + "learning_rate": 2.524282040715642e-07, + "loss": 0.90845543, + "num_input_tokens_seen": 426506080, + "router_z_loss_mlp": 0.73242188, + "step": 5147, + "time_per_iteration": 2.9494400024414062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135311, + "balance_loss_mlp": 1.06206846, + "epoch": 0.9903809157368219, + "flos": 518493711360.0, + "grad_norm": 0.03472325618842919, + "language_loss": 0.86850142, + "learning_rate": 2.426269020866512e-07, + "loss": 0.87985462, + "num_input_tokens_seen": 426573936, + "router_z_loss_mlp": 0.73242188, + "step": 5148, + "time_per_iteration": 2.606642007827759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113491, + "balance_loss_mlp": 1.06166744, + "epoch": 0.9905732974220854, + "flos": 1102197046272.0, + "grad_norm": 0.03711196297456148, + "language_loss": 0.85352963, + "learning_rate": 2.3301962312122226e-07, + "loss": 0.86487871, + "num_input_tokens_seen": 426657472, + "router_z_loss_mlp": 0.73242188, + "step": 5149, + "time_per_iteration": 3.4157660007476807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134965, + "balance_loss_mlp": 1.06177092, + "epoch": 0.990765679107349, + "flos": 859492686336.0, + "grad_norm": 0.04154402943927612, + "language_loss": 0.89084303, + "learning_rate": 2.2360637090496073e-07, + "loss": 0.90219271, + "num_input_tokens_seen": 426740560, + "router_z_loss_mlp": 0.73193359, + "step": 5150, + "time_per_iteration": 3.1477768421173096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136347, + "balance_loss_mlp": 1.06329572, + "epoch": 0.9909580607926125, + "flos": 492274076160.0, + "grad_norm": 0.03777042366534936, + "language_loss": 0.84356183, + "learning_rate": 2.143871490925542e-07, + "loss": 0.85492527, + "num_input_tokens_seen": 426809296, + "router_z_loss_mlp": 0.73046875, + "step": 5151, + "time_per_iteration": 2.630377769470215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136659, + "balance_loss_mlp": 1.06355977, + "epoch": 0.9911504424778761, + "flos": 586159457280.0, + "grad_norm": 0.03962254747551654, + "language_loss": 0.84528565, + "learning_rate": 2.0536196126319519e-07, + "loss": 0.85665214, + "num_input_tokens_seen": 426881056, + "router_z_loss_mlp": 0.73095703, + "step": 5152, + "time_per_iteration": 2.711332321166992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135988, + "balance_loss_mlp": 1.06279361, + "epoch": 0.9913428241631397, + "flos": 571100167680.0, + "grad_norm": 0.04036611749146896, + "language_loss": 0.8638401, + "learning_rate": 1.9653081092074753e-07, + "loss": 0.87520003, + "num_input_tokens_seen": 426949664, + "router_z_loss_mlp": 0.73193359, + "step": 5153, + "time_per_iteration": 2.7309064865112305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136524, + "balance_loss_mlp": 1.06347251, + "epoch": 0.9915352058484033, + "flos": 490711005696.0, + "grad_norm": 0.03270171907202174, + "language_loss": 0.90234423, + "learning_rate": 1.8789370149374652e-07, + "loss": 0.91370946, + "num_input_tokens_seen": 427018816, + "router_z_loss_mlp": 0.73046875, + "step": 5154, + "time_per_iteration": 2.650282382965088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113634, + "balance_loss_mlp": 1.06319273, + "epoch": 0.9917275875336667, + "flos": 745409639424.0, + "grad_norm": 0.034109817330924164, + "language_loss": 0.86935675, + "learning_rate": 1.7945063633545423e-07, + "loss": 0.88072014, + "num_input_tokens_seen": 427097984, + "router_z_loss_mlp": 0.73144531, + "step": 5155, + "time_per_iteration": 2.986468553543091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135757, + "balance_loss_mlp": 1.06256294, + "epoch": 0.9919199692189303, + "flos": 509324133888.0, + "grad_norm": 0.03639310073850552, + "language_loss": 0.84705198, + "learning_rate": 1.7120161872380412e-07, + "loss": 0.85840952, + "num_input_tokens_seen": 427169280, + "router_z_loss_mlp": 0.73193359, + "step": 5156, + "time_per_iteration": 2.647678852081299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136146, + "balance_loss_mlp": 1.06299901, + "epoch": 0.9921123509041939, + "flos": 545010788352.0, + "grad_norm": 0.03592115779060212, + "language_loss": 0.8875376, + "learning_rate": 1.6314665186123457e-07, + "loss": 0.89889908, + "num_input_tokens_seen": 427237312, + "router_z_loss_mlp": 0.73144531, + "step": 5157, + "time_per_iteration": 2.6703507900238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134605, + "balance_loss_mlp": 1.0613631, + "epoch": 0.9923047325894575, + "flos": 672757240320.0, + "grad_norm": 0.03851308781628141, + "language_loss": 0.82369369, + "learning_rate": 1.5528573887507724e-07, + "loss": 0.83503973, + "num_input_tokens_seen": 427305008, + "router_z_loss_mlp": 0.73242188, + "step": 5158, + "time_per_iteration": 2.822913408279419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135232, + "balance_loss_mlp": 1.06198978, + "epoch": 0.9924971142747211, + "flos": 467624242176.0, + "grad_norm": 0.03828859510253023, + "language_loss": 0.85407376, + "learning_rate": 1.4761888281711322e-07, + "loss": 0.86542612, + "num_input_tokens_seen": 427377008, + "router_z_loss_mlp": 0.73242188, + "step": 5159, + "time_per_iteration": 2.701911687850952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135482, + "balance_loss_mlp": 1.06223953, + "epoch": 0.9926894959599846, + "flos": 492562785792.0, + "grad_norm": 0.035031095902323076, + "language_loss": 0.8758896, + "learning_rate": 1.4014608666390594e-07, + "loss": 0.88724446, + "num_input_tokens_seen": 427444528, + "router_z_loss_mlp": 0.73242188, + "step": 5160, + "time_per_iteration": 2.5947694778442383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134979, + "balance_loss_mlp": 1.06173706, + "epoch": 0.9928818776452482, + "flos": 493372518912.0, + "grad_norm": 0.0398290144943764, + "language_loss": 0.85975552, + "learning_rate": 1.328673533166902e-07, + "loss": 0.87110531, + "num_input_tokens_seen": 427509808, + "router_z_loss_mlp": 0.73242188, + "step": 5161, + "time_per_iteration": 2.580611228942871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136266, + "balance_loss_mlp": 1.06311941, + "epoch": 0.9930742593305117, + "flos": 547466184192.0, + "grad_norm": 0.04374439834283326, + "language_loss": 0.88636076, + "learning_rate": 1.2578268560131666e-07, + "loss": 0.89772344, + "num_input_tokens_seen": 427587936, + "router_z_loss_mlp": 0.73144531, + "step": 5162, + "time_per_iteration": 2.765444755554199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136135, + "balance_loss_mlp": 1.06294107, + "epoch": 0.9932666410157753, + "flos": 586615352832.0, + "grad_norm": 0.03608446377738685, + "language_loss": 0.90740782, + "learning_rate": 1.1889208626825188e-07, + "loss": 0.91876918, + "num_input_tokens_seen": 427662224, + "router_z_loss_mlp": 0.73193359, + "step": 5163, + "time_per_iteration": 2.8404746055603027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136098, + "balance_loss_mlp": 1.06295085, + "epoch": 0.9934590227010388, + "flos": 538105225728.0, + "grad_norm": 0.036108153087719384, + "language_loss": 0.88640219, + "learning_rate": 1.1219555799268921e-07, + "loss": 0.89776313, + "num_input_tokens_seen": 427730544, + "router_z_loss_mlp": 0.73144531, + "step": 5164, + "time_per_iteration": 2.660189390182495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136245, + "balance_loss_mlp": 1.06319404, + "epoch": 0.9936514043863024, + "flos": 519060397056.0, + "grad_norm": 0.036393144495114126, + "language_loss": 0.91024756, + "learning_rate": 1.0569310337443794e-07, + "loss": 0.92161, + "num_input_tokens_seen": 427799760, + "router_z_loss_mlp": 0.73046875, + "step": 5165, + "time_per_iteration": 2.62958025932312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136227, + "balance_loss_mlp": 1.06308019, + "epoch": 0.993843786071566, + "flos": 745995790848.0, + "grad_norm": 0.039050084286539895, + "language_loss": 0.85854822, + "learning_rate": 9.938472493803419e-08, + "loss": 0.86991048, + "num_input_tokens_seen": 427881936, + "router_z_loss_mlp": 0.73144531, + "step": 5166, + "time_per_iteration": 3.0344748497009277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136102, + "balance_loss_mlp": 1.06305063, + "epoch": 0.9940361677568296, + "flos": 527008006656.0, + "grad_norm": 0.038807373304902144, + "language_loss": 0.87782025, + "learning_rate": 9.327042513251893e-08, + "loss": 0.88918126, + "num_input_tokens_seen": 427951648, + "router_z_loss_mlp": 0.73046875, + "step": 5167, + "time_per_iteration": 2.6882591247558594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136249, + "balance_loss_mlp": 1.06310236, + "epoch": 0.9942285494420932, + "flos": 556746551808.0, + "grad_norm": 0.03797309079451297, + "language_loss": 0.85039365, + "learning_rate": 8.735020633177104e-08, + "loss": 0.86175615, + "num_input_tokens_seen": 428031184, + "router_z_loss_mlp": 0.73144531, + "step": 5168, + "time_per_iteration": 2.7696192264556885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134782, + "balance_loss_mlp": 1.06153989, + "epoch": 0.9944209311273566, + "flos": 587099446272.0, + "grad_norm": 0.03338211410978879, + "language_loss": 0.86810982, + "learning_rate": 8.162407083411872e-08, + "loss": 0.87945765, + "num_input_tokens_seen": 428107296, + "router_z_loss_mlp": 0.73242188, + "step": 5169, + "time_per_iteration": 2.7250516414642334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113501, + "balance_loss_mlp": 1.06176758, + "epoch": 0.9946133128126202, + "flos": 736856412672.0, + "grad_norm": 0.03340787079875126, + "language_loss": 0.8653456, + "learning_rate": 7.609202086272804e-08, + "loss": 0.87669569, + "num_input_tokens_seen": 428187904, + "router_z_loss_mlp": 0.73242188, + "step": 5170, + "time_per_iteration": 2.9989120960235596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134876, + "balance_loss_mlp": 1.06163335, + "epoch": 0.9948056944978838, + "flos": 647180152320.0, + "grad_norm": 0.038233740097927245, + "language_loss": 0.86638784, + "learning_rate": 7.075405856526995e-08, + "loss": 0.87773657, + "num_input_tokens_seen": 428255856, + "router_z_loss_mlp": 0.73242188, + "step": 5171, + "time_per_iteration": 2.8077123165130615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113494, + "balance_loss_mlp": 1.06169748, + "epoch": 0.9949980761831474, + "flos": 446796764160.0, + "grad_norm": 0.03800509693543743, + "language_loss": 0.90174496, + "learning_rate": 6.561018601414226e-08, + "loss": 0.91309434, + "num_input_tokens_seen": 428321872, + "router_z_loss_mlp": 0.73242188, + "step": 5172, + "time_per_iteration": 2.5135178565979004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136048, + "balance_loss_mlp": 1.06285322, + "epoch": 0.995190457868411, + "flos": 436558943232.0, + "grad_norm": 0.036425615927118446, + "language_loss": 0.90128154, + "learning_rate": 6.066040520641414e-08, + "loss": 0.91264206, + "num_input_tokens_seen": 428389232, + "router_z_loss_mlp": 0.73193359, + "step": 5173, + "time_per_iteration": 2.5291202068328857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136192, + "balance_loss_mlp": 1.06309295, + "epoch": 0.9953828395536745, + "flos": 515189650944.0, + "grad_norm": 0.03877686635677472, + "language_loss": 0.85795176, + "learning_rate": 5.590471806377062e-08, + "loss": 0.8693136, + "num_input_tokens_seen": 428456128, + "router_z_loss_mlp": 0.73095703, + "step": 5174, + "time_per_iteration": 2.562049150466919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113637, + "balance_loss_mlp": 1.06331813, + "epoch": 0.995575221238938, + "flos": 480807556608.0, + "grad_norm": 0.03833934527177391, + "language_loss": 0.86279237, + "learning_rate": 5.134312643245709e-08, + "loss": 0.87415606, + "num_input_tokens_seen": 428523504, + "router_z_loss_mlp": 0.73046875, + "step": 5175, + "time_per_iteration": 2.563511371612549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136236, + "balance_loss_mlp": 1.06304121, + "epoch": 0.9957676029242016, + "flos": 588931760640.0, + "grad_norm": 0.04190279888706188, + "language_loss": 0.81519473, + "learning_rate": 4.6975632083445793e-08, + "loss": 0.82655716, + "num_input_tokens_seen": 428596880, + "router_z_loss_mlp": 0.73193359, + "step": 5176, + "time_per_iteration": 2.7635369300842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136434, + "balance_loss_mlp": 1.0632391, + "epoch": 0.9959599846094652, + "flos": 427354437120.0, + "grad_norm": 0.03983399888286843, + "language_loss": 0.84399128, + "learning_rate": 4.280223671243588e-08, + "loss": 0.85535556, + "num_input_tokens_seen": 428659472, + "router_z_loss_mlp": 0.73193359, + "step": 5177, + "time_per_iteration": 2.482015371322632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136347, + "balance_loss_mlp": 1.06315267, + "epoch": 0.9961523662947287, + "flos": 612850450944.0, + "grad_norm": 0.03375587395159785, + "language_loss": 0.84842086, + "learning_rate": 3.8822941939575804e-08, + "loss": 0.85978431, + "num_input_tokens_seen": 428736704, + "router_z_loss_mlp": 0.73193359, + "step": 5178, + "time_per_iteration": 2.859119415283203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113476, + "balance_loss_mlp": 1.0615176, + "epoch": 0.9963447479799923, + "flos": 551842490880.0, + "grad_norm": 0.036286768119618104, + "language_loss": 0.78752828, + "learning_rate": 3.5037749309851927e-08, + "loss": 0.79887587, + "num_input_tokens_seen": 428808560, + "router_z_loss_mlp": 0.73242188, + "step": 5179, + "time_per_iteration": 2.689319372177124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134711, + "balance_loss_mlp": 1.0614686, + "epoch": 0.9965371296652559, + "flos": 627010684416.0, + "grad_norm": 0.0387871810816858, + "language_loss": 0.93553257, + "learning_rate": 3.1446660292755446e-08, + "loss": 0.94687963, + "num_input_tokens_seen": 428880688, + "router_z_loss_mlp": 0.73242188, + "step": 5180, + "time_per_iteration": 2.787081480026245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134841, + "balance_loss_mlp": 1.0615989, + "epoch": 0.9967295113505195, + "flos": 640791610368.0, + "grad_norm": 0.033783594667719394, + "language_loss": 0.86376369, + "learning_rate": 2.8049676282504433e-08, + "loss": 0.87511212, + "num_input_tokens_seen": 428960096, + "router_z_loss_mlp": 0.73242188, + "step": 5181, + "time_per_iteration": 2.886129856109619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134863, + "balance_loss_mlp": 1.06162131, + "epoch": 0.996921893035783, + "flos": 608543275008.0, + "grad_norm": 0.03960364803100891, + "language_loss": 0.8131901, + "learning_rate": 2.484679859793282e-08, + "loss": 0.82453877, + "num_input_tokens_seen": 429031296, + "router_z_loss_mlp": 0.73242188, + "step": 5182, + "time_per_iteration": 2.773259162902832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135034, + "balance_loss_mlp": 1.06179142, + "epoch": 0.9971142747210465, + "flos": 645345836544.0, + "grad_norm": 0.03666439365730574, + "language_loss": 0.86077094, + "learning_rate": 2.183802848243488e-08, + "loss": 0.87212121, + "num_input_tokens_seen": 429103312, + "router_z_loss_mlp": 0.73242188, + "step": 5183, + "time_per_iteration": 2.7957136631011963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134817, + "balance_loss_mlp": 1.06157458, + "epoch": 0.9973066564063101, + "flos": 1042461445632.0, + "grad_norm": 0.035212511344882604, + "language_loss": 0.85020685, + "learning_rate": 1.9023367104187285e-08, + "loss": 0.86155498, + "num_input_tokens_seen": 429194896, + "router_z_loss_mlp": 0.73242188, + "step": 5184, + "time_per_iteration": 3.393714427947998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134906, + "balance_loss_mlp": 1.06166399, + "epoch": 0.9974990380915737, + "flos": 666342501888.0, + "grad_norm": 0.03904258073685639, + "language_loss": 0.89533353, + "learning_rate": 1.640281555587153e-08, + "loss": 0.90668261, + "num_input_tokens_seen": 429267664, + "router_z_loss_mlp": 0.73242188, + "step": 5185, + "time_per_iteration": 2.8711843490600586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134943, + "balance_loss_mlp": 1.06170106, + "epoch": 0.9976914197768373, + "flos": 719378657280.0, + "grad_norm": 0.03669739544295146, + "language_loss": 0.82640398, + "learning_rate": 1.3976374855007024e-08, + "loss": 0.83775342, + "num_input_tokens_seen": 429343472, + "router_z_loss_mlp": 0.73242188, + "step": 5186, + "time_per_iteration": 2.8739511966705322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134603, + "balance_loss_mlp": 1.06136048, + "epoch": 0.9978838014621008, + "flos": 519331642368.0, + "grad_norm": 0.038670541148839846, + "language_loss": 0.84187782, + "learning_rate": 1.1744045943451464e-08, + "loss": 0.8532238, + "num_input_tokens_seen": 429411472, + "router_z_loss_mlp": 0.73242188, + "step": 5187, + "time_per_iteration": 2.594606637954712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134963, + "balance_loss_mlp": 1.06172121, + "epoch": 0.9980761831473643, + "flos": 604605399552.0, + "grad_norm": 0.03068761649528877, + "language_loss": 0.88198936, + "learning_rate": 9.70582968801148e-09, + "loss": 0.89333904, + "num_input_tokens_seen": 429486704, + "router_z_loss_mlp": 0.73242188, + "step": 5188, + "time_per_iteration": 2.778276205062866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134568, + "balance_loss_mlp": 1.06132579, + "epoch": 0.9982685648326279, + "flos": 454457665536.0, + "grad_norm": 0.03724729407224267, + "language_loss": 0.94649714, + "learning_rate": 7.861726879943021e-09, + "loss": 0.95784283, + "num_input_tokens_seen": 429554736, + "router_z_loss_mlp": 0.73242188, + "step": 5189, + "time_per_iteration": 2.542572259902954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134686, + "balance_loss_mlp": 1.06144357, + "epoch": 0.9984609465178915, + "flos": 482461951488.0, + "grad_norm": 0.036682028146604845, + "language_loss": 0.83087814, + "learning_rate": 6.211738235173403e-09, + "loss": 0.84222496, + "num_input_tokens_seen": 429623216, + "router_z_loss_mlp": 0.73242188, + "step": 5190, + "time_per_iteration": 2.675111770629883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134834, + "balance_loss_mlp": 1.06159234, + "epoch": 0.9986533282031551, + "flos": 478011784704.0, + "grad_norm": 0.03381508269385847, + "language_loss": 0.87848723, + "learning_rate": 4.755864394301312e-09, + "loss": 0.8898356, + "num_input_tokens_seen": 429695808, + "router_z_loss_mlp": 0.73242188, + "step": 5191, + "time_per_iteration": 2.699894666671753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134426, + "balance_loss_mlp": 1.06118381, + "epoch": 0.9988457098884186, + "flos": 643157683200.0, + "grad_norm": 0.03641547995983512, + "language_loss": 0.90973437, + "learning_rate": 3.494105922541291e-09, + "loss": 0.92107868, + "num_input_tokens_seen": 429774464, + "router_z_loss_mlp": 0.73242188, + "step": 5192, + "time_per_iteration": 2.7941293716430664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134587, + "balance_loss_mlp": 1.06139255, + "epoch": 0.9990380915736822, + "flos": 397188194304.0, + "grad_norm": 0.039725697909644885, + "language_loss": 0.93135947, + "learning_rate": 2.4264633097237365e-09, + "loss": 0.94270533, + "num_input_tokens_seen": 429835872, + "router_z_loss_mlp": 0.73193359, + "step": 5193, + "time_per_iteration": 2.439404010772705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134917, + "balance_loss_mlp": 1.06172252, + "epoch": 0.9992304732589458, + "flos": 577296053760.0, + "grad_norm": 0.03644077357659133, + "language_loss": 0.88674903, + "learning_rate": 1.552936970405927e-09, + "loss": 0.89809811, + "num_input_tokens_seen": 429911440, + "router_z_loss_mlp": 0.73193359, + "step": 5194, + "time_per_iteration": 2.783804178237915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135031, + "balance_loss_mlp": 1.06178868, + "epoch": 0.9994228549442093, + "flos": 545390822400.0, + "grad_norm": 0.047086410884293904, + "language_loss": 0.81329274, + "learning_rate": 8.735272437054853e-10, + "loss": 0.82464302, + "num_input_tokens_seen": 429982512, + "router_z_loss_mlp": 0.73242188, + "step": 5195, + "time_per_iteration": 2.6740100383758545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134949, + "balance_loss_mlp": 1.06170666, + "epoch": 0.9996152366294728, + "flos": 1473468324864.0, + "grad_norm": 0.039118675807487395, + "language_loss": 0.8557514, + "learning_rate": 3.882343933003796e-10, + "loss": 0.86710095, + "num_input_tokens_seen": 430070944, + "router_z_loss_mlp": 0.73242188, + "step": 5196, + "time_per_iteration": 3.72202467918396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134237, + "balance_loss_mlp": 1.06137657, + "epoch": 0.9998076183147364, + "flos": 620085656064.0, + "grad_norm": 0.07900250756549031, + "language_loss": 0.7408278, + "learning_rate": 9.70586077619906e-11, + "loss": 0.75217021, + "num_input_tokens_seen": 430164864, + "router_z_loss_mlp": 0.72851562, + "step": 5197, + "time_per_iteration": 4.020706653594971 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140059, + "balance_loss_mlp": 1.0678184, + "epoch": 1.0, + "flos": 1293860926464.0, + "grad_norm": 0.020340605077202825, + "language_loss": 0.85357249, + "learning_rate": 0.0, + "loss": 0.86497313, + "num_input_tokens_seen": 430340944, + "router_z_loss_mlp": 0.72412109, + "step": 5198, + "time_per_iteration": 5.7421464920043945 + } + ], + "logging_steps": 1.0, + "max_steps": 5198, + "num_input_tokens_seen": 430340944, + "num_train_epochs": 1, + "save_steps": 1040, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.1743145354461184e+16, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +} diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/training_args.bin b/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..dec1b7e0db130318069c72434f32c2789119b732 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c077e5103b778b39b648e3a5a2e73e36256d052f444290e14e15f87c36156cb +size 7992 diff --git a/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/zero_to_fp32.py b/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/checkpoint-5198/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/sft_pretrain/Full_smoe_perturbed/config.json b/sft_pretrain/Full_smoe_perturbed/config.json new file mode 100644 index 0000000000000000000000000000000000000000..6f3e6e6a4e52c5512d32eaabc990b49b745e76fa --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/config.json @@ -0,0 +1,200 @@ +{ + "_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "architectures": [ + "LlavaPhiForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_phi3.Phi3Config", + "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM" + }, + "bal_comp_loss_coef": 0.01, + "balance_loss_coef": 0.01, + "bos_token_id": 1, + "clip_smoe": false, + "diversity_loss_coef": 0.01, + "dropout": false, + "e_loss_coef": 0.001, + "embd_pdrop": 0.0, + "entropy_advance_loss": false, + "eos_token_id": 32000, + "freeze_backbone": false, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 3072, + "hybrid": false, + "image_aspect_ratio": "pad", + "init_weight": true, + "initializer_range": 0.02, + "intermediate_size": 8192, + "is_cosine": false, + "is_norm_weight": false, + "local_rank": 0, + "loss1": "balanceloss", + "loss2": "zloss", + "luna": false, + "max_compete_in_iter": 3, + "max_position_embeddings": 131072, + "mlp_smoe": true, + "mm_hidden_size": 1152, + "mm_patch_merge_type": "flat", + "mm_projector_lr": null, + "mm_projector_type": "moe", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-224", + "model_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "model_type": "llava_phi", + "moe_name": "smoe_perturbed", + "norm_softmax": false, + "normalization": false, + "num_attention_heads": 32, + "num_experts": 8, + "num_hidden_layers": 32, + "num_key_value_heads": 32, + "num_layers": 3, + "num_selected": 4, + "number_of_previous_tokens": 2, + "original_max_position_embeddings": 4096, + "pad_token_id": 32000, + "pretrain_mm_mlp_adapter": null, + "rate_compete": 0.2, + "rate_flip": 0.05, + "resid_pdrop": 0.0, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "long_factor": [ + 1.0800000429153442, + 1.1100000143051147, + 1.1399999856948853, + 1.340000033378601, + 1.5899999141693115, + 1.600000023841858, + 1.6200000047683716, + 2.620000123977661, + 3.2300000190734863, + 3.2300000190734863, + 4.789999961853027, + 7.400000095367432, + 7.700000286102295, + 9.09000015258789, + 12.199999809265137, + 17.670000076293945, + 24.46000099182129, + 28.57000160217285, + 30.420001983642578, + 30.840002059936523, + 32.590003967285156, + 32.93000411987305, + 42.320003509521484, + 44.96000289916992, + 50.340003967285156, + 50.45000457763672, + 57.55000305175781, + 57.93000411987305, + 58.21000289916992, + 60.1400032043457, + 62.61000442504883, + 62.62000274658203, + 62.71000289916992, + 63.1400032043457, + 63.1400032043457, + 63.77000427246094, + 63.93000411987305, + 63.96000289916992, + 63.970001220703125, + 64.02999877929688, + 64.06999969482422, + 64.08000183105469, + 64.12000274658203, + 64.41000366210938, + 64.4800033569336, + 64.51000213623047, + 64.52999877929688, + 64.83999633789062 + ], + "short_factor": [ + 1.0, + 1.0199999809265137, + 1.0299999713897705, + 1.0299999713897705, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0699999332427979, + 1.0999999046325684, + 1.1099998950958252, + 1.1599998474121094, + 1.1599998474121094, + 1.1699998378753662, + 1.2899998426437378, + 1.339999794960022, + 1.679999828338623, + 1.7899998426437378, + 1.8199998140335083, + 1.8499997854232788, + 1.8799997568130493, + 1.9099997282028198, + 1.9399996995925903, + 1.9899996519088745, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0799996852874756, + 2.0899996757507324, + 2.189999580383301, + 2.2199995517730713, + 2.5899994373321533, + 2.729999542236328, + 2.749999523162842, + 2.8399994373321533 + ], + "type": "longrope" + }, + "rope_theta": 10000.0, + "router_loss_coef": 0.01, + "router_theta": 0.1, + "router_z_loss_coef": 0.001, + "scales": [ + 1, + 3 + ], + "sliding_window": 262144, + "sparse_upcycling": false, + "strategy_train": "base", + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "topk_max": 2, + "topk_min": 1, + "torch_dtype": "bfloat16", + "training": true, + "transformers_version": "4.43.0", + "tune_mm_mlp_adapter": false, + "unit_test": true, + "use_cache": true, + "use_mm_proj": true, + "use_old": false, + "version": "phi35", + "vision_tower": "google/siglip-so400m-patch14-224", + "vision_tower_dir": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/clip.bin", + "vocab_size": 32064, + "warm_up": 0.05 +} diff --git a/sft_pretrain/Full_smoe_perturbed/generation_config.json b/sft_pretrain/Full_smoe_perturbed/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dad5c4578f0dc5969b38755d095fc30c368bb54a --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/generation_config.json @@ -0,0 +1,12 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "do_sample": true, + "eos_token_id": [ + 32007, + 32001, + 32000 + ], + "pad_token_id": 32000, + "transformers_version": "4.43.0" +} diff --git a/sft_pretrain/Full_smoe_perturbed/model-00001-of-00002.safetensors b/sft_pretrain/Full_smoe_perturbed/model-00001-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..29d76f5d80605301aab2bba59b53a5e2582094c4 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/model-00001-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe6c4f6ef38e8993629091331e0bbf23484cc88bdfd038f0dd17b6ec2800d855 +size 4972489328 diff --git a/sft_pretrain/Full_smoe_perturbed/model-00002-of-00002.safetensors b/sft_pretrain/Full_smoe_perturbed/model-00002-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4128b762c290a8c7cb6627a17f8505d154b92d4d --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/model-00002-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f860cdc59cd5cce9d24bbc4d9e72e861be5a033df830dd664ddc3ec6244d240 +size 3759043888 diff --git a/sft_pretrain/Full_smoe_perturbed/model.safetensors.index.json b/sft_pretrain/Full_smoe_perturbed/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..01fe755c95da02467d97df3e39228dbbb26b065f --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/model.safetensors.index.json @@ -0,0 +1,674 @@ +{ + "metadata": { + "total_size": 8731443232 + }, + "weight_map": { + "lm_head.weight": "model-00002-of-00002.safetensors", + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.mm_projector.layer_norm.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.layer_norm.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.expert_embeddings": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.gate.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.inp_reduction.weight": "model-00002-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/sft_pretrain/Full_smoe_perturbed/special_tokens_map.json b/sft_pretrain/Full_smoe_perturbed/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3e4d5a5bc1cb51753cc9ae0305ece0da60052b10 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/sft_pretrain/Full_smoe_perturbed/tokenizer.model b/sft_pretrain/Full_smoe_perturbed/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/sft_pretrain/Full_smoe_perturbed/tokenizer_config.json b/sft_pretrain/Full_smoe_perturbed/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d579bb0b91b24b214ea3c2e487e27a65017cdc4a --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/tokenizer_config.json @@ -0,0 +1,132 @@ +{ + "add_bos_token": false, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "32000": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32001": { + "content": "<|assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32002": { + "content": "<|placeholder1|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32003": { + "content": "<|placeholder2|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32004": { + "content": "<|placeholder3|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32005": { + "content": "<|placeholder4|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32006": { + "content": "<|system|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32007": { + "content": "<|end|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32008": { + "content": "<|placeholder5|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32009": { + "content": "<|placeholder6|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32010": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "legacy": false, + "model_max_length": 2048, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/sft_pretrain/Full_smoe_perturbed/trainer_state.json b/sft_pretrain/Full_smoe_perturbed/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..df88d2dab6a55e3781488f0b66152e115086fcec --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/trainer_state.json @@ -0,0 +1,78013 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 5198, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02574398, + "balance_loss_mlp": 1.85189414, + "epoch": 0.00019238168526356292, + "flos": 471022176768.0, + "grad_norm": 12.86455737221305, + "language_loss": 2.79777646, + "learning_rate": 0.0, + "loss": 1.8614465, + "num_input_tokens_seen": 67104, + "router_z_loss_mlp": 7.2109375, + "step": 1, + "time_per_iteration": 21.83068585395813 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02254613, + "balance_loss_mlp": 1.76785779, + "epoch": 0.00038476337052712584, + "flos": 505537981440.0, + "grad_norm": 51.581369656319104, + "language_loss": 12.34714699, + "learning_rate": 0.00013726078121135892, + "loss": 12.3696928, + "num_input_tokens_seen": 134080, + "router_z_loss_mlp": 4.875, + "step": 2, + "time_per_iteration": 2.6192572116851807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02235864, + "balance_loss_mlp": 1.75177932, + "epoch": 0.0005771450557906887, + "flos": 600333152256.0, + "grad_norm": 53.41660983156924, + "language_loss": 12.32898235, + "learning_rate": 0.00021755319103969496, + "loss": 12.35134125, + "num_input_tokens_seen": 205152, + "router_z_loss_mlp": 4.84765625, + "step": 3, + "time_per_iteration": 2.887979030609131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02281771, + "balance_loss_mlp": 1.79577887, + "epoch": 0.0007695267410542517, + "flos": 581496442368.0, + "grad_norm": 15.812083363335244, + "language_loss": 9.24414825, + "learning_rate": 0.00027452156242271784, + "loss": 9.26696682, + "num_input_tokens_seen": 269664, + "router_z_loss_mlp": 4.8671875, + "step": 4, + "time_per_iteration": 2.6792547702789307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02454864, + "balance_loss_mlp": 1.95551991, + "epoch": 0.0009619084263178145, + "flos": 487153164288.0, + "grad_norm": 10.3691594005885, + "language_loss": 9.1886158, + "learning_rate": 0.0003187096642208417, + "loss": 9.21316433, + "num_input_tokens_seen": 338560, + "router_z_loss_mlp": 4.98828125, + "step": 5, + "time_per_iteration": 2.627883195877075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0247156, + "balance_loss_mlp": 1.97450531, + "epoch": 0.0011542901115813775, + "flos": 561166519296.0, + "grad_norm": 9.061082825397735, + "language_loss": 9.31672573, + "learning_rate": 0.0003548139722510539, + "loss": 9.34144115, + "num_input_tokens_seen": 410112, + "router_z_loss_mlp": 4.96875, + "step": 6, + "time_per_iteration": 2.697327136993408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02496704, + "balance_loss_mlp": 1.9977417, + "epoch": 0.0013466717968449403, + "flos": 534950886912.0, + "grad_norm": 5.1401213461899875, + "language_loss": 8.45638084, + "learning_rate": 0.00038533972973918044, + "loss": 8.48134804, + "num_input_tokens_seen": 477552, + "router_z_loss_mlp": 4.984375, + "step": 7, + "time_per_iteration": 2.6605119705200195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02367166, + "balance_loss_mlp": 1.8800292, + "epoch": 0.0015390534821085034, + "flos": 493333587456.0, + "grad_norm": 4.765795170053606, + "language_loss": 7.86978722, + "learning_rate": 0.0004117823436340768, + "loss": 7.89345884, + "num_input_tokens_seen": 549184, + "router_z_loss_mlp": 4.87890625, + "step": 8, + "time_per_iteration": 2.60813570022583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02377529, + "balance_loss_mlp": 1.89153647, + "epoch": 0.0017314351673720662, + "flos": 565775139840.0, + "grad_norm": 2.6394105736579268, + "language_loss": 7.60834789, + "learning_rate": 0.00043510638207938993, + "loss": 7.63212299, + "num_input_tokens_seen": 622880, + "router_z_loss_mlp": 4.8671875, + "step": 9, + "time_per_iteration": 2.871943712234497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0239868, + "balance_loss_mlp": 1.91802776, + "epoch": 0.001923816852635629, + "flos": 594508568064.0, + "grad_norm": 2.7082435786924752, + "language_loss": 7.06748104, + "learning_rate": 0.00045597044543220066, + "loss": 7.09146786, + "num_input_tokens_seen": 693584, + "router_z_loss_mlp": 4.8125, + "step": 10, + "time_per_iteration": 2.671294689178467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02381293, + "balance_loss_mlp": 1.90254807, + "epoch": 0.002116198537899192, + "flos": 610894611456.0, + "grad_norm": 2.113301815517677, + "language_loss": 6.83692646, + "learning_rate": 0.00047484428652143135, + "loss": 6.86073971, + "num_input_tokens_seen": 774432, + "router_z_loss_mlp": 4.79296875, + "step": 11, + "time_per_iteration": 2.885416269302368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02427226, + "balance_loss_mlp": 1.95687437, + "epoch": 0.002308580223162755, + "flos": 546174359040.0, + "grad_norm": 1.7416212933802626, + "language_loss": 6.4295001, + "learning_rate": 0.0004920747534624128, + "loss": 6.45377207, + "num_input_tokens_seen": 844304, + "router_z_loss_mlp": 4.70703125, + "step": 12, + "time_per_iteration": 2.6201112270355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02503769, + "balance_loss_mlp": 2.03265429, + "epoch": 0.002500961908426318, + "flos": 645923255808.0, + "grad_norm": 2.43618245016211, + "language_loss": 6.0048914, + "learning_rate": 0.0005079252465375872, + "loss": 6.02992916, + "num_input_tokens_seen": 915104, + "router_z_loss_mlp": 4.71484375, + "step": 13, + "time_per_iteration": 2.852263927459717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02634854, + "balance_loss_mlp": 2.15916157, + "epoch": 0.0026933435936898806, + "flos": 488848492032.0, + "grad_norm": 4.143842376760835, + "language_loss": 5.42230844, + "learning_rate": 0.0005226005109505393, + "loss": 5.44865704, + "num_input_tokens_seen": 982720, + "router_z_loss_mlp": 4.76171875, + "step": 14, + "time_per_iteration": 2.5524611473083496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02844198, + "balance_loss_mlp": 2.3646903, + "epoch": 0.0028857252789534437, + "flos": 435525628416.0, + "grad_norm": 5.672862092220106, + "language_loss": 4.15845776, + "learning_rate": 0.0005362628552605367, + "loss": 4.18689966, + "num_input_tokens_seen": 1050528, + "router_z_loss_mlp": 4.80078125, + "step": 15, + "time_per_iteration": 2.7353649139404297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03208902, + "balance_loss_mlp": 2.72252893, + "epoch": 0.0030781069642170067, + "flos": 597840826368.0, + "grad_norm": 3.947061509829782, + "language_loss": 2.26971245, + "learning_rate": 0.0005490431248454357, + "loss": 2.30180168, + "num_input_tokens_seen": 1116512, + "router_z_loss_mlp": 4.87109375, + "step": 16, + "time_per_iteration": 2.676703929901123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03601284, + "balance_loss_mlp": 3.10232162, + "epoch": 0.0032704886494805694, + "flos": 1541510280192.0, + "grad_norm": 0.6213816402988768, + "language_loss": 0.75705111, + "learning_rate": 0.0005610483427624225, + "loss": 0.793064, + "num_input_tokens_seen": 1351216, + "router_z_loss_mlp": 5.0, + "step": 17, + "time_per_iteration": 6.1610119342803955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0334326, + "balance_loss_mlp": 2.85841203, + "epoch": 0.0034628703347441324, + "flos": 474970237440.0, + "grad_norm": 2.8341915883282045, + "language_loss": 1.71282685, + "learning_rate": 0.0005723671632907488, + "loss": 1.74625945, + "num_input_tokens_seen": 1420512, + "router_z_loss_mlp": 4.85546875, + "step": 18, + "time_per_iteration": 2.638371467590332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02881518, + "balance_loss_mlp": 2.39934015, + "epoch": 0.0036552520200076955, + "flos": 449477743104.0, + "grad_norm": 2.8867361132515086, + "language_loss": 1.68530536, + "learning_rate": 0.0005830738490244919, + "loss": 1.71412063, + "num_input_tokens_seen": 1484976, + "router_z_loss_mlp": 4.828125, + "step": 19, + "time_per_iteration": 2.56374454498291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02402526, + "balance_loss_mlp": 1.92301893, + "epoch": 0.003847633705271258, + "flos": 637350563328.0, + "grad_norm": 0.6925173808128176, + "language_loss": 1.38203168, + "learning_rate": 0.0005932312266435596, + "loss": 1.406057, + "num_input_tokens_seen": 1557392, + "router_z_loss_mlp": 4.80078125, + "step": 20, + "time_per_iteration": 2.763998508453369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02421171, + "balance_loss_mlp": 1.94814897, + "epoch": 0.004040015390534821, + "flos": 590590158336.0, + "grad_norm": 1.6265477944222306, + "language_loss": 1.40919662, + "learning_rate": 0.0006028929207788754, + "loss": 1.43340826, + "num_input_tokens_seen": 1626064, + "router_z_loss_mlp": 4.734375, + "step": 21, + "time_per_iteration": 2.746016502380371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02575294, + "balance_loss_mlp": 2.10036469, + "epoch": 0.004232397075798384, + "flos": 757865812992.0, + "grad_norm": 1.576079326940489, + "language_loss": 1.40810275, + "learning_rate": 0.0006121050677327902, + "loss": 1.43385565, + "num_input_tokens_seen": 1696528, + "router_z_loss_mlp": 4.75390625, + "step": 22, + "time_per_iteration": 2.9607386589050293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02550906, + "balance_loss_mlp": 2.07025433, + "epoch": 0.004424778761061947, + "flos": 527726415360.0, + "grad_norm": 0.6323448080178445, + "language_loss": 1.22419024, + "learning_rate": 0.0006209076479463684, + "loss": 1.24969923, + "num_input_tokens_seen": 1765936, + "router_z_loss_mlp": 4.8125, + "step": 23, + "time_per_iteration": 2.5966527462005615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02511897, + "balance_loss_mlp": 2.02285314, + "epoch": 0.00461716044632551, + "flos": 549217907712.0, + "grad_norm": 0.22573529074246063, + "language_loss": 1.26396596, + "learning_rate": 0.0006293355346737718, + "loss": 1.28908491, + "num_input_tokens_seen": 1841632, + "router_z_loss_mlp": 4.8984375, + "step": 24, + "time_per_iteration": 2.672264575958252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02557217, + "balance_loss_mlp": 2.05978036, + "epoch": 0.004809542131589073, + "flos": 568751559168.0, + "grad_norm": 0.10471299124135865, + "language_loss": 1.20974565, + "learning_rate": 0.0006374193284416834, + "loss": 1.23531783, + "num_input_tokens_seen": 1920256, + "router_z_loss_mlp": 4.96875, + "step": 25, + "time_per_iteration": 2.7392375469207764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02658191, + "balance_loss_mlp": 2.15503263, + "epoch": 0.005001923816852636, + "flos": 471583584768.0, + "grad_norm": 0.16888144752152706, + "language_loss": 1.20314312, + "learning_rate": 0.0006451860277489461, + "loss": 1.22972512, + "num_input_tokens_seen": 1986528, + "router_z_loss_mlp": 5.02734375, + "step": 26, + "time_per_iteration": 2.6047253608703613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02722422, + "balance_loss_mlp": 2.21582985, + "epoch": 0.005194305502116198, + "flos": 416380743168.0, + "grad_norm": 0.22424567034217777, + "language_loss": 1.28844571, + "learning_rate": 0.0006526595731190848, + "loss": 1.31566989, + "num_input_tokens_seen": 2048016, + "router_z_loss_mlp": 5.0625, + "step": 27, + "time_per_iteration": 2.481884717941284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02743244, + "balance_loss_mlp": 2.2351265, + "epoch": 0.005386687187379761, + "flos": 629995835904.0, + "grad_norm": 0.15642653525507078, + "language_loss": 1.18914986, + "learning_rate": 0.0006598612921618983, + "loss": 1.2165823, + "num_input_tokens_seen": 2127664, + "router_z_loss_mlp": 5.078125, + "step": 28, + "time_per_iteration": 2.8519153594970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02748247, + "balance_loss_mlp": 2.24051118, + "epoch": 0.005579068872643324, + "flos": 888019997184.0, + "grad_norm": 0.1209301216257677, + "language_loss": 1.12191987, + "learning_rate": 0.0006668102665011454, + "loss": 1.14940238, + "num_input_tokens_seen": 2213952, + "router_z_loss_mlp": 5.07421875, + "step": 29, + "time_per_iteration": 3.2244889736175537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02691091, + "balance_loss_mlp": 2.18411779, + "epoch": 0.005771450557906887, + "flos": 548657952768.0, + "grad_norm": 0.1098895199150706, + "language_loss": 1.21368051, + "learning_rate": 0.0006735236364718957, + "loss": 1.24059153, + "num_input_tokens_seen": 2284736, + "router_z_loss_mlp": 5.06640625, + "step": 30, + "time_per_iteration": 2.642730474472046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02653145, + "balance_loss_mlp": 2.14769816, + "epoch": 0.00596383224317045, + "flos": 533068907520.0, + "grad_norm": 0.11046596793449442, + "language_loss": 1.1970098, + "learning_rate": 0.0006800168558381346, + "loss": 1.22354114, + "num_input_tokens_seen": 2354384, + "router_z_loss_mlp": 5.05078125, + "step": 31, + "time_per_iteration": 2.581875801086426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0257592, + "balance_loss_mlp": 2.07123542, + "epoch": 0.0061562139284340135, + "flos": 590162460672.0, + "grad_norm": 0.10949645130098669, + "language_loss": 1.22987807, + "learning_rate": 0.0006863039060567947, + "loss": 1.25563729, + "num_input_tokens_seen": 2419440, + "router_z_loss_mlp": 5.04296875, + "step": 32, + "time_per_iteration": 2.733224868774414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02505923, + "balance_loss_mlp": 2.00390816, + "epoch": 0.006348595613697576, + "flos": 619441107456.0, + "grad_norm": 0.0835016489973258, + "language_loss": 1.14437437, + "learning_rate": 0.0006923974775611263, + "loss": 1.16943359, + "num_input_tokens_seen": 2496368, + "router_z_loss_mlp": 5.015625, + "step": 33, + "time_per_iteration": 2.820788621902466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02482464, + "balance_loss_mlp": 1.98159432, + "epoch": 0.006540977298961139, + "flos": 779298908160.0, + "grad_norm": 0.08776573315434787, + "language_loss": 1.10869515, + "learning_rate": 0.0006983091239737814, + "loss": 1.13351965, + "num_input_tokens_seen": 2573280, + "router_z_loss_mlp": 5.00390625, + "step": 34, + "time_per_iteration": 2.9917590618133545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02373805, + "balance_loss_mlp": 1.87636864, + "epoch": 0.006733358984224702, + "flos": 668372201472.0, + "grad_norm": 0.0744368555221442, + "language_loss": 1.09626412, + "learning_rate": 0.0007040493939600222, + "loss": 1.12000227, + "num_input_tokens_seen": 2647248, + "router_z_loss_mlp": 4.96875, + "step": 35, + "time_per_iteration": 2.813040256500244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02308046, + "balance_loss_mlp": 1.81175399, + "epoch": 0.006925740669488265, + "flos": 565495162368.0, + "grad_norm": 0.06560236116646054, + "language_loss": 1.0974791, + "learning_rate": 0.0007096279445021078, + "loss": 1.12055957, + "num_input_tokens_seen": 2720736, + "router_z_loss_mlp": 4.95703125, + "step": 36, + "time_per_iteration": 2.715013027191162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02240602, + "balance_loss_mlp": 1.74888754, + "epoch": 0.007118122354751828, + "flos": 551111347200.0, + "grad_norm": 0.05581405617561486, + "language_loss": 1.16120386, + "learning_rate": 0.0007150536386503726, + "loss": 1.18360972, + "num_input_tokens_seen": 2800336, + "router_z_loss_mlp": 4.91015625, + "step": 37, + "time_per_iteration": 2.8262643814086914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02218804, + "balance_loss_mlp": 1.7293781, + "epoch": 0.007310504040015391, + "flos": 703813807104.0, + "grad_norm": 0.06412720029508237, + "language_loss": 1.08394384, + "learning_rate": 0.0007203346302358509, + "loss": 1.10613179, + "num_input_tokens_seen": 2883184, + "router_z_loss_mlp": 4.890625, + "step": 38, + "time_per_iteration": 2.9149320125579834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0220325, + "balance_loss_mlp": 1.71954608, + "epoch": 0.007502885725278953, + "flos": 600500338176.0, + "grad_norm": 0.08018675586540955, + "language_loss": 1.13587177, + "learning_rate": 0.000725478437577282, + "loss": 1.15790427, + "num_input_tokens_seen": 2960736, + "router_z_loss_mlp": 4.84375, + "step": 39, + "time_per_iteration": 2.7649383544921875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02194939, + "balance_loss_mlp": 1.71237946, + "epoch": 0.007695267410542516, + "flos": 561427031040.0, + "grad_norm": 0.11080304178085185, + "language_loss": 1.08546591, + "learning_rate": 0.0007304920078549186, + "loss": 1.10741532, + "num_input_tokens_seen": 3033472, + "router_z_loss_mlp": 4.83203125, + "step": 40, + "time_per_iteration": 2.7245187759399414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02164234, + "balance_loss_mlp": 1.68548942, + "epoch": 0.007887649095806078, + "flos": 509230808064.0, + "grad_norm": 0.12864951336881933, + "language_loss": 1.10053396, + "learning_rate": 0.0007353817735343603, + "loss": 1.12217629, + "num_input_tokens_seen": 3107824, + "router_z_loss_mlp": 4.79296875, + "step": 41, + "time_per_iteration": 2.6662168502807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02109951, + "balance_loss_mlp": 1.63425827, + "epoch": 0.008080030781069641, + "flos": 504904166400.0, + "grad_norm": 0.0888118324595499, + "language_loss": 1.05816543, + "learning_rate": 0.0007401537019902344, + "loss": 1.07926488, + "num_input_tokens_seen": 3176528, + "router_z_loss_mlp": 4.76171875, + "step": 42, + "time_per_iteration": 2.583256244659424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02065976, + "balance_loss_mlp": 1.59219027, + "epoch": 0.008272412466333205, + "flos": 519106059264.0, + "grad_norm": 0.08974821197730459, + "language_loss": 1.0785954, + "learning_rate": 0.0007448133392900729, + "loss": 1.09925508, + "num_input_tokens_seen": 3254256, + "router_z_loss_mlp": 4.7421875, + "step": 43, + "time_per_iteration": 2.677175998687744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01955434, + "balance_loss_mlp": 1.4839375, + "epoch": 0.008464794151596768, + "flos": 609183820800.0, + "grad_norm": 0.06237767914218564, + "language_loss": 1.03785229, + "learning_rate": 0.0007493658489441491, + "loss": 1.05740666, + "num_input_tokens_seen": 3340224, + "router_z_loss_mlp": 4.71875, + "step": 44, + "time_per_iteration": 2.8553237915039062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01864539, + "balance_loss_mlp": 1.39800107, + "epoch": 0.00865717583686033, + "flos": 539006283264.0, + "grad_norm": 0.049849947719683325, + "language_loss": 1.08088911, + "learning_rate": 0.0007538160463002316, + "loss": 1.09953451, + "num_input_tokens_seen": 3409216, + "router_z_loss_mlp": 4.66796875, + "step": 45, + "time_per_iteration": 2.637796640396118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01780353, + "balance_loss_mlp": 1.31572247, + "epoch": 0.008849557522123894, + "flos": 509009227776.0, + "grad_norm": 0.046919324832442044, + "language_loss": 1.11748755, + "learning_rate": 0.0007581684291577274, + "loss": 1.1352911, + "num_input_tokens_seen": 3478352, + "router_z_loss_mlp": 4.6484375, + "step": 46, + "time_per_iteration": 2.5655901432037354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01764453, + "balance_loss_mlp": 1.30211222, + "epoch": 0.009041939207387457, + "flos": 626507125248.0, + "grad_norm": 0.05937298040562763, + "language_loss": 1.13580585, + "learning_rate": 0.0007624272050891776, + "loss": 1.15345049, + "num_input_tokens_seen": 3555616, + "router_z_loss_mlp": 4.625, + "step": 47, + "time_per_iteration": 2.804643392562866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01776852, + "balance_loss_mlp": 1.31908798, + "epoch": 0.00923432089265102, + "flos": 550609789440.0, + "grad_norm": 0.07500714899038924, + "language_loss": 1.03489327, + "learning_rate": 0.0007665963158851307, + "loss": 1.05266178, + "num_input_tokens_seen": 3634512, + "router_z_loss_mlp": 4.578125, + "step": 48, + "time_per_iteration": 2.781435489654541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01771411, + "balance_loss_mlp": 1.3170805, + "epoch": 0.009426702577914583, + "flos": 563678310912.0, + "grad_norm": 0.07921486390615404, + "language_loss": 1.12758589, + "learning_rate": 0.0007706794594783609, + "loss": 1.14529991, + "num_input_tokens_seen": 3708480, + "router_z_loss_mlp": 4.54296875, + "step": 49, + "time_per_iteration": 2.739976644515991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.017484, + "balance_loss_mlp": 1.29483247, + "epoch": 0.009619084263178146, + "flos": 617925700608.0, + "grad_norm": 0.05671895540127436, + "language_loss": 1.10915053, + "learning_rate": 0.0007746801096530423, + "loss": 1.12663448, + "num_input_tokens_seen": 3783472, + "router_z_loss_mlp": 4.53515625, + "step": 50, + "time_per_iteration": 2.7333760261535645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01715641, + "balance_loss_mlp": 1.2616924, + "epoch": 0.009811465948441709, + "flos": 542488263168.0, + "grad_norm": 0.04785443300923319, + "language_loss": 1.16231108, + "learning_rate": 0.0007786015338021173, + "loss": 1.17946756, + "num_input_tokens_seen": 3851360, + "router_z_loss_mlp": 4.5390625, + "step": 51, + "time_per_iteration": 2.681392192840576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01700387, + "balance_loss_mlp": 1.24720073, + "epoch": 0.010003847633705272, + "flos": 536976583680.0, + "grad_norm": 0.04536583817216675, + "language_loss": 1.08076, + "learning_rate": 0.0007824468089603051, + "loss": 1.0977639, + "num_input_tokens_seen": 3923056, + "router_z_loss_mlp": 4.53125, + "step": 52, + "time_per_iteration": 2.6839513778686523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01675834, + "balance_loss_mlp": 1.2218852, + "epoch": 0.010196229318968833, + "flos": 910805316096.0, + "grad_norm": 0.04374839581732082, + "language_loss": 1.0833261, + "learning_rate": 0.0007862188363098669, + "loss": 1.10008454, + "num_input_tokens_seen": 4004528, + "router_z_loss_mlp": 4.5390625, + "step": 53, + "time_per_iteration": 3.1748838424682617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01650634, + "balance_loss_mlp": 1.19477725, + "epoch": 0.010388611004232396, + "flos": 586969190400.0, + "grad_norm": 0.045477377455174536, + "language_loss": 1.08262885, + "learning_rate": 0.0007899203543304438, + "loss": 1.09913516, + "num_input_tokens_seen": 4078704, + "router_z_loss_mlp": 4.55859375, + "step": 54, + "time_per_iteration": 2.7011117935180664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01588572, + "balance_loss_mlp": 1.13195276, + "epoch": 0.01058099268949596, + "flos": 503471351808.0, + "grad_norm": 0.05216939031034974, + "language_loss": 1.22650576, + "learning_rate": 0.0007935539507422731, + "loss": 1.24239147, + "num_input_tokens_seen": 4143600, + "router_z_loss_mlp": 4.56640625, + "step": 55, + "time_per_iteration": 2.6142656803131104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01553155, + "balance_loss_mlp": 1.09462798, + "epoch": 0.010773374374759523, + "flos": 545558008320.0, + "grad_norm": 0.04278176221573414, + "language_loss": 1.12836909, + "learning_rate": 0.0007971220733732573, + "loss": 1.14390063, + "num_input_tokens_seen": 4217904, + "router_z_loss_mlp": 4.5859375, + "step": 56, + "time_per_iteration": 2.718210220336914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01586959, + "balance_loss_mlp": 1.1318655, + "epoch": 0.010965756060023086, + "flos": 527285982720.0, + "grad_norm": 0.06958617519474361, + "language_loss": 1.08844507, + "learning_rate": 0.0008006270400641869, + "loss": 1.10431468, + "num_input_tokens_seen": 4293920, + "router_z_loss_mlp": 4.55078125, + "step": 57, + "time_per_iteration": 2.702324628829956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01576177, + "balance_loss_mlp": 1.12375367, + "epoch": 0.011158137745286649, + "flos": 578097054720.0, + "grad_norm": 0.08376433329063605, + "language_loss": 1.09231043, + "learning_rate": 0.0008040710477125043, + "loss": 1.10807228, + "num_input_tokens_seen": 4370080, + "router_z_loss_mlp": 4.5234375, + "step": 58, + "time_per_iteration": 2.733733892440796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01587306, + "balance_loss_mlp": 1.13793492, + "epoch": 0.011350519430550212, + "flos": 530314068480.0, + "grad_norm": 0.056261163559927586, + "language_loss": 1.098104, + "learning_rate": 0.0008074561805429771, + "loss": 1.11397719, + "num_input_tokens_seen": 4439792, + "router_z_loss_mlp": 4.4921875, + "step": 59, + "time_per_iteration": 2.604173183441162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0153348, + "balance_loss_mlp": 1.0886867, + "epoch": 0.011542901115813775, + "flos": 556970133504.0, + "grad_norm": 0.07546157909609297, + "language_loss": 1.07214928, + "learning_rate": 0.0008107844176832545, + "loss": 1.08748412, + "num_input_tokens_seen": 4510800, + "router_z_loss_mlp": 4.45703125, + "step": 60, + "time_per_iteration": 2.670180082321167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01515203, + "balance_loss_mlp": 1.07155395, + "epoch": 0.011735282801077338, + "flos": 573175529472.0, + "grad_norm": 0.06932920743779293, + "language_loss": 1.09267807, + "learning_rate": 0.0008140576401132568, + "loss": 1.10783005, + "num_input_tokens_seen": 4581136, + "router_z_loss_mlp": 4.44921875, + "step": 61, + "time_per_iteration": 2.635917901992798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01537914, + "balance_loss_mlp": 1.0965538, + "epoch": 0.0119276644863409, + "flos": 616716467712.0, + "grad_norm": 0.056166475672555005, + "language_loss": 1.10548615, + "learning_rate": 0.0008172776370494935, + "loss": 1.12086535, + "num_input_tokens_seen": 4650352, + "router_z_loss_mlp": 4.42578125, + "step": 62, + "time_per_iteration": 2.709764242172241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.015397, + "balance_loss_mlp": 1.10024714, + "epoch": 0.012120046171604464, + "flos": 502084199424.0, + "grad_norm": 0.046962065793300374, + "language_loss": 1.17909575, + "learning_rate": 0.0008204461118185703, + "loss": 1.19449282, + "num_input_tokens_seen": 4716336, + "router_z_loss_mlp": 4.40625, + "step": 63, + "time_per_iteration": 2.5971004962921143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01545078, + "balance_loss_mlp": 1.10943925, + "epoch": 0.012312427856868027, + "flos": 474301493760.0, + "grad_norm": 0.04671162143151921, + "language_loss": 1.07277906, + "learning_rate": 0.0008235646872681536, + "loss": 1.08822989, + "num_input_tokens_seen": 4781648, + "router_z_loss_mlp": 4.3671875, + "step": 64, + "time_per_iteration": 2.567622423171997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01534227, + "balance_loss_mlp": 1.10240316, + "epoch": 0.012504809542131588, + "flos": 539470910976.0, + "grad_norm": 0.04435006978162803, + "language_loss": 1.0673492, + "learning_rate": 0.0008266349107584288, + "loss": 1.08269131, + "num_input_tokens_seen": 4852320, + "router_z_loss_mlp": 4.328125, + "step": 65, + "time_per_iteration": 2.6833384037017822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0149994, + "balance_loss_mlp": 1.07345641, + "epoch": 0.012697191227395151, + "flos": 609856567296.0, + "grad_norm": 0.04524096047594039, + "language_loss": 1.09403265, + "learning_rate": 0.0008296582587724851, + "loss": 1.10903215, + "num_input_tokens_seen": 4922016, + "router_z_loss_mlp": 4.2734375, + "step": 66, + "time_per_iteration": 2.692337989807129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01482262, + "balance_loss_mlp": 1.05806744, + "epoch": 0.012889572912658714, + "flos": 769397460480.0, + "grad_norm": 0.04198159389490698, + "language_loss": 1.06809163, + "learning_rate": 0.0008326361411800136, + "loss": 1.08291411, + "num_input_tokens_seen": 5000128, + "router_z_loss_mlp": 4.25, + "step": 67, + "time_per_iteration": 2.923720598220825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01474655, + "balance_loss_mlp": 1.05503809, + "epoch": 0.013081954597922277, + "flos": 535020744192.0, + "grad_norm": 0.041919130945389606, + "language_loss": 1.07100165, + "learning_rate": 0.0008355699051851403, + "loss": 1.0857482, + "num_input_tokens_seen": 5074512, + "router_z_loss_mlp": 4.203125, + "step": 68, + "time_per_iteration": 2.7417044639587402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01462817, + "balance_loss_mlp": 1.04701531, + "epoch": 0.01327433628318584, + "flos": 574180646400.0, + "grad_norm": 0.041322055356332446, + "language_loss": 1.14468551, + "learning_rate": 0.0008384608389860635, + "loss": 1.15931368, + "num_input_tokens_seen": 5141856, + "router_z_loss_mlp": 4.1640625, + "step": 69, + "time_per_iteration": 2.6545376777648926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01450151, + "balance_loss_mlp": 1.03930819, + "epoch": 0.013466717968449404, + "flos": 498259115520.0, + "grad_norm": 0.039605765449237204, + "language_loss": 1.04742777, + "learning_rate": 0.000841310175171381, + "loss": 1.06192923, + "num_input_tokens_seen": 5209280, + "router_z_loss_mlp": 4.11328125, + "step": 70, + "time_per_iteration": 2.5687999725341797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01441096, + "balance_loss_mlp": 1.03101599, + "epoch": 0.013659099653712967, + "flos": 566621803008.0, + "grad_norm": 0.03646297128801074, + "language_loss": 1.03104186, + "learning_rate": 0.000844119093875517, + "loss": 1.04545283, + "num_input_tokens_seen": 5285424, + "router_z_loss_mlp": 4.1015625, + "step": 71, + "time_per_iteration": 2.698259115219116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01433469, + "balance_loss_mlp": 1.02720368, + "epoch": 0.01385148133897653, + "flos": 574942715904.0, + "grad_norm": 0.02854119406997066, + "language_loss": 1.07372236, + "learning_rate": 0.0008468887257134666, + "loss": 1.08805704, + "num_input_tokens_seen": 5358624, + "router_z_loss_mlp": 4.06445312, + "step": 72, + "time_per_iteration": 2.7074387073516846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01422625, + "balance_loss_mlp": 1.01941192, + "epoch": 0.014043863024240093, + "flos": 577958066688.0, + "grad_norm": 0.03113282173853564, + "language_loss": 1.10314119, + "learning_rate": 0.0008496201545131264, + "loss": 1.11736751, + "num_input_tokens_seen": 5429792, + "router_z_loss_mlp": 4.03515625, + "step": 73, + "time_per_iteration": 2.725660562515259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01425762, + "balance_loss_mlp": 1.02655351, + "epoch": 0.014236244709503656, + "flos": 940263883776.0, + "grad_norm": 0.033199488198319166, + "language_loss": 1.07624495, + "learning_rate": 0.0008523144198617317, + "loss": 1.0905025, + "num_input_tokens_seen": 5518608, + "router_z_loss_mlp": 3.99414062, + "step": 74, + "time_per_iteration": 3.2577481269836426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01437934, + "balance_loss_mlp": 1.04139662, + "epoch": 0.014428626394767219, + "flos": 529495603200.0, + "grad_norm": 0.03119178099318558, + "language_loss": 1.07016373, + "learning_rate": 0.0008549725194813783, + "loss": 1.08454299, + "num_input_tokens_seen": 5590576, + "router_z_loss_mlp": 3.96679688, + "step": 75, + "time_per_iteration": 2.727982997894287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01437754, + "balance_loss_mlp": 1.0446496, + "epoch": 0.014621008080030782, + "flos": 805282226688.0, + "grad_norm": 0.02968258762679391, + "language_loss": 1.06415534, + "learning_rate": 0.0008575954114472099, + "loss": 1.07853293, + "num_input_tokens_seen": 5674224, + "router_z_loss_mlp": 3.93164062, + "step": 76, + "time_per_iteration": 3.172807455062866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0143975, + "balance_loss_mlp": 1.04950643, + "epoch": 0.014813389765294343, + "flos": 698356521984.0, + "grad_norm": 0.031905123056971844, + "language_loss": 1.03629625, + "learning_rate": 0.0008601840162606118, + "loss": 1.05069387, + "num_input_tokens_seen": 5757648, + "router_z_loss_mlp": 3.90234375, + "step": 77, + "time_per_iteration": 3.029114007949829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01438585, + "balance_loss_mlp": 1.05158365, + "epoch": 0.015005771450557906, + "flos": 598164464640.0, + "grad_norm": 0.026994348673938514, + "language_loss": 1.09661531, + "learning_rate": 0.000862739218788641, + "loss": 1.11100101, + "num_input_tokens_seen": 5837600, + "router_z_loss_mlp": 3.86914062, + "step": 78, + "time_per_iteration": 2.795952320098877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01440626, + "balance_loss_mlp": 1.05705774, + "epoch": 0.01519815313582147, + "flos": 550492268544.0, + "grad_norm": 0.029495859587709627, + "language_loss": 1.07574832, + "learning_rate": 0.0008652618700799138, + "loss": 1.09015465, + "num_input_tokens_seen": 5907248, + "router_z_loss_mlp": 3.83789062, + "step": 79, + "time_per_iteration": 2.6552224159240723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01430975, + "balance_loss_mlp": 1.05084014, + "epoch": 0.015390534821085032, + "flos": 431440032768.0, + "grad_norm": 0.037998818197719206, + "language_loss": 1.07206631, + "learning_rate": 0.0008677527890662774, + "loss": 1.08637595, + "num_input_tokens_seen": 5970864, + "router_z_loss_mlp": 3.80664062, + "step": 80, + "time_per_iteration": 2.530073881149292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01424927, + "balance_loss_mlp": 1.04727161, + "epoch": 0.015582916506348595, + "flos": 525184424448.0, + "grad_norm": 0.03521308344632083, + "language_loss": 1.08168781, + "learning_rate": 0.0008702127641587799, + "loss": 1.09593713, + "num_input_tokens_seen": 6040800, + "router_z_loss_mlp": 3.78125, + "step": 81, + "time_per_iteration": 2.6248533725738525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01426595, + "balance_loss_mlp": 1.05141926, + "epoch": 0.015775298191612157, + "flos": 576616576512.0, + "grad_norm": 0.026523126631237747, + "language_loss": 1.036394, + "learning_rate": 0.0008726425547457192, + "loss": 1.05065989, + "num_input_tokens_seen": 6111840, + "router_z_loss_mlp": 3.75585938, + "step": 82, + "time_per_iteration": 2.759159564971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01424967, + "balance_loss_mlp": 1.05303442, + "epoch": 0.01596767987687572, + "flos": 611439103488.0, + "grad_norm": 0.03656915183129864, + "language_loss": 1.03032446, + "learning_rate": 0.0008750428925998964, + "loss": 1.04457414, + "num_input_tokens_seen": 6183872, + "router_z_loss_mlp": 3.72265625, + "step": 83, + "time_per_iteration": 2.739105224609375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01431924, + "balance_loss_mlp": 1.06151688, + "epoch": 0.016160061562139283, + "flos": 568232537088.0, + "grad_norm": 0.03323001720600938, + "language_loss": 1.08511543, + "learning_rate": 0.0008774144832015932, + "loss": 1.09943461, + "num_input_tokens_seen": 6255760, + "router_z_loss_mlp": 3.70703125, + "step": 84, + "time_per_iteration": 2.7144806385040283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02085876, + "balance_loss_mlp": 1.68762207, + "epoch": 0.016352443247402846, + "flos": 1414499701248.0, + "grad_norm": 0.1388747380481991, + "language_loss": 0.74774313, + "learning_rate": 0.0008797580069832641, + "loss": 0.76860189, + "num_input_tokens_seen": 6472960, + "router_z_loss_mlp": 3.984375, + "step": 85, + "time_per_iteration": 4.569611072540283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01450774, + "balance_loss_mlp": 1.08532572, + "epoch": 0.01654482493266641, + "flos": 731785165824.0, + "grad_norm": 0.04601998260491519, + "language_loss": 1.03772068, + "learning_rate": 0.0008820741205014318, + "loss": 1.05222845, + "num_input_tokens_seen": 6548912, + "router_z_loss_mlp": 3.65625, + "step": 86, + "time_per_iteration": 2.8604419231414795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.014606, + "balance_loss_mlp": 1.09744096, + "epoch": 0.016737206617929972, + "flos": 537404281344.0, + "grad_norm": 0.03433335749497543, + "language_loss": 1.05140662, + "learning_rate": 0.0008843634575408404, + "loss": 1.06601262, + "num_input_tokens_seen": 6621520, + "router_z_loss_mlp": 3.62695312, + "step": 87, + "time_per_iteration": 2.677731513977051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0145769, + "balance_loss_mlp": 1.09777355, + "epoch": 0.016929588303193535, + "flos": 538129420800.0, + "grad_norm": 0.05036212092144492, + "language_loss": 1.06815004, + "learning_rate": 0.0008866266301555082, + "loss": 1.08272696, + "num_input_tokens_seen": 6698432, + "router_z_loss_mlp": 3.59765625, + "step": 88, + "time_per_iteration": 2.7037875652313232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0145347, + "balance_loss_mlp": 1.09622347, + "epoch": 0.017121969988457098, + "flos": 527791543296.0, + "grad_norm": 0.030252065691096418, + "language_loss": 1.07441962, + "learning_rate": 0.0008888642296509615, + "loss": 1.08895445, + "num_input_tokens_seen": 6764336, + "router_z_loss_mlp": 3.56445312, + "step": 89, + "time_per_iteration": 2.590280771255493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0145473, + "balance_loss_mlp": 1.10034442, + "epoch": 0.01731435167372066, + "flos": 626767636992.0, + "grad_norm": 0.041554939890322294, + "language_loss": 1.12743318, + "learning_rate": 0.0008910768275115906, + "loss": 1.14198053, + "num_input_tokens_seen": 6839392, + "router_z_loss_mlp": 3.54101562, + "step": 90, + "time_per_iteration": 2.7529714107513428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0145373, + "balance_loss_mlp": 1.10220587, + "epoch": 0.017506733358984224, + "flos": 497384254464.0, + "grad_norm": 0.05646737130307679, + "language_loss": 1.07978606, + "learning_rate": 0.0008932649762767675, + "loss": 1.0943234, + "num_input_tokens_seen": 6907344, + "router_z_loss_mlp": 3.50976562, + "step": 91, + "time_per_iteration": 2.5964808464050293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01457202, + "balance_loss_mlp": 1.10911036, + "epoch": 0.017699115044247787, + "flos": 747217758720.0, + "grad_norm": 0.04050166442287704, + "language_loss": 1.1018101, + "learning_rate": 0.0008954292103690864, + "loss": 1.11638212, + "num_input_tokens_seen": 6982464, + "router_z_loss_mlp": 3.47851562, + "step": 92, + "time_per_iteration": 2.9288997650146484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01459372, + "balance_loss_mlp": 1.11395121, + "epoch": 0.01789149672951135, + "flos": 516520407552.0, + "grad_norm": 0.054281950557984966, + "language_loss": 1.12496912, + "learning_rate": 0.0008975700468778296, + "loss": 1.13956285, + "num_input_tokens_seen": 7049712, + "router_z_loss_mlp": 3.45117188, + "step": 93, + "time_per_iteration": 2.5800487995147705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01462727, + "balance_loss_mlp": 1.11978543, + "epoch": 0.018083878414774913, + "flos": 587229702144.0, + "grad_norm": 0.04557553976021738, + "language_loss": 1.05795836, + "learning_rate": 0.0008996879863005366, + "loss": 1.07258558, + "num_input_tokens_seen": 7120288, + "router_z_loss_mlp": 3.42578125, + "step": 94, + "time_per_iteration": 2.6668198108673096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0146929, + "balance_loss_mlp": 1.12882805, + "epoch": 0.018276260100038477, + "flos": 498369905664.0, + "grad_norm": 0.055406629054909326, + "language_loss": 1.06168532, + "learning_rate": 0.0009017835132453337, + "loss": 1.07637823, + "num_input_tokens_seen": 7188896, + "router_z_loss_mlp": 3.40234375, + "step": 95, + "time_per_iteration": 2.588728904724121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0146889, + "balance_loss_mlp": 1.1312896, + "epoch": 0.01846864178530204, + "flos": 641232043008.0, + "grad_norm": 0.04012691806662063, + "language_loss": 1.05874133, + "learning_rate": 0.0009038570970964896, + "loss": 1.0734303, + "num_input_tokens_seen": 7259536, + "router_z_loss_mlp": 3.37890625, + "step": 96, + "time_per_iteration": 2.7860937118530273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01464817, + "balance_loss_mlp": 1.12912345, + "epoch": 0.018661023470565603, + "flos": 512667125760.0, + "grad_norm": 0.027884025705687265, + "language_loss": 1.03269148, + "learning_rate": 0.0009059091926454854, + "loss": 1.04733968, + "num_input_tokens_seen": 7326752, + "router_z_loss_mlp": 3.359375, + "step": 97, + "time_per_iteration": 2.6100950241088867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01470726, + "balance_loss_mlp": 1.13694024, + "epoch": 0.018853405155829166, + "flos": 932696308224.0, + "grad_norm": 0.03936003805775877, + "language_loss": 1.02435613, + "learning_rate": 0.0009079402406897198, + "loss": 1.03906357, + "num_input_tokens_seen": 7417488, + "router_z_loss_mlp": 3.33984375, + "step": 98, + "time_per_iteration": 3.2489542961120605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01467854, + "balance_loss_mlp": 1.13616598, + "epoch": 0.01904578684109273, + "flos": 577586764800.0, + "grad_norm": 0.036005296184057074, + "language_loss": 1.04073858, + "learning_rate": 0.0009099506686008212, + "loss": 1.05541718, + "num_input_tokens_seen": 7493136, + "router_z_loss_mlp": 3.31835938, + "step": 99, + "time_per_iteration": 2.7905051708221436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01467812, + "balance_loss_mlp": 1.13822246, + "epoch": 0.019238168526356292, + "flos": 559520856576.0, + "grad_norm": 0.02696843746399884, + "language_loss": 1.07409596, + "learning_rate": 0.0009119408908644013, + "loss": 1.08877409, + "num_input_tokens_seen": 7560896, + "router_z_loss_mlp": 3.296875, + "step": 100, + "time_per_iteration": 2.7075607776641846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01456893, + "balance_loss_mlp": 1.12882876, + "epoch": 0.019430550211619855, + "flos": 725103184896.0, + "grad_norm": 0.03304065923870771, + "language_loss": 1.12780023, + "learning_rate": 0.0009139113095929519, + "loss": 1.14236927, + "num_input_tokens_seen": 7629040, + "router_z_loss_mlp": 3.28125, + "step": 101, + "time_per_iteration": 2.86230731010437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01460167, + "balance_loss_mlp": 1.13439226, + "epoch": 0.019622931896883418, + "flos": 500456001024.0, + "grad_norm": 0.030619133870748612, + "language_loss": 1.06594038, + "learning_rate": 0.0009158623150134762, + "loss": 1.08054209, + "num_input_tokens_seen": 7694256, + "router_z_loss_mlp": 3.2578125, + "step": 102, + "time_per_iteration": 2.563690185546875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01458611, + "balance_loss_mlp": 1.13569677, + "epoch": 0.01981531358214698, + "flos": 510281587200.0, + "grad_norm": 0.03276303076426602, + "language_loss": 1.06164801, + "learning_rate": 0.000917794285931332, + "loss": 1.0762341, + "num_input_tokens_seen": 7762256, + "router_z_loss_mlp": 3.22851562, + "step": 103, + "time_per_iteration": 2.6599903106689453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01462945, + "balance_loss_mlp": 1.1421293, + "epoch": 0.020007695267410544, + "flos": 522392655360.0, + "grad_norm": 0.026505304013468463, + "language_loss": 0.98227251, + "learning_rate": 0.0009197075901716639, + "loss": 0.99690199, + "num_input_tokens_seen": 7834400, + "router_z_loss_mlp": 3.20703125, + "step": 104, + "time_per_iteration": 2.726245880126953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01469463, + "balance_loss_mlp": 1.14998221, + "epoch": 0.020200076952674107, + "flos": 534443324928.0, + "grad_norm": 0.029933884589862427, + "language_loss": 1.08736229, + "learning_rate": 0.0009216025849997171, + "loss": 1.10205698, + "num_input_tokens_seen": 7911184, + "router_z_loss_mlp": 3.19335938, + "step": 105, + "time_per_iteration": 2.8023486137390137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01468836, + "balance_loss_mlp": 1.15221632, + "epoch": 0.020392458637937667, + "flos": 686082270720.0, + "grad_norm": 0.024520994280375335, + "language_loss": 1.03054178, + "learning_rate": 0.0009234796175212258, + "loss": 1.04523015, + "num_input_tokens_seen": 7985280, + "router_z_loss_mlp": 3.1640625, + "step": 106, + "time_per_iteration": 2.9396088123321533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01469456, + "balance_loss_mlp": 1.15512502, + "epoch": 0.02058484032320123, + "flos": 703414307328.0, + "grad_norm": 0.02898567585615155, + "language_loss": 1.07201982, + "learning_rate": 0.000925339025064007, + "loss": 1.08671439, + "num_input_tokens_seen": 8068320, + "router_z_loss_mlp": 3.140625, + "step": 107, + "time_per_iteration": 2.9473297595977783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0147282, + "balance_loss_mlp": 1.16001439, + "epoch": 0.020777222008464793, + "flos": 640326982656.0, + "grad_norm": 0.02770789473723963, + "language_loss": 0.99879742, + "learning_rate": 0.0009271811355418027, + "loss": 1.01352561, + "num_input_tokens_seen": 8148144, + "router_z_loss_mlp": 3.125, + "step": 108, + "time_per_iteration": 2.8551387786865234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01469504, + "balance_loss_mlp": 1.15803361, + "epoch": 0.020969603693728356, + "flos": 683320700928.0, + "grad_norm": 0.029161506766480293, + "language_loss": 1.06637371, + "learning_rate": 0.0009290062678013548, + "loss": 1.08106875, + "num_input_tokens_seen": 8222256, + "router_z_loss_mlp": 3.11132812, + "step": 109, + "time_per_iteration": 2.821951389312744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01468675, + "balance_loss_mlp": 1.15949392, + "epoch": 0.02116198537899192, + "flos": 534419129856.0, + "grad_norm": 0.03188637458086245, + "language_loss": 1.05070233, + "learning_rate": 0.0009308147319536321, + "loss": 1.06538928, + "num_input_tokens_seen": 8292432, + "router_z_loss_mlp": 3.08789062, + "step": 110, + "time_per_iteration": 2.6315042972564697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01469018, + "balance_loss_mlp": 1.16212535, + "epoch": 0.021354367064255482, + "flos": 718727377920.0, + "grad_norm": 0.030955966903197116, + "language_loss": 1.11490715, + "learning_rate": 0.0009326068296900676, + "loss": 1.12959719, + "num_input_tokens_seen": 8365024, + "router_z_loss_mlp": 3.06445312, + "step": 111, + "time_per_iteration": 2.8208162784576416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01474326, + "balance_loss_mlp": 1.16934085, + "epoch": 0.021546748749519045, + "flos": 520623467520.0, + "grad_norm": 0.027870670355515197, + "language_loss": 1.02138007, + "learning_rate": 0.0009343828545846161, + "loss": 1.03612328, + "num_input_tokens_seen": 8442448, + "router_z_loss_mlp": 3.04492188, + "step": 112, + "time_per_iteration": 2.759277105331421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01474098, + "balance_loss_mlp": 1.17063916, + "epoch": 0.021739130434782608, + "flos": 506161062912.0, + "grad_norm": 0.03372988233582904, + "language_loss": 1.06662297, + "learning_rate": 0.0009361430923823841, + "loss": 1.08136404, + "num_input_tokens_seen": 8508992, + "router_z_loss_mlp": 3.02929688, + "step": 113, + "time_per_iteration": 2.565107822418213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01471087, + "balance_loss_mlp": 1.1693449, + "epoch": 0.02193151212004617, + "flos": 464426242560.0, + "grad_norm": 0.03803370713592907, + "language_loss": 1.10115385, + "learning_rate": 0.0009378878212755459, + "loss": 1.11586463, + "num_input_tokens_seen": 8574048, + "router_z_loss_mlp": 3.01171875, + "step": 114, + "time_per_iteration": 2.491929292678833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01471993, + "balance_loss_mlp": 1.17253923, + "epoch": 0.022123893805309734, + "flos": 553331701248.0, + "grad_norm": 0.029753755152528143, + "language_loss": 1.00006115, + "learning_rate": 0.0009396173121672103, + "loss": 1.014781, + "num_input_tokens_seen": 8647808, + "router_z_loss_mlp": 2.98828125, + "step": 115, + "time_per_iteration": 2.6869561672210693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01473585, + "balance_loss_mlp": 1.1754663, + "epoch": 0.022316275490573297, + "flos": 637378761216.0, + "grad_norm": 0.032022590728611564, + "language_loss": 1.0593642, + "learning_rate": 0.0009413318289238633, + "loss": 1.07410002, + "num_input_tokens_seen": 8719760, + "router_z_loss_mlp": 2.97460938, + "step": 116, + "time_per_iteration": 2.7639846801757812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01474428, + "balance_loss_mlp": 1.17859828, + "epoch": 0.02250865717583686, + "flos": 800315039232.0, + "grad_norm": 0.032750944460810345, + "language_loss": 0.98115921, + "learning_rate": 0.0009430316286169771, + "loss": 0.99590349, + "num_input_tokens_seen": 8798752, + "router_z_loss_mlp": 2.95117188, + "step": 117, + "time_per_iteration": 3.020703077316284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01469481, + "balance_loss_mlp": 1.17536783, + "epoch": 0.022701038861100423, + "flos": 457062782976.0, + "grad_norm": 0.027209249322999743, + "language_loss": 1.0327785, + "learning_rate": 0.0009447169617543361, + "loss": 1.04747331, + "num_input_tokens_seen": 8866848, + "router_z_loss_mlp": 2.9375, + "step": 118, + "time_per_iteration": 2.5938501358032227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01466386, + "balance_loss_mlp": 1.17437065, + "epoch": 0.022893420546363986, + "flos": 584186153472.0, + "grad_norm": 0.028075325054819567, + "language_loss": 1.10005641, + "learning_rate": 0.0009463880725016029, + "loss": 1.11472011, + "num_input_tokens_seen": 8935488, + "router_z_loss_mlp": 2.91992188, + "step": 119, + "time_per_iteration": 2.7082488536834717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01467196, + "balance_loss_mlp": 1.17861414, + "epoch": 0.02308580223162755, + "flos": 562477810176.0, + "grad_norm": 0.032360539397207934, + "language_loss": 1.05048943, + "learning_rate": 0.0009480451988946134, + "loss": 1.06516147, + "num_input_tokens_seen": 9015344, + "router_z_loss_mlp": 2.89257812, + "step": 120, + "time_per_iteration": 2.808687686920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01461098, + "balance_loss_mlp": 1.17423272, + "epoch": 0.023278183916891113, + "flos": 772645125120.0, + "grad_norm": 0.033180722862994706, + "language_loss": 1.06113267, + "learning_rate": 0.0009496885730428627, + "loss": 1.07574379, + "num_input_tokens_seen": 9094672, + "router_z_loss_mlp": 2.875, + "step": 121, + "time_per_iteration": 3.0043137073516846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01466426, + "balance_loss_mlp": 1.18070555, + "epoch": 0.023470565602154676, + "flos": 554430144000.0, + "grad_norm": 0.030787275004595428, + "language_loss": 1.04567683, + "learning_rate": 0.0009513184213246156, + "loss": 1.06034112, + "num_input_tokens_seen": 9160608, + "router_z_loss_mlp": 2.86328125, + "step": 122, + "time_per_iteration": 2.6666839122772217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01462554, + "balance_loss_mlp": 1.17835939, + "epoch": 0.02366294728741824, + "flos": 561166519296.0, + "grad_norm": 0.030499039091632818, + "language_loss": 1.08099937, + "learning_rate": 0.0009529349645740552, + "loss": 1.09562504, + "num_input_tokens_seen": 9228704, + "router_z_loss_mlp": 2.84765625, + "step": 123, + "time_per_iteration": 2.69850492477417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01460088, + "balance_loss_mlp": 1.17741883, + "epoch": 0.0238553289726818, + "flos": 469516955136.0, + "grad_norm": 0.026549221517309443, + "language_loss": 1.06623578, + "learning_rate": 0.0009545384182608524, + "loss": 1.08083653, + "num_input_tokens_seen": 9294288, + "router_z_loss_mlp": 2.83203125, + "step": 124, + "time_per_iteration": 2.5435874462127686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01462583, + "balance_loss_mlp": 1.18144011, + "epoch": 0.024047710657945365, + "flos": 561103392768.0, + "grad_norm": 0.03287811385355005, + "language_loss": 1.04055512, + "learning_rate": 0.0009561289926625252, + "loss": 1.05518079, + "num_input_tokens_seen": 9368048, + "router_z_loss_mlp": 2.81640625, + "step": 125, + "time_per_iteration": 2.6661720275878906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01464029, + "balance_loss_mlp": 1.18460226, + "epoch": 0.024240092343208928, + "flos": 505770295296.0, + "grad_norm": 0.030159442314643806, + "language_loss": 1.08985233, + "learning_rate": 0.0009577068930299292, + "loss": 1.10449266, + "num_input_tokens_seen": 9434848, + "router_z_loss_mlp": 2.79882812, + "step": 126, + "time_per_iteration": 2.596027135848999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01456959, + "balance_loss_mlp": 1.17944014, + "epoch": 0.02443247402847249, + "flos": 436752325632.0, + "grad_norm": 0.03465787530540315, + "language_loss": 1.04454637, + "learning_rate": 0.0009592723197462087, + "loss": 1.05911589, + "num_input_tokens_seen": 9504112, + "router_z_loss_mlp": 2.77929688, + "step": 127, + "time_per_iteration": 2.6355836391448975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0145855, + "balance_loss_mlp": 1.18236613, + "epoch": 0.024624855713736054, + "flos": 685068421632.0, + "grad_norm": 0.03103018628328697, + "language_loss": 1.00976562, + "learning_rate": 0.0009608254684795125, + "loss": 1.02435124, + "num_input_tokens_seen": 9590032, + "router_z_loss_mlp": 2.765625, + "step": 128, + "time_per_iteration": 2.956745147705078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01452077, + "balance_loss_mlp": 1.17741859, + "epoch": 0.024817237398999614, + "flos": 526113679872.0, + "grad_norm": 0.03378324138815482, + "language_loss": 1.03947771, + "learning_rate": 0.0009623665303297678, + "loss": 1.05399847, + "num_input_tokens_seen": 9663040, + "router_z_loss_mlp": 2.75, + "step": 129, + "time_per_iteration": 2.762612819671631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0145448, + "balance_loss_mlp": 1.18115723, + "epoch": 0.025009619084263177, + "flos": 656886216192.0, + "grad_norm": 0.03318348770393379, + "language_loss": 1.08023834, + "learning_rate": 0.0009638956919697878, + "loss": 1.09478307, + "num_input_tokens_seen": 9736544, + "router_z_loss_mlp": 2.73339844, + "step": 130, + "time_per_iteration": 2.8800294399261475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01453293, + "balance_loss_mlp": 1.18130565, + "epoch": 0.02520200076952674, + "flos": 455369456640.0, + "grad_norm": 0.028803226470227133, + "language_loss": 1.00211501, + "learning_rate": 0.0009654131357809714, + "loss": 1.01664793, + "num_input_tokens_seen": 9804656, + "router_z_loss_mlp": 2.71875, + "step": 131, + "time_per_iteration": 2.593409776687622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01454951, + "balance_loss_mlp": 1.18534708, + "epoch": 0.025394382454790303, + "flos": 841268324352.0, + "grad_norm": 0.035993676074610494, + "language_loss": 1.09494662, + "learning_rate": 0.0009669190399838441, + "loss": 1.10949612, + "num_input_tokens_seen": 9888864, + "router_z_loss_mlp": 2.69824219, + "step": 132, + "time_per_iteration": 3.1307294368743896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01454062, + "balance_loss_mlp": 1.18588877, + "epoch": 0.025586764140053866, + "flos": 582228312576.0, + "grad_norm": 0.03305283337163912, + "language_loss": 1.02299893, + "learning_rate": 0.0009684135787636724, + "loss": 1.03753948, + "num_input_tokens_seen": 9968208, + "router_z_loss_mlp": 2.68359375, + "step": 133, + "time_per_iteration": 2.8118627071380615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01454726, + "balance_loss_mlp": 1.18798327, + "epoch": 0.02577914582531743, + "flos": 791677218816.0, + "grad_norm": 0.03011124606519955, + "language_loss": 1.06380379, + "learning_rate": 0.0009698969223913726, + "loss": 1.07835102, + "num_input_tokens_seen": 10049664, + "router_z_loss_mlp": 2.66894531, + "step": 134, + "time_per_iteration": 3.0371806621551514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01450237, + "balance_loss_mlp": 1.18454385, + "epoch": 0.025971527510580992, + "flos": 596062906368.0, + "grad_norm": 0.030569012833979448, + "language_loss": 1.08986592, + "learning_rate": 0.0009713692373399265, + "loss": 1.10436833, + "num_input_tokens_seen": 10120096, + "router_z_loss_mlp": 2.65820312, + "step": 135, + "time_per_iteration": 2.684903621673584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01684837, + "balance_loss_mlp": 1.39873505, + "epoch": 0.026163909195844555, + "flos": 1581074411520.0, + "grad_norm": 0.08870187959024729, + "language_loss": 0.79456228, + "learning_rate": 0.0009728306863964993, + "loss": 0.81141067, + "num_input_tokens_seen": 10348976, + "router_z_loss_mlp": 2.8671875, + "step": 136, + "time_per_iteration": 5.94019627571106 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0161422, + "balance_loss_mlp": 1.33116913, + "epoch": 0.026356290881108118, + "flos": 1505160886272.0, + "grad_norm": 0.07212137850421584, + "language_loss": 0.77811038, + "learning_rate": 0.0009742814287704512, + "loss": 0.79425257, + "num_input_tokens_seen": 10576512, + "router_z_loss_mlp": 2.8359375, + "step": 137, + "time_per_iteration": 4.865153074264526 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01469938, + "balance_loss_mlp": 1.20901299, + "epoch": 0.02654867256637168, + "flos": 598340382720.0, + "grad_norm": 0.040535745966457745, + "language_loss": 1.01652551, + "learning_rate": 0.0009757216201974225, + "loss": 1.03122485, + "num_input_tokens_seen": 10659168, + "router_z_loss_mlp": 2.609375, + "step": 138, + "time_per_iteration": 2.8955435752868652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01487517, + "balance_loss_mlp": 1.22802222, + "epoch": 0.026741054251635244, + "flos": 546135427584.0, + "grad_norm": 0.04340470282065083, + "language_loss": 1.06732666, + "learning_rate": 0.0009771514130396581, + "loss": 1.08220184, + "num_input_tokens_seen": 10731584, + "router_z_loss_mlp": 2.59472656, + "step": 139, + "time_per_iteration": 2.6939706802368164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01498511, + "balance_loss_mlp": 1.24044681, + "epoch": 0.026933435936898807, + "flos": 507845657088.0, + "grad_norm": 0.04879945782970011, + "language_loss": 1.07520163, + "learning_rate": 0.00097857095638274, + "loss": 1.09018672, + "num_input_tokens_seen": 10799456, + "router_z_loss_mlp": 2.58007812, + "step": 140, + "time_per_iteration": 2.559673309326172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01492411, + "balance_loss_mlp": 1.23558652, + "epoch": 0.02712581762216237, + "flos": 742253299200.0, + "grad_norm": 0.043929969627725114, + "language_loss": 0.98754954, + "learning_rate": 0.0009799803961288726, + "loss": 1.00247359, + "num_input_tokens_seen": 10886416, + "router_z_loss_mlp": 2.5703125, + "step": 141, + "time_per_iteration": 3.008998394012451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01470778, + "balance_loss_mlp": 1.21567059, + "epoch": 0.027318199307425933, + "flos": 849777890304.0, + "grad_norm": 0.03716164217421175, + "language_loss": 1.04960537, + "learning_rate": 0.000981379875086876, + "loss": 1.06431305, + "num_input_tokens_seen": 10966064, + "router_z_loss_mlp": 2.55371094, + "step": 142, + "time_per_iteration": 3.057098865509033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01469037, + "balance_loss_mlp": 1.21535933, + "epoch": 0.027510580992689496, + "flos": 576638043648.0, + "grad_norm": 0.03712962317624948, + "language_loss": 1.00046849, + "learning_rate": 0.0009827695330590185, + "loss": 1.01515889, + "num_input_tokens_seen": 11039712, + "router_z_loss_mlp": 2.5390625, + "step": 143, + "time_per_iteration": 2.638338327407837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01450228, + "balance_loss_mlp": 1.19750416, + "epoch": 0.02770296267795306, + "flos": 773789230080.0, + "grad_norm": 0.030455330453953735, + "language_loss": 0.99027133, + "learning_rate": 0.0009841495069248256, + "loss": 1.00477362, + "num_input_tokens_seen": 11123984, + "router_z_loss_mlp": 2.52929688, + "step": 144, + "time_per_iteration": 2.981438636779785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01441391, + "balance_loss_mlp": 1.19009781, + "epoch": 0.027895344363216622, + "flos": 570448888320.0, + "grad_norm": 0.031624263879455494, + "language_loss": 0.98723662, + "learning_rate": 0.0009855199307219871, + "loss": 1.00165045, + "num_input_tokens_seen": 11192864, + "router_z_loss_mlp": 2.51464844, + "step": 145, + "time_per_iteration": 2.6923046112060547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01440125, + "balance_loss_mlp": 1.1903578, + "epoch": 0.028087726048480186, + "flos": 548408174592.0, + "grad_norm": 0.029995844711875903, + "language_loss": 1.00586843, + "learning_rate": 0.0009868809357244854, + "loss": 1.02026975, + "num_input_tokens_seen": 11261760, + "router_z_loss_mlp": 2.49902344, + "step": 146, + "time_per_iteration": 2.6284868717193604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01436833, + "balance_loss_mlp": 1.18782902, + "epoch": 0.02828010773374375, + "flos": 525872633856.0, + "grad_norm": 0.03288909570778387, + "language_loss": 1.05042541, + "learning_rate": 0.0009882326505180556, + "loss": 1.06479371, + "num_input_tokens_seen": 11334736, + "router_z_loss_mlp": 2.49121094, + "step": 147, + "time_per_iteration": 2.6588714122772217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01425728, + "balance_loss_mlp": 1.1783452, + "epoch": 0.02847248941900731, + "flos": 773771765760.0, + "grad_norm": 0.031738987003727674, + "language_loss": 1.02499485, + "learning_rate": 0.0009895752010730906, + "loss": 1.03925204, + "num_input_tokens_seen": 11409872, + "router_z_loss_mlp": 2.47460938, + "step": 148, + "time_per_iteration": 2.9316182136535645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01424571, + "balance_loss_mlp": 1.17785549, + "epoch": 0.028664871104270875, + "flos": 535469908992.0, + "grad_norm": 0.028294299214345536, + "language_loss": 1.0900923, + "learning_rate": 0.0009909087108150867, + "loss": 1.10433793, + "num_input_tokens_seen": 11481024, + "router_z_loss_mlp": 2.46777344, + "step": 149, + "time_per_iteration": 2.697423219680786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.014274, + "balance_loss_mlp": 1.18182933, + "epoch": 0.028857252789534438, + "flos": 368604487680.0, + "grad_norm": 0.03525963963400797, + "language_loss": 1.09753942, + "learning_rate": 0.0009922333006927371, + "loss": 1.11181331, + "num_input_tokens_seen": 11544240, + "router_z_loss_mlp": 2.45605469, + "step": 150, + "time_per_iteration": 2.483644723892212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01433542, + "balance_loss_mlp": 1.18911529, + "epoch": 0.029049634474798, + "flos": 516483477504.0, + "grad_norm": 0.03341635886009217, + "language_loss": 1.03220332, + "learning_rate": 0.0009935490892437632, + "loss": 1.04653883, + "num_input_tokens_seen": 11610416, + "router_z_loss_mlp": 2.44433594, + "step": 151, + "time_per_iteration": 2.604599952697754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01438911, + "balance_loss_mlp": 1.19553363, + "epoch": 0.029242016160061564, + "flos": 589348724736.0, + "grad_norm": 0.030166761621646727, + "language_loss": 1.01782072, + "learning_rate": 0.0009948561926585687, + "loss": 1.03220987, + "num_input_tokens_seen": 11687488, + "router_z_loss_mlp": 2.43359375, + "step": 152, + "time_per_iteration": 2.7724709510803223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01445258, + "balance_loss_mlp": 1.20350146, + "epoch": 0.029434397845325123, + "flos": 553136317440.0, + "grad_norm": 0.030739210798008048, + "language_loss": 1.05873716, + "learning_rate": 0.0009961547248418122, + "loss": 1.07318974, + "num_input_tokens_seen": 11754576, + "router_z_loss_mlp": 2.41699219, + "step": 153, + "time_per_iteration": 2.6247737407684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01440878, + "balance_loss_mlp": 1.19988418, + "epoch": 0.029626779530588686, + "flos": 604607400960.0, + "grad_norm": 0.030186385343499288, + "language_loss": 1.02632022, + "learning_rate": 0.0009974447974719707, + "loss": 1.04072905, + "num_input_tokens_seen": 11831360, + "router_z_loss_mlp": 2.40917969, + "step": 154, + "time_per_iteration": 2.730053663253784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01431891, + "balance_loss_mlp": 1.19194651, + "epoch": 0.02981916121585225, + "flos": 622217413632.0, + "grad_norm": 0.02801027733601246, + "language_loss": 1.04305005, + "learning_rate": 0.0009987265200589763, + "loss": 1.05736899, + "num_input_tokens_seen": 11902192, + "router_z_loss_mlp": 2.3984375, + "step": 155, + "time_per_iteration": 2.7198500633239746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01423605, + "balance_loss_mlp": 1.18537688, + "epoch": 0.030011542901115813, + "flos": 662879987712.0, + "grad_norm": 0.0349007823819893, + "language_loss": 1.04218483, + "learning_rate": 0.001, + "loss": 1.05642092, + "num_input_tokens_seen": 11979088, + "router_z_loss_mlp": 2.38085938, + "step": 156, + "time_per_iteration": 2.8801028728485107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01420835, + "balance_loss_mlp": 1.18289316, + "epoch": 0.030203924586379376, + "flos": 652818084864.0, + "grad_norm": 0.029403473562715665, + "language_loss": 1.01930022, + "learning_rate": 0.0009999999029413921, + "loss": 1.03350854, + "num_input_tokens_seen": 12059200, + "router_z_loss_mlp": 2.37792969, + "step": 157, + "time_per_iteration": 2.8549368381500244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01415444, + "balance_loss_mlp": 1.17921925, + "epoch": 0.03039630627164294, + "flos": 532443824640.0, + "grad_norm": 0.03295212675068383, + "language_loss": 1.02716291, + "learning_rate": 0.0009999996117656068, + "loss": 1.04131734, + "num_input_tokens_seen": 12134944, + "router_z_loss_mlp": 2.36035156, + "step": 158, + "time_per_iteration": 2.6989729404449463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01410387, + "balance_loss_mlp": 1.17530584, + "epoch": 0.030588687956906502, + "flos": 587294830080.0, + "grad_norm": 0.0291076208082698, + "language_loss": 0.96305156, + "learning_rate": 0.0009999991264727564, + "loss": 0.97715545, + "num_input_tokens_seen": 12207936, + "router_z_loss_mlp": 2.34863281, + "step": 159, + "time_per_iteration": 2.7609338760375977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0140999, + "balance_loss_mlp": 1.1752907, + "epoch": 0.030781069642170065, + "flos": 514286592000.0, + "grad_norm": 0.030494101007586163, + "language_loss": 1.0725081, + "learning_rate": 0.0009999984470630296, + "loss": 1.08660805, + "num_input_tokens_seen": 12273200, + "router_z_loss_mlp": 2.34472656, + "step": 160, + "time_per_iteration": 2.5805158615112305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01410287, + "balance_loss_mlp": 1.17711365, + "epoch": 0.030973451327433628, + "flos": 719559304704.0, + "grad_norm": 0.025032822394785544, + "language_loss": 0.95934659, + "learning_rate": 0.0009999975735366902, + "loss": 0.97344947, + "num_input_tokens_seen": 12359600, + "router_z_loss_mlp": 2.32910156, + "step": 161, + "time_per_iteration": 3.078343629837036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01409543, + "balance_loss_mlp": 1.17675149, + "epoch": 0.03116583301269719, + "flos": 1111614400512.0, + "grad_norm": 0.029903967107167622, + "language_loss": 0.98009437, + "learning_rate": 0.0009999965058940775, + "loss": 0.99418974, + "num_input_tokens_seen": 12443936, + "router_z_loss_mlp": 2.32519531, + "step": 162, + "time_per_iteration": 3.49137544631958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01408163, + "balance_loss_mlp": 1.17689729, + "epoch": 0.031358214697960754, + "flos": 451833082368.0, + "grad_norm": 0.11336845133687022, + "language_loss": 1.0463953, + "learning_rate": 0.0009999952441356057, + "loss": 1.06047678, + "num_input_tokens_seen": 12507488, + "router_z_loss_mlp": 2.30957031, + "step": 163, + "time_per_iteration": 2.531280755996704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01406979, + "balance_loss_mlp": 1.17676246, + "epoch": 0.031550596383224314, + "flos": 1257085658112.0, + "grad_norm": 0.03183858769064714, + "language_loss": 1.05248928, + "learning_rate": 0.000999993788261765, + "loss": 1.06655908, + "num_input_tokens_seen": 12594096, + "router_z_loss_mlp": 2.30078125, + "step": 164, + "time_per_iteration": 3.5714328289031982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01408503, + "balance_loss_mlp": 1.17943025, + "epoch": 0.03174297806848788, + "flos": 669322924032.0, + "grad_norm": 0.03191781964215587, + "language_loss": 1.06263065, + "learning_rate": 0.00099999213827312, + "loss": 1.07671571, + "num_input_tokens_seen": 12669424, + "router_z_loss_mlp": 2.29101562, + "step": 165, + "time_per_iteration": 2.7947938442230225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01409995, + "balance_loss_mlp": 1.18101788, + "epoch": 0.03193535975375144, + "flos": 552363514368.0, + "grad_norm": 0.03891580789868065, + "language_loss": 1.01044345, + "learning_rate": 0.000999990294170312, + "loss": 1.0245434, + "num_input_tokens_seen": 12740080, + "router_z_loss_mlp": 2.29003906, + "step": 166, + "time_per_iteration": 2.6462574005126953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0140342, + "balance_loss_mlp": 1.17577803, + "epoch": 0.032127741439015006, + "flos": 544739543040.0, + "grad_norm": 0.03757156138401865, + "language_loss": 1.05309296, + "learning_rate": 0.0009999882559540566, + "loss": 1.06712723, + "num_input_tokens_seen": 12810576, + "router_z_loss_mlp": 2.27636719, + "step": 167, + "time_per_iteration": 2.629549503326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0140941, + "balance_loss_mlp": 1.18234003, + "epoch": 0.032320123124278566, + "flos": 549513348096.0, + "grad_norm": 0.028659149555752484, + "language_loss": 1.01791751, + "learning_rate": 0.000999986023625145, + "loss": 1.03201175, + "num_input_tokens_seen": 12887904, + "router_z_loss_mlp": 2.27050781, + "step": 168, + "time_per_iteration": 2.7051401138305664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01589355, + "balance_loss_mlp": 1.35360718, + "epoch": 0.03251250480954213, + "flos": 1308815430144.0, + "grad_norm": 0.08201951270186027, + "language_loss": 0.78924417, + "learning_rate": 0.0009999835971844441, + "loss": 0.80513763, + "num_input_tokens_seen": 13107344, + "router_z_loss_mlp": 2.35546875, + "step": 169, + "time_per_iteration": 4.9428627490997314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01407645, + "balance_loss_mlp": 1.18257797, + "epoch": 0.03270488649480569, + "flos": 562201835520.0, + "grad_norm": 0.03970113019311383, + "language_loss": 1.02863848, + "learning_rate": 0.0009999809766328958, + "loss": 1.04271495, + "num_input_tokens_seen": 13175552, + "router_z_loss_mlp": 2.25, + "step": 170, + "time_per_iteration": 2.675811529159546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01415662, + "balance_loss_mlp": 1.19193029, + "epoch": 0.03289726818006926, + "flos": 483338813952.0, + "grad_norm": 0.03325277263778645, + "language_loss": 1.04760146, + "learning_rate": 0.0009999781619715177, + "loss": 1.06175804, + "num_input_tokens_seen": 13242384, + "router_z_loss_mlp": 2.23632812, + "step": 171, + "time_per_iteration": 2.5431392192840576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01419714, + "balance_loss_mlp": 1.1972214, + "epoch": 0.03308964986533282, + "flos": 675820254720.0, + "grad_norm": 0.02950894161591202, + "language_loss": 1.04164565, + "learning_rate": 0.000999975153201402, + "loss": 1.05584288, + "num_input_tokens_seen": 13316160, + "router_z_loss_mlp": 2.22363281, + "step": 172, + "time_per_iteration": 2.812837600708008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01422366, + "balance_loss_mlp": 1.20044637, + "epoch": 0.033282031550596385, + "flos": 610340660736.0, + "grad_norm": 0.03086814843966846, + "language_loss": 1.02532911, + "learning_rate": 0.0009999719503237174, + "loss": 1.03955269, + "num_input_tokens_seen": 13387664, + "router_z_loss_mlp": 2.21777344, + "step": 173, + "time_per_iteration": 2.755462646484375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01416936, + "balance_loss_mlp": 1.1959697, + "epoch": 0.033474413235859944, + "flos": 468995931648.0, + "grad_norm": 0.048603642070708566, + "language_loss": 1.1131072, + "learning_rate": 0.0009999685533397073, + "loss": 1.12727666, + "num_input_tokens_seen": 13454528, + "router_z_loss_mlp": 2.20800781, + "step": 174, + "time_per_iteration": 2.566751003265381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01414495, + "balance_loss_mlp": 1.19438744, + "epoch": 0.03366679492112351, + "flos": 580714907136.0, + "grad_norm": 0.03243683176756354, + "language_loss": 1.02908182, + "learning_rate": 0.00099996496225069, + "loss": 1.04322672, + "num_input_tokens_seen": 13522528, + "router_z_loss_mlp": 2.19921875, + "step": 175, + "time_per_iteration": 2.67861008644104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01407523, + "balance_loss_mlp": 1.1883682, + "epoch": 0.03385917660638707, + "flos": 638885435904.0, + "grad_norm": 0.029120554083078395, + "language_loss": 1.05784094, + "learning_rate": 0.0009999611770580604, + "loss": 1.0719161, + "num_input_tokens_seen": 13601120, + "router_z_loss_mlp": 2.18945312, + "step": 176, + "time_per_iteration": 2.8410942554473877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01401607, + "balance_loss_mlp": 1.18302441, + "epoch": 0.03405155829165064, + "flos": 442739366400.0, + "grad_norm": 0.031490867136515936, + "language_loss": 1.04703283, + "learning_rate": 0.0009999571977632876, + "loss": 1.06104875, + "num_input_tokens_seen": 13666384, + "router_z_loss_mlp": 2.18359375, + "step": 177, + "time_per_iteration": 2.559652090072632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01399051, + "balance_loss_mlp": 1.1813277, + "epoch": 0.034243939976914196, + "flos": 467274407424.0, + "grad_norm": 0.029366691437037535, + "language_loss": 1.0724479, + "learning_rate": 0.0009999530243679166, + "loss": 1.08643842, + "num_input_tokens_seen": 13733968, + "router_z_loss_mlp": 2.17480469, + "step": 178, + "time_per_iteration": 2.5423247814178467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01392432, + "balance_loss_mlp": 1.17556691, + "epoch": 0.03443632166217776, + "flos": 780712257024.0, + "grad_norm": 0.02507202069561695, + "language_loss": 1.01653552, + "learning_rate": 0.0009999486568735675, + "loss": 1.03045988, + "num_input_tokens_seen": 13818960, + "router_z_loss_mlp": 2.16601562, + "step": 179, + "time_per_iteration": 3.111632823944092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01381684, + "balance_loss_mlp": 1.16567647, + "epoch": 0.03462870334744132, + "flos": 1265758407168.0, + "grad_norm": 0.027829136834509844, + "language_loss": 1.02053452, + "learning_rate": 0.0009999440952819362, + "loss": 1.03435147, + "num_input_tokens_seen": 13912448, + "router_z_loss_mlp": 2.15722656, + "step": 180, + "time_per_iteration": 3.6354756355285645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01375883, + "balance_loss_mlp": 1.16035271, + "epoch": 0.03482108503270489, + "flos": 608302228992.0, + "grad_norm": 0.033531921209289, + "language_loss": 1.02966988, + "learning_rate": 0.0009999393395947935, + "loss": 1.04342866, + "num_input_tokens_seen": 13990752, + "router_z_loss_mlp": 2.15234375, + "step": 181, + "time_per_iteration": 2.8509652614593506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01372611, + "balance_loss_mlp": 1.15774834, + "epoch": 0.03501346671796845, + "flos": 539314458624.0, + "grad_norm": 0.029990628161131794, + "language_loss": 1.05946589, + "learning_rate": 0.0009999343898139858, + "loss": 1.07319212, + "num_input_tokens_seen": 14058608, + "router_z_loss_mlp": 2.14550781, + "step": 182, + "time_per_iteration": 2.6199755668640137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01375908, + "balance_loss_mlp": 1.16161704, + "epoch": 0.035205848403232015, + "flos": 519498828288.0, + "grad_norm": 0.03419998284579487, + "language_loss": 1.04830694, + "learning_rate": 0.0009999292459414348, + "loss": 1.06206608, + "num_input_tokens_seen": 14126656, + "router_z_loss_mlp": 2.13964844, + "step": 183, + "time_per_iteration": 2.563997983932495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01386507, + "balance_loss_mlp": 1.17269289, + "epoch": 0.035398230088495575, + "flos": 473333306880.0, + "grad_norm": 0.03346089667402367, + "language_loss": 1.09292293, + "learning_rate": 0.0009999239079791374, + "loss": 1.10678792, + "num_input_tokens_seen": 14195840, + "router_z_loss_mlp": 2.13476562, + "step": 184, + "time_per_iteration": 2.5561137199401855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01387981, + "balance_loss_mlp": 1.17512131, + "epoch": 0.03559061177375914, + "flos": 513094823424.0, + "grad_norm": 0.03551516541146116, + "language_loss": 1.01857162, + "learning_rate": 0.0009999183759291659, + "loss": 1.03245139, + "num_input_tokens_seen": 14269936, + "router_z_loss_mlp": 2.125, + "step": 185, + "time_per_iteration": 2.689763307571411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01383562, + "balance_loss_mlp": 1.17108345, + "epoch": 0.0357829934590227, + "flos": 478350159360.0, + "grad_norm": 0.03945465081959485, + "language_loss": 1.04534364, + "learning_rate": 0.0009999126497936682, + "loss": 1.05917931, + "num_input_tokens_seen": 14334848, + "router_z_loss_mlp": 2.12109375, + "step": 186, + "time_per_iteration": 2.5142176151275635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01375295, + "balance_loss_mlp": 1.16415167, + "epoch": 0.03597537514428627, + "flos": 645884324352.0, + "grad_norm": 0.029215470851159726, + "language_loss": 1.06864357, + "learning_rate": 0.0009999067295748676, + "loss": 1.08239663, + "num_input_tokens_seen": 14407888, + "router_z_loss_mlp": 2.10742188, + "step": 187, + "time_per_iteration": 2.8259549140930176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01370561, + "balance_loss_mlp": 1.16056204, + "epoch": 0.03616775682954983, + "flos": 582269245440.0, + "grad_norm": 0.03159066859467708, + "language_loss": 1.0519886, + "learning_rate": 0.000999900615275062, + "loss": 1.06569433, + "num_input_tokens_seen": 14479072, + "router_z_loss_mlp": 2.09570312, + "step": 188, + "time_per_iteration": 2.6748595237731934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01368603, + "balance_loss_mlp": 1.15898561, + "epoch": 0.03636013851481339, + "flos": 383264277504.0, + "grad_norm": 0.043734318168479426, + "language_loss": 1.10731864, + "learning_rate": 0.0009998943068966256, + "loss": 1.1210047, + "num_input_tokens_seen": 14540944, + "router_z_loss_mlp": 2.09179688, + "step": 189, + "time_per_iteration": 2.4394500255584717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01365543, + "balance_loss_mlp": 1.15668833, + "epoch": 0.03655252020007695, + "flos": 584307677184.0, + "grad_norm": 0.02577278402121573, + "language_loss": 1.05579162, + "learning_rate": 0.0009998878044420072, + "loss": 1.06944704, + "num_input_tokens_seen": 14611392, + "router_z_loss_mlp": 2.0859375, + "step": 190, + "time_per_iteration": 2.7022814750671387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01365865, + "balance_loss_mlp": 1.15882242, + "epoch": 0.03674490188534051, + "flos": 472597433856.0, + "grad_norm": 0.03520388751206912, + "language_loss": 1.01277018, + "learning_rate": 0.0009998811079137318, + "loss": 1.02642882, + "num_input_tokens_seen": 14679776, + "router_z_loss_mlp": 2.07324219, + "step": 191, + "time_per_iteration": 2.5930585861206055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0136447, + "balance_loss_mlp": 1.15742755, + "epoch": 0.03693728357060408, + "flos": 529411009536.0, + "grad_norm": 0.03125533686722731, + "language_loss": 1.02464271, + "learning_rate": 0.0009998742173143987, + "loss": 1.0382874, + "num_input_tokens_seen": 14749712, + "router_z_loss_mlp": 2.07324219, + "step": 192, + "time_per_iteration": 2.6235413551330566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01358793, + "balance_loss_mlp": 1.15222692, + "epoch": 0.03712966525586764, + "flos": 800345238528.0, + "grad_norm": 0.02848545485219292, + "language_loss": 1.02800548, + "learning_rate": 0.0009998671326466833, + "loss": 1.04159343, + "num_input_tokens_seen": 14827136, + "router_z_loss_mlp": 2.06835938, + "step": 193, + "time_per_iteration": 2.991110324859619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01351781, + "balance_loss_mlp": 1.1463598, + "epoch": 0.037322046941131205, + "flos": 831358144512.0, + "grad_norm": 0.03513998418582105, + "language_loss": 1.0392077, + "learning_rate": 0.0009998598539133362, + "loss": 1.05272543, + "num_input_tokens_seen": 14902880, + "router_z_loss_mlp": 2.05664062, + "step": 194, + "time_per_iteration": 3.0204203128814697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01349328, + "balance_loss_mlp": 1.14371598, + "epoch": 0.037514428626394765, + "flos": 438588642816.0, + "grad_norm": 0.028816536284039847, + "language_loss": 1.04176903, + "learning_rate": 0.0009998523811171828, + "loss": 1.05526221, + "num_input_tokens_seen": 14967264, + "router_z_loss_mlp": 2.05859375, + "step": 195, + "time_per_iteration": 2.5615782737731934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01345129, + "balance_loss_mlp": 1.14047015, + "epoch": 0.03770681031165833, + "flos": 512638927872.0, + "grad_norm": 0.030721230574493993, + "language_loss": 1.05052435, + "learning_rate": 0.0009998447142611248, + "loss": 1.06397557, + "num_input_tokens_seen": 15039104, + "router_z_loss_mlp": 2.04882812, + "step": 196, + "time_per_iteration": 2.6310269832611084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01347072, + "balance_loss_mlp": 1.14289033, + "epoch": 0.03789919199692189, + "flos": 808842069504.0, + "grad_norm": 0.024329502455983587, + "language_loss": 0.97805226, + "learning_rate": 0.0009998368533481387, + "loss": 0.99152303, + "num_input_tokens_seen": 15124864, + "router_z_loss_mlp": 2.04394531, + "step": 197, + "time_per_iteration": 3.0467066764831543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01344143, + "balance_loss_mlp": 1.14043784, + "epoch": 0.03809157368218546, + "flos": 691791335424.0, + "grad_norm": 0.028391473090668865, + "language_loss": 1.00891113, + "learning_rate": 0.0009998287983812762, + "loss": 1.0223527, + "num_input_tokens_seen": 15199680, + "router_z_loss_mlp": 2.0390625, + "step": 198, + "time_per_iteration": 2.8457672595977783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01342798, + "balance_loss_mlp": 1.14023721, + "epoch": 0.03828395536744902, + "flos": 519004001280.0, + "grad_norm": 0.02890411668538335, + "language_loss": 1.07749867, + "learning_rate": 0.0009998205493636646, + "loss": 1.09092665, + "num_input_tokens_seen": 15270176, + "router_z_loss_mlp": 2.02734375, + "step": 199, + "time_per_iteration": 2.66135573387146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01336213, + "balance_loss_mlp": 1.13432038, + "epoch": 0.038476337052712584, + "flos": 582762071040.0, + "grad_norm": 0.025165239757241963, + "language_loss": 0.99723649, + "learning_rate": 0.0009998121062985063, + "loss": 1.01059866, + "num_input_tokens_seen": 15343168, + "router_z_loss_mlp": 2.02050781, + "step": 200, + "time_per_iteration": 2.70021915435791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01340101, + "balance_loss_mlp": 1.13868463, + "epoch": 0.03866871873797614, + "flos": 578272972800.0, + "grad_norm": 0.025940014565947116, + "language_loss": 1.01401794, + "learning_rate": 0.0009998034691890794, + "loss": 1.02741897, + "num_input_tokens_seen": 15417328, + "router_z_loss_mlp": 2.015625, + "step": 201, + "time_per_iteration": 2.7596118450164795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0134112, + "balance_loss_mlp": 1.14018106, + "epoch": 0.03886110042323971, + "flos": 541771855872.0, + "grad_norm": 0.03045868040347491, + "language_loss": 1.06763899, + "learning_rate": 0.0009997946380387369, + "loss": 1.08105016, + "num_input_tokens_seen": 15489488, + "router_z_loss_mlp": 2.01074219, + "step": 202, + "time_per_iteration": 2.6249613761901855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01341912, + "balance_loss_mlp": 1.14192665, + "epoch": 0.03905348210850327, + "flos": 719239669248.0, + "grad_norm": 0.02826530469295273, + "language_loss": 1.09111357, + "learning_rate": 0.0009997856128509076, + "loss": 1.1045326, + "num_input_tokens_seen": 15558944, + "router_z_loss_mlp": 2.00097656, + "step": 203, + "time_per_iteration": 2.8254761695861816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01336015, + "balance_loss_mlp": 1.13660145, + "epoch": 0.039245863793766836, + "flos": 428396484096.0, + "grad_norm": 0.028264614074004907, + "language_loss": 1.0366801, + "learning_rate": 0.0009997763936290952, + "loss": 1.05004025, + "num_input_tokens_seen": 15625024, + "router_z_loss_mlp": 1.99511719, + "step": 204, + "time_per_iteration": 2.4907312393188477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01334897, + "balance_loss_mlp": 1.13624632, + "epoch": 0.039438245479030395, + "flos": 664269141504.0, + "grad_norm": 0.0294297584821439, + "language_loss": 1.09143519, + "learning_rate": 0.0009997669803768789, + "loss": 1.10478401, + "num_input_tokens_seen": 15697120, + "router_z_loss_mlp": 1.98730469, + "step": 205, + "time_per_iteration": 2.787046194076538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01332958, + "balance_loss_mlp": 1.13497555, + "epoch": 0.03963062716429396, + "flos": 636495168000.0, + "grad_norm": 0.025164669035445293, + "language_loss": 1.04324186, + "learning_rate": 0.0009997573730979134, + "loss": 1.05657148, + "num_input_tokens_seen": 15768752, + "router_z_loss_mlp": 1.98242188, + "step": 206, + "time_per_iteration": 2.744339942932129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01388672, + "balance_loss_mlp": 1.18687439, + "epoch": 0.03982300884955752, + "flos": 1421587186176.0, + "grad_norm": 0.04225268457123109, + "language_loss": 0.79193199, + "learning_rate": 0.0009997475717959284, + "loss": 0.80581868, + "num_input_tokens_seen": 15980624, + "router_z_loss_mlp": 2.01953125, + "step": 207, + "time_per_iteration": 4.62822699546814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01338974, + "balance_loss_mlp": 1.14251721, + "epoch": 0.04001539053482109, + "flos": 690519702528.0, + "grad_norm": 0.029734692172116686, + "language_loss": 1.02667236, + "learning_rate": 0.0009997375764747294, + "loss": 1.04006195, + "num_input_tokens_seen": 16067232, + "router_z_loss_mlp": 1.96875, + "step": 208, + "time_per_iteration": 3.0006470680236816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01332342, + "balance_loss_mlp": 1.1360755, + "epoch": 0.04020777222008465, + "flos": 534751500288.0, + "grad_norm": 0.02521302149444487, + "language_loss": 1.00535607, + "learning_rate": 0.0009997273871381967, + "loss": 1.01867938, + "num_input_tokens_seen": 16139808, + "router_z_loss_mlp": 1.96679688, + "step": 209, + "time_per_iteration": 2.6790220737457275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01335368, + "balance_loss_mlp": 1.14005554, + "epoch": 0.040400153905348214, + "flos": 568996608000.0, + "grad_norm": 0.04055154679799505, + "language_loss": 1.05331016, + "learning_rate": 0.0009997170037902862, + "loss": 1.06666374, + "num_input_tokens_seen": 16210848, + "router_z_loss_mlp": 1.95703125, + "step": 210, + "time_per_iteration": 2.748340129852295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01331596, + "balance_loss_mlp": 1.13647389, + "epoch": 0.040592535590611774, + "flos": 714678712320.0, + "grad_norm": 0.0276705792773584, + "language_loss": 1.07916689, + "learning_rate": 0.0009997064264350292, + "loss": 1.09248281, + "num_input_tokens_seen": 16283984, + "router_z_loss_mlp": 1.95507812, + "step": 211, + "time_per_iteration": 2.8284339904785156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01332545, + "balance_loss_mlp": 1.13761449, + "epoch": 0.04078491727587533, + "flos": 579206231040.0, + "grad_norm": 0.026753366885260317, + "language_loss": 1.01893198, + "learning_rate": 0.0009996956550765317, + "loss": 1.03225756, + "num_input_tokens_seen": 16353904, + "router_z_loss_mlp": 1.953125, + "step": 212, + "time_per_iteration": 2.6636033058166504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01330854, + "balance_loss_mlp": 1.13668597, + "epoch": 0.0409772989611389, + "flos": 553368631296.0, + "grad_norm": 0.03340351088011317, + "language_loss": 0.96620274, + "learning_rate": 0.0009996846897189762, + "loss": 0.97951126, + "num_input_tokens_seen": 16425488, + "router_z_loss_mlp": 1.9453125, + "step": 213, + "time_per_iteration": 2.62785005569458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01327396, + "balance_loss_mlp": 1.13332307, + "epoch": 0.04116968064640246, + "flos": 556764016128.0, + "grad_norm": 0.026256493309422244, + "language_loss": 1.0283711, + "learning_rate": 0.0009996735303666193, + "loss": 1.04164505, + "num_input_tokens_seen": 16498016, + "router_z_loss_mlp": 1.94433594, + "step": 214, + "time_per_iteration": 2.745412588119507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01324547, + "balance_loss_mlp": 1.13152313, + "epoch": 0.041362062331666026, + "flos": 579651393024.0, + "grad_norm": 0.025801807715809106, + "language_loss": 1.04973316, + "learning_rate": 0.0009996621770237937, + "loss": 1.06297863, + "num_input_tokens_seen": 16573744, + "router_z_loss_mlp": 1.93359375, + "step": 215, + "time_per_iteration": 2.7359023094177246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01319406, + "balance_loss_mlp": 1.12657344, + "epoch": 0.041554444016929586, + "flos": 612700729344.0, + "grad_norm": 0.027594527286323677, + "language_loss": 1.00985026, + "learning_rate": 0.0009996506296949073, + "loss": 1.02304435, + "num_input_tokens_seen": 16655344, + "router_z_loss_mlp": 1.93164062, + "step": 216, + "time_per_iteration": 2.860781669616699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01320461, + "balance_loss_mlp": 1.12781918, + "epoch": 0.04174682570219315, + "flos": 529150497792.0, + "grad_norm": 0.030561981852332186, + "language_loss": 1.01172602, + "learning_rate": 0.0009996388883844428, + "loss": 1.02493072, + "num_input_tokens_seen": 16726480, + "router_z_loss_mlp": 1.9296875, + "step": 217, + "time_per_iteration": 2.614837169647217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01315002, + "balance_loss_mlp": 1.12255037, + "epoch": 0.04193920738745671, + "flos": 512499939840.0, + "grad_norm": 0.024235201889365978, + "language_loss": 1.04092622, + "learning_rate": 0.0009996269530969588, + "loss": 1.05407631, + "num_input_tokens_seen": 16792112, + "router_z_loss_mlp": 1.92773438, + "step": 218, + "time_per_iteration": 2.5777087211608887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01317845, + "balance_loss_mlp": 1.1255846, + "epoch": 0.04213158907272028, + "flos": 572552448000.0, + "grad_norm": 0.03618883866707401, + "language_loss": 1.04623246, + "learning_rate": 0.0009996148238370888, + "loss": 1.05941105, + "num_input_tokens_seen": 16862960, + "router_z_loss_mlp": 1.92578125, + "step": 219, + "time_per_iteration": 2.723344564437866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01319419, + "balance_loss_mlp": 1.12830234, + "epoch": 0.04232397075798384, + "flos": 965904098304.0, + "grad_norm": 0.02808123492922437, + "language_loss": 0.99962145, + "learning_rate": 0.0009996025006095421, + "loss": 1.01281559, + "num_input_tokens_seen": 16950416, + "router_z_loss_mlp": 1.9140625, + "step": 220, + "time_per_iteration": 3.297567844390869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01355408, + "balance_loss_mlp": 1.16314697, + "epoch": 0.042516352443247404, + "flos": 1472730628608.0, + "grad_norm": 0.031119874656221472, + "language_loss": 0.77783144, + "learning_rate": 0.0009995899834191028, + "loss": 0.79138547, + "num_input_tokens_seen": 17180944, + "router_z_loss_mlp": 1.92578125, + "step": 221, + "time_per_iteration": 5.484851837158203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0132056, + "balance_loss_mlp": 1.13039756, + "epoch": 0.042708734128510964, + "flos": 655891832832.0, + "grad_norm": 0.027306518139410985, + "language_loss": 0.99887031, + "learning_rate": 0.0009995772722706307, + "loss": 1.0120759, + "num_input_tokens_seen": 17257792, + "router_z_loss_mlp": 1.90429688, + "step": 222, + "time_per_iteration": 2.801955461502075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01324867, + "balance_loss_mlp": 1.13518083, + "epoch": 0.04290111581377453, + "flos": 432733859328.0, + "grad_norm": 0.025166076900031344, + "language_loss": 1.13987851, + "learning_rate": 0.0009995643671690604, + "loss": 1.15312719, + "num_input_tokens_seen": 17320288, + "router_z_loss_mlp": 1.89941406, + "step": 223, + "time_per_iteration": 2.4589195251464844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01320058, + "balance_loss_mlp": 1.13142133, + "epoch": 0.04309349749903809, + "flos": 645866860032.0, + "grad_norm": 0.02470776233740571, + "language_loss": 1.01624262, + "learning_rate": 0.0009995512681194023, + "loss": 1.02944326, + "num_input_tokens_seen": 17396672, + "router_z_loss_mlp": 1.88867188, + "step": 224, + "time_per_iteration": 2.854653835296631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01319788, + "balance_loss_mlp": 1.13124692, + "epoch": 0.04328587918430166, + "flos": 832895745024.0, + "grad_norm": 0.02898896961022835, + "language_loss": 0.98942387, + "learning_rate": 0.0009995379751267417, + "loss": 1.00262189, + "num_input_tokens_seen": 17488096, + "router_z_loss_mlp": 1.88769531, + "step": 225, + "time_per_iteration": 3.260105609893799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01317885, + "balance_loss_mlp": 1.12943935, + "epoch": 0.043478260869565216, + "flos": 526115681280.0, + "grad_norm": 0.02601835272599882, + "language_loss": 1.00718379, + "learning_rate": 0.0009995244881962398, + "loss": 1.02036262, + "num_input_tokens_seen": 17557632, + "router_z_loss_mlp": 1.88671875, + "step": 226, + "time_per_iteration": 2.631685495376587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01320396, + "balance_loss_mlp": 1.13204539, + "epoch": 0.04367064255482878, + "flos": 440412225024.0, + "grad_norm": 0.02740546356326938, + "language_loss": 1.02089393, + "learning_rate": 0.0009995108073331323, + "loss": 1.03409791, + "num_input_tokens_seen": 17626672, + "router_z_loss_mlp": 1.88574219, + "step": 227, + "time_per_iteration": 2.6414895057678223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01308962, + "balance_loss_mlp": 1.12156498, + "epoch": 0.04386302424009234, + "flos": 508466737152.0, + "grad_norm": 0.023646446246452554, + "language_loss": 1.04017711, + "learning_rate": 0.0009994969325427309, + "loss": 1.05326676, + "num_input_tokens_seen": 17698624, + "router_z_loss_mlp": 1.87597656, + "step": 228, + "time_per_iteration": 2.6933584213256836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0130646, + "balance_loss_mlp": 1.11906338, + "epoch": 0.04405540592535591, + "flos": 541743657984.0, + "grad_norm": 0.02642836262436834, + "language_loss": 1.00691068, + "learning_rate": 0.0009994828638304218, + "loss": 1.0199753, + "num_input_tokens_seen": 17767760, + "router_z_loss_mlp": 1.87597656, + "step": 229, + "time_per_iteration": 2.604616165161133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01305226, + "balance_loss_mlp": 1.11792421, + "epoch": 0.04424778761061947, + "flos": 447309055488.0, + "grad_norm": 0.039218098968292335, + "language_loss": 1.07079852, + "learning_rate": 0.0009994686012016675, + "loss": 1.08385086, + "num_input_tokens_seen": 17833664, + "router_z_loss_mlp": 1.875, + "step": 230, + "time_per_iteration": 2.568608045578003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0130487, + "balance_loss_mlp": 1.1187129, + "epoch": 0.044440169295883035, + "flos": 701981492736.0, + "grad_norm": 0.02721662483758601, + "language_loss": 1.06240797, + "learning_rate": 0.000999454144662005, + "loss": 1.07545662, + "num_input_tokens_seen": 17908880, + "router_z_loss_mlp": 1.86328125, + "step": 231, + "time_per_iteration": 2.9104526042938232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01295735, + "balance_loss_mlp": 1.10957813, + "epoch": 0.044632550981146595, + "flos": 589426587648.0, + "grad_norm": 0.02817980914561194, + "language_loss": 1.003865, + "learning_rate": 0.0009994394942170468, + "loss": 1.01682234, + "num_input_tokens_seen": 17978208, + "router_z_loss_mlp": 1.86328125, + "step": 232, + "time_per_iteration": 2.674896001815796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01302928, + "balance_loss_mlp": 1.11667526, + "epoch": 0.04482493266641016, + "flos": 555854226432.0, + "grad_norm": 0.029144066951330677, + "language_loss": 0.98161608, + "learning_rate": 0.0009994246498724808, + "loss": 0.99464536, + "num_input_tokens_seen": 18049296, + "router_z_loss_mlp": 1.86425781, + "step": 233, + "time_per_iteration": 2.674178123474121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01302597, + "balance_loss_mlp": 1.11682117, + "epoch": 0.04501731435167372, + "flos": 724069870080.0, + "grad_norm": 0.027038299766394356, + "language_loss": 1.00722432, + "learning_rate": 0.00099940961163407, + "loss": 1.02025032, + "num_input_tokens_seen": 18123296, + "router_z_loss_mlp": 1.859375, + "step": 234, + "time_per_iteration": 2.8427939414978027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01301098, + "balance_loss_mlp": 1.11608493, + "epoch": 0.04520969603693728, + "flos": 512797381632.0, + "grad_norm": 0.027022139799708383, + "language_loss": 1.02586675, + "learning_rate": 0.0009993943795076528, + "loss": 1.03887773, + "num_input_tokens_seen": 18192784, + "router_z_loss_mlp": 1.8515625, + "step": 235, + "time_per_iteration": 2.5940792560577393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01295671, + "balance_loss_mlp": 1.11094403, + "epoch": 0.04540207772220085, + "flos": 365877846528.0, + "grad_norm": 0.03212133053651388, + "language_loss": 1.0562067, + "learning_rate": 0.0009993789534991427, + "loss": 1.06916356, + "num_input_tokens_seen": 18254064, + "router_z_loss_mlp": 1.84863281, + "step": 236, + "time_per_iteration": 2.4345834255218506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01294151, + "balance_loss_mlp": 1.1095196, + "epoch": 0.045594459407464406, + "flos": 523723411968.0, + "grad_norm": 0.029471400038435007, + "language_loss": 1.00276268, + "learning_rate": 0.0009993633336145287, + "loss": 1.01570415, + "num_input_tokens_seen": 18325728, + "router_z_loss_mlp": 1.84765625, + "step": 237, + "time_per_iteration": 2.6279234886169434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01296614, + "balance_loss_mlp": 1.11284053, + "epoch": 0.04578684109272797, + "flos": 673115807232.0, + "grad_norm": 0.032189822363292264, + "language_loss": 1.04537559, + "learning_rate": 0.0009993475198598752, + "loss": 1.05834174, + "num_input_tokens_seen": 18408608, + "router_z_loss_mlp": 1.83886719, + "step": 238, + "time_per_iteration": 2.98264741897583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01294154, + "balance_loss_mlp": 1.11047626, + "epoch": 0.04597922277799153, + "flos": 542620520448.0, + "grad_norm": 0.025834809881005002, + "language_loss": 1.01282692, + "learning_rate": 0.0009993315122413212, + "loss": 1.02576852, + "num_input_tokens_seen": 18471920, + "router_z_loss_mlp": 1.83789062, + "step": 239, + "time_per_iteration": 2.5969364643096924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01297016, + "balance_loss_mlp": 1.11333883, + "epoch": 0.0461716044632551, + "flos": 459993540096.0, + "grad_norm": 0.025301515003642434, + "language_loss": 1.01210213, + "learning_rate": 0.0009993153107650818, + "loss": 1.02507234, + "num_input_tokens_seen": 18540496, + "router_z_loss_mlp": 1.83789062, + "step": 240, + "time_per_iteration": 2.590198278427124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01297188, + "balance_loss_mlp": 1.11360526, + "epoch": 0.04636398614851866, + "flos": 456170457600.0, + "grad_norm": 0.0338801607583888, + "language_loss": 1.01026332, + "learning_rate": 0.0009992989154374468, + "loss": 1.0232352, + "num_input_tokens_seen": 18606944, + "router_z_loss_mlp": 1.83691406, + "step": 241, + "time_per_iteration": 2.5699570178985596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012963, + "balance_loss_mlp": 1.11271763, + "epoch": 0.046556367833782225, + "flos": 557901390336.0, + "grad_norm": 0.02656657647638049, + "language_loss": 1.0757494, + "learning_rate": 0.0009992823262647817, + "loss": 1.08871233, + "num_input_tokens_seen": 18679520, + "router_z_loss_mlp": 1.83691406, + "step": 242, + "time_per_iteration": 2.6949496269226074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01293965, + "balance_loss_mlp": 1.11047852, + "epoch": 0.046748749519045785, + "flos": 594087601152.0, + "grad_norm": 0.02772781005565529, + "language_loss": 1.02479577, + "learning_rate": 0.0009992655432535264, + "loss": 1.03773546, + "num_input_tokens_seen": 18756656, + "router_z_loss_mlp": 1.8359375, + "step": 243, + "time_per_iteration": 2.7783396244049072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01286985, + "balance_loss_mlp": 1.10454702, + "epoch": 0.04694113120430935, + "flos": 570941713920.0, + "grad_norm": 0.021337056529223342, + "language_loss": 1.01771712, + "learning_rate": 0.0009992485664101973, + "loss": 1.03058696, + "num_input_tokens_seen": 18829792, + "router_z_loss_mlp": 1.82519531, + "step": 244, + "time_per_iteration": 2.679227590560913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01286082, + "balance_loss_mlp": 1.10364425, + "epoch": 0.04713351288957291, + "flos": 865245411840.0, + "grad_norm": 0.03170954338904746, + "language_loss": 1.04355013, + "learning_rate": 0.000999231395741385, + "loss": 1.05641103, + "num_input_tokens_seen": 18906864, + "router_z_loss_mlp": 1.82519531, + "step": 245, + "time_per_iteration": 3.0976788997650146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01287082, + "balance_loss_mlp": 1.10473943, + "epoch": 0.04732589457483648, + "flos": 538235481600.0, + "grad_norm": 0.02353809889700427, + "language_loss": 1.02393425, + "learning_rate": 0.0009992140312537557, + "loss": 1.03680515, + "num_input_tokens_seen": 18973632, + "router_z_loss_mlp": 1.82421875, + "step": 246, + "time_per_iteration": 2.6005859375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01286378, + "balance_loss_mlp": 1.1048938, + "epoch": 0.04751827626010004, + "flos": 763271431680.0, + "grad_norm": 0.021903859990429042, + "language_loss": 0.96665001, + "learning_rate": 0.000999196472954051, + "loss": 0.97951376, + "num_input_tokens_seen": 19052944, + "router_z_loss_mlp": 1.81542969, + "step": 247, + "time_per_iteration": 2.95379638671875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01319153, + "balance_loss_mlp": 1.13833618, + "epoch": 0.0477106579453636, + "flos": 1583125578240.0, + "grad_norm": 0.034344144576267104, + "language_loss": 0.79424852, + "learning_rate": 0.0009991787208490878, + "loss": 0.80744004, + "num_input_tokens_seen": 19286288, + "router_z_loss_mlp": 1.80859375, + "step": 248, + "time_per_iteration": 6.070216655731201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01286412, + "balance_loss_mlp": 1.10521388, + "epoch": 0.04790303963062716, + "flos": 458692982784.0, + "grad_norm": 0.024476775577385278, + "language_loss": 1.04631317, + "learning_rate": 0.0009991607749457578, + "loss": 1.05917728, + "num_input_tokens_seen": 19349296, + "router_z_loss_mlp": 1.8125, + "step": 249, + "time_per_iteration": 2.5741825103759766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0128623, + "balance_loss_mlp": 1.10503209, + "epoch": 0.04809542131589073, + "flos": 783786004992.0, + "grad_norm": 0.021665977114244464, + "language_loss": 1.0235486, + "learning_rate": 0.0009991426352510286, + "loss": 1.03641105, + "num_input_tokens_seen": 19428416, + "router_z_loss_mlp": 1.81152344, + "step": 250, + "time_per_iteration": 3.004519462585449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01287109, + "balance_loss_mlp": 1.10648286, + "epoch": 0.04828780300115429, + "flos": 560321857536.0, + "grad_norm": 0.028059326531900755, + "language_loss": 1.04456568, + "learning_rate": 0.0009991243017719422, + "loss": 1.05743682, + "num_input_tokens_seen": 19498688, + "router_z_loss_mlp": 1.8046875, + "step": 251, + "time_per_iteration": 2.666212320327759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01283793, + "balance_loss_mlp": 1.10364354, + "epoch": 0.048480184686417856, + "flos": 502922130432.0, + "grad_norm": 0.02282661348297379, + "language_loss": 0.985008, + "learning_rate": 0.0009991057745156165, + "loss": 0.99784589, + "num_input_tokens_seen": 19567568, + "router_z_loss_mlp": 1.80078125, + "step": 252, + "time_per_iteration": 2.6053824424743652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01291534, + "balance_loss_mlp": 1.11186218, + "epoch": 0.048672566371681415, + "flos": 1539469120512.0, + "grad_norm": 0.022804524860740846, + "language_loss": 0.81910986, + "learning_rate": 0.0009990870534892446, + "loss": 0.83202517, + "num_input_tokens_seen": 19796368, + "router_z_loss_mlp": 1.796875, + "step": 253, + "time_per_iteration": 5.005317449569702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01285445, + "balance_loss_mlp": 1.10500991, + "epoch": 0.04886494805694498, + "flos": 538951888896.0, + "grad_norm": 0.028242285238858512, + "language_loss": 1.06865251, + "learning_rate": 0.0009990681387000943, + "loss": 1.08150697, + "num_input_tokens_seen": 19870480, + "router_z_loss_mlp": 1.80371094, + "step": 254, + "time_per_iteration": 2.743307590484619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01283321, + "balance_loss_mlp": 1.10317183, + "epoch": 0.04905732974220854, + "flos": 681484383744.0, + "grad_norm": 0.028658365214850164, + "language_loss": 1.02065015, + "learning_rate": 0.0009990490301555093, + "loss": 1.03348327, + "num_input_tokens_seen": 19956288, + "router_z_loss_mlp": 1.80126953, + "step": 255, + "time_per_iteration": 2.989856719970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01291977, + "balance_loss_mlp": 1.1134491, + "epoch": 0.04924971142747211, + "flos": 1424274895872.0, + "grad_norm": 0.01325206916769545, + "language_loss": 0.79215157, + "learning_rate": 0.0009990297278629078, + "loss": 0.80507129, + "num_input_tokens_seen": 20180080, + "router_z_loss_mlp": 1.78515625, + "step": 256, + "time_per_iteration": 4.888273477554321 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01281082, + "balance_loss_mlp": 1.10255432, + "epoch": 0.04944209311273567, + "flos": 1561236587520.0, + "grad_norm": 0.00993410716153638, + "language_loss": 0.79242742, + "learning_rate": 0.000999010231829784, + "loss": 0.80523825, + "num_input_tokens_seen": 20413456, + "router_z_loss_mlp": 1.78515625, + "step": 257, + "time_per_iteration": 4.983605623245239 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01285439, + "balance_loss_mlp": 1.10786438, + "epoch": 0.04963447479799923, + "flos": 1574170850304.0, + "grad_norm": 0.014798835308040135, + "language_loss": 0.69975883, + "learning_rate": 0.0009989905420637066, + "loss": 0.71261322, + "num_input_tokens_seen": 20644736, + "router_z_loss_mlp": 1.77539062, + "step": 258, + "time_per_iteration": 4.888776540756226 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01282209, + "balance_loss_mlp": 1.10310864, + "epoch": 0.049826856483262794, + "flos": 626498393088.0, + "grad_norm": 0.032236291487241595, + "language_loss": 0.9680413, + "learning_rate": 0.0009989706585723202, + "loss": 0.98086333, + "num_input_tokens_seen": 20719040, + "router_z_loss_mlp": 1.79003906, + "step": 259, + "time_per_iteration": 2.776397705078125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01280186, + "balance_loss_mlp": 1.10175359, + "epoch": 0.05001923816852635, + "flos": 505155945984.0, + "grad_norm": 0.03442249770662494, + "language_loss": 1.03026366, + "learning_rate": 0.0009989505813633442, + "loss": 1.04306555, + "num_input_tokens_seen": 20789376, + "router_z_loss_mlp": 1.78271484, + "step": 260, + "time_per_iteration": 2.651773691177368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01281097, + "balance_loss_mlp": 1.10295069, + "epoch": 0.05021161985378992, + "flos": 588467132928.0, + "grad_norm": 0.024781843968885862, + "language_loss": 1.02880228, + "learning_rate": 0.000998930310444573, + "loss": 1.04161322, + "num_input_tokens_seen": 20857856, + "router_z_loss_mlp": 1.78125, + "step": 261, + "time_per_iteration": 2.730717420578003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01266116, + "balance_loss_mlp": 1.08796966, + "epoch": 0.05040400153905348, + "flos": 634402341888.0, + "grad_norm": 0.028473185138455738, + "language_loss": 1.01351452, + "learning_rate": 0.0009989098458238765, + "loss": 1.02617574, + "num_input_tokens_seen": 20931232, + "router_z_loss_mlp": 1.77929688, + "step": 262, + "time_per_iteration": 2.7717010974884033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01272128, + "balance_loss_mlp": 1.09407711, + "epoch": 0.050596383224317046, + "flos": 554808176640.0, + "grad_norm": 0.03464065468219783, + "language_loss": 1.00597906, + "learning_rate": 0.0009988891875091998, + "loss": 1.01870036, + "num_input_tokens_seen": 21012672, + "router_z_loss_mlp": 1.77880859, + "step": 263, + "time_per_iteration": 2.8842556476593018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012725, + "balance_loss_mlp": 1.09444928, + "epoch": 0.050788764909580605, + "flos": 550761512448.0, + "grad_norm": 0.02541343292713684, + "language_loss": 0.95014787, + "learning_rate": 0.0009988683355085636, + "loss": 0.96287298, + "num_input_tokens_seen": 21088592, + "router_z_loss_mlp": 1.77880859, + "step": 264, + "time_per_iteration": 2.7466378211975098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01272896, + "balance_loss_mlp": 1.09527469, + "epoch": 0.05098114659484417, + "flos": 606344388096.0, + "grad_norm": 0.02024934595994547, + "language_loss": 1.03858495, + "learning_rate": 0.000998847289830063, + "loss": 1.05131388, + "num_input_tokens_seen": 21169840, + "router_z_loss_mlp": 1.77587891, + "step": 265, + "time_per_iteration": 2.821997880935669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01285574, + "balance_loss_mlp": 1.10761857, + "epoch": 0.05117352828010773, + "flos": 439472236032.0, + "grad_norm": 0.026937538773041583, + "language_loss": 0.97004128, + "learning_rate": 0.0009988260504818682, + "loss": 0.98289704, + "num_input_tokens_seen": 21236144, + "router_z_loss_mlp": 1.77832031, + "step": 266, + "time_per_iteration": 2.557830333709717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01277028, + "balance_loss_mlp": 1.09907281, + "epoch": 0.0513659099653713, + "flos": 506030807040.0, + "grad_norm": 0.02494960853942852, + "language_loss": 1.03986156, + "learning_rate": 0.000998804617472226, + "loss": 1.05263186, + "num_input_tokens_seen": 21304864, + "router_z_loss_mlp": 1.77832031, + "step": 267, + "time_per_iteration": 2.644099235534668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01269549, + "balance_loss_mlp": 1.09254682, + "epoch": 0.05155829165063486, + "flos": 696714862080.0, + "grad_norm": 0.027664306986101984, + "language_loss": 0.98796493, + "learning_rate": 0.0009987829908094568, + "loss": 1.00066042, + "num_input_tokens_seen": 21377504, + "router_z_loss_mlp": 1.76953125, + "step": 268, + "time_per_iteration": 2.8291003704071045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01265086, + "balance_loss_mlp": 1.08817983, + "epoch": 0.051750673335898424, + "flos": 1350300294144.0, + "grad_norm": 0.03385083640642466, + "language_loss": 1.06218576, + "learning_rate": 0.0009987611705019569, + "loss": 1.07483661, + "num_input_tokens_seen": 21463840, + "router_z_loss_mlp": 1.76855469, + "step": 269, + "time_per_iteration": 4.150776624679565 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01264769, + "balance_loss_mlp": 1.08795822, + "epoch": 0.051943055021161984, + "flos": 490589481984.0, + "grad_norm": 0.028250493976035247, + "language_loss": 1.04104686, + "learning_rate": 0.0009987391565581978, + "loss": 1.05369449, + "num_input_tokens_seen": 21531184, + "router_z_loss_mlp": 1.76757812, + "step": 270, + "time_per_iteration": 2.5921454429626465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01266977, + "balance_loss_mlp": 1.09092879, + "epoch": 0.05213543670642555, + "flos": 546880032768.0, + "grad_norm": 0.026669721507250346, + "language_loss": 0.96455419, + "learning_rate": 0.000998716948986726, + "loss": 0.97722399, + "num_input_tokens_seen": 21612224, + "router_z_loss_mlp": 1.75976562, + "step": 271, + "time_per_iteration": 2.7835500240325928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01268405, + "balance_loss_mlp": 1.09264266, + "epoch": 0.05232781839168911, + "flos": 604672528896.0, + "grad_norm": 0.03568520247936263, + "language_loss": 0.99334317, + "learning_rate": 0.0009986945477961633, + "loss": 1.00602722, + "num_input_tokens_seen": 21681024, + "router_z_loss_mlp": 1.75683594, + "step": 272, + "time_per_iteration": 2.6972289085388184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01271248, + "balance_loss_mlp": 1.0953902, + "epoch": 0.052520200076952676, + "flos": 539655561216.0, + "grad_norm": 0.02343402151836954, + "language_loss": 1.0317328, + "learning_rate": 0.0009986719529952066, + "loss": 1.04444528, + "num_input_tokens_seen": 21761616, + "router_z_loss_mlp": 1.7578125, + "step": 273, + "time_per_iteration": 2.908298969268799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0126867, + "balance_loss_mlp": 1.09266984, + "epoch": 0.052712581762216236, + "flos": 464332916736.0, + "grad_norm": 0.028493663433316604, + "language_loss": 1.03350449, + "learning_rate": 0.000998649164592628, + "loss": 1.0461911, + "num_input_tokens_seen": 21828416, + "router_z_loss_mlp": 1.75927734, + "step": 274, + "time_per_iteration": 2.5805718898773193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01263404, + "balance_loss_mlp": 1.08735609, + "epoch": 0.0529049634474798, + "flos": 549105116160.0, + "grad_norm": 0.024462560446863554, + "language_loss": 1.01155043, + "learning_rate": 0.0009986261825972748, + "loss": 1.02418458, + "num_input_tokens_seen": 21901600, + "router_z_loss_mlp": 1.75976562, + "step": 275, + "time_per_iteration": 2.675705909729004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01269015, + "balance_loss_mlp": 1.09334803, + "epoch": 0.05309734513274336, + "flos": 619200061440.0, + "grad_norm": 0.026443817532743642, + "language_loss": 1.03055406, + "learning_rate": 0.000998603007018069, + "loss": 1.04324436, + "num_input_tokens_seen": 21979312, + "router_z_loss_mlp": 1.75585938, + "step": 276, + "time_per_iteration": 2.77298903465271 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01264217, + "balance_loss_mlp": 1.08893192, + "epoch": 0.05328972681800693, + "flos": 606617634816.0, + "grad_norm": 0.022439827576013177, + "language_loss": 1.00613213, + "learning_rate": 0.0009985796378640089, + "loss": 1.01877427, + "num_input_tokens_seen": 22053776, + "router_z_loss_mlp": 1.75195312, + "step": 277, + "time_per_iteration": 2.693049669265747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01264635, + "balance_loss_mlp": 1.08963549, + "epoch": 0.05348210850327049, + "flos": 605730038784.0, + "grad_norm": 0.02549683888178727, + "language_loss": 1.01102281, + "learning_rate": 0.0009985560751441665, + "loss": 1.02366924, + "num_input_tokens_seen": 22134304, + "router_z_loss_mlp": 1.74902344, + "step": 278, + "time_per_iteration": 2.8009955883026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262716, + "balance_loss_mlp": 1.08757329, + "epoch": 0.053674490188534055, + "flos": 631997337600.0, + "grad_norm": 0.025192100126554, + "language_loss": 1.03316271, + "learning_rate": 0.00099853231886769, + "loss": 1.04578984, + "num_input_tokens_seen": 22212896, + "router_z_loss_mlp": 1.75048828, + "step": 279, + "time_per_iteration": 2.8228564262390137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262121, + "balance_loss_mlp": 1.08712184, + "epoch": 0.053866871873797614, + "flos": 480173741568.0, + "grad_norm": 0.02583251996588833, + "language_loss": 1.02629757, + "learning_rate": 0.0009985083690438024, + "loss": 1.03891873, + "num_input_tokens_seen": 22287216, + "router_z_loss_mlp": 1.74902344, + "step": 280, + "time_per_iteration": 2.705789566040039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01260843, + "balance_loss_mlp": 1.08655906, + "epoch": 0.054059253559061174, + "flos": 789489065472.0, + "grad_norm": 0.023704628566171972, + "language_loss": 0.9340027, + "learning_rate": 0.0009984842256818016, + "loss": 0.94661117, + "num_input_tokens_seen": 22370864, + "router_z_loss_mlp": 1.74169922, + "step": 281, + "time_per_iteration": 3.084801435470581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01257985, + "balance_loss_mlp": 1.08379591, + "epoch": 0.05425163524432474, + "flos": 629505011712.0, + "grad_norm": 0.027462270528210347, + "language_loss": 1.04308844, + "learning_rate": 0.0009984598887910613, + "loss": 1.05566835, + "num_input_tokens_seen": 22440080, + "router_z_loss_mlp": 1.74072266, + "step": 282, + "time_per_iteration": 2.729063034057617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262745, + "balance_loss_mlp": 1.08855665, + "epoch": 0.0544440169295883, + "flos": 616992442368.0, + "grad_norm": 0.02580860229759897, + "language_loss": 0.99945354, + "learning_rate": 0.0009984353583810297, + "loss": 1.01208091, + "num_input_tokens_seen": 22517936, + "router_z_loss_mlp": 1.74072266, + "step": 283, + "time_per_iteration": 2.812309741973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01258383, + "balance_loss_mlp": 1.08433735, + "epoch": 0.05463639861485187, + "flos": 648929874432.0, + "grad_norm": 0.0290705298354334, + "language_loss": 1.01989841, + "learning_rate": 0.0009984106344612302, + "loss": 1.03248215, + "num_input_tokens_seen": 22590480, + "router_z_loss_mlp": 1.73925781, + "step": 284, + "time_per_iteration": 2.785377264022827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0126395, + "balance_loss_mlp": 1.0907625, + "epoch": 0.054828780300115426, + "flos": 798584782848.0, + "grad_norm": 0.03167011835004719, + "language_loss": 0.97435868, + "learning_rate": 0.0009983857170412615, + "loss": 0.9869982, + "num_input_tokens_seen": 22668144, + "router_z_loss_mlp": 1.73046875, + "step": 285, + "time_per_iteration": 2.9822604656219482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01258353, + "balance_loss_mlp": 1.08511817, + "epoch": 0.05502116198537899, + "flos": 550798442496.0, + "grad_norm": 0.02077828299254123, + "language_loss": 0.96197385, + "learning_rate": 0.000998360606130798, + "loss": 0.9745574, + "num_input_tokens_seen": 22749648, + "router_z_loss_mlp": 1.73095703, + "step": 286, + "time_per_iteration": 2.8340489864349365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01266281, + "balance_loss_mlp": 1.09461975, + "epoch": 0.05521354367064255, + "flos": 1410906931200.0, + "grad_norm": 0.010589673029146669, + "language_loss": 0.69073117, + "learning_rate": 0.0009983353017395877, + "loss": 0.70339394, + "num_input_tokens_seen": 22982752, + "router_z_loss_mlp": 1.71484375, + "step": 287, + "time_per_iteration": 4.893908500671387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0126535, + "balance_loss_mlp": 1.09235394, + "epoch": 0.05540592535590612, + "flos": 646611465216.0, + "grad_norm": 0.04031113274469801, + "language_loss": 1.02544129, + "learning_rate": 0.0009983098038774552, + "loss": 1.03809476, + "num_input_tokens_seen": 23053584, + "router_z_loss_mlp": 1.72851562, + "step": 288, + "time_per_iteration": 2.800687551498413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01258598, + "balance_loss_mlp": 1.08712769, + "epoch": 0.05559830704116968, + "flos": 1514315727360.0, + "grad_norm": 0.011752943348929798, + "language_loss": 0.78170228, + "learning_rate": 0.0009982841125542993, + "loss": 0.79428822, + "num_input_tokens_seen": 23280256, + "router_z_loss_mlp": 1.71289062, + "step": 289, + "time_per_iteration": 4.802466630935669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0126164, + "balance_loss_mlp": 1.08869088, + "epoch": 0.055790688726433245, + "flos": 509334867456.0, + "grad_norm": 0.03460900762027919, + "language_loss": 1.00913107, + "learning_rate": 0.0009982582277800948, + "loss": 1.02174735, + "num_input_tokens_seen": 23345760, + "router_z_loss_mlp": 1.72802734, + "step": 290, + "time_per_iteration": 2.574007749557495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01255451, + "balance_loss_mlp": 1.08326483, + "epoch": 0.055983070411696804, + "flos": 659074369536.0, + "grad_norm": 0.03439417592421578, + "language_loss": 1.07703924, + "learning_rate": 0.0009982321495648908, + "loss": 1.08959377, + "num_input_tokens_seen": 23420720, + "router_z_loss_mlp": 1.72021484, + "step": 291, + "time_per_iteration": 2.8004326820373535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01257264, + "balance_loss_mlp": 1.08503067, + "epoch": 0.05617545209696037, + "flos": 588475865088.0, + "grad_norm": 0.024241847728240208, + "language_loss": 0.9905349, + "learning_rate": 0.0009982058779188115, + "loss": 1.00310755, + "num_input_tokens_seen": 23492576, + "router_z_loss_mlp": 1.72070312, + "step": 292, + "time_per_iteration": 2.763096570968628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01257503, + "balance_loss_mlp": 1.0853169, + "epoch": 0.05636783378222393, + "flos": 612787324416.0, + "grad_norm": 0.027188079674348095, + "language_loss": 1.06693649, + "learning_rate": 0.0009981794128520567, + "loss": 1.07951164, + "num_input_tokens_seen": 23569824, + "router_z_loss_mlp": 1.72021484, + "step": 293, + "time_per_iteration": 2.7630960941314697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01253426, + "balance_loss_mlp": 1.08123958, + "epoch": 0.0565602154674875, + "flos": 669422980608.0, + "grad_norm": 0.030197403892147204, + "language_loss": 1.03523457, + "learning_rate": 0.000998152754374901, + "loss": 1.04776871, + "num_input_tokens_seen": 23649984, + "router_z_loss_mlp": 1.72021484, + "step": 294, + "time_per_iteration": 2.8583314418792725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01249713, + "balance_loss_mlp": 1.07743168, + "epoch": 0.05675259715275106, + "flos": 618364131840.0, + "grad_norm": 0.026289358543143387, + "language_loss": 0.99071473, + "learning_rate": 0.0009981259024976943, + "loss": 1.00321186, + "num_input_tokens_seen": 23722032, + "router_z_loss_mlp": 1.72119141, + "step": 295, + "time_per_iteration": 2.719881534576416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250566, + "balance_loss_mlp": 1.07814193, + "epoch": 0.05694497883801462, + "flos": 753153133056.0, + "grad_norm": 0.03148267511857758, + "language_loss": 0.97962338, + "learning_rate": 0.0009980988572308612, + "loss": 0.99212909, + "num_input_tokens_seen": 23797376, + "router_z_loss_mlp": 1.72265625, + "step": 296, + "time_per_iteration": 2.9828195571899414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250905, + "balance_loss_mlp": 1.0789572, + "epoch": 0.05713736052327818, + "flos": 713380882944.0, + "grad_norm": 0.02524811137395651, + "language_loss": 1.00250125, + "learning_rate": 0.0009980716185849015, + "loss": 1.01501024, + "num_input_tokens_seen": 23880496, + "router_z_loss_mlp": 1.71777344, + "step": 297, + "time_per_iteration": 2.9749252796173096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01251066, + "balance_loss_mlp": 1.07959557, + "epoch": 0.05732974220854175, + "flos": 469935920640.0, + "grad_norm": 0.024054663695119705, + "language_loss": 0.96916056, + "learning_rate": 0.0009980441865703904, + "loss": 0.98167121, + "num_input_tokens_seen": 23950016, + "router_z_loss_mlp": 1.71289062, + "step": 298, + "time_per_iteration": 2.598325252532959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250911, + "balance_loss_mlp": 1.07939255, + "epoch": 0.05752212389380531, + "flos": 602540771328.0, + "grad_norm": 0.025930022992042723, + "language_loss": 1.05563986, + "learning_rate": 0.000998016561197978, + "loss": 1.06814897, + "num_input_tokens_seen": 24020064, + "router_z_loss_mlp": 1.71337891, + "step": 299, + "time_per_iteration": 2.690300703048706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250529, + "balance_loss_mlp": 1.07924938, + "epoch": 0.057714505579068875, + "flos": 679949511168.0, + "grad_norm": 0.025847674874905035, + "language_loss": 0.97115421, + "learning_rate": 0.0009979887424783895, + "loss": 0.98365951, + "num_input_tokens_seen": 24095360, + "router_z_loss_mlp": 1.7109375, + "step": 300, + "time_per_iteration": 2.863856554031372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01249286, + "balance_loss_mlp": 1.07810116, + "epoch": 0.057906887264332435, + "flos": 597011627520.0, + "grad_norm": 0.02594453351976595, + "language_loss": 0.96475613, + "learning_rate": 0.0009979607304224248, + "loss": 0.97724897, + "num_input_tokens_seen": 24164608, + "router_z_loss_mlp": 1.70996094, + "step": 301, + "time_per_iteration": 2.733415365219116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01248659, + "balance_loss_mlp": 1.0772841, + "epoch": 0.058099268949596, + "flos": 553164515328.0, + "grad_norm": 0.024492956239426298, + "language_loss": 1.0387162, + "learning_rate": 0.000997932525040959, + "loss": 1.05120289, + "num_input_tokens_seen": 24233840, + "router_z_loss_mlp": 1.71191406, + "step": 302, + "time_per_iteration": 2.6392264366149902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01252345, + "balance_loss_mlp": 1.08111238, + "epoch": 0.05829165063485956, + "flos": 509230808064.0, + "grad_norm": 0.038324718957869854, + "language_loss": 1.05616117, + "learning_rate": 0.000997904126344943, + "loss": 1.06868458, + "num_input_tokens_seen": 24302928, + "router_z_loss_mlp": 1.71044922, + "step": 303, + "time_per_iteration": 2.611621141433716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0125091, + "balance_loss_mlp": 1.080441, + "epoch": 0.05848403232012313, + "flos": 616362630144.0, + "grad_norm": 0.028818083574726525, + "language_loss": 1.02425826, + "learning_rate": 0.0009978755343454018, + "loss": 1.03676736, + "num_input_tokens_seen": 24377024, + "router_z_loss_mlp": 1.70263672, + "step": 304, + "time_per_iteration": 2.750213384628296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01245805, + "balance_loss_mlp": 1.07490659, + "epoch": 0.05867641400538669, + "flos": 501079082496.0, + "grad_norm": 0.025195073137535502, + "language_loss": 1.02874422, + "learning_rate": 0.0009978467490534355, + "loss": 1.04120219, + "num_input_tokens_seen": 24442736, + "router_z_loss_mlp": 1.70703125, + "step": 305, + "time_per_iteration": 2.591139078140259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0124905, + "balance_loss_mlp": 1.07853293, + "epoch": 0.05886879569065025, + "flos": 532378696704.0, + "grad_norm": 0.026491629776715375, + "language_loss": 0.99473399, + "learning_rate": 0.00099781777048022, + "loss": 1.00722456, + "num_input_tokens_seen": 24514800, + "router_z_loss_mlp": 1.703125, + "step": 306, + "time_per_iteration": 2.731084108352661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012482, + "balance_loss_mlp": 1.07782638, + "epoch": 0.05906117737591381, + "flos": 490040260608.0, + "grad_norm": 0.025118942729794178, + "language_loss": 1.01122224, + "learning_rate": 0.0009977885986370057, + "loss": 1.02370417, + "num_input_tokens_seen": 24581648, + "router_z_loss_mlp": 1.70166016, + "step": 307, + "time_per_iteration": 2.548307418823242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01247075, + "balance_loss_mlp": 1.0766536, + "epoch": 0.05925355906117737, + "flos": 592709180928.0, + "grad_norm": 0.029001286226925486, + "language_loss": 0.96780527, + "learning_rate": 0.000997759233535118, + "loss": 0.98027599, + "num_input_tokens_seen": 24658864, + "router_z_loss_mlp": 1.70214844, + "step": 308, + "time_per_iteration": 2.7876322269439697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01247056, + "balance_loss_mlp": 1.07668173, + "epoch": 0.05944594074644094, + "flos": 564787487232.0, + "grad_norm": 0.026648157056946717, + "language_loss": 1.03345561, + "learning_rate": 0.0009977296751859576, + "loss": 1.04592621, + "num_input_tokens_seen": 24735808, + "router_z_loss_mlp": 1.70166016, + "step": 309, + "time_per_iteration": 2.71488094329834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0124953, + "balance_loss_mlp": 1.07958508, + "epoch": 0.0596383224317045, + "flos": 539807284224.0, + "grad_norm": 0.023775477335694146, + "language_loss": 1.04459929, + "learning_rate": 0.0009976999236009998, + "loss": 1.05709469, + "num_input_tokens_seen": 24807744, + "router_z_loss_mlp": 1.69726562, + "step": 310, + "time_per_iteration": 2.7919182777404785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01255511, + "balance_loss_mlp": 1.08618629, + "epoch": 0.059830704116968066, + "flos": 562052113920.0, + "grad_norm": 0.02942700961653022, + "language_loss": 1.06853497, + "learning_rate": 0.0009976699787917955, + "loss": 1.08109009, + "num_input_tokens_seen": 24876640, + "router_z_loss_mlp": 1.69091797, + "step": 311, + "time_per_iteration": 2.6729257106781006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012565, + "balance_loss_mlp": 1.08789062, + "epoch": 0.060023085802231625, + "flos": 1574047325184.0, + "grad_norm": 0.029063497479097016, + "language_loss": 0.73442996, + "learning_rate": 0.00099763984076997, + "loss": 0.74699497, + "num_input_tokens_seen": 25110864, + "router_z_loss_mlp": 1.68359375, + "step": 312, + "time_per_iteration": 4.972649097442627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01249775, + "balance_loss_mlp": 1.08021212, + "epoch": 0.06021546748749519, + "flos": 483627523584.0, + "grad_norm": 0.0314235925459163, + "language_loss": 0.98280072, + "learning_rate": 0.0009976095095472243, + "loss": 0.9952985, + "num_input_tokens_seen": 25179328, + "router_z_loss_mlp": 1.69335938, + "step": 313, + "time_per_iteration": 2.5644209384918213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0125234, + "balance_loss_mlp": 1.08287179, + "epoch": 0.06040784917275875, + "flos": 621423143424.0, + "grad_norm": 0.030123719928355924, + "language_loss": 0.99538821, + "learning_rate": 0.0009975789851353334, + "loss": 1.00791156, + "num_input_tokens_seen": 25254128, + "router_z_loss_mlp": 1.69238281, + "step": 314, + "time_per_iteration": 2.794311285018921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01256592, + "balance_loss_mlp": 1.08741045, + "epoch": 0.06060023085802232, + "flos": 484602441216.0, + "grad_norm": 0.026992074473858402, + "language_loss": 1.01683283, + "learning_rate": 0.0009975482675461487, + "loss": 1.02939868, + "num_input_tokens_seen": 25324624, + "router_z_loss_mlp": 1.68945312, + "step": 315, + "time_per_iteration": 2.67146897315979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01249108, + "balance_loss_mlp": 1.08054566, + "epoch": 0.06079261254328588, + "flos": 582985652736.0, + "grad_norm": 0.0292304668639163, + "language_loss": 0.99909455, + "learning_rate": 0.0009975173567915952, + "loss": 1.01158559, + "num_input_tokens_seen": 25393648, + "router_z_loss_mlp": 1.68310547, + "step": 316, + "time_per_iteration": 2.693526268005371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0124983, + "balance_loss_mlp": 1.08131599, + "epoch": 0.060984994228549444, + "flos": 689008298496.0, + "grad_norm": 0.03272213432041067, + "language_loss": 0.93868685, + "learning_rate": 0.000997486252883674, + "loss": 0.95118511, + "num_input_tokens_seen": 25469152, + "router_z_loss_mlp": 1.68261719, + "step": 317, + "time_per_iteration": 2.837315082550049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01252509, + "balance_loss_mlp": 1.08399427, + "epoch": 0.061177375913813004, + "flos": 1316747398656.0, + "grad_norm": 0.031012352820614663, + "language_loss": 0.98949343, + "learning_rate": 0.0009974549558344602, + "loss": 1.00201845, + "num_input_tokens_seen": 25560944, + "router_z_loss_mlp": 1.68261719, + "step": 318, + "time_per_iteration": 3.686920166015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0125178, + "balance_loss_mlp": 1.08321846, + "epoch": 0.06136975759907657, + "flos": 575400612864.0, + "grad_norm": 0.027925836735275204, + "language_loss": 1.08640313, + "learning_rate": 0.000997423465656105, + "loss": 1.09892082, + "num_input_tokens_seen": 25631424, + "router_z_loss_mlp": 1.68310547, + "step": 319, + "time_per_iteration": 2.7691538333892822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250553, + "balance_loss_mlp": 1.08218133, + "epoch": 0.06156213928434013, + "flos": 528564346368.0, + "grad_norm": 0.033042319608268485, + "language_loss": 1.06051123, + "learning_rate": 0.0009973917823608335, + "loss": 1.07301688, + "num_input_tokens_seen": 25698176, + "router_z_loss_mlp": 1.68115234, + "step": 320, + "time_per_iteration": 2.583859443664551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01251303, + "balance_loss_mlp": 1.08364725, + "epoch": 0.061754520969603696, + "flos": 496589984256.0, + "grad_norm": 0.025351519610416894, + "language_loss": 0.99929821, + "learning_rate": 0.0009973599059609462, + "loss": 1.01181126, + "num_input_tokens_seen": 25773472, + "router_z_loss_mlp": 1.67382812, + "step": 321, + "time_per_iteration": 2.7139415740966797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01246641, + "balance_loss_mlp": 1.07893777, + "epoch": 0.061946902654867256, + "flos": 441044038656.0, + "grad_norm": 0.025867704850659153, + "language_loss": 0.98033404, + "learning_rate": 0.000997327836468819, + "loss": 0.99280047, + "num_input_tokens_seen": 25841088, + "router_z_loss_mlp": 1.67431641, + "step": 322, + "time_per_iteration": 2.598400831222534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250362, + "balance_loss_mlp": 1.08280182, + "epoch": 0.06213928434013082, + "flos": 600042441216.0, + "grad_norm": 0.02535167136018297, + "language_loss": 1.01516175, + "learning_rate": 0.000997295573896902, + "loss": 1.02766538, + "num_input_tokens_seen": 25919424, + "router_z_loss_mlp": 1.67285156, + "step": 323, + "time_per_iteration": 2.8295648097991943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0125071, + "balance_loss_mlp": 1.0847702, + "epoch": 0.06233166602539438, + "flos": 1453114384896.0, + "grad_norm": 0.012451454042686489, + "language_loss": 0.8119604, + "learning_rate": 0.000997263118257721, + "loss": 0.82446748, + "num_input_tokens_seen": 26135504, + "router_z_loss_mlp": 1.65625, + "step": 324, + "time_per_iteration": 4.72265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01244164, + "balance_loss_mlp": 1.07803345, + "epoch": 0.06252404771065795, + "flos": 1466628794880.0, + "grad_norm": 0.009026829376029815, + "language_loss": 0.78571939, + "learning_rate": 0.0009972304695638763, + "loss": 0.79816103, + "num_input_tokens_seen": 26358880, + "router_z_loss_mlp": 1.65820312, + "step": 325, + "time_per_iteration": 4.859014272689819 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01252677, + "balance_loss_mlp": 1.08535445, + "epoch": 0.06271642939592151, + "flos": 465235975680.0, + "grad_norm": 0.02899330239765154, + "language_loss": 0.95714885, + "learning_rate": 0.000997197627828043, + "loss": 0.96967566, + "num_input_tokens_seen": 26425888, + "router_z_loss_mlp": 1.67041016, + "step": 326, + "time_per_iteration": 2.5137081146240234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250284, + "balance_loss_mlp": 1.08343852, + "epoch": 0.06290881108118507, + "flos": 533431477248.0, + "grad_norm": 0.02712212536791958, + "language_loss": 0.90827119, + "learning_rate": 0.0009971645930629716, + "loss": 0.92077404, + "num_input_tokens_seen": 26500656, + "router_z_loss_mlp": 1.66552734, + "step": 327, + "time_per_iteration": 2.6867988109588623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01249402, + "balance_loss_mlp": 1.08260453, + "epoch": 0.06310119276644863, + "flos": 674767474176.0, + "grad_norm": 0.026247049513885422, + "language_loss": 1.04735494, + "learning_rate": 0.0009971313652814872, + "loss": 1.0598489, + "num_input_tokens_seen": 26577408, + "router_z_loss_mlp": 1.66503906, + "step": 328, + "time_per_iteration": 2.845618724822998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01245995, + "balance_loss_mlp": 1.07924485, + "epoch": 0.0632935744517122, + "flos": 772050241536.0, + "grad_norm": 0.03020034978800923, + "language_loss": 1.02482498, + "learning_rate": 0.0009970979444964903, + "loss": 1.03728485, + "num_input_tokens_seen": 26652048, + "router_z_loss_mlp": 1.66455078, + "step": 329, + "time_per_iteration": 2.967315196990967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01249674, + "balance_loss_mlp": 1.08316231, + "epoch": 0.06348595613697576, + "flos": 562974638592.0, + "grad_norm": 0.027434293654228625, + "language_loss": 1.03562641, + "learning_rate": 0.0009970643307209556, + "loss": 1.04812312, + "num_input_tokens_seen": 26728192, + "router_z_loss_mlp": 1.66210938, + "step": 330, + "time_per_iteration": 2.7991747856140137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01247918, + "balance_loss_mlp": 1.0814544, + "epoch": 0.06367833782223932, + "flos": 677383325184.0, + "grad_norm": 0.030236705728133754, + "language_loss": 1.00163436, + "learning_rate": 0.0009970305239679334, + "loss": 1.01411343, + "num_input_tokens_seen": 26798016, + "router_z_loss_mlp": 1.66162109, + "step": 331, + "time_per_iteration": 2.8012547492980957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01243208, + "balance_loss_mlp": 1.07669675, + "epoch": 0.06387071950750288, + "flos": 496348938240.0, + "grad_norm": 0.029279450628507057, + "language_loss": 1.04491925, + "learning_rate": 0.0009969965242505483, + "loss": 1.05735123, + "num_input_tokens_seen": 26867536, + "router_z_loss_mlp": 1.66210938, + "step": 332, + "time_per_iteration": 2.658085584640503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01251001, + "balance_loss_mlp": 1.08463287, + "epoch": 0.06406310119276645, + "flos": 534556116480.0, + "grad_norm": 0.029350032940601952, + "language_loss": 1.00548685, + "learning_rate": 0.0009969623315820007, + "loss": 1.01799679, + "num_input_tokens_seen": 26941216, + "router_z_loss_mlp": 1.66064453, + "step": 333, + "time_per_iteration": 2.6670596599578857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238877, + "balance_loss_mlp": 1.07246125, + "epoch": 0.06425548287803001, + "flos": 457164840960.0, + "grad_norm": 0.03277849846880731, + "language_loss": 1.00979996, + "learning_rate": 0.000996927945975565, + "loss": 1.02218866, + "num_input_tokens_seen": 27006560, + "router_z_loss_mlp": 1.66113281, + "step": 334, + "time_per_iteration": 2.5448765754699707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01245409, + "balance_loss_mlp": 1.0792315, + "epoch": 0.06444786456329357, + "flos": 561122858496.0, + "grad_norm": 0.03573042475309631, + "language_loss": 0.98108363, + "learning_rate": 0.0009968933674445906, + "loss": 0.99353766, + "num_input_tokens_seen": 27076400, + "router_z_loss_mlp": 1.65869141, + "step": 335, + "time_per_iteration": 2.679093360900879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01242425, + "balance_loss_mlp": 1.07672429, + "epoch": 0.06464024624855713, + "flos": 667356350976.0, + "grad_norm": 0.0316377115871937, + "language_loss": 0.99817598, + "learning_rate": 0.0009968585960025028, + "loss": 1.01060021, + "num_input_tokens_seen": 27158672, + "router_z_loss_mlp": 1.65380859, + "step": 336, + "time_per_iteration": 2.9642832279205322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01246223, + "balance_loss_mlp": 1.08085632, + "epoch": 0.0648326279338207, + "flos": 1524555549696.0, + "grad_norm": 0.012731648189289846, + "language_loss": 0.77653188, + "learning_rate": 0.0009968236316628006, + "loss": 0.78899413, + "num_input_tokens_seen": 27380592, + "router_z_loss_mlp": 1.65039062, + "step": 337, + "time_per_iteration": 4.799122333526611 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01242249, + "balance_loss_mlp": 1.07683408, + "epoch": 0.06502500961908426, + "flos": 1145214959616.0, + "grad_norm": 0.030168792806873873, + "language_loss": 0.98216963, + "learning_rate": 0.0009967884744390583, + "loss": 0.99459207, + "num_input_tokens_seen": 27469984, + "router_z_loss_mlp": 1.65087891, + "step": 338, + "time_per_iteration": 3.513155460357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01243978, + "balance_loss_mlp": 1.07865858, + "epoch": 0.06521739130434782, + "flos": 583693327872.0, + "grad_norm": 0.025823410577593665, + "language_loss": 0.98998213, + "learning_rate": 0.0009967531243449256, + "loss": 1.00242186, + "num_input_tokens_seen": 27543904, + "router_z_loss_mlp": 1.64990234, + "step": 339, + "time_per_iteration": 2.6683707237243652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01239476, + "balance_loss_mlp": 1.07453787, + "epoch": 0.06540977298961138, + "flos": 498658615296.0, + "grad_norm": 0.02384437782241591, + "language_loss": 1.06067204, + "learning_rate": 0.000996717581394126, + "loss": 1.07306671, + "num_input_tokens_seen": 27609888, + "router_z_loss_mlp": 1.64599609, + "step": 340, + "time_per_iteration": 2.5471885204315186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236124, + "balance_loss_mlp": 1.07171023, + "epoch": 0.06560215467487496, + "flos": 543903613440.0, + "grad_norm": 0.02318937955413124, + "language_loss": 1.0712086, + "learning_rate": 0.000996681845600459, + "loss": 1.08356977, + "num_input_tokens_seen": 27683936, + "router_z_loss_mlp": 1.640625, + "step": 341, + "time_per_iteration": 2.651742458343506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01240028, + "balance_loss_mlp": 1.07575738, + "epoch": 0.06579453636013852, + "flos": 414351043584.0, + "grad_norm": 0.026316803994829763, + "language_loss": 0.99228215, + "learning_rate": 0.0009966459169777982, + "loss": 1.00468254, + "num_input_tokens_seen": 27747840, + "router_z_loss_mlp": 1.63916016, + "step": 342, + "time_per_iteration": 2.4996230602264404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01244627, + "balance_loss_mlp": 1.08045232, + "epoch": 0.06598691804540208, + "flos": 561680812032.0, + "grad_norm": 0.03097158399986616, + "language_loss": 1.07124209, + "learning_rate": 0.0009966097955400924, + "loss": 1.08368838, + "num_input_tokens_seen": 27819728, + "router_z_loss_mlp": 1.63818359, + "step": 343, + "time_per_iteration": 2.7243080139160156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238691, + "balance_loss_mlp": 1.07451606, + "epoch": 0.06617929973066564, + "flos": 573301782528.0, + "grad_norm": 0.022915441754152527, + "language_loss": 1.00964892, + "learning_rate": 0.0009965734813013652, + "loss": 1.02203584, + "num_input_tokens_seen": 27893536, + "router_z_loss_mlp": 1.63818359, + "step": 344, + "time_per_iteration": 2.8087360858917236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01237027, + "balance_loss_mlp": 1.07375824, + "epoch": 0.06637168141592921, + "flos": 491464343040.0, + "grad_norm": 0.024444849604151265, + "language_loss": 1.03758335, + "learning_rate": 0.0009965369742757151, + "loss": 1.04995358, + "num_input_tokens_seen": 27960976, + "router_z_loss_mlp": 1.62890625, + "step": 345, + "time_per_iteration": 2.5691587924957275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01237907, + "balance_loss_mlp": 1.07459044, + "epoch": 0.06656406310119277, + "flos": 1081037924352.0, + "grad_norm": 0.024807678995847144, + "language_loss": 0.99529493, + "learning_rate": 0.0009965002744773152, + "loss": 1.00767398, + "num_input_tokens_seen": 28050864, + "router_z_loss_mlp": 1.62939453, + "step": 346, + "time_per_iteration": 3.507969856262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01239522, + "balance_loss_mlp": 1.07611036, + "epoch": 0.06675644478645633, + "flos": 514723021824.0, + "grad_norm": 0.02663627628784384, + "language_loss": 0.97097999, + "learning_rate": 0.0009964633819204139, + "loss": 0.98337519, + "num_input_tokens_seen": 28122448, + "router_z_loss_mlp": 1.63037109, + "step": 347, + "time_per_iteration": 2.6551601886749268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01261986, + "balance_loss_mlp": 1.09986115, + "epoch": 0.06694882647171989, + "flos": 1450534189056.0, + "grad_norm": 0.030948258254188146, + "language_loss": 0.81801116, + "learning_rate": 0.0009964262966193338, + "loss": 0.83063102, + "num_input_tokens_seen": 28350352, + "router_z_loss_mlp": 1.6171875, + "step": 348, + "time_per_iteration": 5.152506589889526 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236206, + "balance_loss_mlp": 1.07427216, + "epoch": 0.06714120815698346, + "flos": 1555397266944.0, + "grad_norm": 0.0077968992848742235, + "language_loss": 0.75153887, + "learning_rate": 0.000996389018588473, + "loss": 0.76390088, + "num_input_tokens_seen": 28585584, + "router_z_loss_mlp": 1.61523438, + "step": 349, + "time_per_iteration": 4.909464120864868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01242005, + "balance_loss_mlp": 1.07873547, + "epoch": 0.06733358984224702, + "flos": 881615992320.0, + "grad_norm": 0.03432587789196913, + "language_loss": 0.97228402, + "learning_rate": 0.000996351547842304, + "loss": 0.98470408, + "num_input_tokens_seen": 28672512, + "router_z_loss_mlp": 1.62890625, + "step": 350, + "time_per_iteration": 3.1799545288085938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01240315, + "balance_loss_mlp": 1.0778569, + "epoch": 0.06752597152751058, + "flos": 519917793792.0, + "grad_norm": 0.030803186893757592, + "language_loss": 0.96182388, + "learning_rate": 0.0009963138843953744, + "loss": 0.97422707, + "num_input_tokens_seen": 28741520, + "router_z_loss_mlp": 1.62060547, + "step": 351, + "time_per_iteration": 2.5873348712921143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238163, + "balance_loss_mlp": 1.07565665, + "epoch": 0.06771835321277414, + "flos": 540882258432.0, + "grad_norm": 0.023778523337364334, + "language_loss": 0.99575555, + "learning_rate": 0.000996276028262306, + "loss": 1.00813723, + "num_input_tokens_seen": 28814912, + "router_z_loss_mlp": 1.62109375, + "step": 352, + "time_per_iteration": 2.7943532466888428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238104, + "balance_loss_mlp": 1.07583654, + "epoch": 0.0679107348980377, + "flos": 461615007744.0, + "grad_norm": 0.02720743117278016, + "language_loss": 1.06749547, + "learning_rate": 0.0009962379794577964, + "loss": 1.07987642, + "num_input_tokens_seen": 28882192, + "router_z_loss_mlp": 1.61865234, + "step": 353, + "time_per_iteration": 2.589200973510742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01239427, + "balance_loss_mlp": 1.07711196, + "epoch": 0.06810311658330127, + "flos": 637207572480.0, + "grad_norm": 0.02321502152829773, + "language_loss": 0.95908678, + "learning_rate": 0.000996199737996617, + "loss": 0.97148108, + "num_input_tokens_seen": 28968576, + "router_z_loss_mlp": 1.61914062, + "step": 354, + "time_per_iteration": 2.8822708129882812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0123871, + "balance_loss_mlp": 1.07687151, + "epoch": 0.06829549826856483, + "flos": 465626743296.0, + "grad_norm": 0.030894548658215056, + "language_loss": 1.05554581, + "learning_rate": 0.0009961613038936149, + "loss": 1.06793284, + "num_input_tokens_seen": 29036160, + "router_z_loss_mlp": 1.61425781, + "step": 355, + "time_per_iteration": 2.576930522918701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236116, + "balance_loss_mlp": 1.07456315, + "epoch": 0.06848787995382839, + "flos": 635896281600.0, + "grad_norm": 0.0286185110148739, + "language_loss": 0.9730283, + "learning_rate": 0.000996122677163711, + "loss": 0.98538941, + "num_input_tokens_seen": 29112048, + "router_z_loss_mlp": 1.61132812, + "step": 356, + "time_per_iteration": 2.850829601287842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01237686, + "balance_loss_mlp": 1.07637215, + "epoch": 0.06868026163909195, + "flos": 807780556800.0, + "grad_norm": 0.03078602082995562, + "language_loss": 1.03526855, + "learning_rate": 0.000996083857821902, + "loss": 1.04764557, + "num_input_tokens_seen": 29190960, + "router_z_loss_mlp": 1.60888672, + "step": 357, + "time_per_iteration": 3.124053716659546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01237273, + "balance_loss_mlp": 1.07605469, + "epoch": 0.06887264332435553, + "flos": 440151713280.0, + "grad_norm": 0.02263887650004652, + "language_loss": 1.01701617, + "learning_rate": 0.0009960448458832588, + "loss": 1.0293889, + "num_input_tokens_seen": 29262832, + "router_z_loss_mlp": 1.60791016, + "step": 358, + "time_per_iteration": 2.6918816566467285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01242041, + "balance_loss_mlp": 1.08077514, + "epoch": 0.06906502500961909, + "flos": 485785477632.0, + "grad_norm": 0.021707311176365728, + "language_loss": 1.01897752, + "learning_rate": 0.000996005641362927, + "loss": 1.03139794, + "num_input_tokens_seen": 29329552, + "router_z_loss_mlp": 1.60839844, + "step": 359, + "time_per_iteration": 2.601358652114868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238764, + "balance_loss_mlp": 1.07725942, + "epoch": 0.06925740669488265, + "flos": 734885110272.0, + "grad_norm": 0.024380378407611886, + "language_loss": 1.04387617, + "learning_rate": 0.0009959662442761274, + "loss": 1.05626392, + "num_input_tokens_seen": 29410784, + "router_z_loss_mlp": 1.61083984, + "step": 360, + "time_per_iteration": 2.9404215812683105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236823, + "balance_loss_mlp": 1.07589066, + "epoch": 0.0694497883801462, + "flos": 553570745856.0, + "grad_norm": 0.023221163769242582, + "language_loss": 0.97943044, + "learning_rate": 0.000995926654638155, + "loss": 0.99179876, + "num_input_tokens_seen": 29486992, + "router_z_loss_mlp": 1.60498047, + "step": 361, + "time_per_iteration": 2.811624526977539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01234495, + "balance_loss_mlp": 1.07413495, + "epoch": 0.06964217006540978, + "flos": 679243837440.0, + "grad_norm": 0.025577226237571565, + "language_loss": 1.00741839, + "learning_rate": 0.00099588687246438, + "loss": 1.01976323, + "num_input_tokens_seen": 29557232, + "router_z_loss_mlp": 1.59912109, + "step": 362, + "time_per_iteration": 2.826204538345337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01235331, + "balance_loss_mlp": 1.0749228, + "epoch": 0.06983455175067334, + "flos": 525260285952.0, + "grad_norm": 0.054619150892928216, + "language_loss": 1.0805161, + "learning_rate": 0.0009958468977702471, + "loss": 1.09286952, + "num_input_tokens_seen": 29625344, + "router_z_loss_mlp": 1.59960938, + "step": 363, + "time_per_iteration": 2.5742297172546387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01269455, + "balance_loss_mlp": 1.11000061, + "epoch": 0.0700269334359369, + "flos": 1580173353984.0, + "grad_norm": 0.0347214045967213, + "language_loss": 0.79734707, + "learning_rate": 0.0009958067305712761, + "loss": 0.81004167, + "num_input_tokens_seen": 29843664, + "router_z_loss_mlp": 1.59179688, + "step": 364, + "time_per_iteration": 4.815373182296753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01234235, + "balance_loss_mlp": 1.07420838, + "epoch": 0.07021931512120046, + "flos": 1014856659456.0, + "grad_norm": 0.027565425727799023, + "language_loss": 0.95424879, + "learning_rate": 0.0009957663708830612, + "loss": 0.96659118, + "num_input_tokens_seen": 29927152, + "router_z_loss_mlp": 1.59667969, + "step": 365, + "time_per_iteration": 3.3032214641571045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238249, + "balance_loss_mlp": 1.07874703, + "epoch": 0.07041169680646403, + "flos": 824431114752.0, + "grad_norm": 0.03609893162101238, + "language_loss": 0.99641442, + "learning_rate": 0.0009957258187212714, + "loss": 1.00879693, + "num_input_tokens_seen": 30004928, + "router_z_loss_mlp": 1.59228516, + "step": 366, + "time_per_iteration": 3.143951654434204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01232948, + "balance_loss_mlp": 1.0748291, + "epoch": 0.07060407849172759, + "flos": 1417290743808.0, + "grad_norm": 0.015479474187128486, + "language_loss": 0.79194862, + "learning_rate": 0.0009956850741016502, + "loss": 0.80427808, + "num_input_tokens_seen": 30230256, + "router_z_loss_mlp": 1.578125, + "step": 367, + "time_per_iteration": 4.856614112854004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01232866, + "balance_loss_mlp": 1.07417488, + "epoch": 0.07079646017699115, + "flos": 513941486592.0, + "grad_norm": 0.03158452537667852, + "language_loss": 0.9606331, + "learning_rate": 0.0009956441370400167, + "loss": 0.97296178, + "num_input_tokens_seen": 30301200, + "router_z_loss_mlp": 1.58398438, + "step": 368, + "time_per_iteration": 2.6471550464630127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01231431, + "balance_loss_mlp": 1.07288289, + "epoch": 0.07098884186225471, + "flos": 541548274176.0, + "grad_norm": 0.03366854249700899, + "language_loss": 1.02536654, + "learning_rate": 0.0009956030075522636, + "loss": 1.03768086, + "num_input_tokens_seen": 30377024, + "router_z_loss_mlp": 1.58251953, + "step": 369, + "time_per_iteration": 2.764350175857544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01230433, + "balance_loss_mlp": 1.07183695, + "epoch": 0.07118122354751828, + "flos": 549738931200.0, + "grad_norm": 0.025388205653796188, + "language_loss": 1.02520657, + "learning_rate": 0.0009955616856543587, + "loss": 1.03751087, + "num_input_tokens_seen": 30448896, + "router_z_loss_mlp": 1.58300781, + "step": 370, + "time_per_iteration": 2.6488449573516846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01233332, + "balance_loss_mlp": 1.07483125, + "epoch": 0.07137360523278184, + "flos": 622076424192.0, + "grad_norm": 0.025131147277089937, + "language_loss": 0.94016552, + "learning_rate": 0.0009955201713623448, + "loss": 0.95249885, + "num_input_tokens_seen": 30523584, + "router_z_loss_mlp": 1.58203125, + "step": 371, + "time_per_iteration": 2.7475128173828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01231201, + "balance_loss_mlp": 1.07594299, + "epoch": 0.0715659869180454, + "flos": 1505973347328.0, + "grad_norm": 0.011087848535678398, + "language_loss": 0.76672721, + "learning_rate": 0.000995478464692339, + "loss": 0.77903926, + "num_input_tokens_seen": 30757920, + "router_z_loss_mlp": 1.55664062, + "step": 372, + "time_per_iteration": 4.930227518081665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01234827, + "balance_loss_mlp": 1.0769937, + "epoch": 0.07175836860330896, + "flos": 496481195520.0, + "grad_norm": 0.02946804107059058, + "language_loss": 1.07406306, + "learning_rate": 0.0009954365656605333, + "loss": 1.08641148, + "num_input_tokens_seen": 30824960, + "router_z_loss_mlp": 1.57910156, + "step": 373, + "time_per_iteration": 2.5494606494903564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01235693, + "balance_loss_mlp": 1.07862246, + "epoch": 0.07195075028857253, + "flos": 787081333248.0, + "grad_norm": 0.030340412148976308, + "language_loss": 1.00769055, + "learning_rate": 0.0009953944742831947, + "loss": 1.02004743, + "num_input_tokens_seen": 30902224, + "router_z_loss_mlp": 1.57519531, + "step": 374, + "time_per_iteration": 2.974013328552246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01234053, + "balance_loss_mlp": 1.07707787, + "epoch": 0.0721431319738361, + "flos": 594346111488.0, + "grad_norm": 0.024760984543104554, + "language_loss": 1.04227853, + "learning_rate": 0.0009953521905766642, + "loss": 1.05461907, + "num_input_tokens_seen": 30984784, + "router_z_loss_mlp": 1.57421875, + "step": 375, + "time_per_iteration": 2.9470102787017822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01233349, + "balance_loss_mlp": 1.07642198, + "epoch": 0.07233551365909965, + "flos": 549328697856.0, + "grad_norm": 0.025099095391344205, + "language_loss": 1.02903581, + "learning_rate": 0.0009953097145573577, + "loss": 1.04136944, + "num_input_tokens_seen": 31055376, + "router_z_loss_mlp": 1.57373047, + "step": 376, + "time_per_iteration": 2.656438112258911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01232315, + "balance_loss_mlp": 1.0754832, + "epoch": 0.07252789534436321, + "flos": 959167723008.0, + "grad_norm": 0.028756244795243427, + "language_loss": 1.01008701, + "learning_rate": 0.000995267046241766, + "loss": 1.02241015, + "num_input_tokens_seen": 31144944, + "router_z_loss_mlp": 1.57275391, + "step": 377, + "time_per_iteration": 3.2601664066314697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226098, + "balance_loss_mlp": 1.06931448, + "epoch": 0.07272027702962677, + "flos": 508655390208.0, + "grad_norm": 0.025279277167219092, + "language_loss": 1.00209188, + "learning_rate": 0.0009952241856464547, + "loss": 1.01435292, + "num_input_tokens_seen": 31213392, + "router_z_loss_mlp": 1.57226562, + "step": 378, + "time_per_iteration": 2.616483688354492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228279, + "balance_loss_mlp": 1.07159042, + "epoch": 0.07291265871489035, + "flos": 613551395328.0, + "grad_norm": 0.025059419305224793, + "language_loss": 1.0761106, + "learning_rate": 0.0009951811327880632, + "loss": 1.08839345, + "num_input_tokens_seen": 31289840, + "router_z_loss_mlp": 1.57128906, + "step": 379, + "time_per_iteration": 2.7666382789611816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226658, + "balance_loss_mlp": 1.07063651, + "epoch": 0.0731050404001539, + "flos": 496741707264.0, + "grad_norm": 0.032880990240464036, + "language_loss": 1.00766444, + "learning_rate": 0.0009951378876833063, + "loss": 1.01993108, + "num_input_tokens_seen": 31357600, + "router_z_loss_mlp": 1.56445312, + "step": 380, + "time_per_iteration": 2.5504086017608643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01230504, + "balance_loss_mlp": 1.07433975, + "epoch": 0.07329742208541747, + "flos": 641129985024.0, + "grad_norm": 0.0343074889031262, + "language_loss": 1.0780232, + "learning_rate": 0.0009950944503489736, + "loss": 1.0903281, + "num_input_tokens_seen": 31428896, + "router_z_loss_mlp": 1.56591797, + "step": 381, + "time_per_iteration": 2.7695260047912598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01231248, + "balance_loss_mlp": 1.07537043, + "epoch": 0.07348980377068103, + "flos": 817740401664.0, + "grad_norm": 0.027198888726283066, + "language_loss": 1.01785743, + "learning_rate": 0.0009950508208019285, + "loss": 1.03016996, + "num_input_tokens_seen": 31507424, + "router_z_loss_mlp": 1.56298828, + "step": 382, + "time_per_iteration": 2.9918277263641357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01227944, + "balance_loss_mlp": 1.07187521, + "epoch": 0.0736821854559446, + "flos": 509669239296.0, + "grad_norm": 0.03113985633155724, + "language_loss": 1.05612254, + "learning_rate": 0.0009950069990591096, + "loss": 1.06840205, + "num_input_tokens_seen": 31576768, + "router_z_loss_mlp": 1.56494141, + "step": 383, + "time_per_iteration": 2.610745429992676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01248039, + "balance_loss_mlp": 1.09392548, + "epoch": 0.07387456714120816, + "flos": 1558048046592.0, + "grad_norm": 0.03338671968111017, + "language_loss": 0.76401371, + "learning_rate": 0.0009949629851375302, + "loss": 0.77649409, + "num_input_tokens_seen": 31797312, + "router_z_loss_mlp": 1.54492188, + "step": 384, + "time_per_iteration": 4.854166269302368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01229749, + "balance_loss_mlp": 1.0736798, + "epoch": 0.07406694882647172, + "flos": 526643435520.0, + "grad_norm": 0.03274978311793036, + "language_loss": 0.98781282, + "learning_rate": 0.0009949187790542777, + "loss": 1.00011039, + "num_input_tokens_seen": 31869568, + "router_z_loss_mlp": 1.56494141, + "step": 385, + "time_per_iteration": 2.728701591491699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0123258, + "balance_loss_mlp": 1.07636821, + "epoch": 0.07425933051173528, + "flos": 498823799808.0, + "grad_norm": 0.026908846939264777, + "language_loss": 0.94723004, + "learning_rate": 0.0009948743808265148, + "loss": 0.95955586, + "num_input_tokens_seen": 31941712, + "router_z_loss_mlp": 1.56640625, + "step": 386, + "time_per_iteration": 2.6850693225860596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01231135, + "balance_loss_mlp": 1.07511437, + "epoch": 0.07445171219699885, + "flos": 506057003520.0, + "grad_norm": 0.05633654869747302, + "language_loss": 1.04553366, + "learning_rate": 0.0009948297904714782, + "loss": 1.05784488, + "num_input_tokens_seen": 32015232, + "router_z_loss_mlp": 1.56445312, + "step": 387, + "time_per_iteration": 2.6746010780334473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01231627, + "balance_loss_mlp": 1.07555866, + "epoch": 0.07464409388226241, + "flos": 555116352000.0, + "grad_norm": 0.03450843374667126, + "language_loss": 0.9665134, + "learning_rate": 0.0009947850080064796, + "loss": 0.97882968, + "num_input_tokens_seen": 32094640, + "router_z_loss_mlp": 1.56494141, + "step": 388, + "time_per_iteration": 2.7839057445526123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01230193, + "balance_loss_mlp": 1.07431459, + "epoch": 0.07483647556752597, + "flos": 778274325504.0, + "grad_norm": 0.021592891008175935, + "language_loss": 1.01240289, + "learning_rate": 0.0009947400334489047, + "loss": 1.02470493, + "num_input_tokens_seen": 32176640, + "router_z_loss_mlp": 1.56298828, + "step": 389, + "time_per_iteration": 2.9945342540740967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01229089, + "balance_loss_mlp": 1.07411718, + "epoch": 0.07502885725278953, + "flos": 613681651200.0, + "grad_norm": 0.023383004705128753, + "language_loss": 0.92341155, + "learning_rate": 0.0009946948668162145, + "loss": 0.93570244, + "num_input_tokens_seen": 32246704, + "router_z_loss_mlp": 1.55371094, + "step": 390, + "time_per_iteration": 2.7355024814605713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122989, + "balance_loss_mlp": 1.07496524, + "epoch": 0.0752212389380531, + "flos": 689854961664.0, + "grad_norm": 0.026752200694656208, + "language_loss": 0.97335494, + "learning_rate": 0.0009946495081259441, + "loss": 0.98565376, + "num_input_tokens_seen": 32320032, + "router_z_loss_mlp": 1.55322266, + "step": 391, + "time_per_iteration": 2.799938678741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01227768, + "balance_loss_mlp": 1.07303405, + "epoch": 0.07541362062331666, + "flos": 767050853376.0, + "grad_norm": 0.02596026064524479, + "language_loss": 1.01604676, + "learning_rate": 0.0009946039573957035, + "loss": 1.02832437, + "num_input_tokens_seen": 32398144, + "router_z_loss_mlp": 1.55126953, + "step": 392, + "time_per_iteration": 2.932504415512085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0123199, + "balance_loss_mlp": 1.07768571, + "epoch": 0.07560600230858022, + "flos": 589908679680.0, + "grad_norm": 0.028382748029943367, + "language_loss": 0.97495323, + "learning_rate": 0.000994558214643177, + "loss": 0.98727316, + "num_input_tokens_seen": 32471984, + "router_z_loss_mlp": 1.546875, + "step": 393, + "time_per_iteration": 2.752694845199585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228178, + "balance_loss_mlp": 1.07425475, + "epoch": 0.07579838399384378, + "flos": 751144900608.0, + "grad_norm": 0.028291982513743617, + "language_loss": 0.99160051, + "learning_rate": 0.000994512279886123, + "loss": 1.00388229, + "num_input_tokens_seen": 32550176, + "router_z_loss_mlp": 1.54296875, + "step": 394, + "time_per_iteration": 3.06592059135437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228894, + "balance_loss_mlp": 1.07530475, + "epoch": 0.07599076567910736, + "flos": 524550609408.0, + "grad_norm": 0.023352712612718218, + "language_loss": 0.98641121, + "learning_rate": 0.0009944661531423758, + "loss": 0.99870014, + "num_input_tokens_seen": 32620768, + "router_z_loss_mlp": 1.53955078, + "step": 395, + "time_per_iteration": 2.6720728874206543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122919, + "balance_loss_mlp": 1.07555354, + "epoch": 0.07618314736437092, + "flos": 552185594880.0, + "grad_norm": 0.026216962171459895, + "language_loss": 0.97914684, + "learning_rate": 0.000994419834429843, + "loss": 0.99143875, + "num_input_tokens_seen": 32693472, + "router_z_loss_mlp": 1.54003906, + "step": 396, + "time_per_iteration": 2.6652910709381104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226861, + "balance_loss_mlp": 1.07308066, + "epoch": 0.07637552904963447, + "flos": 699432771072.0, + "grad_norm": 0.029361663168223213, + "language_loss": 1.03114796, + "learning_rate": 0.0009943733237665069, + "loss": 1.0434165, + "num_input_tokens_seen": 32764976, + "router_z_loss_mlp": 1.54150391, + "step": 397, + "time_per_iteration": 2.808711290359497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01227023, + "balance_loss_mlp": 1.07329071, + "epoch": 0.07656791073489803, + "flos": 580635042816.0, + "grad_norm": 0.02000560632750303, + "language_loss": 1.01598048, + "learning_rate": 0.0009943266211704248, + "loss": 1.02825069, + "num_input_tokens_seen": 32853104, + "router_z_loss_mlp": 1.54101562, + "step": 398, + "time_per_iteration": 2.9420461654663086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226854, + "balance_loss_mlp": 1.0732646, + "epoch": 0.0767602924201616, + "flos": 418037139456.0, + "grad_norm": 0.02425852476792673, + "language_loss": 1.03237891, + "learning_rate": 0.000994279726659728, + "loss": 1.04464746, + "num_input_tokens_seen": 32919376, + "router_z_loss_mlp": 1.53955078, + "step": 399, + "time_per_iteration": 2.5185675621032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01230296, + "balance_loss_mlp": 1.07675469, + "epoch": 0.07695267410542517, + "flos": 483888035328.0, + "grad_norm": 0.030174375239475117, + "language_loss": 1.02145576, + "learning_rate": 0.0009942326402526231, + "loss": 1.03375876, + "num_input_tokens_seen": 32988064, + "router_z_loss_mlp": 1.5390625, + "step": 400, + "time_per_iteration": 2.5265390872955322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224857, + "balance_loss_mlp": 1.07184029, + "epoch": 0.07714505579068873, + "flos": 532026860544.0, + "grad_norm": 0.024483465572707617, + "language_loss": 0.99344772, + "learning_rate": 0.0009941853619673902, + "loss": 1.0056963, + "num_input_tokens_seen": 33059024, + "router_z_loss_mlp": 1.53369141, + "step": 401, + "time_per_iteration": 2.660491704940796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224912, + "balance_loss_mlp": 1.07218146, + "epoch": 0.07733743747595229, + "flos": 806439066624.0, + "grad_norm": 0.032921156451595594, + "language_loss": 1.03587961, + "learning_rate": 0.0009941378918223844, + "loss": 1.04812872, + "num_input_tokens_seen": 33137712, + "router_z_loss_mlp": 1.53076172, + "step": 402, + "time_per_iteration": 3.078272819519043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01222316, + "balance_loss_mlp": 1.06972802, + "epoch": 0.07752981916121585, + "flos": 623613298176.0, + "grad_norm": 0.02596227047756477, + "language_loss": 0.96322513, + "learning_rate": 0.0009940902298360354, + "loss": 0.97544825, + "num_input_tokens_seen": 33211296, + "router_z_loss_mlp": 1.52929688, + "step": 403, + "time_per_iteration": 2.78222918510437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224993, + "balance_loss_mlp": 1.07288182, + "epoch": 0.07772220084647942, + "flos": 729542618112.0, + "grad_norm": 0.031231063897144088, + "language_loss": 1.06544566, + "learning_rate": 0.0009940423760268473, + "loss": 1.07769561, + "num_input_tokens_seen": 33283632, + "router_z_loss_mlp": 1.52441406, + "step": 404, + "time_per_iteration": 2.8572018146514893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226552, + "balance_loss_mlp": 1.07472658, + "epoch": 0.07791458253174298, + "flos": 556468575744.0, + "grad_norm": 0.029548764371286118, + "language_loss": 0.99639893, + "learning_rate": 0.0009939943304133982, + "loss": 1.00866449, + "num_input_tokens_seen": 33350704, + "router_z_loss_mlp": 1.52148438, + "step": 405, + "time_per_iteration": 2.607412815093994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226106, + "balance_loss_mlp": 1.07409084, + "epoch": 0.07810696421700654, + "flos": 554234760192.0, + "grad_norm": 0.031141101296471768, + "language_loss": 1.06411445, + "learning_rate": 0.0009939460930143416, + "loss": 1.07637548, + "num_input_tokens_seen": 33416272, + "router_z_loss_mlp": 1.5234375, + "step": 406, + "time_per_iteration": 2.6132876873016357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223027, + "balance_loss_mlp": 1.07120168, + "epoch": 0.0782993459022701, + "flos": 651878095872.0, + "grad_norm": 0.023437908852709077, + "language_loss": 1.00106847, + "learning_rate": 0.0009938976638484043, + "loss": 1.01329875, + "num_input_tokens_seen": 33501824, + "router_z_loss_mlp": 1.52148438, + "step": 407, + "time_per_iteration": 2.905681610107422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218745, + "balance_loss_mlp": 1.06691968, + "epoch": 0.07849172758753367, + "flos": 497160672768.0, + "grad_norm": 0.02891290096917658, + "language_loss": 0.99991584, + "learning_rate": 0.0009938490429343887, + "loss": 1.01210332, + "num_input_tokens_seen": 33571456, + "router_z_loss_mlp": 1.52148438, + "step": 408, + "time_per_iteration": 2.539567708969116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01222677, + "balance_loss_mlp": 1.07066166, + "epoch": 0.07868410927279723, + "flos": 579075975168.0, + "grad_norm": 0.030601656563413092, + "language_loss": 0.99965751, + "learning_rate": 0.0009938002302911709, + "loss": 1.01188421, + "num_input_tokens_seen": 33646320, + "router_z_loss_mlp": 1.5234375, + "step": 409, + "time_per_iteration": 2.732064962387085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01220028, + "balance_loss_mlp": 1.0680126, + "epoch": 0.07887649095806079, + "flos": 524066515968.0, + "grad_norm": 0.03256443285635905, + "language_loss": 1.03146362, + "learning_rate": 0.0009937512259377015, + "loss": 1.04366398, + "num_input_tokens_seen": 33717664, + "router_z_loss_mlp": 1.5234375, + "step": 410, + "time_per_iteration": 2.6500303745269775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221864, + "balance_loss_mlp": 1.07013464, + "epoch": 0.07906887264332435, + "flos": 558437876736.0, + "grad_norm": 0.023780630120827737, + "language_loss": 1.01466393, + "learning_rate": 0.000993702029893006, + "loss": 1.02688265, + "num_input_tokens_seen": 33794720, + "router_z_loss_mlp": 1.52050781, + "step": 411, + "time_per_iteration": 2.7921671867370605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221791, + "balance_loss_mlp": 1.07010949, + "epoch": 0.07926125432858792, + "flos": 823362871296.0, + "grad_norm": 0.04077078343290612, + "language_loss": 1.01153946, + "learning_rate": 0.0009936526421761838, + "loss": 1.02375734, + "num_input_tokens_seen": 33868304, + "router_z_loss_mlp": 1.52001953, + "step": 412, + "time_per_iteration": 3.0569379329681396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217861, + "balance_loss_mlp": 1.06632257, + "epoch": 0.07945363601385148, + "flos": 563393604096.0, + "grad_norm": 0.02717343044282308, + "language_loss": 1.04004121, + "learning_rate": 0.000993603062806409, + "loss": 1.05221987, + "num_input_tokens_seen": 33937424, + "router_z_loss_mlp": 1.51855469, + "step": 413, + "time_per_iteration": 2.707462787628174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01219172, + "balance_loss_mlp": 1.06844354, + "epoch": 0.07964601769911504, + "flos": 518884478976.0, + "grad_norm": 0.031245789494761384, + "language_loss": 1.07179379, + "learning_rate": 0.0009935532918029298, + "loss": 1.08398533, + "num_input_tokens_seen": 34003984, + "router_z_loss_mlp": 1.51025391, + "step": 414, + "time_per_iteration": 2.668151617050171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224604, + "balance_loss_mlp": 1.07387555, + "epoch": 0.0798383993843786, + "flos": 540300109824.0, + "grad_norm": 0.025221671350570463, + "language_loss": 0.99906069, + "learning_rate": 0.0009935033291850694, + "loss": 1.01130676, + "num_input_tokens_seen": 34072400, + "router_z_loss_mlp": 1.51025391, + "step": 415, + "time_per_iteration": 2.64747953414917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216008, + "balance_loss_mlp": 1.06547058, + "epoch": 0.08003078106964218, + "flos": 486121850880.0, + "grad_norm": 0.027121462600521052, + "language_loss": 1.02766061, + "learning_rate": 0.0009934531749722247, + "loss": 1.03982067, + "num_input_tokens_seen": 34142448, + "router_z_loss_mlp": 1.50830078, + "step": 416, + "time_per_iteration": 2.5705764293670654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121625, + "balance_loss_mlp": 1.06576049, + "epoch": 0.08022316275490574, + "flos": 519275246592.0, + "grad_norm": 0.027391361962933233, + "language_loss": 1.00515926, + "learning_rate": 0.0009934028291838672, + "loss": 1.01732171, + "num_input_tokens_seen": 34214080, + "router_z_loss_mlp": 1.5078125, + "step": 417, + "time_per_iteration": 2.7232770919799805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01219761, + "balance_loss_mlp": 1.0695101, + "epoch": 0.0804155444401693, + "flos": 495046379520.0, + "grad_norm": 0.028534904701295792, + "language_loss": 0.95904237, + "learning_rate": 0.0009933522918395433, + "loss": 0.97123998, + "num_input_tokens_seen": 34288448, + "router_z_loss_mlp": 1.50537109, + "step": 418, + "time_per_iteration": 2.670992374420166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01265297, + "balance_loss_mlp": 1.11595154, + "epoch": 0.08060792612543285, + "flos": 1584853833216.0, + "grad_norm": 0.03473829356439328, + "language_loss": 0.782511, + "learning_rate": 0.0009933015629588731, + "loss": 0.79516399, + "num_input_tokens_seen": 34521632, + "router_z_loss_mlp": 1.49609375, + "step": 419, + "time_per_iteration": 4.9051830768585205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01222046, + "balance_loss_mlp": 1.07246244, + "epoch": 0.08080030781069643, + "flos": 526358728704.0, + "grad_norm": 0.03232182071246488, + "language_loss": 1.15746891, + "learning_rate": 0.000993250642561551, + "loss": 1.16968942, + "num_input_tokens_seen": 34590080, + "router_z_loss_mlp": 1.49853516, + "step": 420, + "time_per_iteration": 2.596930503845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224313, + "balance_loss_mlp": 1.07487273, + "epoch": 0.08099268949595999, + "flos": 547756895232.0, + "grad_norm": 0.03306568774928502, + "language_loss": 1.00193918, + "learning_rate": 0.0009931995306673466, + "loss": 1.01418233, + "num_input_tokens_seen": 34660512, + "router_z_loss_mlp": 1.49707031, + "step": 421, + "time_per_iteration": 2.704012155532837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223697, + "balance_loss_mlp": 1.0744468, + "epoch": 0.08118507118122355, + "flos": 511373299200.0, + "grad_norm": 0.026268861479682264, + "language_loss": 1.0597651, + "learning_rate": 0.000993148227296103, + "loss": 1.07200205, + "num_input_tokens_seen": 34732016, + "router_z_loss_mlp": 1.49511719, + "step": 422, + "time_per_iteration": 2.6110117435455322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224578, + "balance_loss_mlp": 1.0751853, + "epoch": 0.08137745286648711, + "flos": 722001239040.0, + "grad_norm": 0.024088300997991936, + "language_loss": 0.92380643, + "learning_rate": 0.000993096732467738, + "loss": 0.9360522, + "num_input_tokens_seen": 34810416, + "router_z_loss_mlp": 1.49658203, + "step": 423, + "time_per_iteration": 2.9790220260620117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224383, + "balance_loss_mlp": 1.0753237, + "epoch": 0.08156983455175067, + "flos": 680817641472.0, + "grad_norm": 0.029818930066630327, + "language_loss": 1.0177561, + "learning_rate": 0.0009930450462022435, + "loss": 1.02999997, + "num_input_tokens_seen": 34879504, + "router_z_loss_mlp": 1.49316406, + "step": 424, + "time_per_iteration": 2.8023674488067627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223, + "balance_loss_mlp": 1.07518005, + "epoch": 0.08176221623701424, + "flos": 1456588359168.0, + "grad_norm": 0.012435251357338771, + "language_loss": 0.79189807, + "learning_rate": 0.0009929931685196862, + "loss": 0.80412811, + "num_input_tokens_seen": 35111584, + "router_z_loss_mlp": 1.48046875, + "step": 425, + "time_per_iteration": 4.96533989906311 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01219597, + "balance_loss_mlp": 1.0711571, + "epoch": 0.0819545979222778, + "flos": 1558883071488.0, + "grad_norm": 0.04204100969257126, + "language_loss": 1.00605047, + "learning_rate": 0.0009929410994402065, + "loss": 1.01824641, + "num_input_tokens_seen": 35205664, + "router_z_loss_mlp": 1.48681641, + "step": 426, + "time_per_iteration": 3.850475311279297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01220758, + "balance_loss_mlp": 1.07236588, + "epoch": 0.08214697960754136, + "flos": 513800497152.0, + "grad_norm": 0.03975912273964659, + "language_loss": 1.03955805, + "learning_rate": 0.0009928888389840196, + "loss": 1.05176568, + "num_input_tokens_seen": 35280144, + "router_z_loss_mlp": 1.48632812, + "step": 427, + "time_per_iteration": 2.6892385482788086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224824, + "balance_loss_mlp": 1.07633698, + "epoch": 0.08233936129280492, + "flos": 596221360128.0, + "grad_norm": 0.02633667259549893, + "language_loss": 1.0604248, + "learning_rate": 0.0009928363871714147, + "loss": 1.07267296, + "num_input_tokens_seen": 35344768, + "router_z_loss_mlp": 1.48730469, + "step": 428, + "time_per_iteration": 2.666851282119751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224039, + "balance_loss_mlp": 1.07550442, + "epoch": 0.08253174297806849, + "flos": 573164795904.0, + "grad_norm": 0.03052010415677114, + "language_loss": 0.99677718, + "learning_rate": 0.0009927837440227556, + "loss": 1.00901759, + "num_input_tokens_seen": 35425536, + "router_z_loss_mlp": 1.48779297, + "step": 429, + "time_per_iteration": 2.810197591781616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228416, + "balance_loss_mlp": 1.07992899, + "epoch": 0.08272412466333205, + "flos": 624642610176.0, + "grad_norm": 0.029909202440675912, + "language_loss": 0.93710327, + "learning_rate": 0.0009927309095584798, + "loss": 0.94938743, + "num_input_tokens_seen": 35515440, + "router_z_loss_mlp": 1.48730469, + "step": 430, + "time_per_iteration": 2.98052978515625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122165, + "balance_loss_mlp": 1.07316256, + "epoch": 0.08291650634859561, + "flos": 514994267136.0, + "grad_norm": 0.038201439099628094, + "language_loss": 1.07072532, + "learning_rate": 0.0009926778837991, + "loss": 1.08294177, + "num_input_tokens_seen": 35580192, + "router_z_loss_mlp": 1.48730469, + "step": 431, + "time_per_iteration": 2.613912582397461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223506, + "balance_loss_mlp": 1.07516193, + "epoch": 0.08310888803385917, + "flos": 668541388800.0, + "grad_norm": 0.02618037233016902, + "language_loss": 1.04762018, + "learning_rate": 0.000992624666765202, + "loss": 1.05985522, + "num_input_tokens_seen": 35649472, + "router_z_loss_mlp": 1.48583984, + "step": 432, + "time_per_iteration": 2.785602331161499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224029, + "balance_loss_mlp": 1.07659137, + "epoch": 0.08330126971912274, + "flos": 584490326016.0, + "grad_norm": 0.023129420064945467, + "language_loss": 1.02043724, + "learning_rate": 0.000992571258477447, + "loss": 1.03267753, + "num_input_tokens_seen": 35722848, + "router_z_loss_mlp": 1.4765625, + "step": 433, + "time_per_iteration": 2.7774012088775635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225333, + "balance_loss_mlp": 1.07799041, + "epoch": 0.0834936514043863, + "flos": 562497275904.0, + "grad_norm": 0.02412369992445121, + "language_loss": 0.95710295, + "learning_rate": 0.0009925176589565695, + "loss": 0.9693563, + "num_input_tokens_seen": 35800944, + "router_z_loss_mlp": 1.47558594, + "step": 434, + "time_per_iteration": 2.7975149154663086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224713, + "balance_loss_mlp": 1.07751381, + "epoch": 0.08368603308964986, + "flos": 495513008640.0, + "grad_norm": 0.023499028814372425, + "language_loss": 1.06310439, + "learning_rate": 0.0009924638682233791, + "loss": 1.07535148, + "num_input_tokens_seen": 35866288, + "router_z_loss_mlp": 1.47412109, + "step": 435, + "time_per_iteration": 2.5623626708984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01247864, + "balance_loss_mlp": 1.10328674, + "epoch": 0.08387841477491342, + "flos": 1391808983040.0, + "grad_norm": 0.0329185074425942, + "language_loss": 0.79564589, + "learning_rate": 0.0009924098862987589, + "loss": 0.80812454, + "num_input_tokens_seen": 36083040, + "router_z_loss_mlp": 1.44726562, + "step": 436, + "time_per_iteration": 4.5364601612091064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01219037, + "balance_loss_mlp": 1.07174218, + "epoch": 0.084070796460177, + "flos": 800353970688.0, + "grad_norm": 0.025226905267595717, + "language_loss": 0.95941472, + "learning_rate": 0.0009923557132036668, + "loss": 0.97160506, + "num_input_tokens_seen": 36158816, + "router_z_loss_mlp": 1.47509766, + "step": 437, + "time_per_iteration": 3.031538963317871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01219746, + "balance_loss_mlp": 1.07226074, + "epoch": 0.08426317814544056, + "flos": 560096274432.0, + "grad_norm": 0.024291343012928023, + "language_loss": 0.99699497, + "learning_rate": 0.0009923013489591345, + "loss": 1.00919247, + "num_input_tokens_seen": 36236432, + "router_z_loss_mlp": 1.47705078, + "step": 438, + "time_per_iteration": 2.741021156311035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217749, + "balance_loss_mlp": 1.07073975, + "epoch": 0.08445555983070412, + "flos": 811883616768.0, + "grad_norm": 0.02787309358423107, + "language_loss": 0.97740996, + "learning_rate": 0.0009922467935862681, + "loss": 0.98958743, + "num_input_tokens_seen": 36327952, + "router_z_loss_mlp": 1.47216797, + "step": 439, + "time_per_iteration": 3.0727341175079346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215984, + "balance_loss_mlp": 1.06907046, + "epoch": 0.08464794151596768, + "flos": 511169183232.0, + "grad_norm": 0.02418736148641671, + "language_loss": 1.01547837, + "learning_rate": 0.0009921920471062478, + "loss": 1.0276382, + "num_input_tokens_seen": 36394896, + "router_z_loss_mlp": 1.47119141, + "step": 440, + "time_per_iteration": 2.5793957710266113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214442, + "balance_loss_mlp": 1.06805265, + "epoch": 0.08484032320123125, + "flos": 557473692672.0, + "grad_norm": 0.02549300900866748, + "language_loss": 0.99590349, + "learning_rate": 0.0009921371095403281, + "loss": 1.00804806, + "num_input_tokens_seen": 36464656, + "router_z_loss_mlp": 1.46582031, + "step": 441, + "time_per_iteration": 2.633976936340332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215261, + "balance_loss_mlp": 1.06887233, + "epoch": 0.08503270488649481, + "flos": 528360230400.0, + "grad_norm": 0.023285649852896013, + "language_loss": 1.02823853, + "learning_rate": 0.0009920819809098379, + "loss": 1.04039121, + "num_input_tokens_seen": 36532208, + "router_z_loss_mlp": 1.46582031, + "step": 442, + "time_per_iteration": 2.5975728034973145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213611, + "balance_loss_mlp": 1.06722176, + "epoch": 0.08522508657175837, + "flos": 615385711104.0, + "grad_norm": 0.021771679570127336, + "language_loss": 0.97986722, + "learning_rate": 0.0009920266612361798, + "loss": 0.99200332, + "num_input_tokens_seen": 36607360, + "router_z_loss_mlp": 1.46582031, + "step": 443, + "time_per_iteration": 2.7284042835235596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214332, + "balance_loss_mlp": 1.06803846, + "epoch": 0.08541746825702193, + "flos": 620986713600.0, + "grad_norm": 0.024601404202987703, + "language_loss": 0.97963679, + "learning_rate": 0.0009919711505408308, + "loss": 0.9917801, + "num_input_tokens_seen": 36680688, + "router_z_loss_mlp": 1.46484375, + "step": 444, + "time_per_iteration": 2.797030448913574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216522, + "balance_loss_mlp": 1.07051492, + "epoch": 0.08560984994228549, + "flos": 483888035328.0, + "grad_norm": 0.023417740932750293, + "language_loss": 0.96522343, + "learning_rate": 0.000991915448845342, + "loss": 0.97738856, + "num_input_tokens_seen": 36746288, + "router_z_loss_mlp": 1.46191406, + "step": 445, + "time_per_iteration": 2.544638156890869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121537, + "balance_loss_mlp": 1.06945765, + "epoch": 0.08580223162754906, + "flos": 518176803840.0, + "grad_norm": 0.025018627604332305, + "language_loss": 1.05275297, + "learning_rate": 0.000991859556171339, + "loss": 1.0649066, + "num_input_tokens_seen": 36812528, + "router_z_loss_mlp": 1.4609375, + "step": 446, + "time_per_iteration": 2.5865097045898438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214045, + "balance_loss_mlp": 1.06856191, + "epoch": 0.08599461331281262, + "flos": 532519686144.0, + "grad_norm": 0.025883227843611877, + "language_loss": 1.07190132, + "learning_rate": 0.000991803472540521, + "loss": 1.08404183, + "num_input_tokens_seen": 36879248, + "router_z_loss_mlp": 1.45654297, + "step": 447, + "time_per_iteration": 2.6001055240631104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213992, + "balance_loss_mlp": 1.06879497, + "epoch": 0.08618699499807618, + "flos": 791633558016.0, + "grad_norm": 0.022461373320799196, + "language_loss": 1.02303076, + "learning_rate": 0.0009917471979746615, + "loss": 1.03517067, + "num_input_tokens_seen": 36951376, + "router_z_loss_mlp": 1.45361328, + "step": 448, + "time_per_iteration": 2.9621376991271973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218395, + "balance_loss_mlp": 1.07300746, + "epoch": 0.08637937668333974, + "flos": 567114628608.0, + "grad_norm": 0.02449904215267775, + "language_loss": 1.00404847, + "learning_rate": 0.0009916907324956086, + "loss": 1.01623249, + "num_input_tokens_seen": 37025936, + "router_z_loss_mlp": 1.45556641, + "step": 449, + "time_per_iteration": 2.691150188446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214944, + "balance_loss_mlp": 1.0697943, + "epoch": 0.08657175836860331, + "flos": 446117286912.0, + "grad_norm": 0.025714213043280993, + "language_loss": 0.97109705, + "learning_rate": 0.0009916340761252837, + "loss": 0.98324645, + "num_input_tokens_seen": 37095872, + "router_z_loss_mlp": 1.453125, + "step": 450, + "time_per_iteration": 2.6118698120117188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212599, + "balance_loss_mlp": 1.067307, + "epoch": 0.08676414005386687, + "flos": 845588235264.0, + "grad_norm": 0.02612794411743426, + "language_loss": 0.94540501, + "learning_rate": 0.0009915772288856832, + "loss": 0.95753098, + "num_input_tokens_seen": 37179072, + "router_z_loss_mlp": 1.45458984, + "step": 451, + "time_per_iteration": 3.0883219242095947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213701, + "balance_loss_mlp": 1.06926715, + "epoch": 0.08695652173913043, + "flos": 604483875840.0, + "grad_norm": 0.02003375948944636, + "language_loss": 0.95739877, + "learning_rate": 0.000991520190798877, + "loss": 0.96953583, + "num_input_tokens_seen": 37260288, + "router_z_loss_mlp": 1.44580078, + "step": 452, + "time_per_iteration": 2.8387818336486816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213572, + "balance_loss_mlp": 1.06928122, + "epoch": 0.08714890342439399, + "flos": 732000015360.0, + "grad_norm": 0.027770143088691506, + "language_loss": 1.06693339, + "learning_rate": 0.0009914629618870089, + "loss": 1.07906914, + "num_input_tokens_seen": 37331136, + "router_z_loss_mlp": 1.44433594, + "step": 453, + "time_per_iteration": 2.9403207302093506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01234398, + "balance_loss_mlp": 1.0905838, + "epoch": 0.08734128510965757, + "flos": 1485454044672.0, + "grad_norm": 0.02536208637588336, + "language_loss": 0.78675872, + "learning_rate": 0.0009914055421722976, + "loss": 0.79910266, + "num_input_tokens_seen": 37559040, + "router_z_loss_mlp": 1.43945312, + "step": 454, + "time_per_iteration": 4.803662061691284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121994, + "balance_loss_mlp": 1.07631683, + "epoch": 0.08753366679492113, + "flos": 1526266340352.0, + "grad_norm": 0.01817690946373191, + "language_loss": 0.81427962, + "learning_rate": 0.0009913479316770353, + "loss": 0.82647902, + "num_input_tokens_seen": 37785136, + "router_z_loss_mlp": 1.4375, + "step": 455, + "time_per_iteration": 4.812621355056763 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213204, + "balance_loss_mlp": 1.06919885, + "epoch": 0.08772604848018468, + "flos": 722524263936.0, + "grad_norm": 0.030160618436618963, + "language_loss": 0.98162878, + "learning_rate": 0.0009912901304235883, + "loss": 0.99376082, + "num_input_tokens_seen": 37858832, + "router_z_loss_mlp": 1.44140625, + "step": 456, + "time_per_iteration": 2.9147355556488037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217818, + "balance_loss_mlp": 1.07386112, + "epoch": 0.08791843016544824, + "flos": 709466476032.0, + "grad_norm": 0.03064824893295274, + "language_loss": 0.96399593, + "learning_rate": 0.000991232138434397, + "loss": 0.97617412, + "num_input_tokens_seen": 37931856, + "router_z_loss_mlp": 1.44091797, + "step": 457, + "time_per_iteration": 2.8735082149505615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121922, + "balance_loss_mlp": 1.07540572, + "epoch": 0.08811081185071182, + "flos": 474021516288.0, + "grad_norm": 0.03193385229896835, + "language_loss": 1.03185177, + "learning_rate": 0.000991173955731976, + "loss": 1.04404402, + "num_input_tokens_seen": 38002432, + "router_z_loss_mlp": 1.43945312, + "step": 458, + "time_per_iteration": 2.6597843170166016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01220724, + "balance_loss_mlp": 1.07762539, + "epoch": 0.08830319353597538, + "flos": 686314584576.0, + "grad_norm": 0.057581270182385194, + "language_loss": 1.06524456, + "learning_rate": 0.0009911155823389137, + "loss": 1.07745171, + "num_input_tokens_seen": 38081648, + "router_z_loss_mlp": 1.43212891, + "step": 459, + "time_per_iteration": 2.938124656677246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218235, + "balance_loss_mlp": 1.07513571, + "epoch": 0.08849557522123894, + "flos": 574608344064.0, + "grad_norm": 0.027044136096108284, + "language_loss": 1.01923048, + "learning_rate": 0.000991057018277873, + "loss": 1.03141284, + "num_input_tokens_seen": 38153424, + "router_z_loss_mlp": 1.43212891, + "step": 460, + "time_per_iteration": 2.746169090270996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212445, + "balance_loss_mlp": 1.0693934, + "epoch": 0.0886879569065025, + "flos": 565627419648.0, + "grad_norm": 0.031092379840733354, + "language_loss": 1.03267121, + "learning_rate": 0.0009909982635715898, + "loss": 1.04479575, + "num_input_tokens_seen": 38223008, + "router_z_loss_mlp": 1.43164062, + "step": 461, + "time_per_iteration": 2.6196396350860596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212854, + "balance_loss_mlp": 1.06956458, + "epoch": 0.08888033859176607, + "flos": 564956674560.0, + "grad_norm": 0.030181357689894217, + "language_loss": 1.02059078, + "learning_rate": 0.0009909393182428751, + "loss": 1.03271937, + "num_input_tokens_seen": 38294592, + "router_z_loss_mlp": 1.43408203, + "step": 462, + "time_per_iteration": 2.679793357849121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216843, + "balance_loss_mlp": 1.07345808, + "epoch": 0.08907272027702963, + "flos": 466742650368.0, + "grad_norm": 0.029240136547664795, + "language_loss": 0.9639132, + "learning_rate": 0.000990880182314614, + "loss": 0.97608161, + "num_input_tokens_seen": 38365792, + "router_z_loss_mlp": 1.43505859, + "step": 463, + "time_per_iteration": 2.712097644805908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212421, + "balance_loss_mlp": 1.06922734, + "epoch": 0.08926510196229319, + "flos": 682843338240.0, + "grad_norm": 0.026287763165510035, + "language_loss": 0.96174729, + "learning_rate": 0.0009908208558097643, + "loss": 0.97387147, + "num_input_tokens_seen": 38447776, + "router_z_loss_mlp": 1.43310547, + "step": 464, + "time_per_iteration": 2.906903028488159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217208, + "balance_loss_mlp": 1.07406175, + "epoch": 0.08945748364755675, + "flos": 597821360640.0, + "grad_norm": 0.024374741633963998, + "language_loss": 0.98668623, + "learning_rate": 0.000990761338751359, + "loss": 0.99885827, + "num_input_tokens_seen": 38521632, + "router_z_loss_mlp": 1.43261719, + "step": 465, + "time_per_iteration": 2.7994933128356934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225639, + "balance_loss_mlp": 1.08506775, + "epoch": 0.08964986533282032, + "flos": 1589340930048.0, + "grad_norm": 0.02575129149720033, + "language_loss": 0.73659623, + "learning_rate": 0.0009907016311625045, + "loss": 0.74885261, + "num_input_tokens_seen": 38760528, + "router_z_loss_mlp": 1.40625, + "step": 466, + "time_per_iteration": 4.9763429164886475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221953, + "balance_loss_mlp": 1.07861578, + "epoch": 0.08984224701808388, + "flos": 534549385728.0, + "grad_norm": 0.024628184063577727, + "language_loss": 1.01551545, + "learning_rate": 0.0009906417330663815, + "loss": 1.02773499, + "num_input_tokens_seen": 38827200, + "router_z_loss_mlp": 1.43457031, + "step": 467, + "time_per_iteration": 2.614560842514038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01232523, + "balance_loss_mlp": 1.08994913, + "epoch": 0.09003462870334744, + "flos": 479850103296.0, + "grad_norm": 0.03230737833956583, + "language_loss": 0.98222148, + "learning_rate": 0.0009905816444862442, + "loss": 0.99454677, + "num_input_tokens_seen": 38891984, + "router_z_loss_mlp": 1.42675781, + "step": 468, + "time_per_iteration": 2.598146438598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223867, + "balance_loss_mlp": 1.08124495, + "epoch": 0.090227010388611, + "flos": 654902178816.0, + "grad_norm": 0.027522185030294237, + "language_loss": 0.95659769, + "learning_rate": 0.0009905213654454216, + "loss": 0.96883637, + "num_input_tokens_seen": 38977136, + "router_z_loss_mlp": 1.42724609, + "step": 469, + "time_per_iteration": 2.8876352310180664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01219852, + "balance_loss_mlp": 1.07737279, + "epoch": 0.09041939207387456, + "flos": 619358515200.0, + "grad_norm": 0.023282407360439072, + "language_loss": 1.03878951, + "learning_rate": 0.0009904608959673158, + "loss": 1.0509882, + "num_input_tokens_seen": 39052224, + "router_z_loss_mlp": 1.42578125, + "step": 470, + "time_per_iteration": 2.7882330417633057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213781, + "balance_loss_mlp": 1.0718745, + "epoch": 0.09061177375913813, + "flos": 455295596544.0, + "grad_norm": 0.02882877970469751, + "language_loss": 1.04707062, + "learning_rate": 0.000990400236075403, + "loss": 1.05920839, + "num_input_tokens_seen": 39116832, + "router_z_loss_mlp": 1.41992188, + "step": 471, + "time_per_iteration": 2.5016987323760986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217743, + "balance_loss_mlp": 1.07574117, + "epoch": 0.0908041554444017, + "flos": 545308230144.0, + "grad_norm": 0.02444258884202674, + "language_loss": 1.01020849, + "learning_rate": 0.0009903393857932338, + "loss": 1.02238584, + "num_input_tokens_seen": 39190528, + "router_z_loss_mlp": 1.42089844, + "step": 472, + "time_per_iteration": 2.644397497177124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218613, + "balance_loss_mlp": 1.07732654, + "epoch": 0.09099653712966525, + "flos": 565466964480.0, + "grad_norm": 0.02685769494428931, + "language_loss": 0.99245131, + "learning_rate": 0.0009902783451444317, + "loss": 1.00463748, + "num_input_tokens_seen": 39263168, + "router_z_loss_mlp": 1.41357422, + "step": 473, + "time_per_iteration": 2.7087745666503906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214499, + "balance_loss_mlp": 1.07292593, + "epoch": 0.09118891881492881, + "flos": 475501994496.0, + "grad_norm": 0.029476649456104027, + "language_loss": 1.02896917, + "learning_rate": 0.0009902171141526956, + "loss": 1.04111421, + "num_input_tokens_seen": 39330784, + "router_z_loss_mlp": 1.41650391, + "step": 474, + "time_per_iteration": 2.5271990299224854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215154, + "balance_loss_mlp": 1.07410538, + "epoch": 0.09138130050019239, + "flos": 546990822912.0, + "grad_norm": 0.02490932279529465, + "language_loss": 0.89845926, + "learning_rate": 0.000990155692841797, + "loss": 0.9106108, + "num_input_tokens_seen": 39417472, + "router_z_loss_mlp": 1.41113281, + "step": 475, + "time_per_iteration": 2.958740234375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214039, + "balance_loss_mlp": 1.07303798, + "epoch": 0.09157368218545595, + "flos": 733973319168.0, + "grad_norm": 0.02740759839690251, + "language_loss": 1.01869047, + "learning_rate": 0.0009900940812355818, + "loss": 1.03083086, + "num_input_tokens_seen": 39488656, + "router_z_loss_mlp": 1.41064453, + "step": 476, + "time_per_iteration": 2.891787528991699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205639, + "balance_loss_mlp": 1.06478107, + "epoch": 0.0917660638707195, + "flos": 612072918528.0, + "grad_norm": 0.029261712768775452, + "language_loss": 0.99624813, + "learning_rate": 0.00099003227935797, + "loss": 1.0083046, + "num_input_tokens_seen": 39558224, + "router_z_loss_mlp": 1.40917969, + "step": 477, + "time_per_iteration": 2.7569031715393066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207057, + "balance_loss_mlp": 1.06605613, + "epoch": 0.09195844555598306, + "flos": 657018473472.0, + "grad_norm": 0.026965523070242428, + "language_loss": 1.02860427, + "learning_rate": 0.000989970287232955, + "loss": 1.04067481, + "num_input_tokens_seen": 39629856, + "router_z_loss_mlp": 1.41064453, + "step": 478, + "time_per_iteration": 2.7705225944519043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212938, + "balance_loss_mlp": 1.07212758, + "epoch": 0.09215082724124664, + "flos": 477540426240.0, + "grad_norm": 0.02578247385618595, + "language_loss": 0.99767786, + "learning_rate": 0.0009899081048846043, + "loss": 1.00980723, + "num_input_tokens_seen": 39695984, + "router_z_loss_mlp": 1.40869141, + "step": 479, + "time_per_iteration": 2.5488922595977783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215229, + "balance_loss_mlp": 1.07437098, + "epoch": 0.0923432089265102, + "flos": 525325413888.0, + "grad_norm": 0.029009434883925433, + "language_loss": 1.05276799, + "learning_rate": 0.0009898457323370593, + "loss": 1.06492031, + "num_input_tokens_seen": 39760256, + "router_z_loss_mlp": 1.40917969, + "step": 480, + "time_per_iteration": 2.5628790855407715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213957, + "balance_loss_mlp": 1.07314658, + "epoch": 0.09253559061177376, + "flos": 546638986752.0, + "grad_norm": 0.030643020391807937, + "language_loss": 1.01694977, + "learning_rate": 0.000989783169614535, + "loss": 1.02908933, + "num_input_tokens_seen": 39827984, + "router_z_loss_mlp": 1.40869141, + "step": 481, + "time_per_iteration": 2.6431851387023926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206421, + "balance_loss_mlp": 1.06718445, + "epoch": 0.09272797229703732, + "flos": 1541334362112.0, + "grad_norm": 0.00793715508899474, + "language_loss": 0.78752756, + "learning_rate": 0.0009897204167413206, + "loss": 0.79959178, + "num_input_tokens_seen": 40056688, + "router_z_loss_mlp": 1.39257812, + "step": 482, + "time_per_iteration": 4.84259295463562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211177, + "balance_loss_mlp": 1.07041514, + "epoch": 0.09292035398230089, + "flos": 691064194560.0, + "grad_norm": 0.029391602229229655, + "language_loss": 0.99036419, + "learning_rate": 0.000989657473741779, + "loss": 1.00247598, + "num_input_tokens_seen": 40133120, + "router_z_loss_mlp": 1.40820312, + "step": 483, + "time_per_iteration": 2.8193717002868652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210505, + "balance_loss_mlp": 1.06964695, + "epoch": 0.09311273566756445, + "flos": 510822076416.0, + "grad_norm": 0.026713621627667553, + "language_loss": 1.0060308, + "learning_rate": 0.0009895943406403465, + "loss": 1.01813591, + "num_input_tokens_seen": 40206464, + "router_z_loss_mlp": 1.40917969, + "step": 484, + "time_per_iteration": 2.695058822631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210956, + "balance_loss_mlp": 1.07071841, + "epoch": 0.09330511735282801, + "flos": 660583045632.0, + "grad_norm": 0.02538483632370611, + "language_loss": 0.94170594, + "learning_rate": 0.0009895310174615338, + "loss": 0.95381546, + "num_input_tokens_seen": 40277744, + "router_z_loss_mlp": 1.40283203, + "step": 485, + "time_per_iteration": 2.7646515369415283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210991, + "balance_loss_mlp": 1.0725174, + "epoch": 0.09349749903809157, + "flos": 1456021673472.0, + "grad_norm": 0.008074315810691821, + "language_loss": 0.75718516, + "learning_rate": 0.0009894675042299251, + "loss": 0.7692951, + "num_input_tokens_seen": 40503664, + "router_z_loss_mlp": 1.38476562, + "step": 486, + "time_per_iteration": 4.652726888656616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208546, + "balance_loss_mlp": 1.06868994, + "epoch": 0.09368988072335514, + "flos": 521899829760.0, + "grad_norm": 0.021962490795067104, + "language_loss": 0.97574425, + "learning_rate": 0.0009894038009701782, + "loss": 0.98782969, + "num_input_tokens_seen": 40571376, + "router_z_loss_mlp": 1.39892578, + "step": 487, + "time_per_iteration": 2.647747755050659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207771, + "balance_loss_mlp": 1.06786692, + "epoch": 0.0938822624086187, + "flos": 498751941120.0, + "grad_norm": 0.02403393711112831, + "language_loss": 1.01297927, + "learning_rate": 0.0009893399077070253, + "loss": 1.02505696, + "num_input_tokens_seen": 40638096, + "router_z_loss_mlp": 1.39941406, + "step": 488, + "time_per_iteration": 2.5559775829315186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209251, + "balance_loss_mlp": 1.07006216, + "epoch": 0.09407464409388226, + "flos": 534223746048.0, + "grad_norm": 0.02465812888810929, + "language_loss": 0.94380867, + "learning_rate": 0.0009892758244652718, + "loss": 0.95590127, + "num_input_tokens_seen": 40710992, + "router_z_loss_mlp": 1.39208984, + "step": 489, + "time_per_iteration": 2.6696364879608154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203933, + "balance_loss_mlp": 1.06398153, + "epoch": 0.09426702577914582, + "flos": 587090714112.0, + "grad_norm": 0.02607881729553482, + "language_loss": 1.01920152, + "learning_rate": 0.0009892115512697968, + "loss": 1.03124094, + "num_input_tokens_seen": 40778896, + "router_z_loss_mlp": 1.39990234, + "step": 490, + "time_per_iteration": 2.645073652267456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205245, + "balance_loss_mlp": 1.06524527, + "epoch": 0.0944594074644094, + "flos": 504463733760.0, + "grad_norm": 0.02086232355550113, + "language_loss": 1.01703966, + "learning_rate": 0.0009891470881455537, + "loss": 1.02909207, + "num_input_tokens_seen": 40853376, + "router_z_loss_mlp": 1.40039062, + "step": 491, + "time_per_iteration": 2.669978618621826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207443, + "balance_loss_mlp": 1.06777763, + "epoch": 0.09465178914967295, + "flos": 572114016768.0, + "grad_norm": 0.026976181820206353, + "language_loss": 1.00743008, + "learning_rate": 0.0009890824351175692, + "loss": 1.01950443, + "num_input_tokens_seen": 40923776, + "router_z_loss_mlp": 1.39697266, + "step": 492, + "time_per_iteration": 2.6572952270507812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207157, + "balance_loss_mlp": 1.06796801, + "epoch": 0.09484417083493651, + "flos": 550418408448.0, + "grad_norm": 0.023611014675858334, + "language_loss": 1.04079592, + "learning_rate": 0.0009890175922109435, + "loss": 1.05286753, + "num_input_tokens_seen": 40996848, + "router_z_loss_mlp": 1.39208984, + "step": 493, + "time_per_iteration": 2.622361183166504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120413, + "balance_loss_mlp": 1.06498933, + "epoch": 0.09503655252020007, + "flos": 825271047168.0, + "grad_norm": 0.02510100112233158, + "language_loss": 1.0275588, + "learning_rate": 0.0009889525594508513, + "loss": 1.03960025, + "num_input_tokens_seen": 41071280, + "router_z_loss_mlp": 1.39160156, + "step": 494, + "time_per_iteration": 3.0307581424713135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202477, + "balance_loss_mlp": 1.06333554, + "epoch": 0.09522893420546363, + "flos": 405517839360.0, + "grad_norm": 0.02234367718934989, + "language_loss": 0.96151906, + "learning_rate": 0.0009888873368625404, + "loss": 0.97354376, + "num_input_tokens_seen": 41136304, + "router_z_loss_mlp": 1.39160156, + "step": 495, + "time_per_iteration": 2.4793317317962646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205465, + "balance_loss_mlp": 1.06665742, + "epoch": 0.0954213158907272, + "flos": 692255963136.0, + "grad_norm": 0.025506351191757377, + "language_loss": 1.00908709, + "learning_rate": 0.0009888219244713326, + "loss": 1.02114165, + "num_input_tokens_seen": 41212384, + "router_z_loss_mlp": 1.38818359, + "step": 496, + "time_per_iteration": 2.865914821624756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206499, + "balance_loss_mlp": 1.06773937, + "epoch": 0.09561369757599077, + "flos": 520074246144.0, + "grad_norm": 0.030124833611481355, + "language_loss": 1.02319717, + "learning_rate": 0.0009887563223026229, + "loss": 1.03526211, + "num_input_tokens_seen": 41282528, + "router_z_loss_mlp": 1.38671875, + "step": 497, + "time_per_iteration": 2.689708948135376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210899, + "balance_loss_mlp": 1.07376099, + "epoch": 0.09580607926125433, + "flos": 1388781623808.0, + "grad_norm": 0.014650036919455408, + "language_loss": 0.7906816, + "learning_rate": 0.0009886905303818805, + "loss": 0.80279064, + "num_input_tokens_seen": 41512256, + "router_z_loss_mlp": 1.37109375, + "step": 498, + "time_per_iteration": 4.940208196640015 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203477, + "balance_loss_mlp": 1.06476545, + "epoch": 0.09599846094651789, + "flos": 718825433088.0, + "grad_norm": 0.028840614245688557, + "language_loss": 0.98952407, + "learning_rate": 0.0009886245487346482, + "loss": 1.00155878, + "num_input_tokens_seen": 41596816, + "router_z_loss_mlp": 1.38427734, + "step": 499, + "time_per_iteration": 3.023056745529175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205479, + "balance_loss_mlp": 1.06690967, + "epoch": 0.09619084263178146, + "flos": 386893977600.0, + "grad_norm": 0.031706482821381415, + "language_loss": 1.0340035, + "learning_rate": 0.0009885583773865422, + "loss": 1.04605842, + "num_input_tokens_seen": 41658544, + "router_z_loss_mlp": 1.38183594, + "step": 500, + "time_per_iteration": 2.422914505004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202787, + "balance_loss_mlp": 1.06479073, + "epoch": 0.09638322431704502, + "flos": 535172467200.0, + "grad_norm": 0.02878579188863982, + "language_loss": 0.99392897, + "learning_rate": 0.0009884920163632524, + "loss": 1.00595689, + "num_input_tokens_seen": 41730736, + "router_z_loss_mlp": 1.37988281, + "step": 501, + "time_per_iteration": 2.6820154190063477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203474, + "balance_loss_mlp": 1.0655731, + "epoch": 0.09657560600230858, + "flos": 501656501760.0, + "grad_norm": 0.02635733095705931, + "language_loss": 1.03128934, + "learning_rate": 0.000988425465690543, + "loss": 1.04332411, + "num_input_tokens_seen": 41797824, + "router_z_loss_mlp": 1.37890625, + "step": 502, + "time_per_iteration": 2.605536699295044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204627, + "balance_loss_mlp": 1.06677341, + "epoch": 0.09676798768757214, + "flos": 530331532800.0, + "grad_norm": 0.023374032620567947, + "language_loss": 1.00861204, + "learning_rate": 0.0009883587253942505, + "loss": 1.02065825, + "num_input_tokens_seen": 41875520, + "router_z_loss_mlp": 1.37841797, + "step": 503, + "time_per_iteration": 2.7548091411590576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204765, + "balance_loss_mlp": 1.06686366, + "epoch": 0.09696036937283571, + "flos": 464556498432.0, + "grad_norm": 0.029206950172382878, + "language_loss": 1.0685035, + "learning_rate": 0.0009882917955002862, + "loss": 1.08055115, + "num_input_tokens_seen": 41942224, + "router_z_loss_mlp": 1.37890625, + "step": 504, + "time_per_iteration": 2.520970344543457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200777, + "balance_loss_mlp": 1.06297076, + "epoch": 0.09715275105809927, + "flos": 536010398208.0, + "grad_norm": 0.02484338661637091, + "language_loss": 0.9770751, + "learning_rate": 0.0009882246760346343, + "loss": 0.98908287, + "num_input_tokens_seen": 42007552, + "router_z_loss_mlp": 1.37695312, + "step": 505, + "time_per_iteration": 2.6314897537231445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204578, + "balance_loss_mlp": 1.06672478, + "epoch": 0.09734513274336283, + "flos": 455881747968.0, + "grad_norm": 0.02756591702740651, + "language_loss": 1.04990697, + "learning_rate": 0.0009881573670233533, + "loss": 1.06195283, + "num_input_tokens_seen": 42071760, + "router_z_loss_mlp": 1.37451172, + "step": 506, + "time_per_iteration": 2.492464780807495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203948, + "balance_loss_mlp": 1.06619, + "epoch": 0.09753751442862639, + "flos": 509827693056.0, + "grad_norm": 0.02954706972608782, + "language_loss": 0.97619581, + "learning_rate": 0.0009880898684925747, + "loss": 0.98823535, + "num_input_tokens_seen": 42140688, + "router_z_loss_mlp": 1.37353516, + "step": 507, + "time_per_iteration": 2.6402478218078613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120195, + "balance_loss_mlp": 1.06438243, + "epoch": 0.09772989611388996, + "flos": 485246989824.0, + "grad_norm": 0.02487380392257162, + "language_loss": 0.96617985, + "learning_rate": 0.0009880221804685037, + "loss": 0.97819936, + "num_input_tokens_seen": 42208544, + "router_z_loss_mlp": 1.37158203, + "step": 508, + "time_per_iteration": 2.5352439880371094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209412, + "balance_loss_mlp": 1.0741806, + "epoch": 0.09792227779915352, + "flos": 1569316454400.0, + "grad_norm": 0.016823619827393988, + "language_loss": 0.79344422, + "learning_rate": 0.000987954302977419, + "loss": 0.80553836, + "num_input_tokens_seen": 42426624, + "router_z_loss_mlp": 1.3515625, + "step": 509, + "time_per_iteration": 4.694217205047607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205455, + "balance_loss_mlp": 1.06831706, + "epoch": 0.09811465948441708, + "flos": 588914296320.0, + "grad_norm": 0.032012577058462416, + "language_loss": 1.03636336, + "learning_rate": 0.0009878862360456733, + "loss": 1.04841793, + "num_input_tokens_seen": 42494592, + "router_z_loss_mlp": 1.37011719, + "step": 510, + "time_per_iteration": 2.73879337310791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208431, + "balance_loss_mlp": 1.07148337, + "epoch": 0.09830704116968064, + "flos": 614128814592.0, + "grad_norm": 0.028115444050206044, + "language_loss": 0.94855493, + "learning_rate": 0.0009878179796996922, + "loss": 0.96063924, + "num_input_tokens_seen": 42564944, + "router_z_loss_mlp": 1.36914062, + "step": 511, + "time_per_iteration": 2.6949734687805176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207361, + "balance_loss_mlp": 1.07050836, + "epoch": 0.09849942285494422, + "flos": 539935538688.0, + "grad_norm": 0.022608937638108787, + "language_loss": 0.9790619, + "learning_rate": 0.0009877495339659754, + "loss": 0.99113548, + "num_input_tokens_seen": 42645616, + "router_z_loss_mlp": 1.36816406, + "step": 512, + "time_per_iteration": 2.7515861988067627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214076, + "balance_loss_mlp": 1.0773195, + "epoch": 0.09869180454020778, + "flos": 621603064320.0, + "grad_norm": 0.029833187637910333, + "language_loss": 0.94261241, + "learning_rate": 0.000987680898871096, + "loss": 0.95475316, + "num_input_tokens_seen": 42713632, + "router_z_loss_mlp": 1.3671875, + "step": 513, + "time_per_iteration": 2.6975760459899902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120845, + "balance_loss_mlp": 1.07145417, + "epoch": 0.09888418622547133, + "flos": 813059922432.0, + "grad_norm": 0.032512892127392744, + "language_loss": 0.9726817, + "learning_rate": 0.0009876120744417, + "loss": 0.98476619, + "num_input_tokens_seen": 42789088, + "router_z_loss_mlp": 1.36767578, + "step": 514, + "time_per_iteration": 2.9514927864074707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214576, + "balance_loss_mlp": 1.07762837, + "epoch": 0.0990765679107349, + "flos": 536857061376.0, + "grad_norm": 0.028495408786163776, + "language_loss": 1.0346663, + "learning_rate": 0.0009875430607045078, + "loss": 1.04681206, + "num_input_tokens_seen": 42861168, + "router_z_loss_mlp": 1.36523438, + "step": 515, + "time_per_iteration": 2.669271230697632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209323, + "balance_loss_mlp": 1.07242322, + "epoch": 0.09926894959599845, + "flos": 588970692096.0, + "grad_norm": 0.026228231589839293, + "language_loss": 0.98752952, + "learning_rate": 0.000987473857686313, + "loss": 0.9996227, + "num_input_tokens_seen": 42934112, + "router_z_loss_mlp": 1.36474609, + "step": 516, + "time_per_iteration": 2.7055716514587402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120601, + "balance_loss_mlp": 1.06934881, + "epoch": 0.09946133128126203, + "flos": 642386881536.0, + "grad_norm": 0.0302129460476142, + "language_loss": 1.04248524, + "learning_rate": 0.0009874044654139824, + "loss": 1.05454528, + "num_input_tokens_seen": 43005248, + "router_z_loss_mlp": 1.36230469, + "step": 517, + "time_per_iteration": 2.726618528366089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200307, + "balance_loss_mlp": 1.06340742, + "epoch": 0.09965371296652559, + "flos": 466725186048.0, + "grad_norm": 0.03251153136411229, + "language_loss": 1.02563679, + "learning_rate": 0.0009873348839144563, + "loss": 1.03763986, + "num_input_tokens_seen": 43070576, + "router_z_loss_mlp": 1.36474609, + "step": 518, + "time_per_iteration": 2.5855953693389893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200913, + "balance_loss_mlp": 1.06439471, + "epoch": 0.09984609465178915, + "flos": 484558780416.0, + "grad_norm": 0.029627125773621466, + "language_loss": 1.03352094, + "learning_rate": 0.000987265113214749, + "loss": 1.04552996, + "num_input_tokens_seen": 43138048, + "router_z_loss_mlp": 1.36279297, + "step": 519, + "time_per_iteration": 2.5350587368011475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01201703, + "balance_loss_mlp": 1.06566191, + "epoch": 0.1000384763370527, + "flos": 570095050752.0, + "grad_norm": 0.028931775658430137, + "language_loss": 1.07544637, + "learning_rate": 0.0009871951533419476, + "loss": 1.08746338, + "num_input_tokens_seen": 43207600, + "router_z_loss_mlp": 1.35986328, + "step": 520, + "time_per_iteration": 2.6423709392547607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200484, + "balance_loss_mlp": 1.06439495, + "epoch": 0.10023085802231628, + "flos": 546925694976.0, + "grad_norm": 0.025491893219336172, + "language_loss": 0.95403761, + "learning_rate": 0.0009871250043232132, + "loss": 0.96604246, + "num_input_tokens_seen": 43285104, + "router_z_loss_mlp": 1.36035156, + "step": 521, + "time_per_iteration": 2.7604362964630127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198813, + "balance_loss_mlp": 1.06205583, + "epoch": 0.10042323970757984, + "flos": 504439538688.0, + "grad_norm": 0.029888360913216814, + "language_loss": 0.96113187, + "learning_rate": 0.0009870546661857797, + "loss": 0.97311997, + "num_input_tokens_seen": 43353312, + "router_z_loss_mlp": 1.36328125, + "step": 522, + "time_per_iteration": 2.578458547592163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195212, + "balance_loss_mlp": 1.05931365, + "epoch": 0.1006156213928434, + "flos": 771724601856.0, + "grad_norm": 0.029426081780707294, + "language_loss": 1.05752206, + "learning_rate": 0.0009869841389569553, + "loss": 1.0694741, + "num_input_tokens_seen": 43427680, + "router_z_loss_mlp": 1.35839844, + "step": 523, + "time_per_iteration": 2.958531618118286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194366, + "balance_loss_mlp": 1.05846703, + "epoch": 0.10080800307810696, + "flos": 491008447488.0, + "grad_norm": 0.024593893632090205, + "language_loss": 0.96497846, + "learning_rate": 0.0009869134226641206, + "loss": 0.97692204, + "num_input_tokens_seen": 43495200, + "router_z_loss_mlp": 1.35839844, + "step": 524, + "time_per_iteration": 2.6820528507232666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196113, + "balance_loss_mlp": 1.06030965, + "epoch": 0.10100038476337053, + "flos": 455712560640.0, + "grad_norm": 0.026556514945601337, + "language_loss": 0.98348475, + "learning_rate": 0.0009868425173347303, + "loss": 0.99544585, + "num_input_tokens_seen": 43566256, + "router_z_loss_mlp": 1.35742188, + "step": 525, + "time_per_iteration": 2.6460907459259033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196515, + "balance_loss_mlp": 1.06099772, + "epoch": 0.10119276644863409, + "flos": 557573749248.0, + "grad_norm": 0.022458491608374247, + "language_loss": 1.03332829, + "learning_rate": 0.0009867714229963125, + "loss": 1.04529333, + "num_input_tokens_seen": 43639696, + "router_z_loss_mlp": 1.35449219, + "step": 526, + "time_per_iteration": 2.693362236022949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119647, + "balance_loss_mlp": 1.0609529, + "epoch": 0.10138514813389765, + "flos": 517219350528.0, + "grad_norm": 0.028969258136437262, + "language_loss": 1.0161202, + "learning_rate": 0.000986700139676468, + "loss": 1.02808487, + "num_input_tokens_seen": 43703872, + "router_z_loss_mlp": 1.35449219, + "step": 527, + "time_per_iteration": 2.5826644897460938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202893, + "balance_loss_mlp": 1.06742311, + "epoch": 0.10157752981916121, + "flos": 501563175936.0, + "grad_norm": 0.023004964960346017, + "language_loss": 0.98490077, + "learning_rate": 0.0009866286674028717, + "loss": 0.99692971, + "num_input_tokens_seen": 43774416, + "router_z_loss_mlp": 1.35400391, + "step": 528, + "time_per_iteration": 2.626595973968506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204326, + "balance_loss_mlp": 1.06876123, + "epoch": 0.10176991150442478, + "flos": 658093447680.0, + "grad_norm": 0.024381421822087013, + "language_loss": 0.95674849, + "learning_rate": 0.0009865570062032717, + "loss": 0.96879184, + "num_input_tokens_seen": 43853376, + "router_z_loss_mlp": 1.35498047, + "step": 529, + "time_per_iteration": 2.916924238204956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203456, + "balance_loss_mlp": 1.0680815, + "epoch": 0.10196229318968834, + "flos": 574402226688.0, + "grad_norm": 0.021344584600364362, + "language_loss": 0.99175954, + "learning_rate": 0.0009864851561054893, + "loss": 1.00379407, + "num_input_tokens_seen": 43929632, + "router_z_loss_mlp": 1.35302734, + "step": 530, + "time_per_iteration": 2.750075578689575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203649, + "balance_loss_mlp": 1.06856096, + "epoch": 0.1021546748749519, + "flos": 519255780864.0, + "grad_norm": 0.027896087186932737, + "language_loss": 0.99157, + "learning_rate": 0.0009864131171374191, + "loss": 1.00360656, + "num_input_tokens_seen": 44002144, + "router_z_loss_mlp": 1.35009766, + "step": 531, + "time_per_iteration": 2.6506359577178955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202329, + "balance_loss_mlp": 1.06728852, + "epoch": 0.10234705656021546, + "flos": 610953008640.0, + "grad_norm": 0.021304730024267197, + "language_loss": 0.98848057, + "learning_rate": 0.0009863408893270292, + "loss": 1.0005039, + "num_input_tokens_seen": 44078272, + "router_z_loss_mlp": 1.34960938, + "step": 532, + "time_per_iteration": 2.827632427215576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202805, + "balance_loss_mlp": 1.06776476, + "epoch": 0.10253943824547904, + "flos": 602912073216.0, + "grad_norm": 0.02650069508154076, + "language_loss": 0.95645475, + "learning_rate": 0.0009862684727023605, + "loss": 0.96848285, + "num_input_tokens_seen": 44152304, + "router_z_loss_mlp": 1.34960938, + "step": 533, + "time_per_iteration": 2.730771541595459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206135, + "balance_loss_mlp": 1.07152414, + "epoch": 0.1027318199307426, + "flos": 664156349952.0, + "grad_norm": 0.02579556790717569, + "language_loss": 0.96718729, + "learning_rate": 0.0009861958672915283, + "loss": 0.97924864, + "num_input_tokens_seen": 44226720, + "router_z_loss_mlp": 1.34521484, + "step": 534, + "time_per_iteration": 2.825239419937134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202189, + "balance_loss_mlp": 1.06776834, + "epoch": 0.10292420161600616, + "flos": 684529933824.0, + "grad_norm": 0.02492376876437301, + "language_loss": 0.95656139, + "learning_rate": 0.0009861230731227201, + "loss": 0.96858335, + "num_input_tokens_seen": 44303600, + "router_z_loss_mlp": 1.34326172, + "step": 535, + "time_per_iteration": 2.858596086502075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203815, + "balance_loss_mlp": 1.06958508, + "epoch": 0.10311658330126972, + "flos": 491268959232.0, + "grad_norm": 0.02833674325523021, + "language_loss": 0.99709427, + "learning_rate": 0.0009860500902241973, + "loss": 1.00913239, + "num_input_tokens_seen": 44370960, + "router_z_loss_mlp": 1.34130859, + "step": 536, + "time_per_iteration": 2.5780303478240967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197149, + "balance_loss_mlp": 1.06291902, + "epoch": 0.10330896498653329, + "flos": 432686195712.0, + "grad_norm": 0.024484943889946764, + "language_loss": 1.03652823, + "learning_rate": 0.0009859769186242942, + "loss": 1.0484997, + "num_input_tokens_seen": 44435584, + "router_z_loss_mlp": 1.34130859, + "step": 537, + "time_per_iteration": 2.5104598999023438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119791, + "balance_loss_mlp": 1.06415713, + "epoch": 0.10350134667179685, + "flos": 550641990144.0, + "grad_norm": 0.0271300181774947, + "language_loss": 0.97886324, + "learning_rate": 0.0009859035583514187, + "loss": 0.99084234, + "num_input_tokens_seen": 44505456, + "router_z_loss_mlp": 1.33642578, + "step": 538, + "time_per_iteration": 2.6156880855560303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197994, + "balance_loss_mlp": 1.06395507, + "epoch": 0.10369372835706041, + "flos": 641826926592.0, + "grad_norm": 0.024416305433678544, + "language_loss": 1.00991774, + "learning_rate": 0.0009858300094340517, + "loss": 1.02189767, + "num_input_tokens_seen": 44580208, + "router_z_loss_mlp": 1.33935547, + "step": 539, + "time_per_iteration": 2.764214515686035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198436, + "balance_loss_mlp": 1.06468332, + "epoch": 0.10388611004232397, + "flos": 522765958656.0, + "grad_norm": 0.025798430155835095, + "language_loss": 0.9342165, + "learning_rate": 0.0009857562719007473, + "loss": 0.94620085, + "num_input_tokens_seen": 44646576, + "router_z_loss_mlp": 1.33642578, + "step": 540, + "time_per_iteration": 2.6592581272125244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204547, + "balance_loss_mlp": 1.07122386, + "epoch": 0.10407849172758753, + "flos": 703739947008.0, + "grad_norm": 0.023593197084580173, + "language_loss": 0.95331407, + "learning_rate": 0.0009856823457801331, + "loss": 0.96535957, + "num_input_tokens_seen": 44726752, + "router_z_loss_mlp": 1.33203125, + "step": 541, + "time_per_iteration": 2.889531373977661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202711, + "balance_loss_mlp": 1.06924474, + "epoch": 0.1042708734128511, + "flos": 503944711680.0, + "grad_norm": 0.023957714626313076, + "language_loss": 1.02856565, + "learning_rate": 0.00098560823110091, + "loss": 1.04059267, + "num_input_tokens_seen": 44795824, + "router_z_loss_mlp": 1.33349609, + "step": 542, + "time_per_iteration": 2.6067047119140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205134, + "balance_loss_mlp": 1.07185781, + "epoch": 0.10446325509811466, + "flos": 486640872960.0, + "grad_norm": 0.0231214260398276, + "language_loss": 1.01405394, + "learning_rate": 0.000985533927891851, + "loss": 1.02610517, + "num_input_tokens_seen": 44868496, + "router_z_loss_mlp": 1.33154297, + "step": 543, + "time_per_iteration": 2.6622776985168457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01201388, + "balance_loss_mlp": 1.06820762, + "epoch": 0.10465563678337822, + "flos": 569713015296.0, + "grad_norm": 0.023482705287667723, + "language_loss": 1.01015687, + "learning_rate": 0.0009854594361818044, + "loss": 1.02217078, + "num_input_tokens_seen": 44939888, + "router_z_loss_mlp": 1.33056641, + "step": 544, + "time_per_iteration": 2.7061924934387207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195672, + "balance_loss_mlp": 1.06244385, + "epoch": 0.10484801846864178, + "flos": 627242998272.0, + "grad_norm": 0.023194608242680787, + "language_loss": 0.99799937, + "learning_rate": 0.0009853847559996897, + "loss": 1.00995612, + "num_input_tokens_seen": 45012720, + "router_z_loss_mlp": 1.33105469, + "step": 545, + "time_per_iteration": 2.742445707321167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192128, + "balance_loss_mlp": 1.05885231, + "epoch": 0.10504040015390535, + "flos": 744812754432.0, + "grad_norm": 0.025865682249952955, + "language_loss": 0.99192667, + "learning_rate": 0.0009853098873745, + "loss": 1.00384796, + "num_input_tokens_seen": 45093744, + "router_z_loss_mlp": 1.33154297, + "step": 546, + "time_per_iteration": 3.0260400772094727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192867, + "balance_loss_mlp": 1.05997264, + "epoch": 0.10523278183916891, + "flos": 587842050048.0, + "grad_norm": 0.02599355243407578, + "language_loss": 0.98197657, + "learning_rate": 0.0009852348303353027, + "loss": 0.99390525, + "num_input_tokens_seen": 45172784, + "router_z_loss_mlp": 1.32763672, + "step": 547, + "time_per_iteration": 2.8120169639587402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191481, + "balance_loss_mlp": 1.05844367, + "epoch": 0.10542516352443247, + "flos": 871145857536.0, + "grad_norm": 0.02495252935664815, + "language_loss": 0.91398883, + "learning_rate": 0.000985159584911237, + "loss": 0.92590368, + "num_input_tokens_seen": 45255600, + "router_z_loss_mlp": 1.32910156, + "step": 548, + "time_per_iteration": 3.1012043952941895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119193, + "balance_loss_mlp": 1.05913138, + "epoch": 0.10561754520969603, + "flos": 506412842496.0, + "grad_norm": 0.025955858684814606, + "language_loss": 0.9925828, + "learning_rate": 0.0009850841511315162, + "loss": 1.00450206, + "num_input_tokens_seen": 45325072, + "router_z_loss_mlp": 1.32666016, + "step": 549, + "time_per_iteration": 2.626220464706421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192876, + "balance_loss_mlp": 1.06022012, + "epoch": 0.1058099268949596, + "flos": 561147053568.0, + "grad_norm": 0.02554357007654854, + "language_loss": 0.98952115, + "learning_rate": 0.0009850085290254256, + "loss": 1.00144982, + "num_input_tokens_seen": 45401440, + "router_z_loss_mlp": 1.32519531, + "step": 550, + "time_per_iteration": 2.7464635372161865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194366, + "balance_loss_mlp": 1.06161487, + "epoch": 0.10600230858022316, + "flos": 563159288832.0, + "grad_norm": 0.020736613501838204, + "language_loss": 0.9519307, + "learning_rate": 0.0009849327186223246, + "loss": 0.9638744, + "num_input_tokens_seen": 45479264, + "router_z_loss_mlp": 1.32617188, + "step": 551, + "time_per_iteration": 2.7678163051605225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199655, + "balance_loss_mlp": 1.06728542, + "epoch": 0.10619469026548672, + "flos": 495317624832.0, + "grad_norm": 0.02236411826292933, + "language_loss": 1.02411103, + "learning_rate": 0.000984856719951646, + "loss": 1.03610754, + "num_input_tokens_seen": 45547328, + "router_z_loss_mlp": 1.32226562, + "step": 552, + "time_per_iteration": 2.5607285499572754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196226, + "balance_loss_mlp": 1.06404662, + "epoch": 0.10638707195075028, + "flos": 677463916032.0, + "grad_norm": 0.025808282690500464, + "language_loss": 1.00531495, + "learning_rate": 0.0009847805330428943, + "loss": 1.01727724, + "num_input_tokens_seen": 45631152, + "router_z_loss_mlp": 1.3203125, + "step": 553, + "time_per_iteration": 2.8748667240142822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190787, + "balance_loss_mlp": 1.05860806, + "epoch": 0.10657945363601386, + "flos": 489035143680.0, + "grad_norm": 0.02571681940882287, + "language_loss": 1.04715252, + "learning_rate": 0.0009847041579256481, + "loss": 1.05906045, + "num_input_tokens_seen": 45698208, + "router_z_loss_mlp": 1.3203125, + "step": 554, + "time_per_iteration": 2.56693696975708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191519, + "balance_loss_mlp": 1.05948246, + "epoch": 0.10677183532127742, + "flos": 483970627584.0, + "grad_norm": 0.020874824601389917, + "language_loss": 1.01746583, + "learning_rate": 0.0009846275946295592, + "loss": 1.02938092, + "num_input_tokens_seen": 45766640, + "router_z_loss_mlp": 1.31884766, + "step": 555, + "time_per_iteration": 2.596774101257324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195781, + "balance_loss_mlp": 1.06369734, + "epoch": 0.10696421700654098, + "flos": 657581156352.0, + "grad_norm": 0.023085993180182653, + "language_loss": 0.93557143, + "learning_rate": 0.0009845508431843518, + "loss": 0.94752926, + "num_input_tokens_seen": 45851408, + "router_z_loss_mlp": 1.31933594, + "step": 556, + "time_per_iteration": 2.9913573265075684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192823, + "balance_loss_mlp": 1.06088233, + "epoch": 0.10715659869180454, + "flos": 568792492032.0, + "grad_norm": 0.026087632201688016, + "language_loss": 0.9692713, + "learning_rate": 0.0009844739036198233, + "loss": 0.9811995, + "num_input_tokens_seen": 45919824, + "router_z_loss_mlp": 1.31787109, + "step": 557, + "time_per_iteration": 2.6583988666534424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192362, + "balance_loss_mlp": 1.06051683, + "epoch": 0.10734898037706811, + "flos": 541743657984.0, + "grad_norm": 0.02708275038302545, + "language_loss": 1.03564882, + "learning_rate": 0.0009843967759658448, + "loss": 1.04757237, + "num_input_tokens_seen": 45991024, + "router_z_loss_mlp": 1.31689453, + "step": 558, + "time_per_iteration": 2.6571173667907715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209854, + "balance_loss_mlp": 1.07920074, + "epoch": 0.10754136206233167, + "flos": 1479731518464.0, + "grad_norm": 0.021017403581586082, + "language_loss": 0.72767758, + "learning_rate": 0.0009843194602523592, + "loss": 0.73977602, + "num_input_tokens_seen": 46212736, + "router_z_loss_mlp": 1.30664062, + "step": 559, + "time_per_iteration": 4.901749134063721 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191994, + "balance_loss_mlp": 1.06024349, + "epoch": 0.10773374374759523, + "flos": 513411730944.0, + "grad_norm": 0.02623387515623986, + "language_loss": 1.03025067, + "learning_rate": 0.000984241956509384, + "loss": 1.04217052, + "num_input_tokens_seen": 46283920, + "router_z_loss_mlp": 1.31591797, + "step": 560, + "time_per_iteration": 2.642380714416504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011916, + "balance_loss_mlp": 1.06013584, + "epoch": 0.10792612543285879, + "flos": 497477580288.0, + "grad_norm": 0.029111560342126648, + "language_loss": 1.01683569, + "learning_rate": 0.0009841642647670078, + "loss": 1.02875161, + "num_input_tokens_seen": 46349664, + "router_z_loss_mlp": 1.31298828, + "step": 561, + "time_per_iteration": 2.5994741916656494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191695, + "balance_loss_mlp": 1.06027901, + "epoch": 0.10811850711812235, + "flos": 736836946944.0, + "grad_norm": 0.027918527501713815, + "language_loss": 0.94711685, + "learning_rate": 0.0009840863850553944, + "loss": 0.95903373, + "num_input_tokens_seen": 46432688, + "router_z_loss_mlp": 1.3125, + "step": 562, + "time_per_iteration": 2.980377435684204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193377, + "balance_loss_mlp": 1.06215191, + "epoch": 0.10831088880338592, + "flos": 612676534272.0, + "grad_norm": 0.025174626098757973, + "language_loss": 0.99795747, + "learning_rate": 0.0009840083174047782, + "loss": 1.00989127, + "num_input_tokens_seen": 46507216, + "router_z_loss_mlp": 1.31054688, + "step": 563, + "time_per_iteration": 2.7209153175354004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194645, + "balance_loss_mlp": 1.0633713, + "epoch": 0.10850327048864948, + "flos": 557497887744.0, + "grad_norm": 0.021851565940339403, + "language_loss": 0.93414235, + "learning_rate": 0.0009839300618454685, + "loss": 0.94608879, + "num_input_tokens_seen": 46590464, + "router_z_loss_mlp": 1.31103516, + "step": 564, + "time_per_iteration": 2.833120584487915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194873, + "balance_loss_mlp": 1.06402934, + "epoch": 0.10869565217391304, + "flos": 604436212224.0, + "grad_norm": 0.021697209366751603, + "language_loss": 0.98980927, + "learning_rate": 0.0009838516184078466, + "loss": 1.00175798, + "num_input_tokens_seen": 46666240, + "router_z_loss_mlp": 1.30664062, + "step": 565, + "time_per_iteration": 2.805722236633301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193483, + "balance_loss_mlp": 1.06263876, + "epoch": 0.1088880338591766, + "flos": 527205391872.0, + "grad_norm": 0.024778377976546286, + "language_loss": 0.97356248, + "learning_rate": 0.0009837729871223669, + "loss": 0.98549736, + "num_input_tokens_seen": 46734288, + "router_z_loss_mlp": 1.30664062, + "step": 566, + "time_per_iteration": 2.652186155319214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119656, + "balance_loss_mlp": 1.0658114, + "epoch": 0.10908041554444017, + "flos": 621416412672.0, + "grad_norm": 0.023487449334803984, + "language_loss": 0.99301046, + "learning_rate": 0.0009836941680195568, + "loss": 1.00497603, + "num_input_tokens_seen": 46809920, + "router_z_loss_mlp": 1.30566406, + "step": 567, + "time_per_iteration": 2.7732484340667725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192093, + "balance_loss_mlp": 1.06144011, + "epoch": 0.10927279722970373, + "flos": 899673168384.0, + "grad_norm": 0.026216288845653656, + "language_loss": 0.95416081, + "learning_rate": 0.0009836151611300166, + "loss": 0.96608174, + "num_input_tokens_seen": 46889984, + "router_z_loss_mlp": 1.3046875, + "step": 568, + "time_per_iteration": 3.174981117248535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190864, + "balance_loss_mlp": 1.06049693, + "epoch": 0.10946517891496729, + "flos": 529699719168.0, + "grad_norm": 0.02336242427092275, + "language_loss": 1.03071296, + "learning_rate": 0.0009835359664844194, + "loss": 1.04262161, + "num_input_tokens_seen": 46959536, + "router_z_loss_mlp": 1.30273438, + "step": 569, + "time_per_iteration": 2.595041513442993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190102, + "balance_loss_mlp": 1.06173706, + "epoch": 0.10965756060023085, + "flos": 1563991426560.0, + "grad_norm": 0.006726678932110135, + "language_loss": 0.81036806, + "learning_rate": 0.0009834565841135114, + "loss": 0.82226908, + "num_input_tokens_seen": 47196960, + "router_z_loss_mlp": 1.28320312, + "step": 570, + "time_per_iteration": 4.911731719970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193915, + "balance_loss_mlp": 1.0634526, + "epoch": 0.10984994228549443, + "flos": 514099940352.0, + "grad_norm": 0.027266515996607284, + "language_loss": 1.00165153, + "learning_rate": 0.0009833770140481118, + "loss": 1.01359057, + "num_input_tokens_seen": 47266560, + "router_z_loss_mlp": 1.30273438, + "step": 571, + "time_per_iteration": 2.6079747676849365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197777, + "balance_loss_mlp": 1.06741011, + "epoch": 0.11004232397075799, + "flos": 956273895936.0, + "grad_norm": 0.026548665437539986, + "language_loss": 0.90315044, + "learning_rate": 0.000983297256319112, + "loss": 0.91512823, + "num_input_tokens_seen": 47348512, + "router_z_loss_mlp": 1.30175781, + "step": 572, + "time_per_iteration": 3.1897354125976562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188275, + "balance_loss_mlp": 1.05776477, + "epoch": 0.11023470565602154, + "flos": 489228526080.0, + "grad_norm": 0.026034490292812715, + "language_loss": 0.95817071, + "learning_rate": 0.000983217310957477, + "loss": 0.97005343, + "num_input_tokens_seen": 47425392, + "router_z_loss_mlp": 1.30322266, + "step": 573, + "time_per_iteration": 2.7447898387908936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190883, + "balance_loss_mlp": 1.06056309, + "epoch": 0.1104270873412851, + "flos": 656990275584.0, + "grad_norm": 0.026590820610190004, + "language_loss": 1.00224817, + "learning_rate": 0.000983137177994244, + "loss": 1.01415706, + "num_input_tokens_seen": 47502336, + "router_z_loss_mlp": 1.30126953, + "step": 574, + "time_per_iteration": 2.846140146255493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185115, + "balance_loss_mlp": 1.0552249, + "epoch": 0.11061946902654868, + "flos": 724747345920.0, + "grad_norm": 0.019709272455133778, + "language_loss": 0.93286896, + "learning_rate": 0.0009830568574605235, + "loss": 0.94472009, + "num_input_tokens_seen": 47583552, + "router_z_loss_mlp": 1.29736328, + "step": 575, + "time_per_iteration": 2.922821044921875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185727, + "balance_loss_mlp": 1.05569339, + "epoch": 0.11081185071181224, + "flos": 836867822592.0, + "grad_norm": 0.025292755419638515, + "language_loss": 0.97880363, + "learning_rate": 0.0009829763493874992, + "loss": 0.99066085, + "num_input_tokens_seen": 47663440, + "router_z_loss_mlp": 1.29833984, + "step": 576, + "time_per_iteration": 3.022394895553589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183726, + "balance_loss_mlp": 1.05412149, + "epoch": 0.1110042323970758, + "flos": 610282263552.0, + "grad_norm": 0.023453623229808367, + "language_loss": 1.02838886, + "learning_rate": 0.0009828956538064264, + "loss": 1.04022622, + "num_input_tokens_seen": 47741920, + "router_z_loss_mlp": 1.29541016, + "step": 577, + "time_per_iteration": 2.817147970199585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182671, + "balance_loss_mlp": 1.05316234, + "epoch": 0.11119661408233936, + "flos": 597039825408.0, + "grad_norm": 0.025026186935027953, + "language_loss": 0.99076784, + "learning_rate": 0.0009828147707486344, + "loss": 1.00259459, + "num_input_tokens_seen": 47815136, + "router_z_loss_mlp": 1.29492188, + "step": 578, + "time_per_iteration": 2.6778078079223633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186939, + "balance_loss_mlp": 1.05752516, + "epoch": 0.11138899576760293, + "flos": 556887541248.0, + "grad_norm": 0.027590262528076937, + "language_loss": 0.96720088, + "learning_rate": 0.0009827337002455245, + "loss": 0.97907031, + "num_input_tokens_seen": 47881360, + "router_z_loss_mlp": 1.29394531, + "step": 579, + "time_per_iteration": 2.6259562969207764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188781, + "balance_loss_mlp": 1.05951095, + "epoch": 0.11158137745286649, + "flos": 691062193152.0, + "grad_norm": 0.0223692175133054, + "language_loss": 0.94567806, + "learning_rate": 0.0009826524423285712, + "loss": 0.9575659, + "num_input_tokens_seen": 47962720, + "router_z_loss_mlp": 1.29150391, + "step": 580, + "time_per_iteration": 2.9144554138183594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118328, + "balance_loss_mlp": 1.05386627, + "epoch": 0.11177375913813005, + "flos": 764306747904.0, + "grad_norm": 0.02877171771660235, + "language_loss": 0.97941083, + "learning_rate": 0.0009825709970293218, + "loss": 0.9912436, + "num_input_tokens_seen": 48035472, + "router_z_loss_mlp": 1.29296875, + "step": 581, + "time_per_iteration": 2.8999927043914795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181128, + "balance_loss_mlp": 1.05223894, + "epoch": 0.11196614082339361, + "flos": 808030334976.0, + "grad_norm": 0.029325346048851512, + "language_loss": 1.03732872, + "learning_rate": 0.0009824893643793956, + "loss": 1.04913998, + "num_input_tokens_seen": 48116944, + "router_z_loss_mlp": 1.28857422, + "step": 582, + "time_per_iteration": 3.0697131156921387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186636, + "balance_loss_mlp": 1.05731773, + "epoch": 0.11215852250865718, + "flos": 559724972544.0, + "grad_norm": 0.028740695003145394, + "language_loss": 0.98446089, + "learning_rate": 0.0009824075444104857, + "loss": 0.99632728, + "num_input_tokens_seen": 48187808, + "router_z_loss_mlp": 1.29150391, + "step": 583, + "time_per_iteration": 2.7398793697357178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190407, + "balance_loss_mlp": 1.06147003, + "epoch": 0.11235090419392074, + "flos": 514575301632.0, + "grad_norm": 0.02293328270345756, + "language_loss": 1.02460003, + "learning_rate": 0.000982325537154357, + "loss": 1.03650403, + "num_input_tokens_seen": 48254464, + "router_z_loss_mlp": 1.28808594, + "step": 584, + "time_per_iteration": 2.590156078338623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188149, + "balance_loss_mlp": 1.05954635, + "epoch": 0.1125432858791843, + "flos": 492432529920.0, + "grad_norm": 0.028214107652977688, + "language_loss": 1.0381788, + "learning_rate": 0.0009822433426428484, + "loss": 1.05006027, + "num_input_tokens_seen": 48318784, + "router_z_loss_mlp": 1.28564453, + "step": 585, + "time_per_iteration": 2.566488027572632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188321, + "balance_loss_mlp": 1.05957532, + "epoch": 0.11273566756444786, + "flos": 511727136768.0, + "grad_norm": 0.027438709113267498, + "language_loss": 0.95940274, + "learning_rate": 0.0009821609609078697, + "loss": 0.971286, + "num_input_tokens_seen": 48389248, + "router_z_loss_mlp": 1.28710938, + "step": 586, + "time_per_iteration": 2.6117701530456543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189545, + "balance_loss_mlp": 1.06098938, + "epoch": 0.11292804924971142, + "flos": 623639494656.0, + "grad_norm": 0.025949033694362005, + "language_loss": 0.97216725, + "learning_rate": 0.0009820783919814045, + "loss": 0.98406273, + "num_input_tokens_seen": 48463312, + "router_z_loss_mlp": 1.28515625, + "step": 587, + "time_per_iteration": 2.798182249069214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181783, + "balance_loss_mlp": 1.05360925, + "epoch": 0.113120430934975, + "flos": 479038368768.0, + "grad_norm": 0.03012596671256698, + "language_loss": 0.94172156, + "learning_rate": 0.0009819956358955095, + "loss": 0.95353937, + "num_input_tokens_seen": 48531856, + "router_z_loss_mlp": 1.28125, + "step": 588, + "time_per_iteration": 2.54179310798645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197707, + "balance_loss_mlp": 1.06905663, + "epoch": 0.11331281262023855, + "flos": 467990814720.0, + "grad_norm": 0.02502737191739997, + "language_loss": 0.9542653, + "learning_rate": 0.0009819126926823127, + "loss": 0.96624243, + "num_input_tokens_seen": 48596640, + "router_z_loss_mlp": 1.28613281, + "step": 589, + "time_per_iteration": 2.5262975692749023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191554, + "balance_loss_mlp": 1.06333208, + "epoch": 0.11350519430550211, + "flos": 651610853376.0, + "grad_norm": 0.023462259875113876, + "language_loss": 0.96713853, + "learning_rate": 0.000981829562374016, + "loss": 0.97905409, + "num_input_tokens_seen": 48669648, + "router_z_loss_mlp": 1.28173828, + "step": 590, + "time_per_iteration": 2.764240026473999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192039, + "balance_loss_mlp": 1.06415117, + "epoch": 0.11369757599076567, + "flos": 558860845056.0, + "grad_norm": 0.030341732837715945, + "language_loss": 1.07369685, + "learning_rate": 0.0009817462450028933, + "loss": 1.08561718, + "num_input_tokens_seen": 48737392, + "router_z_loss_mlp": 1.27832031, + "step": 591, + "time_per_iteration": 2.638333559036255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189947, + "balance_loss_mlp": 1.06215453, + "epoch": 0.11388995767602925, + "flos": 572305397760.0, + "grad_norm": 0.0238596111294556, + "language_loss": 0.94198918, + "learning_rate": 0.0009816627406012916, + "loss": 0.9538886, + "num_input_tokens_seen": 48817136, + "router_z_loss_mlp": 1.27734375, + "step": 592, + "time_per_iteration": 2.800842523574829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191939, + "balance_loss_mlp": 1.06395626, + "epoch": 0.1140823393612928, + "flos": 741743009280.0, + "grad_norm": 0.025351621893671843, + "language_loss": 0.93787777, + "learning_rate": 0.0009815790492016295, + "loss": 0.94979715, + "num_input_tokens_seen": 48895808, + "router_z_loss_mlp": 1.27929688, + "step": 593, + "time_per_iteration": 2.9331579208374023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191026, + "balance_loss_mlp": 1.06337643, + "epoch": 0.11427472104655637, + "flos": 700251236352.0, + "grad_norm": 0.02689478502881467, + "language_loss": 0.96601468, + "learning_rate": 0.0009814951708363993, + "loss": 0.97792494, + "num_input_tokens_seen": 48967456, + "router_z_loss_mlp": 1.27587891, + "step": 594, + "time_per_iteration": 2.832094192504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200218, + "balance_loss_mlp": 1.07414246, + "epoch": 0.11446710273181993, + "flos": 1480352598528.0, + "grad_norm": 0.020191453180706247, + "language_loss": 0.77990985, + "learning_rate": 0.0009814111055381654, + "loss": 0.79191208, + "num_input_tokens_seen": 49193152, + "router_z_loss_mlp": 1.25976562, + "step": 595, + "time_per_iteration": 4.752530574798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187485, + "balance_loss_mlp": 1.06026483, + "epoch": 0.1146594844170835, + "flos": 495912508416.0, + "grad_norm": 0.02910362847653251, + "language_loss": 0.97498882, + "learning_rate": 0.0009813268533395648, + "loss": 0.98686367, + "num_input_tokens_seen": 49260960, + "router_z_loss_mlp": 1.27148438, + "step": 596, + "time_per_iteration": 2.571367025375366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187961, + "balance_loss_mlp": 1.06093144, + "epoch": 0.11485186610234706, + "flos": 475790704128.0, + "grad_norm": 0.02927093575191284, + "language_loss": 0.98108673, + "learning_rate": 0.0009812424142733073, + "loss": 0.99296629, + "num_input_tokens_seen": 49327616, + "router_z_loss_mlp": 1.26953125, + "step": 597, + "time_per_iteration": 2.5622098445892334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187255, + "balance_loss_mlp": 1.06046438, + "epoch": 0.11504424778761062, + "flos": 732619094016.0, + "grad_norm": 0.02047017320895946, + "language_loss": 0.92490959, + "learning_rate": 0.000981157788372175, + "loss": 0.93678212, + "num_input_tokens_seen": 49412864, + "router_z_loss_mlp": 1.26708984, + "step": 598, + "time_per_iteration": 3.017120599746704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185489, + "balance_loss_mlp": 1.05855536, + "epoch": 0.11523662947287418, + "flos": 546962625024.0, + "grad_norm": 0.02044602685826044, + "language_loss": 0.96609688, + "learning_rate": 0.0009810729756690223, + "loss": 0.97795177, + "num_input_tokens_seen": 49483584, + "router_z_loss_mlp": 1.26855469, + "step": 599, + "time_per_iteration": 2.7182610034942627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190213, + "balance_loss_mlp": 1.06323159, + "epoch": 0.11542901115813775, + "flos": 776387616768.0, + "grad_norm": 0.023703305464208416, + "language_loss": 0.99939269, + "learning_rate": 0.0009809879761967766, + "loss": 1.01129484, + "num_input_tokens_seen": 49563568, + "router_z_loss_mlp": 1.26904297, + "step": 600, + "time_per_iteration": 2.9586148262023926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189892, + "balance_loss_mlp": 1.06319618, + "epoch": 0.11562139284340131, + "flos": 732212863488.0, + "grad_norm": 0.024193120208057816, + "language_loss": 0.99113685, + "learning_rate": 0.0009809027899884378, + "loss": 1.00303578, + "num_input_tokens_seen": 49640800, + "router_z_loss_mlp": 1.26611328, + "step": 601, + "time_per_iteration": 2.885070323944092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183816, + "balance_loss_mlp": 1.05731082, + "epoch": 0.11581377452866487, + "flos": 537039710208.0, + "grad_norm": 0.022696091128935367, + "language_loss": 0.96568906, + "learning_rate": 0.0009808174170770779, + "loss": 0.97752714, + "num_input_tokens_seen": 49721872, + "router_z_loss_mlp": 1.26416016, + "step": 602, + "time_per_iteration": 2.7809743881225586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191742, + "balance_loss_mlp": 1.0662384, + "epoch": 0.11600615621392843, + "flos": 1559211617280.0, + "grad_norm": 0.013792800863456836, + "language_loss": 0.84898245, + "learning_rate": 0.0009807318574958418, + "loss": 0.86089987, + "num_input_tokens_seen": 49951472, + "router_z_loss_mlp": 1.25390625, + "step": 603, + "time_per_iteration": 4.860181570053101 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187966, + "balance_loss_mlp": 1.06169963, + "epoch": 0.116198537899192, + "flos": 538467795456.0, + "grad_norm": 0.022659628017063727, + "language_loss": 1.02766323, + "learning_rate": 0.0009806461112779462, + "loss": 1.03954291, + "num_input_tokens_seen": 50021136, + "router_z_loss_mlp": 1.26171875, + "step": 604, + "time_per_iteration": 2.614189863204956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187324, + "balance_loss_mlp": 1.06091404, + "epoch": 0.11639091958445556, + "flos": 455137142784.0, + "grad_norm": 0.0301649070939891, + "language_loss": 1.00891566, + "learning_rate": 0.0009805601784566814, + "loss": 1.02078903, + "num_input_tokens_seen": 50083888, + "router_z_loss_mlp": 1.26318359, + "step": 605, + "time_per_iteration": 2.470878839492798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119223, + "balance_loss_mlp": 1.06658351, + "epoch": 0.11658330126971912, + "flos": 556151668224.0, + "grad_norm": 0.025758302551065336, + "language_loss": 1.05099356, + "learning_rate": 0.0009804740590654089, + "loss": 1.0629158, + "num_input_tokens_seen": 50151744, + "router_z_loss_mlp": 1.25537109, + "step": 606, + "time_per_iteration": 2.631462812423706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191677, + "balance_loss_mlp": 1.06588733, + "epoch": 0.11677568295498268, + "flos": 717600737280.0, + "grad_norm": 0.02545612001836415, + "language_loss": 0.99629396, + "learning_rate": 0.0009803877531375635, + "loss": 1.00821078, + "num_input_tokens_seen": 50221248, + "router_z_loss_mlp": 1.25683594, + "step": 607, + "time_per_iteration": 2.879645586013794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191881, + "balance_loss_mlp": 1.06613898, + "epoch": 0.11696806464024626, + "flos": 610898614272.0, + "grad_norm": 0.023619167708177922, + "language_loss": 0.99668628, + "learning_rate": 0.0009803012607066523, + "loss": 1.008605, + "num_input_tokens_seen": 50293792, + "router_z_loss_mlp": 1.25634766, + "step": 608, + "time_per_iteration": 2.717660427093506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189661, + "balance_loss_mlp": 1.06406212, + "epoch": 0.11716044632550981, + "flos": 521415736320.0, + "grad_norm": 0.023557070356346427, + "language_loss": 0.97414643, + "learning_rate": 0.0009802145818062543, + "loss": 0.98604298, + "num_input_tokens_seen": 50367760, + "router_z_loss_mlp": 1.25488281, + "step": 609, + "time_per_iteration": 2.7209720611572266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190685, + "balance_loss_mlp": 1.064991, + "epoch": 0.11735282801077337, + "flos": 508488204288.0, + "grad_norm": 0.03039581956620226, + "language_loss": 1.01476204, + "learning_rate": 0.0009801277164700212, + "loss": 1.02666891, + "num_input_tokens_seen": 50435664, + "router_z_loss_mlp": 1.25585938, + "step": 610, + "time_per_iteration": 2.5900633335113525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190447, + "balance_loss_mlp": 1.06489623, + "epoch": 0.11754520969603693, + "flos": 687835995648.0, + "grad_norm": 0.028512829376260446, + "language_loss": 0.97853899, + "learning_rate": 0.0009800406647316776, + "loss": 0.99044347, + "num_input_tokens_seen": 50514144, + "router_z_loss_mlp": 1.25439453, + "step": 611, + "time_per_iteration": 2.8018290996551514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118467, + "balance_loss_mlp": 1.06088257, + "epoch": 0.1177375913813005, + "flos": 1545756331008.0, + "grad_norm": 0.00764509792440145, + "language_loss": 0.76914459, + "learning_rate": 0.0009799534266250196, + "loss": 0.78099126, + "num_input_tokens_seen": 50738448, + "router_z_loss_mlp": 1.24023438, + "step": 612, + "time_per_iteration": 4.767510175704956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185632, + "balance_loss_mlp": 1.05974686, + "epoch": 0.11792997306656407, + "flos": 521537260032.0, + "grad_norm": 0.0290479345737112, + "language_loss": 0.97953087, + "learning_rate": 0.000979866002183916, + "loss": 0.99138713, + "num_input_tokens_seen": 50809328, + "router_z_loss_mlp": 1.2578125, + "step": 613, + "time_per_iteration": 2.6752681732177734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182111, + "balance_loss_mlp": 1.05632174, + "epoch": 0.11812235475182763, + "flos": 667488608256.0, + "grad_norm": 0.030776001440310688, + "language_loss": 0.9883132, + "learning_rate": 0.0009797783914423082, + "loss": 1.00013435, + "num_input_tokens_seen": 50887728, + "router_z_loss_mlp": 1.25683594, + "step": 614, + "time_per_iteration": 2.8556718826293945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182577, + "balance_loss_mlp": 1.05697787, + "epoch": 0.11831473643709119, + "flos": 622504121856.0, + "grad_norm": 0.02739500646081478, + "language_loss": 0.93579996, + "learning_rate": 0.0009796905944342094, + "loss": 0.94762576, + "num_input_tokens_seen": 50966160, + "router_z_loss_mlp": 1.25488281, + "step": 615, + "time_per_iteration": 2.80253267288208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187072, + "balance_loss_mlp": 1.06152117, + "epoch": 0.11850711812235475, + "flos": 457694596608.0, + "grad_norm": 0.020858577781052552, + "language_loss": 0.96166766, + "learning_rate": 0.0009796026111937057, + "loss": 0.9735384, + "num_input_tokens_seen": 51035712, + "router_z_loss_mlp": 1.25439453, + "step": 616, + "time_per_iteration": 2.5763044357299805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189497, + "balance_loss_mlp": 1.06404102, + "epoch": 0.11869949980761832, + "flos": 514927137792.0, + "grad_norm": 0.022050319992180305, + "language_loss": 0.96050835, + "learning_rate": 0.0009795144417549552, + "loss": 0.97240329, + "num_input_tokens_seen": 51108656, + "router_z_loss_mlp": 1.25341797, + "step": 617, + "time_per_iteration": 2.7428698539733887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186044, + "balance_loss_mlp": 1.06092167, + "epoch": 0.11889188149288188, + "flos": 536156116992.0, + "grad_norm": 0.0238791856796517, + "language_loss": 0.97532642, + "learning_rate": 0.0009794260861521883, + "loss": 0.98718691, + "num_input_tokens_seen": 51185552, + "router_z_loss_mlp": 1.25292969, + "step": 618, + "time_per_iteration": 2.784257173538208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189196, + "balance_loss_mlp": 1.06445491, + "epoch": 0.11908426317814544, + "flos": 499644266496.0, + "grad_norm": 0.024260475486046627, + "language_loss": 0.96495152, + "learning_rate": 0.0009793375444197075, + "loss": 0.97684348, + "num_input_tokens_seen": 51255808, + "router_z_loss_mlp": 1.25, + "step": 619, + "time_per_iteration": 2.6021063327789307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189567, + "balance_loss_mlp": 1.06482673, + "epoch": 0.119276644863409, + "flos": 661067139072.0, + "grad_norm": 0.023292068214373615, + "language_loss": 0.96012962, + "learning_rate": 0.000979248816591888, + "loss": 0.97202522, + "num_input_tokens_seen": 51329408, + "router_z_loss_mlp": 1.25, + "step": 620, + "time_per_iteration": 2.783372640609741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184512, + "balance_loss_mlp": 1.06001019, + "epoch": 0.11946902654867257, + "flos": 760152021504.0, + "grad_norm": 0.02911418191745056, + "language_loss": 0.95521206, + "learning_rate": 0.0009791599027031766, + "loss": 0.96705711, + "num_input_tokens_seen": 51408784, + "router_z_loss_mlp": 1.24755859, + "step": 621, + "time_per_iteration": 3.04338002204895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185972, + "balance_loss_mlp": 1.06156564, + "epoch": 0.11966140823393613, + "flos": 682213526016.0, + "grad_norm": 0.0317276180850791, + "language_loss": 0.96021026, + "learning_rate": 0.0009790708027880932, + "loss": 0.97206998, + "num_input_tokens_seen": 51482592, + "router_z_loss_mlp": 1.24658203, + "step": 622, + "time_per_iteration": 2.8048830032348633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184547, + "balance_loss_mlp": 1.06171417, + "epoch": 0.11985378991919969, + "flos": 1454298147840.0, + "grad_norm": 0.011779966077399251, + "language_loss": 0.77427292, + "learning_rate": 0.0009789815168812293, + "loss": 0.78611839, + "num_input_tokens_seen": 51712240, + "router_z_loss_mlp": 1.23046875, + "step": 623, + "time_per_iteration": 4.88221549987793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187655, + "balance_loss_mlp": 1.06291461, + "epoch": 0.12004617160446325, + "flos": 528898718208.0, + "grad_norm": 0.0243802584204396, + "language_loss": 1.01341891, + "learning_rate": 0.0009788920450172487, + "loss": 1.0252955, + "num_input_tokens_seen": 51781440, + "router_z_loss_mlp": 1.25, + "step": 624, + "time_per_iteration": 2.6179678440093994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190724, + "balance_loss_mlp": 1.06655562, + "epoch": 0.12023855328972682, + "flos": 475176354816.0, + "grad_norm": 0.025839680970612892, + "language_loss": 0.99598378, + "learning_rate": 0.0009788023872308875, + "loss": 1.00789118, + "num_input_tokens_seen": 51845424, + "router_z_loss_mlp": 1.24414062, + "step": 625, + "time_per_iteration": 2.5168616771698 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189499, + "balance_loss_mlp": 1.06723785, + "epoch": 0.12043093497499038, + "flos": 1535051880960.0, + "grad_norm": 0.008994278182213968, + "language_loss": 0.75428998, + "learning_rate": 0.0009787125435569539, + "loss": 0.76618505, + "num_input_tokens_seen": 52076496, + "router_z_loss_mlp": 1.22460938, + "step": 626, + "time_per_iteration": 4.739393472671509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194547, + "balance_loss_mlp": 1.07128501, + "epoch": 0.12062331666025394, + "flos": 540914459136.0, + "grad_norm": 0.025390703641747513, + "language_loss": 1.01758838, + "learning_rate": 0.0009786225140303285, + "loss": 1.02953386, + "num_input_tokens_seen": 52143072, + "router_z_loss_mlp": 1.23486328, + "step": 627, + "time_per_iteration": 2.627995729446411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119053, + "balance_loss_mlp": 1.06683803, + "epoch": 0.1208156983455175, + "flos": 512999496192.0, + "grad_norm": 0.027559316114759484, + "language_loss": 1.00245547, + "learning_rate": 0.0009785322986859634, + "loss": 1.0143609, + "num_input_tokens_seen": 52211888, + "router_z_loss_mlp": 1.23925781, + "step": 628, + "time_per_iteration": 2.657465696334839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011787, + "balance_loss_mlp": 1.05481803, + "epoch": 0.12100808003078108, + "flos": 597589046784.0, + "grad_norm": 0.024406659961039724, + "language_loss": 1.01031506, + "learning_rate": 0.0009784418975588838, + "loss": 1.02210212, + "num_input_tokens_seen": 52283696, + "router_z_loss_mlp": 1.24121094, + "step": 629, + "time_per_iteration": 2.6953535079956055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187008, + "balance_loss_mlp": 1.063555, + "epoch": 0.12120046171604464, + "flos": 524066515968.0, + "grad_norm": 0.02180733694842763, + "language_loss": 0.99517697, + "learning_rate": 0.0009783513106841862, + "loss": 1.00704694, + "num_input_tokens_seen": 52358624, + "router_z_loss_mlp": 1.23681641, + "step": 630, + "time_per_iteration": 2.7234978675842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189331, + "balance_loss_mlp": 1.06687927, + "epoch": 0.1213928434013082, + "flos": 1557907057152.0, + "grad_norm": 0.011472153843238986, + "language_loss": 0.76732707, + "learning_rate": 0.00097826053809704, + "loss": 0.77922034, + "num_input_tokens_seen": 52591248, + "router_z_loss_mlp": 1.2265625, + "step": 631, + "time_per_iteration": 4.975109100341797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184278, + "balance_loss_mlp": 1.06072986, + "epoch": 0.12158522508657175, + "flos": 496387869696.0, + "grad_norm": 0.025959921000511615, + "language_loss": 0.96498066, + "learning_rate": 0.0009781695798326854, + "loss": 0.97682351, + "num_input_tokens_seen": 52659920, + "router_z_loss_mlp": 1.23779297, + "step": 632, + "time_per_iteration": 2.5740485191345215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184351, + "balance_loss_mlp": 1.0608983, + "epoch": 0.12177760677183531, + "flos": 476589703680.0, + "grad_norm": 0.025554774573744533, + "language_loss": 0.96275663, + "learning_rate": 0.0009780784359264365, + "loss": 0.9746002, + "num_input_tokens_seen": 52728832, + "router_z_loss_mlp": 1.23681641, + "step": 633, + "time_per_iteration": 2.604390859603882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176552, + "balance_loss_mlp": 1.05543518, + "epoch": 0.12196998845709889, + "flos": 1471784635392.0, + "grad_norm": 0.009598735556444526, + "language_loss": 0.74188697, + "learning_rate": 0.0009779871064136778, + "loss": 0.75365245, + "num_input_tokens_seen": 52949776, + "router_z_loss_mlp": 1.21289062, + "step": 634, + "time_per_iteration": 4.757449626922607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117713, + "balance_loss_mlp": 1.05424869, + "epoch": 0.12216237014236245, + "flos": 587748724224.0, + "grad_norm": 0.021555120902870813, + "language_loss": 0.93822527, + "learning_rate": 0.000977895591329867, + "loss": 0.94999647, + "num_input_tokens_seen": 53027184, + "router_z_loss_mlp": 1.23095703, + "step": 635, + "time_per_iteration": 2.7859792709350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181489, + "balance_loss_mlp": 1.05851305, + "epoch": 0.12235475182762601, + "flos": 599106455040.0, + "grad_norm": 0.023775729584682537, + "language_loss": 0.96009773, + "learning_rate": 0.000977803890710533, + "loss": 0.97191262, + "num_input_tokens_seen": 53101072, + "router_z_loss_mlp": 1.23193359, + "step": 636, + "time_per_iteration": 2.76069712638855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180701, + "balance_loss_mlp": 1.05762947, + "epoch": 0.12254713351288957, + "flos": 498760673280.0, + "grad_norm": 0.024707427516876792, + "language_loss": 1.00440359, + "learning_rate": 0.0009777120045912774, + "loss": 1.01621056, + "num_input_tokens_seen": 53172992, + "router_z_loss_mlp": 1.23291016, + "step": 637, + "time_per_iteration": 2.5980072021484375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118065, + "balance_loss_mlp": 1.05772126, + "epoch": 0.12273951519815314, + "flos": 606980204544.0, + "grad_norm": 0.02489341207380848, + "language_loss": 0.99891078, + "learning_rate": 0.0009776199330077736, + "loss": 1.01071739, + "num_input_tokens_seen": 53248256, + "router_z_loss_mlp": 1.23144531, + "step": 638, + "time_per_iteration": 2.704040288925171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181154, + "balance_loss_mlp": 1.05841601, + "epoch": 0.1229318968834167, + "flos": 598984931328.0, + "grad_norm": 0.02631208797714665, + "language_loss": 1.02141118, + "learning_rate": 0.0009775276759957667, + "loss": 1.03322268, + "num_input_tokens_seen": 53318960, + "router_z_loss_mlp": 1.22949219, + "step": 639, + "time_per_iteration": 2.7442896366119385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179934, + "balance_loss_mlp": 1.05700564, + "epoch": 0.12312427856868026, + "flos": 679588942848.0, + "grad_norm": 0.026802425502252814, + "language_loss": 1.01084137, + "learning_rate": 0.0009774352335910745, + "loss": 1.02264071, + "num_input_tokens_seen": 53389120, + "router_z_loss_mlp": 1.23144531, + "step": 640, + "time_per_iteration": 2.8294076919555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117918, + "balance_loss_mlp": 1.05625129, + "epoch": 0.12331666025394382, + "flos": 610043218944.0, + "grad_norm": 0.020742791942005383, + "language_loss": 1.02118182, + "learning_rate": 0.000977342605829586, + "loss": 1.03297377, + "num_input_tokens_seen": 53459056, + "router_z_loss_mlp": 1.23144531, + "step": 641, + "time_per_iteration": 2.7078418731689453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180028, + "balance_loss_mlp": 1.05748129, + "epoch": 0.12350904193920739, + "flos": 763840118784.0, + "grad_norm": 0.025027209312251563, + "language_loss": 0.94737858, + "learning_rate": 0.0009772497927472623, + "loss": 0.95917892, + "num_input_tokens_seen": 53541552, + "router_z_loss_mlp": 1.22753906, + "step": 642, + "time_per_iteration": 3.0655579566955566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177096, + "balance_loss_mlp": 1.05454898, + "epoch": 0.12370142362447095, + "flos": 542049831936.0, + "grad_norm": 0.02608476880613399, + "language_loss": 0.96273685, + "learning_rate": 0.0009771567943801368, + "loss": 0.97450781, + "num_input_tokens_seen": 53611520, + "router_z_loss_mlp": 1.22753906, + "step": 643, + "time_per_iteration": 2.7343406677246094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179725, + "balance_loss_mlp": 1.05727291, + "epoch": 0.12389380530973451, + "flos": 549252836352.0, + "grad_norm": 0.02435000122960196, + "language_loss": 0.99357152, + "learning_rate": 0.0009770636107643152, + "loss": 1.00536871, + "num_input_tokens_seen": 53683888, + "router_z_loss_mlp": 1.2265625, + "step": 644, + "time_per_iteration": 2.7361886501312256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177422, + "balance_loss_mlp": 1.05516136, + "epoch": 0.12408618699499807, + "flos": 541352890368.0, + "grad_norm": 0.02246298440278387, + "language_loss": 0.95392644, + "learning_rate": 0.0009769702419359738, + "loss": 0.96570063, + "num_input_tokens_seen": 53751888, + "router_z_loss_mlp": 1.22460938, + "step": 645, + "time_per_iteration": 2.674142837524414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181453, + "balance_loss_mlp": 1.05904841, + "epoch": 0.12427856868026164, + "flos": 747159361536.0, + "grad_norm": 0.023095982047370255, + "language_loss": 0.97586024, + "learning_rate": 0.000976876687931362, + "loss": 0.98767477, + "num_input_tokens_seen": 53827648, + "router_z_loss_mlp": 1.22607422, + "step": 646, + "time_per_iteration": 2.9833688735961914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189298, + "balance_loss_mlp": 1.06703711, + "epoch": 0.1244709503655252, + "flos": 534744769536.0, + "grad_norm": 0.03060863164707411, + "language_loss": 0.94044995, + "learning_rate": 0.0009767829487868005, + "loss": 0.95234299, + "num_input_tokens_seen": 53896400, + "router_z_loss_mlp": 1.22460938, + "step": 647, + "time_per_iteration": 2.596003293991089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182997, + "balance_loss_mlp": 1.06073558, + "epoch": 0.12466333205078876, + "flos": 509111285760.0, + "grad_norm": 0.028982594733012217, + "language_loss": 0.98960567, + "learning_rate": 0.000976689024538682, + "loss": 1.00143564, + "num_input_tokens_seen": 53965904, + "router_z_loss_mlp": 1.22460938, + "step": 648, + "time_per_iteration": 2.5837948322296143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183924, + "balance_loss_mlp": 1.06171107, + "epoch": 0.12485571373605232, + "flos": 682639222272.0, + "grad_norm": 0.03213416167398649, + "language_loss": 0.97804081, + "learning_rate": 0.0009765949152234716, + "loss": 0.98988008, + "num_input_tokens_seen": 54049792, + "router_z_loss_mlp": 1.22412109, + "step": 649, + "time_per_iteration": 2.876009702682495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193359, + "balance_loss_mlp": 1.07243347, + "epoch": 0.1250480954213159, + "flos": 1333198748160.0, + "grad_norm": 0.014891788740719425, + "language_loss": 0.78686082, + "learning_rate": 0.0009765006208777055, + "loss": 0.79879445, + "num_input_tokens_seen": 54262432, + "router_z_loss_mlp": 1.2109375, + "step": 650, + "time_per_iteration": 4.675558805465698 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182781, + "balance_loss_mlp": 1.06152093, + "epoch": 0.12524047710657946, + "flos": 940196754432.0, + "grad_norm": 0.027794334398077363, + "language_loss": 0.91408408, + "learning_rate": 0.0009764061415379919, + "loss": 0.9259119, + "num_input_tokens_seen": 54351568, + "router_z_loss_mlp": 1.21435547, + "step": 651, + "time_per_iteration": 3.260758399963379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184193, + "balance_loss_mlp": 1.06288576, + "epoch": 0.12543285879184302, + "flos": 514900941312.0, + "grad_norm": 0.027655948956122736, + "language_loss": 0.97430605, + "learning_rate": 0.0009763114772410109, + "loss": 0.986148, + "num_input_tokens_seen": 54418944, + "router_z_loss_mlp": 1.21484375, + "step": 652, + "time_per_iteration": 2.60402512550354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179616, + "balance_loss_mlp": 1.05849957, + "epoch": 0.12562524047710658, + "flos": 719682829824.0, + "grad_norm": 0.022040452281994895, + "language_loss": 0.94100869, + "learning_rate": 0.0009762166280235146, + "loss": 0.95280486, + "num_input_tokens_seen": 54495312, + "router_z_loss_mlp": 1.21289062, + "step": 653, + "time_per_iteration": 2.953866958618164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177042, + "balance_loss_mlp": 1.05592513, + "epoch": 0.12581762216237014, + "flos": 564798220800.0, + "grad_norm": 0.026345633512325176, + "language_loss": 0.96725851, + "learning_rate": 0.0009761215939223267, + "loss": 0.97902894, + "num_input_tokens_seen": 54566832, + "router_z_loss_mlp": 1.21289062, + "step": 654, + "time_per_iteration": 2.6936216354370117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176243, + "balance_loss_mlp": 1.0553174, + "epoch": 0.1260100038476337, + "flos": 482900382720.0, + "grad_norm": 0.0302310026354778, + "language_loss": 0.97697163, + "learning_rate": 0.0009760263749743428, + "loss": 0.98873413, + "num_input_tokens_seen": 54632128, + "router_z_loss_mlp": 1.2109375, + "step": 655, + "time_per_iteration": 2.5425992012023926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173716, + "balance_loss_mlp": 1.05302835, + "epoch": 0.12620238553289725, + "flos": 576701170176.0, + "grad_norm": 0.026173940013352312, + "language_loss": 0.96703827, + "learning_rate": 0.0009759309712165299, + "loss": 0.97877538, + "num_input_tokens_seen": 54707600, + "router_z_loss_mlp": 1.20849609, + "step": 656, + "time_per_iteration": 2.7134242057800293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182641, + "balance_loss_mlp": 1.06185794, + "epoch": 0.12639476721816084, + "flos": 532185314304.0, + "grad_norm": 0.024272217680215723, + "language_loss": 1.00863099, + "learning_rate": 0.0009758353826859272, + "loss": 1.02045751, + "num_input_tokens_seen": 54776704, + "router_z_loss_mlp": 1.20947266, + "step": 657, + "time_per_iteration": 2.621317148208618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183764, + "balance_loss_mlp": 1.06288576, + "epoch": 0.1265871489034244, + "flos": 691231380480.0, + "grad_norm": 0.02639198012969831, + "language_loss": 0.9913975, + "learning_rate": 0.0009757396094196456, + "loss": 1.00323522, + "num_input_tokens_seen": 54851744, + "router_z_loss_mlp": 1.21044922, + "step": 658, + "time_per_iteration": 2.8867759704589844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183942, + "balance_loss_mlp": 1.06311166, + "epoch": 0.12677953058868796, + "flos": 538242212352.0, + "grad_norm": 0.02343039495549204, + "language_loss": 0.91435432, + "learning_rate": 0.0009756436514548673, + "loss": 0.92619371, + "num_input_tokens_seen": 54932576, + "router_z_loss_mlp": 1.20996094, + "step": 659, + "time_per_iteration": 2.8055155277252197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179962, + "balance_loss_mlp": 1.05903614, + "epoch": 0.12697191227395152, + "flos": 520119908352.0, + "grad_norm": 0.02147737158217614, + "language_loss": 0.94944704, + "learning_rate": 0.0009755475088288466, + "loss": 0.96124667, + "num_input_tokens_seen": 55007296, + "router_z_loss_mlp": 1.2109375, + "step": 660, + "time_per_iteration": 2.713801860809326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179144, + "balance_loss_mlp": 1.05826533, + "epoch": 0.12716429395921508, + "flos": 567665851392.0, + "grad_norm": 0.026687699897107686, + "language_loss": 0.99289566, + "learning_rate": 0.0009754511815789095, + "loss": 1.00468707, + "num_input_tokens_seen": 55079312, + "router_z_loss_mlp": 1.21044922, + "step": 661, + "time_per_iteration": 2.739250898361206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176549, + "balance_loss_mlp": 1.05590951, + "epoch": 0.12735667564447864, + "flos": 515141987328.0, + "grad_norm": 0.028028480179563667, + "language_loss": 0.94950283, + "learning_rate": 0.0009753546697424533, + "loss": 0.96126837, + "num_input_tokens_seen": 55151824, + "router_z_loss_mlp": 1.20800781, + "step": 662, + "time_per_iteration": 2.71746826171875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180242, + "balance_loss_mlp": 1.05941188, + "epoch": 0.1275490573297422, + "flos": 542321077248.0, + "grad_norm": 0.02443290319898258, + "language_loss": 0.98755229, + "learning_rate": 0.0009752579733569475, + "loss": 0.99935466, + "num_input_tokens_seen": 55224368, + "router_z_loss_mlp": 1.20996094, + "step": 663, + "time_per_iteration": 2.631284713745117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179512, + "balance_loss_mlp": 1.06030273, + "epoch": 0.12774143901500576, + "flos": 1562024853504.0, + "grad_norm": 0.010147906106003043, + "language_loss": 0.74881387, + "learning_rate": 0.0009751610924599328, + "loss": 0.76060903, + "num_input_tokens_seen": 55453584, + "router_z_loss_mlp": 1.19335938, + "step": 664, + "time_per_iteration": 4.941519260406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188286, + "balance_loss_mlp": 1.06783676, + "epoch": 0.12793382070026935, + "flos": 614873419776.0, + "grad_norm": 0.028758292375382164, + "language_loss": 1.00255466, + "learning_rate": 0.0009750640270890217, + "loss": 1.01443744, + "num_input_tokens_seen": 55528000, + "router_z_loss_mlp": 1.20605469, + "step": 665, + "time_per_iteration": 2.7382516860961914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185033, + "balance_loss_mlp": 1.06458378, + "epoch": 0.1281262023855329, + "flos": 709117367808.0, + "grad_norm": 0.02727882395737353, + "language_loss": 1.05972624, + "learning_rate": 0.0009749667772818983, + "loss": 1.0715766, + "num_input_tokens_seen": 55612416, + "router_z_loss_mlp": 1.20605469, + "step": 666, + "time_per_iteration": 2.961103677749634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117968, + "balance_loss_mlp": 1.06104279, + "epoch": 0.12831858407079647, + "flos": 1428182572032.0, + "grad_norm": 0.005713660367986308, + "language_loss": 0.76935941, + "learning_rate": 0.0009748693430763185, + "loss": 0.78115624, + "num_input_tokens_seen": 55843664, + "router_z_loss_mlp": 1.1875, + "step": 667, + "time_per_iteration": 4.799788475036621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180825, + "balance_loss_mlp": 1.06056714, + "epoch": 0.12851096575606002, + "flos": 450018232320.0, + "grad_norm": 0.027450705632443572, + "language_loss": 1.04045725, + "learning_rate": 0.0009747717245101093, + "loss": 1.05226541, + "num_input_tokens_seen": 55909072, + "router_z_loss_mlp": 1.20410156, + "step": 668, + "time_per_iteration": 2.503016471862793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181103, + "balance_loss_mlp": 1.0609405, + "epoch": 0.12870334744132358, + "flos": 480909614592.0, + "grad_norm": 0.024743463193645603, + "language_loss": 0.94192064, + "learning_rate": 0.00097467392162117, + "loss": 0.95373166, + "num_input_tokens_seen": 55978544, + "router_z_loss_mlp": 1.203125, + "step": 669, + "time_per_iteration": 2.6341683864593506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176215, + "balance_loss_mlp": 1.05609953, + "epoch": 0.12889572912658714, + "flos": 640151064576.0, + "grad_norm": 0.020470833753638586, + "language_loss": 0.98179239, + "learning_rate": 0.0009745759344474708, + "loss": 0.99355447, + "num_input_tokens_seen": 56054144, + "router_z_loss_mlp": 1.20263672, + "step": 670, + "time_per_iteration": 2.8753654956817627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175464, + "balance_loss_mlp": 1.05530083, + "epoch": 0.1290881108118507, + "flos": 510954333696.0, + "grad_norm": 0.02496408481001148, + "language_loss": 0.98669916, + "learning_rate": 0.0009744777630270536, + "loss": 0.99845386, + "num_input_tokens_seen": 56120960, + "router_z_loss_mlp": 1.203125, + "step": 671, + "time_per_iteration": 2.601480484008789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173739, + "balance_loss_mlp": 1.05381489, + "epoch": 0.12928049249711426, + "flos": 672290611200.0, + "grad_norm": 0.0267777739546368, + "language_loss": 1.0349828, + "learning_rate": 0.000974379407398032, + "loss": 1.04672015, + "num_input_tokens_seen": 56202560, + "router_z_loss_mlp": 1.20068359, + "step": 672, + "time_per_iteration": 2.8746023178100586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176311, + "balance_loss_mlp": 1.05633891, + "epoch": 0.12947287418237785, + "flos": 794998743552.0, + "grad_norm": 0.021070447178693698, + "language_loss": 0.89884377, + "learning_rate": 0.0009742808675985913, + "loss": 0.91060686, + "num_input_tokens_seen": 56289456, + "router_z_loss_mlp": 1.20117188, + "step": 673, + "time_per_iteration": 3.106855869293213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178925, + "balance_loss_mlp": 1.05895269, + "epoch": 0.1296652558676414, + "flos": 486447490560.0, + "grad_norm": 0.028552559493613055, + "language_loss": 1.00707459, + "learning_rate": 0.0009741821436669876, + "loss": 1.0188638, + "num_input_tokens_seen": 56354480, + "router_z_loss_mlp": 1.20117188, + "step": 674, + "time_per_iteration": 2.6221611499786377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180752, + "balance_loss_mlp": 1.06097043, + "epoch": 0.12985763755290497, + "flos": 454392537600.0, + "grad_norm": 0.03163366532216525, + "language_loss": 1.04449701, + "learning_rate": 0.0009740832356415492, + "loss": 1.05630445, + "num_input_tokens_seen": 56418944, + "router_z_loss_mlp": 1.19921875, + "step": 675, + "time_per_iteration": 2.508666515350342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179614, + "balance_loss_mlp": 1.05968916, + "epoch": 0.13005001923816853, + "flos": 826434617856.0, + "grad_norm": 0.02755997498495484, + "language_loss": 0.99148017, + "learning_rate": 0.0009739841435606756, + "loss": 1.00327623, + "num_input_tokens_seen": 56492368, + "router_z_loss_mlp": 1.20068359, + "step": 676, + "time_per_iteration": 3.026420831680298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180175, + "balance_loss_mlp": 1.06058431, + "epoch": 0.1302424009234321, + "flos": 532480754688.0, + "grad_norm": 0.02275953253130011, + "language_loss": 0.97366607, + "learning_rate": 0.0009738848674628377, + "loss": 0.98546779, + "num_input_tokens_seen": 56568128, + "router_z_loss_mlp": 1.19726562, + "step": 677, + "time_per_iteration": 2.710205554962158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179059, + "balance_loss_mlp": 1.05927801, + "epoch": 0.13043478260869565, + "flos": 526916682240.0, + "grad_norm": 0.02441501439452981, + "language_loss": 0.97902691, + "learning_rate": 0.000973785407386578, + "loss": 0.99081755, + "num_input_tokens_seen": 56646448, + "router_z_loss_mlp": 1.19921875, + "step": 678, + "time_per_iteration": 2.7785394191741943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184892, + "balance_loss_mlp": 1.06553924, + "epoch": 0.1306271642939592, + "flos": 627416914944.0, + "grad_norm": 0.023801085732510874, + "language_loss": 0.94469249, + "learning_rate": 0.0009736857633705103, + "loss": 0.95654142, + "num_input_tokens_seen": 56732080, + "router_z_loss_mlp": 1.19482422, + "step": 679, + "time_per_iteration": 2.8619470596313477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177483, + "balance_loss_mlp": 1.05827415, + "epoch": 0.13081954597922277, + "flos": 551840489472.0, + "grad_norm": 0.024512943765722366, + "language_loss": 1.01033652, + "learning_rate": 0.0009735859354533196, + "loss": 1.02211142, + "num_input_tokens_seen": 56804432, + "router_z_loss_mlp": 1.19335938, + "step": 680, + "time_per_iteration": 2.6954457759857178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176387, + "balance_loss_mlp": 1.05755925, + "epoch": 0.13101192766448633, + "flos": 537955504128.0, + "grad_norm": 0.029188130773433643, + "language_loss": 1.02405858, + "learning_rate": 0.0009734859236737628, + "loss": 1.03582239, + "num_input_tokens_seen": 56872512, + "router_z_loss_mlp": 1.18945312, + "step": 681, + "time_per_iteration": 2.606597661972046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172364, + "balance_loss_mlp": 1.05353606, + "epoch": 0.13120430934974991, + "flos": 504513398784.0, + "grad_norm": 0.02625319928532985, + "language_loss": 1.02007055, + "learning_rate": 0.0009733857280706678, + "loss": 1.03179431, + "num_input_tokens_seen": 56940928, + "router_z_loss_mlp": 1.18945312, + "step": 682, + "time_per_iteration": 2.626211404800415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011686, + "balance_loss_mlp": 1.05010605, + "epoch": 0.13139669103501347, + "flos": 615422641152.0, + "grad_norm": 0.025135553656080285, + "language_loss": 0.9321503, + "learning_rate": 0.000973285348682934, + "loss": 0.94383633, + "num_input_tokens_seen": 57012736, + "router_z_loss_mlp": 1.18603516, + "step": 683, + "time_per_iteration": 2.71779727935791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190269, + "balance_loss_mlp": 1.07296753, + "epoch": 0.13158907272027703, + "flos": 1488215614464.0, + "grad_norm": 0.025067429703540995, + "language_loss": 0.77898371, + "learning_rate": 0.0009731847855495323, + "loss": 0.7908864, + "num_input_tokens_seen": 57243136, + "router_z_loss_mlp": 1.17382812, + "step": 684, + "time_per_iteration": 4.811431169509888 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168738, + "balance_loss_mlp": 1.05048192, + "epoch": 0.1317814544055406, + "flos": 987117614592.0, + "grad_norm": 0.026136533405527674, + "language_loss": 0.93269205, + "learning_rate": 0.0009730840387095046, + "loss": 0.94437939, + "num_input_tokens_seen": 57336160, + "router_z_loss_mlp": 1.18359375, + "step": 685, + "time_per_iteration": 3.3154938220977783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117288, + "balance_loss_mlp": 1.05443382, + "epoch": 0.13197383609080415, + "flos": 612628870656.0, + "grad_norm": 0.026271684435729213, + "language_loss": 0.99177825, + "learning_rate": 0.0009729831082019642, + "loss": 1.00350702, + "num_input_tokens_seen": 57418976, + "router_z_loss_mlp": 1.18554688, + "step": 686, + "time_per_iteration": 2.79620623588562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179476, + "balance_loss_mlp": 1.06093395, + "epoch": 0.1321662177760677, + "flos": 495554668032.0, + "grad_norm": 0.02508782879826625, + "language_loss": 0.97052312, + "learning_rate": 0.0009728819940660958, + "loss": 0.98231786, + "num_input_tokens_seen": 57490288, + "router_z_loss_mlp": 1.18652344, + "step": 687, + "time_per_iteration": 2.779193162918091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178983, + "balance_loss_mlp": 1.06067955, + "epoch": 0.13235859946133127, + "flos": 496843765248.0, + "grad_norm": 0.02705130625621755, + "language_loss": 0.97550011, + "learning_rate": 0.0009727806963411557, + "loss": 0.98728997, + "num_input_tokens_seen": 57556064, + "router_z_loss_mlp": 1.18408203, + "step": 688, + "time_per_iteration": 2.5702319145202637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177139, + "balance_loss_mlp": 1.05883551, + "epoch": 0.13255098114659483, + "flos": 512767182336.0, + "grad_norm": 0.022910122085290585, + "language_loss": 0.96022904, + "learning_rate": 0.000972679215066471, + "loss": 0.97200048, + "num_input_tokens_seen": 57627248, + "router_z_loss_mlp": 1.18408203, + "step": 689, + "time_per_iteration": 2.64780592918396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178761, + "balance_loss_mlp": 1.06050563, + "epoch": 0.13274336283185842, + "flos": 548399442432.0, + "grad_norm": 0.030606528220640358, + "language_loss": 1.08985806, + "learning_rate": 0.0009725775502814401, + "loss": 1.10164571, + "num_input_tokens_seen": 57694832, + "router_z_loss_mlp": 1.18359375, + "step": 690, + "time_per_iteration": 2.5830535888671875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179512, + "balance_loss_mlp": 1.06120849, + "epoch": 0.13293574451712198, + "flos": 642002844672.0, + "grad_norm": 0.023439513257655937, + "language_loss": 0.94635952, + "learning_rate": 0.0009724757020255327, + "loss": 0.95815468, + "num_input_tokens_seen": 57771776, + "router_z_loss_mlp": 1.18408203, + "step": 691, + "time_per_iteration": 2.827944278717041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183334, + "balance_loss_mlp": 1.06517375, + "epoch": 0.13312812620238554, + "flos": 492469459968.0, + "grad_norm": 0.028212898490696088, + "language_loss": 0.96836531, + "learning_rate": 0.0009723736703382902, + "loss": 0.98019874, + "num_input_tokens_seen": 57836272, + "router_z_loss_mlp": 1.18261719, + "step": 692, + "time_per_iteration": 2.6144213676452637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180114, + "balance_loss_mlp": 1.06200123, + "epoch": 0.1333205078876491, + "flos": 509949216768.0, + "grad_norm": 0.023005533645913036, + "language_loss": 0.90654016, + "learning_rate": 0.0009722714552593244, + "loss": 0.91834128, + "num_input_tokens_seen": 57907232, + "router_z_loss_mlp": 1.18212891, + "step": 693, + "time_per_iteration": 2.600128173828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180549, + "balance_loss_mlp": 1.06262743, + "epoch": 0.13351288957291266, + "flos": 419591477760.0, + "grad_norm": 0.029950659996273835, + "language_loss": 1.05475199, + "learning_rate": 0.000972169056828319, + "loss": 1.06655741, + "num_input_tokens_seen": 57969808, + "router_z_loss_mlp": 1.18017578, + "step": 694, + "time_per_iteration": 2.466643810272217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178338, + "balance_loss_mlp": 1.0606066, + "epoch": 0.13370527125817622, + "flos": 617050839552.0, + "grad_norm": 0.021764231653516302, + "language_loss": 0.95444119, + "learning_rate": 0.0009720664750850283, + "loss": 0.96622455, + "num_input_tokens_seen": 58042944, + "router_z_loss_mlp": 1.17822266, + "step": 695, + "time_per_iteration": 2.7776308059692383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173328, + "balance_loss_mlp": 1.05578816, + "epoch": 0.13389765294343978, + "flos": 627169138176.0, + "grad_norm": 0.026088042391715836, + "language_loss": 1.0165019, + "learning_rate": 0.0009719637100692784, + "loss": 1.0282352, + "num_input_tokens_seen": 58116080, + "router_z_loss_mlp": 1.17626953, + "step": 696, + "time_per_iteration": 2.77535343170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175294, + "balance_loss_mlp": 1.0578016, + "epoch": 0.13409003462870334, + "flos": 610896612864.0, + "grad_norm": 0.027090913840535472, + "language_loss": 0.92017978, + "learning_rate": 0.0009718607618209661, + "loss": 0.93193275, + "num_input_tokens_seen": 58197616, + "router_z_loss_mlp": 1.17578125, + "step": 697, + "time_per_iteration": 2.8413584232330322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179845, + "balance_loss_mlp": 1.06235278, + "epoch": 0.13428241631396692, + "flos": 685087887360.0, + "grad_norm": 0.024883061853709334, + "language_loss": 0.95573747, + "learning_rate": 0.0009717576303800595, + "loss": 0.96753585, + "num_input_tokens_seen": 58280480, + "router_z_loss_mlp": 1.17578125, + "step": 698, + "time_per_iteration": 3.047100782394409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175386, + "balance_loss_mlp": 1.05794048, + "epoch": 0.13447479799923048, + "flos": 509818960896.0, + "grad_norm": 0.024888049065051182, + "language_loss": 0.95325053, + "learning_rate": 0.0009716543157865975, + "loss": 0.96500432, + "num_input_tokens_seen": 58352464, + "router_z_loss_mlp": 1.17529297, + "step": 699, + "time_per_iteration": 2.7481272220611572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176233, + "balance_loss_mlp": 1.05878782, + "epoch": 0.13466717968449404, + "flos": 899058819072.0, + "grad_norm": 0.023872779385430955, + "language_loss": 0.92076075, + "learning_rate": 0.0009715508180806907, + "loss": 0.93252313, + "num_input_tokens_seen": 58437216, + "router_z_loss_mlp": 1.17529297, + "step": 700, + "time_per_iteration": 3.2107367515563965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173529, + "balance_loss_mlp": 1.05660856, + "epoch": 0.1348595613697576, + "flos": 991694034432.0, + "grad_norm": 0.023513798430807663, + "language_loss": 1.00262749, + "learning_rate": 0.0009714471373025202, + "loss": 1.01436281, + "num_input_tokens_seen": 58533152, + "router_z_loss_mlp": 1.16992188, + "step": 701, + "time_per_iteration": 3.3966751098632812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173715, + "balance_loss_mlp": 1.0566988, + "epoch": 0.13505194305502116, + "flos": 488811561984.0, + "grad_norm": 0.028001983236069502, + "language_loss": 0.99373382, + "learning_rate": 0.0009713432734923386, + "loss": 1.00547099, + "num_input_tokens_seen": 58601376, + "router_z_loss_mlp": 1.17089844, + "step": 702, + "time_per_iteration": 2.615107536315918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171408, + "balance_loss_mlp": 1.05439234, + "epoch": 0.13524432474028472, + "flos": 614519582208.0, + "grad_norm": 0.024192478681639117, + "language_loss": 0.96606487, + "learning_rate": 0.0009712392266904696, + "loss": 0.97777891, + "num_input_tokens_seen": 58676608, + "router_z_loss_mlp": 1.17089844, + "step": 703, + "time_per_iteration": 2.7448034286499023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174325, + "balance_loss_mlp": 1.05740499, + "epoch": 0.13543670642554828, + "flos": 906274558464.0, + "grad_norm": 0.025492480769094515, + "language_loss": 0.96012545, + "learning_rate": 0.0009711349969373076, + "loss": 0.97186869, + "num_input_tokens_seen": 58759264, + "router_z_loss_mlp": 1.16992188, + "step": 704, + "time_per_iteration": 3.1337268352508545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172794, + "balance_loss_mlp": 1.05596876, + "epoch": 0.13562908811081184, + "flos": 551747163648.0, + "grad_norm": 0.026772975251671254, + "language_loss": 0.91034031, + "learning_rate": 0.0009710305842733178, + "loss": 0.9220683, + "num_input_tokens_seen": 58834800, + "router_z_loss_mlp": 1.16894531, + "step": 705, + "time_per_iteration": 2.7571139335632324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167183, + "balance_loss_mlp": 1.05031061, + "epoch": 0.1358214697960754, + "flos": 509037425664.0, + "grad_norm": 0.024292049069741084, + "language_loss": 0.98220038, + "learning_rate": 0.0009709259887390373, + "loss": 0.99387223, + "num_input_tokens_seen": 58901712, + "router_z_loss_mlp": 1.16943359, + "step": 706, + "time_per_iteration": 2.559511661529541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168004, + "balance_loss_mlp": 1.05141699, + "epoch": 0.136013851481339, + "flos": 529923300864.0, + "grad_norm": 0.025926611739077732, + "language_loss": 1.00068641, + "learning_rate": 0.0009708212103750737, + "loss": 1.01236641, + "num_input_tokens_seen": 58967824, + "router_z_loss_mlp": 1.16650391, + "step": 707, + "time_per_iteration": 2.6197190284729004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168587, + "balance_loss_mlp": 1.05219126, + "epoch": 0.13620623316660255, + "flos": 660320532480.0, + "grad_norm": 0.02235622943703988, + "language_loss": 0.96270919, + "learning_rate": 0.0009707162492221051, + "loss": 0.97439504, + "num_input_tokens_seen": 59045040, + "router_z_loss_mlp": 1.16455078, + "step": 708, + "time_per_iteration": 2.8917648792266846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171818, + "balance_loss_mlp": 1.05542207, + "epoch": 0.1363986148518661, + "flos": 673082880000.0, + "grad_norm": 0.027649047287573853, + "language_loss": 0.98132068, + "learning_rate": 0.0009706111053208815, + "loss": 0.99303889, + "num_input_tokens_seen": 59117216, + "router_z_loss_mlp": 1.16455078, + "step": 709, + "time_per_iteration": 2.7827165126800537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173191, + "balance_loss_mlp": 1.05669987, + "epoch": 0.13659099653712967, + "flos": 474004051968.0, + "grad_norm": 0.02773643003805471, + "language_loss": 0.94597077, + "learning_rate": 0.0009705057787122232, + "loss": 0.9577027, + "num_input_tokens_seen": 59183056, + "router_z_loss_mlp": 1.16552734, + "step": 710, + "time_per_iteration": 2.542836904525757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169067, + "balance_loss_mlp": 1.05286229, + "epoch": 0.13678337822239323, + "flos": 453647932416.0, + "grad_norm": 0.0248615327032158, + "language_loss": 0.9884814, + "learning_rate": 0.0009704002694370216, + "loss": 1.00017214, + "num_input_tokens_seen": 59247312, + "router_z_loss_mlp": 1.16259766, + "step": 711, + "time_per_iteration": 2.550527811050415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164533, + "balance_loss_mlp": 1.04842281, + "epoch": 0.13697575990765679, + "flos": 520625468928.0, + "grad_norm": 0.0274811578413112, + "language_loss": 0.97066599, + "learning_rate": 0.0009702945775362388, + "loss": 0.98231125, + "num_input_tokens_seen": 59317968, + "router_z_loss_mlp": 1.16162109, + "step": 712, + "time_per_iteration": 2.56953501701355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116862, + "balance_loss_mlp": 1.05246294, + "epoch": 0.13716814159292035, + "flos": 481365510144.0, + "grad_norm": 0.025544817797380492, + "language_loss": 0.98621845, + "learning_rate": 0.0009701887030509086, + "loss": 0.99790466, + "num_input_tokens_seen": 59387936, + "router_z_loss_mlp": 1.16210938, + "step": 713, + "time_per_iteration": 2.6443872451782227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173026, + "balance_loss_mlp": 1.05663013, + "epoch": 0.1373605232781839, + "flos": 546749776896.0, + "grad_norm": 0.02672517687154734, + "language_loss": 1.02031791, + "learning_rate": 0.0009700826460221346, + "loss": 1.03204811, + "num_input_tokens_seen": 59460624, + "router_z_loss_mlp": 1.16455078, + "step": 714, + "time_per_iteration": 2.6742734909057617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171339, + "balance_loss_mlp": 1.05508566, + "epoch": 0.1375529049634475, + "flos": 710070091776.0, + "grad_norm": 0.027473841831572973, + "language_loss": 1.03736091, + "learning_rate": 0.0009699764064910921, + "loss": 1.04907441, + "num_input_tokens_seen": 59536752, + "router_z_loss_mlp": 1.16308594, + "step": 715, + "time_per_iteration": 2.8945000171661377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116921, + "balance_loss_mlp": 1.05281401, + "epoch": 0.13774528664871105, + "flos": 487676189184.0, + "grad_norm": 0.02500038679906112, + "language_loss": 0.96403199, + "learning_rate": 0.0009698699844990268, + "loss": 0.9757241, + "num_input_tokens_seen": 59608128, + "router_z_loss_mlp": 1.16455078, + "step": 716, + "time_per_iteration": 2.638272762298584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116569, + "balance_loss_mlp": 1.04972363, + "epoch": 0.1379376683339746, + "flos": 681458187264.0, + "grad_norm": 0.024933229917961583, + "language_loss": 0.9565106, + "learning_rate": 0.0009697633800872555, + "loss": 0.96816742, + "num_input_tokens_seen": 59685120, + "router_z_loss_mlp": 1.16015625, + "step": 717, + "time_per_iteration": 2.8989553451538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168974, + "balance_loss_mlp": 1.05310297, + "epoch": 0.13813005001923817, + "flos": 612225368064.0, + "grad_norm": 0.02330012063083705, + "language_loss": 1.0130372, + "learning_rate": 0.0009696565932971655, + "loss": 1.02472687, + "num_input_tokens_seen": 59763376, + "router_z_loss_mlp": 1.15917969, + "step": 718, + "time_per_iteration": 2.8472671508789062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171117, + "balance_loss_mlp": 1.05524576, + "epoch": 0.13832243170450173, + "flos": 589926144000.0, + "grad_norm": 0.027418468702626427, + "language_loss": 0.98498988, + "learning_rate": 0.0009695496241702153, + "loss": 0.99670106, + "num_input_tokens_seen": 59836800, + "router_z_loss_mlp": 1.15917969, + "step": 719, + "time_per_iteration": 2.786895990371704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167345, + "balance_loss_mlp": 1.05180764, + "epoch": 0.1385148133897653, + "flos": 701319479808.0, + "grad_norm": 0.026285913371991803, + "language_loss": 0.94868541, + "learning_rate": 0.0009694424727479339, + "loss": 0.96035892, + "num_input_tokens_seen": 59914720, + "router_z_loss_mlp": 1.15576172, + "step": 720, + "time_per_iteration": 2.921644926071167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117298, + "balance_loss_mlp": 1.05729949, + "epoch": 0.13870719507502885, + "flos": 599366966784.0, + "grad_norm": 0.024279001882637877, + "language_loss": 0.97845113, + "learning_rate": 0.0009693351390719213, + "loss": 0.99018097, + "num_input_tokens_seen": 59984544, + "router_z_loss_mlp": 1.15722656, + "step": 721, + "time_per_iteration": 2.701911687850952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168632, + "balance_loss_mlp": 1.05304694, + "epoch": 0.1388995767602924, + "flos": 587748724224.0, + "grad_norm": 0.03212240351747381, + "language_loss": 0.98596126, + "learning_rate": 0.000969227623183848, + "loss": 0.99764758, + "num_input_tokens_seen": 60057056, + "router_z_loss_mlp": 1.15625, + "step": 722, + "time_per_iteration": 2.7723541259765625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167541, + "balance_loss_mlp": 1.05205071, + "epoch": 0.139091958445556, + "flos": 652362189312.0, + "grad_norm": 0.025655198862846312, + "language_loss": 0.99224544, + "learning_rate": 0.0009691199251254554, + "loss": 1.00392079, + "num_input_tokens_seen": 60133232, + "router_z_loss_mlp": 1.15527344, + "step": 723, + "time_per_iteration": 2.8426058292388916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165537, + "balance_loss_mlp": 1.05019021, + "epoch": 0.13928434013081956, + "flos": 576905286144.0, + "grad_norm": 0.022500478429048027, + "language_loss": 0.9243086, + "learning_rate": 0.0009690120449385555, + "loss": 0.93596393, + "num_input_tokens_seen": 60207104, + "router_z_loss_mlp": 1.15380859, + "step": 724, + "time_per_iteration": 2.7558276653289795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168709, + "balance_loss_mlp": 1.05307627, + "epoch": 0.13947672181608312, + "flos": 564314127360.0, + "grad_norm": 0.02294482348940274, + "language_loss": 1.00981367, + "learning_rate": 0.0009689039826650312, + "loss": 1.02150071, + "num_input_tokens_seen": 60277920, + "router_z_loss_mlp": 1.15673828, + "step": 725, + "time_per_iteration": 2.784708261489868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211281, + "balance_loss_mlp": 1.09550476, + "epoch": 0.13966910350134668, + "flos": 1524949045248.0, + "grad_norm": 0.02639881420994122, + "language_loss": 0.76523066, + "learning_rate": 0.000968795738346836, + "loss": 0.77734339, + "num_input_tokens_seen": 60494224, + "router_z_loss_mlp": 1.15820312, + "step": 726, + "time_per_iteration": 4.9523255825042725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171441, + "balance_loss_mlp": 1.05604661, + "epoch": 0.13986148518661023, + "flos": 500855500800.0, + "grad_norm": 0.0321160389091748, + "language_loss": 0.98954523, + "learning_rate": 0.0009686873120259941, + "loss": 1.00125957, + "num_input_tokens_seen": 60562176, + "router_z_loss_mlp": 1.15429688, + "step": 727, + "time_per_iteration": 2.584141731262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173326, + "balance_loss_mlp": 1.05850363, + "epoch": 0.1400538668718738, + "flos": 599849058816.0, + "grad_norm": 0.027531106684590426, + "language_loss": 0.93834305, + "learning_rate": 0.0009685787037446004, + "loss": 0.95007634, + "num_input_tokens_seen": 60631472, + "router_z_loss_mlp": 1.1484375, + "step": 728, + "time_per_iteration": 2.770592451095581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170215, + "balance_loss_mlp": 1.05520177, + "epoch": 0.14024624855713735, + "flos": 595168579584.0, + "grad_norm": 0.026051179565135866, + "language_loss": 0.98294961, + "learning_rate": 0.0009684699135448201, + "loss": 0.99465179, + "num_input_tokens_seen": 60703488, + "router_z_loss_mlp": 1.15039062, + "step": 729, + "time_per_iteration": 2.728573799133301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164865, + "balance_loss_mlp": 1.04985154, + "epoch": 0.1404386302424009, + "flos": 507585145344.0, + "grad_norm": 0.02205061924934426, + "language_loss": 0.98307908, + "learning_rate": 0.0009683609414688895, + "loss": 0.99472773, + "num_input_tokens_seen": 60773936, + "router_z_loss_mlp": 1.15039062, + "step": 730, + "time_per_iteration": 2.700016975402832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167363, + "balance_loss_mlp": 1.05254078, + "epoch": 0.14063101192766447, + "flos": 574515018240.0, + "grad_norm": 0.021243768346974407, + "language_loss": 0.95329058, + "learning_rate": 0.0009682517875591154, + "loss": 0.96496415, + "num_input_tokens_seen": 60851120, + "router_z_loss_mlp": 1.1484375, + "step": 731, + "time_per_iteration": 2.743590831756592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116728, + "balance_loss_mlp": 1.05264843, + "epoch": 0.14082339361292806, + "flos": 565764406272.0, + "grad_norm": 0.02284757167221282, + "language_loss": 0.93998873, + "learning_rate": 0.0009681424518578749, + "loss": 0.95166153, + "num_input_tokens_seen": 60924896, + "router_z_loss_mlp": 1.14648438, + "step": 732, + "time_per_iteration": 2.757690668106079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166596, + "balance_loss_mlp": 1.05215514, + "epoch": 0.14101577529819162, + "flos": 464582694912.0, + "grad_norm": 0.02112517179619274, + "language_loss": 0.95363593, + "learning_rate": 0.000968032934407616, + "loss": 0.96530199, + "num_input_tokens_seen": 60996016, + "router_z_loss_mlp": 1.14453125, + "step": 733, + "time_per_iteration": 2.6260647773742676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167013, + "balance_loss_mlp": 1.05257201, + "epoch": 0.14120815698345518, + "flos": 597261405696.0, + "grad_norm": 0.02235342076428548, + "language_loss": 0.90822989, + "learning_rate": 0.0009679232352508571, + "loss": 0.91990006, + "num_input_tokens_seen": 61072016, + "router_z_loss_mlp": 1.14453125, + "step": 734, + "time_per_iteration": 2.7677996158599854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167689, + "balance_loss_mlp": 1.05334342, + "epoch": 0.14140053866871874, + "flos": 536231978496.0, + "grad_norm": 0.023954026934244203, + "language_loss": 0.90350544, + "learning_rate": 0.0009678133544301871, + "loss": 0.91518235, + "num_input_tokens_seen": 61144528, + "router_z_loss_mlp": 1.14355469, + "step": 735, + "time_per_iteration": 2.6668286323547363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165912, + "balance_loss_mlp": 1.05147135, + "epoch": 0.1415929203539823, + "flos": 521276748288.0, + "grad_norm": 0.01836780541558419, + "language_loss": 0.98091269, + "learning_rate": 0.0009677032919882658, + "loss": 0.99257177, + "num_input_tokens_seen": 61216960, + "router_z_loss_mlp": 1.14453125, + "step": 736, + "time_per_iteration": 2.654975652694702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175055, + "balance_loss_mlp": 1.0601368, + "epoch": 0.14178530203924586, + "flos": 483301883904.0, + "grad_norm": 0.025248480485652293, + "language_loss": 1.00008237, + "learning_rate": 0.000967593047967823, + "loss": 1.01183295, + "num_input_tokens_seen": 61281312, + "router_z_loss_mlp": 1.14941406, + "step": 737, + "time_per_iteration": 2.529147148132324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167635, + "balance_loss_mlp": 1.05319452, + "epoch": 0.14197768372450942, + "flos": 677839220736.0, + "grad_norm": 0.02278890168576414, + "language_loss": 0.9561522, + "learning_rate": 0.0009674826224116593, + "loss": 0.96782857, + "num_input_tokens_seen": 61355888, + "router_z_loss_mlp": 1.14453125, + "step": 738, + "time_per_iteration": 2.8032455444335938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170364, + "balance_loss_mlp": 1.05606639, + "epoch": 0.14217006540977298, + "flos": 446992147968.0, + "grad_norm": 0.026055784762538982, + "language_loss": 0.97800839, + "learning_rate": 0.0009673720153626455, + "loss": 0.989712, + "num_input_tokens_seen": 61424288, + "router_z_loss_mlp": 1.14306641, + "step": 739, + "time_per_iteration": 2.629868984222412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172861, + "balance_loss_mlp": 1.05889642, + "epoch": 0.14236244709503657, + "flos": 497477580288.0, + "grad_norm": 0.02475738760241807, + "language_loss": 0.95941108, + "learning_rate": 0.0009672612268637235, + "loss": 0.97113973, + "num_input_tokens_seen": 61493344, + "router_z_loss_mlp": 1.13964844, + "step": 740, + "time_per_iteration": 2.6037824153900146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170194, + "balance_loss_mlp": 1.05618262, + "epoch": 0.14255482878030012, + "flos": 649479095808.0, + "grad_norm": 0.03387034378547869, + "language_loss": 0.95329261, + "learning_rate": 0.0009671502569579048, + "loss": 0.96499455, + "num_input_tokens_seen": 61565216, + "router_z_loss_mlp": 1.14013672, + "step": 741, + "time_per_iteration": 2.7700846195220947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170542, + "balance_loss_mlp": 1.05657792, + "epoch": 0.14274721046556368, + "flos": 537274025472.0, + "grad_norm": 0.02433568326488268, + "language_loss": 0.98081231, + "learning_rate": 0.0009670391056882719, + "loss": 0.99251777, + "num_input_tokens_seen": 61640928, + "router_z_loss_mlp": 1.13964844, + "step": 742, + "time_per_iteration": 2.696019172668457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174036, + "balance_loss_mlp": 1.06002402, + "epoch": 0.14293959215082724, + "flos": 958583572992.0, + "grad_norm": 0.027423351639808666, + "language_loss": 0.96458268, + "learning_rate": 0.0009669277730979776, + "loss": 0.97632295, + "num_input_tokens_seen": 61717552, + "router_z_loss_mlp": 1.14013672, + "step": 743, + "time_per_iteration": 3.2084367275238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174905, + "balance_loss_mlp": 1.06103587, + "epoch": 0.1431319738360908, + "flos": 694385719296.0, + "grad_norm": 0.02304461389980259, + "language_loss": 0.94654781, + "learning_rate": 0.0009668162592302449, + "loss": 0.9582969, + "num_input_tokens_seen": 61800016, + "router_z_loss_mlp": 1.13867188, + "step": 744, + "time_per_iteration": 2.8862292766571045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184206, + "balance_loss_mlp": 1.07009852, + "epoch": 0.14332435552135436, + "flos": 566502280704.0, + "grad_norm": 0.024928546312887438, + "language_loss": 0.9473027, + "learning_rate": 0.0009667045641283676, + "loss": 0.95914471, + "num_input_tokens_seen": 61865904, + "router_z_loss_mlp": 1.14111328, + "step": 745, + "time_per_iteration": 2.6714677810668945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170598, + "balance_loss_mlp": 1.05672932, + "epoch": 0.14351673720661792, + "flos": 739695845376.0, + "grad_norm": 0.027004630074695047, + "language_loss": 1.03854704, + "learning_rate": 0.0009665926878357092, + "loss": 1.05025315, + "num_input_tokens_seen": 61945728, + "router_z_loss_mlp": 1.13867188, + "step": 746, + "time_per_iteration": 2.9414963722229004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168037, + "balance_loss_mlp": 1.05416811, + "epoch": 0.14370911889188148, + "flos": 550351279104.0, + "grad_norm": 0.024394803732961844, + "language_loss": 0.99195439, + "learning_rate": 0.0009664806303957043, + "loss": 1.00363481, + "num_input_tokens_seen": 62016288, + "router_z_loss_mlp": 1.13867188, + "step": 747, + "time_per_iteration": 2.6798276901245117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175063, + "balance_loss_mlp": 1.06109881, + "epoch": 0.14390150057714507, + "flos": 591589271040.0, + "grad_norm": 0.028912253716933817, + "language_loss": 0.96970344, + "learning_rate": 0.0009663683918518571, + "loss": 0.98145401, + "num_input_tokens_seen": 62097904, + "router_z_loss_mlp": 1.13964844, + "step": 748, + "time_per_iteration": 2.894670248031616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172034, + "balance_loss_mlp": 1.05845118, + "epoch": 0.14409388226240863, + "flos": 592144496640.0, + "grad_norm": 0.025560266799661176, + "language_loss": 0.96381319, + "learning_rate": 0.0009662559722477428, + "loss": 0.97553355, + "num_input_tokens_seen": 62166736, + "router_z_loss_mlp": 1.13574219, + "step": 749, + "time_per_iteration": 2.702796220779419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193848, + "balance_loss_mlp": 1.08131409, + "epoch": 0.1442862639476722, + "flos": 1514654828544.0, + "grad_norm": 0.02305864885865106, + "language_loss": 0.7616297, + "learning_rate": 0.0009661433716270062, + "loss": 0.77356815, + "num_input_tokens_seen": 62402512, + "router_z_loss_mlp": 1.125, + "step": 750, + "time_per_iteration": 5.010634660720825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166507, + "balance_loss_mlp": 1.05287659, + "epoch": 0.14447864563293575, + "flos": 497855612928.0, + "grad_norm": 0.023714468612350204, + "language_loss": 0.97989428, + "learning_rate": 0.0009660305900333632, + "loss": 0.99155927, + "num_input_tokens_seen": 62473408, + "router_z_loss_mlp": 1.13623047, + "step": 751, + "time_per_iteration": 2.7064144611358643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172129, + "balance_loss_mlp": 1.05845106, + "epoch": 0.1446710273181993, + "flos": 590794274304.0, + "grad_norm": 0.03190287595859636, + "language_loss": 0.91963172, + "learning_rate": 0.0009659176275105992, + "loss": 0.93135297, + "num_input_tokens_seen": 62547440, + "router_z_loss_mlp": 1.13671875, + "step": 752, + "time_per_iteration": 2.7171401977539062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171619, + "balance_loss_mlp": 1.05803668, + "epoch": 0.14486340900346287, + "flos": 587012851200.0, + "grad_norm": 0.023715921645424867, + "language_loss": 0.93508279, + "learning_rate": 0.0009658044841025701, + "loss": 0.94679892, + "num_input_tokens_seen": 62620224, + "router_z_loss_mlp": 1.13574219, + "step": 753, + "time_per_iteration": 2.77504563331604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172686, + "balance_loss_mlp": 1.05900788, + "epoch": 0.14505579068872643, + "flos": 505740096000.0, + "grad_norm": 0.025730958483317315, + "language_loss": 0.9055903, + "learning_rate": 0.0009656911598532021, + "loss": 0.91731715, + "num_input_tokens_seen": 62690464, + "router_z_loss_mlp": 1.13671875, + "step": 754, + "time_per_iteration": 2.642886161804199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172881, + "balance_loss_mlp": 1.05925071, + "epoch": 0.14524817237399, + "flos": 487815177216.0, + "grad_norm": 0.025261406861214447, + "language_loss": 0.98625988, + "learning_rate": 0.0009655776548064917, + "loss": 0.9979887, + "num_input_tokens_seen": 62762240, + "router_z_loss_mlp": 1.13623047, + "step": 755, + "time_per_iteration": 2.6610004901885986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169342, + "balance_loss_mlp": 1.05571139, + "epoch": 0.14544055405925355, + "flos": 729449292288.0, + "grad_norm": 0.025093779151575485, + "language_loss": 0.97407329, + "learning_rate": 0.0009654639690065054, + "loss": 0.98576677, + "num_input_tokens_seen": 62839760, + "router_z_loss_mlp": 1.13623047, + "step": 756, + "time_per_iteration": 2.867976665496826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173831, + "balance_loss_mlp": 1.06024873, + "epoch": 0.14563293574451713, + "flos": 594786544128.0, + "grad_norm": 0.02769433731610086, + "language_loss": 0.96328217, + "learning_rate": 0.00096535010249738, + "loss": 0.97502041, + "num_input_tokens_seen": 62910336, + "router_z_loss_mlp": 1.13574219, + "step": 757, + "time_per_iteration": 2.718595266342163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171947, + "balance_loss_mlp": 1.05879402, + "epoch": 0.1458253174297807, + "flos": 561622414848.0, + "grad_norm": 0.027253539371253223, + "language_loss": 0.93671888, + "learning_rate": 0.0009652360553233224, + "loss": 0.94843829, + "num_input_tokens_seen": 62988160, + "router_z_loss_mlp": 1.13134766, + "step": 758, + "time_per_iteration": 2.732665538787842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181084, + "balance_loss_mlp": 1.06835938, + "epoch": 0.14601769911504425, + "flos": 1561186922496.0, + "grad_norm": 0.016548141494889222, + "language_loss": 0.73773748, + "learning_rate": 0.0009651218275286093, + "loss": 0.74954832, + "num_input_tokens_seen": 63224704, + "router_z_loss_mlp": 1.12695312, + "step": 759, + "time_per_iteration": 4.9278404712677 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161415, + "balance_loss_mlp": 1.04840457, + "epoch": 0.1462100808003078, + "flos": 867822331392.0, + "grad_norm": 0.024551380524627048, + "language_loss": 0.89752859, + "learning_rate": 0.0009650074191575883, + "loss": 0.90914273, + "num_input_tokens_seen": 63312400, + "router_z_loss_mlp": 1.12988281, + "step": 760, + "time_per_iteration": 3.18084716796875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011658, + "balance_loss_mlp": 1.05302811, + "epoch": 0.14640246248557137, + "flos": 524029585920.0, + "grad_norm": 0.025729752682943422, + "language_loss": 0.95023656, + "learning_rate": 0.0009648928302546766, + "loss": 0.96189463, + "num_input_tokens_seen": 63387792, + "router_z_loss_mlp": 1.12744141, + "step": 761, + "time_per_iteration": 2.707385301589966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161728, + "balance_loss_mlp": 1.04895639, + "epoch": 0.14659484417083493, + "flos": 1032241089024.0, + "grad_norm": 0.022974522077421757, + "language_loss": 0.94352418, + "learning_rate": 0.0009647780608643613, + "loss": 0.95514143, + "num_input_tokens_seen": 63475632, + "router_z_loss_mlp": 1.12744141, + "step": 762, + "time_per_iteration": 3.357776165008545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116078, + "balance_loss_mlp": 1.04848516, + "epoch": 0.1467872258560985, + "flos": 501656501760.0, + "grad_norm": 0.027279773355913427, + "language_loss": 0.99627388, + "learning_rate": 0.0009646631110312001, + "loss": 1.00788176, + "num_input_tokens_seen": 63546080, + "router_z_loss_mlp": 1.12255859, + "step": 763, + "time_per_iteration": 2.629650115966797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159049, + "balance_loss_mlp": 1.04665887, + "epoch": 0.14697960754136205, + "flos": 548935928832.0, + "grad_norm": 0.020644179018096606, + "language_loss": 0.95446718, + "learning_rate": 0.0009645479807998203, + "loss": 0.96605766, + "num_input_tokens_seen": 63622464, + "router_z_loss_mlp": 1.12353516, + "step": 764, + "time_per_iteration": 2.7475621700286865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157825, + "balance_loss_mlp": 1.04510117, + "epoch": 0.14717198922662564, + "flos": 518901943296.0, + "grad_norm": 0.021535065255329562, + "language_loss": 0.99812603, + "learning_rate": 0.0009644326702149196, + "loss": 1.00970435, + "num_input_tokens_seen": 63694736, + "router_z_loss_mlp": 1.12695312, + "step": 765, + "time_per_iteration": 2.711500406265259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158907, + "balance_loss_mlp": 1.04618227, + "epoch": 0.1473643709118892, + "flos": 733483221504.0, + "grad_norm": 0.02504361772442387, + "language_loss": 0.95452881, + "learning_rate": 0.0009643171793212653, + "loss": 0.96611786, + "num_input_tokens_seen": 63779072, + "router_z_loss_mlp": 1.12695312, + "step": 766, + "time_per_iteration": 3.130798578262329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163931, + "balance_loss_mlp": 1.05115891, + "epoch": 0.14755675259715276, + "flos": 621668192256.0, + "grad_norm": 0.027740201354691706, + "language_loss": 0.99870968, + "learning_rate": 0.0009642015081636952, + "loss": 1.01034904, + "num_input_tokens_seen": 63847472, + "router_z_loss_mlp": 1.12744141, + "step": 767, + "time_per_iteration": 2.701939344406128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160055, + "balance_loss_mlp": 1.04761696, + "epoch": 0.14774913428241632, + "flos": 453172571136.0, + "grad_norm": 0.025159341457135456, + "language_loss": 0.98449206, + "learning_rate": 0.0009640856567871166, + "loss": 0.99609256, + "num_input_tokens_seen": 63912496, + "router_z_loss_mlp": 1.12402344, + "step": 768, + "time_per_iteration": 2.516721725463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164874, + "balance_loss_mlp": 1.05262613, + "epoch": 0.14794151596767988, + "flos": 838654474752.0, + "grad_norm": 0.02612823197324643, + "language_loss": 0.99416363, + "learning_rate": 0.0009639696252365072, + "loss": 1.00581241, + "num_input_tokens_seen": 63990832, + "router_z_loss_mlp": 1.12207031, + "step": 769, + "time_per_iteration": 3.06074857711792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167068, + "balance_loss_mlp": 1.05472481, + "epoch": 0.14813389765294344, + "flos": 687404295168.0, + "grad_norm": 0.02602975967937929, + "language_loss": 0.89651555, + "learning_rate": 0.0009638534135569144, + "loss": 0.90818626, + "num_input_tokens_seen": 64067552, + "router_z_loss_mlp": 1.12304688, + "step": 770, + "time_per_iteration": 2.9440436363220215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169876, + "balance_loss_mlp": 1.05753326, + "epoch": 0.148326279338207, + "flos": 510943600128.0, + "grad_norm": 0.028093178265757666, + "language_loss": 1.01150489, + "learning_rate": 0.0009637370217934554, + "loss": 1.02320373, + "num_input_tokens_seen": 64140336, + "router_z_loss_mlp": 1.12304688, + "step": 771, + "time_per_iteration": 2.649656295776367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166681, + "balance_loss_mlp": 1.05443311, + "epoch": 0.14851866102347056, + "flos": 589331260416.0, + "grad_norm": 0.028336871459981, + "language_loss": 0.90924722, + "learning_rate": 0.0009636204499913175, + "loss": 0.92091405, + "num_input_tokens_seen": 64223472, + "router_z_loss_mlp": 1.12207031, + "step": 772, + "time_per_iteration": 2.8592941761016846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157961, + "balance_loss_mlp": 1.04609525, + "epoch": 0.14871104270873411, + "flos": 692247230976.0, + "grad_norm": 0.030313888046816524, + "language_loss": 0.95830965, + "learning_rate": 0.0009635036981957581, + "loss": 0.96988928, + "num_input_tokens_seen": 64299872, + "router_z_loss_mlp": 1.11816406, + "step": 773, + "time_per_iteration": 2.8690600395202637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160765, + "balance_loss_mlp": 1.04904246, + "epoch": 0.1489034243939977, + "flos": 656282600448.0, + "grad_norm": 0.02808100337337059, + "language_loss": 0.98035401, + "learning_rate": 0.0009633867664521043, + "loss": 0.99196172, + "num_input_tokens_seen": 64377152, + "router_z_loss_mlp": 1.11669922, + "step": 774, + "time_per_iteration": 2.812833070755005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159463, + "balance_loss_mlp": 1.04788363, + "epoch": 0.14909580607926126, + "flos": 476795821056.0, + "grad_norm": 0.030787585825694654, + "language_loss": 0.97385693, + "learning_rate": 0.0009632696548057527, + "loss": 0.98545158, + "num_input_tokens_seen": 64443008, + "router_z_loss_mlp": 1.11523438, + "step": 775, + "time_per_iteration": 2.5662474632263184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160778, + "balance_loss_mlp": 1.04910243, + "epoch": 0.14928818776452482, + "flos": 612283765248.0, + "grad_norm": 0.030552265213122824, + "language_loss": 0.94746792, + "learning_rate": 0.0009631523633021704, + "loss": 0.95907569, + "num_input_tokens_seen": 64519776, + "router_z_loss_mlp": 1.11621094, + "step": 776, + "time_per_iteration": 2.789336919784546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115528, + "balance_loss_mlp": 1.04408133, + "epoch": 0.14948056944978838, + "flos": 562916241408.0, + "grad_norm": 0.02653866309736765, + "language_loss": 0.98006344, + "learning_rate": 0.0009630348919868936, + "loss": 0.99161637, + "num_input_tokens_seen": 64593712, + "router_z_loss_mlp": 1.11132812, + "step": 777, + "time_per_iteration": 2.708918571472168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115506, + "balance_loss_mlp": 1.04395676, + "epoch": 0.14967295113505194, + "flos": 450111558144.0, + "grad_norm": 0.02761804701826243, + "language_loss": 0.92444694, + "learning_rate": 0.0009629172409055293, + "loss": 0.93599755, + "num_input_tokens_seen": 64658448, + "router_z_loss_mlp": 1.11035156, + "step": 778, + "time_per_iteration": 2.522322177886963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154649, + "balance_loss_mlp": 1.0435462, + "epoch": 0.1498653328203155, + "flos": 572428922880.0, + "grad_norm": 0.02112796064723151, + "language_loss": 0.9446094, + "learning_rate": 0.0009627994101037531, + "loss": 0.9561559, + "num_input_tokens_seen": 64734144, + "router_z_loss_mlp": 1.11035156, + "step": 779, + "time_per_iteration": 2.7606184482574463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154399, + "balance_loss_mlp": 1.0433439, + "epoch": 0.15005771450557906, + "flos": 632407570944.0, + "grad_norm": 0.02232887996041627, + "language_loss": 0.98232067, + "learning_rate": 0.0009626813996273114, + "loss": 0.99386466, + "num_input_tokens_seen": 64813456, + "router_z_loss_mlp": 1.10986328, + "step": 780, + "time_per_iteration": 2.8442463874816895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156542, + "balance_loss_mlp": 1.04553461, + "epoch": 0.15025009619084262, + "flos": 579165298176.0, + "grad_norm": 0.021576328362923832, + "language_loss": 0.96611506, + "learning_rate": 0.0009625632095220198, + "loss": 0.97768044, + "num_input_tokens_seen": 64896816, + "router_z_loss_mlp": 1.109375, + "step": 781, + "time_per_iteration": 2.823941469192505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156174, + "balance_loss_mlp": 1.04492784, + "epoch": 0.1504424778761062, + "flos": 484856222208.0, + "grad_norm": 0.023769174200548453, + "language_loss": 0.96595448, + "learning_rate": 0.0009624448398337637, + "loss": 0.97751617, + "num_input_tokens_seen": 64964176, + "router_z_loss_mlp": 1.11181641, + "step": 782, + "time_per_iteration": 2.517115354537964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153917, + "balance_loss_mlp": 1.04286146, + "epoch": 0.15063485956136977, + "flos": 763894513152.0, + "grad_norm": 0.022118467112767815, + "language_loss": 0.97773027, + "learning_rate": 0.0009623262906084984, + "loss": 0.98926944, + "num_input_tokens_seen": 65042592, + "router_z_loss_mlp": 1.10986328, + "step": 783, + "time_per_iteration": 2.9971072673797607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156171, + "balance_loss_mlp": 1.04554462, + "epoch": 0.15082724124663333, + "flos": 498676079616.0, + "grad_norm": 0.021733375764601555, + "language_loss": 0.99047554, + "learning_rate": 0.0009622075618922486, + "loss": 1.00203729, + "num_input_tokens_seen": 65114576, + "router_z_loss_mlp": 1.10546875, + "step": 784, + "time_per_iteration": 2.7209272384643555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161923, + "balance_loss_mlp": 1.05110586, + "epoch": 0.15101962293189689, + "flos": 510722019840.0, + "grad_norm": 0.02414763506099098, + "language_loss": 0.95223093, + "learning_rate": 0.0009620886537311091, + "loss": 0.96385014, + "num_input_tokens_seen": 65186640, + "router_z_loss_mlp": 1.10742188, + "step": 785, + "time_per_iteration": 2.668501138687134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154688, + "balance_loss_mlp": 1.04406226, + "epoch": 0.15121200461716044, + "flos": 458701714944.0, + "grad_norm": 0.026890312379790088, + "language_loss": 0.97208995, + "learning_rate": 0.000961969566171244, + "loss": 0.98363686, + "num_input_tokens_seen": 65252112, + "router_z_loss_mlp": 1.10546875, + "step": 786, + "time_per_iteration": 2.5466530323028564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153217, + "balance_loss_mlp": 1.04278123, + "epoch": 0.151404386302424, + "flos": 539017016832.0, + "grad_norm": 0.02528800532756524, + "language_loss": 1.00058115, + "learning_rate": 0.0009618502992588873, + "loss": 1.01211333, + "num_input_tokens_seen": 65318912, + "router_z_loss_mlp": 1.10351562, + "step": 787, + "time_per_iteration": 2.6463584899902344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154208, + "balance_loss_mlp": 1.04358232, + "epoch": 0.15159676798768756, + "flos": 689616643584.0, + "grad_norm": 0.023869082053813537, + "language_loss": 0.98612797, + "learning_rate": 0.0009617308530403424, + "loss": 0.99766994, + "num_input_tokens_seen": 65395424, + "router_z_loss_mlp": 1.10546875, + "step": 788, + "time_per_iteration": 3.065110921859741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158206, + "balance_loss_mlp": 1.04758012, + "epoch": 0.15178914967295112, + "flos": 546432869376.0, + "grad_norm": 0.025092696297707027, + "language_loss": 0.95288265, + "learning_rate": 0.0009616112275619825, + "loss": 0.96446472, + "num_input_tokens_seen": 65470480, + "router_z_loss_mlp": 1.10546875, + "step": 789, + "time_per_iteration": 2.7197253704071045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158368, + "balance_loss_mlp": 1.0478847, + "epoch": 0.1519815313582147, + "flos": 512814845952.0, + "grad_norm": 0.020890571468345706, + "language_loss": 0.90545368, + "learning_rate": 0.0009614914228702503, + "loss": 0.91703737, + "num_input_tokens_seen": 65544720, + "router_z_loss_mlp": 1.10400391, + "step": 790, + "time_per_iteration": 2.6894142627716064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158071, + "balance_loss_mlp": 1.04782641, + "epoch": 0.15217391304347827, + "flos": 685457187840.0, + "grad_norm": 0.02448742031060442, + "language_loss": 0.96480352, + "learning_rate": 0.0009613714390116581, + "loss": 0.97638422, + "num_input_tokens_seen": 65627872, + "router_z_loss_mlp": 1.1015625, + "step": 791, + "time_per_iteration": 2.9898860454559326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155788, + "balance_loss_mlp": 1.04568636, + "epoch": 0.15236629472874183, + "flos": 645445893120.0, + "grad_norm": 0.023088199171654812, + "language_loss": 0.93995309, + "learning_rate": 0.0009612512760327879, + "loss": 0.95151103, + "num_input_tokens_seen": 65705264, + "router_z_loss_mlp": 1.10009766, + "step": 792, + "time_per_iteration": 2.855648994445801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154532, + "balance_loss_mlp": 1.0444783, + "epoch": 0.1525586764140054, + "flos": 413764892160.0, + "grad_norm": 0.024948238648346503, + "language_loss": 0.97790802, + "learning_rate": 0.0009611309339802909, + "loss": 0.98945332, + "num_input_tokens_seen": 65768592, + "router_z_loss_mlp": 1.09960938, + "step": 793, + "time_per_iteration": 2.4684345722198486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153777, + "balance_loss_mlp": 1.04372334, + "epoch": 0.15275105809926895, + "flos": 804233448960.0, + "grad_norm": 0.02131820977076166, + "language_loss": 0.93039513, + "learning_rate": 0.0009610104129008881, + "loss": 0.94193292, + "num_input_tokens_seen": 65852432, + "router_z_loss_mlp": 1.09960938, + "step": 794, + "time_per_iteration": 3.1013269424438477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155691, + "balance_loss_mlp": 1.04554129, + "epoch": 0.1529434397845325, + "flos": 613542663168.0, + "grad_norm": 0.024012716250022468, + "language_loss": 0.97966266, + "learning_rate": 0.0009608897128413701, + "loss": 0.99121952, + "num_input_tokens_seen": 65927904, + "router_z_loss_mlp": 1.10058594, + "step": 795, + "time_per_iteration": 2.729837417602539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154149, + "balance_loss_mlp": 1.04419053, + "epoch": 0.15313582146979607, + "flos": 616471418880.0, + "grad_norm": 0.02134077894827986, + "language_loss": 0.93399352, + "learning_rate": 0.0009607688338485965, + "loss": 0.945535, + "num_input_tokens_seen": 66006800, + "router_z_loss_mlp": 1.09863281, + "step": 796, + "time_per_iteration": 2.8517422676086426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153572, + "balance_loss_mlp": 1.04409015, + "epoch": 0.15332820315505963, + "flos": 794992012800.0, + "grad_norm": 0.02204541106277596, + "language_loss": 0.98951191, + "learning_rate": 0.0009606477759694969, + "loss": 1.00104761, + "num_input_tokens_seen": 66088608, + "router_z_loss_mlp": 1.09375, + "step": 797, + "time_per_iteration": 3.0313384532928467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153537, + "balance_loss_mlp": 1.0440551, + "epoch": 0.1535205848403232, + "flos": 551256339456.0, + "grad_norm": 0.028291975879130113, + "language_loss": 0.99155664, + "learning_rate": 0.0009605265392510703, + "loss": 1.00309205, + "num_input_tokens_seen": 66153616, + "router_z_loss_mlp": 1.09375, + "step": 798, + "time_per_iteration": 2.6558592319488525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150991, + "balance_loss_mlp": 1.04122281, + "epoch": 0.15371296652558677, + "flos": 536978585088.0, + "grad_norm": 0.02676367025649214, + "language_loss": 1.00762391, + "learning_rate": 0.0009604051237403846, + "loss": 1.01913381, + "num_input_tokens_seen": 66219472, + "router_z_loss_mlp": 1.09667969, + "step": 799, + "time_per_iteration": 2.6129424571990967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151653, + "balance_loss_mlp": 1.04198015, + "epoch": 0.15390534821085033, + "flos": 396089751552.0, + "grad_norm": 0.02759928767191203, + "language_loss": 0.9523741, + "learning_rate": 0.0009602835294845776, + "loss": 0.96389061, + "num_input_tokens_seen": 66281456, + "router_z_loss_mlp": 1.09570312, + "step": 800, + "time_per_iteration": 2.4865612983703613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152453, + "balance_loss_mlp": 1.04297161, + "epoch": 0.1540977298961139, + "flos": 536885259264.0, + "grad_norm": 0.0240348205061721, + "language_loss": 0.99338514, + "learning_rate": 0.0009601617565308565, + "loss": 1.00490952, + "num_input_tokens_seen": 66348160, + "router_z_loss_mlp": 1.09375, + "step": 801, + "time_per_iteration": 2.646925449371338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155144, + "balance_loss_mlp": 1.04551864, + "epoch": 0.15429011158137745, + "flos": 725090449920.0, + "grad_norm": 0.022214532903779557, + "language_loss": 0.94821054, + "learning_rate": 0.0009600398049264977, + "loss": 0.95976186, + "num_input_tokens_seen": 66430576, + "router_z_loss_mlp": 1.09521484, + "step": 802, + "time_per_iteration": 3.0287652015686035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011558, + "balance_loss_mlp": 1.04627085, + "epoch": 0.154482493266641, + "flos": 621748783104.0, + "grad_norm": 0.025430739734688717, + "language_loss": 1.02679133, + "learning_rate": 0.0009599176747188469, + "loss": 1.03834927, + "num_input_tokens_seen": 66506480, + "router_z_loss_mlp": 1.09423828, + "step": 803, + "time_per_iteration": 2.8240089416503906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156206, + "balance_loss_mlp": 1.0467242, + "epoch": 0.15467487495190457, + "flos": 526719297024.0, + "grad_norm": 0.024483654101252486, + "language_loss": 0.90705526, + "learning_rate": 0.0009597953659553196, + "loss": 0.91861731, + "num_input_tokens_seen": 66577680, + "router_z_loss_mlp": 1.09375, + "step": 804, + "time_per_iteration": 2.745878219604492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153494, + "balance_loss_mlp": 1.04386926, + "epoch": 0.15486725663716813, + "flos": 528759730176.0, + "grad_norm": 0.02516296775651391, + "language_loss": 0.97286022, + "learning_rate": 0.0009596728786833997, + "loss": 0.98439509, + "num_input_tokens_seen": 66648496, + "router_z_loss_mlp": 1.09521484, + "step": 805, + "time_per_iteration": 2.6471030712127686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152075, + "balance_loss_mlp": 1.04244983, + "epoch": 0.1550596383224317, + "flos": 1050278799360.0, + "grad_norm": 0.026563720364072098, + "language_loss": 0.9858942, + "learning_rate": 0.0009595502129506415, + "loss": 0.99741489, + "num_input_tokens_seen": 66735216, + "router_z_loss_mlp": 1.09521484, + "step": 806, + "time_per_iteration": 3.3734352588653564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115037, + "balance_loss_mlp": 1.04088783, + "epoch": 0.15525202000769528, + "flos": 614836489728.0, + "grad_norm": 0.02624405223250092, + "language_loss": 0.91745955, + "learning_rate": 0.0009594273688046678, + "loss": 0.92896324, + "num_input_tokens_seen": 66810672, + "router_z_loss_mlp": 1.09375, + "step": 807, + "time_per_iteration": 2.8000967502593994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153708, + "balance_loss_mlp": 1.04441667, + "epoch": 0.15544440169295884, + "flos": 534102222336.0, + "grad_norm": 0.028049278390969077, + "language_loss": 0.97350299, + "learning_rate": 0.000959304346293171, + "loss": 0.98504007, + "num_input_tokens_seen": 66879824, + "router_z_loss_mlp": 1.09179688, + "step": 808, + "time_per_iteration": 2.7285830974578857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164275, + "balance_loss_mlp": 1.05464995, + "epoch": 0.1556367833782224, + "flos": 645886325760.0, + "grad_norm": 0.033021349518653896, + "language_loss": 0.99046445, + "learning_rate": 0.0009591811454639125, + "loss": 1.00210714, + "num_input_tokens_seen": 66949424, + "router_z_loss_mlp": 1.09521484, + "step": 809, + "time_per_iteration": 2.842867612838745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155411, + "balance_loss_mlp": 1.04612005, + "epoch": 0.15582916506348596, + "flos": 544952391168.0, + "grad_norm": 0.02421082053858415, + "language_loss": 0.95793635, + "learning_rate": 0.0009590577663647234, + "loss": 0.96949041, + "num_input_tokens_seen": 67024000, + "router_z_loss_mlp": 1.09179688, + "step": 810, + "time_per_iteration": 2.8207406997680664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158015, + "balance_loss_mlp": 1.04877126, + "epoch": 0.15602154674874952, + "flos": 581214463488.0, + "grad_norm": 0.022734781081273227, + "language_loss": 0.95110512, + "learning_rate": 0.0009589342090435036, + "loss": 0.96268523, + "num_input_tokens_seen": 67100672, + "router_z_loss_mlp": 1.09130859, + "step": 811, + "time_per_iteration": 2.8413872718811035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170356, + "balance_loss_mlp": 1.06068361, + "epoch": 0.15621392843401308, + "flos": 536316572160.0, + "grad_norm": 0.026628933906638022, + "language_loss": 0.97807872, + "learning_rate": 0.0009588104735482223, + "loss": 0.98978221, + "num_input_tokens_seen": 67171584, + "router_z_loss_mlp": 1.09570312, + "step": 812, + "time_per_iteration": 2.6673126220703125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164587, + "balance_loss_mlp": 1.05524826, + "epoch": 0.15640631011927664, + "flos": 551981478912.0, + "grad_norm": 0.027865461759282353, + "language_loss": 0.94247007, + "learning_rate": 0.0009586865599269177, + "loss": 0.95411587, + "num_input_tokens_seen": 67240640, + "router_z_loss_mlp": 1.09228516, + "step": 813, + "time_per_iteration": 2.655217409133911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159004, + "balance_loss_mlp": 1.04985571, + "epoch": 0.1565986918045402, + "flos": 638635657728.0, + "grad_norm": 0.024501009698068087, + "language_loss": 0.98888743, + "learning_rate": 0.0009585624682276977, + "loss": 1.00047755, + "num_input_tokens_seen": 67312976, + "router_z_loss_mlp": 1.09033203, + "step": 814, + "time_per_iteration": 2.7572293281555176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160029, + "balance_loss_mlp": 1.05073786, + "epoch": 0.15679107348980378, + "flos": 491781250560.0, + "grad_norm": 0.02545428800843787, + "language_loss": 0.97158241, + "learning_rate": 0.0009584381984987386, + "loss": 0.98318267, + "num_input_tokens_seen": 67378528, + "router_z_loss_mlp": 1.09179688, + "step": 815, + "time_per_iteration": 2.554208517074585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160766, + "balance_loss_mlp": 1.05185616, + "epoch": 0.15698345517506734, + "flos": 531002277888.0, + "grad_norm": 0.022736041606184667, + "language_loss": 0.98151159, + "learning_rate": 0.0009583137507882864, + "loss": 0.99311924, + "num_input_tokens_seen": 67449728, + "router_z_loss_mlp": 1.08789062, + "step": 816, + "time_per_iteration": 2.6635444164276123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158696, + "balance_loss_mlp": 1.04978669, + "epoch": 0.1571758368603309, + "flos": 547077417984.0, + "grad_norm": 0.024009976747476527, + "language_loss": 0.90921289, + "learning_rate": 0.000958189125144656, + "loss": 0.92079985, + "num_input_tokens_seen": 67520512, + "router_z_loss_mlp": 1.08789062, + "step": 817, + "time_per_iteration": 2.635559558868408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156061, + "balance_loss_mlp": 1.04719925, + "epoch": 0.15736821854559446, + "flos": 566743326720.0, + "grad_norm": 0.021547949482456395, + "language_loss": 0.97883654, + "learning_rate": 0.0009580643216162313, + "loss": 0.99039721, + "num_input_tokens_seen": 67592464, + "router_z_loss_mlp": 1.08740234, + "step": 818, + "time_per_iteration": 2.673997640609741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157698, + "balance_loss_mlp": 1.04888415, + "epoch": 0.15756060023085802, + "flos": 501953943552.0, + "grad_norm": 0.023826624353146583, + "language_loss": 0.90112716, + "learning_rate": 0.0009579393402514652, + "loss": 0.91270417, + "num_input_tokens_seen": 67658928, + "router_z_loss_mlp": 1.08691406, + "step": 819, + "time_per_iteration": 2.61460542678833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156999, + "balance_loss_mlp": 1.04823244, + "epoch": 0.15775298191612158, + "flos": 520271631360.0, + "grad_norm": 0.023927295219635936, + "language_loss": 0.99075627, + "learning_rate": 0.0009578141810988801, + "loss": 1.00232625, + "num_input_tokens_seen": 67727936, + "router_z_loss_mlp": 1.08642578, + "step": 820, + "time_per_iteration": 2.591036558151245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149976, + "balance_loss_mlp": 1.04111433, + "epoch": 0.15794536360138514, + "flos": 467087755776.0, + "grad_norm": 0.026283029611425073, + "language_loss": 1.00067806, + "learning_rate": 0.0009576888442070668, + "loss": 1.01217794, + "num_input_tokens_seen": 67795488, + "router_z_loss_mlp": 1.08740234, + "step": 821, + "time_per_iteration": 2.5960564613342285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151894, + "balance_loss_mlp": 1.04279363, + "epoch": 0.1581377452866487, + "flos": 518168071680.0, + "grad_norm": 0.02399653039287492, + "language_loss": 1.01290274, + "learning_rate": 0.0009575633296246854, + "loss": 1.02442169, + "num_input_tokens_seen": 67858896, + "router_z_loss_mlp": 1.08984375, + "step": 822, + "time_per_iteration": 2.579575300216675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151848, + "balance_loss_mlp": 1.04312956, + "epoch": 0.15833012697191226, + "flos": 550837373952.0, + "grad_norm": 0.02407632334340799, + "language_loss": 0.91124117, + "learning_rate": 0.0009574376374004652, + "loss": 0.92275965, + "num_input_tokens_seen": 67924864, + "router_z_loss_mlp": 1.0859375, + "step": 823, + "time_per_iteration": 2.661754608154297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162901, + "balance_loss_mlp": 1.05446815, + "epoch": 0.15852250865717585, + "flos": 488466456576.0, + "grad_norm": 0.026327967105985502, + "language_loss": 0.90841949, + "learning_rate": 0.000957311767583204, + "loss": 0.92004848, + "num_input_tokens_seen": 67992912, + "router_z_loss_mlp": 1.08300781, + "step": 824, + "time_per_iteration": 2.7887372970581055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156753, + "balance_loss_mlp": 1.04956055, + "epoch": 0.1587148903424394, + "flos": 1312696909824.0, + "grad_norm": 0.010620587901871582, + "language_loss": 0.8207159, + "learning_rate": 0.0009571857202217691, + "loss": 0.8322835, + "num_input_tokens_seen": 68207408, + "router_z_loss_mlp": 1.0703125, + "step": 825, + "time_per_iteration": 4.766167640686035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151145, + "balance_loss_mlp": 1.04304576, + "epoch": 0.15890727202770297, + "flos": 467832360960.0, + "grad_norm": 0.02959471781097451, + "language_loss": 1.0376749, + "learning_rate": 0.0009570594953650961, + "loss": 1.04918623, + "num_input_tokens_seen": 68270864, + "router_z_loss_mlp": 1.07958984, + "step": 826, + "time_per_iteration": 2.6334874629974365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151786, + "balance_loss_mlp": 1.04354417, + "epoch": 0.15909965371296653, + "flos": 778606695936.0, + "grad_norm": 0.024366848241159877, + "language_loss": 0.8923949, + "learning_rate": 0.00095693309306219, + "loss": 0.90391278, + "num_input_tokens_seen": 68355408, + "router_z_loss_mlp": 1.08105469, + "step": 827, + "time_per_iteration": 3.1078274250030518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115264, + "balance_loss_mlp": 1.04449332, + "epoch": 0.1592920353982301, + "flos": 1079962950144.0, + "grad_norm": 0.02547465125103231, + "language_loss": 0.98567259, + "learning_rate": 0.0009568065133621244, + "loss": 0.99719906, + "num_input_tokens_seen": 68437072, + "router_z_loss_mlp": 1.08007812, + "step": 828, + "time_per_iteration": 3.3287436962127686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147109, + "balance_loss_mlp": 1.03872418, + "epoch": 0.15948441708349365, + "flos": 726889837056.0, + "grad_norm": 0.026992334830630314, + "language_loss": 0.93815649, + "learning_rate": 0.0009566797563140422, + "loss": 0.94962764, + "num_input_tokens_seen": 68511696, + "router_z_loss_mlp": 1.08251953, + "step": 829, + "time_per_iteration": 2.8641507625579834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146027, + "balance_loss_mlp": 1.03788006, + "epoch": 0.1596767987687572, + "flos": 580075087872.0, + "grad_norm": 0.026140449767567974, + "language_loss": 0.96191794, + "learning_rate": 0.0009565528219671547, + "loss": 0.97337818, + "num_input_tokens_seen": 68587488, + "router_z_loss_mlp": 1.08007812, + "step": 830, + "time_per_iteration": 2.9082329273223877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147169, + "balance_loss_mlp": 1.03902268, + "epoch": 0.15986918045402077, + "flos": 530025358848.0, + "grad_norm": 0.02186736495212519, + "language_loss": 0.93771887, + "learning_rate": 0.0009564257103707418, + "loss": 0.94919056, + "num_input_tokens_seen": 68655760, + "router_z_loss_mlp": 1.08007812, + "step": 831, + "time_per_iteration": 4.109540700912476 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153246, + "balance_loss_mlp": 1.04519463, + "epoch": 0.16006156213928435, + "flos": 575669856768.0, + "grad_norm": 0.025156765484562034, + "language_loss": 1.01463771, + "learning_rate": 0.0009562984215741533, + "loss": 1.02617025, + "num_input_tokens_seen": 68724560, + "router_z_loss_mlp": 1.07910156, + "step": 832, + "time_per_iteration": 2.634381055831909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148637, + "balance_loss_mlp": 1.0408721, + "epoch": 0.1602539438245479, + "flos": 516674858496.0, + "grad_norm": 0.023022886756030446, + "language_loss": 0.90665066, + "learning_rate": 0.0009561709556268065, + "loss": 0.91813707, + "num_input_tokens_seen": 68795440, + "router_z_loss_mlp": 1.07617188, + "step": 833, + "time_per_iteration": 2.7094552516937256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115539, + "balance_loss_mlp": 1.04752922, + "epoch": 0.16044632550981147, + "flos": 622161017856.0, + "grad_norm": 0.02456985500743924, + "language_loss": 1.0306673, + "learning_rate": 0.0009560433125781884, + "loss": 1.04222107, + "num_input_tokens_seen": 68868176, + "router_z_loss_mlp": 1.07714844, + "step": 834, + "time_per_iteration": 2.7217955589294434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156285, + "balance_loss_mlp": 1.04794765, + "epoch": 0.16063870719507503, + "flos": 562127975424.0, + "grad_norm": 0.02550250825542428, + "language_loss": 1.02622008, + "learning_rate": 0.0009559154924778544, + "loss": 1.03778291, + "num_input_tokens_seen": 68939616, + "router_z_loss_mlp": 1.08203125, + "step": 835, + "time_per_iteration": 4.0438151359558105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153381, + "balance_loss_mlp": 1.04509139, + "epoch": 0.1608310888803386, + "flos": 806560590336.0, + "grad_norm": 0.023331498233936678, + "language_loss": 0.93980491, + "learning_rate": 0.0009557874953754284, + "loss": 0.95133871, + "num_input_tokens_seen": 69016192, + "router_z_loss_mlp": 1.08154297, + "step": 836, + "time_per_iteration": 3.0253541469573975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155161, + "balance_loss_mlp": 1.04739583, + "epoch": 0.16102347056560215, + "flos": 601694108160.0, + "grad_norm": 0.024039154316001603, + "language_loss": 0.9449209, + "learning_rate": 0.0009556593213206038, + "loss": 0.95647246, + "num_input_tokens_seen": 69089360, + "router_z_loss_mlp": 1.07617188, + "step": 837, + "time_per_iteration": 2.815293788909912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148071, + "balance_loss_mlp": 1.04049647, + "epoch": 0.1612158522508657, + "flos": 554614794240.0, + "grad_norm": 0.024490980939479982, + "language_loss": 0.96443379, + "learning_rate": 0.0009555309703631414, + "loss": 0.9759146, + "num_input_tokens_seen": 69161952, + "router_z_loss_mlp": 1.07421875, + "step": 838, + "time_per_iteration": 2.7353601455688477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148397, + "balance_loss_mlp": 1.0406791, + "epoch": 0.16140823393612927, + "flos": 557017797120.0, + "grad_norm": 0.026558461299776022, + "language_loss": 0.98485982, + "learning_rate": 0.0009554024425528722, + "loss": 0.99634379, + "num_input_tokens_seen": 69232432, + "router_z_loss_mlp": 1.07568359, + "step": 839, + "time_per_iteration": 2.801539182662964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146915, + "balance_loss_mlp": 1.03924477, + "epoch": 0.16160061562139286, + "flos": 544908730368.0, + "grad_norm": 0.023933605454050468, + "language_loss": 0.96992832, + "learning_rate": 0.0009552737379396948, + "loss": 0.98139745, + "num_input_tokens_seen": 69297696, + "router_z_loss_mlp": 1.07519531, + "step": 840, + "time_per_iteration": 2.613037586212158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148515, + "balance_loss_mlp": 1.04122651, + "epoch": 0.16179299730665642, + "flos": 605006900736.0, + "grad_norm": 0.020652206840645122, + "language_loss": 0.95695615, + "learning_rate": 0.0009551448565735767, + "loss": 0.96844131, + "num_input_tokens_seen": 69373888, + "router_z_loss_mlp": 1.07128906, + "step": 841, + "time_per_iteration": 2.779979705810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149052, + "balance_loss_mlp": 1.04128659, + "epoch": 0.16198537899191998, + "flos": 788551077888.0, + "grad_norm": 0.02358864683094414, + "language_loss": 0.96423578, + "learning_rate": 0.0009550157985045543, + "loss": 0.97572625, + "num_input_tokens_seen": 69449984, + "router_z_loss_mlp": 1.07617188, + "step": 842, + "time_per_iteration": 3.0352344512939453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148245, + "balance_loss_mlp": 1.04086173, + "epoch": 0.16217776067718354, + "flos": 520829584896.0, + "grad_norm": 0.02127918945612936, + "language_loss": 0.95624614, + "learning_rate": 0.0009548865637827321, + "loss": 0.96772861, + "num_input_tokens_seen": 69522736, + "router_z_loss_mlp": 1.07226562, + "step": 843, + "time_per_iteration": 2.695211172103882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148036, + "balance_loss_mlp": 1.04027128, + "epoch": 0.1623701423624471, + "flos": 506254388736.0, + "grad_norm": 0.02427958482397641, + "language_loss": 0.99469078, + "learning_rate": 0.0009547571524582838, + "loss": 1.00617111, + "num_input_tokens_seen": 69587184, + "router_z_loss_mlp": 1.07617188, + "step": 844, + "time_per_iteration": 2.586859941482544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145622, + "balance_loss_mlp": 1.03842914, + "epoch": 0.16256252404771065, + "flos": 498157057536.0, + "grad_norm": 0.025657026114593633, + "language_loss": 1.02873135, + "learning_rate": 0.0009546275645814512, + "loss": 1.04018748, + "num_input_tokens_seen": 69656560, + "router_z_loss_mlp": 1.0703125, + "step": 845, + "time_per_iteration": 2.735323190689087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147597, + "balance_loss_mlp": 1.04040384, + "epoch": 0.16275490573297421, + "flos": 503286701568.0, + "grad_norm": 0.024743383464961046, + "language_loss": 1.00377154, + "learning_rate": 0.0009544978002025446, + "loss": 1.01524746, + "num_input_tokens_seen": 69723872, + "router_z_loss_mlp": 1.0703125, + "step": 846, + "time_per_iteration": 2.5876121520996094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148997, + "balance_loss_mlp": 1.04189885, + "epoch": 0.16294728741823777, + "flos": 508353945600.0, + "grad_norm": 0.020876938588178177, + "language_loss": 0.94877481, + "learning_rate": 0.0009543678593719434, + "loss": 0.9602648, + "num_input_tokens_seen": 69795504, + "router_z_loss_mlp": 1.06933594, + "step": 847, + "time_per_iteration": 2.69250750541687 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159847, + "balance_loss_mlp": 1.05274892, + "epoch": 0.16313966910350133, + "flos": 510756948480.0, + "grad_norm": 0.020936629725758764, + "language_loss": 0.95534647, + "learning_rate": 0.0009542377421400945, + "loss": 0.96694493, + "num_input_tokens_seen": 69873408, + "router_z_loss_mlp": 1.06933594, + "step": 848, + "time_per_iteration": 2.7832183837890625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146796, + "balance_loss_mlp": 1.03965068, + "epoch": 0.16333205078876492, + "flos": 545056450560.0, + "grad_norm": 0.023544058946573278, + "language_loss": 0.94486761, + "learning_rate": 0.0009541074485575145, + "loss": 0.95633554, + "num_input_tokens_seen": 69944112, + "router_z_loss_mlp": 1.06982422, + "step": 849, + "time_per_iteration": 2.7163026332855225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147161, + "balance_loss_mlp": 1.03996801, + "epoch": 0.16352443247402848, + "flos": 508711785984.0, + "grad_norm": 0.023080110816121054, + "language_loss": 1.00550437, + "learning_rate": 0.0009539769786747874, + "loss": 1.01697588, + "num_input_tokens_seen": 70012288, + "router_z_loss_mlp": 1.0703125, + "step": 850, + "time_per_iteration": 2.5918350219726562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152854, + "balance_loss_mlp": 1.04547, + "epoch": 0.16371681415929204, + "flos": 543222134784.0, + "grad_norm": 0.022593715242085626, + "language_loss": 0.90895152, + "learning_rate": 0.0009538463325425665, + "loss": 0.92048007, + "num_input_tokens_seen": 70086560, + "router_z_loss_mlp": 1.07226562, + "step": 851, + "time_per_iteration": 2.701662063598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146583, + "balance_loss_mlp": 1.03939056, + "epoch": 0.1639091958445556, + "flos": 521760841728.0, + "grad_norm": 0.025319624949764974, + "language_loss": 0.95562863, + "learning_rate": 0.0009537155102115728, + "loss": 0.96709442, + "num_input_tokens_seen": 70153968, + "router_z_loss_mlp": 1.0703125, + "step": 852, + "time_per_iteration": 2.577416181564331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145576, + "balance_loss_mlp": 1.03871727, + "epoch": 0.16410157752981916, + "flos": 548482034688.0, + "grad_norm": 0.022217218078565786, + "language_loss": 0.92332971, + "learning_rate": 0.0009535845117325961, + "loss": 0.93478549, + "num_input_tokens_seen": 70222496, + "router_z_loss_mlp": 1.06689453, + "step": 853, + "time_per_iteration": 2.643528699874878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148166, + "balance_loss_mlp": 1.04135406, + "epoch": 0.16429395921508272, + "flos": 584025698304.0, + "grad_norm": 0.02024018106959617, + "language_loss": 1.00128078, + "learning_rate": 0.0009534533371564946, + "loss": 1.01276231, + "num_input_tokens_seen": 70301680, + "router_z_loss_mlp": 1.06640625, + "step": 854, + "time_per_iteration": 2.74361515045166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150543, + "balance_loss_mlp": 1.04377949, + "epoch": 0.16448634090034628, + "flos": 531961732608.0, + "grad_norm": 0.02843561601072028, + "language_loss": 1.00094676, + "learning_rate": 0.0009533219865341949, + "loss": 1.01245213, + "num_input_tokens_seen": 70371152, + "router_z_loss_mlp": 1.06591797, + "step": 855, + "time_per_iteration": 2.5858380794525146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156957, + "balance_loss_mlp": 1.05014503, + "epoch": 0.16467872258560984, + "flos": 492960284160.0, + "grad_norm": 0.026495144396752456, + "language_loss": 0.95923662, + "learning_rate": 0.0009531904599166916, + "loss": 0.97080612, + "num_input_tokens_seen": 70440832, + "router_z_loss_mlp": 1.06640625, + "step": 856, + "time_per_iteration": 2.638528823852539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147973, + "balance_loss_mlp": 1.04101861, + "epoch": 0.16487110427087343, + "flos": 507259505664.0, + "grad_norm": 0.02303677132947941, + "language_loss": 0.95950538, + "learning_rate": 0.0009530587573550478, + "loss": 0.97098505, + "num_input_tokens_seen": 70507424, + "router_z_loss_mlp": 1.06787109, + "step": 857, + "time_per_iteration": 2.5788354873657227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151596, + "balance_loss_mlp": 1.04592896, + "epoch": 0.16506348595613698, + "flos": 1436108714496.0, + "grad_norm": 0.011861304780107247, + "language_loss": 0.74319386, + "learning_rate": 0.0009529268789003953, + "loss": 0.75470984, + "num_input_tokens_seen": 70742320, + "router_z_loss_mlp": 1.0546875, + "step": 858, + "time_per_iteration": 5.003005027770996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153597, + "balance_loss_mlp": 1.04673755, + "epoch": 0.16525586764140054, + "flos": 478089647616.0, + "grad_norm": 0.02595402254221991, + "language_loss": 0.98057735, + "learning_rate": 0.0009527948246039337, + "loss": 0.99211335, + "num_input_tokens_seen": 70808400, + "router_z_loss_mlp": 1.06689453, + "step": 859, + "time_per_iteration": 2.541255474090576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152748, + "balance_loss_mlp": 1.04622293, + "epoch": 0.1654482493266641, + "flos": 882540518400.0, + "grad_norm": 0.024187417777422206, + "language_loss": 0.96476752, + "learning_rate": 0.000952662594516931, + "loss": 0.97629499, + "num_input_tokens_seen": 70886192, + "router_z_loss_mlp": 1.06347656, + "step": 860, + "time_per_iteration": 3.102233409881592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154678, + "balance_loss_mlp": 1.04791439, + "epoch": 0.16564063101192766, + "flos": 628105124352.0, + "grad_norm": 0.02242324391324738, + "language_loss": 0.93166292, + "learning_rate": 0.0009525301886907234, + "loss": 0.94320977, + "num_input_tokens_seen": 70964816, + "router_z_loss_mlp": 1.06591797, + "step": 861, + "time_per_iteration": 2.871971368789673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151309, + "balance_loss_mlp": 1.04487896, + "epoch": 0.16583301269719122, + "flos": 562592603136.0, + "grad_norm": 0.02248996903194516, + "language_loss": 0.97140592, + "learning_rate": 0.0009523976071767155, + "loss": 0.98291898, + "num_input_tokens_seen": 71037456, + "router_z_loss_mlp": 1.0625, + "step": 862, + "time_per_iteration": 2.653031349182129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146763, + "balance_loss_mlp": 1.04038036, + "epoch": 0.16602539438245478, + "flos": 568983873024.0, + "grad_norm": 0.020794335354585358, + "language_loss": 0.9646408, + "learning_rate": 0.00095226485002638, + "loss": 0.97610843, + "num_input_tokens_seen": 71111872, + "router_z_loss_mlp": 1.06201172, + "step": 863, + "time_per_iteration": 2.7685163021087646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147042, + "balance_loss_mlp": 1.04075551, + "epoch": 0.16621777606771834, + "flos": 576021692928.0, + "grad_norm": 0.021581021962121343, + "language_loss": 0.96560466, + "learning_rate": 0.0009521319172912576, + "loss": 0.9770751, + "num_input_tokens_seen": 71187808, + "router_z_loss_mlp": 1.06103516, + "step": 864, + "time_per_iteration": 2.762233257293701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149511, + "balance_loss_mlp": 1.0432713, + "epoch": 0.16641015775298193, + "flos": 515597882880.0, + "grad_norm": 0.029880870913045234, + "language_loss": 1.0375855, + "learning_rate": 0.0009519988090229579, + "loss": 1.04908061, + "num_input_tokens_seen": 71261728, + "router_z_loss_mlp": 1.06054688, + "step": 865, + "time_per_iteration": 2.7156929969787598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148426, + "balance_loss_mlp": 1.04199588, + "epoch": 0.1666025394382455, + "flos": 622849227264.0, + "grad_norm": 0.023088954173990716, + "language_loss": 0.96669209, + "learning_rate": 0.0009518655252731576, + "loss": 0.9781763, + "num_input_tokens_seen": 71338352, + "router_z_loss_mlp": 1.0625, + "step": 866, + "time_per_iteration": 2.76474928855896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147261, + "balance_loss_mlp": 1.04102135, + "epoch": 0.16679492112350905, + "flos": 549932313600.0, + "grad_norm": 0.021458749489738967, + "language_loss": 0.98467255, + "learning_rate": 0.0009517320660936022, + "loss": 0.99614513, + "num_input_tokens_seen": 71416544, + "router_z_loss_mlp": 1.06054688, + "step": 867, + "time_per_iteration": 2.7664713859558105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151692, + "balance_loss_mlp": 1.04545259, + "epoch": 0.1669873028087726, + "flos": 666865526784.0, + "grad_norm": 0.02209258354681387, + "language_loss": 0.92114806, + "learning_rate": 0.0009515984315361051, + "loss": 0.93266487, + "num_input_tokens_seen": 71494080, + "router_z_loss_mlp": 1.06054688, + "step": 868, + "time_per_iteration": 2.845388412475586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151588, + "balance_loss_mlp": 1.04563451, + "epoch": 0.16717968449403617, + "flos": 539603168256.0, + "grad_norm": 0.02501334283432316, + "language_loss": 0.95751995, + "learning_rate": 0.000951464621652548, + "loss": 0.96903574, + "num_input_tokens_seen": 71562672, + "router_z_loss_mlp": 1.05761719, + "step": 869, + "time_per_iteration": 2.623375415802002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148167, + "balance_loss_mlp": 1.04216599, + "epoch": 0.16737206617929973, + "flos": 531278252544.0, + "grad_norm": 0.02062860382438808, + "language_loss": 0.87610328, + "learning_rate": 0.0009513306364948804, + "loss": 0.88758498, + "num_input_tokens_seen": 71641904, + "router_z_loss_mlp": 1.05810547, + "step": 870, + "time_per_iteration": 2.792346239089966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148065, + "balance_loss_mlp": 1.04206407, + "epoch": 0.1675644478645633, + "flos": 481756277760.0, + "grad_norm": 0.023236257285911367, + "language_loss": 0.98118269, + "learning_rate": 0.0009511964761151197, + "loss": 0.99266338, + "num_input_tokens_seen": 71709616, + "router_z_loss_mlp": 1.05810547, + "step": 871, + "time_per_iteration": 2.572923183441162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152601, + "balance_loss_mlp": 1.04669595, + "epoch": 0.16775682954982685, + "flos": 495541206528.0, + "grad_norm": 0.026661505796453877, + "language_loss": 0.99311042, + "learning_rate": 0.0009510621405653521, + "loss": 1.00463641, + "num_input_tokens_seen": 71776592, + "router_z_loss_mlp": 1.05712891, + "step": 872, + "time_per_iteration": 2.6296472549438477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150326, + "balance_loss_mlp": 1.04484987, + "epoch": 0.1679492112350904, + "flos": 753404912640.0, + "grad_norm": 0.029291148216183213, + "language_loss": 0.93300939, + "learning_rate": 0.0009509276298977309, + "loss": 0.94451261, + "num_input_tokens_seen": 71856352, + "router_z_loss_mlp": 1.05273438, + "step": 873, + "time_per_iteration": 3.0177366733551025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150817, + "balance_loss_mlp": 1.04543638, + "epoch": 0.168141592920354, + "flos": 1137731977728.0, + "grad_norm": 0.021155110884158303, + "language_loss": 0.9134444, + "learning_rate": 0.0009507929441644778, + "loss": 0.92495263, + "num_input_tokens_seen": 71948480, + "router_z_loss_mlp": 1.05175781, + "step": 874, + "time_per_iteration": 3.53277325630188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160399, + "balance_loss_mlp": 1.05501771, + "epoch": 0.16833397460561755, + "flos": 633553677312.0, + "grad_norm": 0.025508723945600786, + "language_loss": 0.94342184, + "learning_rate": 0.0009506580834178826, + "loss": 0.95502585, + "num_input_tokens_seen": 72019200, + "router_z_loss_mlp": 1.05175781, + "step": 875, + "time_per_iteration": 2.763296365737915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151031, + "balance_loss_mlp": 1.04560196, + "epoch": 0.1685263562908811, + "flos": 542542657536.0, + "grad_norm": 0.0234395143242784, + "language_loss": 1.00066125, + "learning_rate": 0.0009505230477103028, + "loss": 1.01217151, + "num_input_tokens_seen": 72088672, + "router_z_loss_mlp": 1.05224609, + "step": 876, + "time_per_iteration": 2.7256453037261963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143495, + "balance_loss_mlp": 1.03801847, + "epoch": 0.16871873797614467, + "flos": 620485155840.0, + "grad_norm": 0.02951425183806971, + "language_loss": 0.91949958, + "learning_rate": 0.0009503878370941641, + "loss": 0.93093449, + "num_input_tokens_seen": 72159952, + "router_z_loss_mlp": 1.05273438, + "step": 877, + "time_per_iteration": 2.75011944770813 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143733, + "balance_loss_mlp": 1.038257, + "epoch": 0.16891111966140823, + "flos": 607455565824.0, + "grad_norm": 0.02526909046796152, + "language_loss": 0.99137431, + "learning_rate": 0.0009502524516219595, + "loss": 1.00281167, + "num_input_tokens_seen": 72231648, + "router_z_loss_mlp": 1.05273438, + "step": 878, + "time_per_iteration": 2.7107326984405518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145725, + "balance_loss_mlp": 1.04005778, + "epoch": 0.1691035013466718, + "flos": 553405561344.0, + "grad_norm": 0.023246247090994255, + "language_loss": 0.99022686, + "learning_rate": 0.0009501168913462506, + "loss": 1.00168419, + "num_input_tokens_seen": 72298608, + "router_z_loss_mlp": 1.0546875, + "step": 879, + "time_per_iteration": 2.654356002807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153572, + "balance_loss_mlp": 1.04866791, + "epoch": 0.16929588303193535, + "flos": 1479305822208.0, + "grad_norm": 0.014844444469597292, + "language_loss": 0.79121923, + "learning_rate": 0.0009499811563196665, + "loss": 0.802755, + "num_input_tokens_seen": 72525312, + "router_z_loss_mlp": 1.046875, + "step": 880, + "time_per_iteration": 4.877387762069702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114571, + "balance_loss_mlp": 1.04042399, + "epoch": 0.1694882647171989, + "flos": 927846641664.0, + "grad_norm": 0.023879743421000837, + "language_loss": 0.93963408, + "learning_rate": 0.0009498452465949042, + "loss": 0.95109117, + "num_input_tokens_seen": 72612976, + "router_z_loss_mlp": 1.05078125, + "step": 881, + "time_per_iteration": 3.241151809692383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150014, + "balance_loss_mlp": 1.0447762, + "epoch": 0.1696806464024625, + "flos": 547151278080.0, + "grad_norm": 0.02293023114251512, + "language_loss": 0.98854458, + "learning_rate": 0.0009497091622247285, + "loss": 1.0000447, + "num_input_tokens_seen": 72686800, + "router_z_loss_mlp": 1.05029297, + "step": 882, + "time_per_iteration": 2.720453977584839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145786, + "balance_loss_mlp": 1.0406431, + "epoch": 0.16987302808772606, + "flos": 530294602752.0, + "grad_norm": 0.02459483675822623, + "language_loss": 1.0302248, + "learning_rate": 0.0009495729032619723, + "loss": 1.04168272, + "num_input_tokens_seen": 72759360, + "router_z_loss_mlp": 1.04931641, + "step": 883, + "time_per_iteration": 2.717176675796509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151842, + "balance_loss_mlp": 1.04731977, + "epoch": 0.17006540977298962, + "flos": 756478660608.0, + "grad_norm": 0.02507713686866634, + "language_loss": 0.9295364, + "learning_rate": 0.0009494364697595354, + "loss": 0.94105482, + "num_input_tokens_seen": 72831424, + "router_z_loss_mlp": 1.04589844, + "step": 884, + "time_per_iteration": 2.924898147583008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157567, + "balance_loss_mlp": 1.05271089, + "epoch": 0.17025779145825318, + "flos": 559874694144.0, + "grad_norm": 0.025110060032482954, + "language_loss": 0.98774076, + "learning_rate": 0.0009492998617703867, + "loss": 0.99931645, + "num_input_tokens_seen": 72901536, + "router_z_loss_mlp": 1.04833984, + "step": 885, + "time_per_iteration": 2.6759417057037354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155376, + "balance_loss_mlp": 1.05104423, + "epoch": 0.17045017314351674, + "flos": 513216347136.0, + "grad_norm": 0.0280627140127875, + "language_loss": 0.96898842, + "learning_rate": 0.0009491630793475619, + "loss": 0.98054218, + "num_input_tokens_seen": 72970480, + "router_z_loss_mlp": 1.04492188, + "step": 886, + "time_per_iteration": 2.5910444259643555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149096, + "balance_loss_mlp": 1.04452574, + "epoch": 0.1706425548287803, + "flos": 510012343296.0, + "grad_norm": 0.023090423796267925, + "language_loss": 0.94873035, + "learning_rate": 0.0009490261225441643, + "loss": 0.96022129, + "num_input_tokens_seen": 73053376, + "router_z_loss_mlp": 1.04638672, + "step": 887, + "time_per_iteration": 2.960139513015747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149945, + "balance_loss_mlp": 1.04508829, + "epoch": 0.17083493651404386, + "flos": 718714642944.0, + "grad_norm": 0.024954435208077393, + "language_loss": 0.98478651, + "learning_rate": 0.0009488889914133656, + "loss": 0.99628592, + "num_input_tokens_seen": 73136032, + "router_z_loss_mlp": 1.04833984, + "step": 888, + "time_per_iteration": 3.0498712062835693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151016, + "balance_loss_mlp": 1.04649353, + "epoch": 0.17102731819930742, + "flos": 560200333824.0, + "grad_norm": 0.020862133880352407, + "language_loss": 0.97394216, + "learning_rate": 0.0009487516860084047, + "loss": 0.98545229, + "num_input_tokens_seen": 73208544, + "router_z_loss_mlp": 1.046875, + "step": 889, + "time_per_iteration": 2.799579381942749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115955, + "balance_loss_mlp": 1.0542171, + "epoch": 0.17121969988457098, + "flos": 495764788224.0, + "grad_norm": 0.030159167385703775, + "language_loss": 0.99659365, + "learning_rate": 0.0009486142063825884, + "loss": 1.0081892, + "num_input_tokens_seen": 73274336, + "router_z_loss_mlp": 1.05126953, + "step": 890, + "time_per_iteration": 2.5897767543792725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160561, + "balance_loss_mlp": 1.05718231, + "epoch": 0.17141208156983456, + "flos": 1552105941504.0, + "grad_norm": 0.012289453069715352, + "language_loss": 0.72426212, + "learning_rate": 0.0009484765525892909, + "loss": 0.73586774, + "num_input_tokens_seen": 73506320, + "router_z_loss_mlp": 1.03515625, + "step": 891, + "time_per_iteration": 4.971697807312012 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160561, + "balance_loss_mlp": 1.05527556, + "epoch": 0.17160446325509812, + "flos": 620700005376.0, + "grad_norm": 0.02677753623279009, + "language_loss": 1.00227833, + "learning_rate": 0.0009483387246819542, + "loss": 1.01388383, + "num_input_tokens_seen": 73578048, + "router_z_loss_mlp": 1.05078125, + "step": 892, + "time_per_iteration": 2.7142419815063477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153152, + "balance_loss_mlp": 1.04977417, + "epoch": 0.17179684494036168, + "flos": 1384693300224.0, + "grad_norm": 0.011012484205567044, + "language_loss": 0.82285583, + "learning_rate": 0.0009482007227140877, + "loss": 0.8343873, + "num_input_tokens_seen": 73798640, + "router_z_loss_mlp": 1.03515625, + "step": 893, + "time_per_iteration": 4.678752183914185 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159751, + "balance_loss_mlp": 1.05446541, + "epoch": 0.17198922662562524, + "flos": 493641762816.0, + "grad_norm": 0.02464509578240857, + "language_loss": 0.9638195, + "learning_rate": 0.0009480625467392688, + "loss": 0.97541702, + "num_input_tokens_seen": 73867328, + "router_z_loss_mlp": 1.05175781, + "step": 894, + "time_per_iteration": 2.6579103469848633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158279, + "balance_loss_mlp": 1.05490112, + "epoch": 0.1721816083108888, + "flos": 1461485689344.0, + "grad_norm": 0.014844728137103481, + "language_loss": 0.77994668, + "learning_rate": 0.0009479241968111421, + "loss": 0.79152954, + "num_input_tokens_seen": 74093376, + "router_z_loss_mlp": 1.03515625, + "step": 895, + "time_per_iteration": 4.754615783691406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157074, + "balance_loss_mlp": 1.0523603, + "epoch": 0.17237398999615236, + "flos": 529204892160.0, + "grad_norm": 0.024157534092911288, + "language_loss": 0.95005947, + "learning_rate": 0.0009477856729834196, + "loss": 0.96163023, + "num_input_tokens_seen": 74169136, + "router_z_loss_mlp": 1.046875, + "step": 896, + "time_per_iteration": 2.7640984058380127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161659, + "balance_loss_mlp": 1.05742288, + "epoch": 0.17256637168141592, + "flos": 605026366464.0, + "grad_norm": 0.02447501108745492, + "language_loss": 0.9782356, + "learning_rate": 0.0009476469753098809, + "loss": 0.98985219, + "num_input_tokens_seen": 74236912, + "router_z_loss_mlp": 1.04394531, + "step": 897, + "time_per_iteration": 2.7016282081604004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153769, + "balance_loss_mlp": 1.04957986, + "epoch": 0.17275875336667948, + "flos": 510693821952.0, + "grad_norm": 0.025419887327313116, + "language_loss": 0.94868481, + "learning_rate": 0.0009475081038443738, + "loss": 0.96022242, + "num_input_tokens_seen": 74305968, + "router_z_loss_mlp": 1.04345703, + "step": 898, + "time_per_iteration": 2.5731348991394043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148609, + "balance_loss_mlp": 1.0446589, + "epoch": 0.17295113505194307, + "flos": 666500955648.0, + "grad_norm": 0.02623291269769982, + "language_loss": 0.95752573, + "learning_rate": 0.0009473690586408124, + "loss": 0.96901178, + "num_input_tokens_seen": 74384144, + "router_z_loss_mlp": 1.04101562, + "step": 899, + "time_per_iteration": 2.8549156188964844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146417, + "balance_loss_mlp": 1.04227531, + "epoch": 0.17314351673720663, + "flos": 556431645696.0, + "grad_norm": 0.022300666942289, + "language_loss": 0.94826102, + "learning_rate": 0.0009472298397531792, + "loss": 0.9597252, + "num_input_tokens_seen": 74455040, + "router_z_loss_mlp": 1.04296875, + "step": 900, + "time_per_iteration": 2.7165167331695557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145486, + "balance_loss_mlp": 1.04124928, + "epoch": 0.17333589842247019, + "flos": 504606724608.0, + "grad_norm": 0.023477361471443404, + "language_loss": 0.95443118, + "learning_rate": 0.0009470904472355235, + "loss": 0.96588612, + "num_input_tokens_seen": 74525248, + "router_z_loss_mlp": 1.04394531, + "step": 901, + "time_per_iteration": 2.668320655822754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143811, + "balance_loss_mlp": 1.03967023, + "epoch": 0.17352828010773375, + "flos": 557350167552.0, + "grad_norm": 0.02470997420275152, + "language_loss": 0.90534914, + "learning_rate": 0.0009469508811419626, + "loss": 0.91678727, + "num_input_tokens_seen": 74597328, + "router_z_loss_mlp": 1.04296875, + "step": 902, + "time_per_iteration": 2.714174747467041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155739, + "balance_loss_mlp": 1.05331421, + "epoch": 0.1737206617929973, + "flos": 1557791537664.0, + "grad_norm": 0.011695515468407039, + "language_loss": 0.7161383, + "learning_rate": 0.0009468111415266806, + "loss": 0.7276957, + "num_input_tokens_seen": 74819664, + "router_z_loss_mlp": 1.02539062, + "step": 903, + "time_per_iteration": 4.783574104309082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146888, + "balance_loss_mlp": 1.04308009, + "epoch": 0.17391304347826086, + "flos": 517755836928.0, + "grad_norm": 0.027522671456014093, + "language_loss": 0.94518518, + "learning_rate": 0.0009466712284439292, + "loss": 0.95665407, + "num_input_tokens_seen": 74896224, + "router_z_loss_mlp": 1.03955078, + "step": 904, + "time_per_iteration": 2.7503864765167236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011486, + "balance_loss_mlp": 1.04503071, + "epoch": 0.17410542516352442, + "flos": 542160622080.0, + "grad_norm": 0.027186859166075866, + "language_loss": 0.99262786, + "learning_rate": 0.0009465311419480276, + "loss": 1.00411391, + "num_input_tokens_seen": 74966560, + "router_z_loss_mlp": 1.03710938, + "step": 905, + "time_per_iteration": 2.671753168106079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153491, + "balance_loss_mlp": 1.05011249, + "epoch": 0.17429780684878798, + "flos": 625081041408.0, + "grad_norm": 0.028950662808853365, + "language_loss": 0.96674442, + "learning_rate": 0.0009463908820933622, + "loss": 0.97827929, + "num_input_tokens_seen": 75045248, + "router_z_loss_mlp": 1.03515625, + "step": 906, + "time_per_iteration": 2.8291828632354736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151914, + "balance_loss_mlp": 1.04844034, + "epoch": 0.17449018853405157, + "flos": 576848890368.0, + "grad_norm": 0.03002954803612974, + "language_loss": 0.90420532, + "learning_rate": 0.0009462504489343868, + "loss": 0.91572446, + "num_input_tokens_seen": 75123952, + "router_z_loss_mlp": 1.03613281, + "step": 907, + "time_per_iteration": 2.8554108142852783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147077, + "balance_loss_mlp": 1.04341269, + "epoch": 0.17468257021931513, + "flos": 534772967424.0, + "grad_norm": 0.024073731406752365, + "language_loss": 1.01002121, + "learning_rate": 0.0009461098425256222, + "loss": 1.02149189, + "num_input_tokens_seen": 75191728, + "router_z_loss_mlp": 1.03808594, + "step": 908, + "time_per_iteration": 2.5968382358551025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114306, + "balance_loss_mlp": 1.03930068, + "epoch": 0.1748749519045787, + "flos": 541808785920.0, + "grad_norm": 0.02493910110608304, + "language_loss": 0.93412566, + "learning_rate": 0.0009459690629216567, + "loss": 0.94555628, + "num_input_tokens_seen": 75262224, + "router_z_loss_mlp": 1.0390625, + "step": 909, + "time_per_iteration": 2.670389413833618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150977, + "balance_loss_mlp": 1.04688334, + "epoch": 0.17506733358984225, + "flos": 499626802176.0, + "grad_norm": 0.02402970341263653, + "language_loss": 0.96272469, + "learning_rate": 0.0009458281101771457, + "loss": 0.97423446, + "num_input_tokens_seen": 75329760, + "router_z_loss_mlp": 1.04248047, + "step": 910, + "time_per_iteration": 2.6256320476531982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153015, + "balance_loss_mlp": 1.04906452, + "epoch": 0.1752597152751058, + "flos": 624132320256.0, + "grad_norm": 0.023679811966199643, + "language_loss": 0.91450173, + "learning_rate": 0.0009456869843468122, + "loss": 0.92603183, + "num_input_tokens_seen": 75407920, + "router_z_loss_mlp": 1.04101562, + "step": 911, + "time_per_iteration": 2.863004207611084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158204, + "balance_loss_mlp": 1.05434883, + "epoch": 0.17545209696036937, + "flos": 521993155584.0, + "grad_norm": 0.029813530713564303, + "language_loss": 0.92364156, + "learning_rate": 0.0009455456854854459, + "loss": 0.93522358, + "num_input_tokens_seen": 75476752, + "router_z_loss_mlp": 1.04003906, + "step": 912, + "time_per_iteration": 2.616231918334961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150079, + "balance_loss_mlp": 1.04612815, + "epoch": 0.17564447864563293, + "flos": 462945764352.0, + "grad_norm": 0.02810445184103091, + "language_loss": 0.92624664, + "learning_rate": 0.0009454042136479039, + "loss": 0.93774742, + "num_input_tokens_seen": 75542944, + "router_z_loss_mlp": 1.04101562, + "step": 913, + "time_per_iteration": 2.5944247245788574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155662, + "balance_loss_mlp": 1.05199766, + "epoch": 0.1758368603308965, + "flos": 481617289728.0, + "grad_norm": 0.02706355326928303, + "language_loss": 0.91841793, + "learning_rate": 0.0009452625688891103, + "loss": 0.92997456, + "num_input_tokens_seen": 75609840, + "router_z_loss_mlp": 1.03808594, + "step": 914, + "time_per_iteration": 2.580941915512085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144051, + "balance_loss_mlp": 1.04200745, + "epoch": 0.17602924201616005, + "flos": 1482084856320.0, + "grad_norm": 0.009713749524187035, + "language_loss": 0.78734738, + "learning_rate": 0.0009451207512640567, + "loss": 0.79878789, + "num_input_tokens_seen": 75819312, + "router_z_loss_mlp": 1.02148438, + "step": 915, + "time_per_iteration": 4.592097997665405 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148996, + "balance_loss_mlp": 1.04523647, + "epoch": 0.17622162370142364, + "flos": 603470026752.0, + "grad_norm": 0.02797967110469985, + "language_loss": 1.03421283, + "learning_rate": 0.0009449787608278015, + "loss": 1.0457027, + "num_input_tokens_seen": 75893984, + "router_z_loss_mlp": 1.0390625, + "step": 916, + "time_per_iteration": 2.755580425262451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150108, + "balance_loss_mlp": 1.04677713, + "epoch": 0.1764140053866872, + "flos": 443605495296.0, + "grad_norm": 0.024189441248888145, + "language_loss": 1.00777316, + "learning_rate": 0.0009448365976354704, + "loss": 1.01927423, + "num_input_tokens_seen": 75958944, + "router_z_loss_mlp": 1.03466797, + "step": 917, + "time_per_iteration": 2.4922571182250977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149055, + "balance_loss_mlp": 1.04567707, + "epoch": 0.17660638707195075, + "flos": 501591373824.0, + "grad_norm": 0.028333637349232343, + "language_loss": 1.01507974, + "learning_rate": 0.0009446942617422558, + "loss": 1.02657032, + "num_input_tokens_seen": 76024240, + "router_z_loss_mlp": 1.03515625, + "step": 918, + "time_per_iteration": 2.574998378753662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148191, + "balance_loss_mlp": 1.0448128, + "epoch": 0.17679876875721431, + "flos": 539983202304.0, + "grad_norm": 0.02432410226762854, + "language_loss": 0.94564992, + "learning_rate": 0.0009445517532034176, + "loss": 0.9571318, + "num_input_tokens_seen": 76095264, + "router_z_loss_mlp": 1.03515625, + "step": 919, + "time_per_iteration": 2.7170355319976807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153425, + "balance_loss_mlp": 1.05009484, + "epoch": 0.17699115044247787, + "flos": 498715011072.0, + "grad_norm": 0.026165935935680888, + "language_loss": 0.99032271, + "learning_rate": 0.0009444090720742824, + "loss": 1.00185692, + "num_input_tokens_seen": 76163520, + "router_z_loss_mlp": 1.03466797, + "step": 920, + "time_per_iteration": 2.6009292602539062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149157, + "balance_loss_mlp": 1.04587448, + "epoch": 0.17718353212774143, + "flos": 663915303936.0, + "grad_norm": 0.025722324934358026, + "language_loss": 0.98290348, + "learning_rate": 0.0009442662184102439, + "loss": 0.99439508, + "num_input_tokens_seen": 76233760, + "router_z_loss_mlp": 1.03417969, + "step": 921, + "time_per_iteration": 2.7612035274505615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145605, + "balance_loss_mlp": 1.04251313, + "epoch": 0.177375913813005, + "flos": 583847778816.0, + "grad_norm": 0.021564117555322487, + "language_loss": 0.93569565, + "learning_rate": 0.000944123192266763, + "loss": 0.94715166, + "num_input_tokens_seen": 76310704, + "router_z_loss_mlp": 1.03222656, + "step": 922, + "time_per_iteration": 2.8110268115997314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141792, + "balance_loss_mlp": 1.03865182, + "epoch": 0.17756829549826855, + "flos": 553683537408.0, + "grad_norm": 0.021487036209533367, + "language_loss": 0.92858881, + "learning_rate": 0.0009439799936993671, + "loss": 0.94000673, + "num_input_tokens_seen": 76386992, + "router_z_loss_mlp": 1.03271484, + "step": 923, + "time_per_iteration": 2.7440245151519775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142202, + "balance_loss_mlp": 1.03901482, + "epoch": 0.17776067718353214, + "flos": 557371634688.0, + "grad_norm": 0.02463154633112553, + "language_loss": 0.97990632, + "learning_rate": 0.0009438366227636511, + "loss": 0.99132836, + "num_input_tokens_seen": 76453328, + "router_z_loss_mlp": 1.03320312, + "step": 924, + "time_per_iteration": 2.7032759189605713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140208, + "balance_loss_mlp": 1.03721154, + "epoch": 0.1779530588687957, + "flos": 659651788800.0, + "grad_norm": 0.022941473179093813, + "language_loss": 0.94988692, + "learning_rate": 0.0009436930795152763, + "loss": 0.96128899, + "num_input_tokens_seen": 76529040, + "router_z_loss_mlp": 1.03125, + "step": 925, + "time_per_iteration": 2.854522943496704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143555, + "balance_loss_mlp": 1.04084456, + "epoch": 0.17814544055405926, + "flos": 645671476224.0, + "grad_norm": 0.02421412975678805, + "language_loss": 0.95479, + "learning_rate": 0.0009435493640099713, + "loss": 0.9662255, + "num_input_tokens_seen": 76604080, + "router_z_loss_mlp": 1.02832031, + "step": 926, + "time_per_iteration": 2.8268251419067383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143389, + "balance_loss_mlp": 1.04077399, + "epoch": 0.17833782223932282, + "flos": 461884251648.0, + "grad_norm": 0.0252062590806445, + "language_loss": 0.94177145, + "learning_rate": 0.0009434054763035314, + "loss": 0.95320535, + "num_input_tokens_seen": 76674096, + "router_z_loss_mlp": 1.02734375, + "step": 927, + "time_per_iteration": 2.629499673843384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139685, + "balance_loss_mlp": 1.03706956, + "epoch": 0.17853020392458638, + "flos": 760852965888.0, + "grad_norm": 0.02122720378042075, + "language_loss": 0.93181551, + "learning_rate": 0.0009432614164518185, + "loss": 0.94321233, + "num_input_tokens_seen": 76752144, + "router_z_loss_mlp": 1.02734375, + "step": 928, + "time_per_iteration": 2.9364700317382812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140803, + "balance_loss_mlp": 1.03818727, + "epoch": 0.17872258560984994, + "flos": 784055248896.0, + "grad_norm": 0.023477252169520995, + "language_loss": 0.93520033, + "learning_rate": 0.000943117184510762, + "loss": 0.94660836, + "num_input_tokens_seen": 76830240, + "router_z_loss_mlp": 1.02734375, + "step": 929, + "time_per_iteration": 3.07600474357605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150169, + "balance_loss_mlp": 1.04831696, + "epoch": 0.1789149672951135, + "flos": 1463031295488.0, + "grad_norm": 0.013755703560815407, + "language_loss": 0.78789961, + "learning_rate": 0.0009429727805363575, + "loss": 0.7994014, + "num_input_tokens_seen": 77062464, + "router_z_loss_mlp": 1.01953125, + "step": 930, + "time_per_iteration": 5.029282808303833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153323, + "balance_loss_mlp": 1.05099344, + "epoch": 0.17910734898037706, + "flos": 504930362880.0, + "grad_norm": 0.023999213273897636, + "language_loss": 0.96652937, + "learning_rate": 0.0009428282045846674, + "loss": 0.97806263, + "num_input_tokens_seen": 77136672, + "router_z_loss_mlp": 1.02441406, + "step": 931, + "time_per_iteration": 2.7112410068511963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145421, + "balance_loss_mlp": 1.04275823, + "epoch": 0.17929973066564064, + "flos": 747669651456.0, + "grad_norm": 0.02006943819739268, + "language_loss": 0.96385491, + "learning_rate": 0.0009426834567118214, + "loss": 0.97530913, + "num_input_tokens_seen": 77227040, + "router_z_loss_mlp": 1.02783203, + "step": 932, + "time_per_iteration": 3.0711913108825684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143693, + "balance_loss_mlp": 1.04098177, + "epoch": 0.1794921123509042, + "flos": 714572651520.0, + "grad_norm": 0.021210123960592832, + "language_loss": 0.89608383, + "learning_rate": 0.0009425385369740155, + "loss": 0.90752071, + "num_input_tokens_seen": 77319392, + "router_z_loss_mlp": 1.02832031, + "step": 933, + "time_per_iteration": 3.059857130050659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114727, + "balance_loss_mlp": 1.0451318, + "epoch": 0.17968449403616776, + "flos": 634361409024.0, + "grad_norm": 0.02299955090486112, + "language_loss": 0.96636283, + "learning_rate": 0.0009423934454275125, + "loss": 0.97783554, + "num_input_tokens_seen": 77394688, + "router_z_loss_mlp": 1.02246094, + "step": 934, + "time_per_iteration": 2.85917592048645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146917, + "balance_loss_mlp": 1.04477859, + "epoch": 0.17987687572143132, + "flos": 537378084864.0, + "grad_norm": 0.02461268142415081, + "language_loss": 1.01075852, + "learning_rate": 0.0009422481821286418, + "loss": 1.02222764, + "num_input_tokens_seen": 77468288, + "router_z_loss_mlp": 1.02246094, + "step": 935, + "time_per_iteration": 2.7314486503601074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150005, + "balance_loss_mlp": 1.04777098, + "epoch": 0.18006925740669488, + "flos": 539119074816.0, + "grad_norm": 0.026258801194945027, + "language_loss": 0.98970592, + "learning_rate": 0.0009421027471337998, + "loss": 1.00120604, + "num_input_tokens_seen": 77535840, + "router_z_loss_mlp": 1.0234375, + "step": 936, + "time_per_iteration": 2.6354496479034424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151337, + "balance_loss_mlp": 1.04891205, + "epoch": 0.18026163909195844, + "flos": 540534425088.0, + "grad_norm": 0.029056123283387615, + "language_loss": 0.94782555, + "learning_rate": 0.0009419571404994493, + "loss": 0.9593389, + "num_input_tokens_seen": 77604000, + "router_z_loss_mlp": 1.02539062, + "step": 937, + "time_per_iteration": 2.6368348598480225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148965, + "balance_loss_mlp": 1.04649317, + "epoch": 0.180454020777222, + "flos": 501682698240.0, + "grad_norm": 0.026973093946582868, + "language_loss": 1.00715971, + "learning_rate": 0.00094181136228212, + "loss": 1.01864934, + "num_input_tokens_seen": 77671488, + "router_z_loss_mlp": 1.02587891, + "step": 938, + "time_per_iteration": 2.710451602935791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145832, + "balance_loss_mlp": 1.043455, + "epoch": 0.18064640246248556, + "flos": 500006836224.0, + "grad_norm": 0.02510488837562242, + "language_loss": 0.93535352, + "learning_rate": 0.0009416654125384077, + "loss": 0.9468118, + "num_input_tokens_seen": 77746240, + "router_z_loss_mlp": 1.02490234, + "step": 939, + "time_per_iteration": 2.728480577468872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145905, + "balance_loss_mlp": 1.04424286, + "epoch": 0.18083878414774912, + "flos": 1522290808320.0, + "grad_norm": 0.01070150853185005, + "language_loss": 0.79772377, + "learning_rate": 0.0009415192913249752, + "loss": 0.80918276, + "num_input_tokens_seen": 77966080, + "router_z_loss_mlp": 1.01757812, + "step": 940, + "time_per_iteration": 4.915560007095337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145419, + "balance_loss_mlp": 1.04318535, + "epoch": 0.1810311658330127, + "flos": 728665755648.0, + "grad_norm": 0.023936590350452012, + "language_loss": 0.92724693, + "learning_rate": 0.000941372998698552, + "loss": 0.93870103, + "num_input_tokens_seen": 78049200, + "router_z_loss_mlp": 1.0234375, + "step": 941, + "time_per_iteration": 2.993441343307495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140689, + "balance_loss_mlp": 1.0385505, + "epoch": 0.18122354751827627, + "flos": 566044383744.0, + "grad_norm": 0.025062658148163358, + "language_loss": 0.94270039, + "learning_rate": 0.0009412265347159336, + "loss": 0.95410728, + "num_input_tokens_seen": 78122752, + "router_z_loss_mlp": 1.02246094, + "step": 942, + "time_per_iteration": 2.731416702270508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140669, + "balance_loss_mlp": 1.03848326, + "epoch": 0.18141592920353983, + "flos": 520317293568.0, + "grad_norm": 0.024682729806918415, + "language_loss": 0.94559634, + "learning_rate": 0.0009410798994339829, + "loss": 0.95700312, + "num_input_tokens_seen": 78194064, + "router_z_loss_mlp": 1.02294922, + "step": 943, + "time_per_iteration": 2.6001992225646973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138644, + "balance_loss_mlp": 1.03650522, + "epoch": 0.1816083108888034, + "flos": 513476858880.0, + "grad_norm": 0.022579221317186333, + "language_loss": 0.95589852, + "learning_rate": 0.000940933092909628, + "loss": 0.96728498, + "num_input_tokens_seen": 78262048, + "router_z_loss_mlp": 1.02246094, + "step": 944, + "time_per_iteration": 2.6360957622528076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147356, + "balance_loss_mlp": 1.04550409, + "epoch": 0.18180069257406695, + "flos": 493372518912.0, + "grad_norm": 0.02569410792888805, + "language_loss": 0.9276287, + "learning_rate": 0.0009407861151998649, + "loss": 0.93910229, + "num_input_tokens_seen": 78330624, + "router_z_loss_mlp": 1.01953125, + "step": 945, + "time_per_iteration": 2.6910903453826904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147749, + "balance_loss_mlp": 1.04608703, + "epoch": 0.1819930742593305, + "flos": 571230423552.0, + "grad_norm": 0.024877151530798884, + "language_loss": 0.95025092, + "learning_rate": 0.0009406389663617552, + "loss": 0.96172833, + "num_input_tokens_seen": 78400672, + "router_z_loss_mlp": 1.01757812, + "step": 946, + "time_per_iteration": 2.689232110977173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138734, + "balance_loss_mlp": 1.03669131, + "epoch": 0.18218545594459407, + "flos": 607110460416.0, + "grad_norm": 0.026141117268158143, + "language_loss": 0.96229172, + "learning_rate": 0.000940491646452427, + "loss": 0.97367907, + "num_input_tokens_seen": 78467952, + "router_z_loss_mlp": 1.02148438, + "step": 947, + "time_per_iteration": 2.720996618270874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136776, + "balance_loss_mlp": 1.03473294, + "epoch": 0.18237783762985763, + "flos": 549738931200.0, + "grad_norm": 0.02114848591843324, + "language_loss": 0.99382234, + "learning_rate": 0.000940344155529075, + "loss": 1.00519001, + "num_input_tokens_seen": 78538928, + "router_z_loss_mlp": 1.02148438, + "step": 948, + "time_per_iteration": 2.655764102935791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136656, + "balance_loss_mlp": 1.03489935, + "epoch": 0.1825702193151212, + "flos": 451674628608.0, + "grad_norm": 0.027816765537183038, + "language_loss": 0.98392528, + "learning_rate": 0.0009401964936489605, + "loss": 0.99529195, + "num_input_tokens_seen": 78602144, + "router_z_loss_mlp": 1.01855469, + "step": 949, + "time_per_iteration": 2.5372273921966553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137812, + "balance_loss_mlp": 1.03615081, + "epoch": 0.18276260100038477, + "flos": 590384040960.0, + "grad_norm": 0.023066854335363023, + "language_loss": 0.93237805, + "learning_rate": 0.0009400486608694108, + "loss": 0.94375616, + "num_input_tokens_seen": 78673152, + "router_z_loss_mlp": 1.01757812, + "step": 950, + "time_per_iteration": 2.7370681762695312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139002, + "balance_loss_mlp": 1.03719783, + "epoch": 0.18295498268564833, + "flos": 788709531648.0, + "grad_norm": 0.02337801281240106, + "language_loss": 0.97100747, + "learning_rate": 0.0009399006572478195, + "loss": 0.98239744, + "num_input_tokens_seen": 78753872, + "router_z_loss_mlp": 1.01904297, + "step": 951, + "time_per_iteration": 3.1136744022369385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144566, + "balance_loss_mlp": 1.04276168, + "epoch": 0.1831473643709119, + "flos": 579225696768.0, + "grad_norm": 0.024500893588447415, + "language_loss": 0.99522519, + "learning_rate": 0.0009397524828416468, + "loss": 1.00667083, + "num_input_tokens_seen": 78822640, + "router_z_loss_mlp": 1.01904297, + "step": 952, + "time_per_iteration": 2.680551767349243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138356, + "balance_loss_mlp": 1.03664696, + "epoch": 0.18333974605617545, + "flos": 567963293184.0, + "grad_norm": 0.023361368133084506, + "language_loss": 1.04812968, + "learning_rate": 0.0009396041377084192, + "loss": 1.05951309, + "num_input_tokens_seen": 78893792, + "router_z_loss_mlp": 1.01806641, + "step": 953, + "time_per_iteration": 2.6526119709014893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136097, + "balance_loss_mlp": 1.03443527, + "epoch": 0.183532127741439, + "flos": 528069519360.0, + "grad_norm": 0.02324700647994909, + "language_loss": 0.98137838, + "learning_rate": 0.0009394556219057295, + "loss": 0.99273932, + "num_input_tokens_seen": 78964752, + "router_z_loss_mlp": 1.01757812, + "step": 954, + "time_per_iteration": 2.6928489208221436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147999, + "balance_loss_mlp": 1.04671907, + "epoch": 0.18372450942670257, + "flos": 595643940864.0, + "grad_norm": 0.02338261009959255, + "language_loss": 0.93879586, + "learning_rate": 0.0009393069354912362, + "loss": 0.95027584, + "num_input_tokens_seen": 79034400, + "router_z_loss_mlp": 1.01367188, + "step": 955, + "time_per_iteration": 2.7496042251586914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151957, + "balance_loss_mlp": 1.05067647, + "epoch": 0.18391689111196613, + "flos": 646283824128.0, + "grad_norm": 0.029421035614033756, + "language_loss": 0.90626895, + "learning_rate": 0.0009391580785226649, + "loss": 0.91778857, + "num_input_tokens_seen": 79109488, + "router_z_loss_mlp": 1.01367188, + "step": 956, + "time_per_iteration": 2.9440600872039795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152481, + "balance_loss_mlp": 1.05253601, + "epoch": 0.18410927279722972, + "flos": 1460391975936.0, + "grad_norm": 0.020211591247266292, + "language_loss": 0.79340446, + "learning_rate": 0.0009390090510578067, + "loss": 0.80492932, + "num_input_tokens_seen": 79327712, + "router_z_loss_mlp": 1.0, + "step": 957, + "time_per_iteration": 4.738964796066284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138037, + "balance_loss_mlp": 1.03623211, + "epoch": 0.18430165448249328, + "flos": 660003624960.0, + "grad_norm": 0.026926680065899915, + "language_loss": 0.95339954, + "learning_rate": 0.0009388598531545196, + "loss": 0.96477991, + "num_input_tokens_seen": 79401504, + "router_z_loss_mlp": 1.01904297, + "step": 958, + "time_per_iteration": 2.859509229660034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138629, + "balance_loss_mlp": 1.03687191, + "epoch": 0.18449403616775684, + "flos": 518949606912.0, + "grad_norm": 0.029778126611616895, + "language_loss": 0.94583583, + "learning_rate": 0.000938710484870727, + "loss": 0.9572221, + "num_input_tokens_seen": 79466688, + "router_z_loss_mlp": 1.01855469, + "step": 959, + "time_per_iteration": 2.565548896789551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137101, + "balance_loss_mlp": 1.03543901, + "epoch": 0.1846864178530204, + "flos": 553824526848.0, + "grad_norm": 0.027283874554685776, + "language_loss": 0.94945395, + "learning_rate": 0.0009385609462644189, + "loss": 0.96082497, + "num_input_tokens_seen": 79540288, + "router_z_loss_mlp": 1.01757812, + "step": 960, + "time_per_iteration": 2.676379919052124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138569, + "balance_loss_mlp": 1.03709817, + "epoch": 0.18487879953828396, + "flos": 467115953664.0, + "grad_norm": 0.025693285519799033, + "language_loss": 0.96468461, + "learning_rate": 0.0009384112373936514, + "loss": 0.97607034, + "num_input_tokens_seen": 79611872, + "router_z_loss_mlp": 1.015625, + "step": 961, + "time_per_iteration": 2.7575974464416504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154728, + "balance_loss_mlp": 1.05325735, + "epoch": 0.18507118122354752, + "flos": 649683211776.0, + "grad_norm": 0.02725538915325764, + "language_loss": 1.0098747, + "learning_rate": 0.0009382613583165467, + "loss": 1.02142203, + "num_input_tokens_seen": 79689504, + "router_z_loss_mlp": 1.015625, + "step": 962, + "time_per_iteration": 2.8268754482269287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116263, + "balance_loss_mlp": 1.06125438, + "epoch": 0.18526356290881107, + "flos": 627922475520.0, + "grad_norm": 0.027998512126097927, + "language_loss": 0.99849832, + "learning_rate": 0.0009381113090912928, + "loss": 1.01012468, + "num_input_tokens_seen": 79759264, + "router_z_loss_mlp": 1.01464844, + "step": 963, + "time_per_iteration": 2.7762861251831055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147698, + "balance_loss_mlp": 1.04679894, + "epoch": 0.18545594459407463, + "flos": 433645650432.0, + "grad_norm": 0.027008272304904758, + "language_loss": 0.98634118, + "learning_rate": 0.000937961089776144, + "loss": 0.99781811, + "num_input_tokens_seen": 79824464, + "router_z_loss_mlp": 1.00976562, + "step": 964, + "time_per_iteration": 2.61759352684021 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149635, + "balance_loss_mlp": 1.04844999, + "epoch": 0.1856483262793382, + "flos": 750426491904.0, + "grad_norm": 0.028502333826765886, + "language_loss": 0.91998804, + "learning_rate": 0.0009378107004294208, + "loss": 0.93148446, + "num_input_tokens_seen": 79907152, + "router_z_loss_mlp": 1.01269531, + "step": 965, + "time_per_iteration": 2.964561939239502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151478, + "balance_loss_mlp": 1.05057883, + "epoch": 0.18584070796460178, + "flos": 531401777664.0, + "grad_norm": 0.02451376704559663, + "language_loss": 1.00210857, + "learning_rate": 0.0009376601411095096, + "loss": 1.01362348, + "num_input_tokens_seen": 79976944, + "router_z_loss_mlp": 1.00976562, + "step": 966, + "time_per_iteration": 2.6664164066314697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150482, + "balance_loss_mlp": 1.04953575, + "epoch": 0.18603308964986534, + "flos": 484083419136.0, + "grad_norm": 0.02282308899195351, + "language_loss": 0.93174511, + "learning_rate": 0.0009375094118748622, + "loss": 0.94324994, + "num_input_tokens_seen": 80042112, + "router_z_loss_mlp": 1.01025391, + "step": 967, + "time_per_iteration": 2.544952392578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142823, + "balance_loss_mlp": 1.041924, + "epoch": 0.1862254713351289, + "flos": 802681112064.0, + "grad_norm": 0.02495680742184495, + "language_loss": 1.00251484, + "learning_rate": 0.0009373585127839976, + "loss": 1.01394308, + "num_input_tokens_seen": 80118896, + "router_z_loss_mlp": 1.00976562, + "step": 968, + "time_per_iteration": 2.973095417022705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142113, + "balance_loss_mlp": 1.0413574, + "epoch": 0.18641785302039246, + "flos": 479290148352.0, + "grad_norm": 0.02509872783632802, + "language_loss": 0.9944787, + "learning_rate": 0.0009372074438954994, + "loss": 1.00589979, + "num_input_tokens_seen": 80183360, + "router_z_loss_mlp": 1.00830078, + "step": 969, + "time_per_iteration": 2.5303025245666504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142663, + "balance_loss_mlp": 1.04181159, + "epoch": 0.18661023470565602, + "flos": 389779072512.0, + "grad_norm": 0.02439046514561532, + "language_loss": 1.00939226, + "learning_rate": 0.0009370562052680181, + "loss": 1.02081895, + "num_input_tokens_seen": 80247024, + "router_z_loss_mlp": 1.00927734, + "step": 970, + "time_per_iteration": 2.5023443698883057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114981, + "balance_loss_mlp": 1.04929316, + "epoch": 0.18680261639091958, + "flos": 565775139840.0, + "grad_norm": 0.02213336285369191, + "language_loss": 0.95379293, + "learning_rate": 0.0009369047969602695, + "loss": 0.96529102, + "num_input_tokens_seen": 80318256, + "router_z_loss_mlp": 1.00585938, + "step": 971, + "time_per_iteration": 2.722823143005371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154865, + "balance_loss_mlp": 1.05420506, + "epoch": 0.18699499807618314, + "flos": 480230137344.0, + "grad_norm": 0.029574405329312194, + "language_loss": 0.9913702, + "learning_rate": 0.0009367532190310357, + "loss": 1.00291884, + "num_input_tokens_seen": 80384848, + "router_z_loss_mlp": 1.00732422, + "step": 972, + "time_per_iteration": 2.633387327194214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149336, + "balance_loss_mlp": 1.0490092, + "epoch": 0.1871873797614467, + "flos": 554328086016.0, + "grad_norm": 0.02905569815438633, + "language_loss": 0.99535728, + "learning_rate": 0.0009366014715391644, + "loss": 1.00685072, + "num_input_tokens_seen": 80453088, + "router_z_loss_mlp": 1.00390625, + "step": 973, + "time_per_iteration": 2.6549065113067627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153264, + "balance_loss_mlp": 1.05293763, + "epoch": 0.18737976144671029, + "flos": 553952781312.0, + "grad_norm": 0.023481989115367276, + "language_loss": 0.9123525, + "learning_rate": 0.0009364495545435693, + "loss": 0.92388517, + "num_input_tokens_seen": 80528608, + "router_z_loss_mlp": 1.00390625, + "step": 974, + "time_per_iteration": 4.409714221954346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155126, + "balance_loss_mlp": 1.05479944, + "epoch": 0.18757214313197385, + "flos": 503247770112.0, + "grad_norm": 0.022955013749569684, + "language_loss": 0.97297812, + "learning_rate": 0.0009362974681032297, + "loss": 0.98452938, + "num_input_tokens_seen": 80599600, + "router_z_loss_mlp": 1.00390625, + "step": 975, + "time_per_iteration": 2.61857533454895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153706, + "balance_loss_mlp": 1.05352271, + "epoch": 0.1877645248172374, + "flos": 676291613184.0, + "grad_norm": 0.028784531937469084, + "language_loss": 0.98011422, + "learning_rate": 0.0009361452122771907, + "loss": 0.9916513, + "num_input_tokens_seen": 80677264, + "router_z_loss_mlp": 1.00244141, + "step": 976, + "time_per_iteration": 2.91774320602417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149368, + "balance_loss_mlp": 1.04923177, + "epoch": 0.18795690650250096, + "flos": 405862944768.0, + "grad_norm": 0.029616845561456457, + "language_loss": 0.95658362, + "learning_rate": 0.0009359927871245635, + "loss": 0.9680773, + "num_input_tokens_seen": 80739776, + "router_z_loss_mlp": 1.00195312, + "step": 977, + "time_per_iteration": 2.563232183456421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149302, + "balance_loss_mlp": 1.04916573, + "epoch": 0.18814928818776452, + "flos": 639063355392.0, + "grad_norm": 0.027239481801034963, + "language_loss": 0.98439831, + "learning_rate": 0.0009358401927045246, + "loss": 0.99589127, + "num_input_tokens_seen": 80815200, + "router_z_loss_mlp": 1.00195312, + "step": 978, + "time_per_iteration": 2.8147568702697754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144978, + "balance_loss_mlp": 1.04498518, + "epoch": 0.18834166987302808, + "flos": 1140115514880.0, + "grad_norm": 0.022094320674951175, + "language_loss": 0.96123868, + "learning_rate": 0.0009356874290763166, + "loss": 0.9726885, + "num_input_tokens_seen": 80905024, + "router_z_loss_mlp": 1.00048828, + "step": 979, + "time_per_iteration": 3.4719691276550293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149894, + "balance_loss_mlp": 1.04971051, + "epoch": 0.18853405155829164, + "flos": 505815957504.0, + "grad_norm": 0.02560863383472628, + "language_loss": 0.98637187, + "learning_rate": 0.0009355344962992474, + "loss": 0.99787074, + "num_input_tokens_seen": 80976704, + "router_z_loss_mlp": 1.00244141, + "step": 980, + "time_per_iteration": 2.6199324131011963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139646, + "balance_loss_mlp": 1.03931963, + "epoch": 0.1887264332435552, + "flos": 609370472448.0, + "grad_norm": 0.02150131271194909, + "language_loss": 0.97900265, + "learning_rate": 0.0009353813944326908, + "loss": 0.99039912, + "num_input_tokens_seen": 81057152, + "router_z_loss_mlp": 1.00390625, + "step": 981, + "time_per_iteration": 2.8862478733062744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143203, + "balance_loss_mlp": 1.04287672, + "epoch": 0.1889188149288188, + "flos": 553592212992.0, + "grad_norm": 0.027403519760576756, + "language_loss": 0.92598587, + "learning_rate": 0.0009352281235360863, + "loss": 0.93741786, + "num_input_tokens_seen": 81131520, + "router_z_loss_mlp": 1.00390625, + "step": 982, + "time_per_iteration": 2.680797815322876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142003, + "balance_loss_mlp": 1.04167616, + "epoch": 0.18911119661408235, + "flos": 419469954048.0, + "grad_norm": 0.02481781093748577, + "language_loss": 0.92531025, + "learning_rate": 0.0009350746836689389, + "loss": 0.93673027, + "num_input_tokens_seen": 81195952, + "router_z_loss_mlp": 1.00390625, + "step": 983, + "time_per_iteration": 2.5687928199768066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152649, + "balance_loss_mlp": 1.05289459, + "epoch": 0.1893035782993459, + "flos": 1485317784576.0, + "grad_norm": 0.01747927461324531, + "language_loss": 0.81439221, + "learning_rate": 0.0009349210748908193, + "loss": 0.82591867, + "num_input_tokens_seen": 81427312, + "router_z_loss_mlp": 0.99804688, + "step": 984, + "time_per_iteration": 4.978898048400879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115218, + "balance_loss_mlp": 1.05237782, + "epoch": 0.18949595998460947, + "flos": 509456391168.0, + "grad_norm": 0.033971943902626214, + "language_loss": 0.94133711, + "learning_rate": 0.0009347672972613634, + "loss": 0.95285892, + "num_input_tokens_seen": 81494256, + "router_z_loss_mlp": 0.99853516, + "step": 985, + "time_per_iteration": 2.5850014686584473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153583, + "balance_loss_mlp": 1.05382824, + "epoch": 0.18968834166987303, + "flos": 532192045056.0, + "grad_norm": 0.027626772825507382, + "language_loss": 0.93152702, + "learning_rate": 0.0009346133508402735, + "loss": 0.9430629, + "num_input_tokens_seen": 81569312, + "router_z_loss_mlp": 0.99804688, + "step": 986, + "time_per_iteration": 2.7262227535247803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146056, + "balance_loss_mlp": 1.04658782, + "epoch": 0.1898807233551366, + "flos": 500753442816.0, + "grad_norm": 0.02768975875157221, + "language_loss": 0.95335174, + "learning_rate": 0.0009344592356873166, + "loss": 0.96481234, + "num_input_tokens_seen": 81637024, + "router_z_loss_mlp": 0.99511719, + "step": 987, + "time_per_iteration": 2.678715467453003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149829, + "balance_loss_mlp": 1.05002666, + "epoch": 0.19007310504040015, + "flos": 603359236608.0, + "grad_norm": 0.02899497531058058, + "language_loss": 0.87347138, + "learning_rate": 0.0009343049518623255, + "loss": 0.88496965, + "num_input_tokens_seen": 81709488, + "router_z_loss_mlp": 0.99853516, + "step": 988, + "time_per_iteration": 2.726668119430542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143975, + "balance_loss_mlp": 1.04407787, + "epoch": 0.1902654867256637, + "flos": 602764353024.0, + "grad_norm": 0.022945627178248204, + "language_loss": 0.90576518, + "learning_rate": 0.0009341504994251985, + "loss": 0.91720492, + "num_input_tokens_seen": 81787152, + "router_z_loss_mlp": 0.99951172, + "step": 989, + "time_per_iteration": 2.8518989086151123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151848, + "balance_loss_mlp": 1.05247498, + "epoch": 0.19045786841092727, + "flos": 1579231363584.0, + "grad_norm": 0.011944448483625032, + "language_loss": 0.73520499, + "learning_rate": 0.0009339958784358994, + "loss": 0.74672347, + "num_input_tokens_seen": 82030608, + "router_z_loss_mlp": 0.99414062, + "step": 990, + "time_per_iteration": 5.084089517593384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144398, + "balance_loss_mlp": 1.04445326, + "epoch": 0.19065025009619085, + "flos": 683054184960.0, + "grad_norm": 0.025253455013724026, + "language_loss": 0.88680583, + "learning_rate": 0.0009338410889544574, + "loss": 0.8982498, + "num_input_tokens_seen": 82119872, + "router_z_loss_mlp": 1.0, + "step": 991, + "time_per_iteration": 3.007277011871338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113977, + "balance_loss_mlp": 1.03949153, + "epoch": 0.1908426317814544, + "flos": 603441828864.0, + "grad_norm": 0.02514183514150974, + "language_loss": 0.96243769, + "learning_rate": 0.000933686131040967, + "loss": 0.97383535, + "num_input_tokens_seen": 82195552, + "router_z_loss_mlp": 1.00341797, + "step": 992, + "time_per_iteration": 2.7673017978668213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144459, + "balance_loss_mlp": 1.04441845, + "epoch": 0.19103501346671797, + "flos": 587433818112.0, + "grad_norm": 0.025095383977303525, + "language_loss": 0.99126339, + "learning_rate": 0.0009335310047555883, + "loss": 1.00270796, + "num_input_tokens_seen": 82267040, + "router_z_loss_mlp": 1.00097656, + "step": 993, + "time_per_iteration": 2.782841920852661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145602, + "balance_loss_mlp": 1.04565716, + "epoch": 0.19122739515198153, + "flos": 546834370560.0, + "grad_norm": 0.0365250692916995, + "language_loss": 0.97246122, + "learning_rate": 0.0009333757101585467, + "loss": 0.98391724, + "num_input_tokens_seen": 82337680, + "router_z_loss_mlp": 1.0, + "step": 994, + "time_per_iteration": 2.6937174797058105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142239, + "balance_loss_mlp": 1.04229414, + "epoch": 0.1914197768372451, + "flos": 522549107712.0, + "grad_norm": 0.02399514581888075, + "language_loss": 1.00362575, + "learning_rate": 0.0009332202473101329, + "loss": 1.01504803, + "num_input_tokens_seen": 82409600, + "router_z_loss_mlp": 1.0, + "step": 995, + "time_per_iteration": 2.7192962169647217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137582, + "balance_loss_mlp": 1.03763652, + "epoch": 0.19161215852250865, + "flos": 612387824640.0, + "grad_norm": 0.024864495797513732, + "language_loss": 0.91319168, + "learning_rate": 0.0009330646162707028, + "loss": 0.92456746, + "num_input_tokens_seen": 82480288, + "router_z_loss_mlp": 1.0, + "step": 996, + "time_per_iteration": 2.7450180053710938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113947, + "balance_loss_mlp": 1.03962064, + "epoch": 0.1918045402077722, + "flos": 848182619136.0, + "grad_norm": 0.02592603597590215, + "language_loss": 0.92579019, + "learning_rate": 0.0009329088171006779, + "loss": 0.93718487, + "num_input_tokens_seen": 82568960, + "router_z_loss_mlp": 0.99902344, + "step": 997, + "time_per_iteration": 3.1890194416046143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144521, + "balance_loss_mlp": 1.04457617, + "epoch": 0.19199692189303577, + "flos": 466892371968.0, + "grad_norm": 0.027577096255712943, + "language_loss": 0.95194477, + "learning_rate": 0.0009327528498605446, + "loss": 0.96338999, + "num_input_tokens_seen": 82634128, + "router_z_loss_mlp": 1.0, + "step": 998, + "time_per_iteration": 2.6845622062683105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141712, + "balance_loss_mlp": 1.04143262, + "epoch": 0.19218930357829936, + "flos": 532613011968.0, + "grad_norm": 0.026795980657526523, + "language_loss": 0.98209792, + "learning_rate": 0.0009325967146108548, + "loss": 0.99351501, + "num_input_tokens_seen": 82707472, + "router_z_loss_mlp": 1.00341797, + "step": 999, + "time_per_iteration": 2.690363883972168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145933, + "balance_loss_mlp": 1.04589295, + "epoch": 0.19238168526356292, + "flos": 602727422976.0, + "grad_norm": 0.025877996038880184, + "language_loss": 0.97816348, + "learning_rate": 0.0009324404114122258, + "loss": 0.98962283, + "num_input_tokens_seen": 82775232, + "router_z_loss_mlp": 1.00097656, + "step": 1000, + "time_per_iteration": 2.717535972595215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139683, + "balance_loss_mlp": 1.03969073, + "epoch": 0.19257406694882648, + "flos": 573154062336.0, + "grad_norm": 0.0251308575536182, + "language_loss": 0.95425117, + "learning_rate": 0.0009322839403253397, + "loss": 0.96564806, + "num_input_tokens_seen": 82850032, + "router_z_loss_mlp": 1.00048828, + "step": 1001, + "time_per_iteration": 2.8128621578216553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147687, + "balance_loss_mlp": 1.04793251, + "epoch": 0.19276644863409004, + "flos": 803156473344.0, + "grad_norm": 0.02827819499351052, + "language_loss": 0.93752921, + "learning_rate": 0.0009321273014109439, + "loss": 0.94900608, + "num_input_tokens_seen": 82926080, + "router_z_loss_mlp": 0.99804688, + "step": 1002, + "time_per_iteration": 2.9962668418884277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115103, + "balance_loss_mlp": 1.05127609, + "epoch": 0.1929588303193536, + "flos": 564479311872.0, + "grad_norm": 0.02425681225612504, + "language_loss": 0.92063946, + "learning_rate": 0.0009319704947298513, + "loss": 0.93214977, + "num_input_tokens_seen": 83005200, + "router_z_loss_mlp": 0.99804688, + "step": 1003, + "time_per_iteration": 2.9961774349212646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148634, + "balance_loss_mlp": 1.04887998, + "epoch": 0.19315121200461716, + "flos": 627987603456.0, + "grad_norm": 0.023688885680104285, + "language_loss": 0.95116329, + "learning_rate": 0.0009318135203429393, + "loss": 0.96264958, + "num_input_tokens_seen": 83077280, + "router_z_loss_mlp": 0.99804688, + "step": 1004, + "time_per_iteration": 2.7953245639801025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146221, + "balance_loss_mlp": 1.04646707, + "epoch": 0.19334359368988072, + "flos": 518583034368.0, + "grad_norm": 0.02448547542723696, + "language_loss": 0.95706153, + "learning_rate": 0.0009316563783111511, + "loss": 0.9685238, + "num_input_tokens_seen": 83145456, + "router_z_loss_mlp": 0.99804688, + "step": 1005, + "time_per_iteration": 2.7417562007904053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141812, + "balance_loss_mlp": 1.04224837, + "epoch": 0.19353597537514428, + "flos": 695399568384.0, + "grad_norm": 0.022656832097962477, + "language_loss": 0.91614294, + "learning_rate": 0.0009314990686954943, + "loss": 0.9275611, + "num_input_tokens_seen": 83225392, + "router_z_loss_mlp": 0.99609375, + "step": 1006, + "time_per_iteration": 2.921147584915161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143701, + "balance_loss_mlp": 1.04413795, + "epoch": 0.19372835706040784, + "flos": 1212199226880.0, + "grad_norm": 0.0213605480211332, + "language_loss": 0.89449364, + "learning_rate": 0.000931341591557042, + "loss": 0.90593064, + "num_input_tokens_seen": 83331296, + "router_z_loss_mlp": 0.99609375, + "step": 1007, + "time_per_iteration": 3.6934237480163574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142723, + "balance_loss_mlp": 1.04292154, + "epoch": 0.19392073874567142, + "flos": 521684980224.0, + "grad_norm": 0.02492230683936131, + "language_loss": 0.9970367, + "learning_rate": 0.0009311839469569325, + "loss": 1.00846386, + "num_input_tokens_seen": 83399952, + "router_z_loss_mlp": 0.99853516, + "step": 1008, + "time_per_iteration": 2.66283917427063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141437, + "balance_loss_mlp": 1.04187346, + "epoch": 0.19411312043093498, + "flos": 589910681088.0, + "grad_norm": 0.028572464719479444, + "language_loss": 0.9835515, + "learning_rate": 0.0009310261349563687, + "loss": 0.99496591, + "num_input_tokens_seen": 83468384, + "router_z_loss_mlp": 0.99609375, + "step": 1009, + "time_per_iteration": 2.6913864612579346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139912, + "balance_loss_mlp": 1.04034853, + "epoch": 0.19430550211619854, + "flos": 580571916288.0, + "grad_norm": 0.022224830980977262, + "language_loss": 0.9288035, + "learning_rate": 0.0009308681556166186, + "loss": 0.94020259, + "num_input_tokens_seen": 83547952, + "router_z_loss_mlp": 0.99609375, + "step": 1010, + "time_per_iteration": 2.8937342166900635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141907, + "balance_loss_mlp": 1.04234338, + "epoch": 0.1944978838014621, + "flos": 622245611520.0, + "grad_norm": 0.028831874511777204, + "language_loss": 1.01060331, + "learning_rate": 0.0009307100089990152, + "loss": 1.02202237, + "num_input_tokens_seen": 83615712, + "router_z_loss_mlp": 0.99609375, + "step": 1011, + "time_per_iteration": 2.7086822986602783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114452, + "balance_loss_mlp": 1.04495597, + "epoch": 0.19469026548672566, + "flos": 599814130176.0, + "grad_norm": 0.02434118582542042, + "language_loss": 0.95591187, + "learning_rate": 0.0009305516951649568, + "loss": 0.96735704, + "num_input_tokens_seen": 83687296, + "router_z_loss_mlp": 0.99609375, + "step": 1012, + "time_per_iteration": 2.7046425342559814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114359, + "balance_loss_mlp": 1.04402685, + "epoch": 0.19488264717198922, + "flos": 553247107584.0, + "grad_norm": 0.020712874248618226, + "language_loss": 0.93779677, + "learning_rate": 0.0009303932141759057, + "loss": 0.94923264, + "num_input_tokens_seen": 83763168, + "router_z_loss_mlp": 0.99609375, + "step": 1013, + "time_per_iteration": 2.7684950828552246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145994, + "balance_loss_mlp": 1.0468123, + "epoch": 0.19507502885725278, + "flos": 667312690176.0, + "grad_norm": 0.029421944235057496, + "language_loss": 0.94045115, + "learning_rate": 0.0009302345660933902, + "loss": 0.95191121, + "num_input_tokens_seen": 83837312, + "router_z_loss_mlp": 0.9921875, + "step": 1014, + "time_per_iteration": 2.8242082595825195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143606, + "balance_loss_mlp": 1.04442382, + "epoch": 0.19526741054251634, + "flos": 672327541248.0, + "grad_norm": 0.024449615989116238, + "language_loss": 0.93477654, + "learning_rate": 0.0009300757509790026, + "loss": 0.94621253, + "num_input_tokens_seen": 83917120, + "router_z_loss_mlp": 0.9921875, + "step": 1015, + "time_per_iteration": 2.840658664703369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144964, + "balance_loss_mlp": 1.04578233, + "epoch": 0.19545979222777993, + "flos": 448146986496.0, + "grad_norm": 0.028637929544829934, + "language_loss": 1.02226353, + "learning_rate": 0.0009299167688944005, + "loss": 1.0337131, + "num_input_tokens_seen": 83982992, + "router_z_loss_mlp": 0.9921875, + "step": 1016, + "time_per_iteration": 2.505427837371826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114266, + "balance_loss_mlp": 1.04376352, + "epoch": 0.1956521739130435, + "flos": 570168910848.0, + "grad_norm": 0.02609870742448671, + "language_loss": 0.93148959, + "learning_rate": 0.0009297576199013063, + "loss": 0.94291621, + "num_input_tokens_seen": 84057296, + "router_z_loss_mlp": 0.98925781, + "step": 1017, + "time_per_iteration": 2.7357168197631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155182, + "balance_loss_mlp": 1.05752563, + "epoch": 0.19584455559830705, + "flos": 1458880571904.0, + "grad_norm": 0.02028337436206496, + "language_loss": 0.73002136, + "learning_rate": 0.0009295983040615071, + "loss": 0.74157315, + "num_input_tokens_seen": 84292640, + "router_z_loss_mlp": 0.9765625, + "step": 1018, + "time_per_iteration": 5.09963059425354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147095, + "balance_loss_mlp": 1.04962921, + "epoch": 0.1960369372835706, + "flos": 1594481307648.0, + "grad_norm": 0.015251553743586253, + "language_loss": 0.79426301, + "learning_rate": 0.0009294388214368547, + "loss": 0.80573392, + "num_input_tokens_seen": 84524448, + "router_z_loss_mlp": 0.97460938, + "step": 1019, + "time_per_iteration": 6.03454852104187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146546, + "balance_loss_mlp": 1.0477457, + "epoch": 0.19622931896883417, + "flos": 617252954112.0, + "grad_norm": 0.02445318741287071, + "language_loss": 0.94190967, + "learning_rate": 0.0009292791720892659, + "loss": 0.9533751, + "num_input_tokens_seen": 84600208, + "router_z_loss_mlp": 0.98828125, + "step": 1020, + "time_per_iteration": 2.8369834423065186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147421, + "balance_loss_mlp": 1.0486201, + "epoch": 0.19642170065409773, + "flos": 467207278080.0, + "grad_norm": 0.027280190942869837, + "language_loss": 0.98824823, + "learning_rate": 0.0009291193560807218, + "loss": 0.99972242, + "num_input_tokens_seen": 84668032, + "router_z_loss_mlp": 0.98828125, + "step": 1021, + "time_per_iteration": 2.5833048820495605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144681, + "balance_loss_mlp": 1.0458802, + "epoch": 0.19661408233936128, + "flos": 516288093696.0, + "grad_norm": 0.025303886608753337, + "language_loss": 0.95740455, + "learning_rate": 0.0009289593734732688, + "loss": 0.96885145, + "num_input_tokens_seen": 84738176, + "router_z_loss_mlp": 0.98828125, + "step": 1022, + "time_per_iteration": 2.5913774967193604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149525, + "balance_loss_mlp": 1.05058122, + "epoch": 0.19680646402462484, + "flos": 393493366272.0, + "grad_norm": 0.0253763529676381, + "language_loss": 1.01103711, + "learning_rate": 0.0009287992243290175, + "loss": 1.02253246, + "num_input_tokens_seen": 84799936, + "router_z_loss_mlp": 0.98974609, + "step": 1023, + "time_per_iteration": 2.4793736934661865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115501, + "balance_loss_mlp": 1.05635238, + "epoch": 0.19699884570988843, + "flos": 627623032320.0, + "grad_norm": 0.02508480994731895, + "language_loss": 0.99886519, + "learning_rate": 0.0009286389087101435, + "loss": 1.01041532, + "num_input_tokens_seen": 84877216, + "router_z_loss_mlp": 0.98681641, + "step": 1024, + "time_per_iteration": 2.7772202491760254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153446, + "balance_loss_mlp": 1.05483615, + "epoch": 0.197191227395152, + "flos": 559073693184.0, + "grad_norm": 0.02445444816711275, + "language_loss": 0.98426372, + "learning_rate": 0.0009284784266788864, + "loss": 0.99579823, + "num_input_tokens_seen": 84952464, + "router_z_loss_mlp": 0.98632812, + "step": 1025, + "time_per_iteration": 2.6955864429473877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150264, + "balance_loss_mlp": 1.05165374, + "epoch": 0.19738360908041555, + "flos": 666249176064.0, + "grad_norm": 0.021666801749132464, + "language_loss": 0.99231869, + "learning_rate": 0.0009283177782975512, + "loss": 1.00382137, + "num_input_tokens_seen": 85031488, + "router_z_loss_mlp": 0.98632812, + "step": 1026, + "time_per_iteration": 2.9886229038238525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153907, + "balance_loss_mlp": 1.05529749, + "epoch": 0.1975759907656791, + "flos": 523510563840.0, + "grad_norm": 0.025961932589349316, + "language_loss": 0.98509014, + "learning_rate": 0.000928156963628507, + "loss": 0.99662918, + "num_input_tokens_seen": 85098384, + "router_z_loss_mlp": 0.98632812, + "step": 1027, + "time_per_iteration": 2.586740493774414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149439, + "balance_loss_mlp": 1.05097175, + "epoch": 0.19776837245094267, + "flos": 463484252160.0, + "grad_norm": 0.02550253779434718, + "language_loss": 0.96135926, + "learning_rate": 0.0009279959827341877, + "loss": 0.97285366, + "num_input_tokens_seen": 85172944, + "router_z_loss_mlp": 0.98486328, + "step": 1028, + "time_per_iteration": 2.723517894744873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146754, + "balance_loss_mlp": 1.04852605, + "epoch": 0.19796075413620623, + "flos": 504057503232.0, + "grad_norm": 0.02160335630411572, + "language_loss": 0.96627682, + "learning_rate": 0.0009278348356770915, + "loss": 0.97774434, + "num_input_tokens_seen": 85241632, + "router_z_loss_mlp": 0.98242188, + "step": 1029, + "time_per_iteration": 2.566802501678467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144801, + "balance_loss_mlp": 1.04666746, + "epoch": 0.1981531358214698, + "flos": 508570796544.0, + "grad_norm": 0.024261507948164947, + "language_loss": 0.9528529, + "learning_rate": 0.0009276735225197814, + "loss": 0.96430099, + "num_input_tokens_seen": 85308992, + "router_z_loss_mlp": 0.98144531, + "step": 1030, + "time_per_iteration": 2.6009340286254883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145205, + "balance_loss_mlp": 1.04702377, + "epoch": 0.19834551750673335, + "flos": 532639208448.0, + "grad_norm": 0.023062563394134136, + "language_loss": 0.95906407, + "learning_rate": 0.0009275120433248847, + "loss": 0.97051609, + "num_input_tokens_seen": 85381936, + "router_z_loss_mlp": 0.98193359, + "step": 1031, + "time_per_iteration": 2.684858560562134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145757, + "balance_loss_mlp": 1.0477196, + "epoch": 0.1985378991919969, + "flos": 776969765376.0, + "grad_norm": 0.02469129884935611, + "language_loss": 0.94986421, + "learning_rate": 0.0009273503981550931, + "loss": 0.96132183, + "num_input_tokens_seen": 85474352, + "router_z_loss_mlp": 0.98046875, + "step": 1032, + "time_per_iteration": 3.058094024658203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145313, + "balance_loss_mlp": 1.04737103, + "epoch": 0.1987302808772605, + "flos": 435191256576.0, + "grad_norm": 0.025952536265860523, + "language_loss": 0.96777844, + "learning_rate": 0.0009271885870731626, + "loss": 0.9792316, + "num_input_tokens_seen": 85538416, + "router_z_loss_mlp": 0.97949219, + "step": 1033, + "time_per_iteration": 2.493664503097534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153962, + "balance_loss_mlp": 1.05592442, + "epoch": 0.19892266256252406, + "flos": 554653725696.0, + "grad_norm": 0.029222795446194067, + "language_loss": 1.0035603, + "learning_rate": 0.0009270266101419143, + "loss": 1.01509976, + "num_input_tokens_seen": 85604416, + "router_z_loss_mlp": 0.98046875, + "step": 1034, + "time_per_iteration": 2.626612901687622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145521, + "balance_loss_mlp": 1.04748368, + "epoch": 0.19911504424778761, + "flos": 550948164096.0, + "grad_norm": 0.02425528851980561, + "language_loss": 0.92802572, + "learning_rate": 0.0009268644674242328, + "loss": 0.9394809, + "num_input_tokens_seen": 85677008, + "router_z_loss_mlp": 0.98046875, + "step": 1035, + "time_per_iteration": 2.683253288269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148174, + "balance_loss_mlp": 1.04994512, + "epoch": 0.19930742593305117, + "flos": 519312176640.0, + "grad_norm": 0.02646778626346152, + "language_loss": 0.91577774, + "learning_rate": 0.0009267021589830678, + "loss": 0.9272595, + "num_input_tokens_seen": 85745200, + "router_z_loss_mlp": 0.98242188, + "step": 1036, + "time_per_iteration": 2.7614338397979736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218948, + "balance_loss_mlp": 1.11824036, + "epoch": 0.19949980761831473, + "flos": 1512637863936.0, + "grad_norm": 0.02467753292442409, + "language_loss": 0.77627081, + "learning_rate": 0.0009265396848814328, + "loss": 0.78846025, + "num_input_tokens_seen": 85980608, + "router_z_loss_mlp": 1.0078125, + "step": 1037, + "time_per_iteration": 4.962339878082275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114988, + "balance_loss_mlp": 1.05184233, + "epoch": 0.1996921893035783, + "flos": 699439501824.0, + "grad_norm": 0.02757683731024766, + "language_loss": 1.02362621, + "learning_rate": 0.000926377045182406, + "loss": 1.03512502, + "num_input_tokens_seen": 86055952, + "router_z_loss_mlp": 0.98046875, + "step": 1038, + "time_per_iteration": 2.916594982147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155504, + "balance_loss_mlp": 1.05727601, + "epoch": 0.19988457098884185, + "flos": 728394510336.0, + "grad_norm": 0.024851830352508646, + "language_loss": 0.97729039, + "learning_rate": 0.0009262142399491296, + "loss": 0.98884547, + "num_input_tokens_seen": 86145536, + "router_z_loss_mlp": 0.98242188, + "step": 1039, + "time_per_iteration": 3.0976781845092773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156606, + "balance_loss_mlp": 1.05837739, + "epoch": 0.2000769526741054, + "flos": 561624416256.0, + "grad_norm": 0.025662568358030838, + "language_loss": 0.98388815, + "learning_rate": 0.0009260512692448105, + "loss": 0.99545419, + "num_input_tokens_seen": 86214480, + "router_z_loss_mlp": 0.98242188, + "step": 1040, + "time_per_iteration": 2.715479850769043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151311, + "balance_loss_mlp": 1.05308211, + "epoch": 0.200269334359369, + "flos": 573164795904.0, + "grad_norm": 0.022253887646478135, + "language_loss": 0.93097693, + "learning_rate": 0.000925888133132719, + "loss": 0.9424901, + "num_input_tokens_seen": 86289824, + "router_z_loss_mlp": 0.98242188, + "step": 1041, + "time_per_iteration": 2.7987864017486572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011912, + "balance_loss_mlp": 1.0923996, + "epoch": 0.20046171604463256, + "flos": 1489152875520.0, + "grad_norm": 0.020655335232781416, + "language_loss": 0.79610431, + "learning_rate": 0.0009257248316761906, + "loss": 0.80801636, + "num_input_tokens_seen": 86516384, + "router_z_loss_mlp": 0.98828125, + "step": 1042, + "time_per_iteration": 4.944507360458374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154531, + "balance_loss_mlp": 1.05644536, + "epoch": 0.20065409772989612, + "flos": 497577636864.0, + "grad_norm": 0.02609736880654102, + "language_loss": 0.92129564, + "learning_rate": 0.0009255613649386244, + "loss": 0.932841, + "num_input_tokens_seen": 86587296, + "router_z_loss_mlp": 0.98095703, + "step": 1043, + "time_per_iteration": 2.6478612422943115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157191, + "balance_loss_mlp": 1.05915368, + "epoch": 0.20084647941515968, + "flos": 580463127552.0, + "grad_norm": 0.02650777474930283, + "language_loss": 0.87469566, + "learning_rate": 0.0009253977329834838, + "loss": 0.88626754, + "num_input_tokens_seen": 86662656, + "router_z_loss_mlp": 0.98046875, + "step": 1044, + "time_per_iteration": 2.7641594409942627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161195, + "balance_loss_mlp": 1.06315744, + "epoch": 0.20103886110042324, + "flos": 643287939072.0, + "grad_norm": 0.030624079602620518, + "language_loss": 0.9713465, + "learning_rate": 0.0009252339358742965, + "loss": 0.98295844, + "num_input_tokens_seen": 86734704, + "router_z_loss_mlp": 0.98046875, + "step": 1045, + "time_per_iteration": 2.811687707901001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157153, + "balance_loss_mlp": 1.0594964, + "epoch": 0.2012312427856868, + "flos": 442969678848.0, + "grad_norm": 0.023268596270985206, + "language_loss": 0.93283701, + "learning_rate": 0.000925069973674654, + "loss": 0.94440854, + "num_input_tokens_seen": 86806512, + "router_z_loss_mlp": 0.9765625, + "step": 1046, + "time_per_iteration": 2.6709671020507812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157527, + "balance_loss_mlp": 1.05948889, + "epoch": 0.20142362447095036, + "flos": 555472190976.0, + "grad_norm": 0.022730221646095148, + "language_loss": 0.96496689, + "learning_rate": 0.000924905846448212, + "loss": 0.97654217, + "num_input_tokens_seen": 86883440, + "router_z_loss_mlp": 0.98046875, + "step": 1047, + "time_per_iteration": 2.7338547706604004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115317, + "balance_loss_mlp": 1.05522716, + "epoch": 0.20161600615621392, + "flos": 671554738176.0, + "grad_norm": 0.026697286803692055, + "language_loss": 0.96143991, + "learning_rate": 0.0009247415542586906, + "loss": 0.97297156, + "num_input_tokens_seen": 86960208, + "router_z_loss_mlp": 0.97949219, + "step": 1048, + "time_per_iteration": 2.849416494369507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149865, + "balance_loss_mlp": 1.05216146, + "epoch": 0.2018083878414775, + "flos": 574306899456.0, + "grad_norm": 0.021371049275305663, + "language_loss": 0.91504782, + "learning_rate": 0.0009245770971698735, + "loss": 0.92654645, + "num_input_tokens_seen": 87044144, + "router_z_loss_mlp": 0.97705078, + "step": 1049, + "time_per_iteration": 2.8751590251922607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151512, + "balance_loss_mlp": 1.05376041, + "epoch": 0.20200076952674106, + "flos": 426794482176.0, + "grad_norm": 0.027360075371486055, + "language_loss": 0.97835737, + "learning_rate": 0.0009244124752456087, + "loss": 0.98987252, + "num_input_tokens_seen": 87109136, + "router_z_loss_mlp": 0.97753906, + "step": 1050, + "time_per_iteration": 2.4985499382019043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153257, + "balance_loss_mlp": 1.05531442, + "epoch": 0.20219315121200462, + "flos": 537684258816.0, + "grad_norm": 0.025856302906645603, + "language_loss": 0.95370412, + "learning_rate": 0.0009242476885498081, + "loss": 0.96523666, + "num_input_tokens_seen": 87184320, + "router_z_loss_mlp": 0.97949219, + "step": 1051, + "time_per_iteration": 2.7127723693847656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150827, + "balance_loss_mlp": 1.05297983, + "epoch": 0.20238553289726818, + "flos": 478834252800.0, + "grad_norm": 0.02631802181941096, + "language_loss": 0.90995431, + "learning_rate": 0.0009240827371464474, + "loss": 0.92146254, + "num_input_tokens_seen": 87248224, + "router_z_loss_mlp": 0.97851562, + "step": 1052, + "time_per_iteration": 2.527918577194214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144335, + "balance_loss_mlp": 1.04667878, + "epoch": 0.20257791458253174, + "flos": 1153846049280.0, + "grad_norm": 0.025276400477213575, + "language_loss": 0.92167991, + "learning_rate": 0.0009239176210995666, + "loss": 0.93312329, + "num_input_tokens_seen": 87333088, + "router_z_loss_mlp": 0.9765625, + "step": 1053, + "time_per_iteration": 3.4556469917297363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144677, + "balance_loss_mlp": 1.04682982, + "epoch": 0.2027702962677953, + "flos": 668148619776.0, + "grad_norm": 0.025342755763179396, + "language_loss": 1.04358864, + "learning_rate": 0.0009237523404732695, + "loss": 1.05503547, + "num_input_tokens_seen": 87413840, + "router_z_loss_mlp": 0.97851562, + "step": 1054, + "time_per_iteration": 2.894198417663574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144665, + "balance_loss_mlp": 1.04676986, + "epoch": 0.20296267795305886, + "flos": 642452009472.0, + "grad_norm": 0.02468028394334187, + "language_loss": 0.94787639, + "learning_rate": 0.0009235868953317235, + "loss": 0.95932305, + "num_input_tokens_seen": 87487168, + "router_z_loss_mlp": 0.97900391, + "step": 1055, + "time_per_iteration": 2.812633991241455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148717, + "balance_loss_mlp": 1.05082273, + "epoch": 0.20315505963832242, + "flos": 932129622528.0, + "grad_norm": 0.02533903757078053, + "language_loss": 0.93907225, + "learning_rate": 0.0009234212857391602, + "loss": 0.95055938, + "num_input_tokens_seen": 87573184, + "router_z_loss_mlp": 0.97900391, + "step": 1056, + "time_per_iteration": 3.2061142921447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147493, + "balance_loss_mlp": 1.0496459, + "epoch": 0.20334744132358598, + "flos": 563287543296.0, + "grad_norm": 0.019686870604104637, + "language_loss": 0.97330248, + "learning_rate": 0.000923255511759875, + "loss": 0.98477745, + "num_input_tokens_seen": 87651968, + "router_z_loss_mlp": 0.97851562, + "step": 1057, + "time_per_iteration": 2.7639002799987793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150039, + "balance_loss_mlp": 1.05219197, + "epoch": 0.20353982300884957, + "flos": 645428428800.0, + "grad_norm": 0.023252811049323967, + "language_loss": 0.95256209, + "learning_rate": 0.000923089573458227, + "loss": 0.96406245, + "num_input_tokens_seen": 87727792, + "router_z_loss_mlp": 0.97851562, + "step": 1058, + "time_per_iteration": 2.857612133026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114962, + "balance_loss_mlp": 1.05177307, + "epoch": 0.20373220469411313, + "flos": 652705293312.0, + "grad_norm": 0.02395962669603635, + "language_loss": 0.93332446, + "learning_rate": 0.0009229234708986392, + "loss": 0.94482064, + "num_input_tokens_seen": 87806048, + "router_z_loss_mlp": 0.97851562, + "step": 1059, + "time_per_iteration": 2.877995729446411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150688, + "balance_loss_mlp": 1.05436707, + "epoch": 0.2039245863793767, + "flos": 1440396973056.0, + "grad_norm": 0.013896761524226428, + "language_loss": 0.81666899, + "learning_rate": 0.0009227572041455982, + "loss": 0.82817578, + "num_input_tokens_seen": 88018160, + "router_z_loss_mlp": 0.96289062, + "step": 1060, + "time_per_iteration": 4.659267902374268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142187, + "balance_loss_mlp": 1.04434025, + "epoch": 0.20411696806464025, + "flos": 598127534592.0, + "grad_norm": 0.026599581611848343, + "language_loss": 0.93894625, + "learning_rate": 0.0009225907732636548, + "loss": 0.95036817, + "num_input_tokens_seen": 88090864, + "router_z_loss_mlp": 0.97851562, + "step": 1061, + "time_per_iteration": 2.7480902671813965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115027, + "balance_loss_mlp": 1.05242312, + "epoch": 0.2043093497499038, + "flos": 574897053696.0, + "grad_norm": 0.026136319737411078, + "language_loss": 0.96460152, + "learning_rate": 0.0009224241783174227, + "loss": 0.97610414, + "num_input_tokens_seen": 88161360, + "router_z_loss_mlp": 0.97851562, + "step": 1062, + "time_per_iteration": 2.676877021789551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146738, + "balance_loss_mlp": 1.04874802, + "epoch": 0.20450173143516737, + "flos": 631523977728.0, + "grad_norm": 0.02709710709634581, + "language_loss": 0.94472104, + "learning_rate": 0.0009222574193715802, + "loss": 0.95618844, + "num_input_tokens_seen": 88234960, + "router_z_loss_mlp": 0.97998047, + "step": 1063, + "time_per_iteration": 2.7604472637176514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141026, + "balance_loss_mlp": 1.04298854, + "epoch": 0.20469411312043093, + "flos": 575146831872.0, + "grad_norm": 0.022769515120839894, + "language_loss": 0.95189404, + "learning_rate": 0.000922090496490869, + "loss": 0.96330428, + "num_input_tokens_seen": 88308176, + "router_z_loss_mlp": 0.98046875, + "step": 1064, + "time_per_iteration": 2.728154182434082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141583, + "balance_loss_mlp": 1.04383183, + "epoch": 0.20488649480569449, + "flos": 638279818752.0, + "grad_norm": 0.022393105289594414, + "language_loss": 0.97629392, + "learning_rate": 0.0009219234097400937, + "loss": 0.9877097, + "num_input_tokens_seen": 88386768, + "router_z_loss_mlp": 0.97753906, + "step": 1065, + "time_per_iteration": 2.889946937561035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137744, + "balance_loss_mlp": 1.03989744, + "epoch": 0.20507887649095807, + "flos": 977437747200.0, + "grad_norm": 0.024872828726298618, + "language_loss": 0.9305777, + "learning_rate": 0.0009217561591841237, + "loss": 0.94195515, + "num_input_tokens_seen": 88476576, + "router_z_loss_mlp": 0.97851562, + "step": 1066, + "time_per_iteration": 3.296248435974121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144611, + "balance_loss_mlp": 1.04681206, + "epoch": 0.20527125817622163, + "flos": 487155165696.0, + "grad_norm": 0.024567371957878288, + "language_loss": 0.90358436, + "learning_rate": 0.0009215887448878913, + "loss": 0.91503048, + "num_input_tokens_seen": 88541968, + "router_z_loss_mlp": 0.97802734, + "step": 1067, + "time_per_iteration": 2.5662190914154053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137303, + "balance_loss_mlp": 1.03945625, + "epoch": 0.2054636398614852, + "flos": 528210508800.0, + "grad_norm": 0.02249486638659544, + "language_loss": 0.94470721, + "learning_rate": 0.0009214211669163922, + "loss": 0.9560802, + "num_input_tokens_seen": 88615296, + "router_z_loss_mlp": 0.97851562, + "step": 1068, + "time_per_iteration": 2.6912589073181152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139468, + "balance_loss_mlp": 1.04162145, + "epoch": 0.20565602154674875, + "flos": 559323471360.0, + "grad_norm": 0.022635174506508055, + "language_loss": 1.02501464, + "learning_rate": 0.0009212534253346862, + "loss": 1.03640926, + "num_input_tokens_seen": 88691584, + "router_z_loss_mlp": 0.97851562, + "step": 1069, + "time_per_iteration": 2.708683490753174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135123, + "balance_loss_mlp": 1.03746641, + "epoch": 0.2058484032320123, + "flos": 505221073920.0, + "grad_norm": 0.02479403914192968, + "language_loss": 0.95383358, + "learning_rate": 0.0009210855202078964, + "loss": 0.96518481, + "num_input_tokens_seen": 88756592, + "router_z_loss_mlp": 0.9765625, + "step": 1070, + "time_per_iteration": 2.6434948444366455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132203, + "balance_loss_mlp": 1.03478527, + "epoch": 0.20604078491727587, + "flos": 434047151616.0, + "grad_norm": 0.024632817960327506, + "language_loss": 0.96572351, + "learning_rate": 0.0009209174516012091, + "loss": 0.97704554, + "num_input_tokens_seen": 88820928, + "router_z_loss_mlp": 0.97412109, + "step": 1071, + "time_per_iteration": 2.4891347885131836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148822, + "balance_loss_mlp": 1.05130851, + "epoch": 0.20623316660253943, + "flos": 609874031616.0, + "grad_norm": 0.024395492192686875, + "language_loss": 0.97482872, + "learning_rate": 0.0009207492195798747, + "loss": 0.98631692, + "num_input_tokens_seen": 88895440, + "router_z_loss_mlp": 0.97509766, + "step": 1072, + "time_per_iteration": 2.758575201034546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152495, + "balance_loss_mlp": 1.05502975, + "epoch": 0.206425548287803, + "flos": 481393708032.0, + "grad_norm": 0.027205333287948934, + "language_loss": 0.9402262, + "learning_rate": 0.0009205808242092061, + "loss": 0.95175123, + "num_input_tokens_seen": 88964400, + "router_z_loss_mlp": 0.97460938, + "step": 1073, + "time_per_iteration": 2.6534366607666016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152896, + "balance_loss_mlp": 1.05562115, + "epoch": 0.20661792997306658, + "flos": 951122784768.0, + "grad_norm": 0.02943422736446298, + "language_loss": 0.93147469, + "learning_rate": 0.0009204122655545808, + "loss": 0.94300359, + "num_input_tokens_seen": 89049600, + "router_z_loss_mlp": 0.97265625, + "step": 1074, + "time_per_iteration": 3.317518949508667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149199, + "balance_loss_mlp": 1.05201948, + "epoch": 0.20681031165833014, + "flos": 604616133120.0, + "grad_norm": 0.024855118115069977, + "language_loss": 0.88961834, + "learning_rate": 0.0009202435436814388, + "loss": 0.90111029, + "num_input_tokens_seen": 89119024, + "router_z_loss_mlp": 0.97167969, + "step": 1075, + "time_per_iteration": 2.6815345287323 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142912, + "balance_loss_mlp": 1.04563749, + "epoch": 0.2070026933435937, + "flos": 710265475584.0, + "grad_norm": 0.027130222852878607, + "language_loss": 0.99239773, + "learning_rate": 0.0009200746586552836, + "loss": 1.00382686, + "num_input_tokens_seen": 89197344, + "router_z_loss_mlp": 0.97265625, + "step": 1076, + "time_per_iteration": 2.9578917026519775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141976, + "balance_loss_mlp": 1.04451025, + "epoch": 0.20719507502885726, + "flos": 831254085120.0, + "grad_norm": 0.023090334700176834, + "language_loss": 0.92780054, + "learning_rate": 0.0009199056105416825, + "loss": 0.93922031, + "num_input_tokens_seen": 89280464, + "router_z_loss_mlp": 0.97460938, + "step": 1077, + "time_per_iteration": 3.0944156646728516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140475, + "balance_loss_mlp": 1.04324794, + "epoch": 0.20738745671412082, + "flos": 639499785216.0, + "grad_norm": 0.023914471883828003, + "language_loss": 0.96186948, + "learning_rate": 0.0009197363994062654, + "loss": 0.97327423, + "num_input_tokens_seen": 89353344, + "router_z_loss_mlp": 0.97216797, + "step": 1078, + "time_per_iteration": 2.8147799968719482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142489, + "balance_loss_mlp": 1.04521394, + "epoch": 0.20757983839938438, + "flos": 686983328256.0, + "grad_norm": 0.02237329029547868, + "language_loss": 0.90686679, + "learning_rate": 0.0009195670253147262, + "loss": 0.91829169, + "num_input_tokens_seen": 89439328, + "router_z_loss_mlp": 0.97265625, + "step": 1079, + "time_per_iteration": 2.994058609008789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141016, + "balance_loss_mlp": 1.04383624, + "epoch": 0.20777222008464794, + "flos": 520317293568.0, + "grad_norm": 0.026634413874044322, + "language_loss": 0.92195654, + "learning_rate": 0.0009193974883328216, + "loss": 0.93336666, + "num_input_tokens_seen": 89510160, + "router_z_loss_mlp": 0.97167969, + "step": 1080, + "time_per_iteration": 2.6506502628326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140462, + "balance_loss_mlp": 1.04333031, + "epoch": 0.2079646017699115, + "flos": 512469740544.0, + "grad_norm": 0.025261028079588584, + "language_loss": 0.97185814, + "learning_rate": 0.0009192277885263718, + "loss": 0.98326278, + "num_input_tokens_seen": 89582960, + "router_z_loss_mlp": 0.97119141, + "step": 1081, + "time_per_iteration": 2.646629810333252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143678, + "balance_loss_mlp": 1.04640269, + "epoch": 0.20815698345517505, + "flos": 933467109888.0, + "grad_norm": 0.02363260569338726, + "language_loss": 0.9496327, + "learning_rate": 0.0009190579259612602, + "loss": 0.96106946, + "num_input_tokens_seen": 89675488, + "router_z_loss_mlp": 0.97265625, + "step": 1082, + "time_per_iteration": 3.2829811573028564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150642, + "balance_loss_mlp": 1.05336761, + "epoch": 0.20834936514043864, + "flos": 633553677312.0, + "grad_norm": 0.02436625118168465, + "language_loss": 0.97094011, + "learning_rate": 0.000918887900703433, + "loss": 0.98244655, + "num_input_tokens_seen": 89747872, + "router_z_loss_mlp": 0.97265625, + "step": 1083, + "time_per_iteration": 2.779474973678589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147642, + "balance_loss_mlp": 1.05079603, + "epoch": 0.2085417468257022, + "flos": 395243088384.0, + "grad_norm": 0.027448171988374206, + "language_loss": 0.98109657, + "learning_rate": 0.0009187177128188999, + "loss": 0.99257296, + "num_input_tokens_seen": 89810176, + "router_z_loss_mlp": 0.96826172, + "step": 1084, + "time_per_iteration": 2.487755298614502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156746, + "balance_loss_mlp": 1.06118774, + "epoch": 0.20873412851096576, + "flos": 1405195138560.0, + "grad_norm": 0.014888537960634525, + "language_loss": 0.77156538, + "learning_rate": 0.0009185473623737339, + "loss": 0.78313285, + "num_input_tokens_seen": 90038432, + "router_z_loss_mlp": 0.95507812, + "step": 1085, + "time_per_iteration": 4.917901515960693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146704, + "balance_loss_mlp": 1.04981041, + "epoch": 0.20892651019622932, + "flos": 448761335808.0, + "grad_norm": 0.0275038267286557, + "language_loss": 0.93389261, + "learning_rate": 0.000918376849434071, + "loss": 0.94535965, + "num_input_tokens_seen": 90101568, + "router_z_loss_mlp": 0.96875, + "step": 1086, + "time_per_iteration": 2.5117850303649902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153188, + "balance_loss_mlp": 1.05629456, + "epoch": 0.20911889188149288, + "flos": 494080194048.0, + "grad_norm": 0.034273062806107445, + "language_loss": 1.02428699, + "learning_rate": 0.0009182061740661098, + "loss": 1.03581882, + "num_input_tokens_seen": 90169344, + "router_z_loss_mlp": 0.96875, + "step": 1087, + "time_per_iteration": 2.5270984172821045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154258, + "balance_loss_mlp": 1.05736482, + "epoch": 0.20931127356675644, + "flos": 842748802560.0, + "grad_norm": 0.02361505883443172, + "language_loss": 0.92997056, + "learning_rate": 0.0009180353363361127, + "loss": 0.94151306, + "num_input_tokens_seen": 90252416, + "router_z_loss_mlp": 0.96875, + "step": 1088, + "time_per_iteration": 3.1549112796783447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154015, + "balance_loss_mlp": 1.05688298, + "epoch": 0.20950365525202, + "flos": 758523823104.0, + "grad_norm": 0.028384526527587387, + "language_loss": 0.93851304, + "learning_rate": 0.0009178643363104044, + "loss": 0.95005322, + "num_input_tokens_seen": 90337952, + "router_z_loss_mlp": 0.97119141, + "step": 1089, + "time_per_iteration": 4.693684339523315 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148681, + "balance_loss_mlp": 1.05159688, + "epoch": 0.20969603693728356, + "flos": 473491760640.0, + "grad_norm": 0.03411348227976855, + "language_loss": 1.04663801, + "learning_rate": 0.0009176931740553735, + "loss": 1.05812478, + "num_input_tokens_seen": 90401488, + "router_z_loss_mlp": 0.97070312, + "step": 1090, + "time_per_iteration": 2.5203866958618164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146066, + "balance_loss_mlp": 1.04917288, + "epoch": 0.20988841862254715, + "flos": 978627514368.0, + "grad_norm": 0.027482857176328385, + "language_loss": 0.92998403, + "learning_rate": 0.0009175218496374708, + "loss": 0.94144469, + "num_input_tokens_seen": 90486144, + "router_z_loss_mlp": 0.96875, + "step": 1091, + "time_per_iteration": 3.362614870071411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152337, + "balance_loss_mlp": 1.05544364, + "epoch": 0.2100808003078107, + "flos": 1094818123776.0, + "grad_norm": 0.028049590852478556, + "language_loss": 0.96363866, + "learning_rate": 0.0009173503631232103, + "loss": 0.97516203, + "num_input_tokens_seen": 90571504, + "router_z_loss_mlp": 0.96875, + "step": 1092, + "time_per_iteration": 3.359970808029175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150696, + "balance_loss_mlp": 1.05399334, + "epoch": 0.21027318199307427, + "flos": 1014559217664.0, + "grad_norm": 0.03210489869185377, + "language_loss": 0.94109344, + "learning_rate": 0.0009171787145791691, + "loss": 0.95260036, + "num_input_tokens_seen": 90646016, + "router_z_loss_mlp": 0.96679688, + "step": 1093, + "time_per_iteration": 3.2180042266845703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150028, + "balance_loss_mlp": 1.05323017, + "epoch": 0.21046556367833782, + "flos": 522412121088.0, + "grad_norm": 0.02762257246471406, + "language_loss": 0.92679179, + "learning_rate": 0.000917006904071987, + "loss": 0.93829209, + "num_input_tokens_seen": 90713440, + "router_z_loss_mlp": 0.96777344, + "step": 1094, + "time_per_iteration": 2.5961859226226807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152841, + "balance_loss_mlp": 1.0559479, + "epoch": 0.21065794536360138, + "flos": 604839714816.0, + "grad_norm": 0.02570597393175465, + "language_loss": 0.97250223, + "learning_rate": 0.0009168349316683669, + "loss": 0.98403066, + "num_input_tokens_seen": 90788208, + "router_z_loss_mlp": 0.96875, + "step": 1095, + "time_per_iteration": 2.7164759635925293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153125, + "balance_loss_mlp": 1.05642295, + "epoch": 0.21085032704886494, + "flos": 604557735936.0, + "grad_norm": 0.022711755724658188, + "language_loss": 0.91088736, + "learning_rate": 0.0009166627974350741, + "loss": 0.92241859, + "num_input_tokens_seen": 90873776, + "router_z_loss_mlp": 0.96679688, + "step": 1096, + "time_per_iteration": 2.8912341594696045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153907, + "balance_loss_mlp": 1.05739498, + "epoch": 0.2110427087341285, + "flos": 638831041536.0, + "grad_norm": 0.027939519002465243, + "language_loss": 1.01164758, + "learning_rate": 0.0009164905014389373, + "loss": 1.02318668, + "num_input_tokens_seen": 90945872, + "router_z_loss_mlp": 0.96484375, + "step": 1097, + "time_per_iteration": 2.758725881576538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115008, + "balance_loss_mlp": 1.05356789, + "epoch": 0.21123509041939206, + "flos": 523929529344.0, + "grad_norm": 0.027217895626849283, + "language_loss": 0.96537346, + "learning_rate": 0.0009163180437468476, + "loss": 0.97687429, + "num_input_tokens_seen": 91016224, + "router_z_loss_mlp": 0.96484375, + "step": 1098, + "time_per_iteration": 2.6157684326171875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011531, + "balance_loss_mlp": 1.05658853, + "epoch": 0.21142747210465565, + "flos": 452193650688.0, + "grad_norm": 0.025540912808389868, + "language_loss": 0.94842321, + "learning_rate": 0.000916145424425759, + "loss": 0.9599542, + "num_input_tokens_seen": 91086752, + "router_z_loss_mlp": 0.96484375, + "step": 1099, + "time_per_iteration": 2.6368908882141113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157233, + "balance_loss_mlp": 1.06081605, + "epoch": 0.2116198537899192, + "flos": 877625723904.0, + "grad_norm": 0.02885196772961066, + "language_loss": 1.02573156, + "learning_rate": 0.0009159726435426885, + "loss": 1.03730392, + "num_input_tokens_seen": 91162960, + "router_z_loss_mlp": 0.96386719, + "step": 1100, + "time_per_iteration": 3.0916907787323 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011557, + "balance_loss_mlp": 1.05909276, + "epoch": 0.21181223547518277, + "flos": 524674134528.0, + "grad_norm": 0.025603473018395394, + "language_loss": 0.99936807, + "learning_rate": 0.0009157997011647154, + "loss": 1.01092505, + "num_input_tokens_seen": 91229840, + "router_z_loss_mlp": 0.96582031, + "step": 1101, + "time_per_iteration": 2.5971169471740723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152722, + "balance_loss_mlp": 1.05630529, + "epoch": 0.21200461716044633, + "flos": 573425307648.0, + "grad_norm": 0.02306433427515447, + "language_loss": 0.93708789, + "learning_rate": 0.0009156265973589817, + "loss": 0.94861513, + "num_input_tokens_seen": 91307936, + "router_z_loss_mlp": 0.96386719, + "step": 1102, + "time_per_iteration": 2.786557197570801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148247, + "balance_loss_mlp": 1.05187845, + "epoch": 0.2121969988457099, + "flos": 546174359040.0, + "grad_norm": 0.023119673851329285, + "language_loss": 0.9826746, + "learning_rate": 0.0009154533321926926, + "loss": 0.99415696, + "num_input_tokens_seen": 91372848, + "router_z_loss_mlp": 0.96337891, + "step": 1103, + "time_per_iteration": 2.6500911712646484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150448, + "balance_loss_mlp": 1.05393636, + "epoch": 0.21238938053097345, + "flos": 845353920000.0, + "grad_norm": 0.02523726215492747, + "language_loss": 0.96587884, + "learning_rate": 0.0009152799057331156, + "loss": 0.97738338, + "num_input_tokens_seen": 91452768, + "router_z_loss_mlp": 0.96484375, + "step": 1104, + "time_per_iteration": 3.1080517768859863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148697, + "balance_loss_mlp": 1.05213737, + "epoch": 0.212581762216237, + "flos": 447141869568.0, + "grad_norm": 0.026678256955328494, + "language_loss": 1.00256824, + "learning_rate": 0.0009151063180475805, + "loss": 1.01405525, + "num_input_tokens_seen": 91519888, + "router_z_loss_mlp": 0.96533203, + "step": 1105, + "time_per_iteration": 2.530207633972168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153737, + "balance_loss_mlp": 1.05703473, + "epoch": 0.21277414390150057, + "flos": 515385034752.0, + "grad_norm": 0.026680614248996183, + "language_loss": 0.9432478, + "learning_rate": 0.0009149325692034803, + "loss": 0.95478517, + "num_input_tokens_seen": 91585744, + "router_z_loss_mlp": 0.96679688, + "step": 1106, + "time_per_iteration": 2.576834201812744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159119, + "balance_loss_mlp": 1.06413269, + "epoch": 0.21296652558676413, + "flos": 1488512329728.0, + "grad_norm": 0.01358013302766655, + "language_loss": 0.79203427, + "learning_rate": 0.0009147586592682702, + "loss": 0.80362546, + "num_input_tokens_seen": 91805840, + "router_z_loss_mlp": 0.94921875, + "step": 1107, + "time_per_iteration": 4.821696996688843 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156765, + "balance_loss_mlp": 1.06006265, + "epoch": 0.21315890727202771, + "flos": 847450748928.0, + "grad_norm": 0.031460519319247274, + "language_loss": 0.96369046, + "learning_rate": 0.0009145845883094678, + "loss": 0.97525811, + "num_input_tokens_seen": 91885936, + "router_z_loss_mlp": 0.96679688, + "step": 1108, + "time_per_iteration": 3.029548168182373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159379, + "balance_loss_mlp": 1.06267655, + "epoch": 0.21335128895729127, + "flos": 630555790848.0, + "grad_norm": 0.028067626854192333, + "language_loss": 0.95182431, + "learning_rate": 0.000914410356394654, + "loss": 0.96341801, + "num_input_tokens_seen": 91959888, + "router_z_loss_mlp": 0.96679688, + "step": 1109, + "time_per_iteration": 2.737241268157959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160605, + "balance_loss_mlp": 1.06352139, + "epoch": 0.21354367064255483, + "flos": 712284441600.0, + "grad_norm": 0.023599510024272945, + "language_loss": 0.92540836, + "learning_rate": 0.0009142359635914709, + "loss": 0.93701446, + "num_input_tokens_seen": 92043728, + "router_z_loss_mlp": 0.97070312, + "step": 1110, + "time_per_iteration": 3.0267913341522217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161441, + "balance_loss_mlp": 1.0645479, + "epoch": 0.2137360523278184, + "flos": 457210503168.0, + "grad_norm": 0.02473497568188501, + "language_loss": 0.9156003, + "learning_rate": 0.0009140614099676245, + "loss": 0.92721474, + "num_input_tokens_seen": 92114096, + "router_z_loss_mlp": 0.96875, + "step": 1111, + "time_per_iteration": 2.5756866931915283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164266, + "balance_loss_mlp": 1.06727743, + "epoch": 0.21392843401308195, + "flos": 667265026560.0, + "grad_norm": 0.025344438139363285, + "language_loss": 0.90291333, + "learning_rate": 0.0009138866955908821, + "loss": 0.91455603, + "num_input_tokens_seen": 92193552, + "router_z_loss_mlp": 0.96972656, + "step": 1112, + "time_per_iteration": 2.9406254291534424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160039, + "balance_loss_mlp": 1.06319368, + "epoch": 0.2141208156983455, + "flos": 750361363968.0, + "grad_norm": 0.02581510235299489, + "language_loss": 0.89949894, + "learning_rate": 0.0009137118205290738, + "loss": 0.91109931, + "num_input_tokens_seen": 92279248, + "router_z_loss_mlp": 0.96826172, + "step": 1113, + "time_per_iteration": 2.966989278793335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162558, + "balance_loss_mlp": 1.06547356, + "epoch": 0.21431319738360907, + "flos": 420010443264.0, + "grad_norm": 0.024953242249854055, + "language_loss": 1.00419319, + "learning_rate": 0.0009135367848500924, + "loss": 1.01581883, + "num_input_tokens_seen": 92344064, + "router_z_loss_mlp": 0.97070312, + "step": 1114, + "time_per_iteration": 2.4954934120178223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161216, + "balance_loss_mlp": 1.06456113, + "epoch": 0.21450557906887263, + "flos": 610238602752.0, + "grad_norm": 0.030213425802119154, + "language_loss": 0.9839642, + "learning_rate": 0.0009133615886218927, + "loss": 0.99557638, + "num_input_tokens_seen": 92410544, + "router_z_loss_mlp": 0.96630859, + "step": 1115, + "time_per_iteration": 2.71352219581604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152764, + "balance_loss_mlp": 1.05625272, + "epoch": 0.21469796075413622, + "flos": 562974638592.0, + "grad_norm": 0.027635545182738433, + "language_loss": 0.99806535, + "learning_rate": 0.0009131862319124917, + "loss": 1.00959289, + "num_input_tokens_seen": 92480272, + "router_z_loss_mlp": 0.96484375, + "step": 1116, + "time_per_iteration": 2.630807876586914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153717, + "balance_loss_mlp": 1.05720496, + "epoch": 0.21489034243939978, + "flos": 595737266688.0, + "grad_norm": 0.024806539819872384, + "language_loss": 0.94489264, + "learning_rate": 0.0009130107147899691, + "loss": 0.95642984, + "num_input_tokens_seen": 92555584, + "router_z_loss_mlp": 0.96484375, + "step": 1117, + "time_per_iteration": 2.7123875617980957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154765, + "balance_loss_mlp": 1.05825305, + "epoch": 0.21508272412466334, + "flos": 442850156544.0, + "grad_norm": 0.024517194331867692, + "language_loss": 0.93784142, + "learning_rate": 0.0009128350373224665, + "loss": 0.9493891, + "num_input_tokens_seen": 92623136, + "router_z_loss_mlp": 0.96484375, + "step": 1118, + "time_per_iteration": 2.5384151935577393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169045, + "balance_loss_mlp": 1.07348633, + "epoch": 0.2152751058099269, + "flos": 1499232242688.0, + "grad_norm": 0.019396990855708212, + "language_loss": 0.81456429, + "learning_rate": 0.0009126591995781883, + "loss": 0.82625473, + "num_input_tokens_seen": 92842608, + "router_z_loss_mlp": 0.95507812, + "step": 1119, + "time_per_iteration": 4.644891262054443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156688, + "balance_loss_mlp": 1.05989027, + "epoch": 0.21546748749519046, + "flos": 494991985152.0, + "grad_norm": 0.030440112014221473, + "language_loss": 0.9407053, + "learning_rate": 0.0009124832016254005, + "loss": 0.95227218, + "num_input_tokens_seen": 92912960, + "router_z_loss_mlp": 0.96777344, + "step": 1120, + "time_per_iteration": 2.588834285736084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163526, + "balance_loss_mlp": 1.06691861, + "epoch": 0.21565986918045402, + "flos": 635694167040.0, + "grad_norm": 0.030206495794058562, + "language_loss": 0.96966755, + "learning_rate": 0.0009123070435324316, + "loss": 0.98130286, + "num_input_tokens_seen": 92982272, + "router_z_loss_mlp": 0.96582031, + "step": 1121, + "time_per_iteration": 2.786072015762329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170601, + "balance_loss_mlp": 1.07542419, + "epoch": 0.21585225086571758, + "flos": 1586798939136.0, + "grad_norm": 0.013013152417503263, + "language_loss": 0.77875781, + "learning_rate": 0.0009121307253676722, + "loss": 0.79046386, + "num_input_tokens_seen": 93218752, + "router_z_loss_mlp": 0.95117188, + "step": 1122, + "time_per_iteration": 4.946362733840942 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011651, + "balance_loss_mlp": 1.0685885, + "epoch": 0.21604463255098114, + "flos": 685322202624.0, + "grad_norm": 0.027822137906457534, + "language_loss": 0.94040322, + "learning_rate": 0.0009119542471995752, + "loss": 0.95205426, + "num_input_tokens_seen": 93293968, + "router_z_loss_mlp": 0.96484375, + "step": 1123, + "time_per_iteration": 2.8613343238830566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162625, + "balance_loss_mlp": 1.0660181, + "epoch": 0.2162370142362447, + "flos": 782307528192.0, + "grad_norm": 0.029561600436113455, + "language_loss": 0.90709835, + "learning_rate": 0.0009117776090966554, + "loss": 0.9187246, + "num_input_tokens_seen": 93367088, + "router_z_loss_mlp": 0.96582031, + "step": 1124, + "time_per_iteration": 2.9557414054870605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170148, + "balance_loss_mlp": 1.07344532, + "epoch": 0.21642939592150828, + "flos": 1003761441792.0, + "grad_norm": 0.032145354222626064, + "language_loss": 0.98171163, + "learning_rate": 0.0009116008111274899, + "loss": 0.99341309, + "num_input_tokens_seen": 93452944, + "router_z_loss_mlp": 0.96679688, + "step": 1125, + "time_per_iteration": 3.253286838531494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175423, + "balance_loss_mlp": 1.0798645, + "epoch": 0.21662177760677184, + "flos": 1485762220032.0, + "grad_norm": 0.016361962696647775, + "language_loss": 0.79106927, + "learning_rate": 0.0009114238533607176, + "loss": 0.80282342, + "num_input_tokens_seen": 93677328, + "router_z_loss_mlp": 0.95507812, + "step": 1126, + "time_per_iteration": 4.832986831665039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168208, + "balance_loss_mlp": 1.07150567, + "epoch": 0.2168141592920354, + "flos": 888859929600.0, + "grad_norm": 0.027606671666099106, + "language_loss": 0.94760346, + "learning_rate": 0.0009112467358650396, + "loss": 0.9592855, + "num_input_tokens_seen": 93756848, + "router_z_loss_mlp": 0.96679688, + "step": 1127, + "time_per_iteration": 3.1373836994171143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164208, + "balance_loss_mlp": 1.06741047, + "epoch": 0.21700654097729896, + "flos": 547084148736.0, + "grad_norm": 0.025712027239217825, + "language_loss": 0.95734817, + "learning_rate": 0.0009110694587092192, + "loss": 0.96899021, + "num_input_tokens_seen": 93834704, + "router_z_loss_mlp": 0.96777344, + "step": 1128, + "time_per_iteration": 2.752166986465454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162506, + "balance_loss_mlp": 1.06580317, + "epoch": 0.21719892266256252, + "flos": 510535368192.0, + "grad_norm": 0.02739880514200537, + "language_loss": 0.95310479, + "learning_rate": 0.0009108920219620815, + "loss": 0.96472991, + "num_input_tokens_seen": 93904448, + "router_z_loss_mlp": 0.96679688, + "step": 1129, + "time_per_iteration": 2.639409065246582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164125, + "balance_loss_mlp": 1.06742299, + "epoch": 0.21739130434782608, + "flos": 544461566976.0, + "grad_norm": 0.023064586598143682, + "language_loss": 0.97784394, + "learning_rate": 0.0009107144256925133, + "loss": 0.9894852, + "num_input_tokens_seen": 93979312, + "router_z_loss_mlp": 0.96679688, + "step": 1130, + "time_per_iteration": 2.73559308052063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165938, + "balance_loss_mlp": 1.06923568, + "epoch": 0.21758368603308964, + "flos": 617982096384.0, + "grad_norm": 0.027176951765382908, + "language_loss": 0.9233678, + "learning_rate": 0.0009105366699694638, + "loss": 0.93502718, + "num_input_tokens_seen": 94052032, + "router_z_loss_mlp": 0.96679688, + "step": 1131, + "time_per_iteration": 2.7653839588165283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166281, + "balance_loss_mlp": 1.06957853, + "epoch": 0.2177760677183532, + "flos": 636334712832.0, + "grad_norm": 0.021107298895209785, + "language_loss": 0.91459304, + "learning_rate": 0.0009103587548619439, + "loss": 0.92625588, + "num_input_tokens_seen": 94124944, + "router_z_loss_mlp": 0.96679688, + "step": 1132, + "time_per_iteration": 2.8519365787506104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160184, + "balance_loss_mlp": 1.06367195, + "epoch": 0.2179684494036168, + "flos": 533596661760.0, + "grad_norm": 0.022551614427290693, + "language_loss": 0.95995569, + "learning_rate": 0.0009101806804390261, + "loss": 0.97155756, + "num_input_tokens_seen": 94200384, + "router_z_loss_mlp": 0.96484375, + "step": 1133, + "time_per_iteration": 2.8218026161193848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163206, + "balance_loss_mlp": 1.06664658, + "epoch": 0.21816083108888035, + "flos": 476181471744.0, + "grad_norm": 0.0250418684782295, + "language_loss": 1.00355339, + "learning_rate": 0.0009100024467698453, + "loss": 1.01518536, + "num_input_tokens_seen": 94266992, + "router_z_loss_mlp": 0.96533203, + "step": 1134, + "time_per_iteration": 2.5639142990112305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167151, + "balance_loss_mlp": 1.07059181, + "epoch": 0.2183532127741439, + "flos": 578546219520.0, + "grad_norm": 0.029194142239697657, + "language_loss": 0.95151818, + "learning_rate": 0.0009098240539235981, + "loss": 0.96318972, + "num_input_tokens_seen": 94334304, + "router_z_loss_mlp": 0.96533203, + "step": 1135, + "time_per_iteration": 2.693704843521118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162362, + "balance_loss_mlp": 1.06565976, + "epoch": 0.21854559445940747, + "flos": 595279369728.0, + "grad_norm": 0.022714398939090653, + "language_loss": 0.96190184, + "learning_rate": 0.0009096455019695423, + "loss": 0.9735254, + "num_input_tokens_seen": 94413296, + "router_z_loss_mlp": 0.96679688, + "step": 1136, + "time_per_iteration": 2.829479217529297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166866, + "balance_loss_mlp": 1.06997275, + "epoch": 0.21873797614467103, + "flos": 409549040640.0, + "grad_norm": 0.027737994351600712, + "language_loss": 1.01424551, + "learning_rate": 0.000909466790976998, + "loss": 1.02591419, + "num_input_tokens_seen": 94475840, + "router_z_loss_mlp": 0.96875, + "step": 1137, + "time_per_iteration": 2.4491164684295654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165251, + "balance_loss_mlp": 1.06869149, + "epoch": 0.21893035782993459, + "flos": 895654702080.0, + "grad_norm": 0.022710058353260835, + "language_loss": 0.90594929, + "learning_rate": 0.0009092879210153473, + "loss": 0.91760182, + "num_input_tokens_seen": 94555184, + "router_z_loss_mlp": 0.96533203, + "step": 1138, + "time_per_iteration": 3.155076503753662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168627, + "balance_loss_mlp": 1.07192433, + "epoch": 0.21912273951519814, + "flos": 468568233984.0, + "grad_norm": 0.024281064631586205, + "language_loss": 0.97427768, + "learning_rate": 0.0009091088921540333, + "loss": 0.98596388, + "num_input_tokens_seen": 94622656, + "router_z_loss_mlp": 0.96679688, + "step": 1139, + "time_per_iteration": 2.5309600830078125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172859, + "balance_loss_mlp": 1.07711029, + "epoch": 0.2193151212004617, + "flos": 1535177407488.0, + "grad_norm": 0.009496329971255709, + "language_loss": 0.75508678, + "learning_rate": 0.0009089297044625615, + "loss": 0.76681536, + "num_input_tokens_seen": 94856496, + "router_z_loss_mlp": 0.95703125, + "step": 1140, + "time_per_iteration": 4.911335229873657 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172401, + "balance_loss_mlp": 1.07569873, + "epoch": 0.2195075028857253, + "flos": 592274752512.0, + "grad_norm": 0.033335232647672346, + "language_loss": 0.95078719, + "learning_rate": 0.0009087503580104985, + "loss": 0.96251118, + "num_input_tokens_seen": 94926880, + "router_z_loss_mlp": 0.96679688, + "step": 1141, + "time_per_iteration": 2.7083888053894043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169701, + "balance_loss_mlp": 1.07295096, + "epoch": 0.21969988457098885, + "flos": 637517749248.0, + "grad_norm": 0.02859165000671714, + "language_loss": 0.90439236, + "learning_rate": 0.0009085708528674728, + "loss": 0.91608942, + "num_input_tokens_seen": 95000528, + "router_z_loss_mlp": 0.96728516, + "step": 1142, + "time_per_iteration": 2.786891222000122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162201, + "balance_loss_mlp": 1.06549823, + "epoch": 0.2198922662562524, + "flos": 913859598336.0, + "grad_norm": 0.0328462843269242, + "language_loss": 0.98848528, + "learning_rate": 0.0009083911891031745, + "loss": 1.00010729, + "num_input_tokens_seen": 95081040, + "router_z_loss_mlp": 0.96679688, + "step": 1143, + "time_per_iteration": 3.1019930839538574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116483, + "balance_loss_mlp": 1.06793654, + "epoch": 0.22008464794151597, + "flos": 824494241280.0, + "grad_norm": 0.023913565571636344, + "language_loss": 1.01496291, + "learning_rate": 0.0009082113667873553, + "loss": 1.02661121, + "num_input_tokens_seen": 95167328, + "router_z_loss_mlp": 0.96875, + "step": 1144, + "time_per_iteration": 3.104292869567871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170855, + "balance_loss_mlp": 1.07405746, + "epoch": 0.22027702962677953, + "flos": 460618622976.0, + "grad_norm": 0.029355186834356364, + "language_loss": 1.00543249, + "learning_rate": 0.0009080313859898283, + "loss": 1.0171411, + "num_input_tokens_seen": 95230304, + "router_z_loss_mlp": 0.96777344, + "step": 1145, + "time_per_iteration": 2.552457332611084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170139, + "balance_loss_mlp": 1.07343698, + "epoch": 0.2204694113120431, + "flos": 532287372288.0, + "grad_norm": 0.025362278251747628, + "language_loss": 1.01871562, + "learning_rate": 0.0009078512467804684, + "loss": 1.03041708, + "num_input_tokens_seen": 95299520, + "router_z_loss_mlp": 0.96679688, + "step": 1146, + "time_per_iteration": 2.6138763427734375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170493, + "balance_loss_mlp": 1.07379043, + "epoch": 0.22066179299730665, + "flos": 523686481920.0, + "grad_norm": 0.02553067563602684, + "language_loss": 1.00136042, + "learning_rate": 0.0009076709492292119, + "loss": 1.01306534, + "num_input_tokens_seen": 95368912, + "router_z_loss_mlp": 0.96679688, + "step": 1147, + "time_per_iteration": 2.6107985973358154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163104, + "balance_loss_mlp": 1.0664016, + "epoch": 0.2208541746825702, + "flos": 547505115648.0, + "grad_norm": 0.02505349531569444, + "language_loss": 0.99364072, + "learning_rate": 0.0009074904934060562, + "loss": 1.00527167, + "num_input_tokens_seen": 95440800, + "router_z_loss_mlp": 0.96679688, + "step": 1148, + "time_per_iteration": 2.680250644683838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166008, + "balance_loss_mlp": 1.06873322, + "epoch": 0.22104655636783377, + "flos": 710059358208.0, + "grad_norm": 0.023468083856487864, + "language_loss": 0.93112767, + "learning_rate": 0.0009073098793810607, + "loss": 0.94278765, + "num_input_tokens_seen": 95519904, + "router_z_loss_mlp": 0.97265625, + "step": 1149, + "time_per_iteration": 2.9064676761627197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165673, + "balance_loss_mlp": 1.06882739, + "epoch": 0.22123893805309736, + "flos": 585964073472.0, + "grad_norm": 0.028202445852463846, + "language_loss": 0.98436809, + "learning_rate": 0.000907129107224346, + "loss": 0.99602491, + "num_input_tokens_seen": 95591568, + "router_z_loss_mlp": 0.96826172, + "step": 1150, + "time_per_iteration": 2.670436382293701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165906, + "balance_loss_mlp": 1.06901312, + "epoch": 0.22143131973836092, + "flos": 493250995200.0, + "grad_norm": 0.02267098136900654, + "language_loss": 0.95673937, + "learning_rate": 0.0009069481770060939, + "loss": 0.96839839, + "num_input_tokens_seen": 95664480, + "router_z_loss_mlp": 0.96875, + "step": 1151, + "time_per_iteration": 2.650136947631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167632, + "balance_loss_mlp": 1.07092977, + "epoch": 0.22162370142362448, + "flos": 1081467623424.0, + "grad_norm": 0.023887201965423828, + "language_loss": 0.92357147, + "learning_rate": 0.000906767088796548, + "loss": 0.93524778, + "num_input_tokens_seen": 95754400, + "router_z_loss_mlp": 0.96679688, + "step": 1152, + "time_per_iteration": 3.4331767559051514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174048, + "balance_loss_mlp": 1.07734585, + "epoch": 0.22181608310888803, + "flos": 493511506944.0, + "grad_norm": 0.021211000774135545, + "language_loss": 0.94297695, + "learning_rate": 0.0009065858426660127, + "loss": 0.9547174, + "num_input_tokens_seen": 95826944, + "router_z_loss_mlp": 0.96679688, + "step": 1153, + "time_per_iteration": 2.6492207050323486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171336, + "balance_loss_mlp": 1.07458591, + "epoch": 0.2220084647941516, + "flos": 725324765184.0, + "grad_norm": 0.02806046891368227, + "language_loss": 0.95655924, + "learning_rate": 0.0009064044386848543, + "loss": 0.96827257, + "num_input_tokens_seen": 95902688, + "router_z_loss_mlp": 0.96728516, + "step": 1154, + "time_per_iteration": 2.9135258197784424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116775, + "balance_loss_mlp": 1.07090425, + "epoch": 0.22220084647941515, + "flos": 490244376576.0, + "grad_norm": 0.029776005734579798, + "language_loss": 1.00600004, + "learning_rate": 0.0009062228769234997, + "loss": 1.01767755, + "num_input_tokens_seen": 95969952, + "router_z_loss_mlp": 0.96826172, + "step": 1155, + "time_per_iteration": 2.597781181335449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171214, + "balance_loss_mlp": 1.07451141, + "epoch": 0.2223932281646787, + "flos": 537295492608.0, + "grad_norm": 0.030445586519746, + "language_loss": 0.93354964, + "learning_rate": 0.0009060411574524376, + "loss": 0.94526184, + "num_input_tokens_seen": 96037344, + "router_z_loss_mlp": 0.96679688, + "step": 1156, + "time_per_iteration": 2.7325634956359863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168314, + "balance_loss_mlp": 1.07151604, + "epoch": 0.22258560984994227, + "flos": 932967553536.0, + "grad_norm": 0.0275078677514356, + "language_loss": 0.98614538, + "learning_rate": 0.0009058592803422178, + "loss": 0.99782854, + "num_input_tokens_seen": 96115616, + "router_z_loss_mlp": 0.96777344, + "step": 1157, + "time_per_iteration": 3.156981945037842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169861, + "balance_loss_mlp": 1.0739212, + "epoch": 0.22277799153520586, + "flos": 1202395286016.0, + "grad_norm": 0.00950920896526599, + "language_loss": 0.78710288, + "learning_rate": 0.0009056772456634512, + "loss": 0.79880148, + "num_input_tokens_seen": 96333600, + "router_z_loss_mlp": 0.95898438, + "step": 1158, + "time_per_iteration": 4.7935662269592285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170727, + "balance_loss_mlp": 1.07421494, + "epoch": 0.22297037322046942, + "flos": 502316513280.0, + "grad_norm": 0.05502374006765337, + "language_loss": 0.97024429, + "learning_rate": 0.00090549505348681, + "loss": 0.98195159, + "num_input_tokens_seen": 96402544, + "router_z_loss_mlp": 0.96484375, + "step": 1159, + "time_per_iteration": 2.579418659210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167768, + "balance_loss_mlp": 1.07135153, + "epoch": 0.22316275490573298, + "flos": 754112587776.0, + "grad_norm": 0.025312842068973822, + "language_loss": 0.9244132, + "learning_rate": 0.0009053127038830275, + "loss": 0.93609083, + "num_input_tokens_seen": 96487600, + "router_z_loss_mlp": 0.96386719, + "step": 1160, + "time_per_iteration": 2.970240592956543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169788, + "balance_loss_mlp": 1.07346714, + "epoch": 0.22335513659099654, + "flos": 515804000256.0, + "grad_norm": 0.02702757021011719, + "language_loss": 0.97474223, + "learning_rate": 0.000905130196922898, + "loss": 0.98644012, + "num_input_tokens_seen": 96554912, + "router_z_loss_mlp": 0.96289062, + "step": 1161, + "time_per_iteration": 2.558567762374878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175493, + "balance_loss_mlp": 1.07917213, + "epoch": 0.2235475182762601, + "flos": 485507501568.0, + "grad_norm": 0.024760780359754056, + "language_loss": 0.947945, + "learning_rate": 0.0009049475326772769, + "loss": 0.95969993, + "num_input_tokens_seen": 96624192, + "router_z_loss_mlp": 0.96289062, + "step": 1162, + "time_per_iteration": 2.5948867797851562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168008, + "balance_loss_mlp": 1.0716871, + "epoch": 0.22373989996152366, + "flos": 471067290624.0, + "grad_norm": 0.0243609738761747, + "language_loss": 0.92091036, + "learning_rate": 0.0009047647112170811, + "loss": 0.93259048, + "num_input_tokens_seen": 96701040, + "router_z_loss_mlp": 0.96289062, + "step": 1163, + "time_per_iteration": 2.7958250045776367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165002, + "balance_loss_mlp": 1.06868088, + "epoch": 0.22393228164678722, + "flos": 1273017807360.0, + "grad_norm": 0.0269563070164892, + "language_loss": 0.98098505, + "learning_rate": 0.0009045817326132876, + "loss": 0.99263507, + "num_input_tokens_seen": 96791200, + "router_z_loss_mlp": 0.96289062, + "step": 1164, + "time_per_iteration": 3.64853835105896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165462, + "balance_loss_mlp": 1.06914091, + "epoch": 0.22412466333205078, + "flos": 597467523072.0, + "grad_norm": 0.02771003139242203, + "language_loss": 0.94602239, + "learning_rate": 0.0009043985969369357, + "loss": 0.95767695, + "num_input_tokens_seen": 96869360, + "router_z_loss_mlp": 0.96289062, + "step": 1165, + "time_per_iteration": 2.8231425285339355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175209, + "balance_loss_mlp": 1.07860184, + "epoch": 0.22431704501731436, + "flos": 609630984192.0, + "grad_norm": 0.02516811505749033, + "language_loss": 0.93514198, + "learning_rate": 0.0009042153042591245, + "loss": 0.94689411, + "num_input_tokens_seen": 96945840, + "router_z_loss_mlp": 0.96582031, + "step": 1166, + "time_per_iteration": 2.755671501159668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174563, + "balance_loss_mlp": 1.07819414, + "epoch": 0.22450942670257792, + "flos": 908106872832.0, + "grad_norm": 0.024247493396408124, + "language_loss": 0.93277276, + "learning_rate": 0.0009040318546510146, + "loss": 0.94451833, + "num_input_tokens_seen": 97029296, + "router_z_loss_mlp": 0.96337891, + "step": 1167, + "time_per_iteration": 3.126707077026367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174214, + "balance_loss_mlp": 1.07770181, + "epoch": 0.22470180838784148, + "flos": 566380756992.0, + "grad_norm": 0.02335770706345326, + "language_loss": 0.94522464, + "learning_rate": 0.0009038482481838275, + "loss": 0.95696682, + "num_input_tokens_seen": 97097776, + "router_z_loss_mlp": 0.96484375, + "step": 1168, + "time_per_iteration": 2.6482362747192383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171371, + "balance_loss_mlp": 1.07485878, + "epoch": 0.22489419007310504, + "flos": 835917100032.0, + "grad_norm": 0.021740410096357694, + "language_loss": 0.9467479, + "learning_rate": 0.0009036644849288455, + "loss": 0.95846164, + "num_input_tokens_seen": 97181424, + "router_z_loss_mlp": 0.96484375, + "step": 1169, + "time_per_iteration": 3.0959203243255615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168691, + "balance_loss_mlp": 1.07217908, + "epoch": 0.2250865717583686, + "flos": 582138989568.0, + "grad_norm": 0.028400846177611044, + "language_loss": 0.95971251, + "learning_rate": 0.0009034805649574118, + "loss": 0.97139943, + "num_input_tokens_seen": 97252128, + "router_z_loss_mlp": 0.96484375, + "step": 1170, + "time_per_iteration": 2.65209698677063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171761, + "balance_loss_mlp": 1.07515407, + "epoch": 0.22527895344363216, + "flos": 601670639616.0, + "grad_norm": 0.021879369323455276, + "language_loss": 0.92857611, + "learning_rate": 0.0009032964883409308, + "loss": 0.94029367, + "num_input_tokens_seen": 97326640, + "router_z_loss_mlp": 0.96582031, + "step": 1171, + "time_per_iteration": 2.8586626052856445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175461, + "balance_loss_mlp": 1.07990265, + "epoch": 0.22547133512889572, + "flos": 1443731959296.0, + "grad_norm": 0.011387534292379292, + "language_loss": 0.73050535, + "learning_rate": 0.000903112255150867, + "loss": 0.74225998, + "num_input_tokens_seen": 97553952, + "router_z_loss_mlp": 0.95507812, + "step": 1172, + "time_per_iteration": 4.9882895946502686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171774, + "balance_loss_mlp": 1.07526255, + "epoch": 0.22566371681415928, + "flos": 491585866752.0, + "grad_norm": 0.025801800464723818, + "language_loss": 0.97062689, + "learning_rate": 0.0009029278654587462, + "loss": 0.98234463, + "num_input_tokens_seen": 97623584, + "router_z_loss_mlp": 0.96484375, + "step": 1173, + "time_per_iteration": 2.595419406890869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171429, + "balance_loss_mlp": 1.07491696, + "epoch": 0.22585609849942284, + "flos": 605751505920.0, + "grad_norm": 0.02576863859493135, + "language_loss": 0.92400688, + "learning_rate": 0.0009027433193361548, + "loss": 0.93572116, + "num_input_tokens_seen": 97695952, + "router_z_loss_mlp": 0.96484375, + "step": 1174, + "time_per_iteration": 2.738267183303833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117476, + "balance_loss_mlp": 1.07824779, + "epoch": 0.22604848018468643, + "flos": 636727481856.0, + "grad_norm": 0.028952390928102957, + "language_loss": 0.97668821, + "learning_rate": 0.00090255861685474, + "loss": 0.98843575, + "num_input_tokens_seen": 97764544, + "router_z_loss_mlp": 0.96484375, + "step": 1175, + "time_per_iteration": 2.7286014556884766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117152, + "balance_loss_mlp": 1.07481766, + "epoch": 0.22624086186995, + "flos": 480844486656.0, + "grad_norm": 0.027877026454804697, + "language_loss": 1.02366519, + "learning_rate": 0.0009023737580862095, + "loss": 1.03538048, + "num_input_tokens_seen": 97830976, + "router_z_loss_mlp": 0.96679688, + "step": 1176, + "time_per_iteration": 2.553281307220459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170774, + "balance_loss_mlp": 1.07388091, + "epoch": 0.22643324355521355, + "flos": 496806835200.0, + "grad_norm": 0.02249634447584531, + "language_loss": 0.90840948, + "learning_rate": 0.0009021887431023321, + "loss": 0.92011726, + "num_input_tokens_seen": 97898800, + "router_z_loss_mlp": 0.96875, + "step": 1177, + "time_per_iteration": 2.5862364768981934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172189, + "balance_loss_mlp": 1.07539093, + "epoch": 0.2266256252404771, + "flos": 562683927552.0, + "grad_norm": 0.02041789434880362, + "language_loss": 0.95725513, + "learning_rate": 0.0009020035719749369, + "loss": 0.96897697, + "num_input_tokens_seen": 97974112, + "router_z_loss_mlp": 0.96777344, + "step": 1178, + "time_per_iteration": 2.7553560733795166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176357, + "balance_loss_mlp": 1.0796541, + "epoch": 0.22681800692574067, + "flos": 581032541184.0, + "grad_norm": 0.026733278329428435, + "language_loss": 0.89533567, + "learning_rate": 0.0009018182447759136, + "loss": 0.90709925, + "num_input_tokens_seen": 98056640, + "router_z_loss_mlp": 0.96679688, + "step": 1179, + "time_per_iteration": 3.012024402618408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175508, + "balance_loss_mlp": 1.07904434, + "epoch": 0.22701038861100423, + "flos": 741465033216.0, + "grad_norm": 0.025064804828048133, + "language_loss": 0.90941453, + "learning_rate": 0.0009016327615772126, + "loss": 0.92116958, + "num_input_tokens_seen": 98135952, + "router_z_loss_mlp": 0.96435547, + "step": 1180, + "time_per_iteration": 2.969684600830078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172378, + "balance_loss_mlp": 1.07577109, + "epoch": 0.2272027702962678, + "flos": 578305173504.0, + "grad_norm": 0.036813558231106436, + "language_loss": 1.00164366, + "learning_rate": 0.0009014471224508451, + "loss": 1.01336741, + "num_input_tokens_seen": 98204288, + "router_z_loss_mlp": 0.96582031, + "step": 1181, + "time_per_iteration": 2.664487361907959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173976, + "balance_loss_mlp": 1.0774641, + "epoch": 0.22739515198153135, + "flos": 545290765824.0, + "grad_norm": 0.028585613124224512, + "language_loss": 0.95647848, + "learning_rate": 0.0009012613274688823, + "loss": 0.96821827, + "num_input_tokens_seen": 98269856, + "router_z_loss_mlp": 0.96484375, + "step": 1182, + "time_per_iteration": 2.647608518600464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177492, + "balance_loss_mlp": 1.08078945, + "epoch": 0.22758753366679493, + "flos": 441091702272.0, + "grad_norm": 0.02755397132508441, + "language_loss": 1.00651419, + "learning_rate": 0.0009010753767034565, + "loss": 1.01828909, + "num_input_tokens_seen": 98335632, + "router_z_loss_mlp": 0.96679688, + "step": 1183, + "time_per_iteration": 2.528580904006958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176952, + "balance_loss_mlp": 1.08053601, + "epoch": 0.2277799153520585, + "flos": 730823709696.0, + "grad_norm": 0.024484618665474616, + "language_loss": 0.90051508, + "learning_rate": 0.0009008892702267599, + "loss": 0.91228461, + "num_input_tokens_seen": 98420592, + "router_z_loss_mlp": 0.96386719, + "step": 1184, + "time_per_iteration": 2.990344285964966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117733, + "balance_loss_mlp": 1.08100891, + "epoch": 0.22797229703732205, + "flos": 527913067008.0, + "grad_norm": 0.030622621699729128, + "language_loss": 1.01022232, + "learning_rate": 0.0009007030081110457, + "loss": 1.02199566, + "num_input_tokens_seen": 98488096, + "router_z_loss_mlp": 0.96289062, + "step": 1185, + "time_per_iteration": 2.5795140266418457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172726, + "balance_loss_mlp": 1.07592821, + "epoch": 0.2281646787225856, + "flos": 536520688128.0, + "grad_norm": 0.026616575931436976, + "language_loss": 0.93079567, + "learning_rate": 0.000900516590428627, + "loss": 0.942523, + "num_input_tokens_seen": 98561664, + "router_z_loss_mlp": 0.96777344, + "step": 1186, + "time_per_iteration": 2.6647558212280273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117313, + "balance_loss_mlp": 1.07628405, + "epoch": 0.22835706040784917, + "flos": 542477529600.0, + "grad_norm": 0.02522496809839962, + "language_loss": 0.99033505, + "learning_rate": 0.0009003300172518778, + "loss": 1.00206637, + "num_input_tokens_seen": 98634336, + "router_z_loss_mlp": 0.96826172, + "step": 1187, + "time_per_iteration": 2.7046303749084473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177624, + "balance_loss_mlp": 1.08073056, + "epoch": 0.22854944209311273, + "flos": 792004859904.0, + "grad_norm": 0.026332453075710083, + "language_loss": 0.94325852, + "learning_rate": 0.0009001432886532321, + "loss": 0.95503473, + "num_input_tokens_seen": 98709600, + "router_z_loss_mlp": 0.96875, + "step": 1188, + "time_per_iteration": 2.9583094120025635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179036, + "balance_loss_mlp": 1.08233392, + "epoch": 0.2287418237783763, + "flos": 470215898112.0, + "grad_norm": 0.025775869396212594, + "language_loss": 0.97465944, + "learning_rate": 0.0008999564047051843, + "loss": 0.98644984, + "num_input_tokens_seen": 98775024, + "router_z_loss_mlp": 0.96679688, + "step": 1189, + "time_per_iteration": 2.49245023727417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178388, + "balance_loss_mlp": 1.08154237, + "epoch": 0.22893420546363985, + "flos": 469004663808.0, + "grad_norm": 0.023763579929190374, + "language_loss": 0.94691694, + "learning_rate": 0.0008997693654802894, + "loss": 0.95870078, + "num_input_tokens_seen": 98845248, + "router_z_loss_mlp": 0.96826172, + "step": 1190, + "time_per_iteration": 2.6276731491088867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178257, + "balance_loss_mlp": 1.08145857, + "epoch": 0.22912658714890344, + "flos": 627401452032.0, + "grad_norm": 0.023724149848154047, + "language_loss": 0.95182133, + "learning_rate": 0.0008995821710511625, + "loss": 0.96360391, + "num_input_tokens_seen": 98913584, + "router_z_loss_mlp": 0.96777344, + "step": 1191, + "time_per_iteration": 2.756840705871582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117993, + "balance_loss_mlp": 1.08308399, + "epoch": 0.229318968834167, + "flos": 504020573184.0, + "grad_norm": 0.024708694220473774, + "language_loss": 0.93247074, + "learning_rate": 0.0008993948214904786, + "loss": 0.94427001, + "num_input_tokens_seen": 98978608, + "router_z_loss_mlp": 0.96826172, + "step": 1192, + "time_per_iteration": 2.577340602874756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190514, + "balance_loss_mlp": 1.09533691, + "epoch": 0.22951135051943056, + "flos": 1377713877504.0, + "grad_norm": 0.021264094300491608, + "language_loss": 0.78422213, + "learning_rate": 0.0008992073168709733, + "loss": 0.79612726, + "num_input_tokens_seen": 99207424, + "router_z_loss_mlp": 0.95117188, + "step": 1193, + "time_per_iteration": 4.850237607955933 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179442, + "balance_loss_mlp": 1.08316851, + "epoch": 0.22970373220469412, + "flos": 645549952512.0, + "grad_norm": 0.02667568465905087, + "language_loss": 0.92540175, + "learning_rate": 0.0008990196572654427, + "loss": 0.93719625, + "num_input_tokens_seen": 99290592, + "router_z_loss_mlp": 0.96240234, + "step": 1194, + "time_per_iteration": 2.8638381958007812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180858, + "balance_loss_mlp": 1.08453715, + "epoch": 0.22989611388995768, + "flos": 501272464896.0, + "grad_norm": 0.02416134539694475, + "language_loss": 0.95937514, + "learning_rate": 0.0008988318427467426, + "loss": 0.97118378, + "num_input_tokens_seen": 99366096, + "router_z_loss_mlp": 0.96289062, + "step": 1195, + "time_per_iteration": 2.7063868045806885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182741, + "balance_loss_mlp": 1.08589542, + "epoch": 0.23008849557522124, + "flos": 1098333030912.0, + "grad_norm": 0.02922856270819412, + "language_loss": 0.9667449, + "learning_rate": 0.0008986438733877887, + "loss": 0.97857237, + "num_input_tokens_seen": 99456768, + "router_z_loss_mlp": 0.96826172, + "step": 1196, + "time_per_iteration": 3.4508113861083984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176758, + "balance_loss_mlp": 1.08043683, + "epoch": 0.2302808772604848, + "flos": 684992560128.0, + "grad_norm": 0.022228440588834414, + "language_loss": 0.91545051, + "learning_rate": 0.0008984557492615576, + "loss": 0.92721808, + "num_input_tokens_seen": 99539616, + "router_z_loss_mlp": 0.96289062, + "step": 1197, + "time_per_iteration": 2.93611741065979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179014, + "balance_loss_mlp": 1.08269298, + "epoch": 0.23047325894574835, + "flos": 529960230912.0, + "grad_norm": 0.026499525382426087, + "language_loss": 0.99148774, + "learning_rate": 0.0008982674704410854, + "loss": 1.0032779, + "num_input_tokens_seen": 99612064, + "router_z_loss_mlp": 0.96289062, + "step": 1198, + "time_per_iteration": 2.7032008171081543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180823, + "balance_loss_mlp": 1.08450174, + "epoch": 0.23066564063101191, + "flos": 684126431232.0, + "grad_norm": 0.025326379221325218, + "language_loss": 0.86113322, + "learning_rate": 0.0008980790369994682, + "loss": 0.87294143, + "num_input_tokens_seen": 99691040, + "router_z_loss_mlp": 0.96289062, + "step": 1199, + "time_per_iteration": 2.9629056453704834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173246, + "balance_loss_mlp": 1.07673466, + "epoch": 0.2308580223162755, + "flos": 559631646720.0, + "grad_norm": 0.02469990042405053, + "language_loss": 0.95889735, + "learning_rate": 0.000897890449009863, + "loss": 0.97062981, + "num_input_tokens_seen": 99762016, + "router_z_loss_mlp": 0.96484375, + "step": 1200, + "time_per_iteration": 2.6673126220703125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178191, + "balance_loss_mlp": 1.08167911, + "epoch": 0.23105040400153906, + "flos": 556729087488.0, + "grad_norm": 0.021551459012756572, + "language_loss": 0.97633696, + "learning_rate": 0.0008977017065454853, + "loss": 0.98811877, + "num_input_tokens_seen": 99835552, + "router_z_loss_mlp": 0.96484375, + "step": 1201, + "time_per_iteration": 2.6586263179779053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176954, + "balance_loss_mlp": 1.08048964, + "epoch": 0.23124278568680262, + "flos": 706049624064.0, + "grad_norm": 0.025666519973580538, + "language_loss": 0.89963996, + "learning_rate": 0.0008975128096796121, + "loss": 0.9114095, + "num_input_tokens_seen": 99910784, + "router_z_loss_mlp": 0.96435547, + "step": 1202, + "time_per_iteration": 2.8599958419799805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175929, + "balance_loss_mlp": 1.07989419, + "epoch": 0.23143516737206618, + "flos": 613968359424.0, + "grad_norm": 0.02791489713026627, + "language_loss": 0.96485001, + "learning_rate": 0.0008973237584855794, + "loss": 0.97660929, + "num_input_tokens_seen": 99991120, + "router_z_loss_mlp": 0.95996094, + "step": 1203, + "time_per_iteration": 2.8814125061035156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117493, + "balance_loss_mlp": 1.07903779, + "epoch": 0.23162754905732974, + "flos": 390095980032.0, + "grad_norm": 0.02381480195735972, + "language_loss": 0.91340852, + "learning_rate": 0.0008971345530367832, + "loss": 0.92515785, + "num_input_tokens_seen": 100053888, + "router_z_loss_mlp": 0.95849609, + "step": 1204, + "time_per_iteration": 2.513951301574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176133, + "balance_loss_mlp": 1.08024144, + "epoch": 0.2318199307425933, + "flos": 668969086464.0, + "grad_norm": 0.024943516104182908, + "language_loss": 0.94778013, + "learning_rate": 0.0008969451934066799, + "loss": 0.95954144, + "num_input_tokens_seen": 100124176, + "router_z_loss_mlp": 0.95849609, + "step": 1205, + "time_per_iteration": 2.80454421043396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173068, + "balance_loss_mlp": 1.07712853, + "epoch": 0.23201231242785686, + "flos": 667627596288.0, + "grad_norm": 0.029617322009159303, + "language_loss": 0.92493355, + "learning_rate": 0.0008967556796687854, + "loss": 0.93666422, + "num_input_tokens_seen": 100205296, + "router_z_loss_mlp": 0.95898438, + "step": 1206, + "time_per_iteration": 2.89932918548584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173146, + "balance_loss_mlp": 1.07720602, + "epoch": 0.23220469411312042, + "flos": 750094121472.0, + "grad_norm": 0.024264467100448908, + "language_loss": 0.94343531, + "learning_rate": 0.0008965660118966752, + "loss": 0.95516682, + "num_input_tokens_seen": 100279440, + "router_z_loss_mlp": 0.95898438, + "step": 1207, + "time_per_iteration": 2.9768385887145996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179014, + "balance_loss_mlp": 1.08307481, + "epoch": 0.232397075798384, + "flos": 668261411328.0, + "grad_norm": 0.02512248807118796, + "language_loss": 0.97498, + "learning_rate": 0.0008963761901639851, + "loss": 0.98677015, + "num_input_tokens_seen": 100354512, + "router_z_loss_mlp": 0.95898438, + "step": 1208, + "time_per_iteration": 2.8175342082977295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177539, + "balance_loss_mlp": 1.081599, + "epoch": 0.23258945748364757, + "flos": 611345777664.0, + "grad_norm": 0.025244332610569246, + "language_loss": 0.93465042, + "learning_rate": 0.0008961862145444103, + "loss": 0.9464258, + "num_input_tokens_seen": 100426848, + "router_z_loss_mlp": 0.95898438, + "step": 1209, + "time_per_iteration": 2.707583427429199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117491, + "balance_loss_mlp": 1.07901847, + "epoch": 0.23278183916891113, + "flos": 490672074240.0, + "grad_norm": 0.025133767455437463, + "language_loss": 0.96175104, + "learning_rate": 0.0008959960851117059, + "loss": 0.97350019, + "num_input_tokens_seen": 100496176, + "router_z_loss_mlp": 0.95849609, + "step": 1210, + "time_per_iteration": 2.5783777236938477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174943, + "balance_loss_mlp": 1.07895589, + "epoch": 0.23297422085417469, + "flos": 512673856512.0, + "grad_norm": 0.027877077505007057, + "language_loss": 0.94183683, + "learning_rate": 0.0008958058019396868, + "loss": 0.95358628, + "num_input_tokens_seen": 100575072, + "router_z_loss_mlp": 0.95947266, + "step": 1211, + "time_per_iteration": 2.7695388793945312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118178, + "balance_loss_mlp": 1.08560216, + "epoch": 0.23316660253943824, + "flos": 547531312128.0, + "grad_norm": 0.0259067341075638, + "language_loss": 0.95459378, + "learning_rate": 0.0008956153651022274, + "loss": 0.96641153, + "num_input_tokens_seen": 100648304, + "router_z_loss_mlp": 0.96142578, + "step": 1212, + "time_per_iteration": 2.7088377475738525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178328, + "balance_loss_mlp": 1.08181643, + "epoch": 0.2333589842247018, + "flos": 511288705536.0, + "grad_norm": 0.023917692799316066, + "language_loss": 0.93208623, + "learning_rate": 0.0008954247746732618, + "loss": 0.94386959, + "num_input_tokens_seen": 100717616, + "router_z_loss_mlp": 0.96484375, + "step": 1213, + "time_per_iteration": 2.6319668292999268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172909, + "balance_loss_mlp": 1.0766834, + "epoch": 0.23355136590996536, + "flos": 664406128128.0, + "grad_norm": 0.02356648487739955, + "language_loss": 0.98858505, + "learning_rate": 0.0008952340307267837, + "loss": 1.00031424, + "num_input_tokens_seen": 100797056, + "router_z_loss_mlp": 0.96191406, + "step": 1214, + "time_per_iteration": 2.891026735305786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172334, + "balance_loss_mlp": 1.07629859, + "epoch": 0.23374374759522892, + "flos": 509465123328.0, + "grad_norm": 0.027978905734491046, + "language_loss": 0.94424212, + "learning_rate": 0.0008950431333368468, + "loss": 0.95596552, + "num_input_tokens_seen": 100863632, + "router_z_loss_mlp": 0.95996094, + "step": 1215, + "time_per_iteration": 2.5823488235473633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173288, + "balance_loss_mlp": 1.07730114, + "epoch": 0.2339361292804925, + "flos": 1296428209152.0, + "grad_norm": 0.026145796218117214, + "language_loss": 0.94705772, + "learning_rate": 0.0008948520825775634, + "loss": 0.95879066, + "num_input_tokens_seen": 100950272, + "router_z_loss_mlp": 0.95947266, + "step": 1216, + "time_per_iteration": 3.6343605518341064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174216, + "balance_loss_mlp": 1.07808566, + "epoch": 0.23412851096575607, + "flos": 707176264704.0, + "grad_norm": 0.02578801546488365, + "language_loss": 0.93516719, + "learning_rate": 0.0008946608785231067, + "loss": 0.94690937, + "num_input_tokens_seen": 101031008, + "router_z_loss_mlp": 0.9609375, + "step": 1217, + "time_per_iteration": 2.8923676013946533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174557, + "balance_loss_mlp": 1.07842624, + "epoch": 0.23432089265101963, + "flos": 439174794240.0, + "grad_norm": 0.024987781095147748, + "language_loss": 0.94467312, + "learning_rate": 0.0008944695212477084, + "loss": 0.95641869, + "num_input_tokens_seen": 101094688, + "router_z_loss_mlp": 0.9609375, + "step": 1218, + "time_per_iteration": 2.47641658782959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176273, + "balance_loss_mlp": 1.08028615, + "epoch": 0.2345132743362832, + "flos": 481914731520.0, + "grad_norm": 0.02187031641141441, + "language_loss": 0.9320662, + "learning_rate": 0.0008942780108256599, + "loss": 0.94382894, + "num_input_tokens_seen": 101163744, + "router_z_loss_mlp": 0.95947266, + "step": 1219, + "time_per_iteration": 2.585204839706421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176397, + "balance_loss_mlp": 1.07993269, + "epoch": 0.23470565602154675, + "flos": 412340809728.0, + "grad_norm": 0.02314471919225668, + "language_loss": 0.95930934, + "learning_rate": 0.0008940863473313121, + "loss": 0.97107327, + "num_input_tokens_seen": 101226480, + "router_z_loss_mlp": 0.96435547, + "step": 1220, + "time_per_iteration": 2.461904764175415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174627, + "balance_loss_mlp": 1.07811534, + "epoch": 0.2348980377068103, + "flos": 546499998720.0, + "grad_norm": 0.029389735884218435, + "language_loss": 0.99771547, + "learning_rate": 0.0008938945308390756, + "loss": 1.00946164, + "num_input_tokens_seen": 101291824, + "router_z_loss_mlp": 0.96484375, + "step": 1221, + "time_per_iteration": 2.6403567790985107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179462, + "balance_loss_mlp": 1.08295047, + "epoch": 0.23509041939207387, + "flos": 576842159616.0, + "grad_norm": 0.023502241620232074, + "language_loss": 0.96374851, + "learning_rate": 0.00089370256142342, + "loss": 0.97554314, + "num_input_tokens_seen": 101367216, + "router_z_loss_mlp": 0.96484375, + "step": 1222, + "time_per_iteration": 2.7148585319519043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178637, + "balance_loss_mlp": 1.08198178, + "epoch": 0.23528280107733743, + "flos": 589947611136.0, + "grad_norm": 0.022852016666186668, + "language_loss": 0.93682569, + "learning_rate": 0.0008935104391588746, + "loss": 0.94861209, + "num_input_tokens_seen": 101438992, + "router_z_loss_mlp": 0.96630859, + "step": 1223, + "time_per_iteration": 2.7302677631378174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179799, + "balance_loss_mlp": 1.08338237, + "epoch": 0.235475182762601, + "flos": 824856811008.0, + "grad_norm": 0.02091323276417278, + "language_loss": 0.91087663, + "learning_rate": 0.0008933181641200276, + "loss": 0.9226746, + "num_input_tokens_seen": 101534464, + "router_z_loss_mlp": 0.96386719, + "step": 1224, + "time_per_iteration": 3.120337724685669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183017, + "balance_loss_mlp": 1.08650565, + "epoch": 0.23566756444786457, + "flos": 681366862848.0, + "grad_norm": 0.027323039985709546, + "language_loss": 0.94355077, + "learning_rate": 0.0008931257363815271, + "loss": 0.95538092, + "num_input_tokens_seen": 101616496, + "router_z_loss_mlp": 0.96484375, + "step": 1225, + "time_per_iteration": 2.893202543258667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178928, + "balance_loss_mlp": 1.08251154, + "epoch": 0.23585994613312813, + "flos": 703134329856.0, + "grad_norm": 0.022860929740297704, + "language_loss": 0.96590424, + "learning_rate": 0.0008929331560180798, + "loss": 0.97769356, + "num_input_tokens_seen": 101694496, + "router_z_loss_mlp": 0.96386719, + "step": 1226, + "time_per_iteration": 2.913858652114868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176734, + "balance_loss_mlp": 1.08017468, + "epoch": 0.2360523278183917, + "flos": 525195158016.0, + "grad_norm": 0.02227272458953822, + "language_loss": 0.99194574, + "learning_rate": 0.0008927404231044525, + "loss": 1.00371313, + "num_input_tokens_seen": 101766160, + "router_z_loss_mlp": 0.96533203, + "step": 1227, + "time_per_iteration": 2.7194507122039795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175869, + "balance_loss_mlp": 1.07921374, + "epoch": 0.23624470950365525, + "flos": 525442934784.0, + "grad_norm": 0.02071878597098496, + "language_loss": 0.89412713, + "learning_rate": 0.0008925475377154703, + "loss": 0.90588582, + "num_input_tokens_seen": 101844160, + "router_z_loss_mlp": 0.96630859, + "step": 1228, + "time_per_iteration": 2.742506265640259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175669, + "balance_loss_mlp": 1.07896686, + "epoch": 0.2364370911889188, + "flos": 597960348672.0, + "grad_norm": 0.023166098266421232, + "language_loss": 0.90900964, + "learning_rate": 0.0008923544999260183, + "loss": 0.92076635, + "num_input_tokens_seen": 101917968, + "router_z_loss_mlp": 0.96679688, + "step": 1229, + "time_per_iteration": 2.809842109680176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177841, + "balance_loss_mlp": 1.08113885, + "epoch": 0.23662947287418237, + "flos": 758171986944.0, + "grad_norm": 0.02725464196132968, + "language_loss": 1.00227833, + "learning_rate": 0.00089216130981104, + "loss": 1.0140568, + "num_input_tokens_seen": 101996880, + "router_z_loss_mlp": 0.96679688, + "step": 1230, + "time_per_iteration": 3.0096282958984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178297, + "balance_loss_mlp": 1.08159423, + "epoch": 0.23682185455944593, + "flos": 547207673856.0, + "grad_norm": 0.024713012089740163, + "language_loss": 0.91807795, + "learning_rate": 0.000891967967445539, + "loss": 0.92986089, + "num_input_tokens_seen": 102067936, + "router_z_loss_mlp": 0.96679688, + "step": 1231, + "time_per_iteration": 2.7001702785491943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185987, + "balance_loss_mlp": 1.08928442, + "epoch": 0.2370142362447095, + "flos": 663522534912.0, + "grad_norm": 0.02265672956199411, + "language_loss": 0.96654546, + "learning_rate": 0.0008917744729045772, + "loss": 0.97840536, + "num_input_tokens_seen": 102147552, + "router_z_loss_mlp": 0.96679688, + "step": 1232, + "time_per_iteration": 2.8703036308288574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184505, + "balance_loss_mlp": 1.08789778, + "epoch": 0.23720661792997308, + "flos": 684911969280.0, + "grad_norm": 0.02632145570598456, + "language_loss": 0.93737417, + "learning_rate": 0.0008915808262632757, + "loss": 0.94921923, + "num_input_tokens_seen": 102224480, + "router_z_loss_mlp": 0.96582031, + "step": 1233, + "time_per_iteration": 2.839534044265747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185605, + "balance_loss_mlp": 1.08928347, + "epoch": 0.23739899961523664, + "flos": 560022414336.0, + "grad_norm": 0.027552675935845497, + "language_loss": 1.01508975, + "learning_rate": 0.0008913870275968148, + "loss": 1.02694583, + "num_input_tokens_seen": 102297392, + "router_z_loss_mlp": 0.96289062, + "step": 1234, + "time_per_iteration": 2.7176129817962646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182161, + "balance_loss_mlp": 1.08545852, + "epoch": 0.2375913813005002, + "flos": 891163602432.0, + "grad_norm": 0.02404650352203449, + "language_loss": 0.9583261, + "learning_rate": 0.0008911930769804342, + "loss": 0.97014773, + "num_input_tokens_seen": 102386032, + "router_z_loss_mlp": 0.96679688, + "step": 1235, + "time_per_iteration": 3.244257688522339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179697, + "balance_loss_mlp": 1.08289862, + "epoch": 0.23778376298576376, + "flos": 642365414400.0, + "grad_norm": 0.020226791074773265, + "language_loss": 0.99461335, + "learning_rate": 0.0008909989744894318, + "loss": 1.00641024, + "num_input_tokens_seen": 102463504, + "router_z_loss_mlp": 0.96777344, + "step": 1236, + "time_per_iteration": 2.8618855476379395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179012, + "balance_loss_mlp": 1.08230948, + "epoch": 0.23797614467102732, + "flos": 617945166336.0, + "grad_norm": 0.025060145140963254, + "language_loss": 0.91887248, + "learning_rate": 0.0008908047201991649, + "loss": 0.93066257, + "num_input_tokens_seen": 102529632, + "router_z_loss_mlp": 0.96679688, + "step": 1237, + "time_per_iteration": 2.7335665225982666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177715, + "balance_loss_mlp": 1.08120298, + "epoch": 0.23816852635629088, + "flos": 625463076864.0, + "grad_norm": 0.02188809519195417, + "language_loss": 0.92642158, + "learning_rate": 0.0008906103141850502, + "loss": 0.93819869, + "num_input_tokens_seen": 102610192, + "router_z_loss_mlp": 0.96484375, + "step": 1238, + "time_per_iteration": 2.9244723320007324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178141, + "balance_loss_mlp": 1.0816294, + "epoch": 0.23836090804155444, + "flos": 522440318976.0, + "grad_norm": 0.025638098136730073, + "language_loss": 0.97356987, + "learning_rate": 0.0008904157565225621, + "loss": 0.98535126, + "num_input_tokens_seen": 102681216, + "router_z_loss_mlp": 0.96484375, + "step": 1239, + "time_per_iteration": 2.6046018600463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186867, + "balance_loss_mlp": 1.09059334, + "epoch": 0.238553289726818, + "flos": 1155854281728.0, + "grad_norm": 0.0279922632366243, + "language_loss": 0.91224372, + "learning_rate": 0.000890221047287235, + "loss": 0.92411238, + "num_input_tokens_seen": 102777184, + "router_z_loss_mlp": 0.96240234, + "step": 1240, + "time_per_iteration": 3.503387928009033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191442, + "balance_loss_mlp": 1.09512079, + "epoch": 0.23874567141208156, + "flos": 500909895168.0, + "grad_norm": 0.02294407067471098, + "language_loss": 0.98687088, + "learning_rate": 0.0008900261865546615, + "loss": 0.99878532, + "num_input_tokens_seen": 102845744, + "router_z_loss_mlp": 0.96289062, + "step": 1241, + "time_per_iteration": 2.6329948902130127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188291, + "balance_loss_mlp": 1.09197009, + "epoch": 0.23893805309734514, + "flos": 558049110528.0, + "grad_norm": 0.02727719764566138, + "language_loss": 0.96105886, + "learning_rate": 0.0008898311744004936, + "loss": 0.97294176, + "num_input_tokens_seen": 102918064, + "router_z_loss_mlp": 0.96289062, + "step": 1242, + "time_per_iteration": 2.6852729320526123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011866, + "balance_loss_mlp": 1.0902791, + "epoch": 0.2391304347826087, + "flos": 550316350464.0, + "grad_norm": 0.023767912183342704, + "language_loss": 0.95555472, + "learning_rate": 0.0008896360109004414, + "loss": 0.9674207, + "num_input_tokens_seen": 102983920, + "router_z_loss_mlp": 0.96289062, + "step": 1243, + "time_per_iteration": 2.6607675552368164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181953, + "balance_loss_mlp": 1.08558464, + "epoch": 0.23932281646787226, + "flos": 517078361088.0, + "grad_norm": 0.022492500831292953, + "language_loss": 0.92156398, + "learning_rate": 0.0008894406961302742, + "loss": 0.93338358, + "num_input_tokens_seen": 103053328, + "router_z_loss_mlp": 0.96337891, + "step": 1244, + "time_per_iteration": 2.658339262008667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180796, + "balance_loss_mlp": 1.0844276, + "epoch": 0.23951519815313582, + "flos": 745001407488.0, + "grad_norm": 0.0220414301985699, + "language_loss": 0.9171226, + "learning_rate": 0.0008892452301658201, + "loss": 0.92893052, + "num_input_tokens_seen": 103128208, + "router_z_loss_mlp": 0.96337891, + "step": 1245, + "time_per_iteration": 2.987859010696411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189345, + "balance_loss_mlp": 1.09302354, + "epoch": 0.23970757983839938, + "flos": 555174749184.0, + "grad_norm": 0.02624868476300941, + "language_loss": 0.92775297, + "learning_rate": 0.0008890496130829653, + "loss": 0.93964636, + "num_input_tokens_seen": 103197392, + "router_z_loss_mlp": 0.96289062, + "step": 1246, + "time_per_iteration": 2.7285211086273193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011891, + "balance_loss_mlp": 1.09287417, + "epoch": 0.23989996152366294, + "flos": 481617289728.0, + "grad_norm": 0.024405638758005322, + "language_loss": 0.93939734, + "learning_rate": 0.0008888538449576555, + "loss": 0.95128834, + "num_input_tokens_seen": 103265328, + "router_z_loss_mlp": 0.96191406, + "step": 1247, + "time_per_iteration": 2.603447675704956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181648, + "balance_loss_mlp": 1.08532703, + "epoch": 0.2400923432089265, + "flos": 486280304640.0, + "grad_norm": 0.02551404288502155, + "language_loss": 0.9456799, + "learning_rate": 0.0008886579258658944, + "loss": 0.9574964, + "num_input_tokens_seen": 103331632, + "router_z_loss_mlp": 0.96289062, + "step": 1248, + "time_per_iteration": 2.6195995807647705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183672, + "balance_loss_mlp": 1.08735096, + "epoch": 0.24028472489419006, + "flos": 624792331776.0, + "grad_norm": 0.02192042043345247, + "language_loss": 0.93244678, + "learning_rate": 0.0008884618558837446, + "loss": 0.94428349, + "num_input_tokens_seen": 103405408, + "router_z_loss_mlp": 0.96289062, + "step": 1249, + "time_per_iteration": 2.830350399017334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187022, + "balance_loss_mlp": 1.09113026, + "epoch": 0.24047710657945365, + "flos": 602808013824.0, + "grad_norm": 0.023766863499936387, + "language_loss": 0.96457344, + "learning_rate": 0.0008882656350873273, + "loss": 0.97644365, + "num_input_tokens_seen": 103487216, + "router_z_loss_mlp": 0.95849609, + "step": 1250, + "time_per_iteration": 2.8691956996917725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119127, + "balance_loss_mlp": 1.09547377, + "epoch": 0.2406694882647172, + "flos": 843000582144.0, + "grad_norm": 0.03001641023469985, + "language_loss": 1.00300837, + "learning_rate": 0.0008880692635528219, + "loss": 1.01492119, + "num_input_tokens_seen": 103568640, + "router_z_loss_mlp": 0.95751953, + "step": 1251, + "time_per_iteration": 3.066152572631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187351, + "balance_loss_mlp": 1.09155416, + "epoch": 0.24086186994998077, + "flos": 528134647296.0, + "grad_norm": 0.026461260661865858, + "language_loss": 0.98557454, + "learning_rate": 0.0008878727413564669, + "loss": 0.99744809, + "num_input_tokens_seen": 103640784, + "router_z_loss_mlp": 0.95751953, + "step": 1252, + "time_per_iteration": 2.7665653228759766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209228, + "balance_loss_mlp": 1.11519623, + "epoch": 0.24105425163524433, + "flos": 1341459262464.0, + "grad_norm": 0.018061169603452644, + "language_loss": 0.80135596, + "learning_rate": 0.0008876760685745588, + "loss": 0.81344825, + "num_input_tokens_seen": 103865824, + "router_z_loss_mlp": 0.93945312, + "step": 1253, + "time_per_iteration": 4.899695634841919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182732, + "balance_loss_mlp": 1.08679259, + "epoch": 0.24124663332050789, + "flos": 615227257344.0, + "grad_norm": 0.02599071752574661, + "language_loss": 0.90657973, + "learning_rate": 0.0008874792452834528, + "loss": 0.91840708, + "num_input_tokens_seen": 103939872, + "router_z_loss_mlp": 0.95898438, + "step": 1254, + "time_per_iteration": 2.7407760620117188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179855, + "balance_loss_mlp": 1.08401072, + "epoch": 0.24143901500577145, + "flos": 576592381440.0, + "grad_norm": 0.0285281411485809, + "language_loss": 0.99380314, + "learning_rate": 0.0008872822715595626, + "loss": 1.00560164, + "num_input_tokens_seen": 104011120, + "router_z_loss_mlp": 0.95800781, + "step": 1255, + "time_per_iteration": 2.7094287872314453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176059, + "balance_loss_mlp": 1.08007157, + "epoch": 0.241631396691035, + "flos": 496146823680.0, + "grad_norm": 0.026934202036951318, + "language_loss": 0.98012596, + "learning_rate": 0.0008870851474793598, + "loss": 0.9918865, + "num_input_tokens_seen": 104077040, + "router_z_loss_mlp": 0.95947266, + "step": 1256, + "time_per_iteration": 2.5717930793762207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180992, + "balance_loss_mlp": 1.08500445, + "epoch": 0.24182377837629856, + "flos": 637396225536.0, + "grad_norm": 0.02721147411023071, + "language_loss": 0.97604549, + "learning_rate": 0.0008868878731193752, + "loss": 0.98785543, + "num_input_tokens_seen": 104150880, + "router_z_loss_mlp": 0.95947266, + "step": 1257, + "time_per_iteration": 2.835613965988159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180736, + "balance_loss_mlp": 1.08460534, + "epoch": 0.24201616006156215, + "flos": 516349218816.0, + "grad_norm": 0.023847715865297152, + "language_loss": 0.9613235, + "learning_rate": 0.0008866904485561973, + "loss": 0.97313088, + "num_input_tokens_seen": 104223696, + "router_z_loss_mlp": 0.9609375, + "step": 1258, + "time_per_iteration": 2.697693347930908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182815, + "balance_loss_mlp": 1.08682752, + "epoch": 0.2422085417468257, + "flos": 616378093056.0, + "grad_norm": 0.023106527532664196, + "language_loss": 0.92363685, + "learning_rate": 0.000886492873866473, + "loss": 0.93546498, + "num_input_tokens_seen": 104301728, + "router_z_loss_mlp": 0.95947266, + "step": 1259, + "time_per_iteration": 2.8120577335357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118033, + "balance_loss_mlp": 1.08424771, + "epoch": 0.24240092343208927, + "flos": 586912794624.0, + "grad_norm": 0.025402415625288076, + "language_loss": 0.9586736, + "learning_rate": 0.000886295149126908, + "loss": 0.97047698, + "num_input_tokens_seen": 104374480, + "router_z_loss_mlp": 0.96044922, + "step": 1260, + "time_per_iteration": 2.7276840209960938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184073, + "balance_loss_mlp": 1.08813286, + "epoch": 0.24259330511735283, + "flos": 763570874880.0, + "grad_norm": 0.0207328591517146, + "language_loss": 0.94417751, + "learning_rate": 0.0008860972744142655, + "loss": 0.95601827, + "num_input_tokens_seen": 104452384, + "router_z_loss_mlp": 0.95898438, + "step": 1261, + "time_per_iteration": 2.898794412612915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184052, + "balance_loss_mlp": 1.08816016, + "epoch": 0.2427856868026164, + "flos": 628133322240.0, + "grad_norm": 0.02409331705070074, + "language_loss": 0.89591467, + "learning_rate": 0.0008858992498053671, + "loss": 0.90775526, + "num_input_tokens_seen": 104532576, + "router_z_loss_mlp": 0.95849609, + "step": 1262, + "time_per_iteration": 2.8477351665496826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183746, + "balance_loss_mlp": 1.08952332, + "epoch": 0.24297806848787995, + "flos": 1514919343104.0, + "grad_norm": 0.012580587939111834, + "language_loss": 0.7658875, + "learning_rate": 0.0008857010753770934, + "loss": 0.77772498, + "num_input_tokens_seen": 104765216, + "router_z_loss_mlp": 0.94140625, + "step": 1263, + "time_per_iteration": 4.826787710189819 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180613, + "balance_loss_mlp": 1.0848639, + "epoch": 0.2431704501731435, + "flos": 543072413184.0, + "grad_norm": 0.025826560533695943, + "language_loss": 0.92586392, + "learning_rate": 0.0008855027512063817, + "loss": 0.93767005, + "num_input_tokens_seen": 104836912, + "router_z_loss_mlp": 0.95703125, + "step": 1264, + "time_per_iteration": 2.722557306289673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179682, + "balance_loss_mlp": 1.08364689, + "epoch": 0.24336283185840707, + "flos": 524878250496.0, + "grad_norm": 0.025894380889017608, + "language_loss": 0.95614499, + "learning_rate": 0.0008853042773702292, + "loss": 0.96794176, + "num_input_tokens_seen": 104909280, + "router_z_loss_mlp": 0.95996094, + "step": 1265, + "time_per_iteration": 2.7258307933807373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118145, + "balance_loss_mlp": 1.0855577, + "epoch": 0.24355521354367063, + "flos": 538205282304.0, + "grad_norm": 0.022817154468993458, + "language_loss": 0.98287719, + "learning_rate": 0.0008851056539456896, + "loss": 0.99469173, + "num_input_tokens_seen": 104982560, + "router_z_loss_mlp": 0.95849609, + "step": 1266, + "time_per_iteration": 2.6970114707946777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182961, + "balance_loss_mlp": 1.08692622, + "epoch": 0.24374759522893422, + "flos": 932108155392.0, + "grad_norm": 0.024066297062525326, + "language_loss": 0.9148944, + "learning_rate": 0.0008849068810098755, + "loss": 0.92672402, + "num_input_tokens_seen": 105075056, + "router_z_loss_mlp": 0.95996094, + "step": 1267, + "time_per_iteration": 3.326692819595337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118368, + "balance_loss_mlp": 1.08764458, + "epoch": 0.24393997691419778, + "flos": 428685193728.0, + "grad_norm": 0.027357648838687767, + "language_loss": 0.94001949, + "learning_rate": 0.0008847079586399575, + "loss": 0.95185632, + "num_input_tokens_seen": 105137536, + "router_z_loss_mlp": 0.95996094, + "step": 1268, + "time_per_iteration": 2.466787099838257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180763, + "balance_loss_mlp": 1.08482289, + "epoch": 0.24413235859946134, + "flos": 579942104064.0, + "grad_norm": 0.026150492080556795, + "language_loss": 0.95411992, + "learning_rate": 0.0008845088869131641, + "loss": 0.96592754, + "num_input_tokens_seen": 105204848, + "router_z_loss_mlp": 0.95898438, + "step": 1269, + "time_per_iteration": 2.7016899585723877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175832, + "balance_loss_mlp": 1.07989287, + "epoch": 0.2443247402847249, + "flos": 530900219904.0, + "grad_norm": 0.025309414349457434, + "language_loss": 0.98951483, + "learning_rate": 0.0008843096659067818, + "loss": 1.00127316, + "num_input_tokens_seen": 105273456, + "router_z_loss_mlp": 0.95898438, + "step": 1270, + "time_per_iteration": 2.6240859031677246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179701, + "balance_loss_mlp": 1.08366621, + "epoch": 0.24451712196998845, + "flos": 697624651776.0, + "grad_norm": 0.020400222299851913, + "language_loss": 0.92813951, + "learning_rate": 0.000884110295698155, + "loss": 0.93993652, + "num_input_tokens_seen": 105355488, + "router_z_loss_mlp": 0.95996094, + "step": 1271, + "time_per_iteration": 2.945749044418335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180344, + "balance_loss_mlp": 1.08435643, + "epoch": 0.24470950365525201, + "flos": 530863289856.0, + "grad_norm": 0.02434814436965663, + "language_loss": 0.97428346, + "learning_rate": 0.0008839107763646861, + "loss": 0.98608696, + "num_input_tokens_seen": 105421568, + "router_z_loss_mlp": 0.95947266, + "step": 1272, + "time_per_iteration": 2.5816495418548584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182389, + "balance_loss_mlp": 1.08630657, + "epoch": 0.24490188534051557, + "flos": 492347936256.0, + "grad_norm": 0.027277570267404832, + "language_loss": 1.00778949, + "learning_rate": 0.0008837111079838353, + "loss": 1.0196135, + "num_input_tokens_seen": 105493072, + "router_z_loss_mlp": 0.96044922, + "step": 1273, + "time_per_iteration": 2.675060749053955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182001, + "balance_loss_mlp": 1.08587062, + "epoch": 0.24509426702577913, + "flos": 475111226880.0, + "grad_norm": 0.024851656777491255, + "language_loss": 0.98025054, + "learning_rate": 0.000883511290633121, + "loss": 0.99207056, + "num_input_tokens_seen": 105559840, + "router_z_loss_mlp": 0.9609375, + "step": 1274, + "time_per_iteration": 2.5230517387390137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183988, + "balance_loss_mlp": 1.08747613, + "epoch": 0.24528664871104272, + "flos": 551647107072.0, + "grad_norm": 0.02070792437524093, + "language_loss": 1.00507927, + "learning_rate": 0.000883311324390119, + "loss": 1.01691914, + "num_input_tokens_seen": 105634448, + "router_z_loss_mlp": 0.96484375, + "step": 1275, + "time_per_iteration": 2.690488338470459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184819, + "balance_loss_mlp": 1.08887982, + "epoch": 0.24547903039630628, + "flos": 827335675392.0, + "grad_norm": 0.02978995697497926, + "language_loss": 0.95172417, + "learning_rate": 0.0008831112093324629, + "loss": 0.96357232, + "num_input_tokens_seen": 105711936, + "router_z_loss_mlp": 0.95898438, + "step": 1276, + "time_per_iteration": 3.0883522033691406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184816, + "balance_loss_mlp": 1.08839917, + "epoch": 0.24567141208156984, + "flos": 592693718016.0, + "grad_norm": 0.026400385967418116, + "language_loss": 0.99731994, + "learning_rate": 0.0008829109455378444, + "loss": 1.00916803, + "num_input_tokens_seen": 105780240, + "router_z_loss_mlp": 0.96386719, + "step": 1277, + "time_per_iteration": 2.670658588409424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184585, + "balance_loss_mlp": 1.08812118, + "epoch": 0.2458637937668334, + "flos": 548929198080.0, + "grad_norm": 0.022333419000210953, + "language_loss": 0.95654261, + "learning_rate": 0.000882710533084013, + "loss": 0.96838844, + "num_input_tokens_seen": 105849840, + "router_z_loss_mlp": 0.96435547, + "step": 1278, + "time_per_iteration": 2.641019344329834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189057, + "balance_loss_mlp": 1.09244978, + "epoch": 0.24605617545209696, + "flos": 516911175168.0, + "grad_norm": 0.022487969609205835, + "language_loss": 0.97332817, + "learning_rate": 0.0008825099720487755, + "loss": 0.98521876, + "num_input_tokens_seen": 105921488, + "router_z_loss_mlp": 0.96582031, + "step": 1279, + "time_per_iteration": 2.626079559326172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193596, + "balance_loss_mlp": 1.09880066, + "epoch": 0.24624855713736052, + "flos": 1515058331136.0, + "grad_norm": 0.0162275920205478, + "language_loss": 0.7526114, + "learning_rate": 0.0008823092625099967, + "loss": 0.76454735, + "num_input_tokens_seen": 106146816, + "router_z_loss_mlp": 0.94726562, + "step": 1280, + "time_per_iteration": 4.846211671829224 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118811, + "balance_loss_mlp": 1.09350586, + "epoch": 0.24644093882262408, + "flos": 1530746706432.0, + "grad_norm": 0.013716798372908724, + "language_loss": 0.77944112, + "learning_rate": 0.0008821084045455987, + "loss": 0.79132223, + "num_input_tokens_seen": 106361568, + "router_z_loss_mlp": 0.9453125, + "step": 1281, + "time_per_iteration": 4.781409025192261 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189694, + "balance_loss_mlp": 1.09351575, + "epoch": 0.24663332050788764, + "flos": 660348730368.0, + "grad_norm": 0.028995521048395968, + "language_loss": 0.998649, + "learning_rate": 0.0008819073982335619, + "loss": 1.01054597, + "num_input_tokens_seen": 106435296, + "router_z_loss_mlp": 0.96142578, + "step": 1282, + "time_per_iteration": 2.873255729675293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187163, + "balance_loss_mlp": 1.09098482, + "epoch": 0.24682570219315123, + "flos": 542805170688.0, + "grad_norm": 0.0289675073475646, + "language_loss": 0.92590028, + "learning_rate": 0.0008817062436519235, + "loss": 0.93777192, + "num_input_tokens_seen": 106507184, + "router_z_loss_mlp": 0.96142578, + "step": 1283, + "time_per_iteration": 2.6918435096740723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184699, + "balance_loss_mlp": 1.08852112, + "epoch": 0.24701808387841478, + "flos": 441658387968.0, + "grad_norm": 0.027350099061339322, + "language_loss": 1.00939846, + "learning_rate": 0.0008815049408787788, + "loss": 1.02124548, + "num_input_tokens_seen": 106571472, + "router_z_loss_mlp": 0.96142578, + "step": 1284, + "time_per_iteration": 2.5155978202819824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190183, + "balance_loss_mlp": 1.09443462, + "epoch": 0.24721046556367834, + "flos": 469032861696.0, + "grad_norm": 0.028209143321693456, + "language_loss": 0.95635927, + "learning_rate": 0.0008813034899922805, + "loss": 0.96826112, + "num_input_tokens_seen": 106638368, + "router_z_loss_mlp": 0.95703125, + "step": 1285, + "time_per_iteration": 2.5152530670166016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193087, + "balance_loss_mlp": 1.09729075, + "epoch": 0.2474028472489419, + "flos": 505407725568.0, + "grad_norm": 0.027111907557838905, + "language_loss": 1.01196301, + "learning_rate": 0.0008811018910706387, + "loss": 1.02389383, + "num_input_tokens_seen": 106705312, + "router_z_loss_mlp": 0.95751953, + "step": 1286, + "time_per_iteration": 2.5593316555023193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188305, + "balance_loss_mlp": 1.09255612, + "epoch": 0.24759522893420546, + "flos": 480955276800.0, + "grad_norm": 0.03276846828627927, + "language_loss": 0.9498859, + "learning_rate": 0.0008809001441921211, + "loss": 0.96176893, + "num_input_tokens_seen": 106778624, + "router_z_loss_mlp": 0.95703125, + "step": 1287, + "time_per_iteration": 2.7347421646118164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181619, + "balance_loss_mlp": 1.08567917, + "epoch": 0.24778761061946902, + "flos": 534753501696.0, + "grad_norm": 0.025262665654883373, + "language_loss": 0.97019696, + "learning_rate": 0.0008806982494350528, + "loss": 0.98201311, + "num_input_tokens_seen": 106847744, + "router_z_loss_mlp": 0.95898438, + "step": 1288, + "time_per_iteration": 2.6499245166778564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181206, + "balance_loss_mlp": 1.08526671, + "epoch": 0.24797999230473258, + "flos": 560942937600.0, + "grad_norm": 0.021558514258727474, + "language_loss": 0.9849534, + "learning_rate": 0.0008804962068778161, + "loss": 0.99676538, + "num_input_tokens_seen": 106927584, + "router_z_loss_mlp": 0.95898438, + "step": 1289, + "time_per_iteration": 2.852257490158081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186476, + "balance_loss_mlp": 1.09053683, + "epoch": 0.24817237398999614, + "flos": 625480541184.0, + "grad_norm": 0.024913990838324927, + "language_loss": 0.90269625, + "learning_rate": 0.0008802940165988511, + "loss": 0.91456103, + "num_input_tokens_seen": 107006656, + "router_z_loss_mlp": 0.95898438, + "step": 1290, + "time_per_iteration": 2.846277952194214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181135, + "balance_loss_mlp": 1.08471859, + "epoch": 0.2483647556752597, + "flos": 613484265984.0, + "grad_norm": 0.02310813532639645, + "language_loss": 0.96774852, + "learning_rate": 0.000880091678676655, + "loss": 0.97955984, + "num_input_tokens_seen": 107084352, + "router_z_loss_mlp": 0.96386719, + "step": 1291, + "time_per_iteration": 2.8085777759552 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180122, + "balance_loss_mlp": 1.0837059, + "epoch": 0.2485571373605233, + "flos": 584687711232.0, + "grad_norm": 0.021422688776258386, + "language_loss": 0.9855839, + "learning_rate": 0.0008798891931897821, + "loss": 0.99738514, + "num_input_tokens_seen": 107158368, + "router_z_loss_mlp": 0.96386719, + "step": 1292, + "time_per_iteration": 2.7361133098602295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183371, + "balance_loss_mlp": 1.08704984, + "epoch": 0.24874951904578685, + "flos": 495736590336.0, + "grad_norm": 0.02424073807687162, + "language_loss": 0.92916596, + "learning_rate": 0.0008796865602168447, + "loss": 0.94099975, + "num_input_tokens_seen": 107224256, + "router_z_loss_mlp": 0.96289062, + "step": 1293, + "time_per_iteration": 2.5220131874084473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186197, + "balance_loss_mlp": 1.09025729, + "epoch": 0.2489419007310504, + "flos": 457173573120.0, + "grad_norm": 0.023099031146870112, + "language_loss": 0.94818902, + "learning_rate": 0.0008794837798365115, + "loss": 0.96005094, + "num_input_tokens_seen": 107292720, + "router_z_loss_mlp": 0.95898438, + "step": 1294, + "time_per_iteration": 2.6338109970092773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187707, + "balance_loss_mlp": 1.09191012, + "epoch": 0.24913428241631397, + "flos": 486565011456.0, + "grad_norm": 0.02215078033303108, + "language_loss": 0.96107936, + "learning_rate": 0.0008792808521275089, + "loss": 0.97295642, + "num_input_tokens_seen": 107368576, + "router_z_loss_mlp": 0.95751953, + "step": 1295, + "time_per_iteration": 2.7354605197906494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182687, + "balance_loss_mlp": 1.0869385, + "epoch": 0.24932666410157753, + "flos": 519917793792.0, + "grad_norm": 0.022601932216391857, + "language_loss": 0.96075213, + "learning_rate": 0.0008790777771686206, + "loss": 0.972579, + "num_input_tokens_seen": 107433856, + "router_z_loss_mlp": 0.95703125, + "step": 1296, + "time_per_iteration": 2.5746819972991943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181852, + "balance_loss_mlp": 1.08610308, + "epoch": 0.2495190457868411, + "flos": 473556888576.0, + "grad_norm": 0.022656020732285023, + "language_loss": 0.93397439, + "learning_rate": 0.0008788745550386872, + "loss": 0.94579285, + "num_input_tokens_seen": 107500944, + "router_z_loss_mlp": 0.95703125, + "step": 1297, + "time_per_iteration": 2.55985689163208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177725, + "balance_loss_mlp": 1.0820719, + "epoch": 0.24971142747210465, + "flos": 747198292992.0, + "grad_norm": 0.023996141347128058, + "language_loss": 0.88372529, + "learning_rate": 0.0008786711858166063, + "loss": 0.89550251, + "num_input_tokens_seen": 107580000, + "router_z_loss_mlp": 0.95605469, + "step": 1298, + "time_per_iteration": 2.9357082843780518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179743, + "balance_loss_mlp": 1.08399367, + "epoch": 0.2499038091573682, + "flos": 750901853184.0, + "grad_norm": 0.025666304870509565, + "language_loss": 0.93355387, + "learning_rate": 0.0008784676695813332, + "loss": 0.9453513, + "num_input_tokens_seen": 107660384, + "router_z_loss_mlp": 0.95703125, + "step": 1299, + "time_per_iteration": 2.939739942550659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187708, + "balance_loss_mlp": 1.09186363, + "epoch": 0.2500961908426318, + "flos": 746342897664.0, + "grad_norm": 0.02448521774653795, + "language_loss": 0.94308037, + "learning_rate": 0.0008782640064118796, + "loss": 0.95495749, + "num_input_tokens_seen": 107736320, + "router_z_loss_mlp": 0.95800781, + "step": 1300, + "time_per_iteration": 2.882838249206543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223068, + "balance_loss_mlp": 1.12808228, + "epoch": 0.2502885725278953, + "flos": 1420523672064.0, + "grad_norm": 0.019515623701574104, + "language_loss": 0.7618475, + "learning_rate": 0.0008780601963873149, + "loss": 0.77407825, + "num_input_tokens_seen": 107972608, + "router_z_loss_mlp": 0.94921875, + "step": 1301, + "time_per_iteration": 5.002445220947266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180814, + "balance_loss_mlp": 1.08520806, + "epoch": 0.2504809542131589, + "flos": 516231697920.0, + "grad_norm": 0.028413107884204602, + "language_loss": 0.96116567, + "learning_rate": 0.0008778562395867648, + "loss": 0.97297382, + "num_input_tokens_seen": 108043312, + "router_z_loss_mlp": 0.95556641, + "step": 1302, + "time_per_iteration": 2.6463139057159424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183586, + "balance_loss_mlp": 1.08783746, + "epoch": 0.25067333589842244, + "flos": 526851554304.0, + "grad_norm": 0.024791221234372676, + "language_loss": 0.9191972, + "learning_rate": 0.0008776521360894127, + "loss": 0.93103302, + "num_input_tokens_seen": 108114144, + "router_z_loss_mlp": 0.95703125, + "step": 1303, + "time_per_iteration": 2.60622239112854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203766, + "balance_loss_mlp": 1.10897064, + "epoch": 0.25086571758368603, + "flos": 1477157326848.0, + "grad_norm": 0.014632010139538269, + "language_loss": 0.78962064, + "learning_rate": 0.0008774478859744984, + "loss": 0.80165827, + "num_input_tokens_seen": 108338720, + "router_z_loss_mlp": 0.94726562, + "step": 1304, + "time_per_iteration": 4.810328006744385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188508, + "balance_loss_mlp": 1.09285462, + "epoch": 0.2510580992689496, + "flos": 529402277376.0, + "grad_norm": 0.027485922989720333, + "language_loss": 0.99458921, + "learning_rate": 0.0008772434893213186, + "loss": 1.00647426, + "num_input_tokens_seen": 108405456, + "router_z_loss_mlp": 0.95605469, + "step": 1305, + "time_per_iteration": 2.6031458377838135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187013, + "balance_loss_mlp": 1.09155023, + "epoch": 0.25125048095421315, + "flos": 518465513472.0, + "grad_norm": 0.0302061265456268, + "language_loss": 0.93206942, + "learning_rate": 0.0008770389462092276, + "loss": 0.94393957, + "num_input_tokens_seen": 108474368, + "router_z_loss_mlp": 0.95410156, + "step": 1306, + "time_per_iteration": 2.636845827102661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118174, + "balance_loss_mlp": 1.0858953, + "epoch": 0.25144286263947674, + "flos": 621674923008.0, + "grad_norm": 0.026354631998576704, + "language_loss": 0.96568018, + "learning_rate": 0.0008768342567176357, + "loss": 0.97749758, + "num_input_tokens_seen": 108548864, + "router_z_loss_mlp": 0.95800781, + "step": 1307, + "time_per_iteration": 2.797346591949463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187952, + "balance_loss_mlp": 1.09220326, + "epoch": 0.25163524432474027, + "flos": 504865234944.0, + "grad_norm": 0.024318536510777332, + "language_loss": 0.99895847, + "learning_rate": 0.0008766294209260107, + "loss": 1.01083803, + "num_input_tokens_seen": 108623072, + "router_z_loss_mlp": 0.95703125, + "step": 1308, + "time_per_iteration": 2.648099184036255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180717, + "balance_loss_mlp": 1.0850637, + "epoch": 0.25182762601000386, + "flos": 510079472640.0, + "grad_norm": 0.027727924866539442, + "language_loss": 1.0231359, + "learning_rate": 0.0008764244389138767, + "loss": 1.0349431, + "num_input_tokens_seen": 108690128, + "router_z_loss_mlp": 0.95605469, + "step": 1309, + "time_per_iteration": 2.575963258743286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179663, + "balance_loss_mlp": 1.08396196, + "epoch": 0.2520200076952674, + "flos": 635097282048.0, + "grad_norm": 0.028356059247082867, + "language_loss": 0.93336231, + "learning_rate": 0.000876219310760815, + "loss": 0.94515896, + "num_input_tokens_seen": 108770272, + "router_z_loss_mlp": 0.95654297, + "step": 1310, + "time_per_iteration": 2.8647706508636475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189244, + "balance_loss_mlp": 1.09330475, + "epoch": 0.252212389380531, + "flos": 495651996672.0, + "grad_norm": 0.024396868749396446, + "language_loss": 0.91954494, + "learning_rate": 0.0008760140365464631, + "loss": 0.93143737, + "num_input_tokens_seen": 108840592, + "router_z_loss_mlp": 0.95898438, + "step": 1311, + "time_per_iteration": 2.592453718185425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180261, + "balance_loss_mlp": 1.08451247, + "epoch": 0.2524047710657945, + "flos": 491529470976.0, + "grad_norm": 0.026197758988141227, + "language_loss": 0.97483641, + "learning_rate": 0.0008758086163505156, + "loss": 0.98663902, + "num_input_tokens_seen": 108910064, + "router_z_loss_mlp": 0.95703125, + "step": 1312, + "time_per_iteration": 2.56319260597229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181231, + "balance_loss_mlp": 1.08548176, + "epoch": 0.2525971527510581, + "flos": 648612966912.0, + "grad_norm": 0.0242630752619845, + "language_loss": 0.98733318, + "learning_rate": 0.0008756030502527239, + "loss": 0.99914545, + "num_input_tokens_seen": 108986336, + "router_z_loss_mlp": 0.95703125, + "step": 1313, + "time_per_iteration": 2.858691930770874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180546, + "balance_loss_mlp": 1.08455837, + "epoch": 0.2527895344363217, + "flos": 570373026816.0, + "grad_norm": 0.025539383487616106, + "language_loss": 0.99746555, + "learning_rate": 0.0008753973383328954, + "loss": 1.00927103, + "num_input_tokens_seen": 109059712, + "router_z_loss_mlp": 0.95947266, + "step": 1314, + "time_per_iteration": 2.6683549880981445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180137, + "balance_loss_mlp": 1.0841974, + "epoch": 0.2529819161215852, + "flos": 515068127232.0, + "grad_norm": 0.027266475314614652, + "language_loss": 0.95154297, + "learning_rate": 0.0008751914806708952, + "loss": 0.96334434, + "num_input_tokens_seen": 109127504, + "router_z_loss_mlp": 0.95898438, + "step": 1315, + "time_per_iteration": 2.6008012294769287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178852, + "balance_loss_mlp": 1.08310342, + "epoch": 0.2531742978068488, + "flos": 532350498816.0, + "grad_norm": 0.02508848621911812, + "language_loss": 0.91122246, + "learning_rate": 0.0008749854773466439, + "loss": 0.92301095, + "num_input_tokens_seen": 109198080, + "router_z_loss_mlp": 0.95703125, + "step": 1316, + "time_per_iteration": 2.6595401763916016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193828, + "balance_loss_mlp": 1.09822178, + "epoch": 0.25336667949211233, + "flos": 597747500544.0, + "grad_norm": 0.027675397486347803, + "language_loss": 0.92894816, + "learning_rate": 0.0008747793284401192, + "loss": 0.9408865, + "num_input_tokens_seen": 109268368, + "router_z_loss_mlp": 0.95556641, + "step": 1317, + "time_per_iteration": 2.6975109577178955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187696, + "balance_loss_mlp": 1.09175622, + "epoch": 0.2535590611773759, + "flos": 603255177216.0, + "grad_norm": 0.02603186041930466, + "language_loss": 0.95462376, + "learning_rate": 0.0008745730340313551, + "loss": 0.96650076, + "num_input_tokens_seen": 109344112, + "router_z_loss_mlp": 0.95898438, + "step": 1318, + "time_per_iteration": 2.805327892303467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187328, + "balance_loss_mlp": 1.0915786, + "epoch": 0.25375144286263945, + "flos": 496322741760.0, + "grad_norm": 0.027049333310240738, + "language_loss": 0.95645851, + "learning_rate": 0.0008743665942004422, + "loss": 0.96833169, + "num_input_tokens_seen": 109414112, + "router_z_loss_mlp": 0.95703125, + "step": 1319, + "time_per_iteration": 2.6340737342834473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185781, + "balance_loss_mlp": 1.0896982, + "epoch": 0.25394382454790304, + "flos": 513476858880.0, + "grad_norm": 0.02784781206620994, + "language_loss": 1.02473438, + "learning_rate": 0.0008741600090275277, + "loss": 1.03659225, + "num_input_tokens_seen": 109484336, + "router_z_loss_mlp": 0.96044922, + "step": 1320, + "time_per_iteration": 2.573155641555786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183427, + "balance_loss_mlp": 1.08763099, + "epoch": 0.25413620623316663, + "flos": 960855045120.0, + "grad_norm": 0.03323105604734599, + "language_loss": 0.94160318, + "learning_rate": 0.0008739532785928151, + "loss": 0.95343745, + "num_input_tokens_seen": 109590128, + "router_z_loss_mlp": 0.95751953, + "step": 1321, + "time_per_iteration": 3.470245122909546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190819, + "balance_loss_mlp": 1.09659576, + "epoch": 0.25432858791843016, + "flos": 1580648715264.0, + "grad_norm": 0.017424496497570757, + "language_loss": 0.74893582, + "learning_rate": 0.0008737464029765639, + "loss": 0.76084399, + "num_input_tokens_seen": 109816592, + "router_z_loss_mlp": 0.94140625, + "step": 1322, + "time_per_iteration": 4.8549723625183105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184096, + "balance_loss_mlp": 1.08806074, + "epoch": 0.25452096960369375, + "flos": 584893828608.0, + "grad_norm": 0.025099574916072127, + "language_loss": 0.94150972, + "learning_rate": 0.0008735393822590908, + "loss": 0.95335066, + "num_input_tokens_seen": 109890464, + "router_z_loss_mlp": 0.95996094, + "step": 1323, + "time_per_iteration": 2.6771461963653564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187145, + "balance_loss_mlp": 1.0910151, + "epoch": 0.2547133512889573, + "flos": 509641041408.0, + "grad_norm": 0.024104352127734364, + "language_loss": 0.95373654, + "learning_rate": 0.0008733322165207681, + "loss": 0.965608, + "num_input_tokens_seen": 109963408, + "router_z_loss_mlp": 0.9609375, + "step": 1324, + "time_per_iteration": 2.671187400817871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191608, + "balance_loss_mlp": 1.09590697, + "epoch": 0.25490573297422087, + "flos": 784035783168.0, + "grad_norm": 0.02719192919889817, + "language_loss": 0.93181324, + "learning_rate": 0.0008731249058420247, + "loss": 0.94372928, + "num_input_tokens_seen": 110048800, + "router_z_loss_mlp": 0.95654297, + "step": 1325, + "time_per_iteration": 3.0272371768951416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189078, + "balance_loss_mlp": 1.09332883, + "epoch": 0.2550981146594844, + "flos": 510952332288.0, + "grad_norm": 0.024872253546531747, + "language_loss": 1.00651383, + "learning_rate": 0.0008729174503033459, + "loss": 1.0184046, + "num_input_tokens_seen": 110118096, + "router_z_loss_mlp": 0.95703125, + "step": 1326, + "time_per_iteration": 2.6320900917053223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187412, + "balance_loss_mlp": 1.09166288, + "epoch": 0.255290496344748, + "flos": 677930545152.0, + "grad_norm": 0.02807770436691079, + "language_loss": 0.93655276, + "learning_rate": 0.0008727098499852728, + "loss": 0.9484269, + "num_input_tokens_seen": 110190160, + "router_z_loss_mlp": 0.95703125, + "step": 1327, + "time_per_iteration": 2.8246335983276367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187202, + "balance_loss_mlp": 1.09116733, + "epoch": 0.2554828780300115, + "flos": 538984816128.0, + "grad_norm": 0.02304152562423393, + "language_loss": 0.97811985, + "learning_rate": 0.0008725021049684034, + "loss": 0.9899919, + "num_input_tokens_seen": 110268000, + "router_z_loss_mlp": 0.95996094, + "step": 1328, + "time_per_iteration": 2.783276081085205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011849, + "balance_loss_mlp": 1.08924699, + "epoch": 0.2556752597152751, + "flos": 825622883328.0, + "grad_norm": 0.024322773499976656, + "language_loss": 0.90949428, + "learning_rate": 0.000872294215333391, + "loss": 0.92134333, + "num_input_tokens_seen": 110354816, + "router_z_loss_mlp": 0.95605469, + "step": 1329, + "time_per_iteration": 3.1658623218536377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184378, + "balance_loss_mlp": 1.08867729, + "epoch": 0.2558676414005387, + "flos": 571890435072.0, + "grad_norm": 0.026114012927401953, + "language_loss": 0.91800833, + "learning_rate": 0.0008720861811609457, + "loss": 0.92985213, + "num_input_tokens_seen": 110427968, + "router_z_loss_mlp": 0.95654297, + "step": 1330, + "time_per_iteration": 2.725680112838745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185897, + "balance_loss_mlp": 1.09024334, + "epoch": 0.2560600230858022, + "flos": 487748047872.0, + "grad_norm": 0.02457760145285043, + "language_loss": 0.93800515, + "learning_rate": 0.0008718780025318338, + "loss": 0.94986409, + "num_input_tokens_seen": 110501184, + "router_z_loss_mlp": 0.95605469, + "step": 1331, + "time_per_iteration": 2.730424404144287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184699, + "balance_loss_mlp": 1.08904529, + "epoch": 0.2562524047710658, + "flos": 514119406080.0, + "grad_norm": 0.027688932662206074, + "language_loss": 0.94349414, + "learning_rate": 0.0008716696795268771, + "loss": 0.9553411, + "num_input_tokens_seen": 110573008, + "router_z_loss_mlp": 0.95605469, + "step": 1332, + "time_per_iteration": 2.6572844982147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183855, + "balance_loss_mlp": 1.0881542, + "epoch": 0.25644478645632934, + "flos": 636109129728.0, + "grad_norm": 0.025705757243887913, + "language_loss": 0.96553451, + "learning_rate": 0.0008714612122269538, + "loss": 0.97737306, + "num_input_tokens_seen": 110646704, + "router_z_loss_mlp": 0.95654297, + "step": 1333, + "time_per_iteration": 2.867598295211792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184376, + "balance_loss_mlp": 1.0888176, + "epoch": 0.25663716814159293, + "flos": 437544594432.0, + "grad_norm": 0.025955971973603553, + "language_loss": 1.00358891, + "learning_rate": 0.0008712526007129982, + "loss": 1.01543272, + "num_input_tokens_seen": 110712208, + "router_z_loss_mlp": 0.95507812, + "step": 1334, + "time_per_iteration": 2.516052484512329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186528, + "balance_loss_mlp": 1.0908742, + "epoch": 0.25682954982685646, + "flos": 499242765312.0, + "grad_norm": 0.021880143416013124, + "language_loss": 0.98599482, + "learning_rate": 0.0008710438450660003, + "loss": 0.99786019, + "num_input_tokens_seen": 110783936, + "router_z_loss_mlp": 0.95605469, + "step": 1335, + "time_per_iteration": 2.659489870071411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184319, + "balance_loss_mlp": 1.08861768, + "epoch": 0.25702193151212005, + "flos": 458627854848.0, + "grad_norm": 0.028869593177541276, + "language_loss": 0.98979777, + "learning_rate": 0.0008708349453670064, + "loss": 1.00164104, + "num_input_tokens_seen": 110848560, + "router_z_loss_mlp": 0.95654297, + "step": 1336, + "time_per_iteration": 2.5267841815948486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185282, + "balance_loss_mlp": 1.08953345, + "epoch": 0.2572143131973836, + "flos": 599403896832.0, + "grad_norm": 0.021342480544698176, + "language_loss": 0.99445975, + "learning_rate": 0.0008706259016971185, + "loss": 1.00631261, + "num_input_tokens_seen": 110922672, + "router_z_loss_mlp": 0.95703125, + "step": 1337, + "time_per_iteration": 2.7561397552490234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118469, + "balance_loss_mlp": 1.08884537, + "epoch": 0.25740669488264717, + "flos": 699526096896.0, + "grad_norm": 0.032203199948080075, + "language_loss": 0.96320713, + "learning_rate": 0.0008704167141374944, + "loss": 0.97505397, + "num_input_tokens_seen": 110995456, + "router_z_loss_mlp": 0.95800781, + "step": 1338, + "time_per_iteration": 2.7987895011901855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118993, + "balance_loss_mlp": 1.09432399, + "epoch": 0.25759907656791076, + "flos": 503378025984.0, + "grad_norm": 0.024717846020590344, + "language_loss": 0.97755861, + "learning_rate": 0.0008702073827693482, + "loss": 0.98945785, + "num_input_tokens_seen": 111069568, + "router_z_loss_mlp": 0.95556641, + "step": 1339, + "time_per_iteration": 2.694470167160034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186155, + "balance_loss_mlp": 1.0904057, + "epoch": 0.2577914582531743, + "flos": 775241510400.0, + "grad_norm": 0.025036220674882887, + "language_loss": 0.97113985, + "learning_rate": 0.0008699979076739494, + "loss": 0.98300135, + "num_input_tokens_seen": 111142608, + "router_z_loss_mlp": 0.95703125, + "step": 1340, + "time_per_iteration": 2.962740421295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184068, + "balance_loss_mlp": 1.08836627, + "epoch": 0.2579838399384379, + "flos": 460609890816.0, + "grad_norm": 0.026880962232798965, + "language_loss": 0.99139833, + "learning_rate": 0.0008697882889326234, + "loss": 1.00323892, + "num_input_tokens_seen": 111206336, + "router_z_loss_mlp": 0.95654297, + "step": 1341, + "time_per_iteration": 2.517382860183716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185483, + "balance_loss_mlp": 1.08987677, + "epoch": 0.2581762216237014, + "flos": 570262236672.0, + "grad_norm": 0.0242955377416103, + "language_loss": 0.96170259, + "learning_rate": 0.0008695785266267515, + "loss": 0.97355735, + "num_input_tokens_seen": 111276736, + "router_z_loss_mlp": 0.95556641, + "step": 1342, + "time_per_iteration": 2.6961281299591064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118536, + "balance_loss_mlp": 1.08961082, + "epoch": 0.258368603308965, + "flos": 605386934784.0, + "grad_norm": 0.023671890991135848, + "language_loss": 0.9337616, + "learning_rate": 0.0008693686208377704, + "loss": 0.94561517, + "num_input_tokens_seen": 111353856, + "router_z_loss_mlp": 0.95703125, + "step": 1343, + "time_per_iteration": 2.8561604022979736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184784, + "balance_loss_mlp": 1.08908272, + "epoch": 0.2585609849942285, + "flos": 492486924288.0, + "grad_norm": 0.022133881226187983, + "language_loss": 0.96849036, + "learning_rate": 0.0008691585716471733, + "loss": 0.98033822, + "num_input_tokens_seen": 111424960, + "router_z_loss_mlp": 0.95654297, + "step": 1344, + "time_per_iteration": 2.6443324089050293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185279, + "balance_loss_mlp": 1.08952987, + "epoch": 0.2587533666794921, + "flos": 641957182464.0, + "grad_norm": 0.02305984249039353, + "language_loss": 0.94482636, + "learning_rate": 0.0008689483791365079, + "loss": 0.95667922, + "num_input_tokens_seen": 111505248, + "router_z_loss_mlp": 0.95703125, + "step": 1345, + "time_per_iteration": 2.8541483879089355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185515, + "balance_loss_mlp": 1.08976638, + "epoch": 0.2589457483647557, + "flos": 577994996736.0, + "grad_norm": 0.022382124417400225, + "language_loss": 0.97831523, + "learning_rate": 0.0008687380433873786, + "loss": 0.99017042, + "num_input_tokens_seen": 111581936, + "router_z_loss_mlp": 0.95703125, + "step": 1346, + "time_per_iteration": 2.8148868083953857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186141, + "balance_loss_mlp": 1.09048796, + "epoch": 0.25913813005001923, + "flos": 536466293760.0, + "grad_norm": 0.024690786073415343, + "language_loss": 0.93800229, + "learning_rate": 0.0008685275644814448, + "loss": 0.94986367, + "num_input_tokens_seen": 111651456, + "router_z_loss_mlp": 0.95605469, + "step": 1347, + "time_per_iteration": 2.6872267723083496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188569, + "balance_loss_mlp": 1.0930109, + "epoch": 0.2593305117352828, + "flos": 722346344448.0, + "grad_norm": 0.028015192621825148, + "language_loss": 0.944291, + "learning_rate": 0.0008683169425004216, + "loss": 0.95617664, + "num_input_tokens_seen": 111731712, + "router_z_loss_mlp": 0.95507812, + "step": 1348, + "time_per_iteration": 2.9036293029785156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187318, + "balance_loss_mlp": 1.09171176, + "epoch": 0.25952289342054635, + "flos": 711355186176.0, + "grad_norm": 0.028695706473352366, + "language_loss": 0.9867608, + "learning_rate": 0.0008681061775260799, + "loss": 0.99863392, + "num_input_tokens_seen": 111800752, + "router_z_loss_mlp": 0.95556641, + "step": 1349, + "time_per_iteration": 2.8635356426239014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185365, + "balance_loss_mlp": 1.08942509, + "epoch": 0.25971527510580994, + "flos": 456849934848.0, + "grad_norm": 0.028158951385379896, + "language_loss": 1.01652539, + "learning_rate": 0.0008678952696402458, + "loss": 1.02837896, + "num_input_tokens_seen": 111866752, + "router_z_loss_mlp": 0.95898438, + "step": 1350, + "time_per_iteration": 2.4997899532318115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184224, + "balance_loss_mlp": 1.08847523, + "epoch": 0.25990765679107347, + "flos": 613753509888.0, + "grad_norm": 0.022929201317296435, + "language_loss": 0.944794, + "learning_rate": 0.000867684218924801, + "loss": 0.95663619, + "num_input_tokens_seen": 111951328, + "router_z_loss_mlp": 0.95703125, + "step": 1351, + "time_per_iteration": 2.8553221225738525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190399, + "balance_loss_mlp": 1.09655762, + "epoch": 0.26010003847633706, + "flos": 1541404219392.0, + "grad_norm": 0.011373150433568688, + "language_loss": 0.78947091, + "learning_rate": 0.0008674730254616827, + "loss": 0.80137491, + "num_input_tokens_seen": 112182272, + "router_z_loss_mlp": 0.9375, + "step": 1352, + "time_per_iteration": 4.894901752471924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185829, + "balance_loss_mlp": 1.0900805, + "epoch": 0.2602924201616006, + "flos": 717544341504.0, + "grad_norm": 0.021521520095987904, + "language_loss": 0.9327749, + "learning_rate": 0.0008672616893328834, + "loss": 0.94463313, + "num_input_tokens_seen": 112261760, + "router_z_loss_mlp": 0.95703125, + "step": 1353, + "time_per_iteration": 2.9336133003234863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181557, + "balance_loss_mlp": 1.08571243, + "epoch": 0.2604848018468642, + "flos": 644685825024.0, + "grad_norm": 0.026147354827328006, + "language_loss": 0.99375951, + "learning_rate": 0.0008670502106204512, + "loss": 1.00557506, + "num_input_tokens_seen": 112339136, + "router_z_loss_mlp": 0.95800781, + "step": 1354, + "time_per_iteration": 2.828476667404175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182712, + "balance_loss_mlp": 1.08677256, + "epoch": 0.26067718353212777, + "flos": 518037815808.0, + "grad_norm": 0.024264679119450936, + "language_loss": 0.92830276, + "learning_rate": 0.0008668385894064892, + "loss": 0.94012988, + "num_input_tokens_seen": 112409872, + "router_z_loss_mlp": 0.95898438, + "step": 1355, + "time_per_iteration": 2.627603054046631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183025, + "balance_loss_mlp": 1.08708537, + "epoch": 0.2608695652173913, + "flos": 824224997376.0, + "grad_norm": 0.021603697394371835, + "language_loss": 0.98353279, + "learning_rate": 0.0008666268257731562, + "loss": 0.995363, + "num_input_tokens_seen": 112495616, + "router_z_loss_mlp": 0.95898438, + "step": 1356, + "time_per_iteration": 3.104410409927368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185288, + "balance_loss_mlp": 1.0894438, + "epoch": 0.2610619469026549, + "flos": 1009449039360.0, + "grad_norm": 0.029063247039842262, + "language_loss": 0.98633218, + "learning_rate": 0.0008664149198026662, + "loss": 0.99818504, + "num_input_tokens_seen": 112575168, + "router_z_loss_mlp": 0.95800781, + "step": 1357, + "time_per_iteration": 3.2552602291107178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184981, + "balance_loss_mlp": 1.08932745, + "epoch": 0.2612543285879184, + "flos": 537825248256.0, + "grad_norm": 0.02677910773484977, + "language_loss": 0.99748302, + "learning_rate": 0.0008662028715772883, + "loss": 1.00933278, + "num_input_tokens_seen": 112648480, + "router_z_loss_mlp": 0.95605469, + "step": 1358, + "time_per_iteration": 2.6044809818267822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186466, + "balance_loss_mlp": 1.09095597, + "epoch": 0.261446710273182, + "flos": 520438817280.0, + "grad_norm": 0.024887857022763207, + "language_loss": 0.95091379, + "learning_rate": 0.0008659906811793467, + "loss": 0.96277845, + "num_input_tokens_seen": 112719856, + "router_z_loss_mlp": 0.95458984, + "step": 1359, + "time_per_iteration": 2.660039186477661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118844, + "balance_loss_mlp": 1.09297669, + "epoch": 0.26163909195844554, + "flos": 584399001600.0, + "grad_norm": 0.02478490455868915, + "language_loss": 0.99414921, + "learning_rate": 0.0008657783486912215, + "loss": 1.00603366, + "num_input_tokens_seen": 112795088, + "router_z_loss_mlp": 0.95410156, + "step": 1360, + "time_per_iteration": 2.710707187652588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189735, + "balance_loss_mlp": 1.09412944, + "epoch": 0.2618314736437091, + "flos": 960368223744.0, + "grad_norm": 0.025390417969386195, + "language_loss": 0.99146813, + "learning_rate": 0.0008655658741953472, + "loss": 1.00336552, + "num_input_tokens_seen": 112879888, + "router_z_loss_mlp": 0.95556641, + "step": 1361, + "time_per_iteration": 3.2610023021698 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187461, + "balance_loss_mlp": 1.0919987, + "epoch": 0.26202385532897265, + "flos": 575902170624.0, + "grad_norm": 0.01965876060868175, + "language_loss": 0.95685869, + "learning_rate": 0.0008653532577742136, + "loss": 0.96873331, + "num_input_tokens_seen": 112952208, + "router_z_loss_mlp": 0.95410156, + "step": 1362, + "time_per_iteration": 2.753920793533325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190509, + "balance_loss_mlp": 1.09509337, + "epoch": 0.26221623701423624, + "flos": 446397264384.0, + "grad_norm": 0.024702919408059576, + "language_loss": 0.95440364, + "learning_rate": 0.0008651404995103659, + "loss": 0.96630871, + "num_input_tokens_seen": 113017472, + "router_z_loss_mlp": 0.95361328, + "step": 1363, + "time_per_iteration": 2.532839298248291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184254, + "balance_loss_mlp": 1.088696, + "epoch": 0.26240861869949983, + "flos": 536755003392.0, + "grad_norm": 0.021936659097783043, + "language_loss": 0.95658946, + "learning_rate": 0.0008649275994864041, + "loss": 0.96843195, + "num_input_tokens_seen": 113090000, + "router_z_loss_mlp": 0.95507812, + "step": 1364, + "time_per_iteration": 2.6723499298095703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182727, + "balance_loss_mlp": 1.08735919, + "epoch": 0.26260100038476336, + "flos": 566487544320.0, + "grad_norm": 0.02057443182875544, + "language_loss": 0.93747735, + "learning_rate": 0.0008647145577849834, + "loss": 0.94930464, + "num_input_tokens_seen": 113169424, + "router_z_loss_mlp": 0.953125, + "step": 1365, + "time_per_iteration": 2.817335844039917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184888, + "balance_loss_mlp": 1.089378, + "epoch": 0.26279338207002695, + "flos": 614320195584.0, + "grad_norm": 0.02000370099851243, + "language_loss": 0.90110707, + "learning_rate": 0.0008645013744888139, + "loss": 0.912956, + "num_input_tokens_seen": 113256752, + "router_z_loss_mlp": 0.95458984, + "step": 1366, + "time_per_iteration": 2.889956474304199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190369, + "balance_loss_mlp": 1.09452498, + "epoch": 0.2629857637552905, + "flos": 523944992256.0, + "grad_norm": 0.02433762343961203, + "language_loss": 0.96272296, + "learning_rate": 0.0008642880496806607, + "loss": 0.97462666, + "num_input_tokens_seen": 113330512, + "router_z_loss_mlp": 0.95800781, + "step": 1367, + "time_per_iteration": 2.7868857383728027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186128, + "balance_loss_mlp": 1.09028387, + "epoch": 0.26317814544055407, + "flos": 535654559232.0, + "grad_norm": 0.022945771924384736, + "language_loss": 0.9318915, + "learning_rate": 0.0008640745834433437, + "loss": 0.94375277, + "num_input_tokens_seen": 113409088, + "router_z_loss_mlp": 0.95800781, + "step": 1368, + "time_per_iteration": 2.7556509971618652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182695, + "balance_loss_mlp": 1.08718467, + "epoch": 0.2633705271258176, + "flos": 556779479040.0, + "grad_norm": 0.024336346931206027, + "language_loss": 0.96858466, + "learning_rate": 0.000863860975859738, + "loss": 0.98041165, + "num_input_tokens_seen": 113486624, + "router_z_loss_mlp": 0.95458984, + "step": 1369, + "time_per_iteration": 2.9069716930389404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184914, + "balance_loss_mlp": 1.08945167, + "epoch": 0.2635629088110812, + "flos": 553461957120.0, + "grad_norm": 0.02843668952404612, + "language_loss": 1.00276971, + "learning_rate": 0.0008636472270127733, + "loss": 1.01461875, + "num_input_tokens_seen": 113555776, + "router_z_loss_mlp": 0.95410156, + "step": 1370, + "time_per_iteration": 2.626201868057251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185086, + "balance_loss_mlp": 1.08952749, + "epoch": 0.2637552904963448, + "flos": 456915062784.0, + "grad_norm": 0.02826867423240315, + "language_loss": 1.01819849, + "learning_rate": 0.0008634333369854345, + "loss": 1.03004944, + "num_input_tokens_seen": 113624208, + "router_z_loss_mlp": 0.95507812, + "step": 1371, + "time_per_iteration": 2.5906460285186768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183664, + "balance_loss_mlp": 1.08820105, + "epoch": 0.2639476721816083, + "flos": 614259070464.0, + "grad_norm": 0.024066040008067748, + "language_loss": 0.95210433, + "learning_rate": 0.0008632193058607608, + "loss": 0.96394098, + "num_input_tokens_seen": 113698544, + "router_z_loss_mlp": 0.95410156, + "step": 1372, + "time_per_iteration": 2.7260935306549072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180244, + "balance_loss_mlp": 1.08487642, + "epoch": 0.2641400538668719, + "flos": 573025807872.0, + "grad_norm": 0.02730663798923432, + "language_loss": 0.93146777, + "learning_rate": 0.0008630051337218466, + "loss": 0.94327021, + "num_input_tokens_seen": 113769024, + "router_z_loss_mlp": 0.953125, + "step": 1373, + "time_per_iteration": 2.7155323028564453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193282, + "balance_loss_mlp": 1.09777129, + "epoch": 0.2643324355521354, + "flos": 583339490304.0, + "grad_norm": 0.02802871933703498, + "language_loss": 0.91373825, + "learning_rate": 0.0008627908206518409, + "loss": 0.9256711, + "num_input_tokens_seen": 113836320, + "router_z_loss_mlp": 0.95458984, + "step": 1374, + "time_per_iteration": 2.7118475437164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189674, + "balance_loss_mlp": 1.09621429, + "epoch": 0.264524817237399, + "flos": 1548025075200.0, + "grad_norm": 0.008601814223210932, + "language_loss": 0.75151253, + "learning_rate": 0.0008625763667339472, + "loss": 0.76340932, + "num_input_tokens_seen": 114065040, + "router_z_loss_mlp": 0.93359375, + "step": 1375, + "time_per_iteration": 4.9838175773620605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192464, + "balance_loss_mlp": 1.09709656, + "epoch": 0.26471719892266254, + "flos": 519042932736.0, + "grad_norm": 0.024634755338573868, + "language_loss": 0.99606347, + "learning_rate": 0.0008623617720514241, + "loss": 1.0079881, + "num_input_tokens_seen": 114133488, + "router_z_loss_mlp": 0.953125, + "step": 1376, + "time_per_iteration": 2.5836029052734375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191563, + "balance_loss_mlp": 1.09586143, + "epoch": 0.26490958060792613, + "flos": 518205001728.0, + "grad_norm": 0.02740625444526412, + "language_loss": 0.95827538, + "learning_rate": 0.0008621470366875848, + "loss": 0.97019094, + "num_input_tokens_seen": 114200704, + "router_z_loss_mlp": 0.95654297, + "step": 1377, + "time_per_iteration": 2.574557304382324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190438, + "balance_loss_mlp": 1.09507096, + "epoch": 0.26510196229318966, + "flos": 597682372608.0, + "grad_norm": 0.02552910213335578, + "language_loss": 0.96441573, + "learning_rate": 0.0008619321607257966, + "loss": 0.97632015, + "num_input_tokens_seen": 114272160, + "router_z_loss_mlp": 0.953125, + "step": 1378, + "time_per_iteration": 2.680574655532837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187734, + "balance_loss_mlp": 1.09227157, + "epoch": 0.26529434397845325, + "flos": 687052459008.0, + "grad_norm": 0.024630390251990656, + "language_loss": 0.90670931, + "learning_rate": 0.000861717144249482, + "loss": 0.91858661, + "num_input_tokens_seen": 114347904, + "router_z_loss_mlp": 0.95410156, + "step": 1379, + "time_per_iteration": 2.8311944007873535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181951, + "balance_loss_mlp": 1.08672631, + "epoch": 0.26548672566371684, + "flos": 425259609600.0, + "grad_norm": 0.02240925569996582, + "language_loss": 0.98143864, + "learning_rate": 0.0008615019873421175, + "loss": 0.99325812, + "num_input_tokens_seen": 114409952, + "router_z_loss_mlp": 0.95166016, + "step": 1380, + "time_per_iteration": 2.472280263900757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182344, + "balance_loss_mlp": 1.08716714, + "epoch": 0.26567910734898037, + "flos": 490849993728.0, + "grad_norm": 0.024166031959674275, + "language_loss": 0.9586165, + "learning_rate": 0.0008612866900872349, + "loss": 0.97043991, + "num_input_tokens_seen": 114474832, + "router_z_loss_mlp": 0.95117188, + "step": 1381, + "time_per_iteration": 2.5671043395996094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181037, + "balance_loss_mlp": 1.08586013, + "epoch": 0.26587148903424396, + "flos": 535228862976.0, + "grad_norm": 0.024625622440273682, + "language_loss": 0.97316492, + "learning_rate": 0.0008610712525684197, + "loss": 0.98497522, + "num_input_tokens_seen": 114545152, + "router_z_loss_mlp": 0.95117188, + "step": 1382, + "time_per_iteration": 2.6394782066345215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179642, + "balance_loss_mlp": 1.08446515, + "epoch": 0.2660638707195075, + "flos": 1019055046656.0, + "grad_norm": 0.02944222863828147, + "language_loss": 0.96464765, + "learning_rate": 0.0008608556748693121, + "loss": 0.97644401, + "num_input_tokens_seen": 114626512, + "router_z_loss_mlp": 0.95117188, + "step": 1383, + "time_per_iteration": 3.2514846324920654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184353, + "balance_loss_mlp": 1.08941519, + "epoch": 0.2662562524047711, + "flos": 525062900736.0, + "grad_norm": 0.024003921212174706, + "language_loss": 0.95956504, + "learning_rate": 0.000860639957073607, + "loss": 0.97140861, + "num_input_tokens_seen": 114701008, + "router_z_loss_mlp": 0.94873047, + "step": 1384, + "time_per_iteration": 2.6759448051452637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190743, + "balance_loss_mlp": 1.09594798, + "epoch": 0.2664486340900346, + "flos": 553479421440.0, + "grad_norm": 0.02584009515603871, + "language_loss": 0.97059226, + "learning_rate": 0.0008604240992650534, + "loss": 0.98249966, + "num_input_tokens_seen": 114771984, + "router_z_loss_mlp": 0.94726562, + "step": 1385, + "time_per_iteration": 2.6880476474761963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187786, + "balance_loss_mlp": 1.09260905, + "epoch": 0.2666410157752982, + "flos": 471208280064.0, + "grad_norm": 0.023709316387392747, + "language_loss": 0.98021734, + "learning_rate": 0.0008602081015274545, + "loss": 0.99209523, + "num_input_tokens_seen": 114844800, + "router_z_loss_mlp": 0.95117188, + "step": 1386, + "time_per_iteration": 2.71233868598938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187602, + "balance_loss_mlp": 1.0924257, + "epoch": 0.2668333974605617, + "flos": 571015574016.0, + "grad_norm": 0.021121239598078063, + "language_loss": 0.90840185, + "learning_rate": 0.0008599919639446684, + "loss": 0.92027789, + "num_input_tokens_seen": 114918544, + "router_z_loss_mlp": 0.95117188, + "step": 1387, + "time_per_iteration": 2.6656363010406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183674, + "balance_loss_mlp": 1.08840239, + "epoch": 0.2670257791458253, + "flos": 399895369728.0, + "grad_norm": 0.029257146370583235, + "language_loss": 0.92911923, + "learning_rate": 0.000859775686600607, + "loss": 0.940956, + "num_input_tokens_seen": 114984272, + "router_z_loss_mlp": 0.95214844, + "step": 1388, + "time_per_iteration": 2.5366902351379395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186225, + "balance_loss_mlp": 1.09104884, + "epoch": 0.2672181608310889, + "flos": 516891709440.0, + "grad_norm": 0.02488439836403737, + "language_loss": 0.94369394, + "learning_rate": 0.0008595592695792367, + "loss": 0.95555621, + "num_input_tokens_seen": 115054800, + "router_z_loss_mlp": 0.95117188, + "step": 1389, + "time_per_iteration": 2.6710469722747803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184466, + "balance_loss_mlp": 1.08928883, + "epoch": 0.26741054251635243, + "flos": 508525134336.0, + "grad_norm": 0.024055725628873734, + "language_loss": 0.99442971, + "learning_rate": 0.0008593427129645778, + "loss": 1.00627434, + "num_input_tokens_seen": 115120928, + "router_z_loss_mlp": 0.95117188, + "step": 1390, + "time_per_iteration": 2.5913095474243164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184607, + "balance_loss_mlp": 1.08919191, + "epoch": 0.267602924201616, + "flos": 577808345088.0, + "grad_norm": 0.025635319637122064, + "language_loss": 0.93523198, + "learning_rate": 0.0008591260168407052, + "loss": 0.94707805, + "num_input_tokens_seen": 115196688, + "router_z_loss_mlp": 0.95361328, + "step": 1391, + "time_per_iteration": 2.766150712966919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118642, + "balance_loss_mlp": 1.09095728, + "epoch": 0.26779530588687955, + "flos": 524999774208.0, + "grad_norm": 0.02196829508666122, + "language_loss": 0.92168128, + "learning_rate": 0.0008589091812917479, + "loss": 0.93354547, + "num_input_tokens_seen": 115264912, + "router_z_loss_mlp": 0.95410156, + "step": 1392, + "time_per_iteration": 2.6208953857421875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119079, + "balance_loss_mlp": 1.09580445, + "epoch": 0.26798768757214314, + "flos": 557827530240.0, + "grad_norm": 0.02442636530887492, + "language_loss": 0.95854455, + "learning_rate": 0.0008586922064018887, + "loss": 0.97045243, + "num_input_tokens_seen": 115334672, + "router_z_loss_mlp": 0.94921875, + "step": 1393, + "time_per_iteration": 2.6643927097320557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190751, + "balance_loss_mlp": 1.09581244, + "epoch": 0.2681800692574067, + "flos": 932094693888.0, + "grad_norm": 0.0254733622090453, + "language_loss": 0.99184585, + "learning_rate": 0.0008584750922553651, + "loss": 1.00375342, + "num_input_tokens_seen": 115420032, + "router_z_loss_mlp": 0.94873047, + "step": 1394, + "time_per_iteration": 3.1305503845214844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192347, + "balance_loss_mlp": 1.09712303, + "epoch": 0.26837245094267026, + "flos": 702317865984.0, + "grad_norm": 0.023340973249423663, + "language_loss": 0.92753315, + "learning_rate": 0.0008582578389364677, + "loss": 0.93945664, + "num_input_tokens_seen": 115492576, + "router_z_loss_mlp": 0.95166016, + "step": 1395, + "time_per_iteration": 2.8527095317840576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184756, + "balance_loss_mlp": 1.08953142, + "epoch": 0.26856483262793385, + "flos": 594393775104.0, + "grad_norm": 0.020526468408011762, + "language_loss": 1.00206113, + "learning_rate": 0.0008580404465295422, + "loss": 1.01390874, + "num_input_tokens_seen": 115568368, + "router_z_loss_mlp": 0.95166016, + "step": 1396, + "time_per_iteration": 2.784592866897583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184595, + "balance_loss_mlp": 1.08922791, + "epoch": 0.2687572143131974, + "flos": 715588502016.0, + "grad_norm": 0.024818089102904728, + "language_loss": 0.9790895, + "learning_rate": 0.0008578229151189876, + "loss": 0.99093544, + "num_input_tokens_seen": 115651536, + "router_z_loss_mlp": 0.953125, + "step": 1397, + "time_per_iteration": 2.901818037033081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185216, + "balance_loss_mlp": 1.0896579, + "epoch": 0.26894959599846097, + "flos": 468670291968.0, + "grad_norm": 0.028086023154021946, + "language_loss": 0.91012216, + "learning_rate": 0.0008576052447892573, + "loss": 0.92197436, + "num_input_tokens_seen": 115715696, + "router_z_loss_mlp": 0.95507812, + "step": 1398, + "time_per_iteration": 2.5849812030792236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186332, + "balance_loss_mlp": 1.09082139, + "epoch": 0.2691419776837245, + "flos": 469629746688.0, + "grad_norm": 0.022530608820729603, + "language_loss": 0.95147502, + "learning_rate": 0.000857387435624858, + "loss": 0.96333838, + "num_input_tokens_seen": 115780928, + "router_z_loss_mlp": 0.95458984, + "step": 1399, + "time_per_iteration": 2.5274569988250732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011908, + "balance_loss_mlp": 1.09567106, + "epoch": 0.2693343593689881, + "flos": 939284963328.0, + "grad_norm": 0.02095039568010189, + "language_loss": 0.95472848, + "learning_rate": 0.0008571694877103513, + "loss": 0.96663648, + "num_input_tokens_seen": 115874432, + "router_z_loss_mlp": 0.95068359, + "step": 1400, + "time_per_iteration": 3.2558727264404297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190554, + "balance_loss_mlp": 1.09542465, + "epoch": 0.2695267410542516, + "flos": 578793996288.0, + "grad_norm": 0.0241215692671091, + "language_loss": 0.95762217, + "learning_rate": 0.0008569514011303515, + "loss": 0.96952766, + "num_input_tokens_seen": 115956608, + "router_z_loss_mlp": 0.95068359, + "step": 1401, + "time_per_iteration": 2.8175997734069824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193641, + "balance_loss_mlp": 1.09846401, + "epoch": 0.2697191227395152, + "flos": 557964516864.0, + "grad_norm": 0.02413892998134183, + "language_loss": 0.96554017, + "learning_rate": 0.0008567331759695277, + "loss": 0.97747654, + "num_input_tokens_seen": 116031728, + "router_z_loss_mlp": 0.95117188, + "step": 1402, + "time_per_iteration": 2.7052927017211914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192424, + "balance_loss_mlp": 1.09729552, + "epoch": 0.26991150442477874, + "flos": 530314068480.0, + "grad_norm": 0.024237100625486396, + "language_loss": 0.97319567, + "learning_rate": 0.0008565148123126023, + "loss": 0.98511994, + "num_input_tokens_seen": 116104288, + "router_z_loss_mlp": 0.95068359, + "step": 1403, + "time_per_iteration": 2.6399028301239014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187922, + "balance_loss_mlp": 1.09274554, + "epoch": 0.2701038861100423, + "flos": 533086371840.0, + "grad_norm": 0.021620674049761555, + "language_loss": 0.93398714, + "learning_rate": 0.0008562963102443516, + "loss": 0.94586635, + "num_input_tokens_seen": 116177920, + "router_z_loss_mlp": 0.95117188, + "step": 1404, + "time_per_iteration": 2.6793839931488037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185578, + "balance_loss_mlp": 1.09035325, + "epoch": 0.2702962677953059, + "flos": 736504576512.0, + "grad_norm": 0.026106257639691363, + "language_loss": 0.94497591, + "learning_rate": 0.0008560776698496056, + "loss": 0.95683169, + "num_input_tokens_seen": 116251680, + "router_z_loss_mlp": 0.95166016, + "step": 1405, + "time_per_iteration": 2.8884029388427734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186883, + "balance_loss_mlp": 1.09170628, + "epoch": 0.27048864948056944, + "flos": 576000225792.0, + "grad_norm": 0.025611862530653208, + "language_loss": 0.95929742, + "learning_rate": 0.0008558588912132481, + "loss": 0.97116625, + "num_input_tokens_seen": 116327664, + "router_z_loss_mlp": 0.95117188, + "step": 1406, + "time_per_iteration": 2.8396451473236084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190124, + "balance_loss_mlp": 1.09666443, + "epoch": 0.27068103116583303, + "flos": 1426910212608.0, + "grad_norm": 0.014531874927713828, + "language_loss": 0.76458991, + "learning_rate": 0.0008556399744202163, + "loss": 0.77649117, + "num_input_tokens_seen": 116555152, + "router_z_loss_mlp": 0.93359375, + "step": 1407, + "time_per_iteration": 4.898139715194702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119097, + "balance_loss_mlp": 1.09603214, + "epoch": 0.27087341285109656, + "flos": 533031977472.0, + "grad_norm": 0.024689522623330563, + "language_loss": 0.90804136, + "learning_rate": 0.0008554209195555016, + "loss": 0.91995108, + "num_input_tokens_seen": 116626016, + "router_z_loss_mlp": 0.94873047, + "step": 1408, + "time_per_iteration": 2.670093297958374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189645, + "balance_loss_mlp": 1.09446859, + "epoch": 0.27106579453636015, + "flos": 582464629248.0, + "grad_norm": 0.0247795195650599, + "language_loss": 0.98232609, + "learning_rate": 0.0008552017267041483, + "loss": 0.99422252, + "num_input_tokens_seen": 116699152, + "router_z_loss_mlp": 0.95117188, + "step": 1409, + "time_per_iteration": 2.6904594898223877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118886, + "balance_loss_mlp": 1.09368336, + "epoch": 0.2712581762216237, + "flos": 507880585728.0, + "grad_norm": 0.024309295256612126, + "language_loss": 0.90687084, + "learning_rate": 0.0008549823959512549, + "loss": 0.91875941, + "num_input_tokens_seen": 116770912, + "router_z_loss_mlp": 0.95117188, + "step": 1410, + "time_per_iteration": 2.662597417831421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189943, + "balance_loss_mlp": 1.09481394, + "epoch": 0.27145055790688727, + "flos": 999142087680.0, + "grad_norm": 0.023895808714677214, + "language_loss": 0.95848304, + "learning_rate": 0.0008547629273819728, + "loss": 0.97038245, + "num_input_tokens_seen": 116863088, + "router_z_loss_mlp": 0.95068359, + "step": 1411, + "time_per_iteration": 3.36985182762146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186274, + "balance_loss_mlp": 1.09109735, + "epoch": 0.2716429395921508, + "flos": 547728697344.0, + "grad_norm": 0.02712613780862537, + "language_loss": 0.93229926, + "learning_rate": 0.0008545433210815074, + "loss": 0.94416201, + "num_input_tokens_seen": 116929504, + "router_z_loss_mlp": 0.95117188, + "step": 1412, + "time_per_iteration": 2.601452350616455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182035, + "balance_loss_mlp": 1.08685839, + "epoch": 0.2718353212774144, + "flos": 574310902272.0, + "grad_norm": 0.02439507328911507, + "language_loss": 0.95137858, + "learning_rate": 0.0008543235771351176, + "loss": 0.96319902, + "num_input_tokens_seen": 117004064, + "router_z_loss_mlp": 0.95117188, + "step": 1413, + "time_per_iteration": 2.7132034301757812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197126, + "balance_loss_mlp": 1.10209203, + "epoch": 0.272027702962678, + "flos": 645584881152.0, + "grad_norm": 0.02257567173785872, + "language_loss": 0.91220462, + "learning_rate": 0.0008541036956281154, + "loss": 0.92417586, + "num_input_tokens_seen": 117081328, + "router_z_loss_mlp": 0.94970703, + "step": 1414, + "time_per_iteration": 2.871951103210449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187874, + "balance_loss_mlp": 1.09284067, + "epoch": 0.2722200846479415, + "flos": 654995504640.0, + "grad_norm": 0.026411231013774135, + "language_loss": 0.93374348, + "learning_rate": 0.0008538836766458665, + "loss": 0.94562221, + "num_input_tokens_seen": 117156544, + "router_z_loss_mlp": 0.94970703, + "step": 1415, + "time_per_iteration": 2.8673384189605713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183666, + "balance_loss_mlp": 1.08868039, + "epoch": 0.2724124663332051, + "flos": 580778033664.0, + "grad_norm": 0.027862690716265133, + "language_loss": 0.96171892, + "learning_rate": 0.0008536635202737897, + "loss": 0.97355556, + "num_input_tokens_seen": 117230208, + "router_z_loss_mlp": 0.94921875, + "step": 1416, + "time_per_iteration": 2.7829935550689697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183251, + "balance_loss_mlp": 1.08831298, + "epoch": 0.2726048480184686, + "flos": 538467795456.0, + "grad_norm": 0.025077003090708358, + "language_loss": 0.93469489, + "learning_rate": 0.0008534432265973573, + "loss": 0.94652736, + "num_input_tokens_seen": 117298080, + "router_z_loss_mlp": 0.94873047, + "step": 1417, + "time_per_iteration": 2.593364715576172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183107, + "balance_loss_mlp": 1.08793056, + "epoch": 0.2727972297037322, + "flos": 997548817920.0, + "grad_norm": 0.025553987949566613, + "language_loss": 0.99255168, + "learning_rate": 0.000853222795702095, + "loss": 1.00438273, + "num_input_tokens_seen": 117396256, + "router_z_loss_mlp": 0.95117188, + "step": 1418, + "time_per_iteration": 3.387162685394287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119173, + "balance_loss_mlp": 1.09712589, + "epoch": 0.27298961138899575, + "flos": 607334042112.0, + "grad_norm": 0.02541700118612174, + "language_loss": 0.93465757, + "learning_rate": 0.0008530022276735813, + "loss": 0.94657481, + "num_input_tokens_seen": 117467936, + "router_z_loss_mlp": 0.9453125, + "step": 1419, + "time_per_iteration": 2.7426016330718994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191299, + "balance_loss_mlp": 1.0965513, + "epoch": 0.27318199307425933, + "flos": 530396660736.0, + "grad_norm": 0.025702548257077976, + "language_loss": 0.9374572, + "learning_rate": 0.0008527815225974489, + "loss": 0.94937015, + "num_input_tokens_seen": 117538256, + "router_z_loss_mlp": 0.94677734, + "step": 1420, + "time_per_iteration": 2.6544342041015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118326, + "balance_loss_mlp": 1.08865511, + "epoch": 0.2733743747595229, + "flos": 409911610368.0, + "grad_norm": 0.028874111022423956, + "language_loss": 0.99327809, + "learning_rate": 0.0008525606805593829, + "loss": 1.00511074, + "num_input_tokens_seen": 117599488, + "router_z_loss_mlp": 0.9453125, + "step": 1421, + "time_per_iteration": 2.4215376377105713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182106, + "balance_loss_mlp": 1.08721578, + "epoch": 0.27356675644478645, + "flos": 517228082688.0, + "grad_norm": 0.026406413504372096, + "language_loss": 0.92442018, + "learning_rate": 0.0008523397016451213, + "loss": 0.93624127, + "num_input_tokens_seen": 117664240, + "router_z_loss_mlp": 0.94824219, + "step": 1422, + "time_per_iteration": 2.5680603981018066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184812, + "balance_loss_mlp": 1.09011269, + "epoch": 0.27375913813005004, + "flos": 1054058221056.0, + "grad_norm": 0.02228341429952914, + "language_loss": 0.94973963, + "learning_rate": 0.0008521185859404564, + "loss": 0.96158779, + "num_input_tokens_seen": 117754768, + "router_z_loss_mlp": 0.94628906, + "step": 1423, + "time_per_iteration": 3.37345814704895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179884, + "balance_loss_mlp": 1.08485043, + "epoch": 0.27395151981531357, + "flos": 626003566080.0, + "grad_norm": 0.02387683630357993, + "language_loss": 0.97909242, + "learning_rate": 0.0008518973335312326, + "loss": 0.99089128, + "num_input_tokens_seen": 117832816, + "router_z_loss_mlp": 0.94970703, + "step": 1424, + "time_per_iteration": 2.8314859867095947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184763, + "balance_loss_mlp": 1.08982456, + "epoch": 0.27414390150057716, + "flos": 551414793216.0, + "grad_norm": 0.028545098094769822, + "language_loss": 0.95577884, + "learning_rate": 0.0008516759445033477, + "loss": 0.96762645, + "num_input_tokens_seen": 117899168, + "router_z_loss_mlp": 0.94873047, + "step": 1425, + "time_per_iteration": 2.6086578369140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118204, + "balance_loss_mlp": 1.08705389, + "epoch": 0.2743362831858407, + "flos": 540951389184.0, + "grad_norm": 0.02677358847245462, + "language_loss": 0.96958816, + "learning_rate": 0.0008514544189427526, + "loss": 0.9814086, + "num_input_tokens_seen": 117972384, + "router_z_loss_mlp": 0.94921875, + "step": 1426, + "time_per_iteration": 2.6927483081817627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191695, + "balance_loss_mlp": 1.09713852, + "epoch": 0.2745286648711043, + "flos": 469545153024.0, + "grad_norm": 0.025998263163597202, + "language_loss": 0.95807564, + "learning_rate": 0.0008512327569354511, + "loss": 0.96999258, + "num_input_tokens_seen": 118039584, + "router_z_loss_mlp": 0.94482422, + "step": 1427, + "time_per_iteration": 2.5617682933807373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119268, + "balance_loss_mlp": 1.09764659, + "epoch": 0.2747210465563678, + "flos": 473871794688.0, + "grad_norm": 0.02733358796633043, + "language_loss": 0.93333006, + "learning_rate": 0.0008510109585675001, + "loss": 0.94525683, + "num_input_tokens_seen": 118108352, + "router_z_loss_mlp": 0.94970703, + "step": 1428, + "time_per_iteration": 2.7269434928894043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205208, + "balance_loss_mlp": 1.11193848, + "epoch": 0.2749134282416314, + "flos": 1318056866304.0, + "grad_norm": 0.019809968329655446, + "language_loss": 0.81153345, + "learning_rate": 0.0008507890239250093, + "loss": 0.82358551, + "num_input_tokens_seen": 118331120, + "router_z_loss_mlp": 0.93164062, + "step": 1429, + "time_per_iteration": 4.731899738311768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190948, + "balance_loss_mlp": 1.0958662, + "epoch": 0.275105809926895, + "flos": 972531684864.0, + "grad_norm": 0.03147414200634365, + "language_loss": 0.91184711, + "learning_rate": 0.0008505669530941415, + "loss": 0.92375666, + "num_input_tokens_seen": 118415872, + "router_z_loss_mlp": 0.95019531, + "step": 1430, + "time_per_iteration": 3.3260724544525146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189047, + "balance_loss_mlp": 1.09387004, + "epoch": 0.2752981916121585, + "flos": 528368962560.0, + "grad_norm": 0.025580193945061114, + "language_loss": 0.95012403, + "learning_rate": 0.000850344746161112, + "loss": 0.96201456, + "num_input_tokens_seen": 118483008, + "router_z_loss_mlp": 0.95117188, + "step": 1431, + "time_per_iteration": 2.5820231437683105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186021, + "balance_loss_mlp": 1.09093964, + "epoch": 0.2754905732974221, + "flos": 454598654976.0, + "grad_norm": 0.024219881250434897, + "language_loss": 0.962569, + "learning_rate": 0.0008501224032121894, + "loss": 0.97442919, + "num_input_tokens_seen": 118545840, + "router_z_loss_mlp": 0.95019531, + "step": 1432, + "time_per_iteration": 2.501572847366333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188894, + "balance_loss_mlp": 1.09362173, + "epoch": 0.27568295498268564, + "flos": 498508893696.0, + "grad_norm": 0.02427263624604226, + "language_loss": 0.90960014, + "learning_rate": 0.0008498999243336946, + "loss": 0.921489, + "num_input_tokens_seen": 118615168, + "router_z_loss_mlp": 0.95214844, + "step": 1433, + "time_per_iteration": 2.6212003231048584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192375, + "balance_loss_mlp": 1.09715116, + "epoch": 0.2758753366679492, + "flos": 609416134656.0, + "grad_norm": 0.024278981864862804, + "language_loss": 0.95570171, + "learning_rate": 0.0008496773096120021, + "loss": 0.9676255, + "num_input_tokens_seen": 118690384, + "router_z_loss_mlp": 0.95166016, + "step": 1434, + "time_per_iteration": 2.804689407348633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118926, + "balance_loss_mlp": 1.09370184, + "epoch": 0.27606771835321275, + "flos": 741436835328.0, + "grad_norm": 0.025697024392157108, + "language_loss": 0.95037985, + "learning_rate": 0.0008494545591335381, + "loss": 0.96227252, + "num_input_tokens_seen": 118763024, + "router_z_loss_mlp": 0.95507812, + "step": 1435, + "time_per_iteration": 2.9329347610473633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195816, + "balance_loss_mlp": 1.10068655, + "epoch": 0.27626010003847634, + "flos": 555748165632.0, + "grad_norm": 0.0206290639721941, + "language_loss": 0.927001, + "learning_rate": 0.0008492316729847823, + "loss": 0.93895912, + "num_input_tokens_seen": 118845536, + "router_z_loss_mlp": 0.95068359, + "step": 1436, + "time_per_iteration": 2.820913553237915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118782, + "balance_loss_mlp": 1.09245288, + "epoch": 0.2764524817237399, + "flos": 543695494656.0, + "grad_norm": 0.02424730092158954, + "language_loss": 0.88914406, + "learning_rate": 0.0008490086512522664, + "loss": 0.90102232, + "num_input_tokens_seen": 118919008, + "router_z_loss_mlp": 0.953125, + "step": 1437, + "time_per_iteration": 2.7454309463500977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186593, + "balance_loss_mlp": 1.09127319, + "epoch": 0.27664486340900346, + "flos": 407128573440.0, + "grad_norm": 0.024912305575595636, + "language_loss": 0.99286187, + "learning_rate": 0.0008487854940225755, + "loss": 1.00472784, + "num_input_tokens_seen": 118981376, + "router_z_loss_mlp": 0.95263672, + "step": 1438, + "time_per_iteration": 2.4809510707855225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183239, + "balance_loss_mlp": 1.08834839, + "epoch": 0.27683724509426705, + "flos": 523156726272.0, + "grad_norm": 0.025259333782437998, + "language_loss": 0.98154646, + "learning_rate": 0.0008485622013823466, + "loss": 0.99337876, + "num_input_tokens_seen": 119050560, + "router_z_loss_mlp": 0.94824219, + "step": 1439, + "time_per_iteration": 2.65401554107666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183688, + "balance_loss_mlp": 1.08865404, + "epoch": 0.2770296267795306, + "flos": 536409897984.0, + "grad_norm": 0.02898674716386243, + "language_loss": 0.9318651, + "learning_rate": 0.00084833877341827, + "loss": 0.94370198, + "num_input_tokens_seen": 119121104, + "router_z_loss_mlp": 0.94970703, + "step": 1440, + "time_per_iteration": 2.6294455528259277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192537, + "balance_loss_mlp": 1.09755075, + "epoch": 0.27722200846479417, + "flos": 488970015744.0, + "grad_norm": 0.027244615130064133, + "language_loss": 0.90653217, + "learning_rate": 0.000848115210217088, + "loss": 0.91845751, + "num_input_tokens_seen": 119187712, + "router_z_loss_mlp": 0.94921875, + "step": 1441, + "time_per_iteration": 2.5394957065582275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118987, + "balance_loss_mlp": 1.09493196, + "epoch": 0.2774143901500577, + "flos": 619443108864.0, + "grad_norm": 0.024388639686817183, + "language_loss": 0.9228884, + "learning_rate": 0.0008478915118655952, + "loss": 0.93478709, + "num_input_tokens_seen": 119259264, + "router_z_loss_mlp": 0.94873047, + "step": 1442, + "time_per_iteration": 2.7634968757629395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119119, + "balance_loss_mlp": 1.0962522, + "epoch": 0.2776067718353213, + "flos": 514844545536.0, + "grad_norm": 0.021441164984372, + "language_loss": 0.94525409, + "learning_rate": 0.0008476676784506393, + "loss": 0.95716596, + "num_input_tokens_seen": 119328304, + "router_z_loss_mlp": 0.94873047, + "step": 1443, + "time_per_iteration": 2.6474499702453613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119148, + "balance_loss_mlp": 1.09678042, + "epoch": 0.2777991535205848, + "flos": 1006040919552.0, + "grad_norm": 0.026818715625153876, + "language_loss": 0.93016809, + "learning_rate": 0.0008474437100591201, + "loss": 0.94208288, + "num_input_tokens_seen": 119412352, + "router_z_loss_mlp": 0.94628906, + "step": 1444, + "time_per_iteration": 3.311842441558838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189789, + "balance_loss_mlp": 1.09494591, + "epoch": 0.2779915352058484, + "flos": 551375861760.0, + "grad_norm": 0.021641305677188864, + "language_loss": 0.95129728, + "learning_rate": 0.0008472196067779898, + "loss": 0.96319526, + "num_input_tokens_seen": 119484464, + "router_z_loss_mlp": 0.94775391, + "step": 1445, + "time_per_iteration": 2.667910575866699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186263, + "balance_loss_mlp": 1.091277, + "epoch": 0.278183916891112, + "flos": 875215990272.0, + "grad_norm": 0.030449834007814664, + "language_loss": 0.98351109, + "learning_rate": 0.0008469953686942531, + "loss": 0.99537361, + "num_input_tokens_seen": 119557280, + "router_z_loss_mlp": 0.94921875, + "step": 1446, + "time_per_iteration": 3.100473403930664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187264, + "balance_loss_mlp": 1.09246826, + "epoch": 0.2783762985763755, + "flos": 625195834368.0, + "grad_norm": 0.025904191205549917, + "language_loss": 0.93646944, + "learning_rate": 0.0008467709958949668, + "loss": 0.94834208, + "num_input_tokens_seen": 119631232, + "router_z_loss_mlp": 0.94726562, + "step": 1447, + "time_per_iteration": 2.7201731204986572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187983, + "balance_loss_mlp": 1.09333074, + "epoch": 0.2785686802616391, + "flos": 582911792640.0, + "grad_norm": 0.026760771702797625, + "language_loss": 0.94447374, + "learning_rate": 0.0008465464884672403, + "loss": 0.9563536, + "num_input_tokens_seen": 119700224, + "router_z_loss_mlp": 0.94580078, + "step": 1448, + "time_per_iteration": 2.7300403118133545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118631, + "balance_loss_mlp": 1.09180129, + "epoch": 0.27876106194690264, + "flos": 588538991616.0, + "grad_norm": 0.0212290178255441, + "language_loss": 0.93077391, + "learning_rate": 0.0008463218464982348, + "loss": 0.94263697, + "num_input_tokens_seen": 119781376, + "router_z_loss_mlp": 0.94433594, + "step": 1449, + "time_per_iteration": 2.86130952835083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190148, + "balance_loss_mlp": 1.09520972, + "epoch": 0.27895344363216623, + "flos": 877430340096.0, + "grad_norm": 0.02756647509109648, + "language_loss": 0.96903402, + "learning_rate": 0.0008460970700751645, + "loss": 0.98093557, + "num_input_tokens_seen": 119856672, + "router_z_loss_mlp": 0.94873047, + "step": 1450, + "time_per_iteration": 3.069391965866089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188227, + "balance_loss_mlp": 1.0932883, + "epoch": 0.27914582531742976, + "flos": 605035098624.0, + "grad_norm": 0.025261876769304706, + "language_loss": 0.97766632, + "learning_rate": 0.000845872159285295, + "loss": 0.98954856, + "num_input_tokens_seen": 119929008, + "router_z_loss_mlp": 0.94873047, + "step": 1451, + "time_per_iteration": 2.748164653778076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197098, + "balance_loss_mlp": 1.10325623, + "epoch": 0.27933820700269335, + "flos": 1501130411520.0, + "grad_norm": 0.012982305827020523, + "language_loss": 0.77766848, + "learning_rate": 0.0008456471142159447, + "loss": 0.78963947, + "num_input_tokens_seen": 120164032, + "router_z_loss_mlp": 0.9375, + "step": 1452, + "time_per_iteration": 4.906180143356323 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198876, + "balance_loss_mlp": 1.10408044, + "epoch": 0.2795305886879569, + "flos": 1033517451264.0, + "grad_norm": 0.027093914793319178, + "language_loss": 0.95323974, + "learning_rate": 0.0008454219349544836, + "loss": 0.9652285, + "num_input_tokens_seen": 120246784, + "router_z_loss_mlp": 0.94726562, + "step": 1453, + "time_per_iteration": 3.333178758621216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194793, + "balance_loss_mlp": 1.10014069, + "epoch": 0.27972297037322047, + "flos": 608226367488.0, + "grad_norm": 0.025225525542022995, + "language_loss": 0.8972255, + "learning_rate": 0.000845196621588334, + "loss": 0.90917349, + "num_input_tokens_seen": 120318208, + "router_z_loss_mlp": 0.94580078, + "step": 1454, + "time_per_iteration": 2.7425026893615723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191631, + "balance_loss_mlp": 1.09697926, + "epoch": 0.27991535205848406, + "flos": 631560907776.0, + "grad_norm": 0.023908777965609074, + "language_loss": 0.86623406, + "learning_rate": 0.0008449711742049706, + "loss": 0.87815034, + "num_input_tokens_seen": 120393248, + "router_z_loss_mlp": 0.94580078, + "step": 1455, + "time_per_iteration": 2.8148674964904785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188728, + "balance_loss_mlp": 1.09369469, + "epoch": 0.2801077337437476, + "flos": 550353280512.0, + "grad_norm": 0.02989232443782136, + "language_loss": 0.94001353, + "learning_rate": 0.0008447455928919196, + "loss": 0.95190072, + "num_input_tokens_seen": 120461040, + "router_z_loss_mlp": 0.94970703, + "step": 1456, + "time_per_iteration": 2.6030025482177734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186748, + "balance_loss_mlp": 1.09166706, + "epoch": 0.2803001154290112, + "flos": 487741317120.0, + "grad_norm": 0.023726139763527557, + "language_loss": 0.95883709, + "learning_rate": 0.0008445198777367595, + "loss": 0.97070462, + "num_input_tokens_seen": 120530400, + "router_z_loss_mlp": 0.95019531, + "step": 1457, + "time_per_iteration": 2.598212718963623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188426, + "balance_loss_mlp": 1.09344053, + "epoch": 0.2804924971142747, + "flos": 523091598336.0, + "grad_norm": 0.027291046925092925, + "language_loss": 0.9210875, + "learning_rate": 0.0008442940288271208, + "loss": 0.93297172, + "num_input_tokens_seen": 120598304, + "router_z_loss_mlp": 0.94921875, + "step": 1458, + "time_per_iteration": 2.617572069168091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189438, + "balance_loss_mlp": 1.09473801, + "epoch": 0.2806848787995383, + "flos": 528849053184.0, + "grad_norm": 0.02378106137707509, + "language_loss": 0.95258486, + "learning_rate": 0.0008440680462506856, + "loss": 0.96447927, + "num_input_tokens_seen": 120675712, + "router_z_loss_mlp": 0.94628906, + "step": 1459, + "time_per_iteration": 2.7465641498565674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191591, + "balance_loss_mlp": 1.09660506, + "epoch": 0.2808772604848018, + "flos": 486484420608.0, + "grad_norm": 0.02248739277997059, + "language_loss": 0.9351486, + "learning_rate": 0.0008438419300951883, + "loss": 0.94706452, + "num_input_tokens_seen": 120746544, + "router_z_loss_mlp": 0.94921875, + "step": 1460, + "time_per_iteration": 2.6331160068511963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188162, + "balance_loss_mlp": 1.09303284, + "epoch": 0.2810696421700654, + "flos": 619339049472.0, + "grad_norm": 0.024684272432392865, + "language_loss": 0.96464884, + "learning_rate": 0.0008436156804484148, + "loss": 0.97653049, + "num_input_tokens_seen": 120823520, + "router_z_loss_mlp": 0.95068359, + "step": 1461, + "time_per_iteration": 2.7740418910980225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188616, + "balance_loss_mlp": 1.09358263, + "epoch": 0.28126202385532895, + "flos": 455686364160.0, + "grad_norm": 0.026728942288464865, + "language_loss": 0.99464989, + "learning_rate": 0.0008433892973982031, + "loss": 1.00653601, + "num_input_tokens_seen": 120889568, + "router_z_loss_mlp": 0.94970703, + "step": 1462, + "time_per_iteration": 2.5151000022888184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188441, + "balance_loss_mlp": 1.09345496, + "epoch": 0.28145440554059253, + "flos": 531738150912.0, + "grad_norm": 0.02863032020985732, + "language_loss": 0.95777607, + "learning_rate": 0.0008431627810324431, + "loss": 0.96966046, + "num_input_tokens_seen": 120958480, + "router_z_loss_mlp": 0.94921875, + "step": 1463, + "time_per_iteration": 2.64477801322937 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187972, + "balance_loss_mlp": 1.09298646, + "epoch": 0.2816467872258561, + "flos": 453163838976.0, + "grad_norm": 0.025052425157320847, + "language_loss": 0.90961307, + "learning_rate": 0.000842936131439076, + "loss": 0.92149282, + "num_input_tokens_seen": 121028032, + "router_z_loss_mlp": 0.94921875, + "step": 1464, + "time_per_iteration": 2.5910096168518066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186267, + "balance_loss_mlp": 1.09147155, + "epoch": 0.28183916891111965, + "flos": 473704608768.0, + "grad_norm": 0.02627501463847235, + "language_loss": 0.97073281, + "learning_rate": 0.0008427093487060951, + "loss": 0.98259544, + "num_input_tokens_seen": 121099280, + "router_z_loss_mlp": 0.94726562, + "step": 1465, + "time_per_iteration": 2.6250505447387695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187944, + "balance_loss_mlp": 1.09300542, + "epoch": 0.28203155059638324, + "flos": 558188098560.0, + "grad_norm": 0.02108937585301408, + "language_loss": 0.91709232, + "learning_rate": 0.000842482432921545, + "loss": 0.92897177, + "num_input_tokens_seen": 121180240, + "router_z_loss_mlp": 0.94873047, + "step": 1466, + "time_per_iteration": 2.809101104736328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186286, + "balance_loss_mlp": 1.09139562, + "epoch": 0.28222393228164677, + "flos": 417878685696.0, + "grad_norm": 0.025824876793605126, + "language_loss": 0.96517414, + "learning_rate": 0.0008422553841735225, + "loss": 0.97703695, + "num_input_tokens_seen": 121242736, + "router_z_loss_mlp": 0.94824219, + "step": 1467, + "time_per_iteration": 2.468773365020752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184331, + "balance_loss_mlp": 1.08963072, + "epoch": 0.28241631396691036, + "flos": 606040215552.0, + "grad_norm": 0.02479925640814435, + "language_loss": 0.92490911, + "learning_rate": 0.0008420282025501757, + "loss": 0.93675244, + "num_input_tokens_seen": 121319248, + "router_z_loss_mlp": 0.94628906, + "step": 1468, + "time_per_iteration": 2.7617123126983643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184258, + "balance_loss_mlp": 1.08960581, + "epoch": 0.2826086956521739, + "flos": 574050390528.0, + "grad_norm": 0.023359152371130017, + "language_loss": 0.93868291, + "learning_rate": 0.0008418008881397043, + "loss": 0.95052546, + "num_input_tokens_seen": 121392064, + "router_z_loss_mlp": 0.94580078, + "step": 1469, + "time_per_iteration": 2.681727886199951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185359, + "balance_loss_mlp": 1.09056342, + "epoch": 0.2828010773374375, + "flos": 844318603776.0, + "grad_norm": 0.02469333041166596, + "language_loss": 0.92646587, + "learning_rate": 0.0008415734410303595, + "loss": 0.93831944, + "num_input_tokens_seen": 121475984, + "router_z_loss_mlp": 0.94726562, + "step": 1470, + "time_per_iteration": 3.1949617862701416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186089, + "balance_loss_mlp": 1.09124613, + "epoch": 0.28299345902270107, + "flos": 543771356160.0, + "grad_norm": 0.022743934694793657, + "language_loss": 0.98454034, + "learning_rate": 0.0008413458613104444, + "loss": 0.99640119, + "num_input_tokens_seen": 121551024, + "router_z_loss_mlp": 0.94775391, + "step": 1471, + "time_per_iteration": 2.679994583129883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184615, + "balance_loss_mlp": 1.08972394, + "epoch": 0.2831858407079646, + "flos": 572754562560.0, + "grad_norm": 0.02381851847695354, + "language_loss": 0.91435039, + "learning_rate": 0.0008411181490683129, + "loss": 0.92619658, + "num_input_tokens_seen": 121624528, + "router_z_loss_mlp": 0.94824219, + "step": 1472, + "time_per_iteration": 2.7178077697753906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186226, + "balance_loss_mlp": 1.09152639, + "epoch": 0.2833782223932282, + "flos": 765170875392.0, + "grad_norm": 0.023393787071714342, + "language_loss": 0.92628008, + "learning_rate": 0.0008408903043923707, + "loss": 0.9381423, + "num_input_tokens_seen": 121706736, + "router_z_loss_mlp": 0.94628906, + "step": 1473, + "time_per_iteration": 3.0261785984039307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184462, + "balance_loss_mlp": 1.0899055, + "epoch": 0.2835706040784917, + "flos": 540087261696.0, + "grad_norm": 0.026141956799832673, + "language_loss": 0.93214488, + "learning_rate": 0.0008406623273710754, + "loss": 0.94398952, + "num_input_tokens_seen": 121773008, + "router_z_loss_mlp": 0.94482422, + "step": 1474, + "time_per_iteration": 2.62430739402771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118759, + "balance_loss_mlp": 1.09312844, + "epoch": 0.2837629857637553, + "flos": 531653557248.0, + "grad_norm": 0.026627011980012938, + "language_loss": 0.91140723, + "learning_rate": 0.0008404342180929351, + "loss": 0.9232831, + "num_input_tokens_seen": 121840016, + "router_z_loss_mlp": 0.94384766, + "step": 1475, + "time_per_iteration": 2.6201882362365723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191029, + "balance_loss_mlp": 1.09666264, + "epoch": 0.28395536744901884, + "flos": 541109842944.0, + "grad_norm": 0.026942213566754976, + "language_loss": 0.91036892, + "learning_rate": 0.00084020597664651, + "loss": 0.92227924, + "num_input_tokens_seen": 121915008, + "router_z_loss_mlp": 0.94287109, + "step": 1476, + "time_per_iteration": 2.792515516281128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191806, + "balance_loss_mlp": 1.09743977, + "epoch": 0.2841477491342824, + "flos": 574801726464.0, + "grad_norm": 0.0281069748307863, + "language_loss": 0.94561875, + "learning_rate": 0.0008399776031204111, + "loss": 0.95753682, + "num_input_tokens_seen": 121987456, + "router_z_loss_mlp": 0.94287109, + "step": 1477, + "time_per_iteration": 2.7592930793762207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189206, + "balance_loss_mlp": 1.09479237, + "epoch": 0.28434013081954596, + "flos": 573138599424.0, + "grad_norm": 0.025578880464706598, + "language_loss": 0.90985346, + "learning_rate": 0.0008397490976033009, + "loss": 0.92174542, + "num_input_tokens_seen": 122058720, + "router_z_loss_mlp": 0.94335938, + "step": 1478, + "time_per_iteration": 2.72312331199646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193047, + "balance_loss_mlp": 1.10015869, + "epoch": 0.28453251250480954, + "flos": 1556673629184.0, + "grad_norm": 0.009281527310597816, + "language_loss": 0.77879643, + "learning_rate": 0.000839520460183893, + "loss": 0.7907269, + "num_input_tokens_seen": 122285792, + "router_z_loss_mlp": 0.92773438, + "step": 1479, + "time_per_iteration": 4.714428901672363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188304, + "balance_loss_mlp": 1.0943675, + "epoch": 0.28472489419007313, + "flos": 750426491904.0, + "grad_norm": 0.023822673694276757, + "language_loss": 0.93367732, + "learning_rate": 0.0008392916909509525, + "loss": 0.94556034, + "num_input_tokens_seen": 122366608, + "router_z_loss_mlp": 0.93847656, + "step": 1480, + "time_per_iteration": 3.0365796089172363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183623, + "balance_loss_mlp": 1.08930516, + "epoch": 0.28491727587533666, + "flos": 491138703360.0, + "grad_norm": 0.028675048847138535, + "language_loss": 0.94468164, + "learning_rate": 0.0008390627899932954, + "loss": 0.95651788, + "num_input_tokens_seen": 122435536, + "router_z_loss_mlp": 0.94238281, + "step": 1481, + "time_per_iteration": 2.562316656112671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187714, + "balance_loss_mlp": 1.09353888, + "epoch": 0.28510965756060025, + "flos": 730359081984.0, + "grad_norm": 0.028797322451775676, + "language_loss": 0.96514452, + "learning_rate": 0.000838833757399789, + "loss": 0.97702163, + "num_input_tokens_seen": 122515584, + "router_z_loss_mlp": 0.94091797, + "step": 1482, + "time_per_iteration": 2.955920696258545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189825, + "balance_loss_mlp": 1.09593546, + "epoch": 0.2853020392458638, + "flos": 552669688320.0, + "grad_norm": 0.027781834693451857, + "language_loss": 0.92148101, + "learning_rate": 0.0008386045932593515, + "loss": 0.93337923, + "num_input_tokens_seen": 122585552, + "router_z_loss_mlp": 0.93798828, + "step": 1483, + "time_per_iteration": 2.6609442234039307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185409, + "balance_loss_mlp": 1.09151959, + "epoch": 0.28549442093112737, + "flos": 756096625152.0, + "grad_norm": 0.023489805753692042, + "language_loss": 0.9365592, + "learning_rate": 0.0008383752976609525, + "loss": 0.94841331, + "num_input_tokens_seen": 122658928, + "router_z_loss_mlp": 0.93798828, + "step": 1484, + "time_per_iteration": 2.914872646331787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188644, + "balance_loss_mlp": 1.09480286, + "epoch": 0.2856868026163909, + "flos": 539703224832.0, + "grad_norm": 0.026354969281760218, + "language_loss": 0.9020288, + "learning_rate": 0.0008381458706936123, + "loss": 0.91391522, + "num_input_tokens_seen": 122729056, + "router_z_loss_mlp": 0.9375, + "step": 1485, + "time_per_iteration": 2.7100982666015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190691, + "balance_loss_mlp": 1.09675431, + "epoch": 0.2858791843016545, + "flos": 584920025088.0, + "grad_norm": 0.026556247425645045, + "language_loss": 0.97539783, + "learning_rate": 0.0008379163124464025, + "loss": 0.98730469, + "num_input_tokens_seen": 122802832, + "router_z_loss_mlp": 0.93847656, + "step": 1486, + "time_per_iteration": 2.7065536975860596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192022, + "balance_loss_mlp": 1.0979898, + "epoch": 0.286071565986918, + "flos": 646051510272.0, + "grad_norm": 0.03147840332437955, + "language_loss": 0.84533966, + "learning_rate": 0.0008376866230084452, + "loss": 0.85725987, + "num_input_tokens_seen": 122881328, + "router_z_loss_mlp": 0.93945312, + "step": 1487, + "time_per_iteration": 2.818673849105835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186798, + "balance_loss_mlp": 1.09295619, + "epoch": 0.2862639476721816, + "flos": 492330471936.0, + "grad_norm": 0.02612625436823832, + "language_loss": 0.963471, + "learning_rate": 0.000837456802468914, + "loss": 0.975339, + "num_input_tokens_seen": 122949680, + "router_z_loss_mlp": 0.9375, + "step": 1488, + "time_per_iteration": 2.5766210556030273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185712, + "balance_loss_mlp": 1.09187043, + "epoch": 0.2864563293574452, + "flos": 522744491520.0, + "grad_norm": 0.023875595461199783, + "language_loss": 0.96454561, + "learning_rate": 0.0008372268509170331, + "loss": 0.9764027, + "num_input_tokens_seen": 123024736, + "router_z_loss_mlp": 0.9375, + "step": 1489, + "time_per_iteration": 2.7241337299346924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117946, + "balance_loss_mlp": 1.08537972, + "epoch": 0.2866487110427087, + "flos": 548256451584.0, + "grad_norm": 0.022999113981848278, + "language_loss": 0.93815279, + "learning_rate": 0.0008369967684420779, + "loss": 0.94994742, + "num_input_tokens_seen": 123097344, + "router_z_loss_mlp": 0.93994141, + "step": 1490, + "time_per_iteration": 2.7358930110931396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180309, + "balance_loss_mlp": 1.08656251, + "epoch": 0.2868410927279723, + "flos": 483217290240.0, + "grad_norm": 0.024118055050044187, + "language_loss": 0.93676293, + "learning_rate": 0.0008367665551333736, + "loss": 0.94856608, + "num_input_tokens_seen": 123166240, + "router_z_loss_mlp": 0.93652344, + "step": 1491, + "time_per_iteration": 2.6094913482666016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181201, + "balance_loss_mlp": 1.08731139, + "epoch": 0.28703347441323585, + "flos": 726136499712.0, + "grad_norm": 0.03204326630579906, + "language_loss": 0.96034807, + "learning_rate": 0.0008365362110802977, + "loss": 0.9721601, + "num_input_tokens_seen": 123238160, + "router_z_loss_mlp": 0.93798828, + "step": 1492, + "time_per_iteration": 2.862281322479248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180339, + "balance_loss_mlp": 1.08630645, + "epoch": 0.28722585609849943, + "flos": 636213189120.0, + "grad_norm": 0.024948941988181064, + "language_loss": 0.92257547, + "learning_rate": 0.0008363057363722773, + "loss": 0.93437886, + "num_input_tokens_seen": 123319504, + "router_z_loss_mlp": 0.93945312, + "step": 1493, + "time_per_iteration": 2.8364765644073486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180894, + "balance_loss_mlp": 1.08695745, + "epoch": 0.28741823778376296, + "flos": 511251775488.0, + "grad_norm": 0.026788978355157977, + "language_loss": 0.94388151, + "learning_rate": 0.0008360751310987906, + "loss": 0.9556905, + "num_input_tokens_seen": 123387008, + "router_z_loss_mlp": 0.93847656, + "step": 1494, + "time_per_iteration": 2.5825915336608887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186005, + "balance_loss_mlp": 1.09244919, + "epoch": 0.28761061946902655, + "flos": 604931039232.0, + "grad_norm": 0.023099591474152015, + "language_loss": 0.92881125, + "learning_rate": 0.0008358443953493666, + "loss": 0.94067132, + "num_input_tokens_seen": 123471056, + "router_z_loss_mlp": 0.93457031, + "step": 1495, + "time_per_iteration": 2.8426852226257324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190116, + "balance_loss_mlp": 1.09617913, + "epoch": 0.28780300115429014, + "flos": 408059830272.0, + "grad_norm": 0.026469370193436835, + "language_loss": 0.97524667, + "learning_rate": 0.0008356135292135851, + "loss": 0.98714793, + "num_input_tokens_seen": 123535024, + "router_z_loss_mlp": 0.93847656, + "step": 1496, + "time_per_iteration": 2.505594491958618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187979, + "balance_loss_mlp": 1.09356499, + "epoch": 0.28799538283955367, + "flos": 375744365568.0, + "grad_norm": 0.028081335314896084, + "language_loss": 1.02447343, + "learning_rate": 0.0008353825327810758, + "loss": 1.03635335, + "num_input_tokens_seen": 123596224, + "router_z_loss_mlp": 0.94335938, + "step": 1497, + "time_per_iteration": 2.4137980937957764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188393, + "balance_loss_mlp": 1.09416974, + "epoch": 0.28818776452481726, + "flos": 593019357696.0, + "grad_norm": 0.027570910872340922, + "language_loss": 0.91214752, + "learning_rate": 0.00083515140614152, + "loss": 0.9240315, + "num_input_tokens_seen": 123668640, + "router_z_loss_mlp": 0.94140625, + "step": 1498, + "time_per_iteration": 2.7084319591522217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188877, + "balance_loss_mlp": 1.0943675, + "epoch": 0.2883801462100808, + "flos": 536103724032.0, + "grad_norm": 0.024692508476740448, + "language_loss": 0.97239816, + "learning_rate": 0.0008349201493846485, + "loss": 0.9842869, + "num_input_tokens_seen": 123740816, + "router_z_loss_mlp": 0.94433594, + "step": 1499, + "time_per_iteration": 2.6401236057281494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190398, + "balance_loss_mlp": 1.09617448, + "epoch": 0.2885725278953444, + "flos": 481076800512.0, + "grad_norm": 0.026282906035864008, + "language_loss": 0.98523659, + "learning_rate": 0.0008346887626002432, + "loss": 0.99714065, + "num_input_tokens_seen": 123805968, + "router_z_loss_mlp": 0.94140625, + "step": 1500, + "time_per_iteration": 2.52458119392395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193099, + "balance_loss_mlp": 1.09863722, + "epoch": 0.2887649095806079, + "flos": 465029858304.0, + "grad_norm": 0.024051725112114657, + "language_loss": 0.95880306, + "learning_rate": 0.000834457245878137, + "loss": 0.970734, + "num_input_tokens_seen": 123876576, + "router_z_loss_mlp": 0.94384766, + "step": 1501, + "time_per_iteration": 2.629535436630249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192018, + "balance_loss_mlp": 1.09765196, + "epoch": 0.2889572912658715, + "flos": 932639912448.0, + "grad_norm": 0.02596355901590014, + "language_loss": 0.90450358, + "learning_rate": 0.000834225599308212, + "loss": 0.9164238, + "num_input_tokens_seen": 123967664, + "router_z_loss_mlp": 0.94287109, + "step": 1502, + "time_per_iteration": 3.2340567111968994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189718, + "balance_loss_mlp": 1.09568572, + "epoch": 0.28914967295113503, + "flos": 571256620032.0, + "grad_norm": 0.02412179831144176, + "language_loss": 0.9487462, + "learning_rate": 0.0008339938229804016, + "loss": 0.96064335, + "num_input_tokens_seen": 124039680, + "router_z_loss_mlp": 0.93945312, + "step": 1503, + "time_per_iteration": 2.710339069366455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193321, + "balance_loss_mlp": 1.10081482, + "epoch": 0.2893420546363986, + "flos": 1489872010752.0, + "grad_norm": 0.01509287591883609, + "language_loss": 0.75434822, + "learning_rate": 0.0008337619169846895, + "loss": 0.76628143, + "num_input_tokens_seen": 124278848, + "router_z_loss_mlp": 0.92382812, + "step": 1504, + "time_per_iteration": 4.937675714492798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189832, + "balance_loss_mlp": 1.09579968, + "epoch": 0.2895344363216622, + "flos": 471182083584.0, + "grad_norm": 0.02978733186062401, + "language_loss": 0.95586789, + "learning_rate": 0.0008335298814111094, + "loss": 0.96776623, + "num_input_tokens_seen": 124346736, + "router_z_loss_mlp": 0.93945312, + "step": 1505, + "time_per_iteration": 2.5757808685302734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119483, + "balance_loss_mlp": 1.10075009, + "epoch": 0.28972681800692573, + "flos": 649340107776.0, + "grad_norm": 0.024998045510076724, + "language_loss": 0.95390272, + "learning_rate": 0.0008332977163497455, + "loss": 0.96585107, + "num_input_tokens_seen": 124420816, + "router_z_loss_mlp": 0.93994141, + "step": 1506, + "time_per_iteration": 2.8062288761138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190367, + "balance_loss_mlp": 1.09638238, + "epoch": 0.2899191996921893, + "flos": 573305785344.0, + "grad_norm": 0.023440576211443395, + "language_loss": 0.92864263, + "learning_rate": 0.0008330654218907325, + "loss": 0.94054627, + "num_input_tokens_seen": 124490480, + "router_z_loss_mlp": 0.93896484, + "step": 1507, + "time_per_iteration": 2.6871397495269775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195663, + "balance_loss_mlp": 1.10158336, + "epoch": 0.29011158137745285, + "flos": 662636940288.0, + "grad_norm": 0.026311762315396375, + "language_loss": 0.90949756, + "learning_rate": 0.0008328329981242548, + "loss": 0.92145419, + "num_input_tokens_seen": 124564960, + "router_z_loss_mlp": 0.93994141, + "step": 1508, + "time_per_iteration": 2.870436906814575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189885, + "balance_loss_mlp": 1.09585261, + "epoch": 0.29030396306271644, + "flos": 537402279936.0, + "grad_norm": 0.02293974263799261, + "language_loss": 0.95641714, + "learning_rate": 0.0008326004451405475, + "loss": 0.96831596, + "num_input_tokens_seen": 124637424, + "router_z_loss_mlp": 0.93945312, + "step": 1509, + "time_per_iteration": 2.7639336585998535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191857, + "balance_loss_mlp": 1.09815872, + "epoch": 0.29049634474798, + "flos": 512955835392.0, + "grad_norm": 0.025710607890434264, + "language_loss": 0.93112034, + "learning_rate": 0.0008323677630298957, + "loss": 0.94303894, + "num_input_tokens_seen": 124704832, + "router_z_loss_mlp": 0.93603516, + "step": 1510, + "time_per_iteration": 2.561455726623535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118953, + "balance_loss_mlp": 1.09592652, + "epoch": 0.29068872643324356, + "flos": 614982208512.0, + "grad_norm": 0.023671610956976636, + "language_loss": 0.92362118, + "learning_rate": 0.0008321349518826345, + "loss": 0.93551642, + "num_input_tokens_seen": 124779600, + "router_z_loss_mlp": 0.93505859, + "step": 1511, + "time_per_iteration": 2.807711362838745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191488, + "balance_loss_mlp": 1.09736073, + "epoch": 0.2908811081185071, + "flos": 547468185600.0, + "grad_norm": 0.029262624151918007, + "language_loss": 1.03824317, + "learning_rate": 0.0008319020117891491, + "loss": 1.05015802, + "num_input_tokens_seen": 124844128, + "router_z_loss_mlp": 0.94042969, + "step": 1512, + "time_per_iteration": 2.626357316970825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192195, + "balance_loss_mlp": 1.09840155, + "epoch": 0.2910734898037707, + "flos": 605901227520.0, + "grad_norm": 0.026098769068304807, + "language_loss": 0.96355087, + "learning_rate": 0.0008316689428398751, + "loss": 0.97547281, + "num_input_tokens_seen": 124915376, + "router_z_loss_mlp": 0.93701172, + "step": 1513, + "time_per_iteration": 2.6982998847961426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190959, + "balance_loss_mlp": 1.09721279, + "epoch": 0.29126587148903427, + "flos": 575835041280.0, + "grad_norm": 0.02240755749123148, + "language_loss": 0.95587385, + "learning_rate": 0.0008314357451252979, + "loss": 0.96778345, + "num_input_tokens_seen": 124995504, + "router_z_loss_mlp": 0.93652344, + "step": 1514, + "time_per_iteration": 2.7506277561187744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185358, + "balance_loss_mlp": 1.09170711, + "epoch": 0.2914582531742978, + "flos": 572133482496.0, + "grad_norm": 0.030106635879309524, + "language_loss": 0.98758858, + "learning_rate": 0.0008312024187359527, + "loss": 0.99944222, + "num_input_tokens_seen": 125064192, + "router_z_loss_mlp": 0.93554688, + "step": 1515, + "time_per_iteration": 2.6389546394348145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186161, + "balance_loss_mlp": 1.09265339, + "epoch": 0.2916506348595614, + "flos": 732302186496.0, + "grad_norm": 0.023105382424412787, + "language_loss": 0.95643955, + "learning_rate": 0.000830968963762425, + "loss": 0.96830118, + "num_input_tokens_seen": 125150560, + "router_z_loss_mlp": 0.93408203, + "step": 1516, + "time_per_iteration": 3.0222864151000977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183995, + "balance_loss_mlp": 1.09048688, + "epoch": 0.2918430165448249, + "flos": 511466625024.0, + "grad_norm": 0.027481799845478876, + "language_loss": 0.92072952, + "learning_rate": 0.0008307353802953497, + "loss": 0.93256938, + "num_input_tokens_seen": 125219264, + "router_z_loss_mlp": 0.93408203, + "step": 1517, + "time_per_iteration": 2.6852073669433594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188929, + "balance_loss_mlp": 1.09546912, + "epoch": 0.2920353982300885, + "flos": 631606569984.0, + "grad_norm": 0.024841994736450757, + "language_loss": 0.95207542, + "learning_rate": 0.0008305016684254125, + "loss": 0.9639647, + "num_input_tokens_seen": 125301904, + "router_z_loss_mlp": 0.93359375, + "step": 1518, + "time_per_iteration": 2.78326678276062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185623, + "balance_loss_mlp": 1.0920676, + "epoch": 0.29222777991535204, + "flos": 502670350848.0, + "grad_norm": 0.02442081482663903, + "language_loss": 0.96402657, + "learning_rate": 0.0008302678282433479, + "loss": 0.97588277, + "num_input_tokens_seen": 125367712, + "router_z_loss_mlp": 0.93457031, + "step": 1519, + "time_per_iteration": 2.580885887145996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186077, + "balance_loss_mlp": 1.09261727, + "epoch": 0.2924201616006156, + "flos": 487841373696.0, + "grad_norm": 0.025531334181834578, + "language_loss": 0.92434102, + "learning_rate": 0.0008300338598399411, + "loss": 0.93620181, + "num_input_tokens_seen": 125437648, + "router_z_loss_mlp": 0.93359375, + "step": 1520, + "time_per_iteration": 2.60040020942688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182574, + "balance_loss_mlp": 1.08911419, + "epoch": 0.2926125432858792, + "flos": 477410170368.0, + "grad_norm": 0.025034871095789283, + "language_loss": 1.04410791, + "learning_rate": 0.0008297997633060263, + "loss": 1.05593348, + "num_input_tokens_seen": 125502432, + "router_z_loss_mlp": 0.93359375, + "step": 1521, + "time_per_iteration": 2.5479507446289062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184296, + "balance_loss_mlp": 1.09083581, + "epoch": 0.29280492497114274, + "flos": 677867418624.0, + "grad_norm": 0.023158831925944874, + "language_loss": 0.93757105, + "learning_rate": 0.0008295655387324883, + "loss": 0.94941401, + "num_input_tokens_seen": 125575424, + "router_z_loss_mlp": 0.93359375, + "step": 1522, + "time_per_iteration": 2.80924916267395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184597, + "balance_loss_mlp": 1.09113646, + "epoch": 0.29299730665640633, + "flos": 459344262144.0, + "grad_norm": 0.024881330364852117, + "language_loss": 0.95369709, + "learning_rate": 0.0008293311862102609, + "loss": 0.96554303, + "num_input_tokens_seen": 125639040, + "router_z_loss_mlp": 0.93359375, + "step": 1523, + "time_per_iteration": 2.5006909370422363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183918, + "balance_loss_mlp": 1.09055364, + "epoch": 0.29318968834166986, + "flos": 447495707136.0, + "grad_norm": 0.027757525537519354, + "language_loss": 0.99242002, + "learning_rate": 0.0008290967058303275, + "loss": 1.00425935, + "num_input_tokens_seen": 125701712, + "router_z_loss_mlp": 0.93261719, + "step": 1524, + "time_per_iteration": 2.472071409225464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184496, + "balance_loss_mlp": 1.09098816, + "epoch": 0.29338207002693345, + "flos": 451255663104.0, + "grad_norm": 0.024483324027042522, + "language_loss": 0.93697757, + "learning_rate": 0.0008288620976837219, + "loss": 0.9488225, + "num_input_tokens_seen": 125765088, + "router_z_loss_mlp": 0.93408203, + "step": 1525, + "time_per_iteration": 2.486726760864258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183678, + "balance_loss_mlp": 1.08997941, + "epoch": 0.293574451712197, + "flos": 503284700160.0, + "grad_norm": 0.025672010983446535, + "language_loss": 0.92014909, + "learning_rate": 0.000828627361861527, + "loss": 0.93198591, + "num_input_tokens_seen": 125831328, + "router_z_loss_mlp": 0.93603516, + "step": 1526, + "time_per_iteration": 2.557725429534912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183155, + "balance_loss_mlp": 1.089504, + "epoch": 0.29376683339746057, + "flos": 697683048960.0, + "grad_norm": 0.028193197708561973, + "language_loss": 0.94158876, + "learning_rate": 0.0008283924984548752, + "loss": 0.95342028, + "num_input_tokens_seen": 125903664, + "router_z_loss_mlp": 0.93554688, + "step": 1527, + "time_per_iteration": 2.866138219833374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182528, + "balance_loss_mlp": 1.08882964, + "epoch": 0.2939592150827241, + "flos": 479541927936.0, + "grad_norm": 0.024215116577050826, + "language_loss": 0.92182994, + "learning_rate": 0.0008281575075549485, + "loss": 0.93365526, + "num_input_tokens_seen": 125971856, + "router_z_loss_mlp": 0.93603516, + "step": 1528, + "time_per_iteration": 2.5585758686065674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202408, + "balance_loss_mlp": 1.1108551, + "epoch": 0.2941515967679877, + "flos": 1488386803200.0, + "grad_norm": 0.02007823063587109, + "language_loss": 0.77352691, + "learning_rate": 0.000827922389252979, + "loss": 0.78555101, + "num_input_tokens_seen": 126183968, + "router_z_loss_mlp": 0.9140625, + "step": 1529, + "time_per_iteration": 4.658870697021484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186281, + "balance_loss_mlp": 1.09267783, + "epoch": 0.2943439784532513, + "flos": 675399287808.0, + "grad_norm": 0.027761434636537758, + "language_loss": 0.99164081, + "learning_rate": 0.0008276871436402469, + "loss": 1.00350356, + "num_input_tokens_seen": 126254448, + "router_z_loss_mlp": 0.93505859, + "step": 1530, + "time_per_iteration": 2.897517204284668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182983, + "balance_loss_mlp": 1.08909357, + "epoch": 0.2945363601385148, + "flos": 577382648832.0, + "grad_norm": 0.025208295044921922, + "language_loss": 0.95561033, + "learning_rate": 0.000827451770808083, + "loss": 0.96744013, + "num_input_tokens_seen": 126328208, + "router_z_loss_mlp": 0.93798828, + "step": 1531, + "time_per_iteration": 2.667419910430908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183127, + "balance_loss_mlp": 1.08923733, + "epoch": 0.2947287418237784, + "flos": 481617289728.0, + "grad_norm": 0.0238323033403859, + "language_loss": 0.92856085, + "learning_rate": 0.0008272162708478674, + "loss": 0.94039214, + "num_input_tokens_seen": 126396464, + "router_z_loss_mlp": 0.93798828, + "step": 1532, + "time_per_iteration": 2.532593250274658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190087, + "balance_loss_mlp": 1.09638822, + "epoch": 0.2949211235090419, + "flos": 559260344832.0, + "grad_norm": 0.023856250691152107, + "language_loss": 0.9573307, + "learning_rate": 0.000826980643851029, + "loss": 0.96923155, + "num_input_tokens_seen": 126468960, + "router_z_loss_mlp": 0.93603516, + "step": 1533, + "time_per_iteration": 2.648393154144287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190115, + "balance_loss_mlp": 1.09665465, + "epoch": 0.2951135051943055, + "flos": 484856222208.0, + "grad_norm": 0.02761517479674983, + "language_loss": 0.9290787, + "learning_rate": 0.0008267448899090464, + "loss": 0.94097984, + "num_input_tokens_seen": 126536496, + "router_z_loss_mlp": 0.93359375, + "step": 1534, + "time_per_iteration": 2.5158579349517822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185677, + "balance_loss_mlp": 1.09226477, + "epoch": 0.29530588687956905, + "flos": 551421523968.0, + "grad_norm": 0.024001584155810263, + "language_loss": 0.90244222, + "learning_rate": 0.0008265090091134473, + "loss": 0.91429895, + "num_input_tokens_seen": 126614048, + "router_z_loss_mlp": 0.93310547, + "step": 1535, + "time_per_iteration": 2.8246946334838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185762, + "balance_loss_mlp": 1.09234965, + "epoch": 0.29549826856483263, + "flos": 674309577216.0, + "grad_norm": 0.021562014940098434, + "language_loss": 0.8727591, + "learning_rate": 0.0008262730015558088, + "loss": 0.88461667, + "num_input_tokens_seen": 126697248, + "router_z_loss_mlp": 0.93310547, + "step": 1536, + "time_per_iteration": 2.8568825721740723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189062, + "balance_loss_mlp": 1.09560144, + "epoch": 0.29569065025009617, + "flos": 766135059456.0, + "grad_norm": 0.0253531059084562, + "language_loss": 0.89567208, + "learning_rate": 0.0008260368673277574, + "loss": 0.90756267, + "num_input_tokens_seen": 126782496, + "router_z_loss_mlp": 0.93359375, + "step": 1537, + "time_per_iteration": 3.1248908042907715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181656, + "balance_loss_mlp": 1.08781409, + "epoch": 0.29588303193535975, + "flos": 544830867456.0, + "grad_norm": 0.02589470547450269, + "language_loss": 0.93808746, + "learning_rate": 0.0008258006065209682, + "loss": 0.94990402, + "num_input_tokens_seen": 126857328, + "router_z_loss_mlp": 0.9375, + "step": 1538, + "time_per_iteration": 2.7405824661254883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182922, + "balance_loss_mlp": 1.0892235, + "epoch": 0.29607541362062334, + "flos": 598144998912.0, + "grad_norm": 0.02499469713889481, + "language_loss": 0.9045589, + "learning_rate": 0.0008255642192271657, + "loss": 0.91638815, + "num_input_tokens_seen": 126932608, + "router_z_loss_mlp": 0.93603516, + "step": 1539, + "time_per_iteration": 2.7654454708099365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183976, + "balance_loss_mlp": 1.09032559, + "epoch": 0.29626779530588687, + "flos": 611037602304.0, + "grad_norm": 0.024707919738005703, + "language_loss": 0.92616487, + "learning_rate": 0.0008253277055381241, + "loss": 0.93800461, + "num_input_tokens_seen": 127008928, + "router_z_loss_mlp": 0.93554688, + "step": 1540, + "time_per_iteration": 2.803755760192871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186228, + "balance_loss_mlp": 1.09252918, + "epoch": 0.29646017699115046, + "flos": 868957704192.0, + "grad_norm": 0.02707124240628881, + "language_loss": 0.95315254, + "learning_rate": 0.0008250910655456658, + "loss": 0.96501482, + "num_input_tokens_seen": 127097104, + "router_z_loss_mlp": 0.93603516, + "step": 1541, + "time_per_iteration": 3.11143159866333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181572, + "balance_loss_mlp": 1.08787382, + "epoch": 0.296652558676414, + "flos": 496880695296.0, + "grad_norm": 0.02670504880571787, + "language_loss": 0.9343757, + "learning_rate": 0.0008248542993416625, + "loss": 0.94619143, + "num_input_tokens_seen": 127165264, + "router_z_loss_mlp": 0.93603516, + "step": 1542, + "time_per_iteration": 2.5893712043762207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181697, + "balance_loss_mlp": 1.08790362, + "epoch": 0.2968449403616776, + "flos": 572626308096.0, + "grad_norm": 0.02711797813063544, + "language_loss": 0.9310621, + "learning_rate": 0.0008246174070180352, + "loss": 0.94287908, + "num_input_tokens_seen": 127238992, + "router_z_loss_mlp": 0.93701172, + "step": 1543, + "time_per_iteration": 2.677011489868164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189648, + "balance_loss_mlp": 1.09614003, + "epoch": 0.2970373220469411, + "flos": 795650022912.0, + "grad_norm": 0.029629985597633038, + "language_loss": 0.9263432, + "learning_rate": 0.0008243803886667537, + "loss": 0.93823969, + "num_input_tokens_seen": 127328160, + "router_z_loss_mlp": 0.93408203, + "step": 1544, + "time_per_iteration": 3.1022729873657227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188285, + "balance_loss_mlp": 1.09472907, + "epoch": 0.2972297037322047, + "flos": 662248174080.0, + "grad_norm": 0.0271995559284498, + "language_loss": 0.89610922, + "learning_rate": 0.0008241432443798364, + "loss": 0.90799212, + "num_input_tokens_seen": 127407328, + "router_z_loss_mlp": 0.93457031, + "step": 1545, + "time_per_iteration": 2.8079423904418945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181998, + "balance_loss_mlp": 1.08868086, + "epoch": 0.29742208541746823, + "flos": 598231593984.0, + "grad_norm": 0.02196679377417612, + "language_loss": 0.91743886, + "learning_rate": 0.0008239059742493512, + "loss": 0.92925882, + "num_input_tokens_seen": 127477136, + "router_z_loss_mlp": 0.93212891, + "step": 1546, + "time_per_iteration": 2.703385353088379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182095, + "balance_loss_mlp": 1.08868301, + "epoch": 0.2976144671027318, + "flos": 771338563584.0, + "grad_norm": 0.02555387631372138, + "language_loss": 0.94145298, + "learning_rate": 0.0008236685783674142, + "loss": 0.95327395, + "num_input_tokens_seen": 127565680, + "router_z_loss_mlp": 0.93310547, + "step": 1547, + "time_per_iteration": 3.0583412647247314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221115, + "balance_loss_mlp": 1.12822723, + "epoch": 0.2978068487879954, + "flos": 1487911441920.0, + "grad_norm": 0.023679675459363107, + "language_loss": 0.76221192, + "learning_rate": 0.0008234310568261911, + "loss": 0.77442312, + "num_input_tokens_seen": 127791584, + "router_z_loss_mlp": 0.92773438, + "step": 1548, + "time_per_iteration": 4.846614360809326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192812, + "balance_loss_mlp": 1.09925652, + "epoch": 0.29799923047325894, + "flos": 476329191936.0, + "grad_norm": 0.02691026692614136, + "language_loss": 0.91868371, + "learning_rate": 0.0008231934097178955, + "loss": 0.93061185, + "num_input_tokens_seen": 127860112, + "router_z_loss_mlp": 0.93457031, + "step": 1549, + "time_per_iteration": 2.600588798522949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189437, + "balance_loss_mlp": 1.09573877, + "epoch": 0.2981916121585225, + "flos": 761167872000.0, + "grad_norm": 0.02304182660847759, + "language_loss": 0.93441629, + "learning_rate": 0.0008229556371347903, + "loss": 0.94631064, + "num_input_tokens_seen": 127938752, + "router_z_loss_mlp": 0.93603516, + "step": 1550, + "time_per_iteration": 2.9500393867492676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196641, + "balance_loss_mlp": 1.10256064, + "epoch": 0.29838399384378606, + "flos": 876516547584.0, + "grad_norm": 0.029531977965095095, + "language_loss": 0.90478379, + "learning_rate": 0.0008227177391691874, + "loss": 0.91675019, + "num_input_tokens_seen": 128022192, + "router_z_loss_mlp": 0.93994141, + "step": 1551, + "time_per_iteration": 3.117060422897339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192501, + "balance_loss_mlp": 1.09870708, + "epoch": 0.29857637552904964, + "flos": 580751837184.0, + "grad_norm": 0.026349497602305087, + "language_loss": 0.9813534, + "learning_rate": 0.0008224797159134463, + "loss": 0.99327838, + "num_input_tokens_seen": 128097776, + "router_z_loss_mlp": 0.93701172, + "step": 1552, + "time_per_iteration": 2.694382429122925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185823, + "balance_loss_mlp": 1.09212494, + "epoch": 0.2987687572143132, + "flos": 837807811584.0, + "grad_norm": 0.022207279660822626, + "language_loss": 0.8985877, + "learning_rate": 0.0008222415674599765, + "loss": 0.91044593, + "num_input_tokens_seen": 128179888, + "router_z_loss_mlp": 0.93603516, + "step": 1553, + "time_per_iteration": 3.074347972869873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186024, + "balance_loss_mlp": 1.09203923, + "epoch": 0.29896113889957676, + "flos": 568167409152.0, + "grad_norm": 0.026892838709900748, + "language_loss": 0.93768913, + "learning_rate": 0.0008220032939012349, + "loss": 0.94954944, + "num_input_tokens_seen": 128251152, + "router_z_loss_mlp": 0.93896484, + "step": 1554, + "time_per_iteration": 2.6793601512908936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190641, + "balance_loss_mlp": 1.0965606, + "epoch": 0.29915352058484035, + "flos": 499835647488.0, + "grad_norm": 0.021647779244158522, + "language_loss": 0.95223451, + "learning_rate": 0.0008217648953297277, + "loss": 0.96414095, + "num_input_tokens_seen": 128327600, + "router_z_loss_mlp": 0.93994141, + "step": 1555, + "time_per_iteration": 2.836775779724121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189405, + "balance_loss_mlp": 1.09546852, + "epoch": 0.2993459022701039, + "flos": 593214741504.0, + "grad_norm": 0.03843372955580003, + "language_loss": 0.88026905, + "learning_rate": 0.0008215263718380095, + "loss": 0.89216304, + "num_input_tokens_seen": 128398432, + "router_z_loss_mlp": 0.93847656, + "step": 1556, + "time_per_iteration": 2.6840782165527344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192028, + "balance_loss_mlp": 1.09790027, + "epoch": 0.29953828395536747, + "flos": 573472971264.0, + "grad_norm": 0.02697506762846426, + "language_loss": 0.95771539, + "learning_rate": 0.0008212877235186833, + "loss": 0.96963573, + "num_input_tokens_seen": 128469696, + "router_z_loss_mlp": 0.94042969, + "step": 1557, + "time_per_iteration": 2.649303674697876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216583, + "balance_loss_mlp": 1.12350464, + "epoch": 0.299730665640631, + "flos": 1508083637760.0, + "grad_norm": 0.01733611069553414, + "language_loss": 0.77737558, + "learning_rate": 0.0008210489504644005, + "loss": 0.78954148, + "num_input_tokens_seen": 128698560, + "router_z_loss_mlp": 0.9296875, + "step": 1558, + "time_per_iteration": 4.920740365982056 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191809, + "balance_loss_mlp": 1.09772909, + "epoch": 0.2999230473258946, + "flos": 514807615488.0, + "grad_norm": 0.03091345134541536, + "language_loss": 0.92723, + "learning_rate": 0.0008208100527678611, + "loss": 0.93914807, + "num_input_tokens_seen": 128765952, + "router_z_loss_mlp": 0.93994141, + "step": 1559, + "time_per_iteration": 2.628755807876587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191055, + "balance_loss_mlp": 1.09692788, + "epoch": 0.3001154290111581, + "flos": 835853973504.0, + "grad_norm": 0.03027255896835194, + "language_loss": 0.86836946, + "learning_rate": 0.0008205710305218135, + "loss": 0.88028002, + "num_input_tokens_seen": 128840048, + "router_z_loss_mlp": 0.94042969, + "step": 1560, + "time_per_iteration": 3.0076475143432617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188346, + "balance_loss_mlp": 1.09431422, + "epoch": 0.3003078106964217, + "flos": 557945051136.0, + "grad_norm": 0.023845762720508586, + "language_loss": 0.96495396, + "learning_rate": 0.0008203318838190541, + "loss": 0.9768374, + "num_input_tokens_seen": 128912496, + "router_z_loss_mlp": 0.93945312, + "step": 1561, + "time_per_iteration": 2.7329952716827393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118952, + "balance_loss_mlp": 1.09548759, + "epoch": 0.30050019238168524, + "flos": 527168461824.0, + "grad_norm": 0.030147848994798797, + "language_loss": 0.95915771, + "learning_rate": 0.0008200926127524281, + "loss": 0.97105289, + "num_input_tokens_seen": 128980624, + "router_z_loss_mlp": 0.93945312, + "step": 1562, + "time_per_iteration": 2.625941753387451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186113, + "balance_loss_mlp": 1.09217656, + "epoch": 0.3006925740669488, + "flos": 578936987136.0, + "grad_norm": 0.02860364820877459, + "language_loss": 0.92538679, + "learning_rate": 0.0008198532174148289, + "loss": 0.93724799, + "num_input_tokens_seen": 129050576, + "router_z_loss_mlp": 0.93847656, + "step": 1563, + "time_per_iteration": 2.725884199142456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207901, + "balance_loss_mlp": 1.11539459, + "epoch": 0.3008849557522124, + "flos": 1493610499584.0, + "grad_norm": 0.014785027254047896, + "language_loss": 0.8068617, + "learning_rate": 0.0008196136978991977, + "loss": 0.8189407, + "num_input_tokens_seen": 129278880, + "router_z_loss_mlp": 0.92382812, + "step": 1564, + "time_per_iteration": 4.830730438232422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198016, + "balance_loss_mlp": 1.10398376, + "epoch": 0.30107733743747594, + "flos": 510824077824.0, + "grad_norm": 0.03423038852538926, + "language_loss": 0.994165, + "learning_rate": 0.0008193740542985244, + "loss": 1.00614524, + "num_input_tokens_seen": 129346560, + "router_z_loss_mlp": 0.93945312, + "step": 1565, + "time_per_iteration": 2.578756809234619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194051, + "balance_loss_mlp": 1.10020983, + "epoch": 0.30126971912273953, + "flos": 588820970496.0, + "grad_norm": 0.027351016206119898, + "language_loss": 0.95914042, + "learning_rate": 0.0008191342867058467, + "loss": 0.97108096, + "num_input_tokens_seen": 129420448, + "router_z_loss_mlp": 0.9375, + "step": 1566, + "time_per_iteration": 2.7046890258789062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192822, + "balance_loss_mlp": 1.09898102, + "epoch": 0.30146210080800306, + "flos": 603220248576.0, + "grad_norm": 0.029722715632080093, + "language_loss": 0.93181753, + "learning_rate": 0.0008188943952142509, + "loss": 0.94374579, + "num_input_tokens_seen": 129494032, + "router_z_loss_mlp": 0.9375, + "step": 1567, + "time_per_iteration": 2.7784945964813232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189204, + "balance_loss_mlp": 1.09588659, + "epoch": 0.30165448249326665, + "flos": 919286684160.0, + "grad_norm": 0.02698998287866622, + "language_loss": 0.91980577, + "learning_rate": 0.0008186543799168711, + "loss": 0.93169785, + "num_input_tokens_seen": 129569088, + "router_z_loss_mlp": 0.93212891, + "step": 1568, + "time_per_iteration": 3.1082897186279297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188766, + "balance_loss_mlp": 1.09530556, + "epoch": 0.3018468641785302, + "flos": 778630164480.0, + "grad_norm": 0.02791954193910651, + "language_loss": 0.98386627, + "learning_rate": 0.0008184142409068892, + "loss": 0.99575394, + "num_input_tokens_seen": 129647968, + "router_z_loss_mlp": 0.93359375, + "step": 1569, + "time_per_iteration": 3.0047945976257324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187793, + "balance_loss_mlp": 1.09433293, + "epoch": 0.30203924586379377, + "flos": 523389040128.0, + "grad_norm": 0.023468489537567368, + "language_loss": 0.94207543, + "learning_rate": 0.000818173978277536, + "loss": 0.95395339, + "num_input_tokens_seen": 129718928, + "router_z_loss_mlp": 0.93359375, + "step": 1570, + "time_per_iteration": 2.640746831893921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119455, + "balance_loss_mlp": 1.10094678, + "epoch": 0.3022316275490573, + "flos": 525649052160.0, + "grad_norm": 0.028721303316250762, + "language_loss": 0.92132497, + "learning_rate": 0.000817933592122089, + "loss": 0.93327045, + "num_input_tokens_seen": 129790128, + "router_z_loss_mlp": 0.93505859, + "step": 1571, + "time_per_iteration": 2.683819055557251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119426, + "balance_loss_mlp": 1.10037029, + "epoch": 0.3024240092343209, + "flos": 480872684544.0, + "grad_norm": 0.028034832338571278, + "language_loss": 0.93476671, + "learning_rate": 0.0008176930825338749, + "loss": 0.94670928, + "num_input_tokens_seen": 129857536, + "router_z_loss_mlp": 0.93798828, + "step": 1572, + "time_per_iteration": 2.5472469329833984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189801, + "balance_loss_mlp": 1.09605432, + "epoch": 0.3026163909195845, + "flos": 688430879232.0, + "grad_norm": 0.025848261804373458, + "language_loss": 0.98155606, + "learning_rate": 0.0008174524496062679, + "loss": 0.9934541, + "num_input_tokens_seen": 129931440, + "router_z_loss_mlp": 0.93652344, + "step": 1573, + "time_per_iteration": 2.90840482711792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185834, + "balance_loss_mlp": 1.0922308, + "epoch": 0.302808772604848, + "flos": 544086262272.0, + "grad_norm": 0.023993082839652336, + "language_loss": 0.9423182, + "learning_rate": 0.0008172116934326894, + "loss": 0.95417649, + "num_input_tokens_seen": 130005200, + "router_z_loss_mlp": 0.93505859, + "step": 1574, + "time_per_iteration": 2.735853433609009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197529, + "balance_loss_mlp": 1.10349655, + "epoch": 0.3030011542901116, + "flos": 476051215872.0, + "grad_norm": 0.025758910941944917, + "language_loss": 0.96492219, + "learning_rate": 0.0008169708141066097, + "loss": 0.97689748, + "num_input_tokens_seen": 130069136, + "router_z_loss_mlp": 0.93945312, + "step": 1575, + "time_per_iteration": 2.5468080043792725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195411, + "balance_loss_mlp": 1.10123575, + "epoch": 0.30319353597537513, + "flos": 482472685056.0, + "grad_norm": 0.02368764088299644, + "language_loss": 0.97863203, + "learning_rate": 0.0008167298117215465, + "loss": 0.99058616, + "num_input_tokens_seen": 130135456, + "router_z_loss_mlp": 0.94091797, + "step": 1576, + "time_per_iteration": 2.5703070163726807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191699, + "balance_loss_mlp": 1.09747636, + "epoch": 0.3033859176606387, + "flos": 706112750592.0, + "grad_norm": 0.02517452757559557, + "language_loss": 0.96809077, + "learning_rate": 0.0008164886863710649, + "loss": 0.98000777, + "num_input_tokens_seen": 130213712, + "router_z_loss_mlp": 0.94140625, + "step": 1577, + "time_per_iteration": 2.9235777854919434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194461, + "balance_loss_mlp": 1.09990454, + "epoch": 0.30357829934590225, + "flos": 766108862976.0, + "grad_norm": 0.022389524212240816, + "language_loss": 0.93041158, + "learning_rate": 0.0008162474381487783, + "loss": 0.94235623, + "num_input_tokens_seen": 130290928, + "router_z_loss_mlp": 0.94482422, + "step": 1578, + "time_per_iteration": 3.0875654220581055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198648, + "balance_loss_mlp": 1.10399556, + "epoch": 0.30377068103116583, + "flos": 533448941568.0, + "grad_norm": 0.026496061930467673, + "language_loss": 0.94202471, + "learning_rate": 0.0008160060671483475, + "loss": 0.9540112, + "num_input_tokens_seen": 130362672, + "router_z_loss_mlp": 0.94580078, + "step": 1579, + "time_per_iteration": 2.69014048576355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198759, + "balance_loss_mlp": 1.10415483, + "epoch": 0.3039630627164294, + "flos": 511223577600.0, + "grad_norm": 0.03174839578716906, + "language_loss": 0.93386602, + "learning_rate": 0.0008157645734634809, + "loss": 0.94585359, + "num_input_tokens_seen": 130428848, + "router_z_loss_mlp": 0.9453125, + "step": 1580, + "time_per_iteration": 2.602752923965454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221184, + "balance_loss_mlp": 1.12791443, + "epoch": 0.30415544440169295, + "flos": 1509188084736.0, + "grad_norm": 0.0221653057193215, + "language_loss": 0.76896489, + "learning_rate": 0.000815522957187935, + "loss": 0.78117669, + "num_input_tokens_seen": 130665440, + "router_z_loss_mlp": 0.93164062, + "step": 1581, + "time_per_iteration": 4.895219802856445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196045, + "balance_loss_mlp": 1.10334778, + "epoch": 0.30434782608695654, + "flos": 1461787133952.0, + "grad_norm": 0.012004742936218659, + "language_loss": 0.73214495, + "learning_rate": 0.0008152812184155132, + "loss": 0.74410546, + "num_input_tokens_seen": 130895248, + "router_z_loss_mlp": 0.92578125, + "step": 1582, + "time_per_iteration": 4.860503196716309 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199297, + "balance_loss_mlp": 1.10526431, + "epoch": 0.3045402077722201, + "flos": 483534197760.0, + "grad_norm": 0.030796945736395555, + "language_loss": 0.93027633, + "learning_rate": 0.000815039357240067, + "loss": 0.94226933, + "num_input_tokens_seen": 130964544, + "router_z_loss_mlp": 0.93945312, + "step": 1583, + "time_per_iteration": 2.6209895610809326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200124, + "balance_loss_mlp": 1.10613978, + "epoch": 0.30473258945748366, + "flos": 544626751488.0, + "grad_norm": 0.03019985050023197, + "language_loss": 0.95277119, + "learning_rate": 0.0008147973737554952, + "loss": 0.9647724, + "num_input_tokens_seen": 131041744, + "router_z_loss_mlp": 0.93896484, + "step": 1584, + "time_per_iteration": 2.7421703338623047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194047, + "balance_loss_mlp": 1.10039604, + "epoch": 0.3049249711427472, + "flos": 568121746944.0, + "grad_norm": 0.05356410902969654, + "language_loss": 0.96138752, + "learning_rate": 0.000814555268055744, + "loss": 0.97332799, + "num_input_tokens_seen": 131108864, + "router_z_loss_mlp": 0.93554688, + "step": 1585, + "time_per_iteration": 2.632770299911499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191549, + "balance_loss_mlp": 1.09804094, + "epoch": 0.3051173528280108, + "flos": 529289485824.0, + "grad_norm": 0.02648444030223836, + "language_loss": 0.96492249, + "learning_rate": 0.0008143130402348073, + "loss": 0.97683799, + "num_input_tokens_seen": 131181104, + "router_z_loss_mlp": 0.93408203, + "step": 1586, + "time_per_iteration": 2.67673659324646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01201208, + "balance_loss_mlp": 1.10746217, + "epoch": 0.3053097345132743, + "flos": 587599002624.0, + "grad_norm": 0.026229801397330138, + "language_loss": 0.86860031, + "learning_rate": 0.0008140706903867265, + "loss": 0.88061237, + "num_input_tokens_seen": 131258704, + "router_z_loss_mlp": 0.93652344, + "step": 1587, + "time_per_iteration": 2.800891399383545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198977, + "balance_loss_mlp": 1.10518289, + "epoch": 0.3055021161985379, + "flos": 608200171008.0, + "grad_norm": 0.031935519152889405, + "language_loss": 1.00360334, + "learning_rate": 0.0008138282186055897, + "loss": 1.01559317, + "num_input_tokens_seen": 131325712, + "router_z_loss_mlp": 0.93701172, + "step": 1588, + "time_per_iteration": 2.735144853591919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119001, + "balance_loss_mlp": 1.09645426, + "epoch": 0.3056944978838015, + "flos": 574962181632.0, + "grad_norm": 0.02354328369726863, + "language_loss": 0.90634608, + "learning_rate": 0.0008135856249855331, + "loss": 0.91824615, + "num_input_tokens_seen": 131397568, + "router_z_loss_mlp": 0.93457031, + "step": 1589, + "time_per_iteration": 2.676589012145996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193478, + "balance_loss_mlp": 1.0996846, + "epoch": 0.305886879569065, + "flos": 635071085568.0, + "grad_norm": 0.031037281782467684, + "language_loss": 0.99387443, + "learning_rate": 0.0008133429096207398, + "loss": 1.00580931, + "num_input_tokens_seen": 131467632, + "router_z_loss_mlp": 0.93701172, + "step": 1590, + "time_per_iteration": 2.7601518630981445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01232346, + "balance_loss_mlp": 1.14117432, + "epoch": 0.3060792612543286, + "flos": 1372131065856.0, + "grad_norm": 0.03086145734446917, + "language_loss": 0.75312257, + "learning_rate": 0.0008131000726054403, + "loss": 0.76544607, + "num_input_tokens_seen": 131702224, + "router_z_loss_mlp": 0.91015625, + "step": 1591, + "time_per_iteration": 4.945107460021973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194266, + "balance_loss_mlp": 1.10051942, + "epoch": 0.30627164293959214, + "flos": 519618350592.0, + "grad_norm": 0.024964882972055902, + "language_loss": 0.95062864, + "learning_rate": 0.0008128571140339123, + "loss": 0.96257126, + "num_input_tokens_seen": 131774608, + "router_z_loss_mlp": 0.93652344, + "step": 1592, + "time_per_iteration": 2.6392171382904053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01201642, + "balance_loss_mlp": 1.10780036, + "epoch": 0.3064640246248557, + "flos": 456533027328.0, + "grad_norm": 0.029487227531667784, + "language_loss": 0.98122042, + "learning_rate": 0.0008126140340004805, + "loss": 0.9932369, + "num_input_tokens_seen": 131841216, + "router_z_loss_mlp": 0.9375, + "step": 1593, + "time_per_iteration": 2.504150629043579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199461, + "balance_loss_mlp": 1.10561943, + "epoch": 0.30665640631011926, + "flos": 851608203264.0, + "grad_norm": 0.026956571268616787, + "language_loss": 0.91923594, + "learning_rate": 0.0008123708325995172, + "loss": 0.93123049, + "num_input_tokens_seen": 131937584, + "router_z_loss_mlp": 0.9375, + "step": 1594, + "time_per_iteration": 3.184525489807129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190831, + "balance_loss_mlp": 1.09713268, + "epoch": 0.30684878799538284, + "flos": 759615535104.0, + "grad_norm": 0.022474213305982697, + "language_loss": 0.88990366, + "learning_rate": 0.0008121275099254414, + "loss": 0.90181196, + "num_input_tokens_seen": 132012656, + "router_z_loss_mlp": 0.93603516, + "step": 1595, + "time_per_iteration": 2.892902374267578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200579, + "balance_loss_mlp": 1.10668933, + "epoch": 0.3070411696806464, + "flos": 518595769344.0, + "grad_norm": 0.025855927391394404, + "language_loss": 0.96650064, + "learning_rate": 0.0008118840660727194, + "loss": 0.97850645, + "num_input_tokens_seen": 132083728, + "router_z_loss_mlp": 0.93798828, + "step": 1596, + "time_per_iteration": 2.696312665939331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191708, + "balance_loss_mlp": 1.09805715, + "epoch": 0.30723355136590996, + "flos": 845790349824.0, + "grad_norm": 0.023513083336694603, + "language_loss": 0.94521677, + "learning_rate": 0.0008116405011358644, + "loss": 0.95713389, + "num_input_tokens_seen": 132170896, + "router_z_loss_mlp": 0.93554688, + "step": 1597, + "time_per_iteration": 3.1500890254974365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118938, + "balance_loss_mlp": 1.09572959, + "epoch": 0.30742593305117355, + "flos": 467079023616.0, + "grad_norm": 0.024597056369147573, + "language_loss": 0.89059556, + "learning_rate": 0.0008113968152094369, + "loss": 0.90248942, + "num_input_tokens_seen": 132234592, + "router_z_loss_mlp": 0.93554688, + "step": 1598, + "time_per_iteration": 2.502336263656616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191327, + "balance_loss_mlp": 1.09781969, + "epoch": 0.3076183147364371, + "flos": 687816529920.0, + "grad_norm": 0.025330429780868927, + "language_loss": 0.90385377, + "learning_rate": 0.0008111530083880438, + "loss": 0.91576707, + "num_input_tokens_seen": 132314720, + "router_z_loss_mlp": 0.93408203, + "step": 1599, + "time_per_iteration": 2.8846051692962646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192126, + "balance_loss_mlp": 1.09847498, + "epoch": 0.30781069642170067, + "flos": 615179593728.0, + "grad_norm": 0.02627563558110635, + "language_loss": 0.95310938, + "learning_rate": 0.0008109090807663399, + "loss": 0.96503073, + "num_input_tokens_seen": 132388768, + "router_z_loss_mlp": 0.93554688, + "step": 1600, + "time_per_iteration": 2.8132736682891846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119763, + "balance_loss_mlp": 1.10402679, + "epoch": 0.3080030781069642, + "flos": 591508680192.0, + "grad_norm": 0.027223292643472258, + "language_loss": 0.96310741, + "learning_rate": 0.0008106650324390257, + "loss": 0.97508371, + "num_input_tokens_seen": 132472544, + "router_z_loss_mlp": 0.93505859, + "step": 1601, + "time_per_iteration": 2.8477296829223633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188215, + "balance_loss_mlp": 1.0948981, + "epoch": 0.3081954597922278, + "flos": 563691045888.0, + "grad_norm": 0.027322987260225157, + "language_loss": 0.89918464, + "learning_rate": 0.0008104208635008493, + "loss": 0.91106677, + "num_input_tokens_seen": 132541968, + "router_z_loss_mlp": 0.93212891, + "step": 1602, + "time_per_iteration": 2.6639676094055176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192245, + "balance_loss_mlp": 1.09859383, + "epoch": 0.3083878414774913, + "flos": 448761335808.0, + "grad_norm": 0.031035394068971153, + "language_loss": 0.93496901, + "learning_rate": 0.0008101765740466058, + "loss": 0.94689143, + "num_input_tokens_seen": 132606976, + "router_z_loss_mlp": 0.93554688, + "step": 1603, + "time_per_iteration": 2.4892899990081787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188397, + "balance_loss_mlp": 1.09465039, + "epoch": 0.3085802231627549, + "flos": 494544821760.0, + "grad_norm": 0.029709960428380106, + "language_loss": 0.93853128, + "learning_rate": 0.0008099321641711364, + "loss": 0.95041513, + "num_input_tokens_seen": 132677984, + "router_z_loss_mlp": 0.93652344, + "step": 1604, + "time_per_iteration": 2.638798952102661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011875, + "balance_loss_mlp": 1.09380174, + "epoch": 0.3087726048480185, + "flos": 488690038272.0, + "grad_norm": 0.02367908107469003, + "language_loss": 0.91951108, + "learning_rate": 0.0008096876339693295, + "loss": 0.93138611, + "num_input_tokens_seen": 132749136, + "router_z_loss_mlp": 0.93603516, + "step": 1605, + "time_per_iteration": 2.6115643978118896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189736, + "balance_loss_mlp": 1.09603786, + "epoch": 0.308964986533282, + "flos": 731887223808.0, + "grad_norm": 0.029121548764615916, + "language_loss": 0.90058184, + "learning_rate": 0.0008094429835361206, + "loss": 0.91247922, + "num_input_tokens_seen": 132823824, + "router_z_loss_mlp": 0.93603516, + "step": 1606, + "time_per_iteration": 2.9361119270324707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185725, + "balance_loss_mlp": 1.09226441, + "epoch": 0.3091573682185456, + "flos": 606515576832.0, + "grad_norm": 0.024539043330914945, + "language_loss": 0.94318593, + "learning_rate": 0.0008091982129664908, + "loss": 0.95504314, + "num_input_tokens_seen": 132895936, + "router_z_loss_mlp": 0.93359375, + "step": 1607, + "time_per_iteration": 2.750641345977783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191863, + "balance_loss_mlp": 1.09821212, + "epoch": 0.30934974990380915, + "flos": 461306832384.0, + "grad_norm": 0.02635007664096696, + "language_loss": 0.92281848, + "learning_rate": 0.0008089533223554687, + "loss": 0.93473709, + "num_input_tokens_seen": 132968960, + "router_z_loss_mlp": 0.93554688, + "step": 1608, + "time_per_iteration": 2.733422040939331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187457, + "balance_loss_mlp": 1.09380579, + "epoch": 0.30954213158907273, + "flos": 554567130624.0, + "grad_norm": 0.025571984513822792, + "language_loss": 0.94345558, + "learning_rate": 0.0008087083117981294, + "loss": 0.95533013, + "num_input_tokens_seen": 133048448, + "router_z_loss_mlp": 0.93554688, + "step": 1609, + "time_per_iteration": 2.919583797454834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189683, + "balance_loss_mlp": 1.09665251, + "epoch": 0.30973451327433627, + "flos": 554113236480.0, + "grad_norm": 0.028700236773969223, + "language_loss": 0.98730469, + "learning_rate": 0.0008084631813895943, + "loss": 0.99920154, + "num_input_tokens_seen": 133121680, + "router_z_loss_mlp": 0.92919922, + "step": 1610, + "time_per_iteration": 2.7721197605133057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192773, + "balance_loss_mlp": 1.09955156, + "epoch": 0.30992689495959985, + "flos": 566762792448.0, + "grad_norm": 0.027612542910463767, + "language_loss": 0.93469882, + "learning_rate": 0.0008082179312250315, + "loss": 0.94662654, + "num_input_tokens_seen": 133190176, + "router_z_loss_mlp": 0.93115234, + "step": 1611, + "time_per_iteration": 2.658564805984497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01219437, + "balance_loss_mlp": 1.12769318, + "epoch": 0.3101192766448634, + "flos": 1445560270848.0, + "grad_norm": 0.021240149379623804, + "language_loss": 0.79855847, + "learning_rate": 0.0008079725613996555, + "loss": 0.81075287, + "num_input_tokens_seen": 133420512, + "router_z_loss_mlp": 0.91601562, + "step": 1612, + "time_per_iteration": 4.8431174755096436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01227287, + "balance_loss_mlp": 1.13497162, + "epoch": 0.31031165833012697, + "flos": 1535127742464.0, + "grad_norm": 0.019393089292119553, + "language_loss": 0.76629329, + "learning_rate": 0.0008077270720087273, + "loss": 0.77856624, + "num_input_tokens_seen": 133651984, + "router_z_loss_mlp": 0.921875, + "step": 1613, + "time_per_iteration": 5.043596029281616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191397, + "balance_loss_mlp": 1.09850931, + "epoch": 0.31050404001539056, + "flos": 993632409600.0, + "grad_norm": 0.029090005547288914, + "language_loss": 0.90590245, + "learning_rate": 0.0008074814631475545, + "loss": 0.91781646, + "num_input_tokens_seen": 133741648, + "router_z_loss_mlp": 0.92773438, + "step": 1614, + "time_per_iteration": 3.3308844566345215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011972, + "balance_loss_mlp": 1.10450339, + "epoch": 0.3106964217006541, + "flos": 446972682240.0, + "grad_norm": 0.029174032275502568, + "language_loss": 0.8959738, + "learning_rate": 0.0008072357349114907, + "loss": 0.90794587, + "num_input_tokens_seen": 133813344, + "router_z_loss_mlp": 0.92578125, + "step": 1615, + "time_per_iteration": 2.660557746887207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194484, + "balance_loss_mlp": 1.10169172, + "epoch": 0.3108888033859177, + "flos": 511494822912.0, + "grad_norm": 0.027617375290548026, + "language_loss": 0.9836188, + "learning_rate": 0.0008069898873959363, + "loss": 0.99556363, + "num_input_tokens_seen": 133884192, + "router_z_loss_mlp": 0.92675781, + "step": 1616, + "time_per_iteration": 2.650024175643921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203555, + "balance_loss_mlp": 1.11076295, + "epoch": 0.3110811850711812, + "flos": 521778306048.0, + "grad_norm": 0.027380341091067188, + "language_loss": 0.94434142, + "learning_rate": 0.0008067439206963375, + "loss": 0.95637697, + "num_input_tokens_seen": 133954848, + "router_z_loss_mlp": 0.92675781, + "step": 1617, + "time_per_iteration": 2.6584017276763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120371, + "balance_loss_mlp": 1.11082232, + "epoch": 0.3112735667564448, + "flos": 687729934848.0, + "grad_norm": 0.029016410329411102, + "language_loss": 0.95023614, + "learning_rate": 0.0008064978349081873, + "loss": 0.96227324, + "num_input_tokens_seen": 134031824, + "router_z_loss_mlp": 0.92773438, + "step": 1618, + "time_per_iteration": 2.911677122116089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199948, + "balance_loss_mlp": 1.10720289, + "epoch": 0.31146594844170833, + "flos": 534165348864.0, + "grad_norm": 0.025439718165996668, + "language_loss": 0.95660365, + "learning_rate": 0.0008062516301270245, + "loss": 0.96860307, + "num_input_tokens_seen": 134104480, + "router_z_loss_mlp": 0.92626953, + "step": 1619, + "time_per_iteration": 2.669111490249634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196196, + "balance_loss_mlp": 1.10388064, + "epoch": 0.3116583301269719, + "flos": 680841836544.0, + "grad_norm": 0.024218225399572888, + "language_loss": 0.96279341, + "learning_rate": 0.0008060053064484343, + "loss": 0.97475541, + "num_input_tokens_seen": 134185632, + "router_z_loss_mlp": 0.921875, + "step": 1620, + "time_per_iteration": 2.924476385116577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189886, + "balance_loss_mlp": 1.09733212, + "epoch": 0.31185071181223545, + "flos": 587329758720.0, + "grad_norm": 0.02529679167102671, + "language_loss": 0.92711556, + "learning_rate": 0.0008057588639680482, + "loss": 0.93901443, + "num_input_tokens_seen": 134261600, + "router_z_loss_mlp": 0.92431641, + "step": 1621, + "time_per_iteration": 2.74631667137146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119125, + "balance_loss_mlp": 1.09817135, + "epoch": 0.31204309349749904, + "flos": 726657523200.0, + "grad_norm": 0.03522846239796161, + "language_loss": 0.93884659, + "learning_rate": 0.0008055123027815434, + "loss": 0.95075905, + "num_input_tokens_seen": 134334368, + "router_z_loss_mlp": 0.9296875, + "step": 1622, + "time_per_iteration": 2.90444016456604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189249, + "balance_loss_mlp": 1.09631383, + "epoch": 0.3122354751827626, + "flos": 577894940160.0, + "grad_norm": 0.026492717763192643, + "language_loss": 0.93252558, + "learning_rate": 0.0008052656229846436, + "loss": 0.94441813, + "num_input_tokens_seen": 134403824, + "router_z_loss_mlp": 0.92822266, + "step": 1623, + "time_per_iteration": 2.680220603942871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187983, + "balance_loss_mlp": 1.09519064, + "epoch": 0.31242785686802615, + "flos": 577028811264.0, + "grad_norm": 0.026617450345468772, + "language_loss": 1.00026262, + "learning_rate": 0.0008050188246731182, + "loss": 1.01214242, + "num_input_tokens_seen": 134471296, + "router_z_loss_mlp": 0.92675781, + "step": 1624, + "time_per_iteration": 2.6526694297790527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190099, + "balance_loss_mlp": 1.09711611, + "epoch": 0.31262023855328974, + "flos": 738195901440.0, + "grad_norm": 0.023806346866415393, + "language_loss": 0.9048847, + "learning_rate": 0.0008047719079427834, + "loss": 0.91678566, + "num_input_tokens_seen": 134551360, + "router_z_loss_mlp": 0.92871094, + "step": 1625, + "time_per_iteration": 3.0077152252197266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119944, + "balance_loss_mlp": 1.108078, + "epoch": 0.3128126202385533, + "flos": 1562591539200.0, + "grad_norm": 0.020013754894949238, + "language_loss": 0.74351704, + "learning_rate": 0.0008045248728895, + "loss": 0.7555114, + "num_input_tokens_seen": 134761328, + "router_z_loss_mlp": 0.91210938, + "step": 1626, + "time_per_iteration": 4.793031215667725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194528, + "balance_loss_mlp": 1.10111523, + "epoch": 0.31300500192381686, + "flos": 515942988288.0, + "grad_norm": 0.023349922932092686, + "language_loss": 0.95821261, + "learning_rate": 0.0008042777196091757, + "loss": 0.97015792, + "num_input_tokens_seen": 134833136, + "router_z_loss_mlp": 0.93310547, + "step": 1627, + "time_per_iteration": 2.679588556289673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196127, + "balance_loss_mlp": 1.10281038, + "epoch": 0.3131973836090804, + "flos": 527661287424.0, + "grad_norm": 0.026058472156191805, + "language_loss": 0.91163933, + "learning_rate": 0.0008040304481977643, + "loss": 0.92360055, + "num_input_tokens_seen": 134904352, + "router_z_loss_mlp": 0.93212891, + "step": 1628, + "time_per_iteration": 2.6339213848114014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206718, + "balance_loss_mlp": 1.11335361, + "epoch": 0.313389765294344, + "flos": 824209534464.0, + "grad_norm": 0.028324849871922998, + "language_loss": 0.96729648, + "learning_rate": 0.0008037830587512649, + "loss": 0.97936368, + "num_input_tokens_seen": 134984880, + "router_z_loss_mlp": 0.93261719, + "step": 1629, + "time_per_iteration": 3.052304744720459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191904, + "balance_loss_mlp": 1.09896827, + "epoch": 0.31358214697960757, + "flos": 394702599168.0, + "grad_norm": 0.026724204555937114, + "language_loss": 0.89292234, + "learning_rate": 0.0008035355513657224, + "loss": 0.90484136, + "num_input_tokens_seen": 135047456, + "router_z_loss_mlp": 0.92822266, + "step": 1630, + "time_per_iteration": 2.470526695251465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198859, + "balance_loss_mlp": 1.1059711, + "epoch": 0.3137745286648711, + "flos": 573097666560.0, + "grad_norm": 0.025006494531642755, + "language_loss": 1.00651205, + "learning_rate": 0.0008032879261372279, + "loss": 1.01850057, + "num_input_tokens_seen": 135124256, + "router_z_loss_mlp": 0.92773438, + "step": 1631, + "time_per_iteration": 2.7967746257781982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194023, + "balance_loss_mlp": 1.10418701, + "epoch": 0.3139669103501347, + "flos": 1501629241344.0, + "grad_norm": 0.01894627505164378, + "language_loss": 0.79635841, + "learning_rate": 0.0008030401831619178, + "loss": 0.80829865, + "num_input_tokens_seen": 135353024, + "router_z_loss_mlp": 0.89648438, + "step": 1632, + "time_per_iteration": 5.690793991088867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187718, + "balance_loss_mlp": 1.09478259, + "epoch": 0.3141592920353982, + "flos": 526358728704.0, + "grad_norm": 0.023739615719740217, + "language_loss": 0.94780874, + "learning_rate": 0.0008027923225359748, + "loss": 0.95968592, + "num_input_tokens_seen": 135422464, + "router_z_loss_mlp": 0.92822266, + "step": 1633, + "time_per_iteration": 2.619640827178955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182027, + "balance_loss_mlp": 1.08894837, + "epoch": 0.3143516737206618, + "flos": 594387044352.0, + "grad_norm": 0.024020227962995952, + "language_loss": 0.97166598, + "learning_rate": 0.0008025443443556267, + "loss": 0.98348624, + "num_input_tokens_seen": 135490928, + "router_z_loss_mlp": 0.9296875, + "step": 1634, + "time_per_iteration": 2.7105367183685303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187192, + "balance_loss_mlp": 1.09397042, + "epoch": 0.31454405540592534, + "flos": 649679208960.0, + "grad_norm": 0.024579905610689918, + "language_loss": 0.95561564, + "learning_rate": 0.000802296248717147, + "loss": 0.96748757, + "num_input_tokens_seen": 135576288, + "router_z_loss_mlp": 0.93115234, + "step": 1635, + "time_per_iteration": 2.954427480697632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189389, + "balance_loss_mlp": 1.09616756, + "epoch": 0.3147364370911889, + "flos": 644069474304.0, + "grad_norm": 0.026460377875643523, + "language_loss": 0.89723325, + "learning_rate": 0.0008020480357168554, + "loss": 0.90912724, + "num_input_tokens_seen": 135652320, + "router_z_loss_mlp": 0.93115234, + "step": 1636, + "time_per_iteration": 2.7983195781707764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118902, + "balance_loss_mlp": 1.09575093, + "epoch": 0.31492881877645246, + "flos": 472821015552.0, + "grad_norm": 0.024118652497695542, + "language_loss": 0.95980144, + "learning_rate": 0.0008017997054511165, + "loss": 0.97169161, + "num_input_tokens_seen": 135719632, + "router_z_loss_mlp": 0.93164062, + "step": 1637, + "time_per_iteration": 2.543381690979004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188761, + "balance_loss_mlp": 1.09544361, + "epoch": 0.31512120046171604, + "flos": 630629650944.0, + "grad_norm": 0.026442486928658162, + "language_loss": 0.94192296, + "learning_rate": 0.0008015512580163407, + "loss": 0.95381057, + "num_input_tokens_seen": 135796544, + "router_z_loss_mlp": 0.93212891, + "step": 1638, + "time_per_iteration": 2.8069217205047607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189537, + "balance_loss_mlp": 1.09645832, + "epoch": 0.31531358214697963, + "flos": 705053239296.0, + "grad_norm": 0.0247809696854931, + "language_loss": 0.89687169, + "learning_rate": 0.0008013026935089838, + "loss": 0.9087671, + "num_input_tokens_seen": 135871344, + "router_z_loss_mlp": 0.9296875, + "step": 1639, + "time_per_iteration": 2.8575150966644287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189099, + "balance_loss_mlp": 1.09592521, + "epoch": 0.31550596383224316, + "flos": 573631425024.0, + "grad_norm": 0.026868409426578303, + "language_loss": 0.92173505, + "learning_rate": 0.0008010540120255472, + "loss": 0.93362606, + "num_input_tokens_seen": 135944320, + "router_z_loss_mlp": 0.93066406, + "step": 1640, + "time_per_iteration": 2.6781005859375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118909, + "balance_loss_mlp": 1.09591639, + "epoch": 0.31569834551750675, + "flos": 659512800768.0, + "grad_norm": 0.03030176261580671, + "language_loss": 0.95734656, + "learning_rate": 0.0008008052136625774, + "loss": 0.96923745, + "num_input_tokens_seen": 136019456, + "router_z_loss_mlp": 0.93066406, + "step": 1641, + "time_per_iteration": 2.8858654499053955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192627, + "balance_loss_mlp": 1.09950101, + "epoch": 0.3158907272027703, + "flos": 567403338240.0, + "grad_norm": 0.026165343030711524, + "language_loss": 0.94310361, + "learning_rate": 0.0008005562985166666, + "loss": 0.9550299, + "num_input_tokens_seen": 136091232, + "router_z_loss_mlp": 0.93017578, + "step": 1642, + "time_per_iteration": 2.7097506523132324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193912, + "balance_loss_mlp": 1.10102403, + "epoch": 0.31608310888803387, + "flos": 537972968448.0, + "grad_norm": 0.020568762002796243, + "language_loss": 0.9172346, + "learning_rate": 0.0008003072666844524, + "loss": 0.92917377, + "num_input_tokens_seen": 136165088, + "router_z_loss_mlp": 0.92773438, + "step": 1643, + "time_per_iteration": 2.6982197761535645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194419, + "balance_loss_mlp": 1.10181749, + "epoch": 0.3162754905732974, + "flos": 487639259136.0, + "grad_norm": 0.02816029335024998, + "language_loss": 0.90344775, + "learning_rate": 0.0008000581182626173, + "loss": 0.91539198, + "num_input_tokens_seen": 136230368, + "router_z_loss_mlp": 0.92480469, + "step": 1644, + "time_per_iteration": 2.546762466430664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193569, + "balance_loss_mlp": 1.10048997, + "epoch": 0.316467872258561, + "flos": 531095603712.0, + "grad_norm": 0.024394566764596542, + "language_loss": 0.93082815, + "learning_rate": 0.0007998088533478894, + "loss": 0.94276381, + "num_input_tokens_seen": 136302512, + "router_z_loss_mlp": 0.9296875, + "step": 1645, + "time_per_iteration": 2.6320817470550537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188922, + "balance_loss_mlp": 1.09622455, + "epoch": 0.3166602539438245, + "flos": 444413227008.0, + "grad_norm": 0.029455070645316363, + "language_loss": 0.9479661, + "learning_rate": 0.000799559472037042, + "loss": 0.95985526, + "num_input_tokens_seen": 136368064, + "router_z_loss_mlp": 0.92578125, + "step": 1646, + "time_per_iteration": 2.535414457321167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187182, + "balance_loss_mlp": 1.09458041, + "epoch": 0.3168526356290881, + "flos": 647102289408.0, + "grad_norm": 0.02168302123393663, + "language_loss": 0.94649625, + "learning_rate": 0.0007993099744268932, + "loss": 0.95836812, + "num_input_tokens_seen": 136451520, + "router_z_loss_mlp": 0.92480469, + "step": 1647, + "time_per_iteration": 2.912095785140991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182437, + "balance_loss_mlp": 1.08988261, + "epoch": 0.3170450173143517, + "flos": 587257900032.0, + "grad_norm": 0.023943172344495993, + "language_loss": 0.96008313, + "learning_rate": 0.000799060360614307, + "loss": 0.97190744, + "num_input_tokens_seen": 136521184, + "router_z_loss_mlp": 0.92431641, + "step": 1648, + "time_per_iteration": 2.6763339042663574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118738, + "balance_loss_mlp": 1.09482586, + "epoch": 0.3172373989996152, + "flos": 828573106176.0, + "grad_norm": 0.025050943971751935, + "language_loss": 0.91967106, + "learning_rate": 0.0007988106306961917, + "loss": 0.93154484, + "num_input_tokens_seen": 136612592, + "router_z_loss_mlp": 0.92431641, + "step": 1649, + "time_per_iteration": 3.1265392303466797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183645, + "balance_loss_mlp": 1.09151971, + "epoch": 0.3174297806848788, + "flos": 528434090496.0, + "grad_norm": 0.026893421102733506, + "language_loss": 0.92866611, + "learning_rate": 0.0007985607847695014, + "loss": 0.94050252, + "num_input_tokens_seen": 136684336, + "router_z_loss_mlp": 0.91992188, + "step": 1650, + "time_per_iteration": 2.640529155731201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184032, + "balance_loss_mlp": 1.09152567, + "epoch": 0.31762216237014235, + "flos": 714481327104.0, + "grad_norm": 0.024008942139765378, + "language_loss": 0.9102264, + "learning_rate": 0.0007983108229312345, + "loss": 0.92206669, + "num_input_tokens_seen": 136766400, + "router_z_loss_mlp": 0.92382812, + "step": 1651, + "time_per_iteration": 2.890881299972534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183971, + "balance_loss_mlp": 1.09170341, + "epoch": 0.31781454405540593, + "flos": 484799826432.0, + "grad_norm": 0.027702532543066302, + "language_loss": 0.9509185, + "learning_rate": 0.0007980607452784351, + "loss": 0.96275818, + "num_input_tokens_seen": 136834016, + "router_z_loss_mlp": 0.92138672, + "step": 1652, + "time_per_iteration": 2.5693578720092773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118418, + "balance_loss_mlp": 1.09186423, + "epoch": 0.31800692574066947, + "flos": 549804059136.0, + "grad_norm": 0.028510736103347943, + "language_loss": 0.99507928, + "learning_rate": 0.0007978105519081919, + "loss": 1.00692105, + "num_input_tokens_seen": 136906288, + "router_z_loss_mlp": 0.921875, + "step": 1653, + "time_per_iteration": 2.674062967300415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181597, + "balance_loss_mlp": 1.08947253, + "epoch": 0.31819930742593305, + "flos": 517916292096.0, + "grad_norm": 0.029899238666621586, + "language_loss": 0.96953475, + "learning_rate": 0.0007975602429176385, + "loss": 0.98135078, + "num_input_tokens_seen": 136972416, + "router_z_loss_mlp": 0.91992188, + "step": 1654, + "time_per_iteration": 2.595107316970825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011812, + "balance_loss_mlp": 1.08907461, + "epoch": 0.31839168911119664, + "flos": 456969457152.0, + "grad_norm": 0.02327460697487094, + "language_loss": 0.90136862, + "learning_rate": 0.0007973098184039536, + "loss": 0.91318059, + "num_input_tokens_seen": 137044576, + "router_z_loss_mlp": 0.91992188, + "step": 1655, + "time_per_iteration": 2.654873847961426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184047, + "balance_loss_mlp": 1.09192252, + "epoch": 0.3185840707964602, + "flos": 627295391232.0, + "grad_norm": 0.025652000789891626, + "language_loss": 0.955365, + "learning_rate": 0.0007970592784643602, + "loss": 0.96720552, + "num_input_tokens_seen": 137125120, + "router_z_loss_mlp": 0.91992188, + "step": 1656, + "time_per_iteration": 2.8485612869262695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183486, + "balance_loss_mlp": 1.09107482, + "epoch": 0.31877645248172376, + "flos": 568540712448.0, + "grad_norm": 0.02977939264047221, + "language_loss": 0.94253254, + "learning_rate": 0.0007968086231961272, + "loss": 0.9543674, + "num_input_tokens_seen": 137195344, + "router_z_loss_mlp": 0.92285156, + "step": 1657, + "time_per_iteration": 2.6949312686920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182357, + "balance_loss_mlp": 1.09004128, + "epoch": 0.3189688341669873, + "flos": 490552551936.0, + "grad_norm": 0.03598298081414456, + "language_loss": 0.95643866, + "learning_rate": 0.0007965578526965671, + "loss": 0.96826226, + "num_input_tokens_seen": 137261040, + "router_z_loss_mlp": 0.921875, + "step": 1658, + "time_per_iteration": 2.5717341899871826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182583, + "balance_loss_mlp": 1.09012401, + "epoch": 0.3191612158522509, + "flos": 577380647424.0, + "grad_norm": 0.02594626841132509, + "language_loss": 0.93226576, + "learning_rate": 0.0007963069670630377, + "loss": 0.94409156, + "num_input_tokens_seen": 137334400, + "router_z_loss_mlp": 0.92333984, + "step": 1659, + "time_per_iteration": 2.7431960105895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187517, + "balance_loss_mlp": 1.09486747, + "epoch": 0.3193535975375144, + "flos": 539192934912.0, + "grad_norm": 0.026552556196046555, + "language_loss": 0.97412628, + "learning_rate": 0.0007960559663929416, + "loss": 0.98600149, + "num_input_tokens_seen": 137405344, + "router_z_loss_mlp": 0.92529297, + "step": 1660, + "time_per_iteration": 2.631037473678589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186332, + "balance_loss_mlp": 1.09382606, + "epoch": 0.319545979222778, + "flos": 735627714048.0, + "grad_norm": 0.022912970149823363, + "language_loss": 0.94840437, + "learning_rate": 0.0007958048507837259, + "loss": 0.96026772, + "num_input_tokens_seen": 137486016, + "router_z_loss_mlp": 0.92382812, + "step": 1661, + "time_per_iteration": 2.925752878189087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191424, + "balance_loss_mlp": 1.09872651, + "epoch": 0.31973836090804153, + "flos": 765767760384.0, + "grad_norm": 0.030797304976158044, + "language_loss": 0.98320282, + "learning_rate": 0.0007955536203328822, + "loss": 0.99511707, + "num_input_tokens_seen": 137562304, + "router_z_loss_mlp": 0.92578125, + "step": 1662, + "time_per_iteration": 2.9076955318450928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187513, + "balance_loss_mlp": 1.09486389, + "epoch": 0.3199307425933051, + "flos": 561741937152.0, + "grad_norm": 0.02511010738984868, + "language_loss": 0.90468192, + "learning_rate": 0.0007953022751379469, + "loss": 0.91655713, + "num_input_tokens_seen": 137639248, + "router_z_loss_mlp": 0.92529297, + "step": 1663, + "time_per_iteration": 2.7703394889831543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188156, + "balance_loss_mlp": 1.09564936, + "epoch": 0.3201231242785687, + "flos": 752671041024.0, + "grad_norm": 0.029121282383782986, + "language_loss": 0.92101777, + "learning_rate": 0.000795050815296501, + "loss": 0.93289936, + "num_input_tokens_seen": 137718256, + "router_z_loss_mlp": 0.92382812, + "step": 1664, + "time_per_iteration": 2.966632843017578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188504, + "balance_loss_mlp": 1.0960933, + "epoch": 0.32031550596383224, + "flos": 497384254464.0, + "grad_norm": 0.02307975398987516, + "language_loss": 1.00050378, + "learning_rate": 0.0007947992409061695, + "loss": 1.01238883, + "num_input_tokens_seen": 137785216, + "router_z_loss_mlp": 0.92285156, + "step": 1665, + "time_per_iteration": 2.6264171600341797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193124, + "balance_loss_mlp": 1.10080826, + "epoch": 0.3205078876490958, + "flos": 732874876416.0, + "grad_norm": 0.02454331261307917, + "language_loss": 0.93550396, + "learning_rate": 0.0007945475520646226, + "loss": 0.9474352, + "num_input_tokens_seen": 137863424, + "router_z_loss_mlp": 0.921875, + "step": 1666, + "time_per_iteration": 2.915013551712036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191587, + "balance_loss_mlp": 1.09941399, + "epoch": 0.32070026933435936, + "flos": 550474804224.0, + "grad_norm": 0.02796219722650757, + "language_loss": 0.9429689, + "learning_rate": 0.0007942957488695743, + "loss": 0.95488477, + "num_input_tokens_seen": 137930384, + "router_z_loss_mlp": 0.92041016, + "step": 1667, + "time_per_iteration": 2.621396780014038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186724, + "balance_loss_mlp": 1.09421742, + "epoch": 0.32089265101962294, + "flos": 746684000256.0, + "grad_norm": 0.022875326013334737, + "language_loss": 0.87680244, + "learning_rate": 0.0007940438314187833, + "loss": 0.88866973, + "num_input_tokens_seen": 138017200, + "router_z_loss_mlp": 0.92382812, + "step": 1668, + "time_per_iteration": 3.0475997924804688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187112, + "balance_loss_mlp": 1.0947485, + "epoch": 0.3210850327048865, + "flos": 495196101120.0, + "grad_norm": 0.03400858364934581, + "language_loss": 0.88502395, + "learning_rate": 0.0007937917998100529, + "loss": 0.89689511, + "num_input_tokens_seen": 138084048, + "router_z_loss_mlp": 0.92236328, + "step": 1669, + "time_per_iteration": 2.6158430576324463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188853, + "balance_loss_mlp": 1.09658515, + "epoch": 0.32127741439015006, + "flos": 531673022976.0, + "grad_norm": 0.029937804889017615, + "language_loss": 0.92354518, + "learning_rate": 0.0007935396541412302, + "loss": 0.93543375, + "num_input_tokens_seen": 138153280, + "router_z_loss_mlp": 0.92138672, + "step": 1670, + "time_per_iteration": 2.6148414611816406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188159, + "balance_loss_mlp": 1.09589148, + "epoch": 0.3214697960754136, + "flos": 502223187456.0, + "grad_norm": 0.027719397006423088, + "language_loss": 0.94146281, + "learning_rate": 0.0007932873945102068, + "loss": 0.95334446, + "num_input_tokens_seen": 138222320, + "router_z_loss_mlp": 0.92138672, + "step": 1671, + "time_per_iteration": 2.5756680965423584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189911, + "balance_loss_mlp": 1.09950256, + "epoch": 0.3216621777606772, + "flos": 1386402089472.0, + "grad_norm": 0.015471737686433536, + "language_loss": 0.75761777, + "learning_rate": 0.0007930350210149188, + "loss": 0.76951689, + "num_input_tokens_seen": 138449488, + "router_z_loss_mlp": 0.90234375, + "step": 1672, + "time_per_iteration": 4.848818778991699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181453, + "balance_loss_mlp": 1.08975732, + "epoch": 0.32185455944594077, + "flos": 572635040256.0, + "grad_norm": 0.021338606013939526, + "language_loss": 0.94597888, + "learning_rate": 0.0007927825337533461, + "loss": 0.95779347, + "num_input_tokens_seen": 138522496, + "router_z_loss_mlp": 0.91552734, + "step": 1673, + "time_per_iteration": 2.6742517948150635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181114, + "balance_loss_mlp": 1.08975172, + "epoch": 0.3220469411312043, + "flos": 544936928256.0, + "grad_norm": 0.029706455848313437, + "language_loss": 0.9645716, + "learning_rate": 0.0007925299328235131, + "loss": 0.97638273, + "num_input_tokens_seen": 138590096, + "router_z_loss_mlp": 0.91210938, + "step": 1674, + "time_per_iteration": 2.637598991394043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182375, + "balance_loss_mlp": 1.09101272, + "epoch": 0.3222393228164679, + "flos": 492161284608.0, + "grad_norm": 0.02873592636128419, + "language_loss": 0.969607, + "learning_rate": 0.000792277218323488, + "loss": 0.98143071, + "num_input_tokens_seen": 138658224, + "router_z_loss_mlp": 0.91210938, + "step": 1675, + "time_per_iteration": 2.589118719100952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182718, + "balance_loss_mlp": 1.0914042, + "epoch": 0.3224317045017314, + "flos": 491362285056.0, + "grad_norm": 0.026517432951267347, + "language_loss": 0.94174361, + "learning_rate": 0.0007920243903513833, + "loss": 0.95357084, + "num_input_tokens_seen": 138722864, + "router_z_loss_mlp": 0.91162109, + "step": 1676, + "time_per_iteration": 2.5541775226593018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179593, + "balance_loss_mlp": 1.08832622, + "epoch": 0.322624086186995, + "flos": 576870357504.0, + "grad_norm": 0.028460659829427477, + "language_loss": 0.94868386, + "learning_rate": 0.0007917714490053556, + "loss": 0.96047986, + "num_input_tokens_seen": 138791472, + "router_z_loss_mlp": 0.91113281, + "step": 1677, + "time_per_iteration": 2.685833215713501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196193, + "balance_loss_mlp": 1.10454535, + "epoch": 0.32281646787225854, + "flos": 630571253760.0, + "grad_norm": 0.02861547850998442, + "language_loss": 0.93624204, + "learning_rate": 0.0007915183943836055, + "loss": 0.94820398, + "num_input_tokens_seen": 138873424, + "router_z_loss_mlp": 0.91503906, + "step": 1678, + "time_per_iteration": 2.8957157135009766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184806, + "balance_loss_mlp": 1.09363461, + "epoch": 0.3230088495575221, + "flos": 782807084544.0, + "grad_norm": 0.029736135795599906, + "language_loss": 0.92990124, + "learning_rate": 0.0007912652265843773, + "loss": 0.94174933, + "num_input_tokens_seen": 138956880, + "router_z_loss_mlp": 0.91015625, + "step": 1679, + "time_per_iteration": 3.0256145000457764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187663, + "balance_loss_mlp": 1.09620523, + "epoch": 0.3232012312427857, + "flos": 537200165376.0, + "grad_norm": 0.0299548546326655, + "language_loss": 0.88938797, + "learning_rate": 0.0007910119457059597, + "loss": 0.90126455, + "num_input_tokens_seen": 139031296, + "router_z_loss_mlp": 0.91308594, + "step": 1680, + "time_per_iteration": 2.7195773124694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118719, + "balance_loss_mlp": 1.09601843, + "epoch": 0.32339361292804925, + "flos": 706232272896.0, + "grad_norm": 0.03079987155163935, + "language_loss": 0.89790422, + "learning_rate": 0.0007907585518466849, + "loss": 0.90977609, + "num_input_tokens_seen": 139109776, + "router_z_loss_mlp": 0.91015625, + "step": 1681, + "time_per_iteration": 2.9635961055755615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186411, + "balance_loss_mlp": 1.09523988, + "epoch": 0.32358599461331283, + "flos": 453257164800.0, + "grad_norm": 0.027692195030378806, + "language_loss": 0.99450397, + "learning_rate": 0.000790505045104929, + "loss": 1.00636816, + "num_input_tokens_seen": 139174736, + "router_z_loss_mlp": 0.91015625, + "step": 1682, + "time_per_iteration": 2.5084030628204346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186896, + "balance_loss_mlp": 1.09553456, + "epoch": 0.32377837629857636, + "flos": 602091606528.0, + "grad_norm": 0.028152445524849662, + "language_loss": 0.96712899, + "learning_rate": 0.0007902514255791125, + "loss": 0.97899795, + "num_input_tokens_seen": 139252064, + "router_z_loss_mlp": 0.91210938, + "step": 1683, + "time_per_iteration": 2.7732536792755127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185338, + "balance_loss_mlp": 1.09388101, + "epoch": 0.32397075798383995, + "flos": 808898465280.0, + "grad_norm": 0.02645952871958238, + "language_loss": 0.9579218, + "learning_rate": 0.0007899976933676986, + "loss": 0.9697752, + "num_input_tokens_seen": 139333328, + "router_z_loss_mlp": 0.91308594, + "step": 1684, + "time_per_iteration": 2.985987424850464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184012, + "balance_loss_mlp": 1.09274495, + "epoch": 0.3241631396691035, + "flos": 602792550912.0, + "grad_norm": 0.02682215462305332, + "language_loss": 0.96423018, + "learning_rate": 0.0007897438485691955, + "loss": 0.97607034, + "num_input_tokens_seen": 139400976, + "router_z_loss_mlp": 0.91113281, + "step": 1685, + "time_per_iteration": 2.673083543777466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185177, + "balance_loss_mlp": 1.09386301, + "epoch": 0.32435552135436707, + "flos": 475176354816.0, + "grad_norm": 0.030260846574811467, + "language_loss": 0.93327641, + "learning_rate": 0.0007894898912821542, + "loss": 0.9451282, + "num_input_tokens_seen": 139465664, + "router_z_loss_mlp": 0.91162109, + "step": 1686, + "time_per_iteration": 2.526704788208008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181419, + "balance_loss_mlp": 1.09015274, + "epoch": 0.3245479030396306, + "flos": 539219131392.0, + "grad_norm": 0.02519584895765407, + "language_loss": 0.95407552, + "learning_rate": 0.0007892358216051695, + "loss": 0.96588969, + "num_input_tokens_seen": 139541984, + "router_z_loss_mlp": 0.91113281, + "step": 1687, + "time_per_iteration": 2.718292713165283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186611, + "balance_loss_mlp": 1.09543955, + "epoch": 0.3247402847248942, + "flos": 548696884224.0, + "grad_norm": 0.02873183694146744, + "language_loss": 1.00761271, + "learning_rate": 0.0007889816396368803, + "loss": 1.0194788, + "num_input_tokens_seen": 139607408, + "router_z_loss_mlp": 0.91015625, + "step": 1688, + "time_per_iteration": 2.6112852096557617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179714, + "balance_loss_mlp": 1.08835161, + "epoch": 0.3249326664101578, + "flos": 378992030208.0, + "grad_norm": 0.0263136625306578, + "language_loss": 0.95246112, + "learning_rate": 0.0007887273454759687, + "loss": 0.96425825, + "num_input_tokens_seen": 139670000, + "router_z_loss_mlp": 0.91210938, + "step": 1689, + "time_per_iteration": 2.466093063354492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185248, + "balance_loss_mlp": 1.09407663, + "epoch": 0.3251250480954213, + "flos": 529122299904.0, + "grad_norm": 0.02633136368880149, + "language_loss": 0.91763788, + "learning_rate": 0.0007884729392211603, + "loss": 0.92949039, + "num_input_tokens_seen": 139739872, + "router_z_loss_mlp": 0.91015625, + "step": 1690, + "time_per_iteration": 2.633387804031372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182102, + "balance_loss_mlp": 1.09054887, + "epoch": 0.3253174297806849, + "flos": 450558721536.0, + "grad_norm": 0.03256384134880849, + "language_loss": 0.96271229, + "learning_rate": 0.0007882184209712245, + "loss": 0.97453332, + "num_input_tokens_seen": 139802032, + "router_z_loss_mlp": 0.9140625, + "step": 1691, + "time_per_iteration": 2.511629104614258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183951, + "balance_loss_mlp": 1.09239864, + "epoch": 0.32550981146594843, + "flos": 705489669120.0, + "grad_norm": 0.02306884235196454, + "language_loss": 0.92818689, + "learning_rate": 0.000787963790824974, + "loss": 0.9400264, + "num_input_tokens_seen": 139885648, + "router_z_loss_mlp": 0.9140625, + "step": 1692, + "time_per_iteration": 2.953939914703369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118506, + "balance_loss_mlp": 1.0935545, + "epoch": 0.325702193151212, + "flos": 393558494208.0, + "grad_norm": 0.026666894987577915, + "language_loss": 0.98025191, + "learning_rate": 0.0007877090488812651, + "loss": 0.9921025, + "num_input_tokens_seen": 139947920, + "router_z_loss_mlp": 0.91357422, + "step": 1693, + "time_per_iteration": 2.4410316944122314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178009, + "balance_loss_mlp": 1.08659911, + "epoch": 0.32589457483647555, + "flos": 578583149568.0, + "grad_norm": 0.029080232987036207, + "language_loss": 0.92532402, + "learning_rate": 0.0007874541952389973, + "loss": 0.93710411, + "num_input_tokens_seen": 140020048, + "router_z_loss_mlp": 0.91259766, + "step": 1694, + "time_per_iteration": 2.660390853881836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179003, + "balance_loss_mlp": 1.08792675, + "epoch": 0.32608695652173914, + "flos": 499329360384.0, + "grad_norm": 0.023433013698769337, + "language_loss": 0.93903476, + "learning_rate": 0.0007871992299971136, + "loss": 0.9508248, + "num_input_tokens_seen": 140085600, + "router_z_loss_mlp": 0.90917969, + "step": 1695, + "time_per_iteration": 2.5506269931793213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179394, + "balance_loss_mlp": 1.08822274, + "epoch": 0.32627933820700267, + "flos": 592300948992.0, + "grad_norm": 0.02355558557065364, + "language_loss": 0.91491008, + "learning_rate": 0.0007869441532546001, + "loss": 0.92670405, + "num_input_tokens_seen": 140155152, + "router_z_loss_mlp": 0.91015625, + "step": 1696, + "time_per_iteration": 2.7493326663970947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177542, + "balance_loss_mlp": 1.08618009, + "epoch": 0.32647171989226625, + "flos": 610273531392.0, + "grad_norm": 0.02705729718991907, + "language_loss": 0.87004846, + "learning_rate": 0.0007866889651104867, + "loss": 0.8818239, + "num_input_tokens_seen": 140228560, + "router_z_loss_mlp": 0.91210938, + "step": 1697, + "time_per_iteration": 2.7824432849884033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179221, + "balance_loss_mlp": 1.08785892, + "epoch": 0.32666410157752984, + "flos": 478189704192.0, + "grad_norm": 0.028152017440838794, + "language_loss": 0.94142878, + "learning_rate": 0.000786433665663846, + "loss": 0.95322108, + "num_input_tokens_seen": 140297952, + "router_z_loss_mlp": 0.91210938, + "step": 1698, + "time_per_iteration": 2.6674411296844482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187877, + "balance_loss_mlp": 1.09670568, + "epoch": 0.3268564832627934, + "flos": 719693563392.0, + "grad_norm": 0.040459779361444057, + "language_loss": 0.95728016, + "learning_rate": 0.0007861782550137942, + "loss": 0.96915889, + "num_input_tokens_seen": 140373408, + "router_z_loss_mlp": 0.91015625, + "step": 1699, + "time_per_iteration": 2.923370599746704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187429, + "balance_loss_mlp": 1.09625793, + "epoch": 0.32704886494805696, + "flos": 770105135616.0, + "grad_norm": 0.025720199745930695, + "language_loss": 0.93479955, + "learning_rate": 0.0007859227332594901, + "loss": 0.94667387, + "num_input_tokens_seen": 140451840, + "router_z_loss_mlp": 0.91015625, + "step": 1700, + "time_per_iteration": 2.94014573097229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191948, + "balance_loss_mlp": 1.10120583, + "epoch": 0.3272412466333205, + "flos": 851404087296.0, + "grad_norm": 0.0329500691508657, + "language_loss": 0.94768298, + "learning_rate": 0.0007856671005001365, + "loss": 0.95960248, + "num_input_tokens_seen": 140537696, + "router_z_loss_mlp": 0.90576172, + "step": 1701, + "time_per_iteration": 3.1774539947509766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118211, + "balance_loss_mlp": 1.09065294, + "epoch": 0.3274336283185841, + "flos": 833040737280.0, + "grad_norm": 0.029774404200988806, + "language_loss": 0.90405869, + "learning_rate": 0.0007854113568349787, + "loss": 0.91587985, + "num_input_tokens_seen": 140623536, + "router_z_loss_mlp": 0.91308594, + "step": 1702, + "time_per_iteration": 3.107083559036255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186026, + "balance_loss_mlp": 1.09471202, + "epoch": 0.3276260100038476, + "flos": 693252347904.0, + "grad_norm": 0.029328613393929583, + "language_loss": 0.89606428, + "learning_rate": 0.0007851555023633052, + "loss": 0.90792453, + "num_input_tokens_seen": 140700688, + "router_z_loss_mlp": 0.91162109, + "step": 1703, + "time_per_iteration": 2.8335254192352295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011877, + "balance_loss_mlp": 1.09643364, + "epoch": 0.3278183916891112, + "flos": 436977908736.0, + "grad_norm": 0.03479764223743197, + "language_loss": 0.91987431, + "learning_rate": 0.0007848995371844474, + "loss": 0.93175125, + "num_input_tokens_seen": 140765808, + "router_z_loss_mlp": 0.91113281, + "step": 1704, + "time_per_iteration": 2.51261043548584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118827, + "balance_loss_mlp": 1.09728956, + "epoch": 0.3280107733743748, + "flos": 462016508928.0, + "grad_norm": 0.027955151013136243, + "language_loss": 0.90236068, + "learning_rate": 0.0007846434613977801, + "loss": 0.91424334, + "num_input_tokens_seen": 140830512, + "router_z_loss_mlp": 0.90820312, + "step": 1705, + "time_per_iteration": 2.51505708694458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185335, + "balance_loss_mlp": 1.09464061, + "epoch": 0.3282031550596383, + "flos": 680528931840.0, + "grad_norm": 0.0285448105624817, + "language_loss": 0.86403298, + "learning_rate": 0.0007843872751027203, + "loss": 0.87588632, + "num_input_tokens_seen": 140902816, + "router_z_loss_mlp": 0.90527344, + "step": 1706, + "time_per_iteration": 2.7977733612060547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183945, + "balance_loss_mlp": 1.0931555, + "epoch": 0.3283955367449019, + "flos": 546254949888.0, + "grad_norm": 0.024438576566567966, + "language_loss": 0.93906903, + "learning_rate": 0.0007841309783987287, + "loss": 0.95090854, + "num_input_tokens_seen": 140975488, + "router_z_loss_mlp": 0.90625, + "step": 1707, + "time_per_iteration": 2.737680196762085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178748, + "balance_loss_mlp": 1.08757639, + "epoch": 0.32858791843016544, + "flos": 482240371200.0, + "grad_norm": 0.027193371904651382, + "language_loss": 0.97315758, + "learning_rate": 0.0007838745713853084, + "loss": 0.98494506, + "num_input_tokens_seen": 141043248, + "router_z_loss_mlp": 0.91015625, + "step": 1708, + "time_per_iteration": 2.5702459812164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189964, + "balance_loss_mlp": 1.09879303, + "epoch": 0.328780300115429, + "flos": 567915629568.0, + "grad_norm": 0.029427091701823335, + "language_loss": 0.93208408, + "learning_rate": 0.0007836180541620053, + "loss": 0.94398379, + "num_input_tokens_seen": 141119408, + "router_z_loss_mlp": 0.91015625, + "step": 1709, + "time_per_iteration": 2.7365195751190186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189596, + "balance_loss_mlp": 1.09852052, + "epoch": 0.32897268180069256, + "flos": 476991204864.0, + "grad_norm": 0.02924752300223344, + "language_loss": 0.94609785, + "learning_rate": 0.0007833614268284082, + "loss": 0.95799387, + "num_input_tokens_seen": 141184112, + "router_z_loss_mlp": 0.90917969, + "step": 1710, + "time_per_iteration": 2.575416326522827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186913, + "balance_loss_mlp": 1.09745789, + "epoch": 0.32916506348595614, + "flos": 1580450603520.0, + "grad_norm": 0.014653073497659498, + "language_loss": 0.74109769, + "learning_rate": 0.0007831046894841489, + "loss": 0.75296688, + "num_input_tokens_seen": 141414960, + "router_z_loss_mlp": 0.89257812, + "step": 1711, + "time_per_iteration": 4.8569114208221436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117837, + "balance_loss_mlp": 1.08681703, + "epoch": 0.3293574451712197, + "flos": 483851105280.0, + "grad_norm": 0.027096123044633498, + "language_loss": 0.8678506, + "learning_rate": 0.0007828478422289016, + "loss": 0.87963432, + "num_input_tokens_seen": 141485744, + "router_z_loss_mlp": 0.9140625, + "step": 1712, + "time_per_iteration": 2.5748305320739746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181971, + "balance_loss_mlp": 1.09041798, + "epoch": 0.32954982685648326, + "flos": 623724088320.0, + "grad_norm": 0.027491608740018197, + "language_loss": 0.97854888, + "learning_rate": 0.0007825908851623833, + "loss": 0.99036855, + "num_input_tokens_seen": 141560592, + "router_z_loss_mlp": 0.9140625, + "step": 1713, + "time_per_iteration": 2.7387707233428955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180742, + "balance_loss_mlp": 1.0893327, + "epoch": 0.32974220854174685, + "flos": 546070299648.0, + "grad_norm": 0.028986059756107307, + "language_loss": 0.93660253, + "learning_rate": 0.0007823338183843533, + "loss": 0.94840991, + "num_input_tokens_seen": 141630400, + "router_z_loss_mlp": 0.91259766, + "step": 1714, + "time_per_iteration": 2.7061285972595215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194773, + "balance_loss_mlp": 1.10341084, + "epoch": 0.3299345902270104, + "flos": 983822286336.0, + "grad_norm": 0.02918308821255402, + "language_loss": 0.89344442, + "learning_rate": 0.0007820766419946141, + "loss": 0.90539211, + "num_input_tokens_seen": 141721552, + "router_z_loss_mlp": 0.91210938, + "step": 1715, + "time_per_iteration": 3.2698333263397217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119133, + "balance_loss_mlp": 1.10206604, + "epoch": 0.33012697191227397, + "flos": 1406901926400.0, + "grad_norm": 0.008988097140154246, + "language_loss": 0.7967248, + "learning_rate": 0.0007818193560930102, + "loss": 0.8086381, + "num_input_tokens_seen": 141956464, + "router_z_loss_mlp": 0.890625, + "step": 1716, + "time_per_iteration": 4.931420564651489 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193588, + "balance_loss_mlp": 1.10213029, + "epoch": 0.3303193535975375, + "flos": 506169795072.0, + "grad_norm": 0.03043585823380059, + "language_loss": 0.87317824, + "learning_rate": 0.0007815619607794288, + "loss": 0.88511419, + "num_input_tokens_seen": 142029552, + "router_z_loss_mlp": 0.91308594, + "step": 1717, + "time_per_iteration": 2.611924171447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198413, + "balance_loss_mlp": 1.10676467, + "epoch": 0.3305117352828011, + "flos": 939484349952.0, + "grad_norm": 0.029759763631388395, + "language_loss": 0.92828202, + "learning_rate": 0.0007813044561538001, + "loss": 0.94026613, + "num_input_tokens_seen": 142117344, + "router_z_loss_mlp": 0.91503906, + "step": 1718, + "time_per_iteration": 3.188633680343628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186368, + "balance_loss_mlp": 1.09495842, + "epoch": 0.3307041169680646, + "flos": 722793507840.0, + "grad_norm": 0.027827869889066197, + "language_loss": 0.97286105, + "learning_rate": 0.0007810468423160958, + "loss": 0.9847247, + "num_input_tokens_seen": 142190096, + "router_z_loss_mlp": 0.91259766, + "step": 1719, + "time_per_iteration": 2.8963494300842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179653, + "balance_loss_mlp": 1.08829057, + "epoch": 0.3308964986533282, + "flos": 584815965696.0, + "grad_norm": 0.0232486528054596, + "language_loss": 0.89203978, + "learning_rate": 0.0007807891193663306, + "loss": 0.90383637, + "num_input_tokens_seen": 142265584, + "router_z_loss_mlp": 0.91210938, + "step": 1720, + "time_per_iteration": 2.784005880355835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188579, + "balance_loss_mlp": 1.09712148, + "epoch": 0.33108888033859174, + "flos": 474525075456.0, + "grad_norm": 0.03234593548431852, + "language_loss": 0.92577451, + "learning_rate": 0.0007805312874045614, + "loss": 0.93766028, + "num_input_tokens_seen": 142330352, + "router_z_loss_mlp": 0.91308594, + "step": 1721, + "time_per_iteration": 2.5072579383850098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187856, + "balance_loss_mlp": 1.09635103, + "epoch": 0.3312812620238553, + "flos": 386996035584.0, + "grad_norm": 0.030880666413309405, + "language_loss": 0.96009982, + "learning_rate": 0.0007802733465308874, + "loss": 0.97197837, + "num_input_tokens_seen": 142392208, + "router_z_loss_mlp": 0.91357422, + "step": 1722, + "time_per_iteration": 2.460878372192383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193288, + "balance_loss_mlp": 1.10173571, + "epoch": 0.3314736437091189, + "flos": 495604333056.0, + "grad_norm": 0.02871647017272099, + "language_loss": 0.9219079, + "learning_rate": 0.0007800152968454501, + "loss": 0.93384075, + "num_input_tokens_seen": 142462112, + "router_z_loss_mlp": 0.9140625, + "step": 1723, + "time_per_iteration": 2.6537680625915527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185112, + "balance_loss_mlp": 1.09365499, + "epoch": 0.33166602539438245, + "flos": 654930376704.0, + "grad_norm": 0.0223046700763118, + "language_loss": 0.96869862, + "learning_rate": 0.0007797571384484334, + "loss": 0.98054969, + "num_input_tokens_seen": 142539120, + "router_z_loss_mlp": 0.91308594, + "step": 1724, + "time_per_iteration": 2.8509135246276855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180603, + "balance_loss_mlp": 1.08909798, + "epoch": 0.33185840707964603, + "flos": 521834701824.0, + "grad_norm": 0.02731483808063424, + "language_loss": 1.00636935, + "learning_rate": 0.0007794988714400633, + "loss": 1.01817536, + "num_input_tokens_seen": 142611520, + "router_z_loss_mlp": 0.91357422, + "step": 1725, + "time_per_iteration": 2.5883586406707764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180377, + "balance_loss_mlp": 1.08901501, + "epoch": 0.33205078876490957, + "flos": 437898432000.0, + "grad_norm": 0.028871117282170154, + "language_loss": 0.94438303, + "learning_rate": 0.0007792404959206079, + "loss": 0.95618677, + "num_input_tokens_seen": 142676064, + "router_z_loss_mlp": 0.91210938, + "step": 1726, + "time_per_iteration": 2.522392988204956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196305, + "balance_loss_mlp": 1.10499096, + "epoch": 0.33224317045017315, + "flos": 770094402048.0, + "grad_norm": 0.026417182809826974, + "language_loss": 0.89548182, + "learning_rate": 0.0007789820119903774, + "loss": 0.90744483, + "num_input_tokens_seen": 142750944, + "router_z_loss_mlp": 0.91162109, + "step": 1727, + "time_per_iteration": 3.015399217605591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119368, + "balance_loss_mlp": 1.10441589, + "epoch": 0.3324355521354367, + "flos": 1469293584384.0, + "grad_norm": 0.009201187704085647, + "language_loss": 0.78492665, + "learning_rate": 0.0007787234197497242, + "loss": 0.79686344, + "num_input_tokens_seen": 142974032, + "router_z_loss_mlp": 0.890625, + "step": 1728, + "time_per_iteration": 4.849627494812012 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187682, + "balance_loss_mlp": 1.09641564, + "epoch": 0.3326279338207003, + "flos": 497799217152.0, + "grad_norm": 0.02618775195690524, + "language_loss": 0.91979456, + "learning_rate": 0.0007784647192990428, + "loss": 0.93167138, + "num_input_tokens_seen": 143047280, + "router_z_loss_mlp": 0.91113281, + "step": 1729, + "time_per_iteration": 2.6944785118103027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178599, + "balance_loss_mlp": 1.08761811, + "epoch": 0.33282031550596386, + "flos": 637053121536.0, + "grad_norm": 0.02771760173732663, + "language_loss": 0.88792735, + "learning_rate": 0.0007782059107387696, + "loss": 0.89971334, + "num_input_tokens_seen": 143124224, + "router_z_loss_mlp": 0.90820312, + "step": 1730, + "time_per_iteration": 2.8583710193634033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179548, + "balance_loss_mlp": 1.0887109, + "epoch": 0.3330126971912274, + "flos": 690721090560.0, + "grad_norm": 0.027739782699759397, + "language_loss": 0.98025161, + "learning_rate": 0.0007779469941693826, + "loss": 0.99204707, + "num_input_tokens_seen": 143194048, + "router_z_loss_mlp": 0.90673828, + "step": 1731, + "time_per_iteration": 2.810589075088501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184359, + "balance_loss_mlp": 1.09361696, + "epoch": 0.333205078876491, + "flos": 567553059840.0, + "grad_norm": 0.03096728777448764, + "language_loss": 0.86715639, + "learning_rate": 0.0007776879696914029, + "loss": 0.87899995, + "num_input_tokens_seen": 143272976, + "router_z_loss_mlp": 0.90576172, + "step": 1732, + "time_per_iteration": 2.8331797122955322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179804, + "balance_loss_mlp": 1.08906233, + "epoch": 0.3333974605617545, + "flos": 642170030592.0, + "grad_norm": 0.024377484958938406, + "language_loss": 0.95668435, + "learning_rate": 0.000777428837405392, + "loss": 0.96848238, + "num_input_tokens_seen": 143346496, + "router_z_loss_mlp": 0.90576172, + "step": 1733, + "time_per_iteration": 2.8495984077453613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178278, + "balance_loss_mlp": 1.087345, + "epoch": 0.3335898422470181, + "flos": 462778578432.0, + "grad_norm": 0.02888991438897714, + "language_loss": 0.96001673, + "learning_rate": 0.0007771695974119544, + "loss": 0.97179955, + "num_input_tokens_seen": 143410448, + "router_z_loss_mlp": 0.90771484, + "step": 1734, + "time_per_iteration": 2.581843614578247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193993, + "balance_loss_mlp": 1.10267842, + "epoch": 0.33378222393228163, + "flos": 854336845824.0, + "grad_norm": 0.031032438471150628, + "language_loss": 0.84453082, + "learning_rate": 0.0007769102498117359, + "loss": 0.85647076, + "num_input_tokens_seen": 143492416, + "router_z_loss_mlp": 0.91162109, + "step": 1735, + "time_per_iteration": 3.092892646789551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118579, + "balance_loss_mlp": 1.09471452, + "epoch": 0.3339746056175452, + "flos": 956308824576.0, + "grad_norm": 0.02638013374987503, + "language_loss": 0.87690091, + "learning_rate": 0.000776650794705424, + "loss": 0.88875878, + "num_input_tokens_seen": 143590096, + "router_z_loss_mlp": 0.90917969, + "step": 1736, + "time_per_iteration": 3.26749587059021 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188294, + "balance_loss_mlp": 1.09693241, + "epoch": 0.33416698730280875, + "flos": 545894381568.0, + "grad_norm": 0.025194797458818457, + "language_loss": 0.89670336, + "learning_rate": 0.0007763912321937483, + "loss": 0.90858638, + "num_input_tokens_seen": 143663344, + "router_z_loss_mlp": 0.91210938, + "step": 1737, + "time_per_iteration": 2.680321455001831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186775, + "balance_loss_mlp": 1.09522188, + "epoch": 0.33435936898807234, + "flos": 1015875237888.0, + "grad_norm": 0.02847992800895855, + "language_loss": 0.91932124, + "learning_rate": 0.0007761315623774799, + "loss": 0.93118894, + "num_input_tokens_seen": 143753072, + "router_z_loss_mlp": 0.9140625, + "step": 1738, + "time_per_iteration": 3.3992278575897217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191791, + "balance_loss_mlp": 1.10014248, + "epoch": 0.3345517506733359, + "flos": 616371362304.0, + "grad_norm": 0.027566762490977777, + "language_loss": 0.97487831, + "learning_rate": 0.0007758717853574313, + "loss": 0.9867962, + "num_input_tokens_seen": 143827280, + "router_z_loss_mlp": 0.91503906, + "step": 1739, + "time_per_iteration": 2.7331244945526123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195023, + "balance_loss_mlp": 1.10327947, + "epoch": 0.33474413235859946, + "flos": 495569404416.0, + "grad_norm": 0.027457607023843998, + "language_loss": 0.9961037, + "learning_rate": 0.0007756119012344571, + "loss": 1.00805402, + "num_input_tokens_seen": 143895072, + "router_z_loss_mlp": 0.91601562, + "step": 1740, + "time_per_iteration": 2.5305063724517822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189378, + "balance_loss_mlp": 1.09772944, + "epoch": 0.33493651404386304, + "flos": 629487547392.0, + "grad_norm": 0.029043894294382887, + "language_loss": 0.93616855, + "learning_rate": 0.0007753519101094535, + "loss": 0.9480623, + "num_input_tokens_seen": 143965728, + "router_z_loss_mlp": 0.91503906, + "step": 1741, + "time_per_iteration": 2.7408056259155273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177762, + "balance_loss_mlp": 1.08630431, + "epoch": 0.3351288957291266, + "flos": 514742487552.0, + "grad_norm": 0.027889242250670986, + "language_loss": 0.95720202, + "learning_rate": 0.0007750918120833575, + "loss": 0.96897966, + "num_input_tokens_seen": 144030272, + "router_z_loss_mlp": 0.91308594, + "step": 1742, + "time_per_iteration": 2.5787625312805176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179593, + "balance_loss_mlp": 1.08818376, + "epoch": 0.33532127741439016, + "flos": 648482711040.0, + "grad_norm": 0.029208114264274002, + "language_loss": 0.95614851, + "learning_rate": 0.0007748316072571485, + "loss": 0.96794444, + "num_input_tokens_seen": 144104048, + "router_z_loss_mlp": 0.91259766, + "step": 1743, + "time_per_iteration": 2.751394033432007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178526, + "balance_loss_mlp": 1.08764088, + "epoch": 0.3355136590996537, + "flos": 769788228096.0, + "grad_norm": 0.02678280054581141, + "language_loss": 0.86505532, + "learning_rate": 0.0007745712957318467, + "loss": 0.87684047, + "num_input_tokens_seen": 144180432, + "router_z_loss_mlp": 0.90722656, + "step": 1744, + "time_per_iteration": 2.9703569412231445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179715, + "balance_loss_mlp": 1.088925, + "epoch": 0.3357060407849173, + "flos": 596649057792.0, + "grad_norm": 0.023433474800662903, + "language_loss": 0.94101429, + "learning_rate": 0.0007743108776085141, + "loss": 0.95281148, + "num_input_tokens_seen": 144258704, + "router_z_loss_mlp": 0.90625, + "step": 1745, + "time_per_iteration": 2.7529683113098145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184954, + "balance_loss_mlp": 1.09435499, + "epoch": 0.3358984224701808, + "flos": 599801395200.0, + "grad_norm": 0.02538707782704008, + "language_loss": 0.88967884, + "learning_rate": 0.0007740503529882543, + "loss": 0.9015283, + "num_input_tokens_seen": 144335104, + "router_z_loss_mlp": 0.90429688, + "step": 1746, + "time_per_iteration": 2.79131817817688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188552, + "balance_loss_mlp": 1.09780991, + "epoch": 0.3360908041554444, + "flos": 579429812736.0, + "grad_norm": 0.028485119021284356, + "language_loss": 0.99668056, + "learning_rate": 0.0007737897219722114, + "loss": 1.00856614, + "num_input_tokens_seen": 144402912, + "router_z_loss_mlp": 0.90576172, + "step": 1747, + "time_per_iteration": 2.685925006866455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189008, + "balance_loss_mlp": 1.09836173, + "epoch": 0.336283185840708, + "flos": 514620963840.0, + "grad_norm": 0.027318502045144608, + "language_loss": 0.90481317, + "learning_rate": 0.0007735289846615716, + "loss": 0.91670322, + "num_input_tokens_seen": 144475328, + "router_z_loss_mlp": 0.90478516, + "step": 1748, + "time_per_iteration": 2.62443470954895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189766, + "balance_loss_mlp": 1.09902358, + "epoch": 0.3364755675259715, + "flos": 526013623296.0, + "grad_norm": 0.026723032477842582, + "language_loss": 0.90137696, + "learning_rate": 0.0007732681411575621, + "loss": 0.91327465, + "num_input_tokens_seen": 144548288, + "router_z_loss_mlp": 0.90576172, + "step": 1749, + "time_per_iteration": 2.646358013153076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182694, + "balance_loss_mlp": 1.09209466, + "epoch": 0.3366679492112351, + "flos": 555973748736.0, + "grad_norm": 0.023573972968583972, + "language_loss": 0.93333745, + "learning_rate": 0.0007730071915614514, + "loss": 0.94516432, + "num_input_tokens_seen": 144619488, + "router_z_loss_mlp": 0.90429688, + "step": 1750, + "time_per_iteration": 2.6758012771606445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179857, + "balance_loss_mlp": 1.08901942, + "epoch": 0.33686033089649864, + "flos": 428164170240.0, + "grad_norm": 0.030830494146199924, + "language_loss": 0.97502697, + "learning_rate": 0.0007727461359745489, + "loss": 0.98682547, + "num_input_tokens_seen": 144682560, + "router_z_loss_mlp": 0.90673828, + "step": 1751, + "time_per_iteration": 2.4563541412353516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182248, + "balance_loss_mlp": 1.09145832, + "epoch": 0.3370527125817622, + "flos": 542840099328.0, + "grad_norm": 0.023246790346845608, + "language_loss": 0.93729055, + "learning_rate": 0.0007724849744982056, + "loss": 0.94911301, + "num_input_tokens_seen": 144753328, + "router_z_loss_mlp": 0.90625, + "step": 1752, + "time_per_iteration": 2.668113946914673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179422, + "balance_loss_mlp": 1.08858418, + "epoch": 0.33724509426702576, + "flos": 543230866944.0, + "grad_norm": 0.02371236203418416, + "language_loss": 0.90932786, + "learning_rate": 0.0007722237072338131, + "loss": 0.92112207, + "num_input_tokens_seen": 144827312, + "router_z_loss_mlp": 0.90673828, + "step": 1753, + "time_per_iteration": 2.69787335395813 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178324, + "balance_loss_mlp": 1.08753431, + "epoch": 0.33743747595228935, + "flos": 473752272384.0, + "grad_norm": 0.029898359882718887, + "language_loss": 0.95709926, + "learning_rate": 0.0007719623342828046, + "loss": 0.96888256, + "num_input_tokens_seen": 144893488, + "router_z_loss_mlp": 0.90625, + "step": 1754, + "time_per_iteration": 2.4994091987609863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183652, + "balance_loss_mlp": 1.09295714, + "epoch": 0.33762985763755293, + "flos": 470836978176.0, + "grad_norm": 0.02665869511949433, + "language_loss": 0.93777692, + "learning_rate": 0.000771700855746654, + "loss": 0.94961339, + "num_input_tokens_seen": 144961152, + "router_z_loss_mlp": 0.90527344, + "step": 1755, + "time_per_iteration": 2.58086895942688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178715, + "balance_loss_mlp": 1.08792567, + "epoch": 0.33782223932281646, + "flos": 493250995200.0, + "grad_norm": 0.024252070816233498, + "language_loss": 0.95916575, + "learning_rate": 0.0007714392717268763, + "loss": 0.97095293, + "num_input_tokens_seen": 145030576, + "router_z_loss_mlp": 0.90625, + "step": 1756, + "time_per_iteration": 2.5631322860717773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180772, + "balance_loss_mlp": 1.08988702, + "epoch": 0.33801462100808005, + "flos": 466017510912.0, + "grad_norm": 0.025388958299120416, + "language_loss": 0.95127004, + "learning_rate": 0.0007711775823250273, + "loss": 0.96307778, + "num_input_tokens_seen": 145095648, + "router_z_loss_mlp": 0.90722656, + "step": 1757, + "time_per_iteration": 2.5053045749664307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178431, + "balance_loss_mlp": 1.08754551, + "epoch": 0.3382070026933436, + "flos": 797067374592.0, + "grad_norm": 0.024419621343361942, + "language_loss": 0.92107689, + "learning_rate": 0.0007709157876427039, + "loss": 0.93286121, + "num_input_tokens_seen": 145181248, + "router_z_loss_mlp": 0.90722656, + "step": 1758, + "time_per_iteration": 3.1007301807403564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178269, + "balance_loss_mlp": 1.08738351, + "epoch": 0.33839938437860717, + "flos": 509428193280.0, + "grad_norm": 0.024832384176200758, + "language_loss": 0.94253516, + "learning_rate": 0.0007706538877815439, + "loss": 0.95431781, + "num_input_tokens_seen": 145252944, + "router_z_loss_mlp": 0.90722656, + "step": 1759, + "time_per_iteration": 2.588744640350342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178646, + "balance_loss_mlp": 1.0878557, + "epoch": 0.3385917660638707, + "flos": 485273186304.0, + "grad_norm": 0.02369115174437829, + "language_loss": 0.89945841, + "learning_rate": 0.0007703918828432259, + "loss": 0.91124481, + "num_input_tokens_seen": 145323168, + "router_z_loss_mlp": 0.90625, + "step": 1760, + "time_per_iteration": 2.5859875679016113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178403, + "balance_loss_mlp": 1.08770907, + "epoch": 0.3387841477491343, + "flos": 546415405056.0, + "grad_norm": 0.02534991906570622, + "language_loss": 0.96946132, + "learning_rate": 0.000770129772929469, + "loss": 0.9812454, + "num_input_tokens_seen": 145395776, + "router_z_loss_mlp": 0.90527344, + "step": 1761, + "time_per_iteration": 2.633229970932007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117744, + "balance_loss_mlp": 1.08684063, + "epoch": 0.3389765294343978, + "flos": 721063251456.0, + "grad_norm": 0.027907228809642075, + "language_loss": 0.96886694, + "learning_rate": 0.0007698675581420334, + "loss": 0.98064131, + "num_input_tokens_seen": 145470576, + "router_z_loss_mlp": 0.90429688, + "step": 1762, + "time_per_iteration": 2.8309946060180664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190138, + "balance_loss_mlp": 1.09987259, + "epoch": 0.3391689111196614, + "flos": 701263084032.0, + "grad_norm": 0.028701846645649853, + "language_loss": 0.87853253, + "learning_rate": 0.0007696052385827199, + "loss": 0.89043397, + "num_input_tokens_seen": 145548896, + "router_z_loss_mlp": 0.90087891, + "step": 1763, + "time_per_iteration": 2.9673497676849365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183311, + "balance_loss_mlp": 1.09304607, + "epoch": 0.339361292804925, + "flos": 628248115200.0, + "grad_norm": 0.027144566695111814, + "language_loss": 0.85910845, + "learning_rate": 0.00076934281435337, + "loss": 0.87094158, + "num_input_tokens_seen": 145617136, + "router_z_loss_mlp": 0.90087891, + "step": 1764, + "time_per_iteration": 2.7069530487060547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011791, + "balance_loss_mlp": 1.08869135, + "epoch": 0.33955367449018853, + "flos": 610794554880.0, + "grad_norm": 0.025973604998757366, + "language_loss": 0.94002628, + "learning_rate": 0.0007690802855558658, + "loss": 0.95181727, + "num_input_tokens_seen": 145696416, + "router_z_loss_mlp": 0.90234375, + "step": 1765, + "time_per_iteration": 2.8596885204315186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198868, + "balance_loss_mlp": 1.11151123, + "epoch": 0.3397460561754521, + "flos": 1456586357760.0, + "grad_norm": 0.018873382807181687, + "language_loss": 0.76374954, + "learning_rate": 0.0007688176522921302, + "loss": 0.77573818, + "num_input_tokens_seen": 145919680, + "router_z_loss_mlp": 0.87109375, + "step": 1766, + "time_per_iteration": 4.900039434432983 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183458, + "balance_loss_mlp": 1.09304976, + "epoch": 0.33993843786071565, + "flos": 488290538496.0, + "grad_norm": 0.033631077459875626, + "language_loss": 1.00266671, + "learning_rate": 0.0007685549146641262, + "loss": 1.01450121, + "num_input_tokens_seen": 145984272, + "router_z_loss_mlp": 0.90234375, + "step": 1767, + "time_per_iteration": 2.521587610244751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176512, + "balance_loss_mlp": 1.08557928, + "epoch": 0.34013081954597923, + "flos": 418232523264.0, + "grad_norm": 0.024531175575557927, + "language_loss": 0.95696396, + "learning_rate": 0.0007682920727738579, + "loss": 0.96872908, + "num_input_tokens_seen": 146047248, + "router_z_loss_mlp": 0.90771484, + "step": 1768, + "time_per_iteration": 2.4606878757476807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177177, + "balance_loss_mlp": 1.08614898, + "epoch": 0.34032320123124277, + "flos": 438430189056.0, + "grad_norm": 0.027457130501572214, + "language_loss": 0.93990809, + "learning_rate": 0.000768029126723369, + "loss": 0.95167989, + "num_input_tokens_seen": 146111872, + "router_z_loss_mlp": 0.90869141, + "step": 1769, + "time_per_iteration": 2.494699478149414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181852, + "balance_loss_mlp": 1.09077609, + "epoch": 0.34051558291650635, + "flos": 458543261184.0, + "grad_norm": 0.027949795017340132, + "language_loss": 0.90377855, + "learning_rate": 0.0007677660766147447, + "loss": 0.91559708, + "num_input_tokens_seen": 146172608, + "router_z_loss_mlp": 0.90917969, + "step": 1770, + "time_per_iteration": 2.5302748680114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183578, + "balance_loss_mlp": 1.09469604, + "epoch": 0.3407079646017699, + "flos": 1562137645056.0, + "grad_norm": 0.011444512115251876, + "language_loss": 0.72470945, + "learning_rate": 0.0007675029225501102, + "loss": 0.73654521, + "num_input_tokens_seen": 146413584, + "router_z_loss_mlp": 0.88671875, + "step": 1771, + "time_per_iteration": 4.913311004638672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188847, + "balance_loss_mlp": 1.09758055, + "epoch": 0.3409003462870335, + "flos": 493530972672.0, + "grad_norm": 0.032062498304007335, + "language_loss": 0.91194993, + "learning_rate": 0.0007672396646316306, + "loss": 0.92383844, + "num_input_tokens_seen": 146476992, + "router_z_loss_mlp": 0.91113281, + "step": 1772, + "time_per_iteration": 2.539181709289551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179934, + "balance_loss_mlp": 1.08885825, + "epoch": 0.34109272797229706, + "flos": 809820989952.0, + "grad_norm": 0.028470010979029077, + "language_loss": 0.88439053, + "learning_rate": 0.000766976302961512, + "loss": 0.89618981, + "num_input_tokens_seen": 146552848, + "router_z_loss_mlp": 0.90917969, + "step": 1773, + "time_per_iteration": 3.006547212600708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181829, + "balance_loss_mlp": 1.09094357, + "epoch": 0.3412851096575606, + "flos": 471099491328.0, + "grad_norm": 0.02901021255147234, + "language_loss": 0.91066158, + "learning_rate": 0.0007667128376420003, + "loss": 0.92247993, + "num_input_tokens_seen": 146617504, + "router_z_loss_mlp": 0.90722656, + "step": 1774, + "time_per_iteration": 2.534266233444214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118318, + "balance_loss_mlp": 1.09253371, + "epoch": 0.3414774913428242, + "flos": 596770581504.0, + "grad_norm": 0.02876896591079206, + "language_loss": 0.92739397, + "learning_rate": 0.0007664492687753817, + "loss": 0.93922579, + "num_input_tokens_seen": 146691568, + "router_z_loss_mlp": 0.90478516, + "step": 1775, + "time_per_iteration": 2.671475410461426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181574, + "balance_loss_mlp": 1.09102285, + "epoch": 0.3416698730280877, + "flos": 528507950592.0, + "grad_norm": 0.025483549401886952, + "language_loss": 0.89018893, + "learning_rate": 0.000766185596463983, + "loss": 0.90200466, + "num_input_tokens_seen": 146764208, + "router_z_loss_mlp": 0.90380859, + "step": 1776, + "time_per_iteration": 2.6099884510040283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177935, + "balance_loss_mlp": 1.08719325, + "epoch": 0.3418622547133513, + "flos": 876117047808.0, + "grad_norm": 0.026020404961979337, + "language_loss": 0.84743214, + "learning_rate": 0.0007659218208101706, + "loss": 0.8592115, + "num_input_tokens_seen": 146847744, + "router_z_loss_mlp": 0.90576172, + "step": 1777, + "time_per_iteration": 3.1272366046905518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118093, + "balance_loss_mlp": 1.08994997, + "epoch": 0.34205463639861483, + "flos": 604876644864.0, + "grad_norm": 0.024068405360429687, + "language_loss": 0.91582745, + "learning_rate": 0.0007656579419163515, + "loss": 0.92763674, + "num_input_tokens_seen": 146918336, + "router_z_loss_mlp": 0.90820312, + "step": 1778, + "time_per_iteration": 2.7243831157684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180436, + "balance_loss_mlp": 1.0894556, + "epoch": 0.3422470180838784, + "flos": 464714952192.0, + "grad_norm": 0.02739040164484414, + "language_loss": 0.86445272, + "learning_rate": 0.0007653939598849724, + "loss": 0.87625706, + "num_input_tokens_seen": 146982496, + "router_z_loss_mlp": 0.90820312, + "step": 1779, + "time_per_iteration": 2.4913573265075684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180695, + "balance_loss_mlp": 1.09143066, + "epoch": 0.34243939976914195, + "flos": 1589816291328.0, + "grad_norm": 0.01051605552964957, + "language_loss": 0.82880205, + "learning_rate": 0.0007651298748185204, + "loss": 0.84060901, + "num_input_tokens_seen": 147213600, + "router_z_loss_mlp": 0.890625, + "step": 1780, + "time_per_iteration": 4.891184091567993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176554, + "balance_loss_mlp": 1.085621, + "epoch": 0.34263178145440554, + "flos": 874443187200.0, + "grad_norm": 0.026322112436007235, + "language_loss": 0.88782489, + "learning_rate": 0.000764865686819522, + "loss": 0.89959043, + "num_input_tokens_seen": 147287664, + "router_z_loss_mlp": 0.90771484, + "step": 1781, + "time_per_iteration": 3.048123836517334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176352, + "balance_loss_mlp": 1.08551466, + "epoch": 0.3428241631396691, + "flos": 507873854976.0, + "grad_norm": 0.024622696081698998, + "language_loss": 0.93515933, + "learning_rate": 0.0007646013959905449, + "loss": 0.94692284, + "num_input_tokens_seen": 147356800, + "router_z_loss_mlp": 0.90673828, + "step": 1782, + "time_per_iteration": 2.565661907196045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176257, + "balance_loss_mlp": 1.08565772, + "epoch": 0.34301654482493266, + "flos": 881524667904.0, + "grad_norm": 0.0252118274748732, + "language_loss": 0.880337, + "learning_rate": 0.0007643370024341949, + "loss": 0.89209956, + "num_input_tokens_seen": 147432496, + "router_z_loss_mlp": 0.90429688, + "step": 1783, + "time_per_iteration": 3.0695888996124268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180625, + "balance_loss_mlp": 1.08959711, + "epoch": 0.34320892651019624, + "flos": 432668731392.0, + "grad_norm": 0.024350173092139916, + "language_loss": 0.89407057, + "learning_rate": 0.0007640725062531195, + "loss": 0.90587682, + "num_input_tokens_seen": 147495856, + "router_z_loss_mlp": 0.90869141, + "step": 1784, + "time_per_iteration": 2.5120832920074463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184023, + "balance_loss_mlp": 1.09294736, + "epoch": 0.3434013081954598, + "flos": 464593428480.0, + "grad_norm": 0.02877111448667641, + "language_loss": 0.95969987, + "learning_rate": 0.0007638079075500047, + "loss": 0.97154009, + "num_input_tokens_seen": 147559632, + "router_z_loss_mlp": 0.90917969, + "step": 1785, + "time_per_iteration": 2.5176198482513428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194351, + "balance_loss_mlp": 1.10546875, + "epoch": 0.34359368988072336, + "flos": 1560674631168.0, + "grad_norm": 0.01088995253456435, + "language_loss": 0.75180668, + "learning_rate": 0.0007635432064275772, + "loss": 0.7637502, + "num_input_tokens_seen": 147794576, + "router_z_loss_mlp": 0.88671875, + "step": 1786, + "time_per_iteration": 5.021549463272095 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183341, + "balance_loss_mlp": 1.09278917, + "epoch": 0.3437860715659869, + "flos": 496572519936.0, + "grad_norm": 0.024204144242014246, + "language_loss": 0.90540475, + "learning_rate": 0.0007632784029886026, + "loss": 0.91723818, + "num_input_tokens_seen": 147866960, + "router_z_loss_mlp": 0.90380859, + "step": 1787, + "time_per_iteration": 2.6350793838500977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178894, + "balance_loss_mlp": 1.08791375, + "epoch": 0.3439784532512505, + "flos": 719608969728.0, + "grad_norm": 0.025958683961259412, + "language_loss": 0.93068433, + "learning_rate": 0.0007630134973358873, + "loss": 0.94247323, + "num_input_tokens_seen": 147947808, + "router_z_loss_mlp": 0.90820312, + "step": 1788, + "time_per_iteration": 2.93084454536438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178793, + "balance_loss_mlp": 1.08785999, + "epoch": 0.34417083493651407, + "flos": 566921246208.0, + "grad_norm": 0.025032512144454056, + "language_loss": 0.92506206, + "learning_rate": 0.0007627484895722763, + "loss": 0.93685007, + "num_input_tokens_seen": 148015936, + "router_z_loss_mlp": 0.90771484, + "step": 1789, + "time_per_iteration": 2.649689197540283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177857, + "balance_loss_mlp": 1.08706772, + "epoch": 0.3443632166217776, + "flos": 797701189632.0, + "grad_norm": 0.027302991531117576, + "language_loss": 0.89870507, + "learning_rate": 0.0007624833798006552, + "loss": 0.9104836, + "num_input_tokens_seen": 148099776, + "router_z_loss_mlp": 0.90625, + "step": 1790, + "time_per_iteration": 3.0469179153442383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117862, + "balance_loss_mlp": 1.08811665, + "epoch": 0.3445555983070412, + "flos": 570392492544.0, + "grad_norm": 0.0288389056738737, + "language_loss": 0.92729777, + "learning_rate": 0.0007622181681239483, + "loss": 0.93908393, + "num_input_tokens_seen": 148169616, + "router_z_loss_mlp": 0.90332031, + "step": 1791, + "time_per_iteration": 2.6440184116363525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178949, + "balance_loss_mlp": 1.08849263, + "epoch": 0.3447479799923047, + "flos": 569980257792.0, + "grad_norm": 0.022982775931836206, + "language_loss": 0.91584516, + "learning_rate": 0.0007619528546451202, + "loss": 0.9276346, + "num_input_tokens_seen": 148247824, + "router_z_loss_mlp": 0.90283203, + "step": 1792, + "time_per_iteration": 2.797133445739746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177091, + "balance_loss_mlp": 1.08673048, + "epoch": 0.3449403616775683, + "flos": 969331683840.0, + "grad_norm": 0.02628926210615307, + "language_loss": 0.90923131, + "learning_rate": 0.0007616874394671745, + "loss": 0.92100227, + "num_input_tokens_seen": 148333040, + "router_z_loss_mlp": 0.90185547, + "step": 1793, + "time_per_iteration": 3.3191378116607666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178301, + "balance_loss_mlp": 1.08784556, + "epoch": 0.34513274336283184, + "flos": 569676085248.0, + "grad_norm": 0.03267712320672132, + "language_loss": 0.9558928, + "learning_rate": 0.0007614219226931547, + "loss": 0.96767581, + "num_input_tokens_seen": 148401840, + "router_z_loss_mlp": 0.90283203, + "step": 1794, + "time_per_iteration": 2.677525043487549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178051, + "balance_loss_mlp": 1.0875473, + "epoch": 0.3453251250480954, + "flos": 461858055168.0, + "grad_norm": 0.024689469906648515, + "language_loss": 0.92397773, + "learning_rate": 0.0007611563044261435, + "loss": 0.93575823, + "num_input_tokens_seen": 148466576, + "router_z_loss_mlp": 0.90332031, + "step": 1795, + "time_per_iteration": 2.5183908939361572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178812, + "balance_loss_mlp": 1.08835602, + "epoch": 0.34551750673335896, + "flos": 416519731200.0, + "grad_norm": 0.027710199676415265, + "language_loss": 0.96473086, + "learning_rate": 0.0007608905847692631, + "loss": 0.97651899, + "num_input_tokens_seen": 148530016, + "router_z_loss_mlp": 0.90283203, + "step": 1796, + "time_per_iteration": 2.4600772857666016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182482, + "balance_loss_mlp": 1.09212101, + "epoch": 0.34570988841862255, + "flos": 589114409472.0, + "grad_norm": 0.023363368939277738, + "language_loss": 0.92555124, + "learning_rate": 0.0007606247638256749, + "loss": 0.93737608, + "num_input_tokens_seen": 148610064, + "router_z_loss_mlp": 0.90185547, + "step": 1797, + "time_per_iteration": 2.8326525688171387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183395, + "balance_loss_mlp": 1.09565735, + "epoch": 0.34590227010388613, + "flos": 1571142764544.0, + "grad_norm": 0.009651567236440416, + "language_loss": 0.78170294, + "learning_rate": 0.0007603588416985798, + "loss": 0.79353684, + "num_input_tokens_seen": 148835872, + "router_z_loss_mlp": 0.875, + "step": 1798, + "time_per_iteration": 4.921091794967651 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118071, + "balance_loss_mlp": 1.09259033, + "epoch": 0.34609465178914967, + "flos": 1540928131584.0, + "grad_norm": 0.004186018133500934, + "language_loss": 0.79327202, + "learning_rate": 0.0007600928184912179, + "loss": 0.8050791, + "num_input_tokens_seen": 149066864, + "router_z_loss_mlp": 0.87890625, + "step": 1799, + "time_per_iteration": 4.76463508605957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177428, + "balance_loss_mlp": 1.08692396, + "epoch": 0.34628703347441325, + "flos": 610516578816.0, + "grad_norm": 0.027319297321258894, + "language_loss": 0.94778776, + "learning_rate": 0.0007598266943068686, + "loss": 0.95956194, + "num_input_tokens_seen": 149141600, + "router_z_loss_mlp": 0.90332031, + "step": 1800, + "time_per_iteration": 2.741830348968506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180421, + "balance_loss_mlp": 1.0898217, + "epoch": 0.3464794151596768, + "flos": 474264563712.0, + "grad_norm": 0.0268607754896097, + "language_loss": 0.91417915, + "learning_rate": 0.0007595604692488507, + "loss": 0.92598337, + "num_input_tokens_seen": 149205888, + "router_z_loss_mlp": 0.90429688, + "step": 1801, + "time_per_iteration": 2.5253777503967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117756, + "balance_loss_mlp": 1.08719921, + "epoch": 0.34667179684494037, + "flos": 606821750784.0, + "grad_norm": 0.0251267071243342, + "language_loss": 0.907076, + "learning_rate": 0.0007592941434205215, + "loss": 0.91885161, + "num_input_tokens_seen": 149281280, + "router_z_loss_mlp": 0.90185547, + "step": 1802, + "time_per_iteration": 2.7729735374450684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175873, + "balance_loss_mlp": 1.0877533, + "epoch": 0.3468641785302039, + "flos": 1568359727616.0, + "grad_norm": 0.004114808875680539, + "language_loss": 0.73571062, + "learning_rate": 0.0007590277169252782, + "loss": 0.74746931, + "num_input_tokens_seen": 149525008, + "router_z_loss_mlp": 0.87890625, + "step": 1803, + "time_per_iteration": 5.036771774291992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178076, + "balance_loss_mlp": 1.08776271, + "epoch": 0.3470565602154675, + "flos": 908723223552.0, + "grad_norm": 0.03174792037748739, + "language_loss": 0.90712535, + "learning_rate": 0.0007587611898665566, + "loss": 0.91890609, + "num_input_tokens_seen": 149600624, + "router_z_loss_mlp": 0.90136719, + "step": 1804, + "time_per_iteration": 3.0725910663604736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177414, + "balance_loss_mlp": 1.08719671, + "epoch": 0.347248941900731, + "flos": 640059740160.0, + "grad_norm": 0.023310551488003612, + "language_loss": 0.90306699, + "learning_rate": 0.0007584945623478315, + "loss": 0.91484118, + "num_input_tokens_seen": 149674224, + "router_z_loss_mlp": 0.90039062, + "step": 1805, + "time_per_iteration": 2.8080646991729736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176916, + "balance_loss_mlp": 1.08655512, + "epoch": 0.3474413235859946, + "flos": 848781505536.0, + "grad_norm": 0.027596494202169034, + "language_loss": 0.90514499, + "learning_rate": 0.000758227834472617, + "loss": 0.91691411, + "num_input_tokens_seen": 149758688, + "router_z_loss_mlp": 0.90185547, + "step": 1806, + "time_per_iteration": 3.0443291664123535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179899, + "balance_loss_mlp": 1.08972931, + "epoch": 0.3476337052712582, + "flos": 516696325632.0, + "grad_norm": 0.02724510251762829, + "language_loss": 0.86438924, + "learning_rate": 0.0007579610063444664, + "loss": 0.87618828, + "num_input_tokens_seen": 149831648, + "router_z_loss_mlp": 0.89990234, + "step": 1807, + "time_per_iteration": 2.716522455215454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177066, + "balance_loss_mlp": 1.08694386, + "epoch": 0.34782608695652173, + "flos": 915114493440.0, + "grad_norm": 0.02927822844999151, + "language_loss": 0.96424794, + "learning_rate": 0.0007576940780669712, + "loss": 0.97601861, + "num_input_tokens_seen": 149919440, + "router_z_loss_mlp": 0.89941406, + "step": 1808, + "time_per_iteration": 3.21464204788208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177424, + "balance_loss_mlp": 1.08734941, + "epoch": 0.3480184686417853, + "flos": 775083056640.0, + "grad_norm": 0.026376675364870938, + "language_loss": 0.91835052, + "learning_rate": 0.0007574270497437624, + "loss": 0.93012476, + "num_input_tokens_seen": 150001632, + "router_z_loss_mlp": 0.89892578, + "step": 1809, + "time_per_iteration": 2.965306043624878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177298, + "balance_loss_mlp": 1.0874145, + "epoch": 0.34821085032704885, + "flos": 578003728896.0, + "grad_norm": 0.024336980271772477, + "language_loss": 0.95592844, + "learning_rate": 0.000757159921478509, + "loss": 0.96770144, + "num_input_tokens_seen": 150077552, + "router_z_loss_mlp": 0.89697266, + "step": 1810, + "time_per_iteration": 2.781496047973633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177093, + "balance_loss_mlp": 1.088974, + "epoch": 0.34840323201231244, + "flos": 1528039531008.0, + "grad_norm": 0.007178450494277746, + "language_loss": 0.74450636, + "learning_rate": 0.0007568926933749201, + "loss": 0.75627732, + "num_input_tokens_seen": 150295328, + "router_z_loss_mlp": 0.87890625, + "step": 1811, + "time_per_iteration": 4.719515562057495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176704, + "balance_loss_mlp": 1.08691561, + "epoch": 0.34859561369757597, + "flos": 510181530624.0, + "grad_norm": 0.02648580139398905, + "language_loss": 0.96071857, + "learning_rate": 0.0007566253655367423, + "loss": 0.97248554, + "num_input_tokens_seen": 150360496, + "router_z_loss_mlp": 0.89599609, + "step": 1812, + "time_per_iteration": 2.5699198246002197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177921, + "balance_loss_mlp": 1.08822834, + "epoch": 0.34878799538283956, + "flos": 549756395520.0, + "grad_norm": 0.036663453377328174, + "language_loss": 0.96810794, + "learning_rate": 0.000756357938067762, + "loss": 0.97988713, + "num_input_tokens_seen": 150432064, + "router_z_loss_mlp": 0.89501953, + "step": 1813, + "time_per_iteration": 2.6622092723846436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179077, + "balance_loss_mlp": 1.08885992, + "epoch": 0.34898037706810314, + "flos": 985193975808.0, + "grad_norm": 0.026013801782247825, + "language_loss": 0.90032709, + "learning_rate": 0.0007560904110718033, + "loss": 0.91211784, + "num_input_tokens_seen": 150512176, + "router_z_loss_mlp": 0.90039062, + "step": 1814, + "time_per_iteration": 3.2704074382781982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176613, + "balance_loss_mlp": 1.08639514, + "epoch": 0.3491727587533667, + "flos": 682836607488.0, + "grad_norm": 0.025025787643359835, + "language_loss": 0.91824377, + "learning_rate": 0.0007558227846527297, + "loss": 0.93000984, + "num_input_tokens_seen": 150586416, + "router_z_loss_mlp": 0.90039062, + "step": 1815, + "time_per_iteration": 2.870858907699585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176853, + "balance_loss_mlp": 1.08673084, + "epoch": 0.34936514043863026, + "flos": 394889250816.0, + "grad_norm": 0.0291076708707547, + "language_loss": 0.91979998, + "learning_rate": 0.0007555550589144429, + "loss": 0.9315685, + "num_input_tokens_seen": 150648944, + "router_z_loss_mlp": 0.89941406, + "step": 1816, + "time_per_iteration": 2.4363009929656982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177424, + "balance_loss_mlp": 1.08739722, + "epoch": 0.3495575221238938, + "flos": 462340147200.0, + "grad_norm": 0.02440335273431038, + "language_loss": 0.92281306, + "learning_rate": 0.000755287233960883, + "loss": 0.9345873, + "num_input_tokens_seen": 150717200, + "router_z_loss_mlp": 0.8984375, + "step": 1817, + "time_per_iteration": 2.538250207901001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117706, + "balance_loss_mlp": 1.08693826, + "epoch": 0.3497499038091574, + "flos": 725428824576.0, + "grad_norm": 0.028430093115180927, + "language_loss": 0.88002723, + "learning_rate": 0.0007550193098960292, + "loss": 0.89179784, + "num_input_tokens_seen": 150790368, + "router_z_loss_mlp": 0.89941406, + "step": 1818, + "time_per_iteration": 2.8685545921325684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174425, + "balance_loss_mlp": 1.08411181, + "epoch": 0.3499422854944209, + "flos": 829196187648.0, + "grad_norm": 0.021653398091314287, + "language_loss": 0.92103571, + "learning_rate": 0.0007547512868238988, + "loss": 0.93277991, + "num_input_tokens_seen": 150879872, + "router_z_loss_mlp": 0.90136719, + "step": 1819, + "time_per_iteration": 3.115814685821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118204, + "balance_loss_mlp": 1.092013, + "epoch": 0.3501346671796845, + "flos": 494542820352.0, + "grad_norm": 0.026515438979626053, + "language_loss": 0.9198699, + "learning_rate": 0.0007544831648485473, + "loss": 0.93169028, + "num_input_tokens_seen": 150953712, + "router_z_loss_mlp": 0.8984375, + "step": 1820, + "time_per_iteration": 2.6666150093078613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178247, + "balance_loss_mlp": 1.08783865, + "epoch": 0.35032704886494803, + "flos": 579848778240.0, + "grad_norm": 0.026574936148936048, + "language_loss": 0.89372301, + "learning_rate": 0.0007542149440740694, + "loss": 0.90550542, + "num_input_tokens_seen": 151026192, + "router_z_loss_mlp": 0.90234375, + "step": 1821, + "time_per_iteration": 2.6776442527770996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178869, + "balance_loss_mlp": 1.08841276, + "epoch": 0.3505194305502116, + "flos": 585831816192.0, + "grad_norm": 0.02674162112947977, + "language_loss": 0.9602831, + "learning_rate": 0.000753946624604597, + "loss": 0.97207189, + "num_input_tokens_seen": 151100720, + "router_z_loss_mlp": 0.90283203, + "step": 1822, + "time_per_iteration": 2.746363639831543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175368, + "balance_loss_mlp": 1.08491182, + "epoch": 0.3507118122354752, + "flos": 527978194944.0, + "grad_norm": 0.02703682960411951, + "language_loss": 0.95658362, + "learning_rate": 0.0007536782065443015, + "loss": 0.9683373, + "num_input_tokens_seen": 151166032, + "router_z_loss_mlp": 0.90283203, + "step": 1823, + "time_per_iteration": 2.5945184230804443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175188, + "balance_loss_mlp": 1.08458936, + "epoch": 0.35090419392073874, + "flos": 512545602048.0, + "grad_norm": 0.03278557538641046, + "language_loss": 0.86822712, + "learning_rate": 0.0007534096899973919, + "loss": 0.87997901, + "num_input_tokens_seen": 151232208, + "router_z_loss_mlp": 0.90429688, + "step": 1824, + "time_per_iteration": 2.56933331489563 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184456, + "balance_loss_mlp": 1.0944289, + "epoch": 0.3510965756060023, + "flos": 565195719168.0, + "grad_norm": 0.023191753507183704, + "language_loss": 0.89392567, + "learning_rate": 0.0007531410750681154, + "loss": 0.90577018, + "num_input_tokens_seen": 151308128, + "router_z_loss_mlp": 0.8984375, + "step": 1825, + "time_per_iteration": 2.7223169803619385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186327, + "balance_loss_mlp": 1.09630024, + "epoch": 0.35128895729126586, + "flos": 1022253046272.0, + "grad_norm": 0.026424599574572643, + "language_loss": 0.93470478, + "learning_rate": 0.0007528723618607575, + "loss": 0.94656801, + "num_input_tokens_seen": 151402560, + "router_z_loss_mlp": 0.8984375, + "step": 1826, + "time_per_iteration": 3.404395580291748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182394, + "balance_loss_mlp": 1.09236717, + "epoch": 0.35148133897652944, + "flos": 589424586240.0, + "grad_norm": 0.02767542011563751, + "language_loss": 0.89242589, + "learning_rate": 0.0007526035504796422, + "loss": 0.90424991, + "num_input_tokens_seen": 151478816, + "router_z_loss_mlp": 0.8984375, + "step": 1827, + "time_per_iteration": 2.820510149002075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117853, + "balance_loss_mlp": 1.08850324, + "epoch": 0.351673720661793, + "flos": 496285811712.0, + "grad_norm": 0.02845608163714707, + "language_loss": 0.94670665, + "learning_rate": 0.0007523346410291312, + "loss": 0.95849192, + "num_input_tokens_seen": 151554528, + "router_z_loss_mlp": 0.8984375, + "step": 1828, + "time_per_iteration": 2.763277053833008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177518, + "balance_loss_mlp": 1.08753836, + "epoch": 0.35186610234705656, + "flos": 763998572544.0, + "grad_norm": 0.028566964886064136, + "language_loss": 0.91855693, + "learning_rate": 0.0007520656336136245, + "loss": 0.93033206, + "num_input_tokens_seen": 151629440, + "router_z_loss_mlp": 0.89794922, + "step": 1829, + "time_per_iteration": 2.9501917362213135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179113, + "balance_loss_mlp": 1.08908641, + "epoch": 0.3520584840323201, + "flos": 627388717056.0, + "grad_norm": 0.0235814228834027, + "language_loss": 0.94624627, + "learning_rate": 0.0007517965283375599, + "loss": 0.95803738, + "num_input_tokens_seen": 151708544, + "router_z_loss_mlp": 0.8984375, + "step": 1830, + "time_per_iteration": 2.8197402954101562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179857, + "balance_loss_mlp": 1.08992577, + "epoch": 0.3522508657175837, + "flos": 538448329728.0, + "grad_norm": 0.025024391475303026, + "language_loss": 0.97205818, + "learning_rate": 0.0007515273253054132, + "loss": 0.9838568, + "num_input_tokens_seen": 151779152, + "router_z_loss_mlp": 0.89746094, + "step": 1831, + "time_per_iteration": 2.6376330852508545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191124, + "balance_loss_mlp": 1.10109711, + "epoch": 0.35244324740284727, + "flos": 568501780992.0, + "grad_norm": 0.029882616882314406, + "language_loss": 0.9266001, + "learning_rate": 0.0007512580246216988, + "loss": 0.93851131, + "num_input_tokens_seen": 151853216, + "router_z_loss_mlp": 0.8984375, + "step": 1832, + "time_per_iteration": 2.708432912826538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179716, + "balance_loss_mlp": 1.08964145, + "epoch": 0.3526356290881108, + "flos": 514054278144.0, + "grad_norm": 0.030813246422457925, + "language_loss": 0.91671479, + "learning_rate": 0.000750988626390968, + "loss": 0.92851192, + "num_input_tokens_seen": 151920416, + "router_z_loss_mlp": 0.89892578, + "step": 1833, + "time_per_iteration": 2.592047929763794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179987, + "balance_loss_mlp": 1.09010315, + "epoch": 0.3528280107733744, + "flos": 596972696064.0, + "grad_norm": 0.024705197674389605, + "language_loss": 0.91622353, + "learning_rate": 0.0007507191307178108, + "loss": 0.9280234, + "num_input_tokens_seen": 151990848, + "router_z_loss_mlp": 0.89697266, + "step": 1834, + "time_per_iteration": 2.7884535789489746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176506, + "balance_loss_mlp": 1.08652651, + "epoch": 0.3530203924586379, + "flos": 552298386432.0, + "grad_norm": 0.0302975798262418, + "language_loss": 0.83893424, + "learning_rate": 0.0007504495377068543, + "loss": 0.85069931, + "num_input_tokens_seen": 152064864, + "router_z_loss_mlp": 0.89794922, + "step": 1835, + "time_per_iteration": 2.7751786708831787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175764, + "balance_loss_mlp": 1.08573675, + "epoch": 0.3532127741439015, + "flos": 654305293824.0, + "grad_norm": 0.027517554164180617, + "language_loss": 0.90655488, + "learning_rate": 0.0007501798474627642, + "loss": 0.91831255, + "num_input_tokens_seen": 152150096, + "router_z_loss_mlp": 0.8984375, + "step": 1836, + "time_per_iteration": 2.9638845920562744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179149, + "balance_loss_mlp": 1.08926523, + "epoch": 0.35340515582916504, + "flos": 724150460928.0, + "grad_norm": 0.024568481275515953, + "language_loss": 0.91140759, + "learning_rate": 0.0007499100600902433, + "loss": 0.92319906, + "num_input_tokens_seen": 152232528, + "router_z_loss_mlp": 0.89697266, + "step": 1837, + "time_per_iteration": 2.9948322772979736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184038, + "balance_loss_mlp": 1.09396327, + "epoch": 0.35359753751442863, + "flos": 595997778432.0, + "grad_norm": 0.031821297821065, + "language_loss": 0.92654896, + "learning_rate": 0.0007496401756940324, + "loss": 0.9383893, + "num_input_tokens_seen": 152299584, + "router_z_loss_mlp": 0.89892578, + "step": 1838, + "time_per_iteration": 2.678050994873047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176486, + "balance_loss_mlp": 1.08665001, + "epoch": 0.3537899191996922, + "flos": 633805456896.0, + "grad_norm": 0.02718368250353396, + "language_loss": 0.91091663, + "learning_rate": 0.0007493701943789098, + "loss": 0.92268145, + "num_input_tokens_seen": 152370368, + "router_z_loss_mlp": 0.89648438, + "step": 1839, + "time_per_iteration": 2.779574155807495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175825, + "balance_loss_mlp": 1.08608413, + "epoch": 0.35398230088495575, + "flos": 507352831488.0, + "grad_norm": 0.028671493841357993, + "language_loss": 0.91863656, + "learning_rate": 0.000749100116249692, + "loss": 0.93039483, + "num_input_tokens_seen": 152436928, + "router_z_loss_mlp": 0.89550781, + "step": 1840, + "time_per_iteration": 2.607614755630493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189406, + "balance_loss_mlp": 1.09980869, + "epoch": 0.35417468257021933, + "flos": 509046157824.0, + "grad_norm": 0.03229862826848899, + "language_loss": 0.95953786, + "learning_rate": 0.0007488299414112321, + "loss": 0.97143197, + "num_input_tokens_seen": 152505952, + "router_z_loss_mlp": 0.89404297, + "step": 1841, + "time_per_iteration": 2.566596746444702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181321, + "balance_loss_mlp": 1.09210455, + "epoch": 0.35436706425548287, + "flos": 657659019264.0, + "grad_norm": 0.02732135002339032, + "language_loss": 0.86453879, + "learning_rate": 0.0007485596699684215, + "loss": 0.87635195, + "num_input_tokens_seen": 152577408, + "router_z_loss_mlp": 0.89013672, + "step": 1842, + "time_per_iteration": 2.8111371994018555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185021, + "balance_loss_mlp": 1.09575689, + "epoch": 0.35455944594074645, + "flos": 653888329728.0, + "grad_norm": 0.026686949506238997, + "language_loss": 0.92940086, + "learning_rate": 0.000748289302026189, + "loss": 0.94125104, + "num_input_tokens_seen": 152654480, + "router_z_loss_mlp": 0.890625, + "step": 1843, + "time_per_iteration": 2.8244054317474365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187203, + "balance_loss_mlp": 1.09793901, + "epoch": 0.35475182762601, + "flos": 850010204160.0, + "grad_norm": 0.02649701564047654, + "language_loss": 0.9307664, + "learning_rate": 0.0007480188376895004, + "loss": 0.94263846, + "num_input_tokens_seen": 152732304, + "router_z_loss_mlp": 0.890625, + "step": 1844, + "time_per_iteration": 3.041001319885254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187935, + "balance_loss_mlp": 1.10115051, + "epoch": 0.3549442093112736, + "flos": 1524775128576.0, + "grad_norm": 0.01173136965559212, + "language_loss": 0.7381134, + "learning_rate": 0.0007477482770633596, + "loss": 0.74999273, + "num_input_tokens_seen": 152965952, + "router_z_loss_mlp": 0.86914062, + "step": 1845, + "time_per_iteration": 4.865761756896973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183261, + "balance_loss_mlp": 1.09390223, + "epoch": 0.3551365909965371, + "flos": 652714025472.0, + "grad_norm": 0.028658093872898062, + "language_loss": 0.85614175, + "learning_rate": 0.0007474776202528074, + "loss": 0.8679744, + "num_input_tokens_seen": 153053088, + "router_z_loss_mlp": 0.89160156, + "step": 1846, + "time_per_iteration": 2.9342904090881348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184977, + "balance_loss_mlp": 1.0954746, + "epoch": 0.3553289726818007, + "flos": 898921832448.0, + "grad_norm": 0.03609141350995601, + "language_loss": 0.89849555, + "learning_rate": 0.000747206867362922, + "loss": 0.91034532, + "num_input_tokens_seen": 153129216, + "router_z_loss_mlp": 0.89306641, + "step": 1847, + "time_per_iteration": 3.1089484691619873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185041, + "balance_loss_mlp": 1.09553862, + "epoch": 0.3555213543670643, + "flos": 689733437952.0, + "grad_norm": 0.0286779566522822, + "language_loss": 0.9096849, + "learning_rate": 0.0007469360184988194, + "loss": 0.92153525, + "num_input_tokens_seen": 153199360, + "router_z_loss_mlp": 0.89306641, + "step": 1848, + "time_per_iteration": 2.820265293121338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183493, + "balance_loss_mlp": 1.09399033, + "epoch": 0.3557137360523278, + "flos": 539603168256.0, + "grad_norm": 0.02648998316664428, + "language_loss": 0.93967247, + "learning_rate": 0.0007466650737656518, + "loss": 0.95150745, + "num_input_tokens_seen": 153269168, + "router_z_loss_mlp": 0.89306641, + "step": 1849, + "time_per_iteration": 2.596639394760132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183541, + "balance_loss_mlp": 1.09427702, + "epoch": 0.3559061177375914, + "flos": 403153767936.0, + "grad_norm": 0.02765421607491624, + "language_loss": 0.97574586, + "learning_rate": 0.0007463940332686098, + "loss": 0.98758125, + "num_input_tokens_seen": 153333120, + "router_z_loss_mlp": 0.890625, + "step": 1850, + "time_per_iteration": 2.478158473968506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177245, + "balance_loss_mlp": 1.08764756, + "epoch": 0.35609849942285493, + "flos": 697893895680.0, + "grad_norm": 0.023379973164811964, + "language_loss": 0.90857208, + "learning_rate": 0.0007461228971129205, + "loss": 0.92034447, + "num_input_tokens_seen": 153407600, + "router_z_loss_mlp": 0.89404297, + "step": 1851, + "time_per_iteration": 2.9202487468719482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179211, + "balance_loss_mlp": 1.08966124, + "epoch": 0.3562908811081185, + "flos": 570001724928.0, + "grad_norm": 0.028863121832353986, + "language_loss": 0.92692959, + "learning_rate": 0.0007458516654038483, + "loss": 0.93872178, + "num_input_tokens_seen": 153477408, + "router_z_loss_mlp": 0.89355469, + "step": 1852, + "time_per_iteration": 2.658867359161377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179202, + "balance_loss_mlp": 1.08936572, + "epoch": 0.35648326279338205, + "flos": 683609410560.0, + "grad_norm": 0.028040747176241956, + "language_loss": 0.94642723, + "learning_rate": 0.0007455803382466946, + "loss": 0.95821923, + "num_input_tokens_seen": 153551888, + "router_z_loss_mlp": 0.89648438, + "step": 1853, + "time_per_iteration": 2.86330509185791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183408, + "balance_loss_mlp": 1.09376252, + "epoch": 0.35667564447864564, + "flos": 630340941312.0, + "grad_norm": 0.02553826751691769, + "language_loss": 0.94946796, + "learning_rate": 0.0007453089157467979, + "loss": 0.96130198, + "num_input_tokens_seen": 153626912, + "router_z_loss_mlp": 0.89453125, + "step": 1854, + "time_per_iteration": 2.792577028274536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180437, + "balance_loss_mlp": 1.09093451, + "epoch": 0.35686802616390917, + "flos": 815504584704.0, + "grad_norm": 0.02468703395074296, + "language_loss": 0.8986901, + "learning_rate": 0.0007450373980095341, + "loss": 0.91049451, + "num_input_tokens_seen": 153711312, + "router_z_loss_mlp": 0.89306641, + "step": 1855, + "time_per_iteration": 3.0555014610290527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182657, + "balance_loss_mlp": 1.09334552, + "epoch": 0.35706040784917276, + "flos": 527205391872.0, + "grad_norm": 0.02890256158864057, + "language_loss": 0.93639445, + "learning_rate": 0.0007447657851403155, + "loss": 0.94822103, + "num_input_tokens_seen": 153780208, + "router_z_loss_mlp": 0.89111328, + "step": 1856, + "time_per_iteration": 2.589708089828491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182935, + "balance_loss_mlp": 1.09367096, + "epoch": 0.35725278953443634, + "flos": 513064624128.0, + "grad_norm": 0.032008561774258475, + "language_loss": 0.88987339, + "learning_rate": 0.0007444940772445915, + "loss": 0.9017027, + "num_input_tokens_seen": 153853152, + "router_z_loss_mlp": 0.890625, + "step": 1857, + "time_per_iteration": 2.7185556888580322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180668, + "balance_loss_mlp": 1.09169042, + "epoch": 0.3574451712196999, + "flos": 488492653056.0, + "grad_norm": 0.02708223160327311, + "language_loss": 0.88387084, + "learning_rate": 0.0007442222744278484, + "loss": 0.89567751, + "num_input_tokens_seen": 153924160, + "router_z_loss_mlp": 0.88769531, + "step": 1858, + "time_per_iteration": 2.6569979190826416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182567, + "balance_loss_mlp": 1.09339869, + "epoch": 0.35763755290496346, + "flos": 551821023744.0, + "grad_norm": 0.023402609147138306, + "language_loss": 0.90506786, + "learning_rate": 0.0007439503767956099, + "loss": 0.91689354, + "num_input_tokens_seen": 153998688, + "router_z_loss_mlp": 0.88964844, + "step": 1859, + "time_per_iteration": 2.7072699069976807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180801, + "balance_loss_mlp": 1.09249115, + "epoch": 0.357829934590227, + "flos": 1507225514496.0, + "grad_norm": 0.010565166743096084, + "language_loss": 0.79671603, + "learning_rate": 0.0007436783844534352, + "loss": 0.80852401, + "num_input_tokens_seen": 154230960, + "router_z_loss_mlp": 0.88085938, + "step": 1860, + "time_per_iteration": 4.9006147384643555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177337, + "balance_loss_mlp": 1.08835948, + "epoch": 0.3580223162754906, + "flos": 569841269760.0, + "grad_norm": 0.022894220472823423, + "language_loss": 0.92520916, + "learning_rate": 0.000743406297506922, + "loss": 0.93698251, + "num_input_tokens_seen": 154309104, + "router_z_loss_mlp": 0.88769531, + "step": 1861, + "time_per_iteration": 2.7065579891204834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186252, + "balance_loss_mlp": 1.09741747, + "epoch": 0.3582146979607541, + "flos": 627760018944.0, + "grad_norm": 0.02759787968542248, + "language_loss": 0.91638815, + "learning_rate": 0.0007431341160617031, + "loss": 0.92825067, + "num_input_tokens_seen": 154387424, + "router_z_loss_mlp": 0.88623047, + "step": 1862, + "time_per_iteration": 2.9316203594207764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179684, + "balance_loss_mlp": 1.09089661, + "epoch": 0.3584070796460177, + "flos": 508319016960.0, + "grad_norm": 0.024526236298265516, + "language_loss": 0.95309365, + "learning_rate": 0.0007428618402234491, + "loss": 0.96489048, + "num_input_tokens_seen": 154459952, + "router_z_loss_mlp": 0.88574219, + "step": 1863, + "time_per_iteration": 2.648061752319336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179939, + "balance_loss_mlp": 1.09129453, + "epoch": 0.3585994613312813, + "flos": 607640216064.0, + "grad_norm": 0.026400757424935653, + "language_loss": 0.88735509, + "learning_rate": 0.0007425894700978668, + "loss": 0.89915442, + "num_input_tokens_seen": 154535456, + "router_z_loss_mlp": 0.88427734, + "step": 1864, + "time_per_iteration": 2.7512128353118896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178956, + "balance_loss_mlp": 1.0905509, + "epoch": 0.3587918430165448, + "flos": 1415087675904.0, + "grad_norm": 0.025937088976099313, + "language_loss": 0.86489892, + "learning_rate": 0.0007423170057906996, + "loss": 0.87668848, + "num_input_tokens_seen": 154627568, + "router_z_loss_mlp": 0.88183594, + "step": 1865, + "time_per_iteration": 3.8491222858428955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181386, + "balance_loss_mlp": 1.0926944, + "epoch": 0.3589842247018084, + "flos": 479513730048.0, + "grad_norm": 0.0296684402619103, + "language_loss": 0.94328964, + "learning_rate": 0.0007420444474077275, + "loss": 0.95510352, + "num_input_tokens_seen": 154694640, + "router_z_loss_mlp": 0.88476562, + "step": 1866, + "time_per_iteration": 2.5396502017974854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183129, + "balance_loss_mlp": 1.09458029, + "epoch": 0.35917660638707194, + "flos": 505705167360.0, + "grad_norm": 0.030930075238968464, + "language_loss": 0.98337018, + "learning_rate": 0.0007417717950547671, + "loss": 0.99520147, + "num_input_tokens_seen": 154762048, + "router_z_loss_mlp": 0.88330078, + "step": 1867, + "time_per_iteration": 2.562638759613037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182945, + "balance_loss_mlp": 1.09654236, + "epoch": 0.3593689880723355, + "flos": 1495481745408.0, + "grad_norm": 0.008554058370081398, + "language_loss": 0.75996608, + "learning_rate": 0.0007414990488376713, + "loss": 0.77179551, + "num_input_tokens_seen": 154989952, + "router_z_loss_mlp": 0.86523438, + "step": 1868, + "time_per_iteration": 4.885401487350464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184482, + "balance_loss_mlp": 1.09583843, + "epoch": 0.35956136975759906, + "flos": 529671521280.0, + "grad_norm": 0.02257875970711003, + "language_loss": 0.91369003, + "learning_rate": 0.0007412262088623299, + "loss": 0.92553484, + "num_input_tokens_seen": 155066992, + "router_z_loss_mlp": 0.88427734, + "step": 1869, + "time_per_iteration": 2.755620241165161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184303, + "balance_loss_mlp": 1.09584975, + "epoch": 0.35975375144286265, + "flos": 535999664640.0, + "grad_norm": 0.02945163599469251, + "language_loss": 0.8810817, + "learning_rate": 0.0007409532752346684, + "loss": 0.89292467, + "num_input_tokens_seen": 155137616, + "router_z_loss_mlp": 0.88232422, + "step": 1870, + "time_per_iteration": 2.6426498889923096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187445, + "balance_loss_mlp": 1.09860992, + "epoch": 0.3599461331281262, + "flos": 505928749056.0, + "grad_norm": 0.025692069404306732, + "language_loss": 0.95194697, + "learning_rate": 0.0007406802480606491, + "loss": 0.96382141, + "num_input_tokens_seen": 155209248, + "router_z_loss_mlp": 0.88623047, + "step": 1871, + "time_per_iteration": 2.6156716346740723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180117, + "balance_loss_mlp": 1.09123456, + "epoch": 0.36013851481338977, + "flos": 512536869888.0, + "grad_norm": 0.029138864413584674, + "language_loss": 0.9874596, + "learning_rate": 0.0007404071274462707, + "loss": 0.99926078, + "num_input_tokens_seen": 155274176, + "router_z_loss_mlp": 0.88671875, + "step": 1872, + "time_per_iteration": 2.5790889263153076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179425, + "balance_loss_mlp": 1.09054244, + "epoch": 0.36033089649865335, + "flos": 548631756288.0, + "grad_norm": 0.029675252163234106, + "language_loss": 0.91584998, + "learning_rate": 0.0007401339134975682, + "loss": 0.92764425, + "num_input_tokens_seen": 155343232, + "router_z_loss_mlp": 0.88671875, + "step": 1873, + "time_per_iteration": 2.6279983520507812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185016, + "balance_loss_mlp": 1.09613371, + "epoch": 0.3605232781839169, + "flos": 459613506048.0, + "grad_norm": 0.030657976300352024, + "language_loss": 0.92556155, + "learning_rate": 0.0007398606063206122, + "loss": 0.93741173, + "num_input_tokens_seen": 155410080, + "router_z_loss_mlp": 0.88671875, + "step": 1874, + "time_per_iteration": 2.5750958919525146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178477, + "balance_loss_mlp": 1.0895946, + "epoch": 0.36071565986918047, + "flos": 510563566080.0, + "grad_norm": 0.029863822651947862, + "language_loss": 0.87000763, + "learning_rate": 0.0007395872060215101, + "loss": 0.88179243, + "num_input_tokens_seen": 155476240, + "router_z_loss_mlp": 0.88671875, + "step": 1875, + "time_per_iteration": 2.599595546722412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180043, + "balance_loss_mlp": 1.09101713, + "epoch": 0.360908041554444, + "flos": 560256729600.0, + "grad_norm": 0.02914010843617622, + "language_loss": 0.95866597, + "learning_rate": 0.0007393137127064056, + "loss": 0.97046638, + "num_input_tokens_seen": 155543392, + "router_z_loss_mlp": 0.88818359, + "step": 1876, + "time_per_iteration": 2.629855155944824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179718, + "balance_loss_mlp": 1.09064531, + "epoch": 0.3611004232397076, + "flos": 524878250496.0, + "grad_norm": 0.029199641876594032, + "language_loss": 0.93452048, + "learning_rate": 0.0007390401264814779, + "loss": 0.94631773, + "num_input_tokens_seen": 155613264, + "router_z_loss_mlp": 0.88867188, + "step": 1877, + "time_per_iteration": 2.6057403087615967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182123, + "balance_loss_mlp": 1.0932405, + "epoch": 0.3612928049249711, + "flos": 542032367616.0, + "grad_norm": 0.029384759310162312, + "language_loss": 0.93887711, + "learning_rate": 0.0007387664474529427, + "loss": 0.95069838, + "num_input_tokens_seen": 155683712, + "router_z_loss_mlp": 0.88671875, + "step": 1878, + "time_per_iteration": 2.612924814224243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181149, + "balance_loss_mlp": 1.09207559, + "epoch": 0.3614851866102347, + "flos": 553629143040.0, + "grad_norm": 0.028847856052759763, + "language_loss": 0.99400896, + "learning_rate": 0.0007384926757270518, + "loss": 1.00582051, + "num_input_tokens_seen": 155751760, + "router_z_loss_mlp": 0.88867188, + "step": 1879, + "time_per_iteration": 2.631417751312256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183007, + "balance_loss_mlp": 1.09364784, + "epoch": 0.36167756829549824, + "flos": 773426660352.0, + "grad_norm": 0.027790454764264987, + "language_loss": 0.87101346, + "learning_rate": 0.0007382188114100924, + "loss": 0.88284349, + "num_input_tokens_seen": 155830464, + "router_z_loss_mlp": 0.89160156, + "step": 1880, + "time_per_iteration": 3.0146212577819824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182663, + "balance_loss_mlp": 1.09330404, + "epoch": 0.36186994998076183, + "flos": 713187500544.0, + "grad_norm": 0.025874200926848077, + "language_loss": 0.89437282, + "learning_rate": 0.0007379448546083884, + "loss": 0.90619946, + "num_input_tokens_seen": 155906208, + "router_z_loss_mlp": 0.89160156, + "step": 1881, + "time_per_iteration": 2.9882314205169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182414, + "balance_loss_mlp": 1.09305489, + "epoch": 0.3620623316660254, + "flos": 748900351488.0, + "grad_norm": 0.028120122690860328, + "language_loss": 0.95218164, + "learning_rate": 0.0007376708054282992, + "loss": 0.96400583, + "num_input_tokens_seen": 155983584, + "router_z_loss_mlp": 0.89160156, + "step": 1882, + "time_per_iteration": 2.937251329421997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185259, + "balance_loss_mlp": 1.09609008, + "epoch": 0.36225471335128895, + "flos": 483534197760.0, + "grad_norm": 0.025051425069896712, + "language_loss": 0.90089262, + "learning_rate": 0.0007373966639762201, + "loss": 0.91274524, + "num_input_tokens_seen": 156052464, + "router_z_loss_mlp": 0.88964844, + "step": 1883, + "time_per_iteration": 2.5956366062164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189104, + "balance_loss_mlp": 1.09964943, + "epoch": 0.36244709503655254, + "flos": 507910785024.0, + "grad_norm": 0.028814908336841725, + "language_loss": 0.97620124, + "learning_rate": 0.0007371224303585822, + "loss": 0.9880923, + "num_input_tokens_seen": 156121424, + "router_z_loss_mlp": 0.89257812, + "step": 1884, + "time_per_iteration": 2.5689563751220703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188454, + "balance_loss_mlp": 1.10205078, + "epoch": 0.36263947672181607, + "flos": 1397052145152.0, + "grad_norm": 0.012535477100621303, + "language_loss": 0.80357069, + "learning_rate": 0.0007368481046818524, + "loss": 0.8154552, + "num_input_tokens_seen": 156346144, + "router_z_loss_mlp": 0.86523438, + "step": 1885, + "time_per_iteration": 4.708393573760986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184768, + "balance_loss_mlp": 1.09531295, + "epoch": 0.36283185840707965, + "flos": 654522144768.0, + "grad_norm": 0.026882878095346403, + "language_loss": 0.90798199, + "learning_rate": 0.0007365736870525335, + "loss": 0.91982961, + "num_input_tokens_seen": 156420880, + "router_z_loss_mlp": 0.89257812, + "step": 1886, + "time_per_iteration": 2.8096718788146973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188121, + "balance_loss_mlp": 1.09842801, + "epoch": 0.3630242400923432, + "flos": 489844876800.0, + "grad_norm": 0.028488669634490066, + "language_loss": 0.90766525, + "learning_rate": 0.000736299177577164, + "loss": 0.91954637, + "num_input_tokens_seen": 156485616, + "router_z_loss_mlp": 0.89501953, + "step": 1887, + "time_per_iteration": 2.5731940269470215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184527, + "balance_loss_mlp": 1.09488153, + "epoch": 0.3632166217776068, + "flos": 518231198208.0, + "grad_norm": 0.0291282657352475, + "language_loss": 0.90900671, + "learning_rate": 0.0007360245763623174, + "loss": 0.92085195, + "num_input_tokens_seen": 156557840, + "router_z_loss_mlp": 0.89453125, + "step": 1888, + "time_per_iteration": 2.6255550384521484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184122, + "balance_loss_mlp": 1.09457171, + "epoch": 0.36340900346287036, + "flos": 647347338240.0, + "grad_norm": 0.024297388169127104, + "language_loss": 0.96519047, + "learning_rate": 0.0007357498835146039, + "loss": 0.97703171, + "num_input_tokens_seen": 156632496, + "router_z_loss_mlp": 0.89355469, + "step": 1889, + "time_per_iteration": 2.8253488540649414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183322, + "balance_loss_mlp": 1.09386766, + "epoch": 0.3636013851481339, + "flos": 554410678272.0, + "grad_norm": 0.02538543495771105, + "language_loss": 0.93937147, + "learning_rate": 0.0007354750991406684, + "loss": 0.95120472, + "num_input_tokens_seen": 156705296, + "router_z_loss_mlp": 0.89257812, + "step": 1890, + "time_per_iteration": 2.692335844039917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182823, + "balance_loss_mlp": 1.09336889, + "epoch": 0.3637937668333975, + "flos": 547691767296.0, + "grad_norm": 0.028084450652072174, + "language_loss": 0.88223994, + "learning_rate": 0.0007352002233471919, + "loss": 0.89406812, + "num_input_tokens_seen": 156773376, + "router_z_loss_mlp": 0.89257812, + "step": 1891, + "time_per_iteration": 2.620753765106201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181153, + "balance_loss_mlp": 1.09212756, + "epoch": 0.363986148518661, + "flos": 539210399232.0, + "grad_norm": 0.027970426809957948, + "language_loss": 0.87592262, + "learning_rate": 0.0007349252562408906, + "loss": 0.88773412, + "num_input_tokens_seen": 156844336, + "router_z_loss_mlp": 0.88818359, + "step": 1892, + "time_per_iteration": 2.6963558197021484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186893, + "balance_loss_mlp": 1.09762907, + "epoch": 0.3641785302039246, + "flos": 661510299648.0, + "grad_norm": 0.026164868426956554, + "language_loss": 0.89186442, + "learning_rate": 0.0007346501979285158, + "loss": 0.90373337, + "num_input_tokens_seen": 156918848, + "router_z_loss_mlp": 0.890625, + "step": 1893, + "time_per_iteration": 2.880326747894287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188103, + "balance_loss_mlp": 1.10150909, + "epoch": 0.36437091188918813, + "flos": 1472082077184.0, + "grad_norm": 0.013556454199407954, + "language_loss": 0.80539101, + "learning_rate": 0.0007343750485168551, + "loss": 0.81727207, + "num_input_tokens_seen": 157134736, + "router_z_loss_mlp": 0.8671875, + "step": 1894, + "time_per_iteration": 4.7823100090026855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189424, + "balance_loss_mlp": 1.10011292, + "epoch": 0.3645632935744517, + "flos": 598444442112.0, + "grad_norm": 0.028411509484180794, + "language_loss": 0.93676329, + "learning_rate": 0.0007340998081127308, + "loss": 0.94865751, + "num_input_tokens_seen": 157211920, + "router_z_loss_mlp": 0.89111328, + "step": 1895, + "time_per_iteration": 2.7800211906433105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179101, + "balance_loss_mlp": 1.08998048, + "epoch": 0.36475567525971525, + "flos": 600695721984.0, + "grad_norm": 0.025932670803143428, + "language_loss": 0.98669052, + "learning_rate": 0.0007338244768230007, + "loss": 0.99848151, + "num_input_tokens_seen": 157284224, + "router_z_loss_mlp": 0.88916016, + "step": 1896, + "time_per_iteration": 2.7945594787597656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180722, + "balance_loss_mlp": 1.09169638, + "epoch": 0.36494805694497884, + "flos": 799830945792.0, + "grad_norm": 0.022772977260465788, + "language_loss": 0.94548512, + "learning_rate": 0.0007335490547545578, + "loss": 0.95729244, + "num_input_tokens_seen": 157367920, + "router_z_loss_mlp": 0.88818359, + "step": 1897, + "time_per_iteration": 3.031527280807495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182826, + "balance_loss_mlp": 1.09389579, + "epoch": 0.3651404386302424, + "flos": 638477203968.0, + "grad_norm": 0.024439781626348547, + "language_loss": 0.90189934, + "learning_rate": 0.0007332735420143308, + "loss": 0.91372758, + "num_input_tokens_seen": 157438672, + "router_z_loss_mlp": 0.88720703, + "step": 1898, + "time_per_iteration": 2.743051767349243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118252, + "balance_loss_mlp": 1.09363747, + "epoch": 0.36533282031550596, + "flos": 492562785792.0, + "grad_norm": 0.03052059755540218, + "language_loss": 0.95941794, + "learning_rate": 0.0007329979387092826, + "loss": 0.97124314, + "num_input_tokens_seen": 157505888, + "router_z_loss_mlp": 0.88671875, + "step": 1899, + "time_per_iteration": 2.5555779933929443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181449, + "balance_loss_mlp": 1.09247124, + "epoch": 0.36552520200076954, + "flos": 857508648960.0, + "grad_norm": 0.02266050351879182, + "language_loss": 0.89947438, + "learning_rate": 0.0007327222449464124, + "loss": 0.91128886, + "num_input_tokens_seen": 157601568, + "router_z_loss_mlp": 0.88769531, + "step": 1900, + "time_per_iteration": 3.2362029552459717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181183, + "balance_loss_mlp": 1.09206235, + "epoch": 0.3657175836860331, + "flos": 484715232768.0, + "grad_norm": 0.026374750280255838, + "language_loss": 0.95288622, + "learning_rate": 0.0007324464608327538, + "loss": 0.96469808, + "num_input_tokens_seen": 157670992, + "router_z_loss_mlp": 0.88916016, + "step": 1901, + "time_per_iteration": 2.5933730602264404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179798, + "balance_loss_mlp": 1.09058213, + "epoch": 0.36590996537129666, + "flos": 435721012224.0, + "grad_norm": 0.02685373461110618, + "language_loss": 0.96213037, + "learning_rate": 0.0007321705864753758, + "loss": 0.97392833, + "num_input_tokens_seen": 157743616, + "router_z_loss_mlp": 0.89013672, + "step": 1902, + "time_per_iteration": 2.6981201171875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180605, + "balance_loss_mlp": 1.09124577, + "epoch": 0.3661023470565602, + "flos": 713513140224.0, + "grad_norm": 0.022756571637903334, + "language_loss": 0.91225153, + "learning_rate": 0.0007318946219813823, + "loss": 0.9240576, + "num_input_tokens_seen": 157823520, + "router_z_loss_mlp": 0.89160156, + "step": 1903, + "time_per_iteration": 2.992624044418335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183651, + "balance_loss_mlp": 1.09443474, + "epoch": 0.3662947287418238, + "flos": 565822803456.0, + "grad_norm": 0.027935940535232063, + "language_loss": 0.96619356, + "learning_rate": 0.000731618567457912, + "loss": 0.97803003, + "num_input_tokens_seen": 157893248, + "router_z_loss_mlp": 0.89013672, + "step": 1904, + "time_per_iteration": 2.685476064682007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183785, + "balance_loss_mlp": 1.09433067, + "epoch": 0.3664871104270873, + "flos": 791201857536.0, + "grad_norm": 0.029459392082425068, + "language_loss": 0.95166355, + "learning_rate": 0.000731342423012139, + "loss": 0.96350139, + "num_input_tokens_seen": 157973216, + "router_z_loss_mlp": 0.89257812, + "step": 1905, + "time_per_iteration": 3.0574183464050293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184501, + "balance_loss_mlp": 1.09480846, + "epoch": 0.3666794921123509, + "flos": 753980330496.0, + "grad_norm": 0.028631588758117728, + "language_loss": 0.89661896, + "learning_rate": 0.0007310661887512722, + "loss": 0.90846401, + "num_input_tokens_seen": 158051088, + "router_z_loss_mlp": 0.89501953, + "step": 1906, + "time_per_iteration": 3.024423122406006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183077, + "balance_loss_mlp": 1.09343171, + "epoch": 0.3668718737976145, + "flos": 524607005184.0, + "grad_norm": 0.02900954708937733, + "language_loss": 0.89823443, + "learning_rate": 0.0007307898647825549, + "loss": 0.91006529, + "num_input_tokens_seen": 158124368, + "router_z_loss_mlp": 0.89453125, + "step": 1907, + "time_per_iteration": 2.6485068798065186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182186, + "balance_loss_mlp": 1.09277892, + "epoch": 0.367064255482878, + "flos": 573045273600.0, + "grad_norm": 0.031417651983294596, + "language_loss": 0.98967636, + "learning_rate": 0.0007305134512132659, + "loss": 1.00149822, + "num_input_tokens_seen": 158191472, + "router_z_loss_mlp": 0.89208984, + "step": 1908, + "time_per_iteration": 2.646838903427124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180724, + "balance_loss_mlp": 1.09107888, + "epoch": 0.3672566371681416, + "flos": 448053660672.0, + "grad_norm": 0.03289649974011927, + "language_loss": 0.93253779, + "learning_rate": 0.0007302369481507183, + "loss": 0.94434512, + "num_input_tokens_seen": 158254384, + "router_z_loss_mlp": 0.89453125, + "step": 1909, + "time_per_iteration": 2.562856674194336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188614, + "balance_loss_mlp": 1.10011292, + "epoch": 0.36744901885340514, + "flos": 1543364061696.0, + "grad_norm": 0.010877058892954462, + "language_loss": 0.79961759, + "learning_rate": 0.00072996035570226, + "loss": 0.81150377, + "num_input_tokens_seen": 158486160, + "router_z_loss_mlp": 0.8828125, + "step": 1910, + "time_per_iteration": 4.90735387802124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011789, + "balance_loss_mlp": 1.08949292, + "epoch": 0.36764140053866873, + "flos": 564761290752.0, + "grad_norm": 0.024499581587470617, + "language_loss": 0.92626876, + "learning_rate": 0.000729683673975274, + "loss": 0.93805778, + "num_input_tokens_seen": 158555616, + "router_z_loss_mlp": 0.89208984, + "step": 1911, + "time_per_iteration": 2.6646595001220703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182116, + "balance_loss_mlp": 1.09285223, + "epoch": 0.36783378222393226, + "flos": 1218650895360.0, + "grad_norm": 0.021973130552363645, + "language_loss": 0.89050859, + "learning_rate": 0.0007294069030771774, + "loss": 0.90232974, + "num_input_tokens_seen": 158653984, + "router_z_loss_mlp": 0.890625, + "step": 1912, + "time_per_iteration": 3.6834843158721924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189865, + "balance_loss_mlp": 1.10021913, + "epoch": 0.36802616390919585, + "flos": 499720128000.0, + "grad_norm": 0.028676866730684987, + "language_loss": 0.97328013, + "learning_rate": 0.0007291300431154224, + "loss": 0.98517883, + "num_input_tokens_seen": 158719728, + "router_z_loss_mlp": 0.89453125, + "step": 1913, + "time_per_iteration": 2.587052822113037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195931, + "balance_loss_mlp": 1.10838318, + "epoch": 0.36821854559445943, + "flos": 1585615902720.0, + "grad_norm": 0.013013835157786544, + "language_loss": 0.70389736, + "learning_rate": 0.0007288530941974955, + "loss": 0.71585667, + "num_input_tokens_seen": 158952544, + "router_z_loss_mlp": 0.87695312, + "step": 1914, + "time_per_iteration": 4.952203989028931 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185283, + "balance_loss_mlp": 1.09582841, + "epoch": 0.36841092727972297, + "flos": 837089402880.0, + "grad_norm": 0.02834339080565921, + "language_loss": 0.8768307, + "learning_rate": 0.0007285760564309179, + "loss": 0.88868356, + "num_input_tokens_seen": 159039680, + "router_z_loss_mlp": 0.89257812, + "step": 1915, + "time_per_iteration": 3.100893974304199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185476, + "balance_loss_mlp": 1.09602106, + "epoch": 0.36860330896498655, + "flos": 691209913344.0, + "grad_norm": 0.028423235038061073, + "language_loss": 0.92041719, + "learning_rate": 0.0007282989299232448, + "loss": 0.93227196, + "num_input_tokens_seen": 159128128, + "router_z_loss_mlp": 0.89257812, + "step": 1916, + "time_per_iteration": 3.0683393478393555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189801, + "balance_loss_mlp": 1.10048962, + "epoch": 0.3687956906502501, + "flos": 555239877120.0, + "grad_norm": 0.03332088686108748, + "language_loss": 0.92434603, + "learning_rate": 0.0007280217147820668, + "loss": 0.93624407, + "num_input_tokens_seen": 159193248, + "router_z_loss_mlp": 0.89111328, + "step": 1917, + "time_per_iteration": 2.635451078414917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188211, + "balance_loss_mlp": 1.09894717, + "epoch": 0.3689880723355137, + "flos": 577819078656.0, + "grad_norm": 0.027623597033391085, + "language_loss": 0.8697632, + "learning_rate": 0.0007277444111150079, + "loss": 0.88164532, + "num_input_tokens_seen": 159265824, + "router_z_loss_mlp": 0.890625, + "step": 1918, + "time_per_iteration": 2.810635805130005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184664, + "balance_loss_mlp": 1.09540033, + "epoch": 0.3691804540207772, + "flos": 529886370816.0, + "grad_norm": 0.029489830132381867, + "language_loss": 0.91299617, + "learning_rate": 0.0007274670190297272, + "loss": 0.92484283, + "num_input_tokens_seen": 159332992, + "router_z_loss_mlp": 0.890625, + "step": 1919, + "time_per_iteration": 2.615386486053467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118238, + "balance_loss_mlp": 1.09368861, + "epoch": 0.3693728357060408, + "flos": 562180368384.0, + "grad_norm": 0.025570373781710027, + "language_loss": 0.90037912, + "learning_rate": 0.0007271895386339179, + "loss": 0.91220295, + "num_input_tokens_seen": 159409808, + "router_z_loss_mlp": 0.88476562, + "step": 1920, + "time_per_iteration": 2.7868921756744385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192586, + "balance_loss_mlp": 1.10375118, + "epoch": 0.3695652173913043, + "flos": 580899557376.0, + "grad_norm": 0.02893533685872539, + "language_loss": 0.90819347, + "learning_rate": 0.0007269119700353073, + "loss": 0.92011935, + "num_input_tokens_seen": 159486128, + "router_z_loss_mlp": 0.88623047, + "step": 1921, + "time_per_iteration": 2.7836573123931885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178636, + "balance_loss_mlp": 1.09023082, + "epoch": 0.3697575990765679, + "flos": 514059007488.0, + "grad_norm": 0.024390447267758214, + "language_loss": 0.90977228, + "learning_rate": 0.0007266343133416571, + "loss": 0.92155862, + "num_input_tokens_seen": 159562224, + "router_z_loss_mlp": 0.8828125, + "step": 1922, + "time_per_iteration": 2.800387382507324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173615, + "balance_loss_mlp": 1.08816528, + "epoch": 0.3699499807618315, + "flos": 1573903607808.0, + "grad_norm": 0.0066311072211368925, + "language_loss": 0.77116919, + "learning_rate": 0.0007263565686607632, + "loss": 0.78290522, + "num_input_tokens_seen": 159784768, + "router_z_loss_mlp": 0.85546875, + "step": 1923, + "time_per_iteration": 4.845300912857056 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176045, + "balance_loss_mlp": 1.08844995, + "epoch": 0.37014236244709503, + "flos": 498324243456.0, + "grad_norm": 0.031949393340513096, + "language_loss": 0.9351213, + "learning_rate": 0.0007260787361004556, + "loss": 0.94688171, + "num_input_tokens_seen": 159848608, + "router_z_loss_mlp": 0.87744141, + "step": 1924, + "time_per_iteration": 2.5984597206115723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175598, + "balance_loss_mlp": 1.0905304, + "epoch": 0.3703347441323586, + "flos": 1447605433344.0, + "grad_norm": 0.008500773473990196, + "language_loss": 0.73761505, + "learning_rate": 0.0007258008157685987, + "loss": 0.74937099, + "num_input_tokens_seen": 160080928, + "router_z_loss_mlp": 0.8515625, + "step": 1925, + "time_per_iteration": 4.886027097702026 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197031, + "balance_loss_mlp": 1.10862505, + "epoch": 0.37052712581762215, + "flos": 564713627136.0, + "grad_norm": 0.03178088368953176, + "language_loss": 0.94516188, + "learning_rate": 0.0007255228077730903, + "loss": 0.95713222, + "num_input_tokens_seen": 160148976, + "router_z_loss_mlp": 0.88183594, + "step": 1926, + "time_per_iteration": 2.6847593784332275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185383, + "balance_loss_mlp": 1.09731126, + "epoch": 0.37071950750288574, + "flos": 927570667008.0, + "grad_norm": 0.029564625514678724, + "language_loss": 0.89603549, + "learning_rate": 0.0007252447122218632, + "loss": 0.90788931, + "num_input_tokens_seen": 160233504, + "router_z_loss_mlp": 0.88037109, + "step": 1927, + "time_per_iteration": 3.106748342514038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179784, + "balance_loss_mlp": 1.0919987, + "epoch": 0.37091188918814927, + "flos": 419200710144.0, + "grad_norm": 0.03402230349378661, + "language_loss": 0.98334146, + "learning_rate": 0.0007249665292228834, + "loss": 0.99513936, + "num_input_tokens_seen": 160299696, + "router_z_loss_mlp": 0.87939453, + "step": 1928, + "time_per_iteration": 2.5786120891571045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186321, + "balance_loss_mlp": 1.09801054, + "epoch": 0.37110427087341286, + "flos": 464146265088.0, + "grad_norm": 0.029271450765855984, + "language_loss": 0.9102214, + "learning_rate": 0.000724688258884151, + "loss": 0.92208457, + "num_input_tokens_seen": 160367904, + "router_z_loss_mlp": 0.88183594, + "step": 1929, + "time_per_iteration": 2.5388894081115723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185686, + "balance_loss_mlp": 1.09780467, + "epoch": 0.3712966525586764, + "flos": 851080449024.0, + "grad_norm": 0.02435916983518334, + "language_loss": 0.9136247, + "learning_rate": 0.0007244099013137002, + "loss": 0.92548156, + "num_input_tokens_seen": 160453600, + "router_z_loss_mlp": 0.88037109, + "step": 1930, + "time_per_iteration": 3.0708000659942627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179762, + "balance_loss_mlp": 1.09159458, + "epoch": 0.37148903424394, + "flos": 927557932032.0, + "grad_norm": 0.024720397528266293, + "language_loss": 0.95256186, + "learning_rate": 0.0007241314566195993, + "loss": 0.96435952, + "num_input_tokens_seen": 160543472, + "router_z_loss_mlp": 0.88232422, + "step": 1931, + "time_per_iteration": 3.2293543815612793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179876, + "balance_loss_mlp": 1.09180403, + "epoch": 0.37168141592920356, + "flos": 520820852736.0, + "grad_norm": 0.029266961451931986, + "language_loss": 0.92750597, + "learning_rate": 0.0007238529249099496, + "loss": 0.93930471, + "num_input_tokens_seen": 160614016, + "router_z_loss_mlp": 0.88232422, + "step": 1932, + "time_per_iteration": 2.6091582775115967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188461, + "balance_loss_mlp": 1.10263062, + "epoch": 0.3718737976144671, + "flos": 1449059715072.0, + "grad_norm": 0.015165360012205364, + "language_loss": 0.77856874, + "learning_rate": 0.0007235743062928872, + "loss": 0.79045337, + "num_input_tokens_seen": 160828640, + "router_z_loss_mlp": 0.859375, + "step": 1933, + "time_per_iteration": 4.854676246643066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184357, + "balance_loss_mlp": 1.09614182, + "epoch": 0.3720661792997307, + "flos": 760953022464.0, + "grad_norm": 0.028795817149727888, + "language_loss": 0.88381398, + "learning_rate": 0.000723295600876581, + "loss": 0.89565754, + "num_input_tokens_seen": 160913088, + "router_z_loss_mlp": 0.8828125, + "step": 1934, + "time_per_iteration": 2.9830405712127686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118189, + "balance_loss_mlp": 1.09396136, + "epoch": 0.3722585609849942, + "flos": 518044546560.0, + "grad_norm": 0.028690096062057496, + "language_loss": 0.95446575, + "learning_rate": 0.0007230168087692344, + "loss": 0.96628463, + "num_input_tokens_seen": 160982960, + "router_z_loss_mlp": 0.88085938, + "step": 1935, + "time_per_iteration": 2.651982307434082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181923, + "balance_loss_mlp": 1.09404159, + "epoch": 0.3724509426702578, + "flos": 783868597248.0, + "grad_norm": 0.02900654324264667, + "language_loss": 0.88952625, + "learning_rate": 0.0007227379300790839, + "loss": 0.90134549, + "num_input_tokens_seen": 161066000, + "router_z_loss_mlp": 0.88037109, + "step": 1936, + "time_per_iteration": 3.0127265453338623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177948, + "balance_loss_mlp": 1.09006691, + "epoch": 0.37264332435552133, + "flos": 392599039488.0, + "grad_norm": 0.02836050450865214, + "language_loss": 0.94049299, + "learning_rate": 0.0007224589649143997, + "loss": 0.95227242, + "num_input_tokens_seen": 161131040, + "router_z_loss_mlp": 0.88037109, + "step": 1937, + "time_per_iteration": 2.5600061416625977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178201, + "balance_loss_mlp": 1.09074926, + "epoch": 0.3728357060407849, + "flos": 543912345600.0, + "grad_norm": 0.027673862011078548, + "language_loss": 0.89373219, + "learning_rate": 0.0007221799133834861, + "loss": 0.90551418, + "num_input_tokens_seen": 161201248, + "router_z_loss_mlp": 0.87597656, + "step": 1938, + "time_per_iteration": 2.646632671356201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011797, + "balance_loss_mlp": 1.0919621, + "epoch": 0.3730280877260485, + "flos": 434483581440.0, + "grad_norm": 0.03019004471989451, + "language_loss": 0.90666437, + "learning_rate": 0.00072190077559468, + "loss": 0.91846132, + "num_input_tokens_seen": 161266288, + "router_z_loss_mlp": 0.87890625, + "step": 1939, + "time_per_iteration": 2.5193679332733154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118304, + "balance_loss_mlp": 1.0957315, + "epoch": 0.37322046941131204, + "flos": 532510953984.0, + "grad_norm": 0.02812892901872328, + "language_loss": 0.95514065, + "learning_rate": 0.0007216215516563527, + "loss": 0.96697104, + "num_input_tokens_seen": 161335648, + "router_z_loss_mlp": 0.87451172, + "step": 1940, + "time_per_iteration": 2.6975200176239014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184025, + "balance_loss_mlp": 1.09666896, + "epoch": 0.3734128510965756, + "flos": 532576081920.0, + "grad_norm": 0.028733495674926814, + "language_loss": 0.91960251, + "learning_rate": 0.0007213422416769083, + "loss": 0.93144274, + "num_input_tokens_seen": 161403440, + "router_z_loss_mlp": 0.875, + "step": 1941, + "time_per_iteration": 2.636056900024414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183262, + "balance_loss_mlp": 1.09561944, + "epoch": 0.37360523278183916, + "flos": 501432920064.0, + "grad_norm": 0.028111058318233337, + "language_loss": 0.83044219, + "learning_rate": 0.0007210628457647849, + "loss": 0.84227479, + "num_input_tokens_seen": 161472864, + "router_z_loss_mlp": 0.87792969, + "step": 1942, + "time_per_iteration": 2.6564345359802246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182498, + "balance_loss_mlp": 1.09475958, + "epoch": 0.37379761446710275, + "flos": 549111846912.0, + "grad_norm": 0.03172951338735415, + "language_loss": 0.86608446, + "learning_rate": 0.000720783364028453, + "loss": 0.87790942, + "num_input_tokens_seen": 161548096, + "router_z_loss_mlp": 0.87890625, + "step": 1943, + "time_per_iteration": 2.7782797813415527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176645, + "balance_loss_mlp": 1.08909822, + "epoch": 0.3739899961523663, + "flos": 476739425280.0, + "grad_norm": 0.0265564263320471, + "language_loss": 0.94348681, + "learning_rate": 0.0007205037965764177, + "loss": 0.95525324, + "num_input_tokens_seen": 161615600, + "router_z_loss_mlp": 0.87695312, + "step": 1944, + "time_per_iteration": 2.5670034885406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198539, + "balance_loss_mlp": 1.11003804, + "epoch": 0.37418237783762986, + "flos": 613076034048.0, + "grad_norm": 0.032068934234115415, + "language_loss": 0.94037992, + "learning_rate": 0.0007202241435172161, + "loss": 0.95236534, + "num_input_tokens_seen": 161687408, + "router_z_loss_mlp": 0.8828125, + "step": 1945, + "time_per_iteration": 2.7505762577056885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119095, + "balance_loss_mlp": 1.10283065, + "epoch": 0.3743747595228934, + "flos": 767628272640.0, + "grad_norm": 0.02891432689626354, + "language_loss": 0.95249915, + "learning_rate": 0.0007199444049594198, + "loss": 0.9644087, + "num_input_tokens_seen": 161764224, + "router_z_loss_mlp": 0.88085938, + "step": 1946, + "time_per_iteration": 2.9690663814544678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179721, + "balance_loss_mlp": 1.09188759, + "epoch": 0.374567141208157, + "flos": 525490598400.0, + "grad_norm": 0.029648083740235674, + "language_loss": 0.90769064, + "learning_rate": 0.0007196645810116322, + "loss": 0.91948783, + "num_input_tokens_seen": 161835520, + "router_z_loss_mlp": 0.87988281, + "step": 1947, + "time_per_iteration": 2.690214157104492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178535, + "balance_loss_mlp": 1.09065437, + "epoch": 0.37475952289342057, + "flos": 682613025792.0, + "grad_norm": 0.029716110952303924, + "language_loss": 0.91939867, + "learning_rate": 0.0007193846717824912, + "loss": 0.93118405, + "num_input_tokens_seen": 161912000, + "router_z_loss_mlp": 0.88037109, + "step": 1948, + "time_per_iteration": 2.9668121337890625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179187, + "balance_loss_mlp": 1.09140122, + "epoch": 0.3749519045786841, + "flos": 461215507968.0, + "grad_norm": 0.032662314662123194, + "language_loss": 0.97396064, + "learning_rate": 0.0007191046773806669, + "loss": 0.98575246, + "num_input_tokens_seen": 161977296, + "router_z_loss_mlp": 0.87939453, + "step": 1949, + "time_per_iteration": 2.5580427646636963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189402, + "balance_loss_mlp": 1.10166442, + "epoch": 0.3751442862639477, + "flos": 956386687488.0, + "grad_norm": 0.03764484603893814, + "language_loss": 0.94282359, + "learning_rate": 0.0007188245979148631, + "loss": 0.95471758, + "num_input_tokens_seen": 162051888, + "router_z_loss_mlp": 0.87890625, + "step": 1950, + "time_per_iteration": 3.1307644844055176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185097, + "balance_loss_mlp": 1.09678674, + "epoch": 0.3753366679492112, + "flos": 528805392384.0, + "grad_norm": 0.0321726971318772, + "language_loss": 0.95554888, + "learning_rate": 0.0007185444334938157, + "loss": 0.96739984, + "num_input_tokens_seen": 162124384, + "router_z_loss_mlp": 0.8828125, + "step": 1951, + "time_per_iteration": 2.7235019207000732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181124, + "balance_loss_mlp": 1.09324276, + "epoch": 0.3755290496344748, + "flos": 522848550912.0, + "grad_norm": 0.029170285322497422, + "language_loss": 0.91979843, + "learning_rate": 0.0007182641842262947, + "loss": 0.93160963, + "num_input_tokens_seen": 162191440, + "router_z_loss_mlp": 0.88037109, + "step": 1952, + "time_per_iteration": 2.6083579063415527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179821, + "balance_loss_mlp": 1.09193957, + "epoch": 0.37572143131973834, + "flos": 622371864576.0, + "grad_norm": 0.029206332986401715, + "language_loss": 0.85116351, + "learning_rate": 0.0007179838502211022, + "loss": 0.86296165, + "num_input_tokens_seen": 162268480, + "router_z_loss_mlp": 0.88037109, + "step": 1953, + "time_per_iteration": 2.8308520317077637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185603, + "balance_loss_mlp": 1.0973407, + "epoch": 0.37591381300500193, + "flos": 772273823232.0, + "grad_norm": 0.030259488278154622, + "language_loss": 0.94510454, + "learning_rate": 0.0007177034315870738, + "loss": 0.9569605, + "num_input_tokens_seen": 162346752, + "router_z_loss_mlp": 0.88232422, + "step": 1954, + "time_per_iteration": 2.966627359390259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187445, + "balance_loss_mlp": 1.09908688, + "epoch": 0.37610619469026546, + "flos": 521480864256.0, + "grad_norm": 0.02960656624392615, + "language_loss": 0.99060822, + "learning_rate": 0.0007174229284330773, + "loss": 1.00248265, + "num_input_tokens_seen": 162415120, + "router_z_loss_mlp": 0.88330078, + "step": 1955, + "time_per_iteration": 2.642186403274536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182076, + "balance_loss_mlp": 1.09338391, + "epoch": 0.37629857637552905, + "flos": 599970582528.0, + "grad_norm": 0.025408092842649905, + "language_loss": 0.92700577, + "learning_rate": 0.0007171423408680141, + "loss": 0.93882644, + "num_input_tokens_seen": 162493280, + "router_z_loss_mlp": 0.88671875, + "step": 1956, + "time_per_iteration": 2.8501906394958496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180409, + "balance_loss_mlp": 1.09138381, + "epoch": 0.37649095806079264, + "flos": 566018187264.0, + "grad_norm": 0.027446848492574977, + "language_loss": 0.96095192, + "learning_rate": 0.0007168616690008176, + "loss": 0.97275609, + "num_input_tokens_seen": 162560736, + "router_z_loss_mlp": 0.88818359, + "step": 1957, + "time_per_iteration": 2.658282995223999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183288, + "balance_loss_mlp": 1.09440601, + "epoch": 0.37668333974605617, + "flos": 593568579072.0, + "grad_norm": 0.029268558303355535, + "language_loss": 0.93381131, + "learning_rate": 0.0007165809129404545, + "loss": 0.9456442, + "num_input_tokens_seen": 162630688, + "router_z_loss_mlp": 0.88671875, + "step": 1958, + "time_per_iteration": 2.738896608352661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185047, + "balance_loss_mlp": 1.09621239, + "epoch": 0.37687572143131975, + "flos": 420364280832.0, + "grad_norm": 0.028940223287944336, + "language_loss": 0.94791234, + "learning_rate": 0.0007163000727959239, + "loss": 0.95976275, + "num_input_tokens_seen": 162694304, + "router_z_loss_mlp": 0.88623047, + "step": 1959, + "time_per_iteration": 2.5175514221191406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122541, + "balance_loss_mlp": 1.14034271, + "epoch": 0.3770681031165833, + "flos": 1360384568832.0, + "grad_norm": 0.031863979933265396, + "language_loss": 0.77959073, + "learning_rate": 0.0007160191486762575, + "loss": 0.79184484, + "num_input_tokens_seen": 162920336, + "router_z_loss_mlp": 0.8515625, + "step": 1960, + "time_per_iteration": 4.834294557571411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187625, + "balance_loss_mlp": 1.0985992, + "epoch": 0.3772604848018469, + "flos": 646153568256.0, + "grad_norm": 0.027699188267120346, + "language_loss": 0.9236567, + "learning_rate": 0.00071573814069052, + "loss": 0.93553299, + "num_input_tokens_seen": 163000720, + "router_z_loss_mlp": 0.88818359, + "step": 1961, + "time_per_iteration": 2.8704912662506104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195985, + "balance_loss_mlp": 1.10681665, + "epoch": 0.3774528664871104, + "flos": 903200810496.0, + "grad_norm": 0.025601029742712816, + "language_loss": 0.93588847, + "learning_rate": 0.0007154570489478081, + "loss": 0.94784832, + "num_input_tokens_seen": 163085680, + "router_z_loss_mlp": 0.88964844, + "step": 1962, + "time_per_iteration": 3.2312510013580322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198663, + "balance_loss_mlp": 1.1095897, + "epoch": 0.377645248172374, + "flos": 789462868992.0, + "grad_norm": 0.028157211525065163, + "language_loss": 0.92405236, + "learning_rate": 0.0007151758735572514, + "loss": 0.93603897, + "num_input_tokens_seen": 163162224, + "router_z_loss_mlp": 0.88867188, + "step": 1963, + "time_per_iteration": 3.0338857173919678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192995, + "balance_loss_mlp": 1.10396981, + "epoch": 0.3778376298576376, + "flos": 587924642304.0, + "grad_norm": 0.030822839560022956, + "language_loss": 0.89740217, + "learning_rate": 0.0007148946146280119, + "loss": 0.90933216, + "num_input_tokens_seen": 163237920, + "router_z_loss_mlp": 0.88818359, + "step": 1964, + "time_per_iteration": 2.795830488204956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193161, + "balance_loss_mlp": 1.10656738, + "epoch": 0.3780300115429011, + "flos": 1399669997568.0, + "grad_norm": 0.013238700163895742, + "language_loss": 0.72192144, + "learning_rate": 0.000714613272269284, + "loss": 0.7338531, + "num_input_tokens_seen": 163455760, + "router_z_loss_mlp": 0.8671875, + "step": 1965, + "time_per_iteration": 4.866962909698486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120089, + "balance_loss_mlp": 1.11372375, + "epoch": 0.3782223932281647, + "flos": 1360631619072.0, + "grad_norm": 0.015556792607008025, + "language_loss": 0.75341946, + "learning_rate": 0.0007143318465902943, + "loss": 0.76542836, + "num_input_tokens_seen": 163678064, + "router_z_loss_mlp": 0.87304688, + "step": 1966, + "time_per_iteration": 4.942438364028931 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179172, + "balance_loss_mlp": 1.09114802, + "epoch": 0.37841477491342823, + "flos": 705515865600.0, + "grad_norm": 0.024767419651172896, + "language_loss": 0.90831983, + "learning_rate": 0.0007140503377003022, + "loss": 0.92011154, + "num_input_tokens_seen": 163764320, + "router_z_loss_mlp": 0.88183594, + "step": 1967, + "time_per_iteration": 2.9852232933044434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118121, + "balance_loss_mlp": 1.09318614, + "epoch": 0.3786071565986918, + "flos": 530155614720.0, + "grad_norm": 0.02676934241732637, + "language_loss": 0.92451024, + "learning_rate": 0.000713768745708599, + "loss": 0.93632239, + "num_input_tokens_seen": 163831808, + "router_z_loss_mlp": 0.88183594, + "step": 1968, + "time_per_iteration": 2.6276321411132812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180899, + "balance_loss_mlp": 1.09311283, + "epoch": 0.37879953828395535, + "flos": 994900039680.0, + "grad_norm": 0.026029915049846697, + "language_loss": 0.85207623, + "learning_rate": 0.0007134870707245085, + "loss": 0.86388516, + "num_input_tokens_seen": 163918128, + "router_z_loss_mlp": 0.87939453, + "step": 1969, + "time_per_iteration": 3.2757370471954346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118867, + "balance_loss_mlp": 1.10074103, + "epoch": 0.37899191996921894, + "flos": 627792219648.0, + "grad_norm": 0.029282968357198087, + "language_loss": 0.91297084, + "learning_rate": 0.0007132053128573864, + "loss": 0.92485756, + "num_input_tokens_seen": 163987552, + "router_z_loss_mlp": 0.88085938, + "step": 1970, + "time_per_iteration": 2.713987350463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184407, + "balance_loss_mlp": 1.09633517, + "epoch": 0.37918430165448247, + "flos": 687519088128.0, + "grad_norm": 0.026716081838251738, + "language_loss": 0.91701669, + "learning_rate": 0.0007129234722166211, + "loss": 0.92886078, + "num_input_tokens_seen": 164063248, + "router_z_loss_mlp": 0.88232422, + "step": 1971, + "time_per_iteration": 2.830312728881836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178089, + "balance_loss_mlp": 1.09025514, + "epoch": 0.37937668333974606, + "flos": 476617901568.0, + "grad_norm": 0.023390773702336033, + "language_loss": 0.97041333, + "learning_rate": 0.0007126415489116328, + "loss": 0.98219419, + "num_input_tokens_seen": 164133776, + "router_z_loss_mlp": 0.87988281, + "step": 1972, + "time_per_iteration": 2.6577088832855225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186585, + "balance_loss_mlp": 1.09903812, + "epoch": 0.37956906502500964, + "flos": 708823928832.0, + "grad_norm": 0.02822522227358307, + "language_loss": 0.89341533, + "learning_rate": 0.0007123595430518736, + "loss": 0.90528119, + "num_input_tokens_seen": 164206672, + "router_z_loss_mlp": 0.87695312, + "step": 1973, + "time_per_iteration": 2.8803040981292725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187247, + "balance_loss_mlp": 1.09974778, + "epoch": 0.3797614467102732, + "flos": 427558553088.0, + "grad_norm": 0.030455517002935972, + "language_loss": 0.93240166, + "learning_rate": 0.0007120774547468282, + "loss": 0.94427419, + "num_input_tokens_seen": 164271968, + "router_z_loss_mlp": 0.87646484, + "step": 1974, + "time_per_iteration": 2.5190658569335938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185963, + "balance_loss_mlp": 1.09836841, + "epoch": 0.37995382839553676, + "flos": 482880916992.0, + "grad_norm": 0.028219754054602288, + "language_loss": 0.89357984, + "learning_rate": 0.0007117952841060128, + "loss": 0.9054395, + "num_input_tokens_seen": 164342800, + "router_z_loss_mlp": 0.87744141, + "step": 1975, + "time_per_iteration": 2.6428894996643066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184241, + "balance_loss_mlp": 1.09631252, + "epoch": 0.3801462100808003, + "flos": 561670078464.0, + "grad_norm": 0.02907805968320273, + "language_loss": 0.90876186, + "learning_rate": 0.0007115130312389756, + "loss": 0.92060423, + "num_input_tokens_seen": 164414928, + "router_z_loss_mlp": 0.88085938, + "step": 1976, + "time_per_iteration": 2.669287919998169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188644, + "balance_loss_mlp": 1.10066783, + "epoch": 0.3803385917660639, + "flos": 465887255040.0, + "grad_norm": 0.031138982719559682, + "language_loss": 0.88565898, + "learning_rate": 0.0007112306962552973, + "loss": 0.89754546, + "num_input_tokens_seen": 164483312, + "router_z_loss_mlp": 0.88134766, + "step": 1977, + "time_per_iteration": 2.617105007171631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188488, + "balance_loss_mlp": 1.10055935, + "epoch": 0.3805309734513274, + "flos": 522904946688.0, + "grad_norm": 0.027881475391737562, + "language_loss": 0.92461807, + "learning_rate": 0.0007109482792645896, + "loss": 0.93650293, + "num_input_tokens_seen": 164555760, + "router_z_loss_mlp": 0.88085938, + "step": 1978, + "time_per_iteration": 2.7350404262542725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191644, + "balance_loss_mlp": 1.10352468, + "epoch": 0.380723355136591, + "flos": 592552728576.0, + "grad_norm": 0.03010131618310245, + "language_loss": 0.91373634, + "learning_rate": 0.0007106657803764969, + "loss": 0.92565274, + "num_input_tokens_seen": 164626768, + "router_z_loss_mlp": 0.88183594, + "step": 1979, + "time_per_iteration": 2.7113609313964844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188099, + "balance_loss_mlp": 1.10007489, + "epoch": 0.38091573682185453, + "flos": 623854344192.0, + "grad_norm": 0.03122566409921124, + "language_loss": 0.90192807, + "learning_rate": 0.0007103831997006948, + "loss": 0.91380906, + "num_input_tokens_seen": 164698016, + "router_z_loss_mlp": 0.88183594, + "step": 1980, + "time_per_iteration": 2.7460203170776367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183293, + "balance_loss_mlp": 1.09507859, + "epoch": 0.3811081185071181, + "flos": 570175641600.0, + "grad_norm": 0.027157726640451497, + "language_loss": 0.92157245, + "learning_rate": 0.0007101005373468908, + "loss": 0.9334054, + "num_input_tokens_seen": 164780320, + "router_z_loss_mlp": 0.8828125, + "step": 1981, + "time_per_iteration": 2.869722604751587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176795, + "balance_loss_mlp": 1.08891392, + "epoch": 0.3813005001923817, + "flos": 585990269952.0, + "grad_norm": 0.026054611177121254, + "language_loss": 0.92786968, + "learning_rate": 0.0007098177934248242, + "loss": 0.9396376, + "num_input_tokens_seen": 164854400, + "router_z_loss_mlp": 0.88037109, + "step": 1982, + "time_per_iteration": 2.7341668605804443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179814, + "balance_loss_mlp": 1.09188521, + "epoch": 0.38149288187764524, + "flos": 622810295808.0, + "grad_norm": 0.03120804506271422, + "language_loss": 0.94404829, + "learning_rate": 0.0007095349680442661, + "loss": 0.95584643, + "num_input_tokens_seen": 164932896, + "router_z_loss_mlp": 0.88085938, + "step": 1983, + "time_per_iteration": 2.845836639404297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182966, + "balance_loss_mlp": 1.09522831, + "epoch": 0.3816852635629088, + "flos": 571797109248.0, + "grad_norm": 0.027372063240090748, + "language_loss": 0.86448967, + "learning_rate": 0.0007092520613150188, + "loss": 0.87631935, + "num_input_tokens_seen": 165002896, + "router_z_loss_mlp": 0.87890625, + "step": 1984, + "time_per_iteration": 2.6740176677703857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178711, + "balance_loss_mlp": 1.09106863, + "epoch": 0.38187764524817236, + "flos": 566678198784.0, + "grad_norm": 0.03160695384354602, + "language_loss": 0.87573516, + "learning_rate": 0.0007089690733469165, + "loss": 0.88752234, + "num_input_tokens_seen": 165074704, + "router_z_loss_mlp": 0.87792969, + "step": 1985, + "time_per_iteration": 2.717921733856201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178571, + "balance_loss_mlp": 1.09073794, + "epoch": 0.38207002693343595, + "flos": 632398838784.0, + "grad_norm": 0.031031403109496963, + "language_loss": 0.90504575, + "learning_rate": 0.000708686004249825, + "loss": 0.91683149, + "num_input_tokens_seen": 165149136, + "router_z_loss_mlp": 0.87988281, + "step": 1986, + "time_per_iteration": 2.758554697036743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179432, + "balance_loss_mlp": 1.09164619, + "epoch": 0.3822624086186995, + "flos": 549840989184.0, + "grad_norm": 0.025201133141653974, + "language_loss": 0.97533029, + "learning_rate": 0.0007084028541336413, + "loss": 0.98712462, + "num_input_tokens_seen": 165220864, + "router_z_loss_mlp": 0.87939453, + "step": 1987, + "time_per_iteration": 2.6981115341186523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187219, + "balance_loss_mlp": 1.09909916, + "epoch": 0.38245479030396307, + "flos": 615066802176.0, + "grad_norm": 0.02853553744793089, + "language_loss": 0.9291808, + "learning_rate": 0.0007081196231082942, + "loss": 0.94105303, + "num_input_tokens_seen": 165301568, + "router_z_loss_mlp": 0.8828125, + "step": 1988, + "time_per_iteration": 2.7912278175354004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186636, + "balance_loss_mlp": 1.09851646, + "epoch": 0.38264717198922665, + "flos": 669303458304.0, + "grad_norm": 0.029318681320032423, + "language_loss": 0.88455558, + "learning_rate": 0.0007078363112837436, + "loss": 0.89642197, + "num_input_tokens_seen": 165373152, + "router_z_loss_mlp": 0.8828125, + "step": 1989, + "time_per_iteration": 2.8133885860443115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187352, + "balance_loss_mlp": 1.09927964, + "epoch": 0.3828395536744902, + "flos": 455686364160.0, + "grad_norm": 0.029265262626364436, + "language_loss": 0.9249233, + "learning_rate": 0.000707552918769981, + "loss": 0.93679678, + "num_input_tokens_seen": 165439136, + "router_z_loss_mlp": 0.88232422, + "step": 1990, + "time_per_iteration": 2.538587808609009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180802, + "balance_loss_mlp": 1.09277809, + "epoch": 0.3830319353597538, + "flos": 500482197504.0, + "grad_norm": 0.02588536582900798, + "language_loss": 0.91112638, + "learning_rate": 0.000707269445677029, + "loss": 0.92293441, + "num_input_tokens_seen": 165514624, + "router_z_loss_mlp": 0.88183594, + "step": 1991, + "time_per_iteration": 2.7578041553497314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183391, + "balance_loss_mlp": 1.09536684, + "epoch": 0.3832243170450173, + "flos": 745466035200.0, + "grad_norm": 0.02707218781991338, + "language_loss": 0.91718936, + "learning_rate": 0.0007069858921149416, + "loss": 0.92902327, + "num_input_tokens_seen": 165594512, + "router_z_loss_mlp": 0.88183594, + "step": 1992, + "time_per_iteration": 2.948418617248535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184259, + "balance_loss_mlp": 1.09613955, + "epoch": 0.3834166987302809, + "flos": 579345219072.0, + "grad_norm": 0.02587271093699699, + "language_loss": 0.92343616, + "learning_rate": 0.0007067022581938043, + "loss": 0.93527877, + "num_input_tokens_seen": 165673968, + "router_z_loss_mlp": 0.8828125, + "step": 1993, + "time_per_iteration": 2.881967782974243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118782, + "balance_loss_mlp": 1.09965289, + "epoch": 0.3836090804155444, + "flos": 537608397312.0, + "grad_norm": 0.029882536442049617, + "language_loss": 0.91833031, + "learning_rate": 0.0007064185440237334, + "loss": 0.9302085, + "num_input_tokens_seen": 165747664, + "router_z_loss_mlp": 0.88330078, + "step": 1994, + "time_per_iteration": 2.7481510639190674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190014, + "balance_loss_mlp": 1.10189474, + "epoch": 0.383801462100808, + "flos": 603051061248.0, + "grad_norm": 0.027232179622410133, + "language_loss": 0.91516536, + "learning_rate": 0.0007061347497148764, + "loss": 0.92706549, + "num_input_tokens_seen": 165824624, + "router_z_loss_mlp": 0.8828125, + "step": 1995, + "time_per_iteration": 2.762807846069336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191619, + "balance_loss_mlp": 1.10321367, + "epoch": 0.38399384378607154, + "flos": 573798610944.0, + "grad_norm": 0.03191203592253993, + "language_loss": 0.9478448, + "learning_rate": 0.0007058508753774122, + "loss": 0.95976096, + "num_input_tokens_seen": 165896304, + "router_z_loss_mlp": 0.88476562, + "step": 1996, + "time_per_iteration": 2.7208473682403564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185202, + "balance_loss_mlp": 1.09708297, + "epoch": 0.38418622547133513, + "flos": 537779586048.0, + "grad_norm": 0.03234926235653744, + "language_loss": 0.93760306, + "learning_rate": 0.0007055669211215505, + "loss": 0.94945514, + "num_input_tokens_seen": 165961312, + "router_z_loss_mlp": 0.8828125, + "step": 1997, + "time_per_iteration": 2.6605474948883057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182194, + "balance_loss_mlp": 1.09397876, + "epoch": 0.3843786071565987, + "flos": 574013460480.0, + "grad_norm": 0.03558568539094479, + "language_loss": 0.86620909, + "learning_rate": 0.0007052828870575322, + "loss": 0.87803102, + "num_input_tokens_seen": 166028064, + "router_z_loss_mlp": 0.88378906, + "step": 1998, + "time_per_iteration": 2.6478962898254395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179215, + "balance_loss_mlp": 1.09100008, + "epoch": 0.38457098884186225, + "flos": 730079104512.0, + "grad_norm": 0.027610192556292087, + "language_loss": 0.94167769, + "learning_rate": 0.0007049987732956291, + "loss": 0.95346981, + "num_input_tokens_seen": 166110272, + "router_z_loss_mlp": 0.88378906, + "step": 1999, + "time_per_iteration": 2.9643850326538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190926, + "balance_loss_mlp": 1.10199583, + "epoch": 0.38476337052712584, + "flos": 584620581888.0, + "grad_norm": 0.023866575274933036, + "language_loss": 0.8787694, + "learning_rate": 0.0007047145799461439, + "loss": 0.89067864, + "num_input_tokens_seen": 166193088, + "router_z_loss_mlp": 0.88720703, + "step": 2000, + "time_per_iteration": 2.8542819023132324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191076, + "balance_loss_mlp": 1.10200322, + "epoch": 0.38495575221238937, + "flos": 554158898688.0, + "grad_norm": 0.025960095413567152, + "language_loss": 0.89154112, + "learning_rate": 0.00070443030711941, + "loss": 0.90345186, + "num_input_tokens_seen": 166271776, + "router_z_loss_mlp": 0.88867188, + "step": 2001, + "time_per_iteration": 2.770023822784424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189246, + "balance_loss_mlp": 1.10084057, + "epoch": 0.38514813389765296, + "flos": 655676983296.0, + "grad_norm": 0.026490656569535233, + "language_loss": 0.88696259, + "learning_rate": 0.0007041459549257924, + "loss": 0.89885509, + "num_input_tokens_seen": 166350000, + "router_z_loss_mlp": 0.88476562, + "step": 2002, + "time_per_iteration": 4.357714414596558 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182232, + "balance_loss_mlp": 1.09392142, + "epoch": 0.3853405155829165, + "flos": 869645913600.0, + "grad_norm": 0.03138294802585753, + "language_loss": 0.86704218, + "learning_rate": 0.0007038615234756859, + "loss": 0.87886453, + "num_input_tokens_seen": 166434336, + "router_z_loss_mlp": 0.88476562, + "step": 2003, + "time_per_iteration": 3.154315233230591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179573, + "balance_loss_mlp": 1.09135854, + "epoch": 0.3855328972681801, + "flos": 547468185600.0, + "grad_norm": 0.030993794918127784, + "language_loss": 0.91032863, + "learning_rate": 0.000703577012879517, + "loss": 0.92212439, + "num_input_tokens_seen": 166503952, + "router_z_loss_mlp": 0.88378906, + "step": 2004, + "time_per_iteration": 2.6320230960845947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184907, + "balance_loss_mlp": 1.09673953, + "epoch": 0.3857252789534436, + "flos": 535098607104.0, + "grad_norm": 0.029525133384240967, + "language_loss": 0.9687134, + "learning_rate": 0.0007032924232477423, + "loss": 0.98056245, + "num_input_tokens_seen": 166575168, + "router_z_loss_mlp": 0.88330078, + "step": 2005, + "time_per_iteration": 2.650982618331909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184324, + "balance_loss_mlp": 1.09630013, + "epoch": 0.3859176606387072, + "flos": 492766901760.0, + "grad_norm": 0.029334702789067958, + "language_loss": 0.8823278, + "learning_rate": 0.0007030077546908493, + "loss": 0.89417106, + "num_input_tokens_seen": 166647552, + "router_z_loss_mlp": 0.88183594, + "step": 2006, + "time_per_iteration": 2.642333745956421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203979, + "balance_loss_mlp": 1.11700439, + "epoch": 0.3861100423239708, + "flos": 1490155991040.0, + "grad_norm": 0.02217822259323008, + "language_loss": 0.83064663, + "learning_rate": 0.0007027230073193561, + "loss": 0.84268641, + "num_input_tokens_seen": 166875088, + "router_z_loss_mlp": 0.87109375, + "step": 2007, + "time_per_iteration": 4.759521961212158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184336, + "balance_loss_mlp": 1.09635913, + "epoch": 0.3863024240092343, + "flos": 474692261376.0, + "grad_norm": 0.030825589148035897, + "language_loss": 0.87378025, + "learning_rate": 0.0007024381812438117, + "loss": 0.88562357, + "num_input_tokens_seen": 166939344, + "router_z_loss_mlp": 0.88134766, + "step": 2008, + "time_per_iteration": 2.5227372646331787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184691, + "balance_loss_mlp": 1.09728634, + "epoch": 0.3864948056944979, + "flos": 717978769920.0, + "grad_norm": 0.032935981886219476, + "language_loss": 0.91112518, + "learning_rate": 0.0007021532765747951, + "loss": 0.92297208, + "num_input_tokens_seen": 167014992, + "router_z_loss_mlp": 0.87548828, + "step": 2009, + "time_per_iteration": 2.963550567626953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182737, + "balance_loss_mlp": 1.0952853, + "epoch": 0.38668718737976143, + "flos": 728954465280.0, + "grad_norm": 0.030267959416106823, + "language_loss": 0.86631739, + "learning_rate": 0.0007018682934229162, + "loss": 0.87814474, + "num_input_tokens_seen": 167092096, + "router_z_loss_mlp": 0.87597656, + "step": 2010, + "time_per_iteration": 2.955132246017456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179617, + "balance_loss_mlp": 1.09235525, + "epoch": 0.386879569065025, + "flos": 526488984576.0, + "grad_norm": 0.02588052645359636, + "language_loss": 0.89375025, + "learning_rate": 0.0007015832318988152, + "loss": 0.90554643, + "num_input_tokens_seen": 167162144, + "router_z_loss_mlp": 0.87402344, + "step": 2011, + "time_per_iteration": 2.612443208694458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117942, + "balance_loss_mlp": 1.09454346, + "epoch": 0.38707195075028855, + "flos": 1530724512768.0, + "grad_norm": 0.010241364382771095, + "language_loss": 0.73890078, + "learning_rate": 0.000701298092113163, + "loss": 0.75069499, + "num_input_tokens_seen": 167391536, + "router_z_loss_mlp": 0.84960938, + "step": 2012, + "time_per_iteration": 4.952507495880127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187813, + "balance_loss_mlp": 1.10040927, + "epoch": 0.38726433243555214, + "flos": 558385483776.0, + "grad_norm": 0.026729103388188073, + "language_loss": 0.89776802, + "learning_rate": 0.0007010128741766604, + "loss": 0.90964615, + "num_input_tokens_seen": 167466000, + "router_z_loss_mlp": 0.87548828, + "step": 2013, + "time_per_iteration": 2.759916067123413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184734, + "balance_loss_mlp": 1.09756815, + "epoch": 0.38745671412081567, + "flos": 554755783680.0, + "grad_norm": 0.0314384592840016, + "language_loss": 0.91517645, + "learning_rate": 0.0007007275782000391, + "loss": 0.92702377, + "num_input_tokens_seen": 167536144, + "router_z_loss_mlp": 0.87304688, + "step": 2014, + "time_per_iteration": 2.6659133434295654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181864, + "balance_loss_mlp": 1.09469819, + "epoch": 0.38764909580607926, + "flos": 459344262144.0, + "grad_norm": 0.028810992523736655, + "language_loss": 0.92611015, + "learning_rate": 0.0007004422042940605, + "loss": 0.9379288, + "num_input_tokens_seen": 167600064, + "router_z_loss_mlp": 0.87304688, + "step": 2015, + "time_per_iteration": 2.4901411533355713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180932, + "balance_loss_mlp": 1.09376657, + "epoch": 0.38784147749134285, + "flos": 523258784256.0, + "grad_norm": 0.030339968140386194, + "language_loss": 0.98432136, + "learning_rate": 0.0007001567525695169, + "loss": 0.99613065, + "num_input_tokens_seen": 167666576, + "router_z_loss_mlp": 0.87304688, + "step": 2016, + "time_per_iteration": 2.605134963989258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182969, + "balance_loss_mlp": 1.09575546, + "epoch": 0.3880338591766064, + "flos": 667400011776.0, + "grad_norm": 0.023304348995526428, + "language_loss": 0.90603948, + "learning_rate": 0.0006998712231372303, + "loss": 0.91786909, + "num_input_tokens_seen": 167753296, + "router_z_loss_mlp": 0.87353516, + "step": 2017, + "time_per_iteration": 2.9866511821746826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187647, + "balance_loss_mlp": 1.10024321, + "epoch": 0.38822624086186996, + "flos": 595175310336.0, + "grad_norm": 0.027834044235160192, + "language_loss": 0.92810535, + "learning_rate": 0.0006995856161080532, + "loss": 0.93998176, + "num_input_tokens_seen": 167834080, + "router_z_loss_mlp": 0.87548828, + "step": 2018, + "time_per_iteration": 2.8917806148529053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181908, + "balance_loss_mlp": 1.09426534, + "epoch": 0.3884186225471335, + "flos": 613681651200.0, + "grad_norm": 0.030912624722110756, + "language_loss": 0.90135586, + "learning_rate": 0.0006992999315928679, + "loss": 0.91317499, + "num_input_tokens_seen": 167912368, + "router_z_loss_mlp": 0.87792969, + "step": 2019, + "time_per_iteration": 2.821570873260498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179846, + "balance_loss_mlp": 1.0924896, + "epoch": 0.3886110042323971, + "flos": 608243831808.0, + "grad_norm": 0.025167723735071885, + "language_loss": 0.91748118, + "learning_rate": 0.0006990141697025871, + "loss": 0.92927969, + "num_input_tokens_seen": 167991968, + "router_z_loss_mlp": 0.875, + "step": 2020, + "time_per_iteration": 2.774073600769043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181915, + "balance_loss_mlp": 1.09684753, + "epoch": 0.3888033859176606, + "flos": 1531193869824.0, + "grad_norm": 0.011544022481713089, + "language_loss": 0.76359642, + "learning_rate": 0.0006987283305481533, + "loss": 0.77541554, + "num_input_tokens_seen": 168212128, + "router_z_loss_mlp": 0.8515625, + "step": 2021, + "time_per_iteration": 4.741650581359863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174887, + "balance_loss_mlp": 1.08734, + "epoch": 0.3889957676029242, + "flos": 693671313408.0, + "grad_norm": 0.03334226176751645, + "language_loss": 0.90383756, + "learning_rate": 0.0006984424142405392, + "loss": 0.91558647, + "num_input_tokens_seen": 168287440, + "router_z_loss_mlp": 0.87695312, + "step": 2022, + "time_per_iteration": 2.839838981628418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174992, + "balance_loss_mlp": 1.08734977, + "epoch": 0.3891881492881878, + "flos": 516194767872.0, + "grad_norm": 0.031660307701904165, + "language_loss": 0.90829813, + "learning_rate": 0.0006981564208907474, + "loss": 0.92004812, + "num_input_tokens_seen": 168354704, + "router_z_loss_mlp": 0.87792969, + "step": 2023, + "time_per_iteration": 2.6160523891448975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179623, + "balance_loss_mlp": 1.09178972, + "epoch": 0.3893805309734513, + "flos": 630175756800.0, + "grad_norm": 0.02822603249283798, + "language_loss": 0.96692258, + "learning_rate": 0.0006978703506098102, + "loss": 0.97871882, + "num_input_tokens_seen": 168424272, + "router_z_loss_mlp": 0.87988281, + "step": 2024, + "time_per_iteration": 2.770775556564331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177682, + "balance_loss_mlp": 1.08994389, + "epoch": 0.3895729126587149, + "flos": 545206172160.0, + "grad_norm": 0.026225366557941037, + "language_loss": 0.95314252, + "learning_rate": 0.00069758420350879, + "loss": 0.96491939, + "num_input_tokens_seen": 168488912, + "router_z_loss_mlp": 0.87890625, + "step": 2025, + "time_per_iteration": 2.615687608718872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179844, + "balance_loss_mlp": 1.09201062, + "epoch": 0.38976529434397844, + "flos": 619406178816.0, + "grad_norm": 0.03181269468531491, + "language_loss": 0.9379099, + "learning_rate": 0.000697297979698779, + "loss": 0.94970834, + "num_input_tokens_seen": 168563248, + "router_z_loss_mlp": 0.87988281, + "step": 2026, + "time_per_iteration": 2.723860740661621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187768, + "balance_loss_mlp": 1.10007727, + "epoch": 0.38995767602924203, + "flos": 836344797696.0, + "grad_norm": 0.025703512313876988, + "language_loss": 0.89683533, + "learning_rate": 0.0006970116792908992, + "loss": 0.90871298, + "num_input_tokens_seen": 168648272, + "router_z_loss_mlp": 0.87841797, + "step": 2027, + "time_per_iteration": 3.0871434211730957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117977, + "balance_loss_mlp": 1.09203207, + "epoch": 0.39015005771450556, + "flos": 542646716928.0, + "grad_norm": 0.03022946762166595, + "language_loss": 0.88945854, + "learning_rate": 0.000696725302396302, + "loss": 0.9012562, + "num_input_tokens_seen": 168721760, + "router_z_loss_mlp": 0.87890625, + "step": 2028, + "time_per_iteration": 2.632178783416748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174959, + "balance_loss_mlp": 1.0871253, + "epoch": 0.39034243939976915, + "flos": 1009140864000.0, + "grad_norm": 0.026055335602768993, + "language_loss": 0.92111158, + "learning_rate": 0.0006964388491261692, + "loss": 0.93286121, + "num_input_tokens_seen": 168803664, + "router_z_loss_mlp": 0.87988281, + "step": 2029, + "time_per_iteration": 3.2683680057525635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174119, + "balance_loss_mlp": 1.08633304, + "epoch": 0.3905348210850327, + "flos": 680240222208.0, + "grad_norm": 0.029787695509808892, + "language_loss": 0.96251416, + "learning_rate": 0.0006961523195917114, + "loss": 0.97425532, + "num_input_tokens_seen": 168879184, + "router_z_loss_mlp": 0.87939453, + "step": 2030, + "time_per_iteration": 2.807161331176758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182527, + "balance_loss_mlp": 1.09459865, + "epoch": 0.39072720277029627, + "flos": 549988709376.0, + "grad_norm": 0.03099080969443711, + "language_loss": 0.86433041, + "learning_rate": 0.0006958657139041696, + "loss": 0.87615567, + "num_input_tokens_seen": 168957808, + "router_z_loss_mlp": 0.88085938, + "step": 2031, + "time_per_iteration": 2.728208065032959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119693, + "balance_loss_mlp": 1.11052704, + "epoch": 0.39091958445555985, + "flos": 1551051159552.0, + "grad_norm": 0.01789751173127641, + "language_loss": 0.76712966, + "learning_rate": 0.0006955790321748136, + "loss": 0.77909899, + "num_input_tokens_seen": 169194416, + "router_z_loss_mlp": 0.86523438, + "step": 2032, + "time_per_iteration": 4.911708354949951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179573, + "balance_loss_mlp": 1.09193051, + "epoch": 0.3911119661408234, + "flos": 505051886592.0, + "grad_norm": 0.03095157096826047, + "language_loss": 0.85940099, + "learning_rate": 0.0006952922745149434, + "loss": 0.87119675, + "num_input_tokens_seen": 169263552, + "router_z_loss_mlp": 0.87792969, + "step": 2033, + "time_per_iteration": 2.649538040161133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176483, + "balance_loss_mlp": 1.08903146, + "epoch": 0.391304347826087, + "flos": 558329088000.0, + "grad_norm": 0.028319463440814277, + "language_loss": 0.94666743, + "learning_rate": 0.000695005441035888, + "loss": 0.95843232, + "num_input_tokens_seen": 169333696, + "router_z_loss_mlp": 0.87597656, + "step": 2034, + "time_per_iteration": 2.6671407222747803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178574, + "balance_loss_mlp": 1.09293365, + "epoch": 0.3914967295113505, + "flos": 1502941807104.0, + "grad_norm": 0.0063133772361172544, + "language_loss": 0.73723435, + "learning_rate": 0.0006947185318490064, + "loss": 0.7490201, + "num_input_tokens_seen": 169556416, + "router_z_loss_mlp": 0.85742188, + "step": 2035, + "time_per_iteration": 4.863725423812866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180506, + "balance_loss_mlp": 1.09338748, + "epoch": 0.3916891111966141, + "flos": 708329101824.0, + "grad_norm": 0.025753563122139746, + "language_loss": 0.86980474, + "learning_rate": 0.0006944315470656863, + "loss": 0.88160974, + "num_input_tokens_seen": 169643312, + "router_z_loss_mlp": 0.87255859, + "step": 2036, + "time_per_iteration": 2.936588764190674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188418, + "balance_loss_mlp": 1.10110939, + "epoch": 0.3918814928818776, + "flos": 557408564736.0, + "grad_norm": 0.031943380680049066, + "language_loss": 0.99613088, + "learning_rate": 0.000694144486797345, + "loss": 1.00801504, + "num_input_tokens_seen": 169712560, + "router_z_loss_mlp": 0.87451172, + "step": 2037, + "time_per_iteration": 2.676107883453369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193756, + "balance_loss_mlp": 1.10868835, + "epoch": 0.3920738745671412, + "flos": 1541685471744.0, + "grad_norm": 0.012882287356254449, + "language_loss": 0.79520434, + "learning_rate": 0.0006938573511554296, + "loss": 0.8071419, + "num_input_tokens_seen": 169914912, + "router_z_loss_mlp": 0.8515625, + "step": 2038, + "time_per_iteration": 4.63246750831604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178826, + "balance_loss_mlp": 1.0916127, + "epoch": 0.39226625625240474, + "flos": 499804721664.0, + "grad_norm": 0.027391930017631044, + "language_loss": 0.96627682, + "learning_rate": 0.0006935701402514156, + "loss": 0.97806513, + "num_input_tokens_seen": 169978848, + "router_z_loss_mlp": 0.87353516, + "step": 2039, + "time_per_iteration": 2.5613086223602295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177521, + "balance_loss_mlp": 1.092453, + "epoch": 0.39245863793766833, + "flos": 1350450920448.0, + "grad_norm": 0.011737641894846437, + "language_loss": 0.73034894, + "learning_rate": 0.0006932828541968083, + "loss": 0.74212414, + "num_input_tokens_seen": 170211488, + "router_z_loss_mlp": 0.8515625, + "step": 2040, + "time_per_iteration": 4.902123689651489 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176176, + "balance_loss_mlp": 1.08881962, + "epoch": 0.3926510196229319, + "flos": 1348114142208.0, + "grad_norm": 0.028665962134257456, + "language_loss": 0.92107272, + "learning_rate": 0.0006929954931031422, + "loss": 0.93283451, + "num_input_tokens_seen": 170298528, + "router_z_loss_mlp": 0.875, + "step": 2041, + "time_per_iteration": 3.7387020587921143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176234, + "balance_loss_mlp": 1.08902013, + "epoch": 0.39284340130819545, + "flos": 500603721216.0, + "grad_norm": 0.024641039111334598, + "language_loss": 0.95021844, + "learning_rate": 0.0006927080570819805, + "loss": 0.96198076, + "num_input_tokens_seen": 170365680, + "router_z_loss_mlp": 0.87353516, + "step": 2042, + "time_per_iteration": 2.5837514400482178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117531, + "balance_loss_mlp": 1.08814418, + "epoch": 0.39303578299345904, + "flos": 521341876224.0, + "grad_norm": 0.03605238478740547, + "language_loss": 0.89998531, + "learning_rate": 0.0006924205462449161, + "loss": 0.9117384, + "num_input_tokens_seen": 170432224, + "router_z_loss_mlp": 0.87304688, + "step": 2043, + "time_per_iteration": 2.560842514038086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173664, + "balance_loss_mlp": 1.08664155, + "epoch": 0.39322816467872257, + "flos": 909537686016.0, + "grad_norm": 0.029197625514705252, + "language_loss": 0.89668262, + "learning_rate": 0.0006921329607035702, + "loss": 0.90841925, + "num_input_tokens_seen": 170517920, + "router_z_loss_mlp": 0.87158203, + "step": 2044, + "time_per_iteration": 3.2215418815612793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185916, + "balance_loss_mlp": 1.09860718, + "epoch": 0.39342054636398616, + "flos": 518641431552.0, + "grad_norm": 0.026194219642157263, + "language_loss": 0.94294739, + "learning_rate": 0.0006918453005695938, + "loss": 0.95480657, + "num_input_tokens_seen": 170589072, + "router_z_loss_mlp": 0.87451172, + "step": 2045, + "time_per_iteration": 2.637197732925415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183114, + "balance_loss_mlp": 1.09594774, + "epoch": 0.3936129280492497, + "flos": 549011790336.0, + "grad_norm": 0.026944227420126074, + "language_loss": 0.91576457, + "learning_rate": 0.0006915575659546662, + "loss": 0.92759573, + "num_input_tokens_seen": 170657856, + "router_z_loss_mlp": 0.87304688, + "step": 2046, + "time_per_iteration": 2.7570858001708984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185485, + "balance_loss_mlp": 1.098176, + "epoch": 0.3938053097345133, + "flos": 527140263936.0, + "grad_norm": 0.02948359624940754, + "language_loss": 0.88347399, + "learning_rate": 0.0006912697569704959, + "loss": 0.89532876, + "num_input_tokens_seen": 170723696, + "router_z_loss_mlp": 0.87451172, + "step": 2047, + "time_per_iteration": 2.635467290878296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186252, + "balance_loss_mlp": 1.09899104, + "epoch": 0.39399769141977686, + "flos": 472588701696.0, + "grad_norm": 0.02995196024762557, + "language_loss": 0.93503523, + "learning_rate": 0.0006909818737288205, + "loss": 0.94689775, + "num_input_tokens_seen": 170789536, + "router_z_loss_mlp": 0.87402344, + "step": 2048, + "time_per_iteration": 2.558013916015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181668, + "balance_loss_mlp": 1.09488404, + "epoch": 0.3941900731050404, + "flos": 502726746624.0, + "grad_norm": 0.02878603575662113, + "language_loss": 0.88763595, + "learning_rate": 0.000690693916341406, + "loss": 0.89945263, + "num_input_tokens_seen": 170859232, + "router_z_loss_mlp": 0.86914062, + "step": 2049, + "time_per_iteration": 2.5820720195770264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178505, + "balance_loss_mlp": 1.09152949, + "epoch": 0.394382454790304, + "flos": 582006732288.0, + "grad_norm": 0.024885306311727563, + "language_loss": 0.90003175, + "learning_rate": 0.0006904058849200475, + "loss": 0.91181684, + "num_input_tokens_seen": 170931568, + "router_z_loss_mlp": 0.87109375, + "step": 2050, + "time_per_iteration": 2.7304697036743164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118427, + "balance_loss_mlp": 1.09700906, + "epoch": 0.3945748364755675, + "flos": 514844545536.0, + "grad_norm": 0.02745844528377672, + "language_loss": 0.91741204, + "learning_rate": 0.0006901177795765683, + "loss": 0.92925465, + "num_input_tokens_seen": 170999856, + "router_z_loss_mlp": 0.87402344, + "step": 2051, + "time_per_iteration": 2.610621213912964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180664, + "balance_loss_mlp": 1.09368861, + "epoch": 0.3947672181608311, + "flos": 595057789440.0, + "grad_norm": 0.03028158635704326, + "language_loss": 0.89240891, + "learning_rate": 0.0006898296004228213, + "loss": 0.90421557, + "num_input_tokens_seen": 171072320, + "router_z_loss_mlp": 0.87109375, + "step": 2052, + "time_per_iteration": 2.747377395629883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119046, + "balance_loss_mlp": 1.10634613, + "epoch": 0.39495959984609463, + "flos": 1551049158144.0, + "grad_norm": 0.018267218432335405, + "language_loss": 0.7812674, + "learning_rate": 0.0006895413475706873, + "loss": 0.793172, + "num_input_tokens_seen": 171304128, + "router_z_loss_mlp": 0.84179688, + "step": 2053, + "time_per_iteration": 4.871596336364746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117553, + "balance_loss_mlp": 1.08845937, + "epoch": 0.3951519815313582, + "flos": 497523242496.0, + "grad_norm": 0.028876315996474663, + "language_loss": 0.87133646, + "learning_rate": 0.0006892530211320763, + "loss": 0.88309175, + "num_input_tokens_seen": 171377392, + "router_z_loss_mlp": 0.87207031, + "step": 2054, + "time_per_iteration": 2.696796417236328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117541, + "balance_loss_mlp": 1.08824456, + "epoch": 0.39534436321662175, + "flos": 532222244352.0, + "grad_norm": 0.031248767008087052, + "language_loss": 0.9121244, + "learning_rate": 0.000688964621218926, + "loss": 0.92387855, + "num_input_tokens_seen": 171447424, + "router_z_loss_mlp": 0.87304688, + "step": 2055, + "time_per_iteration": 2.6398446559906006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176401, + "balance_loss_mlp": 1.08899677, + "epoch": 0.39553674490188534, + "flos": 703724484096.0, + "grad_norm": 0.031024749515969993, + "language_loss": 0.88066703, + "learning_rate": 0.0006886761479432037, + "loss": 0.89243108, + "num_input_tokens_seen": 171519920, + "router_z_loss_mlp": 0.87548828, + "step": 2056, + "time_per_iteration": 2.896899700164795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184707, + "balance_loss_mlp": 1.09720743, + "epoch": 0.3957291265871489, + "flos": 410656215552.0, + "grad_norm": 0.031805347037857014, + "language_loss": 0.92354834, + "learning_rate": 0.0006883876014169045, + "loss": 0.93539548, + "num_input_tokens_seen": 171583856, + "router_z_loss_mlp": 0.87646484, + "step": 2057, + "time_per_iteration": 2.49245023727417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118858, + "balance_loss_mlp": 1.10108006, + "epoch": 0.39592150827241246, + "flos": 619638492672.0, + "grad_norm": 0.03245947566344542, + "language_loss": 0.97519982, + "learning_rate": 0.000688098981752052, + "loss": 0.98708564, + "num_input_tokens_seen": 171656064, + "router_z_loss_mlp": 0.87646484, + "step": 2058, + "time_per_iteration": 2.7079999446868896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183973, + "balance_loss_mlp": 1.09642518, + "epoch": 0.39611388995767605, + "flos": 822720324096.0, + "grad_norm": 0.029593298786174956, + "language_loss": 0.88381338, + "learning_rate": 0.0006878102890606982, + "loss": 0.89565313, + "num_input_tokens_seen": 171738800, + "router_z_loss_mlp": 0.87695312, + "step": 2059, + "time_per_iteration": 3.089268922805786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182646, + "balance_loss_mlp": 1.09524131, + "epoch": 0.3963062716429396, + "flos": 493214065152.0, + "grad_norm": 0.03350279358204369, + "language_loss": 0.88991904, + "learning_rate": 0.0006875215234549239, + "loss": 0.9017455, + "num_input_tokens_seen": 171803664, + "router_z_loss_mlp": 0.87548828, + "step": 2060, + "time_per_iteration": 2.538806200027466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182648, + "balance_loss_mlp": 1.09533882, + "epoch": 0.39649865332820317, + "flos": 585833817600.0, + "grad_norm": 0.030947291001002426, + "language_loss": 0.93147129, + "learning_rate": 0.0006872326850468376, + "loss": 0.9432978, + "num_input_tokens_seen": 171871968, + "router_z_loss_mlp": 0.87451172, + "step": 2061, + "time_per_iteration": 2.6593003273010254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179357, + "balance_loss_mlp": 1.09214342, + "epoch": 0.3966910350134667, + "flos": 459511448064.0, + "grad_norm": 0.03264577108022065, + "language_loss": 0.89072591, + "learning_rate": 0.0006869437739485762, + "loss": 0.90251946, + "num_input_tokens_seen": 171942368, + "router_z_loss_mlp": 0.87353516, + "step": 2062, + "time_per_iteration": 2.605191230773926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180604, + "balance_loss_mlp": 1.0932951, + "epoch": 0.3968834166987303, + "flos": 509614844928.0, + "grad_norm": 0.02743430972643364, + "language_loss": 0.9889155, + "learning_rate": 0.0006866547902723053, + "loss": 1.00072145, + "num_input_tokens_seen": 172012336, + "router_z_loss_mlp": 0.87451172, + "step": 2063, + "time_per_iteration": 2.6466383934020996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178614, + "balance_loss_mlp": 1.09116209, + "epoch": 0.3970757983839938, + "flos": 573742215168.0, + "grad_norm": 0.030016333454088624, + "language_loss": 0.87640852, + "learning_rate": 0.000686365734130218, + "loss": 0.88819462, + "num_input_tokens_seen": 172084640, + "router_z_loss_mlp": 0.87597656, + "step": 2064, + "time_per_iteration": 2.6795899868011475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178875, + "balance_loss_mlp": 1.09161353, + "epoch": 0.3972681800692574, + "flos": 482585476608.0, + "grad_norm": 0.03115409384976, + "language_loss": 0.90479839, + "learning_rate": 0.000686076605634536, + "loss": 0.91658711, + "num_input_tokens_seen": 172152992, + "router_z_loss_mlp": 0.87402344, + "step": 2065, + "time_per_iteration": 2.6956639289855957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176026, + "balance_loss_mlp": 1.0887177, + "epoch": 0.397460561754521, + "flos": 488904887808.0, + "grad_norm": 0.028660372999824147, + "language_loss": 0.91924292, + "learning_rate": 0.0006857874048975088, + "loss": 0.93100321, + "num_input_tokens_seen": 172219312, + "router_z_loss_mlp": 0.87451172, + "step": 2066, + "time_per_iteration": 2.541707992553711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182319, + "balance_loss_mlp": 1.09515274, + "epoch": 0.3976529434397845, + "flos": 422895538176.0, + "grad_norm": 0.03007540042591745, + "language_loss": 0.93814421, + "learning_rate": 0.0006854981320314142, + "loss": 0.94996738, + "num_input_tokens_seen": 172282112, + "router_z_loss_mlp": 0.87304688, + "step": 2067, + "time_per_iteration": 2.455916166305542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118284, + "balance_loss_mlp": 1.09586513, + "epoch": 0.3978453251250481, + "flos": 546621522432.0, + "grad_norm": 0.0330596148196893, + "language_loss": 0.94973123, + "learning_rate": 0.0006852087871485579, + "loss": 0.96155965, + "num_input_tokens_seen": 172347872, + "router_z_loss_mlp": 0.87109375, + "step": 2068, + "time_per_iteration": 2.609492063522339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175372, + "balance_loss_mlp": 1.08801544, + "epoch": 0.39803770681031164, + "flos": 652001620992.0, + "grad_norm": 0.0336676185790188, + "language_loss": 0.8912071, + "learning_rate": 0.0006849193703612735, + "loss": 0.90296078, + "num_input_tokens_seen": 172418560, + "router_z_loss_mlp": 0.875, + "step": 2069, + "time_per_iteration": 2.816309690475464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177646, + "balance_loss_mlp": 1.09071827, + "epoch": 0.39823008849557523, + "flos": 741426101760.0, + "grad_norm": 0.026625397702565265, + "language_loss": 0.84925234, + "learning_rate": 0.0006846298817819225, + "loss": 0.86102879, + "num_input_tokens_seen": 172497984, + "router_z_loss_mlp": 0.87060547, + "step": 2070, + "time_per_iteration": 2.9875504970550537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175555, + "balance_loss_mlp": 1.088485, + "epoch": 0.39842247018083876, + "flos": 385888860672.0, + "grad_norm": 0.03226539532166374, + "language_loss": 0.89664173, + "learning_rate": 0.0006843403215228945, + "loss": 0.90839732, + "num_input_tokens_seen": 172560112, + "router_z_loss_mlp": 0.87207031, + "step": 2071, + "time_per_iteration": 2.4326088428497314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173604, + "balance_loss_mlp": 1.08648539, + "epoch": 0.39861485186610235, + "flos": 534762233856.0, + "grad_norm": 0.028550920618746804, + "language_loss": 0.88238078, + "learning_rate": 0.0006840506896966065, + "loss": 0.89411676, + "num_input_tokens_seen": 172636192, + "router_z_loss_mlp": 0.87255859, + "step": 2072, + "time_per_iteration": 2.6961326599121094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177722, + "balance_loss_mlp": 1.09084272, + "epoch": 0.39880723355136594, + "flos": 644412578304.0, + "grad_norm": 0.03366874484709253, + "language_loss": 0.90951228, + "learning_rate": 0.0006837609864155038, + "loss": 0.9212895, + "num_input_tokens_seen": 172715264, + "router_z_loss_mlp": 0.87011719, + "step": 2073, + "time_per_iteration": 2.8584561347961426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119321, + "balance_loss_mlp": 1.10623515, + "epoch": 0.39899961523662947, + "flos": 516891709440.0, + "grad_norm": 0.031985803275243696, + "language_loss": 0.90341693, + "learning_rate": 0.0006834712117920592, + "loss": 0.91534901, + "num_input_tokens_seen": 172783456, + "router_z_loss_mlp": 0.87109375, + "step": 2074, + "time_per_iteration": 2.624469757080078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186501, + "balance_loss_mlp": 1.09933496, + "epoch": 0.39919199692189306, + "flos": 465338033664.0, + "grad_norm": 0.0320663192521817, + "language_loss": 0.92968071, + "learning_rate": 0.0006831813659387729, + "loss": 0.94154572, + "num_input_tokens_seen": 172848928, + "router_z_loss_mlp": 0.87304688, + "step": 2075, + "time_per_iteration": 2.5216238498687744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184926, + "balance_loss_mlp": 1.09785569, + "epoch": 0.3993843786071566, + "flos": 532678139904.0, + "grad_norm": 0.03441409861038799, + "language_loss": 0.91210699, + "learning_rate": 0.0006828914489681733, + "loss": 0.92395616, + "num_input_tokens_seen": 172921152, + "router_z_loss_mlp": 0.87207031, + "step": 2076, + "time_per_iteration": 2.686810255050659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186966, + "balance_loss_mlp": 1.10008633, + "epoch": 0.3995767602924202, + "flos": 505023688704.0, + "grad_norm": 0.02837279486305722, + "language_loss": 0.91445708, + "learning_rate": 0.0006826014609928162, + "loss": 0.92632675, + "num_input_tokens_seen": 172998864, + "router_z_loss_mlp": 0.87011719, + "step": 2077, + "time_per_iteration": 2.6775381565093994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225517, + "balance_loss_mlp": 1.13892365, + "epoch": 0.3997691419776837, + "flos": 1457471225856.0, + "grad_norm": 0.023004253676312834, + "language_loss": 0.83199388, + "learning_rate": 0.0006823114021252846, + "loss": 0.84424907, + "num_input_tokens_seen": 173219216, + "router_z_loss_mlp": 0.8671875, + "step": 2078, + "time_per_iteration": 4.87092661857605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117794, + "balance_loss_mlp": 1.09134626, + "epoch": 0.3999615236629473, + "flos": 531755615232.0, + "grad_norm": 0.028989200184594895, + "language_loss": 0.86860782, + "learning_rate": 0.0006820212724781896, + "loss": 0.88038719, + "num_input_tokens_seen": 173292000, + "router_z_loss_mlp": 0.8671875, + "step": 2079, + "time_per_iteration": 2.6908116340637207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176834, + "balance_loss_mlp": 1.09033561, + "epoch": 0.4001539053482108, + "flos": 696361024512.0, + "grad_norm": 0.02837619494351951, + "language_loss": 0.90808308, + "learning_rate": 0.0006817310721641694, + "loss": 0.91985142, + "num_input_tokens_seen": 173365568, + "router_z_loss_mlp": 0.86621094, + "step": 2080, + "time_per_iteration": 2.8117949962615967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190878, + "balance_loss_mlp": 1.10437989, + "epoch": 0.4003462870334744, + "flos": 521378806272.0, + "grad_norm": 0.0346474179870518, + "language_loss": 0.91806537, + "learning_rate": 0.00068144080129589, + "loss": 0.9299742, + "num_input_tokens_seen": 173430144, + "router_z_loss_mlp": 0.86621094, + "step": 2081, + "time_per_iteration": 2.596397638320923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190824, + "balance_loss_mlp": 1.10422993, + "epoch": 0.400538668718738, + "flos": 493502774784.0, + "grad_norm": 0.03225854359639043, + "language_loss": 0.90241659, + "learning_rate": 0.0006811504599860441, + "loss": 0.91432476, + "num_input_tokens_seen": 173494464, + "router_z_loss_mlp": 0.8671875, + "step": 2082, + "time_per_iteration": 2.5100014209747314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187111, + "balance_loss_mlp": 1.10075557, + "epoch": 0.40073105040400153, + "flos": 491451608064.0, + "grad_norm": 0.02371927790759806, + "language_loss": 0.91368544, + "learning_rate": 0.0006808600483473526, + "loss": 0.92555654, + "num_input_tokens_seen": 173577168, + "router_z_loss_mlp": 0.86474609, + "step": 2083, + "time_per_iteration": 2.9103221893310547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178586, + "balance_loss_mlp": 1.0923264, + "epoch": 0.4009234320892651, + "flos": 563539322880.0, + "grad_norm": 0.025152017879447597, + "language_loss": 0.9285866, + "learning_rate": 0.0006805695664925629, + "loss": 0.94037247, + "num_input_tokens_seen": 173655632, + "router_z_loss_mlp": 0.86376953, + "step": 2084, + "time_per_iteration": 2.804859161376953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170802, + "balance_loss_mlp": 1.08444667, + "epoch": 0.40111581377452865, + "flos": 426852879360.0, + "grad_norm": 0.029415551527707178, + "language_loss": 0.90934992, + "learning_rate": 0.0006802790145344506, + "loss": 0.92105794, + "num_input_tokens_seen": 173719040, + "router_z_loss_mlp": 0.86474609, + "step": 2085, + "time_per_iteration": 2.476952075958252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117314, + "balance_loss_mlp": 1.0870235, + "epoch": 0.40130819545979224, + "flos": 613642719744.0, + "grad_norm": 0.028611036161279673, + "language_loss": 0.93620002, + "learning_rate": 0.0006799883925858176, + "loss": 0.94793141, + "num_input_tokens_seen": 173796704, + "router_z_loss_mlp": 0.86230469, + "step": 2086, + "time_per_iteration": 2.8800101280212402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118738, + "balance_loss_mlp": 1.10112, + "epoch": 0.40150057714505577, + "flos": 524450552832.0, + "grad_norm": 0.02956813955479834, + "language_loss": 0.92602348, + "learning_rate": 0.0006796977007594933, + "loss": 0.93789732, + "num_input_tokens_seen": 173862352, + "router_z_loss_mlp": 0.86376953, + "step": 2087, + "time_per_iteration": 2.6013576984405518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191969, + "balance_loss_mlp": 1.10537529, + "epoch": 0.40169295883031936, + "flos": 562553671680.0, + "grad_norm": 0.03319927890150985, + "language_loss": 0.92797327, + "learning_rate": 0.0006794069391683345, + "loss": 0.93989295, + "num_input_tokens_seen": 173935408, + "router_z_loss_mlp": 0.8671875, + "step": 2088, + "time_per_iteration": 2.7359838485717773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177019, + "balance_loss_mlp": 1.09095037, + "epoch": 0.4018853405155829, + "flos": 520019851776.0, + "grad_norm": 0.03157379152927814, + "language_loss": 0.87612534, + "learning_rate": 0.0006791161079252248, + "loss": 0.88789552, + "num_input_tokens_seen": 174007152, + "router_z_loss_mlp": 0.86181641, + "step": 2089, + "time_per_iteration": 2.596851348876953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118277, + "balance_loss_mlp": 1.09655797, + "epoch": 0.4020777222008465, + "flos": 527287984128.0, + "grad_norm": 0.02654740933555753, + "language_loss": 0.89437628, + "learning_rate": 0.0006788252071430747, + "loss": 0.90620387, + "num_input_tokens_seen": 174074976, + "router_z_loss_mlp": 0.86328125, + "step": 2090, + "time_per_iteration": 2.8311312198638916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184846, + "balance_loss_mlp": 1.09853876, + "epoch": 0.40227010388611006, + "flos": 526840820736.0, + "grad_norm": 0.026844852664274194, + "language_loss": 0.92195117, + "learning_rate": 0.0006785342369348222, + "loss": 0.93379962, + "num_input_tokens_seen": 174149392, + "router_z_loss_mlp": 0.86425781, + "step": 2091, + "time_per_iteration": 2.7458736896514893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191242, + "balance_loss_mlp": 1.10488725, + "epoch": 0.4024624855713736, + "flos": 433226684928.0, + "grad_norm": 0.031284534475277, + "language_loss": 0.86698365, + "learning_rate": 0.0006782431974134316, + "loss": 0.87889606, + "num_input_tokens_seen": 174214656, + "router_z_loss_mlp": 0.86474609, + "step": 2092, + "time_per_iteration": 2.607151985168457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176082, + "balance_loss_mlp": 1.08996522, + "epoch": 0.4026548672566372, + "flos": 768090898944.0, + "grad_norm": 0.02657615147076362, + "language_loss": 0.96284211, + "learning_rate": 0.0006779520886918949, + "loss": 0.97460294, + "num_input_tokens_seen": 174296064, + "router_z_loss_mlp": 0.86230469, + "step": 2093, + "time_per_iteration": 3.03474760055542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173331, + "balance_loss_mlp": 1.08711922, + "epoch": 0.4028472489419007, + "flos": 644117137920.0, + "grad_norm": 0.02625373299959776, + "language_loss": 0.87827718, + "learning_rate": 0.0006776609108832301, + "loss": 0.89001048, + "num_input_tokens_seen": 174370896, + "router_z_loss_mlp": 0.86328125, + "step": 2094, + "time_per_iteration": 2.7667970657348633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171496, + "balance_loss_mlp": 1.08537877, + "epoch": 0.4030396306271643, + "flos": 492823297536.0, + "grad_norm": 0.02676539061642846, + "language_loss": 0.91710174, + "learning_rate": 0.0006773696641004828, + "loss": 0.92881668, + "num_input_tokens_seen": 174438448, + "router_z_loss_mlp": 0.86230469, + "step": 2095, + "time_per_iteration": 2.6013715267181396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177786, + "balance_loss_mlp": 1.09119189, + "epoch": 0.40323201231242783, + "flos": 903194079744.0, + "grad_norm": 0.03019422222161545, + "language_loss": 0.84170926, + "learning_rate": 0.0006770783484567247, + "loss": 0.85348713, + "num_input_tokens_seen": 174525952, + "router_z_loss_mlp": 0.8671875, + "step": 2096, + "time_per_iteration": 3.1032629013061523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180554, + "balance_loss_mlp": 1.09405565, + "epoch": 0.4034243939976914, + "flos": 571729979904.0, + "grad_norm": 0.026575026001379017, + "language_loss": 0.91571426, + "learning_rate": 0.000676786964065055, + "loss": 0.9275198, + "num_input_tokens_seen": 174607200, + "router_z_loss_mlp": 0.86621094, + "step": 2097, + "time_per_iteration": 2.8030343055725098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179089, + "balance_loss_mlp": 1.09254348, + "epoch": 0.403616775682955, + "flos": 508460006400.0, + "grad_norm": 0.029415731928054877, + "language_loss": 0.85702783, + "learning_rate": 0.0006764955110385986, + "loss": 0.86881876, + "num_input_tokens_seen": 174680976, + "router_z_loss_mlp": 0.86669922, + "step": 2098, + "time_per_iteration": 2.7224180698394775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175119, + "balance_loss_mlp": 1.08857322, + "epoch": 0.40380915736821854, + "flos": 520410619392.0, + "grad_norm": 0.02850929110585318, + "language_loss": 0.87608683, + "learning_rate": 0.0006762039894905083, + "loss": 0.88783801, + "num_input_tokens_seen": 174753152, + "router_z_loss_mlp": 0.86669922, + "step": 2099, + "time_per_iteration": 2.5972354412078857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169724, + "balance_loss_mlp": 1.08313072, + "epoch": 0.40400153905348213, + "flos": 442887086592.0, + "grad_norm": 0.05130464738927161, + "language_loss": 0.88512945, + "learning_rate": 0.000675912399533962, + "loss": 0.89682674, + "num_input_tokens_seen": 174817184, + "router_z_loss_mlp": 0.8671875, + "step": 2100, + "time_per_iteration": 2.502772808074951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168649, + "balance_loss_mlp": 1.08210301, + "epoch": 0.40419392073874566, + "flos": 773704636416.0, + "grad_norm": 0.02210637201548751, + "language_loss": 0.90372586, + "learning_rate": 0.0006756207412821656, + "loss": 0.91541237, + "num_input_tokens_seen": 174898128, + "router_z_loss_mlp": 0.86669922, + "step": 2101, + "time_per_iteration": 2.991191864013672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169884, + "balance_loss_mlp": 1.08319497, + "epoch": 0.40438630242400925, + "flos": 767988840960.0, + "grad_norm": 0.03154624750871164, + "language_loss": 0.88513219, + "learning_rate": 0.0006753290148483505, + "loss": 0.89683104, + "num_input_tokens_seen": 174981872, + "router_z_loss_mlp": 0.86816406, + "step": 2102, + "time_per_iteration": 3.005350112915039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166151, + "balance_loss_mlp": 1.07950926, + "epoch": 0.4045786841092728, + "flos": 416128963584.0, + "grad_norm": 0.026413403572192035, + "language_loss": 0.86387646, + "learning_rate": 0.0006750372203457752, + "loss": 0.87553799, + "num_input_tokens_seen": 175044976, + "router_z_loss_mlp": 0.86767578, + "step": 2103, + "time_per_iteration": 2.4381816387176514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168631, + "balance_loss_mlp": 1.08203721, + "epoch": 0.40477106579453637, + "flos": 540308841984.0, + "grad_norm": 0.025857351914300337, + "language_loss": 0.93101668, + "learning_rate": 0.0006747453578877242, + "loss": 0.94270301, + "num_input_tokens_seen": 175121104, + "router_z_loss_mlp": 0.8671875, + "step": 2104, + "time_per_iteration": 2.7268197536468506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169336, + "balance_loss_mlp": 1.08269489, + "epoch": 0.4049634474797999, + "flos": 828091014144.0, + "grad_norm": 0.03225143111931073, + "language_loss": 0.91022515, + "learning_rate": 0.0006744534275875085, + "loss": 0.92191851, + "num_input_tokens_seen": 175194512, + "router_z_loss_mlp": 0.86767578, + "step": 2105, + "time_per_iteration": 3.0087900161743164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176017, + "balance_loss_mlp": 1.08970928, + "epoch": 0.4051558291650635, + "flos": 573752948736.0, + "grad_norm": 0.02821186929772288, + "language_loss": 0.92500931, + "learning_rate": 0.0006741614295584657, + "loss": 0.93676949, + "num_input_tokens_seen": 175264176, + "router_z_loss_mlp": 0.86425781, + "step": 2106, + "time_per_iteration": 2.666135787963867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183174, + "balance_loss_mlp": 1.09691453, + "epoch": 0.4053482108503271, + "flos": 733244176896.0, + "grad_norm": 0.04647201706044112, + "language_loss": 0.85025966, + "learning_rate": 0.0006738693639139595, + "loss": 0.86209136, + "num_input_tokens_seen": 175347488, + "router_z_loss_mlp": 0.86376953, + "step": 2107, + "time_per_iteration": 2.9633677005767822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177787, + "balance_loss_mlp": 1.09100294, + "epoch": 0.4055405925355906, + "flos": 1214949336576.0, + "grad_norm": 0.0302025425082437, + "language_loss": 0.85097325, + "learning_rate": 0.0006735772307673796, + "loss": 0.86275113, + "num_input_tokens_seen": 175438336, + "router_z_loss_mlp": 0.86914062, + "step": 2108, + "time_per_iteration": 3.5333871841430664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177556, + "balance_loss_mlp": 1.09105742, + "epoch": 0.4057329742208542, + "flos": 717107911680.0, + "grad_norm": 0.026166055652869804, + "language_loss": 0.8899157, + "learning_rate": 0.0006732850302321421, + "loss": 0.90169132, + "num_input_tokens_seen": 175510912, + "router_z_loss_mlp": 0.86621094, + "step": 2109, + "time_per_iteration": 2.8610079288482666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170548, + "balance_loss_mlp": 1.0842886, + "epoch": 0.4059253559061177, + "flos": 565953059328.0, + "grad_norm": 0.026405563608612303, + "language_loss": 0.90377712, + "learning_rate": 0.00067299276242169, + "loss": 0.91548264, + "num_input_tokens_seen": 175583040, + "router_z_loss_mlp": 0.86376953, + "step": 2110, + "time_per_iteration": 2.709127426147461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197311, + "balance_loss_mlp": 1.11319733, + "epoch": 0.4061177375913813, + "flos": 1597186481664.0, + "grad_norm": 0.02594110918583908, + "language_loss": 0.74382168, + "learning_rate": 0.0006727004274494908, + "loss": 0.75579476, + "num_input_tokens_seen": 175817952, + "router_z_loss_mlp": 0.84179688, + "step": 2111, + "time_per_iteration": 4.906593322753906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117304, + "balance_loss_mlp": 1.08654153, + "epoch": 0.40631011927664484, + "flos": 616621140480.0, + "grad_norm": 0.028870166263774127, + "language_loss": 0.85570323, + "learning_rate": 0.0006724080254290395, + "loss": 0.86743361, + "num_input_tokens_seen": 175896352, + "router_z_loss_mlp": 0.86621094, + "step": 2112, + "time_per_iteration": 2.8279542922973633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168033, + "balance_loss_mlp": 1.08134389, + "epoch": 0.40650250096190843, + "flos": 558748053504.0, + "grad_norm": 0.030551496532206422, + "language_loss": 0.96733952, + "learning_rate": 0.0006721155564738566, + "loss": 0.97901982, + "num_input_tokens_seen": 175967152, + "router_z_loss_mlp": 0.86816406, + "step": 2113, + "time_per_iteration": 2.6917896270751953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174904, + "balance_loss_mlp": 1.08964539, + "epoch": 0.40669488264717196, + "flos": 1583542542336.0, + "grad_norm": 0.010618058744132962, + "language_loss": 0.78622639, + "learning_rate": 0.0006718230206974884, + "loss": 0.79797542, + "num_input_tokens_seen": 176205248, + "router_z_loss_mlp": 0.85351562, + "step": 2114, + "time_per_iteration": 4.959328651428223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171549, + "balance_loss_mlp": 1.08476496, + "epoch": 0.40688726433243555, + "flos": 508655390208.0, + "grad_norm": 0.033503716654157654, + "language_loss": 0.93188733, + "learning_rate": 0.0006715304182135078, + "loss": 0.9436028, + "num_input_tokens_seen": 176276208, + "router_z_loss_mlp": 0.86914062, + "step": 2115, + "time_per_iteration": 2.6056840419769287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172073, + "balance_loss_mlp": 1.08528888, + "epoch": 0.40707964601769914, + "flos": 590351840256.0, + "grad_norm": 0.028307470802153102, + "language_loss": 0.95287716, + "learning_rate": 0.0006712377491355127, + "loss": 0.96459788, + "num_input_tokens_seen": 176355072, + "router_z_loss_mlp": 0.86914062, + "step": 2116, + "time_per_iteration": 2.8985562324523926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177825, + "balance_loss_mlp": 1.09146965, + "epoch": 0.40727202770296267, + "flos": 581650893312.0, + "grad_norm": 0.026081347286493965, + "language_loss": 0.86969304, + "learning_rate": 0.0006709450135771274, + "loss": 0.88147128, + "num_input_tokens_seen": 176444592, + "router_z_loss_mlp": 0.86474609, + "step": 2117, + "time_per_iteration": 2.938913345336914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116718, + "balance_loss_mlp": 1.08058655, + "epoch": 0.40746440938822626, + "flos": 505108282368.0, + "grad_norm": 0.02500723808493834, + "language_loss": 0.92501736, + "learning_rate": 0.0006706522116520023, + "loss": 0.93668914, + "num_input_tokens_seen": 176516144, + "router_z_loss_mlp": 0.8671875, + "step": 2118, + "time_per_iteration": 2.6295557022094727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169158, + "balance_loss_mlp": 1.08246934, + "epoch": 0.4076567910734898, + "flos": 606710960640.0, + "grad_norm": 0.031046149511695622, + "language_loss": 0.91392642, + "learning_rate": 0.0006703593434738127, + "loss": 0.92561805, + "num_input_tokens_seen": 176585712, + "router_z_loss_mlp": 0.86816406, + "step": 2119, + "time_per_iteration": 2.6925787925720215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170168, + "balance_loss_mlp": 1.08371782, + "epoch": 0.4078491727587534, + "flos": 480518846976.0, + "grad_norm": 0.026436329156680958, + "language_loss": 0.85361552, + "learning_rate": 0.0006700664091562604, + "loss": 0.86531723, + "num_input_tokens_seen": 176654736, + "router_z_loss_mlp": 0.86572266, + "step": 2120, + "time_per_iteration": 2.567094087600708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177249, + "balance_loss_mlp": 1.09065557, + "epoch": 0.4080415544440169, + "flos": 511418961408.0, + "grad_norm": 0.02549175858454111, + "language_loss": 0.92328954, + "learning_rate": 0.0006697734088130725, + "loss": 0.93506193, + "num_input_tokens_seen": 176722800, + "router_z_loss_mlp": 0.8671875, + "step": 2121, + "time_per_iteration": 2.618701934814453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175348, + "balance_loss_mlp": 1.0889926, + "epoch": 0.4082339361292805, + "flos": 735927157248.0, + "grad_norm": 0.030272250235271202, + "language_loss": 0.93378723, + "learning_rate": 0.0006694803425580018, + "loss": 0.94554067, + "num_input_tokens_seen": 176800320, + "router_z_loss_mlp": 0.86474609, + "step": 2122, + "time_per_iteration": 2.983313798904419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174826, + "balance_loss_mlp": 1.08851826, + "epoch": 0.4084263178145441, + "flos": 458404273152.0, + "grad_norm": 0.031322708915370194, + "language_loss": 0.925843, + "learning_rate": 0.0006691872105048268, + "loss": 0.93759131, + "num_input_tokens_seen": 176867440, + "router_z_loss_mlp": 0.86425781, + "step": 2123, + "time_per_iteration": 2.570157766342163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171971, + "balance_loss_mlp": 1.08566332, + "epoch": 0.4086186994998076, + "flos": 564025417728.0, + "grad_norm": 0.026602974246623758, + "language_loss": 0.91457534, + "learning_rate": 0.0006688940127673513, + "loss": 0.92629504, + "num_input_tokens_seen": 176942048, + "router_z_loss_mlp": 0.86425781, + "step": 2124, + "time_per_iteration": 2.6775970458984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172213, + "balance_loss_mlp": 1.08609629, + "epoch": 0.4088110811850712, + "flos": 574893050880.0, + "grad_norm": 0.023493992507127005, + "language_loss": 0.90594321, + "learning_rate": 0.0006686007494594049, + "loss": 0.91766536, + "num_input_tokens_seen": 177025104, + "router_z_loss_mlp": 0.86230469, + "step": 2125, + "time_per_iteration": 2.8212904930114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166923, + "balance_loss_mlp": 1.08028209, + "epoch": 0.40900346287033473, + "flos": 457846319616.0, + "grad_norm": 0.03600016157180187, + "language_loss": 0.89846623, + "learning_rate": 0.0006683074206948425, + "loss": 0.91013545, + "num_input_tokens_seen": 177089296, + "router_z_loss_mlp": 0.86767578, + "step": 2126, + "time_per_iteration": 2.4914121627807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165958, + "balance_loss_mlp": 1.07926905, + "epoch": 0.4091958445555983, + "flos": 618594444288.0, + "grad_norm": 0.027616550174826966, + "language_loss": 0.88032037, + "learning_rate": 0.0006680140265875443, + "loss": 0.89197993, + "num_input_tokens_seen": 177163648, + "router_z_loss_mlp": 0.86816406, + "step": 2127, + "time_per_iteration": 2.8309690952301025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164825, + "balance_loss_mlp": 1.07846975, + "epoch": 0.40938822624086185, + "flos": 473370236928.0, + "grad_norm": 0.02755246393115647, + "language_loss": 1.01638341, + "learning_rate": 0.0006677205672514162, + "loss": 1.02803159, + "num_input_tokens_seen": 177233856, + "router_z_loss_mlp": 0.86474609, + "step": 2128, + "time_per_iteration": 2.716601610183716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170358, + "balance_loss_mlp": 1.08395457, + "epoch": 0.40958060792612544, + "flos": 571117632000.0, + "grad_norm": 0.024298637355030545, + "language_loss": 0.93714547, + "learning_rate": 0.000667427042800389, + "loss": 0.94884908, + "num_input_tokens_seen": 177309824, + "router_z_loss_mlp": 0.86523438, + "step": 2129, + "time_per_iteration": 2.7863857746124268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181584, + "balance_loss_mlp": 1.09499085, + "epoch": 0.40977298961138897, + "flos": 610470916608.0, + "grad_norm": 0.027297656005279614, + "language_loss": 0.89951032, + "learning_rate": 0.0006671334533484192, + "loss": 0.91132617, + "num_input_tokens_seen": 177380592, + "router_z_loss_mlp": 0.8671875, + "step": 2130, + "time_per_iteration": 2.7272608280181885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177813, + "balance_loss_mlp": 1.09160113, + "epoch": 0.40996537129665256, + "flos": 582872861184.0, + "grad_norm": 0.02438545141207517, + "language_loss": 0.89143705, + "learning_rate": 0.0006668397990094881, + "loss": 0.90321517, + "num_input_tokens_seen": 177454720, + "router_z_loss_mlp": 0.86328125, + "step": 2131, + "time_per_iteration": 2.74776554107666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173755, + "balance_loss_mlp": 1.08739984, + "epoch": 0.41015775298191615, + "flos": 517553722368.0, + "grad_norm": 0.026155362463659675, + "language_loss": 0.91776133, + "learning_rate": 0.0006665460798976027, + "loss": 0.92949885, + "num_input_tokens_seen": 177528224, + "router_z_loss_mlp": 0.86474609, + "step": 2132, + "time_per_iteration": 2.728180170059204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172912, + "balance_loss_mlp": 1.08679533, + "epoch": 0.4103501346671797, + "flos": 511445157888.0, + "grad_norm": 0.02671704384652658, + "language_loss": 0.87880147, + "learning_rate": 0.0006662522961267947, + "loss": 0.89053059, + "num_input_tokens_seen": 177598176, + "router_z_loss_mlp": 0.86230469, + "step": 2133, + "time_per_iteration": 2.6707494258880615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172576, + "balance_loss_mlp": 1.08636391, + "epoch": 0.41054251635244327, + "flos": 550926696960.0, + "grad_norm": 0.02310158230225749, + "language_loss": 0.93120432, + "learning_rate": 0.0006659584478111211, + "loss": 0.9429301, + "num_input_tokens_seen": 177675840, + "router_z_loss_mlp": 0.86328125, + "step": 2134, + "time_per_iteration": 2.7634923458099365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167834, + "balance_loss_mlp": 1.08162224, + "epoch": 0.4107348980377068, + "flos": 841298523648.0, + "grad_norm": 0.0323112144897684, + "language_loss": 0.91370595, + "learning_rate": 0.000665664535064664, + "loss": 0.9253844, + "num_input_tokens_seen": 177751376, + "router_z_loss_mlp": 0.86328125, + "step": 2135, + "time_per_iteration": 3.028343677520752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170594, + "balance_loss_mlp": 1.08447671, + "epoch": 0.4109272797229704, + "flos": 504763176960.0, + "grad_norm": 0.026958983372987907, + "language_loss": 0.8977797, + "learning_rate": 0.0006653705580015303, + "loss": 0.90948564, + "num_input_tokens_seen": 177825264, + "router_z_loss_mlp": 0.86230469, + "step": 2136, + "time_per_iteration": 2.6786246299743652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173433, + "balance_loss_mlp": 1.08731592, + "epoch": 0.4111196614082339, + "flos": 612023253504.0, + "grad_norm": 0.02687154551301225, + "language_loss": 0.92936879, + "learning_rate": 0.0006650765167358523, + "loss": 0.9411031, + "num_input_tokens_seen": 177901680, + "router_z_loss_mlp": 0.86230469, + "step": 2137, + "time_per_iteration": 2.765503168106079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170304, + "balance_loss_mlp": 1.08409154, + "epoch": 0.4113120430934975, + "flos": 454103827968.0, + "grad_norm": 0.029691236683527498, + "language_loss": 0.97143424, + "learning_rate": 0.0006647824113817864, + "loss": 0.98313725, + "num_input_tokens_seen": 177965264, + "router_z_loss_mlp": 0.86328125, + "step": 2138, + "time_per_iteration": 2.490111827850342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179698, + "balance_loss_mlp": 1.09329462, + "epoch": 0.41150442477876104, + "flos": 542709843456.0, + "grad_norm": 0.027637209651618533, + "language_loss": 0.88423729, + "learning_rate": 0.000664488242053515, + "loss": 0.89603424, + "num_input_tokens_seen": 178039712, + "router_z_loss_mlp": 0.86523438, + "step": 2139, + "time_per_iteration": 2.7109243869781494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193887, + "balance_loss_mlp": 1.10748434, + "epoch": 0.4116968064640246, + "flos": 577391380992.0, + "grad_norm": 0.026757188222196804, + "language_loss": 0.8939023, + "learning_rate": 0.0006641940088652445, + "loss": 0.90584123, + "num_input_tokens_seen": 178114080, + "router_z_loss_mlp": 0.86523438, + "step": 2140, + "time_per_iteration": 2.7461891174316406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186164, + "balance_loss_mlp": 1.09952235, + "epoch": 0.4118891881492882, + "flos": 497149939200.0, + "grad_norm": 0.030186458882164903, + "language_loss": 0.90177953, + "learning_rate": 0.0006638997119312065, + "loss": 0.91364121, + "num_input_tokens_seen": 178188032, + "router_z_loss_mlp": 0.86767578, + "step": 2141, + "time_per_iteration": 2.7632482051849365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206482, + "balance_loss_mlp": 1.11969757, + "epoch": 0.41208156983455174, + "flos": 1541570678784.0, + "grad_norm": 0.01865751049600735, + "language_loss": 0.75063306, + "learning_rate": 0.0006636053513656568, + "loss": 0.76269788, + "num_input_tokens_seen": 178395328, + "router_z_loss_mlp": 0.86914062, + "step": 2142, + "time_per_iteration": 4.916187286376953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117268, + "balance_loss_mlp": 1.0864203, + "epoch": 0.41227395151981533, + "flos": 586057399296.0, + "grad_norm": 0.03006664462158482, + "language_loss": 0.91539335, + "learning_rate": 0.000663310927282877, + "loss": 0.92712009, + "num_input_tokens_seen": 178471952, + "router_z_loss_mlp": 0.86376953, + "step": 2143, + "time_per_iteration": 2.783862829208374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178317, + "balance_loss_mlp": 1.09220016, + "epoch": 0.41246633320507886, + "flos": 443892203520.0, + "grad_norm": 0.03021664461702893, + "language_loss": 0.92787349, + "learning_rate": 0.000663016439797172, + "loss": 0.93965667, + "num_input_tokens_seen": 178542192, + "router_z_loss_mlp": 0.86230469, + "step": 2144, + "time_per_iteration": 2.617626428604126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177938, + "balance_loss_mlp": 1.09177303, + "epoch": 0.41265871489034245, + "flos": 581094941184.0, + "grad_norm": 0.031114344129188405, + "language_loss": 0.87895894, + "learning_rate": 0.0006627218890228724, + "loss": 0.89073837, + "num_input_tokens_seen": 178622736, + "router_z_loss_mlp": 0.86279297, + "step": 2145, + "time_per_iteration": 2.823136329650879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172469, + "balance_loss_mlp": 1.08611357, + "epoch": 0.412851096575606, + "flos": 762528827904.0, + "grad_norm": 0.03009040753958223, + "language_loss": 0.9065426, + "learning_rate": 0.0006624272750743326, + "loss": 0.91826725, + "num_input_tokens_seen": 178705808, + "router_z_loss_mlp": 0.86474609, + "step": 2146, + "time_per_iteration": 3.009969472885132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172508, + "balance_loss_mlp": 1.08615267, + "epoch": 0.41304347826086957, + "flos": 556520968704.0, + "grad_norm": 0.023356325653820006, + "language_loss": 0.88529593, + "learning_rate": 0.0006621325980659322, + "loss": 0.89702094, + "num_input_tokens_seen": 178781200, + "router_z_loss_mlp": 0.86474609, + "step": 2147, + "time_per_iteration": 2.7459471225738525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176953, + "balance_loss_mlp": 1.09083641, + "epoch": 0.41323585994613315, + "flos": 666893724672.0, + "grad_norm": 0.029406479855093332, + "language_loss": 0.8760705, + "learning_rate": 0.000661837858112075, + "loss": 0.88783997, + "num_input_tokens_seen": 178855072, + "router_z_loss_mlp": 0.86230469, + "step": 2148, + "time_per_iteration": 2.816408634185791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173515, + "balance_loss_mlp": 1.08763647, + "epoch": 0.4134282416313967, + "flos": 549784593408.0, + "grad_norm": 0.02816234486414791, + "language_loss": 0.9661653, + "learning_rate": 0.0006615430553271888, + "loss": 0.97790039, + "num_input_tokens_seen": 178927936, + "router_z_loss_mlp": 0.85986328, + "step": 2149, + "time_per_iteration": 2.7518115043640137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174425, + "balance_loss_mlp": 1.08859468, + "epoch": 0.4136206233166603, + "flos": 647512522752.0, + "grad_norm": 0.025697121170903614, + "language_loss": 0.9133321, + "learning_rate": 0.0006612481898257264, + "loss": 0.92507643, + "num_input_tokens_seen": 179007792, + "router_z_loss_mlp": 0.859375, + "step": 2150, + "time_per_iteration": 2.841632127761841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179143, + "balance_loss_mlp": 1.09364581, + "epoch": 0.4138130050019238, + "flos": 518363455488.0, + "grad_norm": 0.029278566016903075, + "language_loss": 0.9170779, + "learning_rate": 0.000660953261722165, + "loss": 0.92886931, + "num_input_tokens_seen": 179075200, + "router_z_loss_mlp": 0.85595703, + "step": 2151, + "time_per_iteration": 2.6203365325927734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178641, + "balance_loss_mlp": 1.09309638, + "epoch": 0.4140053866871874, + "flos": 610368858624.0, + "grad_norm": 0.02858072061503926, + "language_loss": 0.90138143, + "learning_rate": 0.0006606582711310055, + "loss": 0.91316783, + "num_input_tokens_seen": 179144448, + "router_z_loss_mlp": 0.85644531, + "step": 2152, + "time_per_iteration": 2.71352481842041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167147, + "balance_loss_mlp": 1.08103001, + "epoch": 0.4141977683724509, + "flos": 580845163008.0, + "grad_norm": 0.02998636441804494, + "language_loss": 0.9075436, + "learning_rate": 0.0006603632181667736, + "loss": 0.91921502, + "num_input_tokens_seen": 179215776, + "router_z_loss_mlp": 0.86230469, + "step": 2153, + "time_per_iteration": 2.766855478286743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175224, + "balance_loss_mlp": 1.09034729, + "epoch": 0.4143901500577145, + "flos": 1310176386048.0, + "grad_norm": 0.007725969282803628, + "language_loss": 0.78943324, + "learning_rate": 0.0006600681029440187, + "loss": 0.80118549, + "num_input_tokens_seen": 179436688, + "router_z_loss_mlp": 0.84960938, + "step": 2154, + "time_per_iteration": 4.895019292831421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175162, + "balance_loss_mlp": 1.08890247, + "epoch": 0.41458253174297804, + "flos": 461122182144.0, + "grad_norm": 0.032062709167589486, + "language_loss": 0.89760709, + "learning_rate": 0.0006597729255773153, + "loss": 0.90935868, + "num_input_tokens_seen": 179503264, + "router_z_loss_mlp": 0.86376953, + "step": 2155, + "time_per_iteration": 2.5811779499053955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170487, + "balance_loss_mlp": 1.08413148, + "epoch": 0.41477491342824163, + "flos": 554438876160.0, + "grad_norm": 0.02646748417883587, + "language_loss": 0.88947552, + "learning_rate": 0.0006594776861812608, + "loss": 0.90118033, + "num_input_tokens_seen": 179574864, + "router_z_loss_mlp": 0.86474609, + "step": 2156, + "time_per_iteration": 2.6486780643463135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174434, + "balance_loss_mlp": 1.08803129, + "epoch": 0.4149672951135052, + "flos": 699085664256.0, + "grad_norm": 0.02893226937169889, + "language_loss": 0.92862517, + "learning_rate": 0.0006591823848704776, + "loss": 0.94036949, + "num_input_tokens_seen": 179658208, + "router_z_loss_mlp": 0.86523438, + "step": 2157, + "time_per_iteration": 2.9617741107940674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175673, + "balance_loss_mlp": 1.08946109, + "epoch": 0.41515967679876875, + "flos": 566836652544.0, + "grad_norm": 0.025963915394380376, + "language_loss": 0.87666786, + "learning_rate": 0.0006588870217596117, + "loss": 0.88842458, + "num_input_tokens_seen": 179732320, + "router_z_loss_mlp": 0.86328125, + "step": 2158, + "time_per_iteration": 2.7438344955444336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175578, + "balance_loss_mlp": 1.08927035, + "epoch": 0.41535205848403234, + "flos": 502177525248.0, + "grad_norm": 0.03336248103115958, + "language_loss": 0.93542749, + "learning_rate": 0.0006585915969633334, + "loss": 0.94718325, + "num_input_tokens_seen": 179801616, + "router_z_loss_mlp": 0.86425781, + "step": 2159, + "time_per_iteration": 2.5621583461761475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170555, + "balance_loss_mlp": 1.08429492, + "epoch": 0.41554444016929587, + "flos": 608701728768.0, + "grad_norm": 0.03070944646834424, + "language_loss": 0.95915914, + "learning_rate": 0.0006582961105963366, + "loss": 0.97086465, + "num_input_tokens_seen": 179876112, + "router_z_loss_mlp": 0.86376953, + "step": 2160, + "time_per_iteration": 2.798051118850708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171192, + "balance_loss_mlp": 1.08498013, + "epoch": 0.41573682185455946, + "flos": 530155614720.0, + "grad_norm": 0.02743693152360054, + "language_loss": 0.85023397, + "learning_rate": 0.0006580005627733395, + "loss": 0.86194587, + "num_input_tokens_seen": 179949936, + "router_z_loss_mlp": 0.86328125, + "step": 2161, + "time_per_iteration": 2.6954233646392822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168175, + "balance_loss_mlp": 1.08234429, + "epoch": 0.415929203539823, + "flos": 506037537792.0, + "grad_norm": 0.027357224978205523, + "language_loss": 0.88365781, + "learning_rate": 0.0006577049536090838, + "loss": 0.89533949, + "num_input_tokens_seen": 180023184, + "router_z_loss_mlp": 0.859375, + "step": 2162, + "time_per_iteration": 2.6762402057647705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167145, + "balance_loss_mlp": 1.08140957, + "epoch": 0.4161215852250866, + "flos": 583823583744.0, + "grad_norm": 0.02816159229600616, + "language_loss": 0.92433643, + "learning_rate": 0.000657409283218335, + "loss": 0.93600792, + "num_input_tokens_seen": 180091728, + "router_z_loss_mlp": 0.85839844, + "step": 2163, + "time_per_iteration": 2.708815574645996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116891, + "balance_loss_mlp": 1.0833174, + "epoch": 0.4163139669103501, + "flos": 491759783424.0, + "grad_norm": 0.02622965675004396, + "language_loss": 0.87195617, + "learning_rate": 0.0006571135517158829, + "loss": 0.8836453, + "num_input_tokens_seen": 180162096, + "router_z_loss_mlp": 0.85693359, + "step": 2164, + "time_per_iteration": 2.7412045001983643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177162, + "balance_loss_mlp": 1.0930481, + "epoch": 0.4165063485956137, + "flos": 1291020767232.0, + "grad_norm": 0.0113690904759025, + "language_loss": 0.76764059, + "learning_rate": 0.0006568177592165404, + "loss": 0.77941221, + "num_input_tokens_seen": 180380912, + "router_z_loss_mlp": 0.84179688, + "step": 2165, + "time_per_iteration": 4.793722867965698 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172447, + "balance_loss_mlp": 1.08680665, + "epoch": 0.4166987302808773, + "flos": 496257613824.0, + "grad_norm": 0.031372404533623194, + "language_loss": 0.90335643, + "learning_rate": 0.0006565219058351444, + "loss": 0.9150809, + "num_input_tokens_seen": 180447424, + "router_z_loss_mlp": 0.85742188, + "step": 2166, + "time_per_iteration": 2.5605039596557617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169955, + "balance_loss_mlp": 1.08412397, + "epoch": 0.4168911119661408, + "flos": 465066788352.0, + "grad_norm": 0.02745374217966413, + "language_loss": 0.89900762, + "learning_rate": 0.0006562259916865553, + "loss": 0.91070712, + "num_input_tokens_seen": 180516336, + "router_z_loss_mlp": 0.859375, + "step": 2167, + "time_per_iteration": 2.5815963745117188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011761, + "balance_loss_mlp": 1.09055507, + "epoch": 0.4170834936514044, + "flos": 537942769152.0, + "grad_norm": 0.0279390150832869, + "language_loss": 0.86569649, + "learning_rate": 0.0006559300168856573, + "loss": 0.8774575, + "num_input_tokens_seen": 180589824, + "router_z_loss_mlp": 0.85644531, + "step": 2168, + "time_per_iteration": 2.7917275428771973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181119, + "balance_loss_mlp": 1.09547901, + "epoch": 0.41727587533666793, + "flos": 551749165056.0, + "grad_norm": 0.026888463962073755, + "language_loss": 0.92254919, + "learning_rate": 0.0006556339815473577, + "loss": 0.93436038, + "num_input_tokens_seen": 180661296, + "router_z_loss_mlp": 0.85742188, + "step": 2169, + "time_per_iteration": 2.640456438064575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170658, + "balance_loss_mlp": 1.08492219, + "epoch": 0.4174682570219315, + "flos": 632377371648.0, + "grad_norm": 0.027558904728032622, + "language_loss": 0.91870886, + "learning_rate": 0.000655337885786588, + "loss": 0.93041539, + "num_input_tokens_seen": 180744896, + "router_z_loss_mlp": 0.85839844, + "step": 2170, + "time_per_iteration": 2.885754108428955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170686, + "balance_loss_mlp": 1.08485556, + "epoch": 0.41766063870719505, + "flos": 520755724800.0, + "grad_norm": 0.031037248087189308, + "language_loss": 0.9245193, + "learning_rate": 0.0006550417297183025, + "loss": 0.93622619, + "num_input_tokens_seen": 180813008, + "router_z_loss_mlp": 0.859375, + "step": 2171, + "time_per_iteration": 2.607590436935425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175474, + "balance_loss_mlp": 1.08945298, + "epoch": 0.41785302039245864, + "flos": 559054227456.0, + "grad_norm": 0.02737354340834092, + "language_loss": 0.87721866, + "learning_rate": 0.0006547455134574793, + "loss": 0.88897336, + "num_input_tokens_seen": 180886480, + "router_z_loss_mlp": 0.86132812, + "step": 2172, + "time_per_iteration": 2.7324562072753906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184116, + "balance_loss_mlp": 1.09833348, + "epoch": 0.41804540207772223, + "flos": 790027553280.0, + "grad_norm": 0.06230752646239431, + "language_loss": 0.90406793, + "learning_rate": 0.0006544492371191198, + "loss": 0.91590911, + "num_input_tokens_seen": 180973776, + "router_z_loss_mlp": 0.85888672, + "step": 2173, + "time_per_iteration": 3.1248764991760254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186676, + "balance_loss_mlp": 1.10089302, + "epoch": 0.41823778376298576, + "flos": 905890521600.0, + "grad_norm": 0.03053935653615099, + "language_loss": 0.9052453, + "learning_rate": 0.0006541529008182485, + "loss": 0.91711211, + "num_input_tokens_seen": 181062768, + "router_z_loss_mlp": 0.85888672, + "step": 2174, + "time_per_iteration": 3.2052760124206543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169526, + "balance_loss_mlp": 1.08383834, + "epoch": 0.41843016544824935, + "flos": 512573799936.0, + "grad_norm": 0.02722476190126499, + "language_loss": 0.93815506, + "learning_rate": 0.0006538565046699136, + "loss": 0.94985026, + "num_input_tokens_seen": 181129872, + "router_z_loss_mlp": 0.85791016, + "step": 2175, + "time_per_iteration": 2.578150987625122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167473, + "balance_loss_mlp": 1.08183265, + "epoch": 0.4186225471335129, + "flos": 654289830912.0, + "grad_norm": 0.03154991846739093, + "language_loss": 0.89587617, + "learning_rate": 0.0006535600487891862, + "loss": 0.90755087, + "num_input_tokens_seen": 181208112, + "router_z_loss_mlp": 0.85742188, + "step": 2176, + "time_per_iteration": 2.8699960708618164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167918, + "balance_loss_mlp": 1.08218253, + "epoch": 0.41881492881877647, + "flos": 570225306624.0, + "grad_norm": 0.027441287945076498, + "language_loss": 0.94665354, + "learning_rate": 0.0006532635332911603, + "loss": 0.95833272, + "num_input_tokens_seen": 181278736, + "router_z_loss_mlp": 0.85839844, + "step": 2177, + "time_per_iteration": 2.695180892944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168273, + "balance_loss_mlp": 1.08239508, + "epoch": 0.41900731050404, + "flos": 913484293632.0, + "grad_norm": 0.030353783790969455, + "language_loss": 0.86808872, + "learning_rate": 0.0006529669582909541, + "loss": 0.87977153, + "num_input_tokens_seen": 181362512, + "router_z_loss_mlp": 0.85986328, + "step": 2178, + "time_per_iteration": 3.2746284008026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116623, + "balance_loss_mlp": 1.08073354, + "epoch": 0.4191996921893036, + "flos": 536783201280.0, + "grad_norm": 0.031775111638151596, + "language_loss": 0.93350971, + "learning_rate": 0.0006526703239037077, + "loss": 0.94517195, + "num_input_tokens_seen": 181432080, + "router_z_loss_mlp": 0.85595703, + "step": 2179, + "time_per_iteration": 2.6485140323638916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167238, + "balance_loss_mlp": 1.08159792, + "epoch": 0.4193920738745671, + "flos": 583730257920.0, + "grad_norm": 0.027399178820930566, + "language_loss": 0.92623031, + "learning_rate": 0.0006523736302445851, + "loss": 0.93790269, + "num_input_tokens_seen": 181507296, + "router_z_loss_mlp": 0.85742188, + "step": 2180, + "time_per_iteration": 2.8337948322296143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116728, + "balance_loss_mlp": 1.08149683, + "epoch": 0.4195844555598307, + "flos": 1337800459776.0, + "grad_norm": 0.031235958835637387, + "language_loss": 0.83915186, + "learning_rate": 0.0006520768774287728, + "loss": 0.85082471, + "num_input_tokens_seen": 181599408, + "router_z_loss_mlp": 0.85888672, + "step": 2181, + "time_per_iteration": 3.725524663925171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170743, + "balance_loss_mlp": 1.08505547, + "epoch": 0.4197768372450943, + "flos": 599996779008.0, + "grad_norm": 0.025797087070179033, + "language_loss": 0.91158509, + "learning_rate": 0.0006517800655714806, + "loss": 0.92329252, + "num_input_tokens_seen": 181674944, + "router_z_loss_mlp": 0.85791016, + "step": 2182, + "time_per_iteration": 2.8207623958587646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172108, + "balance_loss_mlp": 1.08646846, + "epoch": 0.4199692189303578, + "flos": 736595900928.0, + "grad_norm": 0.0300192342725077, + "language_loss": 0.91644537, + "learning_rate": 0.0006514831947879407, + "loss": 0.92816639, + "num_input_tokens_seen": 181756704, + "router_z_loss_mlp": 0.85742188, + "step": 2183, + "time_per_iteration": 2.9593582153320312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170186, + "balance_loss_mlp": 1.08454573, + "epoch": 0.4201616006156214, + "flos": 751661921280.0, + "grad_norm": 0.02826942186100045, + "language_loss": 0.84773123, + "learning_rate": 0.0006511862651934091, + "loss": 0.85943305, + "num_input_tokens_seen": 181837952, + "router_z_loss_mlp": 0.85742188, + "step": 2184, + "time_per_iteration": 3.1170709133148193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168703, + "balance_loss_mlp": 1.08301497, + "epoch": 0.42035398230088494, + "flos": 548091267072.0, + "grad_norm": 0.027950639773315498, + "language_loss": 0.89124084, + "learning_rate": 0.0006508892769031638, + "loss": 0.90292788, + "num_input_tokens_seen": 181906896, + "router_z_loss_mlp": 0.85791016, + "step": 2185, + "time_per_iteration": 2.6419410705566406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116924, + "balance_loss_mlp": 1.08379054, + "epoch": 0.42054636398614853, + "flos": 618047224320.0, + "grad_norm": 0.03133969262582121, + "language_loss": 0.94198585, + "learning_rate": 0.000650592230032506, + "loss": 0.95367819, + "num_input_tokens_seen": 181974976, + "router_z_loss_mlp": 0.85546875, + "step": 2186, + "time_per_iteration": 2.7254862785339355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175, + "balance_loss_mlp": 1.08935976, + "epoch": 0.42073874567141206, + "flos": 641666471424.0, + "grad_norm": 0.02942747497692904, + "language_loss": 0.9171921, + "learning_rate": 0.0006502951246967595, + "loss": 0.92894208, + "num_input_tokens_seen": 182054704, + "router_z_loss_mlp": 0.85742188, + "step": 2187, + "time_per_iteration": 2.8912041187286377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174567, + "balance_loss_mlp": 1.08897436, + "epoch": 0.42093112735667565, + "flos": 494822797824.0, + "grad_norm": 0.02515329577356359, + "language_loss": 0.92510098, + "learning_rate": 0.0006499979610112706, + "loss": 0.93684661, + "num_input_tokens_seen": 182129696, + "router_z_loss_mlp": 0.85693359, + "step": 2188, + "time_per_iteration": 2.710610866546631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119078, + "balance_loss_mlp": 1.1055218, + "epoch": 0.4211235090419392, + "flos": 543436984320.0, + "grad_norm": 0.027549100686041793, + "language_loss": 0.89267701, + "learning_rate": 0.000649700739091409, + "loss": 0.90458483, + "num_input_tokens_seen": 182203792, + "router_z_loss_mlp": 0.85351562, + "step": 2189, + "time_per_iteration": 2.770158290863037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177139, + "balance_loss_mlp": 1.09321594, + "epoch": 0.42131589072720277, + "flos": 1535388254208.0, + "grad_norm": 0.007480893247264192, + "language_loss": 0.73836273, + "learning_rate": 0.0006494034590525657, + "loss": 0.75013411, + "num_input_tokens_seen": 182432080, + "router_z_loss_mlp": 0.83984375, + "step": 2190, + "time_per_iteration": 4.826355218887329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168739, + "balance_loss_mlp": 1.08381474, + "epoch": 0.42150827241246636, + "flos": 567935095296.0, + "grad_norm": 0.025807507169531153, + "language_loss": 0.91430855, + "learning_rate": 0.0006491061210101557, + "loss": 0.92599595, + "num_input_tokens_seen": 182500256, + "router_z_loss_mlp": 0.85009766, + "step": 2191, + "time_per_iteration": 2.6813712120056152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170756, + "balance_loss_mlp": 1.08568799, + "epoch": 0.4217006540977299, + "flos": 708841393152.0, + "grad_norm": 0.02710796189326301, + "language_loss": 0.90667284, + "learning_rate": 0.0006488087250796157, + "loss": 0.91838038, + "num_input_tokens_seen": 182582912, + "router_z_loss_mlp": 0.8515625, + "step": 2192, + "time_per_iteration": 2.8864076137542725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117035, + "balance_loss_mlp": 1.08528221, + "epoch": 0.4218930357829935, + "flos": 628561019904.0, + "grad_norm": 0.0271709214243351, + "language_loss": 0.87769991, + "learning_rate": 0.0006485112713764049, + "loss": 0.8894034, + "num_input_tokens_seen": 182670304, + "router_z_loss_mlp": 0.8515625, + "step": 2193, + "time_per_iteration": 2.9007742404937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170953, + "balance_loss_mlp": 1.08578944, + "epoch": 0.422085417468257, + "flos": 461289368064.0, + "grad_norm": 0.026123872435626132, + "language_loss": 0.89901912, + "learning_rate": 0.0006482137600160051, + "loss": 0.91072869, + "num_input_tokens_seen": 182735024, + "router_z_loss_mlp": 0.85253906, + "step": 2194, + "time_per_iteration": 2.4960973262786865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170401, + "balance_loss_mlp": 1.08533287, + "epoch": 0.4222777991535206, + "flos": 474980971008.0, + "grad_norm": 0.02685495955741856, + "language_loss": 0.90204549, + "learning_rate": 0.0006479161911139206, + "loss": 0.91374946, + "num_input_tokens_seen": 182805024, + "router_z_loss_mlp": 0.8515625, + "step": 2195, + "time_per_iteration": 2.574496030807495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170408, + "balance_loss_mlp": 1.08534062, + "epoch": 0.4224701808387841, + "flos": 471844096512.0, + "grad_norm": 0.03212817551635824, + "language_loss": 0.93686366, + "learning_rate": 0.0006476185647856778, + "loss": 0.94856775, + "num_input_tokens_seen": 182871360, + "router_z_loss_mlp": 0.8515625, + "step": 2196, + "time_per_iteration": 2.558581829071045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169081, + "balance_loss_mlp": 1.08401346, + "epoch": 0.4226625625240477, + "flos": 678822870528.0, + "grad_norm": 0.034209207392335836, + "language_loss": 0.88652933, + "learning_rate": 0.0006473208811468255, + "loss": 0.89822018, + "num_input_tokens_seen": 182952912, + "router_z_loss_mlp": 0.8515625, + "step": 2197, + "time_per_iteration": 2.8745005130767822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169989, + "balance_loss_mlp": 1.08487344, + "epoch": 0.4228549442093113, + "flos": 504559060992.0, + "grad_norm": 0.02694559660877684, + "language_loss": 0.9045344, + "learning_rate": 0.0006470231403129347, + "loss": 0.91623431, + "num_input_tokens_seen": 183022016, + "router_z_loss_mlp": 0.85205078, + "step": 2198, + "time_per_iteration": 2.6385552883148193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171157, + "balance_loss_mlp": 1.08594668, + "epoch": 0.42304732589457483, + "flos": 613074032640.0, + "grad_norm": 0.02362792419875934, + "language_loss": 0.86769903, + "learning_rate": 0.0006467253423995988, + "loss": 0.87941062, + "num_input_tokens_seen": 183101776, + "router_z_loss_mlp": 0.85302734, + "step": 2199, + "time_per_iteration": 2.8800480365753174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169589, + "balance_loss_mlp": 1.08418751, + "epoch": 0.4232397075798384, + "flos": 516648662016.0, + "grad_norm": 0.0345778065938135, + "language_loss": 0.86613309, + "learning_rate": 0.000646427487522433, + "loss": 0.87782902, + "num_input_tokens_seen": 183171392, + "router_z_loss_mlp": 0.85498047, + "step": 2200, + "time_per_iteration": 2.658045768737793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170112, + "balance_loss_mlp": 1.08451986, + "epoch": 0.42343208926510195, + "flos": 590933262336.0, + "grad_norm": 0.02424061904629306, + "language_loss": 0.89308071, + "learning_rate": 0.0006461295757970749, + "loss": 0.90478176, + "num_input_tokens_seen": 183253936, + "router_z_loss_mlp": 0.85693359, + "step": 2201, + "time_per_iteration": 2.8574764728546143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170293, + "balance_loss_mlp": 1.08465314, + "epoch": 0.42362447095036554, + "flos": 641818194432.0, + "grad_norm": 0.03053594684877434, + "language_loss": 0.89224029, + "learning_rate": 0.0006458316073391839, + "loss": 0.90394318, + "num_input_tokens_seen": 183333744, + "router_z_loss_mlp": 0.85742188, + "step": 2202, + "time_per_iteration": 2.932666063308716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168878, + "balance_loss_mlp": 1.08318996, + "epoch": 0.42381685263562907, + "flos": 513717904896.0, + "grad_norm": 0.025745877239568934, + "language_loss": 0.93694568, + "learning_rate": 0.0006455335822644422, + "loss": 0.94863445, + "num_input_tokens_seen": 183401904, + "router_z_loss_mlp": 0.85791016, + "step": 2203, + "time_per_iteration": 2.6537110805511475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169969, + "balance_loss_mlp": 1.0842818, + "epoch": 0.42400923432089266, + "flos": 547822023168.0, + "grad_norm": 0.028367329203477194, + "language_loss": 0.84440267, + "learning_rate": 0.0006452355006885527, + "loss": 0.85610235, + "num_input_tokens_seen": 183471312, + "router_z_loss_mlp": 0.85791016, + "step": 2204, + "time_per_iteration": 2.639218330383301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169105, + "balance_loss_mlp": 1.08346462, + "epoch": 0.4242016160061562, + "flos": 623287658496.0, + "grad_norm": 0.03537327431533643, + "language_loss": 0.96295106, + "learning_rate": 0.0006449373627272412, + "loss": 0.9746421, + "num_input_tokens_seen": 183539184, + "router_z_loss_mlp": 0.85742188, + "step": 2205, + "time_per_iteration": 2.728724956512451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168771, + "balance_loss_mlp": 1.08317852, + "epoch": 0.4243939976914198, + "flos": 572971413504.0, + "grad_norm": 0.029625174738980242, + "language_loss": 0.88551587, + "learning_rate": 0.0006446391684962553, + "loss": 0.89720356, + "num_input_tokens_seen": 183607504, + "router_z_loss_mlp": 0.85693359, + "step": 2206, + "time_per_iteration": 2.6687116622924805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167518, + "balance_loss_mlp": 1.08192575, + "epoch": 0.42458637937668336, + "flos": 449664394752.0, + "grad_norm": 0.02816858253159587, + "language_loss": 0.89565998, + "learning_rate": 0.000644340918111364, + "loss": 0.90733516, + "num_input_tokens_seen": 183674720, + "router_z_loss_mlp": 0.85693359, + "step": 2207, + "time_per_iteration": 2.620295763015747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167512, + "balance_loss_mlp": 1.08206332, + "epoch": 0.4247787610619469, + "flos": 436335361536.0, + "grad_norm": 0.0303416400904182, + "language_loss": 0.92792743, + "learning_rate": 0.0006440426116883585, + "loss": 0.93960261, + "num_input_tokens_seen": 183740448, + "router_z_loss_mlp": 0.85546875, + "step": 2208, + "time_per_iteration": 2.5411367416381836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171139, + "balance_loss_mlp": 1.08602309, + "epoch": 0.4249711427472105, + "flos": 497121741312.0, + "grad_norm": 0.025596497409994177, + "language_loss": 0.92383361, + "learning_rate": 0.0006437442493430519, + "loss": 0.93554503, + "num_input_tokens_seen": 183812640, + "router_z_loss_mlp": 0.85205078, + "step": 2209, + "time_per_iteration": 2.6431679725646973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172012, + "balance_loss_mlp": 1.08694398, + "epoch": 0.425163524432474, + "flos": 657107796480.0, + "grad_norm": 0.030657116246539617, + "language_loss": 0.93065524, + "learning_rate": 0.000643445831191278, + "loss": 0.94237542, + "num_input_tokens_seen": 183895312, + "router_z_loss_mlp": 0.8515625, + "step": 2210, + "time_per_iteration": 2.9031519889831543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117009, + "balance_loss_mlp": 1.08502185, + "epoch": 0.4253559061177376, + "flos": 651778039296.0, + "grad_norm": 0.031032190975230387, + "language_loss": 0.88729775, + "learning_rate": 0.0006431473573488937, + "loss": 0.89899862, + "num_input_tokens_seen": 183966384, + "router_z_loss_mlp": 0.8515625, + "step": 2211, + "time_per_iteration": 2.745398759841919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170674, + "balance_loss_mlp": 1.08560598, + "epoch": 0.42554828780300114, + "flos": 555202947072.0, + "grad_norm": 0.03338022114707726, + "language_loss": 0.92210639, + "learning_rate": 0.0006428488279317765, + "loss": 0.93381315, + "num_input_tokens_seen": 184031728, + "router_z_loss_mlp": 0.8515625, + "step": 2212, + "time_per_iteration": 2.6822004318237305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172615, + "balance_loss_mlp": 1.08797669, + "epoch": 0.4257406694882647, + "flos": 515421964800.0, + "grad_norm": 0.02921339084637532, + "language_loss": 0.9444955, + "learning_rate": 0.0006425502430558259, + "loss": 0.95622164, + "num_input_tokens_seen": 184096160, + "router_z_loss_mlp": 0.84716797, + "step": 2213, + "time_per_iteration": 2.6147451400756836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173123, + "balance_loss_mlp": 1.08824575, + "epoch": 0.42593305117352825, + "flos": 516705057792.0, + "grad_norm": 0.028975617453248656, + "language_loss": 0.90705556, + "learning_rate": 0.0006422516028369628, + "loss": 0.91878676, + "num_input_tokens_seen": 184169664, + "router_z_loss_mlp": 0.84960938, + "step": 2214, + "time_per_iteration": 2.634315013885498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169159, + "balance_loss_mlp": 1.08423436, + "epoch": 0.42612543285879184, + "flos": 589237934592.0, + "grad_norm": 0.02737510916321625, + "language_loss": 0.88997841, + "learning_rate": 0.0006419529073911296, + "loss": 0.90166998, + "num_input_tokens_seen": 184249152, + "router_z_loss_mlp": 0.85009766, + "step": 2215, + "time_per_iteration": 2.934429168701172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168143, + "balance_loss_mlp": 1.08321857, + "epoch": 0.42631781454405543, + "flos": 636751676928.0, + "grad_norm": 0.02841677319990709, + "language_loss": 0.91541028, + "learning_rate": 0.0006416541568342901, + "loss": 0.92709166, + "num_input_tokens_seen": 184326816, + "router_z_loss_mlp": 0.85009766, + "step": 2216, + "time_per_iteration": 2.924881935119629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167669, + "balance_loss_mlp": 1.08269632, + "epoch": 0.42651019622931896, + "flos": 542245215744.0, + "grad_norm": 0.024048936266806608, + "language_loss": 0.89849669, + "learning_rate": 0.0006413553512824297, + "loss": 0.91017342, + "num_input_tokens_seen": 184404336, + "router_z_loss_mlp": 0.85058594, + "step": 2217, + "time_per_iteration": 2.7312259674072266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166506, + "balance_loss_mlp": 1.08096182, + "epoch": 0.42670257791458255, + "flos": 559223414784.0, + "grad_norm": 0.030670266673020908, + "language_loss": 0.90927672, + "learning_rate": 0.0006410564908515549, + "loss": 0.92094177, + "num_input_tokens_seen": 184472320, + "router_z_loss_mlp": 0.85644531, + "step": 2218, + "time_per_iteration": 2.646705389022827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165047, + "balance_loss_mlp": 1.07964516, + "epoch": 0.4268949595998461, + "flos": 622449727488.0, + "grad_norm": 0.03126891192332862, + "language_loss": 0.92295194, + "learning_rate": 0.0006407575756576935, + "loss": 0.93460238, + "num_input_tokens_seen": 184544704, + "router_z_loss_mlp": 0.85498047, + "step": 2219, + "time_per_iteration": 2.750229597091675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163243, + "balance_loss_mlp": 1.07769799, + "epoch": 0.42708734128510967, + "flos": 539015015424.0, + "grad_norm": 0.029393225010211587, + "language_loss": 0.93690813, + "learning_rate": 0.0006404586058168951, + "loss": 0.94854057, + "num_input_tokens_seen": 184622544, + "router_z_loss_mlp": 0.85644531, + "step": 2220, + "time_per_iteration": 2.75992488861084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166043, + "balance_loss_mlp": 1.08049834, + "epoch": 0.4272797229703732, + "flos": 503862119424.0, + "grad_norm": 0.0277791101580606, + "language_loss": 0.93672097, + "learning_rate": 0.0006401595814452296, + "loss": 0.94838136, + "num_input_tokens_seen": 184692544, + "router_z_loss_mlp": 0.85644531, + "step": 2221, + "time_per_iteration": 2.6034135818481445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166502, + "balance_loss_mlp": 1.08081436, + "epoch": 0.4274721046556368, + "flos": 493437646848.0, + "grad_norm": 0.028798228067485887, + "language_loss": 0.8755163, + "learning_rate": 0.000639860502658789, + "loss": 0.88718128, + "num_input_tokens_seen": 184760480, + "router_z_loss_mlp": 0.85791016, + "step": 2222, + "time_per_iteration": 2.6364476680755615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168114, + "balance_loss_mlp": 1.08242607, + "epoch": 0.4276644863409004, + "flos": 569461235712.0, + "grad_norm": 0.025058965600795662, + "language_loss": 0.90727627, + "learning_rate": 0.0006395613695736853, + "loss": 0.91895741, + "num_input_tokens_seen": 184834080, + "router_z_loss_mlp": 0.85791016, + "step": 2223, + "time_per_iteration": 2.7128536701202393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170105, + "balance_loss_mlp": 1.08432245, + "epoch": 0.4278568680261639, + "flos": 608562740736.0, + "grad_norm": 0.029982203504376047, + "language_loss": 0.88910139, + "learning_rate": 0.0006392621823060529, + "loss": 0.90080237, + "num_input_tokens_seen": 184905872, + "router_z_loss_mlp": 0.85888672, + "step": 2224, + "time_per_iteration": 2.7404489517211914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167658, + "balance_loss_mlp": 1.08177996, + "epoch": 0.4280492497114275, + "flos": 561578754048.0, + "grad_norm": 0.03210591854722722, + "language_loss": 0.92597878, + "learning_rate": 0.0006389629409720465, + "loss": 0.93765533, + "num_input_tokens_seen": 184972320, + "router_z_loss_mlp": 0.85986328, + "step": 2225, + "time_per_iteration": 2.66398549079895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170504, + "balance_loss_mlp": 1.08467305, + "epoch": 0.428241631396691, + "flos": 721901182464.0, + "grad_norm": 0.03010502161811575, + "language_loss": 0.95236158, + "learning_rate": 0.0006386636456878417, + "loss": 0.96406662, + "num_input_tokens_seen": 185051040, + "router_z_loss_mlp": 0.859375, + "step": 2226, + "time_per_iteration": 2.866391897201538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168906, + "balance_loss_mlp": 1.08307493, + "epoch": 0.4284340130819546, + "flos": 430369787904.0, + "grad_norm": 0.032531705768225685, + "language_loss": 0.99370027, + "learning_rate": 0.0006383642965696353, + "loss": 1.00538921, + "num_input_tokens_seen": 185113552, + "router_z_loss_mlp": 0.859375, + "step": 2227, + "time_per_iteration": 2.4586703777313232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169599, + "balance_loss_mlp": 1.08376861, + "epoch": 0.42862639476721814, + "flos": 526159342080.0, + "grad_norm": 0.030010487503704626, + "language_loss": 0.90640998, + "learning_rate": 0.000638064893733645, + "loss": 0.91810596, + "num_input_tokens_seen": 185185056, + "router_z_loss_mlp": 0.859375, + "step": 2228, + "time_per_iteration": 2.71899676322937 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168473, + "balance_loss_mlp": 1.08269, + "epoch": 0.42881877645248173, + "flos": 466378079232.0, + "grad_norm": 0.029133853286813928, + "language_loss": 0.95973945, + "learning_rate": 0.000637765437296109, + "loss": 0.97142416, + "num_input_tokens_seen": 185257248, + "router_z_loss_mlp": 0.85888672, + "step": 2229, + "time_per_iteration": 2.6824750900268555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166344, + "balance_loss_mlp": 1.08075178, + "epoch": 0.42901115813774526, + "flos": 561355172352.0, + "grad_norm": 0.028234307189641095, + "language_loss": 0.92378092, + "learning_rate": 0.000637465927373287, + "loss": 0.93544424, + "num_input_tokens_seen": 185324800, + "router_z_loss_mlp": 0.85693359, + "step": 2230, + "time_per_iteration": 2.65869402885437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166629, + "balance_loss_mlp": 1.08137035, + "epoch": 0.42920353982300885, + "flos": 562527475200.0, + "grad_norm": 0.03139177124565146, + "language_loss": 0.86247277, + "learning_rate": 0.000637166364081459, + "loss": 0.87413907, + "num_input_tokens_seen": 185393408, + "router_z_loss_mlp": 0.85351562, + "step": 2231, + "time_per_iteration": 2.7071642875671387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165657, + "balance_loss_mlp": 1.080446, + "epoch": 0.42939592150827244, + "flos": 557315238912.0, + "grad_norm": 0.03049902562345181, + "language_loss": 0.89974546, + "learning_rate": 0.0006368667475369256, + "loss": 0.91140211, + "num_input_tokens_seen": 185467968, + "router_z_loss_mlp": 0.85302734, + "step": 2232, + "time_per_iteration": 2.74843168258667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166412, + "balance_loss_mlp": 1.08363342, + "epoch": 0.42958830319353597, + "flos": 1524942314496.0, + "grad_norm": 0.009964168253272706, + "language_loss": 0.78527778, + "learning_rate": 0.0006365670778560084, + "loss": 0.79694188, + "num_input_tokens_seen": 185705232, + "router_z_loss_mlp": 0.828125, + "step": 2233, + "time_per_iteration": 4.862222909927368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165146, + "balance_loss_mlp": 1.08236694, + "epoch": 0.42978068487879956, + "flos": 1498869672960.0, + "grad_norm": 0.007691227120989337, + "language_loss": 0.78895426, + "learning_rate": 0.0006362673551550494, + "loss": 0.80060571, + "num_input_tokens_seen": 185932672, + "router_z_loss_mlp": 0.828125, + "step": 2234, + "time_per_iteration": 4.816195011138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167111, + "balance_loss_mlp": 1.08242488, + "epoch": 0.4299730665640631, + "flos": 548063069184.0, + "grad_norm": 0.02593969644103988, + "language_loss": 0.92186785, + "learning_rate": 0.0006359675795504112, + "loss": 0.93353903, + "num_input_tokens_seen": 186006288, + "router_z_loss_mlp": 0.84765625, + "step": 2235, + "time_per_iteration": 2.6802918910980225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167601, + "balance_loss_mlp": 1.08300984, + "epoch": 0.4301654482493267, + "flos": 1131115124736.0, + "grad_norm": 0.035304816631346984, + "language_loss": 0.82753956, + "learning_rate": 0.0006356677511584775, + "loss": 0.83921564, + "num_input_tokens_seen": 186097168, + "router_z_loss_mlp": 0.84667969, + "step": 2236, + "time_per_iteration": 3.444307327270508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169724, + "balance_loss_mlp": 1.08522856, + "epoch": 0.4303578299345902, + "flos": 496741707264.0, + "grad_norm": 0.0313639268125667, + "language_loss": 0.9209317, + "learning_rate": 0.0006353678700956511, + "loss": 0.93262899, + "num_input_tokens_seen": 186163904, + "router_z_loss_mlp": 0.84570312, + "step": 2237, + "time_per_iteration": 2.5677876472473145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164152, + "balance_loss_mlp": 1.07965648, + "epoch": 0.4305502116198538, + "flos": 616929315840.0, + "grad_norm": 0.02814766917627989, + "language_loss": 0.90743506, + "learning_rate": 0.0006350679364783569, + "loss": 0.91907656, + "num_input_tokens_seen": 186233888, + "router_z_loss_mlp": 0.84570312, + "step": 2238, + "time_per_iteration": 2.7363951206207275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175266, + "balance_loss_mlp": 1.09081805, + "epoch": 0.4307425933051173, + "flos": 560321857536.0, + "grad_norm": 0.032687311784007, + "language_loss": 0.92748511, + "learning_rate": 0.0006347679504230393, + "loss": 0.93923771, + "num_input_tokens_seen": 186301168, + "router_z_loss_mlp": 0.84521484, + "step": 2239, + "time_per_iteration": 2.6805875301361084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172185, + "balance_loss_mlp": 1.08749855, + "epoch": 0.4309349749903809, + "flos": 973816779264.0, + "grad_norm": 0.03249158230487725, + "language_loss": 0.83304834, + "learning_rate": 0.0006344679120461632, + "loss": 0.84477019, + "num_input_tokens_seen": 186392096, + "router_z_loss_mlp": 0.84765625, + "step": 2240, + "time_per_iteration": 3.4101555347442627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166292, + "balance_loss_mlp": 1.08146274, + "epoch": 0.4311273566756445, + "flos": 542972356608.0, + "grad_norm": 0.03524791345855764, + "language_loss": 0.87825459, + "learning_rate": 0.0006341678214642134, + "loss": 0.88991749, + "num_input_tokens_seen": 186458000, + "router_z_loss_mlp": 0.84912109, + "step": 2241, + "time_per_iteration": 2.625896692276001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165486, + "balance_loss_mlp": 1.08041823, + "epoch": 0.43131973836090803, + "flos": 763110976512.0, + "grad_norm": 0.027424867307564667, + "language_loss": 0.89878041, + "learning_rate": 0.0006338676787936963, + "loss": 0.91043526, + "num_input_tokens_seen": 186544992, + "router_z_loss_mlp": 0.8515625, + "step": 2242, + "time_per_iteration": 3.063455820083618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167252, + "balance_loss_mlp": 1.08199346, + "epoch": 0.4315121200461716, + "flos": 555602446848.0, + "grad_norm": 0.031429355894507384, + "language_loss": 0.916659, + "learning_rate": 0.0006335674841511367, + "loss": 0.92833149, + "num_input_tokens_seen": 186614960, + "router_z_loss_mlp": 0.85351562, + "step": 2243, + "time_per_iteration": 2.666233777999878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192352, + "balance_loss_mlp": 1.10804749, + "epoch": 0.43170450173143515, + "flos": 1488686972928.0, + "grad_norm": 0.015912473948710273, + "language_loss": 0.7918117, + "learning_rate": 0.000633267237653081, + "loss": 0.80373514, + "num_input_tokens_seen": 186854288, + "router_z_loss_mlp": 0.84375, + "step": 2244, + "time_per_iteration": 4.980380535125732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183075, + "balance_loss_mlp": 1.09877014, + "epoch": 0.43189688341669874, + "flos": 1476907548672.0, + "grad_norm": 0.014137336443723746, + "language_loss": 0.77365553, + "learning_rate": 0.0006329669394160953, + "loss": 0.78548628, + "num_input_tokens_seen": 187090272, + "router_z_loss_mlp": 0.84375, + "step": 2245, + "time_per_iteration": 4.896914005279541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011678, + "balance_loss_mlp": 1.08254158, + "epoch": 0.43208926510196227, + "flos": 493984866816.0, + "grad_norm": 0.02893589890767333, + "language_loss": 0.89212227, + "learning_rate": 0.0006326665895567652, + "loss": 0.90380025, + "num_input_tokens_seen": 187157584, + "router_z_loss_mlp": 0.85351562, + "step": 2246, + "time_per_iteration": 2.6488964557647705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169613, + "balance_loss_mlp": 1.08430731, + "epoch": 0.43228164678722586, + "flos": 521302944768.0, + "grad_norm": 0.0351368535627373, + "language_loss": 0.94705987, + "learning_rate": 0.0006323661881916976, + "loss": 0.95875597, + "num_input_tokens_seen": 187229408, + "router_z_loss_mlp": 0.85400391, + "step": 2247, + "time_per_iteration": 2.7094948291778564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170289, + "balance_loss_mlp": 1.08522093, + "epoch": 0.4324740284724894, + "flos": 797395015680.0, + "grad_norm": 0.0300569180656374, + "language_loss": 0.88277382, + "learning_rate": 0.0006320657354375179, + "loss": 0.89447677, + "num_input_tokens_seen": 187304384, + "router_z_loss_mlp": 0.8515625, + "step": 2248, + "time_per_iteration": 2.942108154296875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166997, + "balance_loss_mlp": 1.08188176, + "epoch": 0.432666410157753, + "flos": 483097767936.0, + "grad_norm": 0.027676603795042543, + "language_loss": 0.93945193, + "learning_rate": 0.0006317652314108726, + "loss": 0.95112193, + "num_input_tokens_seen": 187368064, + "router_z_loss_mlp": 0.85205078, + "step": 2249, + "time_per_iteration": 2.559255838394165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167847, + "balance_loss_mlp": 1.08268416, + "epoch": 0.43285879184301657, + "flos": 501209338368.0, + "grad_norm": 0.028764721331973258, + "language_loss": 0.98109567, + "learning_rate": 0.0006314646762284277, + "loss": 0.99277413, + "num_input_tokens_seen": 187436320, + "router_z_loss_mlp": 0.85253906, + "step": 2250, + "time_per_iteration": 2.6713576316833496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188225, + "balance_loss_mlp": 1.10582733, + "epoch": 0.4330511735282801, + "flos": 1513790701056.0, + "grad_norm": 0.02095115440391329, + "language_loss": 0.75425828, + "learning_rate": 0.0006311640700068691, + "loss": 0.76614058, + "num_input_tokens_seen": 187670912, + "router_z_loss_mlp": 0.82421875, + "step": 2251, + "time_per_iteration": 4.936391592025757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170203, + "balance_loss_mlp": 1.08518302, + "epoch": 0.4332435552135437, + "flos": 700837387776.0, + "grad_norm": 0.037779543880407794, + "language_loss": 0.84241956, + "learning_rate": 0.0006308634128629022, + "loss": 0.85412163, + "num_input_tokens_seen": 187746432, + "router_z_loss_mlp": 0.85107422, + "step": 2252, + "time_per_iteration": 2.890848398208618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168176, + "balance_loss_mlp": 1.0830133, + "epoch": 0.4334359368988072, + "flos": 593481984000.0, + "grad_norm": 0.0295787243575072, + "language_loss": 0.93934762, + "learning_rate": 0.0006305627049132531, + "loss": 0.95102942, + "num_input_tokens_seen": 187820032, + "router_z_loss_mlp": 0.85253906, + "step": 2253, + "time_per_iteration": 2.7571680545806885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167414, + "balance_loss_mlp": 1.08220303, + "epoch": 0.4336283185840708, + "flos": 844274942976.0, + "grad_norm": 0.0242542623992157, + "language_loss": 0.90322375, + "learning_rate": 0.0006302619462746662, + "loss": 0.91489786, + "num_input_tokens_seen": 187904400, + "router_z_loss_mlp": 0.85302734, + "step": 2254, + "time_per_iteration": 3.1296751499176025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167279, + "balance_loss_mlp": 1.0821631, + "epoch": 0.43382070026933434, + "flos": 627401452032.0, + "grad_norm": 0.02849659363202695, + "language_loss": 0.96522522, + "learning_rate": 0.0006299611370639069, + "loss": 0.97689807, + "num_input_tokens_seen": 187973264, + "router_z_loss_mlp": 0.85205078, + "step": 2255, + "time_per_iteration": 2.7125463485717773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167069, + "balance_loss_mlp": 1.08181024, + "epoch": 0.4340130819545979, + "flos": 592209624576.0, + "grad_norm": 0.029264792527705672, + "language_loss": 0.85361564, + "learning_rate": 0.0006296602773977593, + "loss": 0.86528635, + "num_input_tokens_seen": 188039984, + "router_z_loss_mlp": 0.85351562, + "step": 2256, + "time_per_iteration": 2.692830801010132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166353, + "balance_loss_mlp": 1.0810945, + "epoch": 0.4342054636398615, + "flos": 491955167232.0, + "grad_norm": 0.02531800088280138, + "language_loss": 0.92533612, + "learning_rate": 0.0006293593673930277, + "loss": 0.93699974, + "num_input_tokens_seen": 188113456, + "router_z_loss_mlp": 0.85351562, + "step": 2257, + "time_per_iteration": 2.6522371768951416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118061, + "balance_loss_mlp": 1.09568477, + "epoch": 0.43439784532512504, + "flos": 700259968512.0, + "grad_norm": 0.028144633410819173, + "language_loss": 0.84340745, + "learning_rate": 0.0006290584071665358, + "loss": 0.85521352, + "num_input_tokens_seen": 188192480, + "router_z_loss_mlp": 0.85009766, + "step": 2258, + "time_per_iteration": 2.878753662109375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179592, + "balance_loss_mlp": 1.09452426, + "epoch": 0.43459022701038863, + "flos": 486801328128.0, + "grad_norm": 0.028951325004384125, + "language_loss": 0.88270766, + "learning_rate": 0.0006287573968351266, + "loss": 0.89450359, + "num_input_tokens_seen": 188258784, + "router_z_loss_mlp": 0.8515625, + "step": 2259, + "time_per_iteration": 2.55161190032959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173139, + "balance_loss_mlp": 1.08830976, + "epoch": 0.43478260869565216, + "flos": 644266859520.0, + "grad_norm": 0.030714073024811012, + "language_loss": 0.91379642, + "learning_rate": 0.0006284563365156626, + "loss": 0.92552781, + "num_input_tokens_seen": 188331312, + "router_z_loss_mlp": 0.84912109, + "step": 2260, + "time_per_iteration": 2.778975009918213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177671, + "balance_loss_mlp": 1.09274662, + "epoch": 0.43497499038091575, + "flos": 427009331712.0, + "grad_norm": 0.03207934204379992, + "language_loss": 0.94470251, + "learning_rate": 0.0006281552263250261, + "loss": 0.95647919, + "num_input_tokens_seen": 188393712, + "router_z_loss_mlp": 0.85009766, + "step": 2261, + "time_per_iteration": 2.540102005004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175407, + "balance_loss_mlp": 1.09281921, + "epoch": 0.4351673720661793, + "flos": 1541525016576.0, + "grad_norm": 0.010664027023399645, + "language_loss": 0.80691534, + "learning_rate": 0.000627854066380118, + "loss": 0.81866938, + "num_input_tokens_seen": 188621152, + "router_z_loss_mlp": 0.82617188, + "step": 2262, + "time_per_iteration": 4.828954219818115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167291, + "balance_loss_mlp": 1.08260465, + "epoch": 0.43535975375144287, + "flos": 750465423360.0, + "grad_norm": 0.02969029135984414, + "language_loss": 0.88281786, + "learning_rate": 0.0006275528567978593, + "loss": 0.89449072, + "num_input_tokens_seen": 188697120, + "router_z_loss_mlp": 0.84765625, + "step": 2263, + "time_per_iteration": 2.9683096408843994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167048, + "balance_loss_mlp": 1.08193278, + "epoch": 0.4355521354367064, + "flos": 862751084544.0, + "grad_norm": 0.03226302104273745, + "language_loss": 0.89985508, + "learning_rate": 0.0006272515976951898, + "loss": 0.91152549, + "num_input_tokens_seen": 188778480, + "router_z_loss_mlp": 0.85205078, + "step": 2264, + "time_per_iteration": 4.429616689682007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166942, + "balance_loss_mlp": 1.08182704, + "epoch": 0.43574451712197, + "flos": 735842563584.0, + "grad_norm": 0.02499576623287147, + "language_loss": 0.84365284, + "learning_rate": 0.0006269502891890687, + "loss": 0.8553223, + "num_input_tokens_seen": 188863616, + "router_z_loss_mlp": 0.85205078, + "step": 2265, + "time_per_iteration": 3.0444254875183105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166782, + "balance_loss_mlp": 1.08214331, + "epoch": 0.4359368988072336, + "flos": 571712515584.0, + "grad_norm": 0.02707186340155289, + "language_loss": 0.93191004, + "learning_rate": 0.0006266489313964743, + "loss": 0.94357783, + "num_input_tokens_seen": 188933984, + "router_z_loss_mlp": 0.84716797, + "step": 2266, + "time_per_iteration": 2.7227466106414795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164913, + "balance_loss_mlp": 1.0802747, + "epoch": 0.4361292804924971, + "flos": 556670690304.0, + "grad_norm": 0.03376827968070452, + "language_loss": 0.92200565, + "learning_rate": 0.0006263475244344041, + "loss": 0.93365479, + "num_input_tokens_seen": 189012976, + "router_z_loss_mlp": 0.84716797, + "step": 2267, + "time_per_iteration": 2.845227003097534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167657, + "balance_loss_mlp": 1.08335233, + "epoch": 0.4363216621777607, + "flos": 558348553728.0, + "grad_norm": 0.031080273211388402, + "language_loss": 0.91650617, + "learning_rate": 0.0006260460684198746, + "loss": 0.92818272, + "num_input_tokens_seen": 189079664, + "router_z_loss_mlp": 0.84375, + "step": 2268, + "time_per_iteration": 2.652310371398926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165668, + "balance_loss_mlp": 1.08141088, + "epoch": 0.4365140438630242, + "flos": 479196822528.0, + "grad_norm": 0.029843008840560653, + "language_loss": 0.92140841, + "learning_rate": 0.0006257445634699213, + "loss": 0.93306512, + "num_input_tokens_seen": 189144688, + "router_z_loss_mlp": 0.84326172, + "step": 2269, + "time_per_iteration": 2.5779240131378174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164543, + "balance_loss_mlp": 1.08042932, + "epoch": 0.4367064255482878, + "flos": 580007232000.0, + "grad_norm": 0.028296510675920098, + "language_loss": 0.89645165, + "learning_rate": 0.0006254430097015993, + "loss": 0.90809709, + "num_input_tokens_seen": 189213984, + "router_z_loss_mlp": 0.84179688, + "step": 2270, + "time_per_iteration": 2.6566953659057617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172028, + "balance_loss_mlp": 1.08963013, + "epoch": 0.43689880723355135, + "flos": 1462271953920.0, + "grad_norm": 0.010844604855090543, + "language_loss": 0.76479089, + "learning_rate": 0.0006251414072319815, + "loss": 0.77651119, + "num_input_tokens_seen": 189434416, + "router_z_loss_mlp": 0.82421875, + "step": 2271, + "time_per_iteration": 4.794802904129028 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170244, + "balance_loss_mlp": 1.08593976, + "epoch": 0.43709118891881493, + "flos": 668873759232.0, + "grad_norm": 0.024959132899117664, + "language_loss": 0.91526961, + "learning_rate": 0.0006248397561781609, + "loss": 0.92697203, + "num_input_tokens_seen": 189513248, + "router_z_loss_mlp": 0.84375, + "step": 2272, + "time_per_iteration": 2.8676164150238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170164, + "balance_loss_mlp": 1.08562064, + "epoch": 0.43728357060407846, + "flos": 545913847296.0, + "grad_norm": 0.033809863548240594, + "language_loss": 0.93834352, + "learning_rate": 0.0006245380566572482, + "loss": 0.95004517, + "num_input_tokens_seen": 189585392, + "router_z_loss_mlp": 0.84619141, + "step": 2273, + "time_per_iteration": 2.6419596672058105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169646, + "balance_loss_mlp": 1.08519816, + "epoch": 0.43747595228934205, + "flos": 748183944192.0, + "grad_norm": 0.02624268387252208, + "language_loss": 0.83012575, + "learning_rate": 0.0006242363087863744, + "loss": 0.84182227, + "num_input_tokens_seen": 189667552, + "router_z_loss_mlp": 0.84521484, + "step": 2274, + "time_per_iteration": 2.9927828311920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165646, + "balance_loss_mlp": 1.08057845, + "epoch": 0.43766833397460564, + "flos": 632529094656.0, + "grad_norm": 0.025411969041571628, + "language_loss": 0.92234564, + "learning_rate": 0.0006239345126826878, + "loss": 0.9340021, + "num_input_tokens_seen": 189742048, + "router_z_loss_mlp": 0.8515625, + "step": 2275, + "time_per_iteration": 2.8180527687072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164237, + "balance_loss_mlp": 1.07931209, + "epoch": 0.43786071565986917, + "flos": 532098719232.0, + "grad_norm": 0.028730665522240066, + "language_loss": 0.90992379, + "learning_rate": 0.0006236326684633561, + "loss": 0.92156613, + "num_input_tokens_seen": 189817968, + "router_z_loss_mlp": 0.85009766, + "step": 2276, + "time_per_iteration": 2.828425168991089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163177, + "balance_loss_mlp": 1.07810962, + "epoch": 0.43805309734513276, + "flos": 539557506048.0, + "grad_norm": 0.03648062799061939, + "language_loss": 0.82486773, + "learning_rate": 0.0006233307762455658, + "loss": 0.83649945, + "num_input_tokens_seen": 189882608, + "router_z_loss_mlp": 0.8515625, + "step": 2277, + "time_per_iteration": 2.608886957168579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164162, + "balance_loss_mlp": 1.07909381, + "epoch": 0.4382454790303963, + "flos": 865963820544.0, + "grad_norm": 0.025903790262040906, + "language_loss": 0.90223956, + "learning_rate": 0.0006230288361465216, + "loss": 0.91388112, + "num_input_tokens_seen": 189960608, + "router_z_loss_mlp": 0.8515625, + "step": 2278, + "time_per_iteration": 3.036163568496704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171688, + "balance_loss_mlp": 1.08638203, + "epoch": 0.4384378607156599, + "flos": 766801075200.0, + "grad_norm": 0.03187081568607536, + "language_loss": 0.92773926, + "learning_rate": 0.0006227268482834473, + "loss": 0.93945611, + "num_input_tokens_seen": 190035472, + "router_z_loss_mlp": 0.85400391, + "step": 2279, + "time_per_iteration": 2.9320731163024902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176636, + "balance_loss_mlp": 1.09137762, + "epoch": 0.4386302424009234, + "flos": 669796283904.0, + "grad_norm": 0.028047353495827182, + "language_loss": 0.9305023, + "learning_rate": 0.000622424812773585, + "loss": 0.94226873, + "num_input_tokens_seen": 190109312, + "router_z_loss_mlp": 0.85351562, + "step": 2280, + "time_per_iteration": 2.7847142219543457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174317, + "balance_loss_mlp": 1.08901083, + "epoch": 0.438822624086187, + "flos": 486150048768.0, + "grad_norm": 0.03276492690852342, + "language_loss": 0.87875438, + "learning_rate": 0.000622122729734195, + "loss": 0.89049757, + "num_input_tokens_seen": 190174176, + "router_z_loss_mlp": 0.85400391, + "step": 2281, + "time_per_iteration": 2.5878114700317383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175391, + "balance_loss_mlp": 1.09008515, + "epoch": 0.4390150057714506, + "flos": 500258615808.0, + "grad_norm": 0.02649151217717187, + "language_loss": 0.92922705, + "learning_rate": 0.0006218205992825566, + "loss": 0.94098091, + "num_input_tokens_seen": 190243888, + "router_z_loss_mlp": 0.85400391, + "step": 2282, + "time_per_iteration": 2.6129069328308105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171786, + "balance_loss_mlp": 1.08652771, + "epoch": 0.4392073874567141, + "flos": 559351669248.0, + "grad_norm": 0.029077625047839704, + "language_loss": 0.88682199, + "learning_rate": 0.0006215184215359671, + "loss": 0.89853978, + "num_input_tokens_seen": 190317504, + "router_z_loss_mlp": 0.85351562, + "step": 2283, + "time_per_iteration": 2.7397634983062744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011712, + "balance_loss_mlp": 1.08594131, + "epoch": 0.4393997691419777, + "flos": 606422251008.0, + "grad_norm": 0.030174398524898192, + "language_loss": 0.92242193, + "learning_rate": 0.0006212161966117425, + "loss": 0.93413389, + "num_input_tokens_seen": 190390160, + "router_z_loss_mlp": 0.85351562, + "step": 2284, + "time_per_iteration": 2.710947275161743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168513, + "balance_loss_mlp": 1.08349264, + "epoch": 0.43959215082724123, + "flos": 805483614720.0, + "grad_norm": 0.03159683391584848, + "language_loss": 0.8931039, + "learning_rate": 0.0006209139246272164, + "loss": 0.90478909, + "num_input_tokens_seen": 190467600, + "router_z_loss_mlp": 0.85107422, + "step": 2285, + "time_per_iteration": 2.9573750495910645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167409, + "balance_loss_mlp": 1.08229375, + "epoch": 0.4397845325125048, + "flos": 488607446016.0, + "grad_norm": 0.033192711624055064, + "language_loss": 0.89631027, + "learning_rate": 0.0006206116056997421, + "loss": 0.90798426, + "num_input_tokens_seen": 190534192, + "router_z_loss_mlp": 0.85205078, + "step": 2286, + "time_per_iteration": 2.5915918350219727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168495, + "balance_loss_mlp": 1.08380854, + "epoch": 0.43997691419776835, + "flos": 481784475648.0, + "grad_norm": 0.02920198010279229, + "language_loss": 0.88986552, + "learning_rate": 0.0006203092399466892, + "loss": 0.90155041, + "num_input_tokens_seen": 190601440, + "router_z_loss_mlp": 0.84765625, + "step": 2287, + "time_per_iteration": 2.6179182529449463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167372, + "balance_loss_mlp": 1.08282888, + "epoch": 0.44016929588303194, + "flos": 484129081344.0, + "grad_norm": 0.024305807708132735, + "language_loss": 0.91028094, + "learning_rate": 0.0006200068274854473, + "loss": 0.92195475, + "num_input_tokens_seen": 190672528, + "router_z_loss_mlp": 0.84619141, + "step": 2288, + "time_per_iteration": 2.6643898487091064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168421, + "balance_loss_mlp": 1.08387816, + "epoch": 0.4403616775682955, + "flos": 573023806464.0, + "grad_norm": 0.025110382343061666, + "language_loss": 0.90969157, + "learning_rate": 0.0006197043684334229, + "loss": 0.92137575, + "num_input_tokens_seen": 190750704, + "router_z_loss_mlp": 0.84619141, + "step": 2289, + "time_per_iteration": 2.7810122966766357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169529, + "balance_loss_mlp": 1.08503318, + "epoch": 0.44055405925355906, + "flos": 631999339008.0, + "grad_norm": 0.03160389670817918, + "language_loss": 0.85855997, + "learning_rate": 0.0006194018629080411, + "loss": 0.87025523, + "num_input_tokens_seen": 190821664, + "router_z_loss_mlp": 0.84570312, + "step": 2290, + "time_per_iteration": 2.7407448291778564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165877, + "balance_loss_mlp": 1.08147717, + "epoch": 0.44074644093882265, + "flos": 537825248256.0, + "grad_norm": 0.027939915930863316, + "language_loss": 0.87505877, + "learning_rate": 0.0006190993110267451, + "loss": 0.88671762, + "num_input_tokens_seen": 190893888, + "router_z_loss_mlp": 0.84472656, + "step": 2291, + "time_per_iteration": 2.7158915996551514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167062, + "balance_loss_mlp": 1.08280444, + "epoch": 0.4409388226240862, + "flos": 464165730816.0, + "grad_norm": 0.03127864863359821, + "language_loss": 0.91365832, + "learning_rate": 0.0006187967129069958, + "loss": 0.92532897, + "num_input_tokens_seen": 190956800, + "router_z_loss_mlp": 0.84326172, + "step": 2292, + "time_per_iteration": 2.506866931915283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167494, + "balance_loss_mlp": 1.08337986, + "epoch": 0.44113120430934977, + "flos": 567160290816.0, + "grad_norm": 0.024295125434261364, + "language_loss": 0.92081046, + "learning_rate": 0.0006184940686662722, + "loss": 0.93248534, + "num_input_tokens_seen": 191032048, + "router_z_loss_mlp": 0.84179688, + "step": 2293, + "time_per_iteration": 2.7406985759735107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168054, + "balance_loss_mlp": 1.084131, + "epoch": 0.4413235859946133, + "flos": 544674415104.0, + "grad_norm": 0.02998433601693185, + "language_loss": 0.95718068, + "learning_rate": 0.0006181913784220714, + "loss": 0.96886122, + "num_input_tokens_seen": 191099952, + "router_z_loss_mlp": 0.83984375, + "step": 2294, + "time_per_iteration": 2.7276971340179443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186783, + "balance_loss_mlp": 1.1034317, + "epoch": 0.4415159676798769, + "flos": 1573302720000.0, + "grad_norm": 0.012177255736314117, + "language_loss": 0.80553782, + "learning_rate": 0.0006178886422919078, + "loss": 0.8174057, + "num_input_tokens_seen": 191335968, + "router_z_loss_mlp": 0.83398438, + "step": 2295, + "time_per_iteration": 4.898420333862305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174829, + "balance_loss_mlp": 1.0908581, + "epoch": 0.4417083493651404, + "flos": 660012357120.0, + "grad_norm": 0.02926637357686751, + "language_loss": 0.86549121, + "learning_rate": 0.0006175858603933146, + "loss": 0.87723947, + "num_input_tokens_seen": 191410112, + "router_z_loss_mlp": 0.84033203, + "step": 2296, + "time_per_iteration": 2.866745710372925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166372, + "balance_loss_mlp": 1.08225799, + "epoch": 0.441900731050404, + "flos": 741816869376.0, + "grad_norm": 0.028401827027787777, + "language_loss": 0.8638438, + "learning_rate": 0.0006172830328438416, + "loss": 0.87550759, + "num_input_tokens_seen": 191491552, + "router_z_loss_mlp": 0.84179688, + "step": 2297, + "time_per_iteration": 2.9731123447418213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165335, + "balance_loss_mlp": 1.08088684, + "epoch": 0.44209311273566754, + "flos": 540595550208.0, + "grad_norm": 0.030114194292861593, + "language_loss": 0.93111193, + "learning_rate": 0.0006169801597610572, + "loss": 0.94276524, + "num_input_tokens_seen": 191567872, + "router_z_loss_mlp": 0.84521484, + "step": 2298, + "time_per_iteration": 2.777326822280884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163943, + "balance_loss_mlp": 1.07959104, + "epoch": 0.4422854944209311, + "flos": 622729704960.0, + "grad_norm": 0.030043302620551878, + "language_loss": 0.96779996, + "learning_rate": 0.0006166772412625469, + "loss": 0.97943938, + "num_input_tokens_seen": 191638032, + "router_z_loss_mlp": 0.84423828, + "step": 2299, + "time_per_iteration": 2.8143997192382812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164367, + "balance_loss_mlp": 1.08006215, + "epoch": 0.4424778761061947, + "flos": 660060020736.0, + "grad_norm": 0.031086205360051855, + "language_loss": 0.88609374, + "learning_rate": 0.0006163742774659141, + "loss": 0.89773744, + "num_input_tokens_seen": 191709104, + "router_z_loss_mlp": 0.84375, + "step": 2300, + "time_per_iteration": 2.8234009742736816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116513, + "balance_loss_mlp": 1.08087325, + "epoch": 0.44267025779145824, + "flos": 569702281728.0, + "grad_norm": 0.02554920530971592, + "language_loss": 0.92150819, + "learning_rate": 0.0006160712684887801, + "loss": 0.93315947, + "num_input_tokens_seen": 191787072, + "router_z_loss_mlp": 0.84326172, + "step": 2301, + "time_per_iteration": 2.733370542526245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170443, + "balance_loss_mlp": 1.08623374, + "epoch": 0.44286263947672183, + "flos": 497818682880.0, + "grad_norm": 0.02788747598953172, + "language_loss": 0.88145387, + "learning_rate": 0.0006157682144487832, + "loss": 0.89315832, + "num_input_tokens_seen": 191863040, + "router_z_loss_mlp": 0.84277344, + "step": 2302, + "time_per_iteration": 2.766334295272827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171189, + "balance_loss_mlp": 1.08697963, + "epoch": 0.44305502116198536, + "flos": 610607903232.0, + "grad_norm": 0.028872273370365097, + "language_loss": 0.89961743, + "learning_rate": 0.0006154651154635793, + "loss": 0.91132939, + "num_input_tokens_seen": 191940352, + "router_z_loss_mlp": 0.84277344, + "step": 2303, + "time_per_iteration": 2.844402313232422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172304, + "balance_loss_mlp": 1.08776116, + "epoch": 0.44324740284724895, + "flos": 471742038528.0, + "grad_norm": 0.028372285588360545, + "language_loss": 0.91810459, + "learning_rate": 0.0006151619716508421, + "loss": 0.92982763, + "num_input_tokens_seen": 192006896, + "router_z_loss_mlp": 0.84619141, + "step": 2304, + "time_per_iteration": 2.545243263244629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166666, + "balance_loss_mlp": 1.08197927, + "epoch": 0.4434397845325125, + "flos": 579811848192.0, + "grad_norm": 0.029138508250266412, + "language_loss": 0.93279153, + "learning_rate": 0.0006148587831282625, + "loss": 0.94445825, + "num_input_tokens_seen": 192075312, + "router_z_loss_mlp": 0.84765625, + "step": 2305, + "time_per_iteration": 2.6743574142456055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179131, + "balance_loss_mlp": 1.09654236, + "epoch": 0.44363216621777607, + "flos": 1499995038720.0, + "grad_norm": 0.011431210063158581, + "language_loss": 0.79176068, + "learning_rate": 0.0006145555500135483, + "loss": 0.80355197, + "num_input_tokens_seen": 192304816, + "router_z_loss_mlp": 0.82617188, + "step": 2306, + "time_per_iteration": 4.870469570159912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177668, + "balance_loss_mlp": 1.09298158, + "epoch": 0.44382454790303966, + "flos": 478285031424.0, + "grad_norm": 0.03377230518223979, + "language_loss": 0.94630158, + "learning_rate": 0.0006142522724244255, + "loss": 0.95807827, + "num_input_tokens_seen": 192369232, + "router_z_loss_mlp": 0.84765625, + "step": 2307, + "time_per_iteration": 2.5165300369262695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181709, + "balance_loss_mlp": 1.09912109, + "epoch": 0.4440169295883032, + "flos": 1547303938560.0, + "grad_norm": 0.010354849447395944, + "language_loss": 0.76484716, + "learning_rate": 0.0006139489504786368, + "loss": 0.77666426, + "num_input_tokens_seen": 192600176, + "router_z_loss_mlp": 0.82617188, + "step": 2308, + "time_per_iteration": 4.86593222618103 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168989, + "balance_loss_mlp": 1.0843029, + "epoch": 0.4442093112735668, + "flos": 592290215424.0, + "grad_norm": 0.030546908540126056, + "language_loss": 0.84313834, + "learning_rate": 0.000613645584293942, + "loss": 0.85482824, + "num_input_tokens_seen": 192675424, + "router_z_loss_mlp": 0.84765625, + "step": 2309, + "time_per_iteration": 2.9245197772979736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179296, + "balance_loss_mlp": 1.09465766, + "epoch": 0.4444016929588303, + "flos": 531327917568.0, + "grad_norm": 0.02954341623225009, + "language_loss": 0.89990199, + "learning_rate": 0.0006133421739881185, + "loss": 0.91169494, + "num_input_tokens_seen": 192747552, + "router_z_loss_mlp": 0.84716797, + "step": 2310, + "time_per_iteration": 2.6806466579437256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173935, + "balance_loss_mlp": 1.08958304, + "epoch": 0.4445940746440939, + "flos": 621388214784.0, + "grad_norm": 0.03132503362752706, + "language_loss": 0.89829159, + "learning_rate": 0.0006130387196789605, + "loss": 0.91003096, + "num_input_tokens_seen": 192819984, + "router_z_loss_mlp": 0.84423828, + "step": 2311, + "time_per_iteration": 2.7674410343170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171768, + "balance_loss_mlp": 1.08751106, + "epoch": 0.4447864563293574, + "flos": 630375869952.0, + "grad_norm": 0.024389617188914626, + "language_loss": 0.89820284, + "learning_rate": 0.0006127352214842795, + "loss": 0.90992051, + "num_input_tokens_seen": 192906080, + "router_z_loss_mlp": 0.84326172, + "step": 2312, + "time_per_iteration": 3.0181000232696533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170174, + "balance_loss_mlp": 1.08591735, + "epoch": 0.444978838014621, + "flos": 652001620992.0, + "grad_norm": 0.03266392614581568, + "language_loss": 0.92178452, + "learning_rate": 0.0006124316795219041, + "loss": 0.93348622, + "num_input_tokens_seen": 192972336, + "router_z_loss_mlp": 0.84326172, + "step": 2313, + "time_per_iteration": 2.7772133350372314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172939, + "balance_loss_mlp": 1.08911133, + "epoch": 0.44517121969988455, + "flos": 613588325376.0, + "grad_norm": 0.026148577301855224, + "language_loss": 0.88032007, + "learning_rate": 0.0006121280939096794, + "loss": 0.89204955, + "num_input_tokens_seen": 193045744, + "router_z_loss_mlp": 0.83886719, + "step": 2314, + "time_per_iteration": 2.7472517490386963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173697, + "balance_loss_mlp": 1.09010756, + "epoch": 0.44536360138514813, + "flos": 489714620928.0, + "grad_norm": 0.031365562822013526, + "language_loss": 0.94548678, + "learning_rate": 0.000611824464765468, + "loss": 0.95722377, + "num_input_tokens_seen": 193115248, + "router_z_loss_mlp": 0.83642578, + "step": 2315, + "time_per_iteration": 2.5471882820129395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188843, + "balance_loss_mlp": 1.10758972, + "epoch": 0.4455559830704117, + "flos": 1519053877248.0, + "grad_norm": 0.020817362108823283, + "language_loss": 0.78594941, + "learning_rate": 0.0006115207922071492, + "loss": 0.79783785, + "num_input_tokens_seen": 193330816, + "router_z_loss_mlp": 0.8125, + "step": 2316, + "time_per_iteration": 4.660900831222534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170364, + "balance_loss_mlp": 1.08663106, + "epoch": 0.44574836475567525, + "flos": 616816524288.0, + "grad_norm": 0.03088300803415325, + "language_loss": 0.9123913, + "learning_rate": 0.000611217076352619, + "loss": 0.92409492, + "num_input_tokens_seen": 193407616, + "router_z_loss_mlp": 0.83789062, + "step": 2317, + "time_per_iteration": 2.7556822299957275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171317, + "balance_loss_mlp": 1.08772719, + "epoch": 0.44594074644093884, + "flos": 507433422336.0, + "grad_norm": 0.026331926721779163, + "language_loss": 0.8931551, + "learning_rate": 0.0006109133173197905, + "loss": 0.90486825, + "num_input_tokens_seen": 193482624, + "router_z_loss_mlp": 0.83642578, + "step": 2318, + "time_per_iteration": 2.720372200012207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172625, + "balance_loss_mlp": 1.08908355, + "epoch": 0.44613312812620237, + "flos": 728311918080.0, + "grad_norm": 0.030991917971638312, + "language_loss": 0.91262019, + "learning_rate": 0.0006106095152265935, + "loss": 0.92434645, + "num_input_tokens_seen": 193555952, + "router_z_loss_mlp": 0.8359375, + "step": 2319, + "time_per_iteration": 2.8956825733184814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171779, + "balance_loss_mlp": 1.08776009, + "epoch": 0.44632550981146596, + "flos": 637057850880.0, + "grad_norm": 0.02763281666385245, + "language_loss": 0.90440875, + "learning_rate": 0.0006103056701909739, + "loss": 0.91612655, + "num_input_tokens_seen": 193636672, + "router_z_loss_mlp": 0.84082031, + "step": 2320, + "time_per_iteration": 2.9104726314544678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175182, + "balance_loss_mlp": 1.09116352, + "epoch": 0.4465178914967295, + "flos": 828616766976.0, + "grad_norm": 0.02413420043376393, + "language_loss": 0.88773656, + "learning_rate": 0.0006100017823308956, + "loss": 0.89948833, + "num_input_tokens_seen": 193721728, + "router_z_loss_mlp": 0.84082031, + "step": 2321, + "time_per_iteration": 3.1638107299804688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176807, + "balance_loss_mlp": 1.0927887, + "epoch": 0.4467102731819931, + "flos": 667032712704.0, + "grad_norm": 0.03201581013716374, + "language_loss": 0.87315178, + "learning_rate": 0.0006096978517643377, + "loss": 0.88491988, + "num_input_tokens_seen": 193795456, + "router_z_loss_mlp": 0.84082031, + "step": 2322, + "time_per_iteration": 2.7875144481658936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182039, + "balance_loss_mlp": 1.09792459, + "epoch": 0.4469026548672566, + "flos": 513969684480.0, + "grad_norm": 0.032089815412588485, + "language_loss": 0.90642822, + "learning_rate": 0.0006093938786092968, + "loss": 0.91824853, + "num_input_tokens_seen": 193865520, + "router_z_loss_mlp": 0.84179688, + "step": 2323, + "time_per_iteration": 2.6789090633392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181311, + "balance_loss_mlp": 1.097054, + "epoch": 0.4470950365525202, + "flos": 685285272576.0, + "grad_norm": 0.032095192334159584, + "language_loss": 0.95970643, + "learning_rate": 0.0006090898629837857, + "loss": 0.97151959, + "num_input_tokens_seen": 193935040, + "router_z_loss_mlp": 0.84326172, + "step": 2324, + "time_per_iteration": 2.842829704284668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174335, + "balance_loss_mlp": 1.08993506, + "epoch": 0.4472874182377838, + "flos": 628534823424.0, + "grad_norm": 0.02542366781046337, + "language_loss": 0.93390518, + "learning_rate": 0.0006087858050058337, + "loss": 0.94564855, + "num_input_tokens_seen": 194009120, + "router_z_loss_mlp": 0.84472656, + "step": 2325, + "time_per_iteration": 2.798461675643921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173301, + "balance_loss_mlp": 1.08899629, + "epoch": 0.4474797999230473, + "flos": 548240988672.0, + "grad_norm": 0.026872235695321916, + "language_loss": 0.8790192, + "learning_rate": 0.0006084817047934866, + "loss": 0.8907522, + "num_input_tokens_seen": 194076672, + "router_z_loss_mlp": 0.84375, + "step": 2326, + "time_per_iteration": 2.6333069801330566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170357, + "balance_loss_mlp": 1.08552742, + "epoch": 0.4476721816083109, + "flos": 456756609024.0, + "grad_norm": 0.03263470786125086, + "language_loss": 0.9605242, + "learning_rate": 0.0006081775624648066, + "loss": 0.97222769, + "num_input_tokens_seen": 194142320, + "router_z_loss_mlp": 0.84912109, + "step": 2327, + "time_per_iteration": 2.506568431854248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171196, + "balance_loss_mlp": 1.08660555, + "epoch": 0.44786456329357444, + "flos": 482500882944.0, + "grad_norm": 0.030530219610100114, + "language_loss": 0.89424241, + "learning_rate": 0.0006078733781378721, + "loss": 0.90595436, + "num_input_tokens_seen": 194208560, + "router_z_loss_mlp": 0.84667969, + "step": 2328, + "time_per_iteration": 2.5324759483337402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174464, + "balance_loss_mlp": 1.09006357, + "epoch": 0.448056944978838, + "flos": 553236374016.0, + "grad_norm": 0.028423200188041658, + "language_loss": 0.87742424, + "learning_rate": 0.0006075691519307781, + "loss": 0.88916886, + "num_input_tokens_seen": 194288080, + "router_z_loss_mlp": 0.84472656, + "step": 2329, + "time_per_iteration": 2.8329951763153076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169966, + "balance_loss_mlp": 1.08580375, + "epoch": 0.44824932666410156, + "flos": 551916350976.0, + "grad_norm": 0.030957218182316032, + "language_loss": 0.88990253, + "learning_rate": 0.0006072648839616356, + "loss": 0.90160215, + "num_input_tokens_seen": 194358464, + "router_z_loss_mlp": 0.84228516, + "step": 2330, + "time_per_iteration": 2.6367061138153076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169901, + "balance_loss_mlp": 1.08612072, + "epoch": 0.44844170834936514, + "flos": 990271953408.0, + "grad_norm": 0.02484019388371453, + "language_loss": 0.87772298, + "learning_rate": 0.0006069605743485718, + "loss": 0.88942194, + "num_input_tokens_seen": 194456112, + "router_z_loss_mlp": 0.83837891, + "step": 2331, + "time_per_iteration": 3.3425865173339844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177153, + "balance_loss_mlp": 1.09356356, + "epoch": 0.44863409003462873, + "flos": 592450670592.0, + "grad_norm": 0.02816420707323987, + "language_loss": 0.89319122, + "learning_rate": 0.0006066562232097303, + "loss": 0.90496272, + "num_input_tokens_seen": 194526880, + "router_z_loss_mlp": 0.83642578, + "step": 2332, + "time_per_iteration": 2.7754669189453125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178328, + "balance_loss_mlp": 1.09473884, + "epoch": 0.44882647171989226, + "flos": 725984776704.0, + "grad_norm": 0.02840681089712515, + "language_loss": 0.91798162, + "learning_rate": 0.0006063518306632708, + "loss": 0.92976487, + "num_input_tokens_seen": 194606800, + "router_z_loss_mlp": 0.83642578, + "step": 2333, + "time_per_iteration": 2.9270272254943848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174339, + "balance_loss_mlp": 1.09065437, + "epoch": 0.44901885340515585, + "flos": 535990932480.0, + "grad_norm": 0.029373675588589353, + "language_loss": 0.88265771, + "learning_rate": 0.0006060473968273688, + "loss": 0.89440107, + "num_input_tokens_seen": 194679856, + "router_z_loss_mlp": 0.83740234, + "step": 2334, + "time_per_iteration": 2.6593613624572754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199905, + "balance_loss_mlp": 1.11693573, + "epoch": 0.4492112350904194, + "flos": 1558690593792.0, + "grad_norm": 0.016875691883268894, + "language_loss": 0.77879542, + "learning_rate": 0.000605742921820216, + "loss": 0.79079443, + "num_input_tokens_seen": 194906320, + "router_z_loss_mlp": 0.83007812, + "step": 2335, + "time_per_iteration": 4.868390321731567 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182762, + "balance_loss_mlp": 1.10017395, + "epoch": 0.44940361677568297, + "flos": 1526700768768.0, + "grad_norm": 0.009982769528938305, + "language_loss": 0.81005216, + "learning_rate": 0.0006054384057600202, + "loss": 0.82187974, + "num_input_tokens_seen": 195129152, + "router_z_loss_mlp": 0.82617188, + "step": 2336, + "time_per_iteration": 4.8639936447143555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176453, + "balance_loss_mlp": 1.09286392, + "epoch": 0.4495959984609465, + "flos": 383320673280.0, + "grad_norm": 0.04017386378382665, + "language_loss": 0.95653474, + "learning_rate": 0.0006051338487650047, + "loss": 0.96829921, + "num_input_tokens_seen": 195189792, + "router_z_loss_mlp": 0.83642578, + "step": 2337, + "time_per_iteration": 2.451195240020752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177188, + "balance_loss_mlp": 1.09364605, + "epoch": 0.4497883801462101, + "flos": 498882196992.0, + "grad_norm": 0.03424215683733749, + "language_loss": 0.88682485, + "learning_rate": 0.0006048292509534095, + "loss": 0.89859676, + "num_input_tokens_seen": 195258640, + "router_z_loss_mlp": 0.8359375, + "step": 2338, + "time_per_iteration": 2.5799245834350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174646, + "balance_loss_mlp": 1.09139061, + "epoch": 0.4499807618314736, + "flos": 615589827072.0, + "grad_norm": 0.03300851417215051, + "language_loss": 0.85045063, + "learning_rate": 0.0006045246124434895, + "loss": 0.86219716, + "num_input_tokens_seen": 195327984, + "router_z_loss_mlp": 0.83300781, + "step": 2339, + "time_per_iteration": 2.732715368270874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170546, + "balance_loss_mlp": 1.08738542, + "epoch": 0.4501731435167372, + "flos": 1007067503616.0, + "grad_norm": 0.0319502465029259, + "language_loss": 0.92538428, + "learning_rate": 0.0006042199333535162, + "loss": 0.9370898, + "num_input_tokens_seen": 195409504, + "router_z_loss_mlp": 0.83203125, + "step": 2340, + "time_per_iteration": 3.3100435733795166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170678, + "balance_loss_mlp": 1.08742249, + "epoch": 0.4503655252020008, + "flos": 822327555072.0, + "grad_norm": 0.024782286149646622, + "language_loss": 0.88794839, + "learning_rate": 0.0006039152138017763, + "loss": 0.89965516, + "num_input_tokens_seen": 195489424, + "router_z_loss_mlp": 0.83300781, + "step": 2341, + "time_per_iteration": 3.0845420360565186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117382, + "balance_loss_mlp": 1.09027839, + "epoch": 0.4505579068872643, + "flos": 487413676032.0, + "grad_norm": 0.028274686754151398, + "language_loss": 0.8912791, + "learning_rate": 0.0006036104539065726, + "loss": 0.90301728, + "num_input_tokens_seen": 195562128, + "router_z_loss_mlp": 0.8359375, + "step": 2342, + "time_per_iteration": 2.704869270324707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170482, + "balance_loss_mlp": 1.08679724, + "epoch": 0.4507502885725279, + "flos": 886335403008.0, + "grad_norm": 0.02767032513042878, + "language_loss": 0.89237905, + "learning_rate": 0.000603305653786223, + "loss": 0.90408385, + "num_input_tokens_seen": 195646800, + "router_z_loss_mlp": 0.83740234, + "step": 2343, + "time_per_iteration": 3.143308162689209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169453, + "balance_loss_mlp": 1.08576834, + "epoch": 0.45094267025779144, + "flos": 579421080576.0, + "grad_norm": 0.028420960086658186, + "language_loss": 0.90634954, + "learning_rate": 0.0006030008135590622, + "loss": 0.91804409, + "num_input_tokens_seen": 195719648, + "router_z_loss_mlp": 0.83740234, + "step": 2344, + "time_per_iteration": 2.7383973598480225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177198, + "balance_loss_mlp": 1.09332275, + "epoch": 0.45113505194305503, + "flos": 526441320960.0, + "grad_norm": 0.025225422820390885, + "language_loss": 0.85642457, + "learning_rate": 0.0006026959333434387, + "loss": 0.86819655, + "num_input_tokens_seen": 195794800, + "router_z_loss_mlp": 0.83935547, + "step": 2345, + "time_per_iteration": 2.7594330310821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177326, + "balance_loss_mlp": 1.09316456, + "epoch": 0.45132743362831856, + "flos": 503115512832.0, + "grad_norm": 0.026356266791679354, + "language_loss": 0.83258432, + "learning_rate": 0.0006023910132577181, + "loss": 0.84435755, + "num_input_tokens_seen": 195866848, + "router_z_loss_mlp": 0.84228516, + "step": 2346, + "time_per_iteration": 2.6426072120666504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174296, + "balance_loss_mlp": 1.09051549, + "epoch": 0.45151981531358215, + "flos": 432835917312.0, + "grad_norm": 0.03747446326611767, + "language_loss": 0.91464496, + "learning_rate": 0.0006020860534202806, + "loss": 0.92638797, + "num_input_tokens_seen": 195930640, + "router_z_loss_mlp": 0.83837891, + "step": 2347, + "time_per_iteration": 2.5375916957855225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172304, + "balance_loss_mlp": 1.08799899, + "epoch": 0.4517121969988457, + "flos": 713493674496.0, + "grad_norm": 0.026159040948808, + "language_loss": 0.86486131, + "learning_rate": 0.0006017810539495224, + "loss": 0.87658435, + "num_input_tokens_seen": 196014240, + "router_z_loss_mlp": 0.84375, + "step": 2348, + "time_per_iteration": 2.935776472091675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172944, + "balance_loss_mlp": 1.0886873, + "epoch": 0.45190457868410927, + "flos": 580556453376.0, + "grad_norm": 0.02859512200307389, + "language_loss": 0.8919422, + "learning_rate": 0.0006014760149638547, + "loss": 0.90367162, + "num_input_tokens_seen": 196083296, + "router_z_loss_mlp": 0.84326172, + "step": 2349, + "time_per_iteration": 4.1359429359436035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117423, + "balance_loss_mlp": 1.08982956, + "epoch": 0.45209696036937286, + "flos": 483627523584.0, + "grad_norm": 0.04225699722465749, + "language_loss": 0.94155228, + "learning_rate": 0.000601170936581704, + "loss": 0.95329458, + "num_input_tokens_seen": 196147840, + "router_z_loss_mlp": 0.84472656, + "step": 2350, + "time_per_iteration": 2.551886796951294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171893, + "balance_loss_mlp": 1.08739793, + "epoch": 0.4522893420546364, + "flos": 541259564544.0, + "grad_norm": 0.03047412078786442, + "language_loss": 0.90869355, + "learning_rate": 0.0006008658189215121, + "loss": 0.92041242, + "num_input_tokens_seen": 196219008, + "router_z_loss_mlp": 0.84570312, + "step": 2351, + "time_per_iteration": 2.6196951866149902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176582, + "balance_loss_mlp": 1.09175217, + "epoch": 0.4524817237399, + "flos": 497690428416.0, + "grad_norm": 0.03573709607194862, + "language_loss": 0.8682127, + "learning_rate": 0.0006005606621017366, + "loss": 0.87997848, + "num_input_tokens_seen": 196287792, + "router_z_loss_mlp": 0.84912109, + "step": 2352, + "time_per_iteration": 2.5675714015960693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174694, + "balance_loss_mlp": 1.09024608, + "epoch": 0.4526741054251635, + "flos": 653840666112.0, + "grad_norm": 0.027536817578414453, + "language_loss": 0.86718237, + "learning_rate": 0.0006002554662408496, + "loss": 0.87892926, + "num_input_tokens_seen": 196371776, + "router_z_loss_mlp": 0.84521484, + "step": 2353, + "time_per_iteration": 2.887061595916748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182285, + "balance_loss_mlp": 1.09774196, + "epoch": 0.4528664871104271, + "flos": 572003226624.0, + "grad_norm": 0.03098083736113463, + "language_loss": 0.96988797, + "learning_rate": 0.0005999502314573388, + "loss": 0.98171079, + "num_input_tokens_seen": 196441840, + "router_z_loss_mlp": 0.84619141, + "step": 2354, + "time_per_iteration": 2.6700878143310547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184968, + "balance_loss_mlp": 1.1005199, + "epoch": 0.45305886879569063, + "flos": 459678633984.0, + "grad_norm": 0.034884925425697356, + "language_loss": 0.93055832, + "learning_rate": 0.0005996449578697066, + "loss": 0.94240803, + "num_input_tokens_seen": 196510464, + "router_z_loss_mlp": 0.84521484, + "step": 2355, + "time_per_iteration": 2.6873598098754883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180832, + "balance_loss_mlp": 1.09647942, + "epoch": 0.4532512504809542, + "flos": 506206725120.0, + "grad_norm": 0.028006133853455534, + "language_loss": 0.87364781, + "learning_rate": 0.0005993396455964709, + "loss": 0.88545609, + "num_input_tokens_seen": 196583888, + "router_z_loss_mlp": 0.84423828, + "step": 2356, + "time_per_iteration": 2.672428607940674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179518, + "balance_loss_mlp": 1.09545124, + "epoch": 0.4534436321662178, + "flos": 583311292416.0, + "grad_norm": 0.033764708533666976, + "language_loss": 0.88888013, + "learning_rate": 0.0005990342947561647, + "loss": 0.90067536, + "num_input_tokens_seen": 196652816, + "router_z_loss_mlp": 0.84130859, + "step": 2357, + "time_per_iteration": 2.7101337909698486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179265, + "balance_loss_mlp": 1.09529436, + "epoch": 0.45363601385148133, + "flos": 550772246016.0, + "grad_norm": 0.03168807299418994, + "language_loss": 0.84871709, + "learning_rate": 0.0005987289054673351, + "loss": 0.86050975, + "num_input_tokens_seen": 196720208, + "router_z_loss_mlp": 0.84033203, + "step": 2358, + "time_per_iteration": 2.6033973693847656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122184, + "balance_loss_mlp": 1.14096832, + "epoch": 0.4538283955367449, + "flos": 1477791141888.0, + "grad_norm": 0.02971290012878958, + "language_loss": 0.76575738, + "learning_rate": 0.0005984234778485451, + "loss": 0.7779758, + "num_input_tokens_seen": 196947696, + "router_z_loss_mlp": 0.80859375, + "step": 2359, + "time_per_iteration": 4.841644525527954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172875, + "balance_loss_mlp": 1.0889039, + "epoch": 0.45402077722200845, + "flos": 585796887552.0, + "grad_norm": 0.03208897744410929, + "language_loss": 0.98243296, + "learning_rate": 0.0005981180120183722, + "loss": 0.99416173, + "num_input_tokens_seen": 197015712, + "router_z_loss_mlp": 0.84033203, + "step": 2360, + "time_per_iteration": 2.76943302154541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183781, + "balance_loss_mlp": 1.09957135, + "epoch": 0.45421315890727204, + "flos": 532888986624.0, + "grad_norm": 0.026822351719262807, + "language_loss": 0.89930874, + "learning_rate": 0.0005978125080954089, + "loss": 0.91114652, + "num_input_tokens_seen": 197094880, + "router_z_loss_mlp": 0.84277344, + "step": 2361, + "time_per_iteration": 2.822767972946167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180091, + "balance_loss_mlp": 1.09597707, + "epoch": 0.4544055405925356, + "flos": 786551577600.0, + "grad_norm": 0.034773976616178995, + "language_loss": 0.84516251, + "learning_rate": 0.000597506966198262, + "loss": 0.85696352, + "num_input_tokens_seen": 197176448, + "router_z_loss_mlp": 0.84179688, + "step": 2362, + "time_per_iteration": 2.952383518218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177, + "balance_loss_mlp": 1.09288561, + "epoch": 0.45459792227779916, + "flos": 519201386496.0, + "grad_norm": 0.03664720273497137, + "language_loss": 0.91360861, + "learning_rate": 0.0005972013864455536, + "loss": 0.92537856, + "num_input_tokens_seen": 197243520, + "router_z_loss_mlp": 0.84179688, + "step": 2363, + "time_per_iteration": 2.6317927837371826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178521, + "balance_loss_mlp": 1.09450209, + "epoch": 0.4547903039630627, + "flos": 538598051328.0, + "grad_norm": 0.028772208334572696, + "language_loss": 0.91273308, + "learning_rate": 0.0005968957689559203, + "loss": 0.92451829, + "num_input_tokens_seen": 197311536, + "router_z_loss_mlp": 0.84082031, + "step": 2364, + "time_per_iteration": 2.6589906215667725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173596, + "balance_loss_mlp": 1.0895294, + "epoch": 0.4549826856483263, + "flos": 529690987008.0, + "grad_norm": 0.029727340486193105, + "language_loss": 0.95477283, + "learning_rate": 0.0005965901138480131, + "loss": 0.96650875, + "num_input_tokens_seen": 197382752, + "router_z_loss_mlp": 0.84130859, + "step": 2365, + "time_per_iteration": 2.595510959625244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171355, + "balance_loss_mlp": 1.08700228, + "epoch": 0.45517506733358987, + "flos": 521982422016.0, + "grad_norm": 0.030829958952989886, + "language_loss": 0.94295681, + "learning_rate": 0.0005962844212404982, + "loss": 0.95467031, + "num_input_tokens_seen": 197456592, + "router_z_loss_mlp": 0.84423828, + "step": 2366, + "time_per_iteration": 2.662235736846924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177016, + "balance_loss_mlp": 1.09271073, + "epoch": 0.4553674490188534, + "flos": 452009000448.0, + "grad_norm": 0.02436634770305822, + "language_loss": 0.92783928, + "learning_rate": 0.0005959786912520558, + "loss": 0.93960941, + "num_input_tokens_seen": 197525408, + "router_z_loss_mlp": 0.84375, + "step": 2367, + "time_per_iteration": 2.573124408721924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117318, + "balance_loss_mlp": 1.08906567, + "epoch": 0.455559830704117, + "flos": 547744160256.0, + "grad_norm": 0.037205613753220755, + "language_loss": 0.90209919, + "learning_rate": 0.0005956729240013806, + "loss": 0.913831, + "num_input_tokens_seen": 197608480, + "router_z_loss_mlp": 0.84179688, + "step": 2368, + "time_per_iteration": 2.772557020187378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173597, + "balance_loss_mlp": 1.08943486, + "epoch": 0.4557522123893805, + "flos": 584865630720.0, + "grad_norm": 0.026144628796570656, + "language_loss": 0.97770655, + "learning_rate": 0.0005953671196071824, + "loss": 0.98944247, + "num_input_tokens_seen": 197678416, + "router_z_loss_mlp": 0.84228516, + "step": 2369, + "time_per_iteration": 2.7082910537719727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172311, + "balance_loss_mlp": 1.08819652, + "epoch": 0.4559445940746441, + "flos": 527483367936.0, + "grad_norm": 0.0309922218143565, + "language_loss": 0.8751142, + "learning_rate": 0.0005950612781881846, + "loss": 0.8868373, + "num_input_tokens_seen": 197753424, + "router_z_loss_mlp": 0.84179688, + "step": 2370, + "time_per_iteration": 2.7258613109588623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172868, + "balance_loss_mlp": 1.08913577, + "epoch": 0.45613697575990764, + "flos": 653367306240.0, + "grad_norm": 0.03125586624235708, + "language_loss": 0.84058654, + "learning_rate": 0.0005947553998631259, + "loss": 0.85231519, + "num_input_tokens_seen": 197832080, + "router_z_loss_mlp": 0.83789062, + "step": 2371, + "time_per_iteration": 2.8463094234466553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169614, + "balance_loss_mlp": 1.08626282, + "epoch": 0.4563293574451712, + "flos": 868623332352.0, + "grad_norm": 0.025158843177806284, + "language_loss": 0.84537494, + "learning_rate": 0.000594449484750758, + "loss": 0.85707104, + "num_input_tokens_seen": 197919536, + "router_z_loss_mlp": 0.83398438, + "step": 2372, + "time_per_iteration": 3.1793160438537598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165382, + "balance_loss_mlp": 1.08193552, + "epoch": 0.45652173913043476, + "flos": 499131975168.0, + "grad_norm": 0.03016735007152292, + "language_loss": 0.8953886, + "learning_rate": 0.0005941435329698484, + "loss": 0.90704238, + "num_input_tokens_seen": 197991872, + "router_z_loss_mlp": 0.83496094, + "step": 2373, + "time_per_iteration": 2.6885011196136475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168274, + "balance_loss_mlp": 1.08458936, + "epoch": 0.45671412081569834, + "flos": 561958788096.0, + "grad_norm": 0.029049495784182693, + "language_loss": 0.89830238, + "learning_rate": 0.0005938375446391778, + "loss": 0.90998513, + "num_input_tokens_seen": 198063392, + "router_z_loss_mlp": 0.83740234, + "step": 2374, + "time_per_iteration": 2.7694103717803955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169785, + "balance_loss_mlp": 1.08605206, + "epoch": 0.45690650250096193, + "flos": 504122631168.0, + "grad_norm": 0.032895841438659715, + "language_loss": 0.95283711, + "learning_rate": 0.0005935315198775415, + "loss": 0.96453488, + "num_input_tokens_seen": 198131232, + "router_z_loss_mlp": 0.83789062, + "step": 2375, + "time_per_iteration": 2.6797261238098145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117336, + "balance_loss_mlp": 1.08967507, + "epoch": 0.45709888418622546, + "flos": 431598486528.0, + "grad_norm": 0.029217874962507603, + "language_loss": 0.93084061, + "learning_rate": 0.0005932254588037486, + "loss": 0.94257426, + "num_input_tokens_seen": 198194944, + "router_z_loss_mlp": 0.83740234, + "step": 2376, + "time_per_iteration": 2.5119664669036865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170171, + "balance_loss_mlp": 1.08634305, + "epoch": 0.45729126587148905, + "flos": 526693100544.0, + "grad_norm": 0.033600967739372, + "language_loss": 0.91914618, + "learning_rate": 0.000592919361536623, + "loss": 0.93084788, + "num_input_tokens_seen": 198265728, + "router_z_loss_mlp": 0.83886719, + "step": 2377, + "time_per_iteration": 2.627753734588623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172251, + "balance_loss_mlp": 1.08861363, + "epoch": 0.4574836475567526, + "flos": 639147949056.0, + "grad_norm": 0.02676395696709272, + "language_loss": 0.95213675, + "learning_rate": 0.0005926132281950017, + "loss": 0.9638592, + "num_input_tokens_seen": 198336640, + "router_z_loss_mlp": 0.83691406, + "step": 2378, + "time_per_iteration": 2.7404637336730957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171278, + "balance_loss_mlp": 1.08754539, + "epoch": 0.45767602924201617, + "flos": 650790386688.0, + "grad_norm": 0.03076010987013328, + "language_loss": 0.92175043, + "learning_rate": 0.0005923070588977367, + "loss": 0.93346316, + "num_input_tokens_seen": 198413552, + "router_z_loss_mlp": 0.83789062, + "step": 2379, + "time_per_iteration": 2.7948412895202637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173225, + "balance_loss_mlp": 1.08944476, + "epoch": 0.4578684109272797, + "flos": 747962363904.0, + "grad_norm": 0.027484014603145524, + "language_loss": 0.92339164, + "learning_rate": 0.0005920008537636931, + "loss": 0.93512392, + "num_input_tokens_seen": 198490864, + "router_z_loss_mlp": 0.83837891, + "step": 2380, + "time_per_iteration": 2.903837203979492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173408, + "balance_loss_mlp": 1.08972311, + "epoch": 0.4580607926125433, + "flos": 642727984128.0, + "grad_norm": 0.029077527756171735, + "language_loss": 0.92490625, + "learning_rate": 0.0005916946129117504, + "loss": 0.93664026, + "num_input_tokens_seen": 198571200, + "router_z_loss_mlp": 0.83740234, + "step": 2381, + "time_per_iteration": 2.902449131011963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169328, + "balance_loss_mlp": 1.08569121, + "epoch": 0.4582531742978069, + "flos": 803239065600.0, + "grad_norm": 0.02842187637415346, + "language_loss": 0.86509985, + "learning_rate": 0.0005913883364608017, + "loss": 0.87679315, + "num_input_tokens_seen": 198658624, + "router_z_loss_mlp": 0.83691406, + "step": 2382, + "time_per_iteration": 3.0474140644073486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171424, + "balance_loss_mlp": 1.0876435, + "epoch": 0.4584455559830704, + "flos": 685517586432.0, + "grad_norm": 0.02678099894990505, + "language_loss": 0.94194049, + "learning_rate": 0.0005910820245297542, + "loss": 0.95365477, + "num_input_tokens_seen": 198731312, + "router_z_loss_mlp": 0.83837891, + "step": 2383, + "time_per_iteration": 2.879652261734009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171015, + "balance_loss_mlp": 1.08718669, + "epoch": 0.458637937668334, + "flos": 519281977344.0, + "grad_norm": 0.03033035418174317, + "language_loss": 0.87193358, + "learning_rate": 0.000590775677237529, + "loss": 0.88364375, + "num_input_tokens_seen": 198805296, + "router_z_loss_mlp": 0.83886719, + "step": 2384, + "time_per_iteration": 2.718327045440674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116823, + "balance_loss_mlp": 1.08478332, + "epoch": 0.4588303193535975, + "flos": 506532364800.0, + "grad_norm": 0.028303891516217768, + "language_loss": 0.87188554, + "learning_rate": 0.0005904692947030601, + "loss": 0.88356787, + "num_input_tokens_seen": 198872112, + "router_z_loss_mlp": 0.83496094, + "step": 2385, + "time_per_iteration": 2.5850000381469727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166672, + "balance_loss_mlp": 1.08303475, + "epoch": 0.4590227010388611, + "flos": 496908893184.0, + "grad_norm": 0.031451346934425, + "language_loss": 0.9665041, + "learning_rate": 0.0005901628770452963, + "loss": 0.97817081, + "num_input_tokens_seen": 198938480, + "router_z_loss_mlp": 0.83691406, + "step": 2386, + "time_per_iteration": 2.5478482246398926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172991, + "balance_loss_mlp": 1.08964002, + "epoch": 0.45921508272412465, + "flos": 494601217536.0, + "grad_norm": 0.030858044337890404, + "language_loss": 0.93199378, + "learning_rate": 0.000589856424383199, + "loss": 0.94372368, + "num_input_tokens_seen": 199008608, + "router_z_loss_mlp": 0.83398438, + "step": 2387, + "time_per_iteration": 2.6889121532440186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170845, + "balance_loss_mlp": 1.08744633, + "epoch": 0.45940746440938823, + "flos": 692592336384.0, + "grad_norm": 0.02985924743030105, + "language_loss": 0.89320701, + "learning_rate": 0.000589549936835744, + "loss": 0.90491545, + "num_input_tokens_seen": 199084592, + "router_z_loss_mlp": 0.83447266, + "step": 2388, + "time_per_iteration": 2.929584264755249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167353, + "balance_loss_mlp": 1.08390617, + "epoch": 0.45959984609465176, + "flos": 504736980480.0, + "grad_norm": 0.026272627268038303, + "language_loss": 0.85652947, + "learning_rate": 0.0005892434145219202, + "loss": 0.86820304, + "num_input_tokens_seen": 199151504, + "router_z_loss_mlp": 0.83496094, + "step": 2389, + "time_per_iteration": 2.6049258708953857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169189, + "balance_loss_mlp": 1.08593321, + "epoch": 0.45979222777991535, + "flos": 677839220736.0, + "grad_norm": 0.032142260667283734, + "language_loss": 0.89047158, + "learning_rate": 0.0005889368575607303, + "loss": 0.90216345, + "num_input_tokens_seen": 199224528, + "router_z_loss_mlp": 0.83300781, + "step": 2390, + "time_per_iteration": 2.8630926609039307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170087, + "balance_loss_mlp": 1.08673584, + "epoch": 0.45998460946517894, + "flos": 779038396416.0, + "grad_norm": 0.02948026619685868, + "language_loss": 0.84149277, + "learning_rate": 0.00058863026607119, + "loss": 0.85319364, + "num_input_tokens_seen": 199312512, + "router_z_loss_mlp": 0.83398438, + "step": 2391, + "time_per_iteration": 3.0889787673950195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170542, + "balance_loss_mlp": 1.08709574, + "epoch": 0.46017699115044247, + "flos": 853021552128.0, + "grad_norm": 0.028406278062058678, + "language_loss": 0.85429174, + "learning_rate": 0.0005883236401723287, + "loss": 0.8659972, + "num_input_tokens_seen": 199397216, + "router_z_loss_mlp": 0.83496094, + "step": 2392, + "time_per_iteration": 3.1613874435424805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167478, + "balance_loss_mlp": 1.08403194, + "epoch": 0.46036937283570606, + "flos": 576963683328.0, + "grad_norm": 0.029157836827012555, + "language_loss": 0.90157199, + "learning_rate": 0.0005880169799831893, + "loss": 0.91324675, + "num_input_tokens_seen": 199464288, + "router_z_loss_mlp": 0.83496094, + "step": 2393, + "time_per_iteration": 2.6974027156829834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117291, + "balance_loss_mlp": 1.08955884, + "epoch": 0.4605617545209696, + "flos": 613119694848.0, + "grad_norm": 0.028584885066092792, + "language_loss": 0.87511885, + "learning_rate": 0.0005877102856228278, + "loss": 0.88684797, + "num_input_tokens_seen": 199538096, + "router_z_loss_mlp": 0.83398438, + "step": 2394, + "time_per_iteration": 2.862462043762207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169553, + "balance_loss_mlp": 1.08591628, + "epoch": 0.4607541362062332, + "flos": 534158618112.0, + "grad_norm": 0.03156913659667245, + "language_loss": 0.91444194, + "learning_rate": 0.0005874035572103133, + "loss": 0.92613751, + "num_input_tokens_seen": 199609504, + "router_z_loss_mlp": 0.83691406, + "step": 2395, + "time_per_iteration": 2.66796612739563 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171842, + "balance_loss_mlp": 1.08830035, + "epoch": 0.4609465178914967, + "flos": 648473978880.0, + "grad_norm": 0.039315545211924735, + "language_loss": 0.89278555, + "learning_rate": 0.0005870967948647288, + "loss": 0.90450394, + "num_input_tokens_seen": 199678960, + "router_z_loss_mlp": 0.8359375, + "step": 2396, + "time_per_iteration": 2.7669596672058105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209076, + "balance_loss_mlp": 1.12553406, + "epoch": 0.4611388995767603, + "flos": 1469498426880.0, + "grad_norm": 0.015424486797259693, + "language_loss": 0.743083, + "learning_rate": 0.0005867899987051693, + "loss": 0.7551738, + "num_input_tokens_seen": 199903568, + "router_z_loss_mlp": 0.8359375, + "step": 2397, + "time_per_iteration": 5.5382936000823975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177127, + "balance_loss_mlp": 1.09377611, + "epoch": 0.46133128126202383, + "flos": 724476100608.0, + "grad_norm": 0.029375695907885992, + "language_loss": 0.91919947, + "learning_rate": 0.0005864831688507443, + "loss": 0.93097073, + "num_input_tokens_seen": 199988672, + "router_z_loss_mlp": 0.83398438, + "step": 2398, + "time_per_iteration": 2.95526123046875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171581, + "balance_loss_mlp": 1.08846855, + "epoch": 0.4615236629472874, + "flos": 549113848320.0, + "grad_norm": 0.030696537047505416, + "language_loss": 0.82409662, + "learning_rate": 0.0005861763054205754, + "loss": 0.83581245, + "num_input_tokens_seen": 200062304, + "router_z_loss_mlp": 0.83154297, + "step": 2399, + "time_per_iteration": 2.767615795135498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172709, + "balance_loss_mlp": 1.08973968, + "epoch": 0.461716044632551, + "flos": 603459293184.0, + "grad_norm": 0.02737063612292851, + "language_loss": 0.84976828, + "learning_rate": 0.0005858694085337976, + "loss": 0.86149538, + "num_input_tokens_seen": 200138464, + "router_z_loss_mlp": 0.83007812, + "step": 2400, + "time_per_iteration": 2.7964670658111572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011724, + "balance_loss_mlp": 1.08966899, + "epoch": 0.46190842631781454, + "flos": 475436866560.0, + "grad_norm": 0.03229000781534058, + "language_loss": 0.9094255, + "learning_rate": 0.0005855624783095589, + "loss": 0.92114949, + "num_input_tokens_seen": 200205728, + "router_z_loss_mlp": 0.82763672, + "step": 2401, + "time_per_iteration": 2.534349203109741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170734, + "balance_loss_mlp": 1.08814597, + "epoch": 0.4621008080030781, + "flos": 438401991168.0, + "grad_norm": 0.027555285929390542, + "language_loss": 0.90607065, + "learning_rate": 0.00058525551486702, + "loss": 0.91777802, + "num_input_tokens_seen": 200269824, + "router_z_loss_mlp": 0.82617188, + "step": 2402, + "time_per_iteration": 2.5021228790283203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172463, + "balance_loss_mlp": 1.08987451, + "epoch": 0.46229318968834165, + "flos": 526497716736.0, + "grad_norm": 0.03262891309156314, + "language_loss": 0.88400978, + "learning_rate": 0.0005849485183253548, + "loss": 0.89573443, + "num_input_tokens_seen": 200341264, + "router_z_loss_mlp": 0.82617188, + "step": 2403, + "time_per_iteration": 2.6212213039398193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165506, + "balance_loss_mlp": 1.08291745, + "epoch": 0.46248557137360524, + "flos": 440533748736.0, + "grad_norm": 0.02845192827842058, + "language_loss": 0.92361593, + "learning_rate": 0.0005846414888037501, + "loss": 0.93527102, + "num_input_tokens_seen": 200405632, + "router_z_loss_mlp": 0.82617188, + "step": 2404, + "time_per_iteration": 2.482285499572754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166688, + "balance_loss_mlp": 1.08409953, + "epoch": 0.4626779530588688, + "flos": 618772363776.0, + "grad_norm": 0.03074329225106782, + "language_loss": 0.881423, + "learning_rate": 0.0005843344264214049, + "loss": 0.89308989, + "num_input_tokens_seen": 200479312, + "router_z_loss_mlp": 0.82617188, + "step": 2405, + "time_per_iteration": 2.746795415878296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170811, + "balance_loss_mlp": 1.08803225, + "epoch": 0.46287033474413236, + "flos": 671359354368.0, + "grad_norm": 0.02816556419491645, + "language_loss": 0.904742, + "learning_rate": 0.0005840273312975317, + "loss": 0.91645014, + "num_input_tokens_seen": 200552976, + "router_z_loss_mlp": 0.828125, + "step": 2406, + "time_per_iteration": 2.866894483566284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168834, + "balance_loss_mlp": 1.08572149, + "epoch": 0.46306271642939595, + "flos": 481198324224.0, + "grad_norm": 0.027370741977369897, + "language_loss": 0.96141434, + "learning_rate": 0.0005837202035513555, + "loss": 0.97310269, + "num_input_tokens_seen": 200621088, + "router_z_loss_mlp": 0.83154297, + "step": 2407, + "time_per_iteration": 2.589233636856079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168547, + "balance_loss_mlp": 1.08562469, + "epoch": 0.4632550981146595, + "flos": 581857010688.0, + "grad_norm": 0.028787881065009197, + "language_loss": 0.87249482, + "learning_rate": 0.0005834130433021136, + "loss": 0.88418025, + "num_input_tokens_seen": 200698400, + "router_z_loss_mlp": 0.82958984, + "step": 2408, + "time_per_iteration": 2.77109432220459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176276, + "balance_loss_mlp": 1.09311593, + "epoch": 0.46344747979992307, + "flos": 525017238528.0, + "grad_norm": 0.03139748973768327, + "language_loss": 0.79860151, + "learning_rate": 0.0005831058506690563, + "loss": 0.81036425, + "num_input_tokens_seen": 200767264, + "router_z_loss_mlp": 0.83203125, + "step": 2409, + "time_per_iteration": 2.6422629356384277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175968, + "balance_loss_mlp": 1.0931412, + "epoch": 0.4636398614851866, + "flos": 747812642304.0, + "grad_norm": 0.02712568041794283, + "language_loss": 0.9122293, + "learning_rate": 0.0005827986257714464, + "loss": 0.92398894, + "num_input_tokens_seen": 200841440, + "router_z_loss_mlp": 0.82861328, + "step": 2410, + "time_per_iteration": 2.915513515472412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175895, + "balance_loss_mlp": 1.09254348, + "epoch": 0.4638322431704502, + "flos": 597645442560.0, + "grad_norm": 0.03337742182336422, + "language_loss": 0.94969916, + "learning_rate": 0.0005824913687285591, + "loss": 0.96145809, + "num_input_tokens_seen": 200911296, + "router_z_loss_mlp": 0.83398438, + "step": 2411, + "time_per_iteration": 2.7729153633117676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174985, + "balance_loss_mlp": 1.09168148, + "epoch": 0.4640246248557137, + "flos": 540532423680.0, + "grad_norm": 0.028926449520475586, + "language_loss": 0.87762833, + "learning_rate": 0.0005821840796596821, + "loss": 0.88937813, + "num_input_tokens_seen": 200981920, + "router_z_loss_mlp": 0.83349609, + "step": 2412, + "time_per_iteration": 2.7454707622528076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174854, + "balance_loss_mlp": 1.09155095, + "epoch": 0.4642170065409773, + "flos": 563808566784.0, + "grad_norm": 0.027243427778446835, + "language_loss": 0.85983133, + "learning_rate": 0.0005818767586841158, + "loss": 0.87157989, + "num_input_tokens_seen": 201059392, + "router_z_loss_mlp": 0.83349609, + "step": 2413, + "time_per_iteration": 2.7634999752044678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174726, + "balance_loss_mlp": 1.09161353, + "epoch": 0.46440938822624084, + "flos": 532061789184.0, + "grad_norm": 0.026139841130999073, + "language_loss": 0.91185576, + "learning_rate": 0.0005815694059211726, + "loss": 0.923603, + "num_input_tokens_seen": 201130192, + "router_z_loss_mlp": 0.83154297, + "step": 2414, + "time_per_iteration": 2.6814608573913574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193306, + "balance_loss_mlp": 1.11109924, + "epoch": 0.4646017699115044, + "flos": 1529624795136.0, + "grad_norm": 0.015412108289742382, + "language_loss": 0.80873632, + "learning_rate": 0.0005812620214901778, + "loss": 0.82066941, + "num_input_tokens_seen": 201354720, + "router_z_loss_mlp": 0.82226562, + "step": 2415, + "time_per_iteration": 4.867271184921265 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183273, + "balance_loss_mlp": 1.10163879, + "epoch": 0.464794151596768, + "flos": 1544171793408.0, + "grad_norm": 0.012751682226462524, + "language_loss": 0.7694506, + "learning_rate": 0.000580954605510468, + "loss": 0.78128332, + "num_input_tokens_seen": 201592096, + "router_z_loss_mlp": 0.81640625, + "step": 2416, + "time_per_iteration": 5.0150392055511475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166548, + "balance_loss_mlp": 1.08391249, + "epoch": 0.46498653328203154, + "flos": 502538093568.0, + "grad_norm": 0.028765151082888876, + "language_loss": 0.92239797, + "learning_rate": 0.0005806471581013931, + "loss": 0.93406343, + "num_input_tokens_seen": 201666160, + "router_z_loss_mlp": 0.82666016, + "step": 2417, + "time_per_iteration": 2.6913554668426514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165917, + "balance_loss_mlp": 1.08332872, + "epoch": 0.46517891496729513, + "flos": 677300732928.0, + "grad_norm": 0.03431254801555697, + "language_loss": 0.85110676, + "learning_rate": 0.0005803396793823146, + "loss": 0.86276597, + "num_input_tokens_seen": 201733552, + "router_z_loss_mlp": 0.82617188, + "step": 2418, + "time_per_iteration": 2.8245232105255127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169421, + "balance_loss_mlp": 1.08702314, + "epoch": 0.46537129665255866, + "flos": 586511293440.0, + "grad_norm": 0.03532488466841911, + "language_loss": 0.93255758, + "learning_rate": 0.0005800321694726065, + "loss": 0.94425178, + "num_input_tokens_seen": 201806128, + "router_z_loss_mlp": 0.82421875, + "step": 2419, + "time_per_iteration": 2.74255108833313 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117097, + "balance_loss_mlp": 1.08866799, + "epoch": 0.46556367833782225, + "flos": 588820970496.0, + "grad_norm": 0.031254530654890866, + "language_loss": 0.92505676, + "learning_rate": 0.0005797246284916545, + "loss": 0.93676651, + "num_input_tokens_seen": 201874224, + "router_z_loss_mlp": 0.82324219, + "step": 2420, + "time_per_iteration": 2.6942667961120605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182114, + "balance_loss_mlp": 1.10238647, + "epoch": 0.4657560600230858, + "flos": 1488582187008.0, + "grad_norm": 0.01896402624903705, + "language_loss": 0.77505189, + "learning_rate": 0.0005794170565588569, + "loss": 0.78687304, + "num_input_tokens_seen": 202111648, + "router_z_loss_mlp": 0.796875, + "step": 2421, + "time_per_iteration": 4.965069532394409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179806, + "balance_loss_mlp": 1.09740829, + "epoch": 0.46594844170834937, + "flos": 581392382976.0, + "grad_norm": 0.035008146137172264, + "language_loss": 0.92618293, + "learning_rate": 0.0005791094537936233, + "loss": 0.93798101, + "num_input_tokens_seen": 202183344, + "router_z_loss_mlp": 0.82421875, + "step": 2422, + "time_per_iteration": 2.7509443759918213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116805, + "balance_loss_mlp": 1.08555722, + "epoch": 0.4661408233936129, + "flos": 513570184704.0, + "grad_norm": 0.03182837491947037, + "language_loss": 0.88539767, + "learning_rate": 0.0005788018203153762, + "loss": 0.89707822, + "num_input_tokens_seen": 202252512, + "router_z_loss_mlp": 0.82519531, + "step": 2423, + "time_per_iteration": 2.6291344165802 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163454, + "balance_loss_mlp": 1.08038855, + "epoch": 0.4663332050788765, + "flos": 492033030144.0, + "grad_norm": 0.03147692461991822, + "language_loss": 0.92034245, + "learning_rate": 0.000578494156243549, + "loss": 0.93197691, + "num_input_tokens_seen": 202320096, + "router_z_loss_mlp": 0.83105469, + "step": 2424, + "time_per_iteration": 2.5616393089294434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167158, + "balance_loss_mlp": 1.08390224, + "epoch": 0.4665255867641401, + "flos": 513707171328.0, + "grad_norm": 0.028174773974589257, + "language_loss": 0.94988501, + "learning_rate": 0.0005781864616975878, + "loss": 0.96155655, + "num_input_tokens_seen": 202391552, + "router_z_loss_mlp": 0.83300781, + "step": 2425, + "time_per_iteration": 2.67893648147583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178777, + "balance_loss_mlp": 1.09552157, + "epoch": 0.4667179684494036, + "flos": 425706772992.0, + "grad_norm": 0.03381525890081808, + "language_loss": 0.91298926, + "learning_rate": 0.0005778787367969502, + "loss": 0.92477703, + "num_input_tokens_seen": 202457328, + "router_z_loss_mlp": 0.83300781, + "step": 2426, + "time_per_iteration": 2.5708863735198975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180968, + "balance_loss_mlp": 1.09790349, + "epoch": 0.4669103501346672, + "flos": 709223428608.0, + "grad_norm": 0.031023375068471706, + "language_loss": 0.86979687, + "learning_rate": 0.0005775709816611053, + "loss": 0.88160658, + "num_input_tokens_seen": 202535888, + "router_z_loss_mlp": 0.83105469, + "step": 2427, + "time_per_iteration": 2.9488039016723633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178737, + "balance_loss_mlp": 1.09543312, + "epoch": 0.4671027318199307, + "flos": 555945550848.0, + "grad_norm": 0.0268683026146142, + "language_loss": 0.8862977, + "learning_rate": 0.0005772631964095346, + "loss": 0.89808506, + "num_input_tokens_seen": 202608400, + "router_z_loss_mlp": 0.83349609, + "step": 2428, + "time_per_iteration": 2.6830828189849854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176571, + "balance_loss_mlp": 1.09321952, + "epoch": 0.4672951135051943, + "flos": 568195607040.0, + "grad_norm": 0.029193722689313813, + "language_loss": 0.92024446, + "learning_rate": 0.000576955381161731, + "loss": 0.93201017, + "num_input_tokens_seen": 202677712, + "router_z_loss_mlp": 0.83398438, + "step": 2429, + "time_per_iteration": 2.7286531925201416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172919, + "balance_loss_mlp": 1.08956802, + "epoch": 0.46748749519045785, + "flos": 425418063360.0, + "grad_norm": 0.030194965591673555, + "language_loss": 0.93541706, + "learning_rate": 0.0005766475360371985, + "loss": 0.94714624, + "num_input_tokens_seen": 202743824, + "router_z_loss_mlp": 0.83398438, + "step": 2430, + "time_per_iteration": 2.5866243839263916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171537, + "balance_loss_mlp": 1.08809078, + "epoch": 0.46767987687572143, + "flos": 539370854400.0, + "grad_norm": 0.031323302876694416, + "language_loss": 0.91645998, + "learning_rate": 0.0005763396611554536, + "loss": 0.92817533, + "num_input_tokens_seen": 202813072, + "router_z_loss_mlp": 0.83496094, + "step": 2431, + "time_per_iteration": 2.644538402557373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169389, + "balance_loss_mlp": 1.08622885, + "epoch": 0.467872258560985, + "flos": 825075663360.0, + "grad_norm": 0.035112660876247544, + "language_loss": 0.8720994, + "learning_rate": 0.0005760317566360237, + "loss": 0.88379329, + "num_input_tokens_seen": 202886576, + "router_z_loss_mlp": 0.83203125, + "step": 2432, + "time_per_iteration": 2.9847497940063477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169145, + "balance_loss_mlp": 1.08598459, + "epoch": 0.46806464024624855, + "flos": 662853791232.0, + "grad_norm": 0.03130586605287321, + "language_loss": 0.92657965, + "learning_rate": 0.000575723822598448, + "loss": 0.93827116, + "num_input_tokens_seen": 202956736, + "router_z_loss_mlp": 0.83203125, + "step": 2433, + "time_per_iteration": 2.7757930755615234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166037, + "balance_loss_mlp": 1.08325768, + "epoch": 0.46825702193151214, + "flos": 757054078464.0, + "grad_norm": 0.025972857143736858, + "language_loss": 0.87588978, + "learning_rate": 0.0005754158591622773, + "loss": 0.88755012, + "num_input_tokens_seen": 203036432, + "router_z_loss_mlp": 0.828125, + "step": 2434, + "time_per_iteration": 2.9586892127990723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167751, + "balance_loss_mlp": 1.08482957, + "epoch": 0.4684494036167757, + "flos": 440310167040.0, + "grad_norm": 0.03095385887839679, + "language_loss": 0.89792037, + "learning_rate": 0.0005751078664470732, + "loss": 0.90959787, + "num_input_tokens_seen": 203101904, + "router_z_loss_mlp": 0.82958984, + "step": 2435, + "time_per_iteration": 2.5508580207824707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167106, + "balance_loss_mlp": 1.08446991, + "epoch": 0.46864178530203926, + "flos": 533748384768.0, + "grad_norm": 0.02784458934890301, + "language_loss": 0.91441107, + "learning_rate": 0.0005747998445724094, + "loss": 0.92608213, + "num_input_tokens_seen": 203170272, + "router_z_loss_mlp": 0.82666016, + "step": 2436, + "time_per_iteration": 2.6264078617095947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166893, + "balance_loss_mlp": 1.08435297, + "epoch": 0.4688341669873028, + "flos": 577825809408.0, + "grad_norm": 0.028098929039846225, + "language_loss": 0.94501269, + "learning_rate": 0.0005744917936578707, + "loss": 0.95668173, + "num_input_tokens_seen": 203243920, + "router_z_loss_mlp": 0.82568359, + "step": 2437, + "time_per_iteration": 2.7923285961151123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163054, + "balance_loss_mlp": 1.0805608, + "epoch": 0.4690265486725664, + "flos": 540717073920.0, + "grad_norm": 0.02510139841230761, + "language_loss": 0.88352144, + "learning_rate": 0.0005741837138230526, + "loss": 0.89515197, + "num_input_tokens_seen": 203321760, + "router_z_loss_mlp": 0.82519531, + "step": 2438, + "time_per_iteration": 2.720592737197876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117104, + "balance_loss_mlp": 1.08849919, + "epoch": 0.4692189303578299, + "flos": 771881054208.0, + "grad_norm": 0.031043213179005578, + "language_loss": 0.91746414, + "learning_rate": 0.0005738756051875627, + "loss": 0.92917454, + "num_input_tokens_seen": 203409088, + "router_z_loss_mlp": 0.82568359, + "step": 2439, + "time_per_iteration": 3.0688676834106445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179368, + "balance_loss_mlp": 1.09697056, + "epoch": 0.4694113120430935, + "flos": 572513516544.0, + "grad_norm": 0.031224617656339514, + "language_loss": 0.8895998, + "learning_rate": 0.0005735674678710192, + "loss": 0.90139341, + "num_input_tokens_seen": 203481680, + "router_z_loss_mlp": 0.82421875, + "step": 2440, + "time_per_iteration": 2.6647889614105225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180255, + "balance_loss_mlp": 1.09814322, + "epoch": 0.4696036937283571, + "flos": 750094121472.0, + "grad_norm": 0.03673041295896698, + "language_loss": 0.88509989, + "learning_rate": 0.0005732593019930517, + "loss": 0.89690244, + "num_input_tokens_seen": 203554848, + "router_z_loss_mlp": 0.82128906, + "step": 2441, + "time_per_iteration": 2.9219651222229004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177833, + "balance_loss_mlp": 1.09553087, + "epoch": 0.4697960754136206, + "flos": 494442763776.0, + "grad_norm": 0.03186685029176949, + "language_loss": 0.93415046, + "learning_rate": 0.0005729511076733008, + "loss": 0.94592881, + "num_input_tokens_seen": 203624816, + "router_z_loss_mlp": 0.82324219, + "step": 2442, + "time_per_iteration": 2.6268982887268066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163524, + "balance_loss_mlp": 1.08088803, + "epoch": 0.4699884570988842, + "flos": 726360081408.0, + "grad_norm": 0.03313850577325225, + "language_loss": 0.91418898, + "learning_rate": 0.000572642885031418, + "loss": 0.92582428, + "num_input_tokens_seen": 203698256, + "router_z_loss_mlp": 0.82666016, + "step": 2443, + "time_per_iteration": 2.847228527069092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165965, + "balance_loss_mlp": 1.08337641, + "epoch": 0.47018083878414774, + "flos": 556577364480.0, + "grad_norm": 0.031620033102277616, + "language_loss": 0.86240256, + "learning_rate": 0.0005723346341870662, + "loss": 0.87406218, + "num_input_tokens_seen": 203772672, + "router_z_loss_mlp": 0.82617188, + "step": 2444, + "time_per_iteration": 2.7060024738311768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171889, + "balance_loss_mlp": 1.08944428, + "epoch": 0.4703732204694113, + "flos": 424962167808.0, + "grad_norm": 0.03469194433982127, + "language_loss": 0.92819834, + "learning_rate": 0.0005720263552599188, + "loss": 0.93991721, + "num_input_tokens_seen": 203835904, + "router_z_loss_mlp": 0.82470703, + "step": 2445, + "time_per_iteration": 2.486546754837036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175277, + "balance_loss_mlp": 1.09307039, + "epoch": 0.47056560215467486, + "flos": 704755797504.0, + "grad_norm": 0.03273224664010927, + "language_loss": 0.86175644, + "learning_rate": 0.0005717180483696604, + "loss": 0.87350929, + "num_input_tokens_seen": 203914704, + "router_z_loss_mlp": 0.82226562, + "step": 2446, + "time_per_iteration": 2.8490843772888184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173534, + "balance_loss_mlp": 1.09123182, + "epoch": 0.47075798383993844, + "flos": 556012680192.0, + "grad_norm": 0.030967943008195494, + "language_loss": 0.88733399, + "learning_rate": 0.0005714097136359862, + "loss": 0.89906937, + "num_input_tokens_seen": 203985072, + "router_z_loss_mlp": 0.82324219, + "step": 2447, + "time_per_iteration": 2.6790409088134766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172662, + "balance_loss_mlp": 1.09035945, + "epoch": 0.470950365525202, + "flos": 565493160960.0, + "grad_norm": 0.028459673893144737, + "language_loss": 0.91199988, + "learning_rate": 0.0005711013511786027, + "loss": 0.92372644, + "num_input_tokens_seen": 204061904, + "router_z_loss_mlp": 0.82324219, + "step": 2448, + "time_per_iteration": 2.871711492538452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169516, + "balance_loss_mlp": 1.08745217, + "epoch": 0.47114274721046556, + "flos": 535498106880.0, + "grad_norm": 0.02665313173872239, + "language_loss": 0.88226557, + "learning_rate": 0.0005707929611172263, + "loss": 0.89396071, + "num_input_tokens_seen": 204137392, + "router_z_loss_mlp": 0.82080078, + "step": 2449, + "time_per_iteration": 2.69319748878479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166092, + "balance_loss_mlp": 1.08402824, + "epoch": 0.47133512889572915, + "flos": 474077912064.0, + "grad_norm": 0.0332447507442279, + "language_loss": 0.90459168, + "learning_rate": 0.000570484543571585, + "loss": 0.91625261, + "num_input_tokens_seen": 204202752, + "router_z_loss_mlp": 0.82080078, + "step": 2450, + "time_per_iteration": 2.5612680912017822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164305, + "balance_loss_mlp": 1.08228934, + "epoch": 0.4715275105809927, + "flos": 459967343616.0, + "grad_norm": 0.03392229050190778, + "language_loss": 0.90577096, + "learning_rate": 0.0005701760986614171, + "loss": 0.91741407, + "num_input_tokens_seen": 204266960, + "router_z_loss_mlp": 0.8203125, + "step": 2451, + "time_per_iteration": 2.5571579933166504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166326, + "balance_loss_mlp": 1.08435798, + "epoch": 0.47171989226625627, + "flos": 422886806016.0, + "grad_norm": 0.028518751420243762, + "language_loss": 0.93793362, + "learning_rate": 0.0005698676265064714, + "loss": 0.94959688, + "num_input_tokens_seen": 204331216, + "router_z_loss_mlp": 0.81982422, + "step": 2452, + "time_per_iteration": 2.476069211959839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169062, + "balance_loss_mlp": 1.08680761, + "epoch": 0.4719122739515198, + "flos": 458376075264.0, + "grad_norm": 0.03301356479716476, + "language_loss": 0.95592558, + "learning_rate": 0.0005695591272265074, + "loss": 0.9676162, + "num_input_tokens_seen": 204397216, + "router_z_loss_mlp": 0.82275391, + "step": 2453, + "time_per_iteration": 2.512503147125244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169417, + "balance_loss_mlp": 1.08730555, + "epoch": 0.4721046556367834, + "flos": 516016848384.0, + "grad_norm": 0.02961212180136774, + "language_loss": 0.87225032, + "learning_rate": 0.0005692506009412954, + "loss": 0.88394439, + "num_input_tokens_seen": 204469952, + "router_z_loss_mlp": 0.82128906, + "step": 2454, + "time_per_iteration": 2.673123836517334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187157, + "balance_loss_mlp": 1.10609436, + "epoch": 0.4722970373220469, + "flos": 1575703721472.0, + "grad_norm": 0.017157731663316397, + "language_loss": 0.7755127, + "learning_rate": 0.0005689420477706156, + "loss": 0.78738415, + "num_input_tokens_seen": 204701152, + "router_z_loss_mlp": 0.81054688, + "step": 2455, + "time_per_iteration": 4.97356915473938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164137, + "balance_loss_mlp": 1.08216834, + "epoch": 0.4724894190073105, + "flos": 587394886656.0, + "grad_norm": 0.02627427755104431, + "language_loss": 0.95142597, + "learning_rate": 0.0005686334678342593, + "loss": 0.96306741, + "num_input_tokens_seen": 204778144, + "router_z_loss_mlp": 0.81982422, + "step": 2456, + "time_per_iteration": 2.867849588394165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165061, + "balance_loss_mlp": 1.08304489, + "epoch": 0.4726818006925741, + "flos": 869072497152.0, + "grad_norm": 0.03086214810478132, + "language_loss": 0.87917793, + "learning_rate": 0.0005683248612520274, + "loss": 0.89082849, + "num_input_tokens_seen": 204853376, + "router_z_loss_mlp": 0.8203125, + "step": 2457, + "time_per_iteration": 3.078068733215332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164085, + "balance_loss_mlp": 1.08206928, + "epoch": 0.4728741823778376, + "flos": 754227380736.0, + "grad_norm": 0.03352301766800045, + "language_loss": 0.88896751, + "learning_rate": 0.0005680162281437321, + "loss": 0.90060842, + "num_input_tokens_seen": 204925280, + "router_z_loss_mlp": 0.8203125, + "step": 2458, + "time_per_iteration": 2.9237887859344482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116424, + "balance_loss_mlp": 1.08260512, + "epoch": 0.4730665640631012, + "flos": 539657562624.0, + "grad_norm": 0.027635752733509208, + "language_loss": 0.89953935, + "learning_rate": 0.000567707568629195, + "loss": 0.91118181, + "num_input_tokens_seen": 205000592, + "router_z_loss_mlp": 0.81640625, + "step": 2459, + "time_per_iteration": 2.719519853591919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166645, + "balance_loss_mlp": 1.08505821, + "epoch": 0.47325894574836475, + "flos": 492682308096.0, + "grad_norm": 0.027667404433321316, + "language_loss": 0.88089126, + "learning_rate": 0.0005673988828282486, + "loss": 0.89255774, + "num_input_tokens_seen": 205073968, + "router_z_loss_mlp": 0.81591797, + "step": 2460, + "time_per_iteration": 2.71736216545105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165583, + "balance_loss_mlp": 1.0839963, + "epoch": 0.47345132743362833, + "flos": 765830886912.0, + "grad_norm": 0.028127891455978875, + "language_loss": 0.87479305, + "learning_rate": 0.0005670901708607352, + "loss": 0.88644892, + "num_input_tokens_seen": 205153536, + "router_z_loss_mlp": 0.81591797, + "step": 2461, + "time_per_iteration": 2.9727017879486084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165349, + "balance_loss_mlp": 1.08371425, + "epoch": 0.47364370911889186, + "flos": 541168240128.0, + "grad_norm": 0.03987357596495419, + "language_loss": 0.90376979, + "learning_rate": 0.0005667814328465076, + "loss": 0.91542327, + "num_input_tokens_seen": 205220944, + "router_z_loss_mlp": 0.81640625, + "step": 2462, + "time_per_iteration": 2.632636547088623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163463, + "balance_loss_mlp": 1.0815897, + "epoch": 0.47383609080415545, + "flos": 407091643392.0, + "grad_norm": 0.03654753942721471, + "language_loss": 0.88796914, + "learning_rate": 0.0005664726689054285, + "loss": 0.89960378, + "num_input_tokens_seen": 205282688, + "router_z_loss_mlp": 0.81884766, + "step": 2463, + "time_per_iteration": 2.466054916381836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170123, + "balance_loss_mlp": 1.08867884, + "epoch": 0.474028472489419, + "flos": 454438199808.0, + "grad_norm": 0.03923165930345575, + "language_loss": 0.8627066, + "learning_rate": 0.0005661638791573704, + "loss": 0.87440789, + "num_input_tokens_seen": 205357360, + "router_z_loss_mlp": 0.81445312, + "step": 2464, + "time_per_iteration": 2.7042744159698486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166183, + "balance_loss_mlp": 1.08450055, + "epoch": 0.47422085417468257, + "flos": 493194599424.0, + "grad_norm": 0.026684931914484025, + "language_loss": 0.92592585, + "learning_rate": 0.0005658550637222164, + "loss": 0.93758774, + "num_input_tokens_seen": 205424352, + "router_z_loss_mlp": 0.81689453, + "step": 2465, + "time_per_iteration": 2.6058290004730225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168127, + "balance_loss_mlp": 1.08611059, + "epoch": 0.47441323585994616, + "flos": 740125544448.0, + "grad_norm": 0.026202374072225774, + "language_loss": 0.87139833, + "learning_rate": 0.0005655462227198592, + "loss": 0.88307959, + "num_input_tokens_seen": 205502912, + "router_z_loss_mlp": 0.8203125, + "step": 2466, + "time_per_iteration": 2.8945796489715576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167919, + "balance_loss_mlp": 1.08590269, + "epoch": 0.4746056175452097, + "flos": 485674687488.0, + "grad_norm": 0.02746668082221095, + "language_loss": 0.89712787, + "learning_rate": 0.0005652373562702016, + "loss": 0.90880704, + "num_input_tokens_seen": 205571168, + "router_z_loss_mlp": 0.8203125, + "step": 2467, + "time_per_iteration": 2.576364278793335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166795, + "balance_loss_mlp": 1.08463609, + "epoch": 0.4747979992304733, + "flos": 462005775360.0, + "grad_norm": 0.03040478239716322, + "language_loss": 0.95003092, + "learning_rate": 0.000564928464493156, + "loss": 0.96169889, + "num_input_tokens_seen": 205639648, + "router_z_loss_mlp": 0.82177734, + "step": 2468, + "time_per_iteration": 2.5468242168426514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168306, + "balance_loss_mlp": 1.08624196, + "epoch": 0.4749903809157368, + "flos": 865879226880.0, + "grad_norm": 0.029413898751956376, + "language_loss": 0.88262731, + "learning_rate": 0.000564619547508645, + "loss": 0.89431041, + "num_input_tokens_seen": 205721536, + "router_z_loss_mlp": 0.82080078, + "step": 2469, + "time_per_iteration": 3.042994260787964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116966, + "balance_loss_mlp": 1.08764374, + "epoch": 0.4751827626010004, + "flos": 506551830528.0, + "grad_norm": 0.035426943126194606, + "language_loss": 0.90271819, + "learning_rate": 0.0005643106054366008, + "loss": 0.91441476, + "num_input_tokens_seen": 205788512, + "router_z_loss_mlp": 0.8203125, + "step": 2470, + "time_per_iteration": 2.5660367012023926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168432, + "balance_loss_mlp": 1.0863688, + "epoch": 0.47537514428626393, + "flos": 560452113408.0, + "grad_norm": 0.029652672624791387, + "language_loss": 0.85815179, + "learning_rate": 0.000564001638396965, + "loss": 0.86983615, + "num_input_tokens_seen": 205863104, + "router_z_loss_mlp": 0.82080078, + "step": 2471, + "time_per_iteration": 2.7345728874206543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167677, + "balance_loss_mlp": 1.08566117, + "epoch": 0.4755675259715275, + "flos": 835676054016.0, + "grad_norm": 0.029111814859825738, + "language_loss": 0.87706691, + "learning_rate": 0.0005636926465096897, + "loss": 0.8887437, + "num_input_tokens_seen": 205940688, + "router_z_loss_mlp": 0.8203125, + "step": 2472, + "time_per_iteration": 3.0570740699768066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166306, + "balance_loss_mlp": 1.08424211, + "epoch": 0.47575990765679105, + "flos": 509232809472.0, + "grad_norm": 0.030849533450069865, + "language_loss": 0.93407679, + "learning_rate": 0.0005633836298947363, + "loss": 0.94573981, + "num_input_tokens_seen": 206008352, + "router_z_loss_mlp": 0.82080078, + "step": 2473, + "time_per_iteration": 2.6804757118225098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167624, + "balance_loss_mlp": 1.08570302, + "epoch": 0.47595228934205464, + "flos": 592962961920.0, + "grad_norm": 0.0319092637225127, + "language_loss": 0.77122205, + "learning_rate": 0.000563074588672075, + "loss": 0.78289831, + "num_input_tokens_seen": 206078240, + "router_z_loss_mlp": 0.81933594, + "step": 2474, + "time_per_iteration": 2.7190651893615723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166922, + "balance_loss_mlp": 1.08500123, + "epoch": 0.4761446710273182, + "flos": 581683094016.0, + "grad_norm": 0.028375010801601097, + "language_loss": 0.91505527, + "learning_rate": 0.0005627655229616868, + "loss": 0.92672449, + "num_input_tokens_seen": 206148896, + "router_z_loss_mlp": 0.81933594, + "step": 2475, + "time_per_iteration": 2.689652919769287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164128, + "balance_loss_mlp": 1.08235061, + "epoch": 0.47633705271258175, + "flos": 674079264768.0, + "grad_norm": 0.024988633596495675, + "language_loss": 0.94898891, + "learning_rate": 0.0005624564328835616, + "loss": 0.96063018, + "num_input_tokens_seen": 206223792, + "router_z_loss_mlp": 0.81787109, + "step": 2476, + "time_per_iteration": 2.8038489818573 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169163, + "balance_loss_mlp": 1.08728969, + "epoch": 0.47652943439784534, + "flos": 542970355200.0, + "grad_norm": 0.0285977430554916, + "language_loss": 0.89680123, + "learning_rate": 0.0005621473185576986, + "loss": 0.90849286, + "num_input_tokens_seen": 206299376, + "router_z_loss_mlp": 0.81884766, + "step": 2477, + "time_per_iteration": 2.7568743228912354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165779, + "balance_loss_mlp": 1.08433557, + "epoch": 0.4767218160831089, + "flos": 525846437376.0, + "grad_norm": 0.0316668482667046, + "language_loss": 0.93167424, + "learning_rate": 0.0005618381801041068, + "loss": 0.94333208, + "num_input_tokens_seen": 206367936, + "router_z_loss_mlp": 0.81445312, + "step": 2478, + "time_per_iteration": 2.612211227416992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167228, + "balance_loss_mlp": 1.08545041, + "epoch": 0.47691419776837246, + "flos": 569126863872.0, + "grad_norm": 0.03238452738028376, + "language_loss": 0.88936818, + "learning_rate": 0.0005615290176428044, + "loss": 0.90104043, + "num_input_tokens_seen": 206438864, + "router_z_loss_mlp": 0.81787109, + "step": 2479, + "time_per_iteration": 2.649019241333008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168128, + "balance_loss_mlp": 1.08668435, + "epoch": 0.477106579453636, + "flos": 532024859136.0, + "grad_norm": 0.027888492093205767, + "language_loss": 0.91917288, + "learning_rate": 0.0005612198312938187, + "loss": 0.93085408, + "num_input_tokens_seen": 206516656, + "router_z_loss_mlp": 0.81445312, + "step": 2480, + "time_per_iteration": 2.739767551422119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169934, + "balance_loss_mlp": 1.08839524, + "epoch": 0.4772989611388996, + "flos": 595500950016.0, + "grad_norm": 0.027931665483744535, + "language_loss": 0.84935582, + "learning_rate": 0.0005609106211771868, + "loss": 0.86105514, + "num_input_tokens_seen": 206595040, + "router_z_loss_mlp": 0.81542969, + "step": 2481, + "time_per_iteration": 2.850339651107788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169841, + "balance_loss_mlp": 1.08835006, + "epoch": 0.4774913428241631, + "flos": 545707729920.0, + "grad_norm": 0.027660076347337716, + "language_loss": 0.94426548, + "learning_rate": 0.0005606013874129543, + "loss": 0.95596385, + "num_input_tokens_seen": 206670192, + "router_z_loss_mlp": 0.81494141, + "step": 2482, + "time_per_iteration": 2.7403533458709717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169934, + "balance_loss_mlp": 1.08829987, + "epoch": 0.4776837245094267, + "flos": 541129308672.0, + "grad_norm": 0.02810737401227857, + "language_loss": 0.86136961, + "learning_rate": 0.0005602921301211768, + "loss": 0.87306893, + "num_input_tokens_seen": 206746992, + "router_z_loss_mlp": 0.81640625, + "step": 2483, + "time_per_iteration": 2.6941261291503906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171891, + "balance_loss_mlp": 1.09016109, + "epoch": 0.4778761061946903, + "flos": 472755887616.0, + "grad_norm": 0.029011275825861695, + "language_loss": 0.8832168, + "learning_rate": 0.0005599828494219185, + "loss": 0.89493567, + "num_input_tokens_seen": 206813584, + "router_z_loss_mlp": 0.81738281, + "step": 2484, + "time_per_iteration": 2.5801451206207275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116562, + "balance_loss_mlp": 1.08355606, + "epoch": 0.4780684878799538, + "flos": 727337000448.0, + "grad_norm": 0.03126301150284597, + "language_loss": 0.95766234, + "learning_rate": 0.0005596735454352527, + "loss": 0.96931851, + "num_input_tokens_seen": 206885840, + "router_z_loss_mlp": 0.82080078, + "step": 2485, + "time_per_iteration": 2.866809368133545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165282, + "balance_loss_mlp": 1.0832181, + "epoch": 0.4782608695652174, + "flos": 549953780736.0, + "grad_norm": 0.032811891631208345, + "language_loss": 0.91780031, + "learning_rate": 0.0005593642182812619, + "loss": 0.92945307, + "num_input_tokens_seen": 206955104, + "router_z_loss_mlp": 0.82080078, + "step": 2486, + "time_per_iteration": 2.6762824058532715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166087, + "balance_loss_mlp": 1.08388078, + "epoch": 0.47845325125048094, + "flos": 831401805312.0, + "grad_norm": 0.03291122574992765, + "language_loss": 0.91604954, + "learning_rate": 0.0005590548680800378, + "loss": 0.92771041, + "num_input_tokens_seen": 207039792, + "router_z_loss_mlp": 0.82226562, + "step": 2487, + "time_per_iteration": 3.1848442554473877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159859, + "balance_loss_mlp": 1.07765198, + "epoch": 0.4786456329357445, + "flos": 515270241792.0, + "grad_norm": 0.02977291399963519, + "language_loss": 0.8241533, + "learning_rate": 0.0005587454949516804, + "loss": 0.83575195, + "num_input_tokens_seen": 207115632, + "router_z_loss_mlp": 0.82226562, + "step": 2488, + "time_per_iteration": 2.728825330734253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163121, + "balance_loss_mlp": 1.08077133, + "epoch": 0.47883801462100806, + "flos": 565729477632.0, + "grad_norm": 0.034122039627151275, + "language_loss": 0.9412536, + "learning_rate": 0.0005584360990162993, + "loss": 0.95288485, + "num_input_tokens_seen": 207184336, + "router_z_loss_mlp": 0.82373047, + "step": 2489, + "time_per_iteration": 2.65055251121521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162976, + "balance_loss_mlp": 1.08076906, + "epoch": 0.47903039630627164, + "flos": 580704173568.0, + "grad_norm": 0.025976014522421025, + "language_loss": 0.89770818, + "learning_rate": 0.0005581266803940124, + "loss": 0.90933788, + "num_input_tokens_seen": 207258720, + "router_z_loss_mlp": 0.82226562, + "step": 2490, + "time_per_iteration": 2.740140199661255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164709, + "balance_loss_mlp": 1.08250248, + "epoch": 0.47922277799153523, + "flos": 620085656064.0, + "grad_norm": 0.030357385002024635, + "language_loss": 0.93398184, + "learning_rate": 0.0005578172392049471, + "loss": 0.94562888, + "num_input_tokens_seen": 207329216, + "router_z_loss_mlp": 0.82226562, + "step": 2491, + "time_per_iteration": 2.7492756843566895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164354, + "balance_loss_mlp": 1.08214724, + "epoch": 0.47941515967679876, + "flos": 640858739712.0, + "grad_norm": 0.03220406636162171, + "language_loss": 0.9124878, + "learning_rate": 0.0005575077755692386, + "loss": 0.92413139, + "num_input_tokens_seen": 207403712, + "router_z_loss_mlp": 0.82226562, + "step": 2492, + "time_per_iteration": 2.8061015605926514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166388, + "balance_loss_mlp": 1.08437181, + "epoch": 0.47960754136206235, + "flos": 520875247104.0, + "grad_norm": 0.02527329704122564, + "language_loss": 0.91187584, + "learning_rate": 0.0005571982896070316, + "loss": 0.92353964, + "num_input_tokens_seen": 207477120, + "router_z_loss_mlp": 0.8203125, + "step": 2493, + "time_per_iteration": 4.094395160675049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116615, + "balance_loss_mlp": 1.08399141, + "epoch": 0.4797999230473259, + "flos": 476031750144.0, + "grad_norm": 0.03303640593992076, + "language_loss": 0.95932508, + "learning_rate": 0.0005568887814384792, + "loss": 0.97098666, + "num_input_tokens_seen": 207544592, + "router_z_loss_mlp": 0.82177734, + "step": 2494, + "time_per_iteration": 2.572852373123169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011645, + "balance_loss_mlp": 1.08229315, + "epoch": 0.47999230473258947, + "flos": 533068907520.0, + "grad_norm": 0.028664161711311382, + "language_loss": 0.92573094, + "learning_rate": 0.000556579251183743, + "loss": 0.93737602, + "num_input_tokens_seen": 207613808, + "router_z_loss_mlp": 0.82226562, + "step": 2495, + "time_per_iteration": 2.6538801193237305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162424, + "balance_loss_mlp": 1.08036053, + "epoch": 0.480184686417853, + "flos": 602605899264.0, + "grad_norm": 0.03331899292815792, + "language_loss": 0.86056805, + "learning_rate": 0.0005562696989629936, + "loss": 0.87219226, + "num_input_tokens_seen": 207684464, + "router_z_loss_mlp": 0.82080078, + "step": 2496, + "time_per_iteration": 2.687903881072998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162213, + "balance_loss_mlp": 1.08019686, + "epoch": 0.4803770681031166, + "flos": 529261287936.0, + "grad_norm": 0.02923998603568501, + "language_loss": 0.88484073, + "learning_rate": 0.0005559601248964095, + "loss": 0.89646292, + "num_input_tokens_seen": 207754016, + "router_z_loss_mlp": 0.8203125, + "step": 2497, + "time_per_iteration": 2.6282827854156494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161296, + "balance_loss_mlp": 1.07918417, + "epoch": 0.4805694497883801, + "flos": 512228694528.0, + "grad_norm": 0.02922528152793709, + "language_loss": 0.91127884, + "learning_rate": 0.0005556505291041783, + "loss": 0.92289186, + "num_input_tokens_seen": 207827104, + "router_z_loss_mlp": 0.82128906, + "step": 2498, + "time_per_iteration": 2.662783622741699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161007, + "balance_loss_mlp": 1.07899094, + "epoch": 0.4807618314736437, + "flos": 601605511680.0, + "grad_norm": 0.02724196548061384, + "language_loss": 0.8966158, + "learning_rate": 0.0005553409117064954, + "loss": 0.90822583, + "num_input_tokens_seen": 207907824, + "router_z_loss_mlp": 0.8203125, + "step": 2499, + "time_per_iteration": 2.898850917816162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164849, + "balance_loss_mlp": 1.08245122, + "epoch": 0.4809542131589073, + "flos": 570029922816.0, + "grad_norm": 0.028349491645904, + "language_loss": 0.91357303, + "learning_rate": 0.0005550312728235654, + "loss": 0.92522144, + "num_input_tokens_seen": 207975632, + "router_z_loss_mlp": 0.82421875, + "step": 2500, + "time_per_iteration": 2.754187822341919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164619, + "balance_loss_mlp": 1.08217347, + "epoch": 0.4811465948441708, + "flos": 577165797888.0, + "grad_norm": 0.034664680835738745, + "language_loss": 0.91214681, + "learning_rate": 0.0005547216125756003, + "loss": 0.92379302, + "num_input_tokens_seen": 208048000, + "router_z_loss_mlp": 0.82470703, + "step": 2501, + "time_per_iteration": 2.778639078140259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164023, + "balance_loss_mlp": 1.08143485, + "epoch": 0.4813389765294344, + "flos": 825297243648.0, + "grad_norm": 0.028167486861350455, + "language_loss": 0.87736559, + "learning_rate": 0.0005544119310828211, + "loss": 0.88900584, + "num_input_tokens_seen": 208132592, + "router_z_loss_mlp": 0.82617188, + "step": 2502, + "time_per_iteration": 3.0756351947784424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164093, + "balance_loss_mlp": 1.08174348, + "epoch": 0.48153135821469795, + "flos": 636699283968.0, + "grad_norm": 0.030410217991048386, + "language_loss": 0.91046345, + "learning_rate": 0.0005541022284654568, + "loss": 0.92210436, + "num_input_tokens_seen": 208215824, + "router_z_loss_mlp": 0.82373047, + "step": 2503, + "time_per_iteration": 2.892679214477539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163382, + "balance_loss_mlp": 1.08103192, + "epoch": 0.48172373989996153, + "flos": 504708782592.0, + "grad_norm": 0.02826951852510112, + "language_loss": 0.89667141, + "learning_rate": 0.0005537925048437446, + "loss": 0.90830529, + "num_input_tokens_seen": 208284304, + "router_z_loss_mlp": 0.82373047, + "step": 2504, + "time_per_iteration": 2.5750081539154053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179108, + "balance_loss_mlp": 1.09918976, + "epoch": 0.48191612158522507, + "flos": 1535566173696.0, + "grad_norm": 0.017261305400491866, + "language_loss": 0.75751472, + "learning_rate": 0.00055348276033793, + "loss": 0.76930583, + "num_input_tokens_seen": 208510224, + "router_z_loss_mlp": 0.79882812, + "step": 2505, + "time_per_iteration": 4.912463426589966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162522, + "balance_loss_mlp": 1.07988608, + "epoch": 0.48210850327048865, + "flos": 703811805696.0, + "grad_norm": 0.027104005826713556, + "language_loss": 0.93955028, + "learning_rate": 0.0005531729950682664, + "loss": 0.95117545, + "num_input_tokens_seen": 208596816, + "router_z_loss_mlp": 0.82666016, + "step": 2506, + "time_per_iteration": 3.000925064086914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162538, + "balance_loss_mlp": 1.07999802, + "epoch": 0.4823008849557522, + "flos": 440700934656.0, + "grad_norm": 0.03451729562062639, + "language_loss": 0.91777337, + "learning_rate": 0.000552863209155015, + "loss": 0.92939872, + "num_input_tokens_seen": 208659616, + "router_z_loss_mlp": 0.82568359, + "step": 2507, + "time_per_iteration": 2.478809118270874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159773, + "balance_loss_mlp": 1.07737529, + "epoch": 0.48249326664101577, + "flos": 472812283392.0, + "grad_norm": 0.02691149649688828, + "language_loss": 0.87363136, + "learning_rate": 0.0005525534027184461, + "loss": 0.88522899, + "num_input_tokens_seen": 208728080, + "router_z_loss_mlp": 0.82421875, + "step": 2508, + "time_per_iteration": 2.54645037651062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161526, + "balance_loss_mlp": 1.07951045, + "epoch": 0.48268564832627936, + "flos": 564314127360.0, + "grad_norm": 0.023137570540037285, + "language_loss": 0.88137501, + "learning_rate": 0.0005522435758788365, + "loss": 0.89299035, + "num_input_tokens_seen": 208803376, + "router_z_loss_mlp": 0.8203125, + "step": 2509, + "time_per_iteration": 2.700540542602539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160536, + "balance_loss_mlp": 1.07842445, + "epoch": 0.4828780300115429, + "flos": 630842499072.0, + "grad_norm": 0.03372990027790351, + "language_loss": 0.86188895, + "learning_rate": 0.0005519337287564721, + "loss": 0.87349427, + "num_input_tokens_seen": 208876656, + "router_z_loss_mlp": 0.82128906, + "step": 2510, + "time_per_iteration": 2.8127758502960205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161519, + "balance_loss_mlp": 1.07945526, + "epoch": 0.4830704116968065, + "flos": 633004455936.0, + "grad_norm": 0.029001937113396697, + "language_loss": 0.88535267, + "learning_rate": 0.000551623861471646, + "loss": 0.89696789, + "num_input_tokens_seen": 208950224, + "router_z_loss_mlp": 0.82080078, + "step": 2511, + "time_per_iteration": 2.7925469875335693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166962, + "balance_loss_mlp": 1.08647156, + "epoch": 0.48326279338207, + "flos": 1572616512000.0, + "grad_norm": 0.009161484988790693, + "language_loss": 0.78818834, + "learning_rate": 0.0005513139741446594, + "loss": 0.79985785, + "num_input_tokens_seen": 209173984, + "router_z_loss_mlp": 0.8046875, + "step": 2512, + "time_per_iteration": 4.850747108459473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159851, + "balance_loss_mlp": 1.07783449, + "epoch": 0.4834551750673336, + "flos": 510237926400.0, + "grad_norm": 0.028933780257729795, + "language_loss": 0.92768925, + "learning_rate": 0.0005510040668958211, + "loss": 0.93928778, + "num_input_tokens_seen": 209242832, + "router_z_loss_mlp": 0.8203125, + "step": 2513, + "time_per_iteration": 2.56387996673584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165955, + "balance_loss_mlp": 1.08546448, + "epoch": 0.48364755675259713, + "flos": 1531825683456.0, + "grad_norm": 0.007133010503999018, + "language_loss": 0.77760583, + "learning_rate": 0.0005506941398454483, + "loss": 0.78926539, + "num_input_tokens_seen": 209473520, + "router_z_loss_mlp": 0.8046875, + "step": 2514, + "time_per_iteration": 4.836379289627075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160977, + "balance_loss_mlp": 1.07938981, + "epoch": 0.4838399384378607, + "flos": 566046385152.0, + "grad_norm": 0.029153045334521625, + "language_loss": 0.89274001, + "learning_rate": 0.0005503841931138645, + "loss": 0.9043498, + "num_input_tokens_seen": 209544208, + "router_z_loss_mlp": 0.81591797, + "step": 2515, + "time_per_iteration": 2.6633048057556152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160148, + "balance_loss_mlp": 1.07846582, + "epoch": 0.4840323201231243, + "flos": 388541641728.0, + "grad_norm": 0.03187042626689644, + "language_loss": 0.88861662, + "learning_rate": 0.0005500742268214025, + "loss": 0.90021807, + "num_input_tokens_seen": 209607408, + "router_z_loss_mlp": 0.81689453, + "step": 2516, + "time_per_iteration": 2.4762659072875977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160045, + "balance_loss_mlp": 1.07845843, + "epoch": 0.48422470180838784, + "flos": 632175257088.0, + "grad_norm": 0.026732605532440536, + "language_loss": 0.9007901, + "learning_rate": 0.0005497642410884014, + "loss": 0.91239059, + "num_input_tokens_seen": 209683392, + "router_z_loss_mlp": 0.81591797, + "step": 2517, + "time_per_iteration": 2.7693819999694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164478, + "balance_loss_mlp": 1.08246255, + "epoch": 0.4844170834936514, + "flos": 500313010176.0, + "grad_norm": 0.028128961210665323, + "language_loss": 0.90248644, + "learning_rate": 0.0005494542360352085, + "loss": 0.91413122, + "num_input_tokens_seen": 209753184, + "router_z_loss_mlp": 0.8203125, + "step": 2518, + "time_per_iteration": 2.6704978942871094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163589, + "balance_loss_mlp": 1.08152497, + "epoch": 0.48460946517891496, + "flos": 552194327040.0, + "grad_norm": 0.02893400906180164, + "language_loss": 0.92442286, + "learning_rate": 0.0005491442117821783, + "loss": 0.93605876, + "num_input_tokens_seen": 209829568, + "router_z_loss_mlp": 0.82080078, + "step": 2519, + "time_per_iteration": 2.691898822784424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167118, + "balance_loss_mlp": 1.08491123, + "epoch": 0.48480184686417854, + "flos": 530461788672.0, + "grad_norm": 0.03488173137086134, + "language_loss": 0.937814, + "learning_rate": 0.0005488341684496732, + "loss": 0.94948518, + "num_input_tokens_seen": 209902176, + "router_z_loss_mlp": 0.82226562, + "step": 2520, + "time_per_iteration": 2.6527535915374756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165597, + "balance_loss_mlp": 1.08343804, + "epoch": 0.4849942285494421, + "flos": 533047440384.0, + "grad_norm": 0.028537304261499467, + "language_loss": 0.97065389, + "learning_rate": 0.0005485241061580624, + "loss": 0.98230994, + "num_input_tokens_seen": 209969168, + "router_z_loss_mlp": 0.82177734, + "step": 2521, + "time_per_iteration": 2.7213969230651855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166792, + "balance_loss_mlp": 1.08463287, + "epoch": 0.48518661023470566, + "flos": 723972541440.0, + "grad_norm": 0.02938300657957885, + "language_loss": 0.90224278, + "learning_rate": 0.0005482140250277228, + "loss": 0.91391075, + "num_input_tokens_seen": 210049616, + "router_z_loss_mlp": 0.82177734, + "step": 2522, + "time_per_iteration": 2.9924206733703613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116808, + "balance_loss_mlp": 1.08592129, + "epoch": 0.4853789919199692, + "flos": 507155446272.0, + "grad_norm": 0.030604201389603965, + "language_loss": 0.93692237, + "learning_rate": 0.0005479039251790387, + "loss": 0.94860315, + "num_input_tokens_seen": 210118512, + "router_z_loss_mlp": 0.82177734, + "step": 2523, + "time_per_iteration": 2.7099061012268066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167569, + "balance_loss_mlp": 1.08541012, + "epoch": 0.4855713736052328, + "flos": 661698952704.0, + "grad_norm": 0.03222198223164457, + "language_loss": 0.90574634, + "learning_rate": 0.0005475938067324014, + "loss": 0.917422, + "num_input_tokens_seen": 210193728, + "router_z_loss_mlp": 0.82177734, + "step": 2524, + "time_per_iteration": 2.8379342555999756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117016, + "balance_loss_mlp": 1.08823884, + "epoch": 0.48576375529049637, + "flos": 437889699840.0, + "grad_norm": 0.03297241328571355, + "language_loss": 0.89402866, + "learning_rate": 0.0005472836698082098, + "loss": 0.90573025, + "num_input_tokens_seen": 210258832, + "router_z_loss_mlp": 0.81933594, + "step": 2525, + "time_per_iteration": 2.5135462284088135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165117, + "balance_loss_mlp": 1.08300531, + "epoch": 0.4859561369757599, + "flos": 582844663296.0, + "grad_norm": 0.028434138704400515, + "language_loss": 0.88848263, + "learning_rate": 0.0005469735145268694, + "loss": 0.90013373, + "num_input_tokens_seen": 210335280, + "router_z_loss_mlp": 0.82128906, + "step": 2526, + "time_per_iteration": 2.7137279510498047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162635, + "balance_loss_mlp": 1.08066678, + "epoch": 0.4861485186610235, + "flos": 488933085696.0, + "grad_norm": 0.028544121185286958, + "language_loss": 0.86922419, + "learning_rate": 0.0005466633410087933, + "loss": 0.88085049, + "num_input_tokens_seen": 210407072, + "router_z_loss_mlp": 0.81982422, + "step": 2527, + "time_per_iteration": 2.7106595039367676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116584, + "balance_loss_mlp": 1.08554077, + "epoch": 0.486340900346287, + "flos": 1561111060992.0, + "grad_norm": 0.005447093154513016, + "language_loss": 0.77260822, + "learning_rate": 0.0005463531493744017, + "loss": 0.78426665, + "num_input_tokens_seen": 210644544, + "router_z_loss_mlp": 0.80273438, + "step": 2528, + "time_per_iteration": 4.841828346252441 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162423, + "balance_loss_mlp": 1.08069348, + "epoch": 0.4865332820315506, + "flos": 483990093312.0, + "grad_norm": 0.026581719305211308, + "language_loss": 0.93869209, + "learning_rate": 0.0005460429397441214, + "loss": 0.95031631, + "num_input_tokens_seen": 210711760, + "router_z_loss_mlp": 0.81738281, + "step": 2529, + "time_per_iteration": 2.553438425064087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164502, + "balance_loss_mlp": 1.08296263, + "epoch": 0.48672566371681414, + "flos": 536857061376.0, + "grad_norm": 0.02943507577689114, + "language_loss": 0.92893845, + "learning_rate": 0.0005457327122383866, + "loss": 0.94058347, + "num_input_tokens_seen": 210783040, + "router_z_loss_mlp": 0.81542969, + "step": 2530, + "time_per_iteration": 2.628859043121338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167305, + "balance_loss_mlp": 1.08795929, + "epoch": 0.4869180454020777, + "flos": 1415830457856.0, + "grad_norm": 0.01207374103656724, + "language_loss": 0.74636483, + "learning_rate": 0.0005454224669776385, + "loss": 0.75803792, + "num_input_tokens_seen": 211002128, + "router_z_loss_mlp": 0.79296875, + "step": 2531, + "time_per_iteration": 4.798464775085449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163612, + "balance_loss_mlp": 1.08212042, + "epoch": 0.48711042708734126, + "flos": 574226308608.0, + "grad_norm": 0.027593185975689192, + "language_loss": 0.81384307, + "learning_rate": 0.0005451122040823244, + "loss": 0.82547921, + "num_input_tokens_seen": 211080080, + "router_z_loss_mlp": 0.81494141, + "step": 2532, + "time_per_iteration": 2.7749013900756836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116272, + "balance_loss_mlp": 1.08118057, + "epoch": 0.48730280877260485, + "flos": 627816414720.0, + "grad_norm": 0.02591805781842408, + "language_loss": 0.82129884, + "learning_rate": 0.0005448019236728997, + "loss": 0.83292603, + "num_input_tokens_seen": 211162944, + "router_z_loss_mlp": 0.81542969, + "step": 2533, + "time_per_iteration": 2.865239381790161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164787, + "balance_loss_mlp": 1.08315206, + "epoch": 0.48749519045786843, + "flos": 513468126720.0, + "grad_norm": 0.03027053938911928, + "language_loss": 0.91336226, + "learning_rate": 0.0005444916258698255, + "loss": 0.92501009, + "num_input_tokens_seen": 211230448, + "router_z_loss_mlp": 0.81640625, + "step": 2534, + "time_per_iteration": 2.5986597537994385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164085, + "balance_loss_mlp": 1.08259368, + "epoch": 0.48768757214313196, + "flos": 526478251008.0, + "grad_norm": 0.02699578070604874, + "language_loss": 0.90958095, + "learning_rate": 0.0005441813107935704, + "loss": 0.92122173, + "num_input_tokens_seen": 211301248, + "router_z_loss_mlp": 0.81494141, + "step": 2535, + "time_per_iteration": 2.685478925704956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162911, + "balance_loss_mlp": 1.08137167, + "epoch": 0.48787995382839555, + "flos": 506030807040.0, + "grad_norm": 0.02902824988643181, + "language_loss": 0.91504169, + "learning_rate": 0.0005438709785646091, + "loss": 0.92667079, + "num_input_tokens_seen": 211369888, + "router_z_loss_mlp": 0.81542969, + "step": 2536, + "time_per_iteration": 2.563302755355835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164758, + "balance_loss_mlp": 1.08302808, + "epoch": 0.4880723355136591, + "flos": 576247276032.0, + "grad_norm": 0.028837521239882914, + "language_loss": 0.92468232, + "learning_rate": 0.0005435606293034234, + "loss": 0.93632984, + "num_input_tokens_seen": 211441808, + "router_z_loss_mlp": 0.81738281, + "step": 2537, + "time_per_iteration": 2.6447930335998535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117327, + "balance_loss_mlp": 1.09163582, + "epoch": 0.48826471719892267, + "flos": 562536207360.0, + "grad_norm": 0.0312247117460979, + "language_loss": 0.90714639, + "learning_rate": 0.0005432502631305016, + "loss": 0.91887903, + "num_input_tokens_seen": 211511216, + "router_z_loss_mlp": 0.81640625, + "step": 2538, + "time_per_iteration": 2.6652588844299316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173314, + "balance_loss_mlp": 1.09163225, + "epoch": 0.4884570988841862, + "flos": 727547847168.0, + "grad_norm": 0.027646073497336384, + "language_loss": 0.88003767, + "learning_rate": 0.0005429398801663386, + "loss": 0.89177084, + "num_input_tokens_seen": 211589264, + "router_z_loss_mlp": 0.81689453, + "step": 2539, + "time_per_iteration": 2.9378042221069336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163435, + "balance_loss_mlp": 1.08180094, + "epoch": 0.4886494805694498, + "flos": 431924126208.0, + "grad_norm": 0.03488087397138866, + "language_loss": 0.90234458, + "learning_rate": 0.0005426294805314355, + "loss": 0.91397893, + "num_input_tokens_seen": 211652928, + "router_z_loss_mlp": 0.81640625, + "step": 2540, + "time_per_iteration": 2.538275718688965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161042, + "balance_loss_mlp": 1.07935977, + "epoch": 0.4888418622547134, + "flos": 674344505856.0, + "grad_norm": 0.02710942555690322, + "language_loss": 0.8497895, + "learning_rate": 0.0005423190643463003, + "loss": 0.86139989, + "num_input_tokens_seen": 211741664, + "router_z_loss_mlp": 0.81689453, + "step": 2541, + "time_per_iteration": 2.9786784648895264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163064, + "balance_loss_mlp": 1.08133411, + "epoch": 0.4890342439399769, + "flos": 542935426560.0, + "grad_norm": 0.02908053911836938, + "language_loss": 0.88889569, + "learning_rate": 0.0005420086317314473, + "loss": 0.90052634, + "num_input_tokens_seen": 211809136, + "router_z_loss_mlp": 0.81738281, + "step": 2542, + "time_per_iteration": 2.650505781173706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163957, + "balance_loss_mlp": 1.08198881, + "epoch": 0.4892266256252405, + "flos": 591862517760.0, + "grad_norm": 0.032456825889771945, + "language_loss": 0.86421382, + "learning_rate": 0.0005416981828073971, + "loss": 0.87585342, + "num_input_tokens_seen": 211883136, + "router_z_loss_mlp": 0.81982422, + "step": 2543, + "time_per_iteration": 2.756906032562256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167862, + "balance_loss_mlp": 1.08718109, + "epoch": 0.48941900731050403, + "flos": 1519654216704.0, + "grad_norm": 0.009398242691954228, + "language_loss": 0.77115011, + "learning_rate": 0.0005413877176946765, + "loss": 0.78282875, + "num_input_tokens_seen": 212117488, + "router_z_loss_mlp": 0.80664062, + "step": 2544, + "time_per_iteration": 4.826622486114502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163984, + "balance_loss_mlp": 1.08225381, + "epoch": 0.4896113889957676, + "flos": 471518456832.0, + "grad_norm": 0.03564931489131084, + "language_loss": 0.92759442, + "learning_rate": 0.000541077236513819, + "loss": 0.93923426, + "num_input_tokens_seen": 212181952, + "router_z_loss_mlp": 0.81738281, + "step": 2545, + "time_per_iteration": 2.5047078132629395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169885, + "balance_loss_mlp": 1.08848882, + "epoch": 0.48980377068103115, + "flos": 497551440384.0, + "grad_norm": 0.02644804149278648, + "language_loss": 0.87771875, + "learning_rate": 0.0005407667393853638, + "loss": 0.88941759, + "num_input_tokens_seen": 212252608, + "router_z_loss_mlp": 0.81396484, + "step": 2546, + "time_per_iteration": 2.615182876586914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172802, + "balance_loss_mlp": 1.09116721, + "epoch": 0.48999615236629473, + "flos": 694107743232.0, + "grad_norm": 0.032384144791382644, + "language_loss": 0.89844877, + "learning_rate": 0.0005404562264298569, + "loss": 0.91017681, + "num_input_tokens_seen": 212328560, + "router_z_loss_mlp": 0.81640625, + "step": 2547, + "time_per_iteration": 2.8694136142730713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164836, + "balance_loss_mlp": 1.08310628, + "epoch": 0.49018853405155827, + "flos": 542748774912.0, + "grad_norm": 0.02932030725962162, + "language_loss": 0.90206313, + "learning_rate": 0.0005401456977678498, + "loss": 0.91371155, + "num_input_tokens_seen": 212399616, + "router_z_loss_mlp": 0.81738281, + "step": 2548, + "time_per_iteration": 2.644604444503784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158708, + "balance_loss_mlp": 1.07702553, + "epoch": 0.49038091573682185, + "flos": 697108357632.0, + "grad_norm": 0.0348486432591887, + "language_loss": 0.83939159, + "learning_rate": 0.0005398351535199008, + "loss": 0.85097861, + "num_input_tokens_seen": 212482352, + "router_z_loss_mlp": 0.81689453, + "step": 2549, + "time_per_iteration": 3.064962863922119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158664, + "balance_loss_mlp": 1.07693398, + "epoch": 0.49057329742208544, + "flos": 598062406656.0, + "grad_norm": 0.028343941430048352, + "language_loss": 0.89488542, + "learning_rate": 0.0005395245938065735, + "loss": 0.90647209, + "num_input_tokens_seen": 212559504, + "router_z_loss_mlp": 0.81738281, + "step": 2550, + "time_per_iteration": 2.8023993968963623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162826, + "balance_loss_mlp": 1.08119094, + "epoch": 0.490765679107349, + "flos": 514416847872.0, + "grad_norm": 0.036438353865587, + "language_loss": 0.8920716, + "learning_rate": 0.0005392140187484379, + "loss": 0.90369982, + "num_input_tokens_seen": 212625664, + "router_z_loss_mlp": 0.81640625, + "step": 2551, + "time_per_iteration": 2.5544004440307617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160822, + "balance_loss_mlp": 1.07928288, + "epoch": 0.49095806079261256, + "flos": 630842499072.0, + "grad_norm": 0.02833803159801528, + "language_loss": 0.95730108, + "learning_rate": 0.0005389034284660701, + "loss": 0.96890926, + "num_input_tokens_seen": 212702000, + "router_z_loss_mlp": 0.81542969, + "step": 2552, + "time_per_iteration": 2.787997245788574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156735, + "balance_loss_mlp": 1.07524312, + "epoch": 0.4911504424778761, + "flos": 916792356864.0, + "grad_norm": 0.03441290589053542, + "language_loss": 0.8892417, + "learning_rate": 0.000538592823080052, + "loss": 0.90080899, + "num_input_tokens_seen": 212785376, + "router_z_loss_mlp": 0.81494141, + "step": 2553, + "time_per_iteration": 3.1353423595428467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159599, + "balance_loss_mlp": 1.07858455, + "epoch": 0.4913428241631397, + "flos": 439854271488.0, + "grad_norm": 0.03215354145178159, + "language_loss": 0.91146123, + "learning_rate": 0.000538282202710971, + "loss": 0.9230572, + "num_input_tokens_seen": 212848176, + "router_z_loss_mlp": 0.81005859, + "step": 2554, + "time_per_iteration": 2.524106025695801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158745, + "balance_loss_mlp": 1.0776825, + "epoch": 0.4915352058484032, + "flos": 637239773184.0, + "grad_norm": 0.03412299335020121, + "language_loss": 0.8861627, + "learning_rate": 0.000537971567479421, + "loss": 0.8977502, + "num_input_tokens_seen": 212917888, + "router_z_loss_mlp": 0.81054688, + "step": 2555, + "time_per_iteration": 2.750051736831665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162188, + "balance_loss_mlp": 1.08107841, + "epoch": 0.4917275875336668, + "flos": 505509783552.0, + "grad_norm": 0.03289434989172404, + "language_loss": 0.93214262, + "learning_rate": 0.0005376609175060011, + "loss": 0.94376451, + "num_input_tokens_seen": 212986288, + "router_z_loss_mlp": 0.81103516, + "step": 2556, + "time_per_iteration": 2.588437557220459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160453, + "balance_loss_mlp": 1.07924759, + "epoch": 0.49191996921893033, + "flos": 655733379072.0, + "grad_norm": 0.02731850736189593, + "language_loss": 0.86463559, + "learning_rate": 0.0005373502529113162, + "loss": 0.87624013, + "num_input_tokens_seen": 213059504, + "router_z_loss_mlp": 0.81201172, + "step": 2557, + "time_per_iteration": 2.775529146194458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160279, + "balance_loss_mlp": 1.07897866, + "epoch": 0.4921123509041939, + "flos": 493398715392.0, + "grad_norm": 0.02896728411720768, + "language_loss": 0.88084292, + "learning_rate": 0.0005370395738159773, + "loss": 0.8924458, + "num_input_tokens_seen": 213129984, + "router_z_loss_mlp": 0.81298828, + "step": 2558, + "time_per_iteration": 2.638489007949829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162432, + "balance_loss_mlp": 1.08084488, + "epoch": 0.4923047325894575, + "flos": 547207673856.0, + "grad_norm": 0.030679841284503157, + "language_loss": 0.90182674, + "learning_rate": 0.0005367288803406003, + "loss": 0.91345102, + "num_input_tokens_seen": 213199184, + "router_z_loss_mlp": 0.81591797, + "step": 2559, + "time_per_iteration": 2.655319929122925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166456, + "balance_loss_mlp": 1.08477354, + "epoch": 0.49249711427472104, + "flos": 597589046784.0, + "grad_norm": 0.03258957792314928, + "language_loss": 0.88157088, + "learning_rate": 0.0005364181726058073, + "loss": 0.89323545, + "num_input_tokens_seen": 213272480, + "router_z_loss_mlp": 0.81689453, + "step": 2560, + "time_per_iteration": 2.7416017055511475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116275, + "balance_loss_mlp": 1.08111596, + "epoch": 0.4926894959599846, + "flos": 498808336896.0, + "grad_norm": 0.03132101057916933, + "language_loss": 0.88768357, + "learning_rate": 0.0005361074507322261, + "loss": 0.89931107, + "num_input_tokens_seen": 213338704, + "router_z_loss_mlp": 0.81640625, + "step": 2561, + "time_per_iteration": 2.6130712032318115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165857, + "balance_loss_mlp": 1.08446133, + "epoch": 0.49288187764524816, + "flos": 537182701056.0, + "grad_norm": 0.03057631912079697, + "language_loss": 0.88031554, + "learning_rate": 0.000535796714840489, + "loss": 0.89197409, + "num_input_tokens_seen": 213406016, + "router_z_loss_mlp": 0.81396484, + "step": 2562, + "time_per_iteration": 2.6463782787323 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167526, + "balance_loss_mlp": 1.08584368, + "epoch": 0.49307425933051174, + "flos": 642712521216.0, + "grad_norm": 0.037191189532270505, + "language_loss": 0.90339726, + "learning_rate": 0.0005354859650512348, + "loss": 0.91507256, + "num_input_tokens_seen": 213474016, + "router_z_loss_mlp": 0.81689453, + "step": 2563, + "time_per_iteration": 2.807185649871826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169953, + "balance_loss_mlp": 1.08831811, + "epoch": 0.4932666410157753, + "flos": 517265012736.0, + "grad_norm": 0.033499096438589164, + "language_loss": 0.92994809, + "learning_rate": 0.0005351752014851074, + "loss": 0.94164765, + "num_input_tokens_seen": 213539696, + "router_z_loss_mlp": 0.81640625, + "step": 2564, + "time_per_iteration": 2.574969530105591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164544, + "balance_loss_mlp": 1.08310056, + "epoch": 0.49345902270103886, + "flos": 602651561472.0, + "grad_norm": 0.03279756121209128, + "language_loss": 0.89816988, + "learning_rate": 0.0005348644242627553, + "loss": 0.90981531, + "num_input_tokens_seen": 213609504, + "router_z_loss_mlp": 0.81445312, + "step": 2565, + "time_per_iteration": 2.718763828277588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170387, + "balance_loss_mlp": 1.0912323, + "epoch": 0.49365140438630245, + "flos": 1496981689344.0, + "grad_norm": 0.010263800536892794, + "language_loss": 0.75286627, + "learning_rate": 0.0005345536335048336, + "loss": 0.76457012, + "num_input_tokens_seen": 213846064, + "router_z_loss_mlp": 0.79101562, + "step": 2566, + "time_per_iteration": 4.933185815811157 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116695, + "balance_loss_mlp": 1.08588743, + "epoch": 0.493843786071566, + "flos": 630788104704.0, + "grad_norm": 0.030129730382445888, + "language_loss": 0.87054515, + "learning_rate": 0.0005342428293320013, + "loss": 0.88221461, + "num_input_tokens_seen": 213923216, + "router_z_loss_mlp": 0.81054688, + "step": 2567, + "time_per_iteration": 2.7435762882232666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167603, + "balance_loss_mlp": 1.08635032, + "epoch": 0.49403616775682957, + "flos": 618689771520.0, + "grad_norm": 0.03756496493147188, + "language_loss": 0.89032316, + "learning_rate": 0.0005339320118649238, + "loss": 0.90199912, + "num_input_tokens_seen": 213994096, + "router_z_loss_mlp": 0.8125, + "step": 2568, + "time_per_iteration": 2.732135057449341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162688, + "balance_loss_mlp": 1.08148313, + "epoch": 0.4942285494420931, + "flos": 578813462016.0, + "grad_norm": 0.027001968550623295, + "language_loss": 0.91260755, + "learning_rate": 0.000533621181224271, + "loss": 0.92423451, + "num_input_tokens_seen": 214069104, + "router_z_loss_mlp": 0.81201172, + "step": 2569, + "time_per_iteration": 2.79868483543396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164198, + "balance_loss_mlp": 1.08304083, + "epoch": 0.4944209311273567, + "flos": 631465580544.0, + "grad_norm": 0.0320565630919746, + "language_loss": 0.86978823, + "learning_rate": 0.0005333103375307182, + "loss": 0.88143021, + "num_input_tokens_seen": 214150368, + "router_z_loss_mlp": 0.81152344, + "step": 2570, + "time_per_iteration": 2.850125551223755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159265, + "balance_loss_mlp": 1.07825053, + "epoch": 0.4946133128126202, + "flos": 588718912512.0, + "grad_norm": 0.030887982554767154, + "language_loss": 0.91666126, + "learning_rate": 0.0005329994809049451, + "loss": 0.92825389, + "num_input_tokens_seen": 214220112, + "router_z_loss_mlp": 0.81005859, + "step": 2571, + "time_per_iteration": 2.716823101043701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115557, + "balance_loss_mlp": 1.07460296, + "epoch": 0.4948056944978838, + "flos": 584846164992.0, + "grad_norm": 0.031743542415023744, + "language_loss": 0.93336749, + "learning_rate": 0.0005326886114676375, + "loss": 0.94492316, + "num_input_tokens_seen": 214294480, + "router_z_loss_mlp": 0.80957031, + "step": 2572, + "time_per_iteration": 2.7895162105560303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160915, + "balance_loss_mlp": 1.08004355, + "epoch": 0.49499807618314734, + "flos": 482780860416.0, + "grad_norm": 0.03097072525481985, + "language_loss": 0.93359911, + "learning_rate": 0.0005323777293394854, + "loss": 0.94520825, + "num_input_tokens_seen": 214359568, + "router_z_loss_mlp": 0.80859375, + "step": 2573, + "time_per_iteration": 2.5428624153137207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161628, + "balance_loss_mlp": 1.08089912, + "epoch": 0.4951904578684109, + "flos": 520037316096.0, + "grad_norm": 0.029847836155631635, + "language_loss": 0.87235224, + "learning_rate": 0.000532066834641184, + "loss": 0.88396853, + "num_input_tokens_seen": 214432032, + "router_z_loss_mlp": 0.80712891, + "step": 2574, + "time_per_iteration": 2.666405439376831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116292, + "balance_loss_mlp": 1.08195353, + "epoch": 0.4953828395536745, + "flos": 536577083904.0, + "grad_norm": 0.029607666498307577, + "language_loss": 0.91085738, + "learning_rate": 0.0005317559274934334, + "loss": 0.92248654, + "num_input_tokens_seen": 214504096, + "router_z_loss_mlp": 0.80957031, + "step": 2575, + "time_per_iteration": 2.694953441619873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161488, + "balance_loss_mlp": 1.08056831, + "epoch": 0.49557522123893805, + "flos": 529606393344.0, + "grad_norm": 0.03416750639658743, + "language_loss": 0.87365144, + "learning_rate": 0.0005314450080169382, + "loss": 0.8852663, + "num_input_tokens_seen": 214575920, + "router_z_loss_mlp": 0.80908203, + "step": 2576, + "time_per_iteration": 2.6648805141448975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160753, + "balance_loss_mlp": 1.07973826, + "epoch": 0.49576760292420163, + "flos": 428917507584.0, + "grad_norm": 0.028909192983869472, + "language_loss": 0.86833698, + "learning_rate": 0.0005311340763324083, + "loss": 0.87994456, + "num_input_tokens_seen": 214641664, + "router_z_loss_mlp": 0.81005859, + "step": 2577, + "time_per_iteration": 2.563143014907837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160705, + "balance_loss_mlp": 1.07945204, + "epoch": 0.49595998460946517, + "flos": 566315629056.0, + "grad_norm": 0.02703431344264104, + "language_loss": 0.87897325, + "learning_rate": 0.0005308231325605578, + "loss": 0.8905803, + "num_input_tokens_seen": 214711744, + "router_z_loss_mlp": 0.8125, + "step": 2578, + "time_per_iteration": 2.690247058868408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159003, + "balance_loss_mlp": 1.07746387, + "epoch": 0.49615236629472875, + "flos": 703813807104.0, + "grad_norm": 0.02447176932933424, + "language_loss": 0.81124884, + "learning_rate": 0.0005305121768221061, + "loss": 0.8228389, + "num_input_tokens_seen": 214802256, + "router_z_loss_mlp": 0.81542969, + "step": 2579, + "time_per_iteration": 3.1026089191436768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011698, + "balance_loss_mlp": 1.08969116, + "epoch": 0.4963447479799923, + "flos": 1444752539136.0, + "grad_norm": 0.010536082657862093, + "language_loss": 0.75038326, + "learning_rate": 0.000530201209237777, + "loss": 0.76208121, + "num_input_tokens_seen": 215023648, + "router_z_loss_mlp": 0.80078125, + "step": 2580, + "time_per_iteration": 4.814293146133423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160566, + "balance_loss_mlp": 1.07912242, + "epoch": 0.49653712966525587, + "flos": 538663179264.0, + "grad_norm": 0.027995208065503225, + "language_loss": 0.97084171, + "learning_rate": 0.0005298902299282984, + "loss": 0.98244739, + "num_input_tokens_seen": 215094080, + "router_z_loss_mlp": 0.81445312, + "step": 2581, + "time_per_iteration": 2.6197092533111572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115749, + "balance_loss_mlp": 1.07609439, + "epoch": 0.4967295113505194, + "flos": 608395554816.0, + "grad_norm": 0.029727926282221828, + "language_loss": 0.90264994, + "learning_rate": 0.0005295792390144033, + "loss": 0.91422486, + "num_input_tokens_seen": 215165456, + "router_z_loss_mlp": 0.81396484, + "step": 2582, + "time_per_iteration": 2.6830005645751953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156586, + "balance_loss_mlp": 1.07528532, + "epoch": 0.496921893035783, + "flos": 475530192384.0, + "grad_norm": 0.034235181262718475, + "language_loss": 0.90576661, + "learning_rate": 0.0005292682366168294, + "loss": 0.91733253, + "num_input_tokens_seen": 215229344, + "router_z_loss_mlp": 0.81298828, + "step": 2583, + "time_per_iteration": 2.5291895866394043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158052, + "balance_loss_mlp": 1.07694244, + "epoch": 0.4971142747210466, + "flos": 598602895872.0, + "grad_norm": 0.029240794220739816, + "language_loss": 0.86485231, + "learning_rate": 0.0005289572228563181, + "loss": 0.8764329, + "num_input_tokens_seen": 215305616, + "router_z_loss_mlp": 0.81103516, + "step": 2584, + "time_per_iteration": 2.777571678161621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159994, + "balance_loss_mlp": 1.0788368, + "epoch": 0.4973066564063101, + "flos": 600734653440.0, + "grad_norm": 0.030481884249605188, + "language_loss": 0.889974, + "learning_rate": 0.000528646197853616, + "loss": 0.90157396, + "num_input_tokens_seen": 215378128, + "router_z_loss_mlp": 0.81152344, + "step": 2585, + "time_per_iteration": 2.767935276031494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116269, + "balance_loss_mlp": 1.08162796, + "epoch": 0.4974990380915737, + "flos": 650768919552.0, + "grad_norm": 0.027212373173769577, + "language_loss": 0.90572929, + "learning_rate": 0.0005283351617294735, + "loss": 0.91735625, + "num_input_tokens_seen": 215453536, + "router_z_loss_mlp": 0.81054688, + "step": 2586, + "time_per_iteration": 2.890571117401123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167969, + "balance_loss_mlp": 1.08862305, + "epoch": 0.49769141977683723, + "flos": 1532440032768.0, + "grad_norm": 0.00993779830792852, + "language_loss": 0.7663666, + "learning_rate": 0.0005280241146046456, + "loss": 0.77804637, + "num_input_tokens_seen": 215689440, + "router_z_loss_mlp": 0.79296875, + "step": 2587, + "time_per_iteration": 4.995927095413208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116898, + "balance_loss_mlp": 1.08791721, + "epoch": 0.4978838014621008, + "flos": 537397550592.0, + "grad_norm": 0.03215658272946184, + "language_loss": 0.92911154, + "learning_rate": 0.0005277130565998916, + "loss": 0.94080132, + "num_input_tokens_seen": 215759600, + "router_z_loss_mlp": 0.81054688, + "step": 2588, + "time_per_iteration": 2.717165946960449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162431, + "balance_loss_mlp": 1.08122599, + "epoch": 0.49807618314736435, + "flos": 540745271808.0, + "grad_norm": 0.02720148099542, + "language_loss": 0.86777204, + "learning_rate": 0.0005274019878359748, + "loss": 0.87939632, + "num_input_tokens_seen": 215833920, + "router_z_loss_mlp": 0.81201172, + "step": 2589, + "time_per_iteration": 2.71560001373291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162135, + "balance_loss_mlp": 1.08088183, + "epoch": 0.49826856483262794, + "flos": 543521577984.0, + "grad_norm": 0.03624054616449923, + "language_loss": 0.92995536, + "learning_rate": 0.0005270909084336628, + "loss": 0.94157672, + "num_input_tokens_seen": 215903616, + "router_z_loss_mlp": 0.8125, + "step": 2590, + "time_per_iteration": 2.6439368724823 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165371, + "balance_loss_mlp": 1.08435619, + "epoch": 0.4984609465178915, + "flos": 523360842240.0, + "grad_norm": 0.02994333023587166, + "language_loss": 0.94466031, + "learning_rate": 0.0005267798185137276, + "loss": 0.95631397, + "num_input_tokens_seen": 215974832, + "router_z_loss_mlp": 0.81005859, + "step": 2591, + "time_per_iteration": 2.6229867935180664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159677, + "balance_loss_mlp": 1.07851899, + "epoch": 0.49865332820315506, + "flos": 575704785408.0, + "grad_norm": 0.030323117469882623, + "language_loss": 0.94773531, + "learning_rate": 0.0005264687181969444, + "loss": 0.95933211, + "num_input_tokens_seen": 216045024, + "router_z_loss_mlp": 0.81152344, + "step": 2592, + "time_per_iteration": 2.7226686477661133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164286, + "balance_loss_mlp": 1.08303344, + "epoch": 0.49884570988841864, + "flos": 1015210497024.0, + "grad_norm": 0.0376584975450282, + "language_loss": 0.82159829, + "learning_rate": 0.0005261576076040937, + "loss": 0.83324111, + "num_input_tokens_seen": 216129024, + "router_z_loss_mlp": 0.8125, + "step": 2593, + "time_per_iteration": 3.2477946281433105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169307, + "balance_loss_mlp": 1.08843529, + "epoch": 0.4990380915736822, + "flos": 560647497216.0, + "grad_norm": 0.03227625840551658, + "language_loss": 0.90092522, + "learning_rate": 0.0005258464868559591, + "loss": 0.91261828, + "num_input_tokens_seen": 216197648, + "router_z_loss_mlp": 0.80859375, + "step": 2594, + "time_per_iteration": 2.650367259979248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167043, + "balance_loss_mlp": 1.08588493, + "epoch": 0.49923047325894576, + "flos": 499943709696.0, + "grad_norm": 0.030210069947970843, + "language_loss": 0.94528484, + "learning_rate": 0.0005255353560733284, + "loss": 0.95695531, + "num_input_tokens_seen": 216263904, + "router_z_loss_mlp": 0.81152344, + "step": 2595, + "time_per_iteration": 2.6242079734802246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174149, + "balance_loss_mlp": 1.09518433, + "epoch": 0.4994228549442093, + "flos": 1499788194816.0, + "grad_norm": 0.015118012466641684, + "language_loss": 0.75578642, + "learning_rate": 0.0005252242153769931, + "loss": 0.76752794, + "num_input_tokens_seen": 216493152, + "router_z_loss_mlp": 0.7890625, + "step": 2596, + "time_per_iteration": 4.820875883102417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116628, + "balance_loss_mlp": 1.08521724, + "epoch": 0.4996152366294729, + "flos": 558513738240.0, + "grad_norm": 0.031441861478263874, + "language_loss": 0.89123356, + "learning_rate": 0.0005249130648877492, + "loss": 0.9028964, + "num_input_tokens_seen": 216567216, + "router_z_loss_mlp": 0.81054688, + "step": 2597, + "time_per_iteration": 2.71932053565979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158102, + "balance_loss_mlp": 1.07699203, + "epoch": 0.4998076183147364, + "flos": 416482801152.0, + "grad_norm": 0.03314289919132309, + "language_loss": 0.90550959, + "learning_rate": 0.0005246019047263953, + "loss": 0.91709059, + "num_input_tokens_seen": 216630624, + "router_z_loss_mlp": 0.81103516, + "step": 2598, + "time_per_iteration": 2.4899134635925293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158453, + "balance_loss_mlp": 1.07739091, + "epoch": 0.5, + "flos": 468325186560.0, + "grad_norm": 0.03341299307449988, + "language_loss": 0.88387024, + "learning_rate": 0.0005242907350137353, + "loss": 0.89545476, + "num_input_tokens_seen": 216696576, + "router_z_loss_mlp": 0.81054688, + "step": 2599, + "time_per_iteration": 2.553997039794922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164809, + "balance_loss_mlp": 1.08369899, + "epoch": 0.5001923816852636, + "flos": 483755778048.0, + "grad_norm": 0.03321709561705903, + "language_loss": 0.85543942, + "learning_rate": 0.0005239795558705754, + "loss": 0.86708754, + "num_input_tokens_seen": 216767584, + "router_z_loss_mlp": 0.81103516, + "step": 2600, + "time_per_iteration": 2.6166868209838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164506, + "balance_loss_mlp": 1.08339632, + "epoch": 0.5003847633705272, + "flos": 534855559680.0, + "grad_norm": 0.030012173683065246, + "language_loss": 0.95093107, + "learning_rate": 0.0005236683674177264, + "loss": 0.96257615, + "num_input_tokens_seen": 216834320, + "router_z_loss_mlp": 0.81103516, + "step": 2601, + "time_per_iteration": 2.6404433250427246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162684, + "balance_loss_mlp": 1.08157361, + "epoch": 0.5005771450557907, + "flos": 739055299584.0, + "grad_norm": 0.032030290781944436, + "language_loss": 0.88311857, + "learning_rate": 0.0005233571697760021, + "loss": 0.89474535, + "num_input_tokens_seen": 216907312, + "router_z_loss_mlp": 0.81103516, + "step": 2602, + "time_per_iteration": 2.8534095287323 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160577, + "balance_loss_mlp": 1.07937133, + "epoch": 0.5007695267410542, + "flos": 780306026496.0, + "grad_norm": 0.036141348793487994, + "language_loss": 0.90016913, + "learning_rate": 0.0005230459630662203, + "loss": 0.91177493, + "num_input_tokens_seen": 216979872, + "router_z_loss_mlp": 0.81201172, + "step": 2603, + "time_per_iteration": 2.952563524246216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162299, + "balance_loss_mlp": 1.0812366, + "epoch": 0.5009619084263178, + "flos": 624618415104.0, + "grad_norm": 0.03600647163377571, + "language_loss": 0.88813984, + "learning_rate": 0.0005227347474092022, + "loss": 0.89976281, + "num_input_tokens_seen": 217054000, + "router_z_loss_mlp": 0.81054688, + "step": 2604, + "time_per_iteration": 2.70975399017334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166549, + "balance_loss_mlp": 1.08543897, + "epoch": 0.5011542901115814, + "flos": 532192045056.0, + "grad_norm": 0.023202845192485378, + "language_loss": 0.88172328, + "learning_rate": 0.0005224235229257724, + "loss": 0.89338881, + "num_input_tokens_seen": 217126784, + "router_z_loss_mlp": 0.81103516, + "step": 2605, + "time_per_iteration": 2.6811788082122803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165049, + "balance_loss_mlp": 1.08393872, + "epoch": 0.5013466717968449, + "flos": 528627472896.0, + "grad_norm": 0.02710312658737552, + "language_loss": 0.91735983, + "learning_rate": 0.0005221122897367589, + "loss": 0.92901027, + "num_input_tokens_seen": 217203056, + "router_z_loss_mlp": 0.81103516, + "step": 2606, + "time_per_iteration": 2.7866344451904297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115755, + "balance_loss_mlp": 1.07644022, + "epoch": 0.5015390534821085, + "flos": 567088432128.0, + "grad_norm": 0.035852557706828735, + "language_loss": 0.88253903, + "learning_rate": 0.0005218010479629932, + "loss": 0.89411449, + "num_input_tokens_seen": 217273280, + "router_z_loss_mlp": 0.81103516, + "step": 2607, + "time_per_iteration": 2.7290749549865723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157153, + "balance_loss_mlp": 1.07594728, + "epoch": 0.5017314351673721, + "flos": 567767909376.0, + "grad_norm": 0.03266328125205783, + "language_loss": 0.88539654, + "learning_rate": 0.0005214897977253102, + "loss": 0.89696807, + "num_input_tokens_seen": 217345568, + "router_z_loss_mlp": 0.81201172, + "step": 2608, + "time_per_iteration": 2.695686101913452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158723, + "balance_loss_mlp": 1.07751739, + "epoch": 0.5019238168526357, + "flos": 523387038720.0, + "grad_norm": 0.02584859781626205, + "language_loss": 0.88962579, + "learning_rate": 0.0005211785391445473, + "loss": 0.90121305, + "num_input_tokens_seen": 217422848, + "router_z_loss_mlp": 0.81201172, + "step": 2609, + "time_per_iteration": 2.7320780754089355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157806, + "balance_loss_mlp": 1.07674336, + "epoch": 0.5021161985378992, + "flos": 642636659712.0, + "grad_norm": 0.03213074952610081, + "language_loss": 0.85809815, + "learning_rate": 0.0005208672723415467, + "loss": 0.86967611, + "num_input_tokens_seen": 217502896, + "router_z_loss_mlp": 0.81054688, + "step": 2610, + "time_per_iteration": 2.8137152194976807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115836, + "balance_loss_mlp": 1.07729781, + "epoch": 0.5023085802231627, + "flos": 592422472704.0, + "grad_norm": 0.03276582898634011, + "language_loss": 0.85898113, + "learning_rate": 0.0005205559974371525, + "loss": 0.8705647, + "num_input_tokens_seen": 217575072, + "router_z_loss_mlp": 0.81054688, + "step": 2611, + "time_per_iteration": 2.7611584663391113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158271, + "balance_loss_mlp": 1.07720828, + "epoch": 0.5025009619084263, + "flos": 473333306880.0, + "grad_norm": 0.02842666355233711, + "language_loss": 0.86990851, + "learning_rate": 0.0005202447145522123, + "loss": 0.88149118, + "num_input_tokens_seen": 217644976, + "router_z_loss_mlp": 0.81054688, + "step": 2612, + "time_per_iteration": 2.6646487712860107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161741, + "balance_loss_mlp": 1.08067882, + "epoch": 0.5026933435936899, + "flos": 456077131776.0, + "grad_norm": 0.031223796902704184, + "language_loss": 0.84174728, + "learning_rate": 0.0005199334238075769, + "loss": 0.85336471, + "num_input_tokens_seen": 217712816, + "router_z_loss_mlp": 0.81054688, + "step": 2613, + "time_per_iteration": 2.567990779876709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163025, + "balance_loss_mlp": 1.08229649, + "epoch": 0.5028857252789535, + "flos": 492721239552.0, + "grad_norm": 0.02841040015147714, + "language_loss": 0.97840261, + "learning_rate": 0.0005196221253241, + "loss": 0.99003285, + "num_input_tokens_seen": 217780256, + "router_z_loss_mlp": 0.80712891, + "step": 2614, + "time_per_iteration": 2.5584659576416016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160421, + "balance_loss_mlp": 1.07988286, + "epoch": 0.503078106964217, + "flos": 626730706944.0, + "grad_norm": 0.03241817920698289, + "language_loss": 0.88891315, + "learning_rate": 0.0005193108192226383, + "loss": 0.90051734, + "num_input_tokens_seen": 217848496, + "router_z_loss_mlp": 0.80517578, + "step": 2615, + "time_per_iteration": 2.7840871810913086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164078, + "balance_loss_mlp": 1.0830152, + "epoch": 0.5032704886494805, + "flos": 580137487872.0, + "grad_norm": 0.02867464613296787, + "language_loss": 0.91759968, + "learning_rate": 0.000518999505624052, + "loss": 0.92924047, + "num_input_tokens_seen": 217919216, + "router_z_loss_mlp": 0.81054688, + "step": 2616, + "time_per_iteration": 2.6807193756103516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161331, + "balance_loss_mlp": 1.08017337, + "epoch": 0.5034628703347441, + "flos": 472845210624.0, + "grad_norm": 0.027070743385767714, + "language_loss": 0.8816672, + "learning_rate": 0.000518688184649203, + "loss": 0.89328051, + "num_input_tokens_seen": 217996096, + "router_z_loss_mlp": 0.81152344, + "step": 2617, + "time_per_iteration": 2.7943994998931885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159886, + "balance_loss_mlp": 1.07877576, + "epoch": 0.5036552520200077, + "flos": 490813063680.0, + "grad_norm": 0.03074056287258418, + "language_loss": 0.88926733, + "learning_rate": 0.0005183768564189577, + "loss": 0.90086615, + "num_input_tokens_seen": 218063072, + "router_z_loss_mlp": 0.81103516, + "step": 2618, + "time_per_iteration": 2.549255609512329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159762, + "balance_loss_mlp": 1.07860434, + "epoch": 0.5038476337052713, + "flos": 495215566848.0, + "grad_norm": 0.030783318052010424, + "language_loss": 0.87459326, + "learning_rate": 0.0005180655210541838, + "loss": 0.88619089, + "num_input_tokens_seen": 218131056, + "router_z_loss_mlp": 0.81152344, + "step": 2619, + "time_per_iteration": 2.5555741786956787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157127, + "balance_loss_mlp": 1.0759213, + "epoch": 0.5040400153905348, + "flos": 601739770368.0, + "grad_norm": 0.036447475930772646, + "language_loss": 0.89893603, + "learning_rate": 0.0005177541786757527, + "loss": 0.91050732, + "num_input_tokens_seen": 218203536, + "router_z_loss_mlp": 0.81201172, + "step": 2620, + "time_per_iteration": 2.75068998336792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157658, + "balance_loss_mlp": 1.07621455, + "epoch": 0.5042323970757984, + "flos": 812918932992.0, + "grad_norm": 0.03476449221513998, + "language_loss": 0.90274507, + "learning_rate": 0.000517442829404538, + "loss": 0.91432166, + "num_input_tokens_seen": 218283008, + "router_z_loss_mlp": 0.81445312, + "step": 2621, + "time_per_iteration": 2.981661558151245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160033, + "balance_loss_mlp": 1.07854116, + "epoch": 0.504424778761062, + "flos": 628606682112.0, + "grad_norm": 0.030074963346690586, + "language_loss": 0.92839754, + "learning_rate": 0.0005171314733614166, + "loss": 0.93999791, + "num_input_tokens_seen": 218362096, + "router_z_loss_mlp": 0.81494141, + "step": 2622, + "time_per_iteration": 2.942354202270508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160933, + "balance_loss_mlp": 1.07934618, + "epoch": 0.5046171604463255, + "flos": 516956837376.0, + "grad_norm": 0.029806335990833818, + "language_loss": 0.84097135, + "learning_rate": 0.0005168201106672671, + "loss": 0.85258067, + "num_input_tokens_seen": 218439440, + "router_z_loss_mlp": 0.81591797, + "step": 2623, + "time_per_iteration": 2.7703733444213867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160048, + "balance_loss_mlp": 1.07841325, + "epoch": 0.504809542131589, + "flos": 528853056000.0, + "grad_norm": 0.03248441490058616, + "language_loss": 0.91679412, + "learning_rate": 0.0005165087414429717, + "loss": 0.92839456, + "num_input_tokens_seen": 218505936, + "router_z_loss_mlp": 0.81640625, + "step": 2624, + "time_per_iteration": 2.620872974395752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116106, + "balance_loss_mlp": 1.07937741, + "epoch": 0.5050019238168526, + "flos": 555174749184.0, + "grad_norm": 0.03119977790816051, + "language_loss": 0.88980711, + "learning_rate": 0.0005161973658094144, + "loss": 0.90141767, + "num_input_tokens_seen": 218573824, + "router_z_loss_mlp": 0.81689453, + "step": 2625, + "time_per_iteration": 2.640408754348755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161049, + "balance_loss_mlp": 1.07955778, + "epoch": 0.5051943055021162, + "flos": 575928367104.0, + "grad_norm": 0.024986408688213266, + "language_loss": 0.88551366, + "learning_rate": 0.000515885983887482, + "loss": 0.89712417, + "num_input_tokens_seen": 218648016, + "router_z_loss_mlp": 0.81494141, + "step": 2626, + "time_per_iteration": 2.7737276554107666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161913, + "balance_loss_mlp": 1.08066046, + "epoch": 0.5053866871873798, + "flos": 497681696256.0, + "grad_norm": 0.03126501141119064, + "language_loss": 0.91551393, + "learning_rate": 0.0005155745957980636, + "loss": 0.92713308, + "num_input_tokens_seen": 218714128, + "router_z_loss_mlp": 0.8125, + "step": 2627, + "time_per_iteration": 2.5588245391845703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159267, + "balance_loss_mlp": 1.07801354, + "epoch": 0.5055790688726434, + "flos": 503219572224.0, + "grad_norm": 0.028407663328603422, + "language_loss": 0.94095421, + "learning_rate": 0.000515263201662051, + "loss": 0.95254695, + "num_input_tokens_seen": 218784800, + "router_z_loss_mlp": 0.8125, + "step": 2628, + "time_per_iteration": 2.6333348751068115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115977, + "balance_loss_mlp": 1.07851708, + "epoch": 0.5057714505579068, + "flos": 846767268864.0, + "grad_norm": 0.025627158908879104, + "language_loss": 0.8802768, + "learning_rate": 0.0005149518016003378, + "loss": 0.89187449, + "num_input_tokens_seen": 218868256, + "router_z_loss_mlp": 0.8125, + "step": 2629, + "time_per_iteration": 3.159515142440796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115843, + "balance_loss_mlp": 1.07722509, + "epoch": 0.5059638322431704, + "flos": 498808336896.0, + "grad_norm": 0.032654832965012745, + "language_loss": 0.88445461, + "learning_rate": 0.0005146403957338206, + "loss": 0.89603889, + "num_input_tokens_seen": 218932496, + "router_z_loss_mlp": 0.81201172, + "step": 2630, + "time_per_iteration": 2.569671154022217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166774, + "balance_loss_mlp": 1.08571208, + "epoch": 0.506156213928434, + "flos": 619113466368.0, + "grad_norm": 0.027165343024338446, + "language_loss": 0.86742038, + "learning_rate": 0.0005143289841833975, + "loss": 0.8790881, + "num_input_tokens_seen": 219010672, + "router_z_loss_mlp": 0.81054688, + "step": 2631, + "time_per_iteration": 2.8505327701568604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169752, + "balance_loss_mlp": 1.08911932, + "epoch": 0.5063485956136976, + "flos": 425789365248.0, + "grad_norm": 0.03495904047465476, + "language_loss": 0.89354646, + "learning_rate": 0.0005140175670699696, + "loss": 0.90524399, + "num_input_tokens_seen": 219077104, + "router_z_loss_mlp": 0.80615234, + "step": 2632, + "time_per_iteration": 2.5920779705047607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174002, + "balance_loss_mlp": 1.09341669, + "epoch": 0.5065409772989612, + "flos": 571069968384.0, + "grad_norm": 0.02494402323857881, + "language_loss": 0.86924809, + "learning_rate": 0.0005137061445144395, + "loss": 0.88098812, + "num_input_tokens_seen": 219164880, + "router_z_loss_mlp": 0.80566406, + "step": 2633, + "time_per_iteration": 2.8890433311462402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172992, + "balance_loss_mlp": 1.09250152, + "epoch": 0.5067333589842247, + "flos": 629969639424.0, + "grad_norm": 0.03395805639170181, + "language_loss": 0.93242514, + "learning_rate": 0.000513394716637712, + "loss": 0.94415504, + "num_input_tokens_seen": 219237376, + "router_z_loss_mlp": 0.8046875, + "step": 2634, + "time_per_iteration": 2.7772305011749268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171906, + "balance_loss_mlp": 1.09217834, + "epoch": 0.5069257406694883, + "flos": 1451096145408.0, + "grad_norm": 0.011960900894201355, + "language_loss": 0.79191709, + "learning_rate": 0.0005130832835606946, + "loss": 0.80363613, + "num_input_tokens_seen": 219467632, + "router_z_loss_mlp": 0.796875, + "step": 2635, + "time_per_iteration": 4.93586802482605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116392, + "balance_loss_mlp": 1.08323884, + "epoch": 0.5071181223547518, + "flos": 640057738752.0, + "grad_norm": 0.03273720191955115, + "language_loss": 0.86367166, + "learning_rate": 0.0005127718454042958, + "loss": 0.87531078, + "num_input_tokens_seen": 219545392, + "router_z_loss_mlp": 0.80664062, + "step": 2636, + "time_per_iteration": 2.8407700061798096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115771, + "balance_loss_mlp": 1.07683849, + "epoch": 0.5073105040400154, + "flos": 714872094720.0, + "grad_norm": 0.03167408399625075, + "language_loss": 0.89809334, + "learning_rate": 0.0005124604022894269, + "loss": 0.90967047, + "num_input_tokens_seen": 219623104, + "router_z_loss_mlp": 0.80859375, + "step": 2637, + "time_per_iteration": 2.9438648223876953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011651, + "balance_loss_mlp": 1.08575439, + "epoch": 0.5075028857252789, + "flos": 1439612161536.0, + "grad_norm": 0.009234713476178756, + "language_loss": 0.77188224, + "learning_rate": 0.000512148954337001, + "loss": 0.78353328, + "num_input_tokens_seen": 219853328, + "router_z_loss_mlp": 0.79296875, + "step": 2638, + "time_per_iteration": 4.855467319488525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170042, + "balance_loss_mlp": 1.08950412, + "epoch": 0.5076952674105425, + "flos": 572307399168.0, + "grad_norm": 0.033371281415520225, + "language_loss": 0.89923447, + "learning_rate": 0.0005118375016679325, + "loss": 0.91093493, + "num_input_tokens_seen": 219925024, + "router_z_loss_mlp": 0.80517578, + "step": 2639, + "time_per_iteration": 2.7761123180389404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168126, + "balance_loss_mlp": 1.08735013, + "epoch": 0.5078876490958061, + "flos": 517712176128.0, + "grad_norm": 0.04218063889538898, + "language_loss": 0.87796986, + "learning_rate": 0.0005115260444031382, + "loss": 0.88965112, + "num_input_tokens_seen": 219992752, + "router_z_loss_mlp": 0.80761719, + "step": 2640, + "time_per_iteration": 2.5914742946624756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164741, + "balance_loss_mlp": 1.08596802, + "epoch": 0.5080800307810697, + "flos": 1587619405824.0, + "grad_norm": 0.012463066852979446, + "language_loss": 0.78731823, + "learning_rate": 0.000511214582663537, + "loss": 0.79896557, + "num_input_tokens_seen": 220224160, + "router_z_loss_mlp": 0.78710938, + "step": 2641, + "time_per_iteration": 4.9428391456604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164884, + "balance_loss_mlp": 1.08420289, + "epoch": 0.5082724124663333, + "flos": 486186978816.0, + "grad_norm": 0.039006057605032056, + "language_loss": 0.93060952, + "learning_rate": 0.0005109031165700483, + "loss": 0.94225836, + "num_input_tokens_seen": 220289504, + "router_z_loss_mlp": 0.80664062, + "step": 2642, + "time_per_iteration": 2.5630409717559814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164249, + "balance_loss_mlp": 1.08318675, + "epoch": 0.5084647941515967, + "flos": 683442224640.0, + "grad_norm": 0.03324563219825503, + "language_loss": 0.88873887, + "learning_rate": 0.0005105916462435945, + "loss": 0.90038145, + "num_input_tokens_seen": 220361376, + "router_z_loss_mlp": 0.81054688, + "step": 2643, + "time_per_iteration": 2.8135592937469482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165445, + "balance_loss_mlp": 1.08438289, + "epoch": 0.5086571758368603, + "flos": 549812791296.0, + "grad_norm": 0.031221131167697595, + "language_loss": 0.92092431, + "learning_rate": 0.0005102801718050989, + "loss": 0.93257874, + "num_input_tokens_seen": 220434720, + "router_z_loss_mlp": 0.81054688, + "step": 2644, + "time_per_iteration": 2.684957981109619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165053, + "balance_loss_mlp": 1.08413339, + "epoch": 0.5088495575221239, + "flos": 565078198272.0, + "grad_norm": 0.032204925975490975, + "language_loss": 0.95189679, + "learning_rate": 0.0005099686933754867, + "loss": 0.96354735, + "num_input_tokens_seen": 220506208, + "router_z_loss_mlp": 0.80908203, + "step": 2645, + "time_per_iteration": 2.6721112728118896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167263, + "balance_loss_mlp": 1.08620095, + "epoch": 0.5090419392073875, + "flos": 552511234560.0, + "grad_norm": 0.03332524240735616, + "language_loss": 0.90223062, + "learning_rate": 0.0005096572110756845, + "loss": 0.9139033, + "num_input_tokens_seen": 220577456, + "router_z_loss_mlp": 0.81054688, + "step": 2646, + "time_per_iteration": 2.6559739112854004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167828, + "balance_loss_mlp": 1.08686149, + "epoch": 0.509234320892651, + "flos": 568883816448.0, + "grad_norm": 0.029529111031728714, + "language_loss": 0.90596855, + "learning_rate": 0.0005093457250266205, + "loss": 0.91764688, + "num_input_tokens_seen": 220649648, + "router_z_loss_mlp": 0.80957031, + "step": 2647, + "time_per_iteration": 2.7653987407684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167889, + "balance_loss_mlp": 1.08673143, + "epoch": 0.5094267025779146, + "flos": 583693327872.0, + "grad_norm": 0.03457257756125772, + "language_loss": 0.89727396, + "learning_rate": 0.000509034235349224, + "loss": 0.90895277, + "num_input_tokens_seen": 220721168, + "router_z_loss_mlp": 0.81152344, + "step": 2648, + "time_per_iteration": 2.690363645553589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159753, + "balance_loss_mlp": 1.07854819, + "epoch": 0.5096190842631781, + "flos": 593138880000.0, + "grad_norm": 0.0341546457293008, + "language_loss": 0.88255095, + "learning_rate": 0.0005087227421644266, + "loss": 0.89414853, + "num_input_tokens_seen": 220796464, + "router_z_loss_mlp": 0.81201172, + "step": 2649, + "time_per_iteration": 2.6982481479644775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159974, + "balance_loss_mlp": 1.07891166, + "epoch": 0.5098114659484417, + "flos": 514584033792.0, + "grad_norm": 0.030485361797949893, + "language_loss": 0.92298341, + "learning_rate": 0.0005084112455931602, + "loss": 0.93458325, + "num_input_tokens_seen": 220862976, + "router_z_loss_mlp": 0.81054688, + "step": 2650, + "time_per_iteration": 2.5739448070526123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162291, + "balance_loss_mlp": 1.08170521, + "epoch": 0.5100038476337053, + "flos": 485600827392.0, + "grad_norm": 0.03052985498468287, + "language_loss": 0.91529775, + "learning_rate": 0.0005080997457563586, + "loss": 0.92692065, + "num_input_tokens_seen": 220926432, + "router_z_loss_mlp": 0.80566406, + "step": 2651, + "time_per_iteration": 2.5381717681884766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165638, + "balance_loss_mlp": 1.08514845, + "epoch": 0.5101962293189688, + "flos": 462554996736.0, + "grad_norm": 0.037278277228963375, + "language_loss": 0.86181092, + "learning_rate": 0.0005077882427749569, + "loss": 0.87346727, + "num_input_tokens_seen": 220993008, + "router_z_loss_mlp": 0.8046875, + "step": 2652, + "time_per_iteration": 2.490943670272827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158092, + "balance_loss_mlp": 1.07745898, + "epoch": 0.5103886110042324, + "flos": 588132761088.0, + "grad_norm": 0.03182463194953253, + "language_loss": 0.91334021, + "learning_rate": 0.0005074767367698913, + "loss": 0.9249211, + "num_input_tokens_seen": 221059248, + "router_z_loss_mlp": 0.80615234, + "step": 2653, + "time_per_iteration": 2.6900839805603027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115906, + "balance_loss_mlp": 1.07847476, + "epoch": 0.510580992689496, + "flos": 846677945856.0, + "grad_norm": 0.027057922805634398, + "language_loss": 0.89024949, + "learning_rate": 0.0005071652278620988, + "loss": 0.90184009, + "num_input_tokens_seen": 221133712, + "router_z_loss_mlp": 0.80566406, + "step": 2654, + "time_per_iteration": 3.044296979904175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115973, + "balance_loss_mlp": 1.07919204, + "epoch": 0.5107733743747596, + "flos": 659810242560.0, + "grad_norm": 0.0315385737613105, + "language_loss": 0.89305294, + "learning_rate": 0.0005068537161725186, + "loss": 0.90465021, + "num_input_tokens_seen": 221202192, + "router_z_loss_mlp": 0.80517578, + "step": 2655, + "time_per_iteration": 2.770669937133789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160641, + "balance_loss_mlp": 1.08000755, + "epoch": 0.510965756060023, + "flos": 702960413184.0, + "grad_norm": 0.03531630249392906, + "language_loss": 0.91070223, + "learning_rate": 0.0005065422018220893, + "loss": 0.92230862, + "num_input_tokens_seen": 221277104, + "router_z_loss_mlp": 0.80615234, + "step": 2656, + "time_per_iteration": 2.833031177520752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165495, + "balance_loss_mlp": 1.08490956, + "epoch": 0.5111581377452866, + "flos": 560940936192.0, + "grad_norm": 0.03615724120857576, + "language_loss": 0.85921729, + "learning_rate": 0.0005062306849317521, + "loss": 0.87087226, + "num_input_tokens_seen": 221352320, + "router_z_loss_mlp": 0.80566406, + "step": 2657, + "time_per_iteration": 2.800971031188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167929, + "balance_loss_mlp": 1.0873909, + "epoch": 0.5113505194305502, + "flos": 610145276928.0, + "grad_norm": 0.029932060678028026, + "language_loss": 0.88435352, + "learning_rate": 0.0005059191656224487, + "loss": 0.89603281, + "num_input_tokens_seen": 221421056, + "router_z_loss_mlp": 0.80517578, + "step": 2658, + "time_per_iteration": 2.7075443267822266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159414, + "balance_loss_mlp": 1.07882822, + "epoch": 0.5115429011158138, + "flos": 535535036928.0, + "grad_norm": 0.028231439832000826, + "language_loss": 0.94975483, + "learning_rate": 0.0005056076440151212, + "loss": 0.96134901, + "num_input_tokens_seen": 221492064, + "router_z_loss_mlp": 0.80566406, + "step": 2659, + "time_per_iteration": 2.6906392574310303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162323, + "balance_loss_mlp": 1.0835495, + "epoch": 0.5117352828010774, + "flos": 1365273166848.0, + "grad_norm": 0.00971890017277948, + "language_loss": 0.76288116, + "learning_rate": 0.0005052961202307133, + "loss": 0.77450442, + "num_input_tokens_seen": 221724672, + "router_z_loss_mlp": 0.78515625, + "step": 2660, + "time_per_iteration": 4.880187273025513 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160968, + "balance_loss_mlp": 1.07990551, + "epoch": 0.5119276644863409, + "flos": 634930096128.0, + "grad_norm": 0.027317751888226913, + "language_loss": 0.91815728, + "learning_rate": 0.0005049845943901691, + "loss": 0.92976695, + "num_input_tokens_seen": 221800144, + "router_z_loss_mlp": 0.81054688, + "step": 2661, + "time_per_iteration": 2.8184986114501953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160969, + "balance_loss_mlp": 1.08004987, + "epoch": 0.5121200461716044, + "flos": 586780537344.0, + "grad_norm": 0.02944382500923868, + "language_loss": 0.91654462, + "learning_rate": 0.0005046730666144338, + "loss": 0.92815423, + "num_input_tokens_seen": 221877168, + "router_z_loss_mlp": 0.80908203, + "step": 2662, + "time_per_iteration": 2.755974769592285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160878, + "balance_loss_mlp": 1.0798161, + "epoch": 0.512312427856868, + "flos": 1034223124992.0, + "grad_norm": 0.029507171441845153, + "language_loss": 0.93013144, + "learning_rate": 0.0005043615370244532, + "loss": 0.94174021, + "num_input_tokens_seen": 221964208, + "router_z_loss_mlp": 0.81054688, + "step": 2663, + "time_per_iteration": 3.3488211631774902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177849, + "balance_loss_mlp": 1.09907532, + "epoch": 0.5125048095421316, + "flos": 1540899207168.0, + "grad_norm": 0.013662934984579522, + "language_loss": 0.78244388, + "learning_rate": 0.0005040500057411736, + "loss": 0.79422235, + "num_input_tokens_seen": 222179264, + "router_z_loss_mlp": 0.78710938, + "step": 2664, + "time_per_iteration": 4.6237993240356445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162223, + "balance_loss_mlp": 1.08130419, + "epoch": 0.5126971912273951, + "flos": 592327145472.0, + "grad_norm": 0.024418914459260154, + "language_loss": 0.89686567, + "learning_rate": 0.0005037384728855425, + "loss": 0.90848792, + "num_input_tokens_seen": 222259504, + "router_z_loss_mlp": 0.80908203, + "step": 2665, + "time_per_iteration": 2.8003761768341064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163774, + "balance_loss_mlp": 1.08299828, + "epoch": 0.5128895729126587, + "flos": 552717351936.0, + "grad_norm": 0.03867267783646357, + "language_loss": 0.9114759, + "learning_rate": 0.0005034269385785075, + "loss": 0.9231137, + "num_input_tokens_seen": 222330512, + "router_z_loss_mlp": 0.80761719, + "step": 2666, + "time_per_iteration": 2.664607286453247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161159, + "balance_loss_mlp": 1.08047831, + "epoch": 0.5130819545979223, + "flos": 482231639040.0, + "grad_norm": 0.037339426134761385, + "language_loss": 0.92204285, + "learning_rate": 0.0005031154029410168, + "loss": 0.93365449, + "num_input_tokens_seen": 222394000, + "router_z_loss_mlp": 0.80664062, + "step": 2667, + "time_per_iteration": 2.5419206619262695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157708, + "balance_loss_mlp": 1.0769316, + "epoch": 0.5132743362831859, + "flos": 476767623168.0, + "grad_norm": 0.03576788906651519, + "language_loss": 0.93073893, + "learning_rate": 0.0005028038660940197, + "loss": 0.942316, + "num_input_tokens_seen": 222459344, + "router_z_loss_mlp": 0.80761719, + "step": 2668, + "time_per_iteration": 2.5499191284179688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166102, + "balance_loss_mlp": 1.08542132, + "epoch": 0.5134667179684494, + "flos": 504902164992.0, + "grad_norm": 0.02981054719592371, + "language_loss": 0.89144588, + "learning_rate": 0.0005024923281584648, + "loss": 0.90310693, + "num_input_tokens_seen": 222528912, + "router_z_loss_mlp": 0.80664062, + "step": 2669, + "time_per_iteration": 2.6367011070251465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165888, + "balance_loss_mlp": 1.08496881, + "epoch": 0.5136590996537129, + "flos": 505004222976.0, + "grad_norm": 0.029270286325536108, + "language_loss": 0.87695622, + "learning_rate": 0.0005021807892553026, + "loss": 0.88861501, + "num_input_tokens_seen": 222604704, + "router_z_loss_mlp": 0.80908203, + "step": 2670, + "time_per_iteration": 2.697326421737671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165807, + "balance_loss_mlp": 1.08522201, + "epoch": 0.5138514813389765, + "flos": 625799450112.0, + "grad_norm": 0.029434336289691197, + "language_loss": 0.8977018, + "learning_rate": 0.0005018692495054828, + "loss": 0.90935987, + "num_input_tokens_seen": 222677888, + "router_z_loss_mlp": 0.80566406, + "step": 2671, + "time_per_iteration": 2.848576784133911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154912, + "balance_loss_mlp": 1.07394516, + "epoch": 0.5140438630242401, + "flos": 584633316864.0, + "grad_norm": 0.027486728027613972, + "language_loss": 0.85466325, + "learning_rate": 0.0005015577090299561, + "loss": 0.86621237, + "num_input_tokens_seen": 222751936, + "router_z_loss_mlp": 0.80957031, + "step": 2672, + "time_per_iteration": 2.698976993560791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155424, + "balance_loss_mlp": 1.0744096, + "epoch": 0.5142362447095037, + "flos": 488904887808.0, + "grad_norm": 0.030629892529963922, + "language_loss": 0.92615306, + "learning_rate": 0.0005012461679496729, + "loss": 0.9377073, + "num_input_tokens_seen": 222819616, + "router_z_loss_mlp": 0.81005859, + "step": 2673, + "time_per_iteration": 2.5998294353485107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115671, + "balance_loss_mlp": 1.07564759, + "epoch": 0.5144286263947672, + "flos": 527884869120.0, + "grad_norm": 0.029257555563523763, + "language_loss": 0.93652987, + "learning_rate": 0.0005009346263855848, + "loss": 0.94809699, + "num_input_tokens_seen": 222888448, + "router_z_loss_mlp": 0.81054688, + "step": 2674, + "time_per_iteration": 2.702364444732666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156546, + "balance_loss_mlp": 1.07548332, + "epoch": 0.5146210080800308, + "flos": 487589594112.0, + "grad_norm": 0.025826040346785265, + "language_loss": 0.88576883, + "learning_rate": 0.0005006230844586422, + "loss": 0.89733428, + "num_input_tokens_seen": 222964736, + "router_z_loss_mlp": 0.81054688, + "step": 2675, + "time_per_iteration": 2.7889058589935303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159564, + "balance_loss_mlp": 1.07845449, + "epoch": 0.5148133897652943, + "flos": 516974301696.0, + "grad_norm": 0.025127862595781116, + "language_loss": 0.83195055, + "learning_rate": 0.0005003115422897968, + "loss": 0.84354615, + "num_input_tokens_seen": 223040944, + "router_z_loss_mlp": 0.81103516, + "step": 2676, + "time_per_iteration": 2.7474374771118164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165139, + "balance_loss_mlp": 1.08436286, + "epoch": 0.5150057714505579, + "flos": 512211230208.0, + "grad_norm": 0.02805317572608274, + "language_loss": 0.92311704, + "learning_rate": 0.0005, + "loss": 0.93476844, + "num_input_tokens_seen": 223109632, + "router_z_loss_mlp": 0.80761719, + "step": 2677, + "time_per_iteration": 2.635801076889038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167536, + "balance_loss_mlp": 1.08652139, + "epoch": 0.5151981531358215, + "flos": 912389853696.0, + "grad_norm": 0.03671017270530106, + "language_loss": 0.86270726, + "learning_rate": 0.0004996884577102033, + "loss": 0.87438262, + "num_input_tokens_seen": 223191648, + "router_z_loss_mlp": 0.81005859, + "step": 2678, + "time_per_iteration": 3.1016898155212402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116356, + "balance_loss_mlp": 1.08264065, + "epoch": 0.515390534821085, + "flos": 472929804288.0, + "grad_norm": 0.02746999857609634, + "language_loss": 0.90178144, + "learning_rate": 0.000499376915541358, + "loss": 0.91341698, + "num_input_tokens_seen": 223265920, + "router_z_loss_mlp": 0.80908203, + "step": 2679, + "time_per_iteration": 2.7041540145874023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163327, + "balance_loss_mlp": 1.0826937, + "epoch": 0.5155829165063486, + "flos": 651357072384.0, + "grad_norm": 0.02786171231522906, + "language_loss": 0.85589147, + "learning_rate": 0.0004990653736144155, + "loss": 0.86752468, + "num_input_tokens_seen": 223340688, + "router_z_loss_mlp": 0.80615234, + "step": 2680, + "time_per_iteration": 2.883392572402954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163916, + "balance_loss_mlp": 1.08280623, + "epoch": 0.5157752981916122, + "flos": 415160776704.0, + "grad_norm": 0.030701546031170052, + "language_loss": 0.92331398, + "learning_rate": 0.0004987538320503271, + "loss": 0.93495315, + "num_input_tokens_seen": 223404064, + "router_z_loss_mlp": 0.81103516, + "step": 2681, + "time_per_iteration": 2.4719676971435547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169918, + "balance_loss_mlp": 1.0890938, + "epoch": 0.5159676798768758, + "flos": 554931701760.0, + "grad_norm": 0.03041903817165714, + "language_loss": 0.89793313, + "learning_rate": 0.0004984422909700442, + "loss": 0.90963233, + "num_input_tokens_seen": 223476784, + "router_z_loss_mlp": 0.80810547, + "step": 2682, + "time_per_iteration": 2.7486019134521484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168893, + "balance_loss_mlp": 1.08816493, + "epoch": 0.5161600615621393, + "flos": 587620469760.0, + "grad_norm": 0.02833679783776788, + "language_loss": 0.89197505, + "learning_rate": 0.0004981307504945173, + "loss": 0.90366399, + "num_input_tokens_seen": 223542832, + "router_z_loss_mlp": 0.80712891, + "step": 2683, + "time_per_iteration": 2.6918153762817383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161385, + "balance_loss_mlp": 1.08060837, + "epoch": 0.5163524432474028, + "flos": 589947611136.0, + "grad_norm": 0.03153559446680845, + "language_loss": 0.9527353, + "learning_rate": 0.0004978192107446976, + "loss": 0.96434915, + "num_input_tokens_seen": 223617968, + "router_z_loss_mlp": 0.80761719, + "step": 2684, + "time_per_iteration": 2.7622218132019043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159701, + "balance_loss_mlp": 1.07906806, + "epoch": 0.5165448249326664, + "flos": 504904166400.0, + "grad_norm": 0.029863924033148703, + "language_loss": 0.92634213, + "learning_rate": 0.0004975076718415353, + "loss": 0.93793911, + "num_input_tokens_seen": 223689504, + "router_z_loss_mlp": 0.80615234, + "step": 2685, + "time_per_iteration": 2.644228219985962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172411, + "balance_loss_mlp": 1.09220684, + "epoch": 0.51673720661793, + "flos": 417646371840.0, + "grad_norm": 0.031084732221220036, + "language_loss": 0.95470178, + "learning_rate": 0.0004971961339059806, + "loss": 0.96642584, + "num_input_tokens_seen": 223752288, + "router_z_loss_mlp": 0.80175781, + "step": 2686, + "time_per_iteration": 2.469081401824951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160009, + "balance_loss_mlp": 1.0795666, + "epoch": 0.5169295883031936, + "flos": 600074641920.0, + "grad_norm": 0.03147701291149863, + "language_loss": 0.89665824, + "learning_rate": 0.0004968845970589832, + "loss": 0.90825832, + "num_input_tokens_seen": 223822304, + "router_z_loss_mlp": 0.80419922, + "step": 2687, + "time_per_iteration": 2.7054736614227295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159105, + "balance_loss_mlp": 1.07847178, + "epoch": 0.517121969988457, + "flos": 557910122496.0, + "grad_norm": 0.03772331123991374, + "language_loss": 0.90882772, + "learning_rate": 0.0004965730614214926, + "loss": 0.92041886, + "num_input_tokens_seen": 223888592, + "router_z_loss_mlp": 0.80615234, + "step": 2688, + "time_per_iteration": 2.6433985233306885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159068, + "balance_loss_mlp": 1.0787214, + "epoch": 0.5173143516737206, + "flos": 470374351872.0, + "grad_norm": 0.031353493154565384, + "language_loss": 0.9113276, + "learning_rate": 0.0004962615271144576, + "loss": 0.92291832, + "num_input_tokens_seen": 223952880, + "router_z_loss_mlp": 0.80322266, + "step": 2689, + "time_per_iteration": 2.5081796646118164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159566, + "balance_loss_mlp": 1.07912409, + "epoch": 0.5175067333589842, + "flos": 721378157568.0, + "grad_norm": 0.03531118205346665, + "language_loss": 0.88785195, + "learning_rate": 0.0004959499942588264, + "loss": 0.89944768, + "num_input_tokens_seen": 224030000, + "router_z_loss_mlp": 0.80419922, + "step": 2690, + "time_per_iteration": 2.8977034091949463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165977, + "balance_loss_mlp": 1.08682251, + "epoch": 0.5176991150442478, + "flos": 1469341974528.0, + "grad_norm": 0.00940812354228104, + "language_loss": 0.78200024, + "learning_rate": 0.0004956384629755469, + "loss": 0.79365999, + "num_input_tokens_seen": 224252384, + "router_z_loss_mlp": 0.79101562, + "step": 2691, + "time_per_iteration": 4.744166851043701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162816, + "balance_loss_mlp": 1.08227849, + "epoch": 0.5178914967295114, + "flos": 613783709184.0, + "grad_norm": 0.0285194405600695, + "language_loss": 0.91181535, + "learning_rate": 0.0004953269333855661, + "loss": 0.92344356, + "num_input_tokens_seen": 224324640, + "router_z_loss_mlp": 0.80517578, + "step": 2692, + "time_per_iteration": 2.7305634021759033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164372, + "balance_loss_mlp": 1.0839293, + "epoch": 0.5180838784147749, + "flos": 501980140032.0, + "grad_norm": 0.03457473418848995, + "language_loss": 0.89626956, + "learning_rate": 0.0004950154056098309, + "loss": 0.90791321, + "num_input_tokens_seen": 224398368, + "router_z_loss_mlp": 0.80419922, + "step": 2693, + "time_per_iteration": 2.7358009815216064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162458, + "balance_loss_mlp": 1.08215868, + "epoch": 0.5182762601000385, + "flos": 690041613312.0, + "grad_norm": 0.03333155233389222, + "language_loss": 0.90543425, + "learning_rate": 0.0004947038797692867, + "loss": 0.91705889, + "num_input_tokens_seen": 224465456, + "router_z_loss_mlp": 0.80273438, + "step": 2694, + "time_per_iteration": 2.8636367321014404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178055, + "balance_loss_mlp": 1.09775615, + "epoch": 0.518468641785302, + "flos": 666800398848.0, + "grad_norm": 0.03410817354988479, + "language_loss": 0.8335048, + "learning_rate": 0.0004943923559848789, + "loss": 0.84528536, + "num_input_tokens_seen": 224540960, + "router_z_loss_mlp": 0.80273438, + "step": 2695, + "time_per_iteration": 2.797072172164917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117824, + "balance_loss_mlp": 1.09794104, + "epoch": 0.5186610234705656, + "flos": 567813571584.0, + "grad_norm": 0.02729227458516312, + "language_loss": 0.95474803, + "learning_rate": 0.0004940808343775515, + "loss": 0.96653044, + "num_input_tokens_seen": 224613200, + "router_z_loss_mlp": 0.80273438, + "step": 2696, + "time_per_iteration": 2.6839044094085693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162534, + "balance_loss_mlp": 1.08204436, + "epoch": 0.5188534051558291, + "flos": 429792368640.0, + "grad_norm": 0.03355790964159957, + "language_loss": 0.87542081, + "learning_rate": 0.0004937693150682479, + "loss": 0.88704622, + "num_input_tokens_seen": 224677456, + "router_z_loss_mlp": 0.8046875, + "step": 2697, + "time_per_iteration": 2.5123825073242188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161323, + "balance_loss_mlp": 1.08045113, + "epoch": 0.5190457868410927, + "flos": 547411789824.0, + "grad_norm": 0.031455242836056954, + "language_loss": 0.81813598, + "learning_rate": 0.0004934577981779107, + "loss": 0.82974923, + "num_input_tokens_seen": 224745600, + "router_z_loss_mlp": 0.80859375, + "step": 2698, + "time_per_iteration": 2.662545919418335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117247, + "balance_loss_mlp": 1.09159839, + "epoch": 0.5192381685263563, + "flos": 549745661952.0, + "grad_norm": 0.02804159255629041, + "language_loss": 0.86178321, + "learning_rate": 0.0004931462838274817, + "loss": 0.87350786, + "num_input_tokens_seen": 224826944, + "router_z_loss_mlp": 0.80859375, + "step": 2699, + "time_per_iteration": 2.877682685852051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172435, + "balance_loss_mlp": 1.09156311, + "epoch": 0.5194305502116199, + "flos": 576349334016.0, + "grad_norm": 0.03885998177020277, + "language_loss": 0.90400088, + "learning_rate": 0.0004928347721379011, + "loss": 0.91572523, + "num_input_tokens_seen": 224895280, + "router_z_loss_mlp": 0.80859375, + "step": 2700, + "time_per_iteration": 2.671849489212036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169932, + "balance_loss_mlp": 1.08906007, + "epoch": 0.5196229318968835, + "flos": 435217453056.0, + "grad_norm": 0.030583901836551724, + "language_loss": 0.87633044, + "learning_rate": 0.0004925232632301089, + "loss": 0.88802975, + "num_input_tokens_seen": 224961632, + "router_z_loss_mlp": 0.80859375, + "step": 2701, + "time_per_iteration": 2.57857608795166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166407, + "balance_loss_mlp": 1.08558309, + "epoch": 0.5198153135821469, + "flos": 559985484288.0, + "grad_norm": 0.03187287566803064, + "language_loss": 0.85556304, + "learning_rate": 0.0004922117572250431, + "loss": 0.86722708, + "num_input_tokens_seen": 225032816, + "router_z_loss_mlp": 0.80810547, + "step": 2702, + "time_per_iteration": 2.7037737369537354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166773, + "balance_loss_mlp": 1.08618808, + "epoch": 0.5200076952674105, + "flos": 566834651136.0, + "grad_norm": 0.03219739559056917, + "language_loss": 0.8641057, + "learning_rate": 0.0004919002542436414, + "loss": 0.87577343, + "num_input_tokens_seen": 225112736, + "router_z_loss_mlp": 0.80566406, + "step": 2703, + "time_per_iteration": 2.8919363021850586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169953, + "balance_loss_mlp": 1.08965361, + "epoch": 0.5202000769526741, + "flos": 572272470528.0, + "grad_norm": 0.0327510509858114, + "language_loss": 0.87948251, + "learning_rate": 0.0004915887544068399, + "loss": 0.89118207, + "num_input_tokens_seen": 225182672, + "router_z_loss_mlp": 0.80273438, + "step": 2704, + "time_per_iteration": 2.6497535705566406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169089, + "balance_loss_mlp": 1.08869386, + "epoch": 0.5203924586379377, + "flos": 695466697728.0, + "grad_norm": 0.02924473313894461, + "language_loss": 0.83824521, + "learning_rate": 0.0004912772578355736, + "loss": 0.84993607, + "num_input_tokens_seen": 225260272, + "router_z_loss_mlp": 0.80371094, + "step": 2705, + "time_per_iteration": 2.8862009048461914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163429, + "balance_loss_mlp": 1.08274853, + "epoch": 0.5205848403232012, + "flos": 567690046464.0, + "grad_norm": 0.031189936278329552, + "language_loss": 0.88606453, + "learning_rate": 0.000490965764650776, + "loss": 0.89769882, + "num_input_tokens_seen": 225337120, + "router_z_loss_mlp": 0.80664062, + "step": 2706, + "time_per_iteration": 2.865553379058838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163571, + "balance_loss_mlp": 1.08308065, + "epoch": 0.5207772220084648, + "flos": 1216204231680.0, + "grad_norm": 0.03053180986383906, + "language_loss": 0.8816222, + "learning_rate": 0.0004906542749733798, + "loss": 0.89325786, + "num_input_tokens_seen": 225433984, + "router_z_loss_mlp": 0.8046875, + "step": 2707, + "time_per_iteration": 3.6396875381469727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162365, + "balance_loss_mlp": 1.08197033, + "epoch": 0.5209696036937284, + "flos": 594031205376.0, + "grad_norm": 0.027334962594272247, + "language_loss": 0.90568572, + "learning_rate": 0.0004903427889243156, + "loss": 0.91730928, + "num_input_tokens_seen": 225512112, + "router_z_loss_mlp": 0.80371094, + "step": 2708, + "time_per_iteration": 2.853013753890991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116169, + "balance_loss_mlp": 1.08129489, + "epoch": 0.5211619853789919, + "flos": 523955725824.0, + "grad_norm": 0.032301377197285666, + "language_loss": 0.91200471, + "learning_rate": 0.0004900313066245134, + "loss": 0.92362165, + "num_input_tokens_seen": 225586944, + "router_z_loss_mlp": 0.80371094, + "step": 2709, + "time_per_iteration": 2.706407070159912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161577, + "balance_loss_mlp": 1.08146846, + "epoch": 0.5213543670642555, + "flos": 503860118016.0, + "grad_norm": 0.02918491733204221, + "language_loss": 0.86683327, + "learning_rate": 0.0004897198281949012, + "loss": 0.87844902, + "num_input_tokens_seen": 225657184, + "router_z_loss_mlp": 0.80078125, + "step": 2710, + "time_per_iteration": 2.6603598594665527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115684, + "balance_loss_mlp": 1.07654023, + "epoch": 0.521546748749519, + "flos": 587071248384.0, + "grad_norm": 0.0328837537508598, + "language_loss": 0.84538651, + "learning_rate": 0.0004894083537564057, + "loss": 0.85695493, + "num_input_tokens_seen": 225729968, + "router_z_loss_mlp": 0.80273438, + "step": 2711, + "time_per_iteration": 2.740659236907959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159708, + "balance_loss_mlp": 1.07955158, + "epoch": 0.5217391304347826, + "flos": 571265352192.0, + "grad_norm": 0.028894041826031003, + "language_loss": 0.85799223, + "learning_rate": 0.0004890968834299519, + "loss": 0.86958933, + "num_input_tokens_seen": 225801808, + "router_z_loss_mlp": 0.80126953, + "step": 2712, + "time_per_iteration": 2.7206225395202637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157432, + "balance_loss_mlp": 1.077371, + "epoch": 0.5219315121200462, + "flos": 543919076352.0, + "grad_norm": 0.029763432747936528, + "language_loss": 0.83741677, + "learning_rate": 0.0004887854173364633, + "loss": 0.84899104, + "num_input_tokens_seen": 225878576, + "router_z_loss_mlp": 0.80029297, + "step": 2713, + "time_per_iteration": 2.737755060195923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160512, + "balance_loss_mlp": 1.08097565, + "epoch": 0.5221238938053098, + "flos": 551530312704.0, + "grad_norm": 0.028214516718367867, + "language_loss": 0.86704654, + "learning_rate": 0.0004884739555968617, + "loss": 0.87865162, + "num_input_tokens_seen": 225960096, + "router_z_loss_mlp": 0.79492188, + "step": 2714, + "time_per_iteration": 2.872819185256958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168823, + "balance_loss_mlp": 1.09100342, + "epoch": 0.5223162754905732, + "flos": 1358389797888.0, + "grad_norm": 0.012476009787944744, + "language_loss": 0.78977054, + "learning_rate": 0.0004881624983320676, + "loss": 0.80145878, + "num_input_tokens_seen": 226184960, + "router_z_loss_mlp": 0.77539062, + "step": 2715, + "time_per_iteration": 4.96741795539856 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170398, + "balance_loss_mlp": 1.09028971, + "epoch": 0.5225086571758368, + "flos": 568973139456.0, + "grad_norm": 0.03267804467904664, + "language_loss": 0.92675197, + "learning_rate": 0.0004878510456629992, + "loss": 0.93845594, + "num_input_tokens_seen": 226271328, + "router_z_loss_mlp": 0.80078125, + "step": 2716, + "time_per_iteration": 2.9626121520996094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160651, + "balance_loss_mlp": 1.08054268, + "epoch": 0.5227010388611004, + "flos": 501135478272.0, + "grad_norm": 0.033781088666230946, + "language_loss": 0.9089278, + "learning_rate": 0.00048753959771057314, + "loss": 0.92053425, + "num_input_tokens_seen": 226340080, + "router_z_loss_mlp": 0.80078125, + "step": 2717, + "time_per_iteration": 2.611691951751709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157135, + "balance_loss_mlp": 1.07702601, + "epoch": 0.522893420546364, + "flos": 598798279680.0, + "grad_norm": 0.032963356718883376, + "language_loss": 0.88626194, + "learning_rate": 0.0004872281545957044, + "loss": 0.89783323, + "num_input_tokens_seen": 226415120, + "router_z_loss_mlp": 0.80078125, + "step": 2718, + "time_per_iteration": 2.7218518257141113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116303, + "balance_loss_mlp": 1.08287394, + "epoch": 0.5230858022316276, + "flos": 665921534976.0, + "grad_norm": 0.02884991307967795, + "language_loss": 0.91186881, + "learning_rate": 0.0004869167164393055, + "loss": 0.92349917, + "num_input_tokens_seen": 226501200, + "router_z_loss_mlp": 0.80126953, + "step": 2719, + "time_per_iteration": 2.932335376739502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164195, + "balance_loss_mlp": 1.08403885, + "epoch": 0.5232781839168911, + "flos": 605033097216.0, + "grad_norm": 0.02708280335676697, + "language_loss": 0.94493294, + "learning_rate": 0.00048660528336228793, + "loss": 0.95657486, + "num_input_tokens_seen": 226582064, + "router_z_loss_mlp": 0.80126953, + "step": 2720, + "time_per_iteration": 2.8030405044555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158564, + "balance_loss_mlp": 1.07840788, + "epoch": 0.5234705656021547, + "flos": 551840489472.0, + "grad_norm": 0.028885887647779437, + "language_loss": 0.95077229, + "learning_rate": 0.0004862938554855606, + "loss": 0.96235794, + "num_input_tokens_seen": 226656448, + "router_z_loss_mlp": 0.80126953, + "step": 2721, + "time_per_iteration": 2.797297716140747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159208, + "balance_loss_mlp": 1.0790993, + "epoch": 0.5236629472874182, + "flos": 505294934016.0, + "grad_norm": 0.03214550067861962, + "language_loss": 0.91548902, + "learning_rate": 0.0004859824329300304, + "loss": 0.92708111, + "num_input_tokens_seen": 226725568, + "router_z_loss_mlp": 0.80078125, + "step": 2722, + "time_per_iteration": 2.589529037475586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164653, + "balance_loss_mlp": 1.08444893, + "epoch": 0.5238553289726818, + "flos": 548696884224.0, + "grad_norm": 0.029959051591606282, + "language_loss": 0.88512689, + "learning_rate": 0.00048567101581660244, + "loss": 0.89677346, + "num_input_tokens_seen": 226795728, + "router_z_loss_mlp": 0.80175781, + "step": 2723, + "time_per_iteration": 2.6637237071990967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160999, + "balance_loss_mlp": 1.08065164, + "epoch": 0.5240477106579453, + "flos": 533003779584.0, + "grad_norm": 0.031636293719806106, + "language_loss": 0.92529982, + "learning_rate": 0.00048535960426617956, + "loss": 0.93690991, + "num_input_tokens_seen": 226865344, + "router_z_loss_mlp": 0.80322266, + "step": 2724, + "time_per_iteration": 2.6061489582061768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156405, + "balance_loss_mlp": 1.07620108, + "epoch": 0.5242400923432089, + "flos": 619089271296.0, + "grad_norm": 0.028230181756235023, + "language_loss": 0.87247139, + "learning_rate": 0.0004850481983996621, + "loss": 0.88403541, + "num_input_tokens_seen": 226936800, + "router_z_loss_mlp": 0.80175781, + "step": 2725, + "time_per_iteration": 2.7699060440063477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157933, + "balance_loss_mlp": 1.07787168, + "epoch": 0.5244324740284725, + "flos": 417589976064.0, + "grad_norm": 0.03201067328997522, + "language_loss": 0.93398654, + "learning_rate": 0.0004847367983379492, + "loss": 0.94556582, + "num_input_tokens_seen": 226998448, + "router_z_loss_mlp": 0.80029297, + "step": 2726, + "time_per_iteration": 2.521516799926758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156009, + "balance_loss_mlp": 1.07599604, + "epoch": 0.5246248557137361, + "flos": 627731821056.0, + "grad_norm": 0.028083517097400017, + "language_loss": 0.83866012, + "learning_rate": 0.00048442540420193643, + "loss": 0.8502202, + "num_input_tokens_seen": 227081872, + "router_z_loss_mlp": 0.79980469, + "step": 2727, + "time_per_iteration": 2.8968660831451416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155443, + "balance_loss_mlp": 1.07547724, + "epoch": 0.5248172373989997, + "flos": 1250401675776.0, + "grad_norm": 0.032601939018394276, + "language_loss": 0.85122609, + "learning_rate": 0.0004841140161125182, + "loss": 0.86278045, + "num_input_tokens_seen": 227167744, + "router_z_loss_mlp": 0.79931641, + "step": 2728, + "time_per_iteration": 3.585556983947754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156303, + "balance_loss_mlp": 1.0764327, + "epoch": 0.5250096190842631, + "flos": 507882587136.0, + "grad_norm": 0.02942710549962748, + "language_loss": 0.90605354, + "learning_rate": 0.0004838026341905857, + "loss": 0.91761655, + "num_input_tokens_seen": 227239136, + "router_z_loss_mlp": 0.79833984, + "step": 2729, + "time_per_iteration": 2.7116506099700928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157734, + "balance_loss_mlp": 1.07781577, + "epoch": 0.5252020007695267, + "flos": 612507346944.0, + "grad_norm": 0.029260311632026755, + "language_loss": 0.9089191, + "learning_rate": 0.00048349125855702844, + "loss": 0.92049646, + "num_input_tokens_seen": 227311968, + "router_z_loss_mlp": 0.79882812, + "step": 2730, + "time_per_iteration": 2.772508144378662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157575, + "balance_loss_mlp": 1.07780039, + "epoch": 0.5253943824547903, + "flos": 540291377664.0, + "grad_norm": 0.027039643287400304, + "language_loss": 0.86249292, + "learning_rate": 0.00048317988933273287, + "loss": 0.87406862, + "num_input_tokens_seen": 227385248, + "router_z_loss_mlp": 0.79736328, + "step": 2731, + "time_per_iteration": 2.7501025199890137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159148, + "balance_loss_mlp": 1.07918203, + "epoch": 0.5255867641400539, + "flos": 699337443840.0, + "grad_norm": 0.030025626211663315, + "language_loss": 0.87967253, + "learning_rate": 0.00048286852663858367, + "loss": 0.89126396, + "num_input_tokens_seen": 227464640, + "router_z_loss_mlp": 0.79931641, + "step": 2732, + "time_per_iteration": 2.9441256523132324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156016, + "balance_loss_mlp": 1.07604992, + "epoch": 0.5257791458253175, + "flos": 668548119552.0, + "grad_norm": 0.03127119397180798, + "language_loss": 0.89405584, + "learning_rate": 0.000482557170595462, + "loss": 0.90561604, + "num_input_tokens_seen": 227542192, + "router_z_loss_mlp": 0.79931641, + "step": 2733, + "time_per_iteration": 2.875559091567993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158055, + "balance_loss_mlp": 1.07813704, + "epoch": 0.525971527510581, + "flos": 484604442624.0, + "grad_norm": 0.02914442262172993, + "language_loss": 0.93156296, + "learning_rate": 0.0004822458213242475, + "loss": 0.94314349, + "num_input_tokens_seen": 227606096, + "router_z_loss_mlp": 0.79882812, + "step": 2734, + "time_per_iteration": 2.5386509895324707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157288, + "balance_loss_mlp": 1.07737029, + "epoch": 0.5261639091958445, + "flos": 831347410944.0, + "grad_norm": 0.025020932409653307, + "language_loss": 0.90545583, + "learning_rate": 0.00048193447894581627, + "loss": 0.91702867, + "num_input_tokens_seen": 227689552, + "router_z_loss_mlp": 0.79882812, + "step": 2735, + "time_per_iteration": 3.087679862976074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158596, + "balance_loss_mlp": 1.07853508, + "epoch": 0.5263562908811081, + "flos": 521732643840.0, + "grad_norm": 0.03948252554958876, + "language_loss": 0.93270254, + "learning_rate": 0.00048162314358104243, + "loss": 0.94428849, + "num_input_tokens_seen": 227760784, + "router_z_loss_mlp": 0.80029297, + "step": 2736, + "time_per_iteration": 2.601278305053711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156345, + "balance_loss_mlp": 1.07633209, + "epoch": 0.5265486725663717, + "flos": 576097554432.0, + "grad_norm": 0.032044906976615765, + "language_loss": 0.89525604, + "learning_rate": 0.0004813118153507969, + "loss": 0.90681952, + "num_input_tokens_seen": 227834304, + "router_z_loss_mlp": 0.79980469, + "step": 2737, + "time_per_iteration": 2.7360177040100098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160461, + "balance_loss_mlp": 1.0820694, + "epoch": 0.5267410542516352, + "flos": 1550558333952.0, + "grad_norm": 0.008730383218555248, + "language_loss": 0.82447124, + "learning_rate": 0.0004810004943759482, + "loss": 0.8360759, + "num_input_tokens_seen": 228057232, + "router_z_loss_mlp": 0.78320312, + "step": 2738, + "time_per_iteration": 4.80830717086792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160505, + "balance_loss_mlp": 1.08039653, + "epoch": 0.5269334359368988, + "flos": 931460878848.0, + "grad_norm": 0.03056162512939441, + "language_loss": 0.89627469, + "learning_rate": 0.00048068918077736163, + "loss": 0.90787971, + "num_input_tokens_seen": 228140816, + "router_z_loss_mlp": 0.80078125, + "step": 2739, + "time_per_iteration": 3.228745222091675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160328, + "balance_loss_mlp": 1.08021903, + "epoch": 0.5271258176221624, + "flos": 656634436608.0, + "grad_norm": 0.03221347808604687, + "language_loss": 0.87126762, + "learning_rate": 0.0004803778746759001, + "loss": 0.88287091, + "num_input_tokens_seen": 228216208, + "router_z_loss_mlp": 0.80078125, + "step": 2740, + "time_per_iteration": 2.888040542602539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161897, + "balance_loss_mlp": 1.08217001, + "epoch": 0.527318199307426, + "flos": 544062067200.0, + "grad_norm": 0.03125376981830108, + "language_loss": 0.87138033, + "learning_rate": 0.00048006657619242317, + "loss": 0.8829993, + "num_input_tokens_seen": 228283184, + "router_z_loss_mlp": 0.796875, + "step": 2741, + "time_per_iteration": 2.6788547039031982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156491, + "balance_loss_mlp": 1.07662046, + "epoch": 0.5275105809926895, + "flos": 448898322432.0, + "grad_norm": 0.035204553781932095, + "language_loss": 0.84527659, + "learning_rate": 0.00047975528544778775, + "loss": 0.8568415, + "num_input_tokens_seen": 228351328, + "router_z_loss_mlp": 0.79833984, + "step": 2742, + "time_per_iteration": 2.5953187942504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156742, + "balance_loss_mlp": 1.07677603, + "epoch": 0.527702962677953, + "flos": 580052894208.0, + "grad_norm": 0.031790657619887884, + "language_loss": 0.9544906, + "learning_rate": 0.00047944400256284754, + "loss": 0.96605802, + "num_input_tokens_seen": 228423632, + "router_z_loss_mlp": 0.79931641, + "step": 2743, + "time_per_iteration": 2.6874876022338867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158128, + "balance_loss_mlp": 1.07821035, + "epoch": 0.5278953443632166, + "flos": 654009853440.0, + "grad_norm": 0.028533864641999515, + "language_loss": 0.84914398, + "learning_rate": 0.0004791327276584532, + "loss": 0.86072528, + "num_input_tokens_seen": 228498736, + "router_z_loss_mlp": 0.79882812, + "step": 2744, + "time_per_iteration": 2.851484537124634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159082, + "balance_loss_mlp": 1.07902145, + "epoch": 0.5280877260484802, + "flos": 515048661504.0, + "grad_norm": 0.02936794285447426, + "language_loss": 0.85631824, + "learning_rate": 0.00047882146085545264, + "loss": 0.86790907, + "num_input_tokens_seen": 228569056, + "router_z_loss_mlp": 0.80029297, + "step": 2745, + "time_per_iteration": 2.6376991271972656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159996, + "balance_loss_mlp": 1.081604, + "epoch": 0.5282801077337438, + "flos": 1448712608256.0, + "grad_norm": 0.005116949586401208, + "language_loss": 0.75402379, + "learning_rate": 0.00047851020227469, + "loss": 0.76562381, + "num_input_tokens_seen": 228800560, + "router_z_loss_mlp": 0.78125, + "step": 2746, + "time_per_iteration": 4.958376169204712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158639, + "balance_loss_mlp": 1.0789119, + "epoch": 0.5284724894190073, + "flos": 605966355456.0, + "grad_norm": 0.03386849685542916, + "language_loss": 0.85558748, + "learning_rate": 0.00047819895203700684, + "loss": 0.86717391, + "num_input_tokens_seen": 228869216, + "router_z_loss_mlp": 0.796875, + "step": 2747, + "time_per_iteration": 2.7103474140167236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161659, + "balance_loss_mlp": 1.08326721, + "epoch": 0.5286648711042709, + "flos": 1498103600640.0, + "grad_norm": 0.005524480658063938, + "language_loss": 0.75512433, + "learning_rate": 0.0004778877102632412, + "loss": 0.76674092, + "num_input_tokens_seen": 229085520, + "router_z_loss_mlp": 0.78125, + "step": 2748, + "time_per_iteration": 4.636225938796997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156911, + "balance_loss_mlp": 1.077088, + "epoch": 0.5288572527895344, + "flos": 598833208320.0, + "grad_norm": 0.030227845431380972, + "language_loss": 0.94071984, + "learning_rate": 0.0004775764770742277, + "loss": 0.95228899, + "num_input_tokens_seen": 229160912, + "router_z_loss_mlp": 0.79785156, + "step": 2749, + "time_per_iteration": 2.7894628047943115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154981, + "balance_loss_mlp": 1.07496762, + "epoch": 0.529049634474798, + "flos": 558439878144.0, + "grad_norm": 0.038921610012438906, + "language_loss": 0.92515904, + "learning_rate": 0.00047726525259079777, + "loss": 0.93670887, + "num_input_tokens_seen": 229235792, + "router_z_loss_mlp": 0.79980469, + "step": 2750, + "time_per_iteration": 2.8399362564086914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156308, + "balance_loss_mlp": 1.07643819, + "epoch": 0.5292420161600616, + "flos": 582434429952.0, + "grad_norm": 0.03493339209419754, + "language_loss": 0.94807124, + "learning_rate": 0.0004769540369337798, + "loss": 0.9596343, + "num_input_tokens_seen": 229309984, + "router_z_loss_mlp": 0.79833984, + "step": 2751, + "time_per_iteration": 2.7520663738250732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171177, + "balance_loss_mlp": 1.09097254, + "epoch": 0.5294343978453251, + "flos": 609563854848.0, + "grad_norm": 0.029200425139457874, + "language_loss": 0.90377945, + "learning_rate": 0.00047664283022399794, + "loss": 0.91549122, + "num_input_tokens_seen": 229394000, + "router_z_loss_mlp": 0.80175781, + "step": 2752, + "time_per_iteration": 2.827075719833374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169344, + "balance_loss_mlp": 1.08904481, + "epoch": 0.5296267795305887, + "flos": 647709907968.0, + "grad_norm": 0.03322281077035965, + "language_loss": 0.85670567, + "learning_rate": 0.00047633163258227376, + "loss": 0.86839902, + "num_input_tokens_seen": 229474320, + "router_z_loss_mlp": 0.80273438, + "step": 2753, + "time_per_iteration": 2.8684630393981934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168156, + "balance_loss_mlp": 1.08790445, + "epoch": 0.5298191612158523, + "flos": 560805950976.0, + "grad_norm": 0.0355054677596956, + "language_loss": 0.92337191, + "learning_rate": 0.0004760204441294247, + "loss": 0.93505347, + "num_input_tokens_seen": 229543072, + "router_z_loss_mlp": 0.80224609, + "step": 2754, + "time_per_iteration": 2.6347973346710205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162052, + "balance_loss_mlp": 1.08194327, + "epoch": 0.5300115429011159, + "flos": 515131253760.0, + "grad_norm": 0.03178410473183971, + "language_loss": 0.90992713, + "learning_rate": 0.00047570926498626486, + "loss": 0.92154765, + "num_input_tokens_seen": 229615296, + "router_z_loss_mlp": 0.80078125, + "step": 2755, + "time_per_iteration": 2.6713931560516357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165293, + "balance_loss_mlp": 1.08513677, + "epoch": 0.5302039245863793, + "flos": 674049065472.0, + "grad_norm": 0.025883205751119107, + "language_loss": 0.86624229, + "learning_rate": 0.00047539809527360474, + "loss": 0.87789524, + "num_input_tokens_seen": 229693728, + "router_z_loss_mlp": 0.80126953, + "step": 2756, + "time_per_iteration": 2.855339765548706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163284, + "balance_loss_mlp": 1.08312809, + "epoch": 0.5303963062716429, + "flos": 732156467712.0, + "grad_norm": 0.025616439830169112, + "language_loss": 0.86757731, + "learning_rate": 0.0004750869351122511, + "loss": 0.87921017, + "num_input_tokens_seen": 229772144, + "router_z_loss_mlp": 0.80126953, + "step": 2757, + "time_per_iteration": 2.9861788749694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157792, + "balance_loss_mlp": 1.07773066, + "epoch": 0.5305886879569065, + "flos": 574551948288.0, + "grad_norm": 0.030995691560080724, + "language_loss": 0.87564695, + "learning_rate": 0.00047477578462300685, + "loss": 0.88722491, + "num_input_tokens_seen": 229847024, + "router_z_loss_mlp": 0.80029297, + "step": 2758, + "time_per_iteration": 2.711434841156006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158236, + "balance_loss_mlp": 1.07817531, + "epoch": 0.5307810696421701, + "flos": 696728323584.0, + "grad_norm": 0.030944173565867344, + "language_loss": 0.85500729, + "learning_rate": 0.0004744646439266718, + "loss": 0.86658955, + "num_input_tokens_seen": 229932416, + "router_z_loss_mlp": 0.80029297, + "step": 2759, + "time_per_iteration": 3.012730121612549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159665, + "balance_loss_mlp": 1.07965159, + "epoch": 0.5309734513274337, + "flos": 650202233856.0, + "grad_norm": 0.02922555436454367, + "language_loss": 0.9794637, + "learning_rate": 0.000474153513144041, + "loss": 0.99106038, + "num_input_tokens_seen": 230010976, + "router_z_loss_mlp": 0.79980469, + "step": 2760, + "time_per_iteration": 2.9069197177886963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158721, + "balance_loss_mlp": 1.07866037, + "epoch": 0.5311658330126972, + "flos": 606055678464.0, + "grad_norm": 0.0324154212137011, + "language_loss": 0.92613202, + "learning_rate": 0.00047384239239590633, + "loss": 0.93771923, + "num_input_tokens_seen": 230093344, + "router_z_loss_mlp": 0.80029297, + "step": 2761, + "time_per_iteration": 2.8556571006774902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159506, + "balance_loss_mlp": 1.07949257, + "epoch": 0.5313582146979607, + "flos": 559316740608.0, + "grad_norm": 0.03061440617121834, + "language_loss": 0.94290936, + "learning_rate": 0.0004735312818030556, + "loss": 0.95450437, + "num_input_tokens_seen": 230165520, + "router_z_loss_mlp": 0.79980469, + "step": 2762, + "time_per_iteration": 2.6934847831726074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157514, + "balance_loss_mlp": 1.07764399, + "epoch": 0.5315505963832243, + "flos": 509445657600.0, + "grad_norm": 0.029953313176207894, + "language_loss": 0.88601178, + "learning_rate": 0.0004732201814862727, + "loss": 0.89758694, + "num_input_tokens_seen": 230237808, + "router_z_loss_mlp": 0.79833984, + "step": 2763, + "time_per_iteration": 2.7555651664733887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156859, + "balance_loss_mlp": 1.0773226, + "epoch": 0.5317429780684879, + "flos": 627668694528.0, + "grad_norm": 0.030098925618691368, + "language_loss": 0.87074947, + "learning_rate": 0.0004729090915663373, + "loss": 0.88231808, + "num_input_tokens_seen": 230321568, + "router_z_loss_mlp": 0.79492188, + "step": 2764, + "time_per_iteration": 2.83986496925354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157289, + "balance_loss_mlp": 1.07751369, + "epoch": 0.5319353597537514, + "flos": 477698880000.0, + "grad_norm": 0.035256009305486516, + "language_loss": 0.9145658, + "learning_rate": 0.00047259801216402534, + "loss": 0.92613864, + "num_input_tokens_seen": 230385376, + "router_z_loss_mlp": 0.79736328, + "step": 2765, + "time_per_iteration": 2.49153208732605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158926, + "balance_loss_mlp": 1.07934201, + "epoch": 0.532127741439015, + "flos": 502633420800.0, + "grad_norm": 0.031216360034414494, + "language_loss": 0.91137969, + "learning_rate": 0.00047228694340010845, + "loss": 0.92296898, + "num_input_tokens_seen": 230449760, + "router_z_loss_mlp": 0.79541016, + "step": 2766, + "time_per_iteration": 2.5491669178009033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163587, + "balance_loss_mlp": 1.08385968, + "epoch": 0.5323201231242786, + "flos": 1166482870272.0, + "grad_norm": 0.028947902109049614, + "language_loss": 0.91277415, + "learning_rate": 0.0004719758853953544, + "loss": 0.92440999, + "num_input_tokens_seen": 230536592, + "router_z_loss_mlp": 0.796875, + "step": 2767, + "time_per_iteration": 3.576573610305786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167049, + "balance_loss_mlp": 1.08694029, + "epoch": 0.5325125048095422, + "flos": 379541251584.0, + "grad_norm": 0.04259356627609034, + "language_loss": 0.91498351, + "learning_rate": 0.00047166483827052645, + "loss": 0.92665404, + "num_input_tokens_seen": 230596688, + "router_z_loss_mlp": 0.80078125, + "step": 2768, + "time_per_iteration": 2.3893725872039795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172249, + "balance_loss_mlp": 1.09423828, + "epoch": 0.5327048864948057, + "flos": 1544747211264.0, + "grad_norm": 0.007240897484727242, + "language_loss": 0.77078491, + "learning_rate": 0.00047135380214638413, + "loss": 0.78250736, + "num_input_tokens_seen": 230829408, + "router_z_loss_mlp": 0.77929688, + "step": 2769, + "time_per_iteration": 4.972010374069214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167053, + "balance_loss_mlp": 1.08737326, + "epoch": 0.5328972681800692, + "flos": 912861212160.0, + "grad_norm": 0.03027786850862354, + "language_loss": 0.8989411, + "learning_rate": 0.000471042777143682, + "loss": 0.91061163, + "num_input_tokens_seen": 230912528, + "router_z_loss_mlp": 0.79638672, + "step": 2770, + "time_per_iteration": 3.1992523670196533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116085, + "balance_loss_mlp": 1.08126593, + "epoch": 0.5330896498653328, + "flos": 474850715136.0, + "grad_norm": 0.032478463467180745, + "language_loss": 0.85492694, + "learning_rate": 0.0004707317633831707, + "loss": 0.86653543, + "num_input_tokens_seen": 230979424, + "router_z_loss_mlp": 0.79541016, + "step": 2771, + "time_per_iteration": 2.636418342590332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159417, + "balance_loss_mlp": 1.07983315, + "epoch": 0.5332820315505964, + "flos": 502633420800.0, + "grad_norm": 0.034509360784450445, + "language_loss": 0.84931278, + "learning_rate": 0.00047042076098559673, + "loss": 0.86090696, + "num_input_tokens_seen": 231046416, + "router_z_loss_mlp": 0.79541016, + "step": 2772, + "time_per_iteration": 2.587954521179199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155982, + "balance_loss_mlp": 1.07615912, + "epoch": 0.53347441323586, + "flos": 926031791616.0, + "grad_norm": 0.036007721663536225, + "language_loss": 0.8042109, + "learning_rate": 0.00047010977007170174, + "loss": 0.81577075, + "num_input_tokens_seen": 231136064, + "router_z_loss_mlp": 0.79785156, + "step": 2773, + "time_per_iteration": 3.207517623901367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154797, + "balance_loss_mlp": 1.07497442, + "epoch": 0.5336667949211235, + "flos": 575539600896.0, + "grad_norm": 0.032460813123339774, + "language_loss": 0.88737571, + "learning_rate": 0.00046979879076222334, + "loss": 0.89892364, + "num_input_tokens_seen": 231203616, + "router_z_loss_mlp": 0.79785156, + "step": 2774, + "time_per_iteration": 2.711036443710327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154367, + "balance_loss_mlp": 1.07459235, + "epoch": 0.533859176606387, + "flos": 1066390869504.0, + "grad_norm": 0.02757600625184913, + "language_loss": 0.88843602, + "learning_rate": 0.0004694878231778939, + "loss": 0.89997971, + "num_input_tokens_seen": 231287008, + "router_z_loss_mlp": 0.79736328, + "step": 2775, + "time_per_iteration": 3.3735690116882324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154523, + "balance_loss_mlp": 1.07512975, + "epoch": 0.5340515582916506, + "flos": 747905968128.0, + "grad_norm": 0.025749810309272533, + "language_loss": 0.89188796, + "learning_rate": 0.0004691768674394423, + "loss": 0.9034332, + "num_input_tokens_seen": 231365296, + "router_z_loss_mlp": 0.79345703, + "step": 2776, + "time_per_iteration": 2.9947128295898438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171234, + "balance_loss_mlp": 1.09341431, + "epoch": 0.5342439399769142, + "flos": 1448818669056.0, + "grad_norm": 0.018487467205991936, + "language_loss": 0.84484011, + "learning_rate": 0.0004688659236675918, + "loss": 0.85655242, + "num_input_tokens_seen": 231579040, + "router_z_loss_mlp": 0.77734375, + "step": 2777, + "time_per_iteration": 4.765547275543213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166931, + "balance_loss_mlp": 1.08872986, + "epoch": 0.5344363216621778, + "flos": 1430696365056.0, + "grad_norm": 0.01490962088780182, + "language_loss": 0.76653534, + "learning_rate": 0.00046855499198306187, + "loss": 0.77820462, + "num_input_tokens_seen": 231812736, + "router_z_loss_mlp": 0.77929688, + "step": 2778, + "time_per_iteration": 4.979669570922852 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156329, + "balance_loss_mlp": 1.07636368, + "epoch": 0.5346287033474413, + "flos": 528675136512.0, + "grad_norm": 0.028255812601682327, + "language_loss": 0.84707999, + "learning_rate": 0.00046824407250656676, + "loss": 0.85864329, + "num_input_tokens_seen": 231883840, + "router_z_loss_mlp": 0.79931641, + "step": 2779, + "time_per_iteration": 2.6169135570526123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161852, + "balance_loss_mlp": 1.08183897, + "epoch": 0.5348210850327049, + "flos": 511755334656.0, + "grad_norm": 0.02960487915529887, + "language_loss": 0.89552319, + "learning_rate": 0.0004679331653588161, + "loss": 0.90714169, + "num_input_tokens_seen": 231955360, + "router_z_loss_mlp": 0.79980469, + "step": 2780, + "time_per_iteration": 2.651503562927246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165567, + "balance_loss_mlp": 1.08536327, + "epoch": 0.5350134667179685, + "flos": 463625241600.0, + "grad_norm": 0.0331551624405392, + "language_loss": 0.91242051, + "learning_rate": 0.0004676222706605147, + "loss": 0.9240762, + "num_input_tokens_seen": 232027088, + "router_z_loss_mlp": 0.80175781, + "step": 2781, + "time_per_iteration": 2.609180450439453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171695, + "balance_loss_mlp": 1.09149086, + "epoch": 0.535205848403232, + "flos": 710117755392.0, + "grad_norm": 0.03114563748345981, + "language_loss": 0.9013232, + "learning_rate": 0.0004673113885323626, + "loss": 0.91304016, + "num_input_tokens_seen": 232099472, + "router_z_loss_mlp": 0.80175781, + "step": 2782, + "time_per_iteration": 2.889096736907959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167285, + "balance_loss_mlp": 1.08708084, + "epoch": 0.5353982300884956, + "flos": 895791688704.0, + "grad_norm": 0.029628425021764316, + "language_loss": 0.840244, + "learning_rate": 0.00046700051909505494, + "loss": 0.85191679, + "num_input_tokens_seen": 232182528, + "router_z_loss_mlp": 0.80175781, + "step": 2783, + "time_per_iteration": 3.1921920776367188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161558, + "balance_loss_mlp": 1.08130586, + "epoch": 0.5355906117737591, + "flos": 537024247296.0, + "grad_norm": 0.03383499561986932, + "language_loss": 0.89968938, + "learning_rate": 0.000466689662469282, + "loss": 0.91130495, + "num_input_tokens_seen": 232253344, + "router_z_loss_mlp": 0.80224609, + "step": 2784, + "time_per_iteration": 2.644693613052368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160347, + "balance_loss_mlp": 1.08009481, + "epoch": 0.5357829934590227, + "flos": 870327392256.0, + "grad_norm": 0.02956685166305249, + "language_loss": 0.89793074, + "learning_rate": 0.00046637881877572917, + "loss": 0.90953422, + "num_input_tokens_seen": 232337232, + "router_z_loss_mlp": 0.80224609, + "step": 2785, + "time_per_iteration": 3.134896755218506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159974, + "balance_loss_mlp": 1.0797224, + "epoch": 0.5359753751442863, + "flos": 554445606912.0, + "grad_norm": 0.027747995864539122, + "language_loss": 0.88820761, + "learning_rate": 0.0004660679881350764, + "loss": 0.89980739, + "num_input_tokens_seen": 232412864, + "router_z_loss_mlp": 0.80224609, + "step": 2786, + "time_per_iteration": 2.7258269786834717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186935, + "balance_loss_mlp": 1.10682678, + "epoch": 0.5361677568295499, + "flos": 1483756715520.0, + "grad_norm": 0.018012162763561924, + "language_loss": 0.75608146, + "learning_rate": 0.0004657571706679988, + "loss": 0.76795077, + "num_input_tokens_seen": 232639888, + "router_z_loss_mlp": 0.80078125, + "step": 2787, + "time_per_iteration": 5.011500835418701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163662, + "balance_loss_mlp": 1.08345807, + "epoch": 0.5363601385148133, + "flos": 807641568768.0, + "grad_norm": 0.03200093229385197, + "language_loss": 0.83718783, + "learning_rate": 0.0004654463664951667, + "loss": 0.84882444, + "num_input_tokens_seen": 232719248, + "router_z_loss_mlp": 0.80175781, + "step": 2788, + "time_per_iteration": 3.0044353008270264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162852, + "balance_loss_mlp": 1.08274364, + "epoch": 0.5365525202000769, + "flos": 508878971904.0, + "grad_norm": 0.03055357919616021, + "language_loss": 0.89048028, + "learning_rate": 0.0004651355757372447, + "loss": 0.90210879, + "num_input_tokens_seen": 232788464, + "router_z_loss_mlp": 0.80078125, + "step": 2789, + "time_per_iteration": 2.6024739742279053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011626, + "balance_loss_mlp": 1.08277702, + "epoch": 0.5367449018853405, + "flos": 530014625280.0, + "grad_norm": 0.03243837084279447, + "language_loss": 0.90724301, + "learning_rate": 0.00046482479851489274, + "loss": 0.91886902, + "num_input_tokens_seen": 232859792, + "router_z_loss_mlp": 0.79785156, + "step": 2790, + "time_per_iteration": 2.7023818492889404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168089, + "balance_loss_mlp": 1.08840978, + "epoch": 0.5369372835706041, + "flos": 651216082944.0, + "grad_norm": 0.035661652748611536, + "language_loss": 0.83603406, + "learning_rate": 0.00046451403494876525, + "loss": 0.84771496, + "num_input_tokens_seen": 232941472, + "router_z_loss_mlp": 0.79443359, + "step": 2791, + "time_per_iteration": 2.9009790420532227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169917, + "balance_loss_mlp": 1.09033263, + "epoch": 0.5371296652558677, + "flos": 585627700224.0, + "grad_norm": 0.03267915449635738, + "language_loss": 0.90313196, + "learning_rate": 0.0004642032851595111, + "loss": 0.91483116, + "num_input_tokens_seen": 233017120, + "router_z_loss_mlp": 0.79345703, + "step": 2792, + "time_per_iteration": 2.743093967437744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171549, + "balance_loss_mlp": 1.09196496, + "epoch": 0.5373220469411312, + "flos": 597083486208.0, + "grad_norm": 0.03226534649155799, + "language_loss": 0.89917493, + "learning_rate": 0.00046389254926777404, + "loss": 0.91089034, + "num_input_tokens_seen": 233095408, + "router_z_loss_mlp": 0.79345703, + "step": 2793, + "time_per_iteration": 2.816979169845581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162732, + "balance_loss_mlp": 1.08319557, + "epoch": 0.5375144286263948, + "flos": 1116277415424.0, + "grad_norm": 0.030732828924726157, + "language_loss": 0.83480382, + "learning_rate": 0.0004635818273941926, + "loss": 0.84643114, + "num_input_tokens_seen": 233191056, + "router_z_loss_mlp": 0.79443359, + "step": 2794, + "time_per_iteration": 3.538351058959961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156539, + "balance_loss_mlp": 1.07704997, + "epoch": 0.5377068103116583, + "flos": 596768580096.0, + "grad_norm": 0.03686105726392354, + "language_loss": 0.88212651, + "learning_rate": 0.0004632711196593997, + "loss": 0.8936919, + "num_input_tokens_seen": 233265536, + "router_z_loss_mlp": 0.79443359, + "step": 2795, + "time_per_iteration": 2.7304327487945557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153271, + "balance_loss_mlp": 1.07383037, + "epoch": 0.5378991919969219, + "flos": 885649195008.0, + "grad_norm": 0.031821277780470766, + "language_loss": 0.90781128, + "learning_rate": 0.00046296042618402297, + "loss": 0.91934395, + "num_input_tokens_seen": 233348224, + "router_z_loss_mlp": 0.79394531, + "step": 2796, + "time_per_iteration": 3.117605447769165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154822, + "balance_loss_mlp": 1.07523799, + "epoch": 0.5380915736821854, + "flos": 711950069760.0, + "grad_norm": 0.03181223121167454, + "language_loss": 0.84282267, + "learning_rate": 0.0004626497470886839, + "loss": 0.85437095, + "num_input_tokens_seen": 233429344, + "router_z_loss_mlp": 0.79541016, + "step": 2797, + "time_per_iteration": 2.943110704421997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154308, + "balance_loss_mlp": 1.07439017, + "epoch": 0.538283955367449, + "flos": 558114238464.0, + "grad_norm": 0.03131439333064892, + "language_loss": 0.87165904, + "learning_rate": 0.00046233908249399897, + "loss": 0.88320208, + "num_input_tokens_seen": 233504944, + "router_z_loss_mlp": 0.79882812, + "step": 2798, + "time_per_iteration": 2.753664970397949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156214, + "balance_loss_mlp": 1.0763911, + "epoch": 0.5384763370527126, + "flos": 514481975808.0, + "grad_norm": 0.02763164557850803, + "language_loss": 0.84223002, + "learning_rate": 0.00046202843252057905, + "loss": 0.85379213, + "num_input_tokens_seen": 233573072, + "router_z_loss_mlp": 0.79785156, + "step": 2799, + "time_per_iteration": 2.5850727558135986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157398, + "balance_loss_mlp": 1.07767105, + "epoch": 0.5386687187379762, + "flos": 490719737856.0, + "grad_norm": 0.033199019667933, + "language_loss": 0.8910532, + "learning_rate": 0.00046171779728902896, + "loss": 0.90262723, + "num_input_tokens_seen": 233640896, + "router_z_loss_mlp": 0.796875, + "step": 2800, + "time_per_iteration": 2.54720139503479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157318, + "balance_loss_mlp": 1.07730448, + "epoch": 0.5388611004232398, + "flos": 483627523584.0, + "grad_norm": 0.041719681603307614, + "language_loss": 0.92617553, + "learning_rate": 0.000461407176919948, + "loss": 0.93774867, + "num_input_tokens_seen": 233703904, + "router_z_loss_mlp": 0.79980469, + "step": 2801, + "time_per_iteration": 2.5201830863952637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158799, + "balance_loss_mlp": 1.07868993, + "epoch": 0.5390534821085032, + "flos": 562089043968.0, + "grad_norm": 0.03196091571695152, + "language_loss": 0.90337479, + "learning_rate": 0.00046109657153392997, + "loss": 0.91496283, + "num_input_tokens_seen": 233779248, + "router_z_loss_mlp": 0.80078125, + "step": 2802, + "time_per_iteration": 2.694173574447632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160257, + "balance_loss_mlp": 1.08014798, + "epoch": 0.5392458637937668, + "flos": 489360783360.0, + "grad_norm": 0.039860159596143786, + "language_loss": 0.89760619, + "learning_rate": 0.0004607859812515622, + "loss": 0.90920877, + "num_input_tokens_seen": 233847520, + "router_z_loss_mlp": 0.80078125, + "step": 2803, + "time_per_iteration": 2.585549831390381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164203, + "balance_loss_mlp": 1.08404684, + "epoch": 0.5394382454790304, + "flos": 513049161216.0, + "grad_norm": 0.03534563174473093, + "language_loss": 0.94152969, + "learning_rate": 0.00046047540619342667, + "loss": 0.95317167, + "num_input_tokens_seen": 233911328, + "router_z_loss_mlp": 0.80126953, + "step": 2804, + "time_per_iteration": 2.589845895767212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116808, + "balance_loss_mlp": 1.08835244, + "epoch": 0.539630627164294, + "flos": 568688432640.0, + "grad_norm": 0.02864783436473809, + "language_loss": 0.85705817, + "learning_rate": 0.00046016484648009933, + "loss": 0.86873901, + "num_input_tokens_seen": 233987104, + "router_z_loss_mlp": 0.796875, + "step": 2805, + "time_per_iteration": 2.687539577484131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162339, + "balance_loss_mlp": 1.08246911, + "epoch": 0.5398230088495575, + "flos": 527502833664.0, + "grad_norm": 0.03312242512211549, + "language_loss": 0.8782742, + "learning_rate": 0.0004598543022321501, + "loss": 0.88989753, + "num_input_tokens_seen": 234057216, + "router_z_loss_mlp": 0.79833984, + "step": 2806, + "time_per_iteration": 2.6111719608306885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159262, + "balance_loss_mlp": 1.07910562, + "epoch": 0.5400153905348211, + "flos": 539852946432.0, + "grad_norm": 0.03059923694994547, + "language_loss": 0.85068846, + "learning_rate": 0.0004595437735701433, + "loss": 0.86228108, + "num_input_tokens_seen": 234129984, + "router_z_loss_mlp": 0.80126953, + "step": 2807, + "time_per_iteration": 2.668133020401001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158376, + "balance_loss_mlp": 1.07826769, + "epoch": 0.5402077722200846, + "flos": 514664624640.0, + "grad_norm": 0.03937747929323063, + "language_loss": 0.88849455, + "learning_rate": 0.00045923326061463623, + "loss": 0.90007836, + "num_input_tokens_seen": 234203920, + "router_z_loss_mlp": 0.80078125, + "step": 2808, + "time_per_iteration": 2.76680588722229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152678, + "balance_loss_mlp": 1.07261717, + "epoch": 0.5404001539053482, + "flos": 677565974016.0, + "grad_norm": 0.030976456011377742, + "language_loss": 0.87454319, + "learning_rate": 0.00045892276348618113, + "loss": 0.88606995, + "num_input_tokens_seen": 234285440, + "router_z_loss_mlp": 0.80029297, + "step": 2809, + "time_per_iteration": 2.9939539432525635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173447, + "balance_loss_mlp": 1.09410095, + "epoch": 0.5405925355906118, + "flos": 1558189036032.0, + "grad_norm": 0.015961767794208704, + "language_loss": 0.78260827, + "learning_rate": 0.0004586122823053235, + "loss": 0.79434276, + "num_input_tokens_seen": 234521424, + "router_z_loss_mlp": 0.79296875, + "step": 2810, + "time_per_iteration": 4.974013328552246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157913, + "balance_loss_mlp": 1.07818568, + "epoch": 0.5407849172758753, + "flos": 648537105408.0, + "grad_norm": 0.02696900388574031, + "language_loss": 0.85372365, + "learning_rate": 0.000458301817192603, + "loss": 0.8653028, + "num_input_tokens_seen": 234601632, + "router_z_loss_mlp": 0.796875, + "step": 2811, + "time_per_iteration": 2.8575778007507324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118454, + "balance_loss_mlp": 1.1057663, + "epoch": 0.5409772989611389, + "flos": 1410481234944.0, + "grad_norm": 0.012734794042181983, + "language_loss": 0.8084178, + "learning_rate": 0.00045799136826855263, + "loss": 0.82026327, + "num_input_tokens_seen": 234825776, + "router_z_loss_mlp": 0.78710938, + "step": 2812, + "time_per_iteration": 4.809651613235474 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163077, + "balance_loss_mlp": 1.0835402, + "epoch": 0.5411696806464025, + "flos": 555544049664.0, + "grad_norm": 0.031759632467193835, + "language_loss": 0.91974443, + "learning_rate": 0.00045768093565369983, + "loss": 0.93137515, + "num_input_tokens_seen": 234901504, + "router_z_loss_mlp": 0.79492188, + "step": 2813, + "time_per_iteration": 2.7333316802978516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164131, + "balance_loss_mlp": 1.0847373, + "epoch": 0.5413620623316661, + "flos": 529204892160.0, + "grad_norm": 0.03127565438509195, + "language_loss": 0.8788538, + "learning_rate": 0.0004573705194685646, + "loss": 0.89049512, + "num_input_tokens_seen": 234970288, + "router_z_loss_mlp": 0.79199219, + "step": 2814, + "time_per_iteration": 2.645961284637451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164839, + "balance_loss_mlp": 1.08544588, + "epoch": 0.5415544440169295, + "flos": 599851060224.0, + "grad_norm": 0.03485280634812332, + "language_loss": 0.91058564, + "learning_rate": 0.00045706011983366157, + "loss": 0.92223406, + "num_input_tokens_seen": 235039984, + "router_z_loss_mlp": 0.79199219, + "step": 2815, + "time_per_iteration": 2.6676552295684814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161812, + "balance_loss_mlp": 1.08237088, + "epoch": 0.5417468257021931, + "flos": 471713840640.0, + "grad_norm": 0.03625185410953689, + "language_loss": 0.88930029, + "learning_rate": 0.00045674973686949847, + "loss": 0.90091836, + "num_input_tokens_seen": 235105232, + "router_z_loss_mlp": 0.79199219, + "step": 2816, + "time_per_iteration": 2.51118540763855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116016, + "balance_loss_mlp": 1.08076715, + "epoch": 0.5419392073874567, + "flos": 682190057472.0, + "grad_norm": 0.02856526912727588, + "language_loss": 0.90316737, + "learning_rate": 0.0004564393706965766, + "loss": 0.91476899, + "num_input_tokens_seen": 235192560, + "router_z_loss_mlp": 0.79199219, + "step": 2817, + "time_per_iteration": 2.9563546180725098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160311, + "balance_loss_mlp": 1.0809654, + "epoch": 0.5421315890727203, + "flos": 463336531968.0, + "grad_norm": 0.032507832188727104, + "language_loss": 0.87249088, + "learning_rate": 0.00045612902143539116, + "loss": 0.884094, + "num_input_tokens_seen": 235258448, + "router_z_loss_mlp": 0.79199219, + "step": 2818, + "time_per_iteration": 2.5383646488189697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162479, + "balance_loss_mlp": 1.08294284, + "epoch": 0.5423239707579839, + "flos": 437889699840.0, + "grad_norm": 0.03622660962153638, + "language_loss": 0.8863132, + "learning_rate": 0.00045581868920642986, + "loss": 0.89793801, + "num_input_tokens_seen": 235322176, + "router_z_loss_mlp": 0.79296875, + "step": 2819, + "time_per_iteration": 2.4692800045013428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163903, + "balance_loss_mlp": 1.08441401, + "epoch": 0.5425163524432474, + "flos": 459305330688.0, + "grad_norm": 0.036307438946012835, + "language_loss": 0.86308074, + "learning_rate": 0.00045550837413017457, + "loss": 0.8747198, + "num_input_tokens_seen": 235390960, + "router_z_loss_mlp": 0.79296875, + "step": 2820, + "time_per_iteration": 2.59252667427063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160476, + "balance_loss_mlp": 1.08089161, + "epoch": 0.542708734128511, + "flos": 420409943040.0, + "grad_norm": 0.028561818537522772, + "language_loss": 0.89964175, + "learning_rate": 0.0004551980763271005, + "loss": 0.91124654, + "num_input_tokens_seen": 235460976, + "router_z_loss_mlp": 0.79394531, + "step": 2821, + "time_per_iteration": 2.64975643157959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158342, + "balance_loss_mlp": 1.07880592, + "epoch": 0.5429011158137745, + "flos": 679708465152.0, + "grad_norm": 0.03014006642218495, + "language_loss": 0.89564693, + "learning_rate": 0.0004548877959176756, + "loss": 0.90723038, + "num_input_tokens_seen": 235540912, + "router_z_loss_mlp": 0.79345703, + "step": 2822, + "time_per_iteration": 2.881334066390991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166233, + "balance_loss_mlp": 1.08693492, + "epoch": 0.5430934974990381, + "flos": 541967239680.0, + "grad_norm": 0.03201888254331298, + "language_loss": 0.91779578, + "learning_rate": 0.00045457753302236166, + "loss": 0.92945808, + "num_input_tokens_seen": 235608736, + "router_z_loss_mlp": 0.79150391, + "step": 2823, + "time_per_iteration": 2.615506887435913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160293, + "balance_loss_mlp": 1.08075619, + "epoch": 0.5432858791843016, + "flos": 659643056640.0, + "grad_norm": 0.03397006228821556, + "language_loss": 0.93680996, + "learning_rate": 0.00045426728776161353, + "loss": 0.94841284, + "num_input_tokens_seen": 235678720, + "router_z_loss_mlp": 0.79443359, + "step": 2824, + "time_per_iteration": 2.815668821334839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160478, + "balance_loss_mlp": 1.08084619, + "epoch": 0.5434782608695652, + "flos": 532966849536.0, + "grad_norm": 0.030340926449950675, + "language_loss": 0.86484039, + "learning_rate": 0.00045395706025587863, + "loss": 0.87644517, + "num_input_tokens_seen": 235748704, + "router_z_loss_mlp": 0.79589844, + "step": 2825, + "time_per_iteration": 2.677969455718994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159818, + "balance_loss_mlp": 1.0802815, + "epoch": 0.5436706425548288, + "flos": 609632985600.0, + "grad_norm": 0.032758454025991736, + "language_loss": 0.88250875, + "learning_rate": 0.00045364685062559843, + "loss": 0.89410686, + "num_input_tokens_seen": 235828224, + "router_z_loss_mlp": 0.79492188, + "step": 2826, + "time_per_iteration": 2.7975664138793945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160655, + "balance_loss_mlp": 1.08111823, + "epoch": 0.5438630242400924, + "flos": 706772762112.0, + "grad_norm": 0.047560346967580276, + "language_loss": 0.96112239, + "learning_rate": 0.0004533366589912067, + "loss": 0.97272885, + "num_input_tokens_seen": 235909392, + "router_z_loss_mlp": 0.79492188, + "step": 2827, + "time_per_iteration": 2.9455690383911133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161858, + "balance_loss_mlp": 1.08232152, + "epoch": 0.544055405925356, + "flos": 857838291456.0, + "grad_norm": 0.035082604549872, + "language_loss": 0.84527165, + "learning_rate": 0.0004530264854731306, + "loss": 0.8568902, + "num_input_tokens_seen": 235983888, + "router_z_loss_mlp": 0.79492188, + "step": 2828, + "time_per_iteration": 3.0149006843566895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161186, + "balance_loss_mlp": 1.08160186, + "epoch": 0.5442477876106194, + "flos": 572967410688.0, + "grad_norm": 0.029506216108961765, + "language_loss": 0.89973861, + "learning_rate": 0.00045271633019179034, + "loss": 0.91135049, + "num_input_tokens_seen": 236063056, + "router_z_loss_mlp": 0.79541016, + "step": 2829, + "time_per_iteration": 2.7735414505004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162764, + "balance_loss_mlp": 1.08313203, + "epoch": 0.544440169295883, + "flos": 626802565632.0, + "grad_norm": 0.028700635940731967, + "language_loss": 0.92908496, + "learning_rate": 0.0004524061932675986, + "loss": 0.94071257, + "num_input_tokens_seen": 236141104, + "router_z_loss_mlp": 0.79589844, + "step": 2830, + "time_per_iteration": 2.828461170196533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116197, + "balance_loss_mlp": 1.08224237, + "epoch": 0.5446325509811466, + "flos": 837640625664.0, + "grad_norm": 0.03503891147687097, + "language_loss": 0.92219722, + "learning_rate": 0.00045209607482096125, + "loss": 0.93381691, + "num_input_tokens_seen": 236220320, + "router_z_loss_mlp": 0.79541016, + "step": 2831, + "time_per_iteration": 3.0058434009552 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162561, + "balance_loss_mlp": 1.08292878, + "epoch": 0.5448249326664102, + "flos": 484389593088.0, + "grad_norm": 0.03287703969217422, + "language_loss": 0.89665288, + "learning_rate": 0.0004517859749722772, + "loss": 0.90827847, + "num_input_tokens_seen": 236288208, + "router_z_loss_mlp": 0.79443359, + "step": 2832, + "time_per_iteration": 2.6527607440948486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116426, + "balance_loss_mlp": 1.08453321, + "epoch": 0.5450173143516738, + "flos": 562345552896.0, + "grad_norm": 0.03300449363670703, + "language_loss": 0.84396762, + "learning_rate": 0.0004514758938419376, + "loss": 0.85561025, + "num_input_tokens_seen": 236366864, + "router_z_loss_mlp": 0.79541016, + "step": 2833, + "time_per_iteration": 2.799923896789551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176773, + "balance_loss_mlp": 1.09971619, + "epoch": 0.5452096960369373, + "flos": 1473586023936.0, + "grad_norm": 0.016868588983801922, + "language_loss": 0.76920587, + "learning_rate": 0.0004511658315503268, + "loss": 0.78097355, + "num_input_tokens_seen": 236597120, + "router_z_loss_mlp": 0.76953125, + "step": 2834, + "time_per_iteration": 4.904434442520142 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116397, + "balance_loss_mlp": 1.08414805, + "epoch": 0.5454020777222008, + "flos": 466017510912.0, + "grad_norm": 0.028290923396431526, + "language_loss": 0.88719809, + "learning_rate": 0.00045085578821782175, + "loss": 0.8988378, + "num_input_tokens_seen": 236664192, + "router_z_loss_mlp": 0.79589844, + "step": 2835, + "time_per_iteration": 2.5375516414642334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116069, + "balance_loss_mlp": 1.08325195, + "epoch": 0.5455944594074644, + "flos": 1472615109120.0, + "grad_norm": 0.00840245760684232, + "language_loss": 0.76134741, + "learning_rate": 0.0004505457639647917, + "loss": 0.77295429, + "num_input_tokens_seen": 236888784, + "router_z_loss_mlp": 0.7734375, + "step": 2836, + "time_per_iteration": 4.908621549606323 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161179, + "balance_loss_mlp": 1.08121371, + "epoch": 0.545786841092728, + "flos": 534304336896.0, + "grad_norm": 0.026675001792915147, + "language_loss": 0.85451794, + "learning_rate": 0.00045023575891159866, + "loss": 0.86612976, + "num_input_tokens_seen": 236962528, + "router_z_loss_mlp": 0.79931641, + "step": 2837, + "time_per_iteration": 2.77382230758667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167343, + "balance_loss_mlp": 1.08952332, + "epoch": 0.5459792227779915, + "flos": 1355426113536.0, + "grad_norm": 0.010026273514264956, + "language_loss": 0.74763811, + "learning_rate": 0.00044992577317859764, + "loss": 0.75931144, + "num_input_tokens_seen": 237179360, + "router_z_loss_mlp": 0.77734375, + "step": 2838, + "time_per_iteration": 4.8985395431518555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163141, + "balance_loss_mlp": 1.08322346, + "epoch": 0.5461716044632551, + "flos": 639072087552.0, + "grad_norm": 0.03170534586871267, + "language_loss": 0.83100337, + "learning_rate": 0.0004496158068861354, + "loss": 0.8426348, + "num_input_tokens_seen": 237256240, + "router_z_loss_mlp": 0.79833984, + "step": 2839, + "time_per_iteration": 2.8032078742980957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163887, + "balance_loss_mlp": 1.08396888, + "epoch": 0.5463639861485187, + "flos": 603925922304.0, + "grad_norm": 0.031486344316249366, + "language_loss": 0.85257053, + "learning_rate": 0.00044930586015455207, + "loss": 0.86420941, + "num_input_tokens_seen": 237334272, + "router_z_loss_mlp": 0.79833984, + "step": 2840, + "time_per_iteration": 2.780024290084839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168265, + "balance_loss_mlp": 1.08834755, + "epoch": 0.5465563678337823, + "flos": 643752566784.0, + "grad_norm": 0.02832807598538896, + "language_loss": 0.93569458, + "learning_rate": 0.000448995933104179, + "loss": 0.9473772, + "num_input_tokens_seen": 237415408, + "router_z_loss_mlp": 0.79736328, + "step": 2841, + "time_per_iteration": 2.848741292953491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168336, + "balance_loss_mlp": 1.08841801, + "epoch": 0.5467487495190458, + "flos": 615364243968.0, + "grad_norm": 0.03451251764660495, + "language_loss": 0.86641318, + "learning_rate": 0.00044868602585534077, + "loss": 0.87809658, + "num_input_tokens_seen": 237493232, + "router_z_loss_mlp": 0.796875, + "step": 2842, + "time_per_iteration": 2.8590362071990967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166404, + "balance_loss_mlp": 1.08677208, + "epoch": 0.5469411312043093, + "flos": 462127299072.0, + "grad_norm": 0.03329693034046033, + "language_loss": 0.9437651, + "learning_rate": 0.0004483761385283541, + "loss": 0.95542908, + "num_input_tokens_seen": 237556624, + "router_z_loss_mlp": 0.79443359, + "step": 2843, + "time_per_iteration": 2.523390769958496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116664, + "balance_loss_mlp": 1.08691323, + "epoch": 0.5471335128895729, + "flos": 562266963456.0, + "grad_norm": 0.03201679454384124, + "language_loss": 0.87509483, + "learning_rate": 0.0004480662712435281, + "loss": 0.88676119, + "num_input_tokens_seen": 237632048, + "router_z_loss_mlp": 0.79492188, + "step": 2844, + "time_per_iteration": 2.7186124324798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162399, + "balance_loss_mlp": 1.08286297, + "epoch": 0.5473258945748365, + "flos": 519685479936.0, + "grad_norm": 0.032165214678065886, + "language_loss": 0.93768156, + "learning_rate": 0.0004477564241211635, + "loss": 0.94930553, + "num_input_tokens_seen": 237699840, + "router_z_loss_mlp": 0.79345703, + "step": 2845, + "time_per_iteration": 2.5637102127075195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159503, + "balance_loss_mlp": 1.08034766, + "epoch": 0.5475182762601001, + "flos": 434744093184.0, + "grad_norm": 0.03138398317411523, + "language_loss": 0.92521811, + "learning_rate": 0.0004474465972815541, + "loss": 0.93681312, + "num_input_tokens_seen": 237762560, + "router_z_loss_mlp": 0.79101562, + "step": 2846, + "time_per_iteration": 2.470494508743286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162403, + "balance_loss_mlp": 1.08348668, + "epoch": 0.5477106579453636, + "flos": 512573799936.0, + "grad_norm": 0.02767233380819538, + "language_loss": 0.92665255, + "learning_rate": 0.000447136790844985, + "loss": 0.93827659, + "num_input_tokens_seen": 237837152, + "router_z_loss_mlp": 0.78759766, + "step": 2847, + "time_per_iteration": 2.7123520374298096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164922, + "balance_loss_mlp": 1.0861007, + "epoch": 0.5479030396306271, + "flos": 677140277760.0, + "grad_norm": 0.030326073882101023, + "language_loss": 0.85917926, + "learning_rate": 0.00044682700493173385, + "loss": 0.87082845, + "num_input_tokens_seen": 237909488, + "router_z_loss_mlp": 0.78710938, + "step": 2848, + "time_per_iteration": 2.826556921005249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166552, + "balance_loss_mlp": 1.08787405, + "epoch": 0.5480954213158907, + "flos": 877578060288.0, + "grad_norm": 0.033676298977630685, + "language_loss": 0.86673969, + "learning_rate": 0.00044651723966207004, + "loss": 0.87840521, + "num_input_tokens_seen": 237991056, + "router_z_loss_mlp": 0.78564453, + "step": 2849, + "time_per_iteration": 3.192443370819092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164243, + "balance_loss_mlp": 1.08556521, + "epoch": 0.5482878030011543, + "flos": 623174866944.0, + "grad_norm": 0.03042847520175512, + "language_loss": 0.83109522, + "learning_rate": 0.00044620749515625536, + "loss": 0.84273762, + "num_input_tokens_seen": 238064576, + "router_z_loss_mlp": 0.78564453, + "step": 2850, + "time_per_iteration": 2.7753841876983643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164392, + "balance_loss_mlp": 1.08528447, + "epoch": 0.5484801846864179, + "flos": 498257114112.0, + "grad_norm": 0.03264010932273605, + "language_loss": 0.90008557, + "learning_rate": 0.00044589777153454334, + "loss": 0.91172945, + "num_input_tokens_seen": 238136464, + "router_z_loss_mlp": 0.78857422, + "step": 2851, + "time_per_iteration": 2.7295939922332764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162977, + "balance_loss_mlp": 1.08391714, + "epoch": 0.5486725663716814, + "flos": 443353715712.0, + "grad_norm": 0.029420479903708215, + "language_loss": 0.88820338, + "learning_rate": 0.00044558806891717895, + "loss": 0.8998332, + "num_input_tokens_seen": 238198912, + "router_z_loss_mlp": 0.78808594, + "step": 2852, + "time_per_iteration": 2.4784035682678223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164311, + "balance_loss_mlp": 1.08548951, + "epoch": 0.548864948056945, + "flos": 656347728384.0, + "grad_norm": 0.02822438724303185, + "language_loss": 0.84744209, + "learning_rate": 0.0004452783874243998, + "loss": 0.8590852, + "num_input_tokens_seen": 238275184, + "router_z_loss_mlp": 0.78759766, + "step": 2853, + "time_per_iteration": 2.821592092514038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159975, + "balance_loss_mlp": 1.08105898, + "epoch": 0.5490573297422086, + "flos": 547140544512.0, + "grad_norm": 0.03150495246723179, + "language_loss": 0.90787637, + "learning_rate": 0.00044496872717643475, + "loss": 0.91947615, + "num_input_tokens_seen": 238348496, + "router_z_loss_mlp": 0.78710938, + "step": 2854, + "time_per_iteration": 2.6908938884735107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011614, + "balance_loss_mlp": 1.08415222, + "epoch": 0.5492497114274721, + "flos": 1593760897536.0, + "grad_norm": 0.006862097523809848, + "language_loss": 0.77089292, + "learning_rate": 0.00044465908829350453, + "loss": 0.78250694, + "num_input_tokens_seen": 238578464, + "router_z_loss_mlp": 0.77148438, + "step": 2855, + "time_per_iteration": 4.92158579826355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159374, + "balance_loss_mlp": 1.08036256, + "epoch": 0.5494420931127356, + "flos": 752269539840.0, + "grad_norm": 0.030842116299214104, + "language_loss": 0.87009478, + "learning_rate": 0.0004443494708958217, + "loss": 0.88168848, + "num_input_tokens_seen": 238660256, + "router_z_loss_mlp": 0.78759766, + "step": 2856, + "time_per_iteration": 2.952693223953247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155384, + "balance_loss_mlp": 1.07627714, + "epoch": 0.5496344747979992, + "flos": 627304123392.0, + "grad_norm": 0.026887140123268247, + "language_loss": 0.85396117, + "learning_rate": 0.0004440398751035906, + "loss": 0.86551499, + "num_input_tokens_seen": 238745856, + "router_z_loss_mlp": 0.79052734, + "step": 2857, + "time_per_iteration": 2.8657121658325195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156313, + "balance_loss_mlp": 1.07691979, + "epoch": 0.5498268564832628, + "flos": 524124913152.0, + "grad_norm": 0.03681476772579859, + "language_loss": 0.90347362, + "learning_rate": 0.00044373030103700645, + "loss": 0.9150368, + "num_input_tokens_seen": 238813888, + "router_z_loss_mlp": 0.79248047, + "step": 2858, + "time_per_iteration": 2.6372759342193604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161253, + "balance_loss_mlp": 1.08185947, + "epoch": 0.5500192381685264, + "flos": 605777702400.0, + "grad_norm": 0.027579474955625485, + "language_loss": 0.8405782, + "learning_rate": 0.000443420748816257, + "loss": 0.85219079, + "num_input_tokens_seen": 238885440, + "router_z_loss_mlp": 0.79248047, + "step": 2859, + "time_per_iteration": 2.832864999771118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163587, + "balance_loss_mlp": 1.08395553, + "epoch": 0.55021161985379, + "flos": 521654780928.0, + "grad_norm": 0.03409053016014856, + "language_loss": 0.84214079, + "learning_rate": 0.0004431112185615208, + "loss": 0.85377669, + "num_input_tokens_seen": 238960944, + "router_z_loss_mlp": 0.79443359, + "step": 2860, + "time_per_iteration": 2.7533481121063232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165675, + "balance_loss_mlp": 1.0862813, + "epoch": 0.5504040015390534, + "flos": 490654609920.0, + "grad_norm": 0.028251427239966796, + "language_loss": 0.84584463, + "learning_rate": 0.00044280171039296845, + "loss": 0.85750139, + "num_input_tokens_seen": 239030592, + "router_z_loss_mlp": 0.79296875, + "step": 2861, + "time_per_iteration": 2.6798369884490967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116251, + "balance_loss_mlp": 1.08306909, + "epoch": 0.550596383224317, + "flos": 576861625344.0, + "grad_norm": 0.030462386563617952, + "language_loss": 0.93688512, + "learning_rate": 0.0004424922244307616, + "loss": 0.94851023, + "num_input_tokens_seen": 239097440, + "router_z_loss_mlp": 0.79296875, + "step": 2862, + "time_per_iteration": 2.7042698860168457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164147, + "balance_loss_mlp": 1.08461094, + "epoch": 0.5507887649095806, + "flos": 643633044480.0, + "grad_norm": 0.03244616812289036, + "language_loss": 0.87943101, + "learning_rate": 0.00044218276079505315, + "loss": 0.89107251, + "num_input_tokens_seen": 239179872, + "router_z_loss_mlp": 0.79296875, + "step": 2863, + "time_per_iteration": 2.869657278060913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116435, + "balance_loss_mlp": 1.08490932, + "epoch": 0.5509811465948442, + "flos": 532864791552.0, + "grad_norm": 0.03309127401700594, + "language_loss": 0.80069649, + "learning_rate": 0.0004418733196059876, + "loss": 0.81234002, + "num_input_tokens_seen": 239251264, + "router_z_loss_mlp": 0.79248047, + "step": 2864, + "time_per_iteration": 2.694439649581909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164051, + "balance_loss_mlp": 1.08489633, + "epoch": 0.5511735282801077, + "flos": 655983157248.0, + "grad_norm": 0.031218908498787497, + "language_loss": 0.85167533, + "learning_rate": 0.0004415639009837008, + "loss": 0.86331582, + "num_input_tokens_seen": 239326688, + "router_z_loss_mlp": 0.79101562, + "step": 2865, + "time_per_iteration": 2.8214035034179688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160959, + "balance_loss_mlp": 1.08175683, + "epoch": 0.5513659099653713, + "flos": 530609508864.0, + "grad_norm": 0.029306479659861318, + "language_loss": 0.87106019, + "learning_rate": 0.00044125450504831955, + "loss": 0.88266975, + "num_input_tokens_seen": 239401248, + "router_z_loss_mlp": 0.79150391, + "step": 2866, + "time_per_iteration": 2.7755370140075684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157699, + "balance_loss_mlp": 1.0782584, + "epoch": 0.5515582916506349, + "flos": 555973748736.0, + "grad_norm": 0.03358668454464356, + "language_loss": 0.88577026, + "learning_rate": 0.0004409451319199622, + "loss": 0.89734721, + "num_input_tokens_seen": 239471600, + "router_z_loss_mlp": 0.79248047, + "step": 2867, + "time_per_iteration": 2.700601577758789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160497, + "balance_loss_mlp": 1.08105552, + "epoch": 0.5517506733358984, + "flos": 736771819008.0, + "grad_norm": 0.033780629576782226, + "language_loss": 0.90037191, + "learning_rate": 0.0004406357817187381, + "loss": 0.91197693, + "num_input_tokens_seen": 239548592, + "router_z_loss_mlp": 0.79248047, + "step": 2868, + "time_per_iteration": 2.9809505939483643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160757, + "balance_loss_mlp": 1.0816493, + "epoch": 0.551943055021162, + "flos": 1117189206528.0, + "grad_norm": 0.02667902344135768, + "language_loss": 0.86254233, + "learning_rate": 0.0004403264545647474, + "loss": 0.87414992, + "num_input_tokens_seen": 239644432, + "router_z_loss_mlp": 0.79052734, + "step": 2869, + "time_per_iteration": 3.5932819843292236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156378, + "balance_loss_mlp": 1.07727027, + "epoch": 0.5521354367064255, + "flos": 545501612544.0, + "grad_norm": 0.024843999573841903, + "language_loss": 0.89363241, + "learning_rate": 0.00044001715057808154, + "loss": 0.90519619, + "num_input_tokens_seen": 239723392, + "router_z_loss_mlp": 0.79052734, + "step": 2870, + "time_per_iteration": 2.7333626747131348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159059, + "balance_loss_mlp": 1.07999909, + "epoch": 0.5523278183916891, + "flos": 937871614464.0, + "grad_norm": 0.027996488517333572, + "language_loss": 0.86652702, + "learning_rate": 0.0004397078698788232, + "loss": 0.87811756, + "num_input_tokens_seen": 239806896, + "router_z_loss_mlp": 0.79003906, + "step": 2871, + "time_per_iteration": 3.199366807937622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168602, + "balance_loss_mlp": 1.0909729, + "epoch": 0.5525202000769527, + "flos": 1469098927104.0, + "grad_norm": 0.009568898658781464, + "language_loss": 0.80442369, + "learning_rate": 0.0004393986125870456, + "loss": 0.81610966, + "num_input_tokens_seen": 240037824, + "router_z_loss_mlp": 0.77539062, + "step": 2872, + "time_per_iteration": 4.912739515304565 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163231, + "balance_loss_mlp": 1.08426642, + "epoch": 0.5527125817622163, + "flos": 490784865792.0, + "grad_norm": 0.03313805620558485, + "language_loss": 0.83656394, + "learning_rate": 0.00043908937882281343, + "loss": 0.84819627, + "num_input_tokens_seen": 240107952, + "router_z_loss_mlp": 0.78808594, + "step": 2873, + "time_per_iteration": 2.6517224311828613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163059, + "balance_loss_mlp": 1.08409429, + "epoch": 0.5529049634474797, + "flos": 636148061184.0, + "grad_norm": 0.033554896267230024, + "language_loss": 0.87775517, + "learning_rate": 0.0004387801687061814, + "loss": 0.88938576, + "num_input_tokens_seen": 240183824, + "router_z_loss_mlp": 0.78710938, + "step": 2874, + "time_per_iteration": 2.8159070014953613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159743, + "balance_loss_mlp": 1.08073115, + "epoch": 0.5530973451327433, + "flos": 582434429952.0, + "grad_norm": 0.02986403100144585, + "language_loss": 0.86760765, + "learning_rate": 0.0004384709823571958, + "loss": 0.87920505, + "num_input_tokens_seen": 240259296, + "router_z_loss_mlp": 0.78857422, + "step": 2875, + "time_per_iteration": 2.755831480026245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158961, + "balance_loss_mlp": 1.08004439, + "epoch": 0.5532897268180069, + "flos": 1124329084416.0, + "grad_norm": 0.02992932493519035, + "language_loss": 0.88625169, + "learning_rate": 0.0004381618198958932, + "loss": 0.89784127, + "num_input_tokens_seen": 240346768, + "router_z_loss_mlp": 0.78662109, + "step": 2876, + "time_per_iteration": 3.504112720489502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157815, + "balance_loss_mlp": 1.0788027, + "epoch": 0.5534821085032705, + "flos": 638512132608.0, + "grad_norm": 0.032170459842753865, + "language_loss": 0.89321101, + "learning_rate": 0.00043785268144230137, + "loss": 0.90478921, + "num_input_tokens_seen": 240429344, + "router_z_loss_mlp": 0.78808594, + "step": 2877, + "time_per_iteration": 2.889683961868286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158076, + "balance_loss_mlp": 1.07911134, + "epoch": 0.5536744901885341, + "flos": 572216074752.0, + "grad_norm": 0.0339903958733494, + "language_loss": 0.87417912, + "learning_rate": 0.00043754356711643837, + "loss": 0.88575995, + "num_input_tokens_seen": 240497008, + "router_z_loss_mlp": 0.78759766, + "step": 2878, + "time_per_iteration": 2.6604373455047607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115856, + "balance_loss_mlp": 1.07950056, + "epoch": 0.5538668718737976, + "flos": 596916300288.0, + "grad_norm": 0.029580626213001865, + "language_loss": 0.88473797, + "learning_rate": 0.0004372344770383132, + "loss": 0.89632356, + "num_input_tokens_seen": 240578432, + "router_z_loss_mlp": 0.78808594, + "step": 2879, + "time_per_iteration": 2.7906830310821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011565, + "balance_loss_mlp": 1.07753599, + "epoch": 0.5540592535590612, + "flos": 533718185472.0, + "grad_norm": 0.030293675767491222, + "language_loss": 0.88174736, + "learning_rate": 0.00043692541132792507, + "loss": 0.89331234, + "num_input_tokens_seen": 240649136, + "router_z_loss_mlp": 0.78710938, + "step": 2880, + "time_per_iteration": 2.7152342796325684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156751, + "balance_loss_mlp": 1.07764363, + "epoch": 0.5542516352443247, + "flos": 413504380416.0, + "grad_norm": 0.03343546183057337, + "language_loss": 0.89203489, + "learning_rate": 0.00043661637010526384, + "loss": 0.90360242, + "num_input_tokens_seen": 240714240, + "router_z_loss_mlp": 0.78857422, + "step": 2881, + "time_per_iteration": 2.4759325981140137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156889, + "balance_loss_mlp": 1.07792521, + "epoch": 0.5544440169295883, + "flos": 548677418496.0, + "grad_norm": 0.03944129006740139, + "language_loss": 0.89678496, + "learning_rate": 0.00043630735349031025, + "loss": 0.90835381, + "num_input_tokens_seen": 240786928, + "router_z_loss_mlp": 0.78759766, + "step": 2882, + "time_per_iteration": 2.6376428604125977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157119, + "balance_loss_mlp": 1.07815528, + "epoch": 0.5546363986148518, + "flos": 623033877504.0, + "grad_norm": 0.025659357486645176, + "language_loss": 0.85712773, + "learning_rate": 0.00043599836160303495, + "loss": 0.86869895, + "num_input_tokens_seen": 240865328, + "router_z_loss_mlp": 0.78710938, + "step": 2883, + "time_per_iteration": 2.861966133117676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155488, + "balance_loss_mlp": 1.07633352, + "epoch": 0.5548287803001154, + "flos": 706579379712.0, + "grad_norm": 0.03141972013571756, + "language_loss": 0.82934201, + "learning_rate": 0.0004356893945633995, + "loss": 0.8408969, + "num_input_tokens_seen": 240945680, + "router_z_loss_mlp": 0.7890625, + "step": 2884, + "time_per_iteration": 2.9471499919891357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154456, + "balance_loss_mlp": 1.07534921, + "epoch": 0.555021161985379, + "flos": 505184143872.0, + "grad_norm": 0.031430850490502316, + "language_loss": 0.85807753, + "learning_rate": 0.0004353804524913551, + "loss": 0.86962205, + "num_input_tokens_seen": 241010800, + "router_z_loss_mlp": 0.78857422, + "step": 2885, + "time_per_iteration": 2.6182141304016113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154918, + "balance_loss_mlp": 1.07576323, + "epoch": 0.5552135436706426, + "flos": 617209293312.0, + "grad_norm": 0.033803824808406595, + "language_loss": 0.88278472, + "learning_rate": 0.0004350715355068441, + "loss": 0.89433384, + "num_input_tokens_seen": 241085328, + "router_z_loss_mlp": 0.7890625, + "step": 2886, + "time_per_iteration": 2.815993547439575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154719, + "balance_loss_mlp": 1.07556415, + "epoch": 0.5554059253559062, + "flos": 464817010176.0, + "grad_norm": 0.03994579560883884, + "language_loss": 0.85848737, + "learning_rate": 0.00043476264372979847, + "loss": 0.87003452, + "num_input_tokens_seen": 241149600, + "router_z_loss_mlp": 0.7890625, + "step": 2887, + "time_per_iteration": 2.5898871421813965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154914, + "balance_loss_mlp": 1.07618785, + "epoch": 0.5555983070411696, + "flos": 1564874841600.0, + "grad_norm": 0.03588081892536478, + "language_loss": 0.85341823, + "learning_rate": 0.0004344537772801408, + "loss": 0.86496735, + "num_input_tokens_seen": 241244832, + "router_z_loss_mlp": 0.78613281, + "step": 2888, + "time_per_iteration": 3.880375385284424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158798, + "balance_loss_mlp": 1.0821228, + "epoch": 0.5557906887264332, + "flos": 1471226681856.0, + "grad_norm": 0.005822600355857551, + "language_loss": 0.73422456, + "learning_rate": 0.0004341449362777836, + "loss": 0.74581254, + "num_input_tokens_seen": 241479728, + "router_z_loss_mlp": 0.76757812, + "step": 2889, + "time_per_iteration": 4.9117255210876465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155617, + "balance_loss_mlp": 1.07670069, + "epoch": 0.5559830704116968, + "flos": 530863289856.0, + "grad_norm": 0.03666523888945824, + "language_loss": 0.89283395, + "learning_rate": 0.0004338361208426298, + "loss": 0.90439016, + "num_input_tokens_seen": 241545616, + "router_z_loss_mlp": 0.78710938, + "step": 2890, + "time_per_iteration": 2.6093485355377197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155534, + "balance_loss_mlp": 1.07671309, + "epoch": 0.5561754520969604, + "flos": 652518641664.0, + "grad_norm": 0.027207956668339604, + "language_loss": 0.85981715, + "learning_rate": 0.00043352733109457164, + "loss": 0.87137252, + "num_input_tokens_seen": 241629040, + "router_z_loss_mlp": 0.78710938, + "step": 2891, + "time_per_iteration": 2.929133892059326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155522, + "balance_loss_mlp": 1.07670057, + "epoch": 0.556367833782224, + "flos": 735618981888.0, + "grad_norm": 0.028477777137297752, + "language_loss": 0.89055073, + "learning_rate": 0.00043321856715349244, + "loss": 0.90210593, + "num_input_tokens_seen": 241706272, + "router_z_loss_mlp": 0.78662109, + "step": 2892, + "time_per_iteration": 2.94014573097229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154528, + "balance_loss_mlp": 1.0758971, + "epoch": 0.5565602154674875, + "flos": 673640833536.0, + "grad_norm": 0.028305708839331062, + "language_loss": 0.85380936, + "learning_rate": 0.00043290982913926466, + "loss": 0.8653546, + "num_input_tokens_seen": 241782304, + "router_z_loss_mlp": 0.78564453, + "step": 2893, + "time_per_iteration": 2.797816038131714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153225, + "balance_loss_mlp": 1.07449973, + "epoch": 0.556752597152751, + "flos": 587503675392.0, + "grad_norm": 0.03108865563447884, + "language_loss": 0.90100253, + "learning_rate": 0.0004326011171717514, + "loss": 0.91253483, + "num_input_tokens_seen": 241868576, + "router_z_loss_mlp": 0.78613281, + "step": 2894, + "time_per_iteration": 2.885183334350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153367, + "balance_loss_mlp": 1.07426023, + "epoch": 0.5569449788380146, + "flos": 438690700800.0, + "grad_norm": 0.03571349027789826, + "language_loss": 0.87187707, + "learning_rate": 0.0004322924313708051, + "loss": 0.88341075, + "num_input_tokens_seen": 241933696, + "router_z_loss_mlp": 0.78857422, + "step": 2895, + "time_per_iteration": 2.505321502685547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115508, + "balance_loss_mlp": 1.07635403, + "epoch": 0.5571373605232782, + "flos": 503247770112.0, + "grad_norm": 0.03410983593663488, + "language_loss": 0.90630054, + "learning_rate": 0.0004319837718562681, + "loss": 0.91785133, + "num_input_tokens_seen": 242003056, + "router_z_loss_mlp": 0.78613281, + "step": 2896, + "time_per_iteration": 2.6243269443511963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154122, + "balance_loss_mlp": 1.07530081, + "epoch": 0.5573297422085417, + "flos": 578589880320.0, + "grad_norm": 0.033933273128928194, + "language_loss": 0.88206899, + "learning_rate": 0.0004316751387479726, + "loss": 0.89361024, + "num_input_tokens_seen": 242076368, + "router_z_loss_mlp": 0.78662109, + "step": 2897, + "time_per_iteration": 2.7566635608673096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153209, + "balance_loss_mlp": 1.074579, + "epoch": 0.5575221238938053, + "flos": 1346047512576.0, + "grad_norm": 0.03456307454544867, + "language_loss": 0.88955474, + "learning_rate": 0.0004313665321657409, + "loss": 0.90108681, + "num_input_tokens_seen": 242161600, + "router_z_loss_mlp": 0.78564453, + "step": 2898, + "time_per_iteration": 3.766465187072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155323, + "balance_loss_mlp": 1.07616794, + "epoch": 0.5577145055790689, + "flos": 603098724864.0, + "grad_norm": 0.03371138021934881, + "language_loss": 0.86232543, + "learning_rate": 0.00043105795222938436, + "loss": 0.8738786, + "num_input_tokens_seen": 242237904, + "router_z_loss_mlp": 0.7890625, + "step": 2899, + "time_per_iteration": 2.7334022521972656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155497, + "balance_loss_mlp": 1.07658088, + "epoch": 0.5579068872643325, + "flos": 563691045888.0, + "grad_norm": 0.045182395108838744, + "language_loss": 0.86075807, + "learning_rate": 0.00043074939905870467, + "loss": 0.87231296, + "num_input_tokens_seen": 242306736, + "router_z_loss_mlp": 0.78759766, + "step": 2900, + "time_per_iteration": 2.696669340133667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155611, + "balance_loss_mlp": 1.0766468, + "epoch": 0.558099268949596, + "flos": 545588207616.0, + "grad_norm": 0.03640236345196184, + "language_loss": 0.86178941, + "learning_rate": 0.0004304408727734927, + "loss": 0.87334555, + "num_input_tokens_seen": 242376000, + "router_z_loss_mlp": 0.78759766, + "step": 2901, + "time_per_iteration": 2.62982439994812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115605, + "balance_loss_mlp": 1.07727695, + "epoch": 0.5582916506348595, + "flos": 553852724736.0, + "grad_norm": 0.027303392187282394, + "language_loss": 0.9274894, + "learning_rate": 0.0004301323734935288, + "loss": 0.93904984, + "num_input_tokens_seen": 242447056, + "router_z_loss_mlp": 0.78613281, + "step": 2902, + "time_per_iteration": 2.705291986465454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164959, + "balance_loss_mlp": 1.08632815, + "epoch": 0.5584840323201231, + "flos": 544424636928.0, + "grad_norm": 0.032065850930778406, + "language_loss": 0.92794406, + "learning_rate": 0.000429823901338583, + "loss": 0.93959367, + "num_input_tokens_seen": 242514400, + "router_z_loss_mlp": 0.78564453, + "step": 2903, + "time_per_iteration": 2.620115041732788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162843, + "balance_loss_mlp": 1.08421218, + "epoch": 0.5586764140053867, + "flos": 817021992960.0, + "grad_norm": 0.03266293414683286, + "language_loss": 0.92888266, + "learning_rate": 0.00042951545642841513, + "loss": 0.94051105, + "num_input_tokens_seen": 242601616, + "router_z_loss_mlp": 0.78564453, + "step": 2904, + "time_per_iteration": 3.066140651702881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160381, + "balance_loss_mlp": 1.08165538, + "epoch": 0.5588687956906503, + "flos": 487415677440.0, + "grad_norm": 0.02932995016233391, + "language_loss": 0.91419339, + "learning_rate": 0.0004292070388827737, + "loss": 0.92579722, + "num_input_tokens_seen": 242669648, + "router_z_loss_mlp": 0.78613281, + "step": 2905, + "time_per_iteration": 2.5493688583374023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153401, + "balance_loss_mlp": 1.07453251, + "epoch": 0.5590611773759138, + "flos": 453068511744.0, + "grad_norm": 0.02745082882239035, + "language_loss": 0.85835731, + "learning_rate": 0.00042889864882139753, + "loss": 0.86989129, + "num_input_tokens_seen": 242737456, + "router_z_loss_mlp": 0.78710938, + "step": 2906, + "time_per_iteration": 2.572270631790161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115253, + "balance_loss_mlp": 1.07347012, + "epoch": 0.5592535590611774, + "flos": 521956225536.0, + "grad_norm": 0.03525028250709423, + "language_loss": 0.87143886, + "learning_rate": 0.0004285902863640139, + "loss": 0.88296419, + "num_input_tokens_seen": 242807008, + "router_z_loss_mlp": 0.78857422, + "step": 2907, + "time_per_iteration": 2.657799482345581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153209, + "balance_loss_mlp": 1.07448292, + "epoch": 0.5594459407464409, + "flos": 553600945152.0, + "grad_norm": 0.02873947635122419, + "language_loss": 0.90871602, + "learning_rate": 0.00042828195163033966, + "loss": 0.92024809, + "num_input_tokens_seen": 242877328, + "router_z_loss_mlp": 0.78613281, + "step": 2908, + "time_per_iteration": 2.6421632766723633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152251, + "balance_loss_mlp": 1.07323921, + "epoch": 0.5596383224317045, + "flos": 485787479040.0, + "grad_norm": 0.030747286656696786, + "language_loss": 0.84394485, + "learning_rate": 0.0004279736447400812, + "loss": 0.85546738, + "num_input_tokens_seen": 242943152, + "router_z_loss_mlp": 0.78808594, + "step": 2909, + "time_per_iteration": 2.571681022644043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152122, + "balance_loss_mlp": 1.07344413, + "epoch": 0.5598307041169681, + "flos": 612379092480.0, + "grad_norm": 0.030942423142950287, + "language_loss": 0.83957374, + "learning_rate": 0.00042766536581293385, + "loss": 0.85109496, + "num_input_tokens_seen": 243014656, + "router_z_loss_mlp": 0.78613281, + "step": 2910, + "time_per_iteration": 2.7282116413116455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155729, + "balance_loss_mlp": 1.07662177, + "epoch": 0.5600230858022316, + "flos": 489916735488.0, + "grad_norm": 0.03226747500803281, + "language_loss": 0.85277241, + "learning_rate": 0.0004273571149685819, + "loss": 0.86432964, + "num_input_tokens_seen": 243089040, + "router_z_loss_mlp": 0.78857422, + "step": 2911, + "time_per_iteration": 2.787032127380371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154593, + "balance_loss_mlp": 1.0759151, + "epoch": 0.5602154674874952, + "flos": 599981316096.0, + "grad_norm": 0.03215276166374932, + "language_loss": 0.88704693, + "learning_rate": 0.00042704889232669937, + "loss": 0.89859283, + "num_input_tokens_seen": 243162480, + "router_z_loss_mlp": 0.78613281, + "step": 2912, + "time_per_iteration": 2.686586856842041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154743, + "balance_loss_mlp": 1.07611275, + "epoch": 0.5604078491727588, + "flos": 587062516224.0, + "grad_norm": 0.032254540051477425, + "language_loss": 0.9111523, + "learning_rate": 0.0004267406980069484, + "loss": 0.92269969, + "num_input_tokens_seen": 243232880, + "router_z_loss_mlp": 0.78466797, + "step": 2913, + "time_per_iteration": 2.6899847984313965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154041, + "balance_loss_mlp": 1.07545817, + "epoch": 0.5606002308580224, + "flos": 542327808000.0, + "grad_norm": 0.028324891167666608, + "language_loss": 0.8452785, + "learning_rate": 0.0004264325321289808, + "loss": 0.85681891, + "num_input_tokens_seen": 243309168, + "router_z_loss_mlp": 0.78515625, + "step": 2914, + "time_per_iteration": 2.770299196243286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151899, + "balance_loss_mlp": 1.07331622, + "epoch": 0.5607926125432858, + "flos": 585078478848.0, + "grad_norm": 0.03365993170310601, + "language_loss": 0.91764051, + "learning_rate": 0.00042612439481243736, + "loss": 0.92915952, + "num_input_tokens_seen": 243382064, + "router_z_loss_mlp": 0.78515625, + "step": 2915, + "time_per_iteration": 2.7451834678649902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162837, + "balance_loss_mlp": 1.08406377, + "epoch": 0.5609849942285494, + "flos": 628630150656.0, + "grad_norm": 0.03395322139017605, + "language_loss": 0.95402431, + "learning_rate": 0.00042581628617694735, + "loss": 0.96565264, + "num_input_tokens_seen": 243452064, + "router_z_loss_mlp": 0.78613281, + "step": 2916, + "time_per_iteration": 2.7379772663116455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157541, + "balance_loss_mlp": 1.07871938, + "epoch": 0.561177375913813, + "flos": 589454785536.0, + "grad_norm": 0.03197816551531196, + "language_loss": 0.86920869, + "learning_rate": 0.0004255082063421296, + "loss": 0.88078409, + "num_input_tokens_seen": 243525600, + "router_z_loss_mlp": 0.78759766, + "step": 2917, + "time_per_iteration": 2.7153422832489014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161631, + "balance_loss_mlp": 1.08285797, + "epoch": 0.5613697575990766, + "flos": 528143379456.0, + "grad_norm": 0.03128753614155992, + "language_loss": 0.89917612, + "learning_rate": 0.00042520015542759065, + "loss": 0.91079247, + "num_input_tokens_seen": 243605536, + "router_z_loss_mlp": 0.78710938, + "step": 2918, + "time_per_iteration": 2.8688042163848877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165136, + "balance_loss_mlp": 1.08636212, + "epoch": 0.5615621392843402, + "flos": 643874090496.0, + "grad_norm": 0.03249260096588731, + "language_loss": 0.93211949, + "learning_rate": 0.00042489213355292687, + "loss": 0.94377089, + "num_input_tokens_seen": 243684208, + "router_z_loss_mlp": 0.78613281, + "step": 2919, + "time_per_iteration": 2.8982832431793213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167734, + "balance_loss_mlp": 1.08900821, + "epoch": 0.5617545209696037, + "flos": 428656995840.0, + "grad_norm": 0.034334958581954525, + "language_loss": 0.87036526, + "learning_rate": 0.00042458414083772276, + "loss": 0.88204259, + "num_input_tokens_seen": 243749376, + "router_z_loss_mlp": 0.78466797, + "step": 2920, + "time_per_iteration": 2.5067636966705322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164187, + "balance_loss_mlp": 1.08536625, + "epoch": 0.5619469026548672, + "flos": 569589490176.0, + "grad_norm": 0.025989129211014445, + "language_loss": 0.89547098, + "learning_rate": 0.000424276177401552, + "loss": 0.90711284, + "num_input_tokens_seen": 243828096, + "router_z_loss_mlp": 0.78710938, + "step": 2921, + "time_per_iteration": 2.810723304748535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158764, + "balance_loss_mlp": 1.07975173, + "epoch": 0.5621392843401308, + "flos": 506243655168.0, + "grad_norm": 0.03554030610259364, + "language_loss": 0.91916943, + "learning_rate": 0.0004239682433639763, + "loss": 0.93075705, + "num_input_tokens_seen": 243896752, + "router_z_loss_mlp": 0.7890625, + "step": 2922, + "time_per_iteration": 2.6607391834259033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159452, + "balance_loss_mlp": 1.08034527, + "epoch": 0.5623316660253944, + "flos": 518009617920.0, + "grad_norm": 0.03283867999662062, + "language_loss": 0.91225737, + "learning_rate": 0.0004236603388445467, + "loss": 0.92385185, + "num_input_tokens_seen": 243964592, + "router_z_loss_mlp": 0.78955078, + "step": 2923, + "time_per_iteration": 2.586524248123169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159206, + "balance_loss_mlp": 1.08043242, + "epoch": 0.5625240477106579, + "flos": 607138658304.0, + "grad_norm": 0.07898356089021562, + "language_loss": 0.87176222, + "learning_rate": 0.00042335246396280166, + "loss": 0.88335431, + "num_input_tokens_seen": 244036656, + "router_z_loss_mlp": 0.78710938, + "step": 2924, + "time_per_iteration": 2.7597639560699463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115906, + "balance_loss_mlp": 1.08004844, + "epoch": 0.5627164293959215, + "flos": 451340256768.0, + "grad_norm": 0.0302800933285396, + "language_loss": 0.96241242, + "learning_rate": 0.0004230446188382693, + "loss": 0.97400308, + "num_input_tokens_seen": 244102704, + "router_z_loss_mlp": 0.7890625, + "step": 2925, + "time_per_iteration": 2.573899030685425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158596, + "balance_loss_mlp": 1.07977474, + "epoch": 0.5629088110811851, + "flos": 743436335616.0, + "grad_norm": 0.03229142562201564, + "language_loss": 0.85888505, + "learning_rate": 0.0004227368035904654, + "loss": 0.87047106, + "num_input_tokens_seen": 244186640, + "router_z_loss_mlp": 0.78759766, + "step": 2926, + "time_per_iteration": 2.9811575412750244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161727, + "balance_loss_mlp": 1.08295333, + "epoch": 0.5631011927664487, + "flos": 497979138048.0, + "grad_norm": 0.030188812186764755, + "language_loss": 0.88692701, + "learning_rate": 0.00042242901833889474, + "loss": 0.89854425, + "num_input_tokens_seen": 244257680, + "router_z_loss_mlp": 0.78710938, + "step": 2927, + "time_per_iteration": 2.6326565742492676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160764, + "balance_loss_mlp": 1.08194327, + "epoch": 0.5632935744517122, + "flos": 887594300928.0, + "grad_norm": 0.033144673445412554, + "language_loss": 0.91819888, + "learning_rate": 0.0004221212632030501, + "loss": 0.92980659, + "num_input_tokens_seen": 244331248, + "router_z_loss_mlp": 0.78759766, + "step": 2928, + "time_per_iteration": 3.0669453144073486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115887, + "balance_loss_mlp": 1.08014381, + "epoch": 0.5634859561369757, + "flos": 605901227520.0, + "grad_norm": 0.03167965641147859, + "language_loss": 0.85548306, + "learning_rate": 0.0004218135383024124, + "loss": 0.86707169, + "num_input_tokens_seen": 244403920, + "router_z_loss_mlp": 0.78662109, + "step": 2929, + "time_per_iteration": 2.704127788543701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154152, + "balance_loss_mlp": 1.07542574, + "epoch": 0.5636783378222393, + "flos": 454902827520.0, + "grad_norm": 0.0331862396137692, + "language_loss": 0.91072655, + "learning_rate": 0.0004215058437564511, + "loss": 0.92226809, + "num_input_tokens_seen": 244470464, + "router_z_loss_mlp": 0.78662109, + "step": 2930, + "time_per_iteration": 2.5648486614227295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153736, + "balance_loss_mlp": 1.07496285, + "epoch": 0.5638707195075029, + "flos": 519461898240.0, + "grad_norm": 0.030026295980520465, + "language_loss": 0.87243164, + "learning_rate": 0.00042119817968462397, + "loss": 0.88396895, + "num_input_tokens_seen": 244536864, + "router_z_loss_mlp": 0.78613281, + "step": 2931, + "time_per_iteration": 2.596165895462036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011545, + "balance_loss_mlp": 1.07572603, + "epoch": 0.5640631011927665, + "flos": 565844270592.0, + "grad_norm": 0.035813464167598875, + "language_loss": 0.92307299, + "learning_rate": 0.0004208905462063766, + "loss": 0.934618, + "num_input_tokens_seen": 244603344, + "router_z_loss_mlp": 0.78564453, + "step": 2932, + "time_per_iteration": 2.6596782207489014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161524, + "balance_loss_mlp": 1.0827024, + "epoch": 0.56425548287803, + "flos": 518037815808.0, + "grad_norm": 0.03163601566095553, + "language_loss": 0.90576756, + "learning_rate": 0.00042058294344114315, + "loss": 0.91738278, + "num_input_tokens_seen": 244671984, + "router_z_loss_mlp": 0.78564453, + "step": 2933, + "time_per_iteration": 2.6681416034698486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160347, + "balance_loss_mlp": 1.08157361, + "epoch": 0.5644478645632935, + "flos": 855669603840.0, + "grad_norm": 0.031443670044009366, + "language_loss": 0.83703303, + "learning_rate": 0.0004202753715083456, + "loss": 0.84863651, + "num_input_tokens_seen": 244754000, + "router_z_loss_mlp": 0.78515625, + "step": 2934, + "time_per_iteration": 3.1047325134277344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159543, + "balance_loss_mlp": 1.08081746, + "epoch": 0.5646402462485571, + "flos": 554495271936.0, + "grad_norm": 0.034946601892201584, + "language_loss": 0.87802339, + "learning_rate": 0.0004199678305273936, + "loss": 0.88961881, + "num_input_tokens_seen": 244820896, + "router_z_loss_mlp": 0.78613281, + "step": 2935, + "time_per_iteration": 2.649768352508545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159598, + "balance_loss_mlp": 1.08092046, + "epoch": 0.5648326279338207, + "flos": 687310969344.0, + "grad_norm": 0.04027660967531297, + "language_loss": 0.86366433, + "learning_rate": 0.0004196603206176854, + "loss": 0.87526035, + "num_input_tokens_seen": 244904464, + "router_z_loss_mlp": 0.78613281, + "step": 2936, + "time_per_iteration": 2.916745662689209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158764, + "balance_loss_mlp": 1.08003819, + "epoch": 0.5650250096190843, + "flos": 804682613760.0, + "grad_norm": 0.03045212290633188, + "language_loss": 0.89034498, + "learning_rate": 0.000419352841898607, + "loss": 0.9019326, + "num_input_tokens_seen": 244983760, + "router_z_loss_mlp": 0.78662109, + "step": 2937, + "time_per_iteration": 3.019742250442505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154573, + "balance_loss_mlp": 1.07541847, + "epoch": 0.5652173913043478, + "flos": 583144106496.0, + "grad_norm": 0.0352415717236192, + "language_loss": 0.82975399, + "learning_rate": 0.000419045394489532, + "loss": 0.84129971, + "num_input_tokens_seen": 245053184, + "router_z_loss_mlp": 0.79003906, + "step": 2938, + "time_per_iteration": 2.7263834476470947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155775, + "balance_loss_mlp": 1.07661998, + "epoch": 0.5654097729896114, + "flos": 822167099904.0, + "grad_norm": 0.030545896529673648, + "language_loss": 0.81679785, + "learning_rate": 0.0004187379785098224, + "loss": 0.82835561, + "num_input_tokens_seen": 245137408, + "router_z_loss_mlp": 0.7890625, + "step": 2939, + "time_per_iteration": 3.125208854675293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155934, + "balance_loss_mlp": 1.07682657, + "epoch": 0.565602154674875, + "flos": 785481332736.0, + "grad_norm": 0.038076573598017076, + "language_loss": 0.89879513, + "learning_rate": 0.00041843059407882744, + "loss": 0.9103545, + "num_input_tokens_seen": 245215504, + "router_z_loss_mlp": 0.78857422, + "step": 2940, + "time_per_iteration": 2.9577417373657227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157161, + "balance_loss_mlp": 1.07814884, + "epoch": 0.5657945363601385, + "flos": 550744048128.0, + "grad_norm": 0.03292975836505615, + "language_loss": 0.88439214, + "learning_rate": 0.0004181232413158842, + "loss": 0.89596379, + "num_input_tokens_seen": 245286032, + "router_z_loss_mlp": 0.78759766, + "step": 2941, + "time_per_iteration": 2.636016845703125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156819, + "balance_loss_mlp": 1.07771146, + "epoch": 0.5659869180454021, + "flos": 669331656192.0, + "grad_norm": 0.0384606105275957, + "language_loss": 0.88344961, + "learning_rate": 0.0004178159203403179, + "loss": 0.89501786, + "num_input_tokens_seen": 245359040, + "router_z_loss_mlp": 0.78857422, + "step": 2942, + "time_per_iteration": 2.873724937438965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157408, + "balance_loss_mlp": 1.07839596, + "epoch": 0.5661792997306656, + "flos": 500948826624.0, + "grad_norm": 0.031907837289758996, + "language_loss": 0.86677325, + "learning_rate": 0.0004175086312714409, + "loss": 0.8783474, + "num_input_tokens_seen": 245426384, + "router_z_loss_mlp": 0.78808594, + "step": 2943, + "time_per_iteration": 2.553450107574463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160396, + "balance_loss_mlp": 1.08138418, + "epoch": 0.5663716814159292, + "flos": 602362851840.0, + "grad_norm": 0.02897032807353051, + "language_loss": 0.8872959, + "learning_rate": 0.00041720137422855366, + "loss": 0.89889991, + "num_input_tokens_seen": 245501216, + "router_z_loss_mlp": 0.78759766, + "step": 2944, + "time_per_iteration": 2.7116591930389404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159876, + "balance_loss_mlp": 1.08095932, + "epoch": 0.5665640631011928, + "flos": 542032367616.0, + "grad_norm": 0.031139658556859174, + "language_loss": 0.83964241, + "learning_rate": 0.00041689414933094383, + "loss": 0.85124123, + "num_input_tokens_seen": 245571600, + "router_z_loss_mlp": 0.78710938, + "step": 2945, + "time_per_iteration": 2.638216495513916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158364, + "balance_loss_mlp": 1.07968628, + "epoch": 0.5667564447864564, + "flos": 603061794816.0, + "grad_norm": 0.037847476611961306, + "language_loss": 0.8757143, + "learning_rate": 0.00041658695669788653, + "loss": 0.88729787, + "num_input_tokens_seen": 245645632, + "router_z_loss_mlp": 0.78613281, + "step": 2946, + "time_per_iteration": 2.736724615097046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159515, + "balance_loss_mlp": 1.08074152, + "epoch": 0.5669488264717198, + "flos": 660722033664.0, + "grad_norm": 0.03809672024086723, + "language_loss": 0.87564874, + "learning_rate": 0.00041627979644864453, + "loss": 0.88724387, + "num_input_tokens_seen": 245715776, + "router_z_loss_mlp": 0.78662109, + "step": 2947, + "time_per_iteration": 2.787102460861206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160652, + "balance_loss_mlp": 1.08192623, + "epoch": 0.5671412081569834, + "flos": 486382362624.0, + "grad_norm": 0.028726289994514737, + "language_loss": 0.86769605, + "learning_rate": 0.0004159726687024683, + "loss": 0.87930262, + "num_input_tokens_seen": 245785328, + "router_z_loss_mlp": 0.78662109, + "step": 2948, + "time_per_iteration": 2.627268075942993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157953, + "balance_loss_mlp": 1.07917941, + "epoch": 0.567333589842247, + "flos": 731060026368.0, + "grad_norm": 0.031224685517340662, + "language_loss": 0.85094821, + "learning_rate": 0.00041566557357859506, + "loss": 0.86252779, + "num_input_tokens_seen": 245858000, + "router_z_loss_mlp": 0.78710938, + "step": 2949, + "time_per_iteration": 2.903480052947998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115639, + "balance_loss_mlp": 1.07737851, + "epoch": 0.5675259715275106, + "flos": 970558381056.0, + "grad_norm": 0.02889906202993953, + "language_loss": 0.84761345, + "learning_rate": 0.0004153585111962502, + "loss": 0.85917735, + "num_input_tokens_seen": 245950640, + "router_z_loss_mlp": 0.78857422, + "step": 2950, + "time_per_iteration": 3.327157497406006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155395, + "balance_loss_mlp": 1.07638264, + "epoch": 0.5677183532127742, + "flos": 566213571072.0, + "grad_norm": 0.036221800053715905, + "language_loss": 0.90357536, + "learning_rate": 0.0004150514816746453, + "loss": 0.9151293, + "num_input_tokens_seen": 246019568, + "router_z_loss_mlp": 0.78857422, + "step": 2951, + "time_per_iteration": 2.664881467819214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155178, + "balance_loss_mlp": 1.07640433, + "epoch": 0.5679107348980377, + "flos": 552745549824.0, + "grad_norm": 0.032718571293428464, + "language_loss": 0.90599716, + "learning_rate": 0.0004147444851329802, + "loss": 0.91754901, + "num_input_tokens_seen": 246089520, + "router_z_loss_mlp": 0.78710938, + "step": 2952, + "time_per_iteration": 2.659607410430908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156293, + "balance_loss_mlp": 1.07752001, + "epoch": 0.5681031165833013, + "flos": 820840346112.0, + "grad_norm": 0.029462667986489877, + "language_loss": 0.91018391, + "learning_rate": 0.00041443752169044126, + "loss": 0.92174685, + "num_input_tokens_seen": 246165920, + "router_z_loss_mlp": 0.78710938, + "step": 2953, + "time_per_iteration": 3.0214719772338867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115648, + "balance_loss_mlp": 1.07775402, + "epoch": 0.5682954982685648, + "flos": 619145667072.0, + "grad_norm": 0.03021657930021912, + "language_loss": 0.89565808, + "learning_rate": 0.0004141305914662025, + "loss": 0.90722287, + "num_input_tokens_seen": 246238672, + "router_z_loss_mlp": 0.78662109, + "step": 2954, + "time_per_iteration": 2.7215545177459717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154854, + "balance_loss_mlp": 1.07608008, + "epoch": 0.5684878799538284, + "flos": 649251511296.0, + "grad_norm": 0.03170231797387521, + "language_loss": 0.85884857, + "learning_rate": 0.0004138236945794246, + "loss": 0.87039715, + "num_input_tokens_seen": 246320208, + "router_z_loss_mlp": 0.78613281, + "step": 2955, + "time_per_iteration": 2.896960496902466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154548, + "balance_loss_mlp": 1.07587004, + "epoch": 0.5686802616390919, + "flos": 807352859136.0, + "grad_norm": 0.03477888356704498, + "language_loss": 0.88849628, + "learning_rate": 0.00041351683114925576, + "loss": 0.90004176, + "num_input_tokens_seen": 246406464, + "router_z_loss_mlp": 0.78564453, + "step": 2956, + "time_per_iteration": 3.056138753890991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155475, + "balance_loss_mlp": 1.07698798, + "epoch": 0.5688726433243555, + "flos": 548175860736.0, + "grad_norm": 0.02988071875067647, + "language_loss": 0.91774637, + "learning_rate": 0.0004132100012948308, + "loss": 0.92930108, + "num_input_tokens_seen": 246477456, + "router_z_loss_mlp": 0.78320312, + "step": 2957, + "time_per_iteration": 2.620039701461792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153148, + "balance_loss_mlp": 1.07475579, + "epoch": 0.5690650250096191, + "flos": 487545933312.0, + "grad_norm": 0.03388139796228596, + "language_loss": 0.90210378, + "learning_rate": 0.00041290320513527145, + "loss": 0.91363525, + "num_input_tokens_seen": 246541744, + "router_z_loss_mlp": 0.78222656, + "step": 2958, + "time_per_iteration": 2.5424137115478516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158065, + "balance_loss_mlp": 1.07953036, + "epoch": 0.5692574066948827, + "flos": 578554951680.0, + "grad_norm": 0.03065337308060062, + "language_loss": 0.9014492, + "learning_rate": 0.0004125964427896867, + "loss": 0.91302985, + "num_input_tokens_seen": 246611440, + "router_z_loss_mlp": 0.78369141, + "step": 2959, + "time_per_iteration": 2.6540746688842773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157828, + "balance_loss_mlp": 1.07924569, + "epoch": 0.5694497883801463, + "flos": 455219735040.0, + "grad_norm": 0.03288997710459115, + "language_loss": 0.8486557, + "learning_rate": 0.0004122897143771723, + "loss": 0.86023396, + "num_input_tokens_seen": 246676496, + "router_z_loss_mlp": 0.78515625, + "step": 2960, + "time_per_iteration": 2.5677952766418457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157581, + "balance_loss_mlp": 1.07899833, + "epoch": 0.5696421700654097, + "flos": 560582369280.0, + "grad_norm": 0.029260680521972587, + "language_loss": 0.86686659, + "learning_rate": 0.0004119830200168109, + "loss": 0.87844241, + "num_input_tokens_seen": 246746464, + "router_z_loss_mlp": 0.78515625, + "step": 2961, + "time_per_iteration": 2.661398410797119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116102, + "balance_loss_mlp": 1.08243668, + "epoch": 0.5698345517506733, + "flos": 466501604352.0, + "grad_norm": 0.06131137217333051, + "language_loss": 0.93434393, + "learning_rate": 0.0004116763598276714, + "loss": 0.94595408, + "num_input_tokens_seen": 246811808, + "router_z_loss_mlp": 0.78515625, + "step": 2962, + "time_per_iteration": 2.5421509742736816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161307, + "balance_loss_mlp": 1.08267653, + "epoch": 0.5700269334359369, + "flos": 607191051264.0, + "grad_norm": 0.033090735660708526, + "language_loss": 0.8645342, + "learning_rate": 0.00041136973392881017, + "loss": 0.87614727, + "num_input_tokens_seen": 246890432, + "router_z_loss_mlp": 0.78515625, + "step": 2963, + "time_per_iteration": 2.826312303543091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116111, + "balance_loss_mlp": 1.08233654, + "epoch": 0.5702193151212005, + "flos": 563856230400.0, + "grad_norm": 0.029371137494056676, + "language_loss": 0.87366056, + "learning_rate": 0.00041106314243926983, + "loss": 0.88527167, + "num_input_tokens_seen": 246959616, + "router_z_loss_mlp": 0.78613281, + "step": 2964, + "time_per_iteration": 2.729848861694336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163001, + "balance_loss_mlp": 1.08432257, + "epoch": 0.570411696806464, + "flos": 524309563392.0, + "grad_norm": 0.030081020285570834, + "language_loss": 0.91922152, + "learning_rate": 0.0004107565854780798, + "loss": 0.93085158, + "num_input_tokens_seen": 247030656, + "router_z_loss_mlp": 0.78564453, + "step": 2965, + "time_per_iteration": 2.6243247985839844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162398, + "balance_loss_mlp": 1.08348167, + "epoch": 0.5706040784917276, + "flos": 719471983104.0, + "grad_norm": 0.03134673766290682, + "language_loss": 0.86833286, + "learning_rate": 0.000410450063164256, + "loss": 0.87995684, + "num_input_tokens_seen": 247105872, + "router_z_loss_mlp": 0.78710938, + "step": 2966, + "time_per_iteration": 2.8488268852233887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160157, + "balance_loss_mlp": 1.08109784, + "epoch": 0.5707964601769911, + "flos": 477670682112.0, + "grad_norm": 0.03469711129941245, + "language_loss": 0.88420385, + "learning_rate": 0.00041014357561680115, + "loss": 0.89580548, + "num_input_tokens_seen": 247170448, + "router_z_loss_mlp": 0.78808594, + "step": 2967, + "time_per_iteration": 2.531399965286255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158843, + "balance_loss_mlp": 1.07997382, + "epoch": 0.5709888418622547, + "flos": 581216464896.0, + "grad_norm": 0.0299141756983156, + "language_loss": 0.91230297, + "learning_rate": 0.0004098371229547039, + "loss": 0.92389137, + "num_input_tokens_seen": 247240400, + "router_z_loss_mlp": 0.78662109, + "step": 2968, + "time_per_iteration": 2.7010715007781982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166153, + "balance_loss_mlp": 1.08947754, + "epoch": 0.5711812235475183, + "flos": 1583192707584.0, + "grad_norm": 0.007250174551889785, + "language_loss": 0.80010808, + "learning_rate": 0.0004095307052969399, + "loss": 0.8117696, + "num_input_tokens_seen": 247469136, + "router_z_loss_mlp": 0.76757812, + "step": 2969, + "time_per_iteration": 4.720959663391113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158975, + "balance_loss_mlp": 1.08001077, + "epoch": 0.5713736052327818, + "flos": 469497489408.0, + "grad_norm": 0.030927251593918268, + "language_loss": 0.85219097, + "learning_rate": 0.00040922432276247107, + "loss": 0.86378068, + "num_input_tokens_seen": 247537712, + "router_z_loss_mlp": 0.78710938, + "step": 2970, + "time_per_iteration": 2.5976855754852295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155112, + "balance_loss_mlp": 1.07610035, + "epoch": 0.5715659869180454, + "flos": 538754503680.0, + "grad_norm": 0.02782082883725602, + "language_loss": 0.88734138, + "learning_rate": 0.0004089179754702457, + "loss": 0.89889252, + "num_input_tokens_seen": 247613872, + "router_z_loss_mlp": 0.78759766, + "step": 2971, + "time_per_iteration": 2.735511064529419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155002, + "balance_loss_mlp": 1.07608509, + "epoch": 0.571758368603309, + "flos": 657250787328.0, + "grad_norm": 0.03021364085019089, + "language_loss": 0.86246514, + "learning_rate": 0.00040861166353919843, + "loss": 0.87401509, + "num_input_tokens_seen": 247686064, + "router_z_loss_mlp": 0.78710938, + "step": 2972, + "time_per_iteration": 2.784243583679199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156758, + "balance_loss_mlp": 1.07808018, + "epoch": 0.5719507502885726, + "flos": 669099342336.0, + "grad_norm": 0.04093131787913085, + "language_loss": 0.87037605, + "learning_rate": 0.00040830538708824983, + "loss": 0.8819437, + "num_input_tokens_seen": 247760384, + "router_z_loss_mlp": 0.78564453, + "step": 2973, + "time_per_iteration": 2.847334861755371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156641, + "balance_loss_mlp": 1.07815385, + "epoch": 0.572143131973836, + "flos": 477279914496.0, + "grad_norm": 0.029260532033913305, + "language_loss": 0.87478364, + "learning_rate": 0.000407999146236307, + "loss": 0.88635004, + "num_input_tokens_seen": 247824768, + "router_z_loss_mlp": 0.78417969, + "step": 2974, + "time_per_iteration": 2.5809874534606934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156886, + "balance_loss_mlp": 1.07849395, + "epoch": 0.5723355136590996, + "flos": 540534425088.0, + "grad_norm": 0.03484414683288605, + "language_loss": 0.89636898, + "learning_rate": 0.0004076929411022634, + "loss": 0.90793782, + "num_input_tokens_seen": 247894448, + "router_z_loss_mlp": 0.78320312, + "step": 2975, + "time_per_iteration": 2.631016969680786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156314, + "balance_loss_mlp": 1.07782686, + "epoch": 0.5725278953443632, + "flos": 825649079808.0, + "grad_norm": 0.03393435544828211, + "language_loss": 0.84972572, + "learning_rate": 0.0004073867718049982, + "loss": 0.86128891, + "num_input_tokens_seen": 247976432, + "router_z_loss_mlp": 0.78369141, + "step": 2976, + "time_per_iteration": 3.09523606300354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158881, + "balance_loss_mlp": 1.08044088, + "epoch": 0.5727202770296268, + "flos": 588569190912.0, + "grad_norm": 0.031011693938846972, + "language_loss": 0.87586653, + "learning_rate": 0.00040708063846337704, + "loss": 0.88745534, + "num_input_tokens_seen": 248048800, + "router_z_loss_mlp": 0.78222656, + "step": 2977, + "time_per_iteration": 2.7148561477661133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159545, + "balance_loss_mlp": 1.08100963, + "epoch": 0.5729126587148904, + "flos": 447940869120.0, + "grad_norm": 0.0318916011479424, + "language_loss": 0.87124234, + "learning_rate": 0.00040677454119625143, + "loss": 0.88283777, + "num_input_tokens_seen": 248116496, + "router_z_loss_mlp": 0.78320312, + "step": 2978, + "time_per_iteration": 2.6003363132476807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158917, + "balance_loss_mlp": 1.0804776, + "epoch": 0.5731050404001539, + "flos": 520467015168.0, + "grad_norm": 0.03318988951179658, + "language_loss": 0.88396186, + "learning_rate": 0.0004064684801224587, + "loss": 0.89555109, + "num_input_tokens_seen": 248184960, + "router_z_loss_mlp": 0.78173828, + "step": 2979, + "time_per_iteration": 2.6103272438049316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160698, + "balance_loss_mlp": 1.08225846, + "epoch": 0.5732974220854175, + "flos": 505770295296.0, + "grad_norm": 0.029710652762807207, + "language_loss": 0.85663891, + "learning_rate": 0.00040616245536082224, + "loss": 0.86824596, + "num_input_tokens_seen": 248252208, + "router_z_loss_mlp": 0.78222656, + "step": 2980, + "time_per_iteration": 2.5594868659973145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159175, + "balance_loss_mlp": 1.08078313, + "epoch": 0.573489803770681, + "flos": 593677367808.0, + "grad_norm": 0.027966372317681742, + "language_loss": 0.86258745, + "learning_rate": 0.00040585646703015165, + "loss": 0.87417924, + "num_input_tokens_seen": 248333312, + "router_z_loss_mlp": 0.78320312, + "step": 2981, + "time_per_iteration": 2.789937734603882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157316, + "balance_loss_mlp": 1.07878125, + "epoch": 0.5736821854559446, + "flos": 490869459456.0, + "grad_norm": 0.031111464824263694, + "language_loss": 0.83780992, + "learning_rate": 0.0004055505152492419, + "loss": 0.84938312, + "num_input_tokens_seen": 248403808, + "router_z_loss_mlp": 0.78466797, + "step": 2982, + "time_per_iteration": 2.6471428871154785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158265, + "balance_loss_mlp": 1.07963431, + "epoch": 0.5738745671412081, + "flos": 459201271296.0, + "grad_norm": 0.03311000411840089, + "language_loss": 0.79528159, + "learning_rate": 0.00040524460013687425, + "loss": 0.80686426, + "num_input_tokens_seen": 248477184, + "router_z_loss_mlp": 0.78564453, + "step": 2983, + "time_per_iteration": 2.708540678024292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155372, + "balance_loss_mlp": 1.07650268, + "epoch": 0.5740669488264717, + "flos": 581620694016.0, + "grad_norm": 0.028109694322635652, + "language_loss": 0.86855406, + "learning_rate": 0.0004049387218118155, + "loss": 0.88010776, + "num_input_tokens_seen": 248565552, + "router_z_loss_mlp": 0.78759766, + "step": 2984, + "time_per_iteration": 2.926750421524048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155283, + "balance_loss_mlp": 1.07622325, + "epoch": 0.5742593305117353, + "flos": 525573190656.0, + "grad_norm": 0.03395381439898354, + "language_loss": 0.91635472, + "learning_rate": 0.00040463288039281777, + "loss": 0.92790747, + "num_input_tokens_seen": 248635456, + "router_z_loss_mlp": 0.78857422, + "step": 2985, + "time_per_iteration": 2.704287528991699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162964, + "balance_loss_mlp": 1.08666992, + "epoch": 0.5744517121969989, + "flos": 1557266511360.0, + "grad_norm": 0.007878379047691413, + "language_loss": 0.77876419, + "learning_rate": 0.0004043270759986194, + "loss": 0.79039383, + "num_input_tokens_seen": 248870160, + "router_z_loss_mlp": 0.76367188, + "step": 2986, + "time_per_iteration": 4.989194869995117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155742, + "balance_loss_mlp": 1.07677734, + "epoch": 0.5746440938822625, + "flos": 753202798080.0, + "grad_norm": 0.03402997808137808, + "language_loss": 0.87620312, + "learning_rate": 0.0004040213087479444, + "loss": 0.88776052, + "num_input_tokens_seen": 248946960, + "router_z_loss_mlp": 0.78759766, + "step": 2987, + "time_per_iteration": 2.9275078773498535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163311, + "balance_loss_mlp": 1.08453715, + "epoch": 0.5748364755675259, + "flos": 502857002496.0, + "grad_norm": 0.03361733343242669, + "language_loss": 0.90824878, + "learning_rate": 0.0004037155787595018, + "loss": 0.91988194, + "num_input_tokens_seen": 249014128, + "router_z_loss_mlp": 0.78710938, + "step": 2988, + "time_per_iteration": 2.576448440551758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160011, + "balance_loss_mlp": 1.08109498, + "epoch": 0.5750288572527895, + "flos": 505197605376.0, + "grad_norm": 0.02880586923954642, + "language_loss": 0.85724807, + "learning_rate": 0.000403409886151987, + "loss": 0.86884815, + "num_input_tokens_seen": 249090016, + "router_z_loss_mlp": 0.78759766, + "step": 2989, + "time_per_iteration": 2.916322946548462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157013, + "balance_loss_mlp": 1.08033752, + "epoch": 0.5752212389380531, + "flos": 1544675352576.0, + "grad_norm": 0.005932241765552608, + "language_loss": 0.81999105, + "learning_rate": 0.0004031042310440799, + "loss": 0.83156121, + "num_input_tokens_seen": 249305552, + "router_z_loss_mlp": 0.765625, + "step": 2990, + "time_per_iteration": 4.758445978164673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115937, + "balance_loss_mlp": 1.08269501, + "epoch": 0.5754136206233167, + "flos": 1570671406080.0, + "grad_norm": 0.005822498768858246, + "language_loss": 0.781986, + "learning_rate": 0.00040279861355444656, + "loss": 0.7935797, + "num_input_tokens_seen": 249523408, + "router_z_loss_mlp": 0.765625, + "step": 2991, + "time_per_iteration": 4.785308122634888 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163075, + "balance_loss_mlp": 1.08420658, + "epoch": 0.5756060023085803, + "flos": 799561701888.0, + "grad_norm": 0.0320241684810352, + "language_loss": 0.81581879, + "learning_rate": 0.00040249303380173807, + "loss": 0.82744956, + "num_input_tokens_seen": 249616624, + "router_z_loss_mlp": 0.78808594, + "step": 2992, + "time_per_iteration": 3.060910940170288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160943, + "balance_loss_mlp": 1.08202648, + "epoch": 0.5757983839938438, + "flos": 589033818624.0, + "grad_norm": 0.033230938583522406, + "language_loss": 0.85061818, + "learning_rate": 0.00040218749190459126, + "loss": 0.86222756, + "num_input_tokens_seen": 249689936, + "router_z_loss_mlp": 0.78857422, + "step": 2993, + "time_per_iteration": 2.722538948059082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159067, + "balance_loss_mlp": 1.08029306, + "epoch": 0.5759907656791073, + "flos": 517851164160.0, + "grad_norm": 0.036503805232005304, + "language_loss": 0.88598883, + "learning_rate": 0.00040188198798162775, + "loss": 0.89757949, + "num_input_tokens_seen": 249759984, + "router_z_loss_mlp": 0.78662109, + "step": 2994, + "time_per_iteration": 2.626763105392456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157444, + "balance_loss_mlp": 1.078861, + "epoch": 0.5761831473643709, + "flos": 588289213440.0, + "grad_norm": 0.030677551313055676, + "language_loss": 0.90523088, + "learning_rate": 0.000401576522151455, + "loss": 0.91680533, + "num_input_tokens_seen": 249837888, + "router_z_loss_mlp": 0.78466797, + "step": 2995, + "time_per_iteration": 2.8290417194366455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156979, + "balance_loss_mlp": 1.07839644, + "epoch": 0.5763755290496345, + "flos": 545008786944.0, + "grad_norm": 0.030026851509959627, + "language_loss": 0.87201327, + "learning_rate": 0.0004012710945326651, + "loss": 0.88358307, + "num_input_tokens_seen": 249913584, + "router_z_loss_mlp": 0.78515625, + "step": 2996, + "time_per_iteration": 2.78725004196167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156215, + "balance_loss_mlp": 1.07767999, + "epoch": 0.576567910734898, + "flos": 627427648512.0, + "grad_norm": 0.03065527687354923, + "language_loss": 0.86651611, + "learning_rate": 0.0004009657052438355, + "loss": 0.87807822, + "num_input_tokens_seen": 249992144, + "router_z_loss_mlp": 0.78271484, + "step": 2997, + "time_per_iteration": 2.8221359252929688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156096, + "balance_loss_mlp": 1.07756102, + "epoch": 0.5767602924201616, + "flos": 539277528576.0, + "grad_norm": 0.032463443859892846, + "language_loss": 0.9117527, + "learning_rate": 0.00040066035440352904, + "loss": 0.92331362, + "num_input_tokens_seen": 250060736, + "router_z_loss_mlp": 0.78271484, + "step": 2998, + "time_per_iteration": 2.614291191101074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169762, + "balance_loss_mlp": 1.09403992, + "epoch": 0.5769526741054252, + "flos": 1563023239680.0, + "grad_norm": 0.012552051598097233, + "language_loss": 0.79293132, + "learning_rate": 0.0004003550421302934, + "loss": 0.80462897, + "num_input_tokens_seen": 250296864, + "router_z_loss_mlp": 0.7578125, + "step": 2999, + "time_per_iteration": 4.9131574630737305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162548, + "balance_loss_mlp": 1.0844425, + "epoch": 0.5771450557906888, + "flos": 469171849728.0, + "grad_norm": 0.03695219944655869, + "language_loss": 0.82297212, + "learning_rate": 0.00040004976854266145, + "loss": 0.83459759, + "num_input_tokens_seen": 250362528, + "router_z_loss_mlp": 0.78027344, + "step": 3000, + "time_per_iteration": 2.599562406539917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161323, + "balance_loss_mlp": 1.08321714, + "epoch": 0.5773374374759523, + "flos": 575632926720.0, + "grad_norm": 0.03253250172707863, + "language_loss": 0.86701882, + "learning_rate": 0.0003997445337591505, + "loss": 0.87863207, + "num_input_tokens_seen": 250432768, + "router_z_loss_mlp": 0.78027344, + "step": 3001, + "time_per_iteration": 2.651052951812744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161912, + "balance_loss_mlp": 1.08380568, + "epoch": 0.5775298191612158, + "flos": 529504335360.0, + "grad_norm": 0.030455172240490772, + "language_loss": 0.78589356, + "learning_rate": 0.0003994393378982635, + "loss": 0.79751271, + "num_input_tokens_seen": 250501504, + "router_z_loss_mlp": 0.78027344, + "step": 3002, + "time_per_iteration": 2.6081488132476807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162445, + "balance_loss_mlp": 1.08576965, + "epoch": 0.5777222008464794, + "flos": 1306896520704.0, + "grad_norm": 0.00976162227486582, + "language_loss": 0.79538, + "learning_rate": 0.00039913418107848786, + "loss": 0.80700445, + "num_input_tokens_seen": 250733632, + "router_z_loss_mlp": 0.765625, + "step": 3003, + "time_per_iteration": 4.794616460800171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154088, + "balance_loss_mlp": 1.07550502, + "epoch": 0.577914582531743, + "flos": 604792051200.0, + "grad_norm": 0.035927509548420514, + "language_loss": 0.93844306, + "learning_rate": 0.0003988290634182961, + "loss": 0.94998395, + "num_input_tokens_seen": 250809152, + "router_z_loss_mlp": 0.78417969, + "step": 3004, + "time_per_iteration": 2.7580206394195557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152956, + "balance_loss_mlp": 1.07465923, + "epoch": 0.5781069642170066, + "flos": 487832641536.0, + "grad_norm": 0.03166140659951907, + "language_loss": 0.85788441, + "learning_rate": 0.0003985239850361453, + "loss": 0.86941397, + "num_input_tokens_seen": 250879152, + "router_z_loss_mlp": 0.78173828, + "step": 3005, + "time_per_iteration": 2.5811102390289307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148402, + "balance_loss_mlp": 1.0700103, + "epoch": 0.5782993459022701, + "flos": 507413956608.0, + "grad_norm": 0.03361154868402879, + "language_loss": 0.90845788, + "learning_rate": 0.0003982189460504777, + "loss": 0.9199419, + "num_input_tokens_seen": 250949904, + "router_z_loss_mlp": 0.78271484, + "step": 3006, + "time_per_iteration": 2.701486349105835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150426, + "balance_loss_mlp": 1.07208133, + "epoch": 0.5784917275875336, + "flos": 603294108672.0, + "grad_norm": 0.03266847587020217, + "language_loss": 0.84488243, + "learning_rate": 0.00039791394657971935, + "loss": 0.85638666, + "num_input_tokens_seen": 251020976, + "router_z_loss_mlp": 0.78222656, + "step": 3007, + "time_per_iteration": 2.7029902935028076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114812, + "balance_loss_mlp": 1.06953716, + "epoch": 0.5786841092727972, + "flos": 522588039168.0, + "grad_norm": 0.03327041662205967, + "language_loss": 0.89717233, + "learning_rate": 0.00039760898674228205, + "loss": 0.90865356, + "num_input_tokens_seen": 251093280, + "router_z_loss_mlp": 0.78466797, + "step": 3008, + "time_per_iteration": 2.6650431156158447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163782, + "balance_loss_mlp": 1.08510339, + "epoch": 0.5788764909580608, + "flos": 768835504128.0, + "grad_norm": 0.02880825356575122, + "language_loss": 0.85863519, + "learning_rate": 0.0003973040666565613, + "loss": 0.87027305, + "num_input_tokens_seen": 251181376, + "router_z_loss_mlp": 0.78515625, + "step": 3009, + "time_per_iteration": 3.0480079650878906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165461, + "balance_loss_mlp": 1.08668745, + "epoch": 0.5790688726433244, + "flos": 600331150848.0, + "grad_norm": 0.03153140111016463, + "language_loss": 0.87491179, + "learning_rate": 0.000396999186440938, + "loss": 0.8865664, + "num_input_tokens_seen": 251256176, + "router_z_loss_mlp": 0.78515625, + "step": 3010, + "time_per_iteration": 2.866971254348755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163905, + "balance_loss_mlp": 1.08517945, + "epoch": 0.5792612543285879, + "flos": 524105447424.0, + "grad_norm": 0.03493307290908607, + "language_loss": 0.90569246, + "learning_rate": 0.000396694346213777, + "loss": 0.91733146, + "num_input_tokens_seen": 251325344, + "router_z_loss_mlp": 0.78564453, + "step": 3011, + "time_per_iteration": 2.6576690673828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160972, + "balance_loss_mlp": 1.08234167, + "epoch": 0.5794536360138515, + "flos": 878079618048.0, + "grad_norm": 0.028681737588389107, + "language_loss": 0.88734698, + "learning_rate": 0.0003963895460934276, + "loss": 0.89895672, + "num_input_tokens_seen": 251406656, + "router_z_loss_mlp": 0.78369141, + "step": 3012, + "time_per_iteration": 3.1439104080200195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159333, + "balance_loss_mlp": 1.08065438, + "epoch": 0.5796460176991151, + "flos": 402298372608.0, + "grad_norm": 0.038884721414284784, + "language_loss": 0.92029333, + "learning_rate": 0.00039608478619822376, + "loss": 0.93188667, + "num_input_tokens_seen": 251467760, + "router_z_loss_mlp": 0.78613281, + "step": 3013, + "time_per_iteration": 2.4331459999084473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115895, + "balance_loss_mlp": 1.08032, + "epoch": 0.5798383993843786, + "flos": 619675422720.0, + "grad_norm": 0.029275699876953817, + "language_loss": 0.87518513, + "learning_rate": 0.00039578006664648394, + "loss": 0.88677466, + "num_input_tokens_seen": 251542272, + "router_z_loss_mlp": 0.78417969, + "step": 3014, + "time_per_iteration": 2.770930290222168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157872, + "balance_loss_mlp": 1.07928884, + "epoch": 0.5800307810696421, + "flos": 845792351232.0, + "grad_norm": 0.03304881172222658, + "language_loss": 0.8676393, + "learning_rate": 0.0003954753875565105, + "loss": 0.87921804, + "num_input_tokens_seen": 251625584, + "router_z_loss_mlp": 0.78320312, + "step": 3015, + "time_per_iteration": 3.08627986907959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155618, + "balance_loss_mlp": 1.0769875, + "epoch": 0.5802231627549057, + "flos": 570364294656.0, + "grad_norm": 0.02949140039649942, + "language_loss": 0.86755216, + "learning_rate": 0.00039517074904659057, + "loss": 0.87910825, + "num_input_tokens_seen": 251696704, + "router_z_loss_mlp": 0.78369141, + "step": 3016, + "time_per_iteration": 2.685842990875244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155954, + "balance_loss_mlp": 1.07732403, + "epoch": 0.5804155444401693, + "flos": 661662022656.0, + "grad_norm": 0.030068480846806175, + "language_loss": 0.90490985, + "learning_rate": 0.00039486615123499535, + "loss": 0.91646945, + "num_input_tokens_seen": 251774784, + "router_z_loss_mlp": 0.78369141, + "step": 3017, + "time_per_iteration": 2.8422367572784424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158277, + "balance_loss_mlp": 1.07950318, + "epoch": 0.5806079261254329, + "flos": 515057393664.0, + "grad_norm": 0.0339975061302382, + "language_loss": 0.90716887, + "learning_rate": 0.00039456159423997996, + "loss": 0.91875166, + "num_input_tokens_seen": 251844768, + "router_z_loss_mlp": 0.78515625, + "step": 3018, + "time_per_iteration": 2.6301286220550537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159604, + "balance_loss_mlp": 1.08116388, + "epoch": 0.5808003078106965, + "flos": 529717183488.0, + "grad_norm": 0.035522237622510534, + "language_loss": 0.94178265, + "learning_rate": 0.00039425707817978406, + "loss": 0.95337874, + "num_input_tokens_seen": 251912736, + "router_z_loss_mlp": 0.78320312, + "step": 3019, + "time_per_iteration": 2.6516103744506836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159065, + "balance_loss_mlp": 1.08033943, + "epoch": 0.58099268949596, + "flos": 477996321792.0, + "grad_norm": 0.033660479575399194, + "language_loss": 0.88736534, + "learning_rate": 0.00039395260317263124, + "loss": 0.89895594, + "num_input_tokens_seen": 251979328, + "router_z_loss_mlp": 0.78466797, + "step": 3020, + "time_per_iteration": 2.5736000537872314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158964, + "balance_loss_mlp": 1.08033383, + "epoch": 0.5811850711812235, + "flos": 518687093760.0, + "grad_norm": 0.032372571582398105, + "language_loss": 0.90171605, + "learning_rate": 0.0003936481693367291, + "loss": 0.9133057, + "num_input_tokens_seen": 252050928, + "router_z_loss_mlp": 0.78417969, + "step": 3021, + "time_per_iteration": 2.655585289001465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152938, + "balance_loss_mlp": 1.07416463, + "epoch": 0.5813774528664871, + "flos": 617626257408.0, + "grad_norm": 0.037353178472421755, + "language_loss": 0.94038713, + "learning_rate": 0.0003933437767902697, + "loss": 0.95191658, + "num_input_tokens_seen": 252126496, + "router_z_loss_mlp": 0.78564453, + "step": 3022, + "time_per_iteration": 2.7785356044769287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155749, + "balance_loss_mlp": 1.07707083, + "epoch": 0.5815698345517507, + "flos": 568603838976.0, + "grad_norm": 0.03237494754713459, + "language_loss": 0.83540273, + "learning_rate": 0.00039303942565142825, + "loss": 0.84696019, + "num_input_tokens_seen": 252203008, + "router_z_loss_mlp": 0.78466797, + "step": 3023, + "time_per_iteration": 2.8082921504974365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115966, + "balance_loss_mlp": 1.08122075, + "epoch": 0.5817622162370142, + "flos": 564303393792.0, + "grad_norm": 0.030406133972166762, + "language_loss": 0.81602162, + "learning_rate": 0.0003927351160383644, + "loss": 0.82761824, + "num_input_tokens_seen": 252283440, + "router_z_loss_mlp": 0.78369141, + "step": 3024, + "time_per_iteration": 2.8258216381073 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115841, + "balance_loss_mlp": 1.07992303, + "epoch": 0.5819545979222778, + "flos": 460153995264.0, + "grad_norm": 0.0330231934286986, + "language_loss": 0.82985759, + "learning_rate": 0.000392430848069222, + "loss": 0.84144175, + "num_input_tokens_seen": 252351760, + "router_z_loss_mlp": 0.78369141, + "step": 3025, + "time_per_iteration": 2.552351713180542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155737, + "balance_loss_mlp": 1.0769639, + "epoch": 0.5821469796075414, + "flos": 542516461056.0, + "grad_norm": 0.03445814315346002, + "language_loss": 0.88443869, + "learning_rate": 0.00039212662186212795, + "loss": 0.89599597, + "num_input_tokens_seen": 252418480, + "router_z_loss_mlp": 0.78515625, + "step": 3026, + "time_per_iteration": 2.6369402408599854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157395, + "balance_loss_mlp": 1.07890785, + "epoch": 0.582339361292805, + "flos": 553340433408.0, + "grad_norm": 0.029462079730168216, + "language_loss": 0.82325065, + "learning_rate": 0.0003918224375351934, + "loss": 0.83482456, + "num_input_tokens_seen": 252493712, + "router_z_loss_mlp": 0.78369141, + "step": 3027, + "time_per_iteration": 2.698915958404541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116249, + "balance_loss_mlp": 1.08386004, + "epoch": 0.5825317429780685, + "flos": 497447380992.0, + "grad_norm": 0.03190253080273137, + "language_loss": 0.83360291, + "learning_rate": 0.0003915182952065135, + "loss": 0.84522784, + "num_input_tokens_seen": 252566096, + "router_z_loss_mlp": 0.78417969, + "step": 3028, + "time_per_iteration": 2.6572346687316895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160994, + "balance_loss_mlp": 1.08265007, + "epoch": 0.582724124663332, + "flos": 565254116352.0, + "grad_norm": 0.030478660984130428, + "language_loss": 0.92836106, + "learning_rate": 0.0003912141949941664, + "loss": 0.93997103, + "num_input_tokens_seen": 252639424, + "router_z_loss_mlp": 0.78271484, + "step": 3029, + "time_per_iteration": 2.683072090148926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153282, + "balance_loss_mlp": 1.07484198, + "epoch": 0.5829165063485956, + "flos": 493112007168.0, + "grad_norm": 0.03294557051603365, + "language_loss": 0.89173961, + "learning_rate": 0.0003909101370162143, + "loss": 0.90327239, + "num_input_tokens_seen": 252706672, + "router_z_loss_mlp": 0.78369141, + "step": 3030, + "time_per_iteration": 2.575670003890991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160767, + "balance_loss_mlp": 1.08370972, + "epoch": 0.5831088880338592, + "flos": 1531877349888.0, + "grad_norm": 0.012849020092446796, + "language_loss": 0.72433889, + "learning_rate": 0.00039060612139070326, + "loss": 0.7359466, + "num_input_tokens_seen": 252932464, + "router_z_loss_mlp": 0.76953125, + "step": 3031, + "time_per_iteration": 4.9284889698028564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152337, + "balance_loss_mlp": 1.07370639, + "epoch": 0.5833012697191228, + "flos": 619208793600.0, + "grad_norm": 0.02929875839371022, + "language_loss": 0.87939668, + "learning_rate": 0.0003903021482356622, + "loss": 0.89092004, + "num_input_tokens_seen": 253011920, + "router_z_loss_mlp": 0.78466797, + "step": 3032, + "time_per_iteration": 2.8254482746124268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152205, + "balance_loss_mlp": 1.07362223, + "epoch": 0.5834936514043862, + "flos": 769293401088.0, + "grad_norm": 0.02695668391828596, + "language_loss": 0.87565535, + "learning_rate": 0.00038999821766910465, + "loss": 0.88717741, + "num_input_tokens_seen": 253091552, + "router_z_loss_mlp": 0.78417969, + "step": 3033, + "time_per_iteration": 3.006687641143799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156362, + "balance_loss_mlp": 1.07796979, + "epoch": 0.5836860330896498, + "flos": 459316064256.0, + "grad_norm": 0.030677066462792797, + "language_loss": 0.91205192, + "learning_rate": 0.00038969432980902606, + "loss": 0.92361552, + "num_input_tokens_seen": 253158608, + "router_z_loss_mlp": 0.78320312, + "step": 3034, + "time_per_iteration": 2.550684690475464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011586, + "balance_loss_mlp": 1.08192444, + "epoch": 0.5838784147749134, + "flos": 1364196191232.0, + "grad_norm": 0.008170267563240248, + "language_loss": 0.79784501, + "learning_rate": 0.0003893904847734068, + "loss": 0.80943102, + "num_input_tokens_seen": 253381184, + "router_z_loss_mlp": 0.765625, + "step": 3035, + "time_per_iteration": 4.859564304351807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154223, + "balance_loss_mlp": 1.07592607, + "epoch": 0.584070796460177, + "flos": 568288932864.0, + "grad_norm": 0.030253680936045732, + "language_loss": 0.87217242, + "learning_rate": 0.00038908668268020953, + "loss": 0.88371468, + "num_input_tokens_seen": 253452880, + "router_z_loss_mlp": 0.78222656, + "step": 3036, + "time_per_iteration": 2.7140538692474365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154776, + "balance_loss_mlp": 1.07624114, + "epoch": 0.5842631781454406, + "flos": 612665800704.0, + "grad_norm": 0.02904438680956131, + "language_loss": 0.90014827, + "learning_rate": 0.00038878292364738097, + "loss": 0.91169608, + "num_input_tokens_seen": 253530000, + "router_z_loss_mlp": 0.78271484, + "step": 3037, + "time_per_iteration": 2.787289619445801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157819, + "balance_loss_mlp": 1.07923615, + "epoch": 0.5844555598307041, + "flos": 464332916736.0, + "grad_norm": 0.03338514659593435, + "language_loss": 0.93144816, + "learning_rate": 0.0003884792077928508, + "loss": 0.94302636, + "num_input_tokens_seen": 253593504, + "router_z_loss_mlp": 0.78320312, + "step": 3038, + "time_per_iteration": 2.513655185699463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155243, + "balance_loss_mlp": 1.07666051, + "epoch": 0.5846479415159677, + "flos": 411057716736.0, + "grad_norm": 0.039769663121131886, + "language_loss": 0.82121253, + "learning_rate": 0.0003881755352345322, + "loss": 0.83276498, + "num_input_tokens_seen": 253657904, + "router_z_loss_mlp": 0.78320312, + "step": 3039, + "time_per_iteration": 2.5270330905914307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154802, + "balance_loss_mlp": 1.07641041, + "epoch": 0.5848403232012312, + "flos": 492265344000.0, + "grad_norm": 0.02801571871014385, + "language_loss": 0.90901846, + "learning_rate": 0.0003878719060903207, + "loss": 0.9205665, + "num_input_tokens_seen": 253725280, + "router_z_loss_mlp": 0.78222656, + "step": 3040, + "time_per_iteration": 2.5588507652282715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154937, + "balance_loss_mlp": 1.07644928, + "epoch": 0.5850327048864948, + "flos": 585508177920.0, + "grad_norm": 0.037771067006053156, + "language_loss": 0.89005375, + "learning_rate": 0.0003875683204780961, + "loss": 0.90160316, + "num_input_tokens_seen": 253795040, + "router_z_loss_mlp": 0.78271484, + "step": 3041, + "time_per_iteration": 2.668827533721924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152572, + "balance_loss_mlp": 1.07408428, + "epoch": 0.5852250865717584, + "flos": 652718028288.0, + "grad_norm": 0.037622145269810676, + "language_loss": 0.92115968, + "learning_rate": 0.00038726477851572043, + "loss": 0.93268543, + "num_input_tokens_seen": 253866384, + "router_z_loss_mlp": 0.78271484, + "step": 3042, + "time_per_iteration": 2.813145160675049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152742, + "balance_loss_mlp": 1.07434952, + "epoch": 0.5854174682570219, + "flos": 535619630592.0, + "grad_norm": 0.034632487357399135, + "language_loss": 0.85911977, + "learning_rate": 0.0003869612803210395, + "loss": 0.87064719, + "num_input_tokens_seen": 253935712, + "router_z_loss_mlp": 0.78222656, + "step": 3043, + "time_per_iteration": 2.6411526203155518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150207, + "balance_loss_mlp": 1.07176721, + "epoch": 0.5856098499422855, + "flos": 510758949888.0, + "grad_norm": 0.03364322076393535, + "language_loss": 0.8838582, + "learning_rate": 0.0003866578260118817, + "loss": 0.89536023, + "num_input_tokens_seen": 254003152, + "router_z_loss_mlp": 0.78271484, + "step": 3044, + "time_per_iteration": 2.59216570854187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160339, + "balance_loss_mlp": 1.08228123, + "epoch": 0.5858022316275491, + "flos": 594992661504.0, + "grad_norm": 0.03592243508466687, + "language_loss": 0.87963545, + "learning_rate": 0.0003863544157060581, + "loss": 0.89123881, + "num_input_tokens_seen": 254072816, + "router_z_loss_mlp": 0.77978516, + "step": 3045, + "time_per_iteration": 2.6693618297576904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159373, + "balance_loss_mlp": 1.08131468, + "epoch": 0.5859946133128127, + "flos": 560317854720.0, + "grad_norm": 0.029657376615259006, + "language_loss": 0.86909235, + "learning_rate": 0.0003860510495213634, + "loss": 0.88068604, + "num_input_tokens_seen": 254152800, + "router_z_loss_mlp": 0.77978516, + "step": 3046, + "time_per_iteration": 2.799967050552368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159061, + "balance_loss_mlp": 1.08085966, + "epoch": 0.5861869949980761, + "flos": 554755783680.0, + "grad_norm": 0.03663253930872626, + "language_loss": 0.84493214, + "learning_rate": 0.0003857477275755746, + "loss": 0.85652274, + "num_input_tokens_seen": 254224384, + "router_z_loss_mlp": 0.78125, + "step": 3047, + "time_per_iteration": 2.6989481449127197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116382, + "balance_loss_mlp": 1.08566678, + "epoch": 0.5863793766833397, + "flos": 720054131712.0, + "grad_norm": 0.029238524404730352, + "language_loss": 0.89394152, + "learning_rate": 0.00038544444998645167, + "loss": 0.90557969, + "num_input_tokens_seen": 254310960, + "router_z_loss_mlp": 0.78076172, + "step": 3048, + "time_per_iteration": 3.0829827785491943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162492, + "balance_loss_mlp": 1.0843389, + "epoch": 0.5865717583686033, + "flos": 473285643264.0, + "grad_norm": 0.03316519352776713, + "language_loss": 0.8619799, + "learning_rate": 0.00038514121687173767, + "loss": 0.87360477, + "num_input_tokens_seen": 254378336, + "router_z_loss_mlp": 0.78076172, + "step": 3049, + "time_per_iteration": 2.575395107269287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157324, + "balance_loss_mlp": 1.07897997, + "epoch": 0.5867641400538669, + "flos": 814846574592.0, + "grad_norm": 0.0318856413902076, + "language_loss": 0.87874395, + "learning_rate": 0.00038483802834915807, + "loss": 0.8903172, + "num_input_tokens_seen": 254454352, + "router_z_loss_mlp": 0.78271484, + "step": 3050, + "time_per_iteration": 2.973144292831421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153006, + "balance_loss_mlp": 1.07461429, + "epoch": 0.5869565217391305, + "flos": 487517735424.0, + "grad_norm": 0.034960474960603255, + "language_loss": 0.8386789, + "learning_rate": 0.00038453488453642074, + "loss": 0.85020894, + "num_input_tokens_seen": 254526352, + "router_z_loss_mlp": 0.78320312, + "step": 3051, + "time_per_iteration": 2.7100586891174316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152299, + "balance_loss_mlp": 1.0736686, + "epoch": 0.587148903424394, + "flos": 570512014848.0, + "grad_norm": 0.03111841936731719, + "language_loss": 0.91899282, + "learning_rate": 0.00038423178555121697, + "loss": 0.93051583, + "num_input_tokens_seen": 254598720, + "router_z_loss_mlp": 0.78466797, + "step": 3052, + "time_per_iteration": 2.713294744491577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151746, + "balance_loss_mlp": 1.07316351, + "epoch": 0.5873412851096576, + "flos": 748694234112.0, + "grad_norm": 0.039836143626506074, + "language_loss": 0.90698159, + "learning_rate": 0.00038392873151121994, + "loss": 0.91849899, + "num_input_tokens_seen": 254683664, + "router_z_loss_mlp": 0.78466797, + "step": 3053, + "time_per_iteration": 3.0334441661834717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151743, + "balance_loss_mlp": 1.07320774, + "epoch": 0.5875336667949211, + "flos": 529187427840.0, + "grad_norm": 0.03304313685691396, + "language_loss": 0.89048851, + "learning_rate": 0.0003836257225340859, + "loss": 0.90200597, + "num_input_tokens_seen": 254754688, + "router_z_loss_mlp": 0.78417969, + "step": 3054, + "time_per_iteration": 2.612002372741699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152089, + "balance_loss_mlp": 1.07360125, + "epoch": 0.5877260484801847, + "flos": 825640347648.0, + "grad_norm": 0.04168388263761463, + "language_loss": 0.87033945, + "learning_rate": 0.00038332275873745336, + "loss": 0.88186038, + "num_input_tokens_seen": 254838976, + "router_z_loss_mlp": 0.78369141, + "step": 3055, + "time_per_iteration": 3.0469071865081787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153404, + "balance_loss_mlp": 1.07472539, + "epoch": 0.5879184301654482, + "flos": 592693718016.0, + "grad_norm": 0.028534237237830384, + "language_loss": 0.87091875, + "learning_rate": 0.0003830198402389431, + "loss": 0.88245273, + "num_input_tokens_seen": 254912912, + "router_z_loss_mlp": 0.78466797, + "step": 3056, + "time_per_iteration": 2.7129743099212646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116227, + "balance_loss_mlp": 1.08635712, + "epoch": 0.5881108118507118, + "flos": 1549223574528.0, + "grad_norm": 0.013735077759529469, + "language_loss": 0.77348936, + "learning_rate": 0.0003827169671561585, + "loss": 0.78511202, + "num_input_tokens_seen": 255151488, + "router_z_loss_mlp": 0.75976562, + "step": 3057, + "time_per_iteration": 4.971419334411621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155251, + "balance_loss_mlp": 1.0767163, + "epoch": 0.5883031935359754, + "flos": 490598214144.0, + "grad_norm": 0.03703880470659913, + "language_loss": 0.88891268, + "learning_rate": 0.0003824141396066855, + "loss": 0.90046519, + "num_input_tokens_seen": 255218896, + "router_z_loss_mlp": 0.78417969, + "step": 3058, + "time_per_iteration": 2.5657668113708496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153431, + "balance_loss_mlp": 1.0749433, + "epoch": 0.588495575221239, + "flos": 583980036096.0, + "grad_norm": 0.04132288833299083, + "language_loss": 0.89364433, + "learning_rate": 0.000382111357708092, + "loss": 0.90517867, + "num_input_tokens_seen": 255287408, + "router_z_loss_mlp": 0.78417969, + "step": 3059, + "time_per_iteration": 2.7690227031707764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152167, + "balance_loss_mlp": 1.07377541, + "epoch": 0.5886879569065026, + "flos": 662239441920.0, + "grad_norm": 0.03195995960407152, + "language_loss": 0.89352429, + "learning_rate": 0.00038180862157792864, + "loss": 0.90504599, + "num_input_tokens_seen": 255358432, + "router_z_loss_mlp": 0.78320312, + "step": 3060, + "time_per_iteration": 2.797255039215088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149069, + "balance_loss_mlp": 1.07048619, + "epoch": 0.588880338591766, + "flos": 563719243776.0, + "grad_norm": 0.031223560866560994, + "language_loss": 0.86781317, + "learning_rate": 0.0003815059313337279, + "loss": 0.87930381, + "num_input_tokens_seen": 255425744, + "router_z_loss_mlp": 0.78369141, + "step": 3061, + "time_per_iteration": 2.6690454483032227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149002, + "balance_loss_mlp": 1.07056284, + "epoch": 0.5890727202770296, + "flos": 555852225024.0, + "grad_norm": 0.029451906852367885, + "language_loss": 0.83063936, + "learning_rate": 0.00038120328709300436, + "loss": 0.84212935, + "num_input_tokens_seen": 255505808, + "router_z_loss_mlp": 0.78271484, + "step": 3062, + "time_per_iteration": 2.902662515640259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149399, + "balance_loss_mlp": 1.07095897, + "epoch": 0.5892651019622932, + "flos": 656701565952.0, + "grad_norm": 0.028569643240873292, + "language_loss": 0.89099294, + "learning_rate": 0.0003809006889732549, + "loss": 0.90248692, + "num_input_tokens_seen": 255580160, + "router_z_loss_mlp": 0.78320312, + "step": 3063, + "time_per_iteration": 2.8155622482299805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150242, + "balance_loss_mlp": 1.07185006, + "epoch": 0.5894574836475568, + "flos": 454132025856.0, + "grad_norm": 0.03219128848339896, + "language_loss": 0.93056011, + "learning_rate": 0.0003805981370919589, + "loss": 0.9420625, + "num_input_tokens_seen": 255644016, + "router_z_loss_mlp": 0.78173828, + "step": 3064, + "time_per_iteration": 2.533978223800659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156603, + "balance_loss_mlp": 1.07840204, + "epoch": 0.5896498653328203, + "flos": 520111176192.0, + "grad_norm": 0.0315116121131164, + "language_loss": 0.89031386, + "learning_rate": 0.0003802956315665771, + "loss": 0.90187985, + "num_input_tokens_seen": 255718192, + "router_z_loss_mlp": 0.78125, + "step": 3065, + "time_per_iteration": 2.6914567947387695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151617, + "balance_loss_mlp": 1.07341576, + "epoch": 0.5898422470180839, + "flos": 550084036608.0, + "grad_norm": 0.037269486879405754, + "language_loss": 0.87739515, + "learning_rate": 0.0003799931725145529, + "loss": 0.88891131, + "num_input_tokens_seen": 255787696, + "router_z_loss_mlp": 0.78125, + "step": 3066, + "time_per_iteration": 2.6040141582489014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151797, + "balance_loss_mlp": 1.07359576, + "epoch": 0.5900346287033474, + "flos": 525379808256.0, + "grad_norm": 0.03210441330274425, + "language_loss": 0.90831029, + "learning_rate": 0.00037969076005331083, + "loss": 0.9198283, + "num_input_tokens_seen": 255862992, + "router_z_loss_mlp": 0.78125, + "step": 3067, + "time_per_iteration": 2.773045301437378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151142, + "balance_loss_mlp": 1.07298875, + "epoch": 0.590227010388611, + "flos": 568215072768.0, + "grad_norm": 0.03944068050463326, + "language_loss": 0.93933421, + "learning_rate": 0.00037938839430025817, + "loss": 0.9508456, + "num_input_tokens_seen": 255931872, + "router_z_loss_mlp": 0.78076172, + "step": 3068, + "time_per_iteration": 2.6502816677093506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149777, + "balance_loss_mlp": 1.07148039, + "epoch": 0.5904193920738746, + "flos": 584455397376.0, + "grad_norm": 0.029602074998044806, + "language_loss": 0.90136111, + "learning_rate": 0.0003790860753727835, + "loss": 0.91285884, + "num_input_tokens_seen": 256004656, + "router_z_loss_mlp": 0.78173828, + "step": 3069, + "time_per_iteration": 2.8173305988311768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148373, + "balance_loss_mlp": 1.07007682, + "epoch": 0.5906117737591381, + "flos": 530796160512.0, + "grad_norm": 0.03761421694137887, + "language_loss": 0.88493633, + "learning_rate": 0.00037878380338825766, + "loss": 0.89642012, + "num_input_tokens_seen": 256076944, + "router_z_loss_mlp": 0.78173828, + "step": 3070, + "time_per_iteration": 2.6682841777801514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148557, + "balance_loss_mlp": 1.07059419, + "epoch": 0.5908041554444017, + "flos": 685515585024.0, + "grad_norm": 0.029847469423829834, + "language_loss": 0.85616612, + "learning_rate": 0.00037848157846403287, + "loss": 0.86765176, + "num_input_tokens_seen": 256154768, + "router_z_loss_mlp": 0.77880859, + "step": 3071, + "time_per_iteration": 2.942607879638672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148313, + "balance_loss_mlp": 1.07015908, + "epoch": 0.5909965371296653, + "flos": 551132814336.0, + "grad_norm": 0.030659229377642858, + "language_loss": 0.88636756, + "learning_rate": 0.0003781794007174435, + "loss": 0.89785063, + "num_input_tokens_seen": 256230896, + "router_z_loss_mlp": 0.78076172, + "step": 3072, + "time_per_iteration": 2.7619588375091553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159439, + "balance_loss_mlp": 1.08276367, + "epoch": 0.5911889188149289, + "flos": 1495642200576.0, + "grad_norm": 0.009662354088300913, + "language_loss": 0.74074531, + "learning_rate": 0.0003778772702658051, + "loss": 0.75233972, + "num_input_tokens_seen": 256462336, + "router_z_loss_mlp": 0.765625, + "step": 3073, + "time_per_iteration": 4.855187177658081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115096, + "balance_loss_mlp": 1.07275867, + "epoch": 0.5913813005001923, + "flos": 488885422080.0, + "grad_norm": 0.030913240812320716, + "language_loss": 0.86239564, + "learning_rate": 0.0003775751872264152, + "loss": 0.87390518, + "num_input_tokens_seen": 256539376, + "router_z_loss_mlp": 0.78125, + "step": 3074, + "time_per_iteration": 2.7676284313201904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150595, + "balance_loss_mlp": 1.0724895, + "epoch": 0.5915736821854559, + "flos": 574521748992.0, + "grad_norm": 0.02774902568268271, + "language_loss": 0.91979122, + "learning_rate": 0.0003772731517165527, + "loss": 0.93129718, + "num_input_tokens_seen": 256617728, + "router_z_loss_mlp": 0.78027344, + "step": 3075, + "time_per_iteration": 2.7969858646392822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146907, + "balance_loss_mlp": 1.06884861, + "epoch": 0.5917660638707195, + "flos": 790860754944.0, + "grad_norm": 0.032083383212934545, + "language_loss": 0.88416231, + "learning_rate": 0.0003769711638534784, + "loss": 0.89563137, + "num_input_tokens_seen": 256696032, + "router_z_loss_mlp": 0.77978516, + "step": 3076, + "time_per_iteration": 2.966887950897217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147265, + "balance_loss_mlp": 1.06915915, + "epoch": 0.5919584455559831, + "flos": 529756114944.0, + "grad_norm": 0.039188776409307895, + "language_loss": 0.84855187, + "learning_rate": 0.00037666922375443446, + "loss": 0.86002445, + "num_input_tokens_seen": 256767360, + "router_z_loss_mlp": 0.78027344, + "step": 3077, + "time_per_iteration": 2.6466495990753174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146857, + "balance_loss_mlp": 1.06889355, + "epoch": 0.5921508272412467, + "flos": 561752670720.0, + "grad_norm": 0.03396925526876144, + "language_loss": 0.87058771, + "learning_rate": 0.00037636733153664396, + "loss": 0.88205624, + "num_input_tokens_seen": 256844848, + "router_z_loss_mlp": 0.77880859, + "step": 3078, + "time_per_iteration": 2.868244171142578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147912, + "balance_loss_mlp": 1.06980658, + "epoch": 0.5923432089265102, + "flos": 564333593088.0, + "grad_norm": 0.03405949699736924, + "language_loss": 0.86518288, + "learning_rate": 0.0003760654873173124, + "loss": 0.87666202, + "num_input_tokens_seen": 256916688, + "router_z_loss_mlp": 0.78027344, + "step": 3079, + "time_per_iteration": 2.665978193283081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148871, + "balance_loss_mlp": 1.07095611, + "epoch": 0.5925355906117737, + "flos": 496750439424.0, + "grad_norm": 0.031078530741144403, + "language_loss": 0.87091482, + "learning_rate": 0.00037576369121362566, + "loss": 0.88240349, + "num_input_tokens_seen": 256985520, + "router_z_loss_mlp": 0.77832031, + "step": 3080, + "time_per_iteration": 2.5879437923431396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152698, + "balance_loss_mlp": 1.07483089, + "epoch": 0.5927279722970373, + "flos": 567492661248.0, + "grad_norm": 0.029886004026783125, + "language_loss": 0.86116624, + "learning_rate": 0.0003754619433427516, + "loss": 0.87269318, + "num_input_tokens_seen": 257067552, + "router_z_loss_mlp": 0.77783203, + "step": 3081, + "time_per_iteration": 2.911530017852783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149482, + "balance_loss_mlp": 1.07156706, + "epoch": 0.5929203539823009, + "flos": 668159353344.0, + "grad_norm": 0.03611880785888225, + "language_loss": 0.84511012, + "learning_rate": 0.0003751602438218392, + "loss": 0.85660493, + "num_input_tokens_seen": 257138896, + "router_z_loss_mlp": 0.77832031, + "step": 3082, + "time_per_iteration": 2.767104148864746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148924, + "balance_loss_mlp": 1.07105672, + "epoch": 0.5931127356675644, + "flos": 556785483264.0, + "grad_norm": 0.03271098535749721, + "language_loss": 0.89783478, + "learning_rate": 0.0003748585927680186, + "loss": 0.90932405, + "num_input_tokens_seen": 257210592, + "router_z_loss_mlp": 0.77783203, + "step": 3083, + "time_per_iteration": 2.6630167961120605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148966, + "balance_loss_mlp": 1.07100332, + "epoch": 0.593305117352828, + "flos": 536242712064.0, + "grad_norm": 0.03028975884774044, + "language_loss": 0.88271487, + "learning_rate": 0.00037455699029840086, + "loss": 0.89420456, + "num_input_tokens_seen": 257276208, + "router_z_loss_mlp": 0.77880859, + "step": 3084, + "time_per_iteration": 2.647643566131592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148168, + "balance_loss_mlp": 1.07020473, + "epoch": 0.5934974990380916, + "flos": 595057789440.0, + "grad_norm": 0.028668930156423956, + "language_loss": 0.89615595, + "learning_rate": 0.0003742554365300787, + "loss": 0.9076376, + "num_input_tokens_seen": 257351920, + "router_z_loss_mlp": 0.77880859, + "step": 3085, + "time_per_iteration": 2.743479013442993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148026, + "balance_loss_mlp": 1.07015836, + "epoch": 0.5936898807233552, + "flos": 714014697984.0, + "grad_norm": 0.030266517596009415, + "language_loss": 0.84002471, + "learning_rate": 0.0003739539315801255, + "loss": 0.85150492, + "num_input_tokens_seen": 257430016, + "router_z_loss_mlp": 0.77783203, + "step": 3086, + "time_per_iteration": 2.9327478408813477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147359, + "balance_loss_mlp": 1.06944346, + "epoch": 0.5938822624086187, + "flos": 392748761088.0, + "grad_norm": 0.030603721844952317, + "language_loss": 0.96139234, + "learning_rate": 0.000373652475565596, + "loss": 0.97286594, + "num_input_tokens_seen": 257492224, + "router_z_loss_mlp": 0.77832031, + "step": 3087, + "time_per_iteration": 2.471726417541504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146572, + "balance_loss_mlp": 1.06860876, + "epoch": 0.5940746440938822, + "flos": 481335310848.0, + "grad_norm": 0.033612762678092996, + "language_loss": 0.86454874, + "learning_rate": 0.00037335106860352587, + "loss": 0.87601447, + "num_input_tokens_seen": 257567824, + "router_z_loss_mlp": 0.77880859, + "step": 3088, + "time_per_iteration": 2.692692756652832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148512, + "balance_loss_mlp": 1.07045376, + "epoch": 0.5942670257791458, + "flos": 484307000832.0, + "grad_norm": 0.031191733120893732, + "language_loss": 0.87924445, + "learning_rate": 0.00037304971081093146, + "loss": 0.89072955, + "num_input_tokens_seen": 257635488, + "router_z_loss_mlp": 0.77978516, + "step": 3089, + "time_per_iteration": 2.568676710128784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149298, + "balance_loss_mlp": 1.071383, + "epoch": 0.5944594074644094, + "flos": 549057452544.0, + "grad_norm": 0.027833968511861495, + "language_loss": 0.85559821, + "learning_rate": 0.00037274840230481024, + "loss": 0.86709118, + "num_input_tokens_seen": 257709552, + "router_z_loss_mlp": 0.77832031, + "step": 3090, + "time_per_iteration": 2.7224090099334717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148103, + "balance_loss_mlp": 1.07009256, + "epoch": 0.594651789149673, + "flos": 450129022464.0, + "grad_norm": 0.03399265003555819, + "language_loss": 0.85464221, + "learning_rate": 0.00037244714320214077, + "loss": 0.86612326, + "num_input_tokens_seen": 257775520, + "router_z_loss_mlp": 0.77929688, + "step": 3091, + "time_per_iteration": 2.545518398284912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148303, + "balance_loss_mlp": 1.07034016, + "epoch": 0.5948441708349365, + "flos": 597465521664.0, + "grad_norm": 0.029759995876706483, + "language_loss": 0.88336015, + "learning_rate": 0.000372145933619882, + "loss": 0.89484322, + "num_input_tokens_seen": 257858560, + "router_z_loss_mlp": 0.77880859, + "step": 3092, + "time_per_iteration": 2.8612496852874756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147536, + "balance_loss_mlp": 1.06952572, + "epoch": 0.5950365525202, + "flos": 549580477440.0, + "grad_norm": 0.03567164883764641, + "language_loss": 0.87935793, + "learning_rate": 0.000371844773674974, + "loss": 0.89083326, + "num_input_tokens_seen": 257928048, + "router_z_loss_mlp": 0.77929688, + "step": 3093, + "time_per_iteration": 2.6431939601898193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147858, + "balance_loss_mlp": 1.06980002, + "epoch": 0.5952289342054636, + "flos": 655963691520.0, + "grad_norm": 0.03489323159702664, + "language_loss": 0.87669003, + "learning_rate": 0.0003715436634843375, + "loss": 0.88816857, + "num_input_tokens_seen": 258003088, + "router_z_loss_mlp": 0.77978516, + "step": 3094, + "time_per_iteration": 2.889326572418213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115074, + "balance_loss_mlp": 1.07268155, + "epoch": 0.5954213158907272, + "flos": 604603398144.0, + "grad_norm": 0.02937888511977547, + "language_loss": 0.85120195, + "learning_rate": 0.00037124260316487355, + "loss": 0.86270934, + "num_input_tokens_seen": 258084880, + "router_z_loss_mlp": 0.77978516, + "step": 3095, + "time_per_iteration": 2.8256890773773193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011487, + "balance_loss_mlp": 1.07064188, + "epoch": 0.5956136975759908, + "flos": 487267957248.0, + "grad_norm": 0.03289727477229571, + "language_loss": 0.94411993, + "learning_rate": 0.0003709415928334643, + "loss": 0.95560694, + "num_input_tokens_seen": 258152032, + "router_z_loss_mlp": 0.77978516, + "step": 3096, + "time_per_iteration": 2.587526559829712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148362, + "balance_loss_mlp": 1.07025576, + "epoch": 0.5958060792612543, + "flos": 660040555008.0, + "grad_norm": 0.03760653483237211, + "language_loss": 0.8629458, + "learning_rate": 0.00037064063260697233, + "loss": 0.8744294, + "num_input_tokens_seen": 258228896, + "router_z_loss_mlp": 0.78027344, + "step": 3097, + "time_per_iteration": 2.8921737670898438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149624, + "balance_loss_mlp": 1.07170904, + "epoch": 0.5959984609465179, + "flos": 724995122688.0, + "grad_norm": 0.02933465569925715, + "language_loss": 0.84228349, + "learning_rate": 0.0003703397226022407, + "loss": 0.85377973, + "num_input_tokens_seen": 258311152, + "router_z_loss_mlp": 0.77832031, + "step": 3098, + "time_per_iteration": 3.0898213386535645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115181, + "balance_loss_mlp": 1.07627869, + "epoch": 0.5961908426317815, + "flos": 1523218788864.0, + "grad_norm": 0.004520881067607934, + "language_loss": 0.75499874, + "learning_rate": 0.00037003886293609335, + "loss": 0.7665168, + "num_input_tokens_seen": 258540656, + "router_z_loss_mlp": 0.75585938, + "step": 3099, + "time_per_iteration": 4.9205827713012695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148148, + "balance_loss_mlp": 1.07023323, + "epoch": 0.596383224317045, + "flos": 533646326784.0, + "grad_norm": 0.03064762726337019, + "language_loss": 0.87394881, + "learning_rate": 0.0003697380537253339, + "loss": 0.88543034, + "num_input_tokens_seen": 258608960, + "router_z_loss_mlp": 0.77832031, + "step": 3100, + "time_per_iteration": 2.6238889694213867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148472, + "balance_loss_mlp": 1.07065213, + "epoch": 0.5965756060023086, + "flos": 592366076928.0, + "grad_norm": 0.03279417600266174, + "language_loss": 0.87095284, + "learning_rate": 0.0003694372950867471, + "loss": 0.88243759, + "num_input_tokens_seen": 258684304, + "router_z_loss_mlp": 0.77734375, + "step": 3101, + "time_per_iteration": 2.754004955291748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149351, + "balance_loss_mlp": 1.0715313, + "epoch": 0.5967679876875721, + "flos": 863469493248.0, + "grad_norm": 0.096940863219985, + "language_loss": 0.82642257, + "learning_rate": 0.0003691365871370976, + "loss": 0.83791614, + "num_input_tokens_seen": 258769472, + "router_z_loss_mlp": 0.77734375, + "step": 3102, + "time_per_iteration": 3.027898073196411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148471, + "balance_loss_mlp": 1.07065165, + "epoch": 0.5969603693728357, + "flos": 554877307392.0, + "grad_norm": 0.03194116769832037, + "language_loss": 0.90513253, + "learning_rate": 0.00036883592999313093, + "loss": 0.91661727, + "num_input_tokens_seen": 258841696, + "router_z_loss_mlp": 0.77734375, + "step": 3103, + "time_per_iteration": 2.6555323600769043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114931, + "balance_loss_mlp": 1.07158601, + "epoch": 0.5971527510580993, + "flos": 719936610816.0, + "grad_norm": 0.037867869271097296, + "language_loss": 0.85018742, + "learning_rate": 0.0003685353237715722, + "loss": 0.86168051, + "num_input_tokens_seen": 258915616, + "router_z_loss_mlp": 0.77636719, + "step": 3104, + "time_per_iteration": 2.88739013671875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115032, + "balance_loss_mlp": 1.07245219, + "epoch": 0.5973451327433629, + "flos": 648862745088.0, + "grad_norm": 0.032062315519195535, + "language_loss": 0.86408043, + "learning_rate": 0.0003682347685891274, + "loss": 0.87558353, + "num_input_tokens_seen": 258994080, + "router_z_loss_mlp": 0.77783203, + "step": 3105, + "time_per_iteration": 2.8420920372009277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149351, + "balance_loss_mlp": 1.07162631, + "epoch": 0.5975375144286263, + "flos": 723088948224.0, + "grad_norm": 0.03318206210872103, + "language_loss": 0.86870039, + "learning_rate": 0.0003679342645624822, + "loss": 0.88019389, + "num_input_tokens_seen": 259075968, + "router_z_loss_mlp": 0.77636719, + "step": 3106, + "time_per_iteration": 2.995124578475952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150114, + "balance_loss_mlp": 1.07248521, + "epoch": 0.5977298961138899, + "flos": 752343399936.0, + "grad_norm": 0.029134934835651077, + "language_loss": 0.86725187, + "learning_rate": 0.0003676338118083025, + "loss": 0.87875295, + "num_input_tokens_seen": 259162512, + "router_z_loss_mlp": 0.77539062, + "step": 3107, + "time_per_iteration": 2.972302198410034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150139, + "balance_loss_mlp": 1.07251036, + "epoch": 0.5979222777991535, + "flos": 531998662656.0, + "grad_norm": 0.035100601373903646, + "language_loss": 0.857481, + "learning_rate": 0.0003673334104432347, + "loss": 0.86898237, + "num_input_tokens_seen": 259228752, + "router_z_loss_mlp": 0.77539062, + "step": 3108, + "time_per_iteration": 2.6626758575439453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149837, + "balance_loss_mlp": 1.07230318, + "epoch": 0.5981146594844171, + "flos": 622914355200.0, + "grad_norm": 0.0316193314504938, + "language_loss": 0.88024735, + "learning_rate": 0.0003670330605839048, + "loss": 0.89174569, + "num_input_tokens_seen": 259303440, + "router_z_loss_mlp": 0.77441406, + "step": 3109, + "time_per_iteration": 2.8445565700531006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149651, + "balance_loss_mlp": 1.07216513, + "epoch": 0.5983070411696807, + "flos": 604709458944.0, + "grad_norm": 0.030685816325192888, + "language_loss": 0.81470084, + "learning_rate": 0.0003667327623469191, + "loss": 0.82619739, + "num_input_tokens_seen": 259378752, + "router_z_loss_mlp": 0.77392578, + "step": 3110, + "time_per_iteration": 2.7507362365722656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151646, + "balance_loss_mlp": 1.07406473, + "epoch": 0.5984994228549442, + "flos": 634669584384.0, + "grad_norm": 0.03251456811802211, + "language_loss": 0.83321273, + "learning_rate": 0.00036643251584886333, + "loss": 0.84472924, + "num_input_tokens_seen": 259454336, + "router_z_loss_mlp": 0.77490234, + "step": 3111, + "time_per_iteration": 2.816390037536621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156112, + "balance_loss_mlp": 1.07848299, + "epoch": 0.5986918045402078, + "flos": 526293600768.0, + "grad_norm": 0.03439308421341756, + "language_loss": 0.88026524, + "learning_rate": 0.00036613232120630393, + "loss": 0.89182639, + "num_input_tokens_seen": 259518960, + "router_z_loss_mlp": 0.77539062, + "step": 3112, + "time_per_iteration": 2.610931396484375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151048, + "balance_loss_mlp": 1.07332325, + "epoch": 0.5988841862254713, + "flos": 484139814912.0, + "grad_norm": 0.040537518995664656, + "language_loss": 0.85835981, + "learning_rate": 0.00036583217853578643, + "loss": 0.86987036, + "num_input_tokens_seen": 259584352, + "router_z_loss_mlp": 0.77636719, + "step": 3113, + "time_per_iteration": 2.535508871078491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115137, + "balance_loss_mlp": 1.07369328, + "epoch": 0.5990765679107349, + "flos": 1142121745920.0, + "grad_norm": 0.03045218931470109, + "language_loss": 0.82758361, + "learning_rate": 0.000365532087953837, + "loss": 0.83909732, + "num_input_tokens_seen": 259693152, + "router_z_loss_mlp": 0.77587891, + "step": 3114, + "time_per_iteration": 3.635089159011841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150692, + "balance_loss_mlp": 1.07282436, + "epoch": 0.5992689495959984, + "flos": 518018350080.0, + "grad_norm": 0.03475345450765353, + "language_loss": 0.94564217, + "learning_rate": 0.00036523204957696065, + "loss": 0.95714909, + "num_input_tokens_seen": 259762048, + "router_z_loss_mlp": 0.77783203, + "step": 3115, + "time_per_iteration": 2.6130504608154297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150235, + "balance_loss_mlp": 1.07231951, + "epoch": 0.599461331281262, + "flos": 745941396480.0, + "grad_norm": 0.03954805443520273, + "language_loss": 0.86356986, + "learning_rate": 0.00036493206352164324, + "loss": 0.87507224, + "num_input_tokens_seen": 259843184, + "router_z_loss_mlp": 0.77832031, + "step": 3116, + "time_per_iteration": 2.902606964111328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115079, + "balance_loss_mlp": 1.07282686, + "epoch": 0.5996537129665256, + "flos": 593483985408.0, + "grad_norm": 0.030263025154964335, + "language_loss": 0.90265405, + "learning_rate": 0.000364632129904349, + "loss": 0.91416192, + "num_input_tokens_seen": 259912720, + "router_z_loss_mlp": 0.77880859, + "step": 3117, + "time_per_iteration": 2.728739023208618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148018, + "balance_loss_mlp": 1.0701983, + "epoch": 0.5998460946517892, + "flos": 560115740160.0, + "grad_norm": 0.03726043771871862, + "language_loss": 0.8256759, + "learning_rate": 0.00036433224884152283, + "loss": 0.83715606, + "num_input_tokens_seen": 259985472, + "router_z_loss_mlp": 0.77734375, + "step": 3118, + "time_per_iteration": 2.7763798236846924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146842, + "balance_loss_mlp": 1.06897449, + "epoch": 0.6000384763370528, + "flos": 485535699456.0, + "grad_norm": 0.03789921911219481, + "language_loss": 0.83006287, + "learning_rate": 0.00036403242044958875, + "loss": 0.84153128, + "num_input_tokens_seen": 260050336, + "router_z_loss_mlp": 0.77783203, + "step": 3119, + "time_per_iteration": 2.549102783203125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156248, + "balance_loss_mlp": 1.07842839, + "epoch": 0.6002308580223162, + "flos": 597877756416.0, + "grad_norm": 0.03490542571663494, + "language_loss": 0.96794367, + "learning_rate": 0.0003637326448449507, + "loss": 0.97950613, + "num_input_tokens_seen": 260120304, + "router_z_loss_mlp": 0.77734375, + "step": 3120, + "time_per_iteration": 2.7004034519195557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153861, + "balance_loss_mlp": 1.07608855, + "epoch": 0.6004232397075798, + "flos": 546220021248.0, + "grad_norm": 0.03097014244858331, + "language_loss": 0.90828121, + "learning_rate": 0.00036343292214399177, + "loss": 0.91981983, + "num_input_tokens_seen": 260198304, + "router_z_loss_mlp": 0.77685547, + "step": 3121, + "time_per_iteration": 2.7137558460235596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149916, + "balance_loss_mlp": 1.07195354, + "epoch": 0.6006156213928434, + "flos": 631150674432.0, + "grad_norm": 0.035271472923777164, + "language_loss": 0.82629979, + "learning_rate": 0.00036313325246307456, + "loss": 0.83779889, + "num_input_tokens_seen": 260277664, + "router_z_loss_mlp": 0.77880859, + "step": 3122, + "time_per_iteration": 2.7764761447906494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149471, + "balance_loss_mlp": 1.07179451, + "epoch": 0.600808003078107, + "flos": 583404618240.0, + "grad_norm": 0.03572948741638757, + "language_loss": 0.92888528, + "learning_rate": 0.0003628336359185411, + "loss": 0.94037998, + "num_input_tokens_seen": 260350096, + "router_z_loss_mlp": 0.77587891, + "step": 3123, + "time_per_iteration": 2.658597707748413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149832, + "balance_loss_mlp": 1.07215571, + "epoch": 0.6010003847633705, + "flos": 636438772224.0, + "grad_norm": 0.033415641646833916, + "language_loss": 0.81693363, + "learning_rate": 0.000362534072626713, + "loss": 0.8284319, + "num_input_tokens_seen": 260421888, + "router_z_loss_mlp": 0.77587891, + "step": 3124, + "time_per_iteration": 2.7385804653167725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146058, + "balance_loss_mlp": 1.06857181, + "epoch": 0.6011927664486341, + "flos": 720029936640.0, + "grad_norm": 0.0314556326919405, + "language_loss": 0.85929549, + "learning_rate": 0.00036223456270389093, + "loss": 0.87075609, + "num_input_tokens_seen": 260499616, + "router_z_loss_mlp": 0.77392578, + "step": 3125, + "time_per_iteration": 2.9184412956237793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148457, + "balance_loss_mlp": 1.0710187, + "epoch": 0.6013851481338977, + "flos": 500054499840.0, + "grad_norm": 0.03211121673376429, + "language_loss": 0.85866034, + "learning_rate": 0.00036193510626635517, + "loss": 0.87014484, + "num_input_tokens_seen": 260572048, + "router_z_loss_mlp": 0.7734375, + "step": 3126, + "time_per_iteration": 2.6580941677093506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151789, + "balance_loss_mlp": 1.07439816, + "epoch": 0.6015775298191612, + "flos": 750875656704.0, + "grad_norm": 0.03289877663507899, + "language_loss": 0.86000574, + "learning_rate": 0.0003616357034303649, + "loss": 0.87152362, + "num_input_tokens_seen": 260644720, + "router_z_loss_mlp": 0.77294922, + "step": 3127, + "time_per_iteration": 2.925900459289551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154509, + "balance_loss_mlp": 1.07730949, + "epoch": 0.6017699115044248, + "flos": 594263519232.0, + "grad_norm": 0.026386451784686567, + "language_loss": 0.83912927, + "learning_rate": 0.0003613363543121584, + "loss": 0.85067433, + "num_input_tokens_seen": 260724864, + "router_z_loss_mlp": 0.77099609, + "step": 3128, + "time_per_iteration": 2.8285086154937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149104, + "balance_loss_mlp": 1.07185686, + "epoch": 0.6019622931896883, + "flos": 516201498624.0, + "grad_norm": 0.032335523729292034, + "language_loss": 0.89489174, + "learning_rate": 0.00036103705902795357, + "loss": 0.90638286, + "num_input_tokens_seen": 260800896, + "router_z_loss_mlp": 0.77148438, + "step": 3129, + "time_per_iteration": 2.7369625568389893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153149, + "balance_loss_mlp": 1.0759964, + "epoch": 0.6021546748749519, + "flos": 491473075200.0, + "grad_norm": 0.037053521707819316, + "language_loss": 0.86282051, + "learning_rate": 0.0003607378176939471, + "loss": 0.87435198, + "num_input_tokens_seen": 260872736, + "router_z_loss_mlp": 0.77050781, + "step": 3130, + "time_per_iteration": 2.6015982627868652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155234, + "balance_loss_mlp": 1.07832015, + "epoch": 0.6023470565602155, + "flos": 542114959872.0, + "grad_norm": 0.03769359789833061, + "language_loss": 0.87922359, + "learning_rate": 0.00036043863042631465, + "loss": 0.89077592, + "num_input_tokens_seen": 260943264, + "router_z_loss_mlp": 0.76806641, + "step": 3131, + "time_per_iteration": 2.870999813079834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151659, + "balance_loss_mlp": 1.07436335, + "epoch": 0.6025394382454791, + "flos": 846463096320.0, + "grad_norm": 0.03206429015818981, + "language_loss": 0.81416667, + "learning_rate": 0.00036013949734121133, + "loss": 0.82568324, + "num_input_tokens_seen": 261030064, + "router_z_loss_mlp": 0.77197266, + "step": 3132, + "time_per_iteration": 3.1543962955474854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115191, + "balance_loss_mlp": 1.0745194, + "epoch": 0.6027318199307425, + "flos": 578257509888.0, + "grad_norm": 0.03267549496137676, + "language_loss": 0.87371534, + "learning_rate": 0.00035984041855477043, + "loss": 0.88523442, + "num_input_tokens_seen": 261106496, + "router_z_loss_mlp": 0.77294922, + "step": 3133, + "time_per_iteration": 2.7443673610687256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143524, + "balance_loss_mlp": 1.06837463, + "epoch": 0.6029242016160061, + "flos": 1474252766208.0, + "grad_norm": 0.006811691070041734, + "language_loss": 0.78709894, + "learning_rate": 0.00035954139418310495, + "loss": 0.79853421, + "num_input_tokens_seen": 261343248, + "router_z_loss_mlp": 0.75195312, + "step": 3134, + "time_per_iteration": 4.92242431640625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145401, + "balance_loss_mlp": 1.06810546, + "epoch": 0.6031165833012697, + "flos": 481782474240.0, + "grad_norm": 0.029444679170183622, + "language_loss": 0.84435833, + "learning_rate": 0.00035924242434230637, + "loss": 0.85581231, + "num_input_tokens_seen": 261416704, + "router_z_loss_mlp": 0.77197266, + "step": 3135, + "time_per_iteration": 2.6391186714172363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154302, + "balance_loss_mlp": 1.07700658, + "epoch": 0.6033089649865333, + "flos": 500464733184.0, + "grad_norm": 0.036345783287305373, + "language_loss": 0.85093319, + "learning_rate": 0.00035894350914844516, + "loss": 0.86247623, + "num_input_tokens_seen": 261486688, + "router_z_loss_mlp": 0.77197266, + "step": 3136, + "time_per_iteration": 2.6352477073669434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150224, + "balance_loss_mlp": 1.07259464, + "epoch": 0.6035013466717969, + "flos": 557723470848.0, + "grad_norm": 0.0365408898732846, + "language_loss": 0.89268684, + "learning_rate": 0.0003586446487175703, + "loss": 0.90418905, + "num_input_tokens_seen": 261557344, + "router_z_loss_mlp": 0.77539062, + "step": 3137, + "time_per_iteration": 2.693071126937866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149547, + "balance_loss_mlp": 1.07215679, + "epoch": 0.6036937283570604, + "flos": 595995777024.0, + "grad_norm": 0.02904364912520073, + "language_loss": 0.90167797, + "learning_rate": 0.0003583458431657099, + "loss": 0.91317338, + "num_input_tokens_seen": 261626240, + "router_z_loss_mlp": 0.77294922, + "step": 3138, + "time_per_iteration": 2.738223075866699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114932, + "balance_loss_mlp": 1.07178628, + "epoch": 0.603886110042324, + "flos": 542058564096.0, + "grad_norm": 0.037255533971674665, + "language_loss": 0.87546921, + "learning_rate": 0.00035804709260887056, + "loss": 0.88696241, + "num_input_tokens_seen": 261696368, + "router_z_loss_mlp": 0.77441406, + "step": 3139, + "time_per_iteration": 2.6814053058624268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148548, + "balance_loss_mlp": 1.07072818, + "epoch": 0.6040784917275875, + "flos": 519655280640.0, + "grad_norm": 0.02881429249122551, + "language_loss": 0.93902391, + "learning_rate": 0.0003577483971630373, + "loss": 0.95050937, + "num_input_tokens_seen": 261769104, + "router_z_loss_mlp": 0.77734375, + "step": 3140, + "time_per_iteration": 2.6691088676452637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011483, + "balance_loss_mlp": 1.07052839, + "epoch": 0.6042708734128511, + "flos": 662013858816.0, + "grad_norm": 0.0304544298908833, + "language_loss": 0.89555264, + "learning_rate": 0.00035744975694417414, + "loss": 0.90703559, + "num_input_tokens_seen": 261844880, + "router_z_loss_mlp": 0.77685547, + "step": 3141, + "time_per_iteration": 2.872135877609253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148852, + "balance_loss_mlp": 1.07107973, + "epoch": 0.6044632550981146, + "flos": 573516632064.0, + "grad_norm": 0.03378277324120908, + "language_loss": 0.88105464, + "learning_rate": 0.00035715117206822344, + "loss": 0.89254314, + "num_input_tokens_seen": 261923280, + "router_z_loss_mlp": 0.77685547, + "step": 3142, + "time_per_iteration": 2.790640354156494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150783, + "balance_loss_mlp": 1.07315397, + "epoch": 0.6046556367833782, + "flos": 547728697344.0, + "grad_norm": 0.0341385163456541, + "language_loss": 0.86351824, + "learning_rate": 0.0003568526426511065, + "loss": 0.87502599, + "num_input_tokens_seen": 261990832, + "router_z_loss_mlp": 0.77539062, + "step": 3143, + "time_per_iteration": 2.622870683670044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150768, + "balance_loss_mlp": 1.07318711, + "epoch": 0.6048480184686418, + "flos": 778174268928.0, + "grad_norm": 0.03443143260722225, + "language_loss": 0.88285363, + "learning_rate": 0.000356554168808722, + "loss": 0.89436138, + "num_input_tokens_seen": 262063760, + "router_z_loss_mlp": 0.77490234, + "step": 3144, + "time_per_iteration": 2.9785499572753906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151515, + "balance_loss_mlp": 1.07393324, + "epoch": 0.6050404001539054, + "flos": 658375426560.0, + "grad_norm": 0.03050523278027174, + "language_loss": 0.89547616, + "learning_rate": 0.00035625575065694837, + "loss": 0.9069913, + "num_input_tokens_seen": 262137968, + "router_z_loss_mlp": 0.77490234, + "step": 3145, + "time_per_iteration": 2.893160343170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151106, + "balance_loss_mlp": 1.07347679, + "epoch": 0.605232781839169, + "flos": 550082035200.0, + "grad_norm": 0.03434592875619572, + "language_loss": 0.82820475, + "learning_rate": 0.0003559573883116415, + "loss": 0.83971578, + "num_input_tokens_seen": 262211264, + "router_z_loss_mlp": 0.77539062, + "step": 3146, + "time_per_iteration": 2.703378677368164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152026, + "balance_loss_mlp": 1.07434905, + "epoch": 0.6054251635244324, + "flos": 606641829888.0, + "grad_norm": 0.028306929425565355, + "language_loss": 0.90180922, + "learning_rate": 0.00035565908188863604, + "loss": 0.91332948, + "num_input_tokens_seen": 262289648, + "router_z_loss_mlp": 0.77587891, + "step": 3147, + "time_per_iteration": 2.8178632259368896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149693, + "balance_loss_mlp": 1.07201612, + "epoch": 0.605617545209696, + "flos": 614808291840.0, + "grad_norm": 0.03167283444801755, + "language_loss": 0.85591269, + "learning_rate": 0.00035536083150374464, + "loss": 0.86740971, + "num_input_tokens_seen": 262362704, + "router_z_loss_mlp": 0.77587891, + "step": 3148, + "time_per_iteration": 2.7630088329315186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151665, + "balance_loss_mlp": 1.07613373, + "epoch": 0.6058099268949596, + "flos": 1501607774208.0, + "grad_norm": 0.006039709216806875, + "language_loss": 0.74747956, + "learning_rate": 0.00035506263727275893, + "loss": 0.75899613, + "num_input_tokens_seen": 262596864, + "router_z_loss_mlp": 0.75585938, + "step": 3149, + "time_per_iteration": 4.826624870300293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148811, + "balance_loss_mlp": 1.07108641, + "epoch": 0.6060023085802232, + "flos": 671704459776.0, + "grad_norm": 0.03325996872858785, + "language_loss": 0.90532559, + "learning_rate": 0.0003547644993114475, + "loss": 0.91681373, + "num_input_tokens_seen": 262671088, + "router_z_loss_mlp": 0.77636719, + "step": 3150, + "time_per_iteration": 2.802644729614258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149051, + "balance_loss_mlp": 1.07127893, + "epoch": 0.6061946902654868, + "flos": 607305844224.0, + "grad_norm": 0.03277875295758358, + "language_loss": 0.85509253, + "learning_rate": 0.00035446641773555806, + "loss": 0.86658305, + "num_input_tokens_seen": 262743888, + "router_z_loss_mlp": 0.77685547, + "step": 3151, + "time_per_iteration": 2.7055504322052 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148261, + "balance_loss_mlp": 1.07082272, + "epoch": 0.6063870719507503, + "flos": 558952169472.0, + "grad_norm": 0.029065175404624204, + "language_loss": 0.91512465, + "learning_rate": 0.000354168392660816, + "loss": 0.92660725, + "num_input_tokens_seen": 262819616, + "router_z_loss_mlp": 0.7734375, + "step": 3152, + "time_per_iteration": 2.7494730949401855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145734, + "balance_loss_mlp": 1.06829596, + "epoch": 0.6065794536360138, + "flos": 558281424384.0, + "grad_norm": 0.03244852665251002, + "language_loss": 0.88397223, + "learning_rate": 0.0003538704242029252, + "loss": 0.89542961, + "num_input_tokens_seen": 262893984, + "router_z_loss_mlp": 0.7734375, + "step": 3153, + "time_per_iteration": 2.675692558288574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146957, + "balance_loss_mlp": 1.06932831, + "epoch": 0.6067718353212774, + "flos": 691381102080.0, + "grad_norm": 0.033220307719005866, + "language_loss": 0.83031321, + "learning_rate": 0.0003535725124775672, + "loss": 0.84178281, + "num_input_tokens_seen": 262969648, + "router_z_loss_mlp": 0.77539062, + "step": 3154, + "time_per_iteration": 2.843881607055664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156617, + "balance_loss_mlp": 1.07903516, + "epoch": 0.606964217006541, + "flos": 522902945280.0, + "grad_norm": 0.035561743978846455, + "language_loss": 0.91791475, + "learning_rate": 0.00035327465760040126, + "loss": 0.92948091, + "num_input_tokens_seen": 263042048, + "router_z_loss_mlp": 0.77490234, + "step": 3155, + "time_per_iteration": 2.684056043624878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158513, + "balance_loss_mlp": 1.08112192, + "epoch": 0.6071565986918045, + "flos": 642712521216.0, + "grad_norm": 0.03594986649837803, + "language_loss": 0.89308429, + "learning_rate": 0.00035297685968706526, + "loss": 0.9046694, + "num_input_tokens_seen": 263108032, + "router_z_loss_mlp": 0.77294922, + "step": 3156, + "time_per_iteration": 2.7834246158599854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160171, + "balance_loss_mlp": 1.08278084, + "epoch": 0.6073489803770681, + "flos": 561652614144.0, + "grad_norm": 0.034893913409009325, + "language_loss": 0.88205332, + "learning_rate": 0.00035267911885317454, + "loss": 0.89365506, + "num_input_tokens_seen": 263175184, + "router_z_loss_mlp": 0.77294922, + "step": 3157, + "time_per_iteration": 2.669710397720337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158828, + "balance_loss_mlp": 1.08143747, + "epoch": 0.6075413620623317, + "flos": 587201504256.0, + "grad_norm": 0.030643892610273542, + "language_loss": 0.86383843, + "learning_rate": 0.0003523814352143222, + "loss": 0.87542671, + "num_input_tokens_seen": 263252768, + "router_z_loss_mlp": 0.77294922, + "step": 3158, + "time_per_iteration": 2.822089195251465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154763, + "balance_loss_mlp": 1.07741952, + "epoch": 0.6077337437475953, + "flos": 631971141120.0, + "grad_norm": 0.03639599054768475, + "language_loss": 0.96294606, + "learning_rate": 0.00035208380888607937, + "loss": 0.97449374, + "num_input_tokens_seen": 263328720, + "router_z_loss_mlp": 0.77246094, + "step": 3159, + "time_per_iteration": 2.7675912380218506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156998, + "balance_loss_mlp": 1.08184814, + "epoch": 0.6079261254328588, + "flos": 1471623453696.0, + "grad_norm": 0.01008994969394602, + "language_loss": 0.79461986, + "learning_rate": 0.000351786239983995, + "loss": 0.80618984, + "num_input_tokens_seen": 263554656, + "router_z_loss_mlp": 0.75195312, + "step": 3160, + "time_per_iteration": 4.839691638946533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155136, + "balance_loss_mlp": 1.07998657, + "epoch": 0.6081185071181223, + "flos": 1526203213824.0, + "grad_norm": 0.005930182573689796, + "language_loss": 0.7569223, + "learning_rate": 0.00035148872862359517, + "loss": 0.76847368, + "num_input_tokens_seen": 263791600, + "router_z_loss_mlp": 0.75195312, + "step": 3161, + "time_per_iteration": 4.991135835647583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154947, + "balance_loss_mlp": 1.07746089, + "epoch": 0.6083108888033859, + "flos": 557434761216.0, + "grad_norm": 0.030736279817991784, + "language_loss": 0.86955488, + "learning_rate": 0.00035119127492038446, + "loss": 0.88110441, + "num_input_tokens_seen": 263869744, + "router_z_loss_mlp": 0.77392578, + "step": 3162, + "time_per_iteration": 2.8129284381866455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115361, + "balance_loss_mlp": 1.07631505, + "epoch": 0.6085032704886495, + "flos": 842555420160.0, + "grad_norm": 0.033332341835850446, + "language_loss": 0.88169372, + "learning_rate": 0.00035089387898984436, + "loss": 0.89322984, + "num_input_tokens_seen": 263946624, + "router_z_loss_mlp": 0.77197266, + "step": 3163, + "time_per_iteration": 3.0287744998931885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151661, + "balance_loss_mlp": 1.07412744, + "epoch": 0.6086956521739131, + "flos": 685992947712.0, + "grad_norm": 0.03500074735075155, + "language_loss": 0.87286401, + "learning_rate": 0.0003505965409474343, + "loss": 0.88438058, + "num_input_tokens_seen": 264022064, + "router_z_loss_mlp": 0.77441406, + "step": 3164, + "time_per_iteration": 2.8668415546417236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155467, + "balance_loss_mlp": 1.07802904, + "epoch": 0.6088880338591766, + "flos": 536865793536.0, + "grad_norm": 0.03207560682458212, + "language_loss": 0.90936065, + "learning_rate": 0.0003502992609085913, + "loss": 0.92091525, + "num_input_tokens_seen": 264089520, + "router_z_loss_mlp": 0.7734375, + "step": 3165, + "time_per_iteration": 2.6344704627990723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152911, + "balance_loss_mlp": 1.07552052, + "epoch": 0.6090804155444401, + "flos": 732881607168.0, + "grad_norm": 0.03068132972373785, + "language_loss": 0.86756754, + "learning_rate": 0.00035000203898872954, + "loss": 0.87909669, + "num_input_tokens_seen": 264173056, + "router_z_loss_mlp": 0.77294922, + "step": 3166, + "time_per_iteration": 3.007883071899414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151975, + "balance_loss_mlp": 1.07458472, + "epoch": 0.6092727972297037, + "flos": 700242504192.0, + "grad_norm": 0.033743959402083586, + "language_loss": 0.89530504, + "learning_rate": 0.0003497048753032406, + "loss": 0.90682483, + "num_input_tokens_seen": 264250912, + "router_z_loss_mlp": 0.77294922, + "step": 3167, + "time_per_iteration": 2.903841018676758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150053, + "balance_loss_mlp": 1.07285297, + "epoch": 0.6094651789149673, + "flos": 1053676185600.0, + "grad_norm": 0.029535454603069295, + "language_loss": 0.85045445, + "learning_rate": 0.000349407769967494, + "loss": 0.86195493, + "num_input_tokens_seen": 264342800, + "router_z_loss_mlp": 0.77099609, + "step": 3168, + "time_per_iteration": 3.4178872108459473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155901, + "balance_loss_mlp": 1.07860577, + "epoch": 0.6096575606002309, + "flos": 504094433280.0, + "grad_norm": 0.02941914211290898, + "language_loss": 0.89039332, + "learning_rate": 0.0003491107230968361, + "loss": 0.90195233, + "num_input_tokens_seen": 264413664, + "router_z_loss_mlp": 0.77197266, + "step": 3169, + "time_per_iteration": 2.6551673412323 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156463, + "balance_loss_mlp": 1.07921588, + "epoch": 0.6098499422854944, + "flos": 586863129600.0, + "grad_norm": 0.02719917666416643, + "language_loss": 0.85504711, + "learning_rate": 0.00034881373480659085, + "loss": 0.86661172, + "num_input_tokens_seen": 264494944, + "router_z_loss_mlp": 0.77148438, + "step": 3170, + "time_per_iteration": 2.851252317428589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157705, + "balance_loss_mlp": 1.08040965, + "epoch": 0.610042323970758, + "flos": 470159502336.0, + "grad_norm": 0.06140035445399593, + "language_loss": 0.85159725, + "learning_rate": 0.0003485168052120594, + "loss": 0.86317426, + "num_input_tokens_seen": 264561664, + "router_z_loss_mlp": 0.77197266, + "step": 3171, + "time_per_iteration": 2.5498504638671875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156725, + "balance_loss_mlp": 1.07938242, + "epoch": 0.6102347056560216, + "flos": 515198383104.0, + "grad_norm": 0.03549166492948706, + "language_loss": 0.85369307, + "learning_rate": 0.00034821993442851973, + "loss": 0.86526036, + "num_input_tokens_seen": 264626256, + "router_z_loss_mlp": 0.77246094, + "step": 3172, + "time_per_iteration": 2.571030378341675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153351, + "balance_loss_mlp": 1.07600832, + "epoch": 0.6104270873412851, + "flos": 469964118528.0, + "grad_norm": 0.03723847696421654, + "language_loss": 0.87251568, + "learning_rate": 0.00034792312257122735, + "loss": 0.88404918, + "num_input_tokens_seen": 264692768, + "router_z_loss_mlp": 0.77246094, + "step": 3173, + "time_per_iteration": 2.601289987564087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153196, + "balance_loss_mlp": 1.07580578, + "epoch": 0.6106194690265486, + "flos": 550939431936.0, + "grad_norm": 0.03428989424028707, + "language_loss": 0.85585618, + "learning_rate": 0.00034762636975541506, + "loss": 0.86738813, + "num_input_tokens_seen": 264764816, + "router_z_loss_mlp": 0.77294922, + "step": 3174, + "time_per_iteration": 2.623203754425049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155286, + "balance_loss_mlp": 1.07784736, + "epoch": 0.6108118507118122, + "flos": 473880526848.0, + "grad_norm": 0.03492975408157665, + "language_loss": 0.85685778, + "learning_rate": 0.0003473296760962923, + "loss": 0.86841059, + "num_input_tokens_seen": 264837968, + "router_z_loss_mlp": 0.7734375, + "step": 3175, + "time_per_iteration": 2.6674108505249023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157349, + "balance_loss_mlp": 1.08181763, + "epoch": 0.6110042323970758, + "flos": 1448180124672.0, + "grad_norm": 0.011972836775056764, + "language_loss": 0.78533739, + "learning_rate": 0.00034703304170904617, + "loss": 0.79691088, + "num_input_tokens_seen": 265058336, + "router_z_loss_mlp": 0.75585938, + "step": 3176, + "time_per_iteration": 4.719567060470581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150349, + "balance_loss_mlp": 1.07286298, + "epoch": 0.6111966140823394, + "flos": 795541234176.0, + "grad_norm": 0.03714406101939167, + "language_loss": 0.87063801, + "learning_rate": 0.00034673646670883976, + "loss": 0.88214147, + "num_input_tokens_seen": 265135920, + "router_z_loss_mlp": 0.77392578, + "step": 3177, + "time_per_iteration": 2.973940134048462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155601, + "balance_loss_mlp": 1.0800705, + "epoch": 0.611388995767603, + "flos": 1561063397376.0, + "grad_norm": 0.00949552405530534, + "language_loss": 0.75715023, + "learning_rate": 0.0003464399512108141, + "loss": 0.76870626, + "num_input_tokens_seen": 265374464, + "router_z_loss_mlp": 0.75585938, + "step": 3178, + "time_per_iteration": 5.061004400253296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152416, + "balance_loss_mlp": 1.07488239, + "epoch": 0.6115813774528664, + "flos": 713484942336.0, + "grad_norm": 0.03541902083866898, + "language_loss": 0.87553525, + "learning_rate": 0.0003461434953300865, + "loss": 0.88705945, + "num_input_tokens_seen": 265450112, + "router_z_loss_mlp": 0.77441406, + "step": 3179, + "time_per_iteration": 2.916708469390869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153239, + "balance_loss_mlp": 1.07556212, + "epoch": 0.61177375913813, + "flos": 685689501696.0, + "grad_norm": 0.028499371872006348, + "language_loss": 0.85970306, + "learning_rate": 0.0003458470991817515, + "loss": 0.87123549, + "num_input_tokens_seen": 265534336, + "router_z_loss_mlp": 0.77587891, + "step": 3180, + "time_per_iteration": 2.9950902462005615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115431, + "balance_loss_mlp": 1.07677627, + "epoch": 0.6119661408233936, + "flos": 512667125760.0, + "grad_norm": 0.035557395139189776, + "language_loss": 0.89999539, + "learning_rate": 0.0003455507628808802, + "loss": 0.91153848, + "num_input_tokens_seen": 265604480, + "router_z_loss_mlp": 0.77441406, + "step": 3181, + "time_per_iteration": 2.5897092819213867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153736, + "balance_loss_mlp": 1.07629788, + "epoch": 0.6121585225086572, + "flos": 557855728128.0, + "grad_norm": 0.03617294918278912, + "language_loss": 0.90379083, + "learning_rate": 0.00034525448654252076, + "loss": 0.9153282, + "num_input_tokens_seen": 265670848, + "router_z_loss_mlp": 0.7734375, + "step": 3182, + "time_per_iteration": 2.636446714401245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157583, + "balance_loss_mlp": 1.08047891, + "epoch": 0.6123509041939207, + "flos": 562909510656.0, + "grad_norm": 0.037973624968581914, + "language_loss": 0.88617527, + "learning_rate": 0.0003449582702816976, + "loss": 0.89775109, + "num_input_tokens_seen": 265739584, + "router_z_loss_mlp": 0.77001953, + "step": 3183, + "time_per_iteration": 2.6636195182800293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155826, + "balance_loss_mlp": 1.0786258, + "epoch": 0.6125432858791843, + "flos": 559130088960.0, + "grad_norm": 0.03254272947638904, + "language_loss": 0.87538117, + "learning_rate": 0.0003446621142134122, + "loss": 0.88693941, + "num_input_tokens_seen": 265810368, + "router_z_loss_mlp": 0.77099609, + "step": 3184, + "time_per_iteration": 2.6456782817840576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154505, + "balance_loss_mlp": 1.07711458, + "epoch": 0.6127356675644479, + "flos": 415896649728.0, + "grad_norm": 0.03534541862410296, + "language_loss": 0.89029509, + "learning_rate": 0.0003443660184526424, + "loss": 0.90184009, + "num_input_tokens_seen": 265871616, + "router_z_loss_mlp": 0.77294922, + "step": 3185, + "time_per_iteration": 2.4446170330047607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153301, + "balance_loss_mlp": 1.07586265, + "epoch": 0.6129280492497114, + "flos": 605033097216.0, + "grad_norm": 0.03004060948026975, + "language_loss": 0.92148149, + "learning_rate": 0.0003440699831143429, + "loss": 0.93301451, + "num_input_tokens_seen": 265946672, + "router_z_loss_mlp": 0.7734375, + "step": 3186, + "time_per_iteration": 2.738818407058716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114756, + "balance_loss_mlp": 1.07007372, + "epoch": 0.613120430934975, + "flos": 520864513536.0, + "grad_norm": 0.031842648163895024, + "language_loss": 0.87123644, + "learning_rate": 0.0003437740083134449, + "loss": 0.88271207, + "num_input_tokens_seen": 266020640, + "router_z_loss_mlp": 0.77392578, + "step": 3187, + "time_per_iteration": 0.013826608657836914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145943, + "balance_loss_mlp": 1.06850421, + "epoch": 0.6133128126202385, + "flos": 512080974336.0, + "grad_norm": 0.03697103993803325, + "language_loss": 0.8916111, + "learning_rate": 0.00034347809416485574, + "loss": 0.90307051, + "num_input_tokens_seen": 266085776, + "router_z_loss_mlp": 0.7734375, + "step": 3188, + "time_per_iteration": 2.626657724380493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152707, + "balance_loss_mlp": 1.07517374, + "epoch": 0.6135051943055021, + "flos": 608756123136.0, + "grad_norm": 0.032275068446110486, + "language_loss": 0.8676489, + "learning_rate": 0.0003431822407834597, + "loss": 0.87917596, + "num_input_tokens_seen": 266157104, + "router_z_loss_mlp": 0.77441406, + "step": 3189, + "time_per_iteration": 2.784728765487671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153516, + "balance_loss_mlp": 1.07588649, + "epoch": 0.6136975759907657, + "flos": 1162008508416.0, + "grad_norm": 0.035345487562752465, + "language_loss": 0.90027606, + "learning_rate": 0.00034288644828411706, + "loss": 0.91181111, + "num_input_tokens_seen": 266244144, + "router_z_loss_mlp": 0.77539062, + "step": 3190, + "time_per_iteration": 3.453296661376953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147033, + "balance_loss_mlp": 1.06959414, + "epoch": 0.6138899576760293, + "flos": 708172649472.0, + "grad_norm": 0.033974370465757506, + "language_loss": 0.80322051, + "learning_rate": 0.0003425907167816649, + "loss": 0.81469083, + "num_input_tokens_seen": 266319040, + "router_z_loss_mlp": 0.7734375, + "step": 3191, + "time_per_iteration": 2.9247496128082275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147023, + "balance_loss_mlp": 1.0697751, + "epoch": 0.6140823393612928, + "flos": 587618468352.0, + "grad_norm": 0.031154822121678163, + "language_loss": 0.89756465, + "learning_rate": 0.00034229504639091623, + "loss": 0.90903485, + "num_input_tokens_seen": 266390784, + "router_z_loss_mlp": 0.77148438, + "step": 3192, + "time_per_iteration": 2.772437810897827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150486, + "balance_loss_mlp": 1.07342911, + "epoch": 0.6142747210465563, + "flos": 805618599936.0, + "grad_norm": 0.03412621705623903, + "language_loss": 0.84789693, + "learning_rate": 0.0003419994372266606, + "loss": 0.85940182, + "num_input_tokens_seen": 266483216, + "router_z_loss_mlp": 0.76953125, + "step": 3193, + "time_per_iteration": 3.096266984939575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148388, + "balance_loss_mlp": 1.07094979, + "epoch": 0.6144671027318199, + "flos": 530544380928.0, + "grad_norm": 0.028061755795717326, + "language_loss": 0.86464483, + "learning_rate": 0.00034170388940366335, + "loss": 0.87612873, + "num_input_tokens_seen": 266557344, + "router_z_loss_mlp": 0.7734375, + "step": 3194, + "time_per_iteration": 2.6779158115386963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152877, + "balance_loss_mlp": 1.07539093, + "epoch": 0.6146594844170835, + "flos": 806912426496.0, + "grad_norm": 0.030674949388275172, + "language_loss": 0.8474896, + "learning_rate": 0.0003414084030366667, + "loss": 0.85901833, + "num_input_tokens_seen": 266639488, + "router_z_loss_mlp": 0.77392578, + "step": 3195, + "time_per_iteration": 3.106736898422241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153391, + "balance_loss_mlp": 1.07590497, + "epoch": 0.6148518661023471, + "flos": 502761675264.0, + "grad_norm": 0.03337820573482111, + "language_loss": 0.87897015, + "learning_rate": 0.0003411129782403883, + "loss": 0.89050412, + "num_input_tokens_seen": 266711168, + "router_z_loss_mlp": 0.77392578, + "step": 3196, + "time_per_iteration": 2.643308639526367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154002, + "balance_loss_mlp": 1.07642102, + "epoch": 0.6150442477876106, + "flos": 511698938880.0, + "grad_norm": 0.038534572595061774, + "language_loss": 0.91158688, + "learning_rate": 0.0003408176151295225, + "loss": 0.92312694, + "num_input_tokens_seen": 266777632, + "router_z_loss_mlp": 0.77490234, + "step": 3197, + "time_per_iteration": 2.5714070796966553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157848, + "balance_loss_mlp": 1.08040917, + "epoch": 0.6152366294728742, + "flos": 527997660672.0, + "grad_norm": 0.045085971427018416, + "language_loss": 0.83155811, + "learning_rate": 0.00034052231381873944, + "loss": 0.84313661, + "num_input_tokens_seen": 266842880, + "router_z_loss_mlp": 0.7734375, + "step": 3198, + "time_per_iteration": 2.607335329055786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158567, + "balance_loss_mlp": 1.0808903, + "epoch": 0.6154290111581378, + "flos": 474282028032.0, + "grad_norm": 0.03501094506345523, + "language_loss": 0.90176225, + "learning_rate": 0.00034022707442268494, + "loss": 0.91334796, + "num_input_tokens_seen": 266909504, + "router_z_loss_mlp": 0.77587891, + "step": 3199, + "time_per_iteration": 2.541625499725342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160121, + "balance_loss_mlp": 1.08244419, + "epoch": 0.6156213928434013, + "flos": 551933815296.0, + "grad_norm": 0.028863713644250544, + "language_loss": 0.85985374, + "learning_rate": 0.0003399318970559813, + "loss": 0.87145495, + "num_input_tokens_seen": 266988880, + "router_z_loss_mlp": 0.77587891, + "step": 3200, + "time_per_iteration": 2.796062707901001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156186, + "balance_loss_mlp": 1.07850885, + "epoch": 0.6158137745286649, + "flos": 752360864256.0, + "grad_norm": 0.02911689008620782, + "language_loss": 0.8882643, + "learning_rate": 0.00033963678183322656, + "loss": 0.89982617, + "num_input_tokens_seen": 267074512, + "router_z_loss_mlp": 0.77587891, + "step": 3201, + "time_per_iteration": 3.0142765045166016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150573, + "balance_loss_mlp": 1.07313454, + "epoch": 0.6160061562139284, + "flos": 556905005568.0, + "grad_norm": 0.026867696213324778, + "language_loss": 0.87175548, + "learning_rate": 0.0003393417288689945, + "loss": 0.8832612, + "num_input_tokens_seen": 267147952, + "router_z_loss_mlp": 0.7734375, + "step": 3202, + "time_per_iteration": 2.655984401702881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149993, + "balance_loss_mlp": 1.07250667, + "epoch": 0.616198537899192, + "flos": 743466534912.0, + "grad_norm": 0.03671255454087467, + "language_loss": 0.83013773, + "learning_rate": 0.00033904673827783504, + "loss": 0.84163767, + "num_input_tokens_seen": 267224368, + "router_z_loss_mlp": 0.77392578, + "step": 3203, + "time_per_iteration": 2.937826633453369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148812, + "balance_loss_mlp": 1.07142162, + "epoch": 0.6163909195844556, + "flos": 479774241792.0, + "grad_norm": 0.030568222552849134, + "language_loss": 0.8708697, + "learning_rate": 0.00033875181017427357, + "loss": 0.88235784, + "num_input_tokens_seen": 267292688, + "router_z_loss_mlp": 0.77294922, + "step": 3204, + "time_per_iteration": 2.6731438636779785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150596, + "balance_loss_mlp": 1.07325339, + "epoch": 0.6165833012697192, + "flos": 532665404928.0, + "grad_norm": 0.031792873085422224, + "language_loss": 0.85750729, + "learning_rate": 0.00033845694467281133, + "loss": 0.86901325, + "num_input_tokens_seen": 267371888, + "router_z_loss_mlp": 0.77246094, + "step": 3205, + "time_per_iteration": 2.876248598098755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149976, + "balance_loss_mlp": 1.07268083, + "epoch": 0.6167756829549826, + "flos": 809293962240.0, + "grad_norm": 0.03236962907615372, + "language_loss": 0.88327932, + "learning_rate": 0.00033816214188792516, + "loss": 0.89477909, + "num_input_tokens_seen": 267458784, + "router_z_loss_mlp": 0.77197266, + "step": 3206, + "time_per_iteration": 3.1564157009124756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151124, + "balance_loss_mlp": 1.07378113, + "epoch": 0.6169680646402462, + "flos": 489910004736.0, + "grad_norm": 0.03290410688193805, + "language_loss": 0.91087395, + "learning_rate": 0.00033786740193406784, + "loss": 0.92238522, + "num_input_tokens_seen": 267528528, + "router_z_loss_mlp": 0.77246094, + "step": 3207, + "time_per_iteration": 2.614291191101074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149659, + "balance_loss_mlp": 1.07236373, + "epoch": 0.6171604463255098, + "flos": 620203176960.0, + "grad_norm": 0.032558146678985676, + "language_loss": 0.86120403, + "learning_rate": 0.00033757272492566736, + "loss": 0.87270063, + "num_input_tokens_seen": 267611152, + "router_z_loss_mlp": 0.77197266, + "step": 3208, + "time_per_iteration": 2.915374994277954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150778, + "balance_loss_mlp": 1.07333994, + "epoch": 0.6173528280107734, + "flos": 529895102976.0, + "grad_norm": 0.029217733611236158, + "language_loss": 0.91618085, + "learning_rate": 0.0003372781109771278, + "loss": 0.9276886, + "num_input_tokens_seen": 267681520, + "router_z_loss_mlp": 0.7734375, + "step": 3209, + "time_per_iteration": 2.7093894481658936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158751, + "balance_loss_mlp": 1.08155119, + "epoch": 0.617545209696037, + "flos": 597736766976.0, + "grad_norm": 0.03128870869992161, + "language_loss": 0.81418395, + "learning_rate": 0.0003369835602028281, + "loss": 0.82577139, + "num_input_tokens_seen": 267758768, + "router_z_loss_mlp": 0.77099609, + "step": 3210, + "time_per_iteration": 2.7591042518615723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156243, + "balance_loss_mlp": 1.07885218, + "epoch": 0.6177375913813005, + "flos": 476105610240.0, + "grad_norm": 0.03246928186554176, + "language_loss": 0.85136282, + "learning_rate": 0.0003366890727171232, + "loss": 0.86292523, + "num_input_tokens_seen": 267831056, + "router_z_loss_mlp": 0.77294922, + "step": 3211, + "time_per_iteration": 2.663344144821167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155968, + "balance_loss_mlp": 1.07881546, + "epoch": 0.617929973066564, + "flos": 530880754176.0, + "grad_norm": 0.03620138157042922, + "language_loss": 0.83830607, + "learning_rate": 0.00033639464863434313, + "loss": 0.84986579, + "num_input_tokens_seen": 267898416, + "router_z_loss_mlp": 0.77050781, + "step": 3212, + "time_per_iteration": 2.6296675205230713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117601, + "balance_loss_mlp": 1.10105133, + "epoch": 0.6181223547518276, + "flos": 1422832622592.0, + "grad_norm": 0.023588472816246354, + "language_loss": 0.78442466, + "learning_rate": 0.00033610028806879363, + "loss": 0.79618478, + "num_input_tokens_seen": 268112864, + "router_z_loss_mlp": 0.75, + "step": 3213, + "time_per_iteration": 4.6863789558410645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148522, + "balance_loss_mlp": 1.07122719, + "epoch": 0.6183147364370912, + "flos": 741695345664.0, + "grad_norm": 0.0331085707194938, + "language_loss": 0.84652448, + "learning_rate": 0.00033580599113475543, + "loss": 0.8580097, + "num_input_tokens_seen": 268198368, + "router_z_loss_mlp": 0.77197266, + "step": 3214, + "time_per_iteration": 2.9692540168762207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148587, + "balance_loss_mlp": 1.07138717, + "epoch": 0.6185071181223547, + "flos": 382482742272.0, + "grad_norm": 0.030292285906144818, + "language_loss": 0.9191429, + "learning_rate": 0.00033551175794648507, + "loss": 0.93062878, + "num_input_tokens_seen": 268260704, + "router_z_loss_mlp": 0.77099609, + "step": 3215, + "time_per_iteration": 2.4922029972076416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157146, + "balance_loss_mlp": 1.07970774, + "epoch": 0.6186994998076183, + "flos": 464304718848.0, + "grad_norm": 0.029842780568851025, + "language_loss": 0.8691783, + "learning_rate": 0.00033521758861821365, + "loss": 0.88074982, + "num_input_tokens_seen": 268328256, + "router_z_loss_mlp": 0.7734375, + "step": 3216, + "time_per_iteration": 2.599022150039673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152488, + "balance_loss_mlp": 1.07485938, + "epoch": 0.6188918814928819, + "flos": 486252106752.0, + "grad_norm": 0.03103316495727489, + "language_loss": 0.9338237, + "learning_rate": 0.0003349234832641479, + "loss": 0.94534856, + "num_input_tokens_seen": 268394016, + "router_z_loss_mlp": 0.77539062, + "step": 3217, + "time_per_iteration": 2.602800130844116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152031, + "balance_loss_mlp": 1.0744493, + "epoch": 0.6190842631781455, + "flos": 658597006848.0, + "grad_norm": 0.03734469861973323, + "language_loss": 0.85810769, + "learning_rate": 0.00033462944199846975, + "loss": 0.86962795, + "num_input_tokens_seen": 268478512, + "router_z_loss_mlp": 0.77490234, + "step": 3218, + "time_per_iteration": 3.070335626602173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151884, + "balance_loss_mlp": 1.07425499, + "epoch": 0.619276644863409, + "flos": 404467060224.0, + "grad_norm": 0.03666199268188377, + "language_loss": 0.91774654, + "learning_rate": 0.00033433546493533606, + "loss": 0.92926538, + "num_input_tokens_seen": 268540304, + "router_z_loss_mlp": 0.77539062, + "step": 3219, + "time_per_iteration": 2.468400716781616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149767, + "balance_loss_mlp": 1.07223368, + "epoch": 0.6194690265486725, + "flos": 584240547840.0, + "grad_norm": 0.03534009375651296, + "language_loss": 0.89686239, + "learning_rate": 0.00033404155218887897, + "loss": 0.90836006, + "num_input_tokens_seen": 268611136, + "router_z_loss_mlp": 0.77441406, + "step": 3220, + "time_per_iteration": 2.695805788040161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150834, + "balance_loss_mlp": 1.07329988, + "epoch": 0.6196614082339361, + "flos": 505384257024.0, + "grad_norm": 0.028059763946118966, + "language_loss": 0.91884506, + "learning_rate": 0.00033374770387320534, + "loss": 0.93035334, + "num_input_tokens_seen": 268684992, + "router_z_loss_mlp": 0.77441406, + "step": 3221, + "time_per_iteration": 2.7483606338500977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151577, + "balance_loss_mlp": 1.07409084, + "epoch": 0.6198537899191997, + "flos": 576525252096.0, + "grad_norm": 0.031050662157407424, + "language_loss": 0.90087008, + "learning_rate": 0.00033345392010239737, + "loss": 0.91238588, + "num_input_tokens_seen": 268758096, + "router_z_loss_mlp": 0.77392578, + "step": 3222, + "time_per_iteration": 2.714914560317993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114984, + "balance_loss_mlp": 1.07249725, + "epoch": 0.6200461716044633, + "flos": 594302450688.0, + "grad_norm": 0.03255490958660124, + "language_loss": 0.88128847, + "learning_rate": 0.0003331602009905118, + "loss": 0.89278692, + "num_input_tokens_seen": 268834432, + "router_z_loss_mlp": 0.77246094, + "step": 3223, + "time_per_iteration": 2.7981505393981934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148595, + "balance_loss_mlp": 1.0711087, + "epoch": 0.6202385532897268, + "flos": 667410745344.0, + "grad_norm": 0.028478674888367996, + "language_loss": 0.88510197, + "learning_rate": 0.00033286654665158085, + "loss": 0.89658791, + "num_input_tokens_seen": 268921168, + "router_z_loss_mlp": 0.77392578, + "step": 3224, + "time_per_iteration": 2.950357437133789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147753, + "balance_loss_mlp": 1.07045746, + "epoch": 0.6204309349749904, + "flos": 485926467072.0, + "grad_norm": 0.03296106773090735, + "language_loss": 0.92470849, + "learning_rate": 0.0003325729571996109, + "loss": 0.93618602, + "num_input_tokens_seen": 268991440, + "router_z_loss_mlp": 0.77197266, + "step": 3225, + "time_per_iteration": 2.632589340209961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150501, + "balance_loss_mlp": 1.07325304, + "epoch": 0.6206233166602539, + "flos": 585217466880.0, + "grad_norm": 0.0318626759985495, + "language_loss": 0.89139777, + "learning_rate": 0.000332279432748584, + "loss": 0.90290284, + "num_input_tokens_seen": 269061024, + "router_z_loss_mlp": 0.77148438, + "step": 3226, + "time_per_iteration": 2.704615592956543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149408, + "balance_loss_mlp": 1.07235157, + "epoch": 0.6208156983455175, + "flos": 477911728128.0, + "grad_norm": 0.029634304247413663, + "language_loss": 0.91940343, + "learning_rate": 0.00033198597341245576, + "loss": 0.93089747, + "num_input_tokens_seen": 269130560, + "router_z_loss_mlp": 0.76953125, + "step": 3227, + "time_per_iteration": 2.582554340362549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149434, + "balance_loss_mlp": 1.07228148, + "epoch": 0.6210080800307811, + "flos": 790467985920.0, + "grad_norm": 0.031063189419047472, + "language_loss": 0.86885202, + "learning_rate": 0.00033169257930515763, + "loss": 0.88034642, + "num_input_tokens_seen": 269213280, + "router_z_loss_mlp": 0.77050781, + "step": 3228, + "time_per_iteration": 3.0251591205596924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152373, + "balance_loss_mlp": 1.07526827, + "epoch": 0.6212004617160446, + "flos": 608916578304.0, + "grad_norm": 0.037247869916732776, + "language_loss": 0.87339175, + "learning_rate": 0.0003313992505405951, + "loss": 0.88491547, + "num_input_tokens_seen": 269286384, + "router_z_loss_mlp": 0.77001953, + "step": 3229, + "time_per_iteration": 2.697026014328003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149107, + "balance_loss_mlp": 1.07209802, + "epoch": 0.6213928434013082, + "flos": 587611737600.0, + "grad_norm": 0.03555615318912057, + "language_loss": 0.87367719, + "learning_rate": 0.0003311059872326487, + "loss": 0.88516825, + "num_input_tokens_seen": 269353296, + "router_z_loss_mlp": 0.76904297, + "step": 3230, + "time_per_iteration": 2.7712976932525635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157084, + "balance_loss_mlp": 1.08017004, + "epoch": 0.6215852250865718, + "flos": 537108840960.0, + "grad_norm": 0.03130868556859839, + "language_loss": 0.84262764, + "learning_rate": 0.0003308127894951734, + "loss": 0.85419852, + "num_input_tokens_seen": 269422304, + "router_z_loss_mlp": 0.76806641, + "step": 3231, + "time_per_iteration": 2.6406192779541016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114749, + "balance_loss_mlp": 1.07038534, + "epoch": 0.6217776067718354, + "flos": 619312852992.0, + "grad_norm": 0.034917389789924605, + "language_loss": 0.91667497, + "learning_rate": 0.00033051965744199834, + "loss": 0.92814988, + "num_input_tokens_seen": 269498784, + "router_z_loss_mlp": 0.77001953, + "step": 3232, + "time_per_iteration": 2.750717878341675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147898, + "balance_loss_mlp": 1.07084131, + "epoch": 0.6219699884570988, + "flos": 547099611648.0, + "grad_norm": 0.02871355385068571, + "language_loss": 0.9457683, + "learning_rate": 0.0003302265911869276, + "loss": 0.95724726, + "num_input_tokens_seen": 269581264, + "router_z_loss_mlp": 0.76953125, + "step": 3233, + "time_per_iteration": 2.930553436279297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147703, + "balance_loss_mlp": 1.07059801, + "epoch": 0.6221623701423624, + "flos": 482155777536.0, + "grad_norm": 0.03278824818574476, + "language_loss": 0.89681149, + "learning_rate": 0.0003299335908437397, + "loss": 0.90828854, + "num_input_tokens_seen": 269649408, + "router_z_loss_mlp": 0.77001953, + "step": 3234, + "time_per_iteration": 2.5631237030029297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149081, + "balance_loss_mlp": 1.07211912, + "epoch": 0.622354751827626, + "flos": 380872008192.0, + "grad_norm": 0.04189689360611541, + "language_loss": 0.86520332, + "learning_rate": 0.0003296406565261873, + "loss": 0.8766942, + "num_input_tokens_seen": 269711648, + "router_z_loss_mlp": 0.76855469, + "step": 3235, + "time_per_iteration": 2.457258701324463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148211, + "balance_loss_mlp": 1.07129693, + "epoch": 0.6225471335128896, + "flos": 669071144448.0, + "grad_norm": 0.03023362442836584, + "language_loss": 0.89682841, + "learning_rate": 0.0003293477883479978, + "loss": 0.90831059, + "num_input_tokens_seen": 269787376, + "router_z_loss_mlp": 0.76806641, + "step": 3236, + "time_per_iteration": 2.8200809955596924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148687, + "balance_loss_mlp": 1.07172537, + "epoch": 0.6227395151981532, + "flos": 772627660800.0, + "grad_norm": 0.038353629459733245, + "language_loss": 0.85627455, + "learning_rate": 0.0003290549864228727, + "loss": 0.86776143, + "num_input_tokens_seen": 269863008, + "router_z_loss_mlp": 0.76855469, + "step": 3237, + "time_per_iteration": 2.9402804374694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151344, + "balance_loss_mlp": 1.07419205, + "epoch": 0.6229318968834167, + "flos": 485357779968.0, + "grad_norm": 0.030356371486713406, + "language_loss": 0.91371596, + "learning_rate": 0.0003287622508644875, + "loss": 0.92522943, + "num_input_tokens_seen": 269939552, + "router_z_loss_mlp": 0.77050781, + "step": 3238, + "time_per_iteration": 2.761613368988037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152584, + "balance_loss_mlp": 1.07543159, + "epoch": 0.6231242785686802, + "flos": 463877021184.0, + "grad_norm": 0.03773116735562404, + "language_loss": 0.92044532, + "learning_rate": 0.0003284695817864923, + "loss": 0.93197119, + "num_input_tokens_seen": 270002752, + "router_z_loss_mlp": 0.77050781, + "step": 3239, + "time_per_iteration": 2.496115207672119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152871, + "balance_loss_mlp": 1.07562304, + "epoch": 0.6233166602539438, + "flos": 610210404864.0, + "grad_norm": 0.04001521730964561, + "language_loss": 0.91216815, + "learning_rate": 0.0003281769793025116, + "loss": 0.92369688, + "num_input_tokens_seen": 270075696, + "router_z_loss_mlp": 0.77148438, + "step": 3240, + "time_per_iteration": 2.737149953842163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153333, + "balance_loss_mlp": 1.07613325, + "epoch": 0.6235090419392074, + "flos": 440114783232.0, + "grad_norm": 0.039001077055099004, + "language_loss": 0.95066154, + "learning_rate": 0.00032788444352614346, + "loss": 0.9621948, + "num_input_tokens_seen": 270139872, + "router_z_loss_mlp": 0.77099609, + "step": 3241, + "time_per_iteration": 2.5000274181365967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152362, + "balance_loss_mlp": 1.07520986, + "epoch": 0.6237014236244709, + "flos": 505900551168.0, + "grad_norm": 0.03351386174888394, + "language_loss": 0.86000109, + "learning_rate": 0.0003275919745709606, + "loss": 0.87152469, + "num_input_tokens_seen": 270206752, + "router_z_loss_mlp": 0.77050781, + "step": 3242, + "time_per_iteration": 2.5560779571533203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150845, + "balance_loss_mlp": 1.07359755, + "epoch": 0.6238938053097345, + "flos": 513995880960.0, + "grad_norm": 0.02989991495254077, + "language_loss": 0.86827087, + "learning_rate": 0.00032729957255050936, + "loss": 0.87977934, + "num_input_tokens_seen": 270275472, + "router_z_loss_mlp": 0.77148438, + "step": 3243, + "time_per_iteration": 2.7240655422210693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151606, + "balance_loss_mlp": 1.07440567, + "epoch": 0.6240861869949981, + "flos": 738021984768.0, + "grad_norm": 0.03287270457650662, + "language_loss": 0.87638962, + "learning_rate": 0.0003270072375783102, + "loss": 0.88790572, + "num_input_tokens_seen": 270348336, + "router_z_loss_mlp": 0.77099609, + "step": 3244, + "time_per_iteration": 2.9896130561828613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151989, + "balance_loss_mlp": 1.07469356, + "epoch": 0.6242785686802617, + "flos": 495708392448.0, + "grad_norm": 0.032661081616998364, + "language_loss": 0.84373832, + "learning_rate": 0.00032671496976785774, + "loss": 0.85525823, + "num_input_tokens_seen": 270416496, + "router_z_loss_mlp": 0.77197266, + "step": 3245, + "time_per_iteration": 2.635254144668579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152307, + "balance_loss_mlp": 1.0751549, + "epoch": 0.6244709503655252, + "flos": 747233221632.0, + "grad_norm": 0.0292375931838659, + "language_loss": 0.80339247, + "learning_rate": 0.0003264227692326205, + "loss": 0.81491554, + "num_input_tokens_seen": 270501680, + "router_z_loss_mlp": 0.77050781, + "step": 3246, + "time_per_iteration": 3.037773609161377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152481, + "balance_loss_mlp": 1.07523346, + "epoch": 0.6246633320507887, + "flos": 493550438400.0, + "grad_norm": 0.03477244782189641, + "language_loss": 0.90644753, + "learning_rate": 0.00032613063608604055, + "loss": 0.91797233, + "num_input_tokens_seen": 270568656, + "router_z_loss_mlp": 0.77148438, + "step": 3247, + "time_per_iteration": 2.537938117980957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151924, + "balance_loss_mlp": 1.07462883, + "epoch": 0.6248557137360523, + "flos": 518391653376.0, + "grad_norm": 0.03220304016525991, + "language_loss": 0.89104807, + "learning_rate": 0.0003258385704415343, + "loss": 0.90256733, + "num_input_tokens_seen": 270636160, + "router_z_loss_mlp": 0.77197266, + "step": 3248, + "time_per_iteration": 2.6050169467926025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157108, + "balance_loss_mlp": 1.08005083, + "epoch": 0.6250480954213159, + "flos": 520428083712.0, + "grad_norm": 0.030644735245645434, + "language_loss": 0.87455463, + "learning_rate": 0.0003255465724124915, + "loss": 0.88612568, + "num_input_tokens_seen": 270708816, + "router_z_loss_mlp": 0.76953125, + "step": 3249, + "time_per_iteration": 2.7039546966552734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152527, + "balance_loss_mlp": 1.07532752, + "epoch": 0.6252404771065795, + "flos": 517069628928.0, + "grad_norm": 0.031780137669166014, + "language_loss": 0.87919134, + "learning_rate": 0.00032525464211227587, + "loss": 0.89071667, + "num_input_tokens_seen": 270778016, + "router_z_loss_mlp": 0.77099609, + "step": 3250, + "time_per_iteration": 2.601846933364868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150948, + "balance_loss_mlp": 1.07403469, + "epoch": 0.6254328587918431, + "flos": 577996998144.0, + "grad_norm": 0.033725560308058275, + "language_loss": 0.90909386, + "learning_rate": 0.0003249627796542249, + "loss": 0.92060328, + "num_input_tokens_seen": 270847072, + "router_z_loss_mlp": 0.76806641, + "step": 3251, + "time_per_iteration": 2.653550148010254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152607, + "balance_loss_mlp": 1.07578814, + "epoch": 0.6256252404771065, + "flos": 599104453632.0, + "grad_norm": 0.030197281894512866, + "language_loss": 0.89177507, + "learning_rate": 0.00032467098515164943, + "loss": 0.90330118, + "num_input_tokens_seen": 270926320, + "router_z_loss_mlp": 0.76708984, + "step": 3252, + "time_per_iteration": 2.896319627761841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153096, + "balance_loss_mlp": 1.07622945, + "epoch": 0.6258176221623701, + "flos": 509361063936.0, + "grad_norm": 0.03670659852857571, + "language_loss": 0.90126091, + "learning_rate": 0.00032437925871783456, + "loss": 0.91279185, + "num_input_tokens_seen": 270997904, + "router_z_loss_mlp": 0.76757812, + "step": 3253, + "time_per_iteration": 2.6326792240142822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151923, + "balance_loss_mlp": 1.07500935, + "epoch": 0.6260100038476337, + "flos": 640804345344.0, + "grad_norm": 0.03617334498196145, + "language_loss": 0.90267026, + "learning_rate": 0.00032408760046603803, + "loss": 0.91418946, + "num_input_tokens_seen": 271074256, + "router_z_loss_mlp": 0.76806641, + "step": 3254, + "time_per_iteration": 2.803849697113037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151596, + "balance_loss_mlp": 1.07458711, + "epoch": 0.6262023855328973, + "flos": 842451360768.0, + "grad_norm": 0.034269487661108974, + "language_loss": 0.82522523, + "learning_rate": 0.00032379601050949193, + "loss": 0.83674121, + "num_input_tokens_seen": 271155152, + "router_z_loss_mlp": 0.76904297, + "step": 3255, + "time_per_iteration": 3.1005427837371826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150946, + "balance_loss_mlp": 1.07422304, + "epoch": 0.6263947672181608, + "flos": 523156726272.0, + "grad_norm": 0.032816276182318284, + "language_loss": 0.93856758, + "learning_rate": 0.0003235044889614013, + "loss": 0.950077, + "num_input_tokens_seen": 271224784, + "router_z_loss_mlp": 0.76611328, + "step": 3256, + "time_per_iteration": 2.6180245876312256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151059, + "balance_loss_mlp": 1.07419276, + "epoch": 0.6265871489034244, + "flos": 608289494016.0, + "grad_norm": 0.03305761610211967, + "language_loss": 0.8896969, + "learning_rate": 0.0003232130359349451, + "loss": 0.90120745, + "num_input_tokens_seen": 271303584, + "router_z_loss_mlp": 0.76757812, + "step": 3257, + "time_per_iteration": 2.845158576965332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152664, + "balance_loss_mlp": 1.07579827, + "epoch": 0.626779530588688, + "flos": 589593773568.0, + "grad_norm": 0.030590175923720698, + "language_loss": 0.86119747, + "learning_rate": 0.0003229216515432751, + "loss": 0.87272418, + "num_input_tokens_seen": 271379632, + "router_z_loss_mlp": 0.76757812, + "step": 3258, + "time_per_iteration": 2.776336193084717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151745, + "balance_loss_mlp": 1.07473612, + "epoch": 0.6269719122739515, + "flos": 439537363968.0, + "grad_norm": 0.03493081590414929, + "language_loss": 0.86540627, + "learning_rate": 0.0003226303358995174, + "loss": 0.87692368, + "num_input_tokens_seen": 271447808, + "router_z_loss_mlp": 0.76904297, + "step": 3259, + "time_per_iteration": 2.589393377304077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151325, + "balance_loss_mlp": 1.07431602, + "epoch": 0.6271642939592151, + "flos": 564014684160.0, + "grad_norm": 0.02751327310294224, + "language_loss": 0.92896867, + "learning_rate": 0.00032233908911677, + "loss": 0.9404819, + "num_input_tokens_seen": 271526768, + "router_z_loss_mlp": 0.76904297, + "step": 3260, + "time_per_iteration": 2.834845781326294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148548, + "balance_loss_mlp": 1.07153916, + "epoch": 0.6273566756444786, + "flos": 515652277248.0, + "grad_norm": 0.03305165048168085, + "language_loss": 0.86257023, + "learning_rate": 0.0003220479113081053, + "loss": 0.87405574, + "num_input_tokens_seen": 271597840, + "router_z_loss_mlp": 0.76904297, + "step": 3261, + "time_per_iteration": 2.7153472900390625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151278, + "balance_loss_mlp": 1.07431674, + "epoch": 0.6275490573297422, + "flos": 586587154944.0, + "grad_norm": 0.03255760599660819, + "language_loss": 0.84347677, + "learning_rate": 0.00032175680258656836, + "loss": 0.85498953, + "num_input_tokens_seen": 271668352, + "router_z_loss_mlp": 0.76855469, + "step": 3262, + "time_per_iteration": 2.7178304195404053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153298, + "balance_loss_mlp": 1.07638431, + "epoch": 0.6277414390150058, + "flos": 560543437824.0, + "grad_norm": 0.03084786969473793, + "language_loss": 0.84701777, + "learning_rate": 0.00032146576306517794, + "loss": 0.85855073, + "num_input_tokens_seen": 271743936, + "router_z_loss_mlp": 0.76806641, + "step": 3263, + "time_per_iteration": 2.730602502822876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153924, + "balance_loss_mlp": 1.07686687, + "epoch": 0.6279338207002694, + "flos": 613840104960.0, + "grad_norm": 0.03145910939226107, + "language_loss": 0.86918247, + "learning_rate": 0.0003211747928569255, + "loss": 0.88072169, + "num_input_tokens_seen": 271817008, + "router_z_loss_mlp": 0.76953125, + "step": 3264, + "time_per_iteration": 2.724712371826172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155736, + "balance_loss_mlp": 1.07882273, + "epoch": 0.6281262023855329, + "flos": 626932821504.0, + "grad_norm": 0.028624354652689574, + "language_loss": 0.87177598, + "learning_rate": 0.0003208838920747754, + "loss": 0.88333333, + "num_input_tokens_seen": 271896960, + "router_z_loss_mlp": 0.76806641, + "step": 3265, + "time_per_iteration": 2.830962896347046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115106, + "balance_loss_mlp": 1.07405066, + "epoch": 0.6283185840707964, + "flos": 1125418795008.0, + "grad_norm": 0.03154411123335471, + "language_loss": 0.82117403, + "learning_rate": 0.0003205930608316656, + "loss": 0.83268464, + "num_input_tokens_seen": 271985008, + "router_z_loss_mlp": 0.76904297, + "step": 3266, + "time_per_iteration": 3.4846274852752686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152648, + "balance_loss_mlp": 1.07573402, + "epoch": 0.62851096575606, + "flos": 516331754496.0, + "grad_norm": 0.032694316072136534, + "language_loss": 0.89774895, + "learning_rate": 0.00032030229924050673, + "loss": 0.90927541, + "num_input_tokens_seen": 272056368, + "router_z_loss_mlp": 0.76806641, + "step": 3267, + "time_per_iteration": 2.6537904739379883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150261, + "balance_loss_mlp": 1.07320464, + "epoch": 0.6287033474413236, + "flos": 405061943808.0, + "grad_norm": 0.03610764341116815, + "language_loss": 0.86379248, + "learning_rate": 0.00032001160741418247, + "loss": 0.8752951, + "num_input_tokens_seen": 272123424, + "router_z_loss_mlp": 0.76953125, + "step": 3268, + "time_per_iteration": 2.6072278022766113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149975, + "balance_loss_mlp": 1.0729655, + "epoch": 0.6288957291265872, + "flos": 526758228480.0, + "grad_norm": 0.03519251125136882, + "language_loss": 0.87577492, + "learning_rate": 0.0003197209854655494, + "loss": 0.88727468, + "num_input_tokens_seen": 272193008, + "router_z_loss_mlp": 0.76904297, + "step": 3269, + "time_per_iteration": 2.624221086502075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151498, + "balance_loss_mlp": 1.07458413, + "epoch": 0.6290881108118507, + "flos": 604957235712.0, + "grad_norm": 0.03303529236450534, + "language_loss": 0.79662859, + "learning_rate": 0.0003194304335074371, + "loss": 0.80814356, + "num_input_tokens_seen": 272275328, + "router_z_loss_mlp": 0.76806641, + "step": 3270, + "time_per_iteration": 2.842299461364746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153904, + "balance_loss_mlp": 1.07703781, + "epoch": 0.6292804924971143, + "flos": 438597374976.0, + "grad_norm": 0.03323676651467279, + "language_loss": 0.93520898, + "learning_rate": 0.0003191399516526475, + "loss": 0.94674796, + "num_input_tokens_seen": 272339328, + "router_z_loss_mlp": 0.76757812, + "step": 3271, + "time_per_iteration": 2.534921169281006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151771, + "balance_loss_mlp": 1.07500029, + "epoch": 0.6294728741823779, + "flos": 607844332032.0, + "grad_norm": 0.029188592887849887, + "language_loss": 0.84005713, + "learning_rate": 0.0003188495400139559, + "loss": 0.8515749, + "num_input_tokens_seen": 272416336, + "router_z_loss_mlp": 0.76660156, + "step": 3272, + "time_per_iteration": 2.783825397491455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149208, + "balance_loss_mlp": 1.07229424, + "epoch": 0.6296652558676414, + "flos": 702773761536.0, + "grad_norm": 0.03427526038841549, + "language_loss": 0.89267194, + "learning_rate": 0.00031855919870411013, + "loss": 0.90416408, + "num_input_tokens_seen": 272490368, + "router_z_loss_mlp": 0.76806641, + "step": 3273, + "time_per_iteration": 2.8276174068450928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148805, + "balance_loss_mlp": 1.07189095, + "epoch": 0.6298576375529049, + "flos": 524943378432.0, + "grad_norm": 0.029237647029809653, + "language_loss": 0.89991713, + "learning_rate": 0.0003182689278358305, + "loss": 0.91140521, + "num_input_tokens_seen": 272562992, + "router_z_loss_mlp": 0.76806641, + "step": 3274, + "time_per_iteration": 2.706908941268921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148394, + "balance_loss_mlp": 1.07143247, + "epoch": 0.6300500192381685, + "flos": 476926076928.0, + "grad_norm": 0.034587260543346605, + "language_loss": 0.85421312, + "learning_rate": 0.0003179787275218105, + "loss": 0.86569709, + "num_input_tokens_seen": 272629456, + "router_z_loss_mlp": 0.76855469, + "step": 3275, + "time_per_iteration": 2.537382125854492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147946, + "balance_loss_mlp": 1.07117569, + "epoch": 0.6302424009234321, + "flos": 521891097600.0, + "grad_norm": 0.02794771765960627, + "language_loss": 0.8894403, + "learning_rate": 0.0003176885978747155, + "loss": 0.9009198, + "num_input_tokens_seen": 272697440, + "router_z_loss_mlp": 0.76660156, + "step": 3276, + "time_per_iteration": 2.6045258045196533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148975, + "balance_loss_mlp": 1.07225204, + "epoch": 0.6304347826086957, + "flos": 695857465344.0, + "grad_norm": 0.03251661514625025, + "language_loss": 0.87684363, + "learning_rate": 0.0003173985390071839, + "loss": 0.88833332, + "num_input_tokens_seen": 272774080, + "router_z_loss_mlp": 0.76611328, + "step": 3277, + "time_per_iteration": 2.858759641647339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167786, + "balance_loss_mlp": 1.09187317, + "epoch": 0.6306271642939593, + "flos": 1470030183936.0, + "grad_norm": 0.015221211739027024, + "language_loss": 0.77900457, + "learning_rate": 0.00031710855103182675, + "loss": 0.79068244, + "num_input_tokens_seen": 272998512, + "router_z_loss_mlp": 0.7578125, + "step": 3278, + "time_per_iteration": 4.767859220504761 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148, + "balance_loss_mlp": 1.07122958, + "epoch": 0.6308195459792227, + "flos": 602929537536.0, + "grad_norm": 0.03309702536338572, + "language_loss": 0.87110293, + "learning_rate": 0.00031681863406122704, + "loss": 0.8825829, + "num_input_tokens_seen": 273074672, + "router_z_loss_mlp": 0.76660156, + "step": 3279, + "time_per_iteration": 2.7526352405548096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151009, + "balance_loss_mlp": 1.0742383, + "epoch": 0.6310119276644863, + "flos": 728236056576.0, + "grad_norm": 0.03127249771985471, + "language_loss": 0.90830934, + "learning_rate": 0.00031652878820794087, + "loss": 0.91981947, + "num_input_tokens_seen": 273157904, + "router_z_loss_mlp": 0.76660156, + "step": 3280, + "time_per_iteration": 2.980374813079834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152955, + "balance_loss_mlp": 1.07623196, + "epoch": 0.6312043093497499, + "flos": 520818851328.0, + "grad_norm": 0.035871108010903825, + "language_loss": 0.91415131, + "learning_rate": 0.00031623901358449627, + "loss": 0.92568088, + "num_input_tokens_seen": 273228160, + "router_z_loss_mlp": 0.76611328, + "step": 3281, + "time_per_iteration": 2.6661479473114014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153626, + "balance_loss_mlp": 1.07685518, + "epoch": 0.6313966910350135, + "flos": 532222244352.0, + "grad_norm": 0.03104696980992861, + "language_loss": 0.93473637, + "learning_rate": 0.0003159493103033936, + "loss": 0.94627267, + "num_input_tokens_seen": 273295872, + "router_z_loss_mlp": 0.76660156, + "step": 3282, + "time_per_iteration": 2.7015254497528076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156189, + "balance_loss_mlp": 1.08065796, + "epoch": 0.631589072720277, + "flos": 1382993969664.0, + "grad_norm": 0.006807831796281711, + "language_loss": 0.79919052, + "learning_rate": 0.00031565967847710564, + "loss": 0.81075245, + "num_input_tokens_seen": 273524320, + "router_z_loss_mlp": 0.75585938, + "step": 3283, + "time_per_iteration": 4.893282890319824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153518, + "balance_loss_mlp": 1.07674742, + "epoch": 0.6317814544055406, + "flos": 625873310208.0, + "grad_norm": 0.03000778283215098, + "language_loss": 0.87091964, + "learning_rate": 0.0003153701182180776, + "loss": 0.88245487, + "num_input_tokens_seen": 273598544, + "router_z_loss_mlp": 0.76660156, + "step": 3284, + "time_per_iteration": 2.785921335220337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153113, + "balance_loss_mlp": 1.07643747, + "epoch": 0.6319738360908042, + "flos": 499097046528.0, + "grad_norm": 0.030580966863201303, + "language_loss": 0.86424339, + "learning_rate": 0.00031508062963872655, + "loss": 0.8757745, + "num_input_tokens_seen": 273666000, + "router_z_loss_mlp": 0.765625, + "step": 3285, + "time_per_iteration": 2.6083192825317383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152348, + "balance_loss_mlp": 1.07567286, + "epoch": 0.6321662177760677, + "flos": 580908289536.0, + "grad_norm": 0.03249956938477427, + "language_loss": 0.84091449, + "learning_rate": 0.0003147912128514423, + "loss": 0.85243797, + "num_input_tokens_seen": 273742672, + "router_z_loss_mlp": 0.765625, + "step": 3286, + "time_per_iteration": 2.7065303325653076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114775, + "balance_loss_mlp": 1.07107508, + "epoch": 0.6323585994613313, + "flos": 602605899264.0, + "grad_norm": 0.03060189068927108, + "language_loss": 0.92241961, + "learning_rate": 0.0003145018679685859, + "loss": 0.93389714, + "num_input_tokens_seen": 273813984, + "router_z_loss_mlp": 0.765625, + "step": 3287, + "time_per_iteration": 2.724647045135498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147567, + "balance_loss_mlp": 1.07093954, + "epoch": 0.6325509811465948, + "flos": 529632589824.0, + "grad_norm": 0.026442764297463384, + "language_loss": 0.9133988, + "learning_rate": 0.00031421259510249134, + "loss": 0.92487442, + "num_input_tokens_seen": 273892848, + "router_z_loss_mlp": 0.76513672, + "step": 3288, + "time_per_iteration": 2.7890970706939697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146868, + "balance_loss_mlp": 1.07019234, + "epoch": 0.6327433628318584, + "flos": 575344217088.0, + "grad_norm": 0.03165563146125425, + "language_loss": 0.8638919, + "learning_rate": 0.00031392339436546414, + "loss": 0.87536061, + "num_input_tokens_seen": 273971696, + "router_z_loss_mlp": 0.765625, + "step": 3289, + "time_per_iteration": 2.8359181880950928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147105, + "balance_loss_mlp": 1.07042992, + "epoch": 0.632935744517122, + "flos": 518111675904.0, + "grad_norm": 0.040669622782204255, + "language_loss": 0.87612778, + "learning_rate": 0.00031363426586978205, + "loss": 0.88759887, + "num_input_tokens_seen": 274048096, + "router_z_loss_mlp": 0.765625, + "step": 3290, + "time_per_iteration": 2.755444288253784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148795, + "balance_loss_mlp": 1.07216728, + "epoch": 0.6331281262023856, + "flos": 618596445696.0, + "grad_norm": 0.029293061792341625, + "language_loss": 0.89532119, + "learning_rate": 0.0003133452097276947, + "loss": 0.90680915, + "num_input_tokens_seen": 274122848, + "router_z_loss_mlp": 0.76513672, + "step": 3291, + "time_per_iteration": 2.731522560119629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153422, + "balance_loss_mlp": 1.07674634, + "epoch": 0.633320507887649, + "flos": 594115799040.0, + "grad_norm": 0.032525593419921936, + "language_loss": 0.88528687, + "learning_rate": 0.0003130562260514238, + "loss": 0.89682108, + "num_input_tokens_seen": 274198320, + "router_z_loss_mlp": 0.765625, + "step": 3292, + "time_per_iteration": 2.7816312313079834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150685, + "balance_loss_mlp": 1.07396197, + "epoch": 0.6335128895729126, + "flos": 583495942656.0, + "grad_norm": 0.0277750610234457, + "language_loss": 0.86754191, + "learning_rate": 0.0003127673149531626, + "loss": 0.87904876, + "num_input_tokens_seen": 274274944, + "router_z_loss_mlp": 0.76611328, + "step": 3293, + "time_per_iteration": 2.7256717681884766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151215, + "balance_loss_mlp": 1.0744915, + "epoch": 0.6337052712581762, + "flos": 453973572096.0, + "grad_norm": 0.0366063114700609, + "language_loss": 0.89718056, + "learning_rate": 0.0003124784765450762, + "loss": 0.90869272, + "num_input_tokens_seen": 274342384, + "router_z_loss_mlp": 0.76611328, + "step": 3294, + "time_per_iteration": 2.557979106903076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152531, + "balance_loss_mlp": 1.07585573, + "epoch": 0.6338976529434398, + "flos": 574515018240.0, + "grad_norm": 0.03914872981780459, + "language_loss": 0.86348414, + "learning_rate": 0.0003121897109393017, + "loss": 0.87500942, + "num_input_tokens_seen": 274417568, + "router_z_loss_mlp": 0.765625, + "step": 3295, + "time_per_iteration": 2.7648093700408936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150647, + "balance_loss_mlp": 1.0738759, + "epoch": 0.6340900346287034, + "flos": 509808227328.0, + "grad_norm": 0.03170073477682662, + "language_loss": 0.93116355, + "learning_rate": 0.0003119010182479481, + "loss": 0.94267005, + "num_input_tokens_seen": 274488960, + "router_z_loss_mlp": 0.76660156, + "step": 3296, + "time_per_iteration": 2.6290597915649414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152399, + "balance_loss_mlp": 1.07562852, + "epoch": 0.6342824163139669, + "flos": 480714230784.0, + "grad_norm": 0.034261076448020254, + "language_loss": 0.8817153, + "learning_rate": 0.00031161239858309563, + "loss": 0.89323932, + "num_input_tokens_seen": 274556880, + "router_z_loss_mlp": 0.76660156, + "step": 3297, + "time_per_iteration": 2.5535776615142822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152393, + "balance_loss_mlp": 1.07571757, + "epoch": 0.6344747979992305, + "flos": 573110401536.0, + "grad_norm": 0.038934995330749234, + "language_loss": 0.89182544, + "learning_rate": 0.0003113238520567964, + "loss": 0.9033494, + "num_input_tokens_seen": 274624944, + "router_z_loss_mlp": 0.765625, + "step": 3298, + "time_per_iteration": 2.6296586990356445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151588, + "balance_loss_mlp": 1.07486486, + "epoch": 0.634667179684494, + "flos": 607045332480.0, + "grad_norm": 0.035281643877612956, + "language_loss": 0.86709571, + "learning_rate": 0.00031103537878107403, + "loss": 0.87861156, + "num_input_tokens_seen": 274695152, + "router_z_loss_mlp": 0.76611328, + "step": 3299, + "time_per_iteration": 2.7374937534332275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156066, + "balance_loss_mlp": 1.07934332, + "epoch": 0.6348595613697576, + "flos": 648128873472.0, + "grad_norm": 0.04012685096431152, + "language_loss": 0.85757369, + "learning_rate": 0.0003107469788679238, + "loss": 0.86913437, + "num_input_tokens_seen": 274767840, + "router_z_loss_mlp": 0.76611328, + "step": 3300, + "time_per_iteration": 2.763896942138672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150162, + "balance_loss_mlp": 1.07329571, + "epoch": 0.6350519430550212, + "flos": 640272588288.0, + "grad_norm": 0.03353321054785192, + "language_loss": 0.91748559, + "learning_rate": 0.00031045865242931267, + "loss": 0.92898715, + "num_input_tokens_seen": 274839312, + "router_z_loss_mlp": 0.76757812, + "step": 3301, + "time_per_iteration": 2.775559186935425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115092, + "balance_loss_mlp": 1.07405412, + "epoch": 0.6352443247402847, + "flos": 687829991424.0, + "grad_norm": 0.033769350364135475, + "language_loss": 0.89046073, + "learning_rate": 0.00031017039957717877, + "loss": 0.90196997, + "num_input_tokens_seen": 274922704, + "router_z_loss_mlp": 0.76757812, + "step": 3302, + "time_per_iteration": 2.9990227222442627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150719, + "balance_loss_mlp": 1.07399607, + "epoch": 0.6354367064255483, + "flos": 560525973504.0, + "grad_norm": 0.03207500130867294, + "language_loss": 0.93455017, + "learning_rate": 0.0003098822204234318, + "loss": 0.94605732, + "num_input_tokens_seen": 274992848, + "router_z_loss_mlp": 0.76611328, + "step": 3303, + "time_per_iteration": 2.6589555740356445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149713, + "balance_loss_mlp": 1.07294202, + "epoch": 0.6356290881108119, + "flos": 981060716544.0, + "grad_norm": 0.03119033938257745, + "language_loss": 0.92425978, + "learning_rate": 0.00030959411507995273, + "loss": 0.93575692, + "num_input_tokens_seen": 275071456, + "router_z_loss_mlp": 0.76660156, + "step": 3304, + "time_per_iteration": 3.2027275562286377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156004, + "balance_loss_mlp": 1.07932901, + "epoch": 0.6358214697960755, + "flos": 529372078080.0, + "grad_norm": 0.037691107664773085, + "language_loss": 0.88209277, + "learning_rate": 0.00030930608365859407, + "loss": 0.8936528, + "num_input_tokens_seen": 275140512, + "router_z_loss_mlp": 0.765625, + "step": 3305, + "time_per_iteration": 2.672909736633301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153167, + "balance_loss_mlp": 1.07663476, + "epoch": 0.6360138514813389, + "flos": 517868628480.0, + "grad_norm": 0.0314628318508628, + "language_loss": 0.93278992, + "learning_rate": 0.00030901812627117943, + "loss": 0.94432157, + "num_input_tokens_seen": 275210896, + "router_z_loss_mlp": 0.76416016, + "step": 3306, + "time_per_iteration": 2.6096842288970947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152004, + "balance_loss_mlp": 1.07556736, + "epoch": 0.6362062331666025, + "flos": 467469791232.0, + "grad_norm": 0.03698857716885425, + "language_loss": 0.90082693, + "learning_rate": 0.000308730243029504, + "loss": 0.91234696, + "num_input_tokens_seen": 275279888, + "router_z_loss_mlp": 0.76318359, + "step": 3307, + "time_per_iteration": 2.625368595123291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148049, + "balance_loss_mlp": 1.07151699, + "epoch": 0.6363986148518661, + "flos": 550772246016.0, + "grad_norm": 0.03499213724407888, + "language_loss": 0.85284883, + "learning_rate": 0.0003084424340453339, + "loss": 0.86432934, + "num_input_tokens_seen": 275357056, + "router_z_loss_mlp": 0.76416016, + "step": 3308, + "time_per_iteration": 2.79801082611084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154866, + "balance_loss_mlp": 1.07842863, + "epoch": 0.6365909965371297, + "flos": 584157955584.0, + "grad_norm": 0.034280921655294554, + "language_loss": 0.87936795, + "learning_rate": 0.0003081546994304064, + "loss": 0.89091659, + "num_input_tokens_seen": 275428240, + "router_z_loss_mlp": 0.76318359, + "step": 3309, + "time_per_iteration": 2.805798053741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151839, + "balance_loss_mlp": 1.0753541, + "epoch": 0.6367833782223933, + "flos": 532287372288.0, + "grad_norm": 0.031184654205402413, + "language_loss": 0.87230557, + "learning_rate": 0.0003078670392964298, + "loss": 0.88382399, + "num_input_tokens_seen": 275497568, + "router_z_loss_mlp": 0.76367188, + "step": 3310, + "time_per_iteration": 2.637089729309082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114879, + "balance_loss_mlp": 1.07211447, + "epoch": 0.6369757599076568, + "flos": 570587876352.0, + "grad_norm": 0.03249753882493018, + "language_loss": 0.8737638, + "learning_rate": 0.00030757945375508406, + "loss": 0.88525176, + "num_input_tokens_seen": 275569616, + "router_z_loss_mlp": 0.765625, + "step": 3311, + "time_per_iteration": 2.6652672290802 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157923, + "balance_loss_mlp": 1.08139026, + "epoch": 0.6371681415929203, + "flos": 541053447168.0, + "grad_norm": 0.03561310839394214, + "language_loss": 0.86446404, + "learning_rate": 0.00030729194291801944, + "loss": 0.8760432, + "num_input_tokens_seen": 275641408, + "router_z_loss_mlp": 0.76416016, + "step": 3312, + "time_per_iteration": 2.685426712036133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152462, + "balance_loss_mlp": 1.07588232, + "epoch": 0.6373605232781839, + "flos": 484530582528.0, + "grad_norm": 0.03615999538834489, + "language_loss": 0.82315236, + "learning_rate": 0.00030700450689685787, + "loss": 0.83467698, + "num_input_tokens_seen": 275706608, + "router_z_loss_mlp": 0.76464844, + "step": 3313, + "time_per_iteration": 2.5285892486572266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115278, + "balance_loss_mlp": 1.07629561, + "epoch": 0.6375529049634475, + "flos": 579816577536.0, + "grad_norm": 0.031570559387627636, + "language_loss": 0.90687287, + "learning_rate": 0.00030671714580319186, + "loss": 0.91840065, + "num_input_tokens_seen": 275785952, + "router_z_loss_mlp": 0.76367188, + "step": 3314, + "time_per_iteration": 2.7918403148651123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149531, + "balance_loss_mlp": 1.07290328, + "epoch": 0.637745286648711, + "flos": 683479154688.0, + "grad_norm": 0.03649458581150707, + "language_loss": 0.8839801, + "learning_rate": 0.0003064298597485846, + "loss": 0.89547539, + "num_input_tokens_seen": 275866240, + "router_z_loss_mlp": 0.76513672, + "step": 3315, + "time_per_iteration": 2.8336853981018066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157103, + "balance_loss_mlp": 1.08066618, + "epoch": 0.6379376683339746, + "flos": 505648771584.0, + "grad_norm": 0.03434060192765891, + "language_loss": 0.89178324, + "learning_rate": 0.00030614264884457054, + "loss": 0.90335435, + "num_input_tokens_seen": 275936176, + "router_z_loss_mlp": 0.76318359, + "step": 3316, + "time_per_iteration": 2.610029697418213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156868, + "balance_loss_mlp": 1.08038342, + "epoch": 0.6381300500192382, + "flos": 503024188416.0, + "grad_norm": 0.037738287263273475, + "language_loss": 0.83208811, + "learning_rate": 0.000305855513202655, + "loss": 0.8436569, + "num_input_tokens_seen": 276004608, + "router_z_loss_mlp": 0.76367188, + "step": 3317, + "time_per_iteration": 2.56390118598938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115293, + "balance_loss_mlp": 1.07663572, + "epoch": 0.6383224317045018, + "flos": 401367115776.0, + "grad_norm": 0.03934464683594442, + "language_loss": 0.83537889, + "learning_rate": 0.0003055684529343138, + "loss": 0.84690815, + "num_input_tokens_seen": 276066688, + "router_z_loss_mlp": 0.76171875, + "step": 3318, + "time_per_iteration": 2.4260315895080566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011523, + "balance_loss_mlp": 1.07600558, + "epoch": 0.6385148133897653, + "flos": 500362675200.0, + "grad_norm": 0.03558980854731561, + "language_loss": 0.8376438, + "learning_rate": 0.00030528146815099374, + "loss": 0.84916675, + "num_input_tokens_seen": 276140000, + "router_z_loss_mlp": 0.76171875, + "step": 3319, + "time_per_iteration": 2.6329188346862793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151029, + "balance_loss_mlp": 1.07468724, + "epoch": 0.6387071950750288, + "flos": 528694602240.0, + "grad_norm": 0.0315122399919932, + "language_loss": 0.76854849, + "learning_rate": 0.00030499455896411203, + "loss": 0.78005874, + "num_input_tokens_seen": 276209840, + "router_z_loss_mlp": 0.76220703, + "step": 3320, + "time_per_iteration": 2.6750285625457764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156959, + "balance_loss_mlp": 1.0823822, + "epoch": 0.6388995767602924, + "flos": 1459104153600.0, + "grad_norm": 0.009844305017815533, + "language_loss": 0.76300812, + "learning_rate": 0.0003047077254850568, + "loss": 0.77457774, + "num_input_tokens_seen": 276444784, + "router_z_loss_mlp": 0.74609375, + "step": 3321, + "time_per_iteration": 4.953099489212036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151078, + "balance_loss_mlp": 1.07459378, + "epoch": 0.639091958445556, + "flos": 605170083840.0, + "grad_norm": 0.03456514545296231, + "language_loss": 0.8206768, + "learning_rate": 0.0003044209678251865, + "loss": 0.83218759, + "num_input_tokens_seen": 276522768, + "router_z_loss_mlp": 0.76367188, + "step": 3322, + "time_per_iteration": 2.8895435333251953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149613, + "balance_loss_mlp": 1.07312858, + "epoch": 0.6392843401308196, + "flos": 585664630272.0, + "grad_norm": 0.030325412861609304, + "language_loss": 0.89598596, + "learning_rate": 0.0003041342860958306, + "loss": 0.90748215, + "num_input_tokens_seen": 276597104, + "router_z_loss_mlp": 0.76367188, + "step": 3323, + "time_per_iteration": 2.8267457485198975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115059, + "balance_loss_mlp": 1.07401037, + "epoch": 0.6394767218160831, + "flos": 515728138752.0, + "grad_norm": 0.035461056589808096, + "language_loss": 0.97089493, + "learning_rate": 0.00030384768040828857, + "loss": 0.98240083, + "num_input_tokens_seen": 276670256, + "router_z_loss_mlp": 0.76464844, + "step": 3324, + "time_per_iteration": 2.6604483127593994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147614, + "balance_loss_mlp": 1.07127237, + "epoch": 0.6396691035013466, + "flos": 542776972800.0, + "grad_norm": 0.029879293671496117, + "language_loss": 0.90136957, + "learning_rate": 0.00030356115087383094, + "loss": 0.91284573, + "num_input_tokens_seen": 276737680, + "router_z_loss_mlp": 0.76220703, + "step": 3325, + "time_per_iteration": 2.61624813079834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151957, + "balance_loss_mlp": 1.07561517, + "epoch": 0.6398614851866102, + "flos": 526554112512.0, + "grad_norm": 0.03633717350328365, + "language_loss": 0.8974539, + "learning_rate": 0.00030327469760369803, + "loss": 0.90897352, + "num_input_tokens_seen": 276803808, + "router_z_loss_mlp": 0.76220703, + "step": 3326, + "time_per_iteration": 2.5705959796905518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151499, + "balance_loss_mlp": 1.0753485, + "epoch": 0.6400538668718738, + "flos": 624134321664.0, + "grad_norm": 0.04101147906430089, + "language_loss": 0.90274537, + "learning_rate": 0.0003029883207091009, + "loss": 0.91426039, + "num_input_tokens_seen": 276874752, + "router_z_loss_mlp": 0.76025391, + "step": 3327, + "time_per_iteration": 2.710705280303955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153226, + "balance_loss_mlp": 1.07712281, + "epoch": 0.6402462485571374, + "flos": 504455001600.0, + "grad_norm": 0.03565756181750687, + "language_loss": 0.8369143, + "learning_rate": 0.00030270202030122095, + "loss": 0.84844655, + "num_input_tokens_seen": 276947200, + "router_z_loss_mlp": 0.75976562, + "step": 3328, + "time_per_iteration": 2.6669437885284424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153213, + "balance_loss_mlp": 1.07706201, + "epoch": 0.6404386302424009, + "flos": 820662426624.0, + "grad_norm": 0.035758844093176624, + "language_loss": 0.90348649, + "learning_rate": 0.00030241579649121, + "loss": 0.91501862, + "num_input_tokens_seen": 277025712, + "router_z_loss_mlp": 0.76025391, + "step": 3329, + "time_per_iteration": 2.9946744441986084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153577, + "balance_loss_mlp": 1.07747424, + "epoch": 0.6406310119276645, + "flos": 472792817664.0, + "grad_norm": 0.031682669944134774, + "language_loss": 0.84166616, + "learning_rate": 0.00030212964939018994, + "loss": 0.85320187, + "num_input_tokens_seen": 277091264, + "router_z_loss_mlp": 0.75976562, + "step": 3330, + "time_per_iteration": 2.529780864715576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153483, + "balance_loss_mlp": 1.07738006, + "epoch": 0.6408233936129281, + "flos": 426488308224.0, + "grad_norm": 0.0317787576762172, + "language_loss": 0.90697497, + "learning_rate": 0.0003018435791092527, + "loss": 0.91850984, + "num_input_tokens_seen": 277154608, + "router_z_loss_mlp": 0.75976562, + "step": 3331, + "time_per_iteration": 2.482226848602295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154163, + "balance_loss_mlp": 1.07810771, + "epoch": 0.6410157752981916, + "flos": 550837373952.0, + "grad_norm": 0.03245017993162029, + "language_loss": 0.86073428, + "learning_rate": 0.00030155758575946083, + "loss": 0.87227595, + "num_input_tokens_seen": 277222176, + "router_z_loss_mlp": 0.75927734, + "step": 3332, + "time_per_iteration": 2.7268691062927246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154009, + "balance_loss_mlp": 1.07785761, + "epoch": 0.6412081569834551, + "flos": 476860948992.0, + "grad_norm": 0.03331397331841687, + "language_loss": 0.88895929, + "learning_rate": 0.0003012716694518467, + "loss": 0.9004994, + "num_input_tokens_seen": 277289600, + "router_z_loss_mlp": 0.76025391, + "step": 3333, + "time_per_iteration": 2.5955138206481934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154559, + "balance_loss_mlp": 1.07845628, + "epoch": 0.6414005386687187, + "flos": 542030366208.0, + "grad_norm": 0.03145594160852774, + "language_loss": 0.89824158, + "learning_rate": 0.000300985830297413, + "loss": 0.90978718, + "num_input_tokens_seen": 277362784, + "router_z_loss_mlp": 0.75976562, + "step": 3334, + "time_per_iteration": 2.675809144973755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151014, + "balance_loss_mlp": 1.07476771, + "epoch": 0.6415929203539823, + "flos": 1042956272640.0, + "grad_norm": 0.03442120912103133, + "language_loss": 0.92276573, + "learning_rate": 0.00030070006840713205, + "loss": 0.93427593, + "num_input_tokens_seen": 277449728, + "router_z_loss_mlp": 0.76123047, + "step": 3335, + "time_per_iteration": 3.3598873615264893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153261, + "balance_loss_mlp": 1.07696736, + "epoch": 0.6417853020392459, + "flos": 649579152384.0, + "grad_norm": 0.03234716357342597, + "language_loss": 0.78466761, + "learning_rate": 0.000300414383891947, + "loss": 0.79620028, + "num_input_tokens_seen": 277527552, + "router_z_loss_mlp": 0.76171875, + "step": 3336, + "time_per_iteration": 2.8177781105041504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153044, + "balance_loss_mlp": 1.07679784, + "epoch": 0.6419776837245095, + "flos": 501943209984.0, + "grad_norm": 0.029578655992370296, + "language_loss": 0.93100476, + "learning_rate": 0.00030012877686276973, + "loss": 0.94253522, + "num_input_tokens_seen": 277603568, + "router_z_loss_mlp": 0.76123047, + "step": 3337, + "time_per_iteration": 2.6656994819641113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153274, + "balance_loss_mlp": 1.07688439, + "epoch": 0.642170065409773, + "flos": 621778982400.0, + "grad_norm": 0.030467733780945628, + "language_loss": 0.91408634, + "learning_rate": 0.0002998432474304832, + "loss": 0.92561901, + "num_input_tokens_seen": 277679696, + "router_z_loss_mlp": 0.76269531, + "step": 3338, + "time_per_iteration": 2.7804837226867676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156387, + "balance_loss_mlp": 1.08161926, + "epoch": 0.6423624470950365, + "flos": 1426638967296.0, + "grad_norm": 0.010632522477168303, + "language_loss": 0.79237342, + "learning_rate": 0.0002995577957059395, + "loss": 0.80393732, + "num_input_tokens_seen": 277913056, + "router_z_loss_mlp": 0.74804688, + "step": 3339, + "time_per_iteration": 4.905744791030884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151035, + "balance_loss_mlp": 1.07493174, + "epoch": 0.6425548287803001, + "flos": 563439266304.0, + "grad_norm": 0.028877045256785867, + "language_loss": 0.92764187, + "learning_rate": 0.00029927242179996107, + "loss": 0.93915224, + "num_input_tokens_seen": 277983168, + "router_z_loss_mlp": 0.75976562, + "step": 3340, + "time_per_iteration": 2.6661758422851562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145869, + "balance_loss_mlp": 1.0697186, + "epoch": 0.6427472104655637, + "flos": 586613351424.0, + "grad_norm": 0.0300822513158231, + "language_loss": 0.88234377, + "learning_rate": 0.0002989871258233398, + "loss": 0.8938024, + "num_input_tokens_seen": 278057600, + "router_z_loss_mlp": 0.76025391, + "step": 3341, + "time_per_iteration": 2.7374660968780518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144033, + "balance_loss_mlp": 1.06773865, + "epoch": 0.6429395921508272, + "flos": 405146537472.0, + "grad_norm": 0.038389287644004705, + "language_loss": 0.88664877, + "learning_rate": 0.0002987019078868373, + "loss": 0.89808905, + "num_input_tokens_seen": 278119232, + "router_z_loss_mlp": 0.76171875, + "step": 3342, + "time_per_iteration": 2.4243760108947754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140022, + "balance_loss_mlp": 1.06377542, + "epoch": 0.6431319738360908, + "flos": 549832257024.0, + "grad_norm": 0.03024016811094423, + "language_loss": 0.8722378, + "learning_rate": 0.00029841676810118484, + "loss": 0.88363802, + "num_input_tokens_seen": 278187456, + "router_z_loss_mlp": 0.76123047, + "step": 3343, + "time_per_iteration": 2.6617236137390137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147432, + "balance_loss_mlp": 1.07118535, + "epoch": 0.6433243555213544, + "flos": 794705304576.0, + "grad_norm": 0.037506118612829445, + "language_loss": 0.92627275, + "learning_rate": 0.0002981317065770839, + "loss": 0.93774706, + "num_input_tokens_seen": 278262176, + "router_z_loss_mlp": 0.76123047, + "step": 3344, + "time_per_iteration": 3.082211494445801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149276, + "balance_loss_mlp": 1.07288682, + "epoch": 0.643516737206618, + "flos": 584112293376.0, + "grad_norm": 0.03767314060719249, + "language_loss": 0.87199879, + "learning_rate": 0.00029784672342520493, + "loss": 0.88349158, + "num_input_tokens_seen": 278328816, + "router_z_loss_mlp": 0.76269531, + "step": 3345, + "time_per_iteration": 2.7258007526397705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114915, + "balance_loss_mlp": 1.07276022, + "epoch": 0.6437091188918815, + "flos": 519750607872.0, + "grad_norm": 0.03533085288020931, + "language_loss": 0.88640958, + "learning_rate": 0.00029756181875618834, + "loss": 0.89790106, + "num_input_tokens_seen": 278395824, + "router_z_loss_mlp": 0.76269531, + "step": 3346, + "time_per_iteration": 2.569779634475708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144811, + "balance_loss_mlp": 1.06846941, + "epoch": 0.643901500577145, + "flos": 385786802688.0, + "grad_norm": 0.034542585210818905, + "language_loss": 0.89738131, + "learning_rate": 0.0002972769926806439, + "loss": 0.90882939, + "num_input_tokens_seen": 278457696, + "router_z_loss_mlp": 0.76220703, + "step": 3347, + "time_per_iteration": 2.497853994369507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147673, + "balance_loss_mlp": 1.07128322, + "epoch": 0.6440938822624086, + "flos": 484697768448.0, + "grad_norm": 0.03553288196721846, + "language_loss": 0.94382805, + "learning_rate": 0.0002969922453091508, + "loss": 0.95530474, + "num_input_tokens_seen": 278526992, + "router_z_loss_mlp": 0.76269531, + "step": 3348, + "time_per_iteration": 2.5491795539855957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147538, + "balance_loss_mlp": 1.07124412, + "epoch": 0.6442862639476722, + "flos": 541637597184.0, + "grad_norm": 0.03037104728594501, + "language_loss": 0.89609063, + "learning_rate": 0.00029670757675225777, + "loss": 0.90756601, + "num_input_tokens_seen": 278601120, + "router_z_loss_mlp": 0.76171875, + "step": 3349, + "time_per_iteration": 2.721752882003784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148396, + "balance_loss_mlp": 1.07234049, + "epoch": 0.6444786456329358, + "flos": 527958729216.0, + "grad_norm": 0.03079951019721412, + "language_loss": 0.85068369, + "learning_rate": 0.0002964229871204831, + "loss": 0.8621676, + "num_input_tokens_seen": 278668208, + "router_z_loss_mlp": 0.75927734, + "step": 3350, + "time_per_iteration": 2.6219635009765625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146722, + "balance_loss_mlp": 1.07076228, + "epoch": 0.6446710273181993, + "flos": 699161525760.0, + "grad_norm": 0.03075522523020309, + "language_loss": 0.88979256, + "learning_rate": 0.00029613847652431403, + "loss": 0.90125972, + "num_input_tokens_seen": 278742832, + "router_z_loss_mlp": 0.75830078, + "step": 3351, + "time_per_iteration": 2.8463754653930664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143843, + "balance_loss_mlp": 1.06778741, + "epoch": 0.6448634090034628, + "flos": 626299006464.0, + "grad_norm": 0.030404862420189395, + "language_loss": 0.8409062, + "learning_rate": 0.0002958540450742078, + "loss": 0.85234463, + "num_input_tokens_seen": 278829744, + "router_z_loss_mlp": 0.75927734, + "step": 3352, + "time_per_iteration": 2.9119668006896973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145662, + "balance_loss_mlp": 1.0695591, + "epoch": 0.6450557906887264, + "flos": 602165466624.0, + "grad_norm": 0.030375965559079645, + "language_loss": 0.81268156, + "learning_rate": 0.0002955696928805901, + "loss": 0.82413822, + "num_input_tokens_seen": 278908592, + "router_z_loss_mlp": 0.75976562, + "step": 3353, + "time_per_iteration": 2.8792967796325684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146049, + "balance_loss_mlp": 1.06989837, + "epoch": 0.64524817237399, + "flos": 647384268288.0, + "grad_norm": 0.032745807535614124, + "language_loss": 0.90629518, + "learning_rate": 0.0002952854200538563, + "loss": 0.91775572, + "num_input_tokens_seen": 278986960, + "router_z_loss_mlp": 0.76025391, + "step": 3354, + "time_per_iteration": 2.7729763984680176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144907, + "balance_loss_mlp": 1.06870866, + "epoch": 0.6454405540592536, + "flos": 474366621696.0, + "grad_norm": 0.04216820116254093, + "language_loss": 0.87584448, + "learning_rate": 0.000295001226704371, + "loss": 0.88729358, + "num_input_tokens_seen": 279054896, + "router_z_loss_mlp": 0.76074219, + "step": 3355, + "time_per_iteration": 2.5655300617218018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146195, + "balance_loss_mlp": 1.06994879, + "epoch": 0.6456329357445171, + "flos": 613019638272.0, + "grad_norm": 0.03469469169647009, + "language_loss": 0.88972664, + "learning_rate": 0.00029471711294246783, + "loss": 0.90118861, + "num_input_tokens_seen": 279126816, + "router_z_loss_mlp": 0.76123047, + "step": 3356, + "time_per_iteration": 2.7737839221954346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149475, + "balance_loss_mlp": 1.07322907, + "epoch": 0.6458253174297807, + "flos": 732931272192.0, + "grad_norm": 0.03845226629357448, + "language_loss": 0.87651891, + "learning_rate": 0.0002944330788784494, + "loss": 0.88801372, + "num_input_tokens_seen": 279197552, + "router_z_loss_mlp": 0.76123047, + "step": 3357, + "time_per_iteration": 2.9011571407318115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151964, + "balance_loss_mlp": 1.07552743, + "epoch": 0.6460176991150443, + "flos": 571554061824.0, + "grad_norm": 0.03220756952294772, + "language_loss": 0.89507246, + "learning_rate": 0.00029414912462258786, + "loss": 0.90659207, + "num_input_tokens_seen": 279275440, + "router_z_loss_mlp": 0.76318359, + "step": 3358, + "time_per_iteration": 2.87532901763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150464, + "balance_loss_mlp": 1.07397914, + "epoch": 0.6462100808003078, + "flos": 584242549248.0, + "grad_norm": 0.034688747990618336, + "language_loss": 0.87649322, + "learning_rate": 0.00029386525028512366, + "loss": 0.88799781, + "num_input_tokens_seen": 279349168, + "router_z_loss_mlp": 0.76367188, + "step": 3359, + "time_per_iteration": 2.701509714126587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115358, + "balance_loss_mlp": 1.07709527, + "epoch": 0.6464024624855714, + "flos": 485010673152.0, + "grad_norm": 0.035268388031257245, + "language_loss": 0.92228907, + "learning_rate": 0.0002935814559762666, + "loss": 0.9338249, + "num_input_tokens_seen": 279427600, + "router_z_loss_mlp": 0.76367188, + "step": 3360, + "time_per_iteration": 2.7698283195495605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149719, + "balance_loss_mlp": 1.07332945, + "epoch": 0.6465948441708349, + "flos": 528842322432.0, + "grad_norm": 0.029604921797993008, + "language_loss": 0.84675246, + "learning_rate": 0.0002932977418061957, + "loss": 0.85824966, + "num_input_tokens_seen": 279496608, + "router_z_loss_mlp": 0.76269531, + "step": 3361, + "time_per_iteration": 2.637636661529541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148892, + "balance_loss_mlp": 1.07245517, + "epoch": 0.6467872258560985, + "flos": 670625482752.0, + "grad_norm": 0.035318648220588056, + "language_loss": 0.86576068, + "learning_rate": 0.00029301410788505833, + "loss": 0.8772496, + "num_input_tokens_seen": 279568448, + "router_z_loss_mlp": 0.76318359, + "step": 3362, + "time_per_iteration": 2.7763969898223877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144507, + "balance_loss_mlp": 1.06826067, + "epoch": 0.6469796075413621, + "flos": 433040033280.0, + "grad_norm": 0.03731380273504302, + "language_loss": 0.87366712, + "learning_rate": 0.00029273055432297126, + "loss": 0.88511223, + "num_input_tokens_seen": 279631952, + "router_z_loss_mlp": 0.76123047, + "step": 3363, + "time_per_iteration": 2.5110268592834473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144768, + "balance_loss_mlp": 1.06842613, + "epoch": 0.6471719892266257, + "flos": 805101579264.0, + "grad_norm": 0.03447928292768335, + "language_loss": 0.85973775, + "learning_rate": 0.00029244708123001917, + "loss": 0.87118536, + "num_input_tokens_seen": 279706880, + "router_z_loss_mlp": 0.76220703, + "step": 3364, + "time_per_iteration": 2.9464926719665527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145161, + "balance_loss_mlp": 1.06896257, + "epoch": 0.6473643709118891, + "flos": 578348834304.0, + "grad_norm": 0.03376367371908884, + "language_loss": 0.88996613, + "learning_rate": 0.0002921636887162565, + "loss": 0.90141773, + "num_input_tokens_seen": 279778864, + "router_z_loss_mlp": 0.76074219, + "step": 3365, + "time_per_iteration": 2.7177810668945312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144996, + "balance_loss_mlp": 1.06879795, + "epoch": 0.6475567525971527, + "flos": 762787338240.0, + "grad_norm": 0.03409968089483679, + "language_loss": 0.89139444, + "learning_rate": 0.00029188037689170595, + "loss": 0.90284443, + "num_input_tokens_seen": 279853328, + "router_z_loss_mlp": 0.76074219, + "step": 3366, + "time_per_iteration": 2.94266676902771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144468, + "balance_loss_mlp": 1.06817389, + "epoch": 0.6477491342824163, + "flos": 844500526080.0, + "grad_norm": 0.03525364957484555, + "language_loss": 0.88880944, + "learning_rate": 0.0002915971458663586, + "loss": 0.90025413, + "num_input_tokens_seen": 279928464, + "router_z_loss_mlp": 0.76171875, + "step": 3367, + "time_per_iteration": 3.037111282348633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144688, + "balance_loss_mlp": 1.06844163, + "epoch": 0.6479415159676799, + "flos": 886381065216.0, + "grad_norm": 0.02613941789873103, + "language_loss": 0.85508728, + "learning_rate": 0.00029131399575017494, + "loss": 0.86653411, + "num_input_tokens_seen": 280015680, + "router_z_loss_mlp": 0.76123047, + "step": 3368, + "time_per_iteration": 3.1630287170410156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144843, + "balance_loss_mlp": 1.06859708, + "epoch": 0.6481338976529435, + "flos": 616723198464.0, + "grad_norm": 0.02777106453890135, + "language_loss": 0.9063583, + "learning_rate": 0.0002910309266530836, + "loss": 0.91780674, + "num_input_tokens_seen": 280093904, + "router_z_loss_mlp": 0.76123047, + "step": 3369, + "time_per_iteration": 2.7928354740142822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154935, + "balance_loss_mlp": 1.07859313, + "epoch": 0.648326279338207, + "flos": 511019461632.0, + "grad_norm": 0.03366950054230419, + "language_loss": 0.90075457, + "learning_rate": 0.0002907479386849814, + "loss": 0.91230392, + "num_input_tokens_seen": 280161584, + "router_z_loss_mlp": 0.76220703, + "step": 3370, + "time_per_iteration": 2.673582077026367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154894, + "balance_loss_mlp": 1.07869589, + "epoch": 0.6485186610234706, + "flos": 703868201472.0, + "grad_norm": 0.031297921332288904, + "language_loss": 0.8459866, + "learning_rate": 0.0002904650319557339, + "loss": 0.8575356, + "num_input_tokens_seen": 280248016, + "router_z_loss_mlp": 0.76074219, + "step": 3371, + "time_per_iteration": 2.984816789627075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149879, + "balance_loss_mlp": 1.07358491, + "epoch": 0.6487110427087341, + "flos": 561745939968.0, + "grad_norm": 0.03993640989964456, + "language_loss": 0.8677696, + "learning_rate": 0.0002901822065751758, + "loss": 0.87926841, + "num_input_tokens_seen": 280319024, + "router_z_loss_mlp": 0.76171875, + "step": 3372, + "time_per_iteration": 2.642890691757202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149196, + "balance_loss_mlp": 1.0729022, + "epoch": 0.6489034243939977, + "flos": 681301734912.0, + "grad_norm": 0.03031559078625196, + "language_loss": 0.90163612, + "learning_rate": 0.0002898994626531093, + "loss": 0.91312808, + "num_input_tokens_seen": 280393200, + "router_z_loss_mlp": 0.76171875, + "step": 3373, + "time_per_iteration": 2.838804006576538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149133, + "balance_loss_mlp": 1.07303011, + "epoch": 0.6490958060792612, + "flos": 475371738624.0, + "grad_norm": 0.03229066647304318, + "language_loss": 0.92974752, + "learning_rate": 0.00028961680029930526, + "loss": 0.94123888, + "num_input_tokens_seen": 280456944, + "router_z_loss_mlp": 0.75976562, + "step": 3374, + "time_per_iteration": 2.5095248222351074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149591, + "balance_loss_mlp": 1.07339203, + "epoch": 0.6492881877645248, + "flos": 590002005504.0, + "grad_norm": 0.03422977569034653, + "language_loss": 0.8249414, + "learning_rate": 0.00028933421962350317, + "loss": 0.83643734, + "num_input_tokens_seen": 280534352, + "router_z_loss_mlp": 0.76074219, + "step": 3375, + "time_per_iteration": 2.733698606491089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149303, + "balance_loss_mlp": 1.07310462, + "epoch": 0.6494805694497884, + "flos": 643587382272.0, + "grad_norm": 0.03276895180859608, + "language_loss": 0.88882941, + "learning_rate": 0.0002890517207354104, + "loss": 0.90032244, + "num_input_tokens_seen": 280608912, + "router_z_loss_mlp": 0.76074219, + "step": 3376, + "time_per_iteration": 2.8495798110961914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149673, + "balance_loss_mlp": 1.07347465, + "epoch": 0.649672951135052, + "flos": 532836593664.0, + "grad_norm": 0.031246089180930747, + "language_loss": 0.86472917, + "learning_rate": 0.0002887693037447029, + "loss": 0.87622589, + "num_input_tokens_seen": 280678848, + "router_z_loss_mlp": 0.76074219, + "step": 3377, + "time_per_iteration": 2.588364601135254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147339, + "balance_loss_mlp": 1.07109332, + "epoch": 0.6498653328203156, + "flos": 548445104640.0, + "grad_norm": 0.03311172972858422, + "language_loss": 0.87447202, + "learning_rate": 0.00028848696876102443, + "loss": 0.88594544, + "num_input_tokens_seen": 280750224, + "router_z_loss_mlp": 0.76123047, + "step": 3378, + "time_per_iteration": 2.6357853412628174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114593, + "balance_loss_mlp": 1.06977868, + "epoch": 0.650057714505579, + "flos": 463160613888.0, + "grad_norm": 0.0392849096276736, + "language_loss": 0.89328945, + "learning_rate": 0.00028820471589398723, + "loss": 0.90474874, + "num_input_tokens_seen": 280817488, + "router_z_loss_mlp": 0.76025391, + "step": 3379, + "time_per_iteration": 2.530264139175415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161056, + "balance_loss_mlp": 1.08519137, + "epoch": 0.6502500961908426, + "flos": 511241041920.0, + "grad_norm": 0.03964181246795499, + "language_loss": 0.82806408, + "learning_rate": 0.00028792254525317196, + "loss": 0.83967471, + "num_input_tokens_seen": 280887440, + "router_z_loss_mlp": 0.75732422, + "step": 3380, + "time_per_iteration": 2.677969217300415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158758, + "balance_loss_mlp": 1.08279765, + "epoch": 0.6504424778761062, + "flos": 580910290944.0, + "grad_norm": 0.031350821569318954, + "language_loss": 0.8659088, + "learning_rate": 0.00028764045694812645, + "loss": 0.87749636, + "num_input_tokens_seen": 280959072, + "router_z_loss_mlp": 0.75830078, + "step": 3381, + "time_per_iteration": 2.7509915828704834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157316, + "balance_loss_mlp": 1.0813086, + "epoch": 0.6506348595613698, + "flos": 520467015168.0, + "grad_norm": 0.04066104102632486, + "language_loss": 0.82166147, + "learning_rate": 0.0002873584510883671, + "loss": 0.83323467, + "num_input_tokens_seen": 281025376, + "router_z_loss_mlp": 0.75878906, + "step": 3382, + "time_per_iteration": 2.5591564178466797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153945, + "balance_loss_mlp": 1.07769895, + "epoch": 0.6508272412466333, + "flos": 511362565632.0, + "grad_norm": 0.02912056326895262, + "language_loss": 0.91856563, + "learning_rate": 0.0002870765277833788, + "loss": 0.93010509, + "num_input_tokens_seen": 281097616, + "router_z_loss_mlp": 0.76123047, + "step": 3383, + "time_per_iteration": 2.7396798133850098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150716, + "balance_loss_mlp": 1.07461333, + "epoch": 0.6510196229318969, + "flos": 626804567040.0, + "grad_norm": 0.032638591105191926, + "language_loss": 0.86156708, + "learning_rate": 0.00028679468714261347, + "loss": 0.87307423, + "num_input_tokens_seen": 281170192, + "router_z_loss_mlp": 0.75976562, + "step": 3384, + "time_per_iteration": 2.762810230255127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148501, + "balance_loss_mlp": 1.07239771, + "epoch": 0.6512120046171604, + "flos": 475669180416.0, + "grad_norm": 0.033246821782095315, + "language_loss": 0.80913359, + "learning_rate": 0.0002865129292754918, + "loss": 0.82061851, + "num_input_tokens_seen": 281238832, + "router_z_loss_mlp": 0.75976562, + "step": 3385, + "time_per_iteration": 2.6017582416534424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151379, + "balance_loss_mlp": 1.07513273, + "epoch": 0.651404386302424, + "flos": 553030256640.0, + "grad_norm": 0.0304228647826632, + "language_loss": 0.86788058, + "learning_rate": 0.00028623125429140105, + "loss": 0.87939441, + "num_input_tokens_seen": 281319472, + "router_z_loss_mlp": 0.76123047, + "step": 3386, + "time_per_iteration": 2.8177084922790527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114874, + "balance_loss_mlp": 1.07230258, + "epoch": 0.6515967679876876, + "flos": 524374691328.0, + "grad_norm": 0.03154749952631653, + "language_loss": 0.92443657, + "learning_rate": 0.00028594966229969785, + "loss": 0.93592393, + "num_input_tokens_seen": 281391168, + "router_z_loss_mlp": 0.76318359, + "step": 3387, + "time_per_iteration": 2.654865264892578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145456, + "balance_loss_mlp": 1.06925726, + "epoch": 0.6517891496729511, + "flos": 575016576000.0, + "grad_norm": 0.03711897249096357, + "language_loss": 0.87118483, + "learning_rate": 0.00028566815340970577, + "loss": 0.88263941, + "num_input_tokens_seen": 281465664, + "router_z_loss_mlp": 0.76074219, + "step": 3388, + "time_per_iteration": 2.724337339401245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148749, + "balance_loss_mlp": 1.07240736, + "epoch": 0.6519815313582147, + "flos": 556989599232.0, + "grad_norm": 0.03038600941725792, + "language_loss": 0.85638821, + "learning_rate": 0.0002853867277307162, + "loss": 0.8678757, + "num_input_tokens_seen": 281532928, + "router_z_loss_mlp": 0.76220703, + "step": 3389, + "time_per_iteration": 2.6384835243225098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114605, + "balance_loss_mlp": 1.0695653, + "epoch": 0.6521739130434783, + "flos": 481521962496.0, + "grad_norm": 0.03095245810395829, + "language_loss": 0.87876832, + "learning_rate": 0.00028510538537198824, + "loss": 0.89022881, + "num_input_tokens_seen": 281601680, + "router_z_loss_mlp": 0.76367188, + "step": 3390, + "time_per_iteration": 2.6401560306549072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143269, + "balance_loss_mlp": 1.06664157, + "epoch": 0.6523662947287419, + "flos": 667019977728.0, + "grad_norm": 0.029103127011675372, + "language_loss": 0.90833724, + "learning_rate": 0.00028482412644274867, + "loss": 0.91976994, + "num_input_tokens_seen": 281679488, + "router_z_loss_mlp": 0.76513672, + "step": 3391, + "time_per_iteration": 2.914109945297241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143322, + "balance_loss_mlp": 1.06645572, + "epoch": 0.6525586764140053, + "flos": 549702001152.0, + "grad_norm": 0.036601963047289736, + "language_loss": 0.80285096, + "learning_rate": 0.00028454295105219207, + "loss": 0.81428421, + "num_input_tokens_seen": 281751056, + "router_z_loss_mlp": 0.76757812, + "step": 3392, + "time_per_iteration": 2.6647682189941406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142157, + "balance_loss_mlp": 1.06557703, + "epoch": 0.6527510580992689, + "flos": 804389901312.0, + "grad_norm": 0.025027747425113815, + "language_loss": 0.83011138, + "learning_rate": 0.0002842618593094802, + "loss": 0.84153295, + "num_input_tokens_seen": 281841008, + "router_z_loss_mlp": 0.76464844, + "step": 3393, + "time_per_iteration": 3.116758108139038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144173, + "balance_loss_mlp": 1.06744993, + "epoch": 0.6529434397845325, + "flos": 672375204864.0, + "grad_norm": 0.042372987357860006, + "language_loss": 0.85526049, + "learning_rate": 0.00028398085132374243, + "loss": 0.8667022, + "num_input_tokens_seen": 281908016, + "router_z_loss_mlp": 0.76611328, + "step": 3394, + "time_per_iteration": 2.7683980464935303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142459, + "balance_loss_mlp": 1.06592691, + "epoch": 0.6531358214697961, + "flos": 829875664896.0, + "grad_norm": 0.03113385731669579, + "language_loss": 0.89394134, + "learning_rate": 0.0002836999272040761, + "loss": 0.90536594, + "num_input_tokens_seen": 281989072, + "router_z_loss_mlp": 0.76416016, + "step": 3395, + "time_per_iteration": 3.102487087249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140812, + "balance_loss_mlp": 1.06432748, + "epoch": 0.6533282031550597, + "flos": 488392596480.0, + "grad_norm": 0.0404739719167322, + "language_loss": 0.89987487, + "learning_rate": 0.00028341908705954575, + "loss": 0.91128296, + "num_input_tokens_seen": 282053152, + "router_z_loss_mlp": 0.76367188, + "step": 3396, + "time_per_iteration": 2.692906618118286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146225, + "balance_loss_mlp": 1.07183838, + "epoch": 0.6535205848403232, + "flos": 1561102328832.0, + "grad_norm": 0.005117457515533169, + "language_loss": 0.81761813, + "learning_rate": 0.00028313833099918265, + "loss": 0.82908034, + "num_input_tokens_seen": 282283984, + "router_z_loss_mlp": 0.74414062, + "step": 3397, + "time_per_iteration": 4.795916557312012 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144233, + "balance_loss_mlp": 1.06793857, + "epoch": 0.6537129665255867, + "flos": 494703275520.0, + "grad_norm": 0.03597932641299946, + "language_loss": 0.82677722, + "learning_rate": 0.00028285765913198604, + "loss": 0.83821958, + "num_input_tokens_seen": 282353008, + "router_z_loss_mlp": 0.76171875, + "step": 3398, + "time_per_iteration": 2.5658674240112305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114427, + "balance_loss_mlp": 1.06788087, + "epoch": 0.6539053482108503, + "flos": 606142273536.0, + "grad_norm": 0.0350820826110483, + "language_loss": 0.88009775, + "learning_rate": 0.0002825770715669227, + "loss": 0.89154047, + "num_input_tokens_seen": 282427648, + "router_z_loss_mlp": 0.76269531, + "step": 3399, + "time_per_iteration": 2.7702410221099854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145417, + "balance_loss_mlp": 1.06902778, + "epoch": 0.6540977298961139, + "flos": 578880591360.0, + "grad_norm": 0.0325786381033819, + "language_loss": 0.8578831, + "learning_rate": 0.00028229656841292634, + "loss": 0.86933732, + "num_input_tokens_seen": 282502128, + "router_z_loss_mlp": 0.76269531, + "step": 3400, + "time_per_iteration": 2.6832401752471924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145045, + "balance_loss_mlp": 1.06865597, + "epoch": 0.6542901115813774, + "flos": 512769183744.0, + "grad_norm": 0.039852870614421367, + "language_loss": 0.82027632, + "learning_rate": 0.0002820161497788979, + "loss": 0.83172679, + "num_input_tokens_seen": 282569360, + "router_z_loss_mlp": 0.76269531, + "step": 3401, + "time_per_iteration": 2.5679121017456055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149696, + "balance_loss_mlp": 1.07330704, + "epoch": 0.654482493266641, + "flos": 626674311168.0, + "grad_norm": 0.030416914651843395, + "language_loss": 0.91325247, + "learning_rate": 0.00028173581577370545, + "loss": 0.92474937, + "num_input_tokens_seen": 282645472, + "router_z_loss_mlp": 0.76269531, + "step": 3402, + "time_per_iteration": 2.7601027488708496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150076, + "balance_loss_mlp": 1.07368624, + "epoch": 0.6546748749519046, + "flos": 525062900736.0, + "grad_norm": 0.030820927894649717, + "language_loss": 0.83866602, + "learning_rate": 0.0002814555665061844, + "loss": 0.8501668, + "num_input_tokens_seen": 282717568, + "router_z_loss_mlp": 0.76269531, + "step": 3403, + "time_per_iteration": 2.688485860824585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153093, + "balance_loss_mlp": 1.07641792, + "epoch": 0.6548672566371682, + "flos": 480273798144.0, + "grad_norm": 0.03553217015928594, + "language_loss": 0.82424521, + "learning_rate": 0.00028117540208513715, + "loss": 0.83577615, + "num_input_tokens_seen": 282791408, + "router_z_loss_mlp": 0.765625, + "step": 3404, + "time_per_iteration": 2.6906890869140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150931, + "balance_loss_mlp": 1.07425523, + "epoch": 0.6550596383224317, + "flos": 617135433216.0, + "grad_norm": 0.03288416711071717, + "language_loss": 0.89287072, + "learning_rate": 0.00028089532261933313, + "loss": 0.90438002, + "num_input_tokens_seen": 282862992, + "router_z_loss_mlp": 0.765625, + "step": 3405, + "time_per_iteration": 2.718001127243042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147316, + "balance_loss_mlp": 1.07078385, + "epoch": 0.6552520200076952, + "flos": 489807946752.0, + "grad_norm": 0.040144975574141664, + "language_loss": 0.91147745, + "learning_rate": 0.0002806153282175087, + "loss": 0.92295063, + "num_input_tokens_seen": 282930448, + "router_z_loss_mlp": 0.76416016, + "step": 3406, + "time_per_iteration": 2.5618858337402344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114632, + "balance_loss_mlp": 1.06983495, + "epoch": 0.6554444016929588, + "flos": 688858576896.0, + "grad_norm": 0.034942224339764696, + "language_loss": 0.88083732, + "learning_rate": 0.0002803354189883679, + "loss": 0.89230049, + "num_input_tokens_seen": 283010864, + "router_z_loss_mlp": 0.76367188, + "step": 3407, + "time_per_iteration": 2.893331527709961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114697, + "balance_loss_mlp": 1.07039022, + "epoch": 0.6556367833782224, + "flos": 544170855936.0, + "grad_norm": 0.02881485242285111, + "language_loss": 0.89870715, + "learning_rate": 0.00028005559504058053, + "loss": 0.91017687, + "num_input_tokens_seen": 283082240, + "router_z_loss_mlp": 0.76464844, + "step": 3408, + "time_per_iteration": 2.750748634338379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146342, + "balance_loss_mlp": 1.06980956, + "epoch": 0.655829165063486, + "flos": 674730544128.0, + "grad_norm": 0.03409829385099465, + "language_loss": 0.82774001, + "learning_rate": 0.0002797758564827838, + "loss": 0.83920342, + "num_input_tokens_seen": 283156656, + "router_z_loss_mlp": 0.76416016, + "step": 3409, + "time_per_iteration": 2.7883474826812744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114755, + "balance_loss_mlp": 1.07111335, + "epoch": 0.6560215467487496, + "flos": 532836593664.0, + "grad_norm": 0.03847218102070899, + "language_loss": 0.89379394, + "learning_rate": 0.0002794962034235824, + "loss": 0.9052695, + "num_input_tokens_seen": 283223584, + "router_z_loss_mlp": 0.76318359, + "step": 3410, + "time_per_iteration": 2.6389691829681396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147509, + "balance_loss_mlp": 1.07102418, + "epoch": 0.656213928434013, + "flos": 592459402752.0, + "grad_norm": 0.035948217838460056, + "language_loss": 0.79690081, + "learning_rate": 0.00027921663597154695, + "loss": 0.80837584, + "num_input_tokens_seen": 283297680, + "router_z_loss_mlp": 0.76367188, + "step": 3411, + "time_per_iteration": 2.8345415592193604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146787, + "balance_loss_mlp": 1.07030261, + "epoch": 0.6564063101192766, + "flos": 416678184960.0, + "grad_norm": 0.038637742097161205, + "language_loss": 0.87214196, + "learning_rate": 0.00027893715423521525, + "loss": 0.88360977, + "num_input_tokens_seen": 283359744, + "router_z_loss_mlp": 0.76367188, + "step": 3412, + "time_per_iteration": 2.4819529056549072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146018, + "balance_loss_mlp": 1.06953347, + "epoch": 0.6565986918045402, + "flos": 454271013888.0, + "grad_norm": 0.03334091944582967, + "language_loss": 0.89441139, + "learning_rate": 0.00027865775832309163, + "loss": 0.90587157, + "num_input_tokens_seen": 283430688, + "router_z_loss_mlp": 0.76367188, + "step": 3413, + "time_per_iteration": 2.728583335876465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145861, + "balance_loss_mlp": 1.06956708, + "epoch": 0.6567910734898038, + "flos": 548798942208.0, + "grad_norm": 0.03367441290021015, + "language_loss": 0.91664404, + "learning_rate": 0.00027837844834364733, + "loss": 0.92810267, + "num_input_tokens_seen": 283498048, + "router_z_loss_mlp": 0.76171875, + "step": 3414, + "time_per_iteration": 2.6371517181396484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145504, + "balance_loss_mlp": 1.06925821, + "epoch": 0.6569834551750673, + "flos": 656764692480.0, + "grad_norm": 0.030804659012074204, + "language_loss": 0.9116472, + "learning_rate": 0.00027809922440532, + "loss": 0.92310226, + "num_input_tokens_seen": 283573040, + "router_z_loss_mlp": 0.76123047, + "step": 3415, + "time_per_iteration": 2.8265881538391113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148906, + "balance_loss_mlp": 1.07265973, + "epoch": 0.6571758368603309, + "flos": 540810399744.0, + "grad_norm": 0.030022936132040084, + "language_loss": 0.8532089, + "learning_rate": 0.00027782008661651406, + "loss": 0.86469799, + "num_input_tokens_seen": 283651696, + "router_z_loss_mlp": 0.76123047, + "step": 3416, + "time_per_iteration": 2.7672157287597656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149293, + "balance_loss_mlp": 1.07314205, + "epoch": 0.6573682185455945, + "flos": 498378637824.0, + "grad_norm": 0.029653574310281386, + "language_loss": 0.91551638, + "learning_rate": 0.00027754103508560013, + "loss": 0.92700928, + "num_input_tokens_seen": 283721824, + "router_z_loss_mlp": 0.76025391, + "step": 3417, + "time_per_iteration": 2.6405131816864014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114713, + "balance_loss_mlp": 1.07088423, + "epoch": 0.657560600230858, + "flos": 448353103872.0, + "grad_norm": 0.03576987566134107, + "language_loss": 0.87917447, + "learning_rate": 0.0002772620699209163, + "loss": 0.89064574, + "num_input_tokens_seen": 283786960, + "router_z_loss_mlp": 0.76123047, + "step": 3418, + "time_per_iteration": 2.5418612957000732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145939, + "balance_loss_mlp": 1.06983602, + "epoch": 0.6577529819161216, + "flos": 482919848448.0, + "grad_norm": 0.03527260419864515, + "language_loss": 0.85359573, + "learning_rate": 0.0002769831912307658, + "loss": 0.86505508, + "num_input_tokens_seen": 283853808, + "router_z_loss_mlp": 0.75976562, + "step": 3419, + "time_per_iteration": 2.604675054550171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147112, + "balance_loss_mlp": 1.07081771, + "epoch": 0.6579453636013851, + "flos": 531859674624.0, + "grad_norm": 0.03824872762512091, + "language_loss": 0.86228991, + "learning_rate": 0.00027670439912341917, + "loss": 0.87376106, + "num_input_tokens_seen": 283920960, + "router_z_loss_mlp": 0.76171875, + "step": 3420, + "time_per_iteration": 2.6483054161071777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146054, + "balance_loss_mlp": 1.06975985, + "epoch": 0.6581377452866487, + "flos": 629242498560.0, + "grad_norm": 0.03412485031630486, + "language_loss": 0.89059192, + "learning_rate": 0.0002764256937071129, + "loss": 0.90205252, + "num_input_tokens_seen": 283992416, + "router_z_loss_mlp": 0.76171875, + "step": 3421, + "time_per_iteration": 2.839137077331543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146563, + "balance_loss_mlp": 1.07031691, + "epoch": 0.6583301269719123, + "flos": 549673803264.0, + "grad_norm": 0.030144943579318143, + "language_loss": 0.91856694, + "learning_rate": 0.00027614707509005036, + "loss": 0.93003255, + "num_input_tokens_seen": 284061760, + "router_z_loss_mlp": 0.76123047, + "step": 3422, + "time_per_iteration": 2.680708408355713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114715, + "balance_loss_mlp": 1.07095134, + "epoch": 0.6585225086571759, + "flos": 428396484096.0, + "grad_norm": 0.04026315039628517, + "language_loss": 0.84251142, + "learning_rate": 0.0002758685433804008, + "loss": 0.85398293, + "num_input_tokens_seen": 284124848, + "router_z_loss_mlp": 0.76074219, + "step": 3423, + "time_per_iteration": 2.5081021785736084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146911, + "balance_loss_mlp": 1.07052183, + "epoch": 0.6587148903424394, + "flos": 861049026048.0, + "grad_norm": 0.03441249575164818, + "language_loss": 0.84824026, + "learning_rate": 0.00027559009868630005, + "loss": 0.85970938, + "num_input_tokens_seen": 284206272, + "router_z_loss_mlp": 0.76269531, + "step": 3424, + "time_per_iteration": 3.1415717601776123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114833, + "balance_loss_mlp": 1.07213128, + "epoch": 0.6589072720277029, + "flos": 807035951616.0, + "grad_norm": 0.03717672501292478, + "language_loss": 0.86237669, + "learning_rate": 0.0002753117411158491, + "loss": 0.87386, + "num_input_tokens_seen": 284293696, + "router_z_loss_mlp": 0.76074219, + "step": 3425, + "time_per_iteration": 3.041346788406372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148297, + "balance_loss_mlp": 1.07195568, + "epoch": 0.6590996537129665, + "flos": 549673803264.0, + "grad_norm": 0.03250683157775158, + "language_loss": 0.94800514, + "learning_rate": 0.0002750334707771168, + "loss": 0.95948815, + "num_input_tokens_seen": 284360192, + "router_z_loss_mlp": 0.76220703, + "step": 3426, + "time_per_iteration": 2.6350677013397217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149524, + "balance_loss_mlp": 1.07318223, + "epoch": 0.6592920353982301, + "flos": 455108944896.0, + "grad_norm": 0.0355046198758662, + "language_loss": 0.86040199, + "learning_rate": 0.0002747552877781369, + "loss": 0.87189716, + "num_input_tokens_seen": 284423680, + "router_z_loss_mlp": 0.76220703, + "step": 3427, + "time_per_iteration": 2.5129551887512207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114868, + "balance_loss_mlp": 1.07233834, + "epoch": 0.6594844170834937, + "flos": 568260734976.0, + "grad_norm": 0.034595379074033504, + "language_loss": 0.88492763, + "learning_rate": 0.0002744771922269097, + "loss": 0.8964144, + "num_input_tokens_seen": 284495712, + "router_z_loss_mlp": 0.76220703, + "step": 3428, + "time_per_iteration": 2.694378137588501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147393, + "balance_loss_mlp": 1.07114637, + "epoch": 0.6596767987687572, + "flos": 1189754284032.0, + "grad_norm": 0.030854411324183387, + "language_loss": 0.86799264, + "learning_rate": 0.0002741991842314015, + "loss": 0.87946653, + "num_input_tokens_seen": 284583440, + "router_z_loss_mlp": 0.76123047, + "step": 3429, + "time_per_iteration": 3.48809552192688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145028, + "balance_loss_mlp": 1.0686388, + "epoch": 0.6598691804540208, + "flos": 504467736576.0, + "grad_norm": 0.03376941001539595, + "language_loss": 0.89963281, + "learning_rate": 0.0002739212638995445, + "loss": 0.9110831, + "num_input_tokens_seen": 284649168, + "router_z_loss_mlp": 0.76269531, + "step": 3430, + "time_per_iteration": 2.532970428466797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114449, + "balance_loss_mlp": 1.06814861, + "epoch": 0.6600615621392844, + "flos": 532398162432.0, + "grad_norm": 0.038613055067671744, + "language_loss": 0.88853264, + "learning_rate": 0.00027364343133923696, + "loss": 0.89997756, + "num_input_tokens_seen": 284723136, + "router_z_loss_mlp": 0.76220703, + "step": 3431, + "time_per_iteration": 2.6269612312316895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144024, + "balance_loss_mlp": 1.06768203, + "epoch": 0.6602539438245479, + "flos": 566556675072.0, + "grad_norm": 0.03520560530434118, + "language_loss": 0.8882376, + "learning_rate": 0.0002733656866583431, + "loss": 0.89967781, + "num_input_tokens_seen": 284792752, + "router_z_loss_mlp": 0.76220703, + "step": 3432, + "time_per_iteration": 2.682663679122925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156009, + "balance_loss_mlp": 1.07995379, + "epoch": 0.6604463255098114, + "flos": 858591628800.0, + "grad_norm": 0.04099855509153074, + "language_loss": 0.88963896, + "learning_rate": 0.0002730880299646927, + "loss": 0.90119904, + "num_input_tokens_seen": 284871008, + "router_z_loss_mlp": 0.75927734, + "step": 3433, + "time_per_iteration": 3.050039291381836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157407, + "balance_loss_mlp": 1.08149505, + "epoch": 0.660638707195075, + "flos": 675679265280.0, + "grad_norm": 0.03297285173612762, + "language_loss": 0.89854127, + "learning_rate": 0.0002728104613660821, + "loss": 0.91011536, + "num_input_tokens_seen": 284945184, + "router_z_loss_mlp": 0.7578125, + "step": 3434, + "time_per_iteration": 2.8358242511749268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148511, + "balance_loss_mlp": 1.07236028, + "epoch": 0.6608310888803386, + "flos": 890523056640.0, + "grad_norm": 0.03459988631627961, + "language_loss": 0.88072419, + "learning_rate": 0.0002725329809702729, + "loss": 0.89220929, + "num_input_tokens_seen": 285029296, + "router_z_loss_mlp": 0.76025391, + "step": 3435, + "time_per_iteration": 3.181201457977295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146577, + "balance_loss_mlp": 1.07033134, + "epoch": 0.6610234705656022, + "flos": 1138107282432.0, + "grad_norm": 0.04279733621824939, + "language_loss": 0.82982898, + "learning_rate": 0.0002722555888849921, + "loss": 0.84129477, + "num_input_tokens_seen": 285124720, + "router_z_loss_mlp": 0.76123047, + "step": 3436, + "time_per_iteration": 3.423975706100464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147052, + "balance_loss_mlp": 1.07099605, + "epoch": 0.6612158522508658, + "flos": 468959001600.0, + "grad_norm": 0.03231258951929261, + "language_loss": 0.84970325, + "learning_rate": 0.00027197828521793334, + "loss": 0.86117375, + "num_input_tokens_seen": 285191360, + "router_z_loss_mlp": 0.75927734, + "step": 3437, + "time_per_iteration": 2.5456013679504395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147897, + "balance_loss_mlp": 1.07179344, + "epoch": 0.6614082339361292, + "flos": 572774028288.0, + "grad_norm": 0.03152032613188321, + "language_loss": 0.8887009, + "learning_rate": 0.0002717010700767552, + "loss": 0.90017986, + "num_input_tokens_seen": 285262624, + "router_z_loss_mlp": 0.75976562, + "step": 3438, + "time_per_iteration": 2.6809959411621094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149118, + "balance_loss_mlp": 1.07306218, + "epoch": 0.6616006156213928, + "flos": 499459616256.0, + "grad_norm": 0.039698826906756704, + "language_loss": 0.82129598, + "learning_rate": 0.00027142394356908226, + "loss": 0.8327871, + "num_input_tokens_seen": 285328512, + "router_z_loss_mlp": 0.75927734, + "step": 3439, + "time_per_iteration": 2.5949456691741943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148646, + "balance_loss_mlp": 1.07254267, + "epoch": 0.6617929973066564, + "flos": 603609014784.0, + "grad_norm": 0.030441774907891187, + "language_loss": 0.8967098, + "learning_rate": 0.00027114690580250456, + "loss": 0.90819627, + "num_input_tokens_seen": 285406128, + "router_z_loss_mlp": 0.75976562, + "step": 3440, + "time_per_iteration": 2.749826431274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147854, + "balance_loss_mlp": 1.07175064, + "epoch": 0.66198537899192, + "flos": 523994657280.0, + "grad_norm": 0.033263511323201614, + "language_loss": 0.91719675, + "learning_rate": 0.0002708699568845776, + "loss": 0.92867529, + "num_input_tokens_seen": 285474704, + "router_z_loss_mlp": 0.75976562, + "step": 3441, + "time_per_iteration": 2.65191912651062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162537, + "balance_loss_mlp": 1.08815002, + "epoch": 0.6621777606771835, + "flos": 1569609893376.0, + "grad_norm": 0.01497403906155291, + "language_loss": 0.79287779, + "learning_rate": 0.00027059309692282265, + "loss": 0.8045032, + "num_input_tokens_seen": 285698704, + "router_z_loss_mlp": 0.74414062, + "step": 3442, + "time_per_iteration": 4.957901239395142 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154184, + "balance_loss_mlp": 1.07817662, + "epoch": 0.6623701423624471, + "flos": 527689485312.0, + "grad_norm": 0.03191394261297454, + "language_loss": 0.8795507, + "learning_rate": 0.0002703163260247261, + "loss": 0.89109254, + "num_input_tokens_seen": 285767936, + "router_z_loss_mlp": 0.75878906, + "step": 3443, + "time_per_iteration": 2.6025161743164062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151931, + "balance_loss_mlp": 1.07601833, + "epoch": 0.6625625240477107, + "flos": 529215625728.0, + "grad_norm": 0.035865829187726836, + "language_loss": 0.87189507, + "learning_rate": 0.0002700396442977399, + "loss": 0.88341439, + "num_input_tokens_seen": 285839456, + "router_z_loss_mlp": 0.7578125, + "step": 3444, + "time_per_iteration": 2.624119758605957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152482, + "balance_loss_mlp": 1.07652199, + "epoch": 0.6627549057329742, + "flos": 474195432960.0, + "grad_norm": 0.03160775147122319, + "language_loss": 0.890499, + "learning_rate": 0.0002697630518492817, + "loss": 0.90202379, + "num_input_tokens_seen": 285905904, + "router_z_loss_mlp": 0.75830078, + "step": 3445, + "time_per_iteration": 2.7382802963256836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151051, + "balance_loss_mlp": 1.07494795, + "epoch": 0.6629472874182378, + "flos": 529011509760.0, + "grad_norm": 0.03595555935138165, + "language_loss": 0.89779699, + "learning_rate": 0.0002694865487867343, + "loss": 0.90930748, + "num_input_tokens_seen": 285975520, + "router_z_loss_mlp": 0.75976562, + "step": 3446, + "time_per_iteration": 2.704895257949829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150785, + "balance_loss_mlp": 1.0749681, + "epoch": 0.6631396691035013, + "flos": 614378592768.0, + "grad_norm": 0.031003429121565652, + "language_loss": 0.8906312, + "learning_rate": 0.0002692101352174453, + "loss": 0.90213907, + "num_input_tokens_seen": 286050320, + "router_z_loss_mlp": 0.75683594, + "step": 3447, + "time_per_iteration": 2.8165597915649414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148036, + "balance_loss_mlp": 1.07207584, + "epoch": 0.6633320507887649, + "flos": 610433986560.0, + "grad_norm": 0.03537124525005162, + "language_loss": 0.89763427, + "learning_rate": 0.00026893381124872787, + "loss": 0.90911466, + "num_input_tokens_seen": 286120672, + "router_z_loss_mlp": 0.75830078, + "step": 3448, + "time_per_iteration": 2.698657512664795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146339, + "balance_loss_mlp": 1.07033098, + "epoch": 0.6635244324740285, + "flos": 751140897792.0, + "grad_norm": 0.037519042250439116, + "language_loss": 0.85281086, + "learning_rate": 0.00026865757698786097, + "loss": 0.86427426, + "num_input_tokens_seen": 286201152, + "router_z_loss_mlp": 0.75878906, + "step": 3449, + "time_per_iteration": 3.055635452270508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145472, + "balance_loss_mlp": 1.06932163, + "epoch": 0.6637168141592921, + "flos": 665747618304.0, + "grad_norm": 0.03493094826481752, + "language_loss": 0.85618043, + "learning_rate": 0.000268381432542088, + "loss": 0.86763519, + "num_input_tokens_seen": 286274512, + "router_z_loss_mlp": 0.76025391, + "step": 3450, + "time_per_iteration": 2.8057384490966797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145353, + "balance_loss_mlp": 1.06934512, + "epoch": 0.6639091958445555, + "flos": 607920193536.0, + "grad_norm": 0.03317215274134995, + "language_loss": 0.85111237, + "learning_rate": 0.00026810537801861807, + "loss": 0.86256593, + "num_input_tokens_seen": 286349808, + "router_z_loss_mlp": 0.75878906, + "step": 3451, + "time_per_iteration": 2.7435052394866943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149606, + "balance_loss_mlp": 1.0735507, + "epoch": 0.6641015775298191, + "flos": 477679414272.0, + "grad_norm": 0.03227894360580252, + "language_loss": 0.85315323, + "learning_rate": 0.0002678294135246243, + "loss": 0.8646493, + "num_input_tokens_seen": 286422912, + "router_z_loss_mlp": 0.75927734, + "step": 3452, + "time_per_iteration": 2.7193186283111572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147818, + "balance_loss_mlp": 1.07171512, + "epoch": 0.6642939592150827, + "flos": 905595081216.0, + "grad_norm": 0.03357369585289791, + "language_loss": 0.91588908, + "learning_rate": 0.0002675535391672463, + "loss": 0.92736733, + "num_input_tokens_seen": 286501072, + "router_z_loss_mlp": 0.75976562, + "step": 3453, + "time_per_iteration": 3.0945043563842773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148472, + "balance_loss_mlp": 1.07236886, + "epoch": 0.6644863409003463, + "flos": 582937989120.0, + "grad_norm": 0.030535675570776123, + "language_loss": 0.90264779, + "learning_rate": 0.0002672777550535877, + "loss": 0.91413254, + "num_input_tokens_seen": 286580480, + "router_z_loss_mlp": 0.75976562, + "step": 3454, + "time_per_iteration": 2.7741284370422363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150279, + "balance_loss_mlp": 1.07398534, + "epoch": 0.6646787225856099, + "flos": 479969625600.0, + "grad_norm": 0.03106835211233169, + "language_loss": 0.89111888, + "learning_rate": 0.00026700206129071747, + "loss": 0.90262163, + "num_input_tokens_seen": 286646208, + "router_z_loss_mlp": 0.76171875, + "step": 3455, + "time_per_iteration": 2.5455679893493652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149274, + "balance_loss_mlp": 1.07302773, + "epoch": 0.6648711042708734, + "flos": 450827965440.0, + "grad_norm": 0.034343549963822835, + "language_loss": 0.92980659, + "learning_rate": 0.00026672645798566925, + "loss": 0.94129932, + "num_input_tokens_seen": 286710624, + "router_z_loss_mlp": 0.76123047, + "step": 3456, + "time_per_iteration": 2.5500409603118896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149485, + "balance_loss_mlp": 1.07319152, + "epoch": 0.665063485956137, + "flos": 860595858432.0, + "grad_norm": 0.03429824706439816, + "language_loss": 0.85038483, + "learning_rate": 0.00026645094524544225, + "loss": 0.86187971, + "num_input_tokens_seen": 286799472, + "router_z_loss_mlp": 0.76171875, + "step": 3457, + "time_per_iteration": 3.2861030101776123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149344, + "balance_loss_mlp": 1.07290661, + "epoch": 0.6652558676414005, + "flos": 605471528448.0, + "grad_norm": 0.02726612159362192, + "language_loss": 0.79581773, + "learning_rate": 0.00026617552317699945, + "loss": 0.80731118, + "num_input_tokens_seen": 286874752, + "router_z_loss_mlp": 0.76318359, + "step": 3458, + "time_per_iteration": 2.8133809566497803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149341, + "balance_loss_mlp": 1.07299888, + "epoch": 0.6654482493266641, + "flos": 511410229248.0, + "grad_norm": 0.030741900207522484, + "language_loss": 0.92019296, + "learning_rate": 0.0002659001918872693, + "loss": 0.9316864, + "num_input_tokens_seen": 286943312, + "router_z_loss_mlp": 0.76220703, + "step": 3459, + "time_per_iteration": 2.719456672668457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149368, + "balance_loss_mlp": 1.07302606, + "epoch": 0.6656406310119277, + "flos": 566660734464.0, + "grad_norm": 0.03268721915470487, + "language_loss": 0.8501879, + "learning_rate": 0.0002656249514831449, + "loss": 0.86168158, + "num_input_tokens_seen": 287010000, + "router_z_loss_mlp": 0.76220703, + "step": 3460, + "time_per_iteration": 2.7105963230133057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150225, + "balance_loss_mlp": 1.07383597, + "epoch": 0.6658330126971912, + "flos": 1026058664448.0, + "grad_norm": 0.029696729072264432, + "language_loss": 0.91355968, + "learning_rate": 0.00026534980207148416, + "loss": 0.92506194, + "num_input_tokens_seen": 287101456, + "router_z_loss_mlp": 0.76269531, + "step": 3461, + "time_per_iteration": 3.3982574939727783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145433, + "balance_loss_mlp": 1.06894886, + "epoch": 0.6660253943824548, + "flos": 818233227264.0, + "grad_norm": 0.03528061567962845, + "language_loss": 0.78412712, + "learning_rate": 0.0002650747437591097, + "loss": 0.79558146, + "num_input_tokens_seen": 287182848, + "router_z_loss_mlp": 0.76367188, + "step": 3462, + "time_per_iteration": 2.9878056049346924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149719, + "balance_loss_mlp": 1.07533264, + "epoch": 0.6662177760677184, + "flos": 1499530411008.0, + "grad_norm": 0.00830594189347842, + "language_loss": 0.8187958, + "learning_rate": 0.00026479977665280806, + "loss": 0.83029294, + "num_input_tokens_seen": 287417920, + "router_z_loss_mlp": 0.74414062, + "step": 3463, + "time_per_iteration": 6.524547815322876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145921, + "balance_loss_mlp": 1.06953192, + "epoch": 0.666410157752982, + "flos": 501107280384.0, + "grad_norm": 0.03076087992809579, + "language_loss": 0.91384947, + "learning_rate": 0.00026452490085933155, + "loss": 0.9253087, + "num_input_tokens_seen": 287483776, + "router_z_loss_mlp": 0.76269531, + "step": 3464, + "time_per_iteration": 2.598808765411377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145896, + "balance_loss_mlp": 1.06955457, + "epoch": 0.6666025394382454, + "flos": 482138313216.0, + "grad_norm": 0.03618588438682257, + "language_loss": 0.95199478, + "learning_rate": 0.00026425011648539614, + "loss": 0.96345377, + "num_input_tokens_seen": 287548176, + "router_z_loss_mlp": 0.76220703, + "step": 3465, + "time_per_iteration": 2.5265092849731445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145501, + "balance_loss_mlp": 1.06906354, + "epoch": 0.666794921123509, + "flos": 547691767296.0, + "grad_norm": 0.03394030373238319, + "language_loss": 0.87548077, + "learning_rate": 0.00026397542363768267, + "loss": 0.88693571, + "num_input_tokens_seen": 287618496, + "router_z_loss_mlp": 0.76318359, + "step": 3466, + "time_per_iteration": 2.645876407623291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145746, + "balance_loss_mlp": 1.06935704, + "epoch": 0.6669873028087726, + "flos": 472942539264.0, + "grad_norm": 0.0340202515012301, + "language_loss": 0.87299979, + "learning_rate": 0.0002637008224228362, + "loss": 0.88445723, + "num_input_tokens_seen": 287684032, + "router_z_loss_mlp": 0.76269531, + "step": 3467, + "time_per_iteration": 2.5271472930908203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147048, + "balance_loss_mlp": 1.07070661, + "epoch": 0.6671796844940362, + "flos": 548499499008.0, + "grad_norm": 0.029468894408270302, + "language_loss": 0.89176929, + "learning_rate": 0.00026342631294746653, + "loss": 0.90323979, + "num_input_tokens_seen": 287757680, + "router_z_loss_mlp": 0.76220703, + "step": 3468, + "time_per_iteration": 2.694568395614624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146376, + "balance_loss_mlp": 1.07008207, + "epoch": 0.6673720661792998, + "flos": 1072122127872.0, + "grad_norm": 0.03284045124327485, + "language_loss": 0.85731959, + "learning_rate": 0.0002631518953181476, + "loss": 0.86878335, + "num_input_tokens_seen": 287848992, + "router_z_loss_mlp": 0.76171875, + "step": 3469, + "time_per_iteration": 3.4704368114471436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148972, + "balance_loss_mlp": 1.07458496, + "epoch": 0.6675644478645633, + "flos": 1527111002112.0, + "grad_norm": 0.004792795584487496, + "language_loss": 0.76325285, + "learning_rate": 0.000262877569641418, + "loss": 0.7747426, + "num_input_tokens_seen": 288085680, + "router_z_loss_mlp": 0.74414062, + "step": 3470, + "time_per_iteration": 4.929240465164185 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146143, + "balance_loss_mlp": 1.06989694, + "epoch": 0.6677568295498268, + "flos": 580843161600.0, + "grad_norm": 0.032107654736022645, + "language_loss": 0.84914112, + "learning_rate": 0.00026260333602377985, + "loss": 0.86060262, + "num_input_tokens_seen": 288161568, + "router_z_loss_mlp": 0.76123047, + "step": 3471, + "time_per_iteration": 2.740605592727661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146874, + "balance_loss_mlp": 1.07072294, + "epoch": 0.6679492112350904, + "flos": 384790417920.0, + "grad_norm": 0.036226919771653675, + "language_loss": 0.91317421, + "learning_rate": 0.0002623291945717007, + "loss": 0.92464286, + "num_input_tokens_seen": 288224032, + "router_z_loss_mlp": 0.76025391, + "step": 3472, + "time_per_iteration": 2.4707448482513428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146308, + "balance_loss_mlp": 1.07015693, + "epoch": 0.668141592920354, + "flos": 1152615349248.0, + "grad_norm": 0.02851459994850691, + "language_loss": 0.88269627, + "learning_rate": 0.00026205514539161175, + "loss": 0.89415932, + "num_input_tokens_seen": 288312912, + "router_z_loss_mlp": 0.76025391, + "step": 3473, + "time_per_iteration": 3.5094759464263916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146143, + "balance_loss_mlp": 1.07008779, + "epoch": 0.6683339746056175, + "flos": 562291158528.0, + "grad_norm": 0.030234261038109174, + "language_loss": 0.88653791, + "learning_rate": 0.00026178118858990773, + "loss": 0.89799941, + "num_input_tokens_seen": 288394224, + "router_z_loss_mlp": 0.75927734, + "step": 3474, + "time_per_iteration": 2.8636863231658936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147022, + "balance_loss_mlp": 1.07096648, + "epoch": 0.6685263562908811, + "flos": 515328638976.0, + "grad_norm": 0.030631239249789746, + "language_loss": 0.89337111, + "learning_rate": 0.0002615073242729483, + "loss": 0.9048413, + "num_input_tokens_seen": 288462976, + "router_z_loss_mlp": 0.75927734, + "step": 3475, + "time_per_iteration": 2.6223714351654053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148783, + "balance_loss_mlp": 1.07267952, + "epoch": 0.6687187379761447, + "flos": 631000952832.0, + "grad_norm": 0.03058857090132586, + "language_loss": 0.88941103, + "learning_rate": 0.0002612335525470573, + "loss": 0.90089881, + "num_input_tokens_seen": 288542032, + "router_z_loss_mlp": 0.75976562, + "step": 3476, + "time_per_iteration": 2.8004729747772217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148335, + "balance_loss_mlp": 1.07242274, + "epoch": 0.6689111196614083, + "flos": 536687874048.0, + "grad_norm": 0.03636459478392294, + "language_loss": 0.82775843, + "learning_rate": 0.0002609598735185221, + "loss": 0.8392418, + "num_input_tokens_seen": 288610704, + "router_z_loss_mlp": 0.7578125, + "step": 3477, + "time_per_iteration": 2.668614149093628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148386, + "balance_loss_mlp": 1.0723784, + "epoch": 0.6691035013466718, + "flos": 604160237568.0, + "grad_norm": 0.03359617144199284, + "language_loss": 0.87902224, + "learning_rate": 0.00026068628729359445, + "loss": 0.89050609, + "num_input_tokens_seen": 288686080, + "router_z_loss_mlp": 0.75878906, + "step": 3478, + "time_per_iteration": 2.7584378719329834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147866, + "balance_loss_mlp": 1.07185841, + "epoch": 0.6692958830319353, + "flos": 634127093760.0, + "grad_norm": 0.030871112113608438, + "language_loss": 0.80438709, + "learning_rate": 0.00026041279397848996, + "loss": 0.81586581, + "num_input_tokens_seen": 288764944, + "router_z_loss_mlp": 0.75878906, + "step": 3479, + "time_per_iteration": 2.8838839530944824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011474, + "balance_loss_mlp": 1.07143939, + "epoch": 0.6694882647171989, + "flos": 646748451840.0, + "grad_norm": 0.03180979016390224, + "language_loss": 0.87201416, + "learning_rate": 0.00026013939367938797, + "loss": 0.88348818, + "num_input_tokens_seen": 288847856, + "router_z_loss_mlp": 0.75830078, + "step": 3480, + "time_per_iteration": 2.908734083175659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148147, + "balance_loss_mlp": 1.07213914, + "epoch": 0.6696806464024625, + "flos": 570761793024.0, + "grad_norm": 0.030473361279484277, + "language_loss": 0.85594642, + "learning_rate": 0.00025986608650243204, + "loss": 0.86742783, + "num_input_tokens_seen": 288929360, + "router_z_loss_mlp": 0.75878906, + "step": 3481, + "time_per_iteration": 2.85624098777771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147434, + "balance_loss_mlp": 1.07137847, + "epoch": 0.6698730280877261, + "flos": 623963132928.0, + "grad_norm": 0.033030030502012045, + "language_loss": 0.84301388, + "learning_rate": 0.0002595928725537293, + "loss": 0.85448819, + "num_input_tokens_seen": 289010160, + "router_z_loss_mlp": 0.75927734, + "step": 3482, + "time_per_iteration": 2.9488890171051025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147834, + "balance_loss_mlp": 1.07177854, + "epoch": 0.6700654097729896, + "flos": 503508281856.0, + "grad_norm": 0.03256709943741325, + "language_loss": 0.93030363, + "learning_rate": 0.0002593197519393509, + "loss": 0.941782, + "num_input_tokens_seen": 289077392, + "router_z_loss_mlp": 0.75927734, + "step": 3483, + "time_per_iteration": 2.6505393981933594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146862, + "balance_loss_mlp": 1.07085407, + "epoch": 0.6702577914582531, + "flos": 625117971456.0, + "grad_norm": 0.031176357525406213, + "language_loss": 0.83921826, + "learning_rate": 0.00025904672476533165, + "loss": 0.85068691, + "num_input_tokens_seen": 289157248, + "router_z_loss_mlp": 0.75878906, + "step": 3484, + "time_per_iteration": 2.859121084213257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147102, + "balance_loss_mlp": 1.07109404, + "epoch": 0.6704501731435167, + "flos": 457212504576.0, + "grad_norm": 0.03137206075835519, + "language_loss": 0.87799835, + "learning_rate": 0.0002587737911376704, + "loss": 0.88946939, + "num_input_tokens_seen": 289224864, + "router_z_loss_mlp": 0.75878906, + "step": 3485, + "time_per_iteration": 2.599365711212158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147337, + "balance_loss_mlp": 1.07137716, + "epoch": 0.6706425548287803, + "flos": 544257451008.0, + "grad_norm": 0.033540892991266884, + "language_loss": 0.88788569, + "learning_rate": 0.00025850095116232885, + "loss": 0.89935905, + "num_input_tokens_seen": 289293488, + "router_z_loss_mlp": 0.75830078, + "step": 3486, + "time_per_iteration": 2.6457767486572266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143978, + "balance_loss_mlp": 1.06787491, + "epoch": 0.6708349365140439, + "flos": 635179874304.0, + "grad_norm": 0.030051375529732832, + "language_loss": 0.82181835, + "learning_rate": 0.000258228204945233, + "loss": 0.83325815, + "num_input_tokens_seen": 289370560, + "router_z_loss_mlp": 0.75976562, + "step": 3487, + "time_per_iteration": 2.8957583904266357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147088, + "balance_loss_mlp": 1.07117581, + "epoch": 0.6710273181993074, + "flos": 641902788096.0, + "grad_norm": 0.03500138254568088, + "language_loss": 0.89155853, + "learning_rate": 0.00025795555259227254, + "loss": 0.90302938, + "num_input_tokens_seen": 289440096, + "router_z_loss_mlp": 0.7578125, + "step": 3488, + "time_per_iteration": 2.814859628677368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147178, + "balance_loss_mlp": 1.0712657, + "epoch": 0.671219699884571, + "flos": 555025027584.0, + "grad_norm": 0.029480168700917284, + "language_loss": 0.88153946, + "learning_rate": 0.00025768299420930046, + "loss": 0.89301121, + "num_input_tokens_seen": 289515808, + "router_z_loss_mlp": 0.7578125, + "step": 3489, + "time_per_iteration": 2.723747491836548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146316, + "balance_loss_mlp": 1.07045078, + "epoch": 0.6714120815698346, + "flos": 732781550592.0, + "grad_norm": 0.031857153656531974, + "language_loss": 0.87735152, + "learning_rate": 0.0002574105299021332, + "loss": 0.88881469, + "num_input_tokens_seen": 289591344, + "router_z_loss_mlp": 0.75732422, + "step": 3490, + "time_per_iteration": 2.8996829986572266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145484, + "balance_loss_mlp": 1.06957209, + "epoch": 0.6716044632550981, + "flos": 689946286080.0, + "grad_norm": 0.030584806240151117, + "language_loss": 0.88189107, + "learning_rate": 0.00025713815977655084, + "loss": 0.89334595, + "num_input_tokens_seen": 289672032, + "router_z_loss_mlp": 0.7578125, + "step": 3491, + "time_per_iteration": 2.8675849437713623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161081, + "balance_loss_mlp": 1.08545506, + "epoch": 0.6717968449403616, + "flos": 461586809856.0, + "grad_norm": 0.035565643494579496, + "language_loss": 0.89158142, + "learning_rate": 0.0002568658839382969, + "loss": 0.90319222, + "num_input_tokens_seen": 289738304, + "router_z_loss_mlp": 0.75488281, + "step": 3492, + "time_per_iteration": 2.542618989944458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161108, + "balance_loss_mlp": 1.08538604, + "epoch": 0.6719892266256252, + "flos": 502596490752.0, + "grad_norm": 0.03871127770917694, + "language_loss": 0.90369606, + "learning_rate": 0.00025659370249307814, + "loss": 0.91530716, + "num_input_tokens_seen": 289804304, + "router_z_loss_mlp": 0.75585938, + "step": 3493, + "time_per_iteration": 2.617976665496826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155204, + "balance_loss_mlp": 1.07938695, + "epoch": 0.6721816083108888, + "flos": 684736051200.0, + "grad_norm": 0.030709352042026482, + "language_loss": 0.89865196, + "learning_rate": 0.00025632161554656473, + "loss": 0.91020399, + "num_input_tokens_seen": 289877696, + "router_z_loss_mlp": 0.75683594, + "step": 3494, + "time_per_iteration": 2.9416136741638184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153333, + "balance_loss_mlp": 1.07742059, + "epoch": 0.6723739899961524, + "flos": 586895330304.0, + "grad_norm": 0.035401445630926676, + "language_loss": 0.86814046, + "learning_rate": 0.00025604962320439017, + "loss": 0.87967384, + "num_input_tokens_seen": 289947296, + "router_z_loss_mlp": 0.7578125, + "step": 3495, + "time_per_iteration": 2.709865093231201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152259, + "balance_loss_mlp": 1.07639432, + "epoch": 0.672566371681416, + "flos": 507739596288.0, + "grad_norm": 0.03037394710394358, + "language_loss": 0.86663043, + "learning_rate": 0.0002557777255721516, + "loss": 0.87815297, + "num_input_tokens_seen": 290020080, + "router_z_loss_mlp": 0.75732422, + "step": 3496, + "time_per_iteration": 2.7064080238342285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144717, + "balance_loss_mlp": 1.06870878, + "epoch": 0.6727587533666795, + "flos": 536735537664.0, + "grad_norm": 0.03895269185794194, + "language_loss": 0.8665306, + "learning_rate": 0.0002555059227554087, + "loss": 0.87797779, + "num_input_tokens_seen": 290094544, + "router_z_loss_mlp": 0.75878906, + "step": 3497, + "time_per_iteration": 2.725748062133789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144891, + "balance_loss_mlp": 1.06897879, + "epoch": 0.672951135051943, + "flos": 604036712448.0, + "grad_norm": 0.03298671193976436, + "language_loss": 0.82722509, + "learning_rate": 0.00025523421485968453, + "loss": 0.83867407, + "num_input_tokens_seen": 290173520, + "router_z_loss_mlp": 0.7578125, + "step": 3498, + "time_per_iteration": 2.7769460678100586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143713, + "balance_loss_mlp": 1.06780005, + "epoch": 0.6731435167372066, + "flos": 812677886976.0, + "grad_norm": 0.03548022480956623, + "language_loss": 0.90755463, + "learning_rate": 0.00025496260199046585, + "loss": 0.91899168, + "num_input_tokens_seen": 290248240, + "router_z_loss_mlp": 0.7578125, + "step": 3499, + "time_per_iteration": 2.952929973602295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143579, + "balance_loss_mlp": 1.06766629, + "epoch": 0.6733358984224702, + "flos": 612750394368.0, + "grad_norm": 0.030145588081223078, + "language_loss": 0.89167559, + "learning_rate": 0.000254691084253202, + "loss": 0.90311134, + "num_input_tokens_seen": 290326288, + "router_z_loss_mlp": 0.7578125, + "step": 3500, + "time_per_iteration": 2.798442840576172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144185, + "balance_loss_mlp": 1.06827235, + "epoch": 0.6735282801077337, + "flos": 559968019968.0, + "grad_norm": 0.034844314373587704, + "language_loss": 0.83049423, + "learning_rate": 0.00025441966175330567, + "loss": 0.84193599, + "num_input_tokens_seen": 290395984, + "router_z_loss_mlp": 0.7578125, + "step": 3501, + "time_per_iteration": 2.712158203125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143612, + "balance_loss_mlp": 1.06769979, + "epoch": 0.6737206617929973, + "flos": 673632101376.0, + "grad_norm": 0.033990412363220264, + "language_loss": 0.84750879, + "learning_rate": 0.00025414833459615183, + "loss": 0.85894495, + "num_input_tokens_seen": 290470224, + "router_z_loss_mlp": 0.7578125, + "step": 3502, + "time_per_iteration": 2.801419973373413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143927, + "balance_loss_mlp": 1.06801498, + "epoch": 0.6739130434782609, + "flos": 634641386496.0, + "grad_norm": 0.0329145119302939, + "language_loss": 0.85179496, + "learning_rate": 0.0002538771028870796, + "loss": 0.86323422, + "num_input_tokens_seen": 290542864, + "router_z_loss_mlp": 0.7578125, + "step": 3503, + "time_per_iteration": 2.775928497314453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143743, + "balance_loss_mlp": 1.06783044, + "epoch": 0.6741054251635245, + "flos": 532545882624.0, + "grad_norm": 0.03235573519036691, + "language_loss": 0.85924655, + "learning_rate": 0.0002536059667313903, + "loss": 0.87068391, + "num_input_tokens_seen": 290617248, + "router_z_loss_mlp": 0.7578125, + "step": 3504, + "time_per_iteration": 2.7243404388427734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142972, + "balance_loss_mlp": 1.06705964, + "epoch": 0.674297806848788, + "flos": 543651833856.0, + "grad_norm": 0.0371245910075902, + "language_loss": 0.94068909, + "learning_rate": 0.0002533349262343483, + "loss": 0.95211881, + "num_input_tokens_seen": 290690112, + "router_z_loss_mlp": 0.7578125, + "step": 3505, + "time_per_iteration": 2.672279119491577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144049, + "balance_loss_mlp": 1.06818378, + "epoch": 0.6744901885340515, + "flos": 464454440448.0, + "grad_norm": 0.03655603062575672, + "language_loss": 0.87737519, + "learning_rate": 0.0002530639815011807, + "loss": 0.88881564, + "num_input_tokens_seen": 290756352, + "router_z_loss_mlp": 0.75732422, + "step": 3506, + "time_per_iteration": 2.4994444847106934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147432, + "balance_loss_mlp": 1.07156682, + "epoch": 0.6746825702193151, + "flos": 633021920256.0, + "grad_norm": 0.03414682593561894, + "language_loss": 0.89147329, + "learning_rate": 0.0002527931326370781, + "loss": 0.90294766, + "num_input_tokens_seen": 290829776, + "router_z_loss_mlp": 0.75732422, + "step": 3507, + "time_per_iteration": 2.8101861476898193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147739, + "balance_loss_mlp": 1.07201719, + "epoch": 0.6748749519045787, + "flos": 672392669184.0, + "grad_norm": 0.03604109956687097, + "language_loss": 0.87794244, + "learning_rate": 0.00025252237974719276, + "loss": 0.88941985, + "num_input_tokens_seen": 290900736, + "router_z_loss_mlp": 0.75585938, + "step": 3508, + "time_per_iteration": 2.8684208393096924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147125, + "balance_loss_mlp": 1.07140362, + "epoch": 0.6750673335898423, + "flos": 768492400128.0, + "grad_norm": 0.03252394082616114, + "language_loss": 0.85605073, + "learning_rate": 0.00025225172293664056, + "loss": 0.867522, + "num_input_tokens_seen": 290981696, + "router_z_loss_mlp": 0.75585938, + "step": 3509, + "time_per_iteration": 2.979069232940674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161552, + "balance_loss_mlp": 1.08716583, + "epoch": 0.6752597152751059, + "flos": 1515904994304.0, + "grad_norm": 0.012789123044337823, + "language_loss": 0.76933134, + "learning_rate": 0.00025198116231049954, + "loss": 0.78094685, + "num_input_tokens_seen": 291217888, + "router_z_loss_mlp": 0.74414062, + "step": 3510, + "time_per_iteration": 4.922729015350342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115617, + "balance_loss_mlp": 1.0805434, + "epoch": 0.6754520969603693, + "flos": 688532937216.0, + "grad_norm": 0.03719909461445286, + "language_loss": 0.8963424, + "learning_rate": 0.00025171069797381106, + "loss": 0.90790415, + "num_input_tokens_seen": 291287856, + "router_z_loss_mlp": 0.75488281, + "step": 3511, + "time_per_iteration": 2.8566861152648926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151796, + "balance_loss_mlp": 1.07621729, + "epoch": 0.6756444786456329, + "flos": 501617570304.0, + "grad_norm": 0.03363675466936639, + "language_loss": 0.85946679, + "learning_rate": 0.00025144033003157864, + "loss": 0.87098479, + "num_input_tokens_seen": 291354912, + "router_z_loss_mlp": 0.75439453, + "step": 3512, + "time_per_iteration": 2.579599142074585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152227, + "balance_loss_mlp": 1.07650506, + "epoch": 0.6758368603308965, + "flos": 493659227136.0, + "grad_norm": 0.044346995690068114, + "language_loss": 0.8418451, + "learning_rate": 0.00025117005858876806, + "loss": 0.85336733, + "num_input_tokens_seen": 291426816, + "router_z_loss_mlp": 0.75585938, + "step": 3513, + "time_per_iteration": 2.694627285003662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115062, + "balance_loss_mlp": 1.07485056, + "epoch": 0.6760292420161601, + "flos": 557043993600.0, + "grad_norm": 0.034337257206957794, + "language_loss": 0.90733004, + "learning_rate": 0.000250899883750308, + "loss": 0.91883624, + "num_input_tokens_seen": 291497648, + "router_z_loss_mlp": 0.75634766, + "step": 3514, + "time_per_iteration": 2.6701719760894775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150513, + "balance_loss_mlp": 1.07474315, + "epoch": 0.6762216237014236, + "flos": 608721194496.0, + "grad_norm": 0.03416515328617874, + "language_loss": 0.87787104, + "learning_rate": 0.00025062980562109006, + "loss": 0.8893761, + "num_input_tokens_seen": 291568080, + "router_z_loss_mlp": 0.75634766, + "step": 3515, + "time_per_iteration": 2.7225759029388428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150722, + "balance_loss_mlp": 1.07499993, + "epoch": 0.6764140053866872, + "flos": 534927418368.0, + "grad_norm": 0.03854621654418095, + "language_loss": 0.89246118, + "learning_rate": 0.0002503598243059677, + "loss": 0.90396839, + "num_input_tokens_seen": 291644896, + "router_z_loss_mlp": 0.75585938, + "step": 3516, + "time_per_iteration": 2.808784008026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143883, + "balance_loss_mlp": 1.06797004, + "epoch": 0.6766063870719508, + "flos": 505861619712.0, + "grad_norm": 0.034298651238093614, + "language_loss": 0.84964311, + "learning_rate": 0.0002500899399097568, + "loss": 0.86108196, + "num_input_tokens_seen": 291716864, + "router_z_loss_mlp": 0.7578125, + "step": 3517, + "time_per_iteration": 2.713134765625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142698, + "balance_loss_mlp": 1.0667851, + "epoch": 0.6767987687572143, + "flos": 514193266176.0, + "grad_norm": 0.03865641767048317, + "language_loss": 0.91341412, + "learning_rate": 0.0002498201525372359, + "loss": 0.92484111, + "num_input_tokens_seen": 291786000, + "router_z_loss_mlp": 0.7578125, + "step": 3518, + "time_per_iteration": 2.5997681617736816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141854, + "balance_loss_mlp": 1.0659889, + "epoch": 0.6769911504424779, + "flos": 526078751232.0, + "grad_norm": 0.04161600440053586, + "language_loss": 0.877231, + "learning_rate": 0.00024955046229314584, + "loss": 0.88864952, + "num_input_tokens_seen": 291854768, + "router_z_loss_mlp": 0.75732422, + "step": 3519, + "time_per_iteration": 2.6678366661071777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114153, + "balance_loss_mlp": 1.06576014, + "epoch": 0.6771835321277414, + "flos": 450836697600.0, + "grad_norm": 0.03317329770903154, + "language_loss": 0.91456813, + "learning_rate": 0.00024928086928218947, + "loss": 0.92598343, + "num_input_tokens_seen": 291918096, + "router_z_loss_mlp": 0.75634766, + "step": 3520, + "time_per_iteration": 2.599364995956421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142519, + "balance_loss_mlp": 1.06689274, + "epoch": 0.677375913813005, + "flos": 710673707520.0, + "grad_norm": 0.03540178465545925, + "language_loss": 0.81423402, + "learning_rate": 0.00024901137360903216, + "loss": 0.82565916, + "num_input_tokens_seen": 291998752, + "router_z_loss_mlp": 0.75488281, + "step": 3521, + "time_per_iteration": 2.9810547828674316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114229, + "balance_loss_mlp": 1.06671166, + "epoch": 0.6775682954982686, + "flos": 429345205248.0, + "grad_norm": 0.03804572823020318, + "language_loss": 0.86387855, + "learning_rate": 0.00024874197537830115, + "loss": 0.87530142, + "num_input_tokens_seen": 292065056, + "router_z_loss_mlp": 0.75439453, + "step": 3522, + "time_per_iteration": 2.5273780822753906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148684, + "balance_loss_mlp": 1.07281935, + "epoch": 0.6777606771835322, + "flos": 438820956672.0, + "grad_norm": 0.03795067145757124, + "language_loss": 0.88304371, + "learning_rate": 0.00024847267469458684, + "loss": 0.89453053, + "num_input_tokens_seen": 292129248, + "router_z_loss_mlp": 0.75732422, + "step": 3523, + "time_per_iteration": 2.5473203659057617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151175, + "balance_loss_mlp": 1.07516694, + "epoch": 0.6779530588687956, + "flos": 776787116544.0, + "grad_norm": 0.03277402838986502, + "language_loss": 0.82546473, + "learning_rate": 0.00024820347166244034, + "loss": 0.83697653, + "num_input_tokens_seen": 292206080, + "router_z_loss_mlp": 0.75878906, + "step": 3524, + "time_per_iteration": 3.006762742996216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151614, + "balance_loss_mlp": 1.07551062, + "epoch": 0.6781454405540592, + "flos": 572904284160.0, + "grad_norm": 0.03398425592449901, + "language_loss": 0.89193916, + "learning_rate": 0.0002479343663863755, + "loss": 0.90345526, + "num_input_tokens_seen": 292280192, + "router_z_loss_mlp": 0.75976562, + "step": 3525, + "time_per_iteration": 2.7708120346069336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149362, + "balance_loss_mlp": 1.07325864, + "epoch": 0.6783378222393228, + "flos": 485982862848.0, + "grad_norm": 0.03421790564553063, + "language_loss": 0.81340361, + "learning_rate": 0.00024766535897086876, + "loss": 0.82489729, + "num_input_tokens_seen": 292347792, + "router_z_loss_mlp": 0.75976562, + "step": 3526, + "time_per_iteration": 2.5445010662078857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149936, + "balance_loss_mlp": 1.07383275, + "epoch": 0.6785302039245864, + "flos": 483831639552.0, + "grad_norm": 0.03533862611113949, + "language_loss": 0.84491217, + "learning_rate": 0.0002473964495203578, + "loss": 0.85641158, + "num_input_tokens_seen": 292420032, + "router_z_loss_mlp": 0.75976562, + "step": 3527, + "time_per_iteration": 2.6606431007385254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151402, + "balance_loss_mlp": 1.07525146, + "epoch": 0.67872258560985, + "flos": 525861900288.0, + "grad_norm": 0.03371892559640898, + "language_loss": 0.90057969, + "learning_rate": 0.0002471276381392425, + "loss": 0.9120937, + "num_input_tokens_seen": 292497792, + "router_z_loss_mlp": 0.76025391, + "step": 3528, + "time_per_iteration": 2.782986640930176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156944, + "balance_loss_mlp": 1.08255768, + "epoch": 0.6789149672951135, + "flos": 1555892093952.0, + "grad_norm": 0.008577357919530966, + "language_loss": 0.78188634, + "learning_rate": 0.0002468589249318848, + "loss": 0.79345584, + "num_input_tokens_seen": 292726704, + "router_z_loss_mlp": 0.74414062, + "step": 3529, + "time_per_iteration": 4.9733335971832275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152043, + "balance_loss_mlp": 1.07594013, + "epoch": 0.6791073489803771, + "flos": 742684999680.0, + "grad_norm": 0.033404033149465266, + "language_loss": 0.89312834, + "learning_rate": 0.00024659031000260826, + "loss": 0.90464872, + "num_input_tokens_seen": 292802320, + "router_z_loss_mlp": 0.75976562, + "step": 3530, + "time_per_iteration": 2.901157855987549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145514, + "balance_loss_mlp": 1.06936264, + "epoch": 0.6792997306656406, + "flos": 577447776768.0, + "grad_norm": 0.04256917362285044, + "language_loss": 0.86884272, + "learning_rate": 0.0002463217934556985, + "loss": 0.8802979, + "num_input_tokens_seen": 292870480, + "router_z_loss_mlp": 0.76025391, + "step": 3531, + "time_per_iteration": 2.6534667015075684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153702, + "balance_loss_mlp": 1.07931519, + "epoch": 0.6794921123509042, + "flos": 1506544035840.0, + "grad_norm": 0.006337226155731696, + "language_loss": 0.7653209, + "learning_rate": 0.000246053375395403, + "loss": 0.77685791, + "num_input_tokens_seen": 293100752, + "router_z_loss_mlp": 0.74414062, + "step": 3532, + "time_per_iteration": 4.827699899673462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147095, + "balance_loss_mlp": 1.07089639, + "epoch": 0.6796844940361677, + "flos": 700140446208.0, + "grad_norm": 0.038428315777117805, + "language_loss": 0.89542228, + "learning_rate": 0.0002457850559259306, + "loss": 0.90689325, + "num_input_tokens_seen": 293178192, + "router_z_loss_mlp": 0.76074219, + "step": 3533, + "time_per_iteration": 2.827556610107422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147708, + "balance_loss_mlp": 1.07160449, + "epoch": 0.6798768757214313, + "flos": 553815794688.0, + "grad_norm": 0.03257941751207101, + "language_loss": 0.86952329, + "learning_rate": 0.00024551683515145275, + "loss": 0.88100034, + "num_input_tokens_seen": 293246368, + "router_z_loss_mlp": 0.75976562, + "step": 3534, + "time_per_iteration": 2.664051055908203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146574, + "balance_loss_mlp": 1.07051849, + "epoch": 0.6800692574066949, + "flos": 523975191552.0, + "grad_norm": 0.03399690480422162, + "language_loss": 0.91393268, + "learning_rate": 0.0002452487131761014, + "loss": 0.92539847, + "num_input_tokens_seen": 293320656, + "router_z_loss_mlp": 0.75927734, + "step": 3535, + "time_per_iteration": 2.733736276626587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146041, + "balance_loss_mlp": 1.06993783, + "epoch": 0.6802616390919585, + "flos": 575129367552.0, + "grad_norm": 0.03256850712762242, + "language_loss": 0.84912848, + "learning_rate": 0.00024498069010397093, + "loss": 0.86058891, + "num_input_tokens_seen": 293388592, + "router_z_loss_mlp": 0.75976562, + "step": 3536, + "time_per_iteration": 2.687980890274048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144058, + "balance_loss_mlp": 1.06805015, + "epoch": 0.6804540207772221, + "flos": 489128469504.0, + "grad_norm": 0.03259916802392139, + "language_loss": 0.89844334, + "learning_rate": 0.00024471276603911697, + "loss": 0.90988398, + "num_input_tokens_seen": 293453936, + "router_z_loss_mlp": 0.75878906, + "step": 3537, + "time_per_iteration": 2.5977725982666016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144351, + "balance_loss_mlp": 1.06834352, + "epoch": 0.6806464024624855, + "flos": 579744718848.0, + "grad_norm": 0.031208373438408543, + "language_loss": 0.83636969, + "learning_rate": 0.0002444449410855572, + "loss": 0.84781325, + "num_input_tokens_seen": 293527664, + "router_z_loss_mlp": 0.75878906, + "step": 3538, + "time_per_iteration": 2.806182384490967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151082, + "balance_loss_mlp": 1.0752176, + "epoch": 0.6808387841477491, + "flos": 554792713728.0, + "grad_norm": 0.02619955396666995, + "language_loss": 0.88271046, + "learning_rate": 0.00024417721534727033, + "loss": 0.89422125, + "num_input_tokens_seen": 293599344, + "router_z_loss_mlp": 0.75732422, + "step": 3539, + "time_per_iteration": 2.6672027111053467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153254, + "balance_loss_mlp": 1.07753205, + "epoch": 0.6810311658330127, + "flos": 427753936896.0, + "grad_norm": 0.03954259059998535, + "language_loss": 0.8817929, + "learning_rate": 0.00024390958892819687, + "loss": 0.89332551, + "num_input_tokens_seen": 293663088, + "router_z_loss_mlp": 0.75585938, + "step": 3540, + "time_per_iteration": 2.4914028644561768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152621, + "balance_loss_mlp": 1.07685137, + "epoch": 0.6812235475182763, + "flos": 573460236288.0, + "grad_norm": 0.03041439482605579, + "language_loss": 0.85729158, + "learning_rate": 0.0002436420619322381, + "loss": 0.86881781, + "num_input_tokens_seen": 293741296, + "router_z_loss_mlp": 0.75634766, + "step": 3541, + "time_per_iteration": 2.8284380435943604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152525, + "balance_loss_mlp": 1.07675517, + "epoch": 0.6814159292035398, + "flos": 502993989120.0, + "grad_norm": 0.031050490172735493, + "language_loss": 0.87018108, + "learning_rate": 0.0002433746344632577, + "loss": 0.88170624, + "num_input_tokens_seen": 293815840, + "router_z_loss_mlp": 0.75634766, + "step": 3542, + "time_per_iteration": 2.6791961193084717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155107, + "balance_loss_mlp": 1.07919419, + "epoch": 0.6816083108888034, + "flos": 766955526144.0, + "grad_norm": 0.032327379337262395, + "language_loss": 0.85101521, + "learning_rate": 0.00024310730662508006, + "loss": 0.86256623, + "num_input_tokens_seen": 293896368, + "router_z_loss_mlp": 0.7578125, + "step": 3543, + "time_per_iteration": 3.091520309448242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154554, + "balance_loss_mlp": 1.07854629, + "epoch": 0.681800692574067, + "flos": 480479915520.0, + "grad_norm": 0.03033872617251452, + "language_loss": 0.91889656, + "learning_rate": 0.0002428400785214911, + "loss": 0.93044209, + "num_input_tokens_seen": 293963344, + "router_z_loss_mlp": 0.75878906, + "step": 3544, + "time_per_iteration": 2.6075758934020996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148266, + "balance_loss_mlp": 1.07216299, + "epoch": 0.6819930742593305, + "flos": 692833382400.0, + "grad_norm": 0.035894178949101116, + "language_loss": 0.8798629, + "learning_rate": 0.00024257295025623794, + "loss": 0.89134556, + "num_input_tokens_seen": 294035440, + "router_z_loss_mlp": 0.75976562, + "step": 3545, + "time_per_iteration": 2.835088014602661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148628, + "balance_loss_mlp": 1.07257295, + "epoch": 0.6821854559445941, + "flos": 679354627584.0, + "grad_norm": 0.03140204473065851, + "language_loss": 0.85909534, + "learning_rate": 0.00024230592193302892, + "loss": 0.87058157, + "num_input_tokens_seen": 294116944, + "router_z_loss_mlp": 0.75927734, + "step": 3546, + "time_per_iteration": 2.8806655406951904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115113, + "balance_loss_mlp": 1.07517004, + "epoch": 0.6823778376298576, + "flos": 463132416000.0, + "grad_norm": 0.035932436170819634, + "language_loss": 0.89696717, + "learning_rate": 0.00024203899365553372, + "loss": 0.9084785, + "num_input_tokens_seen": 294178976, + "router_z_loss_mlp": 0.75830078, + "step": 3547, + "time_per_iteration": 2.538266897201538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147926, + "balance_loss_mlp": 1.07411194, + "epoch": 0.6825702193151212, + "flos": 1478174452224.0, + "grad_norm": 0.007345057771589815, + "language_loss": 0.76734358, + "learning_rate": 0.00024177216552738302, + "loss": 0.77882284, + "num_input_tokens_seen": 294384960, + "router_z_loss_mlp": 0.73828125, + "step": 3548, + "time_per_iteration": 4.545760154724121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143597, + "balance_loss_mlp": 1.06768405, + "epoch": 0.6827626010003848, + "flos": 724412974080.0, + "grad_norm": 0.035220397583358556, + "language_loss": 0.88068932, + "learning_rate": 0.00024150543765216848, + "loss": 0.89212525, + "num_input_tokens_seen": 294461408, + "router_z_loss_mlp": 0.7578125, + "step": 3549, + "time_per_iteration": 2.9486939907073975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143099, + "balance_loss_mlp": 1.06718683, + "epoch": 0.6829549826856484, + "flos": 559939822080.0, + "grad_norm": 0.03492974535391861, + "language_loss": 0.89375067, + "learning_rate": 0.00024123881013344352, + "loss": 0.90518171, + "num_input_tokens_seen": 294530624, + "router_z_loss_mlp": 0.7578125, + "step": 3550, + "time_per_iteration": 2.651604413986206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150936, + "balance_loss_mlp": 1.07502353, + "epoch": 0.6831473643709118, + "flos": 626133821952.0, + "grad_norm": 0.03217647010825034, + "language_loss": 0.83963066, + "learning_rate": 0.00024097228307472202, + "loss": 0.85114002, + "num_input_tokens_seen": 294606784, + "router_z_loss_mlp": 0.7578125, + "step": 3551, + "time_per_iteration": 2.7857072353363037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011508, + "balance_loss_mlp": 1.07479274, + "epoch": 0.6833397460561754, + "flos": 715097677824.0, + "grad_norm": 0.03621401947072565, + "language_loss": 0.87106031, + "learning_rate": 0.00024070585657947846, + "loss": 0.88256836, + "num_input_tokens_seen": 294686960, + "router_z_loss_mlp": 0.75878906, + "step": 3552, + "time_per_iteration": 2.8683760166168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114886, + "balance_loss_mlp": 1.07299471, + "epoch": 0.683532127741439, + "flos": 465726799872.0, + "grad_norm": 0.03128688144219445, + "language_loss": 0.89219671, + "learning_rate": 0.00024043953075114934, + "loss": 0.90368527, + "num_input_tokens_seen": 294759712, + "router_z_loss_mlp": 0.75732422, + "step": 3553, + "time_per_iteration": 2.704216241836548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114847, + "balance_loss_mlp": 1.07251036, + "epoch": 0.6837245094267026, + "flos": 583339490304.0, + "grad_norm": 0.0349442822995555, + "language_loss": 0.93869305, + "learning_rate": 0.00024017330569313128, + "loss": 0.95017779, + "num_input_tokens_seen": 294830592, + "router_z_loss_mlp": 0.75830078, + "step": 3554, + "time_per_iteration": 2.691981554031372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148981, + "balance_loss_mlp": 1.07287753, + "epoch": 0.6839168911119662, + "flos": 795523769856.0, + "grad_norm": 0.0402217191104916, + "language_loss": 0.80629432, + "learning_rate": 0.0002399071815087821, + "loss": 0.81778413, + "num_input_tokens_seen": 294907504, + "router_z_loss_mlp": 0.75976562, + "step": 3555, + "time_per_iteration": 2.984731912612915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148889, + "balance_loss_mlp": 1.07302415, + "epoch": 0.6841092727972297, + "flos": 581114406912.0, + "grad_norm": 0.035602777463953614, + "language_loss": 0.89145899, + "learning_rate": 0.00023964115830142025, + "loss": 0.9029479, + "num_input_tokens_seen": 294977600, + "router_z_loss_mlp": 0.75732422, + "step": 3556, + "time_per_iteration": 2.7377610206604004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148814, + "balance_loss_mlp": 1.07294965, + "epoch": 0.6843016544824932, + "flos": 384595034112.0, + "grad_norm": 0.03918339808288278, + "language_loss": 0.92691845, + "learning_rate": 0.00023937523617432522, + "loss": 0.93840659, + "num_input_tokens_seen": 295039408, + "router_z_loss_mlp": 0.75732422, + "step": 3557, + "time_per_iteration": 2.571953535079956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148872, + "balance_loss_mlp": 1.07305455, + "epoch": 0.6844940361677568, + "flos": 1441287845376.0, + "grad_norm": 0.033291217727089636, + "language_loss": 0.91850209, + "learning_rate": 0.00023910941523073705, + "loss": 0.92999083, + "num_input_tokens_seen": 295142928, + "router_z_loss_mlp": 0.75683594, + "step": 3558, + "time_per_iteration": 3.910876512527466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148946, + "balance_loss_mlp": 1.07317698, + "epoch": 0.6846864178530204, + "flos": 521899829760.0, + "grad_norm": 0.03402610589420279, + "language_loss": 0.9203999, + "learning_rate": 0.0002388436955738566, + "loss": 0.93188941, + "num_input_tokens_seen": 295215504, + "router_z_loss_mlp": 0.75634766, + "step": 3559, + "time_per_iteration": 2.6723177433013916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148516, + "balance_loss_mlp": 1.07279444, + "epoch": 0.6848787995382839, + "flos": 719228935680.0, + "grad_norm": 0.031030975541128533, + "language_loss": 0.86168528, + "learning_rate": 0.00023857807730684523, + "loss": 0.87317038, + "num_input_tokens_seen": 295291024, + "router_z_loss_mlp": 0.75585938, + "step": 3560, + "time_per_iteration": 2.90830135345459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114827, + "balance_loss_mlp": 1.07254827, + "epoch": 0.6850711812235475, + "flos": 512161565184.0, + "grad_norm": 0.040096201780059196, + "language_loss": 0.88262463, + "learning_rate": 0.00023831256053282547, + "loss": 0.89410734, + "num_input_tokens_seen": 295363248, + "router_z_loss_mlp": 0.75585938, + "step": 3561, + "time_per_iteration": 2.671116352081299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148991, + "balance_loss_mlp": 1.07336485, + "epoch": 0.6852635629088111, + "flos": 669431712768.0, + "grad_norm": 0.03641568128756266, + "language_loss": 0.83697838, + "learning_rate": 0.00023804714535488003, + "loss": 0.8484683, + "num_input_tokens_seen": 295442032, + "router_z_loss_mlp": 0.75488281, + "step": 3562, + "time_per_iteration": 2.861722946166992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149231, + "balance_loss_mlp": 1.0756073, + "epoch": 0.6854559445940747, + "flos": 1526364395520.0, + "grad_norm": 0.005446048976110769, + "language_loss": 0.7980963, + "learning_rate": 0.0002377818318760519, + "loss": 0.80958861, + "num_input_tokens_seen": 295680560, + "router_z_loss_mlp": 0.73632812, + "step": 3563, + "time_per_iteration": 5.001219272613525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145764, + "balance_loss_mlp": 1.07037604, + "epoch": 0.6856483262793382, + "flos": 455137142784.0, + "grad_norm": 0.035220734339555373, + "language_loss": 0.86132681, + "learning_rate": 0.00023751662019934488, + "loss": 0.8727845, + "num_input_tokens_seen": 295745712, + "router_z_loss_mlp": 0.75244141, + "step": 3564, + "time_per_iteration": 2.4870924949645996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146111, + "balance_loss_mlp": 1.07077074, + "epoch": 0.6858407079646017, + "flos": 616688269824.0, + "grad_norm": 0.032854756712223265, + "language_loss": 0.84736019, + "learning_rate": 0.00023725151042772364, + "loss": 0.85882127, + "num_input_tokens_seen": 295815104, + "router_z_loss_mlp": 0.75195312, + "step": 3565, + "time_per_iteration": 2.7391157150268555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146, + "balance_loss_mlp": 1.07056403, + "epoch": 0.6860330896498653, + "flos": 467094486528.0, + "grad_norm": 0.03197662147757374, + "language_loss": 0.88051426, + "learning_rate": 0.00023698650266411276, + "loss": 0.89197421, + "num_input_tokens_seen": 295882928, + "router_z_loss_mlp": 0.75292969, + "step": 3566, + "time_per_iteration": 2.6070899963378906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114589, + "balance_loss_mlp": 1.07054949, + "epoch": 0.6862254713351289, + "flos": 865838294016.0, + "grad_norm": 0.03137777844297811, + "language_loss": 0.88001108, + "learning_rate": 0.00023672159701139755, + "loss": 0.89146996, + "num_input_tokens_seen": 295970960, + "router_z_loss_mlp": 0.75195312, + "step": 3567, + "time_per_iteration": 3.252197504043579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145133, + "balance_loss_mlp": 1.06979275, + "epoch": 0.6864178530203925, + "flos": 448090590720.0, + "grad_norm": 0.03718741839919542, + "language_loss": 0.90576816, + "learning_rate": 0.00023645679357242296, + "loss": 0.91721952, + "num_input_tokens_seen": 296036128, + "router_z_loss_mlp": 0.75195312, + "step": 3568, + "time_per_iteration": 2.551252841949463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146099, + "balance_loss_mlp": 1.07052052, + "epoch": 0.6866102347056561, + "flos": 425211945984.0, + "grad_norm": 0.041154591725143186, + "language_loss": 0.89051086, + "learning_rate": 0.00023619209244999534, + "loss": 0.90197182, + "num_input_tokens_seen": 296101440, + "router_z_loss_mlp": 0.75439453, + "step": 3569, + "time_per_iteration": 2.5833351612091064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148567, + "balance_loss_mlp": 1.07289267, + "epoch": 0.6868026163909196, + "flos": 473333306880.0, + "grad_norm": 0.045387721995194655, + "language_loss": 0.91211587, + "learning_rate": 0.0002359274937468806, + "loss": 0.92360151, + "num_input_tokens_seen": 296165504, + "router_z_loss_mlp": 0.75537109, + "step": 3570, + "time_per_iteration": 2.5472187995910645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148303, + "balance_loss_mlp": 1.07258165, + "epoch": 0.6869949980761831, + "flos": 465205776384.0, + "grad_norm": 0.03150793163610154, + "language_loss": 0.82095093, + "learning_rate": 0.00023566299756580512, + "loss": 0.83243394, + "num_input_tokens_seen": 296236880, + "router_z_loss_mlp": 0.75585938, + "step": 3571, + "time_per_iteration": 2.65720534324646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149363, + "balance_loss_mlp": 1.07364154, + "epoch": 0.6871873797614467, + "flos": 427130855424.0, + "grad_norm": 0.03812414034627887, + "language_loss": 0.83773518, + "learning_rate": 0.0002353986040094551, + "loss": 0.84922886, + "num_input_tokens_seen": 296299776, + "router_z_loss_mlp": 0.75585938, + "step": 3572, + "time_per_iteration": 2.5081918239593506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150153, + "balance_loss_mlp": 1.07443094, + "epoch": 0.6873797614467103, + "flos": 444554216448.0, + "grad_norm": 0.03780966347325107, + "language_loss": 0.84840351, + "learning_rate": 0.00023513431318047796, + "loss": 0.859905, + "num_input_tokens_seen": 296365408, + "router_z_loss_mlp": 0.75585938, + "step": 3573, + "time_per_iteration": 2.5093369483947754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151367, + "balance_loss_mlp": 1.07564497, + "epoch": 0.6875721431319738, + "flos": 993914388480.0, + "grad_norm": 0.03609225050037203, + "language_loss": 0.82789201, + "learning_rate": 0.00023487012518147977, + "loss": 0.83940566, + "num_input_tokens_seen": 296445488, + "router_z_loss_mlp": 0.75585938, + "step": 3574, + "time_per_iteration": 3.209183692932129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147663, + "balance_loss_mlp": 1.07194114, + "epoch": 0.6877645248172374, + "flos": 1287447284736.0, + "grad_norm": 0.03474054925627609, + "language_loss": 0.8951385, + "learning_rate": 0.00023460604011502772, + "loss": 0.90661514, + "num_input_tokens_seen": 296529936, + "router_z_loss_mlp": 0.75585938, + "step": 3575, + "time_per_iteration": 3.6102471351623535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147349, + "balance_loss_mlp": 1.07162762, + "epoch": 0.687956906502501, + "flos": 878229339648.0, + "grad_norm": 0.03667268861696713, + "language_loss": 0.90602195, + "learning_rate": 0.00023434205808364845, + "loss": 0.91749543, + "num_input_tokens_seen": 296607488, + "router_z_loss_mlp": 0.75585938, + "step": 3576, + "time_per_iteration": 3.1072838306427 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145679, + "balance_loss_mlp": 1.07014775, + "epoch": 0.6881492881877646, + "flos": 564470579712.0, + "grad_norm": 0.03470071742143998, + "language_loss": 0.90143359, + "learning_rate": 0.00023407817918982932, + "loss": 0.91289037, + "num_input_tokens_seen": 296678672, + "router_z_loss_mlp": 0.75390625, + "step": 3577, + "time_per_iteration": 2.7108538150787354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144131, + "balance_loss_mlp": 1.06869566, + "epoch": 0.6883416698730281, + "flos": 796509421056.0, + "grad_norm": 0.03216167904462723, + "language_loss": 0.83329225, + "learning_rate": 0.00023381440353601718, + "loss": 0.84473354, + "num_input_tokens_seen": 296758896, + "router_z_loss_mlp": 0.75292969, + "step": 3578, + "time_per_iteration": 3.00079345703125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144719, + "balance_loss_mlp": 1.06933129, + "epoch": 0.6885340515582916, + "flos": 724879603200.0, + "grad_norm": 0.03602954458915834, + "language_loss": 0.91766059, + "learning_rate": 0.00023355073122461822, + "loss": 0.92910779, + "num_input_tokens_seen": 296830736, + "router_z_loss_mlp": 0.75244141, + "step": 3579, + "time_per_iteration": 2.8793976306915283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144346, + "balance_loss_mlp": 1.06891012, + "epoch": 0.6887264332435552, + "flos": 1012520785920.0, + "grad_norm": 0.032157968991135766, + "language_loss": 0.87754709, + "learning_rate": 0.00023328716235799973, + "loss": 0.88899052, + "num_input_tokens_seen": 296911504, + "router_z_loss_mlp": 0.75292969, + "step": 3580, + "time_per_iteration": 3.262232780456543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145628, + "balance_loss_mlp": 1.07028747, + "epoch": 0.6889188149288188, + "flos": 586346108928.0, + "grad_norm": 0.030956213624598772, + "language_loss": 0.88613558, + "learning_rate": 0.00023302369703848803, + "loss": 0.89759183, + "num_input_tokens_seen": 296981488, + "router_z_loss_mlp": 0.75195312, + "step": 3581, + "time_per_iteration": 2.6781458854675293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155772, + "balance_loss_mlp": 1.08043158, + "epoch": 0.6891111966140824, + "flos": 637276703232.0, + "grad_norm": 0.03960885447101306, + "language_loss": 0.85706222, + "learning_rate": 0.00023276033536836937, + "loss": 0.86861998, + "num_input_tokens_seen": 297054896, + "router_z_loss_mlp": 0.75195312, + "step": 3582, + "time_per_iteration": 2.8019070625305176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155352, + "balance_loss_mlp": 1.08005941, + "epoch": 0.6893035782993459, + "flos": 496312008192.0, + "grad_norm": 0.03332092041619006, + "language_loss": 0.89310157, + "learning_rate": 0.00023249707744988984, + "loss": 0.9046551, + "num_input_tokens_seen": 297128224, + "router_z_loss_mlp": 0.75146484, + "step": 3583, + "time_per_iteration": 2.6462185382843018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149559, + "balance_loss_mlp": 1.07421863, + "epoch": 0.6894959599846094, + "flos": 459148878336.0, + "grad_norm": 0.037983425016063846, + "language_loss": 0.88022619, + "learning_rate": 0.00023223392338525529, + "loss": 0.89172179, + "num_input_tokens_seen": 297191312, + "router_z_loss_mlp": 0.75195312, + "step": 3584, + "time_per_iteration": 2.493164539337158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149866, + "balance_loss_mlp": 1.07457304, + "epoch": 0.689688341669873, + "flos": 506057003520.0, + "grad_norm": 0.03394886477629218, + "language_loss": 0.83439797, + "learning_rate": 0.00023197087327663107, + "loss": 0.84589666, + "num_input_tokens_seen": 297261904, + "router_z_loss_mlp": 0.75146484, + "step": 3585, + "time_per_iteration": 2.6373069286346436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149128, + "balance_loss_mlp": 1.0738833, + "epoch": 0.6898807233551366, + "flos": 765218539008.0, + "grad_norm": 0.04715187460336584, + "language_loss": 0.87040132, + "learning_rate": 0.00023170792722614243, + "loss": 0.88189256, + "num_input_tokens_seen": 297338352, + "router_z_loss_mlp": 0.75097656, + "step": 3586, + "time_per_iteration": 2.9102606773376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147386, + "balance_loss_mlp": 1.07218862, + "epoch": 0.6900731050404002, + "flos": 584572918272.0, + "grad_norm": 0.029046800456262803, + "language_loss": 0.87808621, + "learning_rate": 0.00023144508533587377, + "loss": 0.88955998, + "num_input_tokens_seen": 297416688, + "router_z_loss_mlp": 0.75048828, + "step": 3587, + "time_per_iteration": 2.8061466217041016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146464, + "balance_loss_mlp": 1.07112408, + "epoch": 0.6902654867256637, + "flos": 713204964864.0, + "grad_norm": 0.038780286956444227, + "language_loss": 0.83763909, + "learning_rate": 0.0002311823477078698, + "loss": 0.84910375, + "num_input_tokens_seen": 297499968, + "router_z_loss_mlp": 0.75195312, + "step": 3588, + "time_per_iteration": 2.943735122680664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145799, + "balance_loss_mlp": 1.0705539, + "epoch": 0.6904578684109273, + "flos": 598303452672.0, + "grad_norm": 0.03424930843273271, + "language_loss": 0.89383221, + "learning_rate": 0.00023091971444413428, + "loss": 0.90529013, + "num_input_tokens_seen": 297574480, + "router_z_loss_mlp": 0.75097656, + "step": 3589, + "time_per_iteration": 2.8112401962280273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144927, + "balance_loss_mlp": 1.06958711, + "epoch": 0.6906502500961909, + "flos": 586176921600.0, + "grad_norm": 0.03337983464568353, + "language_loss": 0.87353265, + "learning_rate": 0.00023065718564663012, + "loss": 0.88498187, + "num_input_tokens_seen": 297645360, + "router_z_loss_mlp": 0.75195312, + "step": 3590, + "time_per_iteration": 2.712702512741089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148972, + "balance_loss_mlp": 1.0753479, + "epoch": 0.6908426317814544, + "flos": 1591140317184.0, + "grad_norm": 0.007217245787203084, + "language_loss": 0.73911589, + "learning_rate": 0.00023039476141728011, + "loss": 0.75060558, + "num_input_tokens_seen": 297879472, + "router_z_loss_mlp": 0.73632812, + "step": 3591, + "time_per_iteration": 4.975476980209351 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011435, + "balance_loss_mlp": 1.06830287, + "epoch": 0.6910350134667179, + "flos": 501804221952.0, + "grad_norm": 0.03486357436652247, + "language_loss": 0.85128838, + "learning_rate": 0.0002301324418579666, + "loss": 0.86272335, + "num_input_tokens_seen": 297950672, + "router_z_loss_mlp": 0.75048828, + "step": 3592, + "time_per_iteration": 2.6776154041290283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144028, + "balance_loss_mlp": 1.07040405, + "epoch": 0.6912273951519815, + "flos": 1412132901888.0, + "grad_norm": 0.003146877221363815, + "language_loss": 0.78688473, + "learning_rate": 0.00022987022707053107, + "loss": 0.798325, + "num_input_tokens_seen": 298171728, + "router_z_loss_mlp": 0.73632812, + "step": 3593, + "time_per_iteration": 4.794835567474365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143307, + "balance_loss_mlp": 1.06806242, + "epoch": 0.6914197768372451, + "flos": 636556293120.0, + "grad_norm": 0.03715032708342992, + "language_loss": 0.8555156, + "learning_rate": 0.00022960811715677415, + "loss": 0.86694872, + "num_input_tokens_seen": 298250304, + "router_z_loss_mlp": 0.75097656, + "step": 3594, + "time_per_iteration": 2.8951711654663086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147289, + "balance_loss_mlp": 1.07213938, + "epoch": 0.6916121585225087, + "flos": 559201947648.0, + "grad_norm": 0.03507172785049161, + "language_loss": 0.86282074, + "learning_rate": 0.00022934611221845608, + "loss": 0.87429363, + "num_input_tokens_seen": 298328000, + "router_z_loss_mlp": 0.75, + "step": 3595, + "time_per_iteration": 2.8272645473480225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145219, + "balance_loss_mlp": 1.0699265, + "epoch": 0.6918045402077723, + "flos": 530292601344.0, + "grad_norm": 0.04349078621871699, + "language_loss": 0.82568008, + "learning_rate": 0.00022908421235729609, + "loss": 0.83713228, + "num_input_tokens_seen": 298406832, + "router_z_loss_mlp": 0.75146484, + "step": 3596, + "time_per_iteration": 2.7838826179504395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146035, + "balance_loss_mlp": 1.07074213, + "epoch": 0.6919969218930357, + "flos": 571425807360.0, + "grad_norm": 0.03178884209281711, + "language_loss": 0.89899623, + "learning_rate": 0.0002288224176749728, + "loss": 0.9104566, + "num_input_tokens_seen": 298477584, + "router_z_loss_mlp": 0.75146484, + "step": 3597, + "time_per_iteration": 2.6271378993988037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114544, + "balance_loss_mlp": 1.07009995, + "epoch": 0.6921893035782993, + "flos": 684503737344.0, + "grad_norm": 0.040516365330590415, + "language_loss": 0.84238005, + "learning_rate": 0.00022856072827312385, + "loss": 0.85383451, + "num_input_tokens_seen": 298551872, + "router_z_loss_mlp": 0.75195312, + "step": 3598, + "time_per_iteration": 2.8102614879608154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145578, + "balance_loss_mlp": 1.07028556, + "epoch": 0.6923816852635629, + "flos": 547793825280.0, + "grad_norm": 0.038084466235788844, + "language_loss": 0.82715267, + "learning_rate": 0.00022829914425334598, + "loss": 0.83860844, + "num_input_tokens_seen": 298619680, + "router_z_loss_mlp": 0.75146484, + "step": 3599, + "time_per_iteration": 2.6669743061065674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143867, + "balance_loss_mlp": 1.06852686, + "epoch": 0.6925740669488265, + "flos": 511056391680.0, + "grad_norm": 0.034117111871926384, + "language_loss": 0.85557401, + "learning_rate": 0.0002280376657171956, + "loss": 0.86701274, + "num_input_tokens_seen": 298690080, + "router_z_loss_mlp": 0.75195312, + "step": 3600, + "time_per_iteration": 2.655038356781006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144019, + "balance_loss_mlp": 1.0685358, + "epoch": 0.69276644863409, + "flos": 870913543680.0, + "grad_norm": 0.03423377398605859, + "language_loss": 0.81733924, + "learning_rate": 0.00022777629276618706, + "loss": 0.82877946, + "num_input_tokens_seen": 298777712, + "router_z_loss_mlp": 0.75341797, + "step": 3601, + "time_per_iteration": 3.1143221855163574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114446, + "balance_loss_mlp": 1.06897676, + "epoch": 0.6929588303193536, + "flos": 626917358592.0, + "grad_norm": 0.03471097371374876, + "language_loss": 0.82267404, + "learning_rate": 0.0002275150255017947, + "loss": 0.8341186, + "num_input_tokens_seen": 298854368, + "router_z_loss_mlp": 0.75341797, + "step": 3602, + "time_per_iteration": 2.7638230323791504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149361, + "balance_loss_mlp": 1.07592773, + "epoch": 0.6931512120046172, + "flos": 1548804609024.0, + "grad_norm": 0.009029231118545568, + "language_loss": 0.75732672, + "learning_rate": 0.0002272538640254511, + "loss": 0.76882035, + "num_input_tokens_seen": 299091664, + "router_z_loss_mlp": 0.734375, + "step": 3603, + "time_per_iteration": 5.028877019882202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165459, + "balance_loss_mlp": 1.09183502, + "epoch": 0.6933435936898807, + "flos": 1451323729920.0, + "grad_norm": 0.01657275533774484, + "language_loss": 0.75127101, + "learning_rate": 0.0002269928084385487, + "loss": 0.76292562, + "num_input_tokens_seen": 299312656, + "router_z_loss_mlp": 0.73632812, + "step": 3604, + "time_per_iteration": 4.7287609577178955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157905, + "balance_loss_mlp": 1.08204055, + "epoch": 0.6935359753751443, + "flos": 541930309632.0, + "grad_norm": 0.03919534439322985, + "language_loss": 0.90026039, + "learning_rate": 0.0002267318588424379, + "loss": 0.91183943, + "num_input_tokens_seen": 299381136, + "router_z_loss_mlp": 0.75732422, + "step": 3605, + "time_per_iteration": 2.6615920066833496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150618, + "balance_loss_mlp": 1.07484841, + "epoch": 0.6937283570604078, + "flos": 720689948160.0, + "grad_norm": 0.03558950704948247, + "language_loss": 0.91988891, + "learning_rate": 0.00022647101533842845, + "loss": 0.93139505, + "num_input_tokens_seen": 299455216, + "router_z_loss_mlp": 0.75634766, + "step": 3606, + "time_per_iteration": 2.875670909881592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152588, + "balance_loss_mlp": 1.07658041, + "epoch": 0.6939207387456714, + "flos": 523193656320.0, + "grad_norm": 0.041224980702036104, + "language_loss": 0.83253193, + "learning_rate": 0.00022621027802778872, + "loss": 0.84405786, + "num_input_tokens_seen": 299524352, + "router_z_loss_mlp": 0.75878906, + "step": 3607, + "time_per_iteration": 2.6125805377960205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151349, + "balance_loss_mlp": 1.07519805, + "epoch": 0.694113120430935, + "flos": 536401165824.0, + "grad_norm": 0.03463828866617186, + "language_loss": 0.85144913, + "learning_rate": 0.00022594964701174586, + "loss": 0.86296266, + "num_input_tokens_seen": 299594960, + "router_z_loss_mlp": 0.76025391, + "step": 3608, + "time_per_iteration": 2.6021461486816406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150974, + "balance_loss_mlp": 1.07496643, + "epoch": 0.6943055021161986, + "flos": 524394157056.0, + "grad_norm": 0.03515633419070769, + "language_loss": 0.89070058, + "learning_rate": 0.00022568912239148586, + "loss": 0.9022103, + "num_input_tokens_seen": 299662560, + "router_z_loss_mlp": 0.75878906, + "step": 3609, + "time_per_iteration": 2.636577844619751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145151, + "balance_loss_mlp": 1.06904817, + "epoch": 0.694497883801462, + "flos": 485970127872.0, + "grad_norm": 0.037176872987451946, + "language_loss": 0.86671317, + "learning_rate": 0.00022542870426815344, + "loss": 0.87816465, + "num_input_tokens_seen": 299734896, + "router_z_loss_mlp": 0.75976562, + "step": 3610, + "time_per_iteration": 2.6800506114959717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114419, + "balance_loss_mlp": 1.06818187, + "epoch": 0.6946902654867256, + "flos": 462424740864.0, + "grad_norm": 0.03708376402785258, + "language_loss": 0.9062373, + "learning_rate": 0.00022516839274285173, + "loss": 0.91767919, + "num_input_tokens_seen": 299799424, + "router_z_loss_mlp": 0.75878906, + "step": 3611, + "time_per_iteration": 2.516231060028076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144878, + "balance_loss_mlp": 1.06906128, + "epoch": 0.6948826471719892, + "flos": 513867626496.0, + "grad_norm": 0.032040517416043905, + "language_loss": 0.80424583, + "learning_rate": 0.00022490818791664265, + "loss": 0.81569457, + "num_input_tokens_seen": 299868272, + "router_z_loss_mlp": 0.75683594, + "step": 3612, + "time_per_iteration": 2.5825564861297607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153455, + "balance_loss_mlp": 1.07768571, + "epoch": 0.6950750288572528, + "flos": 558255227904.0, + "grad_norm": 0.03220148028893399, + "language_loss": 0.90256339, + "learning_rate": 0.00022464808989054676, + "loss": 0.91409791, + "num_input_tokens_seen": 299939136, + "router_z_loss_mlp": 0.75634766, + "step": 3613, + "time_per_iteration": 2.673570394515991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153455, + "balance_loss_mlp": 1.07763827, + "epoch": 0.6952674105425164, + "flos": 543521577984.0, + "grad_norm": 0.03708971382778387, + "language_loss": 0.80475914, + "learning_rate": 0.00022438809876554284, + "loss": 0.81629372, + "num_input_tokens_seen": 300009472, + "router_z_loss_mlp": 0.75683594, + "step": 3614, + "time_per_iteration": 2.6276586055755615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114766, + "balance_loss_mlp": 1.07179534, + "epoch": 0.6954597922277799, + "flos": 547856951808.0, + "grad_norm": 0.035809532178513556, + "language_loss": 0.85295904, + "learning_rate": 0.00022412821464256873, + "loss": 0.86443567, + "num_input_tokens_seen": 300081008, + "router_z_loss_mlp": 0.75732422, + "step": 3615, + "time_per_iteration": 2.675262689590454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144404, + "balance_loss_mlp": 1.06887305, + "epoch": 0.6956521739130435, + "flos": 520540875264.0, + "grad_norm": 0.03660154684653836, + "language_loss": 0.87111717, + "learning_rate": 0.00022386843762252023, + "loss": 0.88256121, + "num_input_tokens_seen": 300149856, + "router_z_loss_mlp": 0.75390625, + "step": 3616, + "time_per_iteration": 2.601850986480713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145995, + "balance_loss_mlp": 1.07055974, + "epoch": 0.695844555598307, + "flos": 467263673856.0, + "grad_norm": 0.03600236468041408, + "language_loss": 0.85243946, + "learning_rate": 0.00022360876780625193, + "loss": 0.86389947, + "num_input_tokens_seen": 300217344, + "router_z_loss_mlp": 0.75292969, + "step": 3617, + "time_per_iteration": 2.6009066104888916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146046, + "balance_loss_mlp": 1.0705148, + "epoch": 0.6960369372835706, + "flos": 601931151360.0, + "grad_norm": 0.03135963801145649, + "language_loss": 0.84376919, + "learning_rate": 0.00022334920529457604, + "loss": 0.85522962, + "num_input_tokens_seen": 300305584, + "router_z_loss_mlp": 0.75390625, + "step": 3618, + "time_per_iteration": 2.919830322265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152209, + "balance_loss_mlp": 1.07662988, + "epoch": 0.6962293189688342, + "flos": 645465358848.0, + "grad_norm": 0.03118514394285757, + "language_loss": 0.91862655, + "learning_rate": 0.00022308975018826423, + "loss": 0.9301486, + "num_input_tokens_seen": 300386480, + "router_z_loss_mlp": 0.75439453, + "step": 3619, + "time_per_iteration": 2.8989925384521484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152559, + "balance_loss_mlp": 1.07688463, + "epoch": 0.6964217006540977, + "flos": 639957682176.0, + "grad_norm": 0.03812258215137557, + "language_loss": 0.9018597, + "learning_rate": 0.00022283040258804564, + "loss": 0.91338527, + "num_input_tokens_seen": 300461840, + "router_z_loss_mlp": 0.75537109, + "step": 3620, + "time_per_iteration": 2.74235200881958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115248, + "balance_loss_mlp": 1.07680559, + "epoch": 0.6966140823393613, + "flos": 653386771968.0, + "grad_norm": 0.03521446946003712, + "language_loss": 0.88482189, + "learning_rate": 0.00022257116259460802, + "loss": 0.89634669, + "num_input_tokens_seen": 300540400, + "router_z_loss_mlp": 0.75537109, + "step": 3621, + "time_per_iteration": 2.819164991378784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152109, + "balance_loss_mlp": 1.07657778, + "epoch": 0.6968064640246249, + "flos": 705824040960.0, + "grad_norm": 0.033483575769838334, + "language_loss": 0.86131644, + "learning_rate": 0.00022231203030859725, + "loss": 0.87283748, + "num_input_tokens_seen": 300624240, + "router_z_loss_mlp": 0.75390625, + "step": 3622, + "time_per_iteration": 2.9764678478240967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151499, + "balance_loss_mlp": 1.07596815, + "epoch": 0.6969988457098885, + "flos": 493530972672.0, + "grad_norm": 0.03689827849321225, + "language_loss": 0.88673711, + "learning_rate": 0.00022205300583061737, + "loss": 0.89825207, + "num_input_tokens_seen": 300689728, + "router_z_loss_mlp": 0.75390625, + "step": 3623, + "time_per_iteration": 2.56077241897583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160957, + "balance_loss_mlp": 1.08676147, + "epoch": 0.6971912273951519, + "flos": 1355612765184.0, + "grad_norm": 0.01051210233646139, + "language_loss": 0.82838202, + "learning_rate": 0.00022179408926123063, + "loss": 0.83999157, + "num_input_tokens_seen": 300913152, + "router_z_loss_mlp": 0.7421875, + "step": 3624, + "time_per_iteration": 4.901975393295288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150633, + "balance_loss_mlp": 1.07529247, + "epoch": 0.6973836090804155, + "flos": 603574086144.0, + "grad_norm": 0.03562483559578549, + "language_loss": 0.82784301, + "learning_rate": 0.00022153528070095735, + "loss": 0.83934939, + "num_input_tokens_seen": 300985824, + "router_z_loss_mlp": 0.75195312, + "step": 3625, + "time_per_iteration": 2.6827454566955566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147557, + "balance_loss_mlp": 1.07226419, + "epoch": 0.6975759907656791, + "flos": 525110564352.0, + "grad_norm": 0.03740891525888632, + "language_loss": 0.94177675, + "learning_rate": 0.00022127658025027568, + "loss": 0.95325232, + "num_input_tokens_seen": 301058048, + "router_z_loss_mlp": 0.75146484, + "step": 3626, + "time_per_iteration": 2.6243293285369873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145673, + "balance_loss_mlp": 1.07014167, + "epoch": 0.6977683724509427, + "flos": 481877801472.0, + "grad_norm": 0.03606674013608827, + "language_loss": 0.91052938, + "learning_rate": 0.00022101798800962258, + "loss": 0.92198616, + "num_input_tokens_seen": 301127472, + "router_z_loss_mlp": 0.75390625, + "step": 3627, + "time_per_iteration": 2.585353374481201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145537, + "balance_loss_mlp": 1.07005322, + "epoch": 0.6979607541362063, + "flos": 523640819712.0, + "grad_norm": 0.043695073898502274, + "language_loss": 0.852063, + "learning_rate": 0.00022075950407939227, + "loss": 0.86351836, + "num_input_tokens_seen": 301193920, + "router_z_loss_mlp": 0.75341797, + "step": 3628, + "time_per_iteration": 2.6018002033233643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145624, + "balance_loss_mlp": 1.07023609, + "epoch": 0.6981531358214698, + "flos": 549115849728.0, + "grad_norm": 0.039500919644618576, + "language_loss": 0.87787813, + "learning_rate": 0.0002205011285599367, + "loss": 0.88933432, + "num_input_tokens_seen": 301264256, + "router_z_loss_mlp": 0.75244141, + "step": 3629, + "time_per_iteration": 2.6909217834472656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114526, + "balance_loss_mlp": 1.06991994, + "epoch": 0.6983455175067333, + "flos": 701275819008.0, + "grad_norm": 0.03293425746388738, + "language_loss": 0.8505758, + "learning_rate": 0.00022024286155156658, + "loss": 0.86202836, + "num_input_tokens_seen": 301337696, + "router_z_loss_mlp": 0.75195312, + "step": 3630, + "time_per_iteration": 2.8668339252471924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145235, + "balance_loss_mlp": 1.07008553, + "epoch": 0.6985378991919969, + "flos": 486119849472.0, + "grad_norm": 0.03293145354984791, + "language_loss": 0.9093079, + "learning_rate": 0.00021998470315454994, + "loss": 0.92076027, + "num_input_tokens_seen": 301407776, + "router_z_loss_mlp": 0.75, + "step": 3631, + "time_per_iteration": 2.6536853313446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145252, + "balance_loss_mlp": 1.07010257, + "epoch": 0.6987302808772605, + "flos": 559892158464.0, + "grad_norm": 0.03487739632649299, + "language_loss": 0.90976024, + "learning_rate": 0.00021972665346911275, + "loss": 0.92121279, + "num_input_tokens_seen": 301475120, + "router_z_loss_mlp": 0.75, + "step": 3632, + "time_per_iteration": 2.705947160720825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145801, + "balance_loss_mlp": 1.07046092, + "epoch": 0.698922662562524, + "flos": 484567512576.0, + "grad_norm": 0.03530100295621196, + "language_loss": 0.84786582, + "learning_rate": 0.00021946871259543877, + "loss": 0.85932386, + "num_input_tokens_seen": 301542416, + "router_z_loss_mlp": 0.75195312, + "step": 3633, + "time_per_iteration": 2.585474729537964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146213, + "balance_loss_mlp": 1.07106328, + "epoch": 0.6991150442477876, + "flos": 720205854720.0, + "grad_norm": 0.031838987726816204, + "language_loss": 0.87710065, + "learning_rate": 0.00021921088063366957, + "loss": 0.88856274, + "num_input_tokens_seen": 301620672, + "router_z_loss_mlp": 0.75, + "step": 3634, + "time_per_iteration": 2.9367825984954834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150014, + "balance_loss_mlp": 1.0748167, + "epoch": 0.6993074259330512, + "flos": 490159782912.0, + "grad_norm": 0.031688179497796835, + "language_loss": 0.86258936, + "learning_rate": 0.00021895315768390435, + "loss": 0.87408948, + "num_input_tokens_seen": 301688016, + "router_z_loss_mlp": 0.75048828, + "step": 3635, + "time_per_iteration": 2.6028146743774414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150052, + "balance_loss_mlp": 1.07490218, + "epoch": 0.6994998076183148, + "flos": 719467980288.0, + "grad_norm": 0.03153013749596923, + "language_loss": 0.92548811, + "learning_rate": 0.00021869554384619999, + "loss": 0.93698871, + "num_input_tokens_seen": 301771184, + "router_z_loss_mlp": 0.75, + "step": 3636, + "time_per_iteration": 2.998966932296753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146553, + "balance_loss_mlp": 1.07126021, + "epoch": 0.6996921893035783, + "flos": 580163684352.0, + "grad_norm": 0.03271766083883028, + "language_loss": 0.86055148, + "learning_rate": 0.00021843803922057115, + "loss": 0.87201703, + "num_input_tokens_seen": 301844528, + "router_z_loss_mlp": 0.75146484, + "step": 3637, + "time_per_iteration": 2.745859384536743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145131, + "balance_loss_mlp": 1.06983805, + "epoch": 0.6998845709888418, + "flos": 519674746368.0, + "grad_norm": 0.033737468180216806, + "language_loss": 0.86839747, + "learning_rate": 0.00021818064390698977, + "loss": 0.87984878, + "num_input_tokens_seen": 301914960, + "router_z_loss_mlp": 0.75146484, + "step": 3638, + "time_per_iteration": 2.632795810699463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146648, + "balance_loss_mlp": 1.07130754, + "epoch": 0.7000769526741054, + "flos": 622095889920.0, + "grad_norm": 0.03373596031982573, + "language_loss": 0.91870159, + "learning_rate": 0.0002179233580053861, + "loss": 0.93016809, + "num_input_tokens_seen": 301986352, + "router_z_loss_mlp": 0.75195312, + "step": 3639, + "time_per_iteration": 2.753880023956299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115047, + "balance_loss_mlp": 1.07512987, + "epoch": 0.700269334359369, + "flos": 561055729152.0, + "grad_norm": 0.03325206970104953, + "language_loss": 0.90108448, + "learning_rate": 0.00021766618161564688, + "loss": 0.91258919, + "num_input_tokens_seen": 302060544, + "router_z_loss_mlp": 0.75195312, + "step": 3640, + "time_per_iteration": 2.724479913711548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114817, + "balance_loss_mlp": 1.07273436, + "epoch": 0.7004617160446326, + "flos": 484361395200.0, + "grad_norm": 0.03152672477913245, + "language_loss": 0.91440845, + "learning_rate": 0.00021740911483761677, + "loss": 0.92589015, + "num_input_tokens_seen": 302127232, + "router_z_loss_mlp": 0.75292969, + "step": 3641, + "time_per_iteration": 2.5502066612243652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146714, + "balance_loss_mlp": 1.07137418, + "epoch": 0.7006540977298961, + "flos": 698321593344.0, + "grad_norm": 0.030766047541437955, + "language_loss": 0.95812565, + "learning_rate": 0.00021715215777109837, + "loss": 0.96959281, + "num_input_tokens_seen": 302207056, + "router_z_loss_mlp": 0.75195312, + "step": 3642, + "time_per_iteration": 2.9363698959350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150063, + "balance_loss_mlp": 1.07477081, + "epoch": 0.7008464794151597, + "flos": 505770295296.0, + "grad_norm": 0.03557511475331178, + "language_loss": 0.88907003, + "learning_rate": 0.00021689531051585103, + "loss": 0.90057063, + "num_input_tokens_seen": 302275632, + "router_z_loss_mlp": 0.75146484, + "step": 3643, + "time_per_iteration": 2.6452667713165283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150173, + "balance_loss_mlp": 1.07483232, + "epoch": 0.7010388611004232, + "flos": 538272411648.0, + "grad_norm": 0.036527368416016295, + "language_loss": 0.85649168, + "learning_rate": 0.00021663857317159196, + "loss": 0.86799347, + "num_input_tokens_seen": 302343600, + "router_z_loss_mlp": 0.75195312, + "step": 3644, + "time_per_iteration": 2.661463499069214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149991, + "balance_loss_mlp": 1.07465088, + "epoch": 0.7012312427856868, + "flos": 548314848768.0, + "grad_norm": 0.031074257387366924, + "language_loss": 0.86441541, + "learning_rate": 0.00021638194583799487, + "loss": 0.87591535, + "num_input_tokens_seen": 302414656, + "router_z_loss_mlp": 0.75195312, + "step": 3645, + "time_per_iteration": 2.6630945205688477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114701, + "balance_loss_mlp": 1.07166946, + "epoch": 0.7014236244709504, + "flos": 942973060608.0, + "grad_norm": 0.03710031332944713, + "language_loss": 0.87637782, + "learning_rate": 0.00021612542861469176, + "loss": 0.8878479, + "num_input_tokens_seen": 302495120, + "router_z_loss_mlp": 0.75195312, + "step": 3646, + "time_per_iteration": 3.1664998531341553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146595, + "balance_loss_mlp": 1.07120693, + "epoch": 0.7016160061562139, + "flos": 526209007104.0, + "grad_norm": 0.036568631884181475, + "language_loss": 0.87361133, + "learning_rate": 0.00021586902160127135, + "loss": 0.88507724, + "num_input_tokens_seen": 302563024, + "router_z_loss_mlp": 0.75244141, + "step": 3647, + "time_per_iteration": 2.588329792022705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145686, + "balance_loss_mlp": 1.07029808, + "epoch": 0.7018083878414775, + "flos": 374244421632.0, + "grad_norm": 0.046770994216465425, + "language_loss": 0.81241143, + "learning_rate": 0.00021561272489727974, + "loss": 0.82386827, + "num_input_tokens_seen": 302624544, + "router_z_loss_mlp": 0.75244141, + "step": 3648, + "time_per_iteration": 2.4180006980895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145708, + "balance_loss_mlp": 1.07036817, + "epoch": 0.7020007695267411, + "flos": 528833590272.0, + "grad_norm": 0.03433939193961528, + "language_loss": 0.86265445, + "learning_rate": 0.0002153565386022199, + "loss": 0.87411153, + "num_input_tokens_seen": 302697856, + "router_z_loss_mlp": 0.75195312, + "step": 3649, + "time_per_iteration": 2.6287925243377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146273, + "balance_loss_mlp": 1.07093239, + "epoch": 0.7021931512120047, + "flos": 691372369920.0, + "grad_norm": 0.0338942783378883, + "language_loss": 0.87374359, + "learning_rate": 0.00021510046281555262, + "loss": 0.88520634, + "num_input_tokens_seen": 302771984, + "router_z_loss_mlp": 0.75195312, + "step": 3650, + "time_per_iteration": 2.8249292373657227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145818, + "balance_loss_mlp": 1.0704776, + "epoch": 0.7023855328972681, + "flos": 640925869056.0, + "grad_norm": 0.04142301274986203, + "language_loss": 0.87215114, + "learning_rate": 0.0002148444976366949, + "loss": 0.88360929, + "num_input_tokens_seen": 302838832, + "router_z_loss_mlp": 0.75195312, + "step": 3651, + "time_per_iteration": 2.7713325023651123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148886, + "balance_loss_mlp": 1.07368851, + "epoch": 0.7025779145825317, + "flos": 562006451712.0, + "grad_norm": 0.03240472166532918, + "language_loss": 0.87441784, + "learning_rate": 0.00021458864316502136, + "loss": 0.8859067, + "num_input_tokens_seen": 302909952, + "router_z_loss_mlp": 0.75048828, + "step": 3652, + "time_per_iteration": 2.729938268661499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147969, + "balance_loss_mlp": 1.07267606, + "epoch": 0.7027702962677953, + "flos": 448370568192.0, + "grad_norm": 0.03662771353243768, + "language_loss": 0.92350411, + "learning_rate": 0.0002143328994998634, + "loss": 0.93498379, + "num_input_tokens_seen": 302973056, + "router_z_loss_mlp": 0.75146484, + "step": 3653, + "time_per_iteration": 2.4846644401550293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147539, + "balance_loss_mlp": 1.07210338, + "epoch": 0.7029626779530589, + "flos": 623713354752.0, + "grad_norm": 0.03664764199554111, + "language_loss": 0.83479095, + "learning_rate": 0.00021407726674050982, + "loss": 0.84626639, + "num_input_tokens_seen": 303054656, + "router_z_loss_mlp": 0.75292969, + "step": 3654, + "time_per_iteration": 2.850576400756836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145188, + "balance_loss_mlp": 1.07003856, + "epoch": 0.7031550596383225, + "flos": 630733710336.0, + "grad_norm": 0.030002783226809063, + "language_loss": 0.91781414, + "learning_rate": 0.0002138217449862061, + "loss": 0.92926598, + "num_input_tokens_seen": 303124256, + "router_z_loss_mlp": 0.75, + "step": 3655, + "time_per_iteration": 2.7412569522857666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145204, + "balance_loss_mlp": 1.07000697, + "epoch": 0.703347441323586, + "flos": 531859674624.0, + "grad_norm": 0.03278089952227313, + "language_loss": 0.82951868, + "learning_rate": 0.00021356633433615403, + "loss": 0.84097064, + "num_input_tokens_seen": 303192720, + "router_z_loss_mlp": 0.75048828, + "step": 3656, + "time_per_iteration": 2.6387276649475098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144911, + "balance_loss_mlp": 1.06971395, + "epoch": 0.7035398230088495, + "flos": 694915474944.0, + "grad_norm": 0.029068288031651398, + "language_loss": 0.87720138, + "learning_rate": 0.0002133110348895133, + "loss": 0.88865048, + "num_input_tokens_seen": 303275968, + "router_z_loss_mlp": 0.75048828, + "step": 3657, + "time_per_iteration": 2.993046998977661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146816, + "balance_loss_mlp": 1.07152295, + "epoch": 0.7037322046941131, + "flos": 969666055680.0, + "grad_norm": 0.030671197457474774, + "language_loss": 0.89195395, + "learning_rate": 0.0002130558467453999, + "loss": 0.90342212, + "num_input_tokens_seen": 303367296, + "router_z_loss_mlp": 0.75146484, + "step": 3658, + "time_per_iteration": 3.3705010414123535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146747, + "balance_loss_mlp": 1.07131183, + "epoch": 0.7039245863793767, + "flos": 503925245952.0, + "grad_norm": 0.03300080382210099, + "language_loss": 0.88645768, + "learning_rate": 0.0002128007700028865, + "loss": 0.89792514, + "num_input_tokens_seen": 303442768, + "router_z_loss_mlp": 0.75292969, + "step": 3659, + "time_per_iteration": 2.734318256378174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148886, + "balance_loss_mlp": 1.07368839, + "epoch": 0.7041169680646402, + "flos": 466938034176.0, + "grad_norm": 0.036833825821468186, + "language_loss": 0.89132273, + "learning_rate": 0.00021254580476100276, + "loss": 0.90281165, + "num_input_tokens_seen": 303508304, + "router_z_loss_mlp": 0.75048828, + "step": 3660, + "time_per_iteration": 2.5174009799957275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149342, + "balance_loss_mlp": 1.07409692, + "epoch": 0.7043093497499038, + "flos": 633321363456.0, + "grad_norm": 0.04007789586728335, + "language_loss": 0.83207953, + "learning_rate": 0.00021229095111873497, + "loss": 0.84357297, + "num_input_tokens_seen": 303579312, + "router_z_loss_mlp": 0.75097656, + "step": 3661, + "time_per_iteration": 2.739220142364502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150007, + "balance_loss_mlp": 1.07466638, + "epoch": 0.7045017314351674, + "flos": 544094994432.0, + "grad_norm": 0.03298817995700549, + "language_loss": 0.90804625, + "learning_rate": 0.0002120362091750261, + "loss": 0.91954637, + "num_input_tokens_seen": 303658384, + "router_z_loss_mlp": 0.75195312, + "step": 3662, + "time_per_iteration": 2.7960565090179443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146981, + "balance_loss_mlp": 1.07149768, + "epoch": 0.704694113120431, + "flos": 429141089280.0, + "grad_norm": 0.039212871672660514, + "language_loss": 0.92362261, + "learning_rate": 0.00021178157902877566, + "loss": 0.93509239, + "num_input_tokens_seen": 303721136, + "router_z_loss_mlp": 0.75341797, + "step": 3663, + "time_per_iteration": 2.4680960178375244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147972, + "balance_loss_mlp": 1.07263219, + "epoch": 0.7048864948056945, + "flos": 651712911360.0, + "grad_norm": 0.034682408130930084, + "language_loss": 0.9230448, + "learning_rate": 0.0002115270607788397, + "loss": 0.93452454, + "num_input_tokens_seen": 303792368, + "router_z_loss_mlp": 0.75195312, + "step": 3664, + "time_per_iteration": 2.775634288787842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149534, + "balance_loss_mlp": 1.07414639, + "epoch": 0.705078876490958, + "flos": 413493646848.0, + "grad_norm": 0.03365445853786745, + "language_loss": 0.90348285, + "learning_rate": 0.00021127265452403133, + "loss": 0.91497815, + "num_input_tokens_seen": 303856336, + "router_z_loss_mlp": 0.75244141, + "step": 3665, + "time_per_iteration": 2.4944612979888916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153404, + "balance_loss_mlp": 1.07958984, + "epoch": 0.7052712581762216, + "flos": 1423148255232.0, + "grad_norm": 0.008450912797082885, + "language_loss": 0.84091628, + "learning_rate": 0.0002110183603631199, + "loss": 0.85245037, + "num_input_tokens_seen": 304089856, + "router_z_loss_mlp": 0.73828125, + "step": 3666, + "time_per_iteration": 4.8742945194244385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147318, + "balance_loss_mlp": 1.07188284, + "epoch": 0.7054636398614852, + "flos": 494069460480.0, + "grad_norm": 0.03621564888049926, + "language_loss": 0.8791604, + "learning_rate": 0.00021076417839483065, + "loss": 0.89063358, + "num_input_tokens_seen": 304164752, + "router_z_loss_mlp": 0.75292969, + "step": 3667, + "time_per_iteration": 2.8080356121063232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145091, + "balance_loss_mlp": 1.06965578, + "epoch": 0.7056560215467488, + "flos": 451377186816.0, + "grad_norm": 0.031611332246536214, + "language_loss": 0.89408493, + "learning_rate": 0.00021051010871784589, + "loss": 0.90553588, + "num_input_tokens_seen": 304229568, + "router_z_loss_mlp": 0.75292969, + "step": 3668, + "time_per_iteration": 2.57733154296875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145739, + "balance_loss_mlp": 1.07039869, + "epoch": 0.7058484032320124, + "flos": 566817186816.0, + "grad_norm": 0.030127652842763482, + "language_loss": 0.83471566, + "learning_rate": 0.0002102561514308045, + "loss": 0.84617305, + "num_input_tokens_seen": 304299408, + "router_z_loss_mlp": 0.75195312, + "step": 3669, + "time_per_iteration": 2.742791175842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144151, + "balance_loss_mlp": 1.06881058, + "epoch": 0.7060407849172758, + "flos": 568102281216.0, + "grad_norm": 0.033895396428982545, + "language_loss": 0.87930894, + "learning_rate": 0.00021000230663230135, + "loss": 0.89075041, + "num_input_tokens_seen": 304367936, + "router_z_loss_mlp": 0.75195312, + "step": 3670, + "time_per_iteration": 2.667344331741333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143185, + "balance_loss_mlp": 1.06779695, + "epoch": 0.7062331666025394, + "flos": 469712338944.0, + "grad_norm": 0.03501215574939966, + "language_loss": 0.88139564, + "learning_rate": 0.00020974857442088762, + "loss": 0.89282751, + "num_input_tokens_seen": 304438368, + "router_z_loss_mlp": 0.75244141, + "step": 3671, + "time_per_iteration": 2.6410346031188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143999, + "balance_loss_mlp": 1.06861079, + "epoch": 0.706425548287803, + "flos": 596416743936.0, + "grad_norm": 0.033800210787899305, + "language_loss": 0.93517375, + "learning_rate": 0.00020949495489507104, + "loss": 0.94661367, + "num_input_tokens_seen": 304508720, + "router_z_loss_mlp": 0.75244141, + "step": 3672, + "time_per_iteration": 2.750444173812866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143883, + "balance_loss_mlp": 1.0685432, + "epoch": 0.7066179299730666, + "flos": 476813285376.0, + "grad_norm": 0.035802140613359776, + "language_loss": 0.90171611, + "learning_rate": 0.00020924144815331525, + "loss": 0.91315496, + "num_input_tokens_seen": 304576128, + "router_z_loss_mlp": 0.75195312, + "step": 3673, + "time_per_iteration": 2.553835391998291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144146, + "balance_loss_mlp": 1.0689013, + "epoch": 0.7068103116583301, + "flos": 507435423744.0, + "grad_norm": 0.037241628897294654, + "language_loss": 0.87898988, + "learning_rate": 0.00020898805429404044, + "loss": 0.8904314, + "num_input_tokens_seen": 304642416, + "router_z_loss_mlp": 0.75097656, + "step": 3674, + "time_per_iteration": 2.586620330810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114411, + "balance_loss_mlp": 1.06905568, + "epoch": 0.7070026933435937, + "flos": 680574594048.0, + "grad_norm": 0.03737000823174173, + "language_loss": 0.83904374, + "learning_rate": 0.0002087347734156228, + "loss": 0.85048485, + "num_input_tokens_seen": 304719312, + "router_z_loss_mlp": 0.74902344, + "step": 3675, + "time_per_iteration": 2.882800579071045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144169, + "balance_loss_mlp": 1.06906736, + "epoch": 0.7071950750288573, + "flos": 473166120960.0, + "grad_norm": 0.03475094948464188, + "language_loss": 0.84385908, + "learning_rate": 0.00020848160561639452, + "loss": 0.85530072, + "num_input_tokens_seen": 304789296, + "router_z_loss_mlp": 0.74951172, + "step": 3676, + "time_per_iteration": 2.6969666481018066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149349, + "balance_loss_mlp": 1.07429469, + "epoch": 0.7073874567141208, + "flos": 474683529216.0, + "grad_norm": 0.03052777669540167, + "language_loss": 0.90233761, + "learning_rate": 0.0002082285509946445, + "loss": 0.91383111, + "num_input_tokens_seen": 304854320, + "router_z_loss_mlp": 0.74902344, + "step": 3677, + "time_per_iteration": 2.546494722366333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152207, + "balance_loss_mlp": 1.07710516, + "epoch": 0.7075798383993844, + "flos": 547036485120.0, + "grad_norm": 0.03113462016358252, + "language_loss": 0.87627769, + "learning_rate": 0.00020797560964861683, + "loss": 0.88779974, + "num_input_tokens_seen": 304932784, + "router_z_loss_mlp": 0.74951172, + "step": 3678, + "time_per_iteration": 2.745973587036133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150766, + "balance_loss_mlp": 1.07585537, + "epoch": 0.7077722200846479, + "flos": 663390277632.0, + "grad_norm": 0.06964386826372344, + "language_loss": 0.85110044, + "learning_rate": 0.0002077227816765122, + "loss": 0.86260808, + "num_input_tokens_seen": 305018080, + "router_z_loss_mlp": 0.74755859, + "step": 3679, + "time_per_iteration": 2.982367753982544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115432, + "balance_loss_mlp": 1.08107758, + "epoch": 0.7079646017699115, + "flos": 1533300157440.0, + "grad_norm": 0.007004763795919161, + "language_loss": 0.76447725, + "learning_rate": 0.0002074700671764869, + "loss": 0.77602041, + "num_input_tokens_seen": 305241216, + "router_z_loss_mlp": 0.73242188, + "step": 3680, + "time_per_iteration": 4.8018670082092285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147209, + "balance_loss_mlp": 1.07224989, + "epoch": 0.7081569834551751, + "flos": 622645111296.0, + "grad_norm": 0.030610109660701587, + "language_loss": 0.83047998, + "learning_rate": 0.00020721746624665383, + "loss": 0.84195209, + "num_input_tokens_seen": 305311376, + "router_z_loss_mlp": 0.74804688, + "step": 3681, + "time_per_iteration": 2.782902717590332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147174, + "balance_loss_mlp": 1.07207251, + "epoch": 0.7083493651404387, + "flos": 796034059776.0, + "grad_norm": 0.03164783844829979, + "language_loss": 0.84436798, + "learning_rate": 0.00020696497898508114, + "loss": 0.85583979, + "num_input_tokens_seen": 305392736, + "router_z_loss_mlp": 0.74951172, + "step": 3682, + "time_per_iteration": 3.0583677291870117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143785, + "balance_loss_mlp": 1.06882644, + "epoch": 0.7085417468257021, + "flos": 815161480704.0, + "grad_norm": 0.03682994028404894, + "language_loss": 0.82170761, + "learning_rate": 0.00020671260548979316, + "loss": 0.83314544, + "num_input_tokens_seen": 305470896, + "router_z_loss_mlp": 0.74804688, + "step": 3683, + "time_per_iteration": 2.987361192703247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144169, + "balance_loss_mlp": 1.06911492, + "epoch": 0.7087341285109657, + "flos": 701796842496.0, + "grad_norm": 0.03866478361298153, + "language_loss": 0.90972751, + "learning_rate": 0.00020646034585876982, + "loss": 0.92116916, + "num_input_tokens_seen": 305547072, + "router_z_loss_mlp": 0.74902344, + "step": 3684, + "time_per_iteration": 2.810547351837158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144506, + "balance_loss_mlp": 1.06954765, + "epoch": 0.7089265101962293, + "flos": 597734765568.0, + "grad_norm": 0.031076054714904006, + "language_loss": 0.88290167, + "learning_rate": 0.00020620820018994718, + "loss": 0.89434671, + "num_input_tokens_seen": 305624512, + "router_z_loss_mlp": 0.74804688, + "step": 3685, + "time_per_iteration": 2.822174310684204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147475, + "balance_loss_mlp": 1.07246852, + "epoch": 0.7091188918814929, + "flos": 488167013376.0, + "grad_norm": 0.047855359590775554, + "language_loss": 0.88914609, + "learning_rate": 0.00020595616858121675, + "loss": 0.90062082, + "num_input_tokens_seen": 305695088, + "router_z_loss_mlp": 0.74853516, + "step": 3686, + "time_per_iteration": 2.7043378353118896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149664, + "balance_loss_mlp": 1.07470512, + "epoch": 0.7093112735667565, + "flos": 601255676928.0, + "grad_norm": 0.0443498852923524, + "language_loss": 0.85199845, + "learning_rate": 0.00020570425113042586, + "loss": 0.86349511, + "num_input_tokens_seen": 305763680, + "router_z_loss_mlp": 0.74804688, + "step": 3687, + "time_per_iteration": 2.702566623687744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152357, + "balance_loss_mlp": 1.07754159, + "epoch": 0.70950365525202, + "flos": 506849272320.0, + "grad_norm": 0.040092967224601664, + "language_loss": 0.90721941, + "learning_rate": 0.0002054524479353776, + "loss": 0.91874295, + "num_input_tokens_seen": 305835008, + "router_z_loss_mlp": 0.74707031, + "step": 3688, + "time_per_iteration": 2.667358636856079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147763, + "balance_loss_mlp": 1.07294738, + "epoch": 0.7096960369372836, + "flos": 733424097792.0, + "grad_norm": 0.04032937797632071, + "language_loss": 0.86300701, + "learning_rate": 0.00020520075909383063, + "loss": 0.87448466, + "num_input_tokens_seen": 305909072, + "router_z_loss_mlp": 0.74707031, + "step": 3689, + "time_per_iteration": 2.829561710357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145291, + "balance_loss_mlp": 1.07033193, + "epoch": 0.7098884186225471, + "flos": 973651594752.0, + "grad_norm": 0.03422835744235037, + "language_loss": 0.85456049, + "learning_rate": 0.00020494918470349916, + "loss": 0.86601341, + "num_input_tokens_seen": 305994752, + "router_z_loss_mlp": 0.74804688, + "step": 3690, + "time_per_iteration": 3.2887604236602783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147533, + "balance_loss_mlp": 1.0725745, + "epoch": 0.7100808003078107, + "flos": 505258003968.0, + "grad_norm": 0.040153245329332135, + "language_loss": 0.91447139, + "learning_rate": 0.00020469772486205297, + "loss": 0.92594671, + "num_input_tokens_seen": 306062960, + "router_z_loss_mlp": 0.74804688, + "step": 3691, + "time_per_iteration": 2.7245473861694336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148215, + "balance_loss_mlp": 1.07344735, + "epoch": 0.7102731819930742, + "flos": 541389820416.0, + "grad_norm": 0.03217926950478085, + "language_loss": 0.86047411, + "learning_rate": 0.0002044463796671177, + "loss": 0.87195623, + "num_input_tokens_seen": 306134224, + "router_z_loss_mlp": 0.74609375, + "step": 3692, + "time_per_iteration": 2.651794910430908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148314, + "balance_loss_mlp": 1.07330716, + "epoch": 0.7104655636783378, + "flos": 621627259392.0, + "grad_norm": 0.03360219211678542, + "language_loss": 0.85673523, + "learning_rate": 0.00020419514921627408, + "loss": 0.86821842, + "num_input_tokens_seen": 306214512, + "router_z_loss_mlp": 0.74853516, + "step": 3693, + "time_per_iteration": 2.933528184890747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147632, + "balance_loss_mlp": 1.07267368, + "epoch": 0.7106579453636014, + "flos": 558376751616.0, + "grad_norm": 0.03878231917046877, + "language_loss": 0.82689238, + "learning_rate": 0.00020394403360705855, + "loss": 0.83836865, + "num_input_tokens_seen": 306283232, + "router_z_loss_mlp": 0.74804688, + "step": 3694, + "time_per_iteration": 2.717163324356079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114284, + "balance_loss_mlp": 1.06788099, + "epoch": 0.710850327048865, + "flos": 514063010304.0, + "grad_norm": 0.03670457803793717, + "language_loss": 0.93433875, + "learning_rate": 0.00020369303293696228, + "loss": 0.9457671, + "num_input_tokens_seen": 306351536, + "router_z_loss_mlp": 0.74804688, + "step": 3695, + "time_per_iteration": 2.591191053390503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144917, + "balance_loss_mlp": 1.06995821, + "epoch": 0.7110427087341286, + "flos": 424506272256.0, + "grad_norm": 0.04020330353774376, + "language_loss": 0.83559984, + "learning_rate": 0.00020344214730343304, + "loss": 0.847049, + "num_input_tokens_seen": 306419040, + "router_z_loss_mlp": 0.74804688, + "step": 3696, + "time_per_iteration": 2.591609001159668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145099, + "balance_loss_mlp": 1.07014048, + "epoch": 0.711235090419392, + "flos": 578653006848.0, + "grad_norm": 0.02808433050647353, + "language_loss": 0.83313894, + "learning_rate": 0.00020319137680387296, + "loss": 0.84458989, + "num_input_tokens_seen": 306503248, + "router_z_loss_mlp": 0.74804688, + "step": 3697, + "time_per_iteration": 2.950737953186035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145687, + "balance_loss_mlp": 1.07063317, + "epoch": 0.7114274721046556, + "flos": 448984917504.0, + "grad_norm": 0.03843897473466325, + "language_loss": 0.86332655, + "learning_rate": 0.0002029407215356398, + "loss": 0.8747834, + "num_input_tokens_seen": 306566288, + "router_z_loss_mlp": 0.74902344, + "step": 3698, + "time_per_iteration": 2.578458309173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145595, + "balance_loss_mlp": 1.07063591, + "epoch": 0.7116198537899192, + "flos": 623092274688.0, + "grad_norm": 0.03606756354447633, + "language_loss": 0.88161683, + "learning_rate": 0.00020269018159604663, + "loss": 0.89307278, + "num_input_tokens_seen": 306633344, + "router_z_loss_mlp": 0.74804688, + "step": 3699, + "time_per_iteration": 2.7380590438842773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145077, + "balance_loss_mlp": 1.07007015, + "epoch": 0.7118122354751828, + "flos": 499720128000.0, + "grad_norm": 0.030764308679153148, + "language_loss": 0.86152577, + "learning_rate": 0.00020243975708236162, + "loss": 0.87297654, + "num_input_tokens_seen": 306701328, + "router_z_loss_mlp": 0.74853516, + "step": 3700, + "time_per_iteration": 2.5728888511657715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146347, + "balance_loss_mlp": 1.07134008, + "epoch": 0.7120046171604463, + "flos": 573844273152.0, + "grad_norm": 0.03285972243825597, + "language_loss": 0.90220731, + "learning_rate": 0.00020218944809180818, + "loss": 0.91367078, + "num_input_tokens_seen": 306773168, + "router_z_loss_mlp": 0.74853516, + "step": 3701, + "time_per_iteration": 2.684532880783081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146223, + "balance_loss_mlp": 1.07116926, + "epoch": 0.7121969988457099, + "flos": 573770413056.0, + "grad_norm": 0.03115747571146437, + "language_loss": 0.89376664, + "learning_rate": 0.00020193925472156493, + "loss": 0.90522885, + "num_input_tokens_seen": 306845312, + "router_z_loss_mlp": 0.74902344, + "step": 3702, + "time_per_iteration": 2.6705996990203857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152153, + "balance_loss_mlp": 1.07910156, + "epoch": 0.7123893805309734, + "flos": 1526820291072.0, + "grad_norm": 0.004701938060017763, + "language_loss": 0.74289167, + "learning_rate": 0.00020168917706876537, + "loss": 0.75441325, + "num_input_tokens_seen": 307079216, + "router_z_loss_mlp": 0.73046875, + "step": 3703, + "time_per_iteration": 4.916099309921265 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154733, + "balance_loss_mlp": 1.07958353, + "epoch": 0.712581762216237, + "flos": 616413021696.0, + "grad_norm": 0.031775345220902064, + "language_loss": 0.87929761, + "learning_rate": 0.00020143921523049863, + "loss": 0.89084488, + "num_input_tokens_seen": 307163568, + "router_z_loss_mlp": 0.75, + "step": 3704, + "time_per_iteration": 2.913417339324951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115426, + "balance_loss_mlp": 1.07915783, + "epoch": 0.7127741439015006, + "flos": 598874141184.0, + "grad_norm": 0.035207007977916, + "language_loss": 0.88667476, + "learning_rate": 0.00020118936930380837, + "loss": 0.89821732, + "num_input_tokens_seen": 307232800, + "router_z_loss_mlp": 0.74951172, + "step": 3705, + "time_per_iteration": 2.7526493072509766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144386, + "balance_loss_mlp": 1.06928408, + "epoch": 0.7129665255867641, + "flos": 538439597568.0, + "grad_norm": 0.036308279292938186, + "language_loss": 0.86138499, + "learning_rate": 0.0002009396393856932, + "loss": 0.87282884, + "num_input_tokens_seen": 307307216, + "router_z_loss_mlp": 0.74951172, + "step": 3706, + "time_per_iteration": 2.6750972270965576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147628, + "balance_loss_mlp": 1.07243121, + "epoch": 0.7131589072720277, + "flos": 527520297984.0, + "grad_norm": 0.03563284623765711, + "language_loss": 0.87550783, + "learning_rate": 0.00020069002557310673, + "loss": 0.88698411, + "num_input_tokens_seen": 307377472, + "router_z_loss_mlp": 0.75048828, + "step": 3707, + "time_per_iteration": 2.6487066745758057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149229, + "balance_loss_mlp": 1.0741272, + "epoch": 0.7133512889572913, + "flos": 532096717824.0, + "grad_norm": 0.031192275434881008, + "language_loss": 0.81347728, + "learning_rate": 0.00020044052796295807, + "loss": 0.82496965, + "num_input_tokens_seen": 307456880, + "router_z_loss_mlp": 0.74951172, + "step": 3708, + "time_per_iteration": 2.7782645225524902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148063, + "balance_loss_mlp": 1.0728184, + "epoch": 0.7135436706425549, + "flos": 504550328832.0, + "grad_norm": 0.03157354031682846, + "language_loss": 0.86940277, + "learning_rate": 0.00020019114665211063, + "loss": 0.8808834, + "num_input_tokens_seen": 307524784, + "router_z_loss_mlp": 0.75097656, + "step": 3709, + "time_per_iteration": 2.6009671688079834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147572, + "balance_loss_mlp": 1.07242227, + "epoch": 0.7137360523278183, + "flos": 516967570944.0, + "grad_norm": 0.03487007754085134, + "language_loss": 0.85992116, + "learning_rate": 0.00019994188173738276, + "loss": 0.8713969, + "num_input_tokens_seen": 307591408, + "router_z_loss_mlp": 0.75, + "step": 3710, + "time_per_iteration": 2.5438315868377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142507, + "balance_loss_mlp": 1.0673095, + "epoch": 0.7139284340130819, + "flos": 511536482304.0, + "grad_norm": 0.03607772040837418, + "language_loss": 0.85274506, + "learning_rate": 0.0001996927333155477, + "loss": 0.86417007, + "num_input_tokens_seen": 307662912, + "router_z_loss_mlp": 0.75048828, + "step": 3711, + "time_per_iteration": 2.7427854537963867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139044, + "balance_loss_mlp": 1.06389427, + "epoch": 0.7141208156983455, + "flos": 891799418880.0, + "grad_norm": 0.0340111276626949, + "language_loss": 0.9025712, + "learning_rate": 0.00019944370148333346, + "loss": 0.91396165, + "num_input_tokens_seen": 307752256, + "router_z_loss_mlp": 0.75, + "step": 3712, + "time_per_iteration": 3.1386330127716064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113928, + "balance_loss_mlp": 1.0641309, + "epoch": 0.7143131973836091, + "flos": 536883257856.0, + "grad_norm": 0.03639718620252856, + "language_loss": 0.8407408, + "learning_rate": 0.00019919478633742278, + "loss": 0.85213363, + "num_input_tokens_seen": 307821504, + "router_z_loss_mlp": 0.75, + "step": 3713, + "time_per_iteration": 2.6460351943969727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139962, + "balance_loss_mlp": 1.06486058, + "epoch": 0.7145055790688727, + "flos": 474627133440.0, + "grad_norm": 0.03673935987195594, + "language_loss": 0.91008997, + "learning_rate": 0.00019894598797445302, + "loss": 0.9214896, + "num_input_tokens_seen": 307886464, + "router_z_loss_mlp": 0.74951172, + "step": 3714, + "time_per_iteration": 2.5253968238830566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139941, + "balance_loss_mlp": 1.06498206, + "epoch": 0.7146979607541362, + "flos": 571701782016.0, + "grad_norm": 0.032359519554933665, + "language_loss": 0.85796106, + "learning_rate": 0.00019869730649101615, + "loss": 0.86936045, + "num_input_tokens_seen": 307962736, + "router_z_loss_mlp": 0.74804688, + "step": 3715, + "time_per_iteration": 2.765871047973633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139754, + "balance_loss_mlp": 1.06489098, + "epoch": 0.7148903424393998, + "flos": 841138068480.0, + "grad_norm": 0.0393709778481749, + "language_loss": 0.77344263, + "learning_rate": 0.00019844874198365943, + "loss": 0.78484023, + "num_input_tokens_seen": 308046592, + "router_z_loss_mlp": 0.74707031, + "step": 3716, + "time_per_iteration": 3.0865817070007324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140443, + "balance_loss_mlp": 1.06562734, + "epoch": 0.7150827241246633, + "flos": 542879030784.0, + "grad_norm": 0.03442327137938287, + "language_loss": 0.88300014, + "learning_rate": 0.00019820029454888362, + "loss": 0.89440459, + "num_input_tokens_seen": 308119920, + "router_z_loss_mlp": 0.74658203, + "step": 3717, + "time_per_iteration": 2.7028956413269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145981, + "balance_loss_mlp": 1.07312012, + "epoch": 0.7152751058099269, + "flos": 1587187705344.0, + "grad_norm": 0.009338560105867444, + "language_loss": 0.74521267, + "learning_rate": 0.00019795196428314455, + "loss": 0.7566725, + "num_input_tokens_seen": 308361024, + "router_z_loss_mlp": 0.72851562, + "step": 3718, + "time_per_iteration": 5.078125715255737 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142063, + "balance_loss_mlp": 1.06729496, + "epoch": 0.7154674874951905, + "flos": 518428583424.0, + "grad_norm": 0.038346473430325045, + "language_loss": 0.86008942, + "learning_rate": 0.0001977037512828529, + "loss": 0.87151003, + "num_input_tokens_seen": 308429808, + "router_z_loss_mlp": 0.74609375, + "step": 3719, + "time_per_iteration": 2.6236274242401123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141984, + "balance_loss_mlp": 1.0672158, + "epoch": 0.715659869180454, + "flos": 603639214080.0, + "grad_norm": 0.03183829156169413, + "language_loss": 0.90619719, + "learning_rate": 0.0001974556556443734, + "loss": 0.91761708, + "num_input_tokens_seen": 308501888, + "router_z_loss_mlp": 0.74609375, + "step": 3720, + "time_per_iteration": 2.7261006832122803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143131, + "balance_loss_mlp": 1.06836271, + "epoch": 0.7158522508657176, + "flos": 532769464320.0, + "grad_norm": 0.029220712652752532, + "language_loss": 0.93066287, + "learning_rate": 0.00019720767746402547, + "loss": 0.94209415, + "num_input_tokens_seen": 308576368, + "router_z_loss_mlp": 0.74609375, + "step": 3721, + "time_per_iteration": 2.730018377304077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144996, + "balance_loss_mlp": 1.06989455, + "epoch": 0.7160446325509812, + "flos": 558645995520.0, + "grad_norm": 0.03469516261194285, + "language_loss": 0.85035664, + "learning_rate": 0.00019695981683808222, + "loss": 0.86180663, + "num_input_tokens_seen": 308651936, + "router_z_loss_mlp": 0.74951172, + "step": 3722, + "time_per_iteration": 2.7371633052825928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152889, + "balance_loss_mlp": 1.07792997, + "epoch": 0.7162370142362448, + "flos": 692282159616.0, + "grad_norm": 0.032260484298275306, + "language_loss": 0.89382893, + "learning_rate": 0.00019671207386277225, + "loss": 0.90535784, + "num_input_tokens_seen": 308737264, + "router_z_loss_mlp": 0.74804688, + "step": 3723, + "time_per_iteration": 2.9425265789031982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114829, + "balance_loss_mlp": 1.07333136, + "epoch": 0.7164293959215082, + "flos": 795458641920.0, + "grad_norm": 0.035931768652590186, + "language_loss": 0.83636975, + "learning_rate": 0.0001964644486342777, + "loss": 0.84785259, + "num_input_tokens_seen": 308811776, + "router_z_loss_mlp": 0.74804688, + "step": 3724, + "time_per_iteration": 2.9537875652313232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147875, + "balance_loss_mlp": 1.07291591, + "epoch": 0.7166217776067718, + "flos": 495204833280.0, + "grad_norm": 0.03617438678608554, + "language_loss": 0.91026467, + "learning_rate": 0.00019621694124873524, + "loss": 0.92174339, + "num_input_tokens_seen": 308886704, + "router_z_loss_mlp": 0.74804688, + "step": 3725, + "time_per_iteration": 2.6945693492889404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146446, + "balance_loss_mlp": 1.07339478, + "epoch": 0.7168141592920354, + "flos": 1403961710592.0, + "grad_norm": 0.00968138139852001, + "language_loss": 0.76540077, + "learning_rate": 0.00019596955180223557, + "loss": 0.77686524, + "num_input_tokens_seen": 309113456, + "router_z_loss_mlp": 0.73046875, + "step": 3726, + "time_per_iteration": 4.849448919296265 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142124, + "balance_loss_mlp": 1.06716549, + "epoch": 0.717006540977299, + "flos": 794599243776.0, + "grad_norm": 0.04056704618834382, + "language_loss": 0.81872368, + "learning_rate": 0.00019572228039082428, + "loss": 0.83014494, + "num_input_tokens_seen": 309198768, + "router_z_loss_mlp": 0.74804688, + "step": 3727, + "time_per_iteration": 3.045783758163452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146498, + "balance_loss_mlp": 1.07153964, + "epoch": 0.7171989226625626, + "flos": 555963015168.0, + "grad_norm": 0.02715897729892971, + "language_loss": 0.87954736, + "learning_rate": 0.0001954751271105002, + "loss": 0.89101231, + "num_input_tokens_seen": 309279680, + "router_z_loss_mlp": 0.74804688, + "step": 3728, + "time_per_iteration": 2.7890095710754395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145282, + "balance_loss_mlp": 1.07027578, + "epoch": 0.717391304347826, + "flos": 557061457920.0, + "grad_norm": 0.03346658539414039, + "language_loss": 0.86323428, + "learning_rate": 0.00019522809205721687, + "loss": 0.87468708, + "num_input_tokens_seen": 309359152, + "router_z_loss_mlp": 0.74853516, + "step": 3729, + "time_per_iteration": 2.7522380352020264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140607, + "balance_loss_mlp": 1.06579113, + "epoch": 0.7175836860330896, + "flos": 539955004416.0, + "grad_norm": 0.0354578224226226, + "language_loss": 0.87126923, + "learning_rate": 0.0001949811753268816, + "loss": 0.88267529, + "num_input_tokens_seen": 309432800, + "router_z_loss_mlp": 0.74658203, + "step": 3730, + "time_per_iteration": 2.707690477371216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141683, + "balance_loss_mlp": 1.06686759, + "epoch": 0.7177760677183532, + "flos": 516650663424.0, + "grad_norm": 0.04023163535665124, + "language_loss": 0.88339722, + "learning_rate": 0.00019473437701535634, + "loss": 0.89481401, + "num_input_tokens_seen": 309499456, + "router_z_loss_mlp": 0.74658203, + "step": 3731, + "time_per_iteration": 2.570448637008667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114196, + "balance_loss_mlp": 1.06714427, + "epoch": 0.7179684494036168, + "flos": 675939777024.0, + "grad_norm": 0.03444896194332825, + "language_loss": 0.95062304, + "learning_rate": 0.00019448769721845677, + "loss": 0.96204257, + "num_input_tokens_seen": 309571056, + "router_z_loss_mlp": 0.74658203, + "step": 3732, + "time_per_iteration": 2.838884115219116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141126, + "balance_loss_mlp": 1.06635737, + "epoch": 0.7181608310888803, + "flos": 470875909632.0, + "grad_norm": 0.032659655773852006, + "language_loss": 0.9114489, + "learning_rate": 0.00019424113603195203, + "loss": 0.92286015, + "num_input_tokens_seen": 309635040, + "router_z_loss_mlp": 0.74609375, + "step": 3733, + "time_per_iteration": 2.540231704711914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142755, + "balance_loss_mlp": 1.06803441, + "epoch": 0.7183532127741439, + "flos": 595184042496.0, + "grad_norm": 0.0393108175728225, + "language_loss": 0.85483897, + "learning_rate": 0.0001939946935515657, + "loss": 0.86626649, + "num_input_tokens_seen": 309713696, + "router_z_loss_mlp": 0.74560547, + "step": 3734, + "time_per_iteration": 2.867018461227417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142515, + "balance_loss_mlp": 1.06774652, + "epoch": 0.7185455944594075, + "flos": 499915511808.0, + "grad_norm": 0.04034729202871447, + "language_loss": 0.85582328, + "learning_rate": 0.0001937483698729755, + "loss": 0.86724842, + "num_input_tokens_seen": 309782864, + "router_z_loss_mlp": 0.74609375, + "step": 3735, + "time_per_iteration": 2.5829944610595703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142145, + "balance_loss_mlp": 1.06737685, + "epoch": 0.718737976144671, + "flos": 816307587072.0, + "grad_norm": 0.03271819913976636, + "language_loss": 0.86010873, + "learning_rate": 0.0001935021650918128, + "loss": 0.87153018, + "num_input_tokens_seen": 309867056, + "router_z_loss_mlp": 0.74609375, + "step": 3736, + "time_per_iteration": 3.0105531215667725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142718, + "balance_loss_mlp": 1.06795025, + "epoch": 0.7189303578299346, + "flos": 439239922176.0, + "grad_norm": 0.03678550720791007, + "language_loss": 0.92134023, + "learning_rate": 0.0001932560793036625, + "loss": 0.93276739, + "num_input_tokens_seen": 309929744, + "router_z_loss_mlp": 0.74609375, + "step": 3737, + "time_per_iteration": 2.4854748249053955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142524, + "balance_loss_mlp": 1.06775641, + "epoch": 0.7191227395151981, + "flos": 550446606336.0, + "grad_norm": 0.04145641408022902, + "language_loss": 0.92745817, + "learning_rate": 0.00019301011260406382, + "loss": 0.93888342, + "num_input_tokens_seen": 309998128, + "router_z_loss_mlp": 0.74609375, + "step": 3738, + "time_per_iteration": 2.6645443439483643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114754, + "balance_loss_mlp": 1.07258117, + "epoch": 0.7193151212004617, + "flos": 628080929280.0, + "grad_norm": 0.039328087285967164, + "language_loss": 0.84679413, + "learning_rate": 0.00019276426508850936, + "loss": 0.85826951, + "num_input_tokens_seen": 310065472, + "router_z_loss_mlp": 0.74804688, + "step": 3739, + "time_per_iteration": 2.7071337699890137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148446, + "balance_loss_mlp": 1.07343948, + "epoch": 0.7195075028857253, + "flos": 742439950848.0, + "grad_norm": 0.030419377075742837, + "language_loss": 0.84898889, + "learning_rate": 0.00019251853685244564, + "loss": 0.86047333, + "num_input_tokens_seen": 310152960, + "router_z_loss_mlp": 0.74853516, + "step": 3740, + "time_per_iteration": 3.0168538093566895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114834, + "balance_loss_mlp": 1.07328558, + "epoch": 0.7196998845709889, + "flos": 804289844736.0, + "grad_norm": 0.05763766751245881, + "language_loss": 0.86089444, + "learning_rate": 0.00019227292799127283, + "loss": 0.87237775, + "num_input_tokens_seen": 310234080, + "router_z_loss_mlp": 0.74902344, + "step": 3741, + "time_per_iteration": 3.0083675384521484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144489, + "balance_loss_mlp": 1.06957746, + "epoch": 0.7198922662562524, + "flos": 926776396800.0, + "grad_norm": 0.03639396960725551, + "language_loss": 0.83974087, + "learning_rate": 0.00019202743860034454, + "loss": 0.8511858, + "num_input_tokens_seen": 310330208, + "router_z_loss_mlp": 0.74755859, + "step": 3742, + "time_per_iteration": 3.2506234645843506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144029, + "balance_loss_mlp": 1.06907046, + "epoch": 0.7200846479415159, + "flos": 581207732736.0, + "grad_norm": 0.03405610584059509, + "language_loss": 0.88730514, + "learning_rate": 0.00019178206877496873, + "loss": 0.89874554, + "num_input_tokens_seen": 310402960, + "router_z_loss_mlp": 0.74804688, + "step": 3743, + "time_per_iteration": 2.6837918758392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144783, + "balance_loss_mlp": 1.0700146, + "epoch": 0.7202770296267795, + "flos": 558839377920.0, + "grad_norm": 0.02830338825493349, + "language_loss": 0.89031184, + "learning_rate": 0.0001915368186104059, + "loss": 0.90175974, + "num_input_tokens_seen": 310479776, + "router_z_loss_mlp": 0.74609375, + "step": 3744, + "time_per_iteration": 2.7329940795898438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143898, + "balance_loss_mlp": 1.06912982, + "epoch": 0.7204694113120431, + "flos": 673771089408.0, + "grad_norm": 0.03331544271841085, + "language_loss": 0.85722578, + "learning_rate": 0.0001912916882018706, + "loss": 0.86866474, + "num_input_tokens_seen": 310555952, + "router_z_loss_mlp": 0.74609375, + "step": 3745, + "time_per_iteration": 2.7906653881073 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145353, + "balance_loss_mlp": 1.0706327, + "epoch": 0.7206617929973067, + "flos": 800595016704.0, + "grad_norm": 0.03936960108018568, + "language_loss": 0.85040343, + "learning_rate": 0.00019104667764453125, + "loss": 0.861857, + "num_input_tokens_seen": 310634784, + "router_z_loss_mlp": 0.74560547, + "step": 3746, + "time_per_iteration": 3.025996685028076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149239, + "balance_loss_mlp": 1.07437599, + "epoch": 0.7208541746825702, + "flos": 532938651648.0, + "grad_norm": 0.0387374733160612, + "language_loss": 0.85314423, + "learning_rate": 0.00019080178703350926, + "loss": 0.86463666, + "num_input_tokens_seen": 310703216, + "router_z_loss_mlp": 0.74707031, + "step": 3747, + "time_per_iteration": 2.640810251235962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149934, + "balance_loss_mlp": 1.07502282, + "epoch": 0.7210465563678338, + "flos": 536168851968.0, + "grad_norm": 0.035199314592541234, + "language_loss": 0.8746413, + "learning_rate": 0.00019055701646387952, + "loss": 0.88614064, + "num_input_tokens_seen": 310776816, + "router_z_loss_mlp": 0.74755859, + "step": 3748, + "time_per_iteration": 2.6518776416778564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155716, + "balance_loss_mlp": 1.08266449, + "epoch": 0.7212389380530974, + "flos": 1537246765056.0, + "grad_norm": 0.009534270530490536, + "language_loss": 0.80472684, + "learning_rate": 0.00019031236603067042, + "loss": 0.81628406, + "num_input_tokens_seen": 310987056, + "router_z_loss_mlp": 0.73046875, + "step": 3749, + "time_per_iteration": 4.76072096824646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151416, + "balance_loss_mlp": 1.07664847, + "epoch": 0.7214313197383609, + "flos": 462452938752.0, + "grad_norm": 0.03323767151214544, + "language_loss": 0.92055959, + "learning_rate": 0.00019006783582886368, + "loss": 0.93207377, + "num_input_tokens_seen": 311051648, + "router_z_loss_mlp": 0.74609375, + "step": 3750, + "time_per_iteration": 2.536107301712036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147507, + "balance_loss_mlp": 1.0724529, + "epoch": 0.7216237014236244, + "flos": 1038912336384.0, + "grad_norm": 0.03471978227212596, + "language_loss": 0.8780399, + "learning_rate": 0.00018982342595339437, + "loss": 0.88951492, + "num_input_tokens_seen": 311146272, + "router_z_loss_mlp": 0.74902344, + "step": 3751, + "time_per_iteration": 3.496842622756958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146824, + "balance_loss_mlp": 1.07181787, + "epoch": 0.721816083108888, + "flos": 897450086400.0, + "grad_norm": 0.03786430970431107, + "language_loss": 0.87491071, + "learning_rate": 0.00018957913649915076, + "loss": 0.88637894, + "num_input_tokens_seen": 311223760, + "router_z_loss_mlp": 0.74853516, + "step": 3752, + "time_per_iteration": 3.1817660331726074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145034, + "balance_loss_mlp": 1.07002771, + "epoch": 0.7220084647941516, + "flos": 524311564800.0, + "grad_norm": 0.03715970514443419, + "language_loss": 0.85220444, + "learning_rate": 0.00018933496756097428, + "loss": 0.86365485, + "num_input_tokens_seen": 311290336, + "router_z_loss_mlp": 0.74853516, + "step": 3753, + "time_per_iteration": 2.6647567749023438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147456, + "balance_loss_mlp": 1.07244956, + "epoch": 0.7222008464794152, + "flos": 817471157760.0, + "grad_norm": 0.038995714903637436, + "language_loss": 0.86141288, + "learning_rate": 0.0001890909192336603, + "loss": 0.87288737, + "num_input_tokens_seen": 311366240, + "router_z_loss_mlp": 0.74853516, + "step": 3754, + "time_per_iteration": 3.0344350337982178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146781, + "balance_loss_mlp": 1.07172728, + "epoch": 0.7223932281646788, + "flos": 750372097536.0, + "grad_norm": 0.03457656786821505, + "language_loss": 0.74980754, + "learning_rate": 0.00018884699161195623, + "loss": 0.76127535, + "num_input_tokens_seen": 311445184, + "router_z_loss_mlp": 0.74902344, + "step": 3755, + "time_per_iteration": 2.9410288333892822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146383, + "balance_loss_mlp": 1.07137632, + "epoch": 0.7225856098499422, + "flos": 746988172800.0, + "grad_norm": 0.03312890727657128, + "language_loss": 0.82509679, + "learning_rate": 0.00018860318479056327, + "loss": 0.83656067, + "num_input_tokens_seen": 311527280, + "router_z_loss_mlp": 0.74853516, + "step": 3756, + "time_per_iteration": 3.1337335109710693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144277, + "balance_loss_mlp": 1.0693661, + "epoch": 0.7227779915352058, + "flos": 548434371072.0, + "grad_norm": 0.030530532653655316, + "language_loss": 0.88339114, + "learning_rate": 0.00018835949886413555, + "loss": 0.89483386, + "num_input_tokens_seen": 311601552, + "router_z_loss_mlp": 0.74755859, + "step": 3757, + "time_per_iteration": 2.6933181285858154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146399, + "balance_loss_mlp": 1.07158351, + "epoch": 0.7229703732204694, + "flos": 531505837056.0, + "grad_norm": 0.03838754790834608, + "language_loss": 0.84470987, + "learning_rate": 0.0001881159339272806, + "loss": 0.85617381, + "num_input_tokens_seen": 311670736, + "router_z_loss_mlp": 0.74658203, + "step": 3758, + "time_per_iteration": 2.6401891708374023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147602, + "balance_loss_mlp": 1.07273877, + "epoch": 0.723162754905733, + "flos": 529365347328.0, + "grad_norm": 0.035007648752716856, + "language_loss": 0.83889484, + "learning_rate": 0.00018787249007455858, + "loss": 0.85037082, + "num_input_tokens_seen": 311736800, + "router_z_loss_mlp": 0.74707031, + "step": 3759, + "time_per_iteration": 2.605527400970459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147364, + "balance_loss_mlp": 1.07250082, + "epoch": 0.7233551365909965, + "flos": 656059018752.0, + "grad_norm": 0.034978512511305425, + "language_loss": 0.76976448, + "learning_rate": 0.00018762916740048302, + "loss": 0.78123814, + "num_input_tokens_seen": 311806064, + "router_z_loss_mlp": 0.74707031, + "step": 3760, + "time_per_iteration": 2.8233485221862793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156841, + "balance_loss_mlp": 1.081882, + "epoch": 0.7235475182762601, + "flos": 523443434496.0, + "grad_norm": 0.03185291769452338, + "language_loss": 0.9024173, + "learning_rate": 0.0001873859659995195, + "loss": 0.91398567, + "num_input_tokens_seen": 311881280, + "router_z_loss_mlp": 0.74804688, + "step": 3761, + "time_per_iteration": 2.7312240600585938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159221, + "balance_loss_mlp": 1.08440578, + "epoch": 0.7237398999615237, + "flos": 610321195008.0, + "grad_norm": 0.03629534298697415, + "language_loss": 0.88241446, + "learning_rate": 0.0001871428859660878, + "loss": 0.89400673, + "num_input_tokens_seen": 311953696, + "router_z_loss_mlp": 0.74658203, + "step": 3762, + "time_per_iteration": 2.7550981044769287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158067, + "balance_loss_mlp": 1.08329916, + "epoch": 0.7239322816467872, + "flos": 660281601024.0, + "grad_norm": 0.02929996085025788, + "language_loss": 0.86564827, + "learning_rate": 0.00018689992739455975, + "loss": 0.87722898, + "num_input_tokens_seen": 312032752, + "router_z_loss_mlp": 0.74609375, + "step": 3763, + "time_per_iteration": 2.925534963607788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152585, + "balance_loss_mlp": 1.07767427, + "epoch": 0.7241246633320508, + "flos": 970940416512.0, + "grad_norm": 0.028975317515326986, + "language_loss": 0.89523166, + "learning_rate": 0.00018665709037926027, + "loss": 0.90675747, + "num_input_tokens_seen": 312120800, + "router_z_loss_mlp": 0.74755859, + "step": 3764, + "time_per_iteration": 3.3454575538635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149589, + "balance_loss_mlp": 1.0751071, + "epoch": 0.7243170450173143, + "flos": 515999384064.0, + "grad_norm": 0.03578449562727673, + "language_loss": 0.88854849, + "learning_rate": 0.00018641437501446694, + "loss": 0.90004438, + "num_input_tokens_seen": 312188416, + "router_z_loss_mlp": 0.74414062, + "step": 3765, + "time_per_iteration": 2.5862903594970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149356, + "balance_loss_mlp": 1.07463598, + "epoch": 0.7245094267025779, + "flos": 560805950976.0, + "grad_norm": 0.04055976430378051, + "language_loss": 0.87262148, + "learning_rate": 0.0001861717813944104, + "loss": 0.88411504, + "num_input_tokens_seen": 312257792, + "router_z_loss_mlp": 0.74560547, + "step": 3766, + "time_per_iteration": 2.6999149322509766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145931, + "balance_loss_mlp": 1.07111502, + "epoch": 0.7247018083878415, + "flos": 613774977024.0, + "grad_norm": 0.03434162187139979, + "language_loss": 0.84787124, + "learning_rate": 0.00018592930961327365, + "loss": 0.85933053, + "num_input_tokens_seen": 312328544, + "router_z_loss_mlp": 0.74658203, + "step": 3767, + "time_per_iteration": 2.7380406856536865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145503, + "balance_loss_mlp": 1.07068777, + "epoch": 0.7248941900731051, + "flos": 635870085120.0, + "grad_norm": 0.03338829446413619, + "language_loss": 0.92739952, + "learning_rate": 0.00018568695976519273, + "loss": 0.93885458, + "num_input_tokens_seen": 312405888, + "router_z_loss_mlp": 0.74658203, + "step": 3768, + "time_per_iteration": 2.7908759117126465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145327, + "balance_loss_mlp": 1.07036865, + "epoch": 0.7250865717583687, + "flos": 425837028864.0, + "grad_norm": 0.039339840772426415, + "language_loss": 0.85823148, + "learning_rate": 0.00018544473194425593, + "loss": 0.86968476, + "num_input_tokens_seen": 312469552, + "router_z_loss_mlp": 0.74804688, + "step": 3769, + "time_per_iteration": 2.493539810180664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114564, + "balance_loss_mlp": 1.0706811, + "epoch": 0.7252789534436321, + "flos": 636397839360.0, + "grad_norm": 0.0351272666064589, + "language_loss": 0.83947301, + "learning_rate": 0.00018520262624450485, + "loss": 0.85092938, + "num_input_tokens_seen": 312548848, + "router_z_loss_mlp": 0.74804688, + "step": 3770, + "time_per_iteration": 2.8556978702545166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145039, + "balance_loss_mlp": 1.07017529, + "epoch": 0.7254713351288957, + "flos": 618353398272.0, + "grad_norm": 0.031209053717976155, + "language_loss": 0.91200709, + "learning_rate": 0.00018496064275993324, + "loss": 0.9234575, + "num_input_tokens_seen": 312622016, + "router_z_loss_mlp": 0.74707031, + "step": 3771, + "time_per_iteration": 2.7326061725616455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114546, + "balance_loss_mlp": 1.07050157, + "epoch": 0.7256637168141593, + "flos": 768290285568.0, + "grad_norm": 0.04607963634377255, + "language_loss": 0.87999386, + "learning_rate": 0.00018471878158448686, + "loss": 0.89144844, + "num_input_tokens_seen": 312696960, + "router_z_loss_mlp": 0.74804688, + "step": 3772, + "time_per_iteration": 2.945519208908081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011453, + "balance_loss_mlp": 1.07038903, + "epoch": 0.7258560984994229, + "flos": 496726970880.0, + "grad_norm": 0.029552123260588873, + "language_loss": 0.88148075, + "learning_rate": 0.00018447704281206512, + "loss": 0.89293379, + "num_input_tokens_seen": 312774352, + "router_z_loss_mlp": 0.74755859, + "step": 3773, + "time_per_iteration": 2.8680005073547363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114455, + "balance_loss_mlp": 1.06963933, + "epoch": 0.7260484801846864, + "flos": 531141265920.0, + "grad_norm": 0.03674222243829071, + "language_loss": 0.87786865, + "learning_rate": 0.0001842354265365191, + "loss": 0.88931417, + "num_input_tokens_seen": 312849600, + "router_z_loss_mlp": 0.74755859, + "step": 3774, + "time_per_iteration": 2.724771499633789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114502, + "balance_loss_mlp": 1.0701561, + "epoch": 0.72624086186995, + "flos": 626107625472.0, + "grad_norm": 0.03805272317803873, + "language_loss": 0.85790277, + "learning_rate": 0.0001839939328516526, + "loss": 0.869353, + "num_input_tokens_seen": 312922688, + "router_z_loss_mlp": 0.74707031, + "step": 3775, + "time_per_iteration": 2.7149298191070557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114524, + "balance_loss_mlp": 1.07037675, + "epoch": 0.7264332435552135, + "flos": 717804853248.0, + "grad_norm": 0.035296918768569004, + "language_loss": 0.86455274, + "learning_rate": 0.0001837525618512218, + "loss": 0.87600511, + "num_input_tokens_seen": 312997728, + "router_z_loss_mlp": 0.74707031, + "step": 3776, + "time_per_iteration": 2.8749477863311768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145925, + "balance_loss_mlp": 1.07129955, + "epoch": 0.7266256252404771, + "flos": 682241723904.0, + "grad_norm": 0.03797985367726647, + "language_loss": 0.88141412, + "learning_rate": 0.00018351131362893519, + "loss": 0.89287341, + "num_input_tokens_seen": 313067168, + "router_z_loss_mlp": 0.74462891, + "step": 3777, + "time_per_iteration": 2.7961273193359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146331, + "balance_loss_mlp": 1.07156312, + "epoch": 0.7268180069257407, + "flos": 519917793792.0, + "grad_norm": 0.04046507418804878, + "language_loss": 0.86727178, + "learning_rate": 0.00018327018827845364, + "loss": 0.87873513, + "num_input_tokens_seen": 313134688, + "router_z_loss_mlp": 0.74609375, + "step": 3778, + "time_per_iteration": 2.6734490394592285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147275, + "balance_loss_mlp": 1.07265031, + "epoch": 0.7270103886110042, + "flos": 513672242688.0, + "grad_norm": 0.03480448253150256, + "language_loss": 0.91087776, + "learning_rate": 0.00018302918589339036, + "loss": 0.92235053, + "num_input_tokens_seen": 313204816, + "router_z_loss_mlp": 0.74462891, + "step": 3779, + "time_per_iteration": 2.693053722381592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144842, + "balance_loss_mlp": 1.07012212, + "epoch": 0.7272027702962678, + "flos": 547691767296.0, + "grad_norm": 0.037628889327950436, + "language_loss": 0.94755363, + "learning_rate": 0.00018278830656731054, + "loss": 0.95900208, + "num_input_tokens_seen": 313274288, + "router_z_loss_mlp": 0.74560547, + "step": 3780, + "time_per_iteration": 2.7247214317321777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143177, + "balance_loss_mlp": 1.06831324, + "epoch": 0.7273951519815314, + "flos": 594154730496.0, + "grad_norm": 0.032307622186086855, + "language_loss": 0.90543699, + "learning_rate": 0.00018254755039373222, + "loss": 0.91686875, + "num_input_tokens_seen": 313344800, + "router_z_loss_mlp": 0.74707031, + "step": 3781, + "time_per_iteration": 2.7543249130249023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139617, + "balance_loss_mlp": 1.06480122, + "epoch": 0.727587533666795, + "flos": 607138658304.0, + "grad_norm": 0.037695022521252085, + "language_loss": 0.89343524, + "learning_rate": 0.0001823069174661252, + "loss": 0.90483147, + "num_input_tokens_seen": 313417840, + "router_z_loss_mlp": 0.74658203, + "step": 3782, + "time_per_iteration": 2.7875726222991943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140015, + "balance_loss_mlp": 1.06524646, + "epoch": 0.7277799153520584, + "flos": 514026080256.0, + "grad_norm": 0.034513244238831585, + "language_loss": 0.83396327, + "learning_rate": 0.00018206640787791112, + "loss": 0.84536338, + "num_input_tokens_seen": 313485936, + "router_z_loss_mlp": 0.74609375, + "step": 3783, + "time_per_iteration": 2.672685146331787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142732, + "balance_loss_mlp": 1.06782138, + "epoch": 0.727972297037322, + "flos": 538793435136.0, + "grad_norm": 0.03888167743908025, + "language_loss": 0.90142006, + "learning_rate": 0.00018182602172246416, + "loss": 0.9128474, + "num_input_tokens_seen": 313553136, + "router_z_loss_mlp": 0.74755859, + "step": 3784, + "time_per_iteration": 2.637195110321045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142638, + "balance_loss_mlp": 1.06767881, + "epoch": 0.7281646787225856, + "flos": 536075526144.0, + "grad_norm": 0.03379285978086118, + "language_loss": 0.81641448, + "learning_rate": 0.00018158575909311075, + "loss": 0.82784092, + "num_input_tokens_seen": 313620128, + "router_z_loss_mlp": 0.74804688, + "step": 3785, + "time_per_iteration": 2.6302285194396973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143773, + "balance_loss_mlp": 1.0688144, + "epoch": 0.7283570604078492, + "flos": 626209683456.0, + "grad_norm": 0.034294613815109176, + "language_loss": 0.84919262, + "learning_rate": 0.000181345620083129, + "loss": 0.86063033, + "num_input_tokens_seen": 313696432, + "router_z_loss_mlp": 0.74804688, + "step": 3786, + "time_per_iteration": 2.826655626296997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143839, + "balance_loss_mlp": 1.06887996, + "epoch": 0.7285494420931128, + "flos": 535255059456.0, + "grad_norm": 0.03289848846312583, + "language_loss": 0.91744298, + "learning_rate": 0.00018110560478574927, + "loss": 0.92888141, + "num_input_tokens_seen": 313768416, + "router_z_loss_mlp": 0.74804688, + "step": 3787, + "time_per_iteration": 2.6760616302490234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011439, + "balance_loss_mlp": 1.06889331, + "epoch": 0.7287418237783763, + "flos": 667740387840.0, + "grad_norm": 0.04379753934602124, + "language_loss": 0.86934447, + "learning_rate": 0.0001808657132941533, + "loss": 0.88078344, + "num_input_tokens_seen": 313839888, + "router_z_loss_mlp": 0.74853516, + "step": 3788, + "time_per_iteration": 2.8172109127044678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143441, + "balance_loss_mlp": 1.0684824, + "epoch": 0.7289342054636399, + "flos": 551638374912.0, + "grad_norm": 0.03930499856080985, + "language_loss": 0.87319398, + "learning_rate": 0.00018062594570147572, + "loss": 0.88462842, + "num_input_tokens_seen": 313908832, + "router_z_loss_mlp": 0.74804688, + "step": 3789, + "time_per_iteration": 2.6159238815307617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146043, + "balance_loss_mlp": 1.07103622, + "epoch": 0.7291265871489034, + "flos": 689138554368.0, + "grad_norm": 0.030589467753511134, + "language_loss": 0.89662123, + "learning_rate": 0.00018038630210080243, + "loss": 0.90808165, + "num_input_tokens_seen": 313982672, + "router_z_loss_mlp": 0.74853516, + "step": 3790, + "time_per_iteration": 2.8022711277008057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147306, + "balance_loss_mlp": 1.07234764, + "epoch": 0.729318968834167, + "flos": 573770413056.0, + "grad_norm": 0.03374595172498584, + "language_loss": 0.89270401, + "learning_rate": 0.0001801467825851712, + "loss": 0.90417707, + "num_input_tokens_seen": 314057184, + "router_z_loss_mlp": 0.74804688, + "step": 3791, + "time_per_iteration": 2.724628210067749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147876, + "balance_loss_mlp": 1.07310832, + "epoch": 0.7295113505194305, + "flos": 587164574208.0, + "grad_norm": 0.035766234040923994, + "language_loss": 0.83940732, + "learning_rate": 0.00017990738724757172, + "loss": 0.85088611, + "num_input_tokens_seen": 314137344, + "router_z_loss_mlp": 0.74609375, + "step": 3792, + "time_per_iteration": 2.842078924179077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161985, + "balance_loss_mlp": 1.08716917, + "epoch": 0.7297037322046941, + "flos": 708441893376.0, + "grad_norm": 0.03365089778951548, + "language_loss": 0.86588967, + "learning_rate": 0.00017966811618094598, + "loss": 0.87750953, + "num_input_tokens_seen": 314214464, + "router_z_loss_mlp": 0.74658203, + "step": 3793, + "time_per_iteration": 2.9457900524139404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151295, + "balance_loss_mlp": 1.07643151, + "epoch": 0.7298961138899577, + "flos": 488308002816.0, + "grad_norm": 0.03933165170986372, + "language_loss": 0.90208626, + "learning_rate": 0.00017942896947818664, + "loss": 0.91359925, + "num_input_tokens_seen": 314280432, + "router_z_loss_mlp": 0.74707031, + "step": 3794, + "time_per_iteration": 2.5673389434814453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155838, + "balance_loss_mlp": 1.08297729, + "epoch": 0.7300884955752213, + "flos": 1368622162944.0, + "grad_norm": 0.012202680830239692, + "language_loss": 0.74825054, + "learning_rate": 0.000179189947232139, + "loss": 0.7598089, + "num_input_tokens_seen": 314497152, + "router_z_loss_mlp": 0.72851562, + "step": 3795, + "time_per_iteration": 4.860522985458374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150098, + "balance_loss_mlp": 1.07523441, + "epoch": 0.7302808772604849, + "flos": 532836593664.0, + "grad_norm": 0.03730166344512247, + "language_loss": 0.91110396, + "learning_rate": 0.00017895104953559947, + "loss": 0.92260492, + "num_input_tokens_seen": 314565488, + "router_z_loss_mlp": 0.74707031, + "step": 3796, + "time_per_iteration": 2.58555269241333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148597, + "balance_loss_mlp": 1.07378125, + "epoch": 0.7304732589457483, + "flos": 437062502400.0, + "grad_norm": 0.03959489131470051, + "language_loss": 0.95557475, + "learning_rate": 0.00017871227648131672, + "loss": 0.96706069, + "num_input_tokens_seen": 314627392, + "router_z_loss_mlp": 0.74658203, + "step": 3797, + "time_per_iteration": 2.464853048324585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148137, + "balance_loss_mlp": 1.07332122, + "epoch": 0.7306656406310119, + "flos": 452603884032.0, + "grad_norm": 0.03192912066727366, + "language_loss": 0.87151992, + "learning_rate": 0.0001784736281619907, + "loss": 0.88300121, + "num_input_tokens_seen": 314695440, + "router_z_loss_mlp": 0.74658203, + "step": 3798, + "time_per_iteration": 2.582390785217285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146414, + "balance_loss_mlp": 1.07155061, + "epoch": 0.7308580223162755, + "flos": 513029695488.0, + "grad_norm": 0.051326436791091785, + "language_loss": 0.79766852, + "learning_rate": 0.00017823510467027232, + "loss": 0.80913264, + "num_input_tokens_seen": 314772592, + "router_z_loss_mlp": 0.74707031, + "step": 3799, + "time_per_iteration": 2.75164794921875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114555, + "balance_loss_mlp": 1.07078159, + "epoch": 0.7310504040015391, + "flos": 376282853376.0, + "grad_norm": 0.04144001955179666, + "language_loss": 0.8475759, + "learning_rate": 0.00017799670609876516, + "loss": 0.85903138, + "num_input_tokens_seen": 314836192, + "router_z_loss_mlp": 0.74609375, + "step": 3800, + "time_per_iteration": 2.5519416332244873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114588, + "balance_loss_mlp": 1.07106447, + "epoch": 0.7312427856868026, + "flos": 550381478400.0, + "grad_norm": 0.03386508062276854, + "language_loss": 0.93402916, + "learning_rate": 0.00017775843254002366, + "loss": 0.94548798, + "num_input_tokens_seen": 314908400, + "router_z_loss_mlp": 0.74658203, + "step": 3801, + "time_per_iteration": 4.189229965209961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144132, + "balance_loss_mlp": 1.06917357, + "epoch": 0.7314351673720662, + "flos": 768677050368.0, + "grad_norm": 0.03513626967715429, + "language_loss": 0.89011091, + "learning_rate": 0.00017752028408655367, + "loss": 0.9015522, + "num_input_tokens_seen": 314995280, + "router_z_loss_mlp": 0.74804688, + "step": 3802, + "time_per_iteration": 3.0296835899353027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114212, + "balance_loss_mlp": 1.06716144, + "epoch": 0.7316275490573297, + "flos": 487704387072.0, + "grad_norm": 0.036348088487259234, + "language_loss": 0.90090084, + "learning_rate": 0.00017728226083081272, + "loss": 0.91232204, + "num_input_tokens_seen": 315063056, + "router_z_loss_mlp": 0.74804688, + "step": 3803, + "time_per_iteration": 2.5504109859466553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142386, + "balance_loss_mlp": 1.06742704, + "epoch": 0.7318199307425933, + "flos": 474412283904.0, + "grad_norm": 0.03547640994648555, + "language_loss": 0.86963499, + "learning_rate": 0.00017704436286520965, + "loss": 0.88105881, + "num_input_tokens_seen": 315128896, + "router_z_loss_mlp": 0.74804688, + "step": 3804, + "time_per_iteration": 2.5794951915740967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141426, + "balance_loss_mlp": 1.06665754, + "epoch": 0.7320123124278569, + "flos": 550511734272.0, + "grad_norm": 0.04039315575901835, + "language_loss": 0.89054638, + "learning_rate": 0.0001768065902821046, + "loss": 0.90196061, + "num_input_tokens_seen": 315198464, + "router_z_loss_mlp": 0.74609375, + "step": 3805, + "time_per_iteration": 2.684680700302124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141527, + "balance_loss_mlp": 1.06675947, + "epoch": 0.7322046941131204, + "flos": 571899167232.0, + "grad_norm": 0.036858739394668875, + "language_loss": 0.87521064, + "learning_rate": 0.00017656894317380907, + "loss": 0.88662589, + "num_input_tokens_seen": 315270240, + "router_z_loss_mlp": 0.74609375, + "step": 3806, + "time_per_iteration": 2.7203333377838135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147461, + "balance_loss_mlp": 1.07460022, + "epoch": 0.732397075798384, + "flos": 1472501042688.0, + "grad_norm": 0.00876082834102495, + "language_loss": 0.76031268, + "learning_rate": 0.00017633142163258565, + "loss": 0.77178729, + "num_input_tokens_seen": 315502448, + "router_z_loss_mlp": 0.72851562, + "step": 3807, + "time_per_iteration": 4.985222816467285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143568, + "balance_loss_mlp": 1.06884801, + "epoch": 0.7325894574836476, + "flos": 465830859264.0, + "grad_norm": 0.03431257016679264, + "language_loss": 0.883228, + "learning_rate": 0.00017609402575064875, + "loss": 0.89466369, + "num_input_tokens_seen": 315569472, + "router_z_loss_mlp": 0.74560547, + "step": 3808, + "time_per_iteration": 2.5505616664886475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150323, + "balance_loss_mlp": 1.07560253, + "epoch": 0.7327818391689112, + "flos": 496481195520.0, + "grad_norm": 0.036747437689303115, + "language_loss": 0.86707413, + "learning_rate": 0.00017585675562016367, + "loss": 0.87857741, + "num_input_tokens_seen": 315637632, + "router_z_loss_mlp": 0.74560547, + "step": 3809, + "time_per_iteration": 2.566805362701416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148865, + "balance_loss_mlp": 1.07600403, + "epoch": 0.7329742208541746, + "flos": 1436679403008.0, + "grad_norm": 0.008652563544013954, + "language_loss": 0.77212846, + "learning_rate": 0.0001756196113332465, + "loss": 0.78361714, + "num_input_tokens_seen": 315863648, + "router_z_loss_mlp": 0.72851562, + "step": 3810, + "time_per_iteration": 4.843864440917969 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143684, + "balance_loss_mlp": 1.06910706, + "epoch": 0.7331666025394382, + "flos": 497868347904.0, + "grad_norm": 0.0400416063155724, + "language_loss": 0.90367377, + "learning_rate": 0.00017538259298196474, + "loss": 0.91511071, + "num_input_tokens_seen": 315930752, + "router_z_loss_mlp": 0.74414062, + "step": 3811, + "time_per_iteration": 2.573604106903076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146365, + "balance_loss_mlp": 1.07174027, + "epoch": 0.7333589842247018, + "flos": 539638096896.0, + "grad_norm": 0.03197642151293291, + "language_loss": 0.86813134, + "learning_rate": 0.00017514570065833745, + "loss": 0.87959504, + "num_input_tokens_seen": 316006400, + "router_z_loss_mlp": 0.74462891, + "step": 3812, + "time_per_iteration": 2.6921682357788086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146575, + "balance_loss_mlp": 1.0719502, + "epoch": 0.7335513659099654, + "flos": 492041762304.0, + "grad_norm": 0.0378422764823117, + "language_loss": 0.86487865, + "learning_rate": 0.00017490893445433426, + "loss": 0.87634438, + "num_input_tokens_seen": 316075824, + "router_z_loss_mlp": 0.74462891, + "step": 3813, + "time_per_iteration": 2.634765148162842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146185, + "balance_loss_mlp": 1.07160771, + "epoch": 0.733743747595229, + "flos": 563252614656.0, + "grad_norm": 0.03359115001415202, + "language_loss": 0.86180258, + "learning_rate": 0.00017467229446187587, + "loss": 0.87326443, + "num_input_tokens_seen": 316148336, + "router_z_loss_mlp": 0.74414062, + "step": 3814, + "time_per_iteration": 2.6770167350769043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146242, + "balance_loss_mlp": 1.07166481, + "epoch": 0.7339361292804925, + "flos": 539648830464.0, + "grad_norm": 0.03482367170061421, + "language_loss": 0.86801744, + "learning_rate": 0.00017443578077283424, + "loss": 0.87947989, + "num_input_tokens_seen": 316220960, + "router_z_loss_mlp": 0.74414062, + "step": 3815, + "time_per_iteration": 2.6352267265319824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144002, + "balance_loss_mlp": 1.06937671, + "epoch": 0.734128510965756, + "flos": 549561011712.0, + "grad_norm": 0.030322366631391387, + "language_loss": 0.89759493, + "learning_rate": 0.0001741993934790319, + "loss": 0.90903497, + "num_input_tokens_seen": 316295824, + "router_z_loss_mlp": 0.74462891, + "step": 3816, + "time_per_iteration": 2.793721914291382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142717, + "balance_loss_mlp": 1.06799662, + "epoch": 0.7343208926510196, + "flos": 541201167360.0, + "grad_norm": 0.038181865946918005, + "language_loss": 0.887739, + "learning_rate": 0.00017396313267224273, + "loss": 0.89916623, + "num_input_tokens_seen": 316368064, + "router_z_loss_mlp": 0.74560547, + "step": 3817, + "time_per_iteration": 2.773219347000122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145721, + "balance_loss_mlp": 1.07090569, + "epoch": 0.7345132743362832, + "flos": 572170412544.0, + "grad_norm": 0.036498541155499, + "language_loss": 0.93785435, + "learning_rate": 0.0001737269984441912, + "loss": 0.94931155, + "num_input_tokens_seen": 316437440, + "router_z_loss_mlp": 0.74658203, + "step": 3818, + "time_per_iteration": 2.6538641452789307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140549, + "balance_loss_mlp": 1.06592357, + "epoch": 0.7347056560215467, + "flos": 546480532992.0, + "grad_norm": 0.03219237397324587, + "language_loss": 0.8964963, + "learning_rate": 0.00017349099088655263, + "loss": 0.90790182, + "num_input_tokens_seen": 316511936, + "router_z_loss_mlp": 0.74462891, + "step": 3819, + "time_per_iteration": 2.7040135860443115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140645, + "balance_loss_mlp": 1.06606805, + "epoch": 0.7348980377068103, + "flos": 597076755456.0, + "grad_norm": 0.033091718107472336, + "language_loss": 0.85581368, + "learning_rate": 0.00017325511009095375, + "loss": 0.86722016, + "num_input_tokens_seen": 316584304, + "router_z_loss_mlp": 0.74414062, + "step": 3820, + "time_per_iteration": 4.160353183746338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142615, + "balance_loss_mlp": 1.06798947, + "epoch": 0.7350904193920739, + "flos": 539611900416.0, + "grad_norm": 0.031456925706235525, + "language_loss": 0.88030791, + "learning_rate": 0.00017301935614897113, + "loss": 0.89173406, + "num_input_tokens_seen": 316659024, + "router_z_loss_mlp": 0.74462891, + "step": 3821, + "time_per_iteration": 2.6948046684265137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142475, + "balance_loss_mlp": 1.06789804, + "epoch": 0.7352828010773375, + "flos": 514061008896.0, + "grad_norm": 0.030574399918046426, + "language_loss": 0.85837513, + "learning_rate": 0.00017278372915213274, + "loss": 0.86979991, + "num_input_tokens_seen": 316732544, + "router_z_loss_mlp": 0.74414062, + "step": 3822, + "time_per_iteration": 2.6384036540985107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146408, + "balance_loss_mlp": 1.07354736, + "epoch": 0.735475182762601, + "flos": 1557255777792.0, + "grad_norm": 0.0051515936537080845, + "language_loss": 0.79893845, + "learning_rate": 0.00017254822919191693, + "loss": 0.81040251, + "num_input_tokens_seen": 316967104, + "router_z_loss_mlp": 0.72851562, + "step": 3823, + "time_per_iteration": 6.475368976593018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140808, + "balance_loss_mlp": 1.06618333, + "epoch": 0.7356675644478645, + "flos": 682611024384.0, + "grad_norm": 0.03514206822018316, + "language_loss": 0.85822678, + "learning_rate": 0.00017231285635975314, + "loss": 0.86963487, + "num_input_tokens_seen": 317048304, + "router_z_loss_mlp": 0.74462891, + "step": 3824, + "time_per_iteration": 2.881985664367676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140396, + "balance_loss_mlp": 1.0657233, + "epoch": 0.7358599461331281, + "flos": 516231697920.0, + "grad_norm": 0.03601426366769367, + "language_loss": 0.88078141, + "learning_rate": 0.00017207761074702115, + "loss": 0.89218545, + "num_input_tokens_seen": 317115968, + "router_z_loss_mlp": 0.74511719, + "step": 3825, + "time_per_iteration": 2.588801860809326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142954, + "balance_loss_mlp": 1.06818557, + "epoch": 0.7360523278183917, + "flos": 444916786176.0, + "grad_norm": 0.029137218094429037, + "language_loss": 0.87851697, + "learning_rate": 0.0001718424924450514, + "loss": 0.88994652, + "num_input_tokens_seen": 317185680, + "router_z_loss_mlp": 0.74609375, + "step": 3826, + "time_per_iteration": 2.596510410308838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145079, + "balance_loss_mlp": 1.07050133, + "epoch": 0.7362447095036553, + "flos": 604551005184.0, + "grad_norm": 0.02824128078517694, + "language_loss": 0.89933646, + "learning_rate": 0.00017160750154512482, + "loss": 0.91078722, + "num_input_tokens_seen": 317258800, + "router_z_loss_mlp": 0.74414062, + "step": 3827, + "time_per_iteration": 2.737093687057495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139545, + "balance_loss_mlp": 1.06496727, + "epoch": 0.7364370911889189, + "flos": 554250223104.0, + "grad_norm": 0.030336693640123275, + "language_loss": 0.87611473, + "learning_rate": 0.0001713726381384731, + "loss": 0.88751018, + "num_input_tokens_seen": 317334608, + "router_z_loss_mlp": 0.74414062, + "step": 3828, + "time_per_iteration": 2.7642135620117188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140018, + "balance_loss_mlp": 1.06553614, + "epoch": 0.7366294728741823, + "flos": 449990034432.0, + "grad_norm": 0.03985156313807423, + "language_loss": 0.86582565, + "learning_rate": 0.00017113790231627812, + "loss": 0.87722576, + "num_input_tokens_seen": 317397504, + "router_z_loss_mlp": 0.74365234, + "step": 3829, + "time_per_iteration": 2.471085786819458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144356, + "balance_loss_mlp": 1.07168579, + "epoch": 0.7368218545594459, + "flos": 1538703048192.0, + "grad_norm": 0.005233117744578673, + "language_loss": 0.79258227, + "learning_rate": 0.0001709032941696726, + "loss": 0.80402577, + "num_input_tokens_seen": 317611472, + "router_z_loss_mlp": 0.7265625, + "step": 3830, + "time_per_iteration": 4.7661731243133545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146943, + "balance_loss_mlp": 1.072366, + "epoch": 0.7370142362447095, + "flos": 516472743936.0, + "grad_norm": 0.03645785594600137, + "language_loss": 0.87339807, + "learning_rate": 0.00017066881378973936, + "loss": 0.88486743, + "num_input_tokens_seen": 317681328, + "router_z_loss_mlp": 0.74414062, + "step": 3831, + "time_per_iteration": 2.6248505115509033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146898, + "balance_loss_mlp": 1.0723207, + "epoch": 0.7372066179299731, + "flos": 501904278528.0, + "grad_norm": 0.03165196577405493, + "language_loss": 0.87413478, + "learning_rate": 0.00017043446126751189, + "loss": 0.88560379, + "num_input_tokens_seen": 317752336, + "router_z_loss_mlp": 0.74414062, + "step": 3832, + "time_per_iteration": 2.6783525943756104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144804, + "balance_loss_mlp": 1.07022643, + "epoch": 0.7373989996152366, + "flos": 559167019008.0, + "grad_norm": 0.037114015277278894, + "language_loss": 0.82006979, + "learning_rate": 0.00017020023669397376, + "loss": 0.83151782, + "num_input_tokens_seen": 317824112, + "router_z_loss_mlp": 0.74414062, + "step": 3833, + "time_per_iteration": 2.6736700534820557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142842, + "balance_loss_mlp": 1.06816959, + "epoch": 0.7375913813005002, + "flos": 507780529152.0, + "grad_norm": 0.035309103887572656, + "language_loss": 0.88040781, + "learning_rate": 0.0001699661401600589, + "loss": 0.89183623, + "num_input_tokens_seen": 317889120, + "router_z_loss_mlp": 0.74511719, + "step": 3834, + "time_per_iteration": 2.566554069519043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114318, + "balance_loss_mlp": 1.06860292, + "epoch": 0.7377837629857638, + "flos": 487155165696.0, + "grad_norm": 0.03517908569874834, + "language_loss": 0.83206999, + "learning_rate": 0.00016973217175665205, + "loss": 0.84350181, + "num_input_tokens_seen": 317953792, + "router_z_loss_mlp": 0.74414062, + "step": 3835, + "time_per_iteration": 2.5718719959259033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144836, + "balance_loss_mlp": 1.07197571, + "epoch": 0.7379761446710273, + "flos": 1417877621760.0, + "grad_norm": 0.005454955067060188, + "language_loss": 0.8116616, + "learning_rate": 0.00016949833157458755, + "loss": 0.82310998, + "num_input_tokens_seen": 318184848, + "router_z_loss_mlp": 0.72851562, + "step": 3836, + "time_per_iteration": 4.927332401275635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113978, + "balance_loss_mlp": 1.065346, + "epoch": 0.7381685263562909, + "flos": 630909628416.0, + "grad_norm": 0.03248613748529956, + "language_loss": 0.88913381, + "learning_rate": 0.00016926461970465047, + "loss": 0.90053165, + "num_input_tokens_seen": 318259296, + "router_z_loss_mlp": 0.74316406, + "step": 3837, + "time_per_iteration": 2.775867462158203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140207, + "balance_loss_mlp": 1.06591558, + "epoch": 0.7383609080415544, + "flos": 740651297280.0, + "grad_norm": 0.029601422195490622, + "language_loss": 0.88803387, + "learning_rate": 0.00016903103623757516, + "loss": 0.89943594, + "num_input_tokens_seen": 318344704, + "router_z_loss_mlp": 0.7421875, + "step": 3838, + "time_per_iteration": 3.0490381717681885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114028, + "balance_loss_mlp": 1.0659889, + "epoch": 0.738553289726818, + "flos": 551256339456.0, + "grad_norm": 0.036589238474362976, + "language_loss": 0.84502995, + "learning_rate": 0.00016879758126404738, + "loss": 0.85643274, + "num_input_tokens_seen": 318416128, + "router_z_loss_mlp": 0.7421875, + "step": 3839, + "time_per_iteration": 2.7638185024261475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140469, + "balance_loss_mlp": 1.06598663, + "epoch": 0.7387456714120816, + "flos": 911775504384.0, + "grad_norm": 0.03874838451291343, + "language_loss": 0.85589796, + "learning_rate": 0.00016856425487470216, + "loss": 0.86730266, + "num_input_tokens_seen": 318498128, + "router_z_loss_mlp": 0.74316406, + "step": 3840, + "time_per_iteration": 3.1033904552459717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139827, + "balance_loss_mlp": 1.06548798, + "epoch": 0.7389380530973452, + "flos": 854195856384.0, + "grad_norm": 0.035495854767005654, + "language_loss": 0.84398341, + "learning_rate": 0.00016833105716012486, + "loss": 0.85538161, + "num_input_tokens_seen": 318578048, + "router_z_loss_mlp": 0.7421875, + "step": 3841, + "time_per_iteration": 3.1338374614715576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011399, + "balance_loss_mlp": 1.06551313, + "epoch": 0.7391304347826086, + "flos": 818419878912.0, + "grad_norm": 0.034862132205022836, + "language_loss": 0.89572388, + "learning_rate": 0.00016809798821085088, + "loss": 0.90712291, + "num_input_tokens_seen": 318654784, + "router_z_loss_mlp": 0.74267578, + "step": 3842, + "time_per_iteration": 2.980786085128784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140329, + "balance_loss_mlp": 1.06622851, + "epoch": 0.7393228164678722, + "flos": 573937598976.0, + "grad_norm": 0.03111800184883808, + "language_loss": 0.93200815, + "learning_rate": 0.00016786504811736565, + "loss": 0.94341135, + "num_input_tokens_seen": 318727680, + "router_z_loss_mlp": 0.74072266, + "step": 3843, + "time_per_iteration": 2.669473171234131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140191, + "balance_loss_mlp": 1.06618571, + "epoch": 0.7395151981531358, + "flos": 686575096320.0, + "grad_norm": 0.030093907505068344, + "language_loss": 0.86420381, + "learning_rate": 0.00016763223697010442, + "loss": 0.8756057, + "num_input_tokens_seen": 318807568, + "router_z_loss_mlp": 0.74023438, + "step": 3844, + "time_per_iteration": 2.99284291267395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140327, + "balance_loss_mlp": 1.06632161, + "epoch": 0.7397075798383994, + "flos": 557454226944.0, + "grad_norm": 0.030952263508457714, + "language_loss": 0.88928902, + "learning_rate": 0.00016739955485945256, + "loss": 0.90069234, + "num_input_tokens_seen": 318881792, + "router_z_loss_mlp": 0.74023438, + "step": 3845, + "time_per_iteration": 2.7834365367889404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143729, + "balance_loss_mlp": 1.06972384, + "epoch": 0.739899961523663, + "flos": 547822023168.0, + "grad_norm": 0.0384067269834895, + "language_loss": 0.91738451, + "learning_rate": 0.00016716700187574513, + "loss": 0.9288218, + "num_input_tokens_seen": 318951552, + "router_z_loss_mlp": 0.74023438, + "step": 3846, + "time_per_iteration": 2.686281681060791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142346, + "balance_loss_mlp": 1.06824505, + "epoch": 0.7400923432089265, + "flos": 610303730688.0, + "grad_norm": 0.03341447658559241, + "language_loss": 0.87943906, + "learning_rate": 0.0001669345781092675, + "loss": 0.89086246, + "num_input_tokens_seen": 319022304, + "router_z_loss_mlp": 0.74072266, + "step": 3847, + "time_per_iteration": 2.7001636028289795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146926, + "balance_loss_mlp": 1.07258725, + "epoch": 0.7402847248941901, + "flos": 592179425280.0, + "grad_norm": 0.03705340018944972, + "language_loss": 0.92317855, + "learning_rate": 0.0001667022836502546, + "loss": 0.9346478, + "num_input_tokens_seen": 319093200, + "router_z_loss_mlp": 0.74169922, + "step": 3848, + "time_per_iteration": 2.7301111221313477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147022, + "balance_loss_mlp": 1.07263577, + "epoch": 0.7404771065794536, + "flos": 478304497152.0, + "grad_norm": 0.03758678291398601, + "language_loss": 0.88680065, + "learning_rate": 0.00016647011858889077, + "loss": 0.89827085, + "num_input_tokens_seen": 319159712, + "router_z_loss_mlp": 0.7421875, + "step": 3849, + "time_per_iteration": 2.5619609355926514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145959, + "balance_loss_mlp": 1.07152426, + "epoch": 0.7406694882647172, + "flos": 497466846720.0, + "grad_norm": 0.035398733472562116, + "language_loss": 0.90902388, + "learning_rate": 0.00016623808301531056, + "loss": 0.92048347, + "num_input_tokens_seen": 319230544, + "router_z_loss_mlp": 0.74267578, + "step": 3850, + "time_per_iteration": 2.6344494819641113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144814, + "balance_loss_mlp": 1.07042766, + "epoch": 0.7408618699499807, + "flos": 563326474752.0, + "grad_norm": 0.04248736642040007, + "language_loss": 0.8449176, + "learning_rate": 0.00016600617701959842, + "loss": 0.85636574, + "num_input_tokens_seen": 319305440, + "router_z_loss_mlp": 0.7421875, + "step": 3851, + "time_per_iteration": 2.764845609664917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152382, + "balance_loss_mlp": 1.07971191, + "epoch": 0.7410542516352443, + "flos": 1391469333504.0, + "grad_norm": 0.006017952028820176, + "language_loss": 0.78843814, + "learning_rate": 0.00016577440069178811, + "loss": 0.79996192, + "num_input_tokens_seen": 319534384, + "router_z_loss_mlp": 0.7265625, + "step": 3852, + "time_per_iteration": 4.992438316345215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143972, + "balance_loss_mlp": 1.06968081, + "epoch": 0.7412466333205079, + "flos": 671211634176.0, + "grad_norm": 0.03177898311172259, + "language_loss": 0.86077726, + "learning_rate": 0.00016554275412186315, + "loss": 0.872217, + "num_input_tokens_seen": 319610960, + "router_z_loss_mlp": 0.74169922, + "step": 3853, + "time_per_iteration": 2.809633731842041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143877, + "balance_loss_mlp": 1.0695858, + "epoch": 0.7414390150057715, + "flos": 490318236672.0, + "grad_norm": 0.037394191958696615, + "language_loss": 0.85646808, + "learning_rate": 0.0001653112373997568, + "loss": 0.86790681, + "num_input_tokens_seen": 319683872, + "router_z_loss_mlp": 0.74169922, + "step": 3854, + "time_per_iteration": 2.6653616428375244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144328, + "balance_loss_mlp": 1.07013178, + "epoch": 0.7416313966910351, + "flos": 600493607424.0, + "grad_norm": 0.037760188692200464, + "language_loss": 0.80141521, + "learning_rate": 0.0001650798506153517, + "loss": 0.81285852, + "num_input_tokens_seen": 319750032, + "router_z_loss_mlp": 0.74072266, + "step": 3855, + "time_per_iteration": 2.6987767219543457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143504, + "balance_loss_mlp": 1.06921279, + "epoch": 0.7418237783762985, + "flos": 543586705920.0, + "grad_norm": 0.04363259370366351, + "language_loss": 0.89603698, + "learning_rate": 0.00016484859385848023, + "loss": 0.90747201, + "num_input_tokens_seen": 319818864, + "router_z_loss_mlp": 0.74121094, + "step": 3856, + "time_per_iteration": 2.6623427867889404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143237, + "balance_loss_mlp": 1.06889808, + "epoch": 0.7420161600615621, + "flos": 545223636480.0, + "grad_norm": 0.03643329679811027, + "language_loss": 0.82348394, + "learning_rate": 0.0001646174672189243, + "loss": 0.83491635, + "num_input_tokens_seen": 319888816, + "router_z_loss_mlp": 0.74169922, + "step": 3857, + "time_per_iteration": 2.663518190383911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143563, + "balance_loss_mlp": 1.0692718, + "epoch": 0.7422085417468257, + "flos": 528210508800.0, + "grad_norm": 0.03811276290038686, + "language_loss": 0.85172391, + "learning_rate": 0.00016438647078641488, + "loss": 0.86315954, + "num_input_tokens_seen": 319956176, + "router_z_loss_mlp": 0.74121094, + "step": 3858, + "time_per_iteration": 2.5988457202911377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145341, + "balance_loss_mlp": 1.07133579, + "epoch": 0.7424009234320893, + "flos": 509760563712.0, + "grad_norm": 0.034205456810992727, + "language_loss": 0.87813514, + "learning_rate": 0.00016415560465063344, + "loss": 0.88958859, + "num_input_tokens_seen": 320028560, + "router_z_loss_mlp": 0.73925781, + "step": 3859, + "time_per_iteration": 2.7205588817596436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145531, + "balance_loss_mlp": 1.07138264, + "epoch": 0.7425933051173528, + "flos": 513607114752.0, + "grad_norm": 0.03574871107412609, + "language_loss": 0.83894295, + "learning_rate": 0.0001639248689012095, + "loss": 0.85039824, + "num_input_tokens_seen": 320096112, + "router_z_loss_mlp": 0.74023438, + "step": 3860, + "time_per_iteration": 2.604342460632324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145572, + "balance_loss_mlp": 1.07142365, + "epoch": 0.7427856868026164, + "flos": 459377189376.0, + "grad_norm": 0.03221086554930489, + "language_loss": 0.91824234, + "learning_rate": 0.00016369426362772271, + "loss": 0.92969811, + "num_input_tokens_seen": 320168992, + "router_z_loss_mlp": 0.74023438, + "step": 3861, + "time_per_iteration": 2.787710189819336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140907, + "balance_loss_mlp": 1.06666386, + "epoch": 0.74297806848788, + "flos": 606187935744.0, + "grad_norm": 0.034095856542736835, + "language_loss": 0.84967786, + "learning_rate": 0.00016346378891970233, + "loss": 0.86108696, + "num_input_tokens_seen": 320247264, + "router_z_loss_mlp": 0.74072266, + "step": 3862, + "time_per_iteration": 2.791630744934082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140095, + "balance_loss_mlp": 1.06594658, + "epoch": 0.7431704501731435, + "flos": 893069776896.0, + "grad_norm": 0.035970776867332244, + "language_loss": 0.86936057, + "learning_rate": 0.00016323344486662633, + "loss": 0.8807615, + "num_input_tokens_seen": 320338992, + "router_z_loss_mlp": 0.74023438, + "step": 3863, + "time_per_iteration": 3.3644163608551025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140007, + "balance_loss_mlp": 1.06562018, + "epoch": 0.7433628318584071, + "flos": 593351728128.0, + "grad_norm": 0.03309073679941976, + "language_loss": 0.8318609, + "learning_rate": 0.00016300323155792247, + "loss": 0.84326088, + "num_input_tokens_seen": 320422096, + "router_z_loss_mlp": 0.7421875, + "step": 3864, + "time_per_iteration": 2.9201974868774414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140802, + "balance_loss_mlp": 1.06655836, + "epoch": 0.7435552135436706, + "flos": 478189704192.0, + "grad_norm": 0.032691738541971056, + "language_loss": 0.93297988, + "learning_rate": 0.00016277314908296687, + "loss": 0.94438791, + "num_input_tokens_seen": 320492640, + "router_z_loss_mlp": 0.74072266, + "step": 3865, + "time_per_iteration": 2.662276268005371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140447, + "balance_loss_mlp": 1.06606066, + "epoch": 0.7437475952289342, + "flos": 674431100928.0, + "grad_norm": 0.04227589537607751, + "language_loss": 0.82037443, + "learning_rate": 0.00016254319753108604, + "loss": 0.83177888, + "num_input_tokens_seen": 320565264, + "router_z_loss_mlp": 0.7421875, + "step": 3866, + "time_per_iteration": 2.818756341934204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140124, + "balance_loss_mlp": 1.06573772, + "epoch": 0.7439399769141978, + "flos": 771770264064.0, + "grad_norm": 0.04121075784978914, + "language_loss": 0.82100695, + "learning_rate": 0.00016231337699155492, + "loss": 0.83240819, + "num_input_tokens_seen": 320647584, + "router_z_loss_mlp": 0.7421875, + "step": 3867, + "time_per_iteration": 2.9714555740356445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139588, + "balance_loss_mlp": 1.06539237, + "epoch": 0.7441323585994614, + "flos": 649038663168.0, + "grad_norm": 0.03532933640628425, + "language_loss": 0.82657182, + "learning_rate": 0.0001620836875535977, + "loss": 0.83796769, + "num_input_tokens_seen": 320722752, + "router_z_loss_mlp": 0.74023438, + "step": 3868, + "time_per_iteration": 2.849938154220581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139487, + "balance_loss_mlp": 1.06548178, + "epoch": 0.7443247402847248, + "flos": 566500279296.0, + "grad_norm": 0.031528263247616775, + "language_loss": 0.85388362, + "learning_rate": 0.00016185412930638766, + "loss": 0.86527848, + "num_input_tokens_seen": 320802496, + "router_z_loss_mlp": 0.73925781, + "step": 3869, + "time_per_iteration": 2.7786920070648193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139674, + "balance_loss_mlp": 1.06547797, + "epoch": 0.7445171219699884, + "flos": 579679590912.0, + "grad_norm": 0.0366739337080916, + "language_loss": 0.87914336, + "learning_rate": 0.00016162470233904765, + "loss": 0.89054006, + "num_input_tokens_seen": 320872496, + "router_z_loss_mlp": 0.74023438, + "step": 3870, + "time_per_iteration": 2.705364465713501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147326, + "balance_loss_mlp": 1.07351112, + "epoch": 0.744709503655252, + "flos": 620029260288.0, + "grad_norm": 0.03364023309307919, + "language_loss": 0.86704087, + "learning_rate": 0.00016139540674064856, + "loss": 0.87851417, + "num_input_tokens_seen": 320944992, + "router_z_loss_mlp": 0.73828125, + "step": 3871, + "time_per_iteration": 2.727344512939453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147794, + "balance_loss_mlp": 1.07388413, + "epoch": 0.7449018853405156, + "flos": 529680253440.0, + "grad_norm": 0.03265362950694584, + "language_loss": 0.82158148, + "learning_rate": 0.00016116624260021113, + "loss": 0.83305943, + "num_input_tokens_seen": 321020208, + "router_z_loss_mlp": 0.73876953, + "step": 3872, + "time_per_iteration": 2.733447551727295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147438, + "balance_loss_mlp": 1.0736239, + "epoch": 0.7450942670257792, + "flos": 434223069696.0, + "grad_norm": 0.03568420204032938, + "language_loss": 0.89293343, + "learning_rate": 0.0001609372100067046, + "loss": 0.90440786, + "num_input_tokens_seen": 321085984, + "router_z_loss_mlp": 0.73828125, + "step": 3873, + "time_per_iteration": 2.5226526260375977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141021, + "balance_loss_mlp": 1.06682503, + "epoch": 0.7452866487110427, + "flos": 698165140992.0, + "grad_norm": 0.04021816698405521, + "language_loss": 0.90011704, + "learning_rate": 0.0001607083090490475, + "loss": 0.91152722, + "num_input_tokens_seen": 321163200, + "router_z_loss_mlp": 0.74023438, + "step": 3874, + "time_per_iteration": 2.897472381591797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138845, + "balance_loss_mlp": 1.06464863, + "epoch": 0.7454790303963063, + "flos": 513279473664.0, + "grad_norm": 0.03827241503421356, + "language_loss": 0.86578858, + "learning_rate": 0.00016047953981610714, + "loss": 0.877177, + "num_input_tokens_seen": 321237328, + "router_z_loss_mlp": 0.74023438, + "step": 3875, + "time_per_iteration": 2.7049574851989746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153999, + "balance_loss_mlp": 1.08171082, + "epoch": 0.7456714120815698, + "flos": 1328874107904.0, + "grad_norm": 0.014146468768439814, + "language_loss": 0.7972964, + "learning_rate": 0.00016025090239669916, + "loss": 0.8088364, + "num_input_tokens_seen": 321456192, + "router_z_loss_mlp": 0.72460938, + "step": 3876, + "time_per_iteration": 4.997116804122925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147349, + "balance_loss_mlp": 1.0731051, + "epoch": 0.7458637937668334, + "flos": 722971427328.0, + "grad_norm": 0.03963419785288614, + "language_loss": 0.8521378, + "learning_rate": 0.0001600223968795889, + "loss": 0.86361128, + "num_input_tokens_seen": 321530560, + "router_z_loss_mlp": 0.74072266, + "step": 3877, + "time_per_iteration": 2.8971540927886963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147774, + "balance_loss_mlp": 1.07548523, + "epoch": 0.746056175452097, + "flos": 1504866172416.0, + "grad_norm": 0.01288298570823651, + "language_loss": 0.75696075, + "learning_rate": 0.00015979402335349004, + "loss": 0.76843846, + "num_input_tokens_seen": 321760928, + "router_z_loss_mlp": 0.72460938, + "step": 3878, + "time_per_iteration": 4.937422275543213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144499, + "balance_loss_mlp": 1.07025564, + "epoch": 0.7462485571373605, + "flos": 521294212608.0, + "grad_norm": 0.03493161366736204, + "language_loss": 0.85764599, + "learning_rate": 0.00015956578190706483, + "loss": 0.86909091, + "num_input_tokens_seen": 321833248, + "router_z_loss_mlp": 0.74072266, + "step": 3879, + "time_per_iteration": 2.68503737449646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144, + "balance_loss_mlp": 1.06980455, + "epoch": 0.7464409388226241, + "flos": 482166511104.0, + "grad_norm": 0.03362253888482968, + "language_loss": 0.79837132, + "learning_rate": 0.00015933767262892468, + "loss": 0.80981129, + "num_input_tokens_seen": 321905904, + "router_z_loss_mlp": 0.74072266, + "step": 3880, + "time_per_iteration": 2.693495988845825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144861, + "balance_loss_mlp": 1.07071245, + "epoch": 0.7466333205078877, + "flos": 487741317120.0, + "grad_norm": 0.04222777509687144, + "language_loss": 0.88058239, + "learning_rate": 0.00015910969560762927, + "loss": 0.89203095, + "num_input_tokens_seen": 321971920, + "router_z_loss_mlp": 0.74023438, + "step": 3881, + "time_per_iteration": 2.562688112258911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148453, + "balance_loss_mlp": 1.07416224, + "epoch": 0.7468257021931513, + "flos": 612407290368.0, + "grad_norm": 0.034328627776477647, + "language_loss": 0.8732987, + "learning_rate": 0.00015888185093168727, + "loss": 0.88478327, + "num_input_tokens_seen": 322041904, + "router_z_loss_mlp": 0.74121094, + "step": 3882, + "time_per_iteration": 2.718461036682129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146529, + "balance_loss_mlp": 1.072142, + "epoch": 0.7470180838784147, + "flos": 534484257792.0, + "grad_norm": 0.03431059853024658, + "language_loss": 0.85983026, + "learning_rate": 0.00015865413868955581, + "loss": 0.87129557, + "num_input_tokens_seen": 322110816, + "router_z_loss_mlp": 0.7421875, + "step": 3883, + "time_per_iteration": 2.6472575664520264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146306, + "balance_loss_mlp": 1.07225311, + "epoch": 0.7472104655636783, + "flos": 740672764416.0, + "grad_norm": 0.030267060700337457, + "language_loss": 0.87475348, + "learning_rate": 0.00015842655896964054, + "loss": 0.88621652, + "num_input_tokens_seen": 322192704, + "router_z_loss_mlp": 0.73974609, + "step": 3884, + "time_per_iteration": 3.015573024749756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145315, + "balance_loss_mlp": 1.07107127, + "epoch": 0.7474028472489419, + "flos": 641501286912.0, + "grad_norm": 0.03713221878515122, + "language_loss": 0.79442894, + "learning_rate": 0.00015819911186029567, + "loss": 0.8058821, + "num_input_tokens_seen": 322263888, + "router_z_loss_mlp": 0.74121094, + "step": 3885, + "time_per_iteration": 2.7972114086151123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145173, + "balance_loss_mlp": 1.07078624, + "epoch": 0.7475952289342055, + "flos": 591326031360.0, + "grad_norm": 0.035996478944381224, + "language_loss": 0.90933514, + "learning_rate": 0.00015797179744982443, + "loss": 0.92078686, + "num_input_tokens_seen": 322331936, + "router_z_loss_mlp": 0.7421875, + "step": 3886, + "time_per_iteration": 2.699364185333252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145253, + "balance_loss_mlp": 1.07100964, + "epoch": 0.7477876106194691, + "flos": 489219793920.0, + "grad_norm": 0.03742232117847866, + "language_loss": 0.83403462, + "learning_rate": 0.00015774461582647765, + "loss": 0.84548712, + "num_input_tokens_seen": 322402032, + "router_z_loss_mlp": 0.74121094, + "step": 3887, + "time_per_iteration": 2.6602365970611572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146333, + "balance_loss_mlp": 1.07199454, + "epoch": 0.7479799923047326, + "flos": 555789098496.0, + "grad_norm": 0.03709849655597122, + "language_loss": 0.85774076, + "learning_rate": 0.00015751756707845505, + "loss": 0.86920416, + "num_input_tokens_seen": 322472512, + "router_z_loss_mlp": 0.74169922, + "step": 3888, + "time_per_iteration": 2.6497113704681396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145173, + "balance_loss_mlp": 1.07097745, + "epoch": 0.7481723739899961, + "flos": 768789841920.0, + "grad_norm": 0.0326002931336663, + "language_loss": 0.92530739, + "learning_rate": 0.00015729065129390502, + "loss": 0.93675911, + "num_input_tokens_seen": 322555104, + "router_z_loss_mlp": 0.74121094, + "step": 3889, + "time_per_iteration": 3.0129857063293457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145589, + "balance_loss_mlp": 1.07129776, + "epoch": 0.7483647556752597, + "flos": 497160672768.0, + "grad_norm": 0.03921764888683204, + "language_loss": 0.87742007, + "learning_rate": 0.0001570638685609241, + "loss": 0.88887596, + "num_input_tokens_seen": 322621904, + "router_z_loss_mlp": 0.74169922, + "step": 3890, + "time_per_iteration": 2.6674981117248535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145557, + "balance_loss_mlp": 1.07126558, + "epoch": 0.7485571373605233, + "flos": 473826132480.0, + "grad_norm": 0.036715319135455414, + "language_loss": 0.85719097, + "learning_rate": 0.00015683721896755693, + "loss": 0.8686465, + "num_input_tokens_seen": 322688928, + "router_z_loss_mlp": 0.74169922, + "step": 3891, + "time_per_iteration": 2.524322271347046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153778, + "balance_loss_mlp": 1.0816803, + "epoch": 0.7487495190457868, + "flos": 1557898324992.0, + "grad_norm": 0.009583293732515121, + "language_loss": 0.82210493, + "learning_rate": 0.00015661070260179682, + "loss": 0.83364266, + "num_input_tokens_seen": 322928464, + "router_z_loss_mlp": 0.72265625, + "step": 3892, + "time_per_iteration": 4.967085361480713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114376, + "balance_loss_mlp": 1.06980217, + "epoch": 0.7489419007310504, + "flos": 582966187008.0, + "grad_norm": 0.03314224500682494, + "language_loss": 0.89740062, + "learning_rate": 0.00015638431955158528, + "loss": 0.90883827, + "num_input_tokens_seen": 323002672, + "router_z_loss_mlp": 0.73974609, + "step": 3893, + "time_per_iteration": 2.7170591354370117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143436, + "balance_loss_mlp": 1.06952667, + "epoch": 0.749134282416314, + "flos": 568697164800.0, + "grad_norm": 0.032778698573620556, + "language_loss": 0.85919845, + "learning_rate": 0.00015615806990481186, + "loss": 0.87063277, + "num_input_tokens_seen": 323076480, + "router_z_loss_mlp": 0.73925781, + "step": 3894, + "time_per_iteration": 2.6996026039123535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143061, + "balance_loss_mlp": 1.06915176, + "epoch": 0.7493266641015776, + "flos": 534165348864.0, + "grad_norm": 0.030394188724740954, + "language_loss": 0.88159597, + "learning_rate": 0.00015593195374931452, + "loss": 0.89302653, + "num_input_tokens_seen": 323151840, + "router_z_loss_mlp": 0.73876953, + "step": 3895, + "time_per_iteration": 2.7341361045837402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146619, + "balance_loss_mlp": 1.0727098, + "epoch": 0.7495190457868411, + "flos": 524717795328.0, + "grad_norm": 0.03863238275082747, + "language_loss": 0.84834325, + "learning_rate": 0.00015570597117287922, + "loss": 0.8598094, + "num_input_tokens_seen": 323223376, + "router_z_loss_mlp": 0.73925781, + "step": 3896, + "time_per_iteration": 2.659959077835083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144958, + "balance_loss_mlp": 1.07123923, + "epoch": 0.7497114274721046, + "flos": 515189650944.0, + "grad_norm": 0.036153955885896226, + "language_loss": 0.83024484, + "learning_rate": 0.0001554801222632406, + "loss": 0.84169447, + "num_input_tokens_seen": 323290288, + "router_z_loss_mlp": 0.73730469, + "step": 3897, + "time_per_iteration": 2.5906412601470947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145811, + "balance_loss_mlp": 1.07199693, + "epoch": 0.7499038091573682, + "flos": 495997102080.0, + "grad_norm": 0.03335147628193477, + "language_loss": 0.89782715, + "learning_rate": 0.00015525440710808052, + "loss": 0.90928525, + "num_input_tokens_seen": 323359568, + "router_z_loss_mlp": 0.73828125, + "step": 3898, + "time_per_iteration": 2.615407705307007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145951, + "balance_loss_mlp": 1.07199407, + "epoch": 0.7500961908426318, + "flos": 738988170240.0, + "grad_norm": 0.03474247339269188, + "language_loss": 0.84343684, + "learning_rate": 0.00015502882579502953, + "loss": 0.85489637, + "num_input_tokens_seen": 323436688, + "router_z_loss_mlp": 0.73925781, + "step": 3899, + "time_per_iteration": 3.010974645614624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114743, + "balance_loss_mlp": 1.07361519, + "epoch": 0.7502885725278954, + "flos": 534536650752.0, + "grad_norm": 0.03268230414324022, + "language_loss": 0.88787687, + "learning_rate": 0.00015480337841166592, + "loss": 0.89935118, + "num_input_tokens_seen": 323510032, + "router_z_loss_mlp": 0.73828125, + "step": 3900, + "time_per_iteration": 2.7430782318115234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147759, + "balance_loss_mlp": 1.07399249, + "epoch": 0.7504809542131589, + "flos": 590557957632.0, + "grad_norm": 0.04375512425984308, + "language_loss": 0.87710261, + "learning_rate": 0.00015457806504551647, + "loss": 0.8885802, + "num_input_tokens_seen": 323588896, + "router_z_loss_mlp": 0.73779297, + "step": 3901, + "time_per_iteration": 2.8651504516601562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148011, + "balance_loss_mlp": 1.0741967, + "epoch": 0.7506733358984224, + "flos": 512582532096.0, + "grad_norm": 0.0332649439615325, + "language_loss": 0.82646012, + "learning_rate": 0.0001543528857840554, + "loss": 0.83794028, + "num_input_tokens_seen": 323661280, + "router_z_loss_mlp": 0.73828125, + "step": 3902, + "time_per_iteration": 2.6909492015838623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144161, + "balance_loss_mlp": 1.07025158, + "epoch": 0.750865717583686, + "flos": 540382702080.0, + "grad_norm": 0.03600709682352738, + "language_loss": 0.85171556, + "learning_rate": 0.000154127840714705, + "loss": 0.86315715, + "num_input_tokens_seen": 323739200, + "router_z_loss_mlp": 0.73925781, + "step": 3903, + "time_per_iteration": 2.7624754905700684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144936, + "balance_loss_mlp": 1.0707401, + "epoch": 0.7510580992689496, + "flos": 477540426240.0, + "grad_norm": 0.045315321448851864, + "language_loss": 0.87899154, + "learning_rate": 0.00015390292992483557, + "loss": 0.89044094, + "num_input_tokens_seen": 323802816, + "router_z_loss_mlp": 0.74072266, + "step": 3904, + "time_per_iteration": 2.512664794921875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141177, + "balance_loss_mlp": 1.06707633, + "epoch": 0.7512504809542132, + "flos": 580200614400.0, + "grad_norm": 0.0336140335329932, + "language_loss": 0.89387548, + "learning_rate": 0.00015367815350176523, + "loss": 0.90528727, + "num_input_tokens_seen": 323879488, + "router_z_loss_mlp": 0.74072266, + "step": 3905, + "time_per_iteration": 2.743971824645996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139798, + "balance_loss_mlp": 1.06550705, + "epoch": 0.7514428626394767, + "flos": 419563279872.0, + "grad_norm": 0.033015406559801515, + "language_loss": 0.88140541, + "learning_rate": 0.00015345351153275987, + "loss": 0.89280337, + "num_input_tokens_seen": 323944512, + "router_z_loss_mlp": 0.74169922, + "step": 3906, + "time_per_iteration": 2.5664329528808594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137169, + "balance_loss_mlp": 1.06335413, + "epoch": 0.7516352443247403, + "flos": 642254624256.0, + "grad_norm": 0.03633245053817903, + "language_loss": 0.85467315, + "learning_rate": 0.00015322900410503332, + "loss": 0.86604482, + "num_input_tokens_seen": 324020688, + "router_z_loss_mlp": 0.73828125, + "step": 3907, + "time_per_iteration": 2.797030210494995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139178, + "balance_loss_mlp": 1.0650295, + "epoch": 0.7518276260100039, + "flos": 582191382528.0, + "grad_norm": 0.03436736061108426, + "language_loss": 0.8251732, + "learning_rate": 0.00015300463130574703, + "loss": 0.83656502, + "num_input_tokens_seen": 324098080, + "router_z_loss_mlp": 0.74023438, + "step": 3908, + "time_per_iteration": 2.8524422645568848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139345, + "balance_loss_mlp": 1.06524479, + "epoch": 0.7520200076952674, + "flos": 688615529472.0, + "grad_norm": 0.03139939166900202, + "language_loss": 0.85847479, + "learning_rate": 0.00015278039322201033, + "loss": 0.86986822, + "num_input_tokens_seen": 324183968, + "router_z_loss_mlp": 0.73974609, + "step": 3909, + "time_per_iteration": 2.9437077045440674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113959, + "balance_loss_mlp": 1.0656805, + "epoch": 0.7522123893805309, + "flos": 487415677440.0, + "grad_norm": 0.04345489019259924, + "language_loss": 0.85063672, + "learning_rate": 0.00015255628994088004, + "loss": 0.86203265, + "num_input_tokens_seen": 324249568, + "router_z_loss_mlp": 0.73876953, + "step": 3910, + "time_per_iteration": 2.5493288040161133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139511, + "balance_loss_mlp": 1.0655055, + "epoch": 0.7524047710657945, + "flos": 820591294464.0, + "grad_norm": 0.035053470769469915, + "language_loss": 0.79975402, + "learning_rate": 0.00015233232154936082, + "loss": 0.81114912, + "num_input_tokens_seen": 324345312, + "router_z_loss_mlp": 0.73925781, + "step": 3911, + "time_per_iteration": 3.2801201343536377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136453, + "balance_loss_mlp": 1.06259108, + "epoch": 0.7525971527510581, + "flos": 700780992000.0, + "grad_norm": 0.03701963339686214, + "language_loss": 0.80987895, + "learning_rate": 0.0001521084881344048, + "loss": 0.82124352, + "num_input_tokens_seen": 324419056, + "router_z_loss_mlp": 0.73876953, + "step": 3912, + "time_per_iteration": 2.864623785018921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136423, + "balance_loss_mlp": 1.06260836, + "epoch": 0.7527895344363217, + "flos": 634949561856.0, + "grad_norm": 0.03193238845442204, + "language_loss": 0.90964454, + "learning_rate": 0.00015188478978291208, + "loss": 0.92100877, + "num_input_tokens_seen": 324490848, + "router_z_loss_mlp": 0.73828125, + "step": 3913, + "time_per_iteration": 2.817735433578491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138001, + "balance_loss_mlp": 1.06423438, + "epoch": 0.7529819161215853, + "flos": 563932091904.0, + "grad_norm": 0.03160281710037872, + "language_loss": 0.90830052, + "learning_rate": 0.00015166122658173014, + "loss": 0.91968054, + "num_input_tokens_seen": 324565648, + "router_z_loss_mlp": 0.73779297, + "step": 3914, + "time_per_iteration": 2.769164562225342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143642, + "balance_loss_mlp": 1.06992257, + "epoch": 0.7531742978068487, + "flos": 691956519936.0, + "grad_norm": 0.03347021027562271, + "language_loss": 0.9305917, + "learning_rate": 0.00015143779861765332, + "loss": 0.94202816, + "num_input_tokens_seen": 324642832, + "router_z_loss_mlp": 0.73730469, + "step": 3915, + "time_per_iteration": 2.8637077808380127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143643, + "balance_loss_mlp": 1.07001936, + "epoch": 0.7533666794921123, + "flos": 682306851840.0, + "grad_norm": 0.03059680855463854, + "language_loss": 0.85590506, + "learning_rate": 0.00015121450597742458, + "loss": 0.86734146, + "num_input_tokens_seen": 324718336, + "router_z_loss_mlp": 0.73632812, + "step": 3916, + "time_per_iteration": 2.822169065475464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143917, + "balance_loss_mlp": 1.0701977, + "epoch": 0.7535590611773759, + "flos": 624813798912.0, + "grad_norm": 0.03788604820756776, + "language_loss": 0.84024751, + "learning_rate": 0.00015099134874773369, + "loss": 0.85168672, + "num_input_tokens_seen": 324787744, + "router_z_loss_mlp": 0.73730469, + "step": 3917, + "time_per_iteration": 2.739708185195923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143474, + "balance_loss_mlp": 1.06975508, + "epoch": 0.7537514428626395, + "flos": 520493211648.0, + "grad_norm": 0.03128503546806215, + "language_loss": 0.84470636, + "learning_rate": 0.00015076832701521793, + "loss": 0.85614109, + "num_input_tokens_seen": 324863280, + "router_z_loss_mlp": 0.73730469, + "step": 3918, + "time_per_iteration": 2.7321834564208984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143927, + "balance_loss_mlp": 1.07016027, + "epoch": 0.753943824547903, + "flos": 725034054144.0, + "grad_norm": 0.04314682819864583, + "language_loss": 0.87482226, + "learning_rate": 0.000150545440866462, + "loss": 0.88626158, + "num_input_tokens_seen": 324949600, + "router_z_loss_mlp": 0.73779297, + "step": 3919, + "time_per_iteration": 2.9775331020355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138634, + "balance_loss_mlp": 1.06486762, + "epoch": 0.7541362062331666, + "flos": 438467119104.0, + "grad_norm": 0.052938940004614674, + "language_loss": 0.83896869, + "learning_rate": 0.000150322690387998, + "loss": 0.85035503, + "num_input_tokens_seen": 325013808, + "router_z_loss_mlp": 0.73779297, + "step": 3920, + "time_per_iteration": 2.49090576171875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137452, + "balance_loss_mlp": 1.06363773, + "epoch": 0.7543285879184302, + "flos": 566343826944.0, + "grad_norm": 0.033797104064901606, + "language_loss": 0.79905725, + "learning_rate": 0.00015010007566630535, + "loss": 0.81043172, + "num_input_tokens_seen": 325084832, + "router_z_loss_mlp": 0.73828125, + "step": 3921, + "time_per_iteration": 2.731271266937256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136388, + "balance_loss_mlp": 1.06257319, + "epoch": 0.7545209696036937, + "flos": 522058283520.0, + "grad_norm": 0.038458937044939336, + "language_loss": 0.86757135, + "learning_rate": 0.00014987759678781077, + "loss": 0.87893528, + "num_input_tokens_seen": 325155120, + "router_z_loss_mlp": 0.73828125, + "step": 3922, + "time_per_iteration": 2.6090140342712402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137282, + "balance_loss_mlp": 1.06356251, + "epoch": 0.7547133512889573, + "flos": 617209293312.0, + "grad_norm": 0.03880443282291728, + "language_loss": 0.87359434, + "learning_rate": 0.00014965525383888795, + "loss": 0.88496715, + "num_input_tokens_seen": 325235632, + "router_z_loss_mlp": 0.73730469, + "step": 3923, + "time_per_iteration": 2.7862982749938965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142684, + "balance_loss_mlp": 1.06867838, + "epoch": 0.7549057329742208, + "flos": 752141285376.0, + "grad_norm": 0.034394345643830246, + "language_loss": 0.76875985, + "learning_rate": 0.00014943304690585851, + "loss": 0.78018677, + "num_input_tokens_seen": 325309696, + "router_z_loss_mlp": 0.73876953, + "step": 3924, + "time_per_iteration": 2.910545825958252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143742, + "balance_loss_mlp": 1.06964111, + "epoch": 0.7550981146594844, + "flos": 515450162688.0, + "grad_norm": 0.03861308320303695, + "language_loss": 0.84874004, + "learning_rate": 0.0001492109760749908, + "loss": 0.8601774, + "num_input_tokens_seen": 325375744, + "router_z_loss_mlp": 0.73925781, + "step": 3925, + "time_per_iteration": 2.6297590732574463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114885, + "balance_loss_mlp": 1.07503557, + "epoch": 0.755290496344748, + "flos": 523026470400.0, + "grad_norm": 0.03619284623478051, + "language_loss": 0.84284902, + "learning_rate": 0.00014898904143250002, + "loss": 0.85433757, + "num_input_tokens_seen": 325448384, + "router_z_loss_mlp": 0.73828125, + "step": 3926, + "time_per_iteration": 2.6899092197418213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155189, + "balance_loss_mlp": 1.082901, + "epoch": 0.7554828780300116, + "flos": 1417703705088.0, + "grad_norm": 0.01325688578051584, + "language_loss": 0.75755203, + "learning_rate": 0.00014876724306454886, + "loss": 0.76910388, + "num_input_tokens_seen": 325678672, + "router_z_loss_mlp": 0.72460938, + "step": 3927, + "time_per_iteration": 4.904372692108154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141123, + "balance_loss_mlp": 1.06683159, + "epoch": 0.7556752597152752, + "flos": 557985984000.0, + "grad_norm": 0.031943357844755736, + "language_loss": 0.84718072, + "learning_rate": 0.0001485455810572474, + "loss": 0.85859191, + "num_input_tokens_seen": 325746656, + "router_z_loss_mlp": 0.74121094, + "step": 3928, + "time_per_iteration": 2.6653287410736084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139674, + "balance_loss_mlp": 1.06519186, + "epoch": 0.7558676414005386, + "flos": 564741825024.0, + "grad_norm": 0.03222629584019241, + "language_loss": 0.88709021, + "learning_rate": 0.00014832405549665236, + "loss": 0.89848697, + "num_input_tokens_seen": 325820304, + "router_z_loss_mlp": 0.74316406, + "step": 3929, + "time_per_iteration": 2.69524884223938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114176, + "balance_loss_mlp": 1.0672785, + "epoch": 0.7560600230858022, + "flos": 562534205952.0, + "grad_norm": 0.03584285097744866, + "language_loss": 0.82973742, + "learning_rate": 0.00014810266646876746, + "loss": 0.84115505, + "num_input_tokens_seen": 325895584, + "router_z_loss_mlp": 0.74316406, + "step": 3930, + "time_per_iteration": 2.781097888946533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141215, + "balance_loss_mlp": 1.06663764, + "epoch": 0.7562524047710658, + "flos": 720957190656.0, + "grad_norm": 0.038983110262219116, + "language_loss": 0.82315147, + "learning_rate": 0.00014788141405954364, + "loss": 0.83456367, + "num_input_tokens_seen": 325976752, + "router_z_loss_mlp": 0.74414062, + "step": 3931, + "time_per_iteration": 2.9991354942321777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140296, + "balance_loss_mlp": 1.06571853, + "epoch": 0.7564447864563294, + "flos": 544396439040.0, + "grad_norm": 0.037101319530533854, + "language_loss": 0.90224212, + "learning_rate": 0.00014766029835487865, + "loss": 0.91364509, + "num_input_tokens_seen": 326047152, + "router_z_loss_mlp": 0.74414062, + "step": 3932, + "time_per_iteration": 2.692891836166382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144662, + "balance_loss_mlp": 1.07008481, + "epoch": 0.7566371681415929, + "flos": 727093953024.0, + "grad_norm": 0.03778072998608002, + "language_loss": 0.86007833, + "learning_rate": 0.0001474393194406173, + "loss": 0.87152493, + "num_input_tokens_seen": 326119056, + "router_z_loss_mlp": 0.74414062, + "step": 3933, + "time_per_iteration": 2.891930341720581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146005, + "balance_loss_mlp": 1.07142723, + "epoch": 0.7568295498268565, + "flos": 577806343680.0, + "grad_norm": 0.03260015867991467, + "language_loss": 0.84333152, + "learning_rate": 0.00014721847740255112, + "loss": 0.85479152, + "num_input_tokens_seen": 326196736, + "router_z_loss_mlp": 0.74414062, + "step": 3934, + "time_per_iteration": 2.799757242202759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151863, + "balance_loss_mlp": 1.07919312, + "epoch": 0.75702193151212, + "flos": 1523216060928.0, + "grad_norm": 0.00897818069303787, + "language_loss": 0.73911923, + "learning_rate": 0.00014699777232641853, + "loss": 0.75063783, + "num_input_tokens_seen": 326404752, + "router_z_loss_mlp": 0.7265625, + "step": 3935, + "time_per_iteration": 4.575445175170898 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146571, + "balance_loss_mlp": 1.07199419, + "epoch": 0.7572143131973836, + "flos": 526488984576.0, + "grad_norm": 0.039044960519486104, + "language_loss": 0.83207357, + "learning_rate": 0.00014677720429790526, + "loss": 0.8435393, + "num_input_tokens_seen": 326472832, + "router_z_loss_mlp": 0.74414062, + "step": 3936, + "time_per_iteration": 2.6141350269317627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143608, + "balance_loss_mlp": 1.06917346, + "epoch": 0.7574066948826472, + "flos": 551823025152.0, + "grad_norm": 0.030693904946920876, + "language_loss": 0.88398033, + "learning_rate": 0.0001465567734026429, + "loss": 0.89541638, + "num_input_tokens_seen": 326546976, + "router_z_loss_mlp": 0.74267578, + "step": 3937, + "time_per_iteration": 2.738377571105957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136961, + "balance_loss_mlp": 1.06219339, + "epoch": 0.7575990765679107, + "flos": 396769228800.0, + "grad_norm": 0.04103098357371863, + "language_loss": 0.88068545, + "learning_rate": 0.00014633647972621034, + "loss": 0.89205503, + "num_input_tokens_seen": 326609296, + "router_z_loss_mlp": 0.74609375, + "step": 3938, + "time_per_iteration": 2.4616434574127197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138132, + "balance_loss_mlp": 1.06336367, + "epoch": 0.7577914582531743, + "flos": 586185653760.0, + "grad_norm": 0.030008665391221847, + "language_loss": 0.90353823, + "learning_rate": 0.00014611632335413354, + "loss": 0.91491956, + "num_input_tokens_seen": 326687168, + "router_z_loss_mlp": 0.74609375, + "step": 3939, + "time_per_iteration": 2.775031805038452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113606, + "balance_loss_mlp": 1.06143546, + "epoch": 0.7579838399384379, + "flos": 822484007424.0, + "grad_norm": 0.031088983596600554, + "language_loss": 0.87266111, + "learning_rate": 0.00014589630437188456, + "loss": 0.8840217, + "num_input_tokens_seen": 326777760, + "router_z_loss_mlp": 0.74462891, + "step": 3940, + "time_per_iteration": 3.1587963104248047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136592, + "balance_loss_mlp": 1.06187153, + "epoch": 0.7581762216237015, + "flos": 444805996032.0, + "grad_norm": 0.04449780821151478, + "language_loss": 0.84434611, + "learning_rate": 0.00014567642286488253, + "loss": 0.85571206, + "num_input_tokens_seen": 326843952, + "router_z_loss_mlp": 0.74560547, + "step": 3941, + "time_per_iteration": 2.541396141052246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146151, + "balance_loss_mlp": 1.07143092, + "epoch": 0.7583686033089649, + "flos": 541939041792.0, + "grad_norm": 0.045311193933261745, + "language_loss": 0.84473586, + "learning_rate": 0.00014545667891849258, + "loss": 0.85619736, + "num_input_tokens_seen": 326911296, + "router_z_loss_mlp": 0.74560547, + "step": 3942, + "time_per_iteration": 2.653228998184204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146078, + "balance_loss_mlp": 1.07150042, + "epoch": 0.7585609849942285, + "flos": 523612621824.0, + "grad_norm": 0.032810068859795746, + "language_loss": 0.87606031, + "learning_rate": 0.00014523707261802733, + "loss": 0.88752109, + "num_input_tokens_seen": 326977776, + "router_z_loss_mlp": 0.74414062, + "step": 3943, + "time_per_iteration": 2.6271109580993652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145321, + "balance_loss_mlp": 1.07064807, + "epoch": 0.7587533666794921, + "flos": 542907228672.0, + "grad_norm": 0.03968141925916535, + "language_loss": 0.87281996, + "learning_rate": 0.00014501760404874527, + "loss": 0.88427311, + "num_input_tokens_seen": 327050240, + "router_z_loss_mlp": 0.74511719, + "step": 3944, + "time_per_iteration": 2.696624279022217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143644, + "balance_loss_mlp": 1.06921005, + "epoch": 0.7589457483647557, + "flos": 607520693760.0, + "grad_norm": 0.03527343203685723, + "language_loss": 0.909307, + "learning_rate": 0.00014479827329585176, + "loss": 0.92074347, + "num_input_tokens_seen": 327119952, + "router_z_loss_mlp": 0.74267578, + "step": 3945, + "time_per_iteration": 2.7308402061462402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141632, + "balance_loss_mlp": 1.06724524, + "epoch": 0.7591381300500193, + "flos": 556251724800.0, + "grad_norm": 0.03227407382042984, + "language_loss": 0.88668191, + "learning_rate": 0.00014457908044449846, + "loss": 0.89809817, + "num_input_tokens_seen": 327192640, + "router_z_loss_mlp": 0.7421875, + "step": 3946, + "time_per_iteration": 2.723604917526245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145154, + "balance_loss_mlp": 1.07076728, + "epoch": 0.7593305117352828, + "flos": 530813624832.0, + "grad_norm": 0.032659275008273744, + "language_loss": 0.87264967, + "learning_rate": 0.00014436002557978371, + "loss": 0.88410115, + "num_input_tokens_seen": 327271008, + "router_z_loss_mlp": 0.7421875, + "step": 3947, + "time_per_iteration": 2.7849090099334717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151436, + "balance_loss_mlp": 1.07876587, + "epoch": 0.7595228934205464, + "flos": 1505922955776.0, + "grad_norm": 0.01242422674418897, + "language_loss": 0.76643145, + "learning_rate": 0.00014414110878675201, + "loss": 0.77794582, + "num_input_tokens_seen": 327505392, + "router_z_loss_mlp": 0.7265625, + "step": 3948, + "time_per_iteration": 4.869319200515747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141564, + "balance_loss_mlp": 1.06717777, + "epoch": 0.7597152751058099, + "flos": 456467899392.0, + "grad_norm": 0.03330137470124234, + "language_loss": 0.84041482, + "learning_rate": 0.0001439223301503945, + "loss": 0.85183042, + "num_input_tokens_seen": 327569392, + "router_z_loss_mlp": 0.7421875, + "step": 3949, + "time_per_iteration": 2.511057138442993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141649, + "balance_loss_mlp": 1.06721532, + "epoch": 0.7599076567910735, + "flos": 686798678016.0, + "grad_norm": 0.040114283676211684, + "language_loss": 0.80981869, + "learning_rate": 0.00014370368975564834, + "loss": 0.82123518, + "num_input_tokens_seen": 327648304, + "router_z_loss_mlp": 0.74267578, + "step": 3950, + "time_per_iteration": 3.0096349716186523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144078, + "balance_loss_mlp": 1.06973898, + "epoch": 0.760100038476337, + "flos": 533494603776.0, + "grad_norm": 0.03798147365213374, + "language_loss": 0.88830221, + "learning_rate": 0.00014348518768739766, + "loss": 0.89974296, + "num_input_tokens_seen": 327725600, + "router_z_loss_mlp": 0.74169922, + "step": 3951, + "time_per_iteration": 2.789020299911499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146828, + "balance_loss_mlp": 1.07415771, + "epoch": 0.7602924201616006, + "flos": 1474916780544.0, + "grad_norm": 0.005782127135677509, + "language_loss": 0.7672804, + "learning_rate": 0.00014326682403047243, + "loss": 0.77874869, + "num_input_tokens_seen": 327954048, + "router_z_loss_mlp": 0.7265625, + "step": 3952, + "time_per_iteration": 4.8369224071502686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142903, + "balance_loss_mlp": 1.06875467, + "epoch": 0.7604848018468642, + "flos": 776040509952.0, + "grad_norm": 0.03364559855712782, + "language_loss": 0.90537649, + "learning_rate": 0.00014304859886964867, + "loss": 0.91680551, + "num_input_tokens_seen": 328034656, + "router_z_loss_mlp": 0.74072266, + "step": 3953, + "time_per_iteration": 2.9843015670776367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142718, + "balance_loss_mlp": 1.06871259, + "epoch": 0.7606771835321278, + "flos": 559260344832.0, + "grad_norm": 0.034495919290042885, + "language_loss": 0.88372874, + "learning_rate": 0.00014283051228964878, + "loss": 0.89515591, + "num_input_tokens_seen": 328107264, + "router_z_loss_mlp": 0.74023438, + "step": 3954, + "time_per_iteration": 2.6971194744110107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143086, + "balance_loss_mlp": 1.06912816, + "epoch": 0.7608695652173914, + "flos": 526432588800.0, + "grad_norm": 0.03600141615552244, + "language_loss": 0.87487853, + "learning_rate": 0.00014261256437514197, + "loss": 0.88630933, + "num_input_tokens_seen": 328177168, + "router_z_loss_mlp": 0.73974609, + "step": 3955, + "time_per_iteration": 2.641023635864258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143325, + "balance_loss_mlp": 1.06932008, + "epoch": 0.7610619469026548, + "flos": 616167246336.0, + "grad_norm": 0.03384728426849952, + "language_loss": 0.87191808, + "learning_rate": 0.0001423947552107428, + "loss": 0.88335133, + "num_input_tokens_seen": 328245360, + "router_z_loss_mlp": 0.73974609, + "step": 3956, + "time_per_iteration": 2.7422232627868652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143723, + "balance_loss_mlp": 1.06981361, + "epoch": 0.7612543285879184, + "flos": 864817714176.0, + "grad_norm": 0.03496249839254083, + "language_loss": 0.82073259, + "learning_rate": 0.00014217708488101243, + "loss": 0.83216989, + "num_input_tokens_seen": 328326560, + "router_z_loss_mlp": 0.73925781, + "step": 3957, + "time_per_iteration": 3.1032650470733643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142422, + "balance_loss_mlp": 1.06822646, + "epoch": 0.761446710273182, + "flos": 554727585792.0, + "grad_norm": 0.03657356062959036, + "language_loss": 0.82088828, + "learning_rate": 0.0001419595534704579, + "loss": 0.83231246, + "num_input_tokens_seen": 328395760, + "router_z_loss_mlp": 0.74121094, + "step": 3958, + "time_per_iteration": 2.624901533126831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145496, + "balance_loss_mlp": 1.07149136, + "epoch": 0.7616390919584456, + "flos": 468325186560.0, + "grad_norm": 0.0357245127474846, + "language_loss": 0.85904223, + "learning_rate": 0.00014174216106353237, + "loss": 0.87049717, + "num_input_tokens_seen": 328464560, + "router_z_loss_mlp": 0.73974609, + "step": 3959, + "time_per_iteration": 2.595851421356201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143762, + "balance_loss_mlp": 1.06966209, + "epoch": 0.7618314736437091, + "flos": 499431418368.0, + "grad_norm": 0.03393548471878093, + "language_loss": 0.81279588, + "learning_rate": 0.00014152490774463512, + "loss": 0.82423347, + "num_input_tokens_seen": 328532640, + "router_z_loss_mlp": 0.73974609, + "step": 3960, + "time_per_iteration": 2.589545488357544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143507, + "balance_loss_mlp": 1.06931114, + "epoch": 0.7620238553289727, + "flos": 435451768320.0, + "grad_norm": 0.03935121424248522, + "language_loss": 0.92124438, + "learning_rate": 0.00014130779359811135, + "loss": 0.93267947, + "num_input_tokens_seen": 328595392, + "router_z_loss_mlp": 0.74072266, + "step": 3961, + "time_per_iteration": 2.455334424972534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114569, + "balance_loss_mlp": 1.07144618, + "epoch": 0.7622162370142362, + "flos": 665541500928.0, + "grad_norm": 0.033439971209903066, + "language_loss": 0.90740561, + "learning_rate": 0.0001410908187082521, + "loss": 0.91886252, + "num_input_tokens_seen": 328676368, + "router_z_loss_mlp": 0.74072266, + "step": 3962, + "time_per_iteration": 2.849613904953003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145492, + "balance_loss_mlp": 1.07105827, + "epoch": 0.7624086186994998, + "flos": 559028030976.0, + "grad_norm": 0.03941593540167477, + "language_loss": 0.90269017, + "learning_rate": 0.0001408739831592949, + "loss": 0.91414511, + "num_input_tokens_seen": 328745136, + "router_z_loss_mlp": 0.74267578, + "step": 3963, + "time_per_iteration": 2.638357639312744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114573, + "balance_loss_mlp": 1.07134342, + "epoch": 0.7626010003847634, + "flos": 630286546944.0, + "grad_norm": 0.03652031952844941, + "language_loss": 0.82416636, + "learning_rate": 0.0001406572870354224, + "loss": 0.83562368, + "num_input_tokens_seen": 328820384, + "router_z_loss_mlp": 0.7421875, + "step": 3964, + "time_per_iteration": 2.8123042583465576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145859, + "balance_loss_mlp": 1.07142508, + "epoch": 0.7627933820700269, + "flos": 438849154560.0, + "grad_norm": 0.03432760394377559, + "language_loss": 0.91489524, + "learning_rate": 0.00014044073042076337, + "loss": 0.92635381, + "num_input_tokens_seen": 328884976, + "router_z_loss_mlp": 0.74267578, + "step": 3965, + "time_per_iteration": 2.536203145980835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146519, + "balance_loss_mlp": 1.0722276, + "epoch": 0.7629857637552905, + "flos": 533794046976.0, + "grad_norm": 0.02784014268631594, + "language_loss": 0.9243055, + "learning_rate": 0.00014022431339939302, + "loss": 0.93577063, + "num_input_tokens_seen": 328957792, + "router_z_loss_mlp": 0.74121094, + "step": 3966, + "time_per_iteration": 2.6469874382019043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145692, + "balance_loss_mlp": 1.07135272, + "epoch": 0.7631781454405541, + "flos": 681236606976.0, + "grad_norm": 0.04013351668688065, + "language_loss": 0.82884651, + "learning_rate": 0.00014000803605533163, + "loss": 0.84030342, + "num_input_tokens_seen": 329034960, + "router_z_loss_mlp": 0.74169922, + "step": 3967, + "time_per_iteration": 2.802208185195923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145081, + "balance_loss_mlp": 1.07074177, + "epoch": 0.7633705271258177, + "flos": 508488204288.0, + "grad_norm": 0.04349575646472503, + "language_loss": 0.88445222, + "learning_rate": 0.00013979189847254553, + "loss": 0.89590299, + "num_input_tokens_seen": 329100848, + "router_z_loss_mlp": 0.74169922, + "step": 3968, + "time_per_iteration": 2.5820798873901367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145241, + "balance_loss_mlp": 1.07085466, + "epoch": 0.7635629088110811, + "flos": 620037992448.0, + "grad_norm": 0.0345033477005795, + "language_loss": 0.85449362, + "learning_rate": 0.00013957590073494674, + "loss": 0.86594605, + "num_input_tokens_seen": 329181120, + "router_z_loss_mlp": 0.7421875, + "step": 3969, + "time_per_iteration": 2.7904934883117676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139507, + "balance_loss_mlp": 1.0648824, + "epoch": 0.7637552904963447, + "flos": 639566914560.0, + "grad_norm": 0.03972116820389674, + "language_loss": 0.84200621, + "learning_rate": 0.0001393600429263931, + "loss": 0.8534013, + "num_input_tokens_seen": 329249888, + "router_z_loss_mlp": 0.74462891, + "step": 3970, + "time_per_iteration": 2.7333059310913086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145393, + "balance_loss_mlp": 1.07272339, + "epoch": 0.7639476721816083, + "flos": 1566683865600.0, + "grad_norm": 0.008603454608039083, + "language_loss": 0.74744886, + "learning_rate": 0.00013914432513068792, + "loss": 0.75890285, + "num_input_tokens_seen": 329483824, + "router_z_loss_mlp": 0.7265625, + "step": 3971, + "time_per_iteration": 4.924766302108765 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139229, + "balance_loss_mlp": 1.06484199, + "epoch": 0.7641400538668719, + "flos": 497019683328.0, + "grad_norm": 0.0358458499629568, + "language_loss": 0.86623794, + "learning_rate": 0.0001389287474315804, + "loss": 0.87763023, + "num_input_tokens_seen": 329553536, + "router_z_loss_mlp": 0.7421875, + "step": 3972, + "time_per_iteration": 2.6104958057403564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139206, + "balance_loss_mlp": 1.06481898, + "epoch": 0.7643324355521355, + "flos": 579514406400.0, + "grad_norm": 0.02970253105840928, + "language_loss": 0.84359801, + "learning_rate": 0.00013871330991276505, + "loss": 0.85499001, + "num_input_tokens_seen": 329621856, + "router_z_loss_mlp": 0.7421875, + "step": 3973, + "time_per_iteration": 2.7183613777160645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145413, + "balance_loss_mlp": 1.07102644, + "epoch": 0.764524817237399, + "flos": 786232668672.0, + "grad_norm": 0.038742643805220495, + "language_loss": 0.85575706, + "learning_rate": 0.00013849801265788247, + "loss": 0.86721122, + "num_input_tokens_seen": 329708192, + "router_z_loss_mlp": 0.7421875, + "step": 3974, + "time_per_iteration": 3.0245180130004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145329, + "balance_loss_mlp": 1.07094204, + "epoch": 0.7647171989226625, + "flos": 527298717696.0, + "grad_norm": 0.0343294309098999, + "language_loss": 0.88214505, + "learning_rate": 0.00013828285575051818, + "loss": 0.89359832, + "num_input_tokens_seen": 329774704, + "router_z_loss_mlp": 0.7421875, + "step": 3975, + "time_per_iteration": 2.6501829624176025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143749, + "balance_loss_mlp": 1.06964874, + "epoch": 0.7649095806079261, + "flos": 556028143104.0, + "grad_norm": 0.034577120087892245, + "language_loss": 0.88279045, + "learning_rate": 0.0001380678392742035, + "loss": 0.89422792, + "num_input_tokens_seen": 329846432, + "router_z_loss_mlp": 0.74072266, + "step": 3976, + "time_per_iteration": 2.717852830886841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143601, + "balance_loss_mlp": 1.06921458, + "epoch": 0.7651019622931897, + "flos": 650388885504.0, + "grad_norm": 0.0329487622471132, + "language_loss": 0.89186555, + "learning_rate": 0.00013785296331241526, + "loss": 0.90330154, + "num_input_tokens_seen": 329926336, + "router_z_loss_mlp": 0.7421875, + "step": 3977, + "time_per_iteration": 2.877988576889038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113775, + "balance_loss_mlp": 1.06336296, + "epoch": 0.7652943439784533, + "flos": 1048112113152.0, + "grad_norm": 0.034644421756337376, + "language_loss": 0.92511564, + "learning_rate": 0.00013763822794857583, + "loss": 0.9364931, + "num_input_tokens_seen": 330009536, + "router_z_loss_mlp": 0.7421875, + "step": 3978, + "time_per_iteration": 3.3197543621063232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113835, + "balance_loss_mlp": 1.06386817, + "epoch": 0.7654867256637168, + "flos": 505414456320.0, + "grad_norm": 0.032056341535250436, + "language_loss": 0.94870603, + "learning_rate": 0.00013742363326605278, + "loss": 0.96008945, + "num_input_tokens_seen": 330083264, + "router_z_loss_mlp": 0.74316406, + "step": 3979, + "time_per_iteration": 2.714352607727051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137744, + "balance_loss_mlp": 1.06330967, + "epoch": 0.7656791073489804, + "flos": 575863239168.0, + "grad_norm": 0.03156054452878063, + "language_loss": 0.82591552, + "learning_rate": 0.00013720917934815935, + "loss": 0.83729297, + "num_input_tokens_seen": 330157120, + "router_z_loss_mlp": 0.74267578, + "step": 3980, + "time_per_iteration": 2.717848300933838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113843, + "balance_loss_mlp": 1.06394827, + "epoch": 0.765871489034244, + "flos": 493791484416.0, + "grad_norm": 0.0408766328487834, + "language_loss": 0.88351345, + "learning_rate": 0.00013699486627815344, + "loss": 0.89489782, + "num_input_tokens_seen": 330224560, + "router_z_loss_mlp": 0.74316406, + "step": 3981, + "time_per_iteration": 2.570958137512207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114649, + "balance_loss_mlp": 1.07215071, + "epoch": 0.7660638707195075, + "flos": 487051106304.0, + "grad_norm": 0.03334801499225344, + "language_loss": 0.87230325, + "learning_rate": 0.00013678069413923928, + "loss": 0.8837682, + "num_input_tokens_seen": 330292000, + "router_z_loss_mlp": 0.74169922, + "step": 3982, + "time_per_iteration": 2.59192156791687 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145932, + "balance_loss_mlp": 1.07168806, + "epoch": 0.766256252404771, + "flos": 445242425856.0, + "grad_norm": 0.033038982399311745, + "language_loss": 0.86065191, + "learning_rate": 0.00013656666301456555, + "loss": 0.8721112, + "num_input_tokens_seen": 330357472, + "router_z_loss_mlp": 0.74121094, + "step": 3983, + "time_per_iteration": 2.5096640586853027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139926, + "balance_loss_mlp": 1.06568277, + "epoch": 0.7664486340900346, + "flos": 486213175296.0, + "grad_norm": 0.0343473148612919, + "language_loss": 0.88720405, + "learning_rate": 0.0001363527729872267, + "loss": 0.89860332, + "num_input_tokens_seen": 330427792, + "router_z_loss_mlp": 0.74072266, + "step": 3984, + "time_per_iteration": 2.652386426925659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138175, + "balance_loss_mlp": 1.06359744, + "epoch": 0.7666410157752982, + "flos": 647384268288.0, + "grad_norm": 0.033932927272579565, + "language_loss": 0.81177199, + "learning_rate": 0.00013613902414026207, + "loss": 0.82315373, + "num_input_tokens_seen": 330500320, + "router_z_loss_mlp": 0.74414062, + "step": 3985, + "time_per_iteration": 2.785083055496216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138176, + "balance_loss_mlp": 1.06359911, + "epoch": 0.7668333974605618, + "flos": 775660475904.0, + "grad_norm": 0.03599596212719163, + "language_loss": 0.86968917, + "learning_rate": 0.00013592541655665642, + "loss": 0.88107091, + "num_input_tokens_seen": 330581696, + "router_z_loss_mlp": 0.74414062, + "step": 3986, + "time_per_iteration": 3.013932704925537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144262, + "balance_loss_mlp": 1.06987572, + "epoch": 0.7670257791458254, + "flos": 614512851456.0, + "grad_norm": 0.036460289004419034, + "language_loss": 0.90080905, + "learning_rate": 0.00013571195031933947, + "loss": 0.91225165, + "num_input_tokens_seen": 330648000, + "router_z_loss_mlp": 0.7421875, + "step": 3987, + "time_per_iteration": 2.6782960891723633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114978, + "balance_loss_mlp": 1.0776825, + "epoch": 0.7672181608310888, + "flos": 1488362608128.0, + "grad_norm": 0.008503355118198302, + "language_loss": 0.80481339, + "learning_rate": 0.00013549862551118626, + "loss": 0.81631124, + "num_input_tokens_seen": 330873872, + "router_z_loss_mlp": 0.72265625, + "step": 3988, + "time_per_iteration": 4.697616338729858 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135719, + "balance_loss_mlp": 1.06128454, + "epoch": 0.7674105425163524, + "flos": 611866801152.0, + "grad_norm": 0.03376269838630617, + "language_loss": 0.9032138, + "learning_rate": 0.00013528544221501655, + "loss": 0.91457105, + "num_input_tokens_seen": 330945760, + "router_z_loss_mlp": 0.74267578, + "step": 3989, + "time_per_iteration": 2.731600284576416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135719, + "balance_loss_mlp": 1.06118917, + "epoch": 0.767602924201616, + "flos": 846604085760.0, + "grad_norm": 0.0353786451651817, + "language_loss": 0.86480021, + "learning_rate": 0.00013507240051359586, + "loss": 0.8761574, + "num_input_tokens_seen": 331025584, + "router_z_loss_mlp": 0.74365234, + "step": 3990, + "time_per_iteration": 3.0497024059295654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135952, + "balance_loss_mlp": 1.06156516, + "epoch": 0.7677953058868796, + "flos": 528145380864.0, + "grad_norm": 0.040368948500693246, + "language_loss": 0.91154569, + "learning_rate": 0.00013485950048963425, + "loss": 0.92290527, + "num_input_tokens_seen": 331093008, + "router_z_loss_mlp": 0.7421875, + "step": 3991, + "time_per_iteration": 2.596708059310913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135888, + "balance_loss_mlp": 1.06145394, + "epoch": 0.7679876875721431, + "flos": 925111268352.0, + "grad_norm": 0.05870608675269832, + "language_loss": 0.88347316, + "learning_rate": 0.00013464674222578643, + "loss": 0.89483202, + "num_input_tokens_seen": 331177120, + "router_z_loss_mlp": 0.74267578, + "step": 3992, + "time_per_iteration": 3.1901588439941406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114079, + "balance_loss_mlp": 1.06640303, + "epoch": 0.7681800692574067, + "flos": 459018622464.0, + "grad_norm": 0.03723022902665057, + "language_loss": 0.87956703, + "learning_rate": 0.00013443412580465292, + "loss": 0.89097494, + "num_input_tokens_seen": 331245424, + "router_z_loss_mlp": 0.7421875, + "step": 3993, + "time_per_iteration": 2.603252649307251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141634, + "balance_loss_mlp": 1.06724763, + "epoch": 0.7683724509426703, + "flos": 659732379648.0, + "grad_norm": 0.0341053080993109, + "language_loss": 0.8901087, + "learning_rate": 0.00013422165130877857, + "loss": 0.90152502, + "num_input_tokens_seen": 331327504, + "router_z_loss_mlp": 0.7421875, + "step": 3994, + "time_per_iteration": 2.911731004714966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142658, + "balance_loss_mlp": 1.06827152, + "epoch": 0.7685648326279338, + "flos": 556338319872.0, + "grad_norm": 0.037345354137488074, + "language_loss": 0.84750074, + "learning_rate": 0.00013400931882065327, + "loss": 0.85892731, + "num_input_tokens_seen": 331398464, + "router_z_loss_mlp": 0.7421875, + "step": 3995, + "time_per_iteration": 2.6689093112945557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142291, + "balance_loss_mlp": 1.06790483, + "epoch": 0.7687572143131974, + "flos": 688743783936.0, + "grad_norm": 0.03341807173983279, + "language_loss": 0.85686117, + "learning_rate": 0.0001337971284227118, + "loss": 0.86828411, + "num_input_tokens_seen": 331484592, + "router_z_loss_mlp": 0.7421875, + "step": 3996, + "time_per_iteration": 3.0353329181671143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148544, + "balance_loss_mlp": 1.07644653, + "epoch": 0.7689495959984609, + "flos": 1492665781248.0, + "grad_norm": 0.006288320283860005, + "language_loss": 0.76118422, + "learning_rate": 0.00013358508019733388, + "loss": 0.77266961, + "num_input_tokens_seen": 331721360, + "router_z_loss_mlp": 0.72265625, + "step": 3997, + "time_per_iteration": 4.911880731582642 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144884, + "balance_loss_mlp": 1.07049692, + "epoch": 0.7691419776837245, + "flos": 571499667456.0, + "grad_norm": 0.031757425540639796, + "language_loss": 0.84642863, + "learning_rate": 0.0001333731742268438, + "loss": 0.85787749, + "num_input_tokens_seen": 331794240, + "router_z_loss_mlp": 0.7421875, + "step": 3998, + "time_per_iteration": 2.6962177753448486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145361, + "balance_loss_mlp": 1.07097435, + "epoch": 0.7693343593689881, + "flos": 521190153216.0, + "grad_norm": 0.03369214696754818, + "language_loss": 0.89708233, + "learning_rate": 0.0001331614105935109, + "loss": 0.9085359, + "num_input_tokens_seen": 331866496, + "router_z_loss_mlp": 0.7421875, + "step": 3999, + "time_per_iteration": 2.6809701919555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114508, + "balance_loss_mlp": 1.07074106, + "epoch": 0.7695267410542517, + "flos": 661551232512.0, + "grad_norm": 0.03371243854874441, + "language_loss": 0.88376063, + "learning_rate": 0.00013294978937954883, + "loss": 0.8952114, + "num_input_tokens_seen": 331936592, + "router_z_loss_mlp": 0.74169922, + "step": 4000, + "time_per_iteration": 2.867079973220825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114193, + "balance_loss_mlp": 1.06754363, + "epoch": 0.7697191227395151, + "flos": 547858953216.0, + "grad_norm": 0.037308762350110276, + "language_loss": 0.89336216, + "learning_rate": 0.00013273831066711655, + "loss": 0.90478146, + "num_input_tokens_seen": 332003536, + "router_z_loss_mlp": 0.7421875, + "step": 4001, + "time_per_iteration": 2.5953049659729004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141038, + "balance_loss_mlp": 1.06684196, + "epoch": 0.7699115044247787, + "flos": 541695994368.0, + "grad_norm": 0.03259494083798661, + "language_loss": 0.84480441, + "learning_rate": 0.00013252697453831747, + "loss": 0.85621476, + "num_input_tokens_seen": 332075248, + "router_z_loss_mlp": 0.74121094, + "step": 4002, + "time_per_iteration": 2.685664653778076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140964, + "balance_loss_mlp": 1.06686342, + "epoch": 0.7701038861100423, + "flos": 564142938624.0, + "grad_norm": 0.03879527633270508, + "language_loss": 0.87191802, + "learning_rate": 0.00013231578107519916, + "loss": 0.8833276, + "num_input_tokens_seen": 332158944, + "router_z_loss_mlp": 0.74072266, + "step": 4003, + "time_per_iteration": 2.8707611560821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142721, + "balance_loss_mlp": 1.06843019, + "epoch": 0.7702962677953059, + "flos": 482733196800.0, + "grad_norm": 0.03964954780213044, + "language_loss": 0.87790287, + "learning_rate": 0.00013210473035975422, + "loss": 0.88933003, + "num_input_tokens_seen": 332226368, + "router_z_loss_mlp": 0.74169922, + "step": 4004, + "time_per_iteration": 2.577669143676758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137199, + "balance_loss_mlp": 1.06266928, + "epoch": 0.7704886494805695, + "flos": 771805192704.0, + "grad_norm": 0.03541890764411222, + "language_loss": 0.90018678, + "learning_rate": 0.0001318938224739201, + "loss": 0.91155875, + "num_input_tokens_seen": 332314784, + "router_z_loss_mlp": 0.74365234, + "step": 4005, + "time_per_iteration": 3.054161787033081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138331, + "balance_loss_mlp": 1.06384909, + "epoch": 0.770681031165833, + "flos": 602317189632.0, + "grad_norm": 0.032853196947195275, + "language_loss": 0.87994003, + "learning_rate": 0.00013168305749957843, + "loss": 0.89132333, + "num_input_tokens_seen": 332387952, + "router_z_loss_mlp": 0.74316406, + "step": 4006, + "time_per_iteration": 2.742284059524536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139142, + "balance_loss_mlp": 1.06461227, + "epoch": 0.7708734128510966, + "flos": 497095544832.0, + "grad_norm": 0.034737097331234285, + "language_loss": 0.87459195, + "learning_rate": 0.00013147243551855532, + "loss": 0.88598335, + "num_input_tokens_seen": 332456352, + "router_z_loss_mlp": 0.74365234, + "step": 4007, + "time_per_iteration": 2.565561532974243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138441, + "balance_loss_mlp": 1.06400645, + "epoch": 0.7710657945363601, + "flos": 568454117376.0, + "grad_norm": 0.028865688800901353, + "language_loss": 0.84292293, + "learning_rate": 0.00013126195661262148, + "loss": 0.85430735, + "num_input_tokens_seen": 332534288, + "router_z_loss_mlp": 0.74267578, + "step": 4008, + "time_per_iteration": 2.76387357711792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143893, + "balance_loss_mlp": 1.06969726, + "epoch": 0.7712581762216237, + "flos": 605749504512.0, + "grad_norm": 0.03137791389810697, + "language_loss": 0.90203846, + "learning_rate": 0.00013105162086349216, + "loss": 0.91347742, + "num_input_tokens_seen": 332615440, + "router_z_loss_mlp": 0.74121094, + "step": 4009, + "time_per_iteration": 2.8172740936279297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144917, + "balance_loss_mlp": 1.07057822, + "epoch": 0.7714505579068872, + "flos": 531996661248.0, + "grad_norm": 0.03056437231076115, + "language_loss": 0.89419609, + "learning_rate": 0.00013084142835282687, + "loss": 0.90564525, + "num_input_tokens_seen": 332687360, + "router_z_loss_mlp": 0.74169922, + "step": 4010, + "time_per_iteration": 2.7165045738220215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150368, + "balance_loss_mlp": 1.07769775, + "epoch": 0.7716429395921508, + "flos": 1425380069376.0, + "grad_norm": 0.007418114590999428, + "language_loss": 0.79884362, + "learning_rate": 0.00013063137916222956, + "loss": 0.81034732, + "num_input_tokens_seen": 332919936, + "router_z_loss_mlp": 0.7265625, + "step": 4011, + "time_per_iteration": 4.772608757019043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143697, + "balance_loss_mlp": 1.06978679, + "epoch": 0.7718353212774144, + "flos": 579586265088.0, + "grad_norm": 0.032910193378974356, + "language_loss": 0.94427228, + "learning_rate": 0.0001304214733732485, + "loss": 0.95570928, + "num_input_tokens_seen": 332990096, + "router_z_loss_mlp": 0.73925781, + "step": 4012, + "time_per_iteration": 2.789973258972168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143696, + "balance_loss_mlp": 1.06969118, + "epoch": 0.772027702962678, + "flos": 511772798976.0, + "grad_norm": 0.03524437980359451, + "language_loss": 0.87796986, + "learning_rate": 0.00013021171106737672, + "loss": 0.8894068, + "num_input_tokens_seen": 333063616, + "router_z_loss_mlp": 0.74023438, + "step": 4013, + "time_per_iteration": 2.71975040435791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113924, + "balance_loss_mlp": 1.06499684, + "epoch": 0.7722200846479416, + "flos": 526747494912.0, + "grad_norm": 0.030121234112763372, + "language_loss": 0.84496903, + "learning_rate": 0.00013000209232605071, + "loss": 0.85636145, + "num_input_tokens_seen": 333136368, + "router_z_loss_mlp": 0.74121094, + "step": 4014, + "time_per_iteration": 2.6892056465148926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139469, + "balance_loss_mlp": 1.06508267, + "epoch": 0.772412466333205, + "flos": 480601439232.0, + "grad_norm": 0.03460224041299985, + "language_loss": 0.83357382, + "learning_rate": 0.0001297926172306519, + "loss": 0.84496856, + "num_input_tokens_seen": 333207136, + "router_z_loss_mlp": 0.7421875, + "step": 4015, + "time_per_iteration": 2.6161460876464844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138641, + "balance_loss_mlp": 1.06449294, + "epoch": 0.7726048480184686, + "flos": 907312602624.0, + "grad_norm": 0.03829273799260643, + "language_loss": 0.83440059, + "learning_rate": 0.0001295832858625055, + "loss": 0.84578699, + "num_input_tokens_seen": 333291920, + "router_z_loss_mlp": 0.74023438, + "step": 4016, + "time_per_iteration": 3.286180019378662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137589, + "balance_loss_mlp": 1.06329787, + "epoch": 0.7727972297037322, + "flos": 632566024704.0, + "grad_norm": 0.037636726324715264, + "language_loss": 0.7551474, + "learning_rate": 0.00012937409830288154, + "loss": 0.7665233, + "num_input_tokens_seen": 333369824, + "router_z_loss_mlp": 0.74121094, + "step": 4017, + "time_per_iteration": 2.8370349407196045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142791, + "balance_loss_mlp": 1.0688808, + "epoch": 0.7729896113889958, + "flos": 415673068032.0, + "grad_norm": 0.038209347580389144, + "language_loss": 0.9001559, + "learning_rate": 0.00012916505463299362, + "loss": 0.91158378, + "num_input_tokens_seen": 333434192, + "router_z_loss_mlp": 0.73925781, + "step": 4018, + "time_per_iteration": 2.519319772720337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141641, + "balance_loss_mlp": 1.06754065, + "epoch": 0.7731819930742593, + "flos": 670104459264.0, + "grad_norm": 0.03754903876157777, + "language_loss": 0.83159339, + "learning_rate": 0.00012895615493399972, + "loss": 0.84300983, + "num_input_tokens_seen": 333509696, + "router_z_loss_mlp": 0.74072266, + "step": 4019, + "time_per_iteration": 2.8084754943847656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136472, + "balance_loss_mlp": 1.06203771, + "epoch": 0.7733743747595229, + "flos": 490858725888.0, + "grad_norm": 0.052975326566308774, + "language_loss": 0.88814008, + "learning_rate": 0.00012874739928700192, + "loss": 0.89950484, + "num_input_tokens_seen": 333575184, + "router_z_loss_mlp": 0.74267578, + "step": 4020, + "time_per_iteration": 2.6240487098693848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113737, + "balance_loss_mlp": 1.06307888, + "epoch": 0.7735667564447865, + "flos": 660887218176.0, + "grad_norm": 0.04201046633060088, + "language_loss": 0.84696388, + "learning_rate": 0.00012853878777304624, + "loss": 0.85833752, + "num_input_tokens_seen": 333651568, + "router_z_loss_mlp": 0.74121094, + "step": 4021, + "time_per_iteration": 2.873288154602051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135595, + "balance_loss_mlp": 1.06120825, + "epoch": 0.77375913813005, + "flos": 534490988544.0, + "grad_norm": 0.02933243833596509, + "language_loss": 0.88221383, + "learning_rate": 0.000128330320473123, + "loss": 0.89356983, + "num_input_tokens_seen": 333726400, + "router_z_loss_mlp": 0.7421875, + "step": 4022, + "time_per_iteration": 2.6959497928619385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138573, + "balance_loss_mlp": 1.06590271, + "epoch": 0.7739515198153136, + "flos": 1523379244032.0, + "grad_norm": 0.005476553783658496, + "language_loss": 0.783319, + "learning_rate": 0.00012812199746816628, + "loss": 0.79470468, + "num_input_tokens_seen": 333960224, + "router_z_loss_mlp": 0.7265625, + "step": 4023, + "time_per_iteration": 4.908393621444702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136949, + "balance_loss_mlp": 1.06256282, + "epoch": 0.7741439015005771, + "flos": 641251508736.0, + "grad_norm": 0.0388161486580036, + "language_loss": 0.86722291, + "learning_rate": 0.0001279138188390543, + "loss": 0.87859237, + "num_input_tokens_seen": 334033904, + "router_z_loss_mlp": 0.7421875, + "step": 4024, + "time_per_iteration": 2.8532886505126953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142263, + "balance_loss_mlp": 1.06835282, + "epoch": 0.7743362831858407, + "flos": 667023980544.0, + "grad_norm": 0.03451580070650428, + "language_loss": 0.90432525, + "learning_rate": 0.00012770578466660915, + "loss": 0.91574788, + "num_input_tokens_seen": 334107904, + "router_z_loss_mlp": 0.73925781, + "step": 4025, + "time_per_iteration": 2.862123489379883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142172, + "balance_loss_mlp": 1.06807196, + "epoch": 0.7745286648711043, + "flos": 563993217024.0, + "grad_norm": 0.03283033762939225, + "language_loss": 0.85806942, + "learning_rate": 0.0001274978950315968, + "loss": 0.86949122, + "num_input_tokens_seen": 334184048, + "router_z_loss_mlp": 0.74072266, + "step": 4026, + "time_per_iteration": 2.802757501602173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137274, + "balance_loss_mlp": 1.06288695, + "epoch": 0.7747210465563679, + "flos": 517961954304.0, + "grad_norm": 0.042128094380904035, + "language_loss": 0.87673521, + "learning_rate": 0.00012729015001472716, + "loss": 0.88810796, + "num_input_tokens_seen": 334257152, + "router_z_loss_mlp": 0.7421875, + "step": 4027, + "time_per_iteration": 2.6692821979522705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137346, + "balance_loss_mlp": 1.06295931, + "epoch": 0.7749134282416313, + "flos": 635368527360.0, + "grad_norm": 0.03931555017475162, + "language_loss": 0.86517704, + "learning_rate": 0.00012708254969665418, + "loss": 0.87655056, + "num_input_tokens_seen": 334331312, + "router_z_loss_mlp": 0.7421875, + "step": 4028, + "time_per_iteration": 2.7921457290649414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138509, + "balance_loss_mlp": 1.0641222, + "epoch": 0.7751058099268949, + "flos": 496350939648.0, + "grad_norm": 0.04579390573234304, + "language_loss": 0.889467, + "learning_rate": 0.00012687509415797526, + "loss": 0.90085208, + "num_input_tokens_seen": 334397344, + "router_z_loss_mlp": 0.7421875, + "step": 4029, + "time_per_iteration": 2.5587246417999268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137293, + "balance_loss_mlp": 1.06304908, + "epoch": 0.7752981916121585, + "flos": 511362565632.0, + "grad_norm": 0.03685004486441248, + "language_loss": 0.85761744, + "learning_rate": 0.00012666778347923208, + "loss": 0.86899036, + "num_input_tokens_seen": 334467872, + "router_z_loss_mlp": 0.74072266, + "step": 4030, + "time_per_iteration": 2.6332554817199707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143646, + "balance_loss_mlp": 1.06978357, + "epoch": 0.7754905732974221, + "flos": 498565289472.0, + "grad_norm": 0.03255854062300405, + "language_loss": 0.87846529, + "learning_rate": 0.0001264606177409092, + "loss": 0.88990176, + "num_input_tokens_seen": 334539088, + "router_z_loss_mlp": 0.73876953, + "step": 4031, + "time_per_iteration": 2.6323087215423584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139185, + "balance_loss_mlp": 1.06498873, + "epoch": 0.7756829549826857, + "flos": 481782474240.0, + "grad_norm": 0.03677638670321597, + "language_loss": 0.90051126, + "learning_rate": 0.00012625359702343609, + "loss": 0.91190314, + "num_input_tokens_seen": 334612576, + "router_z_loss_mlp": 0.74023438, + "step": 4032, + "time_per_iteration": 2.764946937561035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136066, + "balance_loss_mlp": 1.06186974, + "epoch": 0.7758753366679492, + "flos": 553685538816.0, + "grad_norm": 0.03552074396287166, + "language_loss": 0.89551866, + "learning_rate": 0.00012604672140718504, + "loss": 0.90687937, + "num_input_tokens_seen": 334677824, + "router_z_loss_mlp": 0.74072266, + "step": 4033, + "time_per_iteration": 2.616276741027832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136731, + "balance_loss_mlp": 1.06243956, + "epoch": 0.7760677183532128, + "flos": 705063972864.0, + "grad_norm": 0.03368756555440988, + "language_loss": 0.82777321, + "learning_rate": 0.00012583999097247233, + "loss": 0.83914053, + "num_input_tokens_seen": 334751456, + "router_z_loss_mlp": 0.74121094, + "step": 4034, + "time_per_iteration": 2.8126814365386963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136753, + "balance_loss_mlp": 1.06255746, + "epoch": 0.7762601000384763, + "flos": 524478750720.0, + "grad_norm": 0.036921944541312396, + "language_loss": 0.85384995, + "learning_rate": 0.0001256334057995578, + "loss": 0.86521751, + "num_input_tokens_seen": 334823008, + "router_z_loss_mlp": 0.74072266, + "step": 4035, + "time_per_iteration": 2.6846728324890137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138277, + "balance_loss_mlp": 1.0641768, + "epoch": 0.7764524817237399, + "flos": 558617797632.0, + "grad_norm": 0.033254007354158545, + "language_loss": 0.89694679, + "learning_rate": 0.000125426965968645, + "loss": 0.90832961, + "num_input_tokens_seen": 334896048, + "router_z_loss_mlp": 0.73974609, + "step": 4036, + "time_per_iteration": 2.747835636138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144007, + "balance_loss_mlp": 1.07009733, + "epoch": 0.7766448634090035, + "flos": 580816965120.0, + "grad_norm": 0.036524717116784906, + "language_loss": 0.87124515, + "learning_rate": 0.00012522067155988092, + "loss": 0.88268518, + "num_input_tokens_seen": 334964416, + "router_z_loss_mlp": 0.73925781, + "step": 4037, + "time_per_iteration": 2.7287211418151855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011441, + "balance_loss_mlp": 1.07028544, + "epoch": 0.776837245094267, + "flos": 636818806272.0, + "grad_norm": 0.04076227552668926, + "language_loss": 0.80187047, + "learning_rate": 0.00012501452265335617, + "loss": 0.81331146, + "num_input_tokens_seen": 335043360, + "router_z_loss_mlp": 0.73828125, + "step": 4038, + "time_per_iteration": 2.811866283416748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138752, + "balance_loss_mlp": 1.06455588, + "epoch": 0.7770296267795306, + "flos": 615813408768.0, + "grad_norm": 0.0355390445236554, + "language_loss": 0.87746716, + "learning_rate": 0.0001248085193291047, + "loss": 0.88885468, + "num_input_tokens_seen": 335113216, + "router_z_loss_mlp": 0.74023438, + "step": 4039, + "time_per_iteration": 2.734161853790283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137901, + "balance_loss_mlp": 1.06380022, + "epoch": 0.7772220084647942, + "flos": 880295969280.0, + "grad_norm": 0.030150697576870535, + "language_loss": 0.86369264, + "learning_rate": 0.00012460266166710443, + "loss": 0.87507164, + "num_input_tokens_seen": 335195824, + "router_z_loss_mlp": 0.73974609, + "step": 4040, + "time_per_iteration": 3.137223243713379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146543, + "balance_loss_mlp": 1.07215619, + "epoch": 0.7774143901500578, + "flos": 841038011904.0, + "grad_norm": 0.03809465045400834, + "language_loss": 0.82413107, + "learning_rate": 0.00012439694974727633, + "loss": 0.8355965, + "num_input_tokens_seen": 335269712, + "router_z_loss_mlp": 0.7421875, + "step": 4041, + "time_per_iteration": 3.0596840381622314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146741, + "balance_loss_mlp": 1.07225895, + "epoch": 0.7776067718353212, + "flos": 569228921856.0, + "grad_norm": 0.03500635055952716, + "language_loss": 0.84672141, + "learning_rate": 0.00012419138364948458, + "loss": 0.85818887, + "num_input_tokens_seen": 335343408, + "router_z_loss_mlp": 0.74316406, + "step": 4042, + "time_per_iteration": 2.697154998779297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153394, + "balance_loss_mlp": 1.07919836, + "epoch": 0.7777991535205848, + "flos": 747209026560.0, + "grad_norm": 0.038117976475530245, + "language_loss": 0.87011731, + "learning_rate": 0.00012398596345353702, + "loss": 0.88165122, + "num_input_tokens_seen": 335415360, + "router_z_loss_mlp": 0.74072266, + "step": 4043, + "time_per_iteration": 2.903593063354492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145251, + "balance_loss_mlp": 1.07086432, + "epoch": 0.7779915352058484, + "flos": 539182201344.0, + "grad_norm": 0.034270473867383876, + "language_loss": 0.87845659, + "learning_rate": 0.0001237806892391851, + "loss": 0.88990903, + "num_input_tokens_seen": 335491568, + "router_z_loss_mlp": 0.7421875, + "step": 4044, + "time_per_iteration": 2.713480234146118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145012, + "balance_loss_mlp": 1.0706259, + "epoch": 0.778183916891112, + "flos": 635954678784.0, + "grad_norm": 0.03512178084580865, + "language_loss": 0.85495478, + "learning_rate": 0.0001235755610861233, + "loss": 0.86640489, + "num_input_tokens_seen": 335567200, + "router_z_loss_mlp": 0.7421875, + "step": 4045, + "time_per_iteration": 2.732534170150757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141546, + "balance_loss_mlp": 1.06711173, + "epoch": 0.7783762985763756, + "flos": 589789157376.0, + "grad_norm": 0.036702613640591464, + "language_loss": 0.89351201, + "learning_rate": 0.0001233705790739893, + "loss": 0.90492749, + "num_input_tokens_seen": 335640512, + "router_z_loss_mlp": 0.74267578, + "step": 4046, + "time_per_iteration": 2.7078564167022705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139744, + "balance_loss_mlp": 1.06535733, + "epoch": 0.7785686802616391, + "flos": 932240412672.0, + "grad_norm": 0.03647485158303252, + "language_loss": 0.79245514, + "learning_rate": 0.0001231657432823643, + "loss": 0.80385262, + "num_input_tokens_seen": 335726016, + "router_z_loss_mlp": 0.7421875, + "step": 4047, + "time_per_iteration": 3.204200029373169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114146, + "balance_loss_mlp": 1.06707299, + "epoch": 0.7787610619469026, + "flos": 498956057088.0, + "grad_norm": 0.04086385671919431, + "language_loss": 0.84949565, + "learning_rate": 0.0001229610537907725, + "loss": 0.86091024, + "num_input_tokens_seen": 335794864, + "router_z_loss_mlp": 0.7421875, + "step": 4048, + "time_per_iteration": 2.587411403656006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139179, + "balance_loss_mlp": 1.06483984, + "epoch": 0.7789534436321662, + "flos": 516650663424.0, + "grad_norm": 0.0370984959952915, + "language_loss": 0.95913208, + "learning_rate": 0.00012275651067868143, + "loss": 0.97052377, + "num_input_tokens_seen": 335860928, + "router_z_loss_mlp": 0.74169922, + "step": 4049, + "time_per_iteration": 2.6297829151153564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145054, + "balance_loss_mlp": 1.07095397, + "epoch": 0.7791458253174298, + "flos": 990061106688.0, + "grad_norm": 0.049766868205719794, + "language_loss": 0.84448528, + "learning_rate": 0.00012255211402550182, + "loss": 0.85593581, + "num_input_tokens_seen": 335945728, + "router_z_loss_mlp": 0.74072266, + "step": 4050, + "time_per_iteration": 3.2185845375061035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138393, + "balance_loss_mlp": 1.06400621, + "epoch": 0.7793382070026933, + "flos": 630184488960.0, + "grad_norm": 0.041629514228615855, + "language_loss": 0.82138163, + "learning_rate": 0.00012234786391058727, + "loss": 0.83276558, + "num_input_tokens_seen": 336014848, + "router_z_loss_mlp": 0.7421875, + "step": 4051, + "time_per_iteration": 2.7984745502471924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114015, + "balance_loss_mlp": 1.06590664, + "epoch": 0.7795305886879569, + "flos": 532762733568.0, + "grad_norm": 0.042901247751836985, + "language_loss": 0.90027404, + "learning_rate": 0.0001221437604132352, + "loss": 0.91167557, + "num_input_tokens_seen": 336080096, + "router_z_loss_mlp": 0.74072266, + "step": 4052, + "time_per_iteration": 2.6062204837799072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139339, + "balance_loss_mlp": 1.06490481, + "epoch": 0.7797229703732205, + "flos": 613141161984.0, + "grad_norm": 0.0426206226565264, + "language_loss": 0.86529624, + "learning_rate": 0.0001219398036126852, + "loss": 0.87668967, + "num_input_tokens_seen": 336154640, + "router_z_loss_mlp": 0.74267578, + "step": 4053, + "time_per_iteration": 2.7453675270080566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137791, + "balance_loss_mlp": 1.06340408, + "epoch": 0.7799153520584841, + "flos": 873794635776.0, + "grad_norm": 0.03320369943222444, + "language_loss": 0.82415718, + "learning_rate": 0.00012173599358812027, + "loss": 0.83553505, + "num_input_tokens_seen": 336244160, + "router_z_loss_mlp": 0.7421875, + "step": 4054, + "time_per_iteration": 3.2739408016204834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137317, + "balance_loss_mlp": 1.06293011, + "epoch": 0.7801077337437476, + "flos": 584744107008.0, + "grad_norm": 0.03804124847596099, + "language_loss": 0.87714571, + "learning_rate": 0.0001215323304186668, + "loss": 0.88851887, + "num_input_tokens_seen": 336317936, + "router_z_loss_mlp": 0.7421875, + "step": 4055, + "time_per_iteration": 2.7659378051757812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137343, + "balance_loss_mlp": 1.06319451, + "epoch": 0.7803001154290111, + "flos": 602280259584.0, + "grad_norm": 0.03158827116137511, + "language_loss": 0.91988087, + "learning_rate": 0.00012132881418339364, + "loss": 0.93125427, + "num_input_tokens_seen": 336389504, + "router_z_loss_mlp": 0.74072266, + "step": 4056, + "time_per_iteration": 2.7168469429016113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114492, + "balance_loss_mlp": 1.07263184, + "epoch": 0.7804924971142747, + "flos": 1482925515264.0, + "grad_norm": 0.005095674237873183, + "language_loss": 0.77517563, + "learning_rate": 0.00012112544496131306, + "loss": 0.78662485, + "num_input_tokens_seen": 336615536, + "router_z_loss_mlp": 0.72460938, + "step": 4057, + "time_per_iteration": 4.8585734367370605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113894, + "balance_loss_mlp": 1.06460154, + "epoch": 0.7806848787995383, + "flos": 631515245568.0, + "grad_norm": 0.03359665860494396, + "language_loss": 0.81806797, + "learning_rate": 0.00012092222283137944, + "loss": 0.8294574, + "num_input_tokens_seen": 336686400, + "router_z_loss_mlp": 0.74169922, + "step": 4058, + "time_per_iteration": 2.757882595062256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115152, + "balance_loss_mlp": 1.079422, + "epoch": 0.7808772604848019, + "flos": 1420745252352.0, + "grad_norm": 0.008112478231263178, + "language_loss": 0.7890631, + "learning_rate": 0.00012071914787249111, + "loss": 0.8005783, + "num_input_tokens_seen": 336912704, + "router_z_loss_mlp": 0.72265625, + "step": 4059, + "time_per_iteration": 4.779797315597534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011384, + "balance_loss_mlp": 1.06406116, + "epoch": 0.7810696421700654, + "flos": 733103187456.0, + "grad_norm": 0.03176373649090862, + "language_loss": 0.88107026, + "learning_rate": 0.00012051622016348856, + "loss": 0.89245427, + "num_input_tokens_seen": 336997040, + "router_z_loss_mlp": 0.74169922, + "step": 4060, + "time_per_iteration": 3.0269150733947754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138414, + "balance_loss_mlp": 1.06412303, + "epoch": 0.781262023855329, + "flos": 425837028864.0, + "grad_norm": 0.038145388321841694, + "language_loss": 0.90811419, + "learning_rate": 0.00012031343978315539, + "loss": 0.91949832, + "num_input_tokens_seen": 337059760, + "router_z_loss_mlp": 0.74121094, + "step": 4061, + "time_per_iteration": 2.459432363510132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136363, + "balance_loss_mlp": 1.06197631, + "epoch": 0.7814544055405925, + "flos": 502073465856.0, + "grad_norm": 0.03753829813607959, + "language_loss": 0.87161046, + "learning_rate": 0.00012011080681021774, + "loss": 0.88297415, + "num_input_tokens_seen": 337128528, + "router_z_loss_mlp": 0.7421875, + "step": 4062, + "time_per_iteration": 2.691654920578003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136384, + "balance_loss_mlp": 1.06204486, + "epoch": 0.7816467872258561, + "flos": 463392927744.0, + "grad_norm": 0.03545714253981061, + "language_loss": 0.90689021, + "learning_rate": 0.00011990832132334512, + "loss": 0.91825402, + "num_input_tokens_seen": 337194112, + "router_z_loss_mlp": 0.74169922, + "step": 4063, + "time_per_iteration": 2.501356363296509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011365, + "balance_loss_mlp": 1.06211364, + "epoch": 0.7818391689111197, + "flos": 742107580416.0, + "grad_norm": 0.03646375779692072, + "language_loss": 0.8761006, + "learning_rate": 0.00011970598340114897, + "loss": 0.8874656, + "num_input_tokens_seen": 337270416, + "router_z_loss_mlp": 0.7421875, + "step": 4064, + "time_per_iteration": 2.9211695194244385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138234, + "balance_loss_mlp": 1.06389523, + "epoch": 0.7820315505963832, + "flos": 548805672960.0, + "grad_norm": 0.037373767627345386, + "language_loss": 0.88286138, + "learning_rate": 0.00011950379312218396, + "loss": 0.89424372, + "num_input_tokens_seen": 337343024, + "router_z_loss_mlp": 0.74169922, + "step": 4065, + "time_per_iteration": 2.7662761211395264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139451, + "balance_loss_mlp": 1.06511247, + "epoch": 0.7822239322816468, + "flos": 730259025408.0, + "grad_norm": 0.031688812892368586, + "language_loss": 0.90089023, + "learning_rate": 0.00011930175056494719, + "loss": 0.91228467, + "num_input_tokens_seen": 337417232, + "router_z_loss_mlp": 0.74169922, + "step": 4066, + "time_per_iteration": 2.8510522842407227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145428, + "balance_loss_mlp": 1.07137561, + "epoch": 0.7824163139669104, + "flos": 452985919488.0, + "grad_norm": 0.030648314991386538, + "language_loss": 0.79762566, + "learning_rate": 0.00011909985580787885, + "loss": 0.80907995, + "num_input_tokens_seen": 337488224, + "router_z_loss_mlp": 0.73974609, + "step": 4067, + "time_per_iteration": 2.6272332668304443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144706, + "balance_loss_mlp": 1.07074893, + "epoch": 0.782608695652174, + "flos": 541620132864.0, + "grad_norm": 0.030654260562385374, + "language_loss": 0.85639668, + "learning_rate": 0.00011889810892936137, + "loss": 0.86784375, + "num_input_tokens_seen": 337564928, + "router_z_loss_mlp": 0.73974609, + "step": 4068, + "time_per_iteration": 2.7750964164733887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114329, + "balance_loss_mlp": 1.06899869, + "epoch": 0.7828010773374374, + "flos": 501428917248.0, + "grad_norm": 0.03582388212815207, + "language_loss": 0.82907784, + "learning_rate": 0.00011869651000771959, + "loss": 0.84051073, + "num_input_tokens_seen": 337641632, + "router_z_loss_mlp": 0.74169922, + "step": 4069, + "time_per_iteration": 2.8643925189971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138233, + "balance_loss_mlp": 1.06389439, + "epoch": 0.782993459022701, + "flos": 601917689856.0, + "grad_norm": 0.03429166344261292, + "language_loss": 0.87759733, + "learning_rate": 0.00011849505912122117, + "loss": 0.88897967, + "num_input_tokens_seen": 337711968, + "router_z_loss_mlp": 0.74169922, + "step": 4070, + "time_per_iteration": 2.6959619522094727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138061, + "balance_loss_mlp": 1.06377029, + "epoch": 0.7831858407079646, + "flos": 811475384832.0, + "grad_norm": 0.039746496548432604, + "language_loss": 0.82642615, + "learning_rate": 0.00011829375634807654, + "loss": 0.8378067, + "num_input_tokens_seen": 337795792, + "router_z_loss_mlp": 0.74121094, + "step": 4071, + "time_per_iteration": 3.0114569664001465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136715, + "balance_loss_mlp": 1.06247175, + "epoch": 0.7833782223932282, + "flos": 808012870656.0, + "grad_norm": 0.03273964905208881, + "language_loss": 0.857427, + "learning_rate": 0.00011809260176643821, + "loss": 0.86879414, + "num_input_tokens_seen": 337875584, + "router_z_loss_mlp": 0.74121094, + "step": 4072, + "time_per_iteration": 3.0994741916656494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136116, + "balance_loss_mlp": 1.06206274, + "epoch": 0.7835706040784918, + "flos": 521899829760.0, + "grad_norm": 0.04024817722432492, + "language_loss": 0.88959461, + "learning_rate": 0.00011789159545440131, + "loss": 0.9009558, + "num_input_tokens_seen": 337942304, + "router_z_loss_mlp": 0.74023438, + "step": 4073, + "time_per_iteration": 2.644077777862549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138181, + "balance_loss_mlp": 1.06398499, + "epoch": 0.7837629857637552, + "flos": 506743211520.0, + "grad_norm": 0.03009333087268268, + "language_loss": 0.86380607, + "learning_rate": 0.00011769073749000348, + "loss": 0.87518787, + "num_input_tokens_seen": 338020864, + "router_z_loss_mlp": 0.74023438, + "step": 4074, + "time_per_iteration": 2.7675211429595947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138086, + "balance_loss_mlp": 1.06384242, + "epoch": 0.7839553674490188, + "flos": 517134756864.0, + "grad_norm": 0.03603773685865746, + "language_loss": 0.81149113, + "learning_rate": 0.0001174900279512246, + "loss": 0.82287204, + "num_input_tokens_seen": 338089584, + "router_z_loss_mlp": 0.74072266, + "step": 4075, + "time_per_iteration": 2.559067964553833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138281, + "balance_loss_mlp": 1.06418085, + "epoch": 0.7841477491342824, + "flos": 507650273280.0, + "grad_norm": 0.04900023922641464, + "language_loss": 0.86111671, + "learning_rate": 0.00011728946691598707, + "loss": 0.87249947, + "num_input_tokens_seen": 338159568, + "router_z_loss_mlp": 0.73974609, + "step": 4076, + "time_per_iteration": 2.601316213607788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139089, + "balance_loss_mlp": 1.06498837, + "epoch": 0.784340130819546, + "flos": 720904797696.0, + "grad_norm": 0.037946042945582265, + "language_loss": 0.81358349, + "learning_rate": 0.00011708905446215561, + "loss": 0.82497436, + "num_input_tokens_seen": 338233952, + "router_z_loss_mlp": 0.73974609, + "step": 4077, + "time_per_iteration": 2.8491528034210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138777, + "balance_loss_mlp": 1.06477141, + "epoch": 0.7845325125048095, + "flos": 515513289216.0, + "grad_norm": 0.03152801605769719, + "language_loss": 0.84297472, + "learning_rate": 0.00011688879066753711, + "loss": 0.85436249, + "num_input_tokens_seen": 338309568, + "router_z_loss_mlp": 0.73925781, + "step": 4078, + "time_per_iteration": 2.649890184402466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139298, + "balance_loss_mlp": 1.06529319, + "epoch": 0.7847248941900731, + "flos": 467050825728.0, + "grad_norm": 0.04544253460314975, + "language_loss": 0.92901659, + "learning_rate": 0.00011668867560988122, + "loss": 0.9404096, + "num_input_tokens_seen": 338375920, + "router_z_loss_mlp": 0.73925781, + "step": 4079, + "time_per_iteration": 2.583395004272461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137742, + "balance_loss_mlp": 1.06383276, + "epoch": 0.7849172758753367, + "flos": 504083699712.0, + "grad_norm": 0.03256844135977144, + "language_loss": 0.89159727, + "learning_rate": 0.00011648870936687916, + "loss": 0.90297467, + "num_input_tokens_seen": 338452208, + "router_z_loss_mlp": 0.73876953, + "step": 4080, + "time_per_iteration": 2.729670524597168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137567, + "balance_loss_mlp": 1.06375289, + "epoch": 0.7851096575606002, + "flos": 533031977472.0, + "grad_norm": 0.038157171447079044, + "language_loss": 0.83702409, + "learning_rate": 0.00011628889201616461, + "loss": 0.84839982, + "num_input_tokens_seen": 338522864, + "router_z_loss_mlp": 0.73828125, + "step": 4081, + "time_per_iteration": 2.6109676361083984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139939, + "balance_loss_mlp": 1.06602943, + "epoch": 0.7853020392458638, + "flos": 571043771904.0, + "grad_norm": 0.03751217922846888, + "language_loss": 0.86986727, + "learning_rate": 0.00011608922363531393, + "loss": 0.88126665, + "num_input_tokens_seen": 338591024, + "router_z_loss_mlp": 0.73876953, + "step": 4082, + "time_per_iteration": 2.6544032096862793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140867, + "balance_loss_mlp": 1.06686151, + "epoch": 0.7854944209311273, + "flos": 833991459840.0, + "grad_norm": 0.051644606704595315, + "language_loss": 0.88386512, + "learning_rate": 0.00011588970430184504, + "loss": 0.8952738, + "num_input_tokens_seen": 338669616, + "router_z_loss_mlp": 0.73925781, + "step": 4083, + "time_per_iteration": 3.0330986976623535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137232, + "balance_loss_mlp": 1.06332254, + "epoch": 0.7856868026163909, + "flos": 561010066944.0, + "grad_norm": 0.028770858152958077, + "language_loss": 0.85727829, + "learning_rate": 0.00011569033409321822, + "loss": 0.86865062, + "num_input_tokens_seen": 338740416, + "router_z_loss_mlp": 0.73876953, + "step": 4084, + "time_per_iteration": 2.678072452545166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137339, + "balance_loss_mlp": 1.0635246, + "epoch": 0.7858791843016545, + "flos": 546267684864.0, + "grad_norm": 0.036494926225622726, + "language_loss": 0.77694023, + "learning_rate": 0.00011549111308683591, + "loss": 0.78831363, + "num_input_tokens_seen": 338807664, + "router_z_loss_mlp": 0.73828125, + "step": 4085, + "time_per_iteration": 2.67767596244812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137399, + "balance_loss_mlp": 1.06339443, + "epoch": 0.7860715659869181, + "flos": 381840195072.0, + "grad_norm": 0.03798884187272388, + "language_loss": 0.86288953, + "learning_rate": 0.00011529204136004251, + "loss": 0.87426353, + "num_input_tokens_seen": 338869472, + "router_z_loss_mlp": 0.73925781, + "step": 4086, + "time_per_iteration": 2.533773422241211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143071, + "balance_loss_mlp": 1.068923, + "epoch": 0.7862639476721817, + "flos": 568512514560.0, + "grad_norm": 0.030679232207270264, + "language_loss": 0.87964737, + "learning_rate": 0.00011509311899012459, + "loss": 0.89107811, + "num_input_tokens_seen": 338941312, + "router_z_loss_mlp": 0.73974609, + "step": 4087, + "time_per_iteration": 2.76526141166687 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143134, + "balance_loss_mlp": 1.06903315, + "epoch": 0.7864563293574451, + "flos": 546322079232.0, + "grad_norm": 0.04187466244210811, + "language_loss": 0.83333945, + "learning_rate": 0.00011489434605431053, + "loss": 0.84477079, + "num_input_tokens_seen": 339010208, + "router_z_loss_mlp": 0.73925781, + "step": 4088, + "time_per_iteration": 2.6215317249298096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113809, + "balance_loss_mlp": 1.06408453, + "epoch": 0.7866487110427087, + "flos": 564648499200.0, + "grad_norm": 0.03663955414764931, + "language_loss": 0.861283, + "learning_rate": 0.0001146957226297708, + "loss": 0.87266392, + "num_input_tokens_seen": 339081232, + "router_z_loss_mlp": 0.73925781, + "step": 4089, + "time_per_iteration": 2.673021078109741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137912, + "balance_loss_mlp": 1.06381154, + "epoch": 0.7868410927279723, + "flos": 729558081024.0, + "grad_norm": 0.03607616248061006, + "language_loss": 0.80388957, + "learning_rate": 0.00011449724879361827, + "loss": 0.8152687, + "num_input_tokens_seen": 339161040, + "router_z_loss_mlp": 0.73974609, + "step": 4090, + "time_per_iteration": 2.9554953575134277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138064, + "balance_loss_mlp": 1.06410635, + "epoch": 0.7870334744132359, + "flos": 522447049728.0, + "grad_norm": 0.04384771027998422, + "language_loss": 0.79606628, + "learning_rate": 0.00011429892462290687, + "loss": 0.80744684, + "num_input_tokens_seen": 339233984, + "router_z_loss_mlp": 0.73925781, + "step": 4091, + "time_per_iteration": 2.663344383239746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137849, + "balance_loss_mlp": 1.06360543, + "epoch": 0.7872258560984994, + "flos": 452362838016.0, + "grad_norm": 0.03444063676499776, + "language_loss": 0.88160485, + "learning_rate": 0.00011410075019463295, + "loss": 0.89298332, + "num_input_tokens_seen": 339303168, + "router_z_loss_mlp": 0.74072266, + "step": 4092, + "time_per_iteration": 2.6327311992645264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137383, + "balance_loss_mlp": 1.06323516, + "epoch": 0.787418237783763, + "flos": 516249162240.0, + "grad_norm": 0.03476027857253962, + "language_loss": 0.84398365, + "learning_rate": 0.00011390272558573461, + "loss": 0.85535741, + "num_input_tokens_seen": 339374512, + "router_z_loss_mlp": 0.74023438, + "step": 4093, + "time_per_iteration": 2.675528049468994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137221, + "balance_loss_mlp": 1.06316793, + "epoch": 0.7876106194690266, + "flos": 486056722944.0, + "grad_norm": 0.030632947109506273, + "language_loss": 0.84047627, + "learning_rate": 0.00011370485087309202, + "loss": 0.85184848, + "num_input_tokens_seen": 339442720, + "router_z_loss_mlp": 0.73974609, + "step": 4094, + "time_per_iteration": 2.6260645389556885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138901, + "balance_loss_mlp": 1.06465769, + "epoch": 0.7878030011542901, + "flos": 543929809920.0, + "grad_norm": 0.0372748045886788, + "language_loss": 0.83189571, + "learning_rate": 0.00011350712613352688, + "loss": 0.84328461, + "num_input_tokens_seen": 339508800, + "router_z_loss_mlp": 0.74072266, + "step": 4095, + "time_per_iteration": 2.64158034324646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138645, + "balance_loss_mlp": 1.06440127, + "epoch": 0.7879953828395537, + "flos": 517749106176.0, + "grad_norm": 0.04715116302825024, + "language_loss": 0.85976934, + "learning_rate": 0.00011330955144380283, + "loss": 0.87115586, + "num_input_tokens_seen": 339578048, + "router_z_loss_mlp": 0.74072266, + "step": 4096, + "time_per_iteration": 2.599391222000122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138884, + "balance_loss_mlp": 1.06464028, + "epoch": 0.7881877645248172, + "flos": 583376420352.0, + "grad_norm": 0.03608757830250762, + "language_loss": 0.90583527, + "learning_rate": 0.00011311212688062483, + "loss": 0.91722411, + "num_input_tokens_seen": 339650176, + "router_z_loss_mlp": 0.74072266, + "step": 4097, + "time_per_iteration": 2.7737503051757812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141606, + "balance_loss_mlp": 1.06741059, + "epoch": 0.7883801462100808, + "flos": 590327645184.0, + "grad_norm": 0.09861102268280594, + "language_loss": 0.83454096, + "learning_rate": 0.0001129148525206402, + "loss": 0.84595704, + "num_input_tokens_seen": 339727312, + "router_z_loss_mlp": 0.74023438, + "step": 4098, + "time_per_iteration": 2.8053319454193115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114196, + "balance_loss_mlp": 1.06766832, + "epoch": 0.7885725278953444, + "flos": 482741928960.0, + "grad_norm": 0.039263204911434944, + "language_loss": 0.9157722, + "learning_rate": 0.00011271772844043759, + "loss": 0.92719185, + "num_input_tokens_seen": 339801344, + "router_z_loss_mlp": 0.74121094, + "step": 4099, + "time_per_iteration": 2.6722400188446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113855, + "balance_loss_mlp": 1.06440175, + "epoch": 0.788764909580608, + "flos": 758098126848.0, + "grad_norm": 0.0423984319236596, + "language_loss": 0.81897676, + "learning_rate": 0.00011252075471654727, + "loss": 0.83036232, + "num_input_tokens_seen": 339877840, + "router_z_loss_mlp": 0.74023438, + "step": 4100, + "time_per_iteration": 2.941443920135498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138656, + "balance_loss_mlp": 1.0645076, + "epoch": 0.7889572912658714, + "flos": 703878935040.0, + "grad_norm": 0.03307179261397765, + "language_loss": 0.82702905, + "learning_rate": 0.00011232393142544133, + "loss": 0.83841556, + "num_input_tokens_seen": 339959568, + "router_z_loss_mlp": 0.74023438, + "step": 4101, + "time_per_iteration": 2.9557137489318848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138555, + "balance_loss_mlp": 1.06435919, + "epoch": 0.789149672951135, + "flos": 737840062464.0, + "grad_norm": 0.034454067220804824, + "language_loss": 0.87124509, + "learning_rate": 0.00011212725864353323, + "loss": 0.88263059, + "num_input_tokens_seen": 340043600, + "router_z_loss_mlp": 0.74023438, + "step": 4102, + "time_per_iteration": 3.0640292167663574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145164, + "balance_loss_mlp": 1.07287598, + "epoch": 0.7893420546363986, + "flos": 1484487859200.0, + "grad_norm": 0.005768368046383886, + "language_loss": 0.76335925, + "learning_rate": 0.00011193073644717822, + "loss": 0.77481097, + "num_input_tokens_seen": 340270608, + "router_z_loss_mlp": 0.72460938, + "step": 4103, + "time_per_iteration": 4.858243227005005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140406, + "balance_loss_mlp": 1.06620967, + "epoch": 0.7895344363216622, + "flos": 510079472640.0, + "grad_norm": 0.047695984740599745, + "language_loss": 0.81464952, + "learning_rate": 0.00011173436491267291, + "loss": 0.82605356, + "num_input_tokens_seen": 340338784, + "router_z_loss_mlp": 0.74023438, + "step": 4104, + "time_per_iteration": 2.6253249645233154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137981, + "balance_loss_mlp": 1.06378555, + "epoch": 0.7897268180069258, + "flos": 543037484544.0, + "grad_norm": 0.03504267179198509, + "language_loss": 0.86698043, + "learning_rate": 0.0001115381441162554, + "loss": 0.87836027, + "num_input_tokens_seen": 340407744, + "router_z_loss_mlp": 0.74023438, + "step": 4105, + "time_per_iteration": 2.644268274307251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143089, + "balance_loss_mlp": 1.07080078, + "epoch": 0.7899191996921893, + "flos": 1415749867008.0, + "grad_norm": 0.006312961233255799, + "language_loss": 0.73583722, + "learning_rate": 0.00011134207413410557, + "loss": 0.7472682, + "num_input_tokens_seen": 340635824, + "router_z_loss_mlp": 0.72460938, + "step": 4106, + "time_per_iteration": 4.874951601028442 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139255, + "balance_loss_mlp": 1.06486893, + "epoch": 0.7901115813774529, + "flos": 624021530112.0, + "grad_norm": 0.035685278807963586, + "language_loss": 0.89252567, + "learning_rate": 0.00011114615504234465, + "loss": 0.90391827, + "num_input_tokens_seen": 340710928, + "router_z_loss_mlp": 0.7421875, + "step": 4107, + "time_per_iteration": 2.759730577468872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139038, + "balance_loss_mlp": 1.06488955, + "epoch": 0.7903039630627164, + "flos": 646804847616.0, + "grad_norm": 0.03564605308593673, + "language_loss": 0.86189628, + "learning_rate": 0.00011095038691703468, + "loss": 0.87328672, + "num_input_tokens_seen": 340786128, + "router_z_loss_mlp": 0.74023438, + "step": 4108, + "time_per_iteration": 2.8478689193725586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141249, + "balance_loss_mlp": 1.0670532, + "epoch": 0.79049634474798, + "flos": 595611740160.0, + "grad_norm": 0.03583745426638565, + "language_loss": 0.86790907, + "learning_rate": 0.00011075476983417998, + "loss": 0.87932158, + "num_input_tokens_seen": 340861616, + "router_z_loss_mlp": 0.74072266, + "step": 4109, + "time_per_iteration": 2.8335795402526855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139823, + "balance_loss_mlp": 1.0655793, + "epoch": 0.7906887264332435, + "flos": 717331493376.0, + "grad_norm": 0.038905447121572734, + "language_loss": 0.82716894, + "learning_rate": 0.00011055930386972579, + "loss": 0.83856714, + "num_input_tokens_seen": 340934480, + "router_z_loss_mlp": 0.74072266, + "step": 4110, + "time_per_iteration": 2.871617555618286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136816, + "balance_loss_mlp": 1.06271601, + "epoch": 0.7908811081185071, + "flos": 791260254720.0, + "grad_norm": 0.03420948770513602, + "language_loss": 0.82615238, + "learning_rate": 0.00011036398909955863, + "loss": 0.8375206, + "num_input_tokens_seen": 341014912, + "router_z_loss_mlp": 0.74023438, + "step": 4111, + "time_per_iteration": 3.035374402999878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137149, + "balance_loss_mlp": 1.06304824, + "epoch": 0.7910734898037707, + "flos": 643075090944.0, + "grad_norm": 0.03464769838403225, + "language_loss": 0.85694349, + "learning_rate": 0.00011016882559950648, + "loss": 0.86831492, + "num_input_tokens_seen": 341090608, + "router_z_loss_mlp": 0.73974609, + "step": 4112, + "time_per_iteration": 2.809424877166748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136751, + "balance_loss_mlp": 1.06284177, + "epoch": 0.7912658714890343, + "flos": 670560354816.0, + "grad_norm": 0.03852457437308278, + "language_loss": 0.85799241, + "learning_rate": 0.00010997381344533853, + "loss": 0.86935997, + "num_input_tokens_seen": 341160992, + "router_z_loss_mlp": 0.73876953, + "step": 4113, + "time_per_iteration": 2.7723140716552734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139368, + "balance_loss_mlp": 1.06512499, + "epoch": 0.7914582531742979, + "flos": 558887041536.0, + "grad_norm": 0.03351504494890856, + "language_loss": 0.84678841, + "learning_rate": 0.00010977895271276517, + "loss": 0.85818207, + "num_input_tokens_seen": 341232032, + "router_z_loss_mlp": 0.74072266, + "step": 4114, + "time_per_iteration": 2.6767303943634033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138954, + "balance_loss_mlp": 1.06494868, + "epoch": 0.7916506348595613, + "flos": 571191492096.0, + "grad_norm": 0.04313250317632895, + "language_loss": 0.84584868, + "learning_rate": 0.00010958424347743807, + "loss": 0.85723823, + "num_input_tokens_seen": 341303888, + "router_z_loss_mlp": 0.73925781, + "step": 4115, + "time_per_iteration": 2.7286806106567383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136476, + "balance_loss_mlp": 1.06266189, + "epoch": 0.7918430165448249, + "flos": 719645899776.0, + "grad_norm": 0.03512595532684894, + "language_loss": 0.8494817, + "learning_rate": 0.00010938968581494991, + "loss": 0.8608464, + "num_input_tokens_seen": 341385616, + "router_z_loss_mlp": 0.73828125, + "step": 4116, + "time_per_iteration": 2.9482476711273193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136681, + "balance_loss_mlp": 1.06277156, + "epoch": 0.7920353982300885, + "flos": 554736317952.0, + "grad_norm": 0.04228851157339113, + "language_loss": 0.83485335, + "learning_rate": 0.000109195279800835, + "loss": 0.84622014, + "num_input_tokens_seen": 341460976, + "router_z_loss_mlp": 0.73876953, + "step": 4117, + "time_per_iteration": 2.69572114944458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139513, + "balance_loss_mlp": 1.06555605, + "epoch": 0.7922277799153521, + "flos": 811540512768.0, + "grad_norm": 0.03903964409517225, + "language_loss": 0.81738925, + "learning_rate": 0.00010900102551056834, + "loss": 0.82878435, + "num_input_tokens_seen": 341537328, + "router_z_loss_mlp": 0.73876953, + "step": 4118, + "time_per_iteration": 3.021683692932129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139717, + "balance_loss_mlp": 1.06580722, + "epoch": 0.7924201616006156, + "flos": 422244258816.0, + "grad_norm": 0.03704274036887823, + "language_loss": 0.89204621, + "learning_rate": 0.00010880692301956601, + "loss": 0.90344346, + "num_input_tokens_seen": 341600272, + "router_z_loss_mlp": 0.73876953, + "step": 4119, + "time_per_iteration": 2.509284019470215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136195, + "balance_loss_mlp": 1.06238043, + "epoch": 0.7926125432858792, + "flos": 619104734208.0, + "grad_norm": 0.032195482380303, + "language_loss": 0.90015543, + "learning_rate": 0.00010861297240318518, + "loss": 0.91151732, + "num_input_tokens_seen": 341682096, + "router_z_loss_mlp": 0.73828125, + "step": 4120, + "time_per_iteration": 2.835418939590454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136735, + "balance_loss_mlp": 1.0630163, + "epoch": 0.7928049249711427, + "flos": 603611016192.0, + "grad_norm": 0.031028055346739136, + "language_loss": 0.90660435, + "learning_rate": 0.00010841917373672444, + "loss": 0.91797173, + "num_input_tokens_seen": 341754912, + "router_z_loss_mlp": 0.73730469, + "step": 4121, + "time_per_iteration": 2.7115211486816406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136879, + "balance_loss_mlp": 1.06306481, + "epoch": 0.7929973066564063, + "flos": 657231321600.0, + "grad_norm": 0.03886819591939463, + "language_loss": 0.83054501, + "learning_rate": 0.00010822552709542293, + "loss": 0.84191382, + "num_input_tokens_seen": 341831152, + "router_z_loss_mlp": 0.73828125, + "step": 4122, + "time_per_iteration": 2.811147928237915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137962, + "balance_loss_mlp": 1.0642904, + "epoch": 0.7931896883416699, + "flos": 537434480640.0, + "grad_norm": 0.03139044095393014, + "language_loss": 0.90324616, + "learning_rate": 0.0001080320325544612, + "loss": 0.91462576, + "num_input_tokens_seen": 341903552, + "router_z_loss_mlp": 0.73681641, + "step": 4123, + "time_per_iteration": 2.6880621910095215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137532, + "balance_loss_mlp": 1.06381249, + "epoch": 0.7933820700269334, + "flos": 499068848640.0, + "grad_norm": 0.03512735769346207, + "language_loss": 0.87548339, + "learning_rate": 0.00010783869018895997, + "loss": 0.8868587, + "num_input_tokens_seen": 341972256, + "router_z_loss_mlp": 0.73730469, + "step": 4124, + "time_per_iteration": 2.6342406272888184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138023, + "balance_loss_mlp": 1.06425595, + "epoch": 0.793574451712197, + "flos": 538495993344.0, + "grad_norm": 0.03751622303181437, + "language_loss": 0.88749498, + "learning_rate": 0.00010764550007398189, + "loss": 0.89887518, + "num_input_tokens_seen": 342040496, + "router_z_loss_mlp": 0.73779297, + "step": 4125, + "time_per_iteration": 2.6272289752960205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137744, + "balance_loss_mlp": 1.0640254, + "epoch": 0.7937668333974606, + "flos": 489258725376.0, + "grad_norm": 0.034933857523794375, + "language_loss": 0.85822791, + "learning_rate": 0.00010745246228452982, + "loss": 0.86960542, + "num_input_tokens_seen": 342108512, + "router_z_loss_mlp": 0.73730469, + "step": 4126, + "time_per_iteration": 2.5639169216156006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137347, + "balance_loss_mlp": 1.06358075, + "epoch": 0.7939592150827242, + "flos": 528479752704.0, + "grad_norm": 0.034679171376522114, + "language_loss": 0.86079615, + "learning_rate": 0.00010725957689554771, + "loss": 0.87216961, + "num_input_tokens_seen": 342183568, + "router_z_loss_mlp": 0.73779297, + "step": 4127, + "time_per_iteration": 2.7611310482025146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137731, + "balance_loss_mlp": 1.06391644, + "epoch": 0.7941515967679876, + "flos": 542803169280.0, + "grad_norm": 0.03824880137917062, + "language_loss": 0.88766754, + "learning_rate": 0.00010706684398192013, + "loss": 0.89904475, + "num_input_tokens_seen": 342259920, + "router_z_loss_mlp": 0.73828125, + "step": 4128, + "time_per_iteration": 2.7266509532928467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138133, + "balance_loss_mlp": 1.06436622, + "epoch": 0.7943439784532512, + "flos": 519523023360.0, + "grad_norm": 0.040169030809423835, + "language_loss": 0.87296367, + "learning_rate": 0.00010687426361847313, + "loss": 0.88434494, + "num_input_tokens_seen": 342330192, + "router_z_loss_mlp": 0.73779297, + "step": 4129, + "time_per_iteration": 2.7299461364746094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137822, + "balance_loss_mlp": 1.06405497, + "epoch": 0.7945363601385148, + "flos": 510060006912.0, + "grad_norm": 0.03365010231466857, + "language_loss": 0.9038803, + "learning_rate": 0.00010668183587997254, + "loss": 0.91525853, + "num_input_tokens_seen": 342398944, + "router_z_loss_mlp": 0.73779297, + "step": 4130, + "time_per_iteration": 2.5838053226470947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137059, + "balance_loss_mlp": 1.06343496, + "epoch": 0.7947287418237784, + "flos": 652401120768.0, + "grad_norm": 0.02856230138733652, + "language_loss": 0.8155334, + "learning_rate": 0.0001064895608411256, + "loss": 0.826904, + "num_input_tokens_seen": 342474000, + "router_z_loss_mlp": 0.73632812, + "step": 4131, + "time_per_iteration": 2.855571746826172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140645, + "balance_loss_mlp": 1.0668304, + "epoch": 0.794921123509042, + "flos": 697372872192.0, + "grad_norm": 0.03566888341568189, + "language_loss": 0.84410554, + "learning_rate": 0.00010629743857657998, + "loss": 0.85551202, + "num_input_tokens_seen": 342549184, + "router_z_loss_mlp": 0.73828125, + "step": 4132, + "time_per_iteration": 2.8950796127319336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149963, + "balance_loss_mlp": 1.07805634, + "epoch": 0.7951135051943055, + "flos": 1406076730368.0, + "grad_norm": 0.009945360443955307, + "language_loss": 0.70598668, + "learning_rate": 0.0001061054691609244, + "loss": 0.71748632, + "num_input_tokens_seen": 342767376, + "router_z_loss_mlp": 0.72070312, + "step": 4133, + "time_per_iteration": 4.6428234577178955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137714, + "balance_loss_mlp": 1.06399536, + "epoch": 0.795305886879569, + "flos": 811449188352.0, + "grad_norm": 0.03756536523282242, + "language_loss": 0.86775541, + "learning_rate": 0.00010591365266868802, + "loss": 0.87913251, + "num_input_tokens_seen": 342845024, + "router_z_loss_mlp": 0.73730469, + "step": 4134, + "time_per_iteration": 2.9570915699005127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143425, + "balance_loss_mlp": 1.07132721, + "epoch": 0.7954982685648326, + "flos": 1429213885440.0, + "grad_norm": 0.0062941693525409926, + "language_loss": 0.75511783, + "learning_rate": 0.00010572198917434018, + "loss": 0.76655209, + "num_input_tokens_seen": 343072496, + "router_z_loss_mlp": 0.72265625, + "step": 4135, + "time_per_iteration": 4.914888143539429 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137959, + "balance_loss_mlp": 1.06404912, + "epoch": 0.7956906502500962, + "flos": 390747259392.0, + "grad_norm": 0.0392560850681974, + "language_loss": 0.85252422, + "learning_rate": 0.00010553047875229166, + "loss": 0.86390382, + "num_input_tokens_seen": 343136928, + "router_z_loss_mlp": 0.73876953, + "step": 4136, + "time_per_iteration": 2.5757832527160645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137394, + "balance_loss_mlp": 1.06362712, + "epoch": 0.7958830319353598, + "flos": 516585535488.0, + "grad_norm": 0.03073809129555248, + "language_loss": 0.8796097, + "learning_rate": 0.00010533912147689328, + "loss": 0.89098364, + "num_input_tokens_seen": 343207440, + "router_z_loss_mlp": 0.73779297, + "step": 4137, + "time_per_iteration": 2.6300714015960693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137078, + "balance_loss_mlp": 1.06335866, + "epoch": 0.7960754136206233, + "flos": 494926857216.0, + "grad_norm": 0.033442699276882225, + "language_loss": 0.87293124, + "learning_rate": 0.00010514791742243656, + "loss": 0.88430202, + "num_input_tokens_seen": 343273744, + "router_z_loss_mlp": 0.73730469, + "step": 4138, + "time_per_iteration": 2.5906717777252197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136999, + "balance_loss_mlp": 1.06323278, + "epoch": 0.7962677953058869, + "flos": 657005738496.0, + "grad_norm": 0.03903943901806541, + "language_loss": 0.87440938, + "learning_rate": 0.00010495686666315341, + "loss": 0.88577938, + "num_input_tokens_seen": 343357648, + "router_z_loss_mlp": 0.73779297, + "step": 4139, + "time_per_iteration": 2.909572124481201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113797, + "balance_loss_mlp": 1.06401289, + "epoch": 0.7964601769911505, + "flos": 543419520000.0, + "grad_norm": 0.08585465629101555, + "language_loss": 0.81986225, + "learning_rate": 0.00010476596927321635, + "loss": 0.83124197, + "num_input_tokens_seen": 343425344, + "router_z_loss_mlp": 0.73876953, + "step": 4140, + "time_per_iteration": 2.5994365215301514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137712, + "balance_loss_mlp": 1.06389797, + "epoch": 0.796652558676414, + "flos": 538826362368.0, + "grad_norm": 0.03248172590146644, + "language_loss": 0.84015322, + "learning_rate": 0.00010457522532673835, + "loss": 0.85153031, + "num_input_tokens_seen": 343504960, + "router_z_loss_mlp": 0.73828125, + "step": 4141, + "time_per_iteration": 2.851498603820801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137565, + "balance_loss_mlp": 1.06375015, + "epoch": 0.7968449403616775, + "flos": 476051215872.0, + "grad_norm": 0.03503840732668985, + "language_loss": 0.8857249, + "learning_rate": 0.00010438463489777272, + "loss": 0.89710057, + "num_input_tokens_seen": 343570832, + "router_z_loss_mlp": 0.73828125, + "step": 4142, + "time_per_iteration": 2.56007981300354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137015, + "balance_loss_mlp": 1.06320024, + "epoch": 0.7970373220469411, + "flos": 568725362688.0, + "grad_norm": 0.0411728476443369, + "language_loss": 0.82051033, + "learning_rate": 0.00010419419806031316, + "loss": 0.83188045, + "num_input_tokens_seen": 343639808, + "router_z_loss_mlp": 0.73828125, + "step": 4143, + "time_per_iteration": 2.66398549079895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138356, + "balance_loss_mlp": 1.0646373, + "epoch": 0.7972297037322047, + "flos": 557350167552.0, + "grad_norm": 0.048021721616636356, + "language_loss": 0.88371974, + "learning_rate": 0.00010400391488829403, + "loss": 0.89510334, + "num_input_tokens_seen": 343715232, + "router_z_loss_mlp": 0.73730469, + "step": 4144, + "time_per_iteration": 2.764263153076172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137941, + "balance_loss_mlp": 1.06412661, + "epoch": 0.7974220854174683, + "flos": 577306787328.0, + "grad_norm": 0.030349731756734208, + "language_loss": 0.90217054, + "learning_rate": 0.00010381378545558984, + "loss": 0.9135499, + "num_input_tokens_seen": 343787168, + "router_z_loss_mlp": 0.73828125, + "step": 4145, + "time_per_iteration": 2.694387197494507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139239, + "balance_loss_mlp": 1.06552041, + "epoch": 0.7976144671027319, + "flos": 484055221248.0, + "grad_norm": 0.04602586335086132, + "language_loss": 0.89352703, + "learning_rate": 0.00010362380983601505, + "loss": 0.90491945, + "num_input_tokens_seen": 343853600, + "router_z_loss_mlp": 0.73730469, + "step": 4146, + "time_per_iteration": 2.5373778343200684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139051, + "balance_loss_mlp": 1.06528461, + "epoch": 0.7978068487879953, + "flos": 1079652773376.0, + "grad_norm": 0.026886472634432064, + "language_loss": 0.83036357, + "learning_rate": 0.00010343398810332477, + "loss": 0.84175408, + "num_input_tokens_seen": 343942816, + "router_z_loss_mlp": 0.73779297, + "step": 4147, + "time_per_iteration": 3.465343952178955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137553, + "balance_loss_mlp": 1.06383419, + "epoch": 0.7979992304732589, + "flos": 735015366144.0, + "grad_norm": 0.0386131750052721, + "language_loss": 0.89394611, + "learning_rate": 0.00010324432033121467, + "loss": 0.9053216, + "num_input_tokens_seen": 344021232, + "router_z_loss_mlp": 0.73730469, + "step": 4148, + "time_per_iteration": 2.95272159576416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137647, + "balance_loss_mlp": 1.06397593, + "epoch": 0.7981916121585225, + "flos": 416750043648.0, + "grad_norm": 0.03182767294568272, + "language_loss": 0.87920535, + "learning_rate": 0.00010305480659332005, + "loss": 0.89058185, + "num_input_tokens_seen": 344089616, + "router_z_loss_mlp": 0.73681641, + "step": 4149, + "time_per_iteration": 2.6444265842437744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113765, + "balance_loss_mlp": 1.0638833, + "epoch": 0.7983839938437861, + "flos": 466212894720.0, + "grad_norm": 0.047857965738547205, + "language_loss": 0.88751274, + "learning_rate": 0.00010286544696321682, + "loss": 0.89888918, + "num_input_tokens_seen": 344154992, + "router_z_loss_mlp": 0.73779297, + "step": 4150, + "time_per_iteration": 2.5789239406585693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138352, + "balance_loss_mlp": 1.06472826, + "epoch": 0.7985763755290496, + "flos": 511623077376.0, + "grad_norm": 0.03835001072611694, + "language_loss": 0.83638573, + "learning_rate": 0.00010267624151442073, + "loss": 0.84776926, + "num_input_tokens_seen": 344225232, + "router_z_loss_mlp": 0.73632812, + "step": 4151, + "time_per_iteration": 2.670612096786499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137657, + "balance_loss_mlp": 1.06408083, + "epoch": 0.7987687572143132, + "flos": 1012277738496.0, + "grad_norm": 0.03249576548614517, + "language_loss": 0.85286856, + "learning_rate": 0.000102487190320388, + "loss": 0.86424506, + "num_input_tokens_seen": 344309120, + "router_z_loss_mlp": 0.73583984, + "step": 4152, + "time_per_iteration": 3.3122832775115967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138879, + "balance_loss_mlp": 1.06520724, + "epoch": 0.7989611388995768, + "flos": 1022747873280.0, + "grad_norm": 0.03976712139414911, + "language_loss": 0.85336626, + "learning_rate": 0.00010229829345451475, + "loss": 0.86475503, + "num_input_tokens_seen": 344394112, + "router_z_loss_mlp": 0.73681641, + "step": 4153, + "time_per_iteration": 3.3512771129608154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138777, + "balance_loss_mlp": 1.0651536, + "epoch": 0.7991535205848403, + "flos": 1103036978688.0, + "grad_norm": 0.04036200779620281, + "language_loss": 0.83784497, + "learning_rate": 0.00010210955099013724, + "loss": 0.84923279, + "num_input_tokens_seen": 344476512, + "router_z_loss_mlp": 0.73632812, + "step": 4154, + "time_per_iteration": 3.352534532546997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138505, + "balance_loss_mlp": 1.06492949, + "epoch": 0.7993459022701039, + "flos": 836279669760.0, + "grad_norm": 0.04342364986110735, + "language_loss": 0.81863582, + "learning_rate": 0.00010192096300053167, + "loss": 0.83002084, + "num_input_tokens_seen": 344561088, + "router_z_loss_mlp": 0.73583984, + "step": 4155, + "time_per_iteration": 3.055297374725342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140351, + "balance_loss_mlp": 1.06672716, + "epoch": 0.7995382839553674, + "flos": 523769074176.0, + "grad_norm": 0.02922915705008151, + "language_loss": 0.89245528, + "learning_rate": 0.00010173252955891477, + "loss": 0.90385878, + "num_input_tokens_seen": 344639424, + "router_z_loss_mlp": 0.73632812, + "step": 4156, + "time_per_iteration": 2.741558790206909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141174, + "balance_loss_mlp": 1.0675503, + "epoch": 0.799730665640631, + "flos": 538858563072.0, + "grad_norm": 0.03668807577756746, + "language_loss": 0.78405279, + "learning_rate": 0.00010154425073844253, + "loss": 0.79546452, + "num_input_tokens_seen": 344710048, + "router_z_loss_mlp": 0.73632812, + "step": 4157, + "time_per_iteration": 2.6747748851776123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141717, + "balance_loss_mlp": 1.0680933, + "epoch": 0.7999230473258946, + "flos": 506067737088.0, + "grad_norm": 0.03089804381419182, + "language_loss": 0.86340404, + "learning_rate": 0.00010135612661221138, + "loss": 0.87482131, + "num_input_tokens_seen": 344776832, + "router_z_loss_mlp": 0.73632812, + "step": 4158, + "time_per_iteration": 2.565213680267334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144064, + "balance_loss_mlp": 1.07034528, + "epoch": 0.8001154290111582, + "flos": 1028975960064.0, + "grad_norm": 0.0395229836188532, + "language_loss": 0.87076604, + "learning_rate": 0.00010116815725325751, + "loss": 0.88220668, + "num_input_tokens_seen": 344864928, + "router_z_loss_mlp": 0.73681641, + "step": 4159, + "time_per_iteration": 3.3038952350616455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142065, + "balance_loss_mlp": 1.06834638, + "epoch": 0.8003078106964217, + "flos": 752269539840.0, + "grad_norm": 0.03606815133795925, + "language_loss": 0.85251313, + "learning_rate": 0.00010098034273455725, + "loss": 0.8639338, + "num_input_tokens_seen": 344944048, + "router_z_loss_mlp": 0.73681641, + "step": 4160, + "time_per_iteration": 2.9671449661254883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141282, + "balance_loss_mlp": 1.0676111, + "epoch": 0.8005001923816852, + "flos": 489525967872.0, + "grad_norm": 0.034755861099366334, + "language_loss": 0.84454644, + "learning_rate": 0.00010079268312902662, + "loss": 0.8559593, + "num_input_tokens_seen": 345015392, + "router_z_loss_mlp": 0.73632812, + "step": 4161, + "time_per_iteration": 2.6727142333984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140957, + "balance_loss_mlp": 1.06714249, + "epoch": 0.8006925740669488, + "flos": 514312788480.0, + "grad_norm": 0.03457602588260787, + "language_loss": 0.86664772, + "learning_rate": 0.0001006051785095215, + "loss": 0.8780573, + "num_input_tokens_seen": 345086640, + "router_z_loss_mlp": 0.73730469, + "step": 4162, + "time_per_iteration": 2.6881067752838135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140369, + "balance_loss_mlp": 1.06674516, + "epoch": 0.8008849557522124, + "flos": 579679590912.0, + "grad_norm": 0.039589703999255765, + "language_loss": 0.84823501, + "learning_rate": 0.0001004178289488376, + "loss": 0.85963869, + "num_input_tokens_seen": 345159616, + "router_z_loss_mlp": 0.73632812, + "step": 4163, + "time_per_iteration": 2.7627196311950684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140575, + "balance_loss_mlp": 1.06676042, + "epoch": 0.801077337437476, + "flos": 479680915968.0, + "grad_norm": 0.03562538391210133, + "language_loss": 0.88413119, + "learning_rate": 0.0001002306345197106, + "loss": 0.89553696, + "num_input_tokens_seen": 345225536, + "router_z_loss_mlp": 0.73730469, + "step": 4164, + "time_per_iteration": 2.6279873847961426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140166, + "balance_loss_mlp": 1.06635118, + "epoch": 0.8012697191227395, + "flos": 677967475200.0, + "grad_norm": 0.04047488864482016, + "language_loss": 0.85436863, + "learning_rate": 0.00010004359529481571, + "loss": 0.86577028, + "num_input_tokens_seen": 345302960, + "router_z_loss_mlp": 0.73730469, + "step": 4165, + "time_per_iteration": 2.995342493057251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114203, + "balance_loss_mlp": 1.06802452, + "epoch": 0.8014621008080031, + "flos": 1297170812928.0, + "grad_norm": 0.037617272041868384, + "language_loss": 0.87359077, + "learning_rate": 9.985671134676804e-05, + "loss": 0.88501108, + "num_input_tokens_seen": 345397792, + "router_z_loss_mlp": 0.73828125, + "step": 4166, + "time_per_iteration": 3.725456476211548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143421, + "balance_loss_mlp": 1.06941605, + "epoch": 0.8016544824932667, + "flos": 512825579520.0, + "grad_norm": 0.041033167099134404, + "language_loss": 0.89462924, + "learning_rate": 9.966998274812234e-05, + "loss": 0.90606344, + "num_input_tokens_seen": 345465440, + "router_z_loss_mlp": 0.73828125, + "step": 4167, + "time_per_iteration": 2.587735176086426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143621, + "balance_loss_mlp": 1.06961536, + "epoch": 0.8018468641785302, + "flos": 536718073344.0, + "grad_norm": 0.04253470612408202, + "language_loss": 0.87705988, + "learning_rate": 9.948340957137308e-05, + "loss": 0.88849604, + "num_input_tokens_seen": 345533072, + "router_z_loss_mlp": 0.73828125, + "step": 4168, + "time_per_iteration": 2.645045042037964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143563, + "balance_loss_mlp": 1.06950998, + "epoch": 0.8020392458637937, + "flos": 1025057550336.0, + "grad_norm": 0.04189552781046156, + "language_loss": 0.84953403, + "learning_rate": 9.929699188895447e-05, + "loss": 0.86096966, + "num_input_tokens_seen": 345622208, + "router_z_loss_mlp": 0.73876953, + "step": 4169, + "time_per_iteration": 3.2518906593322754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145859, + "balance_loss_mlp": 1.07376099, + "epoch": 0.8022316275490573, + "flos": 1565070403584.0, + "grad_norm": 0.005699099945185395, + "language_loss": 0.78054404, + "learning_rate": 9.911072977324009e-05, + "loss": 0.79200262, + "num_input_tokens_seen": 345852544, + "router_z_loss_mlp": 0.72265625, + "step": 4170, + "time_per_iteration": 4.9828410148620605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140499, + "balance_loss_mlp": 1.06644583, + "epoch": 0.8024240092343209, + "flos": 421601711616.0, + "grad_norm": 0.040177155372648383, + "language_loss": 0.88612646, + "learning_rate": 9.89246232965435e-05, + "loss": 0.89753145, + "num_input_tokens_seen": 345917328, + "router_z_loss_mlp": 0.73876953, + "step": 4171, + "time_per_iteration": 2.67098331451416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141029, + "balance_loss_mlp": 1.06702411, + "epoch": 0.8026163909195845, + "flos": 765162143232.0, + "grad_norm": 0.038738782156352326, + "language_loss": 0.84076917, + "learning_rate": 9.873867253111762e-05, + "loss": 0.85217947, + "num_input_tokens_seen": 345995936, + "router_z_loss_mlp": 0.73828125, + "step": 4172, + "time_per_iteration": 2.9889214038848877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141708, + "balance_loss_mlp": 1.06941986, + "epoch": 0.8028087726048481, + "flos": 1522141813248.0, + "grad_norm": 0.007464951030714858, + "language_loss": 0.80264562, + "learning_rate": 9.855287754915503e-05, + "loss": 0.81406271, + "num_input_tokens_seen": 346232720, + "router_z_loss_mlp": 0.72460938, + "step": 4173, + "time_per_iteration": 5.007925987243652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136925, + "balance_loss_mlp": 1.06277657, + "epoch": 0.8030011542901115, + "flos": 518830084608.0, + "grad_norm": 0.0383067219529844, + "language_loss": 0.93575275, + "learning_rate": 9.836723842278733e-05, + "loss": 0.9471221, + "num_input_tokens_seen": 346298208, + "router_z_loss_mlp": 0.73974609, + "step": 4174, + "time_per_iteration": 2.5880677700042725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137605, + "balance_loss_mlp": 1.06355226, + "epoch": 0.8031935359753751, + "flos": 546658452480.0, + "grad_norm": 0.035609660945247874, + "language_loss": 0.82692063, + "learning_rate": 9.818175522408646e-05, + "loss": 0.83829665, + "num_input_tokens_seen": 346370080, + "router_z_loss_mlp": 0.73876953, + "step": 4175, + "time_per_iteration": 2.6955156326293945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141225, + "balance_loss_mlp": 1.06717181, + "epoch": 0.8033859176606387, + "flos": 604735655424.0, + "grad_norm": 0.04032435514134155, + "language_loss": 0.8889333, + "learning_rate": 9.79964280250632e-05, + "loss": 0.90034556, + "num_input_tokens_seen": 346442432, + "router_z_loss_mlp": 0.73876953, + "step": 4176, + "time_per_iteration": 2.853034734725952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137722, + "balance_loss_mlp": 1.06362164, + "epoch": 0.8035782993459023, + "flos": 566984372736.0, + "grad_norm": 0.03679613531109102, + "language_loss": 0.86388361, + "learning_rate": 9.781125689766795e-05, + "loss": 0.87526083, + "num_input_tokens_seen": 346513088, + "router_z_loss_mlp": 0.73925781, + "step": 4177, + "time_per_iteration": 2.7487175464630127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137775, + "balance_loss_mlp": 1.06372213, + "epoch": 0.8037706810311658, + "flos": 539472912384.0, + "grad_norm": 0.05184044937246734, + "language_loss": 0.90083796, + "learning_rate": 9.762624191379054e-05, + "loss": 0.91221571, + "num_input_tokens_seen": 346581376, + "router_z_loss_mlp": 0.73876953, + "step": 4178, + "time_per_iteration": 2.6330466270446777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138006, + "balance_loss_mlp": 1.06390512, + "epoch": 0.8039630627164294, + "flos": 516194767872.0, + "grad_norm": 0.03661326628709558, + "language_loss": 0.84443927, + "learning_rate": 9.744138314526014e-05, + "loss": 0.85581934, + "num_input_tokens_seen": 346653328, + "router_z_loss_mlp": 0.73925781, + "step": 4179, + "time_per_iteration": 2.6247572898864746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141739, + "balance_loss_mlp": 1.06964111, + "epoch": 0.804155444401693, + "flos": 1481937136128.0, + "grad_norm": 0.005376898019679374, + "language_loss": 0.74733561, + "learning_rate": 9.725668066384535e-05, + "loss": 0.758753, + "num_input_tokens_seen": 346873264, + "router_z_loss_mlp": 0.72265625, + "step": 4180, + "time_per_iteration": 4.874308824539185 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113728, + "balance_loss_mlp": 1.06308401, + "epoch": 0.8043478260869565, + "flos": 522188539392.0, + "grad_norm": 0.04021078617434091, + "language_loss": 0.81771445, + "learning_rate": 9.707213454125396e-05, + "loss": 0.82908726, + "num_input_tokens_seen": 346946272, + "router_z_loss_mlp": 0.74023438, + "step": 4181, + "time_per_iteration": 2.693844795227051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137199, + "balance_loss_mlp": 1.0630033, + "epoch": 0.8045402077722201, + "flos": 546563125248.0, + "grad_norm": 0.03164680023603822, + "language_loss": 0.85049474, + "learning_rate": 9.688774484913298e-05, + "loss": 0.86186671, + "num_input_tokens_seen": 347024048, + "router_z_loss_mlp": 0.74023438, + "step": 4182, + "time_per_iteration": 2.7522850036621094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136736, + "balance_loss_mlp": 1.06254017, + "epoch": 0.8047325894574836, + "flos": 679706463744.0, + "grad_norm": 0.03486353569754657, + "language_loss": 0.79253167, + "learning_rate": 9.670351165906921e-05, + "loss": 0.80389905, + "num_input_tokens_seen": 347108736, + "router_z_loss_mlp": 0.74023438, + "step": 4183, + "time_per_iteration": 2.911919116973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137093, + "balance_loss_mlp": 1.06289673, + "epoch": 0.8049249711427472, + "flos": 588328144896.0, + "grad_norm": 0.03566696314646497, + "language_loss": 0.8362298, + "learning_rate": 9.65194350425882e-05, + "loss": 0.8476007, + "num_input_tokens_seen": 347184192, + "router_z_loss_mlp": 0.74023438, + "step": 4184, + "time_per_iteration": 2.7444334030151367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113629, + "balance_loss_mlp": 1.06204677, + "epoch": 0.8051173528280108, + "flos": 815680502784.0, + "grad_norm": 0.03248361844772192, + "language_loss": 0.82128632, + "learning_rate": 9.633551507115452e-05, + "loss": 0.83264923, + "num_input_tokens_seen": 347282336, + "router_z_loss_mlp": 0.74072266, + "step": 4185, + "time_per_iteration": 3.1254687309265137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136424, + "balance_loss_mlp": 1.06222832, + "epoch": 0.8053097345132744, + "flos": 726954964992.0, + "grad_norm": 0.030976719489159976, + "language_loss": 0.81902802, + "learning_rate": 9.615175181617259e-05, + "loss": 0.83039224, + "num_input_tokens_seen": 347364800, + "router_z_loss_mlp": 0.74023438, + "step": 4186, + "time_per_iteration": 2.9419145584106445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136622, + "balance_loss_mlp": 1.06242585, + "epoch": 0.805502116198538, + "flos": 749430107136.0, + "grad_norm": 0.03914823623045536, + "language_loss": 0.85688961, + "learning_rate": 9.596814534898552e-05, + "loss": 0.86825585, + "num_input_tokens_seen": 347443328, + "router_z_loss_mlp": 0.74023438, + "step": 4187, + "time_per_iteration": 3.0158443450927734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135947, + "balance_loss_mlp": 1.06184673, + "epoch": 0.8056944978838014, + "flos": 641481821184.0, + "grad_norm": 0.03272363751287634, + "language_loss": 0.91907942, + "learning_rate": 9.578469574087561e-05, + "loss": 0.93043882, + "num_input_tokens_seen": 347522064, + "router_z_loss_mlp": 0.73974609, + "step": 4188, + "time_per_iteration": 2.857875347137451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136336, + "balance_loss_mlp": 1.06218791, + "epoch": 0.805886879569065, + "flos": 645784267776.0, + "grad_norm": 0.037643576136900954, + "language_loss": 0.82672054, + "learning_rate": 9.560140306306436e-05, + "loss": 0.83808386, + "num_input_tokens_seen": 347597200, + "router_z_loss_mlp": 0.73974609, + "step": 4189, + "time_per_iteration": 2.7978317737579346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135607, + "balance_loss_mlp": 1.06160235, + "epoch": 0.8060792612543286, + "flos": 662443557888.0, + "grad_norm": 0.03459706232601391, + "language_loss": 0.86474156, + "learning_rate": 9.541826738671233e-05, + "loss": 0.87609762, + "num_input_tokens_seen": 347676928, + "router_z_loss_mlp": 0.73876953, + "step": 4190, + "time_per_iteration": 2.808532476425171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135589, + "balance_loss_mlp": 1.06153619, + "epoch": 0.8062716429395922, + "flos": 456012003840.0, + "grad_norm": 0.03810258680601671, + "language_loss": 0.87435436, + "learning_rate": 9.523528878291904e-05, + "loss": 0.88571024, + "num_input_tokens_seen": 347741552, + "router_z_loss_mlp": 0.73925781, + "step": 4191, + "time_per_iteration": 2.5479166507720947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011352, + "balance_loss_mlp": 1.06114757, + "epoch": 0.8064640246248557, + "flos": 527428973568.0, + "grad_norm": 0.03760103878345668, + "language_loss": 0.90479159, + "learning_rate": 9.50524673227231e-05, + "loss": 0.9161436, + "num_input_tokens_seen": 347807008, + "router_z_loss_mlp": 0.73925781, + "step": 4192, + "time_per_iteration": 2.595338821411133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135682, + "balance_loss_mlp": 1.0616293, + "epoch": 0.8066564063101193, + "flos": 866676225024.0, + "grad_norm": 0.03134383848670985, + "language_loss": 0.86391032, + "learning_rate": 9.486980307710208e-05, + "loss": 0.87526715, + "num_input_tokens_seen": 347895728, + "router_z_loss_mlp": 0.73925781, + "step": 4193, + "time_per_iteration": 3.1573548316955566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136404, + "balance_loss_mlp": 1.06254196, + "epoch": 0.8068487879953828, + "flos": 531642823680.0, + "grad_norm": 0.03189422174274218, + "language_loss": 0.8618921, + "learning_rate": 9.468729611697246e-05, + "loss": 0.87325615, + "num_input_tokens_seen": 347970368, + "router_z_loss_mlp": 0.73779297, + "step": 4194, + "time_per_iteration": 2.6939430236816406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135828, + "balance_loss_mlp": 1.06191802, + "epoch": 0.8070411696806464, + "flos": 567246885888.0, + "grad_norm": 0.031528158130144396, + "language_loss": 0.86619771, + "learning_rate": 9.450494651319003e-05, + "loss": 0.87755609, + "num_input_tokens_seen": 348039040, + "router_z_loss_mlp": 0.73828125, + "step": 4195, + "time_per_iteration": 2.6411421298980713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135645, + "balance_loss_mlp": 1.0615921, + "epoch": 0.80723355136591, + "flos": 988252987392.0, + "grad_norm": 0.028641893528927848, + "language_loss": 0.83544791, + "learning_rate": 9.432275433654885e-05, + "loss": 0.84680438, + "num_input_tokens_seen": 348126064, + "router_z_loss_mlp": 0.73925781, + "step": 4196, + "time_per_iteration": 3.284620761871338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136199, + "balance_loss_mlp": 1.06214666, + "epoch": 0.8074259330511735, + "flos": 568082815488.0, + "grad_norm": 0.03274043714207543, + "language_loss": 0.87193251, + "learning_rate": 9.414071965778221e-05, + "loss": 0.88329452, + "num_input_tokens_seen": 348205888, + "router_z_loss_mlp": 0.73876953, + "step": 4197, + "time_per_iteration": 2.8321473598480225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134907, + "balance_loss_mlp": 1.06075931, + "epoch": 0.8076183147364371, + "flos": 495752053248.0, + "grad_norm": 0.03175873877301644, + "language_loss": 0.83771801, + "learning_rate": 9.395884254756242e-05, + "loss": 0.84906709, + "num_input_tokens_seen": 348278608, + "router_z_loss_mlp": 0.73974609, + "step": 4198, + "time_per_iteration": 2.7369918823242188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134798, + "balance_loss_mlp": 1.06098342, + "epoch": 0.8078106964217007, + "flos": 420867840000.0, + "grad_norm": 0.03527202560929497, + "language_loss": 0.84655821, + "learning_rate": 9.377712307650044e-05, + "loss": 0.85790622, + "num_input_tokens_seen": 348341312, + "router_z_loss_mlp": 0.73779297, + "step": 4199, + "time_per_iteration": 2.523756504058838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134397, + "balance_loss_mlp": 1.06029618, + "epoch": 0.8080030781069643, + "flos": 528564346368.0, + "grad_norm": 0.03723834939135813, + "language_loss": 0.88157082, + "learning_rate": 9.359556131514602e-05, + "loss": 0.89291477, + "num_input_tokens_seen": 348409184, + "router_z_loss_mlp": 0.73974609, + "step": 4200, + "time_per_iteration": 2.6045093536376953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134559, + "balance_loss_mlp": 1.06036282, + "epoch": 0.8081954597922277, + "flos": 545151777792.0, + "grad_norm": 0.03389487766318828, + "language_loss": 0.86047804, + "learning_rate": 9.341415733398733e-05, + "loss": 0.87182367, + "num_input_tokens_seen": 348480832, + "router_z_loss_mlp": 0.74023438, + "step": 4201, + "time_per_iteration": 2.6960625648498535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134481, + "balance_loss_mlp": 1.06038058, + "epoch": 0.8083878414774913, + "flos": 642133100544.0, + "grad_norm": 0.03528539994977632, + "language_loss": 0.79933041, + "learning_rate": 9.323291120345207e-05, + "loss": 0.81067526, + "num_input_tokens_seen": 348559232, + "router_z_loss_mlp": 0.73974609, + "step": 4202, + "time_per_iteration": 2.841066837310791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135094, + "balance_loss_mlp": 1.06099403, + "epoch": 0.8085802231627549, + "flos": 706905019392.0, + "grad_norm": 0.03577618457162915, + "language_loss": 0.77572632, + "learning_rate": 9.305182299390614e-05, + "loss": 0.78707725, + "num_input_tokens_seen": 348638960, + "router_z_loss_mlp": 0.73974609, + "step": 4203, + "time_per_iteration": 2.881850004196167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134762, + "balance_loss_mlp": 1.0607096, + "epoch": 0.8087726048480185, + "flos": 420661722624.0, + "grad_norm": 0.03818278195025951, + "language_loss": 0.93325853, + "learning_rate": 9.287089277565409e-05, + "loss": 0.94460618, + "num_input_tokens_seen": 348704816, + "router_z_loss_mlp": 0.73925781, + "step": 4204, + "time_per_iteration": 2.5712902545928955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134942, + "balance_loss_mlp": 1.06093681, + "epoch": 0.8089649865332821, + "flos": 509862621696.0, + "grad_norm": 0.028510707328060825, + "language_loss": 0.90784013, + "learning_rate": 9.269012061893922e-05, + "loss": 0.91918957, + "num_input_tokens_seen": 348783504, + "router_z_loss_mlp": 0.73925781, + "step": 4205, + "time_per_iteration": 2.774871587753296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134999, + "balance_loss_mlp": 1.0608989, + "epoch": 0.8091573682185456, + "flos": 458261282304.0, + "grad_norm": 0.03265489614473136, + "language_loss": 0.88958049, + "learning_rate": 9.250950659394386e-05, + "loss": 0.90093046, + "num_input_tokens_seen": 348858272, + "router_z_loss_mlp": 0.73974609, + "step": 4206, + "time_per_iteration": 2.7118797302246094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113461, + "balance_loss_mlp": 1.06079543, + "epoch": 0.8093497499038091, + "flos": 526374191616.0, + "grad_norm": 0.03169326833456576, + "language_loss": 0.8122524, + "learning_rate": 9.232905077078824e-05, + "loss": 0.82359844, + "num_input_tokens_seen": 348934432, + "router_z_loss_mlp": 0.73779297, + "step": 4207, + "time_per_iteration": 2.72802734375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134871, + "balance_loss_mlp": 1.06091356, + "epoch": 0.8095421315890727, + "flos": 490580749824.0, + "grad_norm": 0.036826369012514064, + "language_loss": 0.81312108, + "learning_rate": 9.214875321953164e-05, + "loss": 0.8244698, + "num_input_tokens_seen": 349003856, + "router_z_loss_mlp": 0.73876953, + "step": 4208, + "time_per_iteration": 2.605091094970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113518, + "balance_loss_mlp": 1.06117523, + "epoch": 0.8097345132743363, + "flos": 626283543552.0, + "grad_norm": 0.03355343413507775, + "language_loss": 0.85747409, + "learning_rate": 9.196861401017164e-05, + "loss": 0.86882585, + "num_input_tokens_seen": 349080544, + "router_z_loss_mlp": 0.73876953, + "step": 4209, + "time_per_iteration": 2.776834726333618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135546, + "balance_loss_mlp": 1.06149364, + "epoch": 0.8099268949595998, + "flos": 616872920064.0, + "grad_norm": 0.03618347801617859, + "language_loss": 0.8405565, + "learning_rate": 9.178863321264475e-05, + "loss": 0.85191202, + "num_input_tokens_seen": 349159072, + "router_z_loss_mlp": 0.73876953, + "step": 4210, + "time_per_iteration": 2.829793930053711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136593, + "balance_loss_mlp": 1.06258821, + "epoch": 0.8101192766448634, + "flos": 480684031488.0, + "grad_norm": 0.03384381910797024, + "language_loss": 0.84874779, + "learning_rate": 9.160881089682566e-05, + "loss": 0.86011374, + "num_input_tokens_seen": 349230176, + "router_z_loss_mlp": 0.73828125, + "step": 4211, + "time_per_iteration": 2.6381702423095703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136603, + "balance_loss_mlp": 1.06269372, + "epoch": 0.810311658330127, + "flos": 518326525440.0, + "grad_norm": 0.03431479693344864, + "language_loss": 0.91464251, + "learning_rate": 9.142914713252725e-05, + "loss": 0.92600852, + "num_input_tokens_seen": 349299760, + "router_z_loss_mlp": 0.73779297, + "step": 4212, + "time_per_iteration": 2.6007797718048096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137699, + "balance_loss_mlp": 1.0639801, + "epoch": 0.8105040400153906, + "flos": 576987878400.0, + "grad_norm": 0.02918606823415051, + "language_loss": 0.87603903, + "learning_rate": 9.124964198950159e-05, + "loss": 0.88741606, + "num_input_tokens_seen": 349379712, + "router_z_loss_mlp": 0.73681641, + "step": 4213, + "time_per_iteration": 2.8085403442382812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137019, + "balance_loss_mlp": 1.06315684, + "epoch": 0.8106964217006541, + "flos": 640187994624.0, + "grad_norm": 0.033620937872648055, + "language_loss": 0.89619857, + "learning_rate": 9.107029553743862e-05, + "loss": 0.90756875, + "num_input_tokens_seen": 349460320, + "router_z_loss_mlp": 0.73730469, + "step": 4214, + "time_per_iteration": 2.884916305541992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136885, + "balance_loss_mlp": 1.06297493, + "epoch": 0.8108888033859176, + "flos": 580584651264.0, + "grad_norm": 0.03884853564505628, + "language_loss": 0.866575, + "learning_rate": 9.089110784596672e-05, + "loss": 0.87794381, + "num_input_tokens_seen": 349527648, + "router_z_loss_mlp": 0.73779297, + "step": 4215, + "time_per_iteration": 2.6847498416900635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136591, + "balance_loss_mlp": 1.06258559, + "epoch": 0.8110811850711812, + "flos": 561090657792.0, + "grad_norm": 0.03395287421728693, + "language_loss": 0.88044077, + "learning_rate": 9.071207898465284e-05, + "loss": 0.89180672, + "num_input_tokens_seen": 349606912, + "router_z_loss_mlp": 0.73828125, + "step": 4216, + "time_per_iteration": 2.7887377738952637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145004, + "balance_loss_mlp": 1.07290649, + "epoch": 0.8112735667564448, + "flos": 1521066839040.0, + "grad_norm": 0.008024079584686653, + "language_loss": 0.77260417, + "learning_rate": 9.053320902300205e-05, + "loss": 0.78405422, + "num_input_tokens_seen": 349827040, + "router_z_loss_mlp": 0.72265625, + "step": 4217, + "time_per_iteration": 4.71375584602356 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113793, + "balance_loss_mlp": 1.06402028, + "epoch": 0.8114659484417084, + "flos": 617515467264.0, + "grad_norm": 0.0391225260866388, + "language_loss": 0.90230364, + "learning_rate": 9.035449803045792e-05, + "loss": 0.91368294, + "num_input_tokens_seen": 349900080, + "router_z_loss_mlp": 0.73779297, + "step": 4218, + "time_per_iteration": 2.8041131496429443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136237, + "balance_loss_mlp": 1.06242275, + "epoch": 0.8116583301269719, + "flos": 651261745152.0, + "grad_norm": 0.030797335982040666, + "language_loss": 0.83055115, + "learning_rate": 9.017594607640211e-05, + "loss": 0.84191352, + "num_input_tokens_seen": 349983568, + "router_z_loss_mlp": 0.73730469, + "step": 4219, + "time_per_iteration": 2.9443857669830322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136868, + "balance_loss_mlp": 1.06295788, + "epoch": 0.8118507118122354, + "flos": 554195828736.0, + "grad_norm": 0.03810511170832895, + "language_loss": 0.85147524, + "learning_rate": 8.999755323015463e-05, + "loss": 0.86284399, + "num_input_tokens_seen": 350054928, + "router_z_loss_mlp": 0.73779297, + "step": 4220, + "time_per_iteration": 2.680670738220215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136982, + "balance_loss_mlp": 1.06326246, + "epoch": 0.812043093497499, + "flos": 545177974272.0, + "grad_norm": 0.03408780635951255, + "language_loss": 0.91583371, + "learning_rate": 8.981931956097384e-05, + "loss": 0.92720354, + "num_input_tokens_seen": 350127872, + "router_z_loss_mlp": 0.73681641, + "step": 4221, + "time_per_iteration": 2.642547369003296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136863, + "balance_loss_mlp": 1.06295288, + "epoch": 0.8122354751827626, + "flos": 584574919680.0, + "grad_norm": 0.03129027929290594, + "language_loss": 0.87976468, + "learning_rate": 8.964124513805628e-05, + "loss": 0.89113331, + "num_input_tokens_seen": 350206592, + "router_z_loss_mlp": 0.73779297, + "step": 4222, + "time_per_iteration": 2.7617506980895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142586, + "balance_loss_mlp": 1.07067871, + "epoch": 0.8124278568680262, + "flos": 1533860112384.0, + "grad_norm": 0.005717741019292163, + "language_loss": 0.78250074, + "learning_rate": 8.94633300305363e-05, + "loss": 0.7939266, + "num_input_tokens_seen": 350436048, + "router_z_loss_mlp": 0.72070312, + "step": 4223, + "time_per_iteration": 4.967041492462158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135426, + "balance_loss_mlp": 1.06142044, + "epoch": 0.8126202385532897, + "flos": 433767174144.0, + "grad_norm": 0.038884513065240225, + "language_loss": 0.84713882, + "learning_rate": 8.928557430748668e-05, + "loss": 0.85849309, + "num_input_tokens_seen": 350501376, + "router_z_loss_mlp": 0.73876953, + "step": 4224, + "time_per_iteration": 2.5755624771118164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140594, + "balance_loss_mlp": 1.06830597, + "epoch": 0.8128126202385533, + "flos": 1551146486784.0, + "grad_norm": 0.0052150499454202155, + "language_loss": 0.76495624, + "learning_rate": 8.910797803791854e-05, + "loss": 0.77636218, + "num_input_tokens_seen": 350735232, + "router_z_loss_mlp": 0.72460938, + "step": 4225, + "time_per_iteration": 4.887953281402588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136452, + "balance_loss_mlp": 1.06273341, + "epoch": 0.8130050019238169, + "flos": 529337149440.0, + "grad_norm": 0.038030015177674494, + "language_loss": 0.93251669, + "learning_rate": 8.893054129078077e-05, + "loss": 0.94388121, + "num_input_tokens_seen": 350805088, + "router_z_loss_mlp": 0.73681641, + "step": 4226, + "time_per_iteration": 2.6120243072509766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135963, + "balance_loss_mlp": 1.06224418, + "epoch": 0.8131973836090804, + "flos": 544227251712.0, + "grad_norm": 0.04131080667228598, + "language_loss": 0.8568573, + "learning_rate": 8.875326413496037e-05, + "loss": 0.86821687, + "num_input_tokens_seen": 350876896, + "router_z_loss_mlp": 0.73681641, + "step": 4227, + "time_per_iteration": 2.7287051677703857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135709, + "balance_loss_mlp": 1.0617516, + "epoch": 0.8133897652943439, + "flos": 577578032640.0, + "grad_norm": 0.03865852336010986, + "language_loss": 0.86959839, + "learning_rate": 8.857614663928249e-05, + "loss": 0.88095552, + "num_input_tokens_seen": 350948400, + "router_z_loss_mlp": 0.73828125, + "step": 4228, + "time_per_iteration": 2.6870715618133545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135823, + "balance_loss_mlp": 1.06219947, + "epoch": 0.8135821469796075, + "flos": 580350336000.0, + "grad_norm": 0.0387504778946499, + "language_loss": 0.84373677, + "learning_rate": 8.839918887251025e-05, + "loss": 0.85509503, + "num_input_tokens_seen": 351023328, + "router_z_loss_mlp": 0.73632812, + "step": 4229, + "time_per_iteration": 2.745539426803589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135168, + "balance_loss_mlp": 1.06140161, + "epoch": 0.8137745286648711, + "flos": 651643780608.0, + "grad_norm": 0.037162762850376806, + "language_loss": 0.8921082, + "learning_rate": 8.822239090334472e-05, + "loss": 0.90345985, + "num_input_tokens_seen": 351108672, + "router_z_loss_mlp": 0.73730469, + "step": 4230, + "time_per_iteration": 2.971499443054199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134218, + "balance_loss_mlp": 1.06035542, + "epoch": 0.8139669103501347, + "flos": 703127599104.0, + "grad_norm": 0.036809374739783886, + "language_loss": 0.81143808, + "learning_rate": 8.804575280042493e-05, + "loss": 0.82278025, + "num_input_tokens_seen": 351185056, + "router_z_loss_mlp": 0.73828125, + "step": 4231, + "time_per_iteration": 2.89591121673584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134335, + "balance_loss_mlp": 1.06056821, + "epoch": 0.8141592920353983, + "flos": 651387271680.0, + "grad_norm": 0.04068280906456379, + "language_loss": 0.88771474, + "learning_rate": 8.786927463232774e-05, + "loss": 0.8990581, + "num_input_tokens_seen": 351255856, + "router_z_loss_mlp": 0.73730469, + "step": 4232, + "time_per_iteration": 2.777247905731201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113425, + "balance_loss_mlp": 1.06029224, + "epoch": 0.8143516737206618, + "flos": 537844713984.0, + "grad_norm": 0.04131834896262191, + "language_loss": 0.85812843, + "learning_rate": 8.769295646756853e-05, + "loss": 0.86947101, + "num_input_tokens_seen": 351322336, + "router_z_loss_mlp": 0.73876953, + "step": 4233, + "time_per_iteration": 2.6038644313812256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134212, + "balance_loss_mlp": 1.0605886, + "epoch": 0.8145440554059253, + "flos": 509363065344.0, + "grad_norm": 0.03311543445898947, + "language_loss": 0.86719936, + "learning_rate": 8.751679837459963e-05, + "loss": 0.87854147, + "num_input_tokens_seen": 351387440, + "router_z_loss_mlp": 0.73632812, + "step": 4234, + "time_per_iteration": 2.5994458198547363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133864, + "balance_loss_mlp": 1.06024003, + "epoch": 0.8147364370911889, + "flos": 636287049216.0, + "grad_norm": 0.02964347408998998, + "language_loss": 0.90857178, + "learning_rate": 8.734080042181181e-05, + "loss": 0.91991043, + "num_input_tokens_seen": 351464192, + "router_z_loss_mlp": 0.73632812, + "step": 4235, + "time_per_iteration": 2.831850051879883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133974, + "balance_loss_mlp": 1.0603503, + "epoch": 0.8149288187764525, + "flos": 423705271296.0, + "grad_norm": 0.03639444166963084, + "language_loss": 0.83094406, + "learning_rate": 8.716496267753343e-05, + "loss": 0.84228379, + "num_input_tokens_seen": 351528016, + "router_z_loss_mlp": 0.73632812, + "step": 4236, + "time_per_iteration": 2.4640309810638428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135903, + "balance_loss_mlp": 1.06227982, + "epoch": 0.8151212004617161, + "flos": 598620360192.0, + "grad_norm": 0.03190443114038452, + "language_loss": 0.85766506, + "learning_rate": 8.698928521003097e-05, + "loss": 0.8690241, + "num_input_tokens_seen": 351601648, + "router_z_loss_mlp": 0.73632812, + "step": 4237, + "time_per_iteration": 2.7593436241149902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141319, + "balance_loss_mlp": 1.06941223, + "epoch": 0.8153135821469796, + "flos": 1482412497408.0, + "grad_norm": 0.006034012067476844, + "language_loss": 0.77852845, + "learning_rate": 8.681376808750835e-05, + "loss": 0.78994167, + "num_input_tokens_seen": 351826720, + "router_z_loss_mlp": 0.72070312, + "step": 4238, + "time_per_iteration": 5.0358593463897705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135115, + "balance_loss_mlp": 1.06149137, + "epoch": 0.8155059638322432, + "flos": 438011223552.0, + "grad_norm": 0.03574751342036468, + "language_loss": 0.86546302, + "learning_rate": 8.663841137810741e-05, + "loss": 0.87681419, + "num_input_tokens_seen": 351891760, + "router_z_loss_mlp": 0.73632812, + "step": 4239, + "time_per_iteration": 2.5296990871429443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134629, + "balance_loss_mlp": 1.06100523, + "epoch": 0.8156983455175068, + "flos": 795819210240.0, + "grad_norm": 0.036631860682182917, + "language_loss": 0.90299451, + "learning_rate": 8.646321514990763e-05, + "loss": 0.91434073, + "num_input_tokens_seen": 351977504, + "router_z_loss_mlp": 0.73632812, + "step": 4240, + "time_per_iteration": 3.116800308227539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134029, + "balance_loss_mlp": 1.06040537, + "epoch": 0.8158907272027703, + "flos": 687193448448.0, + "grad_norm": 0.03497799399814432, + "language_loss": 0.86212909, + "learning_rate": 8.628817947092616e-05, + "loss": 0.87346935, + "num_input_tokens_seen": 352050176, + "router_z_loss_mlp": 0.73632812, + "step": 4241, + "time_per_iteration": 2.8215630054473877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113408, + "balance_loss_mlp": 1.06040835, + "epoch": 0.8160831088880338, + "flos": 488030026752.0, + "grad_norm": 0.04917888887057411, + "language_loss": 0.90205991, + "learning_rate": 8.611330440911797e-05, + "loss": 0.91340065, + "num_input_tokens_seen": 352116848, + "router_z_loss_mlp": 0.73681641, + "step": 4242, + "time_per_iteration": 2.5900723934173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133957, + "balance_loss_mlp": 1.06033301, + "epoch": 0.8162754905732974, + "flos": 465822127104.0, + "grad_norm": 0.03688342086176751, + "language_loss": 0.8533777, + "learning_rate": 8.593859003237558e-05, + "loss": 0.86471725, + "num_input_tokens_seen": 352185056, + "router_z_loss_mlp": 0.73632812, + "step": 4243, + "time_per_iteration": 2.560988664627075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138031, + "balance_loss_mlp": 1.06593323, + "epoch": 0.816467872258561, + "flos": 1242143341056.0, + "grad_norm": 0.003656687556676087, + "language_loss": 0.75285125, + "learning_rate": 8.576403640852904e-05, + "loss": 0.76423156, + "num_input_tokens_seen": 352397648, + "router_z_loss_mlp": 0.72265625, + "step": 4244, + "time_per_iteration": 4.697356462478638 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134208, + "balance_loss_mlp": 1.06058443, + "epoch": 0.8166602539438246, + "flos": 688402681344.0, + "grad_norm": 0.0314239637841158, + "language_loss": 0.90210414, + "learning_rate": 8.558964360534615e-05, + "loss": 0.91344625, + "num_input_tokens_seen": 352478272, + "router_z_loss_mlp": 0.73632812, + "step": 4245, + "time_per_iteration": 2.9143781661987305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138077, + "balance_loss_mlp": 1.065979, + "epoch": 0.8168526356290882, + "flos": 1493916673536.0, + "grad_norm": 0.0037263758813665952, + "language_loss": 0.72974741, + "learning_rate": 8.541541169053219e-05, + "loss": 0.74112821, + "num_input_tokens_seen": 352707104, + "router_z_loss_mlp": 0.72265625, + "step": 4246, + "time_per_iteration": 4.9454734325408936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133915, + "balance_loss_mlp": 1.06029105, + "epoch": 0.8170450173143516, + "flos": 579299556864.0, + "grad_norm": 0.030493016441410038, + "language_loss": 0.89006281, + "learning_rate": 8.524134073172984e-05, + "loss": 0.901402, + "num_input_tokens_seen": 352779248, + "router_z_loss_mlp": 0.73632812, + "step": 4247, + "time_per_iteration": 2.716303586959839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133981, + "balance_loss_mlp": 1.06035721, + "epoch": 0.8172373989996152, + "flos": 572437655040.0, + "grad_norm": 0.032931273654240076, + "language_loss": 0.89490271, + "learning_rate": 8.506743079651974e-05, + "loss": 0.90624249, + "num_input_tokens_seen": 352856784, + "router_z_loss_mlp": 0.73632812, + "step": 4248, + "time_per_iteration": 2.8293991088867188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134211, + "balance_loss_mlp": 1.06063545, + "epoch": 0.8174297806848788, + "flos": 529858172928.0, + "grad_norm": 0.037171294021196906, + "language_loss": 0.85910308, + "learning_rate": 8.489368195241948e-05, + "loss": 0.87044525, + "num_input_tokens_seen": 352926496, + "router_z_loss_mlp": 0.73583984, + "step": 4249, + "time_per_iteration": 2.6829066276550293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134079, + "balance_loss_mlp": 1.06059849, + "epoch": 0.8176221623701424, + "flos": 570268967424.0, + "grad_norm": 0.034080250978502535, + "language_loss": 0.8438381, + "learning_rate": 8.47200942668846e-05, + "loss": 0.85517883, + "num_input_tokens_seen": 353005312, + "router_z_loss_mlp": 0.73486328, + "step": 4250, + "time_per_iteration": 2.8265514373779297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135859, + "balance_loss_mlp": 1.06237853, + "epoch": 0.8178145440554059, + "flos": 657706682880.0, + "grad_norm": 0.03911715002347649, + "language_loss": 0.85039294, + "learning_rate": 8.454666780730735e-05, + "loss": 0.8617515, + "num_input_tokens_seen": 353085120, + "router_z_loss_mlp": 0.73486328, + "step": 4251, + "time_per_iteration": 2.8799848556518555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136417, + "balance_loss_mlp": 1.06298411, + "epoch": 0.8180069257406695, + "flos": 547055950848.0, + "grad_norm": 0.03495030858038778, + "language_loss": 0.925497, + "learning_rate": 8.437340264101828e-05, + "loss": 0.93686116, + "num_input_tokens_seen": 353160992, + "router_z_loss_mlp": 0.734375, + "step": 4252, + "time_per_iteration": 2.741757392883301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134952, + "balance_loss_mlp": 1.06156695, + "epoch": 0.818199307425933, + "flos": 620411295744.0, + "grad_norm": 0.03572313096621812, + "language_loss": 0.89690208, + "learning_rate": 8.420029883528474e-05, + "loss": 0.90825158, + "num_input_tokens_seen": 353233328, + "router_z_loss_mlp": 0.73388672, + "step": 4253, + "time_per_iteration": 2.7292418479919434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135002, + "balance_loss_mlp": 1.06152105, + "epoch": 0.8183916891111966, + "flos": 648934603776.0, + "grad_norm": 0.03748901013328147, + "language_loss": 0.82274991, + "learning_rate": 8.402735645731157e-05, + "loss": 0.83409989, + "num_input_tokens_seen": 353310592, + "router_z_loss_mlp": 0.73486328, + "step": 4254, + "time_per_iteration": 2.910111665725708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134818, + "balance_loss_mlp": 1.06152833, + "epoch": 0.8185840707964602, + "flos": 500102163456.0, + "grad_norm": 0.038471995455164235, + "language_loss": 0.82772928, + "learning_rate": 8.385457557424098e-05, + "loss": 0.83907747, + "num_input_tokens_seen": 353376544, + "router_z_loss_mlp": 0.73291016, + "step": 4255, + "time_per_iteration": 2.5621390342712402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134815, + "balance_loss_mlp": 1.06142998, + "epoch": 0.8187764524817237, + "flos": 787611088896.0, + "grad_norm": 0.030170748899510557, + "language_loss": 0.84222317, + "learning_rate": 8.368195625315251e-05, + "loss": 0.8535713, + "num_input_tokens_seen": 353461200, + "router_z_loss_mlp": 0.73388672, + "step": 4256, + "time_per_iteration": 3.078824996948242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134992, + "balance_loss_mlp": 1.06170225, + "epoch": 0.8189688341669873, + "flos": 551786095104.0, + "grad_norm": 0.03557729872276572, + "language_loss": 0.84799671, + "learning_rate": 8.350949856106283e-05, + "loss": 0.85934663, + "num_input_tokens_seen": 353538608, + "router_z_loss_mlp": 0.73291016, + "step": 4257, + "time_per_iteration": 2.7947750091552734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137352, + "balance_loss_mlp": 1.06544495, + "epoch": 0.8191612158522509, + "flos": 1354880894976.0, + "grad_norm": 0.0054924176528901095, + "language_loss": 0.71149343, + "learning_rate": 8.333720256492599e-05, + "loss": 0.72286695, + "num_input_tokens_seen": 353766960, + "router_z_loss_mlp": 0.72070312, + "step": 4258, + "time_per_iteration": 4.84255051612854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134347, + "balance_loss_mlp": 1.06096172, + "epoch": 0.8193535975375145, + "flos": 545299497984.0, + "grad_norm": 0.03816003226358518, + "language_loss": 0.88573909, + "learning_rate": 8.316506833163318e-05, + "loss": 0.89708257, + "num_input_tokens_seen": 353833552, + "router_z_loss_mlp": 0.73388672, + "step": 4259, + "time_per_iteration": 2.6227800846099854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134587, + "balance_loss_mlp": 1.06110692, + "epoch": 0.8195459792227779, + "flos": 867227447808.0, + "grad_norm": 0.030985411869637765, + "language_loss": 0.89433575, + "learning_rate": 8.299309592801297e-05, + "loss": 0.90568173, + "num_input_tokens_seen": 353915520, + "router_z_loss_mlp": 0.73486328, + "step": 4260, + "time_per_iteration": 3.120332717895508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136288, + "balance_loss_mlp": 1.06299853, + "epoch": 0.8197383609080415, + "flos": 570409956864.0, + "grad_norm": 0.03501003143671651, + "language_loss": 0.85849857, + "learning_rate": 8.282128542083101e-05, + "loss": 0.86986148, + "num_input_tokens_seen": 353992048, + "router_z_loss_mlp": 0.73291016, + "step": 4261, + "time_per_iteration": 2.7042295932769775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113623, + "balance_loss_mlp": 1.06284475, + "epoch": 0.8199307425933051, + "flos": 531885871104.0, + "grad_norm": 0.03573115992813463, + "language_loss": 0.89631218, + "learning_rate": 8.264963687678978e-05, + "loss": 0.90767449, + "num_input_tokens_seen": 354064848, + "router_z_loss_mlp": 0.73388672, + "step": 4262, + "time_per_iteration": 2.698512554168701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136104, + "balance_loss_mlp": 1.0625757, + "epoch": 0.8201231242785687, + "flos": 568230535680.0, + "grad_norm": 0.03738858607219498, + "language_loss": 0.8919028, + "learning_rate": 8.247815036252921e-05, + "loss": 0.90326387, + "num_input_tokens_seen": 354138848, + "router_z_loss_mlp": 0.73535156, + "step": 4263, + "time_per_iteration": 2.7295687198638916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136681, + "balance_loss_mlp": 1.0632956, + "epoch": 0.8203155059638323, + "flos": 1232383431168.0, + "grad_norm": 0.035805039372270496, + "language_loss": 0.86680698, + "learning_rate": 8.230682594462652e-05, + "loss": 0.87817383, + "num_input_tokens_seen": 354227696, + "router_z_loss_mlp": 0.73388672, + "step": 4264, + "time_per_iteration": 3.529435634613037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137219, + "balance_loss_mlp": 1.0639292, + "epoch": 0.8205078876490958, + "flos": 575279089152.0, + "grad_norm": 0.03283426930312581, + "language_loss": 0.84526485, + "learning_rate": 8.213566368959558e-05, + "loss": 0.856637, + "num_input_tokens_seen": 354298400, + "router_z_loss_mlp": 0.73291016, + "step": 4265, + "time_per_iteration": 2.6853911876678467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136935, + "balance_loss_mlp": 1.06354988, + "epoch": 0.8207002693343594, + "flos": 932985017856.0, + "grad_norm": 0.03554909182622845, + "language_loss": 0.83231854, + "learning_rate": 8.196466366388744e-05, + "loss": 0.84368789, + "num_input_tokens_seen": 354385024, + "router_z_loss_mlp": 0.73388672, + "step": 4266, + "time_per_iteration": 3.2028071880340576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136873, + "balance_loss_mlp": 1.06358302, + "epoch": 0.8208926510196229, + "flos": 550659454464.0, + "grad_norm": 0.030804523886097362, + "language_loss": 0.84640598, + "learning_rate": 8.179382593389029e-05, + "loss": 0.85777473, + "num_input_tokens_seen": 354456384, + "router_z_loss_mlp": 0.73291016, + "step": 4267, + "time_per_iteration": 2.650616407394409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113631, + "balance_loss_mlp": 1.06297278, + "epoch": 0.8210850327048865, + "flos": 649411966464.0, + "grad_norm": 0.034163705244185175, + "language_loss": 0.86939591, + "learning_rate": 8.162315056592918e-05, + "loss": 0.880759, + "num_input_tokens_seen": 354531296, + "router_z_loss_mlp": 0.73339844, + "step": 4268, + "time_per_iteration": 2.8432037830352783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135687, + "balance_loss_mlp": 1.06239724, + "epoch": 0.82127741439015, + "flos": 602697223680.0, + "grad_norm": 0.0327614409719618, + "language_loss": 0.85872579, + "learning_rate": 8.145263762626615e-05, + "loss": 0.87008262, + "num_input_tokens_seen": 354605680, + "router_z_loss_mlp": 0.73291016, + "step": 4269, + "time_per_iteration": 2.794907808303833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136528, + "balance_loss_mlp": 1.06314278, + "epoch": 0.8214697960754136, + "flos": 475853830656.0, + "grad_norm": 0.03329504882056361, + "language_loss": 0.88679749, + "learning_rate": 8.128228718110015e-05, + "loss": 0.89816278, + "num_input_tokens_seen": 354678160, + "router_z_loss_mlp": 0.73388672, + "step": 4270, + "time_per_iteration": 2.6682534217834473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137291, + "balance_loss_mlp": 1.06395364, + "epoch": 0.8216621777606772, + "flos": 905093523456.0, + "grad_norm": 0.04141096199227741, + "language_loss": 0.89987427, + "learning_rate": 8.11120992965671e-05, + "loss": 0.91124725, + "num_input_tokens_seen": 354751024, + "router_z_loss_mlp": 0.73339844, + "step": 4271, + "time_per_iteration": 3.0566489696502686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137158, + "balance_loss_mlp": 1.06372499, + "epoch": 0.8218545594459408, + "flos": 515495824896.0, + "grad_norm": 0.03644141192614607, + "language_loss": 0.88000762, + "learning_rate": 8.094207403873998e-05, + "loss": 0.89137918, + "num_input_tokens_seen": 354819408, + "router_z_loss_mlp": 0.734375, + "step": 4272, + "time_per_iteration": 2.6066787242889404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136048, + "balance_loss_mlp": 1.06261528, + "epoch": 0.8220469411312044, + "flos": 495558670848.0, + "grad_norm": 0.033626065990782314, + "language_loss": 0.90746641, + "learning_rate": 8.077221147362829e-05, + "loss": 0.91882682, + "num_input_tokens_seen": 354887376, + "router_z_loss_mlp": 0.734375, + "step": 4273, + "time_per_iteration": 2.6172597408294678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137405, + "balance_loss_mlp": 1.0640676, + "epoch": 0.8222393228164678, + "flos": 387276013056.0, + "grad_norm": 0.041107028258718356, + "language_loss": 0.94696027, + "learning_rate": 8.060251166717835e-05, + "loss": 0.95833433, + "num_input_tokens_seen": 354948288, + "router_z_loss_mlp": 0.73339844, + "step": 4274, + "time_per_iteration": 2.4571101665496826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136137, + "balance_loss_mlp": 1.06270397, + "epoch": 0.8224317045017314, + "flos": 537629864448.0, + "grad_norm": 0.036324046899494276, + "language_loss": 0.90921676, + "learning_rate": 8.043297468527383e-05, + "loss": 0.92057812, + "num_input_tokens_seen": 355016912, + "router_z_loss_mlp": 0.734375, + "step": 4275, + "time_per_iteration": 2.6465563774108887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137298, + "balance_loss_mlp": 1.06396043, + "epoch": 0.822624086186995, + "flos": 555947552256.0, + "grad_norm": 0.03930955148337389, + "language_loss": 0.87730598, + "learning_rate": 8.02636005937346e-05, + "loss": 0.88867891, + "num_input_tokens_seen": 355085936, + "router_z_loss_mlp": 0.73339844, + "step": 4276, + "time_per_iteration": 2.6447408199310303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137809, + "balance_loss_mlp": 1.06451952, + "epoch": 0.8228164678722586, + "flos": 540717073920.0, + "grad_norm": 0.032348524230564446, + "language_loss": 0.8416298, + "learning_rate": 8.009438945831771e-05, + "loss": 0.85300791, + "num_input_tokens_seen": 355161984, + "router_z_loss_mlp": 0.73291016, + "step": 4277, + "time_per_iteration": 2.725992441177368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137287, + "balance_loss_mlp": 1.06404459, + "epoch": 0.8230088495575221, + "flos": 474262562304.0, + "grad_norm": 0.0328588755399637, + "language_loss": 0.84125638, + "learning_rate": 7.992534134471641e-05, + "loss": 0.8526293, + "num_input_tokens_seen": 355234544, + "router_z_loss_mlp": 0.73242188, + "step": 4278, + "time_per_iteration": 2.722247362136841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137164, + "balance_loss_mlp": 1.0638746, + "epoch": 0.8232012312427857, + "flos": 592750113792.0, + "grad_norm": 0.04012924603788627, + "language_loss": 0.88655663, + "learning_rate": 7.975645631856127e-05, + "loss": 0.89792836, + "num_input_tokens_seen": 355302896, + "router_z_loss_mlp": 0.73291016, + "step": 4279, + "time_per_iteration": 2.67391037940979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137959, + "balance_loss_mlp": 1.06471694, + "epoch": 0.8233936129280492, + "flos": 573787877376.0, + "grad_norm": 0.031871243045387916, + "language_loss": 0.79251921, + "learning_rate": 7.958773444541916e-05, + "loss": 0.80389881, + "num_input_tokens_seen": 355377040, + "router_z_loss_mlp": 0.73242188, + "step": 4280, + "time_per_iteration": 2.7263128757476807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138187, + "balance_loss_mlp": 1.06499279, + "epoch": 0.8235859946133128, + "flos": 732749349888.0, + "grad_norm": 0.030378228316341748, + "language_loss": 0.82564437, + "learning_rate": 7.941917579079383e-05, + "loss": 0.83702624, + "num_input_tokens_seen": 355461616, + "router_z_loss_mlp": 0.73193359, + "step": 4281, + "time_per_iteration": 3.002906322479248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138376, + "balance_loss_mlp": 1.06522954, + "epoch": 0.8237783762985764, + "flos": 571397609472.0, + "grad_norm": 0.035495855879207304, + "language_loss": 0.86794972, + "learning_rate": 7.92507804201253e-05, + "loss": 0.8793335, + "num_input_tokens_seen": 355532480, + "router_z_loss_mlp": 0.73144531, + "step": 4282, + "time_per_iteration": 2.662153720855713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141701, + "balance_loss_mlp": 1.07017517, + "epoch": 0.8239707579838399, + "flos": 1469424566784.0, + "grad_norm": 0.006000143567348165, + "language_loss": 0.75297678, + "learning_rate": 7.908254839879092e-05, + "loss": 0.76439381, + "num_input_tokens_seen": 355768752, + "router_z_loss_mlp": 0.71679688, + "step": 4283, + "time_per_iteration": 4.955921649932861 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134641, + "balance_loss_mlp": 1.0613029, + "epoch": 0.8241631396691035, + "flos": 468296988672.0, + "grad_norm": 0.03760259633973049, + "language_loss": 0.85799181, + "learning_rate": 7.89144797921037e-05, + "loss": 0.86933821, + "num_input_tokens_seen": 355838800, + "router_z_loss_mlp": 0.73339844, + "step": 4284, + "time_per_iteration": 2.670642614364624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137108, + "balance_loss_mlp": 1.06520081, + "epoch": 0.8243555213543671, + "flos": 1542549599232.0, + "grad_norm": 0.0035179548887658537, + "language_loss": 0.77934271, + "learning_rate": 7.874657466531388e-05, + "loss": 0.79071379, + "num_input_tokens_seen": 356069280, + "router_z_loss_mlp": 0.72070312, + "step": 4285, + "time_per_iteration": 4.919512510299683 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135036, + "balance_loss_mlp": 1.06169832, + "epoch": 0.8245479030396307, + "flos": 798862758912.0, + "grad_norm": 0.02838711581178409, + "language_loss": 0.8627755, + "learning_rate": 7.85788330836078e-05, + "loss": 0.87412584, + "num_input_tokens_seen": 356164528, + "router_z_loss_mlp": 0.73339844, + "step": 4286, + "time_per_iteration": 3.106489419937134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135135, + "balance_loss_mlp": 1.06170166, + "epoch": 0.8247402847248941, + "flos": 647399731200.0, + "grad_norm": 0.035275587559529614, + "language_loss": 0.81354994, + "learning_rate": 7.841125511210878e-05, + "loss": 0.82490128, + "num_input_tokens_seen": 356243600, + "router_z_loss_mlp": 0.734375, + "step": 4287, + "time_per_iteration": 2.8796138763427734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135286, + "balance_loss_mlp": 1.06199634, + "epoch": 0.8249326664101577, + "flos": 605619248640.0, + "grad_norm": 0.03206789384595215, + "language_loss": 0.83634263, + "learning_rate": 7.824384081587637e-05, + "loss": 0.84769547, + "num_input_tokens_seen": 356320320, + "router_z_loss_mlp": 0.73291016, + "step": 4288, + "time_per_iteration": 2.846707820892334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134793, + "balance_loss_mlp": 1.06155086, + "epoch": 0.8251250480954213, + "flos": 825826999296.0, + "grad_norm": 0.09140379180840759, + "language_loss": 0.91303772, + "learning_rate": 7.807659025990637e-05, + "loss": 0.92438555, + "num_input_tokens_seen": 356406928, + "router_z_loss_mlp": 0.73242188, + "step": 4289, + "time_per_iteration": 3.1333796977996826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134594, + "balance_loss_mlp": 1.06125653, + "epoch": 0.8253174297806849, + "flos": 758675546112.0, + "grad_norm": 0.03823856900412753, + "language_loss": 0.83296132, + "learning_rate": 7.790950350913112e-05, + "loss": 0.8443073, + "num_input_tokens_seen": 356481456, + "router_z_loss_mlp": 0.73339844, + "step": 4290, + "time_per_iteration": 2.9032602310180664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134661, + "balance_loss_mlp": 1.06141841, + "epoch": 0.8255098114659485, + "flos": 795993126912.0, + "grad_norm": 0.03957304400162463, + "language_loss": 0.91916239, + "learning_rate": 7.774258062841971e-05, + "loss": 0.93050897, + "num_input_tokens_seen": 356568736, + "router_z_loss_mlp": 0.73242188, + "step": 4291, + "time_per_iteration": 3.2001283168792725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135868, + "balance_loss_mlp": 1.06272089, + "epoch": 0.825702193151212, + "flos": 711680825856.0, + "grad_norm": 0.035067281879066665, + "language_loss": 0.82225877, + "learning_rate": 7.757582168257731e-05, + "loss": 0.83361745, + "num_input_tokens_seen": 356643328, + "router_z_loss_mlp": 0.73144531, + "step": 4292, + "time_per_iteration": 2.863765001296997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137284, + "balance_loss_mlp": 1.06413746, + "epoch": 0.8258945748364755, + "flos": 684668921856.0, + "grad_norm": 0.032242786757735724, + "language_loss": 0.85239249, + "learning_rate": 7.740922673634537e-05, + "loss": 0.8637653, + "num_input_tokens_seen": 356723824, + "router_z_loss_mlp": 0.73144531, + "step": 4293, + "time_per_iteration": 2.907665729522705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136851, + "balance_loss_mlp": 1.06360924, + "epoch": 0.8260869565217391, + "flos": 595680870912.0, + "grad_norm": 0.0674529865816818, + "language_loss": 0.82838464, + "learning_rate": 7.724279585440186e-05, + "loss": 0.83975315, + "num_input_tokens_seen": 356796512, + "router_z_loss_mlp": 0.73242188, + "step": 4294, + "time_per_iteration": 2.7359163761138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136147, + "balance_loss_mlp": 1.06290495, + "epoch": 0.8262793382070027, + "flos": 652652900352.0, + "grad_norm": 0.037208876536065486, + "language_loss": 0.90246564, + "learning_rate": 7.707652910136098e-05, + "loss": 0.91382712, + "num_input_tokens_seen": 356868624, + "router_z_loss_mlp": 0.73242188, + "step": 4295, + "time_per_iteration": 2.7886202335357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135781, + "balance_loss_mlp": 1.0624913, + "epoch": 0.8264717198922663, + "flos": 539957005824.0, + "grad_norm": 0.03534933797875362, + "language_loss": 0.89258248, + "learning_rate": 7.691042654177315e-05, + "loss": 0.90394032, + "num_input_tokens_seen": 356934368, + "router_z_loss_mlp": 0.73291016, + "step": 4296, + "time_per_iteration": 2.651456594467163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135891, + "balance_loss_mlp": 1.0626967, + "epoch": 0.8266641015775298, + "flos": 539993935872.0, + "grad_norm": 0.03536676261879614, + "language_loss": 0.81180108, + "learning_rate": 7.674448824012514e-05, + "loss": 0.82316005, + "num_input_tokens_seen": 357005536, + "router_z_loss_mlp": 0.73193359, + "step": 4297, + "time_per_iteration": 2.691899061203003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136173, + "balance_loss_mlp": 1.06278765, + "epoch": 0.8268564832627934, + "flos": 586502561280.0, + "grad_norm": 0.03294900814096248, + "language_loss": 0.88706392, + "learning_rate": 7.657871426083979e-05, + "loss": 0.89842564, + "num_input_tokens_seen": 357082160, + "router_z_loss_mlp": 0.73388672, + "step": 4298, + "time_per_iteration": 3.3337292671203613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150659, + "balance_loss_mlp": 1.07727432, + "epoch": 0.827048864948057, + "flos": 431570288640.0, + "grad_norm": 0.03920761424756738, + "language_loss": 0.88906097, + "learning_rate": 7.641310466827667e-05, + "loss": 0.90056753, + "num_input_tokens_seen": 357146928, + "router_z_loss_mlp": 0.73388672, + "step": 4299, + "time_per_iteration": 3.4399309158325195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150747, + "balance_loss_mlp": 1.07740986, + "epoch": 0.8272412466333205, + "flos": 1390500241920.0, + "grad_norm": 0.03570603995956023, + "language_loss": 0.89542663, + "learning_rate": 7.624765952673069e-05, + "loss": 0.90693414, + "num_input_tokens_seen": 357236768, + "router_z_loss_mlp": 0.73339844, + "step": 4300, + "time_per_iteration": 3.9774158000946045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150112, + "balance_loss_mlp": 1.07667911, + "epoch": 0.827433628318584, + "flos": 539349387264.0, + "grad_norm": 0.034642967404352416, + "language_loss": 0.87599683, + "learning_rate": 7.608237890043335e-05, + "loss": 0.8874979, + "num_input_tokens_seen": 357307568, + "router_z_loss_mlp": 0.734375, + "step": 4301, + "time_per_iteration": 2.814303398132324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114939, + "balance_loss_mlp": 1.0759089, + "epoch": 0.8276260100038476, + "flos": 732063141888.0, + "grad_norm": 0.044295314753443144, + "language_loss": 0.82156098, + "learning_rate": 7.59172628535526e-05, + "loss": 0.8330549, + "num_input_tokens_seen": 357387712, + "router_z_loss_mlp": 0.73486328, + "step": 4302, + "time_per_iteration": 3.0075466632843018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144301, + "balance_loss_mlp": 1.07086802, + "epoch": 0.8278183916891112, + "flos": 872661264384.0, + "grad_norm": 0.03293198528529039, + "language_loss": 0.86338317, + "learning_rate": 7.575231145019196e-05, + "loss": 0.87482619, + "num_input_tokens_seen": 357473360, + "router_z_loss_mlp": 0.734375, + "step": 4303, + "time_per_iteration": 3.220668077468872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144066, + "balance_loss_mlp": 1.0707283, + "epoch": 0.8280107733743748, + "flos": 595698335232.0, + "grad_norm": 0.03223563949514157, + "language_loss": 0.81716228, + "learning_rate": 7.558752475439134e-05, + "loss": 0.82860291, + "num_input_tokens_seen": 357548432, + "router_z_loss_mlp": 0.73339844, + "step": 4304, + "time_per_iteration": 2.810628652572632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142863, + "balance_loss_mlp": 1.06942999, + "epoch": 0.8282031550596384, + "flos": 770027272704.0, + "grad_norm": 0.03508054216090567, + "language_loss": 0.87922353, + "learning_rate": 7.542290283012653e-05, + "loss": 0.89065218, + "num_input_tokens_seen": 357625968, + "router_z_loss_mlp": 0.734375, + "step": 4305, + "time_per_iteration": 3.1161751747131348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142615, + "balance_loss_mlp": 1.06922984, + "epoch": 0.8283955367449019, + "flos": 697446732288.0, + "grad_norm": 0.03898160364369505, + "language_loss": 0.82788968, + "learning_rate": 7.525844574130947e-05, + "loss": 0.83931583, + "num_input_tokens_seen": 357705824, + "router_z_loss_mlp": 0.73388672, + "step": 4306, + "time_per_iteration": 2.9796903133392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142397, + "balance_loss_mlp": 1.06896424, + "epoch": 0.8285879184301654, + "flos": 661937997312.0, + "grad_norm": 0.035115838558733896, + "language_loss": 0.87112027, + "learning_rate": 7.509415355178806e-05, + "loss": 0.88254428, + "num_input_tokens_seen": 357787040, + "router_z_loss_mlp": 0.734375, + "step": 4307, + "time_per_iteration": 2.9617509841918945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138818, + "balance_loss_mlp": 1.06543314, + "epoch": 0.828780300115429, + "flos": 559772636160.0, + "grad_norm": 0.04100434212152103, + "language_loss": 0.82768691, + "learning_rate": 7.493002632534618e-05, + "loss": 0.83907503, + "num_input_tokens_seen": 357856960, + "router_z_loss_mlp": 0.73388672, + "step": 4308, + "time_per_iteration": 2.727365016937256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137735, + "balance_loss_mlp": 1.06439769, + "epoch": 0.8289726818006926, + "flos": 832371993600.0, + "grad_norm": 0.035278553055239026, + "language_loss": 0.86246669, + "learning_rate": 7.476606412570352e-05, + "loss": 0.87384403, + "num_input_tokens_seen": 357937760, + "router_z_loss_mlp": 0.73339844, + "step": 4309, + "time_per_iteration": 3.108769416809082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154857, + "balance_loss_mlp": 1.08161438, + "epoch": 0.8291650634859561, + "flos": 733554353664.0, + "grad_norm": 0.0366695194121263, + "language_loss": 0.85579491, + "learning_rate": 7.460226701651624e-05, + "loss": 0.86734343, + "num_input_tokens_seen": 358012480, + "router_z_loss_mlp": 0.73242188, + "step": 4310, + "time_per_iteration": 2.954108238220215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153477, + "balance_loss_mlp": 1.08013999, + "epoch": 0.8293574451712197, + "flos": 862469105664.0, + "grad_norm": 0.03497290190762598, + "language_loss": 0.85557121, + "learning_rate": 7.443863506137566e-05, + "loss": 0.86710596, + "num_input_tokens_seen": 358100720, + "router_z_loss_mlp": 0.73339844, + "step": 4311, + "time_per_iteration": 3.2707061767578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145208, + "balance_loss_mlp": 1.071823, + "epoch": 0.8295498268564833, + "flos": 496290541056.0, + "grad_norm": 0.030603174986020117, + "language_loss": 0.85576063, + "learning_rate": 7.427516832380948e-05, + "loss": 0.86721271, + "num_input_tokens_seen": 358180496, + "router_z_loss_mlp": 0.73388672, + "step": 4312, + "time_per_iteration": 2.8450915813446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011424, + "balance_loss_mlp": 1.06896734, + "epoch": 0.8297422085417469, + "flos": 555654839808.0, + "grad_norm": 0.0318834502446829, + "language_loss": 0.82207704, + "learning_rate": 7.4111866867281e-05, + "loss": 0.8335011, + "num_input_tokens_seen": 358261104, + "router_z_loss_mlp": 0.734375, + "step": 4313, + "time_per_iteration": 2.8910624980926514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141956, + "balance_loss_mlp": 1.06852293, + "epoch": 0.8299345902270104, + "flos": 1249487883264.0, + "grad_norm": 0.032916410073977276, + "language_loss": 0.8188554, + "learning_rate": 7.39487307551896e-05, + "loss": 0.83027506, + "num_input_tokens_seen": 358356368, + "router_z_loss_mlp": 0.734375, + "step": 4314, + "time_per_iteration": 3.6977193355560303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138238, + "balance_loss_mlp": 1.06480479, + "epoch": 0.8301269719122739, + "flos": 586409235456.0, + "grad_norm": 0.03544125426025781, + "language_loss": 0.86962932, + "learning_rate": 7.378576005087034e-05, + "loss": 0.88101172, + "num_input_tokens_seen": 358429104, + "router_z_loss_mlp": 0.734375, + "step": 4315, + "time_per_iteration": 2.764580011367798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137941, + "balance_loss_mlp": 1.06446016, + "epoch": 0.8303193535975375, + "flos": 510776414208.0, + "grad_norm": 0.03851406833152273, + "language_loss": 0.89923644, + "learning_rate": 7.362295481759412e-05, + "loss": 0.91061592, + "num_input_tokens_seen": 358501344, + "router_z_loss_mlp": 0.73486328, + "step": 4316, + "time_per_iteration": 2.6864657402038574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139377, + "balance_loss_mlp": 1.06556237, + "epoch": 0.8305117352828011, + "flos": 581765686272.0, + "grad_norm": 0.03996280155822034, + "language_loss": 0.87696218, + "learning_rate": 7.346031511856722e-05, + "loss": 0.88835597, + "num_input_tokens_seen": 358575584, + "router_z_loss_mlp": 0.73730469, + "step": 4317, + "time_per_iteration": 2.7490365505218506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138995, + "balance_loss_mlp": 1.06508517, + "epoch": 0.8307041169680647, + "flos": 482648603136.0, + "grad_norm": 0.03410540332175001, + "language_loss": 0.83901942, + "learning_rate": 7.329784101693232e-05, + "loss": 0.85040939, + "num_input_tokens_seen": 358644304, + "router_z_loss_mlp": 0.73779297, + "step": 4318, + "time_per_iteration": 2.633737087249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140154, + "balance_loss_mlp": 1.06629157, + "epoch": 0.8308964986533282, + "flos": 625753787904.0, + "grad_norm": 0.039585355181565605, + "language_loss": 0.87891459, + "learning_rate": 7.313553257576727e-05, + "loss": 0.89031613, + "num_input_tokens_seen": 358712384, + "router_z_loss_mlp": 0.73730469, + "step": 4319, + "time_per_iteration": 2.73393177986145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137292, + "balance_loss_mlp": 1.06362104, + "epoch": 0.8310888803385917, + "flos": 828705363456.0, + "grad_norm": 0.038987738379061505, + "language_loss": 0.83643472, + "learning_rate": 7.297338985808589e-05, + "loss": 0.84780765, + "num_input_tokens_seen": 358789264, + "router_z_loss_mlp": 0.73583984, + "step": 4320, + "time_per_iteration": 3.0508508682250977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137036, + "balance_loss_mlp": 1.06350768, + "epoch": 0.8312812620238553, + "flos": 584946221568.0, + "grad_norm": 0.030329036309150237, + "language_loss": 0.85852158, + "learning_rate": 7.281141292683746e-05, + "loss": 0.86989194, + "num_input_tokens_seen": 358868976, + "router_z_loss_mlp": 0.73486328, + "step": 4321, + "time_per_iteration": 2.864978551864624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136398, + "balance_loss_mlp": 1.06277454, + "epoch": 0.8314736437091189, + "flos": 1117369127424.0, + "grad_norm": 0.04535130746874187, + "language_loss": 0.79764462, + "learning_rate": 7.26496018449071e-05, + "loss": 0.8090086, + "num_input_tokens_seen": 358953600, + "router_z_loss_mlp": 0.73535156, + "step": 4322, + "time_per_iteration": 3.5574073791503906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113609, + "balance_loss_mlp": 1.06237078, + "epoch": 0.8316660253943825, + "flos": 518558839296.0, + "grad_norm": 0.03678795377404695, + "language_loss": 0.86844653, + "learning_rate": 7.248795667511543e-05, + "loss": 0.87980741, + "num_input_tokens_seen": 359028768, + "router_z_loss_mlp": 0.73632812, + "step": 4323, + "time_per_iteration": 2.8555359840393066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136953, + "balance_loss_mlp": 1.06328201, + "epoch": 0.831858407079646, + "flos": 796696072704.0, + "grad_norm": 0.032683299236101075, + "language_loss": 0.82923019, + "learning_rate": 7.232647748021864e-05, + "loss": 0.84059966, + "num_input_tokens_seen": 359116208, + "router_z_loss_mlp": 0.73632812, + "step": 4324, + "time_per_iteration": 3.0507915019989014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135863, + "balance_loss_mlp": 1.06223941, + "epoch": 0.8320507887649096, + "flos": 551041489920.0, + "grad_norm": 0.03984980567953029, + "language_loss": 0.88372821, + "learning_rate": 7.216516432290843e-05, + "loss": 0.89508682, + "num_input_tokens_seen": 359189552, + "router_z_loss_mlp": 0.73583984, + "step": 4325, + "time_per_iteration": 2.910611867904663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135315, + "balance_loss_mlp": 1.06178665, + "epoch": 0.8322431704501732, + "flos": 480351661056.0, + "grad_norm": 0.03873731479113487, + "language_loss": 0.86735284, + "learning_rate": 7.20040172658123e-05, + "loss": 0.87870598, + "num_input_tokens_seen": 359253008, + "router_z_loss_mlp": 0.73535156, + "step": 4326, + "time_per_iteration": 2.637766122817993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113728, + "balance_loss_mlp": 1.06375158, + "epoch": 0.8324355521354367, + "flos": 573546831360.0, + "grad_norm": 0.031469774572695536, + "language_loss": 0.89963889, + "learning_rate": 7.184303637149308e-05, + "loss": 0.9110117, + "num_input_tokens_seen": 359326368, + "router_z_loss_mlp": 0.73535156, + "step": 4327, + "time_per_iteration": 2.7417519092559814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136846, + "balance_loss_mlp": 1.06341326, + "epoch": 0.8326279338207002, + "flos": 504439538688.0, + "grad_norm": 0.03407361480864025, + "language_loss": 0.8678869, + "learning_rate": 7.168222170244888e-05, + "loss": 0.87925529, + "num_input_tokens_seen": 359394192, + "router_z_loss_mlp": 0.734375, + "step": 4328, + "time_per_iteration": 2.7490806579589844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113636, + "balance_loss_mlp": 1.06283176, + "epoch": 0.8328203155059638, + "flos": 606950005248.0, + "grad_norm": 0.0316879397336073, + "language_loss": 0.85139227, + "learning_rate": 7.152157332111364e-05, + "loss": 0.86275589, + "num_input_tokens_seen": 359476016, + "router_z_loss_mlp": 0.73535156, + "step": 4329, + "time_per_iteration": 3.043998956680298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136964, + "balance_loss_mlp": 1.06353128, + "epoch": 0.8330126971912274, + "flos": 699122594304.0, + "grad_norm": 0.03501346929276039, + "language_loss": 0.90436953, + "learning_rate": 7.136109128985663e-05, + "loss": 0.91573918, + "num_input_tokens_seen": 359554048, + "router_z_loss_mlp": 0.734375, + "step": 4330, + "time_per_iteration": 2.9104068279266357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136816, + "balance_loss_mlp": 1.06338286, + "epoch": 0.833205078876491, + "flos": 495020183040.0, + "grad_norm": 0.039903195298822546, + "language_loss": 0.91142917, + "learning_rate": 7.120077567098249e-05, + "loss": 0.92279732, + "num_input_tokens_seen": 359621440, + "router_z_loss_mlp": 0.734375, + "step": 4331, + "time_per_iteration": 2.539658784866333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136663, + "balance_loss_mlp": 1.06327808, + "epoch": 0.8333974605617546, + "flos": 483794709504.0, + "grad_norm": 0.031623545880620704, + "language_loss": 0.86857003, + "learning_rate": 7.104062652673115e-05, + "loss": 0.87993664, + "num_input_tokens_seen": 359690320, + "router_z_loss_mlp": 0.73388672, + "step": 4332, + "time_per_iteration": 2.592482566833496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136652, + "balance_loss_mlp": 1.063362, + "epoch": 0.833589842247018, + "flos": 688040111616.0, + "grad_norm": 0.04080208699909347, + "language_loss": 0.87699354, + "learning_rate": 7.088064391927818e-05, + "loss": 0.88836008, + "num_input_tokens_seen": 359759888, + "router_z_loss_mlp": 0.73291016, + "step": 4333, + "time_per_iteration": 2.8243579864501953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136297, + "balance_loss_mlp": 1.06300712, + "epoch": 0.8337822239322816, + "flos": 883191797760.0, + "grad_norm": 0.034267642896518694, + "language_loss": 0.87079096, + "learning_rate": 7.072082791073419e-05, + "loss": 0.88215387, + "num_input_tokens_seen": 359836544, + "router_z_loss_mlp": 0.73291016, + "step": 4334, + "time_per_iteration": 3.095567226409912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136658, + "balance_loss_mlp": 1.06341565, + "epoch": 0.8339746056175452, + "flos": 498157057536.0, + "grad_norm": 0.036797660488946164, + "language_loss": 0.87406766, + "learning_rate": 7.056117856314531e-05, + "loss": 0.88543415, + "num_input_tokens_seen": 359903024, + "router_z_loss_mlp": 0.73242188, + "step": 4335, + "time_per_iteration": 2.6543936729431152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138151, + "balance_loss_mlp": 1.06490886, + "epoch": 0.8341669873028088, + "flos": 511503555072.0, + "grad_norm": 0.033824511697931096, + "language_loss": 0.91365576, + "learning_rate": 7.040169593849289e-05, + "loss": 0.92503732, + "num_input_tokens_seen": 359971200, + "router_z_loss_mlp": 0.73242188, + "step": 4336, + "time_per_iteration": 2.6173272132873535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141861, + "balance_loss_mlp": 1.06852305, + "epoch": 0.8343593689880723, + "flos": 693541057536.0, + "grad_norm": 0.036766896527395135, + "language_loss": 0.89182138, + "learning_rate": 7.024238009869366e-05, + "loss": 0.90323997, + "num_input_tokens_seen": 360042560, + "router_z_loss_mlp": 0.73339844, + "step": 4337, + "time_per_iteration": 2.832035779953003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113989, + "balance_loss_mlp": 1.06650496, + "epoch": 0.8345517506733359, + "flos": 553516351488.0, + "grad_norm": 0.03709810498280935, + "language_loss": 0.83323646, + "learning_rate": 7.008323110559956e-05, + "loss": 0.84463537, + "num_input_tokens_seen": 360118048, + "router_z_loss_mlp": 0.73388672, + "step": 4338, + "time_per_iteration": 2.7567930221557617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140629, + "balance_loss_mlp": 1.06743467, + "epoch": 0.8347441323585995, + "flos": 593267134464.0, + "grad_norm": 0.04006529314442172, + "language_loss": 0.80799747, + "learning_rate": 6.992424902099754e-05, + "loss": 0.81940377, + "num_input_tokens_seen": 360192528, + "router_z_loss_mlp": 0.73193359, + "step": 4339, + "time_per_iteration": 2.7979674339294434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140723, + "balance_loss_mlp": 1.06752896, + "epoch": 0.834936514043863, + "flos": 616091384832.0, + "grad_norm": 0.03516018404637607, + "language_loss": 0.89085752, + "learning_rate": 6.976543390660983e-05, + "loss": 0.90226471, + "num_input_tokens_seen": 360266880, + "router_z_loss_mlp": 0.73193359, + "step": 4340, + "time_per_iteration": 3.017014980316162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140539, + "balance_loss_mlp": 1.0673449, + "epoch": 0.8351288957291266, + "flos": 468863674368.0, + "grad_norm": 0.040869831177599326, + "language_loss": 0.83971238, + "learning_rate": 6.960678582409424e-05, + "loss": 0.85111785, + "num_input_tokens_seen": 360336336, + "router_z_loss_mlp": 0.73193359, + "step": 4341, + "time_per_iteration": 3.5495381355285645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114012, + "balance_loss_mlp": 1.06697321, + "epoch": 0.8353212774143901, + "flos": 510348716544.0, + "grad_norm": 0.04414728367362659, + "language_loss": 0.83281082, + "learning_rate": 6.944830483504328e-05, + "loss": 0.84421206, + "num_input_tokens_seen": 360409776, + "router_z_loss_mlp": 0.73144531, + "step": 4342, + "time_per_iteration": 2.8123908042907715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140117, + "balance_loss_mlp": 1.06697071, + "epoch": 0.8355136590996537, + "flos": 689017030656.0, + "grad_norm": 0.03677224015719086, + "language_loss": 0.85329032, + "learning_rate": 6.928999100098483e-05, + "loss": 0.8646915, + "num_input_tokens_seen": 360486800, + "router_z_loss_mlp": 0.73144531, + "step": 4343, + "time_per_iteration": 2.8525094985961914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140369, + "balance_loss_mlp": 1.06712639, + "epoch": 0.8357060407849173, + "flos": 985975511040.0, + "grad_norm": 0.03601056440929186, + "language_loss": 0.88194978, + "learning_rate": 6.913184438338138e-05, + "loss": 0.89335346, + "num_input_tokens_seen": 360568624, + "router_z_loss_mlp": 0.73242188, + "step": 4344, + "time_per_iteration": 3.206106185913086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141569, + "balance_loss_mlp": 1.06842268, + "epoch": 0.8358984224701809, + "flos": 844507256832.0, + "grad_norm": 0.03403059716979156, + "language_loss": 0.8941586, + "learning_rate": 6.89738650436313e-05, + "loss": 0.90557432, + "num_input_tokens_seen": 360652384, + "router_z_loss_mlp": 0.73144531, + "step": 4345, + "time_per_iteration": 3.211400032043457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141203, + "balance_loss_mlp": 1.06796038, + "epoch": 0.8360908041554445, + "flos": 627418916352.0, + "grad_norm": 0.033473351355860013, + "language_loss": 0.86278164, + "learning_rate": 6.881605304306748e-05, + "loss": 0.87419367, + "num_input_tokens_seen": 360723200, + "router_z_loss_mlp": 0.73242188, + "step": 4346, + "time_per_iteration": 2.8406436443328857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141884, + "balance_loss_mlp": 1.06878495, + "epoch": 0.8362831858407079, + "flos": 577222193664.0, + "grad_norm": 0.034289712493456775, + "language_loss": 0.89250559, + "learning_rate": 6.865840844295796e-05, + "loss": 0.90392447, + "num_input_tokens_seen": 360798240, + "router_z_loss_mlp": 0.73095703, + "step": 4347, + "time_per_iteration": 2.8221635818481445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114195, + "balance_loss_mlp": 1.06885087, + "epoch": 0.8364755675259715, + "flos": 835183228416.0, + "grad_norm": 0.040230317170211145, + "language_loss": 0.8577764, + "learning_rate": 6.850093130450569e-05, + "loss": 0.86919594, + "num_input_tokens_seen": 360873552, + "router_z_loss_mlp": 0.73095703, + "step": 4348, + "time_per_iteration": 3.087906837463379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142182, + "balance_loss_mlp": 1.0691303, + "epoch": 0.8366679492112351, + "flos": 583563072000.0, + "grad_norm": 0.04163204479707521, + "language_loss": 0.91017622, + "learning_rate": 6.834362168884912e-05, + "loss": 0.92159808, + "num_input_tokens_seen": 360940800, + "router_z_loss_mlp": 0.73046875, + "step": 4349, + "time_per_iteration": 2.6955840587615967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141373, + "balance_loss_mlp": 1.06817806, + "epoch": 0.8368603308964987, + "flos": 612880650240.0, + "grad_norm": 0.03976549497353498, + "language_loss": 0.93744481, + "learning_rate": 6.818647965706076e-05, + "loss": 0.94885856, + "num_input_tokens_seen": 361014368, + "router_z_loss_mlp": 0.73193359, + "step": 4350, + "time_per_iteration": 2.8501739501953125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142129, + "balance_loss_mlp": 1.06902957, + "epoch": 0.8370527125817622, + "flos": 508264622592.0, + "grad_norm": 0.03390143622863109, + "language_loss": 0.8937093, + "learning_rate": 6.802950527014884e-05, + "loss": 0.90513057, + "num_input_tokens_seen": 361087184, + "router_z_loss_mlp": 0.73095703, + "step": 4351, + "time_per_iteration": 2.7211203575134277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140268, + "balance_loss_mlp": 1.06707358, + "epoch": 0.8372450942670258, + "flos": 772282555392.0, + "grad_norm": 0.04155998502814681, + "language_loss": 0.86906236, + "learning_rate": 6.787269858905603e-05, + "loss": 0.88046503, + "num_input_tokens_seen": 361160720, + "router_z_loss_mlp": 0.73193359, + "step": 4352, + "time_per_iteration": 2.9425594806671143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140282, + "balance_loss_mlp": 1.06703997, + "epoch": 0.8374374759522893, + "flos": 580361069568.0, + "grad_norm": 0.036304027113603754, + "language_loss": 0.89294255, + "learning_rate": 6.771605967466033e-05, + "loss": 0.90434539, + "num_input_tokens_seen": 361234432, + "router_z_loss_mlp": 0.73242188, + "step": 4353, + "time_per_iteration": 2.686323881149292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139987, + "balance_loss_mlp": 1.06669676, + "epoch": 0.8376298576375529, + "flos": 789527996928.0, + "grad_norm": 0.03911073314318024, + "language_loss": 0.87069052, + "learning_rate": 6.755958858777434e-05, + "loss": 0.88209045, + "num_input_tokens_seen": 361309376, + "router_z_loss_mlp": 0.73291016, + "step": 4354, + "time_per_iteration": 3.059568166732788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140086, + "balance_loss_mlp": 1.06679642, + "epoch": 0.8378222393228165, + "flos": 578722137600.0, + "grad_norm": 0.03555136596776637, + "language_loss": 0.85425603, + "learning_rate": 6.74032853891452e-05, + "loss": 0.86565685, + "num_input_tokens_seen": 361386768, + "router_z_loss_mlp": 0.73291016, + "step": 4355, + "time_per_iteration": 2.7401504516601562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138958, + "balance_loss_mlp": 1.06566799, + "epoch": 0.83801462100808, + "flos": 481858335744.0, + "grad_norm": 0.03498215623204101, + "language_loss": 0.86501992, + "learning_rate": 6.724715013945548e-05, + "loss": 0.87640953, + "num_input_tokens_seen": 361456704, + "router_z_loss_mlp": 0.73291016, + "step": 4356, + "time_per_iteration": 2.637608528137207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139048, + "balance_loss_mlp": 1.06580544, + "epoch": 0.8382070026933436, + "flos": 551996941824.0, + "grad_norm": 0.03258486084339394, + "language_loss": 0.93043453, + "learning_rate": 6.709118289932226e-05, + "loss": 0.94182503, + "num_input_tokens_seen": 361533648, + "router_z_loss_mlp": 0.73242188, + "step": 4357, + "time_per_iteration": 2.803379535675049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139227, + "balance_loss_mlp": 1.06584203, + "epoch": 0.8383993843786072, + "flos": 626225146368.0, + "grad_norm": 0.04207482015939984, + "language_loss": 0.87703115, + "learning_rate": 6.693538372929725e-05, + "loss": 0.88842344, + "num_input_tokens_seen": 361614256, + "router_z_loss_mlp": 0.73388672, + "step": 4358, + "time_per_iteration": 2.893259286880493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139769, + "balance_loss_mlp": 1.06652725, + "epoch": 0.8385917660638708, + "flos": 492135088128.0, + "grad_norm": 0.038027162181002674, + "language_loss": 0.91387022, + "learning_rate": 6.677975268986719e-05, + "loss": 0.92526793, + "num_input_tokens_seen": 361679008, + "router_z_loss_mlp": 0.73242188, + "step": 4359, + "time_per_iteration": 2.580935001373291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140209, + "balance_loss_mlp": 1.06691909, + "epoch": 0.8387841477491342, + "flos": 467869291008.0, + "grad_norm": 0.03829625401791919, + "language_loss": 0.91665077, + "learning_rate": 6.662428984145336e-05, + "loss": 0.92805284, + "num_input_tokens_seen": 361747600, + "router_z_loss_mlp": 0.73291016, + "step": 4360, + "time_per_iteration": 2.583767890930176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144524, + "balance_loss_mlp": 1.07299805, + "epoch": 0.8389765294343978, + "flos": 1567597658112.0, + "grad_norm": 0.007274153524221762, + "language_loss": 0.71780187, + "learning_rate": 6.646899524441175e-05, + "loss": 0.72924709, + "num_input_tokens_seen": 361983104, + "router_z_loss_mlp": 0.71679688, + "step": 4361, + "time_per_iteration": 5.073408365249634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138412, + "balance_loss_mlp": 1.06521726, + "epoch": 0.8391689111196614, + "flos": 603411629568.0, + "grad_norm": 0.030598309130581258, + "language_loss": 0.86443758, + "learning_rate": 6.631386895903308e-05, + "loss": 0.87582171, + "num_input_tokens_seen": 362065824, + "router_z_loss_mlp": 0.73193359, + "step": 4362, + "time_per_iteration": 2.8680214881896973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138687, + "balance_loss_mlp": 1.06544518, + "epoch": 0.839361292804925, + "flos": 443968065024.0, + "grad_norm": 0.03783251777685458, + "language_loss": 0.84810257, + "learning_rate": 6.615891104554261e-05, + "loss": 0.85948944, + "num_input_tokens_seen": 362128240, + "router_z_loss_mlp": 0.73242188, + "step": 4363, + "time_per_iteration": 2.5391616821289062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138227, + "balance_loss_mlp": 1.06493664, + "epoch": 0.8395536744901886, + "flos": 595298835456.0, + "grad_norm": 0.034478723046930226, + "language_loss": 0.87398577, + "learning_rate": 6.600412156410057e-05, + "loss": 0.88536799, + "num_input_tokens_seen": 362198256, + "router_z_loss_mlp": 0.73291016, + "step": 4364, + "time_per_iteration": 2.712852716445923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138545, + "balance_loss_mlp": 1.06525552, + "epoch": 0.8397460561754521, + "flos": 891334791168.0, + "grad_norm": 0.03388693894725111, + "language_loss": 0.89365327, + "learning_rate": 6.58495005748016e-05, + "loss": 0.90503871, + "num_input_tokens_seen": 362279792, + "router_z_loss_mlp": 0.73291016, + "step": 4365, + "time_per_iteration": 3.19172739982605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138646, + "balance_loss_mlp": 1.06540406, + "epoch": 0.8399384378607156, + "flos": 554560399872.0, + "grad_norm": 0.034766159346027045, + "language_loss": 0.93272662, + "learning_rate": 6.569504813767463e-05, + "loss": 0.94411302, + "num_input_tokens_seen": 362351712, + "router_z_loss_mlp": 0.73242188, + "step": 4366, + "time_per_iteration": 2.6472387313842773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138425, + "balance_loss_mlp": 1.06523097, + "epoch": 0.8401308195459792, + "flos": 519963456000.0, + "grad_norm": 0.031091903503957602, + "language_loss": 0.87725037, + "learning_rate": 6.554076431268341e-05, + "loss": 0.88863462, + "num_input_tokens_seen": 362423424, + "router_z_loss_mlp": 0.73193359, + "step": 4367, + "time_per_iteration": 2.6440939903259277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138179, + "balance_loss_mlp": 1.06488955, + "epoch": 0.8403232012312428, + "flos": 686295118848.0, + "grad_norm": 0.03330958137241384, + "language_loss": 0.84921622, + "learning_rate": 6.538664915972648e-05, + "loss": 0.86059797, + "num_input_tokens_seen": 362514704, + "router_z_loss_mlp": 0.73291016, + "step": 4368, + "time_per_iteration": 3.006840944290161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136367, + "balance_loss_mlp": 1.06307733, + "epoch": 0.8405155829165063, + "flos": 578669744640.0, + "grad_norm": 0.040494146128891996, + "language_loss": 0.82172203, + "learning_rate": 6.523270273863652e-05, + "loss": 0.83308572, + "num_input_tokens_seen": 362581296, + "router_z_loss_mlp": 0.73291016, + "step": 4369, + "time_per_iteration": 2.726771354675293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136099, + "balance_loss_mlp": 1.06290472, + "epoch": 0.8407079646017699, + "flos": 457566342144.0, + "grad_norm": 0.03926161531299747, + "language_loss": 0.92181575, + "learning_rate": 6.507892510918079e-05, + "loss": 0.93317676, + "num_input_tokens_seen": 362648304, + "router_z_loss_mlp": 0.73193359, + "step": 4370, + "time_per_iteration": 2.5662474632263184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136174, + "balance_loss_mlp": 1.06288445, + "epoch": 0.8409003462870335, + "flos": 535999664640.0, + "grad_norm": 0.03344035414756239, + "language_loss": 0.86222243, + "learning_rate": 6.492531633106114e-05, + "loss": 0.87358415, + "num_input_tokens_seen": 362721264, + "router_z_loss_mlp": 0.73291016, + "step": 4371, + "time_per_iteration": 2.7723512649536133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136298, + "balance_loss_mlp": 1.0631038, + "epoch": 0.8410927279722971, + "flos": 557899388928.0, + "grad_norm": 0.03943054767144193, + "language_loss": 0.82708782, + "learning_rate": 6.477187646391374e-05, + "loss": 0.83845079, + "num_input_tokens_seen": 362795312, + "router_z_loss_mlp": 0.73193359, + "step": 4372, + "time_per_iteration": 2.725720167160034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141853, + "balance_loss_mlp": 1.07013702, + "epoch": 0.8412851096575606, + "flos": 1552926408192.0, + "grad_norm": 0.004959659749384099, + "language_loss": 0.77679121, + "learning_rate": 6.461860556730925e-05, + "loss": 0.78820974, + "num_input_tokens_seen": 363026272, + "router_z_loss_mlp": 0.71875, + "step": 4373, + "time_per_iteration": 4.933819770812988 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136162, + "balance_loss_mlp": 1.06296706, + "epoch": 0.8414774913428241, + "flos": 553108119552.0, + "grad_norm": 0.03645525381144212, + "language_loss": 0.84143221, + "learning_rate": 6.446550370075271e-05, + "loss": 0.85279381, + "num_input_tokens_seen": 363098384, + "router_z_loss_mlp": 0.73193359, + "step": 4374, + "time_per_iteration": 2.7640419006347656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140726, + "balance_loss_mlp": 1.06743658, + "epoch": 0.8416698730280877, + "flos": 574069856256.0, + "grad_norm": 0.035030184778751555, + "language_loss": 0.82005304, + "learning_rate": 6.431257092368336e-05, + "loss": 0.83146024, + "num_input_tokens_seen": 363170960, + "router_z_loss_mlp": 0.73291016, + "step": 4375, + "time_per_iteration": 2.8986310958862305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114067, + "balance_loss_mlp": 1.06737995, + "epoch": 0.8418622547133513, + "flos": 760043232768.0, + "grad_norm": 0.04161434529267318, + "language_loss": 0.84811461, + "learning_rate": 6.415980729547543e-05, + "loss": 0.85952127, + "num_input_tokens_seen": 363242000, + "router_z_loss_mlp": 0.73291016, + "step": 4376, + "time_per_iteration": 2.9330646991729736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140768, + "balance_loss_mlp": 1.06743073, + "epoch": 0.8420546363986149, + "flos": 1075921015296.0, + "grad_norm": 0.04130069201888351, + "language_loss": 0.78135824, + "learning_rate": 6.40072128754366e-05, + "loss": 0.79276592, + "num_input_tokens_seen": 363340288, + "router_z_loss_mlp": 0.73339844, + "step": 4377, + "time_per_iteration": 3.4237923622131348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140552, + "balance_loss_mlp": 1.06735754, + "epoch": 0.8422470180838784, + "flos": 527016738816.0, + "grad_norm": 0.03545536535288648, + "language_loss": 0.87165993, + "learning_rate": 6.385478772280933e-05, + "loss": 0.88306552, + "num_input_tokens_seen": 363416208, + "router_z_loss_mlp": 0.73193359, + "step": 4378, + "time_per_iteration": 2.753131628036499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141175, + "balance_loss_mlp": 1.06793308, + "epoch": 0.842439399769142, + "flos": 601963352064.0, + "grad_norm": 0.03434358981966458, + "language_loss": 0.86777276, + "learning_rate": 6.370253189677038e-05, + "loss": 0.87918454, + "num_input_tokens_seen": 363492864, + "router_z_loss_mlp": 0.73242188, + "step": 4379, + "time_per_iteration": 2.779681921005249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114171, + "balance_loss_mlp": 1.06846821, + "epoch": 0.8426317814544055, + "flos": 553375362048.0, + "grad_norm": 0.03541517543705223, + "language_loss": 0.90755582, + "learning_rate": 6.355044545643073e-05, + "loss": 0.91897291, + "num_input_tokens_seen": 363572000, + "router_z_loss_mlp": 0.73242188, + "step": 4380, + "time_per_iteration": 2.812915802001953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142077, + "balance_loss_mlp": 1.06878674, + "epoch": 0.8428241631396691, + "flos": 680044838400.0, + "grad_norm": 0.03810176337310906, + "language_loss": 0.82064164, + "learning_rate": 6.33985284608356e-05, + "loss": 0.83206236, + "num_input_tokens_seen": 363646480, + "router_z_loss_mlp": 0.73291016, + "step": 4381, + "time_per_iteration": 3.037733554840088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138351, + "balance_loss_mlp": 1.0651089, + "epoch": 0.8430165448249327, + "flos": 755198295552.0, + "grad_norm": 0.028303447358351223, + "language_loss": 0.8332209, + "learning_rate": 6.324678096896435e-05, + "loss": 0.84460437, + "num_input_tokens_seen": 363737552, + "router_z_loss_mlp": 0.73242188, + "step": 4382, + "time_per_iteration": 3.35500431060791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136887, + "balance_loss_mlp": 1.06354892, + "epoch": 0.8432089265101962, + "flos": 700435886592.0, + "grad_norm": 0.03473950502542374, + "language_loss": 0.85785019, + "learning_rate": 6.30952030397306e-05, + "loss": 0.86921906, + "num_input_tokens_seen": 363816016, + "router_z_loss_mlp": 0.73339844, + "step": 4383, + "time_per_iteration": 2.925360918045044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135912, + "balance_loss_mlp": 1.06262255, + "epoch": 0.8434013081954598, + "flos": 486790594560.0, + "grad_norm": 0.03830758033053903, + "language_loss": 0.88952708, + "learning_rate": 6.294379473198208e-05, + "loss": 0.90088624, + "num_input_tokens_seen": 363888192, + "router_z_loss_mlp": 0.73291016, + "step": 4384, + "time_per_iteration": 2.6873929500579834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135663, + "balance_loss_mlp": 1.06251621, + "epoch": 0.8435936898807234, + "flos": 521630585856.0, + "grad_norm": 0.03664735464592092, + "language_loss": 0.89606541, + "learning_rate": 6.279255610450068e-05, + "loss": 0.90742207, + "num_input_tokens_seen": 363953904, + "router_z_loss_mlp": 0.73144531, + "step": 4385, + "time_per_iteration": 2.619441509246826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136436, + "balance_loss_mlp": 1.06328917, + "epoch": 0.843786071565987, + "flos": 787313647104.0, + "grad_norm": 0.03681711065218231, + "language_loss": 0.85414076, + "learning_rate": 6.264148721600254e-05, + "loss": 0.8655051, + "num_input_tokens_seen": 364031552, + "router_z_loss_mlp": 0.73144531, + "step": 4386, + "time_per_iteration": 3.0707485675811768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140541, + "balance_loss_mlp": 1.06882477, + "epoch": 0.8439784532512504, + "flos": 1449513609216.0, + "grad_norm": 0.00413751236378941, + "language_loss": 0.75836509, + "learning_rate": 6.24905881251378e-05, + "loss": 0.76977056, + "num_input_tokens_seen": 364256480, + "router_z_loss_mlp": 0.71875, + "step": 4387, + "time_per_iteration": 5.089155197143555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113603, + "balance_loss_mlp": 1.06278777, + "epoch": 0.844170834936514, + "flos": 709968033792.0, + "grad_norm": 0.06407093609242513, + "language_loss": 0.88289285, + "learning_rate": 6.23398588904906e-05, + "loss": 0.89425313, + "num_input_tokens_seen": 364329696, + "router_z_loss_mlp": 0.73242188, + "step": 4388, + "time_per_iteration": 3.0436534881591797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135293, + "balance_loss_mlp": 1.06205094, + "epoch": 0.8443632166217776, + "flos": 484409058816.0, + "grad_norm": 0.03790339659307899, + "language_loss": 0.8391732, + "learning_rate": 6.218929957057922e-05, + "loss": 0.85052609, + "num_input_tokens_seen": 364400944, + "router_z_loss_mlp": 0.73242188, + "step": 4389, + "time_per_iteration": 2.7934298515319824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137913, + "balance_loss_mlp": 1.0647186, + "epoch": 0.8445555983070412, + "flos": 679923314688.0, + "grad_norm": 0.03718559505154548, + "language_loss": 0.8493886, + "learning_rate": 6.2038910223856e-05, + "loss": 0.86076784, + "num_input_tokens_seen": 364475744, + "router_z_loss_mlp": 0.73193359, + "step": 4390, + "time_per_iteration": 2.9792392253875732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137881, + "balance_loss_mlp": 1.06468666, + "epoch": 0.8447479799923048, + "flos": 742858916352.0, + "grad_norm": 0.03376774595397736, + "language_loss": 0.78831851, + "learning_rate": 6.18886909087073e-05, + "loss": 0.79969728, + "num_input_tokens_seen": 364557248, + "router_z_loss_mlp": 0.73193359, + "step": 4391, + "time_per_iteration": 3.1305229663848877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136874, + "balance_loss_mlp": 1.06367922, + "epoch": 0.8449403616775683, + "flos": 954949870080.0, + "grad_norm": 0.036571969449469936, + "language_loss": 0.84915316, + "learning_rate": 6.173864168345344e-05, + "loss": 0.86052191, + "num_input_tokens_seen": 364647856, + "router_z_loss_mlp": 0.73193359, + "step": 4392, + "time_per_iteration": 3.35559344291687 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137009, + "balance_loss_mlp": 1.06371963, + "epoch": 0.8451327433628318, + "flos": 658607740416.0, + "grad_norm": 0.04080767890774202, + "language_loss": 0.78550094, + "learning_rate": 6.158876260634871e-05, + "loss": 0.79687101, + "num_input_tokens_seen": 364728848, + "router_z_loss_mlp": 0.73291016, + "step": 4393, + "time_per_iteration": 2.8861243724823 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136802, + "balance_loss_mlp": 1.06360781, + "epoch": 0.8453251250480954, + "flos": 447048543744.0, + "grad_norm": 0.03643076078950129, + "language_loss": 0.87869531, + "learning_rate": 6.143905373558112e-05, + "loss": 0.89006329, + "num_input_tokens_seen": 364794032, + "router_z_loss_mlp": 0.73193359, + "step": 4394, + "time_per_iteration": 2.601045846939087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136522, + "balance_loss_mlp": 1.06332743, + "epoch": 0.845517506733359, + "flos": 543873414144.0, + "grad_norm": 0.04754169737380615, + "language_loss": 0.75916922, + "learning_rate": 6.128951512927305e-05, + "loss": 0.77053452, + "num_input_tokens_seen": 364868624, + "router_z_loss_mlp": 0.73193359, + "step": 4395, + "time_per_iteration": 2.6586995124816895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136024, + "balance_loss_mlp": 1.06282973, + "epoch": 0.8457098884186226, + "flos": 503506280448.0, + "grad_norm": 0.034957513190318694, + "language_loss": 0.88970757, + "learning_rate": 6.114014684548046e-05, + "loss": 0.90106773, + "num_input_tokens_seen": 364938208, + "router_z_loss_mlp": 0.73193359, + "step": 4396, + "time_per_iteration": 2.641904592514038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136391, + "balance_loss_mlp": 1.06319618, + "epoch": 0.8459022701038861, + "flos": 449894707200.0, + "grad_norm": 0.03727348899635202, + "language_loss": 0.85077035, + "learning_rate": 6.099094894219326e-05, + "loss": 0.86213428, + "num_input_tokens_seen": 365009440, + "router_z_loss_mlp": 0.73193359, + "step": 4397, + "time_per_iteration": 2.7485921382904053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138262, + "balance_loss_mlp": 1.06516242, + "epoch": 0.8460946517891497, + "flos": 744471651840.0, + "grad_norm": 0.03568111304963743, + "language_loss": 0.79751641, + "learning_rate": 6.0841921477335194e-05, + "loss": 0.80889904, + "num_input_tokens_seen": 365085904, + "router_z_loss_mlp": 0.73095703, + "step": 4398, + "time_per_iteration": 3.0065886974334717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137749, + "balance_loss_mlp": 1.06469774, + "epoch": 0.8462870334744133, + "flos": 554326084608.0, + "grad_norm": 0.034126813456360164, + "language_loss": 0.84568942, + "learning_rate": 6.069306450876389e-05, + "loss": 0.85706693, + "num_input_tokens_seen": 365163600, + "router_z_loss_mlp": 0.73095703, + "step": 4399, + "time_per_iteration": 2.758197069168091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142326, + "balance_loss_mlp": 1.07080078, + "epoch": 0.8464794151596768, + "flos": 1568268403200.0, + "grad_norm": 0.004082399579893022, + "language_loss": 0.81708568, + "learning_rate": 6.054437809427071e-05, + "loss": 0.82850897, + "num_input_tokens_seen": 365384528, + "router_z_loss_mlp": 0.71679688, + "step": 4400, + "time_per_iteration": 5.1885364055633545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113736, + "balance_loss_mlp": 1.06416523, + "epoch": 0.8466717968449403, + "flos": 551265071616.0, + "grad_norm": 0.03422118197100462, + "language_loss": 0.84376073, + "learning_rate": 6.039586229158084e-05, + "loss": 0.85513437, + "num_input_tokens_seen": 365453760, + "router_z_loss_mlp": 0.73193359, + "step": 4401, + "time_per_iteration": 2.866410255432129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137584, + "balance_loss_mlp": 1.06438947, + "epoch": 0.8468641785302039, + "flos": 553095384576.0, + "grad_norm": 0.04013122246303511, + "language_loss": 0.89010692, + "learning_rate": 6.024751715835314e-05, + "loss": 0.90148282, + "num_input_tokens_seen": 365532416, + "router_z_loss_mlp": 0.73193359, + "step": 4402, + "time_per_iteration": 2.8533406257629395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137073, + "balance_loss_mlp": 1.06402123, + "epoch": 0.8470565602154675, + "flos": 573824807424.0, + "grad_norm": 0.04032328985760824, + "language_loss": 0.91560149, + "learning_rate": 6.009934275218049e-05, + "loss": 0.92697221, + "num_input_tokens_seen": 365603776, + "router_z_loss_mlp": 0.73095703, + "step": 4403, + "time_per_iteration": 2.733107566833496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137261, + "balance_loss_mlp": 1.06406605, + "epoch": 0.8472489419007311, + "flos": 473780470272.0, + "grad_norm": 0.040727002498919716, + "language_loss": 0.89137018, + "learning_rate": 5.995133913058936e-05, + "loss": 0.90274274, + "num_input_tokens_seen": 365670432, + "router_z_loss_mlp": 0.73193359, + "step": 4404, + "time_per_iteration": 2.5842621326446533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137106, + "balance_loss_mlp": 1.06405413, + "epoch": 0.8474413235859947, + "flos": 799377051648.0, + "grad_norm": 0.036020961775101966, + "language_loss": 0.84674489, + "learning_rate": 5.980350635103954e-05, + "loss": 0.85811591, + "num_input_tokens_seen": 365741584, + "router_z_loss_mlp": 0.73095703, + "step": 4405, + "time_per_iteration": 3.0260725021362305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138055, + "balance_loss_mlp": 1.06495583, + "epoch": 0.8476337052712581, + "flos": 503378025984.0, + "grad_norm": 0.03673815005033266, + "language_loss": 0.85231286, + "learning_rate": 5.9655844470924866e-05, + "loss": 0.86369342, + "num_input_tokens_seen": 365805344, + "router_z_loss_mlp": 0.73144531, + "step": 4406, + "time_per_iteration": 2.5721280574798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011379, + "balance_loss_mlp": 1.06475341, + "epoch": 0.8478260869565217, + "flos": 933516774912.0, + "grad_norm": 0.029177319887610593, + "language_loss": 0.87274981, + "learning_rate": 5.9508353547573e-05, + "loss": 0.88412881, + "num_input_tokens_seen": 365890976, + "router_z_loss_mlp": 0.73193359, + "step": 4407, + "time_per_iteration": 3.267518997192383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138465, + "balance_loss_mlp": 1.0652225, + "epoch": 0.8480184686417853, + "flos": 710052627456.0, + "grad_norm": 0.039132750442480525, + "language_loss": 0.85530651, + "learning_rate": 5.9361033638244855e-05, + "loss": 0.86669123, + "num_input_tokens_seen": 365968912, + "router_z_loss_mlp": 0.73242188, + "step": 4408, + "time_per_iteration": 2.9040720462799072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138377, + "balance_loss_mlp": 1.06513441, + "epoch": 0.8482108503270489, + "flos": 615598559232.0, + "grad_norm": 0.03128645050494452, + "language_loss": 0.8671034, + "learning_rate": 5.9213884800135066e-05, + "loss": 0.87848717, + "num_input_tokens_seen": 366047680, + "router_z_loss_mlp": 0.73242188, + "step": 4409, + "time_per_iteration": 2.814863443374634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138014, + "balance_loss_mlp": 1.06486738, + "epoch": 0.8484032320123124, + "flos": 532072522752.0, + "grad_norm": 0.0338980139670295, + "language_loss": 0.86382216, + "learning_rate": 5.906690709037194e-05, + "loss": 0.8752023, + "num_input_tokens_seen": 366118720, + "router_z_loss_mlp": 0.73193359, + "step": 4410, + "time_per_iteration": 2.678199291229248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142998, + "balance_loss_mlp": 1.07147217, + "epoch": 0.848595613697576, + "flos": 1546171293696.0, + "grad_norm": 0.005786644875246692, + "language_loss": 0.76296914, + "learning_rate": 5.892010056601726e-05, + "loss": 0.7743991, + "num_input_tokens_seen": 366346928, + "router_z_loss_mlp": 0.71679688, + "step": 4411, + "time_per_iteration": 4.905268669128418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113754, + "balance_loss_mlp": 1.06439316, + "epoch": 0.8487879953828396, + "flos": 678618754560.0, + "grad_norm": 0.03786348460058995, + "language_loss": 0.78656065, + "learning_rate": 5.877346528406635e-05, + "loss": 0.79793596, + "num_input_tokens_seen": 366422848, + "router_z_loss_mlp": 0.73144531, + "step": 4412, + "time_per_iteration": 2.9538323879241943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113752, + "balance_loss_mlp": 1.06432509, + "epoch": 0.8489803770681031, + "flos": 504671852544.0, + "grad_norm": 0.03662625673681008, + "language_loss": 0.84200561, + "learning_rate": 5.8627001301448105e-05, + "loss": 0.8533808, + "num_input_tokens_seen": 366492016, + "router_z_loss_mlp": 0.73193359, + "step": 4413, + "time_per_iteration": 2.631989002227783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137281, + "balance_loss_mlp": 1.06408703, + "epoch": 0.8491727587533667, + "flos": 564349056000.0, + "grad_norm": 0.0365734841662918, + "language_loss": 0.81773579, + "learning_rate": 5.84807086750247e-05, + "loss": 0.82910866, + "num_input_tokens_seen": 366566400, + "router_z_loss_mlp": 0.73193359, + "step": 4414, + "time_per_iteration": 2.7764105796813965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137553, + "balance_loss_mlp": 1.06435871, + "epoch": 0.8493651404386302, + "flos": 460748878848.0, + "grad_norm": 0.050320136156211864, + "language_loss": 0.83642417, + "learning_rate": 5.833458746159243e-05, + "loss": 0.84779972, + "num_input_tokens_seen": 366634016, + "router_z_loss_mlp": 0.73193359, + "step": 4415, + "time_per_iteration": 2.55906343460083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136357, + "balance_loss_mlp": 1.06321061, + "epoch": 0.8495575221238938, + "flos": 462144763392.0, + "grad_norm": 0.042827503999962074, + "language_loss": 0.86903214, + "learning_rate": 5.818863771788013e-05, + "loss": 0.88039577, + "num_input_tokens_seen": 366704384, + "router_z_loss_mlp": 0.73193359, + "step": 4416, + "time_per_iteration": 2.7008659839630127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141524, + "balance_loss_mlp": 1.06790054, + "epoch": 0.8497499038091574, + "flos": 872152975872.0, + "grad_norm": 0.03663907725736085, + "language_loss": 0.85962868, + "learning_rate": 5.8042859500550604e-05, + "loss": 0.87104392, + "num_input_tokens_seen": 366785456, + "router_z_loss_mlp": 0.734375, + "step": 4417, + "time_per_iteration": 3.1430251598358154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113909, + "balance_loss_mlp": 1.06594312, + "epoch": 0.849942285494421, + "flos": 780974770176.0, + "grad_norm": 0.037432401008812614, + "language_loss": 0.82071102, + "learning_rate": 5.789725286620018e-05, + "loss": 0.83210188, + "num_input_tokens_seen": 366862848, + "router_z_loss_mlp": 0.73193359, + "step": 4418, + "time_per_iteration": 3.003854990005493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138933, + "balance_loss_mlp": 1.0654043, + "epoch": 0.8501346671796844, + "flos": 514907672064.0, + "grad_norm": 0.035344238090593685, + "language_loss": 0.8925063, + "learning_rate": 5.775181787135819e-05, + "loss": 0.90389562, + "num_input_tokens_seen": 366934800, + "router_z_loss_mlp": 0.73388672, + "step": 4419, + "time_per_iteration": 2.6802642345428467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140063, + "balance_loss_mlp": 1.06663048, + "epoch": 0.850327048864948, + "flos": 622634377728.0, + "grad_norm": 0.045521781734965405, + "language_loss": 0.87826395, + "learning_rate": 5.76065545724877e-05, + "loss": 0.88966453, + "num_input_tokens_seen": 367015152, + "router_z_loss_mlp": 0.73339844, + "step": 4420, + "time_per_iteration": 2.812560558319092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113939, + "balance_loss_mlp": 1.06595683, + "epoch": 0.8505194305502116, + "flos": 775549685760.0, + "grad_norm": 0.03647510347249887, + "language_loss": 0.84107387, + "learning_rate": 5.746146302598454e-05, + "loss": 0.85246778, + "num_input_tokens_seen": 367092192, + "router_z_loss_mlp": 0.73339844, + "step": 4421, + "time_per_iteration": 3.0192792415618896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140317, + "balance_loss_mlp": 1.06697929, + "epoch": 0.8507118122354752, + "flos": 466212894720.0, + "grad_norm": 0.037024341612432836, + "language_loss": 0.90897202, + "learning_rate": 5.731654328817859e-05, + "loss": 0.92037523, + "num_input_tokens_seen": 367159744, + "router_z_loss_mlp": 0.73291016, + "step": 4422, + "time_per_iteration": 2.584484100341797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139141, + "balance_loss_mlp": 1.06580317, + "epoch": 0.8509041939207388, + "flos": 535469908992.0, + "grad_norm": 0.035199882567299716, + "language_loss": 0.8991701, + "learning_rate": 5.717179541533257e-05, + "loss": 0.9105615, + "num_input_tokens_seen": 367226384, + "router_z_loss_mlp": 0.73291016, + "step": 4423, + "time_per_iteration": 2.732942819595337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139224, + "balance_loss_mlp": 1.06588686, + "epoch": 0.8510965756060023, + "flos": 584828700672.0, + "grad_norm": 0.037111715680716625, + "language_loss": 0.89189512, + "learning_rate": 5.702721946364264e-05, + "loss": 0.90328735, + "num_input_tokens_seen": 367294768, + "router_z_loss_mlp": 0.73291016, + "step": 4424, + "time_per_iteration": 2.698284864425659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139191, + "balance_loss_mlp": 1.0658536, + "epoch": 0.8512889572912659, + "flos": 602017746432.0, + "grad_norm": 0.06811401099002824, + "language_loss": 0.81721288, + "learning_rate": 5.688281548923796e-05, + "loss": 0.82860482, + "num_input_tokens_seen": 367372368, + "router_z_loss_mlp": 0.73291016, + "step": 4425, + "time_per_iteration": 2.8075883388519287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137527, + "balance_loss_mlp": 1.06409407, + "epoch": 0.8514813389765294, + "flos": 656065749504.0, + "grad_norm": 0.035446247672874326, + "language_loss": 0.82858717, + "learning_rate": 5.673858354818151e-05, + "loss": 0.83996248, + "num_input_tokens_seen": 367452656, + "router_z_loss_mlp": 0.73388672, + "step": 4426, + "time_per_iteration": 2.880490303039551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136979, + "balance_loss_mlp": 1.06359351, + "epoch": 0.851673720661793, + "flos": 430658497536.0, + "grad_norm": 0.03977079168614994, + "language_loss": 0.84184194, + "learning_rate": 5.6594523696468726e-05, + "loss": 0.8532117, + "num_input_tokens_seen": 367517808, + "router_z_loss_mlp": 0.73388672, + "step": 4427, + "time_per_iteration": 2.5517382621765137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136952, + "balance_loss_mlp": 1.06356657, + "epoch": 0.8518661023470565, + "flos": 642758183424.0, + "grad_norm": 0.03736572659166184, + "language_loss": 0.84144545, + "learning_rate": 5.645063599002875e-05, + "loss": 0.85281491, + "num_input_tokens_seen": 367591728, + "router_z_loss_mlp": 0.73388672, + "step": 4428, + "time_per_iteration": 2.7877635955810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136697, + "balance_loss_mlp": 1.06307364, + "epoch": 0.8520584840323201, + "flos": 563198220288.0, + "grad_norm": 0.038754285899443935, + "language_loss": 0.83934295, + "learning_rate": 5.630692048472363e-05, + "loss": 0.85070992, + "num_input_tokens_seen": 367664496, + "router_z_loss_mlp": 0.73535156, + "step": 4429, + "time_per_iteration": 2.690920352935791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137169, + "balance_loss_mlp": 1.06344974, + "epoch": 0.8522508657175837, + "flos": 528080252928.0, + "grad_norm": 0.04107244986742461, + "language_loss": 0.83775079, + "learning_rate": 5.61633772363489e-05, + "loss": 0.84912252, + "num_input_tokens_seen": 367735584, + "router_z_loss_mlp": 0.73583984, + "step": 4430, + "time_per_iteration": 2.6325595378875732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136253, + "balance_loss_mlp": 1.06272459, + "epoch": 0.8524432474028473, + "flos": 500102163456.0, + "grad_norm": 0.03352438353947398, + "language_loss": 0.84562439, + "learning_rate": 5.602000630063298e-05, + "loss": 0.85698688, + "num_input_tokens_seen": 367801136, + "router_z_loss_mlp": 0.73486328, + "step": 4431, + "time_per_iteration": 2.6214230060577393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135919, + "balance_loss_mlp": 1.06239092, + "epoch": 0.8526356290881109, + "flos": 422216060928.0, + "grad_norm": 0.048049255454419064, + "language_loss": 0.86048019, + "learning_rate": 5.587680773323706e-05, + "loss": 0.8718394, + "num_input_tokens_seen": 367865312, + "router_z_loss_mlp": 0.73486328, + "step": 4432, + "time_per_iteration": 2.535344362258911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136977, + "balance_loss_mlp": 1.06349599, + "epoch": 0.8528280107733743, + "flos": 508329750528.0, + "grad_norm": 0.034970015630649706, + "language_loss": 0.8575263, + "learning_rate": 5.5733781589756115e-05, + "loss": 0.86889607, + "num_input_tokens_seen": 367931104, + "router_z_loss_mlp": 0.73388672, + "step": 4433, + "time_per_iteration": 2.598065137863159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136944, + "balance_loss_mlp": 1.06360638, + "epoch": 0.8530203924586379, + "flos": 446816229888.0, + "grad_norm": 0.03606846672239564, + "language_loss": 0.87374574, + "learning_rate": 5.5590927925717684e-05, + "loss": 0.88511515, + "num_input_tokens_seen": 367995520, + "router_z_loss_mlp": 0.73339844, + "step": 4434, + "time_per_iteration": 2.5089426040649414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136784, + "balance_loss_mlp": 1.06330335, + "epoch": 0.8532127741439015, + "flos": 658989775872.0, + "grad_norm": 0.0360649650839633, + "language_loss": 0.88019717, + "learning_rate": 5.54482467965825e-05, + "loss": 0.89156508, + "num_input_tokens_seen": 368073664, + "router_z_loss_mlp": 0.73388672, + "step": 4435, + "time_per_iteration": 2.8504323959350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137737, + "balance_loss_mlp": 1.06420863, + "epoch": 0.8534051558291651, + "flos": 537098107392.0, + "grad_norm": 0.03019065878399416, + "language_loss": 0.87391806, + "learning_rate": 5.5305738257744264e-05, + "loss": 0.88529551, + "num_input_tokens_seen": 368147536, + "router_z_loss_mlp": 0.734375, + "step": 4436, + "time_per_iteration": 2.728482246398926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137806, + "balance_loss_mlp": 1.06399131, + "epoch": 0.8535975375144286, + "flos": 534037094400.0, + "grad_norm": 0.04283357460488269, + "language_loss": 0.84772766, + "learning_rate": 5.5163402364529655e-05, + "loss": 0.85910571, + "num_input_tokens_seen": 368218672, + "router_z_loss_mlp": 0.73632812, + "step": 4437, + "time_per_iteration": 2.6375861167907715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137872, + "balance_loss_mlp": 1.06405759, + "epoch": 0.8537899191996922, + "flos": 575268355584.0, + "grad_norm": 0.04299966443974174, + "language_loss": 0.8751781, + "learning_rate": 5.502123917219848e-05, + "loss": 0.88655686, + "num_input_tokens_seen": 368287056, + "router_z_loss_mlp": 0.73632812, + "step": 4438, + "time_per_iteration": 2.698176145553589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137907, + "balance_loss_mlp": 1.0640924, + "epoch": 0.8539823008849557, + "flos": 466006777344.0, + "grad_norm": 0.03463807162353114, + "language_loss": 0.87774605, + "learning_rate": 5.48792487359433e-05, + "loss": 0.88912511, + "num_input_tokens_seen": 368358400, + "router_z_loss_mlp": 0.73632812, + "step": 4439, + "time_per_iteration": 2.6831352710723877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137679, + "balance_loss_mlp": 1.06410253, + "epoch": 0.8541746825702193, + "flos": 555806562816.0, + "grad_norm": 0.03867022608803846, + "language_loss": 0.86941582, + "learning_rate": 5.4737431110889745e-05, + "loss": 0.88079262, + "num_input_tokens_seen": 368427168, + "router_z_loss_mlp": 0.73486328, + "step": 4440, + "time_per_iteration": 2.6928815841674805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136981, + "balance_loss_mlp": 1.06331003, + "epoch": 0.8543670642554829, + "flos": 547557508608.0, + "grad_norm": 0.03384967972445922, + "language_loss": 0.81909108, + "learning_rate": 5.4595786352096165e-05, + "loss": 0.83046091, + "num_input_tokens_seen": 368503584, + "router_z_loss_mlp": 0.73535156, + "step": 4441, + "time_per_iteration": 2.747842311859131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137327, + "balance_loss_mlp": 1.06360793, + "epoch": 0.8545594459407464, + "flos": 513075357696.0, + "grad_norm": 0.032234703238349205, + "language_loss": 0.86772889, + "learning_rate": 5.4454314514554236e-05, + "loss": 0.87910211, + "num_input_tokens_seen": 368576976, + "router_z_loss_mlp": 0.73583984, + "step": 4442, + "time_per_iteration": 2.6481122970581055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136922, + "balance_loss_mlp": 1.0632025, + "epoch": 0.85475182762601, + "flos": 422085805056.0, + "grad_norm": 0.03761893009858474, + "language_loss": 0.86693609, + "learning_rate": 5.431301565318786e-05, + "loss": 0.87830532, + "num_input_tokens_seen": 368641664, + "router_z_loss_mlp": 0.73583984, + "step": 4443, + "time_per_iteration": 2.4967923164367676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136971, + "balance_loss_mlp": 1.06339502, + "epoch": 0.8549442093112736, + "flos": 390291363840.0, + "grad_norm": 0.04115905585379076, + "language_loss": 0.82256216, + "learning_rate": 5.41718898228542e-05, + "loss": 0.83393186, + "num_input_tokens_seen": 368705616, + "router_z_loss_mlp": 0.73486328, + "step": 4444, + "time_per_iteration": 2.5440807342529297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137026, + "balance_loss_mlp": 1.0632118, + "epoch": 0.8551365909965372, + "flos": 607154121216.0, + "grad_norm": 0.035375940453208764, + "language_loss": 0.84474754, + "learning_rate": 5.403093707834334e-05, + "loss": 0.85611778, + "num_input_tokens_seen": 368779664, + "router_z_loss_mlp": 0.73632812, + "step": 4445, + "time_per_iteration": 2.843111515045166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136392, + "balance_loss_mlp": 1.0628165, + "epoch": 0.8553289726818007, + "flos": 505155945984.0, + "grad_norm": 0.03917502988089021, + "language_loss": 0.83616102, + "learning_rate": 5.3890157474377865e-05, + "loss": 0.84752494, + "num_input_tokens_seen": 368846656, + "router_z_loss_mlp": 0.73486328, + "step": 4446, + "time_per_iteration": 2.59338641166687 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135534, + "balance_loss_mlp": 1.06162477, + "epoch": 0.8555213543670642, + "flos": 558105506304.0, + "grad_norm": 0.03523140729629835, + "language_loss": 0.80791306, + "learning_rate": 5.374955106561324e-05, + "loss": 0.81926841, + "num_input_tokens_seen": 368923712, + "router_z_loss_mlp": 0.73779297, + "step": 4447, + "time_per_iteration": 2.766433000564575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135051, + "balance_loss_mlp": 1.06114113, + "epoch": 0.8557137360523278, + "flos": 549152779776.0, + "grad_norm": 0.042335426638136726, + "language_loss": 0.80681795, + "learning_rate": 5.360911790663775e-05, + "loss": 0.81816846, + "num_input_tokens_seen": 368994496, + "router_z_loss_mlp": 0.73779297, + "step": 4448, + "time_per_iteration": 2.69462251663208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135659, + "balance_loss_mlp": 1.0617491, + "epoch": 0.8559061177375914, + "flos": 729503686656.0, + "grad_norm": 0.03336299345483442, + "language_loss": 0.82454473, + "learning_rate": 5.346885805197238e-05, + "loss": 0.83590126, + "num_input_tokens_seen": 369077088, + "router_z_loss_mlp": 0.73779297, + "step": 4449, + "time_per_iteration": 3.009009838104248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136011, + "balance_loss_mlp": 1.06238735, + "epoch": 0.856098499422855, + "flos": 536976583680.0, + "grad_norm": 0.039322841970345704, + "language_loss": 0.88345414, + "learning_rate": 5.332877155607085e-05, + "loss": 0.89481425, + "num_input_tokens_seen": 369147680, + "router_z_loss_mlp": 0.73583984, + "step": 4450, + "time_per_iteration": 2.6745853424072266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135355, + "balance_loss_mlp": 1.06163609, + "epoch": 0.8562908811081185, + "flos": 574775529984.0, + "grad_norm": 0.03966072419835989, + "language_loss": 0.88200045, + "learning_rate": 5.3188858473319504e-05, + "loss": 0.893354, + "num_input_tokens_seen": 369224320, + "router_z_loss_mlp": 0.73681641, + "step": 4451, + "time_per_iteration": 2.7596144676208496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136075, + "balance_loss_mlp": 1.06249857, + "epoch": 0.856483262793382, + "flos": 783215316480.0, + "grad_norm": 0.03609964177893848, + "language_loss": 0.85612303, + "learning_rate": 5.3049118858037426e-05, + "loss": 0.86748379, + "num_input_tokens_seen": 369315744, + "router_z_loss_mlp": 0.73535156, + "step": 4452, + "time_per_iteration": 3.099785089492798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136641, + "balance_loss_mlp": 1.06311262, + "epoch": 0.8566756444786456, + "flos": 456756609024.0, + "grad_norm": 0.03265431385486569, + "language_loss": 0.89154232, + "learning_rate": 5.290955276447651e-05, + "loss": 0.90290874, + "num_input_tokens_seen": 369382800, + "router_z_loss_mlp": 0.73486328, + "step": 4453, + "time_per_iteration": 2.553025007247925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135846, + "balance_loss_mlp": 1.06236541, + "epoch": 0.8568680261639092, + "flos": 450315674112.0, + "grad_norm": 0.036031278358889064, + "language_loss": 0.88903332, + "learning_rate": 5.277016024682091e-05, + "loss": 0.9003917, + "num_input_tokens_seen": 369447312, + "router_z_loss_mlp": 0.73486328, + "step": 4454, + "time_per_iteration": 2.578143835067749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142006, + "balance_loss_mlp": 1.0684303, + "epoch": 0.8570604078491728, + "flos": 480937812480.0, + "grad_norm": 0.0381879382744482, + "language_loss": 0.87082827, + "learning_rate": 5.2630941359187665e-05, + "loss": 0.88224834, + "num_input_tokens_seen": 369512800, + "router_z_loss_mlp": 0.73583984, + "step": 4455, + "time_per_iteration": 2.5473132133483887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141253, + "balance_loss_mlp": 1.06762922, + "epoch": 0.8572527895344363, + "flos": 506933865984.0, + "grad_norm": 0.04281102576641978, + "language_loss": 0.8965286, + "learning_rate": 5.249189615562627e-05, + "loss": 0.9079411, + "num_input_tokens_seen": 369580720, + "router_z_loss_mlp": 0.73632812, + "step": 4456, + "time_per_iteration": 2.581775665283203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140917, + "balance_loss_mlp": 1.06748414, + "epoch": 0.8574451712196999, + "flos": 788475216384.0, + "grad_norm": 0.03185344103864885, + "language_loss": 0.87001526, + "learning_rate": 5.235302469011905e-05, + "loss": 0.88142449, + "num_input_tokens_seen": 369672544, + "router_z_loss_mlp": 0.734375, + "step": 4457, + "time_per_iteration": 3.0588223934173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140594, + "balance_loss_mlp": 1.06711328, + "epoch": 0.8576375529049635, + "flos": 510346715136.0, + "grad_norm": 0.037812671186274974, + "language_loss": 0.79738897, + "learning_rate": 5.2214327016580575e-05, + "loss": 0.80879498, + "num_input_tokens_seen": 369745776, + "router_z_loss_mlp": 0.73486328, + "step": 4458, + "time_per_iteration": 2.681156635284424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146698, + "balance_loss_mlp": 1.07498169, + "epoch": 0.857829934590227, + "flos": 1463888692224.0, + "grad_norm": 0.008556411684699908, + "language_loss": 0.84767288, + "learning_rate": 5.207580318885802e-05, + "loss": 0.85913986, + "num_input_tokens_seen": 369975200, + "router_z_loss_mlp": 0.71875, + "step": 4459, + "time_per_iteration": 4.9717326164245605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143149, + "balance_loss_mlp": 1.06976426, + "epoch": 0.8580223162754905, + "flos": 480258335232.0, + "grad_norm": 0.03181762715741318, + "language_loss": 0.93217885, + "learning_rate": 5.193745326073118e-05, + "loss": 0.94361031, + "num_input_tokens_seen": 370043296, + "router_z_loss_mlp": 0.73388672, + "step": 4460, + "time_per_iteration": 2.633009672164917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142727, + "balance_loss_mlp": 1.06934178, + "epoch": 0.8582146979607541, + "flos": 707456242176.0, + "grad_norm": 0.040093751457138914, + "language_loss": 0.83515179, + "learning_rate": 5.179927728591227e-05, + "loss": 0.84657907, + "num_input_tokens_seen": 370111152, + "router_z_loss_mlp": 0.73388672, + "step": 4461, + "time_per_iteration": 2.835998773574829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142655, + "balance_loss_mlp": 1.06922185, + "epoch": 0.8584070796460177, + "flos": 766492899840.0, + "grad_norm": 0.04020414939935447, + "language_loss": 0.87611806, + "learning_rate": 5.1661275318045874e-05, + "loss": 0.88754463, + "num_input_tokens_seen": 370190272, + "router_z_loss_mlp": 0.734375, + "step": 4462, + "time_per_iteration": 2.9936819076538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142285, + "balance_loss_mlp": 1.06885219, + "epoch": 0.8585994613312813, + "flos": 588009235968.0, + "grad_norm": 0.034025859465722855, + "language_loss": 0.905936, + "learning_rate": 5.152344741070919e-05, + "loss": 0.91735888, + "num_input_tokens_seen": 370267056, + "router_z_loss_mlp": 0.734375, + "step": 4463, + "time_per_iteration": 2.7997395992279053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142411, + "balance_loss_mlp": 1.06912124, + "epoch": 0.8587918430165449, + "flos": 609509460480.0, + "grad_norm": 0.03526777948899912, + "language_loss": 0.83016932, + "learning_rate": 5.138579361741169e-05, + "loss": 0.8415935, + "num_input_tokens_seen": 370344176, + "router_z_loss_mlp": 0.73291016, + "step": 4464, + "time_per_iteration": 2.799365520477295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141716, + "balance_loss_mlp": 1.06833065, + "epoch": 0.8589842247018084, + "flos": 590069134848.0, + "grad_norm": 0.038611970938618144, + "language_loss": 0.86071271, + "learning_rate": 5.124831399159535e-05, + "loss": 0.87212992, + "num_input_tokens_seen": 370414224, + "router_z_loss_mlp": 0.73388672, + "step": 4465, + "time_per_iteration": 2.7324819564819336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139539, + "balance_loss_mlp": 1.06610572, + "epoch": 0.8591766063870719, + "flos": 544963124736.0, + "grad_norm": 0.04312248482760193, + "language_loss": 0.83882284, + "learning_rate": 5.1111008586634475e-05, + "loss": 0.85021818, + "num_input_tokens_seen": 370484736, + "router_z_loss_mlp": 0.734375, + "step": 4466, + "time_per_iteration": 2.703601360321045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137187, + "balance_loss_mlp": 1.06365895, + "epoch": 0.8593689880723355, + "flos": 494785867776.0, + "grad_norm": 0.03941312585989275, + "language_loss": 0.86610931, + "learning_rate": 5.0973877455835816e-05, + "loss": 0.87748122, + "num_input_tokens_seen": 370556512, + "router_z_loss_mlp": 0.73486328, + "step": 4467, + "time_per_iteration": 2.681820869445801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136714, + "balance_loss_mlp": 1.06309068, + "epoch": 0.8595613697575991, + "flos": 534940153344.0, + "grad_norm": 0.03756039109661357, + "language_loss": 0.88946462, + "learning_rate": 5.083692065243822e-05, + "loss": 0.9008317, + "num_input_tokens_seen": 370622880, + "router_z_loss_mlp": 0.73535156, + "step": 4468, + "time_per_iteration": 2.6121115684509277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136802, + "balance_loss_mlp": 1.06327391, + "epoch": 0.8597537514428626, + "flos": 618754899456.0, + "grad_norm": 0.03953585832407336, + "language_loss": 0.80730748, + "learning_rate": 5.070013822961328e-05, + "loss": 0.81867552, + "num_input_tokens_seen": 370691632, + "router_z_loss_mlp": 0.73486328, + "step": 4469, + "time_per_iteration": 2.729743719100952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136854, + "balance_loss_mlp": 1.06332588, + "epoch": 0.8599461331281262, + "flos": 609856567296.0, + "grad_norm": 0.039611412927669135, + "language_loss": 0.88193107, + "learning_rate": 5.056353024046462e-05, + "loss": 0.89329958, + "num_input_tokens_seen": 370764848, + "router_z_loss_mlp": 0.73486328, + "step": 4470, + "time_per_iteration": 2.747981071472168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139553, + "balance_loss_mlp": 1.06573892, + "epoch": 0.8601385148133898, + "flos": 552344048640.0, + "grad_norm": 0.036428161077625955, + "language_loss": 0.87615812, + "learning_rate": 5.042709673802786e-05, + "loss": 0.88755369, + "num_input_tokens_seen": 370832496, + "router_z_loss_mlp": 0.73632812, + "step": 4471, + "time_per_iteration": 2.732907772064209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144942, + "balance_loss_mlp": 1.071509, + "epoch": 0.8603308964986534, + "flos": 582378034176.0, + "grad_norm": 0.031295899789225104, + "language_loss": 0.85058415, + "learning_rate": 5.0290837775271494e-05, + "loss": 0.86203361, + "num_input_tokens_seen": 370917104, + "router_z_loss_mlp": 0.734375, + "step": 4472, + "time_per_iteration": 2.87262225151062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144552, + "balance_loss_mlp": 1.07111919, + "epoch": 0.8605232781839169, + "flos": 630147558912.0, + "grad_norm": 0.04037586195823243, + "language_loss": 0.79786807, + "learning_rate": 5.0154753405095846e-05, + "loss": 0.8093136, + "num_input_tokens_seen": 370984512, + "router_z_loss_mlp": 0.734375, + "step": 4473, + "time_per_iteration": 2.787599802017212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144625, + "balance_loss_mlp": 1.07119215, + "epoch": 0.8607156598691804, + "flos": 469089257472.0, + "grad_norm": 0.040989177055780444, + "language_loss": 0.82621419, + "learning_rate": 5.0018843680333604e-05, + "loss": 0.83766043, + "num_input_tokens_seen": 371049664, + "router_z_loss_mlp": 0.734375, + "step": 4474, + "time_per_iteration": 2.5246458053588867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143896, + "balance_loss_mlp": 1.0704627, + "epoch": 0.860908041554444, + "flos": 489406445568.0, + "grad_norm": 0.03447505359677043, + "language_loss": 0.87655497, + "learning_rate": 4.988310865374945e-05, + "loss": 0.88799393, + "num_input_tokens_seen": 371120704, + "router_z_loss_mlp": 0.734375, + "step": 4475, + "time_per_iteration": 2.644601583480835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143481, + "balance_loss_mlp": 1.06995285, + "epoch": 0.8611004232397076, + "flos": 593169079296.0, + "grad_norm": 0.04484226543219231, + "language_loss": 0.85604751, + "learning_rate": 4.974754837804057e-05, + "loss": 0.8674823, + "num_input_tokens_seen": 371189376, + "router_z_loss_mlp": 0.73535156, + "step": 4476, + "time_per_iteration": 2.718604326248169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143628, + "balance_loss_mlp": 1.07014692, + "epoch": 0.8612928049249712, + "flos": 775621544448.0, + "grad_norm": 0.035398978535946514, + "language_loss": 0.90864736, + "learning_rate": 4.9612162905836036e-05, + "loss": 0.92008364, + "num_input_tokens_seen": 371275184, + "router_z_loss_mlp": 0.73486328, + "step": 4477, + "time_per_iteration": 3.0402839183807373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143583, + "balance_loss_mlp": 1.07014966, + "epoch": 0.8614851866102347, + "flos": 538606783488.0, + "grad_norm": 0.045897520744467304, + "language_loss": 0.878411, + "learning_rate": 4.947695228969718e-05, + "loss": 0.88984686, + "num_input_tokens_seen": 371347920, + "router_z_loss_mlp": 0.734375, + "step": 4478, + "time_per_iteration": 2.653444528579712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141916, + "balance_loss_mlp": 1.06838739, + "epoch": 0.8616775682954982, + "flos": 566995106304.0, + "grad_norm": 0.04005533562206663, + "language_loss": 0.84431696, + "learning_rate": 4.934191658211729e-05, + "loss": 0.85573614, + "num_input_tokens_seen": 371419728, + "router_z_loss_mlp": 0.73486328, + "step": 4479, + "time_per_iteration": 2.6883862018585205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114245, + "balance_loss_mlp": 1.06844449, + "epoch": 0.8618699499807618, + "flos": 482557278720.0, + "grad_norm": 0.04408793841080807, + "language_loss": 0.87477684, + "learning_rate": 4.92070558355221e-05, + "loss": 0.88620138, + "num_input_tokens_seen": 371488768, + "router_z_loss_mlp": 0.73828125, + "step": 4480, + "time_per_iteration": 2.6091084480285645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142157, + "balance_loss_mlp": 1.06815219, + "epoch": 0.8620623316660254, + "flos": 650679596544.0, + "grad_norm": 0.04658475745596792, + "language_loss": 0.80903435, + "learning_rate": 4.9072370102269226e-05, + "loss": 0.82045591, + "num_input_tokens_seen": 371560144, + "router_z_loss_mlp": 0.73828125, + "step": 4481, + "time_per_iteration": 2.7939393520355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114159, + "balance_loss_mlp": 1.06806207, + "epoch": 0.862254713351289, + "flos": 753081274368.0, + "grad_norm": 0.03549223597269206, + "language_loss": 0.90676355, + "learning_rate": 4.893785943464801e-05, + "loss": 0.91817951, + "num_input_tokens_seen": 371635920, + "router_z_loss_mlp": 0.73535156, + "step": 4482, + "time_per_iteration": 2.9854023456573486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141507, + "balance_loss_mlp": 1.06788337, + "epoch": 0.8624470950365525, + "flos": 843135567360.0, + "grad_norm": 0.03295717035983083, + "language_loss": 0.82174349, + "learning_rate": 4.880352388488024e-05, + "loss": 0.83315861, + "num_input_tokens_seen": 371727664, + "router_z_loss_mlp": 0.73583984, + "step": 4483, + "time_per_iteration": 3.2930996417999268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141344, + "balance_loss_mlp": 1.0677681, + "epoch": 0.8626394767218161, + "flos": 756087892992.0, + "grad_norm": 0.03698694034231399, + "language_loss": 0.87834418, + "learning_rate": 4.866936350511969e-05, + "loss": 0.88975763, + "num_input_tokens_seen": 371800832, + "router_z_loss_mlp": 0.73535156, + "step": 4484, + "time_per_iteration": 2.905592918395996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141202, + "balance_loss_mlp": 1.06767344, + "epoch": 0.8628318584070797, + "flos": 704857855488.0, + "grad_norm": 0.040701360718788646, + "language_loss": 0.86439824, + "learning_rate": 4.853537834745203e-05, + "loss": 0.87581027, + "num_input_tokens_seen": 371871472, + "router_z_loss_mlp": 0.73535156, + "step": 4485, + "time_per_iteration": 2.876677989959717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141051, + "balance_loss_mlp": 1.0674752, + "epoch": 0.8630242400923432, + "flos": 472197934080.0, + "grad_norm": 0.0356487521331988, + "language_loss": 0.82481432, + "learning_rate": 4.840156846389487e-05, + "loss": 0.83622479, + "num_input_tokens_seen": 371936512, + "router_z_loss_mlp": 0.73583984, + "step": 4486, + "time_per_iteration": 2.5704009532928467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141683, + "balance_loss_mlp": 1.06810677, + "epoch": 0.8632166217776067, + "flos": 965962495488.0, + "grad_norm": 0.042485813473706315, + "language_loss": 0.82875609, + "learning_rate": 4.826793390639783e-05, + "loss": 0.84017289, + "num_input_tokens_seen": 372018032, + "router_z_loss_mlp": 0.73535156, + "step": 4487, + "time_per_iteration": 3.2337405681610107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141296, + "balance_loss_mlp": 1.06772029, + "epoch": 0.8634090034628703, + "flos": 769239006720.0, + "grad_norm": 0.03930910636761154, + "language_loss": 0.82854676, + "learning_rate": 4.813447472684246e-05, + "loss": 0.83995974, + "num_input_tokens_seen": 372092176, + "router_z_loss_mlp": 0.73583984, + "step": 4488, + "time_per_iteration": 3.0039660930633545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114056, + "balance_loss_mlp": 1.06693602, + "epoch": 0.8636013851481339, + "flos": 521719908864.0, + "grad_norm": 0.035635459683833186, + "language_loss": 0.88014925, + "learning_rate": 4.800119097704214e-05, + "loss": 0.89155483, + "num_input_tokens_seen": 372166880, + "router_z_loss_mlp": 0.73583984, + "step": 4489, + "time_per_iteration": 2.762113332748413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141983, + "balance_loss_mlp": 1.06826377, + "epoch": 0.8637937668333975, + "flos": 633293165568.0, + "grad_norm": 0.0371692275655829, + "language_loss": 0.85686231, + "learning_rate": 4.7868082708742324e-05, + "loss": 0.86828208, + "num_input_tokens_seen": 372234608, + "router_z_loss_mlp": 0.73681641, + "step": 4490, + "time_per_iteration": 2.7638096809387207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114124, + "balance_loss_mlp": 1.06771219, + "epoch": 0.8639861485186611, + "flos": 857521383936.0, + "grad_norm": 0.03348350646617803, + "language_loss": 0.80966526, + "learning_rate": 4.773514997362e-05, + "loss": 0.8210777, + "num_input_tokens_seen": 372314704, + "router_z_loss_mlp": 0.73535156, + "step": 4491, + "time_per_iteration": 3.1014699935913086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141741, + "balance_loss_mlp": 1.06826007, + "epoch": 0.8641785302039245, + "flos": 482240371200.0, + "grad_norm": 0.04238731422676562, + "language_loss": 0.83083439, + "learning_rate": 4.7602392823284605e-05, + "loss": 0.84225178, + "num_input_tokens_seen": 372374848, + "router_z_loss_mlp": 0.73486328, + "step": 4492, + "time_per_iteration": 2.5285348892211914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114229, + "balance_loss_mlp": 1.06871402, + "epoch": 0.8643709118891881, + "flos": 505648771584.0, + "grad_norm": 0.03789343460075339, + "language_loss": 0.85717583, + "learning_rate": 4.746981130927675e-05, + "loss": 0.86859876, + "num_input_tokens_seen": 372442432, + "router_z_loss_mlp": 0.73583984, + "step": 4493, + "time_per_iteration": 2.6251981258392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141993, + "balance_loss_mlp": 1.06856048, + "epoch": 0.8645632935744517, + "flos": 553551280128.0, + "grad_norm": 0.03757320956431773, + "language_loss": 0.86991334, + "learning_rate": 4.733740548306908e-05, + "loss": 0.88133329, + "num_input_tokens_seen": 372520048, + "router_z_loss_mlp": 0.734375, + "step": 4494, + "time_per_iteration": 2.798293352127075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142343, + "balance_loss_mlp": 1.06876707, + "epoch": 0.8647556752597153, + "flos": 525735647232.0, + "grad_norm": 0.037128189922481854, + "language_loss": 0.88154763, + "learning_rate": 4.7205175396066336e-05, + "loss": 0.89297104, + "num_input_tokens_seen": 372587968, + "router_z_loss_mlp": 0.73583984, + "step": 4495, + "time_per_iteration": 2.585801124572754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114259, + "balance_loss_mlp": 1.06915712, + "epoch": 0.8649480569449788, + "flos": 789237285888.0, + "grad_norm": 0.036509667691993125, + "language_loss": 0.87320912, + "learning_rate": 4.707312109960471e-05, + "loss": 0.88463509, + "num_input_tokens_seen": 372672544, + "router_z_loss_mlp": 0.734375, + "step": 4496, + "time_per_iteration": 3.11242413520813 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142515, + "balance_loss_mlp": 1.06903481, + "epoch": 0.8651404386302424, + "flos": 765199073280.0, + "grad_norm": 0.037756570686122495, + "language_loss": 0.81536937, + "learning_rate": 4.694124264495225e-05, + "loss": 0.82679451, + "num_input_tokens_seen": 372751296, + "router_z_loss_mlp": 0.73486328, + "step": 4497, + "time_per_iteration": 3.061692476272583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141615, + "balance_loss_mlp": 1.06813455, + "epoch": 0.865332820315506, + "flos": 540988319232.0, + "grad_norm": 0.03448303115707208, + "language_loss": 0.86013806, + "learning_rate": 4.680954008330851e-05, + "loss": 0.87155426, + "num_input_tokens_seen": 372825264, + "router_z_loss_mlp": 0.73486328, + "step": 4498, + "time_per_iteration": 2.7776076793670654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146858, + "balance_loss_mlp": 1.07495117, + "epoch": 0.8655252020007695, + "flos": 1479677124096.0, + "grad_norm": 0.010203912282881854, + "language_loss": 0.79174447, + "learning_rate": 4.667801346580519e-05, + "loss": 0.803213, + "num_input_tokens_seen": 373052000, + "router_z_loss_mlp": 0.72070312, + "step": 4499, + "time_per_iteration": 4.785112619400024 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139577, + "balance_loss_mlp": 1.06576228, + "epoch": 0.8657175836860331, + "flos": 518472244224.0, + "grad_norm": 0.03309637596200986, + "language_loss": 0.86845696, + "learning_rate": 4.6546662843505396e-05, + "loss": 0.87985277, + "num_input_tokens_seen": 373124128, + "router_z_loss_mlp": 0.73632812, + "step": 4500, + "time_per_iteration": 2.7067041397094727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140055, + "balance_loss_mlp": 1.06628788, + "epoch": 0.8659099653712966, + "flos": 591632205312.0, + "grad_norm": 0.036173409641408416, + "language_loss": 0.85177112, + "learning_rate": 4.641548826740394e-05, + "loss": 0.8631717, + "num_input_tokens_seen": 373195472, + "router_z_loss_mlp": 0.73583984, + "step": 4501, + "time_per_iteration": 2.7207436561584473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140299, + "balance_loss_mlp": 1.06667542, + "epoch": 0.8661023470565602, + "flos": 591575809536.0, + "grad_norm": 0.03801706750898132, + "language_loss": 0.9136349, + "learning_rate": 4.628448978842731e-05, + "loss": 0.92503786, + "num_input_tokens_seen": 373273504, + "router_z_loss_mlp": 0.73535156, + "step": 4502, + "time_per_iteration": 2.809257745742798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140286, + "balance_loss_mlp": 1.06647146, + "epoch": 0.8662947287418238, + "flos": 568736096256.0, + "grad_norm": 0.03693136535041395, + "language_loss": 0.84185296, + "learning_rate": 4.61536674574336e-05, + "loss": 0.85325581, + "num_input_tokens_seen": 373346032, + "router_z_loss_mlp": 0.73632812, + "step": 4503, + "time_per_iteration": 2.7448463439941406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141065, + "balance_loss_mlp": 1.06729817, + "epoch": 0.8664871104270874, + "flos": 517002499584.0, + "grad_norm": 0.029797244201928218, + "language_loss": 0.85579336, + "learning_rate": 4.6023021325212636e-05, + "loss": 0.86720395, + "num_input_tokens_seen": 373419968, + "router_z_loss_mlp": 0.73583984, + "step": 4504, + "time_per_iteration": 2.771195411682129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141096, + "balance_loss_mlp": 1.06728137, + "epoch": 0.866679492112351, + "flos": 558429144576.0, + "grad_norm": 0.03508013517718755, + "language_loss": 0.82380766, + "learning_rate": 4.589255144248561e-05, + "loss": 0.83521855, + "num_input_tokens_seen": 373502448, + "router_z_loss_mlp": 0.73632812, + "step": 4505, + "time_per_iteration": 2.779545545578003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142726, + "balance_loss_mlp": 1.0692935, + "epoch": 0.8668718737976144, + "flos": 723661638144.0, + "grad_norm": 0.04291164810102399, + "language_loss": 0.87122786, + "learning_rate": 4.57622578599054e-05, + "loss": 0.88265514, + "num_input_tokens_seen": 373581184, + "router_z_loss_mlp": 0.73388672, + "step": 4506, + "time_per_iteration": 2.866483211517334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142832, + "balance_loss_mlp": 1.06935108, + "epoch": 0.867064255482878, + "flos": 601833096192.0, + "grad_norm": 0.044988032903290696, + "language_loss": 0.90554643, + "learning_rate": 4.5632140628056705e-05, + "loss": 0.91697466, + "num_input_tokens_seen": 373652272, + "router_z_loss_mlp": 0.734375, + "step": 4507, + "time_per_iteration": 2.7110989093780518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142288, + "balance_loss_mlp": 1.06880796, + "epoch": 0.8672566371681416, + "flos": 804932391936.0, + "grad_norm": 0.03964357174424219, + "language_loss": 0.81517231, + "learning_rate": 4.550219979745529e-05, + "loss": 0.82659519, + "num_input_tokens_seen": 373734896, + "router_z_loss_mlp": 0.734375, + "step": 4508, + "time_per_iteration": 3.0471880435943604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142367, + "balance_loss_mlp": 1.06883836, + "epoch": 0.8674490188534052, + "flos": 628554289152.0, + "grad_norm": 0.035932979941611695, + "language_loss": 0.88173008, + "learning_rate": 4.5372435418548905e-05, + "loss": 0.89315367, + "num_input_tokens_seen": 373806960, + "router_z_loss_mlp": 0.734375, + "step": 4509, + "time_per_iteration": 2.7578866481781006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114294, + "balance_loss_mlp": 1.06941152, + "epoch": 0.8676414005386687, + "flos": 729204243456.0, + "grad_norm": 0.03320098636721179, + "language_loss": 0.90483028, + "learning_rate": 4.524284754171615e-05, + "loss": 0.91625965, + "num_input_tokens_seen": 373888352, + "router_z_loss_mlp": 0.734375, + "step": 4510, + "time_per_iteration": 2.963334321975708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142596, + "balance_loss_mlp": 1.06901991, + "epoch": 0.8678337822239323, + "flos": 541162235904.0, + "grad_norm": 0.03785696811984203, + "language_loss": 0.85416347, + "learning_rate": 4.5113436217267765e-05, + "loss": 0.86558938, + "num_input_tokens_seen": 373962112, + "router_z_loss_mlp": 0.734375, + "step": 4511, + "time_per_iteration": 2.8185770511627197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142668, + "balance_loss_mlp": 1.06894934, + "epoch": 0.8680261639091958, + "flos": 508525134336.0, + "grad_norm": 0.039845679615304476, + "language_loss": 0.84207547, + "learning_rate": 4.4984201495445744e-05, + "loss": 0.85350215, + "num_input_tokens_seen": 374028256, + "router_z_loss_mlp": 0.73583984, + "step": 4512, + "time_per_iteration": 2.585066795349121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141066, + "balance_loss_mlp": 1.06729949, + "epoch": 0.8682185455944594, + "flos": 488149549056.0, + "grad_norm": 0.038660182567623144, + "language_loss": 0.85638297, + "learning_rate": 4.4855143426423275e-05, + "loss": 0.86779356, + "num_input_tokens_seen": 374100080, + "router_z_loss_mlp": 0.73583984, + "step": 4513, + "time_per_iteration": 2.633535861968994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143624, + "balance_loss_mlp": 1.07019103, + "epoch": 0.868410927279723, + "flos": 604802784768.0, + "grad_norm": 0.04017621150999441, + "language_loss": 0.86356068, + "learning_rate": 4.472626206030528e-05, + "loss": 0.8749969, + "num_input_tokens_seen": 374174368, + "router_z_loss_mlp": 0.734375, + "step": 4514, + "time_per_iteration": 2.7051877975463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143529, + "balance_loss_mlp": 1.07009649, + "epoch": 0.8686033089649865, + "flos": 1120720851456.0, + "grad_norm": 0.03707200576292934, + "language_loss": 0.88939041, + "learning_rate": 4.4597557447127846e-05, + "loss": 0.90082574, + "num_input_tokens_seen": 374257328, + "router_z_loss_mlp": 0.734375, + "step": 4515, + "time_per_iteration": 3.379136562347412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142134, + "balance_loss_mlp": 1.06870151, + "epoch": 0.8687956906502501, + "flos": 569098665984.0, + "grad_norm": 0.03976409225750092, + "language_loss": 0.89278877, + "learning_rate": 4.446902963685862e-05, + "loss": 0.90421009, + "num_input_tokens_seen": 374327936, + "router_z_loss_mlp": 0.734375, + "step": 4516, + "time_per_iteration": 2.688634157180786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140065, + "balance_loss_mlp": 1.06663203, + "epoch": 0.8689880723355137, + "flos": 545410288128.0, + "grad_norm": 0.03650159916701781, + "language_loss": 0.89403987, + "learning_rate": 4.4340678679396454e-05, + "loss": 0.90544057, + "num_input_tokens_seen": 374400496, + "router_z_loss_mlp": 0.734375, + "step": 4517, + "time_per_iteration": 2.6805598735809326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114102, + "balance_loss_mlp": 1.06763518, + "epoch": 0.8691804540207773, + "flos": 458384807424.0, + "grad_norm": 0.03496696120486749, + "language_loss": 0.90544659, + "learning_rate": 4.4212504624571495e-05, + "loss": 0.91685677, + "num_input_tokens_seen": 374470528, + "router_z_loss_mlp": 0.73388672, + "step": 4518, + "time_per_iteration": 2.601616859436035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141562, + "balance_loss_mlp": 1.06812906, + "epoch": 0.8693728357060407, + "flos": 592999891968.0, + "grad_norm": 0.035835180224579856, + "language_loss": 0.8468256, + "learning_rate": 4.40845075221456e-05, + "loss": 0.8582412, + "num_input_tokens_seen": 374542656, + "router_z_loss_mlp": 0.734375, + "step": 4519, + "time_per_iteration": 2.711921215057373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141689, + "balance_loss_mlp": 1.06835151, + "epoch": 0.8695652173913043, + "flos": 681523315200.0, + "grad_norm": 0.03942817475285988, + "language_loss": 0.84818816, + "learning_rate": 4.395668742181164e-05, + "loss": 0.85960507, + "num_input_tokens_seen": 374617232, + "router_z_loss_mlp": 0.73339844, + "step": 4520, + "time_per_iteration": 2.9093902111053467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140477, + "balance_loss_mlp": 1.06709146, + "epoch": 0.8697575990765679, + "flos": 493335588864.0, + "grad_norm": 0.037682038057646666, + "language_loss": 0.83001059, + "learning_rate": 4.38290443731934e-05, + "loss": 0.84141541, + "num_input_tokens_seen": 374681888, + "router_z_loss_mlp": 0.73388672, + "step": 4521, + "time_per_iteration": 2.5499000549316406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140213, + "balance_loss_mlp": 1.06682801, + "epoch": 0.8699499807618315, + "flos": 527986927104.0, + "grad_norm": 0.03154316551914982, + "language_loss": 0.85485643, + "learning_rate": 4.370157842584671e-05, + "loss": 0.86625856, + "num_input_tokens_seen": 374750464, + "router_z_loss_mlp": 0.73388672, + "step": 4522, + "time_per_iteration": 2.7108314037323 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140429, + "balance_loss_mlp": 1.06699562, + "epoch": 0.8701423624470951, + "flos": 815793294336.0, + "grad_norm": 0.03787775248383424, + "language_loss": 0.84961677, + "learning_rate": 4.357428962925808e-05, + "loss": 0.86102104, + "num_input_tokens_seen": 374836064, + "router_z_loss_mlp": 0.734375, + "step": 4523, + "time_per_iteration": 3.114084482192993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140204, + "balance_loss_mlp": 1.06681871, + "epoch": 0.8703347441323586, + "flos": 557873192448.0, + "grad_norm": 0.037626849509955144, + "language_loss": 0.93374288, + "learning_rate": 4.344717803284542e-05, + "loss": 0.94514489, + "num_input_tokens_seen": 374903392, + "router_z_loss_mlp": 0.73388672, + "step": 4524, + "time_per_iteration": 2.702937602996826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139648, + "balance_loss_mlp": 1.06631005, + "epoch": 0.8705271258176221, + "flos": 586613351424.0, + "grad_norm": 0.0317274327667996, + "language_loss": 0.88659638, + "learning_rate": 4.3320243685957825e-05, + "loss": 0.89799285, + "num_input_tokens_seen": 374985904, + "router_z_loss_mlp": 0.73339844, + "step": 4525, + "time_per_iteration": 2.8281044960021973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140144, + "balance_loss_mlp": 1.06675887, + "epoch": 0.8707195075028857, + "flos": 670501957632.0, + "grad_norm": 0.03755252318995871, + "language_loss": 0.89142346, + "learning_rate": 4.3193486637875536e-05, + "loss": 0.90282488, + "num_input_tokens_seen": 375062992, + "router_z_loss_mlp": 0.73388672, + "step": 4526, + "time_per_iteration": 2.8868792057037354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137755, + "balance_loss_mlp": 1.06436968, + "epoch": 0.8709118891881493, + "flos": 521470130688.0, + "grad_norm": 0.03465882180034492, + "language_loss": 0.88376933, + "learning_rate": 4.306690693781007e-05, + "loss": 0.89514691, + "num_input_tokens_seen": 375139296, + "router_z_loss_mlp": 0.73388672, + "step": 4527, + "time_per_iteration": 2.7600021362304688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137372, + "balance_loss_mlp": 1.06384361, + "epoch": 0.8711042708734128, + "flos": 554271690240.0, + "grad_norm": 0.0382661525421971, + "language_loss": 0.86503428, + "learning_rate": 4.294050463490401e-05, + "loss": 0.87640798, + "num_input_tokens_seen": 375206576, + "router_z_loss_mlp": 0.73486328, + "step": 4528, + "time_per_iteration": 2.6349923610687256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137844, + "balance_loss_mlp": 1.06445885, + "epoch": 0.8712966525586764, + "flos": 503237036544.0, + "grad_norm": 0.04010187218615475, + "language_loss": 0.87453485, + "learning_rate": 4.281427977823094e-05, + "loss": 0.88591325, + "num_input_tokens_seen": 375279008, + "router_z_loss_mlp": 0.73388672, + "step": 4529, + "time_per_iteration": 2.699385166168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113745, + "balance_loss_mlp": 1.06411278, + "epoch": 0.87148903424394, + "flos": 805527275520.0, + "grad_norm": 0.03499624240949085, + "language_loss": 0.7799021, + "learning_rate": 4.268823241679593e-05, + "loss": 0.79127657, + "num_input_tokens_seen": 375368512, + "router_z_loss_mlp": 0.73339844, + "step": 4530, + "time_per_iteration": 3.0969526767730713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113759, + "balance_loss_mlp": 1.06425273, + "epoch": 0.8716814159292036, + "flos": 774840009216.0, + "grad_norm": 0.04260127752626609, + "language_loss": 0.89968532, + "learning_rate": 4.256236259953489e-05, + "loss": 0.91106123, + "num_input_tokens_seen": 375450528, + "router_z_loss_mlp": 0.73339844, + "step": 4531, + "time_per_iteration": 3.010664224624634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113744, + "balance_loss_mlp": 1.06405497, + "epoch": 0.8718737976144671, + "flos": 487797712896.0, + "grad_norm": 0.03878344757926045, + "language_loss": 0.9016605, + "learning_rate": 4.243667037531468e-05, + "loss": 0.91303492, + "num_input_tokens_seen": 375518256, + "router_z_loss_mlp": 0.73339844, + "step": 4532, + "time_per_iteration": 2.5791871547698975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137314, + "balance_loss_mlp": 1.06402397, + "epoch": 0.8720661792997306, + "flos": 585219468288.0, + "grad_norm": 0.034654863878580654, + "language_loss": 0.83150959, + "learning_rate": 4.2311155792933264e-05, + "loss": 0.84288275, + "num_input_tokens_seen": 375588112, + "router_z_loss_mlp": 0.73291016, + "step": 4533, + "time_per_iteration": 2.711474657058716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143066, + "balance_loss_mlp": 1.0713501, + "epoch": 0.8722585609849942, + "flos": 1498999928832.0, + "grad_norm": 0.008770633490120042, + "language_loss": 0.80966806, + "learning_rate": 4.2185818901119946e-05, + "loss": 0.82109869, + "num_input_tokens_seen": 375814496, + "router_z_loss_mlp": 0.71875, + "step": 4534, + "time_per_iteration": 4.842734336853027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137401, + "balance_loss_mlp": 1.06396782, + "epoch": 0.8724509426702578, + "flos": 597309069312.0, + "grad_norm": 0.03609431409406132, + "language_loss": 0.91708696, + "learning_rate": 4.206065974853479e-05, + "loss": 0.92846096, + "num_input_tokens_seen": 375885440, + "router_z_loss_mlp": 0.73388672, + "step": 4535, + "time_per_iteration": 2.740379810333252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140364, + "balance_loss_mlp": 1.06702685, + "epoch": 0.8726433243555214, + "flos": 444545484288.0, + "grad_norm": 0.042510018256880736, + "language_loss": 0.86475211, + "learning_rate": 4.193567838376888e-05, + "loss": 0.87615573, + "num_input_tokens_seen": 375952640, + "router_z_loss_mlp": 0.73339844, + "step": 4536, + "time_per_iteration": 2.634587526321411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142129, + "balance_loss_mlp": 1.06907749, + "epoch": 0.8728357060407849, + "flos": 554234760192.0, + "grad_norm": 0.042982945041552326, + "language_loss": 0.87478817, + "learning_rate": 4.181087485534402e-05, + "loss": 0.88620949, + "num_input_tokens_seen": 376021648, + "router_z_loss_mlp": 0.73046875, + "step": 4537, + "time_per_iteration": 2.6632931232452393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141929, + "balance_loss_mlp": 1.06878173, + "epoch": 0.8730280877260485, + "flos": 629018916864.0, + "grad_norm": 0.03625222734252447, + "language_loss": 0.8318783, + "learning_rate": 4.16862492117136e-05, + "loss": 0.8432976, + "num_input_tokens_seen": 376102304, + "router_z_loss_mlp": 0.73144531, + "step": 4538, + "time_per_iteration": 2.8200526237487793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140845, + "balance_loss_mlp": 1.06750751, + "epoch": 0.873220469411312, + "flos": 536501222400.0, + "grad_norm": 0.03838073368509028, + "language_loss": 0.85009706, + "learning_rate": 4.156180150126143e-05, + "loss": 0.86150557, + "num_input_tokens_seen": 376177072, + "router_z_loss_mlp": 0.73339844, + "step": 4539, + "time_per_iteration": 2.720931053161621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140177, + "balance_loss_mlp": 1.06688702, + "epoch": 0.8734128510965756, + "flos": 563000835072.0, + "grad_norm": 0.036962465734187516, + "language_loss": 0.89154851, + "learning_rate": 4.143753177230242e-05, + "loss": 0.90295029, + "num_input_tokens_seen": 376251376, + "router_z_loss_mlp": 0.73291016, + "step": 4540, + "time_per_iteration": 2.7204575538635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140918, + "balance_loss_mlp": 1.06762838, + "epoch": 0.8736052327818392, + "flos": 687803794944.0, + "grad_norm": 0.05823857081406219, + "language_loss": 0.83594728, + "learning_rate": 4.131344007308224e-05, + "loss": 0.8473565, + "num_input_tokens_seen": 376337104, + "router_z_loss_mlp": 0.73291016, + "step": 4541, + "time_per_iteration": 3.0238983631134033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140844, + "balance_loss_mlp": 1.06750619, + "epoch": 0.8737976144671027, + "flos": 532832590848.0, + "grad_norm": 0.03481069740007844, + "language_loss": 0.85935038, + "learning_rate": 4.1189526451777816e-05, + "loss": 0.87075877, + "num_input_tokens_seen": 376415456, + "router_z_loss_mlp": 0.73339844, + "step": 4542, + "time_per_iteration": 2.805119752883911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141863, + "balance_loss_mlp": 1.06871605, + "epoch": 0.8739899961523663, + "flos": 576729368064.0, + "grad_norm": 0.03488368865297959, + "language_loss": 0.86241484, + "learning_rate": 4.106579095649649e-05, + "loss": 0.87383342, + "num_input_tokens_seen": 376494880, + "router_z_loss_mlp": 0.73144531, + "step": 4543, + "time_per_iteration": 2.8203420639038086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011421, + "balance_loss_mlp": 1.06885803, + "epoch": 0.8741823778376299, + "flos": 732631828992.0, + "grad_norm": 0.04473609359833568, + "language_loss": 0.83021426, + "learning_rate": 4.094223363527666e-05, + "loss": 0.84163529, + "num_input_tokens_seen": 376571760, + "router_z_loss_mlp": 0.73242188, + "step": 4544, + "time_per_iteration": 2.9382483959198 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140903, + "balance_loss_mlp": 1.06766069, + "epoch": 0.8743747595228935, + "flos": 568221803520.0, + "grad_norm": 0.0362289518248913, + "language_loss": 0.88223737, + "learning_rate": 4.081885453608747e-05, + "loss": 0.89364642, + "num_input_tokens_seen": 376644464, + "router_z_loss_mlp": 0.73242188, + "step": 4545, + "time_per_iteration": 2.7575058937072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140609, + "balance_loss_mlp": 1.06731939, + "epoch": 0.8745671412081569, + "flos": 494395100160.0, + "grad_norm": 0.03736605456447314, + "language_loss": 0.86481446, + "learning_rate": 4.0695653706829095e-05, + "loss": 0.87622052, + "num_input_tokens_seen": 376709584, + "router_z_loss_mlp": 0.73291016, + "step": 4546, + "time_per_iteration": 2.600027322769165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141765, + "balance_loss_mlp": 1.06866539, + "epoch": 0.8747595228934205, + "flos": 525166960128.0, + "grad_norm": 0.03010216092213021, + "language_loss": 0.87510192, + "learning_rate": 4.057263119533233e-05, + "loss": 0.88651955, + "num_input_tokens_seen": 376779472, + "router_z_loss_mlp": 0.73095703, + "step": 4547, + "time_per_iteration": 2.6267926692962646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142092, + "balance_loss_mlp": 1.06899297, + "epoch": 0.8749519045786841, + "flos": 745752743424.0, + "grad_norm": 0.036693225963323806, + "language_loss": 0.849769, + "learning_rate": 4.044978704935853e-05, + "loss": 0.86118996, + "num_input_tokens_seen": 376863408, + "router_z_loss_mlp": 0.73095703, + "step": 4548, + "time_per_iteration": 3.072727918624878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140781, + "balance_loss_mlp": 1.06758618, + "epoch": 0.8751442862639477, + "flos": 595383429120.0, + "grad_norm": 0.032788799851171016, + "language_loss": 0.84310943, + "learning_rate": 4.032712131660027e-05, + "loss": 0.85451728, + "num_input_tokens_seen": 376942080, + "router_z_loss_mlp": 0.73193359, + "step": 4549, + "time_per_iteration": 2.878819465637207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138154, + "balance_loss_mlp": 1.06486428, + "epoch": 0.8753366679492113, + "flos": 497514510336.0, + "grad_norm": 0.037587751687951164, + "language_loss": 0.83288509, + "learning_rate": 4.020463404468055e-05, + "loss": 0.84426665, + "num_input_tokens_seen": 377015696, + "router_z_loss_mlp": 0.73291016, + "step": 4550, + "time_per_iteration": 2.7538514137268066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138001, + "balance_loss_mlp": 1.06475925, + "epoch": 0.8755290496344748, + "flos": 490849993728.0, + "grad_norm": 0.036673671086796596, + "language_loss": 0.87328094, + "learning_rate": 4.0082325281153074e-05, + "loss": 0.88466096, + "num_input_tokens_seen": 377081424, + "router_z_loss_mlp": 0.73242188, + "step": 4551, + "time_per_iteration": 2.5642802715301514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137849, + "balance_loss_mlp": 1.06446373, + "epoch": 0.8757214313197383, + "flos": 593071750656.0, + "grad_norm": 0.03525869575859479, + "language_loss": 0.86262238, + "learning_rate": 3.9960195073502345e-05, + "loss": 0.87400079, + "num_input_tokens_seen": 377159360, + "router_z_loss_mlp": 0.73388672, + "step": 4552, + "time_per_iteration": 2.8446478843688965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138023, + "balance_loss_mlp": 1.06473362, + "epoch": 0.8759138130050019, + "flos": 978399203328.0, + "grad_norm": 0.052190711444307536, + "language_loss": 0.83496857, + "learning_rate": 3.9838243469143555e-05, + "loss": 0.84634876, + "num_input_tokens_seen": 377240704, + "router_z_loss_mlp": 0.73291016, + "step": 4553, + "time_per_iteration": 3.229661464691162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138498, + "balance_loss_mlp": 1.06520855, + "epoch": 0.8761061946902655, + "flos": 804205251072.0, + "grad_norm": 0.0321030761247515, + "language_loss": 0.80983669, + "learning_rate": 3.971647051542243e-05, + "loss": 0.82122165, + "num_input_tokens_seen": 377324176, + "router_z_loss_mlp": 0.73291016, + "step": 4554, + "time_per_iteration": 3.0523788928985596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137491, + "balance_loss_mlp": 1.06420088, + "epoch": 0.8762985763755291, + "flos": 699847733760.0, + "grad_norm": 0.035078141939390024, + "language_loss": 0.80103445, + "learning_rate": 3.95948762596155e-05, + "loss": 0.8124094, + "num_input_tokens_seen": 377403440, + "router_z_loss_mlp": 0.73291016, + "step": 4555, + "time_per_iteration": 2.972339391708374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138129, + "balance_loss_mlp": 1.06488729, + "epoch": 0.8764909580607926, + "flos": 630927092736.0, + "grad_norm": 0.0358178830175899, + "language_loss": 0.85281265, + "learning_rate": 3.9473460748929765e-05, + "loss": 0.86419404, + "num_input_tokens_seen": 377483440, + "router_z_loss_mlp": 0.73242188, + "step": 4556, + "time_per_iteration": 2.8507936000823975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137844, + "balance_loss_mlp": 1.06455374, + "epoch": 0.8766833397460562, + "flos": 482537812992.0, + "grad_norm": 0.035589487880799825, + "language_loss": 0.85349488, + "learning_rate": 3.935222403050304e-05, + "loss": 0.86487329, + "num_input_tokens_seen": 377554688, + "router_z_loss_mlp": 0.73291016, + "step": 4557, + "time_per_iteration": 2.686985969543457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138302, + "balance_loss_mlp": 1.06506014, + "epoch": 0.8768757214313198, + "flos": 408617783808.0, + "grad_norm": 0.03886308693669829, + "language_loss": 0.83731771, + "learning_rate": 3.923116615140354e-05, + "loss": 0.84870076, + "num_input_tokens_seen": 377617616, + "router_z_loss_mlp": 0.73242188, + "step": 4558, + "time_per_iteration": 2.5058376789093018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138698, + "balance_loss_mlp": 1.06545591, + "epoch": 0.8770681031165833, + "flos": 583656397824.0, + "grad_norm": 0.050661458115567146, + "language_loss": 0.87454987, + "learning_rate": 3.9110287158630076e-05, + "loss": 0.88593686, + "num_input_tokens_seen": 377685888, + "router_z_loss_mlp": 0.73242188, + "step": 4559, + "time_per_iteration": 2.669386625289917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138391, + "balance_loss_mlp": 1.06495833, + "epoch": 0.8772604848018468, + "flos": 509688705024.0, + "grad_norm": 0.03644513335402904, + "language_loss": 0.85219496, + "learning_rate": 3.8989587099111875e-05, + "loss": 0.86357886, + "num_input_tokens_seen": 377755744, + "router_z_loss_mlp": 0.73388672, + "step": 4560, + "time_per_iteration": 2.6710524559020996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138458, + "balance_loss_mlp": 1.06521559, + "epoch": 0.8774528664871104, + "flos": 409716226560.0, + "grad_norm": 0.04166962676030205, + "language_loss": 0.9057163, + "learning_rate": 3.886906601970913e-05, + "loss": 0.91710079, + "num_input_tokens_seen": 377818880, + "router_z_loss_mlp": 0.73242188, + "step": 4561, + "time_per_iteration": 2.4726264476776123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138891, + "balance_loss_mlp": 1.06574452, + "epoch": 0.877645248172374, + "flos": 501869349888.0, + "grad_norm": 0.03332122726470747, + "language_loss": 0.87716341, + "learning_rate": 3.8748723967212184e-05, + "loss": 0.88855237, + "num_input_tokens_seen": 377893280, + "router_z_loss_mlp": 0.73144531, + "step": 4562, + "time_per_iteration": 2.6267993450164795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139069, + "balance_loss_mlp": 1.06582642, + "epoch": 0.8778376298576376, + "flos": 634298282496.0, + "grad_norm": 0.03625990929087617, + "language_loss": 0.82094103, + "learning_rate": 3.862856098834189e-05, + "loss": 0.83233178, + "num_input_tokens_seen": 377972912, + "router_z_loss_mlp": 0.73242188, + "step": 4563, + "time_per_iteration": 2.874626398086548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138987, + "balance_loss_mlp": 1.06569707, + "epoch": 0.8780300115429012, + "flos": 535114070016.0, + "grad_norm": 0.033329550364358154, + "language_loss": 0.84246641, + "learning_rate": 3.850857712974976e-05, + "loss": 0.85385627, + "num_input_tokens_seen": 378054000, + "router_z_loss_mlp": 0.73291016, + "step": 4564, + "time_per_iteration": 2.865466833114624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138742, + "balance_loss_mlp": 1.06550014, + "epoch": 0.8782223932281646, + "flos": 512667125760.0, + "grad_norm": 0.035748918412903466, + "language_loss": 0.81673437, + "learning_rate": 3.838877243801758e-05, + "loss": 0.82812178, + "num_input_tokens_seen": 378120336, + "router_z_loss_mlp": 0.73242188, + "step": 4565, + "time_per_iteration": 2.6305251121520996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113867, + "balance_loss_mlp": 1.06547523, + "epoch": 0.8784147749134282, + "flos": 782245128192.0, + "grad_norm": 0.039934883887501355, + "language_loss": 0.74876142, + "learning_rate": 3.826914695965766e-05, + "loss": 0.76014817, + "num_input_tokens_seen": 378216672, + "router_z_loss_mlp": 0.73193359, + "step": 4566, + "time_per_iteration": 3.187756299972534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138841, + "balance_loss_mlp": 1.06550372, + "epoch": 0.8786071565986918, + "flos": 562071579648.0, + "grad_norm": 0.044145845900659855, + "language_loss": 0.81758606, + "learning_rate": 3.814970074111279e-05, + "loss": 0.82897443, + "num_input_tokens_seen": 378287536, + "router_z_loss_mlp": 0.73339844, + "step": 4567, + "time_per_iteration": 2.694370746612549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138507, + "balance_loss_mlp": 1.06516922, + "epoch": 0.8787995382839554, + "flos": 604651061760.0, + "grad_norm": 0.03484451232050219, + "language_loss": 0.81663251, + "learning_rate": 3.8030433828755926e-05, + "loss": 0.82801759, + "num_input_tokens_seen": 378362128, + "router_z_loss_mlp": 0.73291016, + "step": 4568, + "time_per_iteration": 2.8261232376098633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137882, + "balance_loss_mlp": 1.06444907, + "epoch": 0.8789919199692189, + "flos": 561290044416.0, + "grad_norm": 0.034253757816549546, + "language_loss": 0.892627, + "learning_rate": 3.7911346268890924e-05, + "loss": 0.90400583, + "num_input_tokens_seen": 378435696, + "router_z_loss_mlp": 0.73388672, + "step": 4569, + "time_per_iteration": 2.671189546585083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114104, + "balance_loss_mlp": 1.06789315, + "epoch": 0.8791843016544825, + "flos": 540152389632.0, + "grad_norm": 0.03918561185928757, + "language_loss": 0.87219656, + "learning_rate": 3.7792438107751405e-05, + "loss": 0.88360703, + "num_input_tokens_seen": 378505664, + "router_z_loss_mlp": 0.73144531, + "step": 4570, + "time_per_iteration": 2.627720355987549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140909, + "balance_loss_mlp": 1.06780934, + "epoch": 0.8793766833397461, + "flos": 1010404491264.0, + "grad_norm": 0.03486713685308289, + "language_loss": 0.83421218, + "learning_rate": 3.767370939150167e-05, + "loss": 0.84562135, + "num_input_tokens_seen": 378598016, + "router_z_loss_mlp": 0.73095703, + "step": 4571, + "time_per_iteration": 3.3709144592285156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141064, + "balance_loss_mlp": 1.06791723, + "epoch": 0.8795690650250096, + "flos": 679912581120.0, + "grad_norm": 0.03284343034146008, + "language_loss": 0.85293531, + "learning_rate": 3.755516016623628e-05, + "loss": 0.86434591, + "num_input_tokens_seen": 378676176, + "router_z_loss_mlp": 0.73144531, + "step": 4572, + "time_per_iteration": 2.883894205093384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140417, + "balance_loss_mlp": 1.06717467, + "epoch": 0.8797614467102732, + "flos": 454355607552.0, + "grad_norm": 0.038996415271177934, + "language_loss": 0.93823111, + "learning_rate": 3.7436790477980157e-05, + "loss": 0.94963527, + "num_input_tokens_seen": 378737952, + "router_z_loss_mlp": 0.73242188, + "step": 4573, + "time_per_iteration": 2.5188074111938477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114079, + "balance_loss_mlp": 1.06773829, + "epoch": 0.8799538283955367, + "flos": 551972746752.0, + "grad_norm": 0.03577674735145117, + "language_loss": 0.8895998, + "learning_rate": 3.7318600372688526e-05, + "loss": 0.90100765, + "num_input_tokens_seen": 378806704, + "router_z_loss_mlp": 0.73046875, + "step": 4574, + "time_per_iteration": 2.6594581604003906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139479, + "balance_loss_mlp": 1.06614149, + "epoch": 0.8801462100808003, + "flos": 808859533824.0, + "grad_norm": 0.03486958865574067, + "language_loss": 0.89314497, + "learning_rate": 3.720058989624681e-05, + "loss": 0.90453982, + "num_input_tokens_seen": 378887616, + "router_z_loss_mlp": 0.73339844, + "step": 4575, + "time_per_iteration": 3.0489046573638916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138104, + "balance_loss_mlp": 1.06481373, + "epoch": 0.8803385917660639, + "flos": 770011809792.0, + "grad_norm": 0.035651765700735125, + "language_loss": 0.88622105, + "learning_rate": 3.708275909447079e-05, + "loss": 0.89760214, + "num_input_tokens_seen": 378964656, + "router_z_loss_mlp": 0.73291016, + "step": 4576, + "time_per_iteration": 2.9586453437805176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138145, + "balance_loss_mlp": 1.06490231, + "epoch": 0.8805309734513275, + "flos": 568419188736.0, + "grad_norm": 0.032922624832929834, + "language_loss": 0.85456908, + "learning_rate": 3.696510801310632e-05, + "loss": 0.86595052, + "num_input_tokens_seen": 379036752, + "router_z_loss_mlp": 0.73242188, + "step": 4577, + "time_per_iteration": 2.719613790512085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137266, + "balance_loss_mlp": 1.06397581, + "epoch": 0.880723355136591, + "flos": 680976095232.0, + "grad_norm": 0.03544954996381365, + "language_loss": 0.8560704, + "learning_rate": 3.6847636697829755e-05, + "loss": 0.86744308, + "num_input_tokens_seen": 379106480, + "router_z_loss_mlp": 0.73291016, + "step": 4578, + "time_per_iteration": 2.8218014240264893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137911, + "balance_loss_mlp": 1.06462061, + "epoch": 0.8809157368218545, + "flos": 566760791040.0, + "grad_norm": 0.03362495082799701, + "language_loss": 0.83221316, + "learning_rate": 3.673034519424734e-05, + "loss": 0.84359229, + "num_input_tokens_seen": 379182544, + "router_z_loss_mlp": 0.73291016, + "step": 4579, + "time_per_iteration": 2.7465338706970215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139025, + "balance_loss_mlp": 1.06578302, + "epoch": 0.8811081185071181, + "flos": 516427081728.0, + "grad_norm": 0.03125001754888258, + "language_loss": 0.79574335, + "learning_rate": 3.661323354789586e-05, + "loss": 0.80713362, + "num_input_tokens_seen": 379255856, + "router_z_loss_mlp": 0.73242188, + "step": 4580, + "time_per_iteration": 2.690438985824585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139132, + "balance_loss_mlp": 1.06589007, + "epoch": 0.8813005001923817, + "flos": 595448557056.0, + "grad_norm": 0.03786361904540541, + "language_loss": 0.8583113, + "learning_rate": 3.649630180424191e-05, + "loss": 0.86970258, + "num_input_tokens_seen": 379322704, + "router_z_loss_mlp": 0.73242188, + "step": 4581, + "time_per_iteration": 2.715607166290283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113771, + "balance_loss_mlp": 1.06446779, + "epoch": 0.8814928818776453, + "flos": 668185549824.0, + "grad_norm": 0.03829692440387713, + "language_loss": 0.82977974, + "learning_rate": 3.637955000868254e-05, + "loss": 0.84115684, + "num_input_tokens_seen": 379395008, + "router_z_loss_mlp": 0.73242188, + "step": 4582, + "time_per_iteration": 2.8873000144958496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138319, + "balance_loss_mlp": 1.06507647, + "epoch": 0.8816852635629088, + "flos": 610275532800.0, + "grad_norm": 0.034998121361190335, + "language_loss": 0.90240663, + "learning_rate": 3.626297820654467e-05, + "loss": 0.91378981, + "num_input_tokens_seen": 379465824, + "router_z_loss_mlp": 0.73242188, + "step": 4583, + "time_per_iteration": 2.7176356315612793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138968, + "balance_loss_mlp": 1.06567812, + "epoch": 0.8818776452481724, + "flos": 481374242304.0, + "grad_norm": 0.0376212060911988, + "language_loss": 0.86705077, + "learning_rate": 3.614658644308572e-05, + "loss": 0.87844038, + "num_input_tokens_seen": 379534960, + "router_z_loss_mlp": 0.73291016, + "step": 4584, + "time_per_iteration": 2.6146843433380127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138915, + "balance_loss_mlp": 1.0655303, + "epoch": 0.882070026933436, + "flos": 1047033136128.0, + "grad_norm": 0.040308027049788406, + "language_loss": 0.78901362, + "learning_rate": 3.60303747634928e-05, + "loss": 0.80040276, + "num_input_tokens_seen": 379617456, + "router_z_loss_mlp": 0.73388672, + "step": 4585, + "time_per_iteration": 3.30761456489563 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136732, + "balance_loss_mlp": 1.06344187, + "epoch": 0.8822624086186995, + "flos": 475434865152.0, + "grad_norm": 0.03393344724745408, + "language_loss": 0.84516394, + "learning_rate": 3.591434321288345e-05, + "loss": 0.8565312, + "num_input_tokens_seen": 379687792, + "router_z_loss_mlp": 0.73291016, + "step": 4586, + "time_per_iteration": 2.680474042892456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113674, + "balance_loss_mlp": 1.06345069, + "epoch": 0.882454790303963, + "flos": 655221087744.0, + "grad_norm": 0.039082630684481784, + "language_loss": 0.86279416, + "learning_rate": 3.579849183630485e-05, + "loss": 0.87416154, + "num_input_tokens_seen": 379761120, + "router_z_loss_mlp": 0.73291016, + "step": 4587, + "time_per_iteration": 2.8492140769958496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136645, + "balance_loss_mlp": 1.06335557, + "epoch": 0.8826471719892266, + "flos": 471303607296.0, + "grad_norm": 0.039436934050180984, + "language_loss": 0.83528584, + "learning_rate": 3.568282067873468e-05, + "loss": 0.84665227, + "num_input_tokens_seen": 379829008, + "router_z_loss_mlp": 0.73291016, + "step": 4588, + "time_per_iteration": 2.562138319015503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136884, + "balance_loss_mlp": 1.06364226, + "epoch": 0.8828395536744902, + "flos": 469766733312.0, + "grad_norm": 0.033013862791337924, + "language_loss": 0.88277167, + "learning_rate": 3.556732978508048e-05, + "loss": 0.89414054, + "num_input_tokens_seen": 379899584, + "router_z_loss_mlp": 0.73242188, + "step": 4589, + "time_per_iteration": 2.7143378257751465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141687, + "balance_loss_mlp": 1.06844449, + "epoch": 0.8830319353597538, + "flos": 722717646336.0, + "grad_norm": 0.03609529277559126, + "language_loss": 0.85748345, + "learning_rate": 3.545201920017971e-05, + "loss": 0.8689003, + "num_input_tokens_seen": 379979440, + "router_z_loss_mlp": 0.73242188, + "step": 4590, + "time_per_iteration": 2.939535140991211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114124, + "balance_loss_mlp": 1.06790292, + "epoch": 0.8832243170450174, + "flos": 444191646720.0, + "grad_norm": 0.03979161587651804, + "language_loss": 0.85422397, + "learning_rate": 3.5336888968799996e-05, + "loss": 0.86563635, + "num_input_tokens_seen": 380046944, + "router_z_loss_mlp": 0.73339844, + "step": 4591, + "time_per_iteration": 2.594569683074951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141267, + "balance_loss_mlp": 1.06792951, + "epoch": 0.8834166987302808, + "flos": 567746442240.0, + "grad_norm": 0.04357275936054337, + "language_loss": 0.87711227, + "learning_rate": 3.5221939135638756e-05, + "loss": 0.88852489, + "num_input_tokens_seen": 380118048, + "router_z_loss_mlp": 0.73339844, + "step": 4592, + "time_per_iteration": 2.7693564891815186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141211, + "balance_loss_mlp": 1.06763518, + "epoch": 0.8836090804155444, + "flos": 610497113088.0, + "grad_norm": 0.036235581662511764, + "language_loss": 0.86945099, + "learning_rate": 3.510716974532352e-05, + "loss": 0.88086307, + "num_input_tokens_seen": 380192416, + "router_z_loss_mlp": 0.73486328, + "step": 4593, + "time_per_iteration": 2.823115587234497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141441, + "balance_loss_mlp": 1.06786549, + "epoch": 0.883801462100808, + "flos": 558116239872.0, + "grad_norm": 0.037409309315743274, + "language_loss": 0.84331363, + "learning_rate": 3.4992580842411745e-05, + "loss": 0.85472804, + "num_input_tokens_seen": 380264432, + "router_z_loss_mlp": 0.73486328, + "step": 4594, + "time_per_iteration": 2.6731603145599365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142652, + "balance_loss_mlp": 1.06917179, + "epoch": 0.8839938437860716, + "flos": 517199884800.0, + "grad_norm": 0.05623624543417451, + "language_loss": 0.82118529, + "learning_rate": 3.487817247139064e-05, + "loss": 0.8326118, + "num_input_tokens_seen": 380334192, + "router_z_loss_mlp": 0.734375, + "step": 4595, + "time_per_iteration": 2.643226385116577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142905, + "balance_loss_mlp": 1.06966281, + "epoch": 0.8841862254713351, + "flos": 714939224064.0, + "grad_norm": 0.03953602235880356, + "language_loss": 0.84327024, + "learning_rate": 3.47639446766777e-05, + "loss": 0.85469925, + "num_input_tokens_seen": 380407504, + "router_z_loss_mlp": 0.73242188, + "step": 4596, + "time_per_iteration": 2.8558902740478516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142903, + "balance_loss_mlp": 1.06966054, + "epoch": 0.8843786071565987, + "flos": 835378612224.0, + "grad_norm": 0.03630937996165782, + "language_loss": 0.8742218, + "learning_rate": 3.4649897502620095e-05, + "loss": 0.88565087, + "num_input_tokens_seen": 380486272, + "router_z_loss_mlp": 0.73242188, + "step": 4597, + "time_per_iteration": 3.0525734424591064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142043, + "balance_loss_mlp": 1.06875324, + "epoch": 0.8845709888418622, + "flos": 658178041344.0, + "grad_norm": 0.03258789526355552, + "language_loss": 0.86930513, + "learning_rate": 3.453603099349462e-05, + "loss": 0.88072556, + "num_input_tokens_seen": 380568480, + "router_z_loss_mlp": 0.73291016, + "step": 4598, + "time_per_iteration": 2.912843704223633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141884, + "balance_loss_mlp": 1.06859386, + "epoch": 0.8847633705271258, + "flos": 524483480064.0, + "grad_norm": 0.03479113833885251, + "language_loss": 0.84803116, + "learning_rate": 3.442234519350823e-05, + "loss": 0.85944992, + "num_input_tokens_seen": 380643088, + "router_z_loss_mlp": 0.73291016, + "step": 4599, + "time_per_iteration": 2.7513442039489746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114178, + "balance_loss_mlp": 1.06844211, + "epoch": 0.8849557522123894, + "flos": 549636873216.0, + "grad_norm": 0.03798845472112611, + "language_loss": 0.88343596, + "learning_rate": 3.430884014679786e-05, + "loss": 0.89485371, + "num_input_tokens_seen": 380714512, + "router_z_loss_mlp": 0.73339844, + "step": 4600, + "time_per_iteration": 2.665273666381836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141776, + "balance_loss_mlp": 1.06848598, + "epoch": 0.8851481338976529, + "flos": 623583098880.0, + "grad_norm": 0.03350151892147519, + "language_loss": 0.88500738, + "learning_rate": 3.4195515897429974e-05, + "loss": 0.89642519, + "num_input_tokens_seen": 380789168, + "router_z_loss_mlp": 0.73291016, + "step": 4601, + "time_per_iteration": 2.8266654014587402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139622, + "balance_loss_mlp": 1.0663321, + "epoch": 0.8853405155829165, + "flos": 445307553792.0, + "grad_norm": 0.035348073668552936, + "language_loss": 0.85571676, + "learning_rate": 3.408237248940088e-05, + "loss": 0.86711299, + "num_input_tokens_seen": 380856992, + "router_z_loss_mlp": 0.73291016, + "step": 4602, + "time_per_iteration": 2.556607246398926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141214, + "balance_loss_mlp": 1.06816256, + "epoch": 0.8855328972681801, + "flos": 731748235776.0, + "grad_norm": 0.03825998754316307, + "language_loss": 0.82411921, + "learning_rate": 3.396940996663683e-05, + "loss": 0.83553129, + "num_input_tokens_seen": 380930480, + "router_z_loss_mlp": 0.73046875, + "step": 4603, + "time_per_iteration": 2.8917107582092285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140786, + "balance_loss_mlp": 1.06763935, + "epoch": 0.8857252789534437, + "flos": 488355666432.0, + "grad_norm": 0.038685533641598824, + "language_loss": 0.83611298, + "learning_rate": 3.385662837299375e-05, + "loss": 0.84752083, + "num_input_tokens_seen": 380994192, + "router_z_loss_mlp": 0.73144531, + "step": 4604, + "time_per_iteration": 2.548560857772827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140966, + "balance_loss_mlp": 1.067819, + "epoch": 0.8859176606387072, + "flos": 509621575680.0, + "grad_norm": 0.042063998825786784, + "language_loss": 0.87247568, + "learning_rate": 3.374402775225727e-05, + "loss": 0.88388538, + "num_input_tokens_seen": 381066848, + "router_z_loss_mlp": 0.73144531, + "step": 4605, + "time_per_iteration": 2.7407033443450928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139586, + "balance_loss_mlp": 1.06634402, + "epoch": 0.8861100423239707, + "flos": 517664512512.0, + "grad_norm": 0.03414528469711758, + "language_loss": 0.89563382, + "learning_rate": 3.3631608148142925e-05, + "loss": 0.90702963, + "num_input_tokens_seen": 381138816, + "router_z_loss_mlp": 0.73242188, + "step": 4606, + "time_per_iteration": 2.652094602584839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113943, + "balance_loss_mlp": 1.06623542, + "epoch": 0.8863024240092343, + "flos": 628109127168.0, + "grad_norm": 0.03551682642921559, + "language_loss": 0.83570439, + "learning_rate": 3.3519369604295746e-05, + "loss": 0.84709865, + "num_input_tokens_seen": 381208448, + "router_z_loss_mlp": 0.73193359, + "step": 4607, + "time_per_iteration": 2.725616455078125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113989, + "balance_loss_mlp": 1.06679058, + "epoch": 0.8864948056944979, + "flos": 768297016320.0, + "grad_norm": 0.030729524445201942, + "language_loss": 0.87768084, + "learning_rate": 3.340731216429083e-05, + "loss": 0.88907969, + "num_input_tokens_seen": 381289712, + "router_z_loss_mlp": 0.73095703, + "step": 4608, + "time_per_iteration": 3.0135393142700195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143433, + "balance_loss_mlp": 1.07171631, + "epoch": 0.8866871873797615, + "flos": 1505665171968.0, + "grad_norm": 0.005181000489045015, + "language_loss": 0.78830957, + "learning_rate": 3.329543587163253e-05, + "loss": 0.79974389, + "num_input_tokens_seen": 381520848, + "router_z_loss_mlp": 0.71875, + "step": 4609, + "time_per_iteration": 4.8497114181518555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139247, + "balance_loss_mlp": 1.06619585, + "epoch": 0.886879569065025, + "flos": 812927665152.0, + "grad_norm": 0.03659826934115083, + "language_loss": 0.86593419, + "learning_rate": 3.3183740769755e-05, + "loss": 0.87732661, + "num_input_tokens_seen": 381603008, + "router_z_loss_mlp": 0.73046875, + "step": 4610, + "time_per_iteration": 3.0547640323638916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143288, + "balance_loss_mlp": 1.07176208, + "epoch": 0.8870719507502886, + "flos": 1586223521280.0, + "grad_norm": 0.0047245300791828836, + "language_loss": 0.7691083, + "learning_rate": 3.307222690202238e-05, + "loss": 0.78054118, + "num_input_tokens_seen": 381844336, + "router_z_loss_mlp": 0.71679688, + "step": 4611, + "time_per_iteration": 4.970493316650391 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140218, + "balance_loss_mlp": 1.06716621, + "epoch": 0.8872643324355521, + "flos": 635164411392.0, + "grad_norm": 0.0365799977682868, + "language_loss": 0.79885757, + "learning_rate": 3.296089431172811e-05, + "loss": 0.8102597, + "num_input_tokens_seen": 381918576, + "router_z_loss_mlp": 0.73046875, + "step": 4612, + "time_per_iteration": 2.800936698913574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140152, + "balance_loss_mlp": 1.06705284, + "epoch": 0.8874567141208157, + "flos": 536783201280.0, + "grad_norm": 0.03880516552904762, + "language_loss": 0.88008845, + "learning_rate": 3.284974304209532e-05, + "loss": 0.89148998, + "num_input_tokens_seen": 381987296, + "router_z_loss_mlp": 0.73095703, + "step": 4613, + "time_per_iteration": 2.6119205951690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139668, + "balance_loss_mlp": 1.06652081, + "epoch": 0.8876490958060793, + "flos": 1568717389824.0, + "grad_norm": 0.03468157692994687, + "language_loss": 0.83196402, + "learning_rate": 3.27387731362766e-05, + "loss": 0.84336072, + "num_input_tokens_seen": 382091744, + "router_z_loss_mlp": 0.73144531, + "step": 4614, + "time_per_iteration": 3.8848578929901123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140594, + "balance_loss_mlp": 1.06754243, + "epoch": 0.8878414774913428, + "flos": 637797726720.0, + "grad_norm": 0.03727846125722482, + "language_loss": 0.90132129, + "learning_rate": 3.2627984637354444e-05, + "loss": 0.91272724, + "num_input_tokens_seen": 382169600, + "router_z_loss_mlp": 0.73046875, + "step": 4615, + "time_per_iteration": 2.821709156036377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140764, + "balance_loss_mlp": 1.0677129, + "epoch": 0.8880338591766064, + "flos": 497421184512.0, + "grad_norm": 0.04463724567610171, + "language_loss": 0.86964601, + "learning_rate": 3.251737758834084e-05, + "loss": 0.88105357, + "num_input_tokens_seen": 382238336, + "router_z_loss_mlp": 0.73046875, + "step": 4616, + "time_per_iteration": 2.6447269916534424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141098, + "balance_loss_mlp": 1.06804681, + "epoch": 0.88822624086187, + "flos": 543912345600.0, + "grad_norm": 0.03827212430271638, + "language_loss": 0.84569329, + "learning_rate": 3.2406952032177086e-05, + "loss": 0.85710424, + "num_input_tokens_seen": 382308560, + "router_z_loss_mlp": 0.73046875, + "step": 4617, + "time_per_iteration": 2.6946191787719727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141215, + "balance_loss_mlp": 1.06816316, + "epoch": 0.8884186225471336, + "flos": 552875805696.0, + "grad_norm": 0.042682461995962664, + "language_loss": 0.88825953, + "learning_rate": 3.229670801173418e-05, + "loss": 0.89967167, + "num_input_tokens_seen": 382377504, + "router_z_loss_mlp": 0.73046875, + "step": 4618, + "time_per_iteration": 2.617229700088501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144875, + "balance_loss_mlp": 1.073349, + "epoch": 0.888611004232397, + "flos": 1568659170816.0, + "grad_norm": 0.003196569224435078, + "language_loss": 0.78512192, + "learning_rate": 3.218664556981288e-05, + "loss": 0.79657078, + "num_input_tokens_seen": 382615728, + "router_z_loss_mlp": 0.71679688, + "step": 4619, + "time_per_iteration": 5.0100486278533936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140753, + "balance_loss_mlp": 1.06770194, + "epoch": 0.8888033859176606, + "flos": 768436004352.0, + "grad_norm": 0.031145339209085954, + "language_loss": 0.86609745, + "learning_rate": 3.207676474914301e-05, + "loss": 0.877505, + "num_input_tokens_seen": 382695552, + "router_z_loss_mlp": 0.73046875, + "step": 4620, + "time_per_iteration": 3.0852935314178467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140488, + "balance_loss_mlp": 1.06738901, + "epoch": 0.8889957676029242, + "flos": 935648532480.0, + "grad_norm": 0.034367536832817513, + "language_loss": 0.88588071, + "learning_rate": 3.1967065592384105e-05, + "loss": 0.89728558, + "num_input_tokens_seen": 382775824, + "router_z_loss_mlp": 0.73095703, + "step": 4621, + "time_per_iteration": 3.1627614498138428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140338, + "balance_loss_mlp": 1.06728625, + "epoch": 0.8891881492881878, + "flos": 590792272896.0, + "grad_norm": 0.03508210471851401, + "language_loss": 0.86302722, + "learning_rate": 3.1857548142125104e-05, + "loss": 0.87443054, + "num_input_tokens_seen": 382854464, + "router_z_loss_mlp": 0.73046875, + "step": 4622, + "time_per_iteration": 2.8091282844543457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141091, + "balance_loss_mlp": 1.06803989, + "epoch": 0.8893805309734514, + "flos": 541843714560.0, + "grad_norm": 0.040725276818425686, + "language_loss": 0.87760389, + "learning_rate": 3.174821244088466e-05, + "loss": 0.88901484, + "num_input_tokens_seen": 382925088, + "router_z_loss_mlp": 0.73046875, + "step": 4623, + "time_per_iteration": 2.712893486022949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138455, + "balance_loss_mlp": 1.06530809, + "epoch": 0.8895729126587149, + "flos": 561168520704.0, + "grad_norm": 0.036429232224768356, + "language_loss": 0.86250001, + "learning_rate": 3.163905853111054e-05, + "loss": 0.87388456, + "num_input_tokens_seen": 382998640, + "router_z_loss_mlp": 0.73144531, + "step": 4624, + "time_per_iteration": 2.683321475982666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138327, + "balance_loss_mlp": 1.06522739, + "epoch": 0.8897652943439784, + "flos": 611280649728.0, + "grad_norm": 0.034860067275865936, + "language_loss": 0.85074407, + "learning_rate": 3.153008645517996e-05, + "loss": 0.86212736, + "num_input_tokens_seen": 383076000, + "router_z_loss_mlp": 0.73144531, + "step": 4625, + "time_per_iteration": 2.78021240234375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140004, + "balance_loss_mlp": 1.06685686, + "epoch": 0.889957676029242, + "flos": 919423670784.0, + "grad_norm": 0.0398902332567692, + "language_loss": 0.81782848, + "learning_rate": 3.142129625539969e-05, + "loss": 0.82922852, + "num_input_tokens_seen": 383166640, + "router_z_loss_mlp": 0.73144531, + "step": 4626, + "time_per_iteration": 3.2139408588409424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138118, + "balance_loss_mlp": 1.06501937, + "epoch": 0.8901500577145056, + "flos": 489686423040.0, + "grad_norm": 0.038017552561291156, + "language_loss": 0.85747802, + "learning_rate": 3.131268797400588e-05, + "loss": 0.86885923, + "num_input_tokens_seen": 383232928, + "router_z_loss_mlp": 0.73095703, + "step": 4627, + "time_per_iteration": 2.599820852279663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138395, + "balance_loss_mlp": 1.06520021, + "epoch": 0.8903424393997691, + "flos": 734913308160.0, + "grad_norm": 0.040511574906705955, + "language_loss": 0.84754193, + "learning_rate": 3.120426165316398e-05, + "loss": 0.85892582, + "num_input_tokens_seen": 383314352, + "router_z_loss_mlp": 0.73193359, + "step": 4628, + "time_per_iteration": 3.002224922180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138975, + "balance_loss_mlp": 1.0660187, + "epoch": 0.8905348210850327, + "flos": 520883979264.0, + "grad_norm": 0.035652036973535486, + "language_loss": 0.86524069, + "learning_rate": 3.109601733496881e-05, + "loss": 0.87663043, + "num_input_tokens_seen": 383384848, + "router_z_loss_mlp": 0.73046875, + "step": 4629, + "time_per_iteration": 2.6983273029327393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138867, + "balance_loss_mlp": 1.0656724, + "epoch": 0.8907272027702963, + "flos": 580198612992.0, + "grad_norm": 0.03507449840097698, + "language_loss": 0.84010351, + "learning_rate": 3.098795506144458e-05, + "loss": 0.85149217, + "num_input_tokens_seen": 383463360, + "router_z_loss_mlp": 0.73193359, + "step": 4630, + "time_per_iteration": 2.8263354301452637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138725, + "balance_loss_mlp": 1.06567347, + "epoch": 0.8909195844555599, + "flos": 895114212864.0, + "grad_norm": 0.03741426633430978, + "language_loss": 0.83983612, + "learning_rate": 3.088007487454475e-05, + "loss": 0.85122335, + "num_input_tokens_seen": 383542080, + "router_z_loss_mlp": 0.73095703, + "step": 4631, + "time_per_iteration": 3.1382222175598145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138682, + "balance_loss_mlp": 1.06548715, + "epoch": 0.8911119661408234, + "flos": 550948164096.0, + "grad_norm": 0.036182534075673435, + "language_loss": 0.89434344, + "learning_rate": 3.077237681615208e-05, + "loss": 0.90573025, + "num_input_tokens_seen": 383613056, + "router_z_loss_mlp": 0.73193359, + "step": 4632, + "time_per_iteration": 2.678633689880371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138526, + "balance_loss_mlp": 1.06533146, + "epoch": 0.8913043478260869, + "flos": 482164509696.0, + "grad_norm": 0.04328943324944268, + "language_loss": 0.89203089, + "learning_rate": 3.066486092807874e-05, + "loss": 0.90341616, + "num_input_tokens_seen": 383683280, + "router_z_loss_mlp": 0.73193359, + "step": 4633, + "time_per_iteration": 2.677217483520508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138784, + "balance_loss_mlp": 1.06568491, + "epoch": 0.8914967295113505, + "flos": 485644488192.0, + "grad_norm": 0.03105234799668386, + "language_loss": 0.88713467, + "learning_rate": 3.055752725206601e-05, + "loss": 0.8985225, + "num_input_tokens_seen": 383754624, + "router_z_loss_mlp": 0.73144531, + "step": 4634, + "time_per_iteration": 2.649566411972046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113871, + "balance_loss_mlp": 1.06561065, + "epoch": 0.8916891111966141, + "flos": 446592648192.0, + "grad_norm": 0.03682675744399267, + "language_loss": 0.86206222, + "learning_rate": 3.0450375829784714e-05, + "loss": 0.87344927, + "num_input_tokens_seen": 383821984, + "router_z_loss_mlp": 0.73095703, + "step": 4635, + "time_per_iteration": 2.5900418758392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138801, + "balance_loss_mlp": 1.06560659, + "epoch": 0.8918814928818777, + "flos": 565078198272.0, + "grad_norm": 0.03804470729703714, + "language_loss": 0.82817543, + "learning_rate": 3.034340670283453e-05, + "loss": 0.83956349, + "num_input_tokens_seen": 383890880, + "router_z_loss_mlp": 0.73193359, + "step": 4636, + "time_per_iteration": 2.741692543029785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137613, + "balance_loss_mlp": 1.06460917, + "epoch": 0.8920738745671412, + "flos": 577028811264.0, + "grad_norm": 0.032886435040047124, + "language_loss": 0.85431588, + "learning_rate": 3.0236619912744513e-05, + "loss": 0.86569202, + "num_input_tokens_seen": 383962480, + "router_z_loss_mlp": 0.73046875, + "step": 4637, + "time_per_iteration": 2.67724609375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137852, + "balance_loss_mlp": 1.06470549, + "epoch": 0.8922662562524047, + "flos": 621314354688.0, + "grad_norm": 0.033521285935624826, + "language_loss": 0.88356864, + "learning_rate": 3.0130015500973163e-05, + "loss": 0.89494717, + "num_input_tokens_seen": 384033616, + "router_z_loss_mlp": 0.73144531, + "step": 4638, + "time_per_iteration": 2.7949366569519043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137013, + "balance_loss_mlp": 1.06396186, + "epoch": 0.8924586379376683, + "flos": 584807233536.0, + "grad_norm": 0.03559045547501193, + "language_loss": 0.84122229, + "learning_rate": 3.0023593508907877e-05, + "loss": 0.85259241, + "num_input_tokens_seen": 384108848, + "router_z_loss_mlp": 0.73046875, + "step": 4639, + "time_per_iteration": 2.7761454582214355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137562, + "balance_loss_mlp": 1.06441486, + "epoch": 0.8926510196229319, + "flos": 526200274944.0, + "grad_norm": 0.03227679041862644, + "language_loss": 0.85516953, + "learning_rate": 2.991735397786538e-05, + "loss": 0.8665452, + "num_input_tokens_seen": 384185728, + "router_z_loss_mlp": 0.73144531, + "step": 4640, + "time_per_iteration": 2.7680256366729736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137327, + "balance_loss_mlp": 1.06422806, + "epoch": 0.8928434013081955, + "flos": 487639259136.0, + "grad_norm": 0.040770764772957185, + "language_loss": 0.85741651, + "learning_rate": 2.981129694909146e-05, + "loss": 0.86878973, + "num_input_tokens_seen": 384251552, + "router_z_loss_mlp": 0.73095703, + "step": 4641, + "time_per_iteration": 2.579289674758911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140709, + "balance_loss_mlp": 1.06918335, + "epoch": 0.893035782993459, + "flos": 1451199478272.0, + "grad_norm": 0.004510853592179409, + "language_loss": 0.80330861, + "learning_rate": 2.970542246376118e-05, + "loss": 0.81471562, + "num_input_tokens_seen": 384472176, + "router_z_loss_mlp": 0.71679688, + "step": 4642, + "time_per_iteration": 4.69758939743042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136696, + "balance_loss_mlp": 1.06345379, + "epoch": 0.8932281646787226, + "flos": 612444220416.0, + "grad_norm": 0.03833301661243837, + "language_loss": 0.86010414, + "learning_rate": 2.95997305629786e-05, + "loss": 0.87147105, + "num_input_tokens_seen": 384544224, + "router_z_loss_mlp": 0.73242188, + "step": 4643, + "time_per_iteration": 2.8750672340393066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136763, + "balance_loss_mlp": 1.06352127, + "epoch": 0.8934205463639862, + "flos": 566827920384.0, + "grad_norm": 0.03653494632059431, + "language_loss": 0.89745998, + "learning_rate": 2.9494221287776957e-05, + "loss": 0.90882766, + "num_input_tokens_seen": 384611728, + "router_z_loss_mlp": 0.73242188, + "step": 4644, + "time_per_iteration": 2.695143222808838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136836, + "balance_loss_mlp": 1.06359375, + "epoch": 0.8936129280492497, + "flos": 489434643456.0, + "grad_norm": 0.042946143094068745, + "language_loss": 0.83516526, + "learning_rate": 2.9388894679118484e-05, + "loss": 0.84653366, + "num_input_tokens_seen": 384678048, + "router_z_loss_mlp": 0.73242188, + "step": 4645, + "time_per_iteration": 2.6457924842834473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137122, + "balance_loss_mlp": 1.06388009, + "epoch": 0.8938053097345132, + "flos": 888074391552.0, + "grad_norm": 0.03223112269549949, + "language_loss": 0.84166312, + "learning_rate": 2.9283750777894912e-05, + "loss": 0.85303438, + "num_input_tokens_seen": 384766768, + "router_z_loss_mlp": 0.73242188, + "step": 4646, + "time_per_iteration": 3.3025524616241455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135843, + "balance_loss_mlp": 1.06260049, + "epoch": 0.8939976914197768, + "flos": 594432706560.0, + "grad_norm": 0.03742744544217847, + "language_loss": 0.88538921, + "learning_rate": 2.9178789624926427e-05, + "loss": 0.89674759, + "num_input_tokens_seen": 384842352, + "router_z_loss_mlp": 0.73242188, + "step": 4647, + "time_per_iteration": 2.732344627380371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136223, + "balance_loss_mlp": 1.06307614, + "epoch": 0.8941900731050404, + "flos": 524309563392.0, + "grad_norm": 0.041291033915724536, + "language_loss": 0.8619101, + "learning_rate": 2.9074011260962706e-05, + "loss": 0.87327242, + "num_input_tokens_seen": 384912048, + "router_z_loss_mlp": 0.73144531, + "step": 4648, + "time_per_iteration": 2.6516520977020264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136671, + "balance_loss_mlp": 1.0635246, + "epoch": 0.894382454790304, + "flos": 801927774720.0, + "grad_norm": 0.03416583650485881, + "language_loss": 0.85338318, + "learning_rate": 2.8969415726682158e-05, + "loss": 0.86474991, + "num_input_tokens_seen": 384986560, + "router_z_loss_mlp": 0.73144531, + "step": 4649, + "time_per_iteration": 3.0272881984710693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136105, + "balance_loss_mlp": 1.06305349, + "epoch": 0.8945748364755676, + "flos": 480060950016.0, + "grad_norm": 0.033926472362053865, + "language_loss": 0.88941896, + "learning_rate": 2.8865003062692517e-05, + "loss": 0.90078008, + "num_input_tokens_seen": 385057376, + "router_z_loss_mlp": 0.73046875, + "step": 4650, + "time_per_iteration": 2.660466194152832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136079, + "balance_loss_mlp": 1.06293249, + "epoch": 0.894767218160831, + "flos": 509853889536.0, + "grad_norm": 0.038719839462236214, + "language_loss": 0.87774134, + "learning_rate": 2.876077330953042e-05, + "loss": 0.8891021, + "num_input_tokens_seen": 385130880, + "router_z_loss_mlp": 0.73144531, + "step": 4651, + "time_per_iteration": 2.9371914863586426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137185, + "balance_loss_mlp": 1.06408608, + "epoch": 0.8949595998460946, + "flos": 687063192576.0, + "grad_norm": 0.035863421919143566, + "language_loss": 0.8627305, + "learning_rate": 2.8656726507661378e-05, + "loss": 0.87410235, + "num_input_tokens_seen": 385205808, + "router_z_loss_mlp": 0.73095703, + "step": 4652, + "time_per_iteration": 2.943850040435791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113756, + "balance_loss_mlp": 1.0645082, + "epoch": 0.8951519815313582, + "flos": 801293959680.0, + "grad_norm": 0.037185614582169284, + "language_loss": 0.81720811, + "learning_rate": 2.855286269747981e-05, + "loss": 0.82858372, + "num_input_tokens_seen": 385283616, + "router_z_loss_mlp": 0.73046875, + "step": 4653, + "time_per_iteration": 3.2343595027923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113662, + "balance_loss_mlp": 1.06347299, + "epoch": 0.8953443632166218, + "flos": 667935771648.0, + "grad_norm": 0.03649889337751892, + "language_loss": 0.90619528, + "learning_rate": 2.8449181919309398e-05, + "loss": 0.91756141, + "num_input_tokens_seen": 385357488, + "router_z_loss_mlp": 0.73144531, + "step": 4654, + "time_per_iteration": 2.87142014503479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113725, + "balance_loss_mlp": 1.06419849, + "epoch": 0.8955367449018854, + "flos": 646209964032.0, + "grad_norm": 0.036322322502662, + "language_loss": 0.8830961, + "learning_rate": 2.8345684213402556e-05, + "loss": 0.89446861, + "num_input_tokens_seen": 385431280, + "router_z_loss_mlp": 0.73046875, + "step": 4655, + "time_per_iteration": 2.8817336559295654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137331, + "balance_loss_mlp": 1.06423211, + "epoch": 0.8957291265871489, + "flos": 810162092544.0, + "grad_norm": 0.03904529135922208, + "language_loss": 0.82293046, + "learning_rate": 2.8242369619940644e-05, + "loss": 0.83430374, + "num_input_tokens_seen": 385509840, + "router_z_loss_mlp": 0.73095703, + "step": 4656, + "time_per_iteration": 3.0670013427734375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137676, + "balance_loss_mlp": 1.06452966, + "epoch": 0.8959215082724125, + "flos": 519963456000.0, + "grad_norm": 0.036966986897206296, + "language_loss": 0.81371593, + "learning_rate": 2.813923817903391e-05, + "loss": 0.82509267, + "num_input_tokens_seen": 385580384, + "router_z_loss_mlp": 0.73144531, + "step": 4657, + "time_per_iteration": 2.6919400691986084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137331, + "balance_loss_mlp": 1.06423163, + "epoch": 0.896113889957676, + "flos": 477911728128.0, + "grad_norm": 0.03989276240480501, + "language_loss": 0.82006389, + "learning_rate": 2.8036289930721603e-05, + "loss": 0.83143717, + "num_input_tokens_seen": 385649184, + "router_z_loss_mlp": 0.73095703, + "step": 4658, + "time_per_iteration": 2.607644557952881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137714, + "balance_loss_mlp": 1.06456733, + "epoch": 0.8963062716429396, + "flos": 519173188608.0, + "grad_norm": 0.033528793307819646, + "language_loss": 0.87108302, + "learning_rate": 2.7933524914971697e-05, + "loss": 0.88246012, + "num_input_tokens_seen": 385717072, + "router_z_loss_mlp": 0.73144531, + "step": 4659, + "time_per_iteration": 2.6183245182037354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113748, + "balance_loss_mlp": 1.06433296, + "epoch": 0.8964986533282031, + "flos": 509502053376.0, + "grad_norm": 0.037292402625012336, + "language_loss": 0.86541545, + "learning_rate": 2.7830943171681113e-05, + "loss": 0.87679029, + "num_input_tokens_seen": 385788880, + "router_z_loss_mlp": 0.73144531, + "step": 4660, + "time_per_iteration": 2.6836605072021484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137687, + "balance_loss_mlp": 1.06454027, + "epoch": 0.8966910350134667, + "flos": 537108840960.0, + "grad_norm": 0.04787249223130083, + "language_loss": 0.87312889, + "learning_rate": 2.77285447406756e-05, + "loss": 0.88450575, + "num_input_tokens_seen": 385854240, + "router_z_loss_mlp": 0.73144531, + "step": 4661, + "time_per_iteration": 2.6272199153900146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137712, + "balance_loss_mlp": 1.0647558, + "epoch": 0.8968834166987303, + "flos": 724497567744.0, + "grad_norm": 0.03914389932725733, + "language_loss": 0.88940513, + "learning_rate": 2.7626329661709914e-05, + "loss": 0.90078223, + "num_input_tokens_seen": 385926080, + "router_z_loss_mlp": 0.72998047, + "step": 4662, + "time_per_iteration": 2.923161268234253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137766, + "balance_loss_mlp": 1.06495285, + "epoch": 0.8970757983839939, + "flos": 682947397632.0, + "grad_norm": 0.02836643100094979, + "language_loss": 0.87210166, + "learning_rate": 2.7524297974467372e-05, + "loss": 0.88347936, + "num_input_tokens_seen": 386005696, + "router_z_loss_mlp": 0.72949219, + "step": 4663, + "time_per_iteration": 2.9333901405334473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137571, + "balance_loss_mlp": 1.06451964, + "epoch": 0.8972681800692575, + "flos": 614157012480.0, + "grad_norm": 0.04594668712214378, + "language_loss": 0.82504487, + "learning_rate": 2.742244971856006e-05, + "loss": 0.83642054, + "num_input_tokens_seen": 386073248, + "router_z_loss_mlp": 0.73095703, + "step": 4664, + "time_per_iteration": 2.7572762966156006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136784, + "balance_loss_mlp": 1.06363738, + "epoch": 0.8974605617545209, + "flos": 573499167744.0, + "grad_norm": 0.03351248965112738, + "language_loss": 0.87172771, + "learning_rate": 2.732078493352913e-05, + "loss": 0.8830955, + "num_input_tokens_seen": 386148528, + "router_z_loss_mlp": 0.73144531, + "step": 4665, + "time_per_iteration": 2.7434494495391846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136912, + "balance_loss_mlp": 1.0637176, + "epoch": 0.8976529434397845, + "flos": 521507060736.0, + "grad_norm": 0.03367433914500393, + "language_loss": 0.92143202, + "learning_rate": 2.721930365884434e-05, + "loss": 0.93280119, + "num_input_tokens_seen": 386218528, + "router_z_loss_mlp": 0.73193359, + "step": 4666, + "time_per_iteration": 2.816922664642334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136738, + "balance_loss_mlp": 1.06359124, + "epoch": 0.8978453251250481, + "flos": 472282527744.0, + "grad_norm": 0.03434454323546124, + "language_loss": 0.8620975, + "learning_rate": 2.7118005933904176e-05, + "loss": 0.87346482, + "num_input_tokens_seen": 386284704, + "router_z_loss_mlp": 0.73144531, + "step": 4667, + "time_per_iteration": 2.7096781730651855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113737, + "balance_loss_mlp": 1.06441426, + "epoch": 0.8980377068103117, + "flos": 592821972480.0, + "grad_norm": 0.030419293496563398, + "language_loss": 0.86279666, + "learning_rate": 2.7016891798035904e-05, + "loss": 0.8741703, + "num_input_tokens_seen": 386356128, + "router_z_loss_mlp": 0.72998047, + "step": 4668, + "time_per_iteration": 3.019211530685425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137373, + "balance_loss_mlp": 1.06427431, + "epoch": 0.8982300884955752, + "flos": 768950297088.0, + "grad_norm": 0.03649542042278363, + "language_loss": 0.87581873, + "learning_rate": 2.691596129049556e-05, + "loss": 0.88719249, + "num_input_tokens_seen": 386434048, + "router_z_loss_mlp": 0.73095703, + "step": 4669, + "time_per_iteration": 3.122833728790283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113727, + "balance_loss_mlp": 1.064219, + "epoch": 0.8984224701808388, + "flos": 846124721664.0, + "grad_norm": 0.0371250323019601, + "language_loss": 0.81804687, + "learning_rate": 2.681521445046775e-05, + "loss": 0.82941949, + "num_input_tokens_seen": 386532384, + "router_z_loss_mlp": 0.73046875, + "step": 4670, + "time_per_iteration": 3.369352340698242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138035, + "balance_loss_mlp": 1.06484008, + "epoch": 0.8986148518661023, + "flos": 759099240960.0, + "grad_norm": 0.03474578852123265, + "language_loss": 0.80345845, + "learning_rate": 2.6714651317065963e-05, + "loss": 0.81483877, + "num_input_tokens_seen": 386627120, + "router_z_loss_mlp": 0.73193359, + "step": 4671, + "time_per_iteration": 3.138118267059326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113809, + "balance_loss_mlp": 1.06484783, + "epoch": 0.8988072335513659, + "flos": 564146941440.0, + "grad_norm": 0.03574727497124782, + "language_loss": 0.81828249, + "learning_rate": 2.6614271929332133e-05, + "loss": 0.8296634, + "num_input_tokens_seen": 386700192, + "router_z_loss_mlp": 0.73242188, + "step": 4672, + "time_per_iteration": 2.695159673690796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136953, + "balance_loss_mlp": 1.06366277, + "epoch": 0.8989996152366295, + "flos": 493661228544.0, + "grad_norm": 0.03847010617944712, + "language_loss": 0.91765416, + "learning_rate": 2.6514076326237147e-05, + "loss": 0.92902374, + "num_input_tokens_seen": 386764256, + "router_z_loss_mlp": 0.73291016, + "step": 4673, + "time_per_iteration": 2.5458626747131348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136749, + "balance_loss_mlp": 1.0634588, + "epoch": 0.899191996921893, + "flos": 543623635968.0, + "grad_norm": 0.04589399919654321, + "language_loss": 0.80554837, + "learning_rate": 2.6414064546680438e-05, + "loss": 0.81691587, + "num_input_tokens_seen": 386835792, + "router_z_loss_mlp": 0.73291016, + "step": 4674, + "time_per_iteration": 2.642505168914795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136848, + "balance_loss_mlp": 1.06355786, + "epoch": 0.8993843786071566, + "flos": 472308724224.0, + "grad_norm": 0.03589158641039823, + "language_loss": 0.84531856, + "learning_rate": 2.631423662948984e-05, + "loss": 0.85668707, + "num_input_tokens_seen": 386904368, + "router_z_loss_mlp": 0.73291016, + "step": 4675, + "time_per_iteration": 2.6165904998779297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136516, + "balance_loss_mlp": 1.0631305, + "epoch": 0.8995767602924202, + "flos": 527817739776.0, + "grad_norm": 0.0341476422766562, + "language_loss": 0.86405528, + "learning_rate": 2.621459261342196e-05, + "loss": 0.87542045, + "num_input_tokens_seen": 386977872, + "router_z_loss_mlp": 0.73388672, + "step": 4676, + "time_per_iteration": 2.719243049621582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137023, + "balance_loss_mlp": 1.06363761, + "epoch": 0.8997691419776838, + "flos": 558711123456.0, + "grad_norm": 0.0347905092588009, + "language_loss": 0.88358057, + "learning_rate": 2.6115132537162245e-05, + "loss": 0.89495075, + "num_input_tokens_seen": 387052080, + "router_z_loss_mlp": 0.73388672, + "step": 4677, + "time_per_iteration": 2.7013773918151855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136646, + "balance_loss_mlp": 1.06321299, + "epoch": 0.8999615236629472, + "flos": 640253122560.0, + "grad_norm": 0.03439496525861691, + "language_loss": 0.84559703, + "learning_rate": 2.601585643932436e-05, + "loss": 0.85696346, + "num_input_tokens_seen": 387129712, + "router_z_loss_mlp": 0.734375, + "step": 4678, + "time_per_iteration": 2.8610715866088867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139397, + "balance_loss_mlp": 1.06768036, + "epoch": 0.9001539053482108, + "flos": 1434588578304.0, + "grad_norm": 0.0055187474782550615, + "language_loss": 0.85784018, + "learning_rate": 2.5916764358450862e-05, + "loss": 0.8692342, + "num_input_tokens_seen": 387356560, + "router_z_loss_mlp": 0.71875, + "step": 4679, + "time_per_iteration": 4.780034780502319 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136508, + "balance_loss_mlp": 1.06321776, + "epoch": 0.9003462870334744, + "flos": 568035151872.0, + "grad_norm": 0.039726434733231085, + "language_loss": 0.84240907, + "learning_rate": 2.5817856333012425e-05, + "loss": 0.85377413, + "num_input_tokens_seen": 387438640, + "router_z_loss_mlp": 0.73291016, + "step": 4680, + "time_per_iteration": 2.8599278926849365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137032, + "balance_loss_mlp": 1.06369436, + "epoch": 0.900538668718738, + "flos": 539705226240.0, + "grad_norm": 0.03640877309681453, + "language_loss": 0.82617021, + "learning_rate": 2.5719132401408883e-05, + "loss": 0.83754051, + "num_input_tokens_seen": 387507088, + "router_z_loss_mlp": 0.73339844, + "step": 4681, + "time_per_iteration": 2.7729578018188477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137597, + "balance_loss_mlp": 1.06435442, + "epoch": 0.9007310504040016, + "flos": 489352051200.0, + "grad_norm": 0.0368162628628219, + "language_loss": 0.90017235, + "learning_rate": 2.5620592601968028e-05, + "loss": 0.91154826, + "num_input_tokens_seen": 387574160, + "router_z_loss_mlp": 0.73242188, + "step": 4682, + "time_per_iteration": 2.755814552307129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113733, + "balance_loss_mlp": 1.06403971, + "epoch": 0.9009234320892651, + "flos": 654140109312.0, + "grad_norm": 0.038557562175802175, + "language_loss": 0.83839279, + "learning_rate": 2.5522236972946532e-05, + "loss": 0.84976614, + "num_input_tokens_seen": 387652528, + "router_z_loss_mlp": 0.73291016, + "step": 4683, + "time_per_iteration": 2.8485474586486816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137175, + "balance_loss_mlp": 1.06393278, + "epoch": 0.9011158137745287, + "flos": 546638986752.0, + "grad_norm": 0.033729496474815886, + "language_loss": 0.89113462, + "learning_rate": 2.5424065552529295e-05, + "loss": 0.90250635, + "num_input_tokens_seen": 387723520, + "router_z_loss_mlp": 0.73242188, + "step": 4684, + "time_per_iteration": 2.6239471435546875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137283, + "balance_loss_mlp": 1.06404042, + "epoch": 0.9013081954597922, + "flos": 560786485248.0, + "grad_norm": 0.03771517464908619, + "language_loss": 0.87072444, + "learning_rate": 2.532607837883011e-05, + "loss": 0.88209724, + "num_input_tokens_seen": 387793664, + "router_z_loss_mlp": 0.73242188, + "step": 4685, + "time_per_iteration": 2.668337345123291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136586, + "balance_loss_mlp": 1.06320047, + "epoch": 0.9015005771450558, + "flos": 729942117888.0, + "grad_norm": 0.031716062736378385, + "language_loss": 0.84871745, + "learning_rate": 2.5228275489890706e-05, + "loss": 0.86008328, + "num_input_tokens_seen": 387871008, + "router_z_loss_mlp": 0.73388672, + "step": 4686, + "time_per_iteration": 2.903815507888794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113734, + "balance_loss_mlp": 1.06419337, + "epoch": 0.9016929588303193, + "flos": 518491709952.0, + "grad_norm": 0.037159626638255984, + "language_loss": 0.85474777, + "learning_rate": 2.5130656923681605e-05, + "loss": 0.86612117, + "num_input_tokens_seen": 387950832, + "router_z_loss_mlp": 0.73144531, + "step": 4687, + "time_per_iteration": 2.7882134914398193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137561, + "balance_loss_mlp": 1.0643189, + "epoch": 0.9018853405155829, + "flos": 623554900992.0, + "grad_norm": 0.030497030476657076, + "language_loss": 0.90075636, + "learning_rate": 2.503322271810171e-05, + "loss": 0.91213191, + "num_input_tokens_seen": 388029792, + "router_z_loss_mlp": 0.73242188, + "step": 4688, + "time_per_iteration": 2.863872766494751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137148, + "balance_loss_mlp": 1.06381035, + "epoch": 0.9020777222008465, + "flos": 524337761280.0, + "grad_norm": 0.0356508664141184, + "language_loss": 0.82390887, + "learning_rate": 2.4935972910978378e-05, + "loss": 0.8352803, + "num_input_tokens_seen": 388095872, + "router_z_loss_mlp": 0.73339844, + "step": 4689, + "time_per_iteration": 2.6352643966674805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137658, + "balance_loss_mlp": 1.06451106, + "epoch": 0.9022701038861101, + "flos": 634893166080.0, + "grad_norm": 0.03217572249444131, + "language_loss": 0.85226208, + "learning_rate": 2.4838907540067346e-05, + "loss": 0.8636387, + "num_input_tokens_seen": 388171632, + "router_z_loss_mlp": 0.73144531, + "step": 4690, + "time_per_iteration": 2.7964348793029785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137305, + "balance_loss_mlp": 1.06411064, + "epoch": 0.9024624855713737, + "flos": 514332254208.0, + "grad_norm": 0.03518480344616928, + "language_loss": 0.8914479, + "learning_rate": 2.474202664305253e-05, + "loss": 0.90282094, + "num_input_tokens_seen": 388242240, + "router_z_loss_mlp": 0.73193359, + "step": 4691, + "time_per_iteration": 2.6292026042938232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113622, + "balance_loss_mlp": 1.06283426, + "epoch": 0.9026548672566371, + "flos": 478450215936.0, + "grad_norm": 0.034512274724425716, + "language_loss": 0.8996951, + "learning_rate": 2.464533025754673e-05, + "loss": 0.91105729, + "num_input_tokens_seen": 388310960, + "router_z_loss_mlp": 0.73388672, + "step": 4692, + "time_per_iteration": 2.6084232330322266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136428, + "balance_loss_mlp": 1.0630908, + "epoch": 0.9028472489419007, + "flos": 663170698752.0, + "grad_norm": 0.04470517923282093, + "language_loss": 0.78629088, + "learning_rate": 2.454881842109058e-05, + "loss": 0.79765511, + "num_input_tokens_seen": 388387280, + "router_z_loss_mlp": 0.73339844, + "step": 4693, + "time_per_iteration": 2.81938099861145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136126, + "balance_loss_mlp": 1.06288338, + "epoch": 0.9030396306271643, + "flos": 535619630592.0, + "grad_norm": 0.03960598704445331, + "language_loss": 0.87602615, + "learning_rate": 2.4452491171153445e-05, + "loss": 0.88738739, + "num_input_tokens_seen": 388456992, + "router_z_loss_mlp": 0.73242188, + "step": 4694, + "time_per_iteration": 2.607726812362671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135271, + "balance_loss_mlp": 1.06193364, + "epoch": 0.9032320123124279, + "flos": 802383670272.0, + "grad_norm": 0.03396233932640605, + "language_loss": 0.86772144, + "learning_rate": 2.43563485451328e-05, + "loss": 0.87907416, + "num_input_tokens_seen": 388534896, + "router_z_loss_mlp": 0.73339844, + "step": 4695, + "time_per_iteration": 2.946852684020996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135645, + "balance_loss_mlp": 1.06221211, + "epoch": 0.9034243939976914, + "flos": 555025027584.0, + "grad_norm": 0.04144086028744623, + "language_loss": 0.81962967, + "learning_rate": 2.426039058035451e-05, + "loss": 0.83098608, + "num_input_tokens_seen": 388606640, + "router_z_loss_mlp": 0.734375, + "step": 4696, + "time_per_iteration": 2.6538476943969727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137462, + "balance_loss_mlp": 1.06417239, + "epoch": 0.903616775682955, + "flos": 504895434240.0, + "grad_norm": 0.04262123189824164, + "language_loss": 0.88294876, + "learning_rate": 2.4164617314072823e-05, + "loss": 0.89432335, + "num_input_tokens_seen": 388675920, + "router_z_loss_mlp": 0.73291016, + "step": 4697, + "time_per_iteration": 2.611482620239258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011379, + "balance_loss_mlp": 1.06465781, + "epoch": 0.9038091573682185, + "flos": 437255884800.0, + "grad_norm": 0.03845558802533531, + "language_loss": 0.83261943, + "learning_rate": 2.406902878347017e-05, + "loss": 0.84399843, + "num_input_tokens_seen": 388743968, + "router_z_loss_mlp": 0.73242188, + "step": 4698, + "time_per_iteration": 2.6136317253112793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137364, + "balance_loss_mlp": 1.0641216, + "epoch": 0.9040015390534821, + "flos": 533989430784.0, + "grad_norm": 0.043430243161230425, + "language_loss": 0.86828995, + "learning_rate": 2.3973625025657253e-05, + "loss": 0.87966359, + "num_input_tokens_seen": 388810784, + "router_z_loss_mlp": 0.73242188, + "step": 4699, + "time_per_iteration": 2.619580030441284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137638, + "balance_loss_mlp": 1.06434846, + "epoch": 0.9041939207387457, + "flos": 565430034432.0, + "grad_norm": 0.038139504979905946, + "language_loss": 0.85678428, + "learning_rate": 2.3878406077673275e-05, + "loss": 0.86816067, + "num_input_tokens_seen": 388885072, + "router_z_loss_mlp": 0.73291016, + "step": 4700, + "time_per_iteration": 2.775902509689331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135693, + "balance_loss_mlp": 1.06230736, + "epoch": 0.9043863024240092, + "flos": 516520407552.0, + "grad_norm": 0.042725558603523235, + "language_loss": 0.8274883, + "learning_rate": 2.3783371976485447e-05, + "loss": 0.83884525, + "num_input_tokens_seen": 388951184, + "router_z_loss_mlp": 0.73388672, + "step": 4701, + "time_per_iteration": 2.564540386199951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139694, + "balance_loss_mlp": 1.06797791, + "epoch": 0.9045786841092728, + "flos": 1280782946304.0, + "grad_norm": 0.004733647973715265, + "language_loss": 0.72929788, + "learning_rate": 2.368852275898914e-05, + "loss": 0.74069482, + "num_input_tokens_seen": 389170752, + "router_z_loss_mlp": 0.71875, + "step": 4702, + "time_per_iteration": 4.953817367553711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135708, + "balance_loss_mlp": 1.0623225, + "epoch": 0.9047710657945364, + "flos": 586932260352.0, + "grad_norm": 0.037178314529548034, + "language_loss": 0.87704772, + "learning_rate": 2.3593858462008178e-05, + "loss": 0.88840485, + "num_input_tokens_seen": 389239600, + "router_z_loss_mlp": 0.73388672, + "step": 4703, + "time_per_iteration": 2.657202959060669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135469, + "balance_loss_mlp": 1.06203628, + "epoch": 0.9049634474798, + "flos": 573071470080.0, + "grad_norm": 0.03668762127255847, + "language_loss": 0.83756787, + "learning_rate": 2.3499379122294495e-05, + "loss": 0.84892261, + "num_input_tokens_seen": 389316032, + "router_z_loss_mlp": 0.734375, + "step": 4704, + "time_per_iteration": 2.7138781547546387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136388, + "balance_loss_mlp": 1.06295526, + "epoch": 0.9051558291650635, + "flos": 573687820800.0, + "grad_norm": 0.04230135132201795, + "language_loss": 0.80652225, + "learning_rate": 2.3405084776528307e-05, + "loss": 0.81788611, + "num_input_tokens_seen": 389383504, + "router_z_loss_mlp": 0.73388672, + "step": 4705, + "time_per_iteration": 2.652484655380249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136762, + "balance_loss_mlp": 1.06342399, + "epoch": 0.905348210850327, + "flos": 541576472064.0, + "grad_norm": 0.03894186985703792, + "language_loss": 0.8417691, + "learning_rate": 2.331097546131783e-05, + "loss": 0.85313666, + "num_input_tokens_seen": 389454592, + "router_z_loss_mlp": 0.73339844, + "step": 4706, + "time_per_iteration": 2.646650791168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136958, + "balance_loss_mlp": 1.0637157, + "epoch": 0.9055405925355906, + "flos": 517395268608.0, + "grad_norm": 0.03706201229587213, + "language_loss": 0.86367965, + "learning_rate": 2.321705121319956e-05, + "loss": 0.87504923, + "num_input_tokens_seen": 389519696, + "router_z_loss_mlp": 0.73242188, + "step": 4707, + "time_per_iteration": 2.578150510787964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136926, + "balance_loss_mlp": 1.0636363, + "epoch": 0.9057329742208542, + "flos": 916221668352.0, + "grad_norm": 0.027988535833480262, + "language_loss": 0.8856324, + "learning_rate": 2.3123312068638104e-05, + "loss": 0.89700168, + "num_input_tokens_seen": 389603568, + "router_z_loss_mlp": 0.73291016, + "step": 4708, + "time_per_iteration": 3.2058019638061523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137016, + "balance_loss_mlp": 1.06363082, + "epoch": 0.9059253559061178, + "flos": 906776116224.0, + "grad_norm": 0.040646490031674046, + "language_loss": 0.87692308, + "learning_rate": 2.3029758064026295e-05, + "loss": 0.88829321, + "num_input_tokens_seen": 389687504, + "router_z_loss_mlp": 0.73388672, + "step": 4709, + "time_per_iteration": 3.121534824371338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136893, + "balance_loss_mlp": 1.06355548, + "epoch": 0.9061177375913813, + "flos": 665802012672.0, + "grad_norm": 0.05005893347039075, + "language_loss": 0.82001013, + "learning_rate": 2.2936389235684918e-05, + "loss": 0.83137906, + "num_input_tokens_seen": 389764880, + "router_z_loss_mlp": 0.73339844, + "step": 4710, + "time_per_iteration": 2.845099449157715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137048, + "balance_loss_mlp": 1.06366277, + "epoch": 0.9063101192766448, + "flos": 566778255360.0, + "grad_norm": 0.03738260666061765, + "language_loss": 0.87451136, + "learning_rate": 2.2843205619862972e-05, + "loss": 0.88588178, + "num_input_tokens_seen": 389838304, + "router_z_loss_mlp": 0.73388672, + "step": 4711, + "time_per_iteration": 2.749617338180542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136766, + "balance_loss_mlp": 1.06342876, + "epoch": 0.9065025009619084, + "flos": 728630827008.0, + "grad_norm": 0.03643976718461331, + "language_loss": 0.82941359, + "learning_rate": 2.2750207252737742e-05, + "loss": 0.84078121, + "num_input_tokens_seen": 389908592, + "router_z_loss_mlp": 0.73339844, + "step": 4712, + "time_per_iteration": 2.8681652545928955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136904, + "balance_loss_mlp": 1.06370974, + "epoch": 0.906694882647172, + "flos": 532547884032.0, + "grad_norm": 0.041072095585484664, + "language_loss": 0.85065079, + "learning_rate": 2.265739417041418e-05, + "loss": 0.86201984, + "num_input_tokens_seen": 389979040, + "router_z_loss_mlp": 0.73193359, + "step": 4713, + "time_per_iteration": 2.6370742321014404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113708, + "balance_loss_mlp": 1.06388533, + "epoch": 0.9068872643324356, + "flos": 430695427584.0, + "grad_norm": 0.035065691956439445, + "language_loss": 0.89791685, + "learning_rate": 2.2564766408925574e-05, + "loss": 0.90928769, + "num_input_tokens_seen": 390046080, + "router_z_loss_mlp": 0.73193359, + "step": 4714, + "time_per_iteration": 2.588728427886963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136841, + "balance_loss_mlp": 1.06350315, + "epoch": 0.9070796460176991, + "flos": 589454785536.0, + "grad_norm": 0.04403478134734124, + "language_loss": 0.84667605, + "learning_rate": 2.2472324004233214e-05, + "loss": 0.85804451, + "num_input_tokens_seen": 390122176, + "router_z_loss_mlp": 0.73339844, + "step": 4715, + "time_per_iteration": 2.7413907051086426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136965, + "balance_loss_mlp": 1.06357956, + "epoch": 0.9072720277029627, + "flos": 572654505984.0, + "grad_norm": 0.03890461174208685, + "language_loss": 0.8084088, + "learning_rate": 2.2380066992226446e-05, + "loss": 0.81977844, + "num_input_tokens_seen": 390195216, + "router_z_loss_mlp": 0.73388672, + "step": 4716, + "time_per_iteration": 2.7009265422821045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136751, + "balance_loss_mlp": 1.06350887, + "epoch": 0.9074644093882263, + "flos": 556859343360.0, + "grad_norm": 0.035784983001337665, + "language_loss": 0.92963278, + "learning_rate": 2.2287995408722617e-05, + "loss": 0.94100022, + "num_input_tokens_seen": 390263216, + "router_z_loss_mlp": 0.73242188, + "step": 4717, + "time_per_iteration": 2.658792734146118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136681, + "balance_loss_mlp": 1.06334293, + "epoch": 0.9076567910734898, + "flos": 642172032000.0, + "grad_norm": 0.035461342657685004, + "language_loss": 0.87066031, + "learning_rate": 2.2196109289467083e-05, + "loss": 0.88202703, + "num_input_tokens_seen": 390337360, + "router_z_loss_mlp": 0.73339844, + "step": 4718, + "time_per_iteration": 2.774747371673584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113687, + "balance_loss_mlp": 1.06353295, + "epoch": 0.9078491727587533, + "flos": 735456525312.0, + "grad_norm": 0.03318515468824905, + "language_loss": 0.86531991, + "learning_rate": 2.2104408670133193e-05, + "loss": 0.8766886, + "num_input_tokens_seen": 390427728, + "router_z_loss_mlp": 0.73339844, + "step": 4719, + "time_per_iteration": 3.110316753387451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136868, + "balance_loss_mlp": 1.06357777, + "epoch": 0.9080415544440169, + "flos": 656020087296.0, + "grad_norm": 0.03252250742039747, + "language_loss": 0.90962839, + "learning_rate": 2.2012893586322245e-05, + "loss": 0.92099708, + "num_input_tokens_seen": 390504736, + "router_z_loss_mlp": 0.73291016, + "step": 4720, + "time_per_iteration": 2.8136444091796875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137209, + "balance_loss_mlp": 1.06382358, + "epoch": 0.9082339361292805, + "flos": 598602895872.0, + "grad_norm": 0.03508499547859316, + "language_loss": 0.84060097, + "learning_rate": 2.1921564073563604e-05, + "loss": 0.85197306, + "num_input_tokens_seen": 390582048, + "router_z_loss_mlp": 0.73388672, + "step": 4721, + "time_per_iteration": 2.728701114654541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137056, + "balance_loss_mlp": 1.0636704, + "epoch": 0.9084263178145441, + "flos": 505425189888.0, + "grad_norm": 0.03720654975675441, + "language_loss": 0.89186943, + "learning_rate": 2.183042016731457e-05, + "loss": 0.90324003, + "num_input_tokens_seen": 390652976, + "router_z_loss_mlp": 0.73339844, + "step": 4722, + "time_per_iteration": 2.6093122959136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137238, + "balance_loss_mlp": 1.06380546, + "epoch": 0.9086186994998077, + "flos": 551106617856.0, + "grad_norm": 0.03925189384717369, + "language_loss": 0.84773749, + "learning_rate": 2.1739461902960223e-05, + "loss": 0.85910988, + "num_input_tokens_seen": 390726832, + "router_z_loss_mlp": 0.734375, + "step": 4723, + "time_per_iteration": 2.706056594848633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137174, + "balance_loss_mlp": 1.06393194, + "epoch": 0.9088110811850711, + "flos": 1135908395520.0, + "grad_norm": 0.031906636087630606, + "language_loss": 0.78563046, + "learning_rate": 2.1648689315813763e-05, + "loss": 0.7970022, + "num_input_tokens_seen": 390824480, + "router_z_loss_mlp": 0.73242188, + "step": 4724, + "time_per_iteration": 3.5388522148132324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137122, + "balance_loss_mlp": 1.06388009, + "epoch": 0.9090034628703347, + "flos": 558059844096.0, + "grad_norm": 0.038899730458288276, + "language_loss": 0.81937408, + "learning_rate": 2.155810244111628e-05, + "loss": 0.83074534, + "num_input_tokens_seen": 390897552, + "router_z_loss_mlp": 0.73242188, + "step": 4725, + "time_per_iteration": 2.6709446907043457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136742, + "balance_loss_mlp": 1.06350017, + "epoch": 0.9091958445555983, + "flos": 545065182720.0, + "grad_norm": 0.034504955767497236, + "language_loss": 0.89321834, + "learning_rate": 2.146770131403658e-05, + "loss": 0.90458584, + "num_input_tokens_seen": 390969008, + "router_z_loss_mlp": 0.73242188, + "step": 4726, + "time_per_iteration": 2.685490608215332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137086, + "balance_loss_mlp": 1.06379664, + "epoch": 0.9093882262408619, + "flos": 527140263936.0, + "grad_norm": 0.040107209375530216, + "language_loss": 0.86455953, + "learning_rate": 2.1377485969671594e-05, + "loss": 0.87593037, + "num_input_tokens_seen": 391038880, + "router_z_loss_mlp": 0.73291016, + "step": 4727, + "time_per_iteration": 2.6698527336120605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137417, + "balance_loss_mlp": 1.06417525, + "epoch": 0.9095806079261254, + "flos": 549571745280.0, + "grad_norm": 0.03978461900871093, + "language_loss": 0.86923885, + "learning_rate": 2.1287456443046084e-05, + "loss": 0.88061309, + "num_input_tokens_seen": 391106720, + "router_z_loss_mlp": 0.73242188, + "step": 4728, + "time_per_iteration": 2.621840476989746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113738, + "balance_loss_mlp": 1.06413734, + "epoch": 0.909772989611389, + "flos": 573640157184.0, + "grad_norm": 0.036584315059023036, + "language_loss": 0.89296705, + "learning_rate": 2.1197612769112528e-05, + "loss": 0.90434086, + "num_input_tokens_seen": 391178128, + "router_z_loss_mlp": 0.73242188, + "step": 4729, + "time_per_iteration": 2.700291395187378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136817, + "balance_loss_mlp": 1.06352687, + "epoch": 0.9099653712966526, + "flos": 562881312768.0, + "grad_norm": 0.0404955741903976, + "language_loss": 0.85047817, + "learning_rate": 2.1107954982751254e-05, + "loss": 0.86184633, + "num_input_tokens_seen": 391248848, + "router_z_loss_mlp": 0.73291016, + "step": 4730, + "time_per_iteration": 2.678140640258789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136149, + "balance_loss_mlp": 1.06271577, + "epoch": 0.9101577529819161, + "flos": 1095497601024.0, + "grad_norm": 0.03929606258638513, + "language_loss": 0.84986031, + "learning_rate": 2.101848311877069e-05, + "loss": 0.86122179, + "num_input_tokens_seen": 391328000, + "router_z_loss_mlp": 0.734375, + "step": 4731, + "time_per_iteration": 3.3611509799957275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135878, + "balance_loss_mlp": 1.06249321, + "epoch": 0.9103501346671797, + "flos": 446360334336.0, + "grad_norm": 0.04307227071554131, + "language_loss": 0.87402189, + "learning_rate": 2.092919721190678e-05, + "loss": 0.88538074, + "num_input_tokens_seen": 391391616, + "router_z_loss_mlp": 0.73388672, + "step": 4732, + "time_per_iteration": 2.5086095333099365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135658, + "balance_loss_mlp": 1.06227303, + "epoch": 0.9105425163524432, + "flos": 501812954112.0, + "grad_norm": 0.03966317690451211, + "language_loss": 0.8330757, + "learning_rate": 2.0840097296823346e-05, + "loss": 0.84443229, + "num_input_tokens_seen": 391461312, + "router_z_loss_mlp": 0.73388672, + "step": 4733, + "time_per_iteration": 2.6233813762664795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011355, + "balance_loss_mlp": 1.06211519, + "epoch": 0.9107348980377068, + "flos": 658774926336.0, + "grad_norm": 0.0391604867021726, + "language_loss": 0.88541472, + "learning_rate": 2.0751183408112162e-05, + "loss": 0.89676976, + "num_input_tokens_seen": 391542192, + "router_z_loss_mlp": 0.73388672, + "step": 4734, + "time_per_iteration": 2.8359274864196777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137103, + "balance_loss_mlp": 1.06381297, + "epoch": 0.9109272797229704, + "flos": 554718853632.0, + "grad_norm": 0.03421844082243491, + "language_loss": 0.8903842, + "learning_rate": 2.066245558029256e-05, + "loss": 0.90175527, + "num_input_tokens_seen": 391609968, + "router_z_loss_mlp": 0.73291016, + "step": 4735, + "time_per_iteration": 2.6057238578796387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136816, + "balance_loss_mlp": 1.06352627, + "epoch": 0.911119661408234, + "flos": 520011119616.0, + "grad_norm": 0.03846629204542353, + "language_loss": 0.89047289, + "learning_rate": 2.057391384781182e-05, + "loss": 0.90184104, + "num_input_tokens_seen": 391681264, + "router_z_loss_mlp": 0.73291016, + "step": 4736, + "time_per_iteration": 2.633537530899048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136729, + "balance_loss_mlp": 1.06348717, + "epoch": 0.9113120430934974, + "flos": 555435260928.0, + "grad_norm": 0.039830009072267566, + "language_loss": 0.87907994, + "learning_rate": 2.0485558245044834e-05, + "loss": 0.89044726, + "num_input_tokens_seen": 391751392, + "router_z_loss_mlp": 0.73242188, + "step": 4737, + "time_per_iteration": 2.6331467628479004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136847, + "balance_loss_mlp": 1.06350923, + "epoch": 0.911504424778761, + "flos": 502957059072.0, + "grad_norm": 0.03552190117680254, + "language_loss": 0.85479963, + "learning_rate": 2.0397388806294216e-05, + "loss": 0.86616814, + "num_input_tokens_seen": 391823952, + "router_z_loss_mlp": 0.73339844, + "step": 4738, + "time_per_iteration": 2.657090663909912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137184, + "balance_loss_mlp": 1.06394231, + "epoch": 0.9116968064640246, + "flos": 612211906560.0, + "grad_norm": 0.03175859953298452, + "language_loss": 0.85633034, + "learning_rate": 2.0309405565790527e-05, + "loss": 0.86770225, + "num_input_tokens_seen": 391895264, + "router_z_loss_mlp": 0.73242188, + "step": 4739, + "time_per_iteration": 2.7278242111206055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137098, + "balance_loss_mlp": 1.06385577, + "epoch": 0.9118891881492882, + "flos": 574094051328.0, + "grad_norm": 0.029792698419162895, + "language_loss": 0.86312258, + "learning_rate": 2.0221608557691895e-05, + "loss": 0.87449354, + "num_input_tokens_seen": 391973040, + "router_z_loss_mlp": 0.73242188, + "step": 4740, + "time_per_iteration": 2.763500452041626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113712, + "balance_loss_mlp": 1.06378198, + "epoch": 0.9120815698345518, + "flos": 637172643840.0, + "grad_norm": 0.034763930832622233, + "language_loss": 0.82391727, + "learning_rate": 2.0133997816083992e-05, + "loss": 0.83528852, + "num_input_tokens_seen": 392048160, + "router_z_loss_mlp": 0.73339844, + "step": 4741, + "time_per_iteration": 2.817636489868164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137084, + "balance_loss_mlp": 1.06384206, + "epoch": 0.9122739515198153, + "flos": 703555296768.0, + "grad_norm": 0.038607205451932886, + "language_loss": 0.90239573, + "learning_rate": 2.0046573374980447e-05, + "loss": 0.91376662, + "num_input_tokens_seen": 392128960, + "router_z_loss_mlp": 0.73242188, + "step": 4742, + "time_per_iteration": 2.8458170890808105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138421, + "balance_loss_mlp": 1.06508315, + "epoch": 0.9124663332050789, + "flos": 525716181504.0, + "grad_norm": 0.04055009874504829, + "language_loss": 0.93180835, + "learning_rate": 1.995933526832239e-05, + "loss": 0.9431926, + "num_input_tokens_seen": 392195008, + "router_z_loss_mlp": 0.73291016, + "step": 4743, + "time_per_iteration": 2.59576678276062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138396, + "balance_loss_mlp": 1.06501067, + "epoch": 0.9126587148903424, + "flos": 564370523136.0, + "grad_norm": 0.03672916386573753, + "language_loss": 0.8672806, + "learning_rate": 1.9872283529978662e-05, + "loss": 0.87866455, + "num_input_tokens_seen": 392265168, + "router_z_loss_mlp": 0.73339844, + "step": 4744, + "time_per_iteration": 2.640869379043579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137273, + "balance_loss_mlp": 1.06398344, + "epoch": 0.912851096575606, + "flos": 506933865984.0, + "grad_norm": 0.03925828506694119, + "language_loss": 0.84253651, + "learning_rate": 1.978541819374574e-05, + "loss": 0.85390925, + "num_input_tokens_seen": 392329456, + "router_z_loss_mlp": 0.73291016, + "step": 4745, + "time_per_iteration": 2.6787405014038086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137181, + "balance_loss_mlp": 1.06389141, + "epoch": 0.9130434782608695, + "flos": 551768630784.0, + "grad_norm": 0.03898701708502903, + "language_loss": 0.87371671, + "learning_rate": 1.9698739293347755e-05, + "loss": 0.88508856, + "num_input_tokens_seen": 392397792, + "router_z_loss_mlp": 0.73291016, + "step": 4746, + "time_per_iteration": 2.6251258850097656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137732, + "balance_loss_mlp": 1.064538, + "epoch": 0.9132358599461331, + "flos": 469935920640.0, + "grad_norm": 0.037506103614932354, + "language_loss": 0.87836325, + "learning_rate": 1.9612246862436456e-05, + "loss": 0.88974053, + "num_input_tokens_seen": 392462928, + "router_z_loss_mlp": 0.73193359, + "step": 4747, + "time_per_iteration": 2.536179542541504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137446, + "balance_loss_mlp": 1.06415629, + "epoch": 0.9134282416313967, + "flos": 507101051904.0, + "grad_norm": 0.038265188221768345, + "language_loss": 0.84132433, + "learning_rate": 1.9525940934591148e-05, + "loss": 0.8526988, + "num_input_tokens_seen": 392531840, + "router_z_loss_mlp": 0.73291016, + "step": 4748, + "time_per_iteration": 2.6317527294158936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138034, + "balance_loss_mlp": 1.06479192, + "epoch": 0.9136206233166603, + "flos": 605938157568.0, + "grad_norm": 0.038780374815894, + "language_loss": 0.88831162, + "learning_rate": 1.9439821543318748e-05, + "loss": 0.89969194, + "num_input_tokens_seen": 392602464, + "router_z_loss_mlp": 0.73242188, + "step": 4749, + "time_per_iteration": 2.7483599185943604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113715, + "balance_loss_mlp": 1.06390798, + "epoch": 0.9138130050019239, + "flos": 562824916992.0, + "grad_norm": 0.03593036056465836, + "language_loss": 0.87584126, + "learning_rate": 1.9353888722053793e-05, + "loss": 0.88721275, + "num_input_tokens_seen": 392669872, + "router_z_loss_mlp": 0.73242188, + "step": 4750, + "time_per_iteration": 2.6593310832977295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137274, + "balance_loss_mlp": 1.06398451, + "epoch": 0.9140053866871873, + "flos": 691344172032.0, + "grad_norm": 0.033756057406165677, + "language_loss": 0.94630772, + "learning_rate": 1.9268142504158426e-05, + "loss": 0.95768046, + "num_input_tokens_seen": 392744256, + "router_z_loss_mlp": 0.73291016, + "step": 4751, + "time_per_iteration": 2.8558006286621094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136083, + "balance_loss_mlp": 1.06279266, + "epoch": 0.9141977683724509, + "flos": 552129199104.0, + "grad_norm": 0.0351497110671635, + "language_loss": 0.88143069, + "learning_rate": 1.9182582922922186e-05, + "loss": 0.89279151, + "num_input_tokens_seen": 392816832, + "router_z_loss_mlp": 0.73291016, + "step": 4752, + "time_per_iteration": 2.6890971660614014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135831, + "balance_loss_mlp": 1.06258917, + "epoch": 0.9143901500577145, + "flos": 541120576512.0, + "grad_norm": 0.039948380347975404, + "language_loss": 0.80258191, + "learning_rate": 1.9097210011562228e-05, + "loss": 0.81394029, + "num_input_tokens_seen": 392886304, + "router_z_loss_mlp": 0.73242188, + "step": 4753, + "time_per_iteration": 2.660975217819214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135888, + "balance_loss_mlp": 1.06264615, + "epoch": 0.9145825317429781, + "flos": 529793044992.0, + "grad_norm": 0.03802405513720637, + "language_loss": 0.85889542, + "learning_rate": 1.9012023803223366e-05, + "loss": 0.87025428, + "num_input_tokens_seen": 392955872, + "router_z_loss_mlp": 0.73242188, + "step": 4754, + "time_per_iteration": 2.6234130859375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135989, + "balance_loss_mlp": 1.06269932, + "epoch": 0.9147749134282416, + "flos": 515812732416.0, + "grad_norm": 0.0330610975308954, + "language_loss": 0.83169824, + "learning_rate": 1.892702433097776e-05, + "loss": 0.84305817, + "num_input_tokens_seen": 393025776, + "router_z_loss_mlp": 0.73291016, + "step": 4755, + "time_per_iteration": 2.6349074840545654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136034, + "balance_loss_mlp": 1.06293452, + "epoch": 0.9149672951135052, + "flos": 515513289216.0, + "grad_norm": 0.03561497864158172, + "language_loss": 0.90493286, + "learning_rate": 1.8842211627825233e-05, + "loss": 0.91629314, + "num_input_tokens_seen": 393095936, + "router_z_loss_mlp": 0.73095703, + "step": 4756, + "time_per_iteration": 2.672971725463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137657, + "balance_loss_mlp": 1.06441462, + "epoch": 0.9151596767987688, + "flos": 578227310592.0, + "grad_norm": 0.0357639019467354, + "language_loss": 0.86071813, + "learning_rate": 1.8757585726692727e-05, + "loss": 0.87209469, + "num_input_tokens_seen": 393166816, + "router_z_loss_mlp": 0.73242188, + "step": 4757, + "time_per_iteration": 2.7354896068573 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113794, + "balance_loss_mlp": 1.06484115, + "epoch": 0.9153520584840323, + "flos": 620476423680.0, + "grad_norm": 0.033473586287839016, + "language_loss": 0.87076652, + "learning_rate": 1.8673146660435182e-05, + "loss": 0.88214588, + "num_input_tokens_seen": 393242176, + "router_z_loss_mlp": 0.73095703, + "step": 4758, + "time_per_iteration": 2.744753122329712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137943, + "balance_loss_mlp": 1.06470096, + "epoch": 0.9155444401692959, + "flos": 469862060544.0, + "grad_norm": 0.03673386334031248, + "language_loss": 0.87150836, + "learning_rate": 1.8588894461834704e-05, + "loss": 0.88288778, + "num_input_tokens_seen": 393311792, + "router_z_loss_mlp": 0.73242188, + "step": 4759, + "time_per_iteration": 2.589590311050415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142845, + "balance_loss_mlp": 1.07131958, + "epoch": 0.9157368218545594, + "flos": 1413839689728.0, + "grad_norm": 0.005825750154504474, + "language_loss": 0.7481907, + "learning_rate": 1.8504829163600855e-05, + "loss": 0.75961918, + "num_input_tokens_seen": 393535648, + "router_z_loss_mlp": 0.71679688, + "step": 4760, + "time_per_iteration": 4.916935682296753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143028, + "balance_loss_mlp": 1.07150269, + "epoch": 0.915929203539823, + "flos": 1525324349952.0, + "grad_norm": 0.00593786079998211, + "language_loss": 0.79576051, + "learning_rate": 1.8420950798370584e-05, + "loss": 0.8071909, + "num_input_tokens_seen": 393767040, + "router_z_loss_mlp": 0.71679688, + "step": 4761, + "time_per_iteration": 4.881082534790039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136307, + "balance_loss_mlp": 1.06306517, + "epoch": 0.9161215852250866, + "flos": 536846327808.0, + "grad_norm": 0.03600435736689933, + "language_loss": 0.85723937, + "learning_rate": 1.8337259398708616e-05, + "loss": 0.86860245, + "num_input_tokens_seen": 393841232, + "router_z_loss_mlp": 0.73242188, + "step": 4762, + "time_per_iteration": 2.6991817951202393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011356, + "balance_loss_mlp": 1.06245291, + "epoch": 0.9163139669103502, + "flos": 591725531136.0, + "grad_norm": 0.04011016842573452, + "language_loss": 0.86041784, + "learning_rate": 1.8253754997106632e-05, + "loss": 0.87177384, + "num_input_tokens_seen": 393910512, + "router_z_loss_mlp": 0.73144531, + "step": 4763, + "time_per_iteration": 2.699273109436035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134482, + "balance_loss_mlp": 1.06114411, + "epoch": 0.9165063485956138, + "flos": 823371603456.0, + "grad_norm": 0.03153796906678494, + "language_loss": 0.88287377, + "learning_rate": 1.817043762598397e-05, + "loss": 0.89421856, + "num_input_tokens_seen": 393988624, + "router_z_loss_mlp": 0.73339844, + "step": 4764, + "time_per_iteration": 3.0631844997406006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113468, + "balance_loss_mlp": 1.0613898, + "epoch": 0.9166987302808772, + "flos": 526245937152.0, + "grad_norm": 0.03701950876229616, + "language_loss": 0.87147516, + "learning_rate": 1.8087307317687264e-05, + "loss": 0.88282192, + "num_input_tokens_seen": 394059184, + "router_z_loss_mlp": 0.73291016, + "step": 4765, + "time_per_iteration": 2.6542019844055176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134817, + "balance_loss_mlp": 1.06152701, + "epoch": 0.9168911119661408, + "flos": 656345726976.0, + "grad_norm": 0.033448675815540965, + "language_loss": 0.88564223, + "learning_rate": 1.800436410449058e-05, + "loss": 0.89699042, + "num_input_tokens_seen": 394142160, + "router_z_loss_mlp": 0.73291016, + "step": 4766, + "time_per_iteration": 2.9484171867370605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134985, + "balance_loss_mlp": 1.06174314, + "epoch": 0.9170834936514044, + "flos": 492721239552.0, + "grad_norm": 0.03145874781003063, + "language_loss": 0.89064819, + "learning_rate": 1.7921608018595436e-05, + "loss": 0.90199804, + "num_input_tokens_seen": 394207056, + "router_z_loss_mlp": 0.73242188, + "step": 4767, + "time_per_iteration": 2.54239821434021 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134486, + "balance_loss_mlp": 1.06124353, + "epoch": 0.917275875336668, + "flos": 629179372032.0, + "grad_norm": 0.03937996598674544, + "language_loss": 0.85276043, + "learning_rate": 1.7839039092130415e-05, + "loss": 0.86410534, + "num_input_tokens_seen": 394275456, + "router_z_loss_mlp": 0.73242188, + "step": 4768, + "time_per_iteration": 2.788365125656128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139496, + "balance_loss_mlp": 1.06777954, + "epoch": 0.9174682570219315, + "flos": 1521212557824.0, + "grad_norm": 0.003465998436582984, + "language_loss": 0.78180236, + "learning_rate": 1.7756657357151762e-05, + "loss": 0.79319733, + "num_input_tokens_seen": 394503808, + "router_z_loss_mlp": 0.71875, + "step": 4769, + "time_per_iteration": 4.939180850982666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134868, + "balance_loss_mlp": 1.06157768, + "epoch": 0.917660638707195, + "flos": 561112124928.0, + "grad_norm": 0.03362556891440619, + "language_loss": 0.8936972, + "learning_rate": 1.7674462845642835e-05, + "loss": 0.90504587, + "num_input_tokens_seen": 394573776, + "router_z_loss_mlp": 0.73291016, + "step": 4770, + "time_per_iteration": 2.734116315841675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113516, + "balance_loss_mlp": 1.06186974, + "epoch": 0.9178530203924586, + "flos": 448175184384.0, + "grad_norm": 0.03565950552809895, + "language_loss": 0.88209128, + "learning_rate": 1.7592455589514387e-05, + "loss": 0.89344281, + "num_input_tokens_seen": 394637600, + "router_z_loss_mlp": 0.73291016, + "step": 4771, + "time_per_iteration": 2.482034683227539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134749, + "balance_loss_mlp": 1.06150699, + "epoch": 0.9180454020777222, + "flos": 466974964224.0, + "grad_norm": 0.033285195978275374, + "language_loss": 0.83965075, + "learning_rate": 1.7510635620604453e-05, + "loss": 0.85099828, + "num_input_tokens_seen": 394707344, + "router_z_loss_mlp": 0.73242188, + "step": 4772, + "time_per_iteration": 2.5653374195098877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113499, + "balance_loss_mlp": 1.06174767, + "epoch": 0.9182377837629858, + "flos": 597484987392.0, + "grad_norm": 0.03234819221060202, + "language_loss": 0.91231674, + "learning_rate": 1.74290029706784e-05, + "loss": 0.92366672, + "num_input_tokens_seen": 394786368, + "router_z_loss_mlp": 0.73242188, + "step": 4773, + "time_per_iteration": 2.758915901184082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134829, + "balance_loss_mlp": 1.06139612, + "epoch": 0.9184301654482493, + "flos": 998360552448.0, + "grad_norm": 0.03268667368696316, + "language_loss": 0.87101263, + "learning_rate": 1.734755767142876e-05, + "loss": 0.88236094, + "num_input_tokens_seen": 394876976, + "router_z_loss_mlp": 0.734375, + "step": 4774, + "time_per_iteration": 3.328178644180298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134649, + "balance_loss_mlp": 1.06140733, + "epoch": 0.9186225471335129, + "flos": 509901553152.0, + "grad_norm": 0.029942945001472855, + "language_loss": 0.87889773, + "learning_rate": 1.7266299754475467e-05, + "loss": 0.89024425, + "num_input_tokens_seen": 394949024, + "router_z_loss_mlp": 0.73242188, + "step": 4775, + "time_per_iteration": 2.658120632171631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134933, + "balance_loss_mlp": 1.06164348, + "epoch": 0.9188149288187765, + "flos": 942076732416.0, + "grad_norm": 0.03844935783294636, + "language_loss": 0.83205068, + "learning_rate": 1.718522925136551e-05, + "loss": 0.8434, + "num_input_tokens_seen": 395044352, + "router_z_loss_mlp": 0.73291016, + "step": 4776, + "time_per_iteration": 3.2743020057678223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134929, + "balance_loss_mlp": 1.06173444, + "epoch": 0.91900731050404, + "flos": 584763572736.0, + "grad_norm": 0.03633610266670935, + "language_loss": 0.87877005, + "learning_rate": 1.7104346193573484e-05, + "loss": 0.89011931, + "num_input_tokens_seen": 395113824, + "router_z_loss_mlp": 0.73193359, + "step": 4777, + "time_per_iteration": 2.6747422218322754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136109, + "balance_loss_mlp": 1.06277132, + "epoch": 0.9191996921893035, + "flos": 582306175488.0, + "grad_norm": 0.04168169923395777, + "language_loss": 0.85453916, + "learning_rate": 1.7023650612500828e-05, + "loss": 0.86590028, + "num_input_tokens_seen": 395184496, + "router_z_loss_mlp": 0.73339844, + "step": 4778, + "time_per_iteration": 2.6795010566711426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136418, + "balance_loss_mlp": 1.06317592, + "epoch": 0.9193920738745671, + "flos": 910416549888.0, + "grad_norm": 0.03761875549388394, + "language_loss": 0.84188634, + "learning_rate": 1.6943142539476374e-05, + "loss": 0.8532505, + "num_input_tokens_seen": 395263760, + "router_z_loss_mlp": 0.73242188, + "step": 4779, + "time_per_iteration": 3.1361474990844727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142517, + "balance_loss_mlp": 1.07080078, + "epoch": 0.9195844555598307, + "flos": 1561644819456.0, + "grad_norm": 0.005775441395861982, + "language_loss": 0.79795396, + "learning_rate": 1.686282200575606e-05, + "loss": 0.8093791, + "num_input_tokens_seen": 395482384, + "router_z_loss_mlp": 0.71875, + "step": 4780, + "time_per_iteration": 4.66200065612793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136054, + "balance_loss_mlp": 1.06271684, + "epoch": 0.9197768372450943, + "flos": 475017901056.0, + "grad_norm": 0.042723214120450784, + "language_loss": 0.83727241, + "learning_rate": 1.678268904252317e-05, + "loss": 0.84863299, + "num_input_tokens_seen": 395550384, + "router_z_loss_mlp": 0.73339844, + "step": 4781, + "time_per_iteration": 2.5478897094726562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134824, + "balance_loss_mlp": 1.06143892, + "epoch": 0.9199692189303579, + "flos": 858596358144.0, + "grad_norm": 0.044037253062345634, + "language_loss": 0.89346141, + "learning_rate": 1.6702743680888088e-05, + "loss": 0.90480959, + "num_input_tokens_seen": 395632320, + "router_z_loss_mlp": 0.73388672, + "step": 4782, + "time_per_iteration": 3.2057340145111084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134616, + "balance_loss_mlp": 1.06137359, + "epoch": 0.9201616006156214, + "flos": 505379527680.0, + "grad_norm": 0.03661647161350629, + "language_loss": 0.82697654, + "learning_rate": 1.6622985951888327e-05, + "loss": 0.83832264, + "num_input_tokens_seen": 395703856, + "router_z_loss_mlp": 0.73242188, + "step": 4783, + "time_per_iteration": 2.646583080291748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134557, + "balance_loss_mlp": 1.06117201, + "epoch": 0.9203539823008849, + "flos": 549895383552.0, + "grad_norm": 0.04183695528673719, + "language_loss": 0.89185143, + "learning_rate": 1.6543415886488554e-05, + "loss": 0.90319705, + "num_input_tokens_seen": 395779456, + "router_z_loss_mlp": 0.73388672, + "step": 4784, + "time_per_iteration": 2.70615816116333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135056, + "balance_loss_mlp": 1.06176567, + "epoch": 0.9205463639861485, + "flos": 541072912896.0, + "grad_norm": 0.038118566916411155, + "language_loss": 0.86795676, + "learning_rate": 1.6464033515580624e-05, + "loss": 0.87930727, + "num_input_tokens_seen": 395849584, + "router_z_loss_mlp": 0.73291016, + "step": 4785, + "time_per_iteration": 2.640362501144409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134779, + "balance_loss_mlp": 1.06144154, + "epoch": 0.9207387456714121, + "flos": 801161702400.0, + "grad_norm": 0.03691419431117059, + "language_loss": 0.82699919, + "learning_rate": 1.6384838869983488e-05, + "loss": 0.83834696, + "num_input_tokens_seen": 395943712, + "router_z_loss_mlp": 0.73339844, + "step": 4786, + "time_per_iteration": 3.035921573638916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134791, + "balance_loss_mlp": 1.06150079, + "epoch": 0.9209311273566756, + "flos": 503816457216.0, + "grad_norm": 0.03887199086882918, + "language_loss": 0.8393299, + "learning_rate": 1.630583198044333e-05, + "loss": 0.85067785, + "num_input_tokens_seen": 396013168, + "router_z_loss_mlp": 0.73291016, + "step": 4787, + "time_per_iteration": 2.648547887802124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136404, + "balance_loss_mlp": 1.06316197, + "epoch": 0.9211235090419392, + "flos": 570383760384.0, + "grad_norm": 0.034570845531176744, + "language_loss": 0.86524636, + "learning_rate": 1.6227012877633173e-05, + "loss": 0.8766104, + "num_input_tokens_seen": 396082032, + "router_z_loss_mlp": 0.73242188, + "step": 4788, + "time_per_iteration": 2.6737005710601807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136182, + "balance_loss_mlp": 1.0629878, + "epoch": 0.9213158907272028, + "flos": 807930278400.0, + "grad_norm": 0.038736420027196794, + "language_loss": 0.88138419, + "learning_rate": 1.6148381592153538e-05, + "loss": 0.89274597, + "num_input_tokens_seen": 396157984, + "router_z_loss_mlp": 0.73193359, + "step": 4789, + "time_per_iteration": 2.984248161315918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136426, + "balance_loss_mlp": 1.06308794, + "epoch": 0.9215082724124664, + "flos": 491650994688.0, + "grad_norm": 0.03447141076986377, + "language_loss": 0.80724669, + "learning_rate": 1.6069938154531618e-05, + "loss": 0.81861091, + "num_input_tokens_seen": 396223840, + "router_z_loss_mlp": 0.73339844, + "step": 4790, + "time_per_iteration": 2.5614049434661865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139565, + "balance_loss_mlp": 1.06765747, + "epoch": 0.9217006540977299, + "flos": 1517893761024.0, + "grad_norm": 0.0033789664426223543, + "language_loss": 0.77070266, + "learning_rate": 1.599168259522188e-05, + "loss": 0.78209823, + "num_input_tokens_seen": 396458288, + "router_z_loss_mlp": 0.72070312, + "step": 4791, + "time_per_iteration": 4.978902578353882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134776, + "balance_loss_mlp": 1.06153357, + "epoch": 0.9218930357829934, + "flos": 745086001152.0, + "grad_norm": 0.03665734830285374, + "language_loss": 0.809376, + "learning_rate": 1.5913614944605804e-05, + "loss": 0.82072377, + "num_input_tokens_seen": 396536208, + "router_z_loss_mlp": 0.73242188, + "step": 4792, + "time_per_iteration": 2.9215128421783447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134751, + "balance_loss_mlp": 1.06146133, + "epoch": 0.922085417468257, + "flos": 453973572096.0, + "grad_norm": 0.04198200068683094, + "language_loss": 0.85471809, + "learning_rate": 1.5835735232992032e-05, + "loss": 0.86606556, + "num_input_tokens_seen": 396599984, + "router_z_loss_mlp": 0.73291016, + "step": 4793, + "time_per_iteration": 2.502872943878174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134773, + "balance_loss_mlp": 1.06148362, + "epoch": 0.9222777991535206, + "flos": 501237536256.0, + "grad_norm": 0.04225847617164951, + "language_loss": 0.89807576, + "learning_rate": 1.575804349061616e-05, + "loss": 0.90942347, + "num_input_tokens_seen": 396664592, + "router_z_loss_mlp": 0.73291016, + "step": 4794, + "time_per_iteration": 2.576061964035034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134907, + "balance_loss_mlp": 1.06147456, + "epoch": 0.9224701808387842, + "flos": 528983311872.0, + "grad_norm": 0.03721796107962599, + "language_loss": 0.8360222, + "learning_rate": 1.5680539747640722e-05, + "loss": 0.84737134, + "num_input_tokens_seen": 396729472, + "router_z_loss_mlp": 0.734375, + "step": 4795, + "time_per_iteration": 2.583193778991699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134896, + "balance_loss_mlp": 1.06160617, + "epoch": 0.9226625625240477, + "flos": 876117047808.0, + "grad_norm": 0.03443008595735349, + "language_loss": 0.79559839, + "learning_rate": 1.5603224034155315e-05, + "loss": 0.80694729, + "num_input_tokens_seen": 396810384, + "router_z_loss_mlp": 0.73291016, + "step": 4796, + "time_per_iteration": 3.1217310428619385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134541, + "balance_loss_mlp": 1.06125164, + "epoch": 0.9228549442093112, + "flos": 503760061440.0, + "grad_norm": 0.036776332050838995, + "language_loss": 0.92655843, + "learning_rate": 1.5526096380176657e-05, + "loss": 0.93790388, + "num_input_tokens_seen": 396875472, + "router_z_loss_mlp": 0.73291016, + "step": 4797, + "time_per_iteration": 2.5615105628967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134953, + "balance_loss_mlp": 1.06161523, + "epoch": 0.9230473258945748, + "flos": 601125421056.0, + "grad_norm": 0.033291935221544965, + "language_loss": 0.89235032, + "learning_rate": 1.544915681564829e-05, + "loss": 0.90369982, + "num_input_tokens_seen": 396949888, + "router_z_loss_mlp": 0.73339844, + "step": 4798, + "time_per_iteration": 2.877967596054077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134901, + "balance_loss_mlp": 1.06165874, + "epoch": 0.9232397075798384, + "flos": 823875162624.0, + "grad_norm": 0.038339368705079924, + "language_loss": 0.84685349, + "learning_rate": 1.5372405370440822e-05, + "loss": 0.85820246, + "num_input_tokens_seen": 397027504, + "router_z_loss_mlp": 0.73242188, + "step": 4799, + "time_per_iteration": 3.0926709175109863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135028, + "balance_loss_mlp": 1.06173778, + "epoch": 0.923432089265102, + "flos": 708274707456.0, + "grad_norm": 0.03568827047974618, + "language_loss": 0.89519256, + "learning_rate": 1.5295842074351805e-05, + "loss": 0.9065429, + "num_input_tokens_seen": 397101600, + "router_z_loss_mlp": 0.73291016, + "step": 4800, + "time_per_iteration": 2.881060838699341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136822, + "balance_loss_mlp": 1.06362712, + "epoch": 0.9236244709503655, + "flos": 703090669056.0, + "grad_norm": 0.0411673786427115, + "language_loss": 0.82487589, + "learning_rate": 1.5219466957105798e-05, + "loss": 0.83624411, + "num_input_tokens_seen": 397170880, + "router_z_loss_mlp": 0.73193359, + "step": 4801, + "time_per_iteration": 2.840782403945923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136403, + "balance_loss_mlp": 1.0632081, + "epoch": 0.9238168526356291, + "flos": 516081976320.0, + "grad_norm": 0.03540606312834152, + "language_loss": 0.88255292, + "learning_rate": 1.5143280048354136e-05, + "loss": 0.89391702, + "num_input_tokens_seen": 397242272, + "router_z_loss_mlp": 0.73193359, + "step": 4802, + "time_per_iteration": 2.6457712650299072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136586, + "balance_loss_mlp": 1.06334352, + "epoch": 0.9240092343208927, + "flos": 492964286976.0, + "grad_norm": 0.04044553968836264, + "language_loss": 0.86154222, + "learning_rate": 1.5067281377675213e-05, + "loss": 0.87290812, + "num_input_tokens_seen": 397308032, + "router_z_loss_mlp": 0.73242188, + "step": 4803, + "time_per_iteration": 2.580083131790161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135778, + "balance_loss_mlp": 1.06239247, + "epoch": 0.9242016160061562, + "flos": 648435047424.0, + "grad_norm": 0.0375252651835897, + "language_loss": 0.78042829, + "learning_rate": 1.4991470974574484e-05, + "loss": 0.79178602, + "num_input_tokens_seen": 397390944, + "router_z_loss_mlp": 0.73388672, + "step": 4804, + "time_per_iteration": 2.8536152839660645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136397, + "balance_loss_mlp": 1.0632025, + "epoch": 0.9243939976914197, + "flos": 730778047488.0, + "grad_norm": 0.037173114265174334, + "language_loss": 0.84313226, + "learning_rate": 1.4915848868484016e-05, + "loss": 0.85449624, + "num_input_tokens_seen": 397468128, + "router_z_loss_mlp": 0.73193359, + "step": 4805, + "time_per_iteration": 2.968522310256958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135222, + "balance_loss_mlp": 1.0618844, + "epoch": 0.9245863793766833, + "flos": 453209501184.0, + "grad_norm": 0.03394031409690086, + "language_loss": 0.94972181, + "learning_rate": 1.4840415088763048e-05, + "loss": 0.96107405, + "num_input_tokens_seen": 397538976, + "router_z_loss_mlp": 0.73339844, + "step": 4806, + "time_per_iteration": 2.591217517852783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135015, + "balance_loss_mlp": 1.06162941, + "epoch": 0.9247787610619469, + "flos": 756365869056.0, + "grad_norm": 0.03881181193239194, + "language_loss": 0.82753104, + "learning_rate": 1.476516966469732e-05, + "loss": 0.83888113, + "num_input_tokens_seen": 397612944, + "router_z_loss_mlp": 0.73388672, + "step": 4807, + "time_per_iteration": 2.9434964656829834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135205, + "balance_loss_mlp": 1.06186795, + "epoch": 0.9249711427472105, + "flos": 563083427328.0, + "grad_norm": 0.034947383902908004, + "language_loss": 0.89372003, + "learning_rate": 1.4690112625499908e-05, + "loss": 0.90507203, + "num_input_tokens_seen": 397690848, + "router_z_loss_mlp": 0.73339844, + "step": 4808, + "time_per_iteration": 2.770357370376587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134947, + "balance_loss_mlp": 1.06156158, + "epoch": 0.9251635244324741, + "flos": 527780809728.0, + "grad_norm": 0.03910850874583782, + "language_loss": 0.89453298, + "learning_rate": 1.4615244000310501e-05, + "loss": 0.90588242, + "num_input_tokens_seen": 397761008, + "router_z_loss_mlp": 0.73388672, + "step": 4809, + "time_per_iteration": 2.6631083488464355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135004, + "balance_loss_mlp": 1.0615716, + "epoch": 0.9253559061177375, + "flos": 612479149056.0, + "grad_norm": 0.03802586190927124, + "language_loss": 0.83715951, + "learning_rate": 1.4540563818195685e-05, + "loss": 0.84850955, + "num_input_tokens_seen": 397840640, + "router_z_loss_mlp": 0.734375, + "step": 4810, + "time_per_iteration": 2.8262386322021484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139725, + "balance_loss_mlp": 1.06800842, + "epoch": 0.9255482878030011, + "flos": 1554461280768.0, + "grad_norm": 0.004137695643331225, + "language_loss": 0.76925391, + "learning_rate": 1.446607210814882e-05, + "loss": 0.78065115, + "num_input_tokens_seen": 398060096, + "router_z_loss_mlp": 0.71875, + "step": 4811, + "time_per_iteration": 4.7207818031311035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134856, + "balance_loss_mlp": 1.06151867, + "epoch": 0.9257406694882647, + "flos": 767802189312.0, + "grad_norm": 0.03858144301478165, + "language_loss": 0.85714322, + "learning_rate": 1.4391768899090219e-05, + "loss": 0.86849177, + "num_input_tokens_seen": 398143680, + "router_z_loss_mlp": 0.73339844, + "step": 4812, + "time_per_iteration": 3.0623562335968018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136229, + "balance_loss_mlp": 1.06298673, + "epoch": 0.9259330511735283, + "flos": 498966790656.0, + "grad_norm": 0.03833501566517131, + "language_loss": 0.8808893, + "learning_rate": 1.431765421986686e-05, + "loss": 0.89225155, + "num_input_tokens_seen": 398207056, + "router_z_loss_mlp": 0.73242188, + "step": 4813, + "time_per_iteration": 2.6300573348999023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136541, + "balance_loss_mlp": 1.06339419, + "epoch": 0.9261254328587919, + "flos": 628015801344.0, + "grad_norm": 0.036925045587933254, + "language_loss": 0.8380208, + "learning_rate": 1.424372809925273e-05, + "loss": 0.84938622, + "num_input_tokens_seen": 398277472, + "router_z_loss_mlp": 0.73144531, + "step": 4814, + "time_per_iteration": 2.739515542984009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136367, + "balance_loss_mlp": 1.06312442, + "epoch": 0.9263178145440554, + "flos": 598492105728.0, + "grad_norm": 0.036427674031464095, + "language_loss": 0.89815581, + "learning_rate": 1.416999056594831e-05, + "loss": 0.90951943, + "num_input_tokens_seen": 398346544, + "router_z_loss_mlp": 0.73242188, + "step": 4815, + "time_per_iteration": 2.7244396209716797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113381, + "balance_loss_mlp": 1.06042469, + "epoch": 0.926510196229319, + "flos": 389416502784.0, + "grad_norm": 0.03761333342393075, + "language_loss": 0.88639969, + "learning_rate": 1.4096441648581259e-05, + "loss": 0.8977378, + "num_input_tokens_seen": 398409344, + "router_z_loss_mlp": 0.73388672, + "step": 4816, + "time_per_iteration": 2.497323513031006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134114, + "balance_loss_mlp": 1.06082404, + "epoch": 0.9267025779145825, + "flos": 546862568448.0, + "grad_norm": 0.04104132157625523, + "language_loss": 0.8884635, + "learning_rate": 1.4023081375705737e-05, + "loss": 0.89980459, + "num_input_tokens_seen": 398478816, + "router_z_loss_mlp": 0.73291016, + "step": 4817, + "time_per_iteration": 2.657047986984253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134159, + "balance_loss_mlp": 1.06086874, + "epoch": 0.9268949595998461, + "flos": 500790372864.0, + "grad_norm": 0.03579000656747544, + "language_loss": 0.86026472, + "learning_rate": 1.3949909775802682e-05, + "loss": 0.87160635, + "num_input_tokens_seen": 398550384, + "router_z_loss_mlp": 0.73291016, + "step": 4818, + "time_per_iteration": 2.6788973808288574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135314, + "balance_loss_mlp": 1.06202364, + "epoch": 0.9270873412851096, + "flos": 433738976256.0, + "grad_norm": 0.03546119064203232, + "language_loss": 0.86793125, + "learning_rate": 1.3876926877279817e-05, + "loss": 0.87928438, + "num_input_tokens_seen": 398620832, + "router_z_loss_mlp": 0.73291016, + "step": 4819, + "time_per_iteration": 2.6300439834594727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135322, + "balance_loss_mlp": 1.06217515, + "epoch": 0.9272797229703732, + "flos": 467802161664.0, + "grad_norm": 0.039403892128954024, + "language_loss": 0.9138974, + "learning_rate": 1.380413270847164e-05, + "loss": 0.92525059, + "num_input_tokens_seen": 398689776, + "router_z_loss_mlp": 0.73144531, + "step": 4820, + "time_per_iteration": 2.6474528312683105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134919, + "balance_loss_mlp": 1.06172454, + "epoch": 0.9274721046556368, + "flos": 706249737216.0, + "grad_norm": 0.036493835710477124, + "language_loss": 0.83149821, + "learning_rate": 1.373152729763938e-05, + "loss": 0.84284735, + "num_input_tokens_seen": 398775072, + "router_z_loss_mlp": 0.73193359, + "step": 4821, + "time_per_iteration": 3.0488803386688232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140076, + "balance_loss_mlp": 1.06835938, + "epoch": 0.9276644863409004, + "flos": 1405342858752.0, + "grad_norm": 0.0042348399486481225, + "language_loss": 0.82380462, + "learning_rate": 1.3659110672970931e-05, + "loss": 0.83520538, + "num_input_tokens_seen": 399002016, + "router_z_loss_mlp": 0.71875, + "step": 4822, + "time_per_iteration": 4.881706237792969 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113384, + "balance_loss_mlp": 1.06054974, + "epoch": 0.927856868026164, + "flos": 743136892416.0, + "grad_norm": 0.036665981072277615, + "language_loss": 0.84963113, + "learning_rate": 1.3586882862580917e-05, + "loss": 0.86096954, + "num_input_tokens_seen": 399085808, + "router_z_loss_mlp": 0.73291016, + "step": 4823, + "time_per_iteration": 3.027317523956299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133668, + "balance_loss_mlp": 1.06028235, + "epoch": 0.9280492497114274, + "flos": 413122344960.0, + "grad_norm": 0.044707757388090734, + "language_loss": 0.79886949, + "learning_rate": 1.3514843894510686e-05, + "loss": 0.81020617, + "num_input_tokens_seen": 399146768, + "router_z_loss_mlp": 0.73388672, + "step": 4824, + "time_per_iteration": 2.4648141860961914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133648, + "balance_loss_mlp": 1.06035805, + "epoch": 0.928241631396691, + "flos": 647664245760.0, + "grad_norm": 0.0394631241951201, + "language_loss": 0.90115678, + "learning_rate": 1.3442993796728254e-05, + "loss": 0.91249329, + "num_input_tokens_seen": 399220192, + "router_z_loss_mlp": 0.73291016, + "step": 4825, + "time_per_iteration": 2.8606808185577393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133488, + "balance_loss_mlp": 1.06019819, + "epoch": 0.9284340130819546, + "flos": 698128210944.0, + "grad_norm": 0.037269219229585766, + "language_loss": 0.85544008, + "learning_rate": 1.3371332597128249e-05, + "loss": 0.86677498, + "num_input_tokens_seen": 399300064, + "router_z_loss_mlp": 0.73291016, + "step": 4826, + "time_per_iteration": 2.960580348968506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135082, + "balance_loss_mlp": 1.06174421, + "epoch": 0.9286263947672182, + "flos": 760542789120.0, + "grad_norm": 0.033270395094925145, + "language_loss": 0.88126981, + "learning_rate": 1.3299860323532032e-05, + "loss": 0.89262056, + "num_input_tokens_seen": 399383200, + "router_z_loss_mlp": 0.73339844, + "step": 4827, + "time_per_iteration": 3.026780366897583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135119, + "balance_loss_mlp": 1.06187654, + "epoch": 0.9288187764524817, + "flos": 674140389888.0, + "grad_norm": 0.03346604176423535, + "language_loss": 0.85396868, + "learning_rate": 1.3228577003687681e-05, + "loss": 0.86531985, + "num_input_tokens_seen": 399466400, + "router_z_loss_mlp": 0.73242188, + "step": 4828, + "time_per_iteration": 2.9438445568084717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113508, + "balance_loss_mlp": 1.06183743, + "epoch": 0.9290111581377453, + "flos": 501469850112.0, + "grad_norm": 0.03828039220289202, + "language_loss": 0.87901628, + "learning_rate": 1.3157482665269727e-05, + "loss": 0.89036709, + "num_input_tokens_seen": 399533504, + "router_z_loss_mlp": 0.73242188, + "step": 4829, + "time_per_iteration": 2.577852725982666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113945, + "balance_loss_mlp": 1.06773376, + "epoch": 0.9292035398230089, + "flos": 1567057168896.0, + "grad_norm": 0.003695990156438286, + "language_loss": 0.72122061, + "learning_rate": 1.3086577335879424e-05, + "loss": 0.73261511, + "num_input_tokens_seen": 399769872, + "router_z_loss_mlp": 0.71875, + "step": 4830, + "time_per_iteration": 4.9167375564575195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139557, + "balance_loss_mlp": 1.06784058, + "epoch": 0.9293959215082724, + "flos": 1522063950336.0, + "grad_norm": 0.003745427392518177, + "language_loss": 0.79511833, + "learning_rate": 1.3015861043044753e-05, + "loss": 0.80651391, + "num_input_tokens_seen": 399997760, + "router_z_loss_mlp": 0.71875, + "step": 4831, + "time_per_iteration": 4.895474195480347 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133447, + "balance_loss_mlp": 1.06006205, + "epoch": 0.929588303193536, + "flos": 558897775104.0, + "grad_norm": 0.05587972312929897, + "language_loss": 0.89084888, + "learning_rate": 1.2945333814220195e-05, + "loss": 0.90218329, + "num_input_tokens_seen": 400063872, + "router_z_loss_mlp": 0.73388672, + "step": 4832, + "time_per_iteration": 2.6715126037597656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134017, + "balance_loss_mlp": 1.06063223, + "epoch": 0.9297806848787995, + "flos": 479550660096.0, + "grad_norm": 0.04310011942892276, + "language_loss": 0.85959709, + "learning_rate": 1.2874995676786905e-05, + "loss": 0.87093729, + "num_input_tokens_seen": 400126064, + "router_z_loss_mlp": 0.73388672, + "step": 4833, + "time_per_iteration": 2.5311076641082764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133753, + "balance_loss_mlp": 1.06036782, + "epoch": 0.9299730665640631, + "flos": 565653616128.0, + "grad_norm": 0.03259048154405644, + "language_loss": 0.84302491, + "learning_rate": 1.2804846658052372e-05, + "loss": 0.85436249, + "num_input_tokens_seen": 400201776, + "router_z_loss_mlp": 0.73388672, + "step": 4834, + "time_per_iteration": 2.917907476425171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133453, + "balance_loss_mlp": 1.06006742, + "epoch": 0.9301654482493267, + "flos": 561342437376.0, + "grad_norm": 0.03578896280013595, + "language_loss": 0.87560201, + "learning_rate": 1.2734886785251032e-05, + "loss": 0.88693655, + "num_input_tokens_seen": 400279504, + "router_z_loss_mlp": 0.73388672, + "step": 4835, + "time_per_iteration": 3.398090362548828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113652, + "balance_loss_mlp": 1.06480408, + "epoch": 0.9303578299345903, + "flos": 1523488032768.0, + "grad_norm": 0.004265178869550273, + "language_loss": 0.76852441, + "learning_rate": 1.2665116085543715e-05, + "loss": 0.7798897, + "num_input_tokens_seen": 400514800, + "router_z_loss_mlp": 0.71875, + "step": 4836, + "time_per_iteration": 5.208449840545654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113369, + "balance_loss_mlp": 1.06040013, + "epoch": 0.9305502116198537, + "flos": 531859674624.0, + "grad_norm": 0.03622258066971115, + "language_loss": 0.88041896, + "learning_rate": 1.2595534586017698e-05, + "loss": 0.89175594, + "num_input_tokens_seen": 400582640, + "router_z_loss_mlp": 0.73291016, + "step": 4837, + "time_per_iteration": 2.700305461883545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133586, + "balance_loss_mlp": 1.06020057, + "epoch": 0.9307425933051173, + "flos": 475855832064.0, + "grad_norm": 0.0423398747183289, + "language_loss": 0.86512882, + "learning_rate": 1.2526142313686983e-05, + "loss": 0.87646472, + "num_input_tokens_seen": 400646912, + "router_z_loss_mlp": 0.73388672, + "step": 4838, + "time_per_iteration": 2.5601203441619873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135535, + "balance_loss_mlp": 1.06219733, + "epoch": 0.9309349749903809, + "flos": 586064130048.0, + "grad_norm": 0.03684050044649056, + "language_loss": 0.90734005, + "learning_rate": 1.245693929549213e-05, + "loss": 0.91869539, + "num_input_tokens_seen": 400722128, + "router_z_loss_mlp": 0.73339844, + "step": 4839, + "time_per_iteration": 2.814164638519287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134924, + "balance_loss_mlp": 1.06168175, + "epoch": 0.9311273566756445, + "flos": 863141852160.0, + "grad_norm": 0.031996461961234596, + "language_loss": 0.80324173, + "learning_rate": 1.2387925558299984e-05, + "loss": 0.81459093, + "num_input_tokens_seen": 400801440, + "router_z_loss_mlp": 0.73242188, + "step": 4840, + "time_per_iteration": 3.157158374786377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134839, + "balance_loss_mlp": 1.06154966, + "epoch": 0.9313197383609081, + "flos": 549161511936.0, + "grad_norm": 0.037830595917140816, + "language_loss": 0.87318212, + "learning_rate": 1.231910112890411e-05, + "loss": 0.88453048, + "num_input_tokens_seen": 400873008, + "router_z_loss_mlp": 0.73291016, + "step": 4841, + "time_per_iteration": 2.7342753410339355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134557, + "balance_loss_mlp": 1.0612191, + "epoch": 0.9315121200461716, + "flos": 469703606784.0, + "grad_norm": 0.04359539081936152, + "language_loss": 0.86872697, + "learning_rate": 1.2250466034024522e-05, + "loss": 0.88007247, + "num_input_tokens_seen": 400935328, + "router_z_loss_mlp": 0.73339844, + "step": 4842, + "time_per_iteration": 2.5657942295074463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113488, + "balance_loss_mlp": 1.06154215, + "epoch": 0.9317045017314352, + "flos": 418558162944.0, + "grad_norm": 0.03823873856936876, + "language_loss": 0.82610798, + "learning_rate": 1.2182020300307684e-05, + "loss": 0.83745676, + "num_input_tokens_seen": 401000720, + "router_z_loss_mlp": 0.73339844, + "step": 4843, + "time_per_iteration": 2.549171209335327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134819, + "balance_loss_mlp": 1.06152916, + "epoch": 0.9318968834166987, + "flos": 541620132864.0, + "grad_norm": 0.03905937038896375, + "language_loss": 0.82102406, + "learning_rate": 1.2113763954326729e-05, + "loss": 0.83237225, + "num_input_tokens_seen": 401079664, + "router_z_loss_mlp": 0.73291016, + "step": 4844, + "time_per_iteration": 2.782175302505493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135109, + "balance_loss_mlp": 1.06172371, + "epoch": 0.9320892651019623, + "flos": 522346993152.0, + "grad_norm": 0.03778476990300089, + "language_loss": 0.84996724, + "learning_rate": 1.2045697022581015e-05, + "loss": 0.86131835, + "num_input_tokens_seen": 401146160, + "router_z_loss_mlp": 0.73388672, + "step": 4845, + "time_per_iteration": 2.640185832977295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135247, + "balance_loss_mlp": 1.06205273, + "epoch": 0.9322816467872258, + "flos": 583252895232.0, + "grad_norm": 0.03215108173886952, + "language_loss": 0.84850752, + "learning_rate": 1.1977819531496348e-05, + "loss": 0.85986006, + "num_input_tokens_seen": 401223264, + "router_z_loss_mlp": 0.73193359, + "step": 4846, + "time_per_iteration": 2.77775239944458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135397, + "balance_loss_mlp": 1.06215477, + "epoch": 0.9324740284724894, + "flos": 485802941952.0, + "grad_norm": 0.03897238462940964, + "language_loss": 0.85641253, + "learning_rate": 1.191013150742537e-05, + "loss": 0.8677665, + "num_input_tokens_seen": 401296368, + "router_z_loss_mlp": 0.73242188, + "step": 4847, + "time_per_iteration": 2.7562150955200195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113494, + "balance_loss_mlp": 1.06160247, + "epoch": 0.932666410157753, + "flos": 734023710720.0, + "grad_norm": 0.035990757069540615, + "language_loss": 0.87008613, + "learning_rate": 1.1842632976646672e-05, + "loss": 0.88143551, + "num_input_tokens_seen": 401383936, + "router_z_loss_mlp": 0.73339844, + "step": 4848, + "time_per_iteration": 3.0684380531311035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134114, + "balance_loss_mlp": 1.06077683, + "epoch": 0.9328587918430166, + "flos": 967180460544.0, + "grad_norm": 0.03473747152051204, + "language_loss": 0.83081275, + "learning_rate": 1.1775323965365681e-05, + "loss": 0.84215385, + "num_input_tokens_seen": 401468784, + "router_z_loss_mlp": 0.73339844, + "step": 4849, + "time_per_iteration": 3.298288583755493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133382, + "balance_loss_mlp": 1.06004477, + "epoch": 0.9330511735282802, + "flos": 615683152896.0, + "grad_norm": 0.04047956220186344, + "language_loss": 0.85783911, + "learning_rate": 1.1708204499713936e-05, + "loss": 0.86917299, + "num_input_tokens_seen": 401539712, + "router_z_loss_mlp": 0.73339844, + "step": 4850, + "time_per_iteration": 2.7613956928253174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134044, + "balance_loss_mlp": 1.06080151, + "epoch": 0.9332435552135436, + "flos": 560217798144.0, + "grad_norm": 0.03457415903450117, + "language_loss": 0.89681101, + "learning_rate": 1.1641274605749653e-05, + "loss": 0.90815145, + "num_input_tokens_seen": 401610432, + "router_z_loss_mlp": 0.73242188, + "step": 4851, + "time_per_iteration": 2.7369134426116943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134874, + "balance_loss_mlp": 1.06153619, + "epoch": 0.9334359368988072, + "flos": 516557337600.0, + "grad_norm": 0.035468780719106426, + "language_loss": 0.8622269, + "learning_rate": 1.1574534309457208e-05, + "loss": 0.87357557, + "num_input_tokens_seen": 401677344, + "router_z_loss_mlp": 0.73339844, + "step": 4852, + "time_per_iteration": 2.609017848968506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134841, + "balance_loss_mlp": 1.06159878, + "epoch": 0.9336283185840708, + "flos": 540940655616.0, + "grad_norm": 0.03211276800808927, + "language_loss": 0.86742085, + "learning_rate": 1.1507983636747488e-05, + "loss": 0.87876928, + "num_input_tokens_seen": 401756864, + "router_z_loss_mlp": 0.73242188, + "step": 4853, + "time_per_iteration": 2.800187587738037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139191, + "balance_loss_mlp": 1.06747437, + "epoch": 0.9338207002693344, + "flos": 1566121182720.0, + "grad_norm": 0.003325990500550125, + "language_loss": 0.78455019, + "learning_rate": 1.1441622613457824e-05, + "loss": 0.79594207, + "num_input_tokens_seen": 401983664, + "router_z_loss_mlp": 0.71875, + "step": 4854, + "time_per_iteration": 4.910603046417236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134905, + "balance_loss_mlp": 1.06161559, + "epoch": 0.9340130819545979, + "flos": 646507405824.0, + "grad_norm": 0.032821826781519965, + "language_loss": 0.85680681, + "learning_rate": 1.1375451265351833e-05, + "loss": 0.86815584, + "num_input_tokens_seen": 402065744, + "router_z_loss_mlp": 0.73291016, + "step": 4855, + "time_per_iteration": 2.939924478530884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113405, + "balance_loss_mlp": 1.06061697, + "epoch": 0.9342054636398615, + "flos": 504511397376.0, + "grad_norm": 0.037538841009704504, + "language_loss": 0.8107596, + "learning_rate": 1.1309469618119516e-05, + "loss": 0.8221001, + "num_input_tokens_seen": 402137728, + "router_z_loss_mlp": 0.734375, + "step": 4856, + "time_per_iteration": 2.6526336669921875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133343, + "balance_loss_mlp": 1.05995786, + "epoch": 0.934397845325125, + "flos": 594235321344.0, + "grad_norm": 0.029967610162658413, + "language_loss": 0.88165474, + "learning_rate": 1.1243677697377109e-05, + "loss": 0.89298815, + "num_input_tokens_seen": 402220160, + "router_z_loss_mlp": 0.73388672, + "step": 4857, + "time_per_iteration": 2.887981414794922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134666, + "balance_loss_mlp": 1.06137609, + "epoch": 0.9345902270103886, + "flos": 500883698688.0, + "grad_norm": 0.036598265959695855, + "language_loss": 0.84688962, + "learning_rate": 1.1178075528667453e-05, + "loss": 0.85823631, + "num_input_tokens_seen": 402285168, + "router_z_loss_mlp": 0.73291016, + "step": 4858, + "time_per_iteration": 2.704299211502075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139069, + "balance_loss_mlp": 1.06735229, + "epoch": 0.9347826086956522, + "flos": 1523404713984.0, + "grad_norm": 0.00324066268031166, + "language_loss": 0.7598772, + "learning_rate": 1.1112663137459566e-05, + "loss": 0.77126789, + "num_input_tokens_seen": 402504912, + "router_z_loss_mlp": 0.71875, + "step": 4859, + "time_per_iteration": 4.773599147796631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134721, + "balance_loss_mlp": 1.06147838, + "epoch": 0.9349749903809157, + "flos": 505664234496.0, + "grad_norm": 0.033069357773198756, + "language_loss": 0.8570931, + "learning_rate": 1.1047440549148636e-05, + "loss": 0.86844027, + "num_input_tokens_seen": 402582032, + "router_z_loss_mlp": 0.73242188, + "step": 4860, + "time_per_iteration": 2.8723926544189453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133433, + "balance_loss_mlp": 1.06009555, + "epoch": 0.9351673720661793, + "flos": 569964794880.0, + "grad_norm": 0.046471377956300595, + "language_loss": 0.84156215, + "learning_rate": 1.0982407789056514e-05, + "loss": 0.85289651, + "num_input_tokens_seen": 402650144, + "router_z_loss_mlp": 0.73339844, + "step": 4861, + "time_per_iteration": 2.6781229972839355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133548, + "balance_loss_mlp": 1.06011534, + "epoch": 0.9353597537514429, + "flos": 545662067712.0, + "grad_norm": 0.03778800547137944, + "language_loss": 0.90822428, + "learning_rate": 1.0917564882430952e-05, + "loss": 0.91955978, + "num_input_tokens_seen": 402720368, + "router_z_loss_mlp": 0.734375, + "step": 4862, + "time_per_iteration": 2.66455340385437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135025, + "balance_loss_mlp": 1.06173515, + "epoch": 0.9355521354367065, + "flos": 520019851776.0, + "grad_norm": 0.029824520949781164, + "language_loss": 0.88586128, + "learning_rate": 1.0852911854446368e-05, + "loss": 0.89721155, + "num_input_tokens_seen": 402795568, + "router_z_loss_mlp": 0.73291016, + "step": 4863, + "time_per_iteration": 2.698141098022461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135399, + "balance_loss_mlp": 1.06215656, + "epoch": 0.93574451712197, + "flos": 447235195392.0, + "grad_norm": 0.037674472562729544, + "language_loss": 0.83579856, + "learning_rate": 1.0788448730203237e-05, + "loss": 0.84715259, + "num_input_tokens_seen": 402858784, + "router_z_loss_mlp": 0.73242188, + "step": 4864, + "time_per_iteration": 2.512160062789917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135421, + "balance_loss_mlp": 1.06217897, + "epoch": 0.9359368988072335, + "flos": 481495766016.0, + "grad_norm": 0.046001044108411895, + "language_loss": 0.81934822, + "learning_rate": 1.072417553472832e-05, + "loss": 0.83070242, + "num_input_tokens_seen": 402924144, + "router_z_loss_mlp": 0.73242188, + "step": 4865, + "time_per_iteration": 2.5373268127441406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135169, + "balance_loss_mlp": 1.06197476, + "epoch": 0.9361292804924971, + "flos": 498091929600.0, + "grad_norm": 0.04032803456119548, + "language_loss": 0.90056789, + "learning_rate": 1.0660092292974766e-05, + "loss": 0.91191959, + "num_input_tokens_seen": 402987488, + "router_z_loss_mlp": 0.73193359, + "step": 4866, + "time_per_iteration": 2.6002197265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135017, + "balance_loss_mlp": 1.06187046, + "epoch": 0.9363216621777607, + "flos": 619293387264.0, + "grad_norm": 0.03580675503506335, + "language_loss": 0.88945127, + "learning_rate": 1.059619902982184e-05, + "loss": 0.90080142, + "num_input_tokens_seen": 403058224, + "router_z_loss_mlp": 0.73144531, + "step": 4867, + "time_per_iteration": 2.777174711227417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113549, + "balance_loss_mlp": 1.06377411, + "epoch": 0.9365140438630243, + "flos": 1418980067328.0, + "grad_norm": 0.003775098340926471, + "language_loss": 0.79203337, + "learning_rate": 1.053249577007509e-05, + "loss": 0.8033883, + "num_input_tokens_seen": 403289072, + "router_z_loss_mlp": 0.71875, + "step": 4868, + "time_per_iteration": 4.925109624862671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134027, + "balance_loss_mlp": 1.06068969, + "epoch": 0.9367064255482878, + "flos": 591649669632.0, + "grad_norm": 0.03396019612935237, + "language_loss": 0.85704494, + "learning_rate": 1.0468982538466287e-05, + "loss": 0.8683852, + "num_input_tokens_seen": 403361728, + "router_z_loss_mlp": 0.73339844, + "step": 4869, + "time_per_iteration": 2.752171754837036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133687, + "balance_loss_mlp": 1.06044507, + "epoch": 0.9368988072335513, + "flos": 527652555264.0, + "grad_norm": 0.03952131288198883, + "language_loss": 0.86593235, + "learning_rate": 1.0405659359653597e-05, + "loss": 0.87726915, + "num_input_tokens_seen": 403431536, + "router_z_loss_mlp": 0.73242188, + "step": 4870, + "time_per_iteration": 2.7232959270477295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134053, + "balance_loss_mlp": 1.06071544, + "epoch": 0.9370911889188149, + "flos": 744508581888.0, + "grad_norm": 0.03463108069269443, + "language_loss": 0.83654445, + "learning_rate": 1.034252625822113e-05, + "loss": 0.84788495, + "num_input_tokens_seen": 403504768, + "router_z_loss_mlp": 0.73339844, + "step": 4871, + "time_per_iteration": 2.9093987941741943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135096, + "balance_loss_mlp": 1.06199658, + "epoch": 0.9372835706040785, + "flos": 547077417984.0, + "grad_norm": 0.039804478611465105, + "language_loss": 0.82813054, + "learning_rate": 1.0279583258679448e-05, + "loss": 0.83948147, + "num_input_tokens_seen": 403575584, + "router_z_loss_mlp": 0.73095703, + "step": 4872, + "time_per_iteration": 2.61991286277771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135188, + "balance_loss_mlp": 1.06194568, + "epoch": 0.9374759522893421, + "flos": 492699772416.0, + "grad_norm": 0.03924108622044038, + "language_loss": 0.8609668, + "learning_rate": 1.0216830385465003e-05, + "loss": 0.87231869, + "num_input_tokens_seen": 403648720, + "router_z_loss_mlp": 0.73242188, + "step": 4873, + "time_per_iteration": 2.662440061569214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135351, + "balance_loss_mlp": 1.06206155, + "epoch": 0.9376683339746056, + "flos": 579531870720.0, + "grad_norm": 0.040838494467933396, + "language_loss": 0.87158096, + "learning_rate": 1.0154267662940809e-05, + "loss": 0.88293445, + "num_input_tokens_seen": 403721392, + "router_z_loss_mlp": 0.73291016, + "step": 4874, + "time_per_iteration": 2.6864585876464844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134875, + "balance_loss_mlp": 1.06153762, + "epoch": 0.9378607156598692, + "flos": 507296435712.0, + "grad_norm": 0.041653799515210505, + "language_loss": 0.86168003, + "learning_rate": 1.0091895115395766e-05, + "loss": 0.87302876, + "num_input_tokens_seen": 403792112, + "router_z_loss_mlp": 0.73339844, + "step": 4875, + "time_per_iteration": 2.6001012325286865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136122, + "balance_loss_mlp": 1.06302249, + "epoch": 0.9380530973451328, + "flos": 521070630912.0, + "grad_norm": 0.05437496502115945, + "language_loss": 0.82745492, + "learning_rate": 1.0029712767045062e-05, + "loss": 0.83881617, + "num_input_tokens_seen": 403860928, + "router_z_loss_mlp": 0.73095703, + "step": 4876, + "time_per_iteration": 2.6492278575897217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135619, + "balance_loss_mlp": 1.06242442, + "epoch": 0.9382454790303963, + "flos": 558869577216.0, + "grad_norm": 0.035653858877996346, + "language_loss": 0.89391607, + "learning_rate": 9.967720642029999e-06, + "loss": 0.90527225, + "num_input_tokens_seen": 403928240, + "router_z_loss_mlp": 0.73193359, + "step": 4877, + "time_per_iteration": 2.6514732837677 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134863, + "balance_loss_mlp": 1.06166816, + "epoch": 0.9384378607156598, + "flos": 696786720768.0, + "grad_norm": 0.03491740156282248, + "language_loss": 0.85915047, + "learning_rate": 9.905918764418153e-06, + "loss": 0.87049913, + "num_input_tokens_seen": 404004320, + "router_z_loss_mlp": 0.73193359, + "step": 4878, + "time_per_iteration": 2.908747673034668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134971, + "balance_loss_mlp": 1.06182373, + "epoch": 0.9386302424009234, + "flos": 555834760704.0, + "grad_norm": 0.040753856632951786, + "language_loss": 0.85157609, + "learning_rate": 9.844307158203058e-06, + "loss": 0.86292583, + "num_input_tokens_seen": 404077040, + "router_z_loss_mlp": 0.73144531, + "step": 4879, + "time_per_iteration": 2.6491734981536865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134453, + "balance_loss_mlp": 1.06116271, + "epoch": 0.938822624086187, + "flos": 568065351168.0, + "grad_norm": 0.04395633401499817, + "language_loss": 0.8441397, + "learning_rate": 9.782885847304469e-06, + "loss": 0.85548419, + "num_input_tokens_seen": 404145248, + "router_z_loss_mlp": 0.73291016, + "step": 4880, + "time_per_iteration": 2.7252390384674072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134587, + "balance_loss_mlp": 1.06153524, + "epoch": 0.9390150057714506, + "flos": 418547429376.0, + "grad_norm": 0.03347739941940771, + "language_loss": 0.8443892, + "learning_rate": 9.721654855568196e-06, + "loss": 0.85573506, + "num_input_tokens_seen": 404212000, + "router_z_loss_mlp": 0.73046875, + "step": 4881, + "time_per_iteration": 2.583867311477661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136116, + "balance_loss_mlp": 1.06301677, + "epoch": 0.9392073874567142, + "flos": 1556082570240.0, + "grad_norm": 0.03746627101283315, + "language_loss": 0.80632669, + "learning_rate": 9.660614206766394e-06, + "loss": 0.81768787, + "num_input_tokens_seen": 404305408, + "router_z_loss_mlp": 0.73095703, + "step": 4882, + "time_per_iteration": 3.714630126953125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135223, + "balance_loss_mlp": 1.06198061, + "epoch": 0.9393997691419776, + "flos": 653731877376.0, + "grad_norm": 0.0382645062266071, + "language_loss": 0.82485741, + "learning_rate": 9.59976392459705e-06, + "loss": 0.83620965, + "num_input_tokens_seen": 404383248, + "router_z_loss_mlp": 0.73242188, + "step": 4883, + "time_per_iteration": 3.2966370582580566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138214, + "balance_loss_mlp": 1.0664978, + "epoch": 0.9395921508272412, + "flos": 1556562839040.0, + "grad_norm": 0.003695333595737308, + "language_loss": 0.78170681, + "learning_rate": 9.539104032684209e-06, + "loss": 0.79308891, + "num_input_tokens_seen": 404615264, + "router_z_loss_mlp": 0.71875, + "step": 4884, + "time_per_iteration": 5.404622554779053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135325, + "balance_loss_mlp": 1.06212997, + "epoch": 0.9397845325125048, + "flos": 499197103104.0, + "grad_norm": 0.03656984791754246, + "language_loss": 0.82897919, + "learning_rate": 9.478634554578314e-06, + "loss": 0.84033239, + "num_input_tokens_seen": 404684656, + "router_z_loss_mlp": 0.73193359, + "step": 4885, + "time_per_iteration": 2.7291576862335205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135657, + "balance_loss_mlp": 1.06251049, + "epoch": 0.9399769141977684, + "flos": 499589872128.0, + "grad_norm": 0.036644251179858374, + "language_loss": 0.88491553, + "learning_rate": 9.418355513755638e-06, + "loss": 0.89627206, + "num_input_tokens_seen": 404752096, + "router_z_loss_mlp": 0.73144531, + "step": 4886, + "time_per_iteration": 2.620981216430664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135735, + "balance_loss_mlp": 1.06401825, + "epoch": 0.9401692958830319, + "flos": 1405675229184.0, + "grad_norm": 0.003512744995628987, + "language_loss": 0.79332191, + "learning_rate": 9.358266933618575e-06, + "loss": 0.80467921, + "num_input_tokens_seen": 404980944, + "router_z_loss_mlp": 0.71875, + "step": 4887, + "time_per_iteration": 4.856574296951294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133934, + "balance_loss_mlp": 1.06073952, + "epoch": 0.9403616775682955, + "flos": 541211900928.0, + "grad_norm": 0.0305164549996701, + "language_loss": 0.88444626, + "learning_rate": 9.298368837495575e-06, + "loss": 0.89578557, + "num_input_tokens_seen": 405056688, + "router_z_loss_mlp": 0.73193359, + "step": 4888, + "time_per_iteration": 2.739971399307251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135735, + "balance_loss_mlp": 1.06401825, + "epoch": 0.9405540592535591, + "flos": 1324938233856.0, + "grad_norm": 0.0035002189725473307, + "language_loss": 0.75169432, + "learning_rate": 9.238661248641089e-06, + "loss": 0.76305169, + "num_input_tokens_seen": 405284656, + "router_z_loss_mlp": 0.71875, + "step": 4889, + "time_per_iteration": 4.893186569213867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135497, + "balance_loss_mlp": 1.06230211, + "epoch": 0.9407464409388226, + "flos": 573427309056.0, + "grad_norm": 0.04031631625697337, + "language_loss": 0.88505602, + "learning_rate": 9.179144190235799e-06, + "loss": 0.896411, + "num_input_tokens_seen": 405351584, + "router_z_loss_mlp": 0.73193359, + "step": 4890, + "time_per_iteration": 2.6828339099884033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135232, + "balance_loss_mlp": 1.06199026, + "epoch": 0.9409388226240862, + "flos": 512348216832.0, + "grad_norm": 0.03147351995793952, + "language_loss": 0.81225574, + "learning_rate": 9.119817685386112e-06, + "loss": 0.82360804, + "num_input_tokens_seen": 405425712, + "router_z_loss_mlp": 0.73242188, + "step": 4891, + "time_per_iteration": 2.752286911010742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140076, + "balance_loss_mlp": 1.06835938, + "epoch": 0.9411312043093497, + "flos": 1573276523520.0, + "grad_norm": 0.004486626700418182, + "language_loss": 0.80241883, + "learning_rate": 9.06068175712471e-06, + "loss": 0.81381959, + "num_input_tokens_seen": 405655760, + "router_z_loss_mlp": 0.71875, + "step": 4892, + "time_per_iteration": 4.878049850463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136128, + "balance_loss_mlp": 1.06298077, + "epoch": 0.9413235859946133, + "flos": 570559678464.0, + "grad_norm": 0.041259003272787025, + "language_loss": 0.831617, + "learning_rate": 9.001736428410234e-06, + "loss": 0.84297824, + "num_input_tokens_seen": 405731664, + "router_z_loss_mlp": 0.73144531, + "step": 4893, + "time_per_iteration": 2.7614989280700684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134748, + "balance_loss_mlp": 1.06150591, + "epoch": 0.9415159676798769, + "flos": 783264981504.0, + "grad_norm": 0.04024659681002993, + "language_loss": 0.84358162, + "learning_rate": 8.942981722127263e-06, + "loss": 0.85492909, + "num_input_tokens_seen": 405808128, + "router_z_loss_mlp": 0.73242188, + "step": 4894, + "time_per_iteration": 3.074845552444458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113637, + "balance_loss_mlp": 1.06312764, + "epoch": 0.9417083493651405, + "flos": 850872330240.0, + "grad_norm": 0.02979508524031529, + "language_loss": 0.84446144, + "learning_rate": 8.884417661086331e-06, + "loss": 0.85582519, + "num_input_tokens_seen": 405892448, + "router_z_loss_mlp": 0.73242188, + "step": 4895, + "time_per_iteration": 3.244321346282959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135905, + "balance_loss_mlp": 1.06280613, + "epoch": 0.941900731050404, + "flos": 530451055104.0, + "grad_norm": 0.03415903081576368, + "language_loss": 0.90385509, + "learning_rate": 8.826044268024025e-06, + "loss": 0.91521418, + "num_input_tokens_seen": 405966736, + "router_z_loss_mlp": 0.73095703, + "step": 4896, + "time_per_iteration": 2.7122864723205566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134586, + "balance_loss_mlp": 1.06134343, + "epoch": 0.9420931127356675, + "flos": 558170634240.0, + "grad_norm": 0.03438694613546509, + "language_loss": 0.84335274, + "learning_rate": 8.767861565602997e-06, + "loss": 0.85469854, + "num_input_tokens_seen": 406043264, + "router_z_loss_mlp": 0.73242188, + "step": 4897, + "time_per_iteration": 2.7777915000915527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134624, + "balance_loss_mlp": 1.06142986, + "epoch": 0.9422854944209311, + "flos": 653786271744.0, + "grad_norm": 0.03610817623575041, + "language_loss": 0.90061867, + "learning_rate": 8.709869576411733e-06, + "loss": 0.91196489, + "num_input_tokens_seen": 406119552, + "router_z_loss_mlp": 0.73193359, + "step": 4898, + "time_per_iteration": 2.8397042751312256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136714, + "balance_loss_mlp": 1.06351972, + "epoch": 0.9424778761061947, + "flos": 554764515840.0, + "grad_norm": 0.032200962869082285, + "language_loss": 0.88306475, + "learning_rate": 8.65206832296478e-06, + "loss": 0.89443189, + "num_input_tokens_seen": 406192464, + "router_z_loss_mlp": 0.73193359, + "step": 4899, + "time_per_iteration": 2.758490800857544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136296, + "balance_loss_mlp": 1.06314933, + "epoch": 0.9426702577914583, + "flos": 589650169344.0, + "grad_norm": 0.04146685259937853, + "language_loss": 0.84754741, + "learning_rate": 8.594457827702406e-06, + "loss": 0.85891032, + "num_input_tokens_seen": 406262640, + "router_z_loss_mlp": 0.73144531, + "step": 4900, + "time_per_iteration": 2.6957013607025146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136116, + "balance_loss_mlp": 1.06292105, + "epoch": 0.9428626394767218, + "flos": 617812909056.0, + "grad_norm": 0.04053390873945447, + "language_loss": 0.83133346, + "learning_rate": 8.537038112991114e-06, + "loss": 0.84269458, + "num_input_tokens_seen": 406341328, + "router_z_loss_mlp": 0.73193359, + "step": 4901, + "time_per_iteration": 2.8101513385772705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136485, + "balance_loss_mlp": 1.06329107, + "epoch": 0.9430550211619854, + "flos": 611541161472.0, + "grad_norm": 0.036057292363132605, + "language_loss": 0.86717069, + "learning_rate": 8.479809201123178e-06, + "loss": 0.87853551, + "num_input_tokens_seen": 406418864, + "router_z_loss_mlp": 0.73193359, + "step": 4902, + "time_per_iteration": 2.7493042945861816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136839, + "balance_loss_mlp": 1.06364477, + "epoch": 0.943247402847249, + "flos": 567051502080.0, + "grad_norm": 0.03817021033168505, + "language_loss": 0.82748675, + "learning_rate": 8.422771114316885e-06, + "loss": 0.83885515, + "num_input_tokens_seen": 406492320, + "router_z_loss_mlp": 0.73193359, + "step": 4903, + "time_per_iteration": 2.731077194213867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135115, + "balance_loss_mlp": 1.06187308, + "epoch": 0.9434397845325125, + "flos": 528088985088.0, + "grad_norm": 0.04132634874172125, + "language_loss": 0.86513394, + "learning_rate": 8.365923874716297e-06, + "loss": 0.87648505, + "num_input_tokens_seen": 406560448, + "router_z_loss_mlp": 0.73242188, + "step": 4904, + "time_per_iteration": 2.6607890129089355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135447, + "balance_loss_mlp": 1.06229973, + "epoch": 0.943632166217776, + "flos": 594591160320.0, + "grad_norm": 0.03589040439105028, + "language_loss": 0.87627959, + "learning_rate": 8.309267504391593e-06, + "loss": 0.88763404, + "num_input_tokens_seen": 406631376, + "router_z_loss_mlp": 0.73144531, + "step": 4905, + "time_per_iteration": 2.725121021270752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135257, + "balance_loss_mlp": 1.06206262, + "epoch": 0.9438245479030396, + "flos": 573981259776.0, + "grad_norm": 0.028116257659022252, + "language_loss": 0.88786232, + "learning_rate": 8.252802025338623e-06, + "loss": 0.89921498, + "num_input_tokens_seen": 406713728, + "router_z_loss_mlp": 0.73193359, + "step": 4906, + "time_per_iteration": 2.84151291847229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137071, + "balance_loss_mlp": 1.06387651, + "epoch": 0.9440169295883032, + "flos": 489221795328.0, + "grad_norm": 0.03908331871996133, + "language_loss": 0.86816639, + "learning_rate": 8.196527459479242e-06, + "loss": 0.87953711, + "num_input_tokens_seen": 406779168, + "router_z_loss_mlp": 0.73193359, + "step": 4907, + "time_per_iteration": 2.593106269836426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136761, + "balance_loss_mlp": 1.06361377, + "epoch": 0.9442093112735668, + "flos": 733122653184.0, + "grad_norm": 0.03263207151306975, + "language_loss": 0.78277397, + "learning_rate": 8.140443828661137e-06, + "loss": 0.79414153, + "num_input_tokens_seen": 406860816, + "router_z_loss_mlp": 0.73144531, + "step": 4908, + "time_per_iteration": 2.9979734420776367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136747, + "balance_loss_mlp": 1.06355298, + "epoch": 0.9444016929588304, + "flos": 572105284608.0, + "grad_norm": 0.039051820427737964, + "language_loss": 0.86598486, + "learning_rate": 8.084551154658004e-06, + "loss": 0.8773523, + "num_input_tokens_seen": 406929888, + "router_z_loss_mlp": 0.73193359, + "step": 4909, + "time_per_iteration": 2.6849961280822754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136144, + "balance_loss_mlp": 1.06299686, + "epoch": 0.9445940746440938, + "flos": 510311786496.0, + "grad_norm": 0.03853248508401035, + "language_loss": 0.91414893, + "learning_rate": 8.028849459169318e-06, + "loss": 0.92551035, + "num_input_tokens_seen": 406998224, + "router_z_loss_mlp": 0.73144531, + "step": 4910, + "time_per_iteration": 2.5958712100982666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136817, + "balance_loss_mlp": 1.06357515, + "epoch": 0.9447864563293574, + "flos": 625797448704.0, + "grad_norm": 0.03483487859921532, + "language_loss": 0.85226071, + "learning_rate": 7.97333876382028e-06, + "loss": 0.86362892, + "num_input_tokens_seen": 407075088, + "router_z_loss_mlp": 0.73242188, + "step": 4911, + "time_per_iteration": 2.8528859615325928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134822, + "balance_loss_mlp": 1.06158018, + "epoch": 0.944978838014621, + "flos": 506308783104.0, + "grad_norm": 0.03612723857831656, + "language_loss": 0.85505927, + "learning_rate": 7.918019090162098e-06, + "loss": 0.86640745, + "num_input_tokens_seen": 407147792, + "router_z_loss_mlp": 0.73242188, + "step": 4912, + "time_per_iteration": 2.7557713985443115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139984, + "balance_loss_mlp": 1.06826782, + "epoch": 0.9451712196998846, + "flos": 1487551600128.0, + "grad_norm": 0.004706549025358334, + "language_loss": 0.78287339, + "learning_rate": 7.862890459671812e-06, + "loss": 0.79427326, + "num_input_tokens_seen": 407387216, + "router_z_loss_mlp": 0.71875, + "step": 4913, + "time_per_iteration": 4.964468955993652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135191, + "balance_loss_mlp": 1.06194913, + "epoch": 0.9453636013851482, + "flos": 522151609344.0, + "grad_norm": 0.03617704302923612, + "language_loss": 0.95077229, + "learning_rate": 7.80795289375219e-06, + "loss": 0.96212423, + "num_input_tokens_seen": 407457664, + "router_z_loss_mlp": 0.73242188, + "step": 4914, + "time_per_iteration": 2.6678929328918457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138802, + "balance_loss_mlp": 1.06708527, + "epoch": 0.9455559830704117, + "flos": 1500283748352.0, + "grad_norm": 0.004548904069174758, + "language_loss": 0.8356235, + "learning_rate": 7.75320641373195e-06, + "loss": 0.84701157, + "num_input_tokens_seen": 407700256, + "router_z_loss_mlp": 0.71875, + "step": 4915, + "time_per_iteration": 4.94046950340271 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113512, + "balance_loss_mlp": 1.06187737, + "epoch": 0.9457483647556753, + "flos": 499151440896.0, + "grad_norm": 0.034056935768259265, + "language_loss": 0.86546624, + "learning_rate": 7.698651040865534e-06, + "loss": 0.87681735, + "num_input_tokens_seen": 407770080, + "router_z_loss_mlp": 0.73242188, + "step": 4916, + "time_per_iteration": 2.6402246952056885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136151, + "balance_loss_mlp": 1.0630039, + "epoch": 0.9459407464409388, + "flos": 1021117673472.0, + "grad_norm": 0.03091693708004351, + "language_loss": 0.86156452, + "learning_rate": 7.644286796333222e-06, + "loss": 0.872926, + "num_input_tokens_seen": 407854640, + "router_z_loss_mlp": 0.73144531, + "step": 4917, + "time_per_iteration": 3.370896816253662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136157, + "balance_loss_mlp": 1.06300974, + "epoch": 0.9461331281262024, + "flos": 514620963840.0, + "grad_norm": 0.03805401706614232, + "language_loss": 0.86857271, + "learning_rate": 7.590113701241075e-06, + "loss": 0.87993431, + "num_input_tokens_seen": 407922704, + "router_z_loss_mlp": 0.73144531, + "step": 4918, + "time_per_iteration": 2.6039915084838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136067, + "balance_loss_mlp": 1.06282437, + "epoch": 0.9463255098114659, + "flos": 529048439808.0, + "grad_norm": 0.04139599350872911, + "language_loss": 0.83497351, + "learning_rate": 7.536131776620936e-06, + "loss": 0.84633422, + "num_input_tokens_seen": 407991136, + "router_z_loss_mlp": 0.73242188, + "step": 4919, + "time_per_iteration": 2.6238739490509033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135985, + "balance_loss_mlp": 1.06283832, + "epoch": 0.9465178914967295, + "flos": 507027191808.0, + "grad_norm": 0.044536709524851746, + "language_loss": 0.88624299, + "learning_rate": 7.482341043430485e-06, + "loss": 0.89760286, + "num_input_tokens_seen": 408056576, + "router_z_loss_mlp": 0.73144531, + "step": 4920, + "time_per_iteration": 2.5972156524658203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134582, + "balance_loss_mlp": 1.06133986, + "epoch": 0.9467102731819931, + "flos": 661538497536.0, + "grad_norm": 0.045944769490510115, + "language_loss": 0.89346719, + "learning_rate": 7.428741522553184e-06, + "loss": 0.90481305, + "num_input_tokens_seen": 408136960, + "router_z_loss_mlp": 0.73242188, + "step": 4921, + "time_per_iteration": 2.878498077392578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134699, + "balance_loss_mlp": 1.06145644, + "epoch": 0.9469026548672567, + "flos": 676504461312.0, + "grad_norm": 0.03622409343837378, + "language_loss": 0.93210799, + "learning_rate": 7.375333234798054e-06, + "loss": 0.94345504, + "num_input_tokens_seen": 408218304, + "router_z_loss_mlp": 0.73242188, + "step": 4922, + "time_per_iteration": 2.9211013317108154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136193, + "balance_loss_mlp": 1.06295109, + "epoch": 0.9470950365525203, + "flos": 515020463616.0, + "grad_norm": 0.07987170801903949, + "language_loss": 0.84155279, + "learning_rate": 7.32211620090012e-06, + "loss": 0.85291469, + "num_input_tokens_seen": 408287936, + "router_z_loss_mlp": 0.73242188, + "step": 4923, + "time_per_iteration": 2.6229920387268066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136284, + "balance_loss_mlp": 1.06304216, + "epoch": 0.9472874182377837, + "flos": 551226140160.0, + "grad_norm": 0.03359870786609723, + "language_loss": 0.85794783, + "learning_rate": 7.269090441520132e-06, + "loss": 0.86931068, + "num_input_tokens_seen": 408365568, + "router_z_loss_mlp": 0.73242188, + "step": 4924, + "time_per_iteration": 4.327451705932617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136476, + "balance_loss_mlp": 1.06332874, + "epoch": 0.9474797999230473, + "flos": 543810287616.0, + "grad_norm": 0.04461289962569648, + "language_loss": 0.84685075, + "learning_rate": 7.216255977244457e-06, + "loss": 0.85821545, + "num_input_tokens_seen": 408431248, + "router_z_loss_mlp": 0.73144531, + "step": 4925, + "time_per_iteration": 2.628014326095581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136189, + "balance_loss_mlp": 1.06294644, + "epoch": 0.9476721816083109, + "flos": 846063596544.0, + "grad_norm": 0.03676518142184114, + "language_loss": 0.90082932, + "learning_rate": 7.163612828585242e-06, + "loss": 0.91219121, + "num_input_tokens_seen": 408514112, + "router_z_loss_mlp": 0.73242188, + "step": 4926, + "time_per_iteration": 3.1086716651916504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136732, + "balance_loss_mlp": 1.06368101, + "epoch": 0.9478645632935745, + "flos": 639147949056.0, + "grad_norm": 0.037886935855288933, + "language_loss": 0.83596742, + "learning_rate": 7.1111610159803605e-06, + "loss": 0.84733474, + "num_input_tokens_seen": 408585968, + "router_z_loss_mlp": 0.73046875, + "step": 4927, + "time_per_iteration": 2.840261220932007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134894, + "balance_loss_mlp": 1.06174707, + "epoch": 0.948056944978838, + "flos": 658041054720.0, + "grad_norm": 0.03537137119366953, + "language_loss": 0.80161017, + "learning_rate": 7.058900559793469e-06, + "loss": 0.81295913, + "num_input_tokens_seen": 408665456, + "router_z_loss_mlp": 0.73193359, + "step": 4928, + "time_per_iteration": 2.820704936981201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134616, + "balance_loss_mlp": 1.06137371, + "epoch": 0.9482493266641016, + "flos": 441836307456.0, + "grad_norm": 0.03955323262094278, + "language_loss": 0.87748522, + "learning_rate": 7.00683148031378e-06, + "loss": 0.88883138, + "num_input_tokens_seen": 408730192, + "router_z_loss_mlp": 0.73242188, + "step": 4929, + "time_per_iteration": 2.5240581035614014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136268, + "balance_loss_mlp": 1.06302619, + "epoch": 0.9484417083493651, + "flos": 547121078784.0, + "grad_norm": 0.03887739915879212, + "language_loss": 0.82831037, + "learning_rate": 6.9549537977564024e-06, + "loss": 0.83967304, + "num_input_tokens_seen": 408807616, + "router_z_loss_mlp": 0.73242188, + "step": 4930, + "time_per_iteration": 2.7851428985595703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136252, + "balance_loss_mlp": 1.0630095, + "epoch": 0.9486340900346287, + "flos": 539694492672.0, + "grad_norm": 0.0339786344788922, + "language_loss": 0.83988905, + "learning_rate": 6.903267532262003e-06, + "loss": 0.8512516, + "num_input_tokens_seen": 408883552, + "router_z_loss_mlp": 0.73242188, + "step": 4931, + "time_per_iteration": 2.6893911361694336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135873, + "balance_loss_mlp": 1.06267822, + "epoch": 0.9488264717198923, + "flos": 682901735424.0, + "grad_norm": 0.03750385652355195, + "language_loss": 0.90455812, + "learning_rate": 6.851772703896975e-06, + "loss": 0.91591686, + "num_input_tokens_seen": 408956400, + "router_z_loss_mlp": 0.73193359, + "step": 4932, + "time_per_iteration": 2.870084762573242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136544, + "balance_loss_mlp": 1.06330168, + "epoch": 0.9490188534051558, + "flos": 463560113664.0, + "grad_norm": 0.04146699354604264, + "language_loss": 0.93162906, + "learning_rate": 6.8004693326533805e-06, + "loss": 0.94299448, + "num_input_tokens_seen": 409019904, + "router_z_loss_mlp": 0.73242188, + "step": 4933, + "time_per_iteration": 2.523359775543213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136279, + "balance_loss_mlp": 1.06308496, + "epoch": 0.9492112350904194, + "flos": 544218519552.0, + "grad_norm": 0.03412343034174357, + "language_loss": 0.87004709, + "learning_rate": 6.7493574384489e-06, + "loss": 0.88140994, + "num_input_tokens_seen": 409094288, + "router_z_loss_mlp": 0.73193359, + "step": 4934, + "time_per_iteration": 2.6940860748291016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136518, + "balance_loss_mlp": 1.06332338, + "epoch": 0.949403616775683, + "flos": 551458454016.0, + "grad_norm": 0.03617720765095602, + "language_loss": 0.8781929, + "learning_rate": 6.698437041126992e-06, + "loss": 0.88955808, + "num_input_tokens_seen": 409169120, + "router_z_loss_mlp": 0.73193359, + "step": 4935, + "time_per_iteration": 2.790689706802368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134956, + "balance_loss_mlp": 1.06171405, + "epoch": 0.9495959984609466, + "flos": 599497222656.0, + "grad_norm": 0.032619945002332076, + "language_loss": 0.86929369, + "learning_rate": 6.647708160456678e-06, + "loss": 0.88064325, + "num_input_tokens_seen": 409243200, + "router_z_loss_mlp": 0.73242188, + "step": 4936, + "time_per_iteration": 2.712833881378174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113519, + "balance_loss_mlp": 1.06194746, + "epoch": 0.94978838014621, + "flos": 609530927616.0, + "grad_norm": 0.03651321025229267, + "language_loss": 0.87489212, + "learning_rate": 6.597170816132702e-06, + "loss": 0.88624406, + "num_input_tokens_seen": 409319264, + "router_z_loss_mlp": 0.73242188, + "step": 4937, + "time_per_iteration": 2.800729513168335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136433, + "balance_loss_mlp": 1.0631907, + "epoch": 0.9499807618314736, + "flos": 541865181696.0, + "grad_norm": 0.03285741477727048, + "language_loss": 0.90760124, + "learning_rate": 6.546825027775427e-06, + "loss": 0.91896558, + "num_input_tokens_seen": 409389840, + "router_z_loss_mlp": 0.73242188, + "step": 4938, + "time_per_iteration": 2.683340311050415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136285, + "balance_loss_mlp": 1.0631386, + "epoch": 0.9501731435167372, + "flos": 595709068800.0, + "grad_norm": 0.03334591399320501, + "language_loss": 0.86523139, + "learning_rate": 6.496670814930717e-06, + "loss": 0.87659431, + "num_input_tokens_seen": 409458752, + "router_z_loss_mlp": 0.73144531, + "step": 4939, + "time_per_iteration": 2.82743763923645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136188, + "balance_loss_mlp": 1.06304121, + "epoch": 0.9503655252020008, + "flos": 455072014848.0, + "grad_norm": 0.03930006662979796, + "language_loss": 0.85443276, + "learning_rate": 6.446708197070161e-06, + "loss": 0.86579466, + "num_input_tokens_seen": 409525008, + "router_z_loss_mlp": 0.73144531, + "step": 4940, + "time_per_iteration": 2.613368034362793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113632, + "balance_loss_mlp": 1.06307828, + "epoch": 0.9505579068872644, + "flos": 669127540224.0, + "grad_norm": 0.0356696809458609, + "language_loss": 0.89633119, + "learning_rate": 6.396937193591079e-06, + "loss": 0.90769434, + "num_input_tokens_seen": 409603376, + "router_z_loss_mlp": 0.73242188, + "step": 4941, + "time_per_iteration": 2.8095662593841553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134768, + "balance_loss_mlp": 1.06147814, + "epoch": 0.9507502885725279, + "flos": 403079907840.0, + "grad_norm": 0.038919580018142184, + "language_loss": 0.87087023, + "learning_rate": 6.347357823816235e-06, + "loss": 0.88221788, + "num_input_tokens_seen": 409667168, + "router_z_loss_mlp": 0.73291016, + "step": 4942, + "time_per_iteration": 2.473461627960205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113482, + "balance_loss_mlp": 1.06157768, + "epoch": 0.9509426702577914, + "flos": 701736443904.0, + "grad_norm": 0.03427667838843753, + "language_loss": 0.84288859, + "learning_rate": 6.297970106994011e-06, + "loss": 0.85423684, + "num_input_tokens_seen": 409746832, + "router_z_loss_mlp": 0.73242188, + "step": 4943, + "time_per_iteration": 2.9936366081237793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135576, + "balance_loss_mlp": 1.06233358, + "epoch": 0.951135051943055, + "flos": 502401106944.0, + "grad_norm": 0.03656450632617296, + "language_loss": 0.86557579, + "learning_rate": 6.2487740622985126e-06, + "loss": 0.87693161, + "num_input_tokens_seen": 409813792, + "router_z_loss_mlp": 0.73242188, + "step": 4944, + "time_per_iteration": 2.610600233078003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136645, + "balance_loss_mlp": 1.06354642, + "epoch": 0.9513274336283186, + "flos": 615865801728.0, + "grad_norm": 0.03295078964621213, + "language_loss": 0.85542595, + "learning_rate": 6.1997697088292395e-06, + "loss": 0.86679238, + "num_input_tokens_seen": 409898848, + "router_z_loss_mlp": 0.73095703, + "step": 4945, + "time_per_iteration": 2.9333925247192383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136839, + "balance_loss_mlp": 1.06369233, + "epoch": 0.9515198153135821, + "flos": 520597271040.0, + "grad_norm": 0.04029361545540468, + "language_loss": 0.86667025, + "learning_rate": 6.150957065611363e-06, + "loss": 0.87803864, + "num_input_tokens_seen": 409966368, + "router_z_loss_mlp": 0.73144531, + "step": 4946, + "time_per_iteration": 2.5970242023468018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136296, + "balance_loss_mlp": 1.06314898, + "epoch": 0.9517121969988457, + "flos": 666284104704.0, + "grad_norm": 0.033604894008419074, + "language_loss": 0.80945677, + "learning_rate": 6.102336151595667e-06, + "loss": 0.82081974, + "num_input_tokens_seen": 410048496, + "router_z_loss_mlp": 0.73144531, + "step": 4947, + "time_per_iteration": 2.9714138507843018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138525, + "balance_loss_mlp": 1.06537843, + "epoch": 0.9519045786841093, + "flos": 677615639040.0, + "grad_norm": 0.040926124550095325, + "language_loss": 0.8053059, + "learning_rate": 6.053906985658553e-06, + "loss": 0.81669116, + "num_input_tokens_seen": 410121840, + "router_z_loss_mlp": 0.73144531, + "step": 4948, + "time_per_iteration": 2.809159278869629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138321, + "balance_loss_mlp": 1.06507838, + "epoch": 0.9520969603693729, + "flos": 654140109312.0, + "grad_norm": 0.03095345074034261, + "language_loss": 0.84655893, + "learning_rate": 6.005669586601814e-06, + "loss": 0.8579421, + "num_input_tokens_seen": 410199152, + "router_z_loss_mlp": 0.73242188, + "step": 4949, + "time_per_iteration": 2.910127878189087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138692, + "balance_loss_mlp": 1.06554544, + "epoch": 0.9522893420546364, + "flos": 744682498560.0, + "grad_norm": 0.032408881572200024, + "language_loss": 0.87415892, + "learning_rate": 5.957623973152748e-06, + "loss": 0.88554585, + "num_input_tokens_seen": 410285392, + "router_z_loss_mlp": 0.73144531, + "step": 4950, + "time_per_iteration": 3.021373987197876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138367, + "balance_loss_mlp": 1.06521976, + "epoch": 0.9524817237398999, + "flos": 763030385664.0, + "grad_norm": 0.03881087544404618, + "language_loss": 0.85428655, + "learning_rate": 5.909770163964545e-06, + "loss": 0.86567014, + "num_input_tokens_seen": 410359872, + "router_z_loss_mlp": 0.73144531, + "step": 4951, + "time_per_iteration": 2.9622764587402344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138142, + "balance_loss_mlp": 1.06499469, + "epoch": 0.9526741054251635, + "flos": 530146882560.0, + "grad_norm": 0.038541049170088305, + "language_loss": 0.85973597, + "learning_rate": 5.8621081776155105e-06, + "loss": 0.87111747, + "num_input_tokens_seen": 410425728, + "router_z_loss_mlp": 0.73144531, + "step": 4952, + "time_per_iteration": 2.5878281593322754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136477, + "balance_loss_mlp": 1.06337738, + "epoch": 0.9528664871104271, + "flos": 489425911296.0, + "grad_norm": 0.03895213525755141, + "language_loss": 0.86453646, + "learning_rate": 5.814638032609787e-06, + "loss": 0.87590122, + "num_input_tokens_seen": 410496080, + "router_z_loss_mlp": 0.73095703, + "step": 4953, + "time_per_iteration": 2.6211817264556885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136503, + "balance_loss_mlp": 1.06340432, + "epoch": 0.9530588687956907, + "flos": 518871744000.0, + "grad_norm": 0.033652335193776035, + "language_loss": 0.8942554, + "learning_rate": 5.76735974737691e-06, + "loss": 0.90562046, + "num_input_tokens_seen": 410576448, + "router_z_loss_mlp": 0.73095703, + "step": 4954, + "time_per_iteration": 2.7593400478363037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134917, + "balance_loss_mlp": 1.06167483, + "epoch": 0.9532512504809542, + "flos": 676413136896.0, + "grad_norm": 0.040464559019193894, + "language_loss": 0.86070359, + "learning_rate": 5.720273340271864e-06, + "loss": 0.87205279, + "num_input_tokens_seen": 410655792, + "router_z_loss_mlp": 0.73242188, + "step": 4955, + "time_per_iteration": 2.8816840648651123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134706, + "balance_loss_mlp": 1.06146348, + "epoch": 0.9534436321662177, + "flos": 490541818368.0, + "grad_norm": 0.03782014800574082, + "language_loss": 0.88387191, + "learning_rate": 5.673378829575249e-06, + "loss": 0.89521897, + "num_input_tokens_seen": 410725440, + "router_z_loss_mlp": 0.73242188, + "step": 4956, + "time_per_iteration": 2.583472967147827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134542, + "balance_loss_mlp": 1.06129992, + "epoch": 0.9536360138514813, + "flos": 497588370432.0, + "grad_norm": 0.03567484815320272, + "language_loss": 0.86718768, + "learning_rate": 5.626676233493167e-06, + "loss": 0.87853312, + "num_input_tokens_seen": 410797552, + "router_z_loss_mlp": 0.73242188, + "step": 4957, + "time_per_iteration": 2.6281793117523193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113481, + "balance_loss_mlp": 1.06156778, + "epoch": 0.9538283955367449, + "flos": 802857030144.0, + "grad_norm": 0.03957427847301793, + "language_loss": 0.87529492, + "learning_rate": 5.580165570157114e-06, + "loss": 0.88664305, + "num_input_tokens_seen": 410876736, + "router_z_loss_mlp": 0.73242188, + "step": 4958, + "time_per_iteration": 3.0466809272766113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136277, + "balance_loss_mlp": 1.06317747, + "epoch": 0.9540207772220085, + "flos": 557797330944.0, + "grad_norm": 0.03074291397770573, + "language_loss": 0.83816719, + "learning_rate": 5.533846857624203e-06, + "loss": 0.84952998, + "num_input_tokens_seen": 410955632, + "router_z_loss_mlp": 0.73095703, + "step": 4959, + "time_per_iteration": 2.7519495487213135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136369, + "balance_loss_mlp": 1.0633173, + "epoch": 0.954213158907272, + "flos": 685758632448.0, + "grad_norm": 0.035505648918623366, + "language_loss": 0.86093831, + "learning_rate": 5.487720113876882e-06, + "loss": 0.87230206, + "num_input_tokens_seen": 411038480, + "router_z_loss_mlp": 0.73046875, + "step": 4960, + "time_per_iteration": 2.910886764526367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136287, + "balance_loss_mlp": 1.06318796, + "epoch": 0.9544055405925356, + "flos": 536846327808.0, + "grad_norm": 0.04174534847869379, + "language_loss": 0.87276769, + "learning_rate": 5.441785356823214e-06, + "loss": 0.88413054, + "num_input_tokens_seen": 411109744, + "router_z_loss_mlp": 0.73095703, + "step": 4961, + "time_per_iteration": 2.7283856868743896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135918, + "balance_loss_mlp": 1.06281853, + "epoch": 0.9545979222777992, + "flos": 826923440640.0, + "grad_norm": 0.04693224510811112, + "language_loss": 0.84321594, + "learning_rate": 5.3960426042965476e-06, + "loss": 0.8545751, + "num_input_tokens_seen": 411202192, + "router_z_loss_mlp": 0.73095703, + "step": 4962, + "time_per_iteration": 3.1215646266937256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135961, + "balance_loss_mlp": 1.0628618, + "epoch": 0.9547903039630627, + "flos": 763156638720.0, + "grad_norm": 0.0399330944835338, + "language_loss": 0.81885892, + "learning_rate": 5.3504918740558405e-06, + "loss": 0.83021849, + "num_input_tokens_seen": 411289248, + "router_z_loss_mlp": 0.73095703, + "step": 4963, + "time_per_iteration": 3.1090612411499023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136424, + "balance_loss_mlp": 1.06332457, + "epoch": 0.9549826856483262, + "flos": 516333755904.0, + "grad_norm": 0.03824273588558422, + "language_loss": 0.87225604, + "learning_rate": 5.3051331837855045e-06, + "loss": 0.88362026, + "num_input_tokens_seen": 411355232, + "router_z_loss_mlp": 0.73095703, + "step": 4964, + "time_per_iteration": 2.620351552963257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134867, + "balance_loss_mlp": 1.06172025, + "epoch": 0.9551750673335898, + "flos": 644266859520.0, + "grad_norm": 0.03397371897405953, + "language_loss": 0.87095642, + "learning_rate": 5.259966551095341e-06, + "loss": 0.88230509, + "num_input_tokens_seen": 411432288, + "router_z_loss_mlp": 0.73193359, + "step": 4965, + "time_per_iteration": 2.814934015274048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134469, + "balance_loss_mlp": 1.06127489, + "epoch": 0.9553674490188534, + "flos": 473174853120.0, + "grad_norm": 0.03543650438605603, + "language_loss": 0.8735832, + "learning_rate": 5.214991993520546e-06, + "loss": 0.88492787, + "num_input_tokens_seen": 411499376, + "router_z_loss_mlp": 0.73193359, + "step": 4966, + "time_per_iteration": 2.6101207733154297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134749, + "balance_loss_mlp": 1.06150663, + "epoch": 0.955559830704117, + "flos": 529337149440.0, + "grad_norm": 0.04293839693076082, + "language_loss": 0.87281948, + "learning_rate": 5.170209528521763e-06, + "loss": 0.88416696, + "num_input_tokens_seen": 411564976, + "router_z_loss_mlp": 0.73242188, + "step": 4967, + "time_per_iteration": 2.5984079837799072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135008, + "balance_loss_mlp": 1.06181312, + "epoch": 0.9557522123893806, + "flos": 549217907712.0, + "grad_norm": 0.038038109123601054, + "language_loss": 0.88284183, + "learning_rate": 5.125619173485196e-06, + "loss": 0.89419186, + "num_input_tokens_seen": 411636464, + "router_z_loss_mlp": 0.73193359, + "step": 4968, + "time_per_iteration": 2.634786605834961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134993, + "balance_loss_mlp": 1.06175089, + "epoch": 0.955944594074644, + "flos": 510524634624.0, + "grad_norm": 0.029523963923908957, + "language_loss": 0.85467374, + "learning_rate": 5.08122094572222e-06, + "loss": 0.86602366, + "num_input_tokens_seen": 411710672, + "router_z_loss_mlp": 0.73242188, + "step": 4969, + "time_per_iteration": 2.6917636394500732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136238, + "balance_loss_mlp": 1.0630914, + "epoch": 0.9561369757599076, + "flos": 528710065152.0, + "grad_norm": 0.036722318154549516, + "language_loss": 0.84130347, + "learning_rate": 5.037014862469824e-06, + "loss": 0.85266584, + "num_input_tokens_seen": 411785616, + "router_z_loss_mlp": 0.73144531, + "step": 4970, + "time_per_iteration": 2.764472723007202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136347, + "balance_loss_mlp": 1.06329584, + "epoch": 0.9563293574451712, + "flos": 499207836672.0, + "grad_norm": 0.035098427244714854, + "language_loss": 0.83948302, + "learning_rate": 4.993000940890391e-06, + "loss": 0.85084653, + "num_input_tokens_seen": 411854832, + "router_z_loss_mlp": 0.73046875, + "step": 4971, + "time_per_iteration": 2.6011996269226074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141472, + "balance_loss_mlp": 1.06994629, + "epoch": 0.9565217391304348, + "flos": 1411744135680.0, + "grad_norm": 0.0046380775984094435, + "language_loss": 0.81773561, + "learning_rate": 4.949179198071585e-06, + "loss": 0.82915032, + "num_input_tokens_seen": 412081856, + "router_z_loss_mlp": 0.71679688, + "step": 4972, + "time_per_iteration": 4.86350417137146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136441, + "balance_loss_mlp": 1.06329453, + "epoch": 0.9567141208156984, + "flos": 504884700672.0, + "grad_norm": 0.036181124300498, + "language_loss": 0.8206802, + "learning_rate": 4.905549651026464e-06, + "loss": 0.8320446, + "num_input_tokens_seen": 412155600, + "router_z_loss_mlp": 0.73144531, + "step": 4973, + "time_per_iteration": 2.7482728958129883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113485, + "balance_loss_mlp": 1.06160808, + "epoch": 0.9569065025009619, + "flos": 434129743872.0, + "grad_norm": 0.045997872643652196, + "language_loss": 0.84962678, + "learning_rate": 4.86211231669359e-06, + "loss": 0.86097533, + "num_input_tokens_seen": 412219584, + "router_z_loss_mlp": 0.73242188, + "step": 4974, + "time_per_iteration": 2.470872163772583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134551, + "balance_loss_mlp": 1.06130922, + "epoch": 0.9570988841862255, + "flos": 591154842624.0, + "grad_norm": 0.0403367254829792, + "language_loss": 0.84212631, + "learning_rate": 4.818867211936806e-06, + "loss": 0.85347188, + "num_input_tokens_seen": 412295088, + "router_z_loss_mlp": 0.73242188, + "step": 4975, + "time_per_iteration": 2.7816882133483887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135143, + "balance_loss_mlp": 1.06190073, + "epoch": 0.957291265871489, + "flos": 768642121728.0, + "grad_norm": 0.04652333923499507, + "language_loss": 0.835931, + "learning_rate": 4.7758143535454045e-06, + "loss": 0.84728247, + "num_input_tokens_seen": 412376992, + "router_z_loss_mlp": 0.73242188, + "step": 4976, + "time_per_iteration": 2.957157850265503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134733, + "balance_loss_mlp": 1.0615381, + "epoch": 0.9574836475567526, + "flos": 640246391808.0, + "grad_norm": 0.03712988268786209, + "language_loss": 0.89267516, + "learning_rate": 4.732953758233849e-06, + "loss": 0.90402251, + "num_input_tokens_seen": 412450064, + "router_z_loss_mlp": 0.73193359, + "step": 4977, + "time_per_iteration": 2.803529977798462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141525, + "balance_loss_mlp": 1.06980896, + "epoch": 0.9576760292420161, + "flos": 1579398549504.0, + "grad_norm": 0.004511171675373937, + "language_loss": 0.78607261, + "learning_rate": 4.690285442642272e-06, + "loss": 0.79748785, + "num_input_tokens_seen": 412676896, + "router_z_loss_mlp": 0.71875, + "step": 4978, + "time_per_iteration": 4.911847352981567 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134672, + "balance_loss_mlp": 1.0614301, + "epoch": 0.9578684109272797, + "flos": 497373520896.0, + "grad_norm": 0.03570297995537699, + "language_loss": 0.91898167, + "learning_rate": 4.6478094233358695e-06, + "loss": 0.93032837, + "num_input_tokens_seen": 412746848, + "router_z_loss_mlp": 0.73242188, + "step": 4979, + "time_per_iteration": 2.59523868560791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135113, + "balance_loss_mlp": 1.06182265, + "epoch": 0.9580607926125433, + "flos": 430853881344.0, + "grad_norm": 0.043029309448741025, + "language_loss": 0.91334265, + "learning_rate": 4.605525716805337e-06, + "loss": 0.92469382, + "num_input_tokens_seen": 412810144, + "router_z_loss_mlp": 0.73291016, + "step": 4980, + "time_per_iteration": 2.4755971431732178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136154, + "balance_loss_mlp": 1.0630554, + "epoch": 0.9582531742978069, + "flos": 1129131087360.0, + "grad_norm": 0.042821653988821394, + "language_loss": 0.8443023, + "learning_rate": 4.563434339466599e-06, + "loss": 0.8556639, + "num_input_tokens_seen": 412904768, + "router_z_loss_mlp": 0.73095703, + "step": 4981, + "time_per_iteration": 3.5472586154937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136224, + "balance_loss_mlp": 1.06312537, + "epoch": 0.9584455559830705, + "flos": 525555726336.0, + "grad_norm": 0.03335114170802168, + "language_loss": 0.83248258, + "learning_rate": 4.521535307661085e-06, + "loss": 0.84384483, + "num_input_tokens_seen": 412974592, + "router_z_loss_mlp": 0.73095703, + "step": 4982, + "time_per_iteration": 2.6682260036468506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113622, + "balance_loss_mlp": 1.06307316, + "epoch": 0.9586379376683339, + "flos": 635449118208.0, + "grad_norm": 0.03182275504909025, + "language_loss": 0.84402609, + "learning_rate": 4.479828637655392e-06, + "loss": 0.85538828, + "num_input_tokens_seen": 413052848, + "router_z_loss_mlp": 0.73144531, + "step": 4983, + "time_per_iteration": 2.840589761734009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136281, + "balance_loss_mlp": 1.06313407, + "epoch": 0.9588303193535975, + "flos": 416984358912.0, + "grad_norm": 0.03935201485071488, + "language_loss": 0.88144433, + "learning_rate": 4.438314345641459e-06, + "loss": 0.89280713, + "num_input_tokens_seen": 413118000, + "router_z_loss_mlp": 0.73144531, + "step": 4984, + "time_per_iteration": 2.549217700958252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136295, + "balance_loss_mlp": 1.06310058, + "epoch": 0.9590227010388611, + "flos": 482659336704.0, + "grad_norm": 0.03510699411251916, + "language_loss": 0.82830805, + "learning_rate": 4.3969924477365585e-06, + "loss": 0.83967102, + "num_input_tokens_seen": 413185616, + "router_z_loss_mlp": 0.73193359, + "step": 4985, + "time_per_iteration": 2.6106717586517334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134857, + "balance_loss_mlp": 1.06180549, + "epoch": 0.9592150827241247, + "flos": 685849956864.0, + "grad_norm": 0.034999035587186825, + "language_loss": 0.84885329, + "learning_rate": 4.355862959983359e-06, + "loss": 0.86020184, + "num_input_tokens_seen": 413265616, + "router_z_loss_mlp": 0.73095703, + "step": 4986, + "time_per_iteration": 2.933217763900757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135131, + "balance_loss_mlp": 1.06198394, + "epoch": 0.9594074644093882, + "flos": 575630925312.0, + "grad_norm": 0.04204182022141106, + "language_loss": 0.74685031, + "learning_rate": 4.314925898349642e-06, + "loss": 0.7582016, + "num_input_tokens_seen": 413341248, + "router_z_loss_mlp": 0.73193359, + "step": 4987, + "time_per_iteration": 2.726092576980591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134792, + "balance_loss_mlp": 1.06155026, + "epoch": 0.9595998460946518, + "flos": 547987207680.0, + "grad_norm": 0.03775455227306167, + "language_loss": 0.82959723, + "learning_rate": 4.2741812787286395e-06, + "loss": 0.84094512, + "num_input_tokens_seen": 413416080, + "router_z_loss_mlp": 0.73242188, + "step": 4988, + "time_per_iteration": 2.7773516178131104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135054, + "balance_loss_mlp": 1.06181157, + "epoch": 0.9597922277799154, + "flos": 475026633216.0, + "grad_norm": 0.041401816345422476, + "language_loss": 0.82861459, + "learning_rate": 4.233629116938809e-06, + "loss": 0.83996511, + "num_input_tokens_seen": 413482336, + "router_z_loss_mlp": 0.73242188, + "step": 4989, + "time_per_iteration": 2.551558494567871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134589, + "balance_loss_mlp": 1.06134653, + "epoch": 0.9599846094651789, + "flos": 515719406592.0, + "grad_norm": 0.052249401603679996, + "language_loss": 0.90226066, + "learning_rate": 4.193269428723889e-06, + "loss": 0.91360652, + "num_input_tokens_seen": 413553248, + "router_z_loss_mlp": 0.73242188, + "step": 4990, + "time_per_iteration": 2.641939163208008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134583, + "balance_loss_mlp": 1.06134093, + "epoch": 0.9601769911504425, + "flos": 596162962944.0, + "grad_norm": 0.03785738083806385, + "language_loss": 0.82735097, + "learning_rate": 4.1531022297529035e-06, + "loss": 0.83869678, + "num_input_tokens_seen": 413625776, + "router_z_loss_mlp": 0.73242188, + "step": 4991, + "time_per_iteration": 2.772304058074951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136451, + "balance_loss_mlp": 1.06330407, + "epoch": 0.960369372835706, + "flos": 494041262592.0, + "grad_norm": 0.034704241516027634, + "language_loss": 0.83890998, + "learning_rate": 4.1131275356201536e-06, + "loss": 0.85027456, + "num_input_tokens_seen": 413693056, + "router_z_loss_mlp": 0.73144531, + "step": 4992, + "time_per_iteration": 2.6465232372283936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136442, + "balance_loss_mlp": 1.06339037, + "epoch": 0.9605617545209696, + "flos": 580406731776.0, + "grad_norm": 0.033359643790349336, + "language_loss": 0.86629891, + "learning_rate": 4.073345361845171e-06, + "loss": 0.87766337, + "num_input_tokens_seen": 413765616, + "router_z_loss_mlp": 0.73046875, + "step": 4993, + "time_per_iteration": 2.689033269882202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135961, + "balance_loss_mlp": 1.06290936, + "epoch": 0.9607541362062332, + "flos": 929298921984.0, + "grad_norm": 0.029146870910398723, + "language_loss": 0.89981806, + "learning_rate": 4.033755723872767e-06, + "loss": 0.91117764, + "num_input_tokens_seen": 413850976, + "router_z_loss_mlp": 0.73046875, + "step": 4994, + "time_per_iteration": 3.2702882289886475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136365, + "balance_loss_mlp": 1.06312311, + "epoch": 0.9609465178914968, + "flos": 574280702976.0, + "grad_norm": 0.03393299990449358, + "language_loss": 0.80548346, + "learning_rate": 3.994358637073036e-06, + "loss": 0.81684709, + "num_input_tokens_seen": 413931648, + "router_z_loss_mlp": 0.73242188, + "step": 4995, + "time_per_iteration": 2.7817986011505127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136147, + "balance_loss_mlp": 1.0630002, + "epoch": 0.9611388995767602, + "flos": 531914068992.0, + "grad_norm": 0.033026252404674224, + "language_loss": 0.89345288, + "learning_rate": 3.955154116741244e-06, + "loss": 0.90481436, + "num_input_tokens_seen": 414003216, + "router_z_loss_mlp": 0.73144531, + "step": 4996, + "time_per_iteration": 2.655974864959717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113658, + "balance_loss_mlp": 1.06343305, + "epoch": 0.9613312812620238, + "flos": 647403734016.0, + "grad_norm": 0.0373910335582963, + "language_loss": 0.87061286, + "learning_rate": 3.916142178097881e-06, + "loss": 0.88197875, + "num_input_tokens_seen": 414077072, + "router_z_loss_mlp": 0.73144531, + "step": 4997, + "time_per_iteration": 2.7723019123077393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136218, + "balance_loss_mlp": 1.06311882, + "epoch": 0.9615236629472874, + "flos": 497178137088.0, + "grad_norm": 0.03336855538209936, + "language_loss": 0.81832653, + "learning_rate": 3.877322836288888e-06, + "loss": 0.82968867, + "num_input_tokens_seen": 414157600, + "router_z_loss_mlp": 0.73095703, + "step": 4998, + "time_per_iteration": 2.844299554824829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136341, + "balance_loss_mlp": 1.06319392, + "epoch": 0.961716044632551, + "flos": 514006614528.0, + "grad_norm": 0.03899261635106141, + "language_loss": 0.80403006, + "learning_rate": 3.838696106385153e-06, + "loss": 0.81539345, + "num_input_tokens_seen": 414224880, + "router_z_loss_mlp": 0.73144531, + "step": 4999, + "time_per_iteration": 2.6195151805877686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136197, + "balance_loss_mlp": 1.0630976, + "epoch": 0.9619084263178146, + "flos": 502084199424.0, + "grad_norm": 0.03786304088384279, + "language_loss": 0.85582483, + "learning_rate": 3.800262003382904e-06, + "loss": 0.86718684, + "num_input_tokens_seen": 414291728, + "router_z_loss_mlp": 0.73095703, + "step": 5000, + "time_per_iteration": 2.5949509143829346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134465, + "balance_loss_mlp": 1.06122255, + "epoch": 0.9621008080030781, + "flos": 596805510144.0, + "grad_norm": 0.041941865277851494, + "language_loss": 0.80558175, + "learning_rate": 3.7620205422035923e-06, + "loss": 0.81692636, + "num_input_tokens_seen": 414369568, + "router_z_loss_mlp": 0.73242188, + "step": 5001, + "time_per_iteration": 2.773188829421997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134714, + "balance_loss_mlp": 1.0614723, + "epoch": 0.9622931896883417, + "flos": 503247770112.0, + "grad_norm": 0.04000138367761118, + "language_loss": 0.87168002, + "learning_rate": 3.723971737693899e-06, + "loss": 0.88302714, + "num_input_tokens_seen": 414441424, + "router_z_loss_mlp": 0.73242188, + "step": 5002, + "time_per_iteration": 2.6144204139709473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134777, + "balance_loss_mlp": 1.06153464, + "epoch": 0.9624855713736052, + "flos": 608449949184.0, + "grad_norm": 0.03656605710173359, + "language_loss": 0.85194814, + "learning_rate": 3.6861156046256728e-06, + "loss": 0.86329585, + "num_input_tokens_seen": 414512960, + "router_z_loss_mlp": 0.73242188, + "step": 5003, + "time_per_iteration": 2.772636890411377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136303, + "balance_loss_mlp": 1.06320393, + "epoch": 0.9626779530588688, + "flos": 511735868928.0, + "grad_norm": 0.044650316551590984, + "language_loss": 0.89575279, + "learning_rate": 3.648452157695936e-06, + "loss": 0.90711582, + "num_input_tokens_seen": 414577392, + "router_z_loss_mlp": 0.73095703, + "step": 5004, + "time_per_iteration": 2.5866780281066895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136273, + "balance_loss_mlp": 1.06322193, + "epoch": 0.9628703347441323, + "flos": 628497893376.0, + "grad_norm": 0.037572642245888015, + "language_loss": 0.87363774, + "learning_rate": 3.610981411526937e-06, + "loss": 0.88500047, + "num_input_tokens_seen": 414655152, + "router_z_loss_mlp": 0.73046875, + "step": 5005, + "time_per_iteration": 2.814835548400879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113604, + "balance_loss_mlp": 1.06294048, + "epoch": 0.9630627164293959, + "flos": 631897281024.0, + "grad_norm": 0.03692802527340189, + "language_loss": 0.82178611, + "learning_rate": 3.573703380666149e-06, + "loss": 0.83314651, + "num_input_tokens_seen": 414730432, + "router_z_loss_mlp": 0.73095703, + "step": 5006, + "time_per_iteration": 2.7788455486297607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113652, + "balance_loss_mlp": 1.06346869, + "epoch": 0.9632550981146595, + "flos": 571729979904.0, + "grad_norm": 0.03764323441994214, + "language_loss": 0.82586932, + "learning_rate": 3.5366180795861622e-06, + "loss": 0.83723456, + "num_input_tokens_seen": 414810688, + "router_z_loss_mlp": 0.73046875, + "step": 5007, + "time_per_iteration": 2.8145768642425537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134652, + "balance_loss_mlp": 1.06141019, + "epoch": 0.9634474797999231, + "flos": 467159614464.0, + "grad_norm": 0.03643507504396426, + "language_loss": 0.86381149, + "learning_rate": 3.4997255226847937e-06, + "loss": 0.87515807, + "num_input_tokens_seen": 414880544, + "router_z_loss_mlp": 0.73242188, + "step": 5008, + "time_per_iteration": 2.641538619995117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134761, + "balance_loss_mlp": 1.06151867, + "epoch": 0.9636398614851867, + "flos": 527624357376.0, + "grad_norm": 0.03653594954025797, + "language_loss": 0.89453661, + "learning_rate": 3.463025724284974e-06, + "loss": 0.90588421, + "num_input_tokens_seen": 414949920, + "router_z_loss_mlp": 0.73242188, + "step": 5009, + "time_per_iteration": 2.6100451946258545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135987, + "balance_loss_mlp": 1.06284046, + "epoch": 0.9638322431704501, + "flos": 565942325760.0, + "grad_norm": 0.035991126690817755, + "language_loss": 0.79672241, + "learning_rate": 3.4265186986348618e-06, + "loss": 0.80808234, + "num_input_tokens_seen": 415024288, + "router_z_loss_mlp": 0.73144531, + "step": 5010, + "time_per_iteration": 2.768517255783081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136278, + "balance_loss_mlp": 1.06317854, + "epoch": 0.9640246248557137, + "flos": 478740926976.0, + "grad_norm": 0.03726077990698358, + "language_loss": 0.89582598, + "learning_rate": 3.3902044599076754e-06, + "loss": 0.90718877, + "num_input_tokens_seen": 415092032, + "router_z_loss_mlp": 0.73095703, + "step": 5011, + "time_per_iteration": 2.578130006790161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135669, + "balance_loss_mlp": 1.06252217, + "epoch": 0.9642170065409773, + "flos": 540339041280.0, + "grad_norm": 0.036587267985256175, + "language_loss": 0.92892486, + "learning_rate": 3.354083022201859e-06, + "loss": 0.94028151, + "num_input_tokens_seen": 415158544, + "router_z_loss_mlp": 0.73144531, + "step": 5012, + "time_per_iteration": 2.626784563064575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136225, + "balance_loss_mlp": 1.06317353, + "epoch": 0.9644093882262409, + "flos": 524776192512.0, + "grad_norm": 0.03589608787010189, + "language_loss": 0.88128811, + "learning_rate": 3.3181543995410843e-06, + "loss": 0.89265037, + "num_input_tokens_seen": 415225088, + "router_z_loss_mlp": 0.73046875, + "step": 5013, + "time_per_iteration": 2.577364444732666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137481, + "balance_loss_mlp": 1.06452537, + "epoch": 0.9646017699115044, + "flos": 575381147136.0, + "grad_norm": 0.036469182684706475, + "language_loss": 0.83875465, + "learning_rate": 3.2824186058740268e-06, + "loss": 0.85012949, + "num_input_tokens_seen": 415300224, + "router_z_loss_mlp": 0.72998047, + "step": 5014, + "time_per_iteration": 2.6983656883239746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135531, + "balance_loss_mlp": 1.06238461, + "epoch": 0.964794151596768, + "flos": 637956180480.0, + "grad_norm": 0.040034570453418294, + "language_loss": 0.89572299, + "learning_rate": 3.246875655074588e-06, + "loss": 0.90707827, + "num_input_tokens_seen": 415368784, + "router_z_loss_mlp": 0.73193359, + "step": 5015, + "time_per_iteration": 2.774064064025879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136038, + "balance_loss_mlp": 1.06279588, + "epoch": 0.9649865332820315, + "flos": 618559515648.0, + "grad_norm": 0.038560774155918465, + "language_loss": 0.90913039, + "learning_rate": 3.211525560941675e-06, + "loss": 0.92049074, + "num_input_tokens_seen": 415440752, + "router_z_loss_mlp": 0.73242188, + "step": 5016, + "time_per_iteration": 2.7157909870147705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135584, + "balance_loss_mlp": 1.06243753, + "epoch": 0.9651789149672951, + "flos": 517326137856.0, + "grad_norm": 0.03416472134449421, + "language_loss": 0.85285097, + "learning_rate": 3.1763683371994754e-06, + "loss": 0.86420679, + "num_input_tokens_seen": 415516128, + "router_z_loss_mlp": 0.73193359, + "step": 5017, + "time_per_iteration": 2.729053020477295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136208, + "balance_loss_mlp": 1.06315696, + "epoch": 0.9653712966525587, + "flos": 493921740288.0, + "grad_norm": 0.04119563726090097, + "language_loss": 0.85390657, + "learning_rate": 3.1414039974972385e-06, + "loss": 0.86526859, + "num_input_tokens_seen": 415583744, + "router_z_loss_mlp": 0.73046875, + "step": 5018, + "time_per_iteration": 2.563650131225586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113648, + "balance_loss_mlp": 1.06338084, + "epoch": 0.9655636783378222, + "flos": 537656060928.0, + "grad_norm": 0.03021172693995666, + "language_loss": 0.85570192, + "learning_rate": 3.106632555409328e-06, + "loss": 0.86706674, + "num_input_tokens_seen": 415659856, + "router_z_loss_mlp": 0.73095703, + "step": 5019, + "time_per_iteration": 2.7251713275909424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136099, + "balance_loss_mlp": 1.06290472, + "epoch": 0.9657560600230858, + "flos": 459958611456.0, + "grad_norm": 0.03436013437508305, + "language_loss": 0.86592716, + "learning_rate": 3.072054024435167e-06, + "loss": 0.87728816, + "num_input_tokens_seen": 415731792, + "router_z_loss_mlp": 0.73193359, + "step": 5020, + "time_per_iteration": 2.6252498626708984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136711, + "balance_loss_mlp": 1.06356394, + "epoch": 0.9659484417083494, + "flos": 687388832256.0, + "grad_norm": 0.043622735099904504, + "language_loss": 0.88656896, + "learning_rate": 3.0376684179994064e-06, + "loss": 0.89793605, + "num_input_tokens_seen": 415809536, + "router_z_loss_mlp": 0.73144531, + "step": 5021, + "time_per_iteration": 2.8548264503479004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140694, + "balance_loss_mlp": 1.06916809, + "epoch": 0.966140823393613, + "flos": 1505456326656.0, + "grad_norm": 0.004755883898104752, + "language_loss": 0.80694246, + "learning_rate": 3.0034757494516453e-06, + "loss": 0.81834936, + "num_input_tokens_seen": 416027600, + "router_z_loss_mlp": 0.71679688, + "step": 5022, + "time_per_iteration": 4.785803556442261 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011346, + "balance_loss_mlp": 1.06135833, + "epoch": 0.9663332050788765, + "flos": 465859057152.0, + "grad_norm": 0.04060247816118618, + "language_loss": 0.85319602, + "learning_rate": 2.9694760320667093e-06, + "loss": 0.86454201, + "num_input_tokens_seen": 416096128, + "router_z_loss_mlp": 0.73242188, + "step": 5023, + "time_per_iteration": 2.615492820739746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134536, + "balance_loss_mlp": 1.06129432, + "epoch": 0.96652558676414, + "flos": 501878082048.0, + "grad_norm": 0.036856046559520406, + "language_loss": 0.90339649, + "learning_rate": 2.9356692790444283e-06, + "loss": 0.91474187, + "num_input_tokens_seen": 416164256, + "router_z_loss_mlp": 0.73242188, + "step": 5024, + "time_per_iteration": 2.659637451171875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134714, + "balance_loss_mlp": 1.06147206, + "epoch": 0.9667179684494036, + "flos": 425743703040.0, + "grad_norm": 0.04260558113745741, + "language_loss": 0.88175714, + "learning_rate": 2.9020555035097484e-06, + "loss": 0.89310426, + "num_input_tokens_seen": 416227296, + "router_z_loss_mlp": 0.73242188, + "step": 5025, + "time_per_iteration": 2.48905611038208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134596, + "balance_loss_mlp": 1.06149662, + "epoch": 0.9669103501346672, + "flos": 518009617920.0, + "grad_norm": 0.03460776123355322, + "language_loss": 0.90789652, + "learning_rate": 2.8686347185127305e-06, + "loss": 0.91924238, + "num_input_tokens_seen": 416297184, + "router_z_loss_mlp": 0.73144531, + "step": 5026, + "time_per_iteration": 2.6764590740203857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134794, + "balance_loss_mlp": 1.06155145, + "epoch": 0.9671027318199308, + "flos": 457175574528.0, + "grad_norm": 0.04902786366657777, + "language_loss": 0.82283497, + "learning_rate": 2.8354069370284396e-06, + "loss": 0.83418286, + "num_input_tokens_seen": 416363056, + "router_z_loss_mlp": 0.73242188, + "step": 5027, + "time_per_iteration": 2.595550537109375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134603, + "balance_loss_mlp": 1.06136048, + "epoch": 0.9672951135051943, + "flos": 526061286912.0, + "grad_norm": 0.03802823500081439, + "language_loss": 0.84784377, + "learning_rate": 2.802372171957057e-06, + "loss": 0.85918975, + "num_input_tokens_seen": 416430688, + "router_z_loss_mlp": 0.73242188, + "step": 5028, + "time_per_iteration": 2.674833059310913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136455, + "balance_loss_mlp": 1.06335628, + "epoch": 0.9674874951904578, + "flos": 575101169664.0, + "grad_norm": 0.03757979852199149, + "language_loss": 0.84332544, + "learning_rate": 2.7695304361237682e-06, + "loss": 0.85469002, + "num_input_tokens_seen": 416505248, + "router_z_loss_mlp": 0.73095703, + "step": 5029, + "time_per_iteration": 2.787973403930664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136141, + "balance_loss_mlp": 1.06289899, + "epoch": 0.9676798768757214, + "flos": 630423533568.0, + "grad_norm": 0.03236731472285776, + "language_loss": 0.83900696, + "learning_rate": 2.7368817422789848e-06, + "loss": 0.85036838, + "num_input_tokens_seen": 416592640, + "router_z_loss_mlp": 0.73242188, + "step": 5030, + "time_per_iteration": 2.92444109916687 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140633, + "balance_loss_mlp": 1.06910706, + "epoch": 0.967872258560985, + "flos": 1467114889728.0, + "grad_norm": 0.004700971558181271, + "language_loss": 0.75563359, + "learning_rate": 2.7044261030979566e-06, + "loss": 0.7670399, + "num_input_tokens_seen": 416808560, + "router_z_loss_mlp": 0.71679688, + "step": 5031, + "time_per_iteration": 4.658704519271851 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136242, + "balance_loss_mlp": 1.0631907, + "epoch": 0.9680646402462486, + "flos": 566567408640.0, + "grad_norm": 0.045787284444390154, + "language_loss": 0.85227001, + "learning_rate": 2.672163531181049e-06, + "loss": 0.86363238, + "num_input_tokens_seen": 416878208, + "router_z_loss_mlp": 0.73046875, + "step": 5032, + "time_per_iteration": 2.662707805633545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137848, + "balance_loss_mlp": 1.06632233, + "epoch": 0.9682570219315121, + "flos": 1437647589888.0, + "grad_norm": 0.0038661012253674927, + "language_loss": 0.78074801, + "learning_rate": 2.6400940390537976e-06, + "loss": 0.79212654, + "num_input_tokens_seen": 417105968, + "router_z_loss_mlp": 0.71679688, + "step": 5033, + "time_per_iteration": 4.825839519500732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134757, + "balance_loss_mlp": 1.06156242, + "epoch": 0.9684494036167757, + "flos": 585703561728.0, + "grad_norm": 0.037836121912765926, + "language_loss": 0.86821753, + "learning_rate": 2.608217639166688e-06, + "loss": 0.87956512, + "num_input_tokens_seen": 417175168, + "router_z_loss_mlp": 0.73193359, + "step": 5034, + "time_per_iteration": 2.733405351638794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134865, + "balance_loss_mlp": 1.0616231, + "epoch": 0.9686417853020393, + "flos": 560189600256.0, + "grad_norm": 0.033762716228182665, + "language_loss": 0.88299072, + "learning_rate": 2.5765343438950982e-06, + "loss": 0.89433932, + "num_input_tokens_seen": 417247760, + "router_z_loss_mlp": 0.73242188, + "step": 5035, + "time_per_iteration": 2.694063186645508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113452, + "balance_loss_mlp": 1.06132543, + "epoch": 0.9688341669873028, + "flos": 786262867968.0, + "grad_norm": 0.040583945106096336, + "language_loss": 0.88091248, + "learning_rate": 2.545044165539745e-06, + "loss": 0.89225769, + "num_input_tokens_seen": 417324080, + "router_z_loss_mlp": 0.73193359, + "step": 5036, + "time_per_iteration": 2.9456684589385986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134924, + "balance_loss_mlp": 1.06168199, + "epoch": 0.9690265486725663, + "flos": 396769228800.0, + "grad_norm": 0.038331219578498374, + "language_loss": 0.8455385, + "learning_rate": 2.513747116326126e-06, + "loss": 0.8568877, + "num_input_tokens_seen": 417386416, + "router_z_loss_mlp": 0.73242188, + "step": 5037, + "time_per_iteration": 2.523125648498535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134975, + "balance_loss_mlp": 1.06173313, + "epoch": 0.9692189303578299, + "flos": 477416901120.0, + "grad_norm": 0.041475216157481225, + "language_loss": 0.82368696, + "learning_rate": 2.4826432084048002e-06, + "loss": 0.83503664, + "num_input_tokens_seen": 417459648, + "router_z_loss_mlp": 0.73242188, + "step": 5038, + "time_per_iteration": 2.7746524810791016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134895, + "balance_loss_mlp": 1.06170058, + "epoch": 0.9694113120430935, + "flos": 598687489536.0, + "grad_norm": 0.040629799686120044, + "language_loss": 0.83335608, + "learning_rate": 2.451732453851385e-06, + "loss": 0.84470499, + "num_input_tokens_seen": 417530512, + "router_z_loss_mlp": 0.73193359, + "step": 5039, + "time_per_iteration": 4.120795726776123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113648, + "balance_loss_mlp": 1.06338096, + "epoch": 0.9696036937283571, + "flos": 501897547776.0, + "grad_norm": 0.033826903503827166, + "language_loss": 0.86580127, + "learning_rate": 2.4210148646665598e-06, + "loss": 0.87716603, + "num_input_tokens_seen": 417597600, + "router_z_loss_mlp": 0.73095703, + "step": 5040, + "time_per_iteration": 2.607876777648926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135933, + "balance_loss_mlp": 1.06278634, + "epoch": 0.9697960754136207, + "flos": 433189754880.0, + "grad_norm": 0.04362735320956941, + "language_loss": 0.92283428, + "learning_rate": 2.3904904527758952e-06, + "loss": 0.93419361, + "num_input_tokens_seen": 417659616, + "router_z_loss_mlp": 0.73144531, + "step": 5041, + "time_per_iteration": 2.4580559730529785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136172, + "balance_loss_mlp": 1.06307268, + "epoch": 0.9699884570988841, + "flos": 569674083840.0, + "grad_norm": 0.03235624014830649, + "language_loss": 0.89051294, + "learning_rate": 2.3601592300300235e-06, + "loss": 0.90187466, + "num_input_tokens_seen": 417730896, + "router_z_loss_mlp": 0.73095703, + "step": 5042, + "time_per_iteration": 2.713972568511963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136359, + "balance_loss_mlp": 1.06321263, + "epoch": 0.9701808387841477, + "flos": 517236814848.0, + "grad_norm": 0.03727061706685101, + "language_loss": 0.85871363, + "learning_rate": 2.33002120820458e-06, + "loss": 0.87007725, + "num_input_tokens_seen": 417803296, + "router_z_loss_mlp": 0.73144531, + "step": 5043, + "time_per_iteration": 2.6875967979431152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113646, + "balance_loss_mlp": 1.06326568, + "epoch": 0.9703732204694113, + "flos": 492497657856.0, + "grad_norm": 0.03840937503625704, + "language_loss": 0.80693823, + "learning_rate": 2.300076399000206e-06, + "loss": 0.81830281, + "num_input_tokens_seen": 417870208, + "router_z_loss_mlp": 0.73193359, + "step": 5044, + "time_per_iteration": 2.5949554443359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113635, + "balance_loss_mlp": 1.06320333, + "epoch": 0.9705656021546749, + "flos": 627279928320.0, + "grad_norm": 0.03683083642331674, + "language_loss": 0.85812724, + "learning_rate": 2.2703248140424348e-06, + "loss": 0.8694908, + "num_input_tokens_seen": 417944464, + "router_z_loss_mlp": 0.73144531, + "step": 5045, + "time_per_iteration": 2.8123650550842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136233, + "balance_loss_mlp": 1.06308591, + "epoch": 0.9707579838399384, + "flos": 472393317888.0, + "grad_norm": 0.03632831837945052, + "language_loss": 0.87609589, + "learning_rate": 2.2407664648819715e-06, + "loss": 0.88745821, + "num_input_tokens_seen": 418010480, + "router_z_loss_mlp": 0.73144531, + "step": 5046, + "time_per_iteration": 2.5618367195129395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113635, + "balance_loss_mlp": 1.06315589, + "epoch": 0.970950365525202, + "flos": 493138203648.0, + "grad_norm": 0.038642032843630054, + "language_loss": 0.85051489, + "learning_rate": 2.2114013629942475e-06, + "loss": 0.8618784, + "num_input_tokens_seen": 418083952, + "router_z_loss_mlp": 0.73193359, + "step": 5047, + "time_per_iteration": 4.11439061164856 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136071, + "balance_loss_mlp": 1.06301963, + "epoch": 0.9711427472104656, + "flos": 558376751616.0, + "grad_norm": 0.04056698166765332, + "language_loss": 0.85194492, + "learning_rate": 2.1822295197799213e-06, + "loss": 0.86330569, + "num_input_tokens_seen": 418156672, + "router_z_loss_mlp": 0.73046875, + "step": 5048, + "time_per_iteration": 2.6787452697753906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134824, + "balance_loss_mlp": 1.06158209, + "epoch": 0.9713351288957291, + "flos": 627100007424.0, + "grad_norm": 0.030987251047231726, + "language_loss": 0.87520432, + "learning_rate": 2.153250946564489e-06, + "loss": 0.88655257, + "num_input_tokens_seen": 418242160, + "router_z_loss_mlp": 0.73242188, + "step": 5049, + "time_per_iteration": 2.9055373668670654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134922, + "balance_loss_mlp": 1.0616796, + "epoch": 0.9715275105809927, + "flos": 500082697728.0, + "grad_norm": 0.03604755550471877, + "language_loss": 0.86542779, + "learning_rate": 2.1244656545983397e-06, + "loss": 0.87677705, + "num_input_tokens_seen": 418316960, + "router_z_loss_mlp": 0.73242188, + "step": 5050, + "time_per_iteration": 2.7245774269104004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113493, + "balance_loss_mlp": 1.06168818, + "epoch": 0.9717198922662562, + "flos": 478480415232.0, + "grad_norm": 0.03989506366730262, + "language_loss": 0.82222277, + "learning_rate": 2.0958736550570345e-06, + "loss": 0.83357209, + "num_input_tokens_seen": 418383888, + "router_z_loss_mlp": 0.73242188, + "step": 5051, + "time_per_iteration": 2.549938201904297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134999, + "balance_loss_mlp": 1.06180418, + "epoch": 0.9719122739515198, + "flos": 554549666304.0, + "grad_norm": 0.03271132462984947, + "language_loss": 0.82110488, + "learning_rate": 2.067474959040916e-06, + "loss": 0.83245492, + "num_input_tokens_seen": 418453776, + "router_z_loss_mlp": 0.73193359, + "step": 5052, + "time_per_iteration": 2.7398674488067627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135029, + "balance_loss_mlp": 1.06178653, + "epoch": 0.9721046556367834, + "flos": 566929978368.0, + "grad_norm": 0.03652890903263657, + "language_loss": 0.85459185, + "learning_rate": 2.0392695775753312e-06, + "loss": 0.86594218, + "num_input_tokens_seen": 418521984, + "router_z_loss_mlp": 0.73242188, + "step": 5053, + "time_per_iteration": 2.660578966140747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135966, + "balance_loss_mlp": 1.06291485, + "epoch": 0.972297037322047, + "flos": 561400834560.0, + "grad_norm": 0.04122701334842068, + "language_loss": 0.8283239, + "learning_rate": 2.0112575216105766e-06, + "loss": 0.83968359, + "num_input_tokens_seen": 418598768, + "router_z_loss_mlp": 0.73046875, + "step": 5054, + "time_per_iteration": 2.773737907409668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136236, + "balance_loss_mlp": 1.06304181, + "epoch": 0.9724894190073105, + "flos": 513503055360.0, + "grad_norm": 0.04021059743942707, + "language_loss": 0.8332113, + "learning_rate": 1.9834388020218974e-06, + "loss": 0.84457362, + "num_input_tokens_seen": 418670064, + "router_z_loss_mlp": 0.73193359, + "step": 5055, + "time_per_iteration": 2.712599992752075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136328, + "balance_loss_mlp": 1.06313324, + "epoch": 0.972681800692574, + "flos": 615038604288.0, + "grad_norm": 0.04232559781751974, + "language_loss": 0.85386884, + "learning_rate": 1.9558134296094875e-06, + "loss": 0.86523211, + "num_input_tokens_seen": 418745216, + "router_z_loss_mlp": 0.73193359, + "step": 5056, + "time_per_iteration": 2.8266754150390625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136353, + "balance_loss_mlp": 1.06325388, + "epoch": 0.9728741823778376, + "flos": 835313484288.0, + "grad_norm": 0.03448022317319212, + "language_loss": 0.87796867, + "learning_rate": 1.92838141509849e-06, + "loss": 0.88933218, + "num_input_tokens_seen": 418824224, + "router_z_loss_mlp": 0.73095703, + "step": 5057, + "time_per_iteration": 3.078075885772705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136379, + "balance_loss_mlp": 1.06323254, + "epoch": 0.9730665640631012, + "flos": 572587376640.0, + "grad_norm": 0.03571508034746827, + "language_loss": 0.89210469, + "learning_rate": 1.9011427691389415e-06, + "loss": 0.90346849, + "num_input_tokens_seen": 418899712, + "router_z_loss_mlp": 0.73144531, + "step": 5058, + "time_per_iteration": 2.743687391281128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136509, + "balance_loss_mlp": 1.06345737, + "epoch": 0.9732589457483648, + "flos": 507520017408.0, + "grad_norm": 0.03560266740855486, + "language_loss": 0.82347834, + "learning_rate": 1.8740975023057715e-06, + "loss": 0.83484346, + "num_input_tokens_seen": 418964912, + "router_z_loss_mlp": 0.73046875, + "step": 5059, + "time_per_iteration": 2.603219985961914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113618, + "balance_loss_mlp": 1.06308138, + "epoch": 0.9734513274336283, + "flos": 928482458112.0, + "grad_norm": 0.03831156338681025, + "language_loss": 0.84692299, + "learning_rate": 1.84724562509897e-06, + "loss": 0.85828483, + "num_input_tokens_seen": 419040032, + "router_z_loss_mlp": 0.73095703, + "step": 5060, + "time_per_iteration": 3.1661386489868164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134848, + "balance_loss_mlp": 1.06165326, + "epoch": 0.9736437091188919, + "flos": 492925355520.0, + "grad_norm": 0.03299060222462335, + "language_loss": 0.81984901, + "learning_rate": 1.8205871479433089e-06, + "loss": 0.8311975, + "num_input_tokens_seen": 419112672, + "router_z_loss_mlp": 0.73193359, + "step": 5061, + "time_per_iteration": 2.7532899379730225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134743, + "balance_loss_mlp": 1.06150103, + "epoch": 0.9738360908041555, + "flos": 614454454272.0, + "grad_norm": 0.044137149894814875, + "language_loss": 0.88850021, + "learning_rate": 1.7941220811885096e-06, + "loss": 0.89984763, + "num_input_tokens_seen": 419183408, + "router_z_loss_mlp": 0.73242188, + "step": 5062, + "time_per_iteration": 2.7332098484039307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138641, + "balance_loss_mlp": 1.06692505, + "epoch": 0.974028472489419, + "flos": 1552731024384.0, + "grad_norm": 0.003870058232261716, + "language_loss": 0.75992095, + "learning_rate": 1.7678504351092972e-06, + "loss": 0.77130735, + "num_input_tokens_seen": 419415472, + "router_z_loss_mlp": 0.71875, + "step": 5063, + "time_per_iteration": 4.949795484542847 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138702, + "balance_loss_mlp": 1.06698608, + "epoch": 0.9742208541746825, + "flos": 1414178064384.0, + "grad_norm": 0.0038928950863822815, + "language_loss": 0.79677713, + "learning_rate": 1.7417722199051245e-06, + "loss": 0.80816418, + "num_input_tokens_seen": 419651840, + "router_z_loss_mlp": 0.71875, + "step": 5064, + "time_per_iteration": 4.926048994064331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134852, + "balance_loss_mlp": 1.06160998, + "epoch": 0.9744132358599461, + "flos": 676098230784.0, + "grad_norm": 0.030093067662967588, + "language_loss": 0.80718327, + "learning_rate": 1.7158874457005592e-06, + "loss": 0.81853181, + "num_input_tokens_seen": 419729424, + "router_z_loss_mlp": 0.73242188, + "step": 5065, + "time_per_iteration": 2.866382360458374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135156, + "balance_loss_mlp": 1.06196105, + "epoch": 0.9746056175452097, + "flos": 599597279232.0, + "grad_norm": 0.03459907750020676, + "language_loss": 0.82514048, + "learning_rate": 1.690196122544896e-06, + "loss": 0.836492, + "num_input_tokens_seen": 419803616, + "router_z_loss_mlp": 0.73193359, + "step": 5066, + "time_per_iteration": 2.8023762702941895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135035, + "balance_loss_mlp": 1.06179249, + "epoch": 0.9747979992304733, + "flos": 733532886528.0, + "grad_norm": 0.03471604647902471, + "language_loss": 0.86751151, + "learning_rate": 1.6646982604123784e-06, + "loss": 0.8788619, + "num_input_tokens_seen": 419883536, + "router_z_loss_mlp": 0.73242188, + "step": 5067, + "time_per_iteration": 3.010525941848755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134934, + "balance_loss_mlp": 1.06174004, + "epoch": 0.9749903809157369, + "flos": 617619526656.0, + "grad_norm": 0.04453093202467409, + "language_loss": 0.81295151, + "learning_rate": 1.6393938692022548e-06, + "loss": 0.82430089, + "num_input_tokens_seen": 419956816, + "router_z_loss_mlp": 0.73193359, + "step": 5068, + "time_per_iteration": 2.7091329097747803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134722, + "balance_loss_mlp": 1.06148005, + "epoch": 0.9751827626010003, + "flos": 469349769216.0, + "grad_norm": 0.03581121919344097, + "language_loss": 0.88265562, + "learning_rate": 1.6142829587384443e-06, + "loss": 0.89400285, + "num_input_tokens_seen": 420022096, + "router_z_loss_mlp": 0.73242188, + "step": 5069, + "time_per_iteration": 2.6038756370544434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134695, + "balance_loss_mlp": 1.06145287, + "epoch": 0.9753751442862639, + "flos": 600407012352.0, + "grad_norm": 0.04136761381890335, + "language_loss": 0.91069138, + "learning_rate": 1.5893655387698713e-06, + "loss": 0.92203832, + "num_input_tokens_seen": 420097008, + "router_z_loss_mlp": 0.73242188, + "step": 5070, + "time_per_iteration": 2.826425075531006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136024, + "balance_loss_mlp": 1.06292439, + "epoch": 0.9755675259715275, + "flos": 652090944000.0, + "grad_norm": 0.03089674785401136, + "language_loss": 0.86145902, + "learning_rate": 1.5646416189704637e-06, + "loss": 0.87281919, + "num_input_tokens_seen": 420174960, + "router_z_loss_mlp": 0.73095703, + "step": 5071, + "time_per_iteration": 2.940932512283325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011359, + "balance_loss_mlp": 1.06275284, + "epoch": 0.9757599076567911, + "flos": 564724360704.0, + "grad_norm": 0.04003681230801716, + "language_loss": 0.83221871, + "learning_rate": 1.5401112089387659e-06, + "loss": 0.84357774, + "num_input_tokens_seen": 420245248, + "router_z_loss_mlp": 0.73144531, + "step": 5072, + "time_per_iteration": 2.683784246444702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135923, + "balance_loss_mlp": 1.06287193, + "epoch": 0.9759522893420547, + "flos": 505648771584.0, + "grad_norm": 0.03649073694406785, + "language_loss": 0.85017049, + "learning_rate": 1.5157743181983819e-06, + "loss": 0.86152965, + "num_input_tokens_seen": 420310688, + "router_z_loss_mlp": 0.73046875, + "step": 5073, + "time_per_iteration": 2.621758222579956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135904, + "balance_loss_mlp": 1.06280482, + "epoch": 0.9761446710273182, + "flos": 584837432832.0, + "grad_norm": 0.04240200515467586, + "language_loss": 0.86889368, + "learning_rate": 1.4916309561976982e-06, + "loss": 0.88025272, + "num_input_tokens_seen": 420379008, + "router_z_loss_mlp": 0.73095703, + "step": 5074, + "time_per_iteration": 2.754220485687256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135867, + "balance_loss_mlp": 1.06276762, + "epoch": 0.9763370527125818, + "flos": 483171628032.0, + "grad_norm": 0.041938466654606696, + "language_loss": 0.87228501, + "learning_rate": 1.4676811323099947e-06, + "loss": 0.88364369, + "num_input_tokens_seen": 420445504, + "router_z_loss_mlp": 0.73095703, + "step": 5075, + "time_per_iteration": 2.660871982574463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135876, + "balance_loss_mlp": 1.06272912, + "epoch": 0.9765294343978453, + "flos": 620113853952.0, + "grad_norm": 0.034349586662843025, + "language_loss": 0.82321155, + "learning_rate": 1.4439248558335561e-06, + "loss": 0.83457041, + "num_input_tokens_seen": 420520528, + "router_z_loss_mlp": 0.73144531, + "step": 5076, + "time_per_iteration": 2.7837605476379395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136031, + "balance_loss_mlp": 1.06293166, + "epoch": 0.9767218160831089, + "flos": 527587427328.0, + "grad_norm": 0.03936217857713211, + "language_loss": 0.89625615, + "learning_rate": 1.4203621359911712e-06, + "loss": 0.9076165, + "num_input_tokens_seen": 420586224, + "router_z_loss_mlp": 0.73095703, + "step": 5077, + "time_per_iteration": 2.5941243171691895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135825, + "balance_loss_mlp": 1.06263041, + "epoch": 0.9769141977683724, + "flos": 526245937152.0, + "grad_norm": 0.034114352455168806, + "language_loss": 0.88253415, + "learning_rate": 1.3969929819308557e-06, + "loss": 0.89389241, + "num_input_tokens_seen": 420655456, + "router_z_loss_mlp": 0.73193359, + "step": 5078, + "time_per_iteration": 2.6603527069091797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135907, + "balance_loss_mlp": 1.06276, + "epoch": 0.977106579453636, + "flos": 458643317760.0, + "grad_norm": 0.03736684310262229, + "language_loss": 0.84752488, + "learning_rate": 1.3738174027252416e-06, + "loss": 0.85888398, + "num_input_tokens_seen": 420733216, + "router_z_loss_mlp": 0.73144531, + "step": 5079, + "time_per_iteration": 2.8190555572509766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113459, + "balance_loss_mlp": 1.06134772, + "epoch": 0.9772989611388996, + "flos": 533134035456.0, + "grad_norm": 0.03786927366079968, + "language_loss": 0.86551404, + "learning_rate": 1.3508354073719642e-06, + "loss": 0.87685996, + "num_input_tokens_seen": 420803376, + "router_z_loss_mlp": 0.73242188, + "step": 5080, + "time_per_iteration": 2.6154069900512695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134748, + "balance_loss_mlp": 1.06150591, + "epoch": 0.9774913428241632, + "flos": 756754635264.0, + "grad_norm": 0.037853043258092404, + "language_loss": 0.8976739, + "learning_rate": 1.3280470047933313e-06, + "loss": 0.90902144, + "num_input_tokens_seen": 420886256, + "router_z_loss_mlp": 0.73242188, + "step": 5081, + "time_per_iteration": 3.048454523086548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138092, + "balance_loss_mlp": 1.06637573, + "epoch": 0.9776837245094268, + "flos": 1557668012544.0, + "grad_norm": 0.00376334878312987, + "language_loss": 0.78895497, + "learning_rate": 1.3054522038366544e-06, + "loss": 0.80033588, + "num_input_tokens_seen": 421123728, + "router_z_loss_mlp": 0.71875, + "step": 5082, + "time_per_iteration": 5.043825149536133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134654, + "balance_loss_mlp": 1.06141222, + "epoch": 0.9778761061946902, + "flos": 593633707008.0, + "grad_norm": 0.04337083767470995, + "language_loss": 0.89383692, + "learning_rate": 1.2830510132739725e-06, + "loss": 0.90518343, + "num_input_tokens_seen": 421192576, + "router_z_loss_mlp": 0.73242188, + "step": 5083, + "time_per_iteration": 2.7039098739624023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135781, + "balance_loss_mlp": 1.06263411, + "epoch": 0.9780684878799538, + "flos": 415831521792.0, + "grad_norm": 0.03593395529924556, + "language_loss": 0.86301732, + "learning_rate": 1.2608434418022175e-06, + "loss": 0.8743751, + "num_input_tokens_seen": 421256272, + "router_z_loss_mlp": 0.73144531, + "step": 5084, + "time_per_iteration": 4.800846815109253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136122, + "balance_loss_mlp": 1.06302321, + "epoch": 0.9782608695652174, + "flos": 569543827968.0, + "grad_norm": 0.03668547357374544, + "language_loss": 0.89074433, + "learning_rate": 1.2388294980431036e-06, + "loss": 0.90210557, + "num_input_tokens_seen": 421332880, + "router_z_loss_mlp": 0.73095703, + "step": 5085, + "time_per_iteration": 2.7352962493896484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135976, + "balance_loss_mlp": 1.06287682, + "epoch": 0.978453251250481, + "flos": 691761136128.0, + "grad_norm": 0.03913427215526849, + "language_loss": 0.87911779, + "learning_rate": 1.217009190543239e-06, + "loss": 0.89047754, + "num_input_tokens_seen": 421406160, + "router_z_loss_mlp": 0.73095703, + "step": 5086, + "time_per_iteration": 2.8892364501953125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135825, + "balance_loss_mlp": 1.06263065, + "epoch": 0.9786456329357445, + "flos": 503571408384.0, + "grad_norm": 0.034620175401031496, + "language_loss": 0.81605828, + "learning_rate": 1.1953825277740694e-06, + "loss": 0.82741642, + "num_input_tokens_seen": 421476208, + "router_z_loss_mlp": 0.73193359, + "step": 5087, + "time_per_iteration": 2.67069149017334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134413, + "balance_loss_mlp": 1.06117117, + "epoch": 0.9788380146210081, + "flos": 864604866048.0, + "grad_norm": 0.039272428340046274, + "language_loss": 0.85826278, + "learning_rate": 1.1739495181317117e-06, + "loss": 0.86960691, + "num_input_tokens_seen": 421549232, + "router_z_loss_mlp": 0.73242188, + "step": 5088, + "time_per_iteration": 3.05206561088562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134797, + "balance_loss_mlp": 1.06155455, + "epoch": 0.9790303963062716, + "flos": 513746102784.0, + "grad_norm": 0.034545752771366, + "language_loss": 0.88846779, + "learning_rate": 1.1527101699371767e-06, + "loss": 0.8998158, + "num_input_tokens_seen": 421617056, + "router_z_loss_mlp": 0.73242188, + "step": 5089, + "time_per_iteration": 2.6102468967437744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134619, + "balance_loss_mlp": 1.06132865, + "epoch": 0.9792227779915352, + "flos": 495410950656.0, + "grad_norm": 0.042612868246076144, + "language_loss": 0.91103876, + "learning_rate": 1.1316644914364237e-06, + "loss": 0.92238486, + "num_input_tokens_seen": 421683424, + "router_z_loss_mlp": 0.73291016, + "step": 5090, + "time_per_iteration": 2.5904555320739746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135903, + "balance_loss_mlp": 1.06275654, + "epoch": 0.9794151596767988, + "flos": 609483264000.0, + "grad_norm": 0.038327834107812486, + "language_loss": 0.86390072, + "learning_rate": 1.1108124908000838e-06, + "loss": 0.87525976, + "num_input_tokens_seen": 421761200, + "router_z_loss_mlp": 0.73144531, + "step": 5091, + "time_per_iteration": 2.7942652702331543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135987, + "balance_loss_mlp": 1.06284022, + "epoch": 0.9796075413620623, + "flos": 479196822528.0, + "grad_norm": 0.04242679412713505, + "language_loss": 0.91551888, + "learning_rate": 1.09015417612357e-06, + "loss": 0.92687881, + "num_input_tokens_seen": 421829600, + "router_z_loss_mlp": 0.73144531, + "step": 5092, + "time_per_iteration": 2.6029610633850098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113605, + "balance_loss_mlp": 1.06285572, + "epoch": 0.9797999230473259, + "flos": 593362461696.0, + "grad_norm": 0.038287668132117786, + "language_loss": 0.88482207, + "learning_rate": 1.0696895554271335e-06, + "loss": 0.8961826, + "num_input_tokens_seen": 421904928, + "router_z_loss_mlp": 0.73193359, + "step": 5093, + "time_per_iteration": 2.7648696899414062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134535, + "balance_loss_mlp": 1.06129241, + "epoch": 0.9799923047325895, + "flos": 557563015680.0, + "grad_norm": 0.03420994841763029, + "language_loss": 0.86238348, + "learning_rate": 1.049418636655919e-06, + "loss": 0.87372881, + "num_input_tokens_seen": 421989616, + "router_z_loss_mlp": 0.73242188, + "step": 5094, + "time_per_iteration": 2.912834644317627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136088, + "balance_loss_mlp": 1.06284571, + "epoch": 0.9801846864178531, + "flos": 580628312064.0, + "grad_norm": 0.03371993676263859, + "language_loss": 0.89129627, + "learning_rate": 1.0293414276797974e-06, + "loss": 0.90265721, + "num_input_tokens_seen": 422067088, + "router_z_loss_mlp": 0.73242188, + "step": 5095, + "time_per_iteration": 2.773477792739868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134792, + "balance_loss_mlp": 1.06154943, + "epoch": 0.9803770681031165, + "flos": 516210230784.0, + "grad_norm": 0.034566414625280935, + "language_loss": 0.83682495, + "learning_rate": 1.0094579362933677e-06, + "loss": 0.8481729, + "num_input_tokens_seen": 422141136, + "router_z_loss_mlp": 0.73242188, + "step": 5096, + "time_per_iteration": 2.712693691253662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136254, + "balance_loss_mlp": 1.06315458, + "epoch": 0.9805694497883801, + "flos": 568119745536.0, + "grad_norm": 0.03425876820589903, + "language_loss": 0.82894945, + "learning_rate": 9.897681702160654e-07, + "loss": 0.840312, + "num_input_tokens_seen": 422216400, + "router_z_loss_mlp": 0.73095703, + "step": 5097, + "time_per_iteration": 2.737246036529541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135606, + "balance_loss_mlp": 1.06241155, + "epoch": 0.9807618314736437, + "flos": 480332195328.0, + "grad_norm": 0.04046674037063813, + "language_loss": 0.78180015, + "learning_rate": 9.702721370922208e-07, + "loss": 0.79315621, + "num_input_tokens_seen": 422287664, + "router_z_loss_mlp": 0.73193359, + "step": 5098, + "time_per_iteration": 2.652815341949463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135541, + "balance_loss_mlp": 1.0623461, + "epoch": 0.9809542131589073, + "flos": 546341544960.0, + "grad_norm": 0.04086563357176875, + "language_loss": 0.85544622, + "learning_rate": 9.509698444908344e-07, + "loss": 0.86680162, + "num_input_tokens_seen": 422357552, + "router_z_loss_mlp": 0.73193359, + "step": 5099, + "time_per_iteration": 2.6499040126800537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134438, + "balance_loss_mlp": 1.06119621, + "epoch": 0.9811465948441709, + "flos": 521862899712.0, + "grad_norm": 0.04248805685521767, + "language_loss": 0.85820013, + "learning_rate": 9.318612999057452e-07, + "loss": 0.86954451, + "num_input_tokens_seen": 422425872, + "router_z_loss_mlp": 0.73242188, + "step": 5100, + "time_per_iteration": 2.6109817028045654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134571, + "balance_loss_mlp": 1.06132865, + "epoch": 0.9813389765294344, + "flos": 542321077248.0, + "grad_norm": 0.03689155006089091, + "language_loss": 0.84802127, + "learning_rate": 9.129465107554635e-07, + "loss": 0.85936701, + "num_input_tokens_seen": 422495760, + "router_z_loss_mlp": 0.73242188, + "step": 5101, + "time_per_iteration": 2.646704912185669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134579, + "balance_loss_mlp": 1.06133687, + "epoch": 0.981531358214698, + "flos": 568464850944.0, + "grad_norm": 0.03755425810198059, + "language_loss": 0.88694, + "learning_rate": 8.942254843834485e-07, + "loss": 0.89828575, + "num_input_tokens_seen": 422568112, + "router_z_loss_mlp": 0.73242188, + "step": 5102, + "time_per_iteration": 2.7322897911071777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136296, + "balance_loss_mlp": 1.06314886, + "epoch": 0.9817237398999615, + "flos": 578413962240.0, + "grad_norm": 0.03455798640068261, + "language_loss": 0.85217297, + "learning_rate": 8.756982280578307e-07, + "loss": 0.86353588, + "num_input_tokens_seen": 422641280, + "router_z_loss_mlp": 0.73144531, + "step": 5103, + "time_per_iteration": 2.751131057739258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136072, + "balance_loss_mlp": 1.06282985, + "epoch": 0.9819161215852251, + "flos": 702854352384.0, + "grad_norm": 0.03555623235695427, + "language_loss": 0.85993326, + "learning_rate": 8.573647489714676e-07, + "loss": 0.87129396, + "num_input_tokens_seen": 422720416, + "router_z_loss_mlp": 0.73242188, + "step": 5104, + "time_per_iteration": 2.951957941055298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135655, + "balance_loss_mlp": 1.0624609, + "epoch": 0.9821085032704886, + "flos": 625452343296.0, + "grad_norm": 0.03465418860850988, + "language_loss": 0.88711596, + "learning_rate": 8.392250542421653e-07, + "loss": 0.89847255, + "num_input_tokens_seen": 422800384, + "router_z_loss_mlp": 0.73193359, + "step": 5105, + "time_per_iteration": 2.886805772781372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136322, + "balance_loss_mlp": 1.06327093, + "epoch": 0.9823008849557522, + "flos": 500492931072.0, + "grad_norm": 0.03689529509653958, + "language_loss": 0.86079448, + "learning_rate": 8.212791509122353e-07, + "loss": 0.87215769, + "num_input_tokens_seen": 422870768, + "router_z_loss_mlp": 0.73046875, + "step": 5106, + "time_per_iteration": 2.687134265899658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134787, + "balance_loss_mlp": 1.06154442, + "epoch": 0.9824932666410158, + "flos": 524904446976.0, + "grad_norm": 0.040173053897464624, + "language_loss": 0.78432387, + "learning_rate": 8.035270459489929e-07, + "loss": 0.79567176, + "num_input_tokens_seen": 422942864, + "router_z_loss_mlp": 0.73242188, + "step": 5107, + "time_per_iteration": 2.6810905933380127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135022, + "balance_loss_mlp": 1.06178021, + "epoch": 0.9826856483262794, + "flos": 503675467776.0, + "grad_norm": 0.03566590525509119, + "language_loss": 0.87364811, + "learning_rate": 7.859687462443698e-07, + "loss": 0.88499832, + "num_input_tokens_seen": 423013600, + "router_z_loss_mlp": 0.73242188, + "step": 5108, + "time_per_iteration": 2.653001546859741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134775, + "balance_loss_mlp": 1.06153297, + "epoch": 0.982878030011543, + "flos": 563213683200.0, + "grad_norm": 0.04574005448539413, + "language_loss": 0.88620985, + "learning_rate": 7.686042586151354e-07, + "loss": 0.89755762, + "num_input_tokens_seen": 423093680, + "router_z_loss_mlp": 0.73242188, + "step": 5109, + "time_per_iteration": 2.8465735912323 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136369, + "balance_loss_mlp": 1.06331754, + "epoch": 0.9830704116968064, + "flos": 538214014464.0, + "grad_norm": 0.034798278837774685, + "language_loss": 0.8696683, + "learning_rate": 7.514335898027857e-07, + "loss": 0.88103199, + "num_input_tokens_seen": 423168608, + "router_z_loss_mlp": 0.73046875, + "step": 5110, + "time_per_iteration": 2.779977321624756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113608, + "balance_loss_mlp": 1.06298041, + "epoch": 0.98326279338207, + "flos": 459902215680.0, + "grad_norm": 0.03838898388533907, + "language_loss": 0.88750166, + "learning_rate": 7.344567464735441e-07, + "loss": 0.89886248, + "num_input_tokens_seen": 423233552, + "router_z_loss_mlp": 0.73095703, + "step": 5111, + "time_per_iteration": 2.5905652046203613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136156, + "balance_loss_mlp": 1.06310439, + "epoch": 0.9834551750673336, + "flos": 642189496320.0, + "grad_norm": 0.03516170903549916, + "language_loss": 0.83847117, + "learning_rate": 7.17673735218416e-07, + "loss": 0.84983265, + "num_input_tokens_seen": 423307440, + "router_z_loss_mlp": 0.73046875, + "step": 5112, + "time_per_iteration": 2.8230271339416504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135233, + "balance_loss_mlp": 1.06199098, + "epoch": 0.9836475567525972, + "flos": 1073548211712.0, + "grad_norm": 0.03562811843552658, + "language_loss": 0.83895671, + "learning_rate": 7.010845625530782e-07, + "loss": 0.85030913, + "num_input_tokens_seen": 423394880, + "router_z_loss_mlp": 0.73242188, + "step": 5113, + "time_per_iteration": 3.4172170162200928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134582, + "balance_loss_mlp": 1.0613873, + "epoch": 0.9838399384378607, + "flos": 566278699008.0, + "grad_norm": 0.043401730302991125, + "language_loss": 0.81372494, + "learning_rate": 6.846892349181566e-07, + "loss": 0.82507074, + "num_input_tokens_seen": 423461792, + "router_z_loss_mlp": 0.73193359, + "step": 5114, + "time_per_iteration": 2.6795566082000732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134656, + "balance_loss_mlp": 1.061414, + "epoch": 0.9840323201231242, + "flos": 774179997696.0, + "grad_norm": 0.042339759208220466, + "language_loss": 0.85027516, + "learning_rate": 6.684877586787819e-07, + "loss": 0.86162174, + "num_input_tokens_seen": 423539952, + "router_z_loss_mlp": 0.73242188, + "step": 5115, + "time_per_iteration": 3.0095579624176025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136423, + "balance_loss_mlp": 1.06322873, + "epoch": 0.9842247018083878, + "flos": 473248713216.0, + "grad_norm": 0.0363602378953053, + "language_loss": 0.89681566, + "learning_rate": 6.524801401249225e-07, + "loss": 0.90817988, + "num_input_tokens_seen": 423607184, + "router_z_loss_mlp": 0.73193359, + "step": 5116, + "time_per_iteration": 2.5631868839263916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136374, + "balance_loss_mlp": 1.06332254, + "epoch": 0.9844170834936514, + "flos": 526311065088.0, + "grad_norm": 0.035086314947572486, + "language_loss": 0.8950007, + "learning_rate": 6.366663854713295e-07, + "loss": 0.90636444, + "num_input_tokens_seen": 423676528, + "router_z_loss_mlp": 0.73046875, + "step": 5117, + "time_per_iteration": 2.6976704597473145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139755, + "balance_loss_mlp": 1.06803894, + "epoch": 0.984609465178915, + "flos": 1570623742464.0, + "grad_norm": 0.005251722325346967, + "language_loss": 0.77162516, + "learning_rate": 6.210465008574251e-07, + "loss": 0.78302276, + "num_input_tokens_seen": 423905856, + "router_z_loss_mlp": 0.71875, + "step": 5118, + "time_per_iteration": 4.95673942565918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134864, + "balance_loss_mlp": 1.06166935, + "epoch": 0.9848018468641785, + "flos": 520569073152.0, + "grad_norm": 0.04534599796839803, + "language_loss": 0.8812722, + "learning_rate": 6.056204923473584e-07, + "loss": 0.8926208, + "num_input_tokens_seen": 423972496, + "router_z_loss_mlp": 0.73193359, + "step": 5119, + "time_per_iteration": 2.6061763763427734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134973, + "balance_loss_mlp": 1.06173038, + "epoch": 0.9849942285494421, + "flos": 493986868224.0, + "grad_norm": 0.034301666318635994, + "language_loss": 0.87063777, + "learning_rate": 5.903883659301167e-07, + "loss": 0.88198745, + "num_input_tokens_seen": 424039968, + "router_z_loss_mlp": 0.73242188, + "step": 5120, + "time_per_iteration": 2.6077840328216553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134811, + "balance_loss_mlp": 1.06161654, + "epoch": 0.9851866102347057, + "flos": 547049220096.0, + "grad_norm": 0.03687618838408007, + "language_loss": 0.85899603, + "learning_rate": 5.753501275193029e-07, + "loss": 0.87034416, + "num_input_tokens_seen": 424108096, + "router_z_loss_mlp": 0.73193359, + "step": 5121, + "time_per_iteration": 2.6531834602355957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113473, + "balance_loss_mlp": 1.06148791, + "epoch": 0.9853789919199692, + "flos": 477214786560.0, + "grad_norm": 0.04121503477449517, + "language_loss": 0.85198522, + "learning_rate": 5.605057829531912e-07, + "loss": 0.86333251, + "num_input_tokens_seen": 424172256, + "router_z_loss_mlp": 0.73242188, + "step": 5122, + "time_per_iteration": 2.5439565181732178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134707, + "balance_loss_mlp": 1.06146467, + "epoch": 0.9855713736052328, + "flos": 1034307718656.0, + "grad_norm": 0.03796282782398555, + "language_loss": 0.80304152, + "learning_rate": 5.458553379950049e-07, + "loss": 0.81438863, + "num_input_tokens_seen": 424261088, + "router_z_loss_mlp": 0.73242188, + "step": 5123, + "time_per_iteration": 3.3912107944488525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134932, + "balance_loss_mlp": 1.06169021, + "epoch": 0.9857637552904963, + "flos": 496079694336.0, + "grad_norm": 0.0481766672977676, + "language_loss": 0.8670826, + "learning_rate": 5.31398798332472e-07, + "loss": 0.87843192, + "num_input_tokens_seen": 424329168, + "router_z_loss_mlp": 0.73242188, + "step": 5124, + "time_per_iteration": 2.6348800659179688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136248, + "balance_loss_mlp": 1.06314898, + "epoch": 0.9859561369757599, + "flos": 593381927424.0, + "grad_norm": 0.042122648622967405, + "language_loss": 0.89123881, + "learning_rate": 5.17136169578103e-07, + "loss": 0.9026013, + "num_input_tokens_seen": 424399392, + "router_z_loss_mlp": 0.73095703, + "step": 5125, + "time_per_iteration": 2.7288503646850586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136176, + "balance_loss_mlp": 1.06298196, + "epoch": 0.9861485186610235, + "flos": 487982363136.0, + "grad_norm": 0.0358846591177453, + "language_loss": 0.83094305, + "learning_rate": 5.030674572691907e-07, + "loss": 0.84230483, + "num_input_tokens_seen": 424470080, + "router_z_loss_mlp": 0.73193359, + "step": 5126, + "time_per_iteration": 2.660942792892456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113627, + "balance_loss_mlp": 1.06317127, + "epoch": 0.9863409003462871, + "flos": 519833200128.0, + "grad_norm": 0.030624136680643108, + "language_loss": 0.86946189, + "learning_rate": 4.891926668676994e-07, + "loss": 0.88082457, + "num_input_tokens_seen": 424541824, + "router_z_loss_mlp": 0.73095703, + "step": 5127, + "time_per_iteration": 2.7073521614074707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139725, + "balance_loss_mlp": 1.06800842, + "epoch": 0.9865332820315506, + "flos": 1489294591488.0, + "grad_norm": 0.005262688675018299, + "language_loss": 0.79182732, + "learning_rate": 4.755118037602646e-07, + "loss": 0.80322456, + "num_input_tokens_seen": 424773408, + "router_z_loss_mlp": 0.71875, + "step": 5128, + "time_per_iteration": 4.899366617202759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113637, + "balance_loss_mlp": 1.06327081, + "epoch": 0.9867256637168141, + "flos": 583217966592.0, + "grad_norm": 0.03678420177070357, + "language_loss": 0.83516836, + "learning_rate": 4.620248732582488e-07, + "loss": 0.84653205, + "num_input_tokens_seen": 424840608, + "router_z_loss_mlp": 0.73095703, + "step": 5129, + "time_per_iteration": 2.7090418338775635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135775, + "balance_loss_mlp": 1.06272316, + "epoch": 0.9869180454020777, + "flos": 960926177280.0, + "grad_norm": 0.03558291852194016, + "language_loss": 0.904948, + "learning_rate": 4.487318805977969e-07, + "loss": 0.91630578, + "num_input_tokens_seen": 424926128, + "router_z_loss_mlp": 0.73046875, + "step": 5130, + "time_per_iteration": 3.30485463142395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134312, + "balance_loss_mlp": 1.06107008, + "epoch": 0.9871104270873413, + "flos": 772113368064.0, + "grad_norm": 0.03765358627123921, + "language_loss": 0.87391722, + "learning_rate": 4.3563283093966954e-07, + "loss": 0.8852604, + "num_input_tokens_seen": 425005744, + "router_z_loss_mlp": 0.73242188, + "step": 5131, + "time_per_iteration": 2.9843320846557617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134246, + "balance_loss_mlp": 1.06100392, + "epoch": 0.9873028087726049, + "flos": 447365451264.0, + "grad_norm": 0.043947923730938386, + "language_loss": 0.84125459, + "learning_rate": 4.2272772936940986e-07, + "loss": 0.852597, + "num_input_tokens_seen": 425068112, + "router_z_loss_mlp": 0.73242188, + "step": 5132, + "time_per_iteration": 2.4963319301605225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135167, + "balance_loss_mlp": 1.06192493, + "epoch": 0.9874951904578684, + "flos": 508627192320.0, + "grad_norm": 0.035291470132473204, + "language_loss": 0.90447533, + "learning_rate": 4.1001658089717676e-07, + "loss": 0.91582704, + "num_input_tokens_seen": 425137408, + "router_z_loss_mlp": 0.73242188, + "step": 5133, + "time_per_iteration": 2.5896787643432617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134492, + "balance_loss_mlp": 1.06124949, + "epoch": 0.987687572143132, + "flos": 718037167104.0, + "grad_norm": 0.034260144513400544, + "language_loss": 0.86916608, + "learning_rate": 3.9749939045791164e-07, + "loss": 0.88051105, + "num_input_tokens_seen": 425213504, + "router_z_loss_mlp": 0.73242188, + "step": 5134, + "time_per_iteration": 2.9246342182159424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138206, + "balance_loss_mlp": 1.06629944, + "epoch": 0.9878799538283956, + "flos": 1541957443584.0, + "grad_norm": 0.003697455186378142, + "language_loss": 0.79817951, + "learning_rate": 3.851761629111716e-07, + "loss": 0.80956161, + "num_input_tokens_seen": 425451296, + "router_z_loss_mlp": 0.72070312, + "step": 5135, + "time_per_iteration": 4.907610654830933 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134617, + "balance_loss_mlp": 1.06142259, + "epoch": 0.9880723355136591, + "flos": 722737112064.0, + "grad_norm": 0.03189445878324839, + "language_loss": 0.85751259, + "learning_rate": 3.730469030412964e-07, + "loss": 0.86885875, + "num_input_tokens_seen": 425527536, + "router_z_loss_mlp": 0.73193359, + "step": 5136, + "time_per_iteration": 2.918485164642334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135851, + "balance_loss_mlp": 1.06279981, + "epoch": 0.9882647171989226, + "flos": 558413681664.0, + "grad_norm": 0.032326338581805884, + "language_loss": 0.88415384, + "learning_rate": 3.611116155572969e-07, + "loss": 0.89551234, + "num_input_tokens_seen": 425596608, + "router_z_loss_mlp": 0.73046875, + "step": 5137, + "time_per_iteration": 2.6782608032226562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136054, + "balance_loss_mlp": 1.06290746, + "epoch": 0.9884570988841862, + "flos": 563940824064.0, + "grad_norm": 0.041268271106656235, + "language_loss": 0.85345703, + "learning_rate": 3.493703050927999e-07, + "loss": 0.86481762, + "num_input_tokens_seen": 425667280, + "router_z_loss_mlp": 0.73144531, + "step": 5138, + "time_per_iteration": 2.737701416015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113618, + "balance_loss_mlp": 1.06303346, + "epoch": 0.9886494805694498, + "flos": 432668731392.0, + "grad_norm": 0.04045018787743159, + "language_loss": 0.91157293, + "learning_rate": 3.378229762062146e-07, + "loss": 0.92293483, + "num_input_tokens_seen": 425730736, + "router_z_loss_mlp": 0.73144531, + "step": 5139, + "time_per_iteration": 2.5153446197509766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136158, + "balance_loss_mlp": 1.06310701, + "epoch": 0.9888418622547134, + "flos": 593240937984.0, + "grad_norm": 0.0339250061411206, + "language_loss": 0.94499457, + "learning_rate": 3.264696333806771e-07, + "loss": 0.95635617, + "num_input_tokens_seen": 425807616, + "router_z_loss_mlp": 0.73046875, + "step": 5140, + "time_per_iteration": 2.8330492973327637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136272, + "balance_loss_mlp": 1.06322026, + "epoch": 0.989034243939977, + "flos": 1136865848832.0, + "grad_norm": 0.048311873953814935, + "language_loss": 0.84138036, + "learning_rate": 3.1531028102388394e-07, + "loss": 0.85274303, + "num_input_tokens_seen": 425900880, + "router_z_loss_mlp": 0.73046875, + "step": 5141, + "time_per_iteration": 3.5308704376220703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136536, + "balance_loss_mlp": 1.06334126, + "epoch": 0.9892266256252404, + "flos": 567730979328.0, + "grad_norm": 0.035998364171371054, + "language_loss": 0.85842848, + "learning_rate": 3.0434492346825824e-07, + "loss": 0.86979377, + "num_input_tokens_seen": 425973632, + "router_z_loss_mlp": 0.73193359, + "step": 5142, + "time_per_iteration": 2.7318220138549805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136331, + "balance_loss_mlp": 1.06323171, + "epoch": 0.989419007310504, + "flos": 641870587392.0, + "grad_norm": 0.04445949933168621, + "language_loss": 0.88850874, + "learning_rate": 2.9357356497095033e-07, + "loss": 0.899872, + "num_input_tokens_seen": 426057088, + "router_z_loss_mlp": 0.73095703, + "step": 5143, + "time_per_iteration": 2.9219346046447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136317, + "balance_loss_mlp": 1.0632174, + "epoch": 0.9896113889957676, + "flos": 456448433664.0, + "grad_norm": 0.03712500975558181, + "language_loss": 0.85754621, + "learning_rate": 2.829962097138372e-07, + "loss": 0.86890936, + "num_input_tokens_seen": 426124336, + "router_z_loss_mlp": 0.73095703, + "step": 5144, + "time_per_iteration": 2.6135852336883545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113489, + "balance_loss_mlp": 1.06164801, + "epoch": 0.9898037706810312, + "flos": 568419188736.0, + "grad_norm": 0.036970241662831894, + "language_loss": 0.85173666, + "learning_rate": 2.726128618033008e-07, + "loss": 0.86308557, + "num_input_tokens_seen": 426191888, + "router_z_loss_mlp": 0.73242188, + "step": 5145, + "time_per_iteration": 2.728771209716797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138741, + "balance_loss_mlp": 1.06702423, + "epoch": 0.9899961523662947, + "flos": 1553447431680.0, + "grad_norm": 0.0039494611042856405, + "language_loss": 0.78146422, + "learning_rate": 2.624235252706164e-07, + "loss": 0.79285163, + "num_input_tokens_seen": 426425840, + "router_z_loss_mlp": 0.71875, + "step": 5146, + "time_per_iteration": 4.958428382873535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135081, + "balance_loss_mlp": 1.06183898, + "epoch": 0.9901885340515583, + "flos": 611947392000.0, + "grad_norm": 0.03732558697558194, + "language_loss": 0.89710462, + "learning_rate": 2.524282040715642e-07, + "loss": 0.90845543, + "num_input_tokens_seen": 426506080, + "router_z_loss_mlp": 0.73242188, + "step": 5147, + "time_per_iteration": 2.9494400024414062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135311, + "balance_loss_mlp": 1.06206846, + "epoch": 0.9903809157368219, + "flos": 518493711360.0, + "grad_norm": 0.03472325618842919, + "language_loss": 0.86850142, + "learning_rate": 2.426269020866512e-07, + "loss": 0.87985462, + "num_input_tokens_seen": 426573936, + "router_z_loss_mlp": 0.73242188, + "step": 5148, + "time_per_iteration": 2.606642007827759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113491, + "balance_loss_mlp": 1.06166744, + "epoch": 0.9905732974220854, + "flos": 1102197046272.0, + "grad_norm": 0.03711196297456148, + "language_loss": 0.85352963, + "learning_rate": 2.3301962312122226e-07, + "loss": 0.86487871, + "num_input_tokens_seen": 426657472, + "router_z_loss_mlp": 0.73242188, + "step": 5149, + "time_per_iteration": 3.4157660007476807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134965, + "balance_loss_mlp": 1.06177092, + "epoch": 0.990765679107349, + "flos": 859492686336.0, + "grad_norm": 0.04154402943927612, + "language_loss": 0.89084303, + "learning_rate": 2.2360637090496073e-07, + "loss": 0.90219271, + "num_input_tokens_seen": 426740560, + "router_z_loss_mlp": 0.73193359, + "step": 5150, + "time_per_iteration": 3.1477768421173096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136347, + "balance_loss_mlp": 1.06329572, + "epoch": 0.9909580607926125, + "flos": 492274076160.0, + "grad_norm": 0.03777042366534936, + "language_loss": 0.84356183, + "learning_rate": 2.143871490925542e-07, + "loss": 0.85492527, + "num_input_tokens_seen": 426809296, + "router_z_loss_mlp": 0.73046875, + "step": 5151, + "time_per_iteration": 2.630377769470215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136659, + "balance_loss_mlp": 1.06355977, + "epoch": 0.9911504424778761, + "flos": 586159457280.0, + "grad_norm": 0.03962254747551654, + "language_loss": 0.84528565, + "learning_rate": 2.0536196126319519e-07, + "loss": 0.85665214, + "num_input_tokens_seen": 426881056, + "router_z_loss_mlp": 0.73095703, + "step": 5152, + "time_per_iteration": 2.711332321166992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135988, + "balance_loss_mlp": 1.06279361, + "epoch": 0.9913428241631397, + "flos": 571100167680.0, + "grad_norm": 0.04036611749146896, + "language_loss": 0.8638401, + "learning_rate": 1.9653081092074753e-07, + "loss": 0.87520003, + "num_input_tokens_seen": 426949664, + "router_z_loss_mlp": 0.73193359, + "step": 5153, + "time_per_iteration": 2.7309064865112305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136524, + "balance_loss_mlp": 1.06347251, + "epoch": 0.9915352058484033, + "flos": 490711005696.0, + "grad_norm": 0.03270171907202174, + "language_loss": 0.90234423, + "learning_rate": 1.8789370149374652e-07, + "loss": 0.91370946, + "num_input_tokens_seen": 427018816, + "router_z_loss_mlp": 0.73046875, + "step": 5154, + "time_per_iteration": 2.650282382965088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113634, + "balance_loss_mlp": 1.06319273, + "epoch": 0.9917275875336667, + "flos": 745409639424.0, + "grad_norm": 0.034109817330924164, + "language_loss": 0.86935675, + "learning_rate": 1.7945063633545423e-07, + "loss": 0.88072014, + "num_input_tokens_seen": 427097984, + "router_z_loss_mlp": 0.73144531, + "step": 5155, + "time_per_iteration": 2.986468553543091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135757, + "balance_loss_mlp": 1.06256294, + "epoch": 0.9919199692189303, + "flos": 509324133888.0, + "grad_norm": 0.03639310073850552, + "language_loss": 0.84705198, + "learning_rate": 1.7120161872380412e-07, + "loss": 0.85840952, + "num_input_tokens_seen": 427169280, + "router_z_loss_mlp": 0.73193359, + "step": 5156, + "time_per_iteration": 2.647678852081299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136146, + "balance_loss_mlp": 1.06299901, + "epoch": 0.9921123509041939, + "flos": 545010788352.0, + "grad_norm": 0.03592115779060212, + "language_loss": 0.8875376, + "learning_rate": 1.6314665186123457e-07, + "loss": 0.89889908, + "num_input_tokens_seen": 427237312, + "router_z_loss_mlp": 0.73144531, + "step": 5157, + "time_per_iteration": 2.6703507900238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134605, + "balance_loss_mlp": 1.0613631, + "epoch": 0.9923047325894575, + "flos": 672757240320.0, + "grad_norm": 0.03851308781628141, + "language_loss": 0.82369369, + "learning_rate": 1.5528573887507724e-07, + "loss": 0.83503973, + "num_input_tokens_seen": 427305008, + "router_z_loss_mlp": 0.73242188, + "step": 5158, + "time_per_iteration": 2.822913408279419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135232, + "balance_loss_mlp": 1.06198978, + "epoch": 0.9924971142747211, + "flos": 467624242176.0, + "grad_norm": 0.03828859510253023, + "language_loss": 0.85407376, + "learning_rate": 1.4761888281711322e-07, + "loss": 0.86542612, + "num_input_tokens_seen": 427377008, + "router_z_loss_mlp": 0.73242188, + "step": 5159, + "time_per_iteration": 2.701911687850952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135482, + "balance_loss_mlp": 1.06223953, + "epoch": 0.9926894959599846, + "flos": 492562785792.0, + "grad_norm": 0.035031095902323076, + "language_loss": 0.8758896, + "learning_rate": 1.4014608666390594e-07, + "loss": 0.88724446, + "num_input_tokens_seen": 427444528, + "router_z_loss_mlp": 0.73242188, + "step": 5160, + "time_per_iteration": 2.5947694778442383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134979, + "balance_loss_mlp": 1.06173706, + "epoch": 0.9928818776452482, + "flos": 493372518912.0, + "grad_norm": 0.0398290144943764, + "language_loss": 0.85975552, + "learning_rate": 1.328673533166902e-07, + "loss": 0.87110531, + "num_input_tokens_seen": 427509808, + "router_z_loss_mlp": 0.73242188, + "step": 5161, + "time_per_iteration": 2.580611228942871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136266, + "balance_loss_mlp": 1.06311941, + "epoch": 0.9930742593305117, + "flos": 547466184192.0, + "grad_norm": 0.04374439834283326, + "language_loss": 0.88636076, + "learning_rate": 1.2578268560131666e-07, + "loss": 0.89772344, + "num_input_tokens_seen": 427587936, + "router_z_loss_mlp": 0.73144531, + "step": 5162, + "time_per_iteration": 2.765444755554199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136135, + "balance_loss_mlp": 1.06294107, + "epoch": 0.9932666410157753, + "flos": 586615352832.0, + "grad_norm": 0.03608446377738685, + "language_loss": 0.90740782, + "learning_rate": 1.1889208626825188e-07, + "loss": 0.91876918, + "num_input_tokens_seen": 427662224, + "router_z_loss_mlp": 0.73193359, + "step": 5163, + "time_per_iteration": 2.8404746055603027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136098, + "balance_loss_mlp": 1.06295085, + "epoch": 0.9934590227010388, + "flos": 538105225728.0, + "grad_norm": 0.036108153087719384, + "language_loss": 0.88640219, + "learning_rate": 1.1219555799268921e-07, + "loss": 0.89776313, + "num_input_tokens_seen": 427730544, + "router_z_loss_mlp": 0.73144531, + "step": 5164, + "time_per_iteration": 2.660189390182495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136245, + "balance_loss_mlp": 1.06319404, + "epoch": 0.9936514043863024, + "flos": 519060397056.0, + "grad_norm": 0.036393144495114126, + "language_loss": 0.91024756, + "learning_rate": 1.0569310337443794e-07, + "loss": 0.92161, + "num_input_tokens_seen": 427799760, + "router_z_loss_mlp": 0.73046875, + "step": 5165, + "time_per_iteration": 2.62958025932312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136227, + "balance_loss_mlp": 1.06308019, + "epoch": 0.993843786071566, + "flos": 745995790848.0, + "grad_norm": 0.039050084286539895, + "language_loss": 0.85854822, + "learning_rate": 9.938472493803419e-08, + "loss": 0.86991048, + "num_input_tokens_seen": 427881936, + "router_z_loss_mlp": 0.73144531, + "step": 5166, + "time_per_iteration": 3.0344748497009277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136102, + "balance_loss_mlp": 1.06305063, + "epoch": 0.9940361677568296, + "flos": 527008006656.0, + "grad_norm": 0.038807373304902144, + "language_loss": 0.87782025, + "learning_rate": 9.327042513251893e-08, + "loss": 0.88918126, + "num_input_tokens_seen": 427951648, + "router_z_loss_mlp": 0.73046875, + "step": 5167, + "time_per_iteration": 2.6882591247558594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136249, + "balance_loss_mlp": 1.06310236, + "epoch": 0.9942285494420932, + "flos": 556746551808.0, + "grad_norm": 0.03797309079451297, + "language_loss": 0.85039365, + "learning_rate": 8.735020633177104e-08, + "loss": 0.86175615, + "num_input_tokens_seen": 428031184, + "router_z_loss_mlp": 0.73144531, + "step": 5168, + "time_per_iteration": 2.7696192264556885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134782, + "balance_loss_mlp": 1.06153989, + "epoch": 0.9944209311273566, + "flos": 587099446272.0, + "grad_norm": 0.03338211410978879, + "language_loss": 0.86810982, + "learning_rate": 8.162407083411872e-08, + "loss": 0.87945765, + "num_input_tokens_seen": 428107296, + "router_z_loss_mlp": 0.73242188, + "step": 5169, + "time_per_iteration": 2.7250516414642334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113501, + "balance_loss_mlp": 1.06176758, + "epoch": 0.9946133128126202, + "flos": 736856412672.0, + "grad_norm": 0.03340787079875126, + "language_loss": 0.8653456, + "learning_rate": 7.609202086272804e-08, + "loss": 0.87669569, + "num_input_tokens_seen": 428187904, + "router_z_loss_mlp": 0.73242188, + "step": 5170, + "time_per_iteration": 2.9989120960235596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134876, + "balance_loss_mlp": 1.06163335, + "epoch": 0.9948056944978838, + "flos": 647180152320.0, + "grad_norm": 0.038233740097927245, + "language_loss": 0.86638784, + "learning_rate": 7.075405856526995e-08, + "loss": 0.87773657, + "num_input_tokens_seen": 428255856, + "router_z_loss_mlp": 0.73242188, + "step": 5171, + "time_per_iteration": 2.8077123165130615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113494, + "balance_loss_mlp": 1.06169748, + "epoch": 0.9949980761831474, + "flos": 446796764160.0, + "grad_norm": 0.03800509693543743, + "language_loss": 0.90174496, + "learning_rate": 6.561018601414226e-08, + "loss": 0.91309434, + "num_input_tokens_seen": 428321872, + "router_z_loss_mlp": 0.73242188, + "step": 5172, + "time_per_iteration": 2.5135178565979004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136048, + "balance_loss_mlp": 1.06285322, + "epoch": 0.995190457868411, + "flos": 436558943232.0, + "grad_norm": 0.036425615927118446, + "language_loss": 0.90128154, + "learning_rate": 6.066040520641414e-08, + "loss": 0.91264206, + "num_input_tokens_seen": 428389232, + "router_z_loss_mlp": 0.73193359, + "step": 5173, + "time_per_iteration": 2.5291202068328857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136192, + "balance_loss_mlp": 1.06309295, + "epoch": 0.9953828395536745, + "flos": 515189650944.0, + "grad_norm": 0.03877686635677472, + "language_loss": 0.85795176, + "learning_rate": 5.590471806377062e-08, + "loss": 0.8693136, + "num_input_tokens_seen": 428456128, + "router_z_loss_mlp": 0.73095703, + "step": 5174, + "time_per_iteration": 2.562049150466919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113637, + "balance_loss_mlp": 1.06331813, + "epoch": 0.995575221238938, + "flos": 480807556608.0, + "grad_norm": 0.03833934527177391, + "language_loss": 0.86279237, + "learning_rate": 5.134312643245709e-08, + "loss": 0.87415606, + "num_input_tokens_seen": 428523504, + "router_z_loss_mlp": 0.73046875, + "step": 5175, + "time_per_iteration": 2.563511371612549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136236, + "balance_loss_mlp": 1.06304121, + "epoch": 0.9957676029242016, + "flos": 588931760640.0, + "grad_norm": 0.04190279888706188, + "language_loss": 0.81519473, + "learning_rate": 4.6975632083445793e-08, + "loss": 0.82655716, + "num_input_tokens_seen": 428596880, + "router_z_loss_mlp": 0.73193359, + "step": 5176, + "time_per_iteration": 2.7635369300842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136434, + "balance_loss_mlp": 1.0632391, + "epoch": 0.9959599846094652, + "flos": 427354437120.0, + "grad_norm": 0.03983399888286843, + "language_loss": 0.84399128, + "learning_rate": 4.280223671243588e-08, + "loss": 0.85535556, + "num_input_tokens_seen": 428659472, + "router_z_loss_mlp": 0.73193359, + "step": 5177, + "time_per_iteration": 2.482015371322632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136347, + "balance_loss_mlp": 1.06315267, + "epoch": 0.9961523662947287, + "flos": 612850450944.0, + "grad_norm": 0.03375587395159785, + "language_loss": 0.84842086, + "learning_rate": 3.8822941939575804e-08, + "loss": 0.85978431, + "num_input_tokens_seen": 428736704, + "router_z_loss_mlp": 0.73193359, + "step": 5178, + "time_per_iteration": 2.859119415283203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113476, + "balance_loss_mlp": 1.0615176, + "epoch": 0.9963447479799923, + "flos": 551842490880.0, + "grad_norm": 0.036286768119618104, + "language_loss": 0.78752828, + "learning_rate": 3.5037749309851927e-08, + "loss": 0.79887587, + "num_input_tokens_seen": 428808560, + "router_z_loss_mlp": 0.73242188, + "step": 5179, + "time_per_iteration": 2.689319372177124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134711, + "balance_loss_mlp": 1.0614686, + "epoch": 0.9965371296652559, + "flos": 627010684416.0, + "grad_norm": 0.0387871810816858, + "language_loss": 0.93553257, + "learning_rate": 3.1446660292755446e-08, + "loss": 0.94687963, + "num_input_tokens_seen": 428880688, + "router_z_loss_mlp": 0.73242188, + "step": 5180, + "time_per_iteration": 2.787081480026245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134841, + "balance_loss_mlp": 1.0615989, + "epoch": 0.9967295113505195, + "flos": 640791610368.0, + "grad_norm": 0.033783594667719394, + "language_loss": 0.86376369, + "learning_rate": 2.8049676282504433e-08, + "loss": 0.87511212, + "num_input_tokens_seen": 428960096, + "router_z_loss_mlp": 0.73242188, + "step": 5181, + "time_per_iteration": 2.886129856109619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134863, + "balance_loss_mlp": 1.06162131, + "epoch": 0.996921893035783, + "flos": 608543275008.0, + "grad_norm": 0.03960364803100891, + "language_loss": 0.8131901, + "learning_rate": 2.484679859793282e-08, + "loss": 0.82453877, + "num_input_tokens_seen": 429031296, + "router_z_loss_mlp": 0.73242188, + "step": 5182, + "time_per_iteration": 2.773259162902832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135034, + "balance_loss_mlp": 1.06179142, + "epoch": 0.9971142747210465, + "flos": 645345836544.0, + "grad_norm": 0.03666439365730574, + "language_loss": 0.86077094, + "learning_rate": 2.183802848243488e-08, + "loss": 0.87212121, + "num_input_tokens_seen": 429103312, + "router_z_loss_mlp": 0.73242188, + "step": 5183, + "time_per_iteration": 2.7957136631011963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134817, + "balance_loss_mlp": 1.06157458, + "epoch": 0.9973066564063101, + "flos": 1042461445632.0, + "grad_norm": 0.035212511344882604, + "language_loss": 0.85020685, + "learning_rate": 1.9023367104187285e-08, + "loss": 0.86155498, + "num_input_tokens_seen": 429194896, + "router_z_loss_mlp": 0.73242188, + "step": 5184, + "time_per_iteration": 3.393714427947998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134906, + "balance_loss_mlp": 1.06166399, + "epoch": 0.9974990380915737, + "flos": 666342501888.0, + "grad_norm": 0.03904258073685639, + "language_loss": 0.89533353, + "learning_rate": 1.640281555587153e-08, + "loss": 0.90668261, + "num_input_tokens_seen": 429267664, + "router_z_loss_mlp": 0.73242188, + "step": 5185, + "time_per_iteration": 2.8711843490600586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134943, + "balance_loss_mlp": 1.06170106, + "epoch": 0.9976914197768373, + "flos": 719378657280.0, + "grad_norm": 0.03669739544295146, + "language_loss": 0.82640398, + "learning_rate": 1.3976374855007024e-08, + "loss": 0.83775342, + "num_input_tokens_seen": 429343472, + "router_z_loss_mlp": 0.73242188, + "step": 5186, + "time_per_iteration": 2.8739511966705322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134603, + "balance_loss_mlp": 1.06136048, + "epoch": 0.9978838014621008, + "flos": 519331642368.0, + "grad_norm": 0.038670541148839846, + "language_loss": 0.84187782, + "learning_rate": 1.1744045943451464e-08, + "loss": 0.8532238, + "num_input_tokens_seen": 429411472, + "router_z_loss_mlp": 0.73242188, + "step": 5187, + "time_per_iteration": 2.594606637954712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134963, + "balance_loss_mlp": 1.06172121, + "epoch": 0.9980761831473643, + "flos": 604605399552.0, + "grad_norm": 0.03068761649528877, + "language_loss": 0.88198936, + "learning_rate": 9.70582968801148e-09, + "loss": 0.89333904, + "num_input_tokens_seen": 429486704, + "router_z_loss_mlp": 0.73242188, + "step": 5188, + "time_per_iteration": 2.778276205062866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134568, + "balance_loss_mlp": 1.06132579, + "epoch": 0.9982685648326279, + "flos": 454457665536.0, + "grad_norm": 0.03724729407224267, + "language_loss": 0.94649714, + "learning_rate": 7.861726879943021e-09, + "loss": 0.95784283, + "num_input_tokens_seen": 429554736, + "router_z_loss_mlp": 0.73242188, + "step": 5189, + "time_per_iteration": 2.542572259902954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134686, + "balance_loss_mlp": 1.06144357, + "epoch": 0.9984609465178915, + "flos": 482461951488.0, + "grad_norm": 0.036682028146604845, + "language_loss": 0.83087814, + "learning_rate": 6.211738235173403e-09, + "loss": 0.84222496, + "num_input_tokens_seen": 429623216, + "router_z_loss_mlp": 0.73242188, + "step": 5190, + "time_per_iteration": 2.675111770629883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134834, + "balance_loss_mlp": 1.06159234, + "epoch": 0.9986533282031551, + "flos": 478011784704.0, + "grad_norm": 0.03381508269385847, + "language_loss": 0.87848723, + "learning_rate": 4.755864394301312e-09, + "loss": 0.8898356, + "num_input_tokens_seen": 429695808, + "router_z_loss_mlp": 0.73242188, + "step": 5191, + "time_per_iteration": 2.699894666671753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134426, + "balance_loss_mlp": 1.06118381, + "epoch": 0.9988457098884186, + "flos": 643157683200.0, + "grad_norm": 0.03641547995983512, + "language_loss": 0.90973437, + "learning_rate": 3.494105922541291e-09, + "loss": 0.92107868, + "num_input_tokens_seen": 429774464, + "router_z_loss_mlp": 0.73242188, + "step": 5192, + "time_per_iteration": 2.7941293716430664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134587, + "balance_loss_mlp": 1.06139255, + "epoch": 0.9990380915736822, + "flos": 397188194304.0, + "grad_norm": 0.039725697909644885, + "language_loss": 0.93135947, + "learning_rate": 2.4264633097237365e-09, + "loss": 0.94270533, + "num_input_tokens_seen": 429835872, + "router_z_loss_mlp": 0.73193359, + "step": 5193, + "time_per_iteration": 2.439404010772705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134917, + "balance_loss_mlp": 1.06172252, + "epoch": 0.9992304732589458, + "flos": 577296053760.0, + "grad_norm": 0.03644077357659133, + "language_loss": 0.88674903, + "learning_rate": 1.552936970405927e-09, + "loss": 0.89809811, + "num_input_tokens_seen": 429911440, + "router_z_loss_mlp": 0.73193359, + "step": 5194, + "time_per_iteration": 2.783804178237915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135031, + "balance_loss_mlp": 1.06178868, + "epoch": 0.9994228549442093, + "flos": 545390822400.0, + "grad_norm": 0.047086410884293904, + "language_loss": 0.81329274, + "learning_rate": 8.735272437054853e-10, + "loss": 0.82464302, + "num_input_tokens_seen": 429982512, + "router_z_loss_mlp": 0.73242188, + "step": 5195, + "time_per_iteration": 2.6740100383758545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134949, + "balance_loss_mlp": 1.06170666, + "epoch": 0.9996152366294728, + "flos": 1473468324864.0, + "grad_norm": 0.039118675807487395, + "language_loss": 0.8557514, + "learning_rate": 3.882343933003796e-10, + "loss": 0.86710095, + "num_input_tokens_seen": 430070944, + "router_z_loss_mlp": 0.73242188, + "step": 5196, + "time_per_iteration": 3.72202467918396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134237, + "balance_loss_mlp": 1.06137657, + "epoch": 0.9998076183147364, + "flos": 620085656064.0, + "grad_norm": 0.07900250756549031, + "language_loss": 0.7408278, + "learning_rate": 9.70586077619906e-11, + "loss": 0.75217021, + "num_input_tokens_seen": 430164864, + "router_z_loss_mlp": 0.72851562, + "step": 5197, + "time_per_iteration": 4.020706653594971 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140059, + "balance_loss_mlp": 1.0678184, + "epoch": 1.0, + "flos": 1293860926464.0, + "grad_norm": 0.020340605077202825, + "language_loss": 0.85357249, + "learning_rate": 0.0, + "loss": 0.86497313, + "num_input_tokens_seen": 430340944, + "router_z_loss_mlp": 0.72412109, + "step": 5198, + "time_per_iteration": 5.7421464920043945 + }, + { + "epoch": 1.0, + "num_input_tokens_seen": 430340944, + "step": 5198, + "total_flos": 1.1743145354461184e+16, + "train_loss": 0.9366320672059069, + "train_runtime": 15521.3015, + "train_samples_per_second": 42.864, + "train_steps_per_second": 0.335 + } + ], + "logging_steps": 1.0, + "max_steps": 5198, + "num_input_tokens_seen": 430340944, + "num_train_epochs": 1, + "save_steps": 1040, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.1743145354461184e+16, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +} diff --git a/sft_pretrain/Full_smoe_perturbed/training_args.bin b/sft_pretrain/Full_smoe_perturbed/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..dec1b7e0db130318069c72434f32c2789119b732 --- /dev/null +++ b/sft_pretrain/Full_smoe_perturbed/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c077e5103b778b39b648e3a5a2e73e36256d052f444290e14e15f87c36156cb +size 7992